@amaster.ai/tts-client 1.1.9 → 1.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,176 +1,1002 @@
1
1
  // src/tts-client.ts
2
2
  var TTS_PATH = "/api/proxy/builtin/platform/qwen-tts/api-ws/v1/realtime";
3
- function createTTSClient(config) {
3
+ var TTS_MAX_FRAGMENT_LENGTH = 2e3;
4
+ function isBrowserEnvironment() {
5
+ return typeof window !== "undefined";
6
+ }
7
+ function getWeightedTextLength(text) {
8
+ let length = 0;
9
+ for (const char of text) {
10
+ length += /[\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]/.test(char) ? 2 : 1;
11
+ }
12
+ return length;
13
+ }
14
+ function splitOversizedSegment(segment, maxLength) {
15
+ const fragments = [];
16
+ let current = "";
17
+ let currentLength = 0;
18
+ for (const char of segment) {
19
+ const charLength = getWeightedTextLength(char);
20
+ if (current && currentLength + charLength > maxLength) {
21
+ fragments.push(current);
22
+ current = char;
23
+ currentLength = charLength;
24
+ continue;
25
+ }
26
+ current += char;
27
+ currentLength += charLength;
28
+ }
29
+ if (current) {
30
+ fragments.push(current);
31
+ }
32
+ return fragments;
33
+ }
34
+ function splitTextIntoFragments(text, maxLength = TTS_MAX_FRAGMENT_LENGTH) {
35
+ const fragments = [];
36
+ const segments = text.match(/.+?(?:\r?\n+|$)/gs) ?? [];
37
+ const softBreakPattern = /(?<=[。!?;.!?;])|(?<=[,、,::])|(?<=\s)/;
38
+ let current = "";
39
+ const pushFragment = (fragment) => {
40
+ const trimmed = fragment.trim();
41
+ if (trimmed) {
42
+ fragments.push(trimmed);
43
+ }
44
+ };
45
+ const appendPart = (part) => {
46
+ if (!part.trim()) {
47
+ return;
48
+ }
49
+ if (getWeightedTextLength(part) > maxLength) {
50
+ for (const fragment of splitOversizedSegment(part, maxLength)) {
51
+ pushFragment(fragment);
52
+ }
53
+ current = "";
54
+ return;
55
+ }
56
+ const next = current ? `${current}${part}` : part;
57
+ if (getWeightedTextLength(next) <= maxLength) {
58
+ current = next;
59
+ return;
60
+ }
61
+ pushFragment(current);
62
+ current = part;
63
+ };
64
+ for (const segment of segments) {
65
+ const parts = segment.split(softBreakPattern).filter((part) => part.trim());
66
+ if (!parts.length) {
67
+ continue;
68
+ }
69
+ for (const part of parts) {
70
+ appendPart(part);
71
+ }
72
+ }
73
+ pushFragment(current);
74
+ return fragments;
75
+ }
76
+ function normalizeWhitespace(text) {
77
+ return text.replace(/\r\n/g, "\n").replace(/\n{3,}/g, "\n\n").replace(/[ \t]{2,}/g, " ");
78
+ }
79
+ function stripMarkdownTables(text) {
80
+ return text.replace(
81
+ /^\|(.+)\|$/gm,
82
+ (_, row) => row.split("|").map((cell) => cell.trim()).filter(Boolean).join("\uFF0C")
83
+ );
84
+ }
85
+ function stripMarkdown(text) {
86
+ return text.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/\[([^\]]+)\]\(([^)]+)\)/g, "$1").replace(/^#{1,6}\s+/gm, "").replace(/^\s*>\s?/gm, "").replace(/^\s*[-*+]\s+/gm, "").replace(/^\s*\d+\.\s+/gm, "").replace(/[*_~#>]+/g, "");
87
+ }
88
+ function stripUrls(text) {
89
+ return text.replace(/https?:\/\/\S+/gi, " ");
90
+ }
91
+ function stripEmojiAndSymbols(text) {
92
+ return text.replace(/[\u{1F000}-\u{1FAFF}\u{2600}-\u{27BF}]/gu, " ");
93
+ }
94
+ function preprocessTTSContent(text) {
95
+ const normalized = normalizeWhitespace(text);
96
+ const withoutTables = stripMarkdownTables(normalized);
97
+ const withoutMarkdown = stripMarkdown(withoutTables);
98
+ const withoutUrls = stripUrls(withoutMarkdown);
99
+ const withoutEmoji = stripEmojiAndSymbols(withoutUrls);
100
+ return withoutEmoji.replace(/[|]/g, "\uFF0C").replace(/[ \t]+\n/g, "\n").replace(/\n+/g, "\n").replace(/[ ]{2,}/g, " ").trim();
101
+ }
102
+ function decodeBase64Chunk(chunk) {
103
+ const binaryString = atob(chunk);
104
+ const bytes = new Uint8Array(binaryString.length);
105
+ for (let index = 0; index < binaryString.length; index += 1) {
106
+ bytes[index] = binaryString.charCodeAt(index);
107
+ }
108
+ return bytes;
109
+ }
110
+ function concatUint8Arrays(items) {
111
+ let totalLength = 0;
112
+ for (const item of items) {
113
+ totalLength += item.length;
114
+ }
115
+ const result = new Uint8Array(totalLength);
116
+ let offset = 0;
117
+ for (const item of items) {
118
+ result.set(item, offset);
119
+ offset += item.length;
120
+ }
121
+ return result;
122
+ }
123
+ function pcmToWav(pcmBytes, sampleRate) {
124
+ const header = new ArrayBuffer(44);
125
+ const view = new DataView(header);
126
+ const dataSize = pcmBytes.byteLength;
127
+ const writeString = (offset, value) => {
128
+ for (let index = 0; index < value.length; index += 1) {
129
+ view.setUint8(offset + index, value.charCodeAt(index));
130
+ }
131
+ };
132
+ writeString(0, "RIFF");
133
+ view.setUint32(4, 36 + dataSize, true);
134
+ writeString(8, "WAVE");
135
+ writeString(12, "fmt ");
136
+ view.setUint32(16, 16, true);
137
+ view.setUint16(20, 1, true);
138
+ view.setUint16(22, 1, true);
139
+ view.setUint32(24, sampleRate, true);
140
+ view.setUint32(28, sampleRate * 2, true);
141
+ view.setUint16(32, 2, true);
142
+ view.setUint16(34, 16, true);
143
+ writeString(36, "data");
144
+ view.setUint32(40, dataSize, true);
145
+ return concatUint8Arrays([new Uint8Array(header), pcmBytes]);
146
+ }
147
+ function bytesToDataUri(bytes, mimeType) {
148
+ let binary = "";
149
+ for (const byte of bytes) {
150
+ binary += String.fromCharCode(byte);
151
+ }
152
+ return `data:${mimeType};base64,${btoa(binary)}`;
153
+ }
154
+ function createBrowserPlaybackBackend(config) {
155
+ const { sampleRate, onAudioStart, onAudioEnd, onError } = config;
156
+ let audioContext = null;
157
+ const audioSources = /* @__PURE__ */ new Set();
158
+ let nextScheduleTime = 0;
159
+ let streamEndHandled = false;
160
+ let streamingStarted = false;
161
+ const ensureAudioContext = () => {
162
+ if (!audioContext && typeof AudioContext !== "undefined") {
163
+ audioContext = new AudioContext({ sampleRate });
164
+ }
165
+ return audioContext;
166
+ };
167
+ const createAudioBufferFromPCM = (bytes) => {
168
+ const numSamples = Math.floor(bytes.length / 2);
169
+ const buffer = new AudioBuffer({ length: numSamples, sampleRate });
170
+ const channelData = buffer.getChannelData(0);
171
+ const dataView = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
172
+ for (let index = 0; index < numSamples; index += 1) {
173
+ const int16 = dataView.getInt16(index * 2, true);
174
+ channelData[index] = int16 / 32768;
175
+ }
176
+ return buffer;
177
+ };
178
+ const stop = () => {
179
+ for (const source of audioSources) {
180
+ try {
181
+ source.onended = null;
182
+ source.stop();
183
+ } catch {
184
+ }
185
+ source.disconnect();
186
+ }
187
+ audioSources.clear();
188
+ nextScheduleTime = 0;
189
+ streamEndHandled = false;
190
+ streamingStarted = false;
191
+ };
192
+ return {
193
+ kind: "browser",
194
+ hasStreamingPlayback: true,
195
+ isPlaying() {
196
+ return audioSources.size > 0;
197
+ },
198
+ stop,
199
+ async playBuffered(input) {
200
+ if (input.audioFormat !== "pcm") {
201
+ onError?.(new Error(`Built-in playback only supports pcm, got ${input.audioFormat}`));
202
+ return;
203
+ }
204
+ const context = ensureAudioContext();
205
+ if (!context || !input.chunks.length) {
206
+ return;
207
+ }
208
+ stop();
209
+ const chunkBytes = input.chunks.map((chunk) => decodeBase64Chunk(chunk));
210
+ const combined = concatUint8Arrays(chunkBytes);
211
+ const buffer = createAudioBufferFromPCM(combined);
212
+ const source = context.createBufferSource();
213
+ if (context.state === "suspended") {
214
+ await context.resume();
215
+ }
216
+ source.buffer = buffer;
217
+ source.connect(context.destination);
218
+ audioSources.add(source);
219
+ source.onended = () => {
220
+ audioSources.delete(source);
221
+ source.disconnect();
222
+ onAudioEnd?.();
223
+ };
224
+ onAudioStart?.();
225
+ source.start(0);
226
+ },
227
+ async scheduleStreamingChunk(input) {
228
+ if (input.audioFormat !== "pcm") {
229
+ onError?.(new Error(`Built-in playback only supports pcm, got ${input.audioFormat}`));
230
+ return;
231
+ }
232
+ const context = ensureAudioContext();
233
+ if (!context) {
234
+ return;
235
+ }
236
+ if (context.state === "suspended") {
237
+ await context.resume();
238
+ }
239
+ const chunkBytes = decodeBase64Chunk(input.chunk);
240
+ const chunkBuffer = createAudioBufferFromPCM(chunkBytes);
241
+ const source = context.createBufferSource();
242
+ const leadTime = 0.05;
243
+ if (!nextScheduleTime) {
244
+ nextScheduleTime = Math.max(context.currentTime + leadTime, leadTime);
245
+ }
246
+ source.buffer = chunkBuffer;
247
+ source.connect(context.destination);
248
+ audioSources.add(source);
249
+ source.onended = () => {
250
+ audioSources.delete(source);
251
+ source.disconnect();
252
+ if (streamEndHandled && audioSources.size === 0) {
253
+ nextScheduleTime = 0;
254
+ streamEndHandled = false;
255
+ streamingStarted = false;
256
+ onAudioEnd?.();
257
+ }
258
+ };
259
+ if (!streamingStarted) {
260
+ streamingStarted = true;
261
+ onAudioStart?.();
262
+ }
263
+ source.start(nextScheduleTime);
264
+ nextScheduleTime += chunkBuffer.duration;
265
+ },
266
+ finalizeStreaming() {
267
+ streamEndHandled = true;
268
+ if (audioSources.size === 0 && streamingStarted) {
269
+ nextScheduleTime = 0;
270
+ streamEndHandled = false;
271
+ streamingStarted = false;
272
+ onAudioEnd?.();
273
+ }
274
+ },
275
+ close() {
276
+ stop();
277
+ if (audioContext) {
278
+ void audioContext.close().catch(() => {
279
+ });
280
+ audioContext = null;
281
+ }
282
+ }
283
+ };
284
+ }
285
+ function createMiniProgramPlaybackBackend(config) {
286
+ const { runtime, onAudioStart, onAudioEnd, onError } = config;
287
+ const taro = runtime?.Taro;
288
+ const createInnerAudioContext = taro?.createInnerAudioContext;
289
+ const getFileSystemManager = taro?.getFileSystemManager;
290
+ const userDataPath = taro?.env?.USER_DATA_PATH;
291
+ let innerAudio = createInnerAudioContext?.() ?? null;
292
+ let currentTempFile = null;
293
+ let playing = false;
294
+ const bindEvents = () => {
295
+ innerAudio?.onPlay?.(() => {
296
+ playing = true;
297
+ onAudioStart?.();
298
+ });
299
+ innerAudio?.onEnded?.(() => {
300
+ playing = false;
301
+ onAudioEnd?.();
302
+ });
303
+ innerAudio?.onStop?.(() => {
304
+ playing = false;
305
+ });
306
+ innerAudio?.onError?.((error) => {
307
+ playing = false;
308
+ onError?.(new Error(error?.errMsg || "Mini-program audio playback failed"));
309
+ });
310
+ };
311
+ bindEvents();
312
+ const cleanupTempFile = () => {
313
+ if (!currentTempFile) {
314
+ return;
315
+ }
316
+ const pathToDelete = currentTempFile;
317
+ currentTempFile = null;
318
+ getFileSystemManager?.()?.unlink?.({
319
+ filePath: pathToDelete,
320
+ fail: () => {
321
+ }
322
+ });
323
+ };
324
+ const writeTempAudioFile = async (bytes) => {
325
+ if (!userDataPath || !getFileSystemManager) {
326
+ return bytesToDataUri(bytes, "audio/wav");
327
+ }
328
+ const filePath = `${userDataPath}/amaster-tts-${Date.now()}-${Math.random().toString(16).slice(2)}.wav`;
329
+ const fsManager = getFileSystemManager();
330
+ if (!fsManager?.writeFile) {
331
+ return bytesToDataUri(bytes, "audio/wav");
332
+ }
333
+ await new Promise((resolve, reject) => {
334
+ fsManager.writeFile?.({
335
+ filePath,
336
+ data: bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength),
337
+ success: () => resolve(),
338
+ fail: (error) => reject(error)
339
+ });
340
+ });
341
+ cleanupTempFile();
342
+ currentTempFile = filePath;
343
+ return filePath;
344
+ };
345
+ return {
346
+ kind: "mini-program",
347
+ hasStreamingPlayback: false,
348
+ isPlaying() {
349
+ return playing;
350
+ },
351
+ stop() {
352
+ innerAudio?.stop?.();
353
+ playing = false;
354
+ },
355
+ async playBuffered(input) {
356
+ if (input.audioFormat !== "pcm") {
357
+ onError?.(new Error(`Mini-program built-in playback only supports pcm, got ${input.audioFormat}`));
358
+ return;
359
+ }
360
+ const pcmChunks = input.chunks.map((chunk) => decodeBase64Chunk(chunk));
361
+ const wavBytes = pcmToWav(concatUint8Arrays(pcmChunks), input.sampleRate);
362
+ const source = await writeTempAudioFile(wavBytes);
363
+ if (!innerAudio && createInnerAudioContext) {
364
+ innerAudio = createInnerAudioContext();
365
+ bindEvents();
366
+ }
367
+ if (!innerAudio?.play) {
368
+ onError?.(new Error("Mini-program audio context is unavailable"));
369
+ return;
370
+ }
371
+ innerAudio.src = source;
372
+ innerAudio.autoplay = false;
373
+ innerAudio.obeyMuteSwitch = false;
374
+ innerAudio.play();
375
+ },
376
+ close() {
377
+ innerAudio?.stop?.();
378
+ innerAudio?.destroy?.();
379
+ innerAudio = null;
380
+ playing = false;
381
+ cleanupTempFile();
382
+ }
383
+ };
384
+ }
385
+ function resolvePlaybackBackend(config) {
386
+ if (config.runtime?.Taro?.createInnerAudioContext && !isBrowserEnvironment()) {
387
+ return createMiniProgramPlaybackBackend(config);
388
+ }
389
+ if (typeof AudioContext !== "undefined") {
390
+ return createBrowserPlaybackBackend(config);
391
+ }
392
+ return null;
393
+ }
394
+ function createRawTTSClient(config) {
4
395
  const {
5
396
  getAccessToken,
6
397
  voice = "Cherry",
7
398
  autoPlay = true,
8
399
  audioFormat = "pcm",
9
400
  sampleRate = 24e3,
401
+ runtime,
10
402
  onReady,
11
403
  onAudioStart,
12
404
  onAudioEnd,
13
405
  onAudioChunk,
14
- onError
406
+ onError,
407
+ onClose
15
408
  } = config;
409
+ void getAccessToken;
16
410
  let ws = null;
411
+ let connected = false;
17
412
  let audioChunks = [];
18
- let audioContext = null;
19
- let audioSource = null;
413
+ let responseDone = false;
414
+ let autoPlayed = false;
415
+ let playbackSuppressed = false;
416
+ let playbackBackend = resolvePlaybackBackend({
417
+ runtime,
418
+ sampleRate,
419
+ onAudioStart,
420
+ onAudioEnd,
421
+ onError
422
+ });
423
+ function buildWsUrl() {
424
+ let path = TTS_PATH;
425
+ return path;
426
+ }
427
+ function play() {
428
+ playbackSuppressed = false;
429
+ if (!audioChunks.length || !playbackBackend) {
430
+ return;
431
+ }
432
+ if (!responseDone && playbackBackend.hasStreamingPlayback) {
433
+ return;
434
+ }
435
+ void playbackBackend.playBuffered({
436
+ chunks: [...audioChunks],
437
+ sampleRate,
438
+ audioFormat
439
+ });
440
+ }
441
+ function stop() {
442
+ playbackSuppressed = true;
443
+ playbackBackend?.stop();
444
+ }
445
+ function close() {
446
+ stop();
447
+ if (ws) {
448
+ ws.close();
449
+ ws = null;
450
+ }
451
+ connected = false;
452
+ playbackBackend?.close();
453
+ playbackBackend = resolvePlaybackBackend({
454
+ runtime,
455
+ sampleRate,
456
+ onAudioStart,
457
+ onAudioEnd,
458
+ onError
459
+ });
460
+ onClose?.();
461
+ }
20
462
  async function connect() {
21
- return new Promise((resolve, reject) => {
22
- let wsUrl = TTS_PATH;
23
- if (getAccessToken) {
24
- const token = getAccessToken();
25
- if (token) {
26
- const separator = wsUrl.includes("?") ? "&" : "?";
27
- wsUrl = `${wsUrl}${separator}token=${encodeURIComponent(token)}`;
463
+ if (connected && ws?.readyState === WebSocket.OPEN) {
464
+ return;
465
+ }
466
+ await new Promise((resolve, reject) => {
467
+ const socket = new WebSocket(buildWsUrl());
468
+ ws = socket;
469
+ let settled = false;
470
+ const settleResolve = () => {
471
+ if (settled) {
472
+ return;
28
473
  }
29
- }
30
- ws = new WebSocket(wsUrl);
31
- ws.onopen = () => {
474
+ settled = true;
475
+ connected = true;
476
+ resolve();
32
477
  };
33
- ws.onmessage = (event) => {
34
- const data = JSON.parse(event.data);
35
- if (data.type === "session.created") {
36
- ws.send(
37
- JSON.stringify({
38
- type: "session.update",
39
- session: {
40
- mode: "server_commit",
41
- voice,
42
- response_format: audioFormat,
43
- sample_rate: sampleRate
44
- }
45
- })
46
- );
478
+ const settleReject = (error) => {
479
+ if (settled) {
480
+ return;
47
481
  }
48
- if (data.type === "session.updated") {
49
- onReady?.();
50
- resolve();
51
- }
52
- if (data.type === "response.audio.delta") {
53
- audioChunks.push(data.delta);
54
- onAudioChunk?.(audioChunks);
55
- }
56
- if (data.type === "response.audio.done") {
57
- onAudioChunk?.(audioChunks);
58
- if (autoPlay && typeof window !== "undefined") {
59
- playAudio();
482
+ settled = true;
483
+ connected = false;
484
+ reject(error);
485
+ };
486
+ socket.onmessage = async (event) => {
487
+ try {
488
+ const data = JSON.parse(event.data);
489
+ if (data.type === "session.created") {
490
+ socket.send(
491
+ JSON.stringify({
492
+ type: "session.update",
493
+ session: {
494
+ mode: "server_commit",
495
+ voice,
496
+ response_format: audioFormat,
497
+ sample_rate: sampleRate
498
+ }
499
+ })
500
+ );
60
501
  }
61
- }
62
- if (data.type === "response.done") {
63
- ws.send(JSON.stringify({ type: "session.finish" }));
64
- }
65
- if (data.type === "error") {
66
- const err = new Error(data.error?.message || "Unknown error");
67
- onError?.(err);
68
- reject(err);
502
+ if (data.type === "session.updated") {
503
+ onReady?.();
504
+ settleResolve();
505
+ return;
506
+ }
507
+ if (data.type === "response.audio.delta") {
508
+ audioChunks.push(data.delta);
509
+ onAudioChunk?.([...audioChunks]);
510
+ if (autoPlay && !playbackSuppressed && playbackBackend?.hasStreamingPlayback) {
511
+ autoPlayed = true;
512
+ await playbackBackend.scheduleStreamingChunk?.({
513
+ chunk: data.delta,
514
+ sampleRate,
515
+ audioFormat
516
+ });
517
+ }
518
+ return;
519
+ }
520
+ if (data.type === "response.audio.done") {
521
+ responseDone = true;
522
+ onAudioChunk?.([...audioChunks]);
523
+ if (playbackBackend?.hasStreamingPlayback) {
524
+ playbackBackend.finalizeStreaming?.();
525
+ return;
526
+ }
527
+ if (autoPlay && !playbackSuppressed && !autoPlayed) {
528
+ autoPlayed = true;
529
+ await playbackBackend?.playBuffered({
530
+ chunks: [...audioChunks],
531
+ sampleRate,
532
+ audioFormat
533
+ });
534
+ }
535
+ return;
536
+ }
537
+ if (data.type === "error") {
538
+ const error = new Error(data.error?.message || "Unknown TTS error");
539
+ onError?.(error);
540
+ settleReject(error);
541
+ }
542
+ } catch (error) {
543
+ const parsedError = error instanceof Error ? error : new Error(String(error));
544
+ onError?.(parsedError);
545
+ settleReject(parsedError);
69
546
  }
70
547
  };
71
- ws.onerror = () => {
72
- const err = new Error("WebSocket connection error");
73
- onError?.(err);
74
- reject(err);
548
+ socket.onerror = () => {
549
+ const error = new Error("WebSocket connection error");
550
+ onError?.(error);
551
+ settleReject(error);
75
552
  };
76
- ws.onclose = () => {
553
+ socket.onclose = () => {
554
+ connected = false;
77
555
  ws = null;
78
556
  };
79
557
  });
80
558
  }
81
- async function speak(text) {
559
+ function resetSynthesisState() {
560
+ stop();
561
+ audioChunks = [];
562
+ responseDone = false;
563
+ autoPlayed = false;
564
+ playbackSuppressed = false;
565
+ }
566
+ function ensureSocketReady() {
82
567
  if (!ws || ws.readyState !== WebSocket.OPEN) {
83
568
  throw new Error("WebSocket not connected");
84
569
  }
85
- audioChunks = [];
86
- ws.send(
87
- JSON.stringify({
88
- type: "input_text_buffer.append",
89
- text
90
- })
91
- );
92
- setTimeout(() => {
93
- ws.send(
570
+ }
571
+ function appendText(text) {
572
+ const normalizedText = preprocessTTSContent(text);
573
+ const fragments = splitTextIntoFragments(normalizedText);
574
+ if (!fragments.length) {
575
+ return;
576
+ }
577
+ ensureSocketReady();
578
+ const socket = ws;
579
+ for (const fragment of fragments) {
580
+ socket?.send(
94
581
  JSON.stringify({
95
- type: "input_text_buffer.commit"
582
+ type: "input_text_buffer.append",
583
+ text: fragment
96
584
  })
97
585
  );
98
- }, 100);
99
- }
100
- function playAudio() {
101
- let chunks = audioChunks;
102
- if (typeof window === "undefined") return;
103
- try {
104
- if (!audioContext) {
105
- audioContext = new AudioContext();
106
- }
107
- onAudioStart?.();
108
- let totalBytes = 0;
109
- const allBytes = [];
110
- for (const chunk of chunks) {
111
- const binaryString = atob(chunk);
112
- const bytes = new Uint8Array(binaryString.length);
113
- for (let i = 0; i < binaryString.length; i++) {
114
- bytes[i] = binaryString.charCodeAt(i);
115
- }
116
- allBytes.push(bytes);
117
- totalBytes += bytes.length;
118
- }
119
- const combined = new Uint8Array(totalBytes);
120
- let offset = 0;
121
- for (const bytes of allBytes) {
122
- combined.set(bytes, offset);
123
- offset += bytes.length;
124
- }
125
- const numSamples = combined.length / 2;
126
- const audioBuffer = audioContext.createBuffer(1, numSamples, sampleRate);
127
- const channelData = audioBuffer.getChannelData(0);
128
- const dataView = new DataView(combined.buffer);
129
- for (let i = 0; i < numSamples; i++) {
130
- const int16 = dataView.getInt16(i * 2, true);
131
- channelData[i] = int16 / 32768;
132
- }
133
- const source = audioContext.createBufferSource();
134
- source.buffer = audioBuffer;
135
- source.connect(audioContext.destination);
136
- source.onended = () => onAudioEnd?.();
137
- source.start(0);
138
- audioSource = source;
139
- } catch (err) {
140
- onError?.(err);
141
586
  }
142
587
  }
143
- function stopAudio() {
144
- if (audioSource) {
145
- audioSource.stop();
146
- audioSource = null;
147
- }
148
- if (audioContext) {
149
- audioContext.close();
150
- audioContext = null;
151
- }
588
+ function commitText() {
589
+ ensureSocketReady();
590
+ const socket = ws;
591
+ socket?.send(
592
+ JSON.stringify({
593
+ type: "input_text_buffer.commit"
594
+ })
595
+ );
152
596
  }
153
- function close() {
154
- if (ws) {
155
- ws.close();
156
- ws = null;
157
- }
158
- stopAudio();
597
+ function startStream() {
598
+ resetSynthesisState();
599
+ }
600
+ async function speak(text) {
601
+ startStream();
602
+ appendText(text);
603
+ commitText();
159
604
  }
160
605
  return {
161
606
  connect,
162
607
  speak,
608
+ startStream,
609
+ appendText,
610
+ commitText,
611
+ play,
612
+ stop,
163
613
  close,
164
- play: playAudio,
165
- stop: stopAudio
614
+ isConnected() {
615
+ return connected && ws?.readyState === WebSocket.OPEN;
616
+ },
617
+ hasAudio() {
618
+ return audioChunks.length > 0;
619
+ },
620
+ isResponseDone() {
621
+ return responseDone;
622
+ },
623
+ isPlaying() {
624
+ return playbackBackend?.isPlaying() ?? false;
625
+ },
626
+ isStreamingPlayback() {
627
+ return playbackBackend?.hasStreamingPlayback ?? false;
628
+ }
166
629
  };
167
630
  }
168
- var tts_client_default = (authConfig) => {
169
- return (config) => {
170
- return createTTSClient({ ...authConfig, ...config });
631
+ function defaultSnapshot(voice) {
632
+ return {
633
+ status: "idle",
634
+ activeId: null,
635
+ error: null,
636
+ requestId: 0,
637
+ text: null,
638
+ voice,
639
+ fallbackMode: "none"
171
640
  };
172
- };
641
+ }
642
+ function canUseSystemSpeech() {
643
+ return isBrowserEnvironment() && "speechSynthesis" in window && "SpeechSynthesisUtterance" in window;
644
+ }
645
+ function systemSpeak(text, options) {
646
+ if (!canUseSystemSpeech()) {
647
+ throw new Error("SpeechSynthesis is not supported");
648
+ }
649
+ const utterance = new SpeechSynthesisUtterance(text);
650
+ utterance.onstart = () => {
651
+ options.onStart?.();
652
+ };
653
+ utterance.onerror = (event) => {
654
+ options.onError?.(new Error(event.error || "Speech synthesis failed"));
655
+ };
656
+ utterance.onend = () => {
657
+ options.onEnd?.();
658
+ };
659
+ window.speechSynthesis.cancel();
660
+ window.speechSynthesis.speak(utterance);
661
+ }
662
+ function stopSystemSpeech() {
663
+ if (canUseSystemSpeech()) {
664
+ window.speechSynthesis.cancel();
665
+ }
666
+ }
667
+ function createTTSSpeakController(createClient, options = {}) {
668
+ const listeners = /* @__PURE__ */ new Set();
669
+ const persistedVoice = options.voiceStorageKey && options.storage ? options.storage.getItem(options.voiceStorageKey) || null : null;
670
+ let client = null;
671
+ let snapshot = defaultSnapshot(persistedVoice);
672
+ let streamActive = false;
673
+ let streamId = null;
674
+ const emit = () => {
675
+ for (const listener of listeners) {
676
+ listener(snapshot);
677
+ }
678
+ };
679
+ const setSnapshot = (next) => {
680
+ snapshot = {
681
+ ...snapshot,
682
+ ...next
683
+ };
684
+ emit();
685
+ };
686
+ const persistVoice = (voice) => {
687
+ if (!options.voiceStorageKey || !options.storage) {
688
+ return;
689
+ }
690
+ if (!voice) {
691
+ options.storage.removeItem?.(options.voiceStorageKey);
692
+ return;
693
+ }
694
+ options.storage.setItem(options.voiceStorageKey, voice);
695
+ };
696
+ const reset = (requestId, preserved) => {
697
+ snapshot = {
698
+ status: "idle",
699
+ activeId: null,
700
+ error: null,
701
+ requestId,
702
+ text: preserved?.text ?? null,
703
+ voice: preserved?.voice ?? snapshot.voice,
704
+ fallbackMode: "none"
705
+ };
706
+ emit();
707
+ };
708
+ const stop = (stopOptions) => {
709
+ const preserveClient = stopOptions?.preserveClient ?? true;
710
+ const nextRequestId = snapshot.requestId + 1;
711
+ streamActive = false;
712
+ streamId = null;
713
+ client?.stop();
714
+ stopSystemSpeech();
715
+ if (!preserveClient || !client || !snapshot.text) {
716
+ client?.close();
717
+ client = null;
718
+ reset(nextRequestId, { voice: snapshot.voice });
719
+ return;
720
+ }
721
+ reset(nextRequestId, {
722
+ text: snapshot.text,
723
+ voice: snapshot.voice
724
+ });
725
+ };
726
+ const createManagedClient = (input, requestId, content) => {
727
+ const nextVoice = input.voice ?? snapshot.voice ?? void 0;
728
+ const nextClient = createClient({
729
+ voice: nextVoice,
730
+ autoPlay: true,
731
+ audioFormat: input.audioFormat,
732
+ sampleRate: input.sampleRate,
733
+ runtime: options.runtime,
734
+ onReady: () => {
735
+ if (client !== nextClient) {
736
+ return;
737
+ }
738
+ setSnapshot({
739
+ status: "connecting",
740
+ error: null
741
+ });
742
+ },
743
+ onAudioStart: () => {
744
+ if (client !== nextClient) {
745
+ return;
746
+ }
747
+ setSnapshot({
748
+ status: "speaking",
749
+ error: null,
750
+ fallbackMode: "none"
751
+ });
752
+ },
753
+ onAudioEnd: () => {
754
+ if (client !== nextClient) {
755
+ return;
756
+ }
757
+ setSnapshot({
758
+ status: streamActive ? "connecting" : "idle",
759
+ activeId: streamActive ? streamId : null,
760
+ error: null,
761
+ fallbackMode: "none"
762
+ });
763
+ },
764
+ onError: (error) => {
765
+ if (client !== nextClient) {
766
+ return;
767
+ }
768
+ streamActive = false;
769
+ streamId = null;
770
+ setSnapshot({
771
+ status: "error",
772
+ error: error.message,
773
+ fallbackMode: "none"
774
+ });
775
+ },
776
+ onClose: () => {
777
+ if (client !== nextClient) {
778
+ return;
779
+ }
780
+ client = null;
781
+ }
782
+ });
783
+ client = nextClient;
784
+ setSnapshot({
785
+ status: "connecting",
786
+ activeId: input.id ?? null,
787
+ error: null,
788
+ requestId,
789
+ text: content,
790
+ voice: nextVoice ?? null,
791
+ fallbackMode: "none"
792
+ });
793
+ return {
794
+ nextClient,
795
+ nextVoice
796
+ };
797
+ };
798
+ const ensureStreamClient = async (streamOptions) => {
799
+ const nextRequestId = snapshot.requestId + 1;
800
+ const nextVoice = streamOptions.voice ?? snapshot.voice ?? void 0;
801
+ if (client && snapshot.voice === (nextVoice ?? null)) {
802
+ streamActive = true;
803
+ streamId = streamOptions.id ?? null;
804
+ setSnapshot({
805
+ status: client.isPlaying() ? "speaking" : "connecting",
806
+ activeId: streamId,
807
+ error: null,
808
+ requestId: nextRequestId,
809
+ voice: nextVoice ?? null,
810
+ fallbackMode: "none"
811
+ });
812
+ client.startStream();
813
+ return;
814
+ }
815
+ stop({ preserveClient: false });
816
+ const { nextClient } = createManagedClient(
817
+ {
818
+ id: streamOptions.id,
819
+ voice: streamOptions.voice,
820
+ audioFormat: streamOptions.audioFormat,
821
+ sampleRate: streamOptions.sampleRate
822
+ },
823
+ nextRequestId,
824
+ ""
825
+ );
826
+ streamActive = true;
827
+ streamId = streamOptions.id ?? null;
828
+ await nextClient.connect();
829
+ nextClient.startStream();
830
+ };
831
+ const speak = async (speakOptions) => {
832
+ const content = preprocessTTSContent(speakOptions.text);
833
+ if (!content) {
834
+ stop({ preserveClient: false });
835
+ return;
836
+ }
837
+ const nextRequestId = snapshot.requestId + 1;
838
+ const nextVoice = speakOptions.voice ?? snapshot.voice ?? void 0;
839
+ if (client && snapshot.text === content && snapshot.voice === (nextVoice ?? null) && client.hasAudio()) {
840
+ setSnapshot({
841
+ status: "speaking",
842
+ activeId: speakOptions.id ?? null,
843
+ error: null,
844
+ requestId: nextRequestId,
845
+ text: content,
846
+ voice: nextVoice ?? null,
847
+ fallbackMode: "none"
848
+ });
849
+ client.play();
850
+ return;
851
+ }
852
+ stop({ preserveClient: false });
853
+ streamActive = false;
854
+ streamId = null;
855
+ const { nextClient } = createManagedClient(
856
+ {
857
+ id: speakOptions.id,
858
+ voice: speakOptions.voice,
859
+ audioFormat: speakOptions.audioFormat,
860
+ sampleRate: speakOptions.sampleRate
861
+ },
862
+ nextRequestId,
863
+ content
864
+ );
865
+ try {
866
+ await nextClient.connect();
867
+ await nextClient.speak(content);
868
+ } catch (error) {
869
+ if (client !== nextClient) {
870
+ return;
871
+ }
872
+ if (options.fallbackToSystemSpeech !== false && canUseSystemSpeech()) {
873
+ client?.close();
874
+ client = null;
875
+ systemSpeak(content, {
876
+ onStart: () => {
877
+ setSnapshot({
878
+ status: "speaking",
879
+ error: null,
880
+ activeId: speakOptions.id ?? null,
881
+ requestId: nextRequestId,
882
+ text: content,
883
+ voice: nextVoice ?? null,
884
+ fallbackMode: "system"
885
+ });
886
+ },
887
+ onEnd: () => {
888
+ setSnapshot({
889
+ status: "idle",
890
+ activeId: null,
891
+ error: null,
892
+ fallbackMode: "none"
893
+ });
894
+ },
895
+ onError: (fallbackError) => {
896
+ setSnapshot({
897
+ status: "error",
898
+ error: fallbackError.message,
899
+ fallbackMode: "none"
900
+ });
901
+ }
902
+ });
903
+ return;
904
+ }
905
+ client?.close();
906
+ client = null;
907
+ setSnapshot({
908
+ status: "error",
909
+ error: error instanceof Error ? error.message : String(error),
910
+ fallbackMode: "none"
911
+ });
912
+ }
913
+ };
914
+ return {
915
+ getSnapshot() {
916
+ return snapshot;
917
+ },
918
+ subscribe(listener) {
919
+ listeners.add(listener);
920
+ return () => {
921
+ listeners.delete(listener);
922
+ };
923
+ },
924
+ speak,
925
+ async startStream(streamOptions) {
926
+ await ensureStreamClient(streamOptions);
927
+ },
928
+ async appendStreamText(streamOptions) {
929
+ const content = preprocessTTSContent(streamOptions.text);
930
+ if (!content) {
931
+ return;
932
+ }
933
+ if (!streamActive || streamId !== (streamOptions.id ?? null) || !client) {
934
+ await ensureStreamClient(streamOptions);
935
+ }
936
+ client?.appendText(content);
937
+ setSnapshot({
938
+ status: snapshot.status === "speaking" ? "speaking" : "connecting",
939
+ activeId: streamOptions.id ?? null,
940
+ error: null,
941
+ text: `${snapshot.text || ""}${content}`,
942
+ fallbackMode: "none"
943
+ });
944
+ },
945
+ commitStream() {
946
+ if (!client || !streamActive) {
947
+ return;
948
+ }
949
+ client.commitText();
950
+ setSnapshot({
951
+ status: snapshot.status === "speaking" ? "speaking" : "connecting",
952
+ activeId: streamId,
953
+ error: null,
954
+ fallbackMode: "none"
955
+ });
956
+ },
957
+ finishStream() {
958
+ streamActive = false;
959
+ streamId = null;
960
+ setSnapshot({
961
+ status: client?.isPlaying() ? "speaking" : "idle",
962
+ activeId: client?.isPlaying() ? snapshot.activeId : null,
963
+ error: null,
964
+ fallbackMode: "none"
965
+ });
966
+ },
967
+ stop,
968
+ release() {
969
+ stop({ preserveClient: false });
970
+ },
971
+ async toggle(toggleOptions) {
972
+ if (this.isActive(toggleOptions.id)) {
973
+ stop();
974
+ return;
975
+ }
976
+ await speak(toggleOptions);
977
+ },
978
+ isActive(id) {
979
+ if (!id) {
980
+ return snapshot.status === "connecting" || snapshot.status === "speaking";
981
+ }
982
+ return snapshot.activeId === id && (snapshot.status === "connecting" || snapshot.status === "speaking");
983
+ },
984
+ setVoice(voice) {
985
+ setSnapshot({ voice });
986
+ persistVoice(voice);
987
+ },
988
+ getVoice() {
989
+ return snapshot.voice;
990
+ }
991
+ };
992
+ }
993
+ function createTTSClient(authConfig) {
994
+ return (config) => createRawTTSClient({ ...authConfig, ...config });
995
+ }
173
996
  export {
174
- tts_client_default as createTTSClient
997
+ createTTSClient,
998
+ createTTSSpeakController,
999
+ preprocessTTSContent,
1000
+ splitTextIntoFragments
175
1001
  };
176
1002
  //# sourceMappingURL=index.js.map