@supertone/supertone 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/FUNCTIONS.md +2 -2
- package/README.md +59 -45
- package/dist/commonjs/lib/config.d.ts +2 -2
- package/dist/commonjs/lib/config.d.ts.map +1 -1
- package/dist/commonjs/lib/config.js +2 -2
- package/dist/commonjs/lib/config.js.map +1 -1
- package/dist/commonjs/sdk/texttospeech.d.ts.map +1 -1
- package/dist/commonjs/sdk/texttospeech.js +12 -9
- package/dist/commonjs/sdk/texttospeech.js.map +1 -1
- package/dist/esm/lib/config.d.ts +2 -2
- package/dist/esm/lib/config.d.ts.map +1 -1
- package/dist/esm/lib/config.js +2 -2
- package/dist/esm/lib/config.js.map +1 -1
- package/dist/esm/sdk/texttospeech.d.ts.map +1 -1
- package/dist/esm/sdk/texttospeech.js +12 -9
- package/dist/esm/sdk/texttospeech.js.map +1 -1
- package/examples/package.json +2 -2
- package/examples/textToSpeechCreateSpeech.example.ts +2 -2
- package/jsr.json +2 -2
- package/package.json +26 -10
- package/src/lib/config.ts +3 -2
- package/src/sdk/texttospeech.ts +474 -465
package/src/sdk/texttospeech.ts
CHANGED
|
@@ -3,27 +3,26 @@
|
|
|
3
3
|
*/
|
|
4
4
|
|
|
5
5
|
import {
|
|
6
|
-
|
|
7
|
-
|
|
6
|
+
CreateSpeechAcceptEnum,
|
|
7
|
+
textToSpeechCreateSpeech,
|
|
8
8
|
} from "../funcs/textToSpeechCreateSpeech.js";
|
|
9
9
|
import { textToSpeechPredictDuration } from "../funcs/textToSpeechPredictDuration.js";
|
|
10
10
|
import {
|
|
11
|
-
|
|
12
|
-
|
|
11
|
+
StreamSpeechAcceptEnum,
|
|
12
|
+
textToSpeechStreamSpeech,
|
|
13
13
|
} from "../funcs/textToSpeechStreamSpeech.js";
|
|
14
14
|
import { ClientSDK, RequestOptions } from "../lib/sdks.js";
|
|
15
15
|
import * as operations from "../models/operations/index.js";
|
|
16
16
|
import { unwrapAsync } from "../types/fp.js";
|
|
17
|
-
|
|
18
17
|
// #region imports
|
|
19
18
|
import {
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
19
|
+
chunkText,
|
|
20
|
+
DEFAULT_MAX_TEXT_LENGTH,
|
|
21
|
+
detectAudioFormat,
|
|
22
|
+
mergeMp3Binary,
|
|
23
|
+
mergeWavBinary,
|
|
24
|
+
removeMp3Header,
|
|
25
|
+
removeWavHeader,
|
|
27
26
|
} from "../lib/custom_utils/index.js";
|
|
28
27
|
// #endregion imports
|
|
29
28
|
|
|
@@ -32,457 +31,467 @@ export { CreateSpeechAcceptEnum } from "../funcs/textToSpeechCreateSpeech.js";
|
|
|
32
31
|
export { StreamSpeechAcceptEnum } from "../funcs/textToSpeechStreamSpeech.js";
|
|
33
32
|
|
|
34
33
|
export class TextToSpeech extends ClientSDK {
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
34
|
+
// #region sdk-class-body
|
|
35
|
+
private _createSpeechOriginal?: typeof TextToSpeech.prototype.createSpeech;
|
|
36
|
+
private _streamSpeechOriginal?: typeof TextToSpeech.prototype.streamSpeech;
|
|
37
|
+
|
|
38
|
+
constructor(options: any) {
|
|
39
|
+
super(options);
|
|
40
|
+
// Store original methods before overriding
|
|
41
|
+
this._createSpeechOriginal = this.createSpeech.bind(this);
|
|
42
|
+
this._streamSpeechOriginal = this.streamSpeech.bind(this);
|
|
43
|
+
|
|
44
|
+
// Override with auto-chunking versions
|
|
45
|
+
this.createSpeech = this.createSpeechWithChunking.bind(this) as any;
|
|
46
|
+
this.streamSpeech = this.streamSpeechWithChunking.bind(this) as any;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Check if text needs to be chunked
|
|
51
|
+
*/
|
|
52
|
+
private shouldChunkText(text: string, maxLength: number): boolean {
|
|
53
|
+
return text.length > maxLength;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Extract audio data from response
|
|
58
|
+
*/
|
|
59
|
+
private async extractAudioFromResponse(
|
|
60
|
+
response: operations.CreateSpeechResponse | operations.StreamSpeechResponse,
|
|
61
|
+
): Promise<Uint8Array> {
|
|
62
|
+
const result = response.result;
|
|
63
|
+
|
|
64
|
+
if (result instanceof Uint8Array) {
|
|
65
|
+
return result;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if (result instanceof Blob) {
|
|
69
|
+
return new Uint8Array(await result.arrayBuffer());
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
if (result instanceof ArrayBuffer) {
|
|
73
|
+
return new Uint8Array(result);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if (
|
|
77
|
+
typeof result === "object"
|
|
78
|
+
&& result !== null
|
|
79
|
+
&& "getReader" in result
|
|
80
|
+
) {
|
|
81
|
+
// ReadableStream
|
|
82
|
+
const reader = (result as ReadableStream<Uint8Array>).getReader();
|
|
83
|
+
const chunks: Uint8Array[] = [];
|
|
84
|
+
|
|
85
|
+
while (true) {
|
|
86
|
+
const { done, value } = await reader.read();
|
|
87
|
+
if (done) break;
|
|
88
|
+
if (value) chunks.push(value);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Merge all chunks
|
|
92
|
+
const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0);
|
|
93
|
+
const merged = new Uint8Array(totalLength);
|
|
94
|
+
let offset = 0;
|
|
95
|
+
for (const chunk of chunks) {
|
|
96
|
+
merged.set(chunk, offset);
|
|
97
|
+
offset += chunk.length;
|
|
98
|
+
}
|
|
99
|
+
return merged;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Handle CreateSpeechResponseBody with audioBase64
|
|
103
|
+
if (typeof result === "object" && result !== null) {
|
|
104
|
+
if ("audioBase64" in result) {
|
|
105
|
+
const audioBase64 = (result as any).audioBase64;
|
|
106
|
+
if (typeof audioBase64 === "string") {
|
|
107
|
+
// Decode base64 to binary
|
|
108
|
+
const binaryString = atob(audioBase64);
|
|
109
|
+
const bytes = new Uint8Array(binaryString.length);
|
|
110
|
+
for (let i = 0; i < binaryString.length; i++) {
|
|
111
|
+
bytes[i] = binaryString.charCodeAt(i);
|
|
112
|
+
}
|
|
113
|
+
return bytes;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Fallback: try to access other properties
|
|
118
|
+
if ("content" in result) {
|
|
119
|
+
const content = (result as any).content;
|
|
120
|
+
if (content instanceof Uint8Array) return content;
|
|
121
|
+
if (typeof content === "string") {
|
|
122
|
+
return new TextEncoder().encode(content);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
if ("_content" in result) {
|
|
126
|
+
const content = (result as any)._content;
|
|
127
|
+
if (content instanceof Uint8Array) return content;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// Enhanced error message with object inspection
|
|
132
|
+
const resultType = typeof result;
|
|
133
|
+
const resultConstructor = result?.constructor?.name || "unknown";
|
|
134
|
+
const resultKeys = result && typeof result === "object"
|
|
135
|
+
? Object.keys(result).join(", ")
|
|
136
|
+
: "N/A";
|
|
137
|
+
|
|
138
|
+
throw new Error(
|
|
139
|
+
`Unsupported result type: ${resultType}, `
|
|
140
|
+
+ `constructor: ${resultConstructor}, `
|
|
141
|
+
+ `keys: [${resultKeys}]`,
|
|
142
|
+
);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Merge multiple audio responses into one
|
|
147
|
+
*/
|
|
148
|
+
private async mergeAudioResponses(
|
|
149
|
+
responses: operations.CreateSpeechResponse[],
|
|
150
|
+
): Promise<operations.CreateSpeechResponse> {
|
|
151
|
+
if (responses.length === 0) {
|
|
152
|
+
throw new Error("No responses to merge");
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
const firstResponse = responses[0];
|
|
156
|
+
if (!firstResponse) {
|
|
157
|
+
throw new Error("First response is undefined");
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if (responses.length === 1) {
|
|
161
|
+
return firstResponse;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// Extract audio data from all responses
|
|
165
|
+
const audioChunks: Uint8Array[] = await Promise.all(
|
|
166
|
+
responses.map((r) => this.extractAudioFromResponse(r)),
|
|
167
|
+
);
|
|
168
|
+
|
|
169
|
+
const firstChunk = audioChunks[0];
|
|
170
|
+
if (!firstChunk) {
|
|
171
|
+
throw new Error("First audio chunk is undefined");
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// Detect format from first chunk
|
|
175
|
+
const audioFormat = detectAudioFormat(firstChunk);
|
|
176
|
+
|
|
177
|
+
// Merge based on format
|
|
178
|
+
let mergedAudio: Uint8Array;
|
|
179
|
+
if (audioFormat === "wav") {
|
|
180
|
+
mergedAudio = mergeWavBinary(audioChunks);
|
|
181
|
+
} else {
|
|
182
|
+
// MP3 or other formats
|
|
183
|
+
mergedAudio = mergeMp3Binary(audioChunks);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Convert Uint8Array to ReadableStream for compatibility with CreateSpeechResponse type
|
|
187
|
+
const stream = new ReadableStream<Uint8Array>({
|
|
188
|
+
start(controller) {
|
|
189
|
+
controller.enqueue(mergedAudio);
|
|
190
|
+
controller.close();
|
|
191
|
+
},
|
|
192
|
+
});
|
|
193
|
+
|
|
194
|
+
// Return merged response with proper type
|
|
195
|
+
return {
|
|
196
|
+
result: stream,
|
|
197
|
+
headers: firstResponse.headers ?? {},
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Create extended streaming response for long text
|
|
203
|
+
* Streams each text chunk progressively without waiting for full completion
|
|
204
|
+
*/
|
|
205
|
+
private createExtendedStreamingResponse(
|
|
206
|
+
firstResponse: operations.StreamSpeechResponse,
|
|
207
|
+
remainingChunks: string[],
|
|
208
|
+
originalRequest: operations.StreamSpeechRequest,
|
|
209
|
+
options?: RequestOptions & {
|
|
210
|
+
acceptHeaderOverride?: StreamSpeechAcceptEnum;
|
|
211
|
+
},
|
|
212
|
+
): operations.StreamSpeechResponse {
|
|
213
|
+
let audioFormat: "wav" | "mp3" | null = null;
|
|
214
|
+
let isFirstAudioChunk = true;
|
|
215
|
+
|
|
216
|
+
// Use arrow function to preserve 'this' context
|
|
217
|
+
const processStream = async (
|
|
218
|
+
controller: ReadableStreamDefaultController<Uint8Array>,
|
|
219
|
+
) => {
|
|
220
|
+
try {
|
|
221
|
+
// Stream first response (first text chunk)
|
|
222
|
+
if (firstResponse.result) {
|
|
223
|
+
const result = firstResponse.result;
|
|
224
|
+
|
|
225
|
+
if (typeof result === "object" && "getReader" in result) {
|
|
226
|
+
const reader = (result as ReadableStream<Uint8Array>).getReader();
|
|
227
|
+
|
|
228
|
+
try {
|
|
229
|
+
while (true) {
|
|
230
|
+
const { done, value } = await reader.read();
|
|
231
|
+
if (done) break;
|
|
232
|
+
|
|
233
|
+
if (value) {
|
|
234
|
+
// Detect format from first audio chunk
|
|
235
|
+
if (isFirstAudioChunk) {
|
|
236
|
+
const detectedFormat = detectAudioFormat(value);
|
|
237
|
+
audioFormat = detectedFormat as "wav" | "mp3";
|
|
238
|
+
isFirstAudioChunk = false;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
controller.enqueue(value);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
} finally {
|
|
245
|
+
reader.releaseLock();
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// Process remaining text chunks sequentially
|
|
251
|
+
for (const chunk of remainingChunks) {
|
|
252
|
+
const chunkRequest: operations.StreamSpeechRequest = {
|
|
253
|
+
...originalRequest,
|
|
254
|
+
apiConvertTextToSpeechUsingCharacterRequest: {
|
|
255
|
+
...originalRequest.apiConvertTextToSpeechUsingCharacterRequest,
|
|
256
|
+
text: chunk,
|
|
257
|
+
},
|
|
258
|
+
};
|
|
259
|
+
|
|
260
|
+
// Call original streaming method
|
|
261
|
+
if (!this._streamSpeechOriginal) {
|
|
262
|
+
throw new Error("Original streamSpeech method not found");
|
|
263
|
+
}
|
|
264
|
+
const chunkResponse = await this._streamSpeechOriginal(
|
|
265
|
+
chunkRequest,
|
|
266
|
+
options,
|
|
267
|
+
);
|
|
268
|
+
|
|
269
|
+
// Stream this text chunk's audio
|
|
270
|
+
if (chunkResponse.result) {
|
|
271
|
+
const result = chunkResponse.result;
|
|
272
|
+
|
|
273
|
+
if (typeof result === "object" && "getReader" in result) {
|
|
274
|
+
const reader = (result as ReadableStream<Uint8Array>).getReader();
|
|
275
|
+
let isFirstChunkOfThisText = true;
|
|
276
|
+
|
|
277
|
+
try {
|
|
278
|
+
while (true) {
|
|
279
|
+
const { done, value } = await reader.read();
|
|
280
|
+
if (done) break;
|
|
281
|
+
|
|
282
|
+
if (value) {
|
|
283
|
+
let processedAudio = value;
|
|
284
|
+
|
|
285
|
+
// Remove header from first chunk of subsequent text chunks
|
|
286
|
+
if (isFirstChunkOfThisText && audioFormat) {
|
|
287
|
+
if (audioFormat === "wav") {
|
|
288
|
+
processedAudio = removeWavHeader(value);
|
|
289
|
+
} else if (audioFormat === "mp3") {
|
|
290
|
+
processedAudio = removeMp3Header(value);
|
|
291
|
+
}
|
|
292
|
+
isFirstChunkOfThisText = false;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
controller.enqueue(processedAudio);
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
} finally {
|
|
299
|
+
reader.releaseLock();
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
controller.close();
|
|
306
|
+
} catch (error) {
|
|
307
|
+
controller.error(error);
|
|
308
|
+
}
|
|
309
|
+
};
|
|
310
|
+
|
|
311
|
+
const stream = new ReadableStream<Uint8Array>({
|
|
312
|
+
start: (controller) => processStream(controller),
|
|
313
|
+
});
|
|
314
|
+
|
|
315
|
+
return {
|
|
316
|
+
result: stream,
|
|
317
|
+
headers: firstResponse.headers ?? {},
|
|
318
|
+
};
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
/**
|
|
322
|
+
* Create speech with auto-chunking support (internal implementation)
|
|
323
|
+
*/
|
|
324
|
+
private async createSpeechWithChunking(
|
|
325
|
+
request: operations.CreateSpeechRequest,
|
|
326
|
+
options?: RequestOptions & {
|
|
327
|
+
acceptHeaderOverride?: CreateSpeechAcceptEnum;
|
|
328
|
+
maxTextLength?: number;
|
|
329
|
+
},
|
|
330
|
+
): Promise<operations.CreateSpeechResponse> {
|
|
331
|
+
const maxLength = options?.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH;
|
|
332
|
+
const text = request.apiConvertTextToSpeechUsingCharacterRequest?.text
|
|
333
|
+
?? "";
|
|
334
|
+
|
|
335
|
+
// Short text: call original method directly
|
|
336
|
+
if (!this.shouldChunkText(text, maxLength)) {
|
|
337
|
+
if (!this._createSpeechOriginal) {
|
|
338
|
+
throw new Error("Original createSpeech method not found");
|
|
339
|
+
}
|
|
340
|
+
return this._createSpeechOriginal(request, options);
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
// Long text: chunk, process sequentially (to avoid schema parsing issues), and merge
|
|
344
|
+
const textChunks = chunkText(text, maxLength);
|
|
345
|
+
|
|
346
|
+
// Determine Accept header based on output format
|
|
347
|
+
const outputFormat = request.apiConvertTextToSpeechUsingCharacterRequest
|
|
348
|
+
?.outputFormat;
|
|
349
|
+
const acceptHeader: CreateSpeechAcceptEnum = outputFormat === "mp3"
|
|
350
|
+
? CreateSpeechAcceptEnum.audioMpeg
|
|
351
|
+
: CreateSpeechAcceptEnum.audioWav;
|
|
352
|
+
|
|
353
|
+
// Process chunks sequentially to avoid race conditions in schema parsing
|
|
354
|
+
const responses: operations.CreateSpeechResponse[] = [];
|
|
355
|
+
for (const chunk of textChunks) {
|
|
356
|
+
const chunkRequest: operations.CreateSpeechRequest = {
|
|
357
|
+
...request,
|
|
358
|
+
apiConvertTextToSpeechUsingCharacterRequest: {
|
|
359
|
+
...request.apiConvertTextToSpeechUsingCharacterRequest,
|
|
360
|
+
text: chunk,
|
|
361
|
+
},
|
|
362
|
+
};
|
|
363
|
+
if (!this._createSpeechOriginal) {
|
|
364
|
+
throw new Error("Original createSpeech method not found");
|
|
365
|
+
}
|
|
366
|
+
const response = await this._createSpeechOriginal(chunkRequest, {
|
|
367
|
+
...options,
|
|
368
|
+
acceptHeaderOverride: acceptHeader,
|
|
369
|
+
});
|
|
370
|
+
responses.push(response);
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
return this.mergeAudioResponses(responses);
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
/**
|
|
377
|
+
* Stream speech with auto-chunking support (internal implementation)
|
|
378
|
+
*/
|
|
379
|
+
private async streamSpeechWithChunking(
|
|
380
|
+
request: operations.StreamSpeechRequest,
|
|
381
|
+
options?: RequestOptions & {
|
|
382
|
+
acceptHeaderOverride?: StreamSpeechAcceptEnum;
|
|
383
|
+
maxTextLength?: number;
|
|
384
|
+
},
|
|
385
|
+
): Promise<operations.StreamSpeechResponse> {
|
|
386
|
+
const maxLength = options?.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH;
|
|
387
|
+
const text = request.apiConvertTextToSpeechUsingCharacterRequest?.text
|
|
388
|
+
?? "";
|
|
389
|
+
|
|
390
|
+
// Short text: call original method directly
|
|
391
|
+
if (!this.shouldChunkText(text, maxLength)) {
|
|
392
|
+
if (!this._streamSpeechOriginal) {
|
|
393
|
+
throw new Error("Original streamSpeech method not found");
|
|
394
|
+
}
|
|
395
|
+
return this._streamSpeechOriginal(request, options);
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// Long text: chunk and stream sequentially
|
|
399
|
+
const textChunks = chunkText(text, maxLength);
|
|
400
|
+
|
|
401
|
+
if (textChunks.length === 0) {
|
|
402
|
+
throw new Error("No text chunks to process");
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
const firstChunk = textChunks[0];
|
|
406
|
+
if (!firstChunk) {
|
|
407
|
+
throw new Error("First text chunk is undefined");
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// Get first response to start streaming
|
|
411
|
+
const firstChunkRequest: operations.StreamSpeechRequest = {
|
|
412
|
+
...request,
|
|
413
|
+
apiConvertTextToSpeechUsingCharacterRequest: {
|
|
414
|
+
...request.apiConvertTextToSpeechUsingCharacterRequest,
|
|
415
|
+
text: firstChunk,
|
|
416
|
+
},
|
|
417
|
+
};
|
|
418
|
+
|
|
419
|
+
if (!this._streamSpeechOriginal) {
|
|
420
|
+
throw new Error("Original streamSpeech method not found");
|
|
421
|
+
}
|
|
422
|
+
const firstResponse = await this._streamSpeechOriginal(
|
|
423
|
+
firstChunkRequest,
|
|
424
|
+
options,
|
|
425
|
+
);
|
|
426
|
+
|
|
427
|
+
// Single chunk: return as-is
|
|
428
|
+
if (textChunks.length === 1) {
|
|
429
|
+
return firstResponse;
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
// Multiple chunks: create extended streaming response
|
|
433
|
+
const remainingChunks = textChunks.slice(1);
|
|
434
|
+
return this.createExtendedStreamingResponse(
|
|
435
|
+
firstResponse,
|
|
436
|
+
remainingChunks,
|
|
437
|
+
request,
|
|
438
|
+
options,
|
|
439
|
+
);
|
|
440
|
+
}
|
|
441
|
+
// #endregion sdk-class-body
|
|
442
|
+
|
|
443
|
+
/**
|
|
444
|
+
* Convert text to speech
|
|
445
|
+
*
|
|
446
|
+
* @remarks
|
|
447
|
+
* Convert text to speech using the specified voice
|
|
448
|
+
*/
|
|
449
|
+
async createSpeech(
|
|
450
|
+
request: operations.CreateSpeechRequest,
|
|
451
|
+
options?: RequestOptions & {
|
|
452
|
+
acceptHeaderOverride?: CreateSpeechAcceptEnum;
|
|
453
|
+
},
|
|
454
|
+
): Promise<operations.CreateSpeechResponse> {
|
|
455
|
+
return unwrapAsync(textToSpeechCreateSpeech(
|
|
456
|
+
this,
|
|
457
|
+
request,
|
|
458
|
+
options,
|
|
459
|
+
));
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
/**
|
|
463
|
+
* Convert text to speech with streaming response
|
|
464
|
+
*
|
|
465
|
+
* @remarks
|
|
466
|
+
* Convert text to speech using the specified voice with streaming response. Returns binary audio stream.
|
|
467
|
+
*/
|
|
468
|
+
async streamSpeech(
|
|
469
|
+
request: operations.StreamSpeechRequest,
|
|
470
|
+
options?: RequestOptions & {
|
|
471
|
+
acceptHeaderOverride?: StreamSpeechAcceptEnum;
|
|
472
|
+
},
|
|
473
|
+
): Promise<operations.StreamSpeechResponse> {
|
|
474
|
+
return unwrapAsync(textToSpeechStreamSpeech(
|
|
475
|
+
this,
|
|
476
|
+
request,
|
|
477
|
+
options,
|
|
478
|
+
));
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
/**
|
|
482
|
+
* Predict text-to-speech duration
|
|
483
|
+
*
|
|
484
|
+
* @remarks
|
|
485
|
+
* Predict the duration of text-to-speech conversion without generating audio
|
|
486
|
+
*/
|
|
487
|
+
async predictDuration(
|
|
488
|
+
request: operations.PredictDurationRequest,
|
|
489
|
+
options?: RequestOptions,
|
|
490
|
+
): Promise<operations.PredictDurationResponse> {
|
|
491
|
+
return unwrapAsync(textToSpeechPredictDuration(
|
|
492
|
+
this,
|
|
493
|
+
request,
|
|
494
|
+
options,
|
|
495
|
+
));
|
|
496
|
+
}
|
|
488
497
|
}
|