@supertone/supertone 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,27 +3,26 @@
3
3
  */
4
4
 
5
5
  import {
6
- CreateSpeechAcceptEnum,
7
- textToSpeechCreateSpeech,
6
+ CreateSpeechAcceptEnum,
7
+ textToSpeechCreateSpeech,
8
8
  } from "../funcs/textToSpeechCreateSpeech.js";
9
9
  import { textToSpeechPredictDuration } from "../funcs/textToSpeechPredictDuration.js";
10
10
  import {
11
- StreamSpeechAcceptEnum,
12
- textToSpeechStreamSpeech,
11
+ StreamSpeechAcceptEnum,
12
+ textToSpeechStreamSpeech,
13
13
  } from "../funcs/textToSpeechStreamSpeech.js";
14
14
  import { ClientSDK, RequestOptions } from "../lib/sdks.js";
15
15
  import * as operations from "../models/operations/index.js";
16
16
  import { unwrapAsync } from "../types/fp.js";
17
-
18
17
  // #region imports
19
18
  import {
20
- chunkText,
21
- DEFAULT_MAX_TEXT_LENGTH,
22
- detectAudioFormat,
23
- mergeMp3Binary,
24
- mergeWavBinary,
25
- removeMp3Header,
26
- removeWavHeader,
19
+ chunkText,
20
+ DEFAULT_MAX_TEXT_LENGTH,
21
+ detectAudioFormat,
22
+ mergeMp3Binary,
23
+ mergeWavBinary,
24
+ removeMp3Header,
25
+ removeWavHeader,
27
26
  } from "../lib/custom_utils/index.js";
28
27
  // #endregion imports
29
28
 
@@ -32,457 +31,467 @@ export { CreateSpeechAcceptEnum } from "../funcs/textToSpeechCreateSpeech.js";
32
31
  export { StreamSpeechAcceptEnum } from "../funcs/textToSpeechStreamSpeech.js";
33
32
 
34
33
  export class TextToSpeech extends ClientSDK {
35
- // #region sdk-class-body
36
- private _createSpeechOriginal?: typeof TextToSpeech.prototype.createSpeech;
37
- private _streamSpeechOriginal?: typeof TextToSpeech.prototype.streamSpeech;
38
-
39
- constructor(options: any) {
40
- super(options);
41
- // Store original methods before overriding
42
- this._createSpeechOriginal = this.createSpeech.bind(this);
43
- this._streamSpeechOriginal = this.streamSpeech.bind(this);
44
-
45
- // Override with auto-chunking versions
46
- this.createSpeech = this.createSpeechWithChunking.bind(this) as any;
47
- this.streamSpeech = this.streamSpeechWithChunking.bind(this) as any;
48
- }
49
-
50
- /**
51
- * Check if text needs to be chunked
52
- */
53
- private shouldChunkText(text: string, maxLength: number): boolean {
54
- return text.length > maxLength;
55
- }
56
-
57
- /**
58
- * Extract audio data from response
59
- */
60
- private async extractAudioFromResponse(
61
- response: operations.CreateSpeechResponse | operations.StreamSpeechResponse
62
- ): Promise<Uint8Array> {
63
- const result = response.result;
64
-
65
- if (result instanceof Uint8Array) {
66
- return result;
67
- }
68
-
69
- if (result instanceof Blob) {
70
- return new Uint8Array(await result.arrayBuffer());
71
- }
72
-
73
- if (result instanceof ArrayBuffer) {
74
- return new Uint8Array(result);
75
- }
76
-
77
- if (
78
- typeof result === "object" &&
79
- result !== null &&
80
- "getReader" in result
81
- ) {
82
- // ReadableStream
83
- const reader = (result as ReadableStream<Uint8Array>).getReader();
84
- const chunks: Uint8Array[] = [];
85
-
86
- while (true) {
87
- const { done, value } = await reader.read();
88
- if (done) break;
89
- if (value) chunks.push(value);
90
- }
91
-
92
- // Merge all chunks
93
- const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0);
94
- const merged = new Uint8Array(totalLength);
95
- let offset = 0;
96
- for (const chunk of chunks) {
97
- merged.set(chunk, offset);
98
- offset += chunk.length;
99
- }
100
- return merged;
101
- }
102
-
103
- // Handle CreateSpeechResponseBody with audioBase64
104
- if (typeof result === "object" && result !== null) {
105
- if ("audioBase64" in result) {
106
- const audioBase64 = (result as any).audioBase64;
107
- if (typeof audioBase64 === "string") {
108
- // Decode base64 to binary
109
- const binaryString = atob(audioBase64);
110
- const bytes = new Uint8Array(binaryString.length);
111
- for (let i = 0; i < binaryString.length; i++) {
112
- bytes[i] = binaryString.charCodeAt(i);
113
- }
114
- return bytes;
115
- }
116
- }
117
-
118
- // Fallback: try to access other properties
119
- if ("content" in result) {
120
- const content = (result as any).content;
121
- if (content instanceof Uint8Array) return content;
122
- if (typeof content === "string") {
123
- return new TextEncoder().encode(content);
124
- }
125
- }
126
- if ("_content" in result) {
127
- const content = (result as any)._content;
128
- if (content instanceof Uint8Array) return content;
129
- }
130
- }
131
-
132
- // Enhanced error message with object inspection
133
- const resultType = typeof result;
134
- const resultConstructor = result?.constructor?.name || "unknown";
135
- const resultKeys =
136
- result && typeof result === "object"
137
- ? Object.keys(result).join(", ")
138
- : "N/A";
139
-
140
- throw new Error(
141
- `Unsupported result type: ${resultType}, ` +
142
- `constructor: ${resultConstructor}, ` +
143
- `keys: [${resultKeys}]`
144
- );
145
- }
146
-
147
- /**
148
- * Merge multiple audio responses into one
149
- */
150
- private async mergeAudioResponses(
151
- responses: operations.CreateSpeechResponse[]
152
- ): Promise<operations.CreateSpeechResponse> {
153
- if (responses.length === 0) {
154
- throw new Error("No responses to merge");
155
- }
156
-
157
- const firstResponse = responses[0];
158
- if (!firstResponse) {
159
- throw new Error("First response is undefined");
160
- }
161
-
162
- if (responses.length === 1) {
163
- return firstResponse;
164
- }
165
-
166
- // Extract audio data from all responses
167
- const audioChunks: Uint8Array[] = await Promise.all(
168
- responses.map((r) => this.extractAudioFromResponse(r))
169
- );
170
-
171
- const firstChunk = audioChunks[0];
172
- if (!firstChunk) {
173
- throw new Error("First audio chunk is undefined");
174
- }
175
-
176
- // Detect format from first chunk
177
- const audioFormat = detectAudioFormat(firstChunk);
178
-
179
- // Merge based on format
180
- let mergedAudio: Uint8Array;
181
- if (audioFormat === "wav") {
182
- mergedAudio = mergeWavBinary(audioChunks);
183
- } else {
184
- // MP3 or other formats
185
- mergedAudio = mergeMp3Binary(audioChunks);
186
- }
187
-
188
- // Convert Uint8Array to ReadableStream for compatibility with CreateSpeechResponse type
189
- const stream = new ReadableStream<Uint8Array>({
190
- start(controller) {
191
- controller.enqueue(mergedAudio);
192
- controller.close();
193
- },
194
- });
195
-
196
- // Return merged response with proper type
197
- return {
198
- result: stream,
199
- headers: firstResponse.headers ?? {},
200
- };
201
- }
202
-
203
- /**
204
- * Create extended streaming response for long text
205
- * Streams each text chunk progressively without waiting for full completion
206
- */
207
- private createExtendedStreamingResponse(
208
- firstResponse: operations.StreamSpeechResponse,
209
- remainingChunks: string[],
210
- originalRequest: operations.StreamSpeechRequest,
211
- options?: RequestOptions & {
212
- acceptHeaderOverride?: StreamSpeechAcceptEnum;
213
- }
214
- ): operations.StreamSpeechResponse {
215
- let audioFormat: "wav" | "mp3" | null = null;
216
- let isFirstAudioChunk = true;
217
-
218
- // Use arrow function to preserve 'this' context
219
- const processStream = async (
220
- controller: ReadableStreamDefaultController<Uint8Array>
221
- ) => {
222
- try {
223
- // Stream first response (first text chunk)
224
- if (firstResponse.result) {
225
- const result = firstResponse.result;
226
-
227
- if (typeof result === "object" && "getReader" in result) {
228
- const reader = (result as ReadableStream<Uint8Array>).getReader();
229
-
230
- try {
231
- while (true) {
232
- const { done, value } = await reader.read();
233
- if (done) break;
234
-
235
- if (value) {
236
- // Detect format from first audio chunk
237
- if (isFirstAudioChunk) {
238
- const detectedFormat = detectAudioFormat(value);
239
- audioFormat = detectedFormat as "wav" | "mp3";
240
- isFirstAudioChunk = false;
241
- }
242
-
243
- controller.enqueue(value);
244
- }
245
- }
246
- } finally {
247
- reader.releaseLock();
248
- }
249
- }
250
- }
251
-
252
- // Process remaining text chunks sequentially
253
- for (const chunk of remainingChunks) {
254
- const chunkRequest: operations.StreamSpeechRequest = {
255
- ...originalRequest,
256
- apiConvertTextToSpeechUsingCharacterRequest: {
257
- ...originalRequest.apiConvertTextToSpeechUsingCharacterRequest,
258
- text: chunk,
259
- },
260
- };
261
-
262
- // Call original streaming method
263
- if (!this._streamSpeechOriginal) {
264
- throw new Error("Original streamSpeech method not found");
265
- }
266
- const chunkResponse = await this._streamSpeechOriginal(
267
- chunkRequest,
268
- options
269
- );
270
-
271
- // Stream this text chunk's audio
272
- if (chunkResponse.result) {
273
- const result = chunkResponse.result;
274
-
275
- if (typeof result === "object" && "getReader" in result) {
276
- const reader = (result as ReadableStream<Uint8Array>).getReader();
277
- let isFirstChunkOfThisText = true;
278
-
279
- try {
280
- while (true) {
281
- const { done, value } = await reader.read();
282
- if (done) break;
283
-
284
- if (value) {
285
- let processedAudio = value;
286
-
287
- // Remove header from first chunk of subsequent text chunks
288
- if (isFirstChunkOfThisText && audioFormat) {
289
- if (audioFormat === "wav") {
290
- processedAudio = removeWavHeader(value);
291
- } else if (audioFormat === "mp3") {
292
- processedAudio = removeMp3Header(value);
293
- }
294
- isFirstChunkOfThisText = false;
295
- }
296
-
297
- controller.enqueue(processedAudio);
298
- }
299
- }
300
- } finally {
301
- reader.releaseLock();
302
- }
303
- }
304
- }
305
- }
306
-
307
- controller.close();
308
- } catch (error) {
309
- controller.error(error);
310
- }
311
- };
312
-
313
- const stream = new ReadableStream<Uint8Array>({
314
- start: (controller) => processStream(controller),
315
- });
316
-
317
- return {
318
- result: stream,
319
- headers: firstResponse.headers ?? {},
320
- };
321
- }
322
-
323
- /**
324
- * Create speech with auto-chunking support (internal implementation)
325
- */
326
- private async createSpeechWithChunking(
327
- request: operations.CreateSpeechRequest,
328
- options?: RequestOptions & {
329
- acceptHeaderOverride?: CreateSpeechAcceptEnum;
330
- maxTextLength?: number;
331
- }
332
- ): Promise<operations.CreateSpeechResponse> {
333
- const maxLength = options?.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH;
334
- const text =
335
- request.apiConvertTextToSpeechUsingCharacterRequest?.text ?? "";
336
-
337
- // Short text: call original method directly
338
- if (!this.shouldChunkText(text, maxLength)) {
339
- if (!this._createSpeechOriginal) {
340
- throw new Error("Original createSpeech method not found");
341
- }
342
- return this._createSpeechOriginal(request, options);
343
- }
344
-
345
- // Long text: chunk, process sequentially (to avoid schema parsing issues), and merge
346
- const textChunks = chunkText(text, maxLength);
347
-
348
- // Determine Accept header based on output format
349
- const outputFormat =
350
- request.apiConvertTextToSpeechUsingCharacterRequest?.outputFormat;
351
- const acceptHeader: CreateSpeechAcceptEnum =
352
- outputFormat === "mp3"
353
- ? CreateSpeechAcceptEnum.audioMpeg
354
- : CreateSpeechAcceptEnum.audioWav;
355
-
356
- // Process chunks sequentially to avoid race conditions in schema parsing
357
- const responses: operations.CreateSpeechResponse[] = [];
358
- for (const chunk of textChunks) {
359
- const chunkRequest: operations.CreateSpeechRequest = {
360
- ...request,
361
- apiConvertTextToSpeechUsingCharacterRequest: {
362
- ...request.apiConvertTextToSpeechUsingCharacterRequest,
363
- text: chunk,
364
- },
365
- };
366
- if (!this._createSpeechOriginal) {
367
- throw new Error("Original createSpeech method not found");
368
- }
369
- const response = await this._createSpeechOriginal(chunkRequest, {
370
- ...options,
371
- acceptHeaderOverride: acceptHeader,
372
- });
373
- responses.push(response);
374
- }
375
-
376
- return this.mergeAudioResponses(responses);
377
- }
378
-
379
- /**
380
- * Stream speech with auto-chunking support (internal implementation)
381
- */
382
- private async streamSpeechWithChunking(
383
- request: operations.StreamSpeechRequest,
384
- options?: RequestOptions & {
385
- acceptHeaderOverride?: StreamSpeechAcceptEnum;
386
- maxTextLength?: number;
387
- }
388
- ): Promise<operations.StreamSpeechResponse> {
389
- const maxLength = options?.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH;
390
- const text =
391
- request.apiConvertTextToSpeechUsingCharacterRequest?.text ?? "";
392
-
393
- // Short text: call original method directly
394
- if (!this.shouldChunkText(text, maxLength)) {
395
- if (!this._streamSpeechOriginal) {
396
- throw new Error("Original streamSpeech method not found");
397
- }
398
- return this._streamSpeechOriginal(request, options);
399
- }
400
-
401
- // Long text: chunk and stream sequentially
402
- const textChunks = chunkText(text, maxLength);
403
-
404
- if (textChunks.length === 0) {
405
- throw new Error("No text chunks to process");
406
- }
407
-
408
- const firstChunk = textChunks[0];
409
- if (!firstChunk) {
410
- throw new Error("First text chunk is undefined");
411
- }
412
-
413
- // Get first response to start streaming
414
- const firstChunkRequest: operations.StreamSpeechRequest = {
415
- ...request,
416
- apiConvertTextToSpeechUsingCharacterRequest: {
417
- ...request.apiConvertTextToSpeechUsingCharacterRequest,
418
- text: firstChunk,
419
- },
420
- };
421
-
422
- if (!this._streamSpeechOriginal) {
423
- throw new Error("Original streamSpeech method not found");
424
- }
425
- const firstResponse = await this._streamSpeechOriginal(
426
- firstChunkRequest,
427
- options
428
- );
429
-
430
- // Single chunk: return as-is
431
- if (textChunks.length === 1) {
432
- return firstResponse;
433
- }
434
-
435
- // Multiple chunks: create extended streaming response
436
- const remainingChunks = textChunks.slice(1);
437
- return this.createExtendedStreamingResponse(
438
- firstResponse,
439
- remainingChunks,
440
- request,
441
- options
442
- );
443
- }
444
- // #endregion sdk-class-body
445
-
446
- /**
447
- * Convert text to speech
448
- *
449
- * @remarks
450
- * Convert text to speech using the specified voice
451
- */
452
- async createSpeech(
453
- request: operations.CreateSpeechRequest,
454
- options?: RequestOptions & {
455
- acceptHeaderOverride?: CreateSpeechAcceptEnum;
456
- }
457
- ): Promise<operations.CreateSpeechResponse> {
458
- return unwrapAsync(textToSpeechCreateSpeech(this, request, options));
459
- }
460
-
461
- /**
462
- * Convert text to speech with streaming response
463
- *
464
- * @remarks
465
- * Convert text to speech using the specified voice with streaming response. Returns binary audio stream.
466
- */
467
- async streamSpeech(
468
- request: operations.StreamSpeechRequest,
469
- options?: RequestOptions & {
470
- acceptHeaderOverride?: StreamSpeechAcceptEnum;
471
- }
472
- ): Promise<operations.StreamSpeechResponse> {
473
- return unwrapAsync(textToSpeechStreamSpeech(this, request, options));
474
- }
475
-
476
- /**
477
- * Predict text-to-speech duration
478
- *
479
- * @remarks
480
- * Predict the duration of text-to-speech conversion without generating audio
481
- */
482
- async predictDuration(
483
- request: operations.PredictDurationRequest,
484
- options?: RequestOptions
485
- ): Promise<operations.PredictDurationResponse> {
486
- return unwrapAsync(textToSpeechPredictDuration(this, request, options));
487
- }
34
+ // #region sdk-class-body
35
+ private _createSpeechOriginal?: typeof TextToSpeech.prototype.createSpeech;
36
+ private _streamSpeechOriginal?: typeof TextToSpeech.prototype.streamSpeech;
37
+
38
+ constructor(options: any) {
39
+ super(options);
40
+ // Store original methods before overriding
41
+ this._createSpeechOriginal = this.createSpeech.bind(this);
42
+ this._streamSpeechOriginal = this.streamSpeech.bind(this);
43
+
44
+ // Override with auto-chunking versions
45
+ this.createSpeech = this.createSpeechWithChunking.bind(this) as any;
46
+ this.streamSpeech = this.streamSpeechWithChunking.bind(this) as any;
47
+ }
48
+
49
+ /**
50
+ * Check if text needs to be chunked
51
+ */
52
+ private shouldChunkText(text: string, maxLength: number): boolean {
53
+ return text.length > maxLength;
54
+ }
55
+
56
+ /**
57
+ * Extract audio data from response
58
+ */
59
+ private async extractAudioFromResponse(
60
+ response: operations.CreateSpeechResponse | operations.StreamSpeechResponse,
61
+ ): Promise<Uint8Array> {
62
+ const result = response.result;
63
+
64
+ if (result instanceof Uint8Array) {
65
+ return result;
66
+ }
67
+
68
+ if (result instanceof Blob) {
69
+ return new Uint8Array(await result.arrayBuffer());
70
+ }
71
+
72
+ if (result instanceof ArrayBuffer) {
73
+ return new Uint8Array(result);
74
+ }
75
+
76
+ if (
77
+ typeof result === "object"
78
+ && result !== null
79
+ && "getReader" in result
80
+ ) {
81
+ // ReadableStream
82
+ const reader = (result as ReadableStream<Uint8Array>).getReader();
83
+ const chunks: Uint8Array[] = [];
84
+
85
+ while (true) {
86
+ const { done, value } = await reader.read();
87
+ if (done) break;
88
+ if (value) chunks.push(value);
89
+ }
90
+
91
+ // Merge all chunks
92
+ const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0);
93
+ const merged = new Uint8Array(totalLength);
94
+ let offset = 0;
95
+ for (const chunk of chunks) {
96
+ merged.set(chunk, offset);
97
+ offset += chunk.length;
98
+ }
99
+ return merged;
100
+ }
101
+
102
+ // Handle CreateSpeechResponseBody with audioBase64
103
+ if (typeof result === "object" && result !== null) {
104
+ if ("audioBase64" in result) {
105
+ const audioBase64 = (result as any).audioBase64;
106
+ if (typeof audioBase64 === "string") {
107
+ // Decode base64 to binary
108
+ const binaryString = atob(audioBase64);
109
+ const bytes = new Uint8Array(binaryString.length);
110
+ for (let i = 0; i < binaryString.length; i++) {
111
+ bytes[i] = binaryString.charCodeAt(i);
112
+ }
113
+ return bytes;
114
+ }
115
+ }
116
+
117
+ // Fallback: try to access other properties
118
+ if ("content" in result) {
119
+ const content = (result as any).content;
120
+ if (content instanceof Uint8Array) return content;
121
+ if (typeof content === "string") {
122
+ return new TextEncoder().encode(content);
123
+ }
124
+ }
125
+ if ("_content" in result) {
126
+ const content = (result as any)._content;
127
+ if (content instanceof Uint8Array) return content;
128
+ }
129
+ }
130
+
131
+ // Enhanced error message with object inspection
132
+ const resultType = typeof result;
133
+ const resultConstructor = result?.constructor?.name || "unknown";
134
+ const resultKeys = result && typeof result === "object"
135
+ ? Object.keys(result).join(", ")
136
+ : "N/A";
137
+
138
+ throw new Error(
139
+ `Unsupported result type: ${resultType}, `
140
+ + `constructor: ${resultConstructor}, `
141
+ + `keys: [${resultKeys}]`,
142
+ );
143
+ }
144
+
145
+ /**
146
+ * Merge multiple audio responses into one
147
+ */
148
+ private async mergeAudioResponses(
149
+ responses: operations.CreateSpeechResponse[],
150
+ ): Promise<operations.CreateSpeechResponse> {
151
+ if (responses.length === 0) {
152
+ throw new Error("No responses to merge");
153
+ }
154
+
155
+ const firstResponse = responses[0];
156
+ if (!firstResponse) {
157
+ throw new Error("First response is undefined");
158
+ }
159
+
160
+ if (responses.length === 1) {
161
+ return firstResponse;
162
+ }
163
+
164
+ // Extract audio data from all responses
165
+ const audioChunks: Uint8Array[] = await Promise.all(
166
+ responses.map((r) => this.extractAudioFromResponse(r)),
167
+ );
168
+
169
+ const firstChunk = audioChunks[0];
170
+ if (!firstChunk) {
171
+ throw new Error("First audio chunk is undefined");
172
+ }
173
+
174
+ // Detect format from first chunk
175
+ const audioFormat = detectAudioFormat(firstChunk);
176
+
177
+ // Merge based on format
178
+ let mergedAudio: Uint8Array;
179
+ if (audioFormat === "wav") {
180
+ mergedAudio = mergeWavBinary(audioChunks);
181
+ } else {
182
+ // MP3 or other formats
183
+ mergedAudio = mergeMp3Binary(audioChunks);
184
+ }
185
+
186
+ // Convert Uint8Array to ReadableStream for compatibility with CreateSpeechResponse type
187
+ const stream = new ReadableStream<Uint8Array>({
188
+ start(controller) {
189
+ controller.enqueue(mergedAudio);
190
+ controller.close();
191
+ },
192
+ });
193
+
194
+ // Return merged response with proper type
195
+ return {
196
+ result: stream,
197
+ headers: firstResponse.headers ?? {},
198
+ };
199
+ }
200
+
201
+ /**
202
+ * Create extended streaming response for long text
203
+ * Streams each text chunk progressively without waiting for full completion
204
+ */
205
+ private createExtendedStreamingResponse(
206
+ firstResponse: operations.StreamSpeechResponse,
207
+ remainingChunks: string[],
208
+ originalRequest: operations.StreamSpeechRequest,
209
+ options?: RequestOptions & {
210
+ acceptHeaderOverride?: StreamSpeechAcceptEnum;
211
+ },
212
+ ): operations.StreamSpeechResponse {
213
+ let audioFormat: "wav" | "mp3" | null = null;
214
+ let isFirstAudioChunk = true;
215
+
216
+ // Use arrow function to preserve 'this' context
217
+ const processStream = async (
218
+ controller: ReadableStreamDefaultController<Uint8Array>,
219
+ ) => {
220
+ try {
221
+ // Stream first response (first text chunk)
222
+ if (firstResponse.result) {
223
+ const result = firstResponse.result;
224
+
225
+ if (typeof result === "object" && "getReader" in result) {
226
+ const reader = (result as ReadableStream<Uint8Array>).getReader();
227
+
228
+ try {
229
+ while (true) {
230
+ const { done, value } = await reader.read();
231
+ if (done) break;
232
+
233
+ if (value) {
234
+ // Detect format from first audio chunk
235
+ if (isFirstAudioChunk) {
236
+ const detectedFormat = detectAudioFormat(value);
237
+ audioFormat = detectedFormat as "wav" | "mp3";
238
+ isFirstAudioChunk = false;
239
+ }
240
+
241
+ controller.enqueue(value);
242
+ }
243
+ }
244
+ } finally {
245
+ reader.releaseLock();
246
+ }
247
+ }
248
+ }
249
+
250
+ // Process remaining text chunks sequentially
251
+ for (const chunk of remainingChunks) {
252
+ const chunkRequest: operations.StreamSpeechRequest = {
253
+ ...originalRequest,
254
+ apiConvertTextToSpeechUsingCharacterRequest: {
255
+ ...originalRequest.apiConvertTextToSpeechUsingCharacterRequest,
256
+ text: chunk,
257
+ },
258
+ };
259
+
260
+ // Call original streaming method
261
+ if (!this._streamSpeechOriginal) {
262
+ throw new Error("Original streamSpeech method not found");
263
+ }
264
+ const chunkResponse = await this._streamSpeechOriginal(
265
+ chunkRequest,
266
+ options,
267
+ );
268
+
269
+ // Stream this text chunk's audio
270
+ if (chunkResponse.result) {
271
+ const result = chunkResponse.result;
272
+
273
+ if (typeof result === "object" && "getReader" in result) {
274
+ const reader = (result as ReadableStream<Uint8Array>).getReader();
275
+ let isFirstChunkOfThisText = true;
276
+
277
+ try {
278
+ while (true) {
279
+ const { done, value } = await reader.read();
280
+ if (done) break;
281
+
282
+ if (value) {
283
+ let processedAudio = value;
284
+
285
+ // Remove header from first chunk of subsequent text chunks
286
+ if (isFirstChunkOfThisText && audioFormat) {
287
+ if (audioFormat === "wav") {
288
+ processedAudio = removeWavHeader(value);
289
+ } else if (audioFormat === "mp3") {
290
+ processedAudio = removeMp3Header(value);
291
+ }
292
+ isFirstChunkOfThisText = false;
293
+ }
294
+
295
+ controller.enqueue(processedAudio);
296
+ }
297
+ }
298
+ } finally {
299
+ reader.releaseLock();
300
+ }
301
+ }
302
+ }
303
+ }
304
+
305
+ controller.close();
306
+ } catch (error) {
307
+ controller.error(error);
308
+ }
309
+ };
310
+
311
+ const stream = new ReadableStream<Uint8Array>({
312
+ start: (controller) => processStream(controller),
313
+ });
314
+
315
+ return {
316
+ result: stream,
317
+ headers: firstResponse.headers ?? {},
318
+ };
319
+ }
320
+
321
+ /**
322
+ * Create speech with auto-chunking support (internal implementation)
323
+ */
324
+ private async createSpeechWithChunking(
325
+ request: operations.CreateSpeechRequest,
326
+ options?: RequestOptions & {
327
+ acceptHeaderOverride?: CreateSpeechAcceptEnum;
328
+ maxTextLength?: number;
329
+ },
330
+ ): Promise<operations.CreateSpeechResponse> {
331
+ const maxLength = options?.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH;
332
+ const text = request.apiConvertTextToSpeechUsingCharacterRequest?.text
333
+ ?? "";
334
+
335
+ // Short text: call original method directly
336
+ if (!this.shouldChunkText(text, maxLength)) {
337
+ if (!this._createSpeechOriginal) {
338
+ throw new Error("Original createSpeech method not found");
339
+ }
340
+ return this._createSpeechOriginal(request, options);
341
+ }
342
+
343
+ // Long text: chunk, process sequentially (to avoid schema parsing issues), and merge
344
+ const textChunks = chunkText(text, maxLength);
345
+
346
+ // Determine Accept header based on output format
347
+ const outputFormat = request.apiConvertTextToSpeechUsingCharacterRequest
348
+ ?.outputFormat;
349
+ const acceptHeader: CreateSpeechAcceptEnum = outputFormat === "mp3"
350
+ ? CreateSpeechAcceptEnum.audioMpeg
351
+ : CreateSpeechAcceptEnum.audioWav;
352
+
353
+ // Process chunks sequentially to avoid race conditions in schema parsing
354
+ const responses: operations.CreateSpeechResponse[] = [];
355
+ for (const chunk of textChunks) {
356
+ const chunkRequest: operations.CreateSpeechRequest = {
357
+ ...request,
358
+ apiConvertTextToSpeechUsingCharacterRequest: {
359
+ ...request.apiConvertTextToSpeechUsingCharacterRequest,
360
+ text: chunk,
361
+ },
362
+ };
363
+ if (!this._createSpeechOriginal) {
364
+ throw new Error("Original createSpeech method not found");
365
+ }
366
+ const response = await this._createSpeechOriginal(chunkRequest, {
367
+ ...options,
368
+ acceptHeaderOverride: acceptHeader,
369
+ });
370
+ responses.push(response);
371
+ }
372
+
373
+ return this.mergeAudioResponses(responses);
374
+ }
375
+
376
+ /**
377
+ * Stream speech with auto-chunking support (internal implementation)
378
+ */
379
+ private async streamSpeechWithChunking(
380
+ request: operations.StreamSpeechRequest,
381
+ options?: RequestOptions & {
382
+ acceptHeaderOverride?: StreamSpeechAcceptEnum;
383
+ maxTextLength?: number;
384
+ },
385
+ ): Promise<operations.StreamSpeechResponse> {
386
+ const maxLength = options?.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH;
387
+ const text = request.apiConvertTextToSpeechUsingCharacterRequest?.text
388
+ ?? "";
389
+
390
+ // Short text: call original method directly
391
+ if (!this.shouldChunkText(text, maxLength)) {
392
+ if (!this._streamSpeechOriginal) {
393
+ throw new Error("Original streamSpeech method not found");
394
+ }
395
+ return this._streamSpeechOriginal(request, options);
396
+ }
397
+
398
+ // Long text: chunk and stream sequentially
399
+ const textChunks = chunkText(text, maxLength);
400
+
401
+ if (textChunks.length === 0) {
402
+ throw new Error("No text chunks to process");
403
+ }
404
+
405
+ const firstChunk = textChunks[0];
406
+ if (!firstChunk) {
407
+ throw new Error("First text chunk is undefined");
408
+ }
409
+
410
+ // Get first response to start streaming
411
+ const firstChunkRequest: operations.StreamSpeechRequest = {
412
+ ...request,
413
+ apiConvertTextToSpeechUsingCharacterRequest: {
414
+ ...request.apiConvertTextToSpeechUsingCharacterRequest,
415
+ text: firstChunk,
416
+ },
417
+ };
418
+
419
+ if (!this._streamSpeechOriginal) {
420
+ throw new Error("Original streamSpeech method not found");
421
+ }
422
+ const firstResponse = await this._streamSpeechOriginal(
423
+ firstChunkRequest,
424
+ options,
425
+ );
426
+
427
+ // Single chunk: return as-is
428
+ if (textChunks.length === 1) {
429
+ return firstResponse;
430
+ }
431
+
432
+ // Multiple chunks: create extended streaming response
433
+ const remainingChunks = textChunks.slice(1);
434
+ return this.createExtendedStreamingResponse(
435
+ firstResponse,
436
+ remainingChunks,
437
+ request,
438
+ options,
439
+ );
440
+ }
441
+ // #endregion sdk-class-body
442
+
443
+ /**
444
+ * Convert text to speech
445
+ *
446
+ * @remarks
447
+ * Convert text to speech using the specified voice
448
+ */
449
+ async createSpeech(
450
+ request: operations.CreateSpeechRequest,
451
+ options?: RequestOptions & {
452
+ acceptHeaderOverride?: CreateSpeechAcceptEnum;
453
+ },
454
+ ): Promise<operations.CreateSpeechResponse> {
455
+ return unwrapAsync(textToSpeechCreateSpeech(
456
+ this,
457
+ request,
458
+ options,
459
+ ));
460
+ }
461
+
462
+ /**
463
+ * Convert text to speech with streaming response
464
+ *
465
+ * @remarks
466
+ * Convert text to speech using the specified voice with streaming response. Returns binary audio stream.
467
+ */
468
+ async streamSpeech(
469
+ request: operations.StreamSpeechRequest,
470
+ options?: RequestOptions & {
471
+ acceptHeaderOverride?: StreamSpeechAcceptEnum;
472
+ },
473
+ ): Promise<operations.StreamSpeechResponse> {
474
+ return unwrapAsync(textToSpeechStreamSpeech(
475
+ this,
476
+ request,
477
+ options,
478
+ ));
479
+ }
480
+
481
+ /**
482
+ * Predict text-to-speech duration
483
+ *
484
+ * @remarks
485
+ * Predict the duration of text-to-speech conversion without generating audio
486
+ */
487
+ async predictDuration(
488
+ request: operations.PredictDurationRequest,
489
+ options?: RequestOptions,
490
+ ): Promise<operations.PredictDurationResponse> {
491
+ return unwrapAsync(textToSpeechPredictDuration(
492
+ this,
493
+ request,
494
+ options,
495
+ ));
496
+ }
488
497
  }