kugelaudio 0.2.0 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -8
- package/dist/index.d.mts +207 -17
- package/dist/index.d.ts +207 -17
- package/dist/index.js +323 -16
- package/dist/index.mjs +330 -16
- package/package.json +5 -1
- package/src/client.ts +391 -18
- package/src/index.ts +8 -3
- package/src/types.ts +116 -12
- package/src/websocket.ts +44 -0
package/README.md
CHANGED
|
@@ -31,7 +31,7 @@ const client = new KugelAudio({ apiKey: 'your_api_key' });
|
|
|
31
31
|
// Generate speech
|
|
32
32
|
const audio = await client.tts.generate({
|
|
33
33
|
text: 'Hello, world!',
|
|
34
|
-
|
|
34
|
+
modelId: 'kugel-1-turbo',
|
|
35
35
|
});
|
|
36
36
|
|
|
37
37
|
// Create a playable blob (browser)
|
|
@@ -144,12 +144,11 @@ Generate complete audio and receive it all at once:
|
|
|
144
144
|
```typescript
|
|
145
145
|
const audio = await client.tts.generate({
|
|
146
146
|
text: 'Hello, this is a test of the KugelAudio text-to-speech system.',
|
|
147
|
-
|
|
147
|
+
modelId: 'kugel-1-turbo', // 'kugel-1-turbo' (fast) or 'kugel-1' (quality)
|
|
148
148
|
voiceId: 123, // Optional: specific voice ID
|
|
149
149
|
cfgScale: 2.0, // Guidance scale (1.0-5.0)
|
|
150
150
|
maxNewTokens: 2048, // Maximum tokens to generate
|
|
151
151
|
sampleRate: 24000, // Output sample rate
|
|
152
|
-
speakerPrefix: true, // Add speaker prefix for better quality
|
|
153
152
|
normalize: true, // Enable text normalization (see below)
|
|
154
153
|
language: 'en', // Language for normalization
|
|
155
154
|
});
|
|
@@ -171,7 +170,7 @@ import { createWavBlob } from 'kugelaudio';
|
|
|
171
170
|
|
|
172
171
|
const audio = await client.tts.generate({
|
|
173
172
|
text: 'Hello, world!',
|
|
174
|
-
|
|
173
|
+
modelId: 'kugel-1-turbo',
|
|
175
174
|
});
|
|
176
175
|
|
|
177
176
|
// Create WAV blob for playback
|
|
@@ -200,7 +199,7 @@ Receive audio chunks as they are generated for lower latency:
|
|
|
200
199
|
await client.tts.stream(
|
|
201
200
|
{
|
|
202
201
|
text: 'Hello, this is streaming audio.',
|
|
203
|
-
|
|
202
|
+
modelId: 'kugel-1-turbo',
|
|
204
203
|
},
|
|
205
204
|
{
|
|
206
205
|
onOpen: () => {
|
|
@@ -354,8 +353,7 @@ interface GenerateOptions {
|
|
|
354
353
|
cfgScale?: number; // Default: 2.0
|
|
355
354
|
maxNewTokens?: number; // Default: 2048
|
|
356
355
|
sampleRate?: number; // Default: 24000
|
|
357
|
-
|
|
358
|
-
normalize?: boolean; // Default: false - Enable text normalization
|
|
356
|
+
normalize?: boolean; // Default: true - Enable text normalization
|
|
359
357
|
language?: string; // ISO 639-1 code for normalization (e.g., 'en', 'de')
|
|
360
358
|
}
|
|
361
359
|
```
|
|
@@ -520,7 +518,7 @@ async function main() {
|
|
|
520
518
|
await client.tts.stream(
|
|
521
519
|
{
|
|
522
520
|
text: 'Welcome to KugelAudio. This is an example of high-quality text-to-speech synthesis.',
|
|
523
|
-
|
|
521
|
+
modelId: 'kugel-1-turbo',
|
|
524
522
|
},
|
|
525
523
|
{
|
|
526
524
|
onChunk: (chunk) => {
|
package/dist/index.d.mts
CHANGED
|
@@ -15,7 +15,7 @@ interface Model {
|
|
|
15
15
|
/**
|
|
16
16
|
* Voice category types.
|
|
17
17
|
*/
|
|
18
|
-
type VoiceCategory = 'premade' | 'cloned' | 'designed';
|
|
18
|
+
type VoiceCategory = 'premade' | 'cloned' | 'designed' | 'conversational' | 'narrative' | 'narrative_story' | 'characters';
|
|
19
19
|
/**
|
|
20
20
|
* Voice sex types.
|
|
21
21
|
*/
|
|
@@ -41,6 +41,23 @@ interface Voice {
|
|
|
41
41
|
isPublic: boolean;
|
|
42
42
|
verified: boolean;
|
|
43
43
|
}
|
|
44
|
+
/**
|
|
45
|
+
* Word-level timestamp from server-side forced alignment.
|
|
46
|
+
*/
|
|
47
|
+
interface WordTimestamp {
|
|
48
|
+
/** The aligned word */
|
|
49
|
+
word: string;
|
|
50
|
+
/** Start time in milliseconds (relative to chunk/audio start) */
|
|
51
|
+
startMs: number;
|
|
52
|
+
/** End time in milliseconds (relative to chunk/audio start) */
|
|
53
|
+
endMs: number;
|
|
54
|
+
/** Start character offset in the original text */
|
|
55
|
+
charStart: number;
|
|
56
|
+
/** End character offset in the original text */
|
|
57
|
+
charEnd: number;
|
|
58
|
+
/** Alignment confidence score (0.0 - 1.0) */
|
|
59
|
+
score: number;
|
|
60
|
+
}
|
|
44
61
|
/**
|
|
45
62
|
* TTS generation request options.
|
|
46
63
|
*/
|
|
@@ -48,7 +65,7 @@ interface GenerateOptions {
|
|
|
48
65
|
/** Text to synthesize */
|
|
49
66
|
text: string;
|
|
50
67
|
/** Model to use: 'kugel-1-turbo' (1.5B, fast) or 'kugel-1' (7B, premium). Default: 'kugel-1-turbo' */
|
|
51
|
-
|
|
68
|
+
modelId?: string;
|
|
52
69
|
/** Voice ID to use */
|
|
53
70
|
voiceId?: number;
|
|
54
71
|
/** CFG scale for generation (default: 2.0) */
|
|
@@ -57,27 +74,30 @@ interface GenerateOptions {
|
|
|
57
74
|
maxNewTokens?: number;
|
|
58
75
|
/** Output sample rate (default: 24000) */
|
|
59
76
|
sampleRate?: number;
|
|
60
|
-
/** Whether to add speaker prefix (default: true) */
|
|
61
|
-
speakerPrefix?: boolean;
|
|
62
77
|
/**
|
|
63
78
|
* Enable text normalization (converts numbers, dates, etc. to spoken words).
|
|
64
79
|
* When true, text will be normalized before TTS generation.
|
|
65
|
-
* Default:
|
|
80
|
+
* Default: true
|
|
66
81
|
*
|
|
67
|
-
* ⚠️
|
|
68
|
-
*
|
|
69
|
-
* the language parameter when using normalization.
|
|
82
|
+
* ⚠️ For best performance, always specify the language parameter when using
|
|
83
|
+
* normalization. Without it, language auto-detection adds ~150ms latency.
|
|
70
84
|
*/
|
|
71
85
|
normalize?: boolean;
|
|
72
86
|
/**
|
|
73
87
|
* ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
|
|
74
|
-
* If not provided and normalize is true, language will be auto-detected
|
|
88
|
+
* If not provided and normalize is true (default), language will be auto-detected
|
|
75
89
|
* (adds ~150ms latency).
|
|
76
90
|
*
|
|
77
91
|
* Supported: de, en, fr, es, it, pt, nl, pl, sv, da, no, fi, cs, hu, ro,
|
|
78
92
|
* el, uk, bg, tr, vi, ar, hi, zh, ja, ko
|
|
79
93
|
*/
|
|
80
94
|
language?: string;
|
|
95
|
+
/**
|
|
96
|
+
* Request word-level timestamps alongside audio.
|
|
97
|
+
* When true, the server performs forced alignment and returns per-word timing boundaries.
|
|
98
|
+
* Default: false
|
|
99
|
+
*/
|
|
100
|
+
wordTimestamps?: boolean;
|
|
81
101
|
}
|
|
82
102
|
/**
|
|
83
103
|
* Streaming session configuration.
|
|
@@ -91,12 +111,25 @@ interface StreamConfig {
|
|
|
91
111
|
maxNewTokens?: number;
|
|
92
112
|
/** Output sample rate */
|
|
93
113
|
sampleRate?: number;
|
|
94
|
-
/** Whether to add speaker prefix */
|
|
95
|
-
speakerPrefix?: boolean;
|
|
96
114
|
/** Auto-flush timeout in milliseconds */
|
|
97
115
|
flushTimeoutMs?: number;
|
|
98
116
|
/** Maximum buffer length */
|
|
99
117
|
maxBufferLength?: number;
|
|
118
|
+
/**
|
|
119
|
+
* Enable text normalization (converts numbers, dates, etc. to spoken words).
|
|
120
|
+
* Default: true
|
|
121
|
+
*/
|
|
122
|
+
normalize?: boolean;
|
|
123
|
+
/**
|
|
124
|
+
* ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
|
|
125
|
+
* Specify to avoid ~150ms auto-detection latency.
|
|
126
|
+
*/
|
|
127
|
+
language?: string;
|
|
128
|
+
/**
|
|
129
|
+
* Request word-level timestamps alongside audio.
|
|
130
|
+
* Default: false
|
|
131
|
+
*/
|
|
132
|
+
wordTimestamps?: boolean;
|
|
100
133
|
}
|
|
101
134
|
/**
|
|
102
135
|
* Audio chunk from streaming TTS.
|
|
@@ -150,6 +183,8 @@ interface AudioResponse {
|
|
|
150
183
|
generationMs: number;
|
|
151
184
|
/** Real-time factor */
|
|
152
185
|
rtf: number;
|
|
186
|
+
/** Per-word timing boundaries (populated when `wordTimestamps: true`) */
|
|
187
|
+
wordTimestamps: WordTimestamp[];
|
|
153
188
|
}
|
|
154
189
|
/**
|
|
155
190
|
* Event callbacks for streaming.
|
|
@@ -157,6 +192,8 @@ interface AudioResponse {
|
|
|
157
192
|
interface StreamCallbacks {
|
|
158
193
|
/** Called when an audio chunk is received */
|
|
159
194
|
onChunk?: (chunk: AudioChunk) => void;
|
|
195
|
+
/** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
|
|
196
|
+
onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
|
|
160
197
|
/** Called when generation is complete */
|
|
161
198
|
onFinal?: (stats: GenerationStats) => void;
|
|
162
199
|
/** Called on error */
|
|
@@ -176,17 +213,75 @@ interface KugelAudioOptions {
|
|
|
176
213
|
isMasterKey?: boolean;
|
|
177
214
|
/** Whether apiKey is a JWT token (for user authentication). Takes precedence over isMasterKey. */
|
|
178
215
|
isToken?: boolean;
|
|
216
|
+
/** Organisation ID to bill usage against (required for token auth to enable usage recording). */
|
|
217
|
+
orgId?: number;
|
|
179
218
|
/** API base URL (default: https://api.kugelaudio.com) */
|
|
180
219
|
apiUrl?: string;
|
|
181
|
-
/** TTS server URL (default:
|
|
220
|
+
/** TTS server URL (default: same as apiUrl) */
|
|
182
221
|
ttsUrl?: string;
|
|
183
222
|
/** Request timeout in milliseconds (default: 60000) */
|
|
184
223
|
timeout?: number;
|
|
185
224
|
}
|
|
186
|
-
|
|
187
225
|
/**
|
|
188
|
-
*
|
|
226
|
+
* Multi-context session configuration.
|
|
189
227
|
*/
|
|
228
|
+
interface MultiContextConfig {
|
|
229
|
+
/** Default voice ID for new contexts */
|
|
230
|
+
defaultVoiceId?: number;
|
|
231
|
+
/** Output sample rate (default: 24000) */
|
|
232
|
+
sampleRate?: number;
|
|
233
|
+
/** CFG scale for generation (default: 2.0) */
|
|
234
|
+
cfgScale?: number;
|
|
235
|
+
/** Maximum tokens to generate (default: 2048) */
|
|
236
|
+
maxNewTokens?: number;
|
|
237
|
+
/** Enable text normalization (default: true) */
|
|
238
|
+
normalize?: boolean;
|
|
239
|
+
/** Seconds before context auto-closes (default: 20.0) */
|
|
240
|
+
inactivityTimeout?: number;
|
|
241
|
+
}
|
|
242
|
+
/**
|
|
243
|
+
* Voice settings for a specific context.
|
|
244
|
+
*/
|
|
245
|
+
interface ContextVoiceSettings {
|
|
246
|
+
/** Stability (0.0-1.0) */
|
|
247
|
+
stability?: number;
|
|
248
|
+
/** Similarity boost (0.0-1.0) */
|
|
249
|
+
similarityBoost?: number;
|
|
250
|
+
/** Style (0.0-1.0) */
|
|
251
|
+
style?: number;
|
|
252
|
+
/** Use speaker boost */
|
|
253
|
+
useSpeakerBoost?: boolean;
|
|
254
|
+
/** Speed multiplier */
|
|
255
|
+
speed?: number;
|
|
256
|
+
}
|
|
257
|
+
/**
|
|
258
|
+
* Audio chunk from multi-context streaming.
|
|
259
|
+
*/
|
|
260
|
+
interface MultiContextAudioChunk extends AudioChunk {
|
|
261
|
+
/** Context ID this audio belongs to */
|
|
262
|
+
contextId: string;
|
|
263
|
+
}
|
|
264
|
+
/**
|
|
265
|
+
* Event callbacks for multi-context streaming.
|
|
266
|
+
*/
|
|
267
|
+
interface MultiContextCallbacks {
|
|
268
|
+
/** Called when session is started */
|
|
269
|
+
onSessionStarted?: (sessionId: string) => void;
|
|
270
|
+
/** Called when a context is created */
|
|
271
|
+
onContextCreated?: (contextId: string) => void;
|
|
272
|
+
/** Called when an audio chunk is received */
|
|
273
|
+
onChunk?: (chunk: MultiContextAudioChunk) => void;
|
|
274
|
+
/** Called when a context finishes generating */
|
|
275
|
+
onContextFinal?: (contextId: string) => void;
|
|
276
|
+
/** Called when a context is closed */
|
|
277
|
+
onContextClosed?: (contextId: string) => void;
|
|
278
|
+
/** Called when a context times out */
|
|
279
|
+
onContextTimeout?: (contextId: string) => void;
|
|
280
|
+
/** Called when session is closed */
|
|
281
|
+
onSessionClosed?: (stats: Record<string, unknown>) => void;
|
|
282
|
+
/** Called on error */
|
|
283
|
+
onError?: (error: Error, contextId?: string) => void;
|
|
284
|
+
}
|
|
190
285
|
|
|
191
286
|
/**
|
|
192
287
|
* Models resource for listing TTS models.
|
|
@@ -290,6 +385,98 @@ declare class TTSResource {
|
|
|
290
385
|
*/
|
|
291
386
|
close(): void;
|
|
292
387
|
private parseError;
|
|
388
|
+
/**
|
|
389
|
+
* Create a multi-context session for concurrent TTS streams.
|
|
390
|
+
*
|
|
391
|
+
* Allows managing up to 5 independent audio generation contexts
|
|
392
|
+
* over a single WebSocket connection. Each context has its own
|
|
393
|
+
* text buffer, voice settings, and generation queue.
|
|
394
|
+
*
|
|
395
|
+
* @example
|
|
396
|
+
* ```typescript
|
|
397
|
+
* const session = client.tts.createMultiContextSession({
|
|
398
|
+
* defaultVoiceId: 123,
|
|
399
|
+
* });
|
|
400
|
+
*
|
|
401
|
+
* session.connect({
|
|
402
|
+
* onChunk: (chunk) => {
|
|
403
|
+
* console.log(`Audio from ${chunk.contextId}`);
|
|
404
|
+
* playAudio(chunk.audio);
|
|
405
|
+
* },
|
|
406
|
+
* onContextFinal: (contextId) => {
|
|
407
|
+
* console.log(`${contextId} finished`);
|
|
408
|
+
* },
|
|
409
|
+
* });
|
|
410
|
+
*
|
|
411
|
+
* // Create contexts with different voices
|
|
412
|
+
* session.createContext('narrator', { voiceId: 123 });
|
|
413
|
+
* session.createContext('character', { voiceId: 456 });
|
|
414
|
+
*
|
|
415
|
+
* // Send text to different speakers
|
|
416
|
+
* session.send('narrator', 'The story begins.', true);
|
|
417
|
+
* session.send('character', 'Hello!', true);
|
|
418
|
+
*
|
|
419
|
+
* // Close when done
|
|
420
|
+
* session.close();
|
|
421
|
+
* ```
|
|
422
|
+
*/
|
|
423
|
+
createMultiContextSession(config?: MultiContextConfig): MultiContextSession;
|
|
424
|
+
}
|
|
425
|
+
/**
|
|
426
|
+
* Multi-context WebSocket session for concurrent TTS streams.
|
|
427
|
+
*/
|
|
428
|
+
declare class MultiContextSession {
|
|
429
|
+
private client;
|
|
430
|
+
private ws;
|
|
431
|
+
private config;
|
|
432
|
+
private callbacks;
|
|
433
|
+
private contexts;
|
|
434
|
+
private _sessionId;
|
|
435
|
+
private isStarted;
|
|
436
|
+
constructor(client: KugelAudio, config?: MultiContextConfig);
|
|
437
|
+
/**
|
|
438
|
+
* Get the current session ID, or null if not connected.
|
|
439
|
+
*/
|
|
440
|
+
get sessionId(): string | null;
|
|
441
|
+
/**
|
|
442
|
+
* Connect to the multi-context WebSocket endpoint.
|
|
443
|
+
*/
|
|
444
|
+
connect(callbacks: MultiContextCallbacks): void;
|
|
445
|
+
/**
|
|
446
|
+
* Create a new context with optional voice settings.
|
|
447
|
+
*/
|
|
448
|
+
createContext(contextId: string, options?: {
|
|
449
|
+
voiceId?: number;
|
|
450
|
+
voiceSettings?: ContextVoiceSettings;
|
|
451
|
+
}): void;
|
|
452
|
+
/**
|
|
453
|
+
* Send text to a specific context.
|
|
454
|
+
*/
|
|
455
|
+
send(contextId: string, text: string, flush?: boolean): void;
|
|
456
|
+
/**
|
|
457
|
+
* Flush a context's buffer.
|
|
458
|
+
*/
|
|
459
|
+
flush(contextId: string): void;
|
|
460
|
+
/**
|
|
461
|
+
* Close a specific context.
|
|
462
|
+
*/
|
|
463
|
+
closeContext(contextId: string): void;
|
|
464
|
+
/**
|
|
465
|
+
* Send keep-alive to reset a context's inactivity timeout.
|
|
466
|
+
*/
|
|
467
|
+
keepAlive(contextId: string): void;
|
|
468
|
+
/**
|
|
469
|
+
* Close the session and all contexts.
|
|
470
|
+
*/
|
|
471
|
+
close(): void;
|
|
472
|
+
/**
|
|
473
|
+
* Get active context IDs.
|
|
474
|
+
*/
|
|
475
|
+
get activeContexts(): string[];
|
|
476
|
+
/**
|
|
477
|
+
* Check if connected.
|
|
478
|
+
*/
|
|
479
|
+
get isConnected(): boolean;
|
|
293
480
|
}
|
|
294
481
|
/**
|
|
295
482
|
* KugelAudio API client.
|
|
@@ -307,13 +494,13 @@ declare class TTSResource {
|
|
|
307
494
|
* // Generate audio with fast model (1.5B params)
|
|
308
495
|
* const audio = await client.tts.generate({
|
|
309
496
|
* text: 'Hello, world!',
|
|
310
|
-
*
|
|
497
|
+
* modelId: 'kugel-1-turbo',
|
|
311
498
|
* });
|
|
312
499
|
*
|
|
313
500
|
* // Generate audio with premium model (7B params)
|
|
314
501
|
* const audio = await client.tts.generate({
|
|
315
502
|
* text: 'Hello, world!',
|
|
316
|
-
*
|
|
503
|
+
* modelId: 'kugel-1',
|
|
317
504
|
* });
|
|
318
505
|
* ```
|
|
319
506
|
*/
|
|
@@ -321,6 +508,7 @@ declare class KugelAudio {
|
|
|
321
508
|
private _apiKey;
|
|
322
509
|
private _isMasterKey;
|
|
323
510
|
private _isToken;
|
|
511
|
+
private _orgId;
|
|
324
512
|
private _apiUrl;
|
|
325
513
|
private _ttsUrl;
|
|
326
514
|
private _timeout;
|
|
@@ -354,6 +542,8 @@ declare class KugelAudio {
|
|
|
354
542
|
get isMasterKey(): boolean;
|
|
355
543
|
/** Check if using JWT token authentication */
|
|
356
544
|
get isToken(): boolean;
|
|
545
|
+
/** Get organisation ID for billing */
|
|
546
|
+
get orgId(): number | undefined;
|
|
357
547
|
/** Get TTS URL */
|
|
358
548
|
get ttsUrl(): string;
|
|
359
549
|
/**
|
|
@@ -451,4 +641,4 @@ declare function createWavFile(audio: ArrayBuffer, sampleRate: number): ArrayBuf
|
|
|
451
641
|
*/
|
|
452
642
|
declare function createWavBlob(audio: ArrayBuffer, sampleRate: number): Blob;
|
|
453
643
|
|
|
454
|
-
export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
|
|
644
|
+
export { type AudioChunk, type AudioResponse, AuthenticationError, ConnectionError, type ContextVoiceSettings, type GenerateOptions, type GenerationStats, InsufficientCreditsError, KugelAudio, KugelAudioError, type KugelAudioOptions, type Model, type MultiContextAudioChunk, type MultiContextCallbacks, type MultiContextConfig, RateLimitError, type StreamCallbacks, type StreamConfig, ValidationError, type Voice, type VoiceAge, type VoiceCategory, type VoiceSex, type WordTimestamp, base64ToArrayBuffer, createWavBlob, createWavFile, decodePCM16 };
|