kugelaudio 0.2.0 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -8
- package/dist/index.d.mts +207 -17
- package/dist/index.d.ts +207 -17
- package/dist/index.js +323 -16
- package/dist/index.mjs +330 -16
- package/package.json +5 -1
- package/src/client.ts +391 -18
- package/src/index.ts +8 -3
- package/src/types.ts +116 -12
- package/src/websocket.ts +44 -0
package/src/types.ts
CHANGED
|
@@ -17,7 +17,7 @@ export interface Model {
|
|
|
17
17
|
/**
|
|
18
18
|
* Voice category types.
|
|
19
19
|
*/
|
|
20
|
-
export type VoiceCategory = 'premade' | 'cloned' | 'designed';
|
|
20
|
+
export type VoiceCategory = 'premade' | 'cloned' | 'designed' | 'conversational' | 'narrative' | 'narrative_story' | 'characters';
|
|
21
21
|
|
|
22
22
|
/**
|
|
23
23
|
* Voice sex types.
|
|
@@ -47,6 +47,24 @@ export interface Voice {
|
|
|
47
47
|
verified: boolean;
|
|
48
48
|
}
|
|
49
49
|
|
|
50
|
+
/**
|
|
51
|
+
* Word-level timestamp from server-side forced alignment.
|
|
52
|
+
*/
|
|
53
|
+
export interface WordTimestamp {
|
|
54
|
+
/** The aligned word */
|
|
55
|
+
word: string;
|
|
56
|
+
/** Start time in milliseconds (relative to chunk/audio start) */
|
|
57
|
+
startMs: number;
|
|
58
|
+
/** End time in milliseconds (relative to chunk/audio start) */
|
|
59
|
+
endMs: number;
|
|
60
|
+
/** Start character offset in the original text */
|
|
61
|
+
charStart: number;
|
|
62
|
+
/** End character offset in the original text */
|
|
63
|
+
charEnd: number;
|
|
64
|
+
/** Alignment confidence score (0.0 - 1.0) */
|
|
65
|
+
score: number;
|
|
66
|
+
}
|
|
67
|
+
|
|
50
68
|
/**
|
|
51
69
|
* TTS generation request options.
|
|
52
70
|
*/
|
|
@@ -54,7 +72,7 @@ export interface GenerateOptions {
|
|
|
54
72
|
/** Text to synthesize */
|
|
55
73
|
text: string;
|
|
56
74
|
/** Model to use: 'kugel-1-turbo' (1.5B, fast) or 'kugel-1' (7B, premium). Default: 'kugel-1-turbo' */
|
|
57
|
-
|
|
75
|
+
modelId?: string;
|
|
58
76
|
/** Voice ID to use */
|
|
59
77
|
voiceId?: number;
|
|
60
78
|
/** CFG scale for generation (default: 2.0) */
|
|
@@ -63,27 +81,30 @@ export interface GenerateOptions {
|
|
|
63
81
|
maxNewTokens?: number;
|
|
64
82
|
/** Output sample rate (default: 24000) */
|
|
65
83
|
sampleRate?: number;
|
|
66
|
-
/** Whether to add speaker prefix (default: true) */
|
|
67
|
-
speakerPrefix?: boolean;
|
|
68
84
|
/**
|
|
69
85
|
* Enable text normalization (converts numbers, dates, etc. to spoken words).
|
|
70
86
|
* When true, text will be normalized before TTS generation.
|
|
71
|
-
* Default:
|
|
87
|
+
* Default: true
|
|
72
88
|
*
|
|
73
|
-
* ⚠️
|
|
74
|
-
*
|
|
75
|
-
* the language parameter when using normalization.
|
|
89
|
+
* ⚠️ For best performance, always specify the language parameter when using
|
|
90
|
+
* normalization. Without it, language auto-detection adds ~150ms latency.
|
|
76
91
|
*/
|
|
77
92
|
normalize?: boolean;
|
|
78
93
|
/**
|
|
79
94
|
* ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
|
|
80
|
-
* If not provided and normalize is true, language will be auto-detected
|
|
95
|
+
* If not provided and normalize is true (default), language will be auto-detected
|
|
81
96
|
* (adds ~150ms latency).
|
|
82
97
|
*
|
|
83
98
|
* Supported: de, en, fr, es, it, pt, nl, pl, sv, da, no, fi, cs, hu, ro,
|
|
84
99
|
* el, uk, bg, tr, vi, ar, hi, zh, ja, ko
|
|
85
100
|
*/
|
|
86
101
|
language?: string;
|
|
102
|
+
/**
|
|
103
|
+
* Request word-level timestamps alongside audio.
|
|
104
|
+
* When true, the server performs forced alignment and returns per-word timing boundaries.
|
|
105
|
+
* Default: false
|
|
106
|
+
*/
|
|
107
|
+
wordTimestamps?: boolean;
|
|
87
108
|
}
|
|
88
109
|
|
|
89
110
|
/**
|
|
@@ -98,12 +119,25 @@ export interface StreamConfig {
|
|
|
98
119
|
maxNewTokens?: number;
|
|
99
120
|
/** Output sample rate */
|
|
100
121
|
sampleRate?: number;
|
|
101
|
-
/** Whether to add speaker prefix */
|
|
102
|
-
speakerPrefix?: boolean;
|
|
103
122
|
/** Auto-flush timeout in milliseconds */
|
|
104
123
|
flushTimeoutMs?: number;
|
|
105
124
|
/** Maximum buffer length */
|
|
106
125
|
maxBufferLength?: number;
|
|
126
|
+
/**
|
|
127
|
+
* Enable text normalization (converts numbers, dates, etc. to spoken words).
|
|
128
|
+
* Default: true
|
|
129
|
+
*/
|
|
130
|
+
normalize?: boolean;
|
|
131
|
+
/**
|
|
132
|
+
* ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
|
|
133
|
+
* Specify to avoid ~150ms auto-detection latency.
|
|
134
|
+
*/
|
|
135
|
+
language?: string;
|
|
136
|
+
/**
|
|
137
|
+
* Request word-level timestamps alongside audio.
|
|
138
|
+
* Default: false
|
|
139
|
+
*/
|
|
140
|
+
wordTimestamps?: boolean;
|
|
107
141
|
}
|
|
108
142
|
|
|
109
143
|
/**
|
|
@@ -160,6 +194,8 @@ export interface AudioResponse {
|
|
|
160
194
|
generationMs: number;
|
|
161
195
|
/** Real-time factor */
|
|
162
196
|
rtf: number;
|
|
197
|
+
/** Per-word timing boundaries (populated when `wordTimestamps: true`) */
|
|
198
|
+
wordTimestamps: WordTimestamp[];
|
|
163
199
|
}
|
|
164
200
|
|
|
165
201
|
/**
|
|
@@ -168,6 +204,8 @@ export interface AudioResponse {
|
|
|
168
204
|
export interface StreamCallbacks {
|
|
169
205
|
/** Called when an audio chunk is received */
|
|
170
206
|
onChunk?: (chunk: AudioChunk) => void;
|
|
207
|
+
/** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
|
|
208
|
+
onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
|
|
171
209
|
/** Called when generation is complete */
|
|
172
210
|
onFinal?: (stats: GenerationStats) => void;
|
|
173
211
|
/** Called on error */
|
|
@@ -188,9 +226,11 @@ export interface KugelAudioOptions {
|
|
|
188
226
|
isMasterKey?: boolean;
|
|
189
227
|
/** Whether apiKey is a JWT token (for user authentication). Takes precedence over isMasterKey. */
|
|
190
228
|
isToken?: boolean;
|
|
229
|
+
/** Organisation ID to bill usage against (required for token auth to enable usage recording). */
|
|
230
|
+
orgId?: number;
|
|
191
231
|
/** API base URL (default: https://api.kugelaudio.com) */
|
|
192
232
|
apiUrl?: string;
|
|
193
|
-
/** TTS server URL (default:
|
|
233
|
+
/** TTS server URL (default: same as apiUrl) */
|
|
194
234
|
ttsUrl?: string;
|
|
195
235
|
/** Request timeout in milliseconds (default: 60000) */
|
|
196
236
|
timeout?: number;
|
|
@@ -205,3 +245,67 @@ export interface ApiError {
|
|
|
205
245
|
statusCode?: number;
|
|
206
246
|
}
|
|
207
247
|
|
|
248
|
+
/**
|
|
249
|
+
* Multi-context session configuration.
|
|
250
|
+
*/
|
|
251
|
+
export interface MultiContextConfig {
|
|
252
|
+
/** Default voice ID for new contexts */
|
|
253
|
+
defaultVoiceId?: number;
|
|
254
|
+
/** Output sample rate (default: 24000) */
|
|
255
|
+
sampleRate?: number;
|
|
256
|
+
/** CFG scale for generation (default: 2.0) */
|
|
257
|
+
cfgScale?: number;
|
|
258
|
+
/** Maximum tokens to generate (default: 2048) */
|
|
259
|
+
maxNewTokens?: number;
|
|
260
|
+
/** Enable text normalization (default: true) */
|
|
261
|
+
normalize?: boolean;
|
|
262
|
+
/** Seconds before context auto-closes (default: 20.0) */
|
|
263
|
+
inactivityTimeout?: number;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
/**
|
|
267
|
+
* Voice settings for a specific context.
|
|
268
|
+
*/
|
|
269
|
+
export interface ContextVoiceSettings {
|
|
270
|
+
/** Stability (0.0-1.0) */
|
|
271
|
+
stability?: number;
|
|
272
|
+
/** Similarity boost (0.0-1.0) */
|
|
273
|
+
similarityBoost?: number;
|
|
274
|
+
/** Style (0.0-1.0) */
|
|
275
|
+
style?: number;
|
|
276
|
+
/** Use speaker boost */
|
|
277
|
+
useSpeakerBoost?: boolean;
|
|
278
|
+
/** Speed multiplier */
|
|
279
|
+
speed?: number;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
/**
|
|
283
|
+
* Audio chunk from multi-context streaming.
|
|
284
|
+
*/
|
|
285
|
+
export interface MultiContextAudioChunk extends AudioChunk {
|
|
286
|
+
/** Context ID this audio belongs to */
|
|
287
|
+
contextId: string;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
/**
|
|
291
|
+
* Event callbacks for multi-context streaming.
|
|
292
|
+
*/
|
|
293
|
+
export interface MultiContextCallbacks {
|
|
294
|
+
/** Called when session is started */
|
|
295
|
+
onSessionStarted?: (sessionId: string) => void;
|
|
296
|
+
/** Called when a context is created */
|
|
297
|
+
onContextCreated?: (contextId: string) => void;
|
|
298
|
+
/** Called when an audio chunk is received */
|
|
299
|
+
onChunk?: (chunk: MultiContextAudioChunk) => void;
|
|
300
|
+
/** Called when a context finishes generating */
|
|
301
|
+
onContextFinal?: (contextId: string) => void;
|
|
302
|
+
/** Called when a context is closed */
|
|
303
|
+
onContextClosed?: (contextId: string) => void;
|
|
304
|
+
/** Called when a context times out */
|
|
305
|
+
onContextTimeout?: (contextId: string) => void;
|
|
306
|
+
/** Called when session is closed */
|
|
307
|
+
onSessionClosed?: (stats: Record<string, unknown>) => void;
|
|
308
|
+
/** Called on error */
|
|
309
|
+
onError?: (error: Error, contextId?: string) => void;
|
|
310
|
+
}
|
|
311
|
+
|
package/src/websocket.ts
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WebSocket compatibility layer for browser and Node.js environments.
|
|
3
|
+
*
|
|
4
|
+
* IMPORTANT: WebSocket resolution is lazy to avoid top-level side-effects
|
|
5
|
+
* that break server-side bundlers (Turbopack / Webpack) when this module
|
|
6
|
+
* is imported in a Node.js (API route) context.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
let _cachedWs: typeof WebSocket | null = null;
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Get the WebSocket constructor for the current environment.
|
|
13
|
+
* Uses native WebSocket in browsers, ws package in Node.js.
|
|
14
|
+
* Result is cached after first call.
|
|
15
|
+
*/
|
|
16
|
+
export function getWebSocket(): typeof WebSocket {
|
|
17
|
+
if (_cachedWs) return _cachedWs;
|
|
18
|
+
|
|
19
|
+
// Browser environment
|
|
20
|
+
if (typeof globalThis !== 'undefined' && typeof (globalThis as any).WebSocket !== 'undefined') {
|
|
21
|
+
_cachedWs = (globalThis as any).WebSocket;
|
|
22
|
+
return _cachedWs!;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// Node.js environment - use ws package via dynamic require
|
|
26
|
+
try {
|
|
27
|
+
// Use Function constructor to hide require from static analysis by bundlers
|
|
28
|
+
// eslint-disable-next-line no-new-func
|
|
29
|
+
const _require = typeof require !== 'undefined'
|
|
30
|
+
? require
|
|
31
|
+
: Function('return typeof require !== "undefined" ? require : undefined')();
|
|
32
|
+
if (_require) {
|
|
33
|
+
const ws = _require('ws');
|
|
34
|
+
_cachedWs = ws.default || ws;
|
|
35
|
+
return _cachedWs!;
|
|
36
|
+
}
|
|
37
|
+
} catch {
|
|
38
|
+
// Fall through to error
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
throw new Error(
|
|
42
|
+
'WebSocket not available. In Node.js, install the "ws" package: npm install ws'
|
|
43
|
+
);
|
|
44
|
+
}
|