kugelaudio 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/types.ts CHANGED
@@ -17,7 +17,7 @@ export interface Model {
17
17
  /**
18
18
  * Voice category types.
19
19
  */
20
- export type VoiceCategory = 'premade' | 'cloned' | 'designed';
20
+ export type VoiceCategory = 'premade' | 'cloned' | 'designed' | 'conversational' | 'narrative' | 'narrative_story' | 'characters';
21
21
 
22
22
  /**
23
23
  * Voice sex types.
@@ -47,6 +47,24 @@ export interface Voice {
47
47
  verified: boolean;
48
48
  }
49
49
 
50
+ /**
51
+ * Word-level timestamp from server-side forced alignment.
52
+ */
53
+ export interface WordTimestamp {
54
+ /** The aligned word */
55
+ word: string;
56
+ /** Start time in milliseconds (relative to chunk/audio start) */
57
+ startMs: number;
58
+ /** End time in milliseconds (relative to chunk/audio start) */
59
+ endMs: number;
60
+ /** Start character offset in the original text */
61
+ charStart: number;
62
+ /** End character offset in the original text */
63
+ charEnd: number;
64
+ /** Alignment confidence score (0.0 - 1.0) */
65
+ score: number;
66
+ }
67
+
50
68
  /**
51
69
  * TTS generation request options.
52
70
  */
@@ -54,7 +72,7 @@ export interface GenerateOptions {
54
72
  /** Text to synthesize */
55
73
  text: string;
56
74
  /** Model to use: 'kugel-1-turbo' (1.5B, fast) or 'kugel-1' (7B, premium). Default: 'kugel-1-turbo' */
57
- model?: string;
75
+ modelId?: string;
58
76
  /** Voice ID to use */
59
77
  voiceId?: number;
60
78
  /** CFG scale for generation (default: 2.0) */
@@ -63,27 +81,30 @@ export interface GenerateOptions {
63
81
  maxNewTokens?: number;
64
82
  /** Output sample rate (default: 24000) */
65
83
  sampleRate?: number;
66
- /** Whether to add speaker prefix (default: true) */
67
- speakerPrefix?: boolean;
68
84
  /**
69
85
  * Enable text normalization (converts numbers, dates, etc. to spoken words).
70
86
  * When true, text will be normalized before TTS generation.
71
- * Default: false
87
+ * Default: true
72
88
  *
73
- * ⚠️ WARNING: Using normalize=true without specifying language adds ~150ms
74
- * latency for language auto-detection. For best performance, always specify
75
- * the language parameter when using normalization.
89
+ * ⚠️ For best performance, always specify the language parameter when using
90
+ * normalization. Without it, language auto-detection adds ~150ms latency.
76
91
  */
77
92
  normalize?: boolean;
78
93
  /**
79
94
  * ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
80
- * If not provided and normalize is true, language will be auto-detected
95
+ * If not provided and normalize is true (default), language will be auto-detected
81
96
  * (adds ~150ms latency).
82
97
  *
83
98
  * Supported: de, en, fr, es, it, pt, nl, pl, sv, da, no, fi, cs, hu, ro,
84
99
  * el, uk, bg, tr, vi, ar, hi, zh, ja, ko
85
100
  */
86
101
  language?: string;
102
+ /**
103
+ * Request word-level timestamps alongside audio.
104
+ * When true, the server performs forced alignment and returns per-word timing boundaries.
105
+ * Default: false
106
+ */
107
+ wordTimestamps?: boolean;
87
108
  }
88
109
 
89
110
  /**
@@ -98,12 +119,25 @@ export interface StreamConfig {
98
119
  maxNewTokens?: number;
99
120
  /** Output sample rate */
100
121
  sampleRate?: number;
101
- /** Whether to add speaker prefix */
102
- speakerPrefix?: boolean;
103
122
  /** Auto-flush timeout in milliseconds */
104
123
  flushTimeoutMs?: number;
105
124
  /** Maximum buffer length */
106
125
  maxBufferLength?: number;
126
+ /**
127
+ * Enable text normalization (converts numbers, dates, etc. to spoken words).
128
+ * Default: true
129
+ */
130
+ normalize?: boolean;
131
+ /**
132
+ * ISO 639-1 language code for text normalization (e.g., 'de', 'en', 'fr').
133
+ * Specify to avoid ~150ms auto-detection latency.
134
+ */
135
+ language?: string;
136
+ /**
137
+ * Request word-level timestamps alongside audio.
138
+ * Default: false
139
+ */
140
+ wordTimestamps?: boolean;
107
141
  }
108
142
 
109
143
  /**
@@ -160,6 +194,8 @@ export interface AudioResponse {
160
194
  generationMs: number;
161
195
  /** Real-time factor */
162
196
  rtf: number;
197
+ /** Per-word timing boundaries (populated when `wordTimestamps: true`) */
198
+ wordTimestamps: WordTimestamp[];
163
199
  }
164
200
 
165
201
  /**
@@ -168,6 +204,8 @@ export interface AudioResponse {
168
204
  export interface StreamCallbacks {
169
205
  /** Called when an audio chunk is received */
170
206
  onChunk?: (chunk: AudioChunk) => void;
207
+ /** Called when word-level timestamps are received (requires `wordTimestamps: true`) */
208
+ onWordTimestamps?: (timestamps: WordTimestamp[]) => void;
171
209
  /** Called when generation is complete */
172
210
  onFinal?: (stats: GenerationStats) => void;
173
211
  /** Called on error */
@@ -188,9 +226,11 @@ export interface KugelAudioOptions {
188
226
  isMasterKey?: boolean;
189
227
  /** Whether apiKey is a JWT token (for user authentication). Takes precedence over isMasterKey. */
190
228
  isToken?: boolean;
229
+ /** Organisation ID to bill usage against (required for token auth to enable usage recording). */
230
+ orgId?: number;
191
231
  /** API base URL (default: https://api.kugelaudio.com) */
192
232
  apiUrl?: string;
193
- /** TTS server URL (default: https://eu.kugelaudio.com) */
233
+ /** TTS server URL (default: same as apiUrl) */
194
234
  ttsUrl?: string;
195
235
  /** Request timeout in milliseconds (default: 60000) */
196
236
  timeout?: number;
@@ -205,3 +245,67 @@ export interface ApiError {
205
245
  statusCode?: number;
206
246
  }
207
247
 
248
+ /**
249
+ * Multi-context session configuration.
250
+ */
251
+ export interface MultiContextConfig {
252
+ /** Default voice ID for new contexts */
253
+ defaultVoiceId?: number;
254
+ /** Output sample rate (default: 24000) */
255
+ sampleRate?: number;
256
+ /** CFG scale for generation (default: 2.0) */
257
+ cfgScale?: number;
258
+ /** Maximum tokens to generate (default: 2048) */
259
+ maxNewTokens?: number;
260
+ /** Enable text normalization (default: true) */
261
+ normalize?: boolean;
262
+ /** Seconds before context auto-closes (default: 20.0) */
263
+ inactivityTimeout?: number;
264
+ }
265
+
266
+ /**
267
+ * Voice settings for a specific context.
268
+ */
269
+ export interface ContextVoiceSettings {
270
+ /** Stability (0.0-1.0) */
271
+ stability?: number;
272
+ /** Similarity boost (0.0-1.0) */
273
+ similarityBoost?: number;
274
+ /** Style (0.0-1.0) */
275
+ style?: number;
276
+ /** Use speaker boost */
277
+ useSpeakerBoost?: boolean;
278
+ /** Speed multiplier */
279
+ speed?: number;
280
+ }
281
+
282
+ /**
283
+ * Audio chunk from multi-context streaming.
284
+ */
285
+ export interface MultiContextAudioChunk extends AudioChunk {
286
+ /** Context ID this audio belongs to */
287
+ contextId: string;
288
+ }
289
+
290
+ /**
291
+ * Event callbacks for multi-context streaming.
292
+ */
293
+ export interface MultiContextCallbacks {
294
+ /** Called when session is started */
295
+ onSessionStarted?: (sessionId: string) => void;
296
+ /** Called when a context is created */
297
+ onContextCreated?: (contextId: string) => void;
298
+ /** Called when an audio chunk is received */
299
+ onChunk?: (chunk: MultiContextAudioChunk) => void;
300
+ /** Called when a context finishes generating */
301
+ onContextFinal?: (contextId: string) => void;
302
+ /** Called when a context is closed */
303
+ onContextClosed?: (contextId: string) => void;
304
+ /** Called when a context times out */
305
+ onContextTimeout?: (contextId: string) => void;
306
+ /** Called when session is closed */
307
+ onSessionClosed?: (stats: Record<string, unknown>) => void;
308
+ /** Called on error */
309
+ onError?: (error: Error, contextId?: string) => void;
310
+ }
311
+
@@ -0,0 +1,44 @@
1
+ /**
2
+ * WebSocket compatibility layer for browser and Node.js environments.
3
+ *
4
+ * IMPORTANT: WebSocket resolution is lazy to avoid top-level side-effects
5
+ * that break server-side bundlers (Turbopack / Webpack) when this module
6
+ * is imported in a Node.js (API route) context.
7
+ */
8
+
9
+ let _cachedWs: typeof WebSocket | null = null;
10
+
11
+ /**
12
+ * Get the WebSocket constructor for the current environment.
13
+ * Uses native WebSocket in browsers, ws package in Node.js.
14
+ * Result is cached after first call.
15
+ */
16
+ export function getWebSocket(): typeof WebSocket {
17
+ if (_cachedWs) return _cachedWs;
18
+
19
+ // Browser environment
20
+ if (typeof globalThis !== 'undefined' && typeof (globalThis as any).WebSocket !== 'undefined') {
21
+ _cachedWs = (globalThis as any).WebSocket;
22
+ return _cachedWs!;
23
+ }
24
+
25
+ // Node.js environment - use ws package via dynamic require
26
+ try {
27
+ // Use Function constructor to hide require from static analysis by bundlers
28
+ // eslint-disable-next-line no-new-func
29
+ const _require = typeof require !== 'undefined'
30
+ ? require
31
+ : Function('return typeof require !== "undefined" ? require : undefined')();
32
+ if (_require) {
33
+ const ws = _require('ws');
34
+ _cachedWs = ws.default || ws;
35
+ return _cachedWs!;
36
+ }
37
+ } catch {
38
+ // Fall through to error
39
+ }
40
+
41
+ throw new Error(
42
+ 'WebSocket not available. In Node.js, install the "ws" package: npm install ws'
43
+ );
44
+ }