@omote/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +584 -0
- package/dist/index.d.mts +4173 -0
- package/dist/index.d.ts +4173 -0
- package/dist/index.js +7172 -0
- package/dist/index.js.map +1 -0
- package/dist/index.mjs +7048 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +68 -0
package/dist/index.d.mts
ADDED
|
@@ -0,0 +1,4173 @@
|
|
|
1
|
+
import { InferenceSession, Tensor, Env } from 'onnxruntime-common';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Type-safe event emitter for Omote core events
|
|
5
|
+
*
|
|
6
|
+
* @category Events
|
|
7
|
+
*/
|
|
8
|
+
type EventCallback<T = unknown> = (data: T) => void;
|
|
9
|
+
declare class EventEmitter<TEvents extends {
|
|
10
|
+
[key: string]: unknown;
|
|
11
|
+
}> {
|
|
12
|
+
private listeners;
|
|
13
|
+
on<K extends keyof TEvents>(event: K, callback: EventCallback<TEvents[K]>): () => void;
|
|
14
|
+
off<K extends keyof TEvents>(event: K, callback: EventCallback<TEvents[K]>): void;
|
|
15
|
+
emit<K extends keyof TEvents>(event: K, data: TEvents[K]): void;
|
|
16
|
+
once<K extends keyof TEvents>(event: K, callback: EventCallback<TEvents[K]>): () => void;
|
|
17
|
+
removeAllListeners(event?: keyof TEvents): void;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Core Omote event types - the contract between core and renderers
|
|
22
|
+
*
|
|
23
|
+
* Renderers subscribe to these events and apply them to their specific
|
|
24
|
+
* rendering system (R3F, Three.js, Babylon, Unity, etc.)
|
|
25
|
+
*/
|
|
26
|
+
/** Animation frame with blendshape weights */
|
|
27
|
+
interface AnimationEvent {
|
|
28
|
+
/** 52 ARKit blendshape weights (0-1 range) */
|
|
29
|
+
blendshapes: Float32Array;
|
|
30
|
+
/** Named blendshape access */
|
|
31
|
+
get(name: string): number;
|
|
32
|
+
/** Raw model output weights (for debugging) */
|
|
33
|
+
rawWeights?: Float32Array;
|
|
34
|
+
/** Timestamp in ms */
|
|
35
|
+
timestamp: number;
|
|
36
|
+
/** Inference latency in ms */
|
|
37
|
+
inferenceMs: number;
|
|
38
|
+
/** Frame index within the current batch (for LAM multi-frame output) */
|
|
39
|
+
frameIndex?: number;
|
|
40
|
+
/** Total frames in the current batch (for LAM multi-frame output) */
|
|
41
|
+
totalFrames?: number;
|
|
42
|
+
}
|
|
43
|
+
/** Viseme for lip sync */
|
|
44
|
+
interface VisemeEvent {
|
|
45
|
+
/** Viseme ID or phoneme */
|
|
46
|
+
viseme: string;
|
|
47
|
+
/** Weight 0-1 */
|
|
48
|
+
weight: number;
|
|
49
|
+
/** Duration in ms */
|
|
50
|
+
duration: number;
|
|
51
|
+
}
|
|
52
|
+
/** Emotion state change */
|
|
53
|
+
interface EmotionEvent {
|
|
54
|
+
/** Emotion weights by name */
|
|
55
|
+
values: Record<string, number>;
|
|
56
|
+
/** Transition duration in ms */
|
|
57
|
+
transitionMs: number;
|
|
58
|
+
}
|
|
59
|
+
/** Gaze target change */
|
|
60
|
+
interface GazeEvent {
|
|
61
|
+
/** Target type */
|
|
62
|
+
target: 'camera' | 'wander' | 'position';
|
|
63
|
+
/** Position if target is 'position' */
|
|
64
|
+
position?: {
|
|
65
|
+
x: number;
|
|
66
|
+
y: number;
|
|
67
|
+
z: number;
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
/** Audio playback events */
|
|
71
|
+
interface TTSStartEvent {
|
|
72
|
+
/** Audio duration in ms */
|
|
73
|
+
durationMs: number;
|
|
74
|
+
/** Text being spoken */
|
|
75
|
+
text: string;
|
|
76
|
+
}
|
|
77
|
+
interface TTSMarkEvent {
|
|
78
|
+
/** Mark name/type */
|
|
79
|
+
name: string;
|
|
80
|
+
/** Time offset in ms */
|
|
81
|
+
timeMs: number;
|
|
82
|
+
}
|
|
83
|
+
interface TTSEndEvent {
|
|
84
|
+
/** Whether playback completed normally */
|
|
85
|
+
completed: boolean;
|
|
86
|
+
}
|
|
87
|
+
/** STT transcription events */
|
|
88
|
+
interface STTPartialEvent {
|
|
89
|
+
/** Partial transcription */
|
|
90
|
+
text: string;
|
|
91
|
+
/** Confidence 0-1 */
|
|
92
|
+
confidence: number;
|
|
93
|
+
}
|
|
94
|
+
interface STTFinalEvent {
|
|
95
|
+
/** Final transcription */
|
|
96
|
+
text: string;
|
|
97
|
+
/** Confidence 0-1 */
|
|
98
|
+
confidence: number;
|
|
99
|
+
}
|
|
100
|
+
/** Session state events */
|
|
101
|
+
interface SessionStateEvent {
|
|
102
|
+
state: 'connecting' | 'connected' | 'ready' | 'streaming' | 'error' | 'disconnected';
|
|
103
|
+
error?: Error;
|
|
104
|
+
}
|
|
105
|
+
/** Backend info */
|
|
106
|
+
interface BackendEvent {
|
|
107
|
+
type: 'webgpu' | 'wasm' | 'remote';
|
|
108
|
+
modelLoaded: boolean;
|
|
109
|
+
loadTimeMs?: number;
|
|
110
|
+
}
|
|
111
|
+
/** AI adapter state */
|
|
112
|
+
type AISessionState$1 = 'idle' | 'listening' | 'thinking' | 'speaking' | 'interrupted' | 'error' | 'disconnected';
|
|
113
|
+
/** AI state change event */
|
|
114
|
+
interface AIStateChangeEvent {
|
|
115
|
+
state: AISessionState$1;
|
|
116
|
+
previousState: AISessionState$1;
|
|
117
|
+
}
|
|
118
|
+
/** User speech events */
|
|
119
|
+
interface UserSpeechStartEvent {
|
|
120
|
+
timestamp: number;
|
|
121
|
+
}
|
|
122
|
+
interface UserSpeechEndEvent {
|
|
123
|
+
timestamp: number;
|
|
124
|
+
durationMs: number;
|
|
125
|
+
}
|
|
126
|
+
interface UserTranscriptEvent {
|
|
127
|
+
text: string;
|
|
128
|
+
confidence: number;
|
|
129
|
+
}
|
|
130
|
+
/** AI response events */
|
|
131
|
+
interface AIThinkingStartEvent {
|
|
132
|
+
timestamp: number;
|
|
133
|
+
}
|
|
134
|
+
interface AIResponseStartEvent {
|
|
135
|
+
text?: string;
|
|
136
|
+
emotion?: string;
|
|
137
|
+
}
|
|
138
|
+
interface AIResponseChunkEvent {
|
|
139
|
+
text: string;
|
|
140
|
+
isLast: boolean;
|
|
141
|
+
}
|
|
142
|
+
interface AIResponseEndEvent {
|
|
143
|
+
fullText: string;
|
|
144
|
+
durationMs: number;
|
|
145
|
+
}
|
|
146
|
+
/** Audio output events (for lip sync processing) */
|
|
147
|
+
interface AudioOutputChunkEvent {
|
|
148
|
+
audio: ArrayBuffer;
|
|
149
|
+
sampleRate: number;
|
|
150
|
+
timestamp: number;
|
|
151
|
+
}
|
|
152
|
+
interface AudioOutputEndEvent {
|
|
153
|
+
durationMs: number;
|
|
154
|
+
}
|
|
155
|
+
/** Adapter events */
|
|
156
|
+
interface AdapterSwitchEvent {
|
|
157
|
+
from: string;
|
|
158
|
+
to: string;
|
|
159
|
+
reason: string;
|
|
160
|
+
}
|
|
161
|
+
interface AdapterFallbackEvent {
|
|
162
|
+
adapter: string;
|
|
163
|
+
reason: string;
|
|
164
|
+
}
|
|
165
|
+
interface InterruptionEvent {
|
|
166
|
+
timestamp: number;
|
|
167
|
+
action?: 'stop' | 'continue';
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* Complete event map for OmoteCore
|
|
171
|
+
*/
|
|
172
|
+
type OmoteEvents = {
|
|
173
|
+
'animation': AnimationEvent;
|
|
174
|
+
'animation.ready': {
|
|
175
|
+
backend: 'webgpu' | 'wasm';
|
|
176
|
+
};
|
|
177
|
+
'viseme': VisemeEvent;
|
|
178
|
+
'emotion': EmotionEvent;
|
|
179
|
+
'gaze': GazeEvent;
|
|
180
|
+
'tts.start': TTSStartEvent;
|
|
181
|
+
'tts.mark': TTSMarkEvent;
|
|
182
|
+
'tts.end': TTSEndEvent;
|
|
183
|
+
'stt.partial': STTPartialEvent;
|
|
184
|
+
'stt.final': STTFinalEvent;
|
|
185
|
+
'session.state': SessionStateEvent;
|
|
186
|
+
'backend': BackendEvent;
|
|
187
|
+
'audio.chunk': {
|
|
188
|
+
pcm: Int16Array;
|
|
189
|
+
timestamp: number;
|
|
190
|
+
};
|
|
191
|
+
'audio.level': {
|
|
192
|
+
rms: number;
|
|
193
|
+
peak: number;
|
|
194
|
+
};
|
|
195
|
+
'audio.output.chunk': AudioOutputChunkEvent;
|
|
196
|
+
'audio.output.end': AudioOutputEndEvent;
|
|
197
|
+
'ai.state.change': AIStateChangeEvent;
|
|
198
|
+
'ai.thinking.start': AIThinkingStartEvent;
|
|
199
|
+
'ai.response.start': AIResponseStartEvent;
|
|
200
|
+
'ai.response.chunk': AIResponseChunkEvent;
|
|
201
|
+
'ai.response.end': AIResponseEndEvent;
|
|
202
|
+
'user.speech.start': UserSpeechStartEvent;
|
|
203
|
+
'user.speech.end': UserSpeechEndEvent;
|
|
204
|
+
'user.transcript.partial': UserTranscriptEvent;
|
|
205
|
+
'user.transcript.final': UserTranscriptEvent;
|
|
206
|
+
'adapter.switch': AdapterSwitchEvent;
|
|
207
|
+
'adapter.fallback': AdapterFallbackEvent;
|
|
208
|
+
'adapter.recovered': {
|
|
209
|
+
adapter: string;
|
|
210
|
+
};
|
|
211
|
+
'interruption.detected': InterruptionEvent;
|
|
212
|
+
'interruption.handled': InterruptionEvent;
|
|
213
|
+
'memory.updated': {
|
|
214
|
+
messageCount: number;
|
|
215
|
+
tokenCount?: number;
|
|
216
|
+
};
|
|
217
|
+
'connection.opened': {
|
|
218
|
+
sessionId: string;
|
|
219
|
+
adapter?: string;
|
|
220
|
+
};
|
|
221
|
+
'connection.closed': {
|
|
222
|
+
reason: string;
|
|
223
|
+
};
|
|
224
|
+
'connection.error': {
|
|
225
|
+
error: Error;
|
|
226
|
+
recoverable: boolean;
|
|
227
|
+
};
|
|
228
|
+
'error': {
|
|
229
|
+
code: string;
|
|
230
|
+
message: string;
|
|
231
|
+
details?: unknown;
|
|
232
|
+
};
|
|
233
|
+
};
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* Microphone capture - renderer-agnostic audio input
|
|
237
|
+
*
|
|
238
|
+
* Captures audio from the microphone and emits PCM chunks.
|
|
239
|
+
* Works in any JavaScript environment with Web Audio API.
|
|
240
|
+
*
|
|
241
|
+
* @category Audio
|
|
242
|
+
*/
|
|
243
|
+
|
|
244
|
+
interface MicrophoneCaptureConfig {
|
|
245
|
+
/** Target sample rate (default: 16000 for speech processing) */
|
|
246
|
+
sampleRate?: number;
|
|
247
|
+
/** Chunk size in samples (default: 1600 = 100ms at 16kHz) */
|
|
248
|
+
chunkSize?: number;
|
|
249
|
+
}
|
|
250
|
+
declare class MicrophoneCapture {
|
|
251
|
+
private events;
|
|
252
|
+
private config;
|
|
253
|
+
private stream;
|
|
254
|
+
private context;
|
|
255
|
+
private processor;
|
|
256
|
+
private buffer;
|
|
257
|
+
private _isRecording;
|
|
258
|
+
private _loggedFirstChunk;
|
|
259
|
+
constructor(events: EventEmitter<OmoteEvents>, config?: MicrophoneCaptureConfig);
|
|
260
|
+
get isRecording(): boolean;
|
|
261
|
+
get isSupported(): boolean;
|
|
262
|
+
start(): Promise<void>;
|
|
263
|
+
stop(): void;
|
|
264
|
+
private floatToPCM16;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
/**
|
|
268
|
+
* Ring buffer for audio sample accumulation
|
|
269
|
+
*
|
|
270
|
+
* Efficiently accumulates audio samples and provides
|
|
271
|
+
* contiguous buffers for inference without memory allocation churn.
|
|
272
|
+
*
|
|
273
|
+
* @category Audio
|
|
274
|
+
*/
|
|
275
|
+
declare class RingBuffer {
|
|
276
|
+
private readonly size;
|
|
277
|
+
private buffer;
|
|
278
|
+
private writeIndex;
|
|
279
|
+
private isFull;
|
|
280
|
+
constructor(size: number);
|
|
281
|
+
/**
|
|
282
|
+
* Write samples to the ring buffer
|
|
283
|
+
* Converts Int16Array PCM to Float32
|
|
284
|
+
*/
|
|
285
|
+
write(pcm: Int16Array): void;
|
|
286
|
+
/**
|
|
287
|
+
* Write float samples directly
|
|
288
|
+
*/
|
|
289
|
+
writeFloat(samples: Float32Array): void;
|
|
290
|
+
/**
|
|
291
|
+
* Get a contiguous copy of the buffer contents in chronological order
|
|
292
|
+
* Returns null if buffer isn't full yet
|
|
293
|
+
*/
|
|
294
|
+
read(): Float32Array | null;
|
|
295
|
+
/**
|
|
296
|
+
* Check if buffer has enough samples
|
|
297
|
+
*/
|
|
298
|
+
get hasData(): boolean;
|
|
299
|
+
/**
|
|
300
|
+
* Get current fill level (0-1)
|
|
301
|
+
*/
|
|
302
|
+
get fillLevel(): number;
|
|
303
|
+
/**
|
|
304
|
+
* Reset the buffer
|
|
305
|
+
*/
|
|
306
|
+
reset(): void;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
/**
|
|
310
|
+
* AudioScheduler - Enterprise-grade Web Audio API scheduling
|
|
311
|
+
*
|
|
312
|
+
* Implements the lookahead scheduling pattern from Chris Wilson's
|
|
313
|
+
* "A Tale of Two Clocks" - the authoritative guide on Web Audio timing.
|
|
314
|
+
*
|
|
315
|
+
* Key Features:
|
|
316
|
+
* - Uses AudioContext.currentTime (hardware clock) for sample-accurate timing
|
|
317
|
+
* - Pre-schedules audio chunks for gapless playback
|
|
318
|
+
* - Tracks scheduled sources for cleanup
|
|
319
|
+
* - Provides playback state monitoring
|
|
320
|
+
*
|
|
321
|
+
* @see https://web.dev/articles/audio-scheduling
|
|
322
|
+
* @category Audio
|
|
323
|
+
*/
|
|
324
|
+
interface AudioSchedulerOptions {
|
|
325
|
+
/** Sample rate in Hz (default: 16000 for speech) */
|
|
326
|
+
sampleRate?: number;
|
|
327
|
+
/** Number of audio channels (default: 1 for mono) */
|
|
328
|
+
channels?: number;
|
|
329
|
+
}
|
|
330
|
+
declare class AudioScheduler {
|
|
331
|
+
private readonly options;
|
|
332
|
+
private context;
|
|
333
|
+
private nextPlayTime;
|
|
334
|
+
private scheduledSources;
|
|
335
|
+
private isPlaying;
|
|
336
|
+
constructor(options?: AudioSchedulerOptions);
|
|
337
|
+
/**
|
|
338
|
+
* Initialize AudioContext with specified sample rate
|
|
339
|
+
*
|
|
340
|
+
* Note: This is now a no-op. AudioContext is created lazily on first schedule()
|
|
341
|
+
* to avoid browser autoplay policy issues (requires user gesture).
|
|
342
|
+
*/
|
|
343
|
+
initialize(): Promise<void>;
|
|
344
|
+
/**
|
|
345
|
+
* Ensure AudioContext is created and ready
|
|
346
|
+
* Called lazily on first schedule() - requires user gesture
|
|
347
|
+
*/
|
|
348
|
+
private ensureContext;
|
|
349
|
+
/**
|
|
350
|
+
* Schedule an audio chunk for playback
|
|
351
|
+
*
|
|
352
|
+
* Uses Web Audio's hardware-accurate clock for sample-perfect timing.
|
|
353
|
+
* Chunks are scheduled immediately, not when they should play - this
|
|
354
|
+
* ensures gapless playback even if main thread stalls.
|
|
355
|
+
*
|
|
356
|
+
* @param audioData - Float32Array of audio samples
|
|
357
|
+
* @returns Scheduled playback time in AudioContext seconds
|
|
358
|
+
*/
|
|
359
|
+
schedule(audioData: Float32Array): Promise<number>;
|
|
360
|
+
/**
|
|
361
|
+
* Get current audio clock time
|
|
362
|
+
*
|
|
363
|
+
* This is the hardware-accurate time, NOT JavaScript time.
|
|
364
|
+
* Use this for synchronizing visual animations to audio.
|
|
365
|
+
*
|
|
366
|
+
* @returns Current time in AudioContext seconds
|
|
367
|
+
*/
|
|
368
|
+
getCurrentTime(): number;
|
|
369
|
+
/**
|
|
370
|
+
* Get scheduled playback end time
|
|
371
|
+
*/
|
|
372
|
+
getPlaybackEndTime(): number;
|
|
373
|
+
/**
|
|
374
|
+
* Check if all scheduled audio has finished playing
|
|
375
|
+
*/
|
|
376
|
+
isComplete(): boolean;
|
|
377
|
+
/**
|
|
378
|
+
* Cancel all scheduled audio with smooth fade-out
|
|
379
|
+
*
|
|
380
|
+
* Applies a linear fade-out to all playing sources and stops them gracefully.
|
|
381
|
+
* Prevents audio clicks/pops by ramping gain to zero before stopping.
|
|
382
|
+
*
|
|
383
|
+
* @param fadeOutMs - Fade-out duration in milliseconds (default: 50ms)
|
|
384
|
+
* @returns Promise that resolves when fade-out completes
|
|
385
|
+
*/
|
|
386
|
+
cancelAll(fadeOutMs?: number): Promise<void>;
|
|
387
|
+
/**
|
|
388
|
+
* Reset scheduler state for new playback session
|
|
389
|
+
*/
|
|
390
|
+
reset(): void;
|
|
391
|
+
/**
|
|
392
|
+
* Cleanup resources
|
|
393
|
+
*/
|
|
394
|
+
dispose(): void;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
/**
|
|
398
|
+
* AudioChunkCoalescer - Combine small network chunks into optimal buffers
|
|
399
|
+
*
|
|
400
|
+
* Network streaming often delivers audio in small chunks (e.g., 32ms from TTS APIs).
|
|
401
|
+
* Creating an AudioBufferSourceNode for each tiny chunk is inefficient and can cause
|
|
402
|
+
* overhead from object creation/GC.
|
|
403
|
+
*
|
|
404
|
+
* This class implements a double-buffering pattern: accumulate small chunks in a
|
|
405
|
+
* temporary buffer, then flush to playback queue when threshold is reached.
|
|
406
|
+
*
|
|
407
|
+
* Benefits:
|
|
408
|
+
* - Reduces AudioBufferSourceNode overhead (fewer nodes = less GC pressure)
|
|
409
|
+
* - Configurable buffer size for optimal playback chunk duration
|
|
410
|
+
* - Maintains sample-accurate timing despite buffering
|
|
411
|
+
*
|
|
412
|
+
* Based on patterns from HLS.js and production streaming implementations.
|
|
413
|
+
*
|
|
414
|
+
* @category Audio
|
|
415
|
+
*/
|
|
416
|
+
interface AudioChunkCoalescerOptions {
|
|
417
|
+
/**
|
|
418
|
+
* Target duration in milliseconds for combined chunks
|
|
419
|
+
* Default: 200ms (balances latency vs overhead)
|
|
420
|
+
*
|
|
421
|
+
* Smaller values = lower latency, more overhead
|
|
422
|
+
* Larger values = higher latency, less overhead
|
|
423
|
+
*/
|
|
424
|
+
targetDurationMs?: number;
|
|
425
|
+
/**
|
|
426
|
+
* Sample rate in Hz
|
|
427
|
+
* Default: 16000 (speech quality)
|
|
428
|
+
*/
|
|
429
|
+
sampleRate?: number;
|
|
430
|
+
}
|
|
431
|
+
declare class AudioChunkCoalescer {
|
|
432
|
+
private readonly options;
|
|
433
|
+
private tempBuffer;
|
|
434
|
+
private readonly targetBytes;
|
|
435
|
+
constructor(options?: AudioChunkCoalescerOptions);
|
|
436
|
+
/**
|
|
437
|
+
* Add a chunk to the temporary buffer
|
|
438
|
+
*
|
|
439
|
+
* @param chunk - Uint8Array containing Int16 PCM audio
|
|
440
|
+
* @returns Combined buffer if threshold reached, null otherwise
|
|
441
|
+
*/
|
|
442
|
+
add(chunk: Uint8Array): ArrayBuffer | null;
|
|
443
|
+
/**
|
|
444
|
+
* Flush remaining buffered data
|
|
445
|
+
*
|
|
446
|
+
* Call this when the stream ends to ensure all audio is processed,
|
|
447
|
+
* even if it doesn't reach the target threshold.
|
|
448
|
+
*
|
|
449
|
+
* @returns Combined buffer, or null if buffer is empty
|
|
450
|
+
*/
|
|
451
|
+
flush(): ArrayBuffer | null;
|
|
452
|
+
/**
|
|
453
|
+
* Get current buffer fill level (0-1)
|
|
454
|
+
*/
|
|
455
|
+
get fillLevel(): number;
|
|
456
|
+
/**
|
|
457
|
+
* Get current buffered duration in milliseconds
|
|
458
|
+
*/
|
|
459
|
+
getBufferedDurationMs(): number;
|
|
460
|
+
/**
|
|
461
|
+
* Get number of chunks currently buffered
|
|
462
|
+
*/
|
|
463
|
+
get chunkCount(): number;
|
|
464
|
+
/**
|
|
465
|
+
* Reset the coalescer
|
|
466
|
+
*/
|
|
467
|
+
reset(): void;
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
/**
|
|
471
|
+
* Runtime detection utilities for platform-specific inference configuration
|
|
472
|
+
*
|
|
473
|
+
* These utilities help determine the optimal backend (WebGPU vs WASM) based on
|
|
474
|
+
* the current platform's capabilities and known limitations.
|
|
475
|
+
*
|
|
476
|
+
* Key considerations:
|
|
477
|
+
* - iOS Safari: WebGPU crashes due to JSEP bugs (GitHub #22776, #26827)
|
|
478
|
+
* - Android Chrome: WebGPU works well (Chrome 121+)
|
|
479
|
+
* - Desktop: WebGPU preferred for performance
|
|
480
|
+
*
|
|
481
|
+
* @module utils/runtime
|
|
482
|
+
*/
|
|
483
|
+
/**
|
|
484
|
+
* Supported inference backends
|
|
485
|
+
*/
|
|
486
|
+
type RuntimeBackend = 'webgpu' | 'wasm';
|
|
487
|
+
/**
|
|
488
|
+
* User-configurable backend preference
|
|
489
|
+
*/
|
|
490
|
+
type BackendPreference = 'auto' | 'webgpu' | 'wasm' | 'webgpu-only' | 'wasm-only';
|
|
491
|
+
/**
|
|
492
|
+
* Detect iOS Safari browser
|
|
493
|
+
*
|
|
494
|
+
* iOS Safari has severe WebGPU issues:
|
|
495
|
+
* - JSEP compilation bugs cause OOM during session creation
|
|
496
|
+
* - Threading bugs require numThreads=1
|
|
497
|
+
* - Proxy mode triggers memory leaks
|
|
498
|
+
*
|
|
499
|
+
* @returns true if running in iOS Safari
|
|
500
|
+
*/
|
|
501
|
+
declare function isIOSSafari(): boolean;
|
|
502
|
+
/**
|
|
503
|
+
* Detect any iOS device (regardless of browser)
|
|
504
|
+
*
|
|
505
|
+
* On iOS, all browsers use WebKit, so Chrome/Firefox on iOS
|
|
506
|
+
* have the same limitations as Safari.
|
|
507
|
+
*
|
|
508
|
+
* @returns true if running on any iOS device
|
|
509
|
+
*/
|
|
510
|
+
declare function isIOS(): boolean;
|
|
511
|
+
/**
|
|
512
|
+
* Detect Android device
|
|
513
|
+
*
|
|
514
|
+
* Android Chrome 121+ has good WebGPU support with Qualcomm/ARM GPUs.
|
|
515
|
+
*
|
|
516
|
+
* @returns true if running on Android
|
|
517
|
+
*/
|
|
518
|
+
declare function isAndroid(): boolean;
|
|
519
|
+
/**
|
|
520
|
+
* Detect any mobile device (iOS or Android)
|
|
521
|
+
*
|
|
522
|
+
* Mobile devices have different performance characteristics:
|
|
523
|
+
* - Lower memory limits
|
|
524
|
+
* - Thermal throttling
|
|
525
|
+
* - Different GPU architectures
|
|
526
|
+
*
|
|
527
|
+
* @returns true if running on mobile
|
|
528
|
+
*/
|
|
529
|
+
declare function isMobile(): boolean;
|
|
530
|
+
/**
|
|
531
|
+
* Check if WebGPU API is available in the browser
|
|
532
|
+
*
|
|
533
|
+
* Note: This only checks if the API exists, not if it works reliably.
|
|
534
|
+
* iOS has navigator.gpu but ONNX Runtime's WebGPU backend crashes.
|
|
535
|
+
*
|
|
536
|
+
* @returns true if navigator.gpu exists
|
|
537
|
+
*/
|
|
538
|
+
declare function hasWebGPUApi(): boolean;
|
|
539
|
+
/**
|
|
540
|
+
* Get the recommended backend for the current platform
|
|
541
|
+
*
|
|
542
|
+
* Decision tree:
|
|
543
|
+
* 1. iOS (any browser): Force WASM (WebGPU crashes)
|
|
544
|
+
* 2. Android: WebGPU preferred (works in Chrome 121+)
|
|
545
|
+
* 3. Desktop: WebGPU preferred (best performance)
|
|
546
|
+
*
|
|
547
|
+
* @returns 'wasm' for iOS, 'webgpu' for everything else
|
|
548
|
+
*/
|
|
549
|
+
declare function getRecommendedBackend(): RuntimeBackend;
|
|
550
|
+
/**
|
|
551
|
+
* Resolve user preference to actual backend
|
|
552
|
+
*
|
|
553
|
+
* @param preference User's backend preference
|
|
554
|
+
* @param webgpuAvailable Whether WebGPU is available and working
|
|
555
|
+
* @returns The backend to use
|
|
556
|
+
*/
|
|
557
|
+
declare function resolveBackend(preference: BackendPreference, webgpuAvailable: boolean): RuntimeBackend;
|
|
558
|
+
/**
|
|
559
|
+
* Get optimal WASM thread count for current platform
|
|
560
|
+
*
|
|
561
|
+
* @returns Recommended number of WASM threads
|
|
562
|
+
*/
|
|
563
|
+
declare function getOptimalWasmThreads(): number;
|
|
564
|
+
/**
|
|
565
|
+
* Check if WASM proxy mode should be enabled
|
|
566
|
+
*
|
|
567
|
+
* Proxy mode offloads inference to a Web Worker, but has issues:
|
|
568
|
+
* - iOS: Triggers Safari 26 JSEP memory leak
|
|
569
|
+
* - Mobile: Generally unstable
|
|
570
|
+
*
|
|
571
|
+
* @returns true if proxy mode is safe to enable
|
|
572
|
+
*/
|
|
573
|
+
declare function shouldEnableWasmProxy(): boolean;
|
|
574
|
+
/**
|
|
575
|
+
* Check if Web Speech API is available in the browser
|
|
576
|
+
*
|
|
577
|
+
* The Web Speech API provides native speech recognition in Safari and Chrome.
|
|
578
|
+
* On iOS Safari, this is significantly faster than Whisper WASM.
|
|
579
|
+
*
|
|
580
|
+
* @returns true if SpeechRecognition API is available
|
|
581
|
+
*/
|
|
582
|
+
declare function isSpeechRecognitionAvailable(): boolean;
|
|
583
|
+
/**
|
|
584
|
+
* Recommend using native Safari Speech API over Whisper on iOS
|
|
585
|
+
*
|
|
586
|
+
* On iOS, Whisper ASR via WASM takes ~1.3s per inference (30% over target).
|
|
587
|
+
* Safari's native Web Speech API is:
|
|
588
|
+
* - Much faster (native implementation)
|
|
589
|
+
* - Battery-efficient (no WASM overhead)
|
|
590
|
+
* - No model download needed (saves 30-150MB)
|
|
591
|
+
*
|
|
592
|
+
* @returns true if on iOS with Speech API available
|
|
593
|
+
*/
|
|
594
|
+
declare function shouldUseNativeASR(): boolean;
|
|
595
|
+
/**
|
|
596
|
+
* Recommend using server-side LAM over client-side on iOS
|
|
597
|
+
*
|
|
598
|
+
* On iOS, LAM lip sync via WASM takes ~332ms per second of audio (3.3x over target).
|
|
599
|
+
* Server-side inference with GPU can achieve ~50ms, providing:
|
|
600
|
+
* - Real-time lip sync (under 100ms target)
|
|
601
|
+
* - Reduced iOS device thermal/battery impact
|
|
602
|
+
* - Better user experience
|
|
603
|
+
*
|
|
604
|
+
* @returns true if on iOS (should use server-side lip sync)
|
|
605
|
+
*/
|
|
606
|
+
declare function shouldUseServerLipSync(): boolean;
|
|
607
|
+
|
|
608
|
+
/**
|
|
609
|
+
* Lazy ONNX Runtime loader with conditional WebGPU/WASM bundle loading
|
|
610
|
+
*
|
|
611
|
+
* This module provides a way to dynamically load the appropriate ONNX Runtime bundle
|
|
612
|
+
* based on the platform's capabilities. This is critical for iOS support because:
|
|
613
|
+
*
|
|
614
|
+
* 1. iOS Safari has WebGPU API but ONNX Runtime's WebGPU backend crashes
|
|
615
|
+
* 2. Loading the WebGPU bundle wastes bandwidth and can cause issues
|
|
616
|
+
* 3. WASM-only bundle is smaller and more reliable on iOS
|
|
617
|
+
*
|
|
618
|
+
* Usage:
|
|
619
|
+
* ```typescript
|
|
620
|
+
* const ort = await getOnnxRuntime('wasm'); // Load WASM-only bundle
|
|
621
|
+
* const ort = await getOnnxRuntime('webgpu'); // Load WebGPU bundle (includes WASM)
|
|
622
|
+
* ```
|
|
623
|
+
*
|
|
624
|
+
* @module inference/onnxLoader
|
|
625
|
+
*/
|
|
626
|
+
|
|
627
|
+
type OrtModule = {
|
|
628
|
+
InferenceSession: typeof InferenceSession;
|
|
629
|
+
Tensor: typeof Tensor;
|
|
630
|
+
env: Env;
|
|
631
|
+
};
|
|
632
|
+
type SessionOptions = InferenceSession.SessionOptions;
|
|
633
|
+
|
|
634
|
+
/**
|
|
635
|
+
* Check if WebGPU is available and likely to work
|
|
636
|
+
*
|
|
637
|
+
* This is more thorough than just checking navigator.gpu exists.
|
|
638
|
+
* It actually requests an adapter to verify the GPU is accessible.
|
|
639
|
+
*
|
|
640
|
+
* @returns true if WebGPU is available and working
|
|
641
|
+
*/
|
|
642
|
+
declare function isWebGPUAvailable(): Promise<boolean>;
|
|
643
|
+
/**
|
|
644
|
+
* Load ONNX Runtime with the specified backend
|
|
645
|
+
*
|
|
646
|
+
* This lazily loads the appropriate bundle:
|
|
647
|
+
* - 'wasm': Loads onnxruntime-web (WASM-only, smaller)
|
|
648
|
+
* - 'webgpu': Loads onnxruntime-web/webgpu (includes WebGPU + WASM fallback)
|
|
649
|
+
*
|
|
650
|
+
* Once loaded, the same instance is reused for all subsequent calls.
|
|
651
|
+
* If you need to switch backends, you must reload the page.
|
|
652
|
+
*
|
|
653
|
+
* @param backend The backend to load ('webgpu' or 'wasm')
|
|
654
|
+
* @returns The ONNX Runtime module
|
|
655
|
+
*/
|
|
656
|
+
declare function getOnnxRuntime(backend: RuntimeBackend): Promise<OrtModule>;
|
|
657
|
+
/**
|
|
658
|
+
* Get the appropriate ONNX Runtime based on user preference
|
|
659
|
+
*
|
|
660
|
+
* This resolves the user's preference against platform capabilities
|
|
661
|
+
* and loads the appropriate bundle.
|
|
662
|
+
*
|
|
663
|
+
* @param preference User's backend preference
|
|
664
|
+
* @returns The ONNX Runtime module and the resolved backend
|
|
665
|
+
*/
|
|
666
|
+
declare function getOnnxRuntimeForPreference(preference?: BackendPreference): Promise<{
|
|
667
|
+
ort: OrtModule;
|
|
668
|
+
backend: RuntimeBackend;
|
|
669
|
+
}>;
|
|
670
|
+
/**
|
|
671
|
+
* Get session options for creating an inference session
|
|
672
|
+
*
|
|
673
|
+
* This returns optimized session options based on the backend and platform.
|
|
674
|
+
*
|
|
675
|
+
* @param backend The backend being used
|
|
676
|
+
* @returns Session options for InferenceSession.create()
|
|
677
|
+
*/
|
|
678
|
+
declare function getSessionOptions(backend: RuntimeBackend): SessionOptions;
|
|
679
|
+
/**
|
|
680
|
+
* Create an inference session with automatic fallback
|
|
681
|
+
*
|
|
682
|
+
* If WebGPU session creation fails, automatically falls back to WASM.
|
|
683
|
+
*
|
|
684
|
+
* @param modelBuffer The model data as ArrayBuffer
|
|
685
|
+
* @param preferredBackend The preferred backend
|
|
686
|
+
* @returns The created session and the backend used
|
|
687
|
+
*/
|
|
688
|
+
declare function createSessionWithFallback(modelBuffer: ArrayBuffer, preferredBackend: RuntimeBackend): Promise<{
|
|
689
|
+
session: InferenceSession;
|
|
690
|
+
backend: RuntimeBackend;
|
|
691
|
+
}>;
|
|
692
|
+
/**
|
|
693
|
+
* Get the currently loaded backend (if any)
|
|
694
|
+
*/
|
|
695
|
+
declare function getLoadedBackend(): RuntimeBackend | null;
|
|
696
|
+
/**
|
|
697
|
+
* Check if ONNX Runtime has been loaded
|
|
698
|
+
*/
|
|
699
|
+
declare function isOnnxRuntimeLoaded(): boolean;
|
|
700
|
+
|
|
701
|
+
/**
|
|
702
|
+
* Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
|
|
703
|
+
*
|
|
704
|
+
* Runs entirely in the browser using WebGPU or WASM.
|
|
705
|
+
* Takes raw 16kHz audio and outputs:
|
|
706
|
+
* - 52 ARKit blendshapes (lip sync)
|
|
707
|
+
* - 32-token CTC logits (speech recognition)
|
|
708
|
+
*
|
|
709
|
+
* @category Inference
|
|
710
|
+
*
|
|
711
|
+
* @example Basic usage
|
|
712
|
+
* ```typescript
|
|
713
|
+
* import { Wav2Vec2Inference } from '@omote/core';
|
|
714
|
+
*
|
|
715
|
+
* const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
|
|
716
|
+
* await wav2vec.load();
|
|
717
|
+
*
|
|
718
|
+
* // Process 1 second of audio (16kHz = 16000 samples)
|
|
719
|
+
* const result = await wav2vec.infer(audioSamples);
|
|
720
|
+
*
|
|
721
|
+
* console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
|
|
722
|
+
* console.log('ASR text:', result.text); // Decoded transcription
|
|
723
|
+
* ```
|
|
724
|
+
*/
|
|
725
|
+
|
|
726
|
+
type InferenceBackend = BackendPreference;
|
|
727
|
+
interface Wav2Vec2InferenceConfig {
|
|
728
|
+
/** Path or URL to the ONNX model */
|
|
729
|
+
modelUrl: string;
|
|
730
|
+
/** Preferred backend (auto will try WebGPU first, fallback to WASM) */
|
|
731
|
+
backend?: InferenceBackend;
|
|
732
|
+
/** Number of identity classes (default: 12 for streaming model) */
|
|
733
|
+
numIdentityClasses?: number;
|
|
734
|
+
}
|
|
735
|
+
interface ModelInfo {
|
|
736
|
+
backend: 'webgpu' | 'wasm';
|
|
737
|
+
loadTimeMs: number;
|
|
738
|
+
inputNames: string[];
|
|
739
|
+
outputNames: string[];
|
|
740
|
+
}
|
|
741
|
+
/**
|
|
742
|
+
* LAM model blendshape names in order (52 total)
|
|
743
|
+
* NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
|
|
744
|
+
*/
|
|
745
|
+
declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
746
|
+
/** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
|
|
747
|
+
declare const CTC_VOCAB: string[];
|
|
748
|
+
interface Wav2Vec2Result {
|
|
749
|
+
/** Blendshape weights [frames, 52] - 30fps */
|
|
750
|
+
blendshapes: Float32Array[];
|
|
751
|
+
/** Raw CTC logits [frames, 32] - 50fps */
|
|
752
|
+
asrLogits: Float32Array[];
|
|
753
|
+
/** Decoded text from CTC */
|
|
754
|
+
text: string;
|
|
755
|
+
/** Number of A2E frames (30fps) */
|
|
756
|
+
numA2EFrames: number;
|
|
757
|
+
/** Number of ASR frames (50fps) */
|
|
758
|
+
numASRFrames: number;
|
|
759
|
+
/** Inference time in ms */
|
|
760
|
+
inferenceTimeMs: number;
|
|
761
|
+
}
|
|
762
|
+
declare class Wav2Vec2Inference {
|
|
763
|
+
private session;
|
|
764
|
+
private ort;
|
|
765
|
+
private config;
|
|
766
|
+
private _backend;
|
|
767
|
+
private isLoading;
|
|
768
|
+
private numIdentityClasses;
|
|
769
|
+
private inferenceQueue;
|
|
770
|
+
constructor(config: Wav2Vec2InferenceConfig);
|
|
771
|
+
/**
|
|
772
|
+
* Check if WebGPU is available and working
|
|
773
|
+
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
774
|
+
*/
|
|
775
|
+
static isWebGPUAvailable: typeof isWebGPUAvailable;
|
|
776
|
+
get backend(): 'webgpu' | 'wasm' | null;
|
|
777
|
+
get isLoaded(): boolean;
|
|
778
|
+
/**
|
|
779
|
+
* Load the ONNX model
|
|
780
|
+
*/
|
|
781
|
+
load(): Promise<ModelInfo>;
|
|
782
|
+
/**
|
|
783
|
+
* Run inference on raw audio
|
|
784
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
|
|
785
|
+
* @param identityIndex - Optional identity index (0-11, default 0 = neutral)
|
|
786
|
+
*
|
|
787
|
+
* Note: Model expects 1-second chunks (16000 samples) for optimal performance.
|
|
788
|
+
* Audio will be zero-padded or truncated to 16000 samples.
|
|
789
|
+
*/
|
|
790
|
+
infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
|
|
791
|
+
/**
|
|
792
|
+
* Decode CTC logits to text using greedy decoding
|
|
793
|
+
*/
|
|
794
|
+
private decodeCTC;
|
|
795
|
+
/**
|
|
796
|
+
* Queue inference to serialize ONNX session calls
|
|
797
|
+
*/
|
|
798
|
+
private queueInference;
|
|
799
|
+
/**
|
|
800
|
+
* Get blendshape value by name for a specific frame
|
|
801
|
+
*/
|
|
802
|
+
getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
|
|
803
|
+
/**
|
|
804
|
+
* Dispose of the model and free resources
|
|
805
|
+
*/
|
|
806
|
+
dispose(): Promise<void>;
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
/**
|
|
810
|
+
* LAMPipeline - Coordinate LAM (Wav2Vec2) inference with frame synchronization
|
|
811
|
+
*
|
|
812
|
+
* Manages the buffering and processing pipeline for LAM lip sync:
|
|
813
|
+
* 1. Accumulates audio samples in a ring buffer
|
|
814
|
+
* 2. Triggers LAM inference when buffer reaches required size (16000 samples @ 16kHz = 1.0s)
|
|
815
|
+
* 3. Queues resulting blendshape frames with precise timestamps
|
|
816
|
+
* 4. Provides frames synchronized to AudioContext clock
|
|
817
|
+
*
|
|
818
|
+
* Key Design Decisions:
|
|
819
|
+
* - Ring buffer pattern for efficient sample accumulation (no allocation churn)
|
|
820
|
+
* - Frame queue with timestamps for deterministic playback
|
|
821
|
+
* - Timestamp-based frame retrieval (not callback) for renderer flexibility
|
|
822
|
+
*
|
|
823
|
+
* Based on patterns from Chrome Audio Worklet design and Web Audio clock management.
|
|
824
|
+
*
|
|
825
|
+
* @see https://developer.chrome.com/blog/audio-worklet-design-pattern
|
|
826
|
+
* @category Audio
|
|
827
|
+
*/
|
|
828
|
+
|
|
829
|
+
interface LAMFrame {
|
|
830
|
+
/** 52 ARKit blendshape weights */
|
|
831
|
+
frame: Float32Array;
|
|
832
|
+
/** AudioContext time when this frame should be displayed */
|
|
833
|
+
timestamp: number;
|
|
834
|
+
}
|
|
835
|
+
interface LAMPipelineOptions {
|
|
836
|
+
/**
|
|
837
|
+
* Sample rate in Hz (must match audio playback)
|
|
838
|
+
* Default: 16000
|
|
839
|
+
*/
|
|
840
|
+
sampleRate?: number;
|
|
841
|
+
/**
|
|
842
|
+
* LAM inference callback
|
|
843
|
+
* Called each time LAM processes a buffer
|
|
844
|
+
*/
|
|
845
|
+
onInference?: (frameCount: number) => void;
|
|
846
|
+
/**
|
|
847
|
+
* Error callback for inference failures
|
|
848
|
+
*/
|
|
849
|
+
onError?: (error: Error) => void;
|
|
850
|
+
}
|
|
851
|
+
declare class LAMPipeline {
|
|
852
|
+
private readonly options;
|
|
853
|
+
private readonly REQUIRED_SAMPLES;
|
|
854
|
+
private readonly FRAME_RATE;
|
|
855
|
+
private buffer;
|
|
856
|
+
private bufferStartTime;
|
|
857
|
+
private frameQueue;
|
|
858
|
+
/**
|
|
859
|
+
* Last successfully retrieved frame
|
|
860
|
+
* Used as fallback when no new frame is available to prevent avatar freezing
|
|
861
|
+
*/
|
|
862
|
+
private lastFrame;
|
|
863
|
+
constructor(options?: LAMPipelineOptions);
|
|
864
|
+
/**
|
|
865
|
+
* Push audio samples into the pipeline
|
|
866
|
+
*
|
|
867
|
+
* Accumulates samples and triggers LAM inference when buffer is full.
|
|
868
|
+
* Multiple calls may be needed to accumulate enough samples.
|
|
869
|
+
*
|
|
870
|
+
* @param samples - Float32Array of audio samples
|
|
871
|
+
* @param timestamp - AudioContext time when these samples start playing
|
|
872
|
+
* @param lam - LAM inference engine
|
|
873
|
+
*/
|
|
874
|
+
push(samples: Float32Array, timestamp: number, lam: Wav2Vec2Inference): Promise<void>;
|
|
875
|
+
/**
|
|
876
|
+
* Process accumulated buffer through LAM inference
|
|
877
|
+
*/
|
|
878
|
+
private processBuffer;
|
|
879
|
+
/**
|
|
880
|
+
* Get the frame that should be displayed at the current time
|
|
881
|
+
*
|
|
882
|
+
* Automatically removes frames that have already been displayed.
|
|
883
|
+
* This prevents memory leaks from accumulating old frames.
|
|
884
|
+
*
|
|
885
|
+
* Discard Window (prevents premature frame discarding):
|
|
886
|
+
* - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
|
|
887
|
+
* - WASM: 1.0s (LAM inference 50-500ms + higher variability)
|
|
888
|
+
*
|
|
889
|
+
* Last-Frame-Hold: Returns last valid frame instead of null to prevent
|
|
890
|
+
* avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
|
|
891
|
+
*
|
|
892
|
+
* @param currentTime - Current AudioContext time
|
|
893
|
+
* @param lam - LAM inference engine (optional, for backend detection)
|
|
894
|
+
* @returns Current frame, or last frame as fallback, or null if no frames yet
|
|
895
|
+
*/
|
|
896
|
+
getFrameForTime(currentTime: number, lam?: {
|
|
897
|
+
backend: 'webgpu' | 'wasm' | null;
|
|
898
|
+
}): Float32Array | null;
|
|
899
|
+
/**
|
|
900
|
+
* Get all frames in the queue (for debugging/monitoring)
|
|
901
|
+
*/
|
|
902
|
+
getQueuedFrames(): LAMFrame[];
|
|
903
|
+
/**
|
|
904
|
+
* Get current buffer fill level (0-1)
|
|
905
|
+
*/
|
|
906
|
+
get fillLevel(): number;
|
|
907
|
+
/**
|
|
908
|
+
* Get number of frames queued
|
|
909
|
+
*/
|
|
910
|
+
get queuedFrameCount(): number;
|
|
911
|
+
/**
|
|
912
|
+
* Get buffered audio duration in seconds
|
|
913
|
+
*/
|
|
914
|
+
get bufferedDuration(): number;
|
|
915
|
+
/**
|
|
916
|
+
* Flush remaining buffered audio
|
|
917
|
+
*
|
|
918
|
+
* Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
|
|
919
|
+
* This ensures the final audio chunk generates blendshape frames.
|
|
920
|
+
*
|
|
921
|
+
* Should be called when audio stream ends to prevent losing the last 0-1 seconds.
|
|
922
|
+
*
|
|
923
|
+
* @param lam - LAM inference engine
|
|
924
|
+
*/
|
|
925
|
+
flush(lam: Wav2Vec2Inference): Promise<void>;
|
|
926
|
+
/**
|
|
927
|
+
* Adjust all queued frame timestamps by an offset
|
|
928
|
+
*
|
|
929
|
+
* Used for synchronization when audio scheduling time differs from
|
|
930
|
+
* the estimated time used during LAM processing.
|
|
931
|
+
*
|
|
932
|
+
* @param offset - Time offset in seconds to add to all timestamps
|
|
933
|
+
*/
|
|
934
|
+
adjustTimestamps(offset: number): void;
|
|
935
|
+
/**
|
|
936
|
+
* Reset the pipeline
|
|
937
|
+
*/
|
|
938
|
+
reset(): void;
|
|
939
|
+
}
|
|
940
|
+
|
|
941
|
+
/**
|
|
942
|
+
* SyncedAudioPipeline - Enterprise-grade audio + LAM synchronization coordinator
|
|
943
|
+
*
|
|
944
|
+
* Orchestrates the complete pipeline for synchronized audio playback and lip sync:
|
|
945
|
+
* 1. Network chunks → Coalescer → Optimized buffers
|
|
946
|
+
* 2. Audio buffers → Scheduler → Gapless playback
|
|
947
|
+
* 3. Audio buffers → LAM Pipeline → Blendshape frames
|
|
948
|
+
* 4. Frames synchronized to AudioContext clock → Renderer
|
|
949
|
+
*
|
|
950
|
+
* Key Architecture Pattern: Wait-for-First-LAM
|
|
951
|
+
* - Buffers incoming audio chunks without scheduling playback
|
|
952
|
+
* - Waits for first LAM inference to complete (ensures LAM frames are ready)
|
|
953
|
+
* - Then schedules all buffered audio + LAM frames together
|
|
954
|
+
* - Result: Perfect synchronization from frame 1, no lag compensation needed
|
|
955
|
+
*
|
|
956
|
+
* This is a deterministic, enterprise-grade solution suitable for production use.
|
|
957
|
+
* No hacks, no lag detection, no frame skipping - just guaranteed synchronization.
|
|
958
|
+
*
|
|
959
|
+
* @see https://web.dev/articles/audio-scheduling (Web Audio clock patterns)
|
|
960
|
+
* @see https://developer.chrome.com/blog/audio-worklet-design-pattern (Ring buffer patterns)
|
|
961
|
+
* @category Audio
|
|
962
|
+
*/
|
|
963
|
+
|
|
964
|
+
interface SyncedAudioPipelineOptions {
|
|
965
|
+
/** Sample rate in Hz (default: 16000) */
|
|
966
|
+
sampleRate?: number;
|
|
967
|
+
/** Target chunk duration in ms for coalescing (default: 200) */
|
|
968
|
+
chunkTargetMs?: number;
|
|
969
|
+
/** LAM inference engine */
|
|
970
|
+
lam: Wav2Vec2Inference;
|
|
971
|
+
}
|
|
972
|
+
interface SyncedAudioPipelineEvents {
|
|
973
|
+
/** New frame ready for display */
|
|
974
|
+
frame_ready: Float32Array;
|
|
975
|
+
/** Playback has completed */
|
|
976
|
+
playback_complete: void;
|
|
977
|
+
/** First LAM inference completed, playback starting */
|
|
978
|
+
playback_start: number;
|
|
979
|
+
/** Error occurred */
|
|
980
|
+
error: Error;
|
|
981
|
+
/** Index signature for EventEmitter compatibility */
|
|
982
|
+
[key: string]: unknown;
|
|
983
|
+
}
|
|
984
|
+
declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents> {
|
|
985
|
+
private readonly options;
|
|
986
|
+
private scheduler;
|
|
987
|
+
private coalescer;
|
|
988
|
+
private lamPipeline;
|
|
989
|
+
private waitingForFirstLAM;
|
|
990
|
+
private bufferedChunks;
|
|
991
|
+
private monitorInterval;
|
|
992
|
+
private frameAnimationId;
|
|
993
|
+
constructor(options: SyncedAudioPipelineOptions);
|
|
994
|
+
/**
|
|
995
|
+
* Initialize the pipeline
|
|
996
|
+
*/
|
|
997
|
+
initialize(): Promise<void>;
|
|
998
|
+
/**
|
|
999
|
+
* Start a new playback session
|
|
1000
|
+
*
|
|
1001
|
+
* Resets all state and prepares for incoming audio chunks.
|
|
1002
|
+
* Enables wait-for-first-LAM synchronization.
|
|
1003
|
+
*/
|
|
1004
|
+
start(): void;
|
|
1005
|
+
/**
|
|
1006
|
+
* Receive audio chunk from network
|
|
1007
|
+
*
|
|
1008
|
+
* Implements wait-for-first-LAM pattern:
|
|
1009
|
+
* - Chunks are coalesced into optimal buffers
|
|
1010
|
+
* - Buffers are sent to LAM for processing
|
|
1011
|
+
* - Audio scheduling waits until first LAM completes
|
|
1012
|
+
* - Then all buffered audio is scheduled together with LAM frames
|
|
1013
|
+
*
|
|
1014
|
+
* @param chunk - Uint8Array containing Int16 PCM audio
|
|
1015
|
+
*/
|
|
1016
|
+
onAudioChunk(chunk: Uint8Array): Promise<void>;
|
|
1017
|
+
/**
|
|
1018
|
+
* Handle first LAM inference completion
|
|
1019
|
+
*
|
|
1020
|
+
* This is the critical synchronization point:
|
|
1021
|
+
* - LAM frames are now ready in the queue
|
|
1022
|
+
* - Schedule all buffered audio chunks
|
|
1023
|
+
* - Adjust LAM frame timestamps to match actual schedule time
|
|
1024
|
+
* - Audio and LAM start playing together, perfectly synchronized
|
|
1025
|
+
*/
|
|
1026
|
+
private onFirstLAMComplete;
|
|
1027
|
+
/**
|
|
1028
|
+
* End of audio stream
|
|
1029
|
+
*
|
|
1030
|
+
* Flushes any remaining buffered data.
|
|
1031
|
+
*/
|
|
1032
|
+
end(): Promise<void>;
|
|
1033
|
+
/**
|
|
1034
|
+
* Stop playback immediately with smooth fade-out
|
|
1035
|
+
*
|
|
1036
|
+
* Gracefully cancels all audio playback and LAM processing:
|
|
1037
|
+
* - Fades out audio over specified duration (default: 50ms)
|
|
1038
|
+
* - Cancels pending LAM inferences
|
|
1039
|
+
* - Clears all buffers and queues
|
|
1040
|
+
* - Emits 'playback_complete' event
|
|
1041
|
+
*
|
|
1042
|
+
* Use this for interruptions (e.g., user barge-in during AI speech).
|
|
1043
|
+
*
|
|
1044
|
+
* @param fadeOutMs - Fade-out duration in milliseconds (default: 50ms)
|
|
1045
|
+
* @returns Promise that resolves when fade-out completes
|
|
1046
|
+
*/
|
|
1047
|
+
stop(fadeOutMs?: number): Promise<void>;
|
|
1048
|
+
/**
|
|
1049
|
+
* Start frame animation loop
|
|
1050
|
+
*
|
|
1051
|
+
* Uses requestAnimationFrame to check for new LAM frames.
|
|
1052
|
+
* Synchronized to AudioContext clock (not visual refresh rate).
|
|
1053
|
+
*
|
|
1054
|
+
* Frame Emission Strategy:
|
|
1055
|
+
* - LAMPipeline uses last-frame-hold to prevent null returns
|
|
1056
|
+
* - Always emit frames (even repeated frames) to maintain smooth animation
|
|
1057
|
+
* - Renderer is responsible for detecting duplicate frames if needed
|
|
1058
|
+
*/
|
|
1059
|
+
private startFrameLoop;
|
|
1060
|
+
/**
|
|
1061
|
+
* Start monitoring for playback completion
|
|
1062
|
+
*/
|
|
1063
|
+
private startMonitoring;
|
|
1064
|
+
/**
|
|
1065
|
+
* Stop monitoring
|
|
1066
|
+
*/
|
|
1067
|
+
private stopMonitoring;
|
|
1068
|
+
/**
|
|
1069
|
+
* Get current pipeline state (for debugging/monitoring)
|
|
1070
|
+
*/
|
|
1071
|
+
getState(): {
|
|
1072
|
+
waitingForFirstLAM: boolean;
|
|
1073
|
+
bufferedChunks: number;
|
|
1074
|
+
coalescerFill: number;
|
|
1075
|
+
lamFill: number;
|
|
1076
|
+
queuedFrames: number;
|
|
1077
|
+
currentTime: number;
|
|
1078
|
+
playbackEndTime: number;
|
|
1079
|
+
};
|
|
1080
|
+
/**
|
|
1081
|
+
* Cleanup resources
|
|
1082
|
+
*/
|
|
1083
|
+
dispose(): void;
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
/**
|
|
1087
|
+
* Whisper Automatic Speech Recognition using transformers.js
|
|
1088
|
+
* Uses Xenova's proven pipeline API for reliable transcription
|
|
1089
|
+
*/
|
|
1090
|
+
type WhisperModel = 'tiny' | 'base' | 'small' | 'medium';
|
|
1091
|
+
type WhisperDtype = 'fp32' | 'fp16' | 'q8' | 'int8' | 'uint8' | 'q4' | 'q4f16' | 'bnb4';
|
|
1092
|
+
interface WhisperConfig {
|
|
1093
|
+
/** Model size: tiny (~75MB), base (~150MB), small (~500MB), medium (~1.5GB) */
|
|
1094
|
+
model?: WhisperModel;
|
|
1095
|
+
/** Use multilingual model (default: false, uses .en models) */
|
|
1096
|
+
multilingual?: boolean;
|
|
1097
|
+
/** Language code (e.g., 'en', 'es', 'fr') - for multilingual models */
|
|
1098
|
+
language?: string;
|
|
1099
|
+
/** Task: transcribe or translate (default: transcribe) */
|
|
1100
|
+
task?: 'transcribe' | 'translate';
|
|
1101
|
+
/** Model quantization format (default: 'q8' for balance of speed/quality) */
|
|
1102
|
+
dtype?: WhisperDtype;
|
|
1103
|
+
/** Use WebGPU acceleration if available (default: auto-detect) */
|
|
1104
|
+
device?: 'auto' | 'webgpu' | 'wasm';
|
|
1105
|
+
/** Local model path (e.g., '/models/whisper-tiny.en') - overrides HuggingFace CDN */
|
|
1106
|
+
localModelPath?: string;
|
|
1107
|
+
/** HuggingFace API token to bypass rate limits (get from https://huggingface.co/settings/tokens) */
|
|
1108
|
+
token?: string;
|
|
1109
|
+
/** Suppress non-speech tokens like [LAUGHTER], [CLICKING], etc. (default: true) */
|
|
1110
|
+
suppressNonSpeech?: boolean;
|
|
1111
|
+
}
|
|
1112
|
+
interface TranscriptionResult {
|
|
1113
|
+
/** Transcribed text */
|
|
1114
|
+
text: string;
|
|
1115
|
+
/** Detected/used language */
|
|
1116
|
+
language: string;
|
|
1117
|
+
/** Inference time in ms */
|
|
1118
|
+
inferenceTimeMs: number;
|
|
1119
|
+
/** Full chunks with timestamps (if requested) */
|
|
1120
|
+
chunks?: Array<{
|
|
1121
|
+
text: string;
|
|
1122
|
+
timestamp: [number, number | null];
|
|
1123
|
+
}>;
|
|
1124
|
+
}
|
|
1125
|
+
/**
|
|
1126
|
+
* Whisper ASR inference using transformers.js pipeline API
|
|
1127
|
+
*
|
|
1128
|
+
* Features:
|
|
1129
|
+
* - Automatic WebGPU/WASM backend selection
|
|
1130
|
+
* - Streaming support with chunk callbacks
|
|
1131
|
+
* - Proven implementation from Xenova's demo
|
|
1132
|
+
* - Handles all audio preprocessing automatically
|
|
1133
|
+
*/
|
|
1134
|
+
declare class WhisperInference {
|
|
1135
|
+
private config;
|
|
1136
|
+
private pipeline;
|
|
1137
|
+
private currentModel;
|
|
1138
|
+
private isLoading;
|
|
1139
|
+
private actualBackend;
|
|
1140
|
+
constructor(config?: WhisperConfig);
|
|
1141
|
+
/**
|
|
1142
|
+
* Check if WebGPU is available in this browser
|
|
1143
|
+
*/
|
|
1144
|
+
static isWebGPUAvailable(): Promise<boolean>;
|
|
1145
|
+
/**
|
|
1146
|
+
* Load the Whisper model pipeline
|
|
1147
|
+
*/
|
|
1148
|
+
load(onProgress?: (progress: {
|
|
1149
|
+
status: string;
|
|
1150
|
+
progress?: number;
|
|
1151
|
+
file?: string;
|
|
1152
|
+
}) => void): Promise<void>;
|
|
1153
|
+
/**
|
|
1154
|
+
* Transcribe audio to text
|
|
1155
|
+
*
|
|
1156
|
+
* @param audio Audio samples (Float32Array, 16kHz mono)
|
|
1157
|
+
* @param options Transcription options
|
|
1158
|
+
*/
|
|
1159
|
+
transcribe(audio: Float32Array, options?: {
|
|
1160
|
+
/** Return timestamps for each chunk */
|
|
1161
|
+
returnTimestamps?: boolean;
|
|
1162
|
+
/** Chunk length in seconds (default: 30) */
|
|
1163
|
+
chunkLengthS?: number;
|
|
1164
|
+
/** Stride length in seconds for overlapping chunks (default: 5) */
|
|
1165
|
+
strideLengthS?: number;
|
|
1166
|
+
/** Language override */
|
|
1167
|
+
language?: string;
|
|
1168
|
+
/** Task override */
|
|
1169
|
+
task?: 'transcribe' | 'translate';
|
|
1170
|
+
}): Promise<TranscriptionResult>;
|
|
1171
|
+
/**
|
|
1172
|
+
* Transcribe with streaming chunks (progressive results)
|
|
1173
|
+
*
|
|
1174
|
+
* @param audio Audio samples
|
|
1175
|
+
* @param onChunk Called when each chunk is finalized
|
|
1176
|
+
* @param onUpdate Called after each generation step (optional)
|
|
1177
|
+
*/
|
|
1178
|
+
transcribeStreaming(audio: Float32Array, onChunk: (chunk: {
|
|
1179
|
+
text: string;
|
|
1180
|
+
timestamp: [number, number | null];
|
|
1181
|
+
}) => void, onUpdate?: (text: string) => void, options?: {
|
|
1182
|
+
chunkLengthS?: number;
|
|
1183
|
+
strideLengthS?: number;
|
|
1184
|
+
language?: string;
|
|
1185
|
+
task?: 'transcribe' | 'translate';
|
|
1186
|
+
}): Promise<TranscriptionResult>;
|
|
1187
|
+
/**
|
|
1188
|
+
* Dispose of the model and free resources
|
|
1189
|
+
*/
|
|
1190
|
+
dispose(): Promise<void>;
|
|
1191
|
+
/**
|
|
1192
|
+
* Check if model is loaded
|
|
1193
|
+
*/
|
|
1194
|
+
get isLoaded(): boolean;
|
|
1195
|
+
/**
|
|
1196
|
+
* Get the backend being used (webgpu or wasm)
|
|
1197
|
+
*/
|
|
1198
|
+
get backend(): string;
|
|
1199
|
+
/**
|
|
1200
|
+
* Get the full model name used by transformers.js
|
|
1201
|
+
*/
|
|
1202
|
+
private getModelName;
|
|
1203
|
+
/**
|
|
1204
|
+
* Remove non-speech event tokens from transcription
|
|
1205
|
+
*
|
|
1206
|
+
* Whisper outputs special tokens for non-speech events like:
|
|
1207
|
+
* [LAUGHTER], [APPLAUSE], [MUSIC], [BLANK_AUDIO], [CLICKING], etc.
|
|
1208
|
+
*
|
|
1209
|
+
* This method strips these tokens and cleans up extra whitespace.
|
|
1210
|
+
*/
|
|
1211
|
+
private removeNonSpeechTokens;
|
|
1212
|
+
}
|
|
1213
|
+
|
|
1214
|
+
/**
|
|
1215
|
+
* Silero VAD (Voice Activity Detection) inference
|
|
1216
|
+
*
|
|
1217
|
+
* Neural network-based VAD running in browser via ONNX Runtime Web.
|
|
1218
|
+
* Much more accurate than RMS-based energy detection.
|
|
1219
|
+
*
|
|
1220
|
+
* Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
|
|
1221
|
+
* - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
|
|
1222
|
+
* - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
|
|
1223
|
+
*
|
|
1224
|
+
* @category Inference
|
|
1225
|
+
*
|
|
1226
|
+
* @example Basic usage
|
|
1227
|
+
* ```typescript
|
|
1228
|
+
* import { SileroVADInference } from '@omote/core';
|
|
1229
|
+
*
|
|
1230
|
+
* const vad = new SileroVADInference({
|
|
1231
|
+
* modelUrl: '/models/silero-vad.onnx'
|
|
1232
|
+
* });
|
|
1233
|
+
* await vad.load();
|
|
1234
|
+
*
|
|
1235
|
+
* // Process 32ms chunks (512 samples at 16kHz)
|
|
1236
|
+
* const probability = await vad.process(audioChunk);
|
|
1237
|
+
* if (probability > 0.5) {
|
|
1238
|
+
* console.log('Speech detected!');
|
|
1239
|
+
* }
|
|
1240
|
+
* ```
|
|
1241
|
+
*
|
|
1242
|
+
* @example Streaming with state management
|
|
1243
|
+
* ```typescript
|
|
1244
|
+
* // State is automatically maintained between process() calls
|
|
1245
|
+
* // Call reset() when starting a new audio stream
|
|
1246
|
+
* vad.reset();
|
|
1247
|
+
*
|
|
1248
|
+
* for (const chunk of audioChunks) {
|
|
1249
|
+
* const prob = await vad.process(chunk);
|
|
1250
|
+
* // prob is speech probability [0, 1]
|
|
1251
|
+
* }
|
|
1252
|
+
* ```
|
|
1253
|
+
*/
|
|
1254
|
+
|
|
1255
|
+
type VADBackend = BackendPreference;
|
|
1256
|
+
/**
|
|
1257
|
+
* Configuration for Silero VAD
|
|
1258
|
+
*/
|
|
1259
|
+
interface SileroVADConfig {
|
|
1260
|
+
/** Path or URL to the ONNX model */
|
|
1261
|
+
modelUrl: string;
|
|
1262
|
+
/** Preferred backend (auto will try WebGPU first, fallback to WASM) */
|
|
1263
|
+
backend?: VADBackend;
|
|
1264
|
+
/** Sample rate (8000 or 16000, default: 16000) */
|
|
1265
|
+
sampleRate?: 8000 | 16000;
|
|
1266
|
+
/** Speech probability threshold (default: 0.5) */
|
|
1267
|
+
threshold?: number;
|
|
1268
|
+
/**
|
|
1269
|
+
* Number of audio chunks to keep in pre-speech buffer.
|
|
1270
|
+
* When VAD triggers, these chunks are prepended to the speech buffer
|
|
1271
|
+
* to capture the beginning of speech that occurred before detection.
|
|
1272
|
+
*
|
|
1273
|
+
* At 512 samples/chunk and 16kHz:
|
|
1274
|
+
* - 10 chunks = 320ms of pre-speech audio
|
|
1275
|
+
* - 15 chunks = 480ms of pre-speech audio
|
|
1276
|
+
*
|
|
1277
|
+
* Default: 10 chunks (320ms)
|
|
1278
|
+
*/
|
|
1279
|
+
preSpeechBufferChunks?: number;
|
|
1280
|
+
}
|
|
1281
|
+
/**
|
|
1282
|
+
* VAD model loading information
|
|
1283
|
+
*/
|
|
1284
|
+
interface VADModelInfo {
|
|
1285
|
+
backend: 'webgpu' | 'wasm';
|
|
1286
|
+
loadTimeMs: number;
|
|
1287
|
+
inputNames: string[];
|
|
1288
|
+
outputNames: string[];
|
|
1289
|
+
sampleRate: number;
|
|
1290
|
+
chunkSize: number;
|
|
1291
|
+
}
|
|
1292
|
+
/**
|
|
1293
|
+
* Result from a single VAD inference
|
|
1294
|
+
*/
|
|
1295
|
+
interface VADResult$1 {
|
|
1296
|
+
/** Speech probability (0-1) */
|
|
1297
|
+
probability: number;
|
|
1298
|
+
/** Whether speech is detected (probability > threshold) */
|
|
1299
|
+
isSpeech: boolean;
|
|
1300
|
+
/** Inference time in milliseconds */
|
|
1301
|
+
inferenceTimeMs: number;
|
|
1302
|
+
/**
|
|
1303
|
+
* Pre-speech audio chunks (only present on first speech detection).
|
|
1304
|
+
* These are the N chunks immediately before VAD triggered, useful for
|
|
1305
|
+
* capturing the beginning of speech that occurred before detection.
|
|
1306
|
+
*
|
|
1307
|
+
* Only populated when transitioning from silence to speech.
|
|
1308
|
+
*/
|
|
1309
|
+
preSpeechChunks?: Float32Array[];
|
|
1310
|
+
}
|
|
1311
|
+
/**
|
|
1312
|
+
* Speech segment detected by VAD
|
|
1313
|
+
*/
|
|
1314
|
+
interface SpeechSegment {
|
|
1315
|
+
/** Start time in seconds */
|
|
1316
|
+
start: number;
|
|
1317
|
+
/** End time in seconds */
|
|
1318
|
+
end: number;
|
|
1319
|
+
/** Average probability during segment */
|
|
1320
|
+
avgProbability: number;
|
|
1321
|
+
}
|
|
1322
|
+
/**
|
|
1323
|
+
* Silero VAD - Neural network voice activity detection
|
|
1324
|
+
*
|
|
1325
|
+
* Based on snakers4/silero-vad ONNX model.
|
|
1326
|
+
* Processes 32ms chunks (512 samples at 16kHz) with LSTM state.
|
|
1327
|
+
*
|
|
1328
|
+
* @see https://github.com/snakers4/silero-vad
|
|
1329
|
+
*/
|
|
1330
|
+
declare class SileroVADInference {
|
|
1331
|
+
private session;
|
|
1332
|
+
private ort;
|
|
1333
|
+
private config;
|
|
1334
|
+
private _backend;
|
|
1335
|
+
private isLoading;
|
|
1336
|
+
private state;
|
|
1337
|
+
private context;
|
|
1338
|
+
private readonly chunkSize;
|
|
1339
|
+
private readonly contextSize;
|
|
1340
|
+
private inferenceQueue;
|
|
1341
|
+
private preSpeechBuffer;
|
|
1342
|
+
private wasSpeaking;
|
|
1343
|
+
constructor(config: SileroVADConfig);
|
|
1344
|
+
get backend(): RuntimeBackend | null;
|
|
1345
|
+
get isLoaded(): boolean;
|
|
1346
|
+
get sampleRate(): number;
|
|
1347
|
+
get threshold(): number;
|
|
1348
|
+
/**
|
|
1349
|
+
* Get required chunk size in samples
|
|
1350
|
+
*/
|
|
1351
|
+
getChunkSize(): number;
|
|
1352
|
+
/**
|
|
1353
|
+
* Get chunk duration in milliseconds
|
|
1354
|
+
*/
|
|
1355
|
+
getChunkDurationMs(): number;
|
|
1356
|
+
/**
|
|
1357
|
+
* Check if WebGPU is available and working
|
|
1358
|
+
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
1359
|
+
*/
|
|
1360
|
+
static isWebGPUAvailable: typeof isWebGPUAvailable;
|
|
1361
|
+
/**
|
|
1362
|
+
* Load the ONNX model
|
|
1363
|
+
*/
|
|
1364
|
+
load(): Promise<VADModelInfo>;
|
|
1365
|
+
/**
|
|
1366
|
+
* Reset state for new audio stream
|
|
1367
|
+
*/
|
|
1368
|
+
reset(): void;
|
|
1369
|
+
/**
|
|
1370
|
+
* Process a single audio chunk
|
|
1371
|
+
*
|
|
1372
|
+
* @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
|
|
1373
|
+
* @returns VAD result with speech probability
|
|
1374
|
+
*/
|
|
1375
|
+
process(audioChunk: Float32Array): Promise<VADResult$1>;
|
|
1376
|
+
/**
|
|
1377
|
+
* Process audio and detect speech segments
|
|
1378
|
+
*
|
|
1379
|
+
* @param audio - Complete audio buffer
|
|
1380
|
+
* @param options - Detection options
|
|
1381
|
+
* @returns Array of speech segments
|
|
1382
|
+
*/
|
|
1383
|
+
detectSpeech(audio: Float32Array, options?: {
|
|
1384
|
+
/** Minimum speech duration in ms (default: 250) */
|
|
1385
|
+
minSpeechDurationMs?: number;
|
|
1386
|
+
/** Minimum silence duration to end segment in ms (default: 300) */
|
|
1387
|
+
minSilenceDurationMs?: number;
|
|
1388
|
+
/** Padding to add before/after speech in ms (default: 30) */
|
|
1389
|
+
speechPadMs?: number;
|
|
1390
|
+
}): Promise<SpeechSegment[]>;
|
|
1391
|
+
/**
|
|
1392
|
+
* Calculate RMS energy of audio chunk
|
|
1393
|
+
*/
|
|
1394
|
+
private calculateRMS;
|
|
1395
|
+
/**
|
|
1396
|
+
* Queue inference to serialize ONNX session calls
|
|
1397
|
+
*/
|
|
1398
|
+
private queueInference;
|
|
1399
|
+
/**
|
|
1400
|
+
* Dispose of the model and free resources
|
|
1401
|
+
*/
|
|
1402
|
+
dispose(): Promise<void>;
|
|
1403
|
+
}
|
|
1404
|
+
|
|
1405
|
+
/**
|
|
1406
|
+
* Configuration for Silero VAD Worker
|
|
1407
|
+
*/
|
|
1408
|
+
interface VADWorkerConfig {
|
|
1409
|
+
/** Path or URL to the ONNX model */
|
|
1410
|
+
modelUrl: string;
|
|
1411
|
+
/** Sample rate (8000 or 16000, default: 16000) */
|
|
1412
|
+
sampleRate?: 8000 | 16000;
|
|
1413
|
+
/** Speech probability threshold (default: 0.5) */
|
|
1414
|
+
threshold?: number;
|
|
1415
|
+
/**
|
|
1416
|
+
* Number of audio chunks to keep in pre-speech buffer.
|
|
1417
|
+
* When VAD triggers, these chunks are prepended to the speech buffer
|
|
1418
|
+
* to capture the beginning of speech that occurred before detection.
|
|
1419
|
+
*
|
|
1420
|
+
* At 512 samples/chunk and 16kHz:
|
|
1421
|
+
* - 10 chunks = 320ms of pre-speech audio
|
|
1422
|
+
* - 15 chunks = 480ms of pre-speech audio
|
|
1423
|
+
*
|
|
1424
|
+
* Default: 10 chunks (320ms)
|
|
1425
|
+
*/
|
|
1426
|
+
preSpeechBufferChunks?: number;
|
|
1427
|
+
}
|
|
1428
|
+
/**
|
|
1429
|
+
* VAD model loading information from worker
|
|
1430
|
+
*/
|
|
1431
|
+
interface VADWorkerModelInfo {
|
|
1432
|
+
backend: 'wasm';
|
|
1433
|
+
loadTimeMs: number;
|
|
1434
|
+
inputNames: string[];
|
|
1435
|
+
outputNames: string[];
|
|
1436
|
+
sampleRate: number;
|
|
1437
|
+
chunkSize: number;
|
|
1438
|
+
}
|
|
1439
|
+
/**
|
|
1440
|
+
* Result from a single VAD inference
|
|
1441
|
+
*/
|
|
1442
|
+
interface VADResult {
|
|
1443
|
+
/** Speech probability (0-1) */
|
|
1444
|
+
probability: number;
|
|
1445
|
+
/** Whether speech is detected (probability > threshold) */
|
|
1446
|
+
isSpeech: boolean;
|
|
1447
|
+
/** Inference time in milliseconds */
|
|
1448
|
+
inferenceTimeMs: number;
|
|
1449
|
+
/**
|
|
1450
|
+
* Pre-speech audio chunks (only present on first speech detection).
|
|
1451
|
+
* These are the N chunks immediately before VAD triggered, useful for
|
|
1452
|
+
* capturing the beginning of speech that occurred before detection.
|
|
1453
|
+
*
|
|
1454
|
+
* Only populated when transitioning from silence to speech.
|
|
1455
|
+
*/
|
|
1456
|
+
preSpeechChunks?: Float32Array[];
|
|
1457
|
+
}
|
|
1458
|
+
/**
|
|
1459
|
+
* Silero VAD Worker - Voice Activity Detection in a Web Worker
|
|
1460
|
+
*
|
|
1461
|
+
* Runs Silero VAD inference off the main thread to prevent UI blocking.
|
|
1462
|
+
* Feature parity with SileroVADInference but runs in dedicated worker.
|
|
1463
|
+
*
|
|
1464
|
+
* @see SileroVADInference for main-thread version
|
|
1465
|
+
*/
|
|
1466
|
+
declare class SileroVADWorker {
|
|
1467
|
+
private worker;
|
|
1468
|
+
private config;
|
|
1469
|
+
private isLoading;
|
|
1470
|
+
private _isLoaded;
|
|
1471
|
+
private state;
|
|
1472
|
+
private context;
|
|
1473
|
+
private readonly chunkSize;
|
|
1474
|
+
private readonly contextSize;
|
|
1475
|
+
private inferenceQueue;
|
|
1476
|
+
private preSpeechBuffer;
|
|
1477
|
+
private wasSpeaking;
|
|
1478
|
+
private pendingResolvers;
|
|
1479
|
+
private messageId;
|
|
1480
|
+
constructor(config: VADWorkerConfig);
|
|
1481
|
+
get isLoaded(): boolean;
|
|
1482
|
+
/**
|
|
1483
|
+
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
1484
|
+
*/
|
|
1485
|
+
get backend(): 'wasm' | null;
|
|
1486
|
+
get sampleRate(): number;
|
|
1487
|
+
get threshold(): number;
|
|
1488
|
+
/**
|
|
1489
|
+
* Get required chunk size in samples
|
|
1490
|
+
*/
|
|
1491
|
+
getChunkSize(): number;
|
|
1492
|
+
/**
|
|
1493
|
+
* Get chunk duration in milliseconds
|
|
1494
|
+
*/
|
|
1495
|
+
getChunkDurationMs(): number;
|
|
1496
|
+
/**
|
|
1497
|
+
* Create the worker from inline script
|
|
1498
|
+
*/
|
|
1499
|
+
private createWorker;
|
|
1500
|
+
/**
|
|
1501
|
+
* Handle messages from worker
|
|
1502
|
+
*/
|
|
1503
|
+
private handleWorkerMessage;
|
|
1504
|
+
/**
|
|
1505
|
+
* Send message to worker and wait for response
|
|
1506
|
+
*/
|
|
1507
|
+
private sendMessage;
|
|
1508
|
+
/**
|
|
1509
|
+
* Load the ONNX model in the worker
|
|
1510
|
+
*/
|
|
1511
|
+
load(): Promise<VADWorkerModelInfo>;
|
|
1512
|
+
/**
|
|
1513
|
+
* Reset state for new audio stream
|
|
1514
|
+
*/
|
|
1515
|
+
reset(): Promise<void>;
|
|
1516
|
+
/**
|
|
1517
|
+
* Process a single audio chunk
|
|
1518
|
+
*
|
|
1519
|
+
* @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
|
|
1520
|
+
* @returns VAD result with speech probability
|
|
1521
|
+
*/
|
|
1522
|
+
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1523
|
+
/**
|
|
1524
|
+
* Queue inference to serialize worker calls
|
|
1525
|
+
*/
|
|
1526
|
+
private queueInference;
|
|
1527
|
+
/**
|
|
1528
|
+
* Dispose of the worker and free resources
|
|
1529
|
+
*/
|
|
1530
|
+
dispose(): Promise<void>;
|
|
1531
|
+
/**
|
|
1532
|
+
* Check if Web Workers are supported
|
|
1533
|
+
*/
|
|
1534
|
+
static isSupported(): boolean;
|
|
1535
|
+
}
|
|
1536
|
+
|
|
1537
|
+
/**
|
|
1538
|
+
* Factory function for Silero VAD with automatic Worker vs main thread selection
|
|
1539
|
+
*
|
|
1540
|
+
* Provides a unified API that automatically selects the optimal implementation:
|
|
1541
|
+
* - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
|
|
1542
|
+
* - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
|
|
1543
|
+
* - Fallback: Gracefully falls back to main thread if Worker fails
|
|
1544
|
+
*
|
|
1545
|
+
* @category Inference
|
|
1546
|
+
*
|
|
1547
|
+
* @example Basic usage (auto-detect)
|
|
1548
|
+
* ```typescript
|
|
1549
|
+
* import { createSileroVAD } from '@omote/core';
|
|
1550
|
+
*
|
|
1551
|
+
* const vad = createSileroVAD({
|
|
1552
|
+
* modelUrl: '/models/silero-vad.onnx',
|
|
1553
|
+
* threshold: 0.5,
|
|
1554
|
+
* });
|
|
1555
|
+
*
|
|
1556
|
+
* await vad.load();
|
|
1557
|
+
* const result = await vad.process(audioChunk);
|
|
1558
|
+
* if (result.isSpeech) {
|
|
1559
|
+
* console.log('Speech detected!', result.probability);
|
|
1560
|
+
* }
|
|
1561
|
+
* ```
|
|
1562
|
+
*
|
|
1563
|
+
* @example Force worker usage
|
|
1564
|
+
* ```typescript
|
|
1565
|
+
* const vad = createSileroVAD({
|
|
1566
|
+
* modelUrl: '/models/silero-vad.onnx',
|
|
1567
|
+
* useWorker: true, // Force Worker even on mobile
|
|
1568
|
+
* });
|
|
1569
|
+
* ```
|
|
1570
|
+
*
|
|
1571
|
+
* @example Force main thread
|
|
1572
|
+
* ```typescript
|
|
1573
|
+
* const vad = createSileroVAD({
|
|
1574
|
+
* modelUrl: '/models/silero-vad.onnx',
|
|
1575
|
+
* useWorker: false, // Force main thread
|
|
1576
|
+
* });
|
|
1577
|
+
* ```
|
|
1578
|
+
*/
|
|
1579
|
+
|
|
1580
|
+
/**
|
|
1581
|
+
* Common interface for both SileroVADInference and SileroVADWorker
|
|
1582
|
+
*
|
|
1583
|
+
* This interface defines the shared API that both implementations provide,
|
|
1584
|
+
* allowing consumers to use either interchangeably.
|
|
1585
|
+
*/
|
|
1586
|
+
interface SileroVADBackend {
|
|
1587
|
+
/** Current backend type (webgpu, wasm, or null if not loaded) */
|
|
1588
|
+
readonly backend: RuntimeBackend | null;
|
|
1589
|
+
/** Whether the model is loaded and ready for inference */
|
|
1590
|
+
readonly isLoaded: boolean;
|
|
1591
|
+
/** Audio sample rate (8000 or 16000 Hz) */
|
|
1592
|
+
readonly sampleRate: number;
|
|
1593
|
+
/** Speech detection threshold (0-1) */
|
|
1594
|
+
readonly threshold: number;
|
|
1595
|
+
/**
|
|
1596
|
+
* Load the ONNX model
|
|
1597
|
+
* @returns Model loading information
|
|
1598
|
+
*/
|
|
1599
|
+
load(): Promise<VADModelInfo | VADWorkerModelInfo>;
|
|
1600
|
+
/**
|
|
1601
|
+
* Process a single audio chunk
|
|
1602
|
+
* @param audioChunk - Float32Array of exactly chunkSize samples
|
|
1603
|
+
* @returns VAD result with speech probability
|
|
1604
|
+
*/
|
|
1605
|
+
process(audioChunk: Float32Array): Promise<VADResult$1>;
|
|
1606
|
+
/**
|
|
1607
|
+
* Reset state for new audio stream
|
|
1608
|
+
*/
|
|
1609
|
+
reset(): void | Promise<void>;
|
|
1610
|
+
/**
|
|
1611
|
+
* Dispose of the model and free resources
|
|
1612
|
+
*/
|
|
1613
|
+
dispose(): Promise<void>;
|
|
1614
|
+
/**
|
|
1615
|
+
* Get required chunk size in samples
|
|
1616
|
+
*/
|
|
1617
|
+
getChunkSize(): number;
|
|
1618
|
+
/**
|
|
1619
|
+
* Get chunk duration in milliseconds
|
|
1620
|
+
*/
|
|
1621
|
+
getChunkDurationMs(): number;
|
|
1622
|
+
}
|
|
1623
|
+
/**
|
|
1624
|
+
* Configuration for the Silero VAD factory
|
|
1625
|
+
*
|
|
1626
|
+
* Extends SileroVADConfig with worker-specific options.
|
|
1627
|
+
*/
|
|
1628
|
+
interface SileroVADFactoryConfig extends SileroVADConfig {
|
|
1629
|
+
/**
|
|
1630
|
+
* Force worker usage (true), main thread (false), or auto-detect (undefined).
|
|
1631
|
+
*
|
|
1632
|
+
* Auto-detection behavior:
|
|
1633
|
+
* - Desktop: Uses Worker (better responsiveness, off-main-thread)
|
|
1634
|
+
* - Mobile: Uses main thread (avoids 5MB memory overhead)
|
|
1635
|
+
*
|
|
1636
|
+
* You can override this to:
|
|
1637
|
+
* - `true`: Force Worker even on mobile (if you have memory headroom)
|
|
1638
|
+
* - `false`: Force main thread even on desktop (for debugging)
|
|
1639
|
+
*
|
|
1640
|
+
* Default: undefined (auto-detect)
|
|
1641
|
+
*/
|
|
1642
|
+
useWorker?: boolean;
|
|
1643
|
+
/**
|
|
1644
|
+
* Fallback to main thread on worker errors.
|
|
1645
|
+
*
|
|
1646
|
+
* When true (default), if the Worker fails to load or encounters an error,
|
|
1647
|
+
* the factory will automatically create a main thread instance instead.
|
|
1648
|
+
*
|
|
1649
|
+
* When false, worker errors will propagate as exceptions.
|
|
1650
|
+
*
|
|
1651
|
+
* Default: true
|
|
1652
|
+
*/
|
|
1653
|
+
fallbackOnError?: boolean;
|
|
1654
|
+
}
|
|
1655
|
+
/**
|
|
1656
|
+
* Check if the current environment supports VAD Web Workers
|
|
1657
|
+
*
|
|
1658
|
+
* Requirements:
|
|
1659
|
+
* - Worker constructor must exist
|
|
1660
|
+
* - Blob URL support (for inline worker script)
|
|
1661
|
+
*
|
|
1662
|
+
* @returns true if VAD Worker is supported
|
|
1663
|
+
*/
|
|
1664
|
+
declare function supportsVADWorker(): boolean;
|
|
1665
|
+
/**
|
|
1666
|
+
* Create a Silero VAD instance with automatic implementation selection
|
|
1667
|
+
*
|
|
1668
|
+
* This factory function automatically selects between:
|
|
1669
|
+
* - **SileroVADWorker**: Off-main-thread inference (better for desktop)
|
|
1670
|
+
* - **SileroVADInference**: Main thread inference (better for mobile)
|
|
1671
|
+
*
|
|
1672
|
+
* The selection is based on:
|
|
1673
|
+
* 1. Explicit `useWorker` config (if provided)
|
|
1674
|
+
* 2. Platform detection (mobile vs desktop)
|
|
1675
|
+
* 3. Worker API availability
|
|
1676
|
+
*
|
|
1677
|
+
* Both implementations share the same interface (SileroVADBackend),
|
|
1678
|
+
* so consumers can use either interchangeably.
|
|
1679
|
+
*
|
|
1680
|
+
* @param config - Factory configuration
|
|
1681
|
+
* @returns A SileroVAD instance (either Worker or main thread)
|
|
1682
|
+
*
|
|
1683
|
+
* @example
|
|
1684
|
+
* ```typescript
|
|
1685
|
+
* // Auto-detect (recommended)
|
|
1686
|
+
* const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
|
|
1687
|
+
*
|
|
1688
|
+
* // Force Worker
|
|
1689
|
+
* const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
|
|
1690
|
+
*
|
|
1691
|
+
* // Force main thread
|
|
1692
|
+
* const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
|
|
1693
|
+
* ```
|
|
1694
|
+
*/
|
|
1695
|
+
declare function createSileroVAD(config: SileroVADFactoryConfig): SileroVADBackend;
|
|
1696
|
+
|
|
1697
|
+
/**
|
|
1698
|
+
* Safari Web Speech API wrapper for iOS speech recognition
|
|
1699
|
+
*
|
|
1700
|
+
* Provides a similar interface to WhisperInference for easy substitution on iOS.
|
|
1701
|
+
* Uses the native Web Speech API which is significantly faster than Whisper WASM on iOS.
|
|
1702
|
+
*
|
|
1703
|
+
* Key differences from WhisperInference:
|
|
1704
|
+
* - Real-time streaming (not batch processing)
|
|
1705
|
+
* - No audio buffer input (microphone handled by browser)
|
|
1706
|
+
* - transcribe() throws error (use start/stop pattern instead)
|
|
1707
|
+
*
|
|
1708
|
+
* @category Inference
|
|
1709
|
+
*
|
|
1710
|
+
* @example Basic usage
|
|
1711
|
+
* ```typescript
|
|
1712
|
+
* import { SafariSpeechRecognition, shouldUseNativeASR } from '@omote/core';
|
|
1713
|
+
*
|
|
1714
|
+
* // Use native ASR on iOS, Whisper elsewhere
|
|
1715
|
+
* if (shouldUseNativeASR()) {
|
|
1716
|
+
* const speech = new SafariSpeechRecognition({ language: 'en-US' });
|
|
1717
|
+
*
|
|
1718
|
+
* speech.onResult((result) => {
|
|
1719
|
+
* console.log('Transcript:', result.text);
|
|
1720
|
+
* });
|
|
1721
|
+
*
|
|
1722
|
+
* await speech.start();
|
|
1723
|
+
* // ... user speaks ...
|
|
1724
|
+
* const finalResult = await speech.stop();
|
|
1725
|
+
* }
|
|
1726
|
+
* ```
|
|
1727
|
+
*
|
|
1728
|
+
* @example Platform-aware initialization
|
|
1729
|
+
* ```typescript
|
|
1730
|
+
* const asr = shouldUseNativeASR()
|
|
1731
|
+
* ? new SafariSpeechRecognition({ language: 'en-US' })
|
|
1732
|
+
* : new WhisperInference({ model: 'tiny' });
|
|
1733
|
+
* ```
|
|
1734
|
+
*/
|
|
1735
|
+
/**
|
|
1736
|
+
* Configuration for Safari Speech Recognition
|
|
1737
|
+
*/
|
|
1738
|
+
interface SafariSpeechConfig {
|
|
1739
|
+
/** Language code (default: 'en-US') */
|
|
1740
|
+
language?: string;
|
|
1741
|
+
/** Continuous mode for ongoing conversation (default: true) */
|
|
1742
|
+
continuous?: boolean;
|
|
1743
|
+
/** Interim results before speech ends (default: true) */
|
|
1744
|
+
interimResults?: boolean;
|
|
1745
|
+
/** Max alternatives (default: 1) */
|
|
1746
|
+
maxAlternatives?: number;
|
|
1747
|
+
}
|
|
1748
|
+
/**
|
|
1749
|
+
* Result from speech recognition (matches WhisperInference TranscriptionResult)
|
|
1750
|
+
*/
|
|
1751
|
+
interface SpeechRecognitionResult {
|
|
1752
|
+
/** Transcribed text */
|
|
1753
|
+
text: string;
|
|
1754
|
+
/** Detected/used language */
|
|
1755
|
+
language: string;
|
|
1756
|
+
/** Time since start in ms (not inference time - native API) */
|
|
1757
|
+
inferenceTimeMs: number;
|
|
1758
|
+
/** Whether this is a final result or interim */
|
|
1759
|
+
isFinal: boolean;
|
|
1760
|
+
/** Confidence score (0-1) if available */
|
|
1761
|
+
confidence?: number;
|
|
1762
|
+
}
|
|
1763
|
+
/**
|
|
1764
|
+
* Callback for receiving recognition results
|
|
1765
|
+
*/
|
|
1766
|
+
type SpeechResultCallback = (result: SpeechRecognitionResult) => void;
|
|
1767
|
+
/**
|
|
1768
|
+
* Callback for receiving recognition errors
|
|
1769
|
+
*/
|
|
1770
|
+
type SpeechErrorCallback = (error: Error) => void;
|
|
1771
|
+
interface SpeechRecognitionEvent extends Event {
|
|
1772
|
+
resultIndex: number;
|
|
1773
|
+
results: SpeechRecognitionResultList;
|
|
1774
|
+
}
|
|
1775
|
+
interface SpeechRecognitionResultList {
|
|
1776
|
+
length: number;
|
|
1777
|
+
item(index: number): SpeechRecognitionResult;
|
|
1778
|
+
[index: number]: SpeechRecognitionResultItem;
|
|
1779
|
+
}
|
|
1780
|
+
interface SpeechRecognitionResultItem {
|
|
1781
|
+
isFinal: boolean;
|
|
1782
|
+
length: number;
|
|
1783
|
+
item(index: number): SpeechRecognitionAlternative;
|
|
1784
|
+
[index: number]: SpeechRecognitionAlternative;
|
|
1785
|
+
}
|
|
1786
|
+
interface SpeechRecognitionAlternative {
|
|
1787
|
+
transcript: string;
|
|
1788
|
+
confidence: number;
|
|
1789
|
+
}
|
|
1790
|
+
interface SpeechRecognitionErrorEvent extends Event {
|
|
1791
|
+
error: string;
|
|
1792
|
+
message: string;
|
|
1793
|
+
}
|
|
1794
|
+
interface SpeechRecognitionInterface extends EventTarget {
|
|
1795
|
+
continuous: boolean;
|
|
1796
|
+
interimResults: boolean;
|
|
1797
|
+
lang: string;
|
|
1798
|
+
maxAlternatives: number;
|
|
1799
|
+
start(): void;
|
|
1800
|
+
stop(): void;
|
|
1801
|
+
abort(): void;
|
|
1802
|
+
onresult: ((event: SpeechRecognitionEvent) => void) | null;
|
|
1803
|
+
onerror: ((event: SpeechRecognitionErrorEvent) => void) | null;
|
|
1804
|
+
onend: (() => void) | null;
|
|
1805
|
+
onstart: (() => void) | null;
|
|
1806
|
+
onaudiostart: (() => void) | null;
|
|
1807
|
+
onaudioend: (() => void) | null;
|
|
1808
|
+
onspeechstart: (() => void) | null;
|
|
1809
|
+
onspeechend: (() => void) | null;
|
|
1810
|
+
}
|
|
1811
|
+
declare global {
|
|
1812
|
+
interface Window {
|
|
1813
|
+
SpeechRecognition?: new () => SpeechRecognitionInterface;
|
|
1814
|
+
webkitSpeechRecognition?: new () => SpeechRecognitionInterface;
|
|
1815
|
+
}
|
|
1816
|
+
}
|
|
1817
|
+
/**
|
|
1818
|
+
* Safari Web Speech API wrapper
|
|
1819
|
+
*
|
|
1820
|
+
* Provides native speech recognition on iOS Safari.
|
|
1821
|
+
* Much faster than Whisper WASM and more battery-efficient.
|
|
1822
|
+
*/
|
|
1823
|
+
declare class SafariSpeechRecognition {
|
|
1824
|
+
private config;
|
|
1825
|
+
private recognition;
|
|
1826
|
+
private isListening;
|
|
1827
|
+
private startTime;
|
|
1828
|
+
private accumulatedText;
|
|
1829
|
+
private resultCallbacks;
|
|
1830
|
+
private errorCallbacks;
|
|
1831
|
+
private stopResolver;
|
|
1832
|
+
private stopRejecter;
|
|
1833
|
+
constructor(config?: SafariSpeechConfig);
|
|
1834
|
+
/**
|
|
1835
|
+
* Check if Web Speech API is available
|
|
1836
|
+
*/
|
|
1837
|
+
static isAvailable(): boolean;
|
|
1838
|
+
/**
|
|
1839
|
+
* Check if currently listening
|
|
1840
|
+
*/
|
|
1841
|
+
get listening(): boolean;
|
|
1842
|
+
/**
|
|
1843
|
+
* Get the language being used
|
|
1844
|
+
*/
|
|
1845
|
+
get language(): string;
|
|
1846
|
+
/**
|
|
1847
|
+
* Register a callback for receiving results
|
|
1848
|
+
*/
|
|
1849
|
+
onResult(callback: SpeechResultCallback): void;
|
|
1850
|
+
/**
|
|
1851
|
+
* Register a callback for receiving errors
|
|
1852
|
+
*/
|
|
1853
|
+
onError(callback: SpeechErrorCallback): void;
|
|
1854
|
+
/**
|
|
1855
|
+
* Remove a result callback
|
|
1856
|
+
*/
|
|
1857
|
+
offResult(callback: SpeechResultCallback): void;
|
|
1858
|
+
/**
|
|
1859
|
+
* Remove an error callback
|
|
1860
|
+
*/
|
|
1861
|
+
offError(callback: SpeechErrorCallback): void;
|
|
1862
|
+
/**
|
|
1863
|
+
* Start listening for speech
|
|
1864
|
+
*
|
|
1865
|
+
* On iOS Safari, this will trigger the microphone permission prompt
|
|
1866
|
+
* if not already granted.
|
|
1867
|
+
*/
|
|
1868
|
+
start(): Promise<void>;
|
|
1869
|
+
/**
|
|
1870
|
+
* Stop listening and return the final transcript
|
|
1871
|
+
*/
|
|
1872
|
+
stop(): Promise<SpeechRecognitionResult>;
|
|
1873
|
+
/**
|
|
1874
|
+
* Abort recognition without waiting for final result
|
|
1875
|
+
*/
|
|
1876
|
+
abort(): void;
|
|
1877
|
+
/**
|
|
1878
|
+
* NOT SUPPORTED: Transcribe audio buffer
|
|
1879
|
+
*
|
|
1880
|
+
* Safari Speech API does not support transcribing pre-recorded audio.
|
|
1881
|
+
* It only works with live microphone input.
|
|
1882
|
+
*
|
|
1883
|
+
* For batch transcription on iOS, use server-side Whisper or a cloud ASR service.
|
|
1884
|
+
*
|
|
1885
|
+
* @throws Error always - this method is not supported
|
|
1886
|
+
*/
|
|
1887
|
+
transcribe(_audio: Float32Array): Promise<SpeechRecognitionResult>;
|
|
1888
|
+
/**
|
|
1889
|
+
* Dispose of recognition resources
|
|
1890
|
+
*/
|
|
1891
|
+
dispose(): void;
|
|
1892
|
+
/**
|
|
1893
|
+
* Set up event handlers for the recognition instance
|
|
1894
|
+
*/
|
|
1895
|
+
private setupEventHandlers;
|
|
1896
|
+
/**
|
|
1897
|
+
* Emit result to all registered callbacks
|
|
1898
|
+
*/
|
|
1899
|
+
private emitResult;
|
|
1900
|
+
/**
|
|
1901
|
+
* Emit error to all registered callbacks
|
|
1902
|
+
*/
|
|
1903
|
+
private emitError;
|
|
1904
|
+
}
|
|
1905
|
+
|
|
1906
|
+
/**
|
|
1907
|
+
* Emotion - Helper for creating emotion vectors for avatar animation
|
|
1908
|
+
*
|
|
1909
|
+
* Provides 10 explicit emotion channels that can be used to control
|
|
1910
|
+
* avatar expressions and emotional states.
|
|
1911
|
+
*
|
|
1912
|
+
* @category Emotion
|
|
1913
|
+
*
|
|
1914
|
+
* @example Creating emotion vectors
|
|
1915
|
+
* ```typescript
|
|
1916
|
+
* import { createEmotionVector, EmotionPresets } from '@omote/core';
|
|
1917
|
+
*
|
|
1918
|
+
* // Named weights
|
|
1919
|
+
* const happy = createEmotionVector({ joy: 0.8, amazement: 0.2 });
|
|
1920
|
+
*
|
|
1921
|
+
* // Use preset
|
|
1922
|
+
* const surprised = EmotionPresets.surprised;
|
|
1923
|
+
* ```
|
|
1924
|
+
*
|
|
1925
|
+
* @example Smooth transitions
|
|
1926
|
+
* ```typescript
|
|
1927
|
+
* import { EmotionController } from '@omote/core';
|
|
1928
|
+
*
|
|
1929
|
+
* const controller = new EmotionController();
|
|
1930
|
+
* controller.setPreset('happy');
|
|
1931
|
+
* controller.transitionTo({ sadness: 0.7 }, 500);
|
|
1932
|
+
*
|
|
1933
|
+
* // In animation loop
|
|
1934
|
+
* controller.update();
|
|
1935
|
+
* const emotion = controller.emotion;
|
|
1936
|
+
* ```
|
|
1937
|
+
*/
|
|
1938
|
+
/** The 10 explicit emotion channels */
|
|
1939
|
+
declare const EMOTION_NAMES: readonly ["amazement", "anger", "cheekiness", "disgust", "fear", "grief", "joy", "outofbreath", "pain", "sadness"];
|
|
1940
|
+
type EmotionName = typeof EMOTION_NAMES[number];
|
|
1941
|
+
/** Emotion weights by name */
|
|
1942
|
+
type EmotionWeights = Partial<Record<EmotionName, number>>;
|
|
1943
|
+
/** Total emotion vector size */
|
|
1944
|
+
declare const EMOTION_VECTOR_SIZE = 26;
|
|
1945
|
+
/**
|
|
1946
|
+
* Create an emotion vector from named weights
|
|
1947
|
+
*
|
|
1948
|
+
* @param weights - Named emotion weights (0-1)
|
|
1949
|
+
* @returns Float32Array of emotion values
|
|
1950
|
+
*
|
|
1951
|
+
* @example
|
|
1952
|
+
* ```ts
|
|
1953
|
+
* const emotion = createEmotionVector({ joy: 0.8, amazement: 0.3 });
|
|
1954
|
+
* ```
|
|
1955
|
+
*/
|
|
1956
|
+
declare function createEmotionVector(weights?: EmotionWeights): Float32Array;
|
|
1957
|
+
/**
|
|
1958
|
+
* Pre-built emotion presets for common expressions
|
|
1959
|
+
*/
|
|
1960
|
+
declare const EmotionPresets: {
|
|
1961
|
+
/** Neutral/default - no emotional expression */
|
|
1962
|
+
readonly neutral: Float32Array<ArrayBufferLike>;
|
|
1963
|
+
/** Happy - joy with slight amazement */
|
|
1964
|
+
readonly happy: Float32Array<ArrayBufferLike>;
|
|
1965
|
+
/** Sad - grief and sadness */
|
|
1966
|
+
readonly sad: Float32Array<ArrayBufferLike>;
|
|
1967
|
+
/** Angry - anger with disgust */
|
|
1968
|
+
readonly angry: Float32Array<ArrayBufferLike>;
|
|
1969
|
+
/** Surprised - high amazement */
|
|
1970
|
+
readonly surprised: Float32Array<ArrayBufferLike>;
|
|
1971
|
+
/** Scared - fear with pain */
|
|
1972
|
+
readonly scared: Float32Array<ArrayBufferLike>;
|
|
1973
|
+
/** Disgusted - disgust with anger */
|
|
1974
|
+
readonly disgusted: Float32Array<ArrayBufferLike>;
|
|
1975
|
+
/** Excited - joy with amazement and cheekiness */
|
|
1976
|
+
readonly excited: Float32Array<ArrayBufferLike>;
|
|
1977
|
+
/** Tired - out of breath with sadness */
|
|
1978
|
+
readonly tired: Float32Array<ArrayBufferLike>;
|
|
1979
|
+
/** Playful - cheekiness with joy */
|
|
1980
|
+
readonly playful: Float32Array<ArrayBufferLike>;
|
|
1981
|
+
/** Pained - pain with grief */
|
|
1982
|
+
readonly pained: Float32Array<ArrayBufferLike>;
|
|
1983
|
+
/** Contemplative - slight sadness, calm */
|
|
1984
|
+
readonly contemplative: Float32Array<ArrayBufferLike>;
|
|
1985
|
+
};
|
|
1986
|
+
type EmotionPresetName = keyof typeof EmotionPresets;
|
|
1987
|
+
/**
|
|
1988
|
+
* Get an emotion preset by name
|
|
1989
|
+
*/
|
|
1990
|
+
declare function getEmotionPreset(name: EmotionPresetName): Float32Array;
|
|
1991
|
+
/**
|
|
1992
|
+
* Blend multiple emotion vectors together
|
|
1993
|
+
*
|
|
1994
|
+
* @param emotions - Array of { vector, weight } pairs
|
|
1995
|
+
* @returns Blended emotion vector
|
|
1996
|
+
*
|
|
1997
|
+
* @example
|
|
1998
|
+
* ```ts
|
|
1999
|
+
* const blended = blendEmotions([
|
|
2000
|
+
* { vector: EmotionPresets.happy, weight: 0.7 },
|
|
2001
|
+
* { vector: EmotionPresets.surprised, weight: 0.3 },
|
|
2002
|
+
* ]);
|
|
2003
|
+
* ```
|
|
2004
|
+
*/
|
|
2005
|
+
declare function blendEmotions(emotions: Array<{
|
|
2006
|
+
vector: Float32Array;
|
|
2007
|
+
weight: number;
|
|
2008
|
+
}>): Float32Array;
|
|
2009
|
+
/**
|
|
2010
|
+
* Interpolate between two emotion vectors
|
|
2011
|
+
*
|
|
2012
|
+
* @param from - Starting emotion
|
|
2013
|
+
* @param to - Target emotion
|
|
2014
|
+
* @param t - Interpolation factor (0-1)
|
|
2015
|
+
* @returns Interpolated emotion vector
|
|
2016
|
+
*/
|
|
2017
|
+
declare function lerpEmotion(from: Float32Array, to: Float32Array, t: number): Float32Array;
|
|
2018
|
+
/**
|
|
2019
|
+
* EmotionController - Manages emotion state with smooth transitions
|
|
2020
|
+
*/
|
|
2021
|
+
declare class EmotionController {
|
|
2022
|
+
private currentEmotion;
|
|
2023
|
+
private targetEmotion;
|
|
2024
|
+
private transitionProgress;
|
|
2025
|
+
private transitionDuration;
|
|
2026
|
+
private transitionStartTime;
|
|
2027
|
+
/**
|
|
2028
|
+
* Get the current emotion vector
|
|
2029
|
+
*/
|
|
2030
|
+
get emotion(): Float32Array;
|
|
2031
|
+
/**
|
|
2032
|
+
* Set emotion immediately (no transition)
|
|
2033
|
+
*/
|
|
2034
|
+
set(weights: EmotionWeights): void;
|
|
2035
|
+
/**
|
|
2036
|
+
* Set emotion from preset immediately
|
|
2037
|
+
*/
|
|
2038
|
+
setPreset(preset: EmotionPresetName): void;
|
|
2039
|
+
/**
|
|
2040
|
+
* Transition to new emotion over time
|
|
2041
|
+
*
|
|
2042
|
+
* @param weights - Target emotion weights
|
|
2043
|
+
* @param durationMs - Transition duration in milliseconds
|
|
2044
|
+
*/
|
|
2045
|
+
transitionTo(weights: EmotionWeights, durationMs: number): void;
|
|
2046
|
+
/**
|
|
2047
|
+
* Transition to preset over time
|
|
2048
|
+
*/
|
|
2049
|
+
transitionToPreset(preset: EmotionPresetName, durationMs: number): void;
|
|
2050
|
+
/**
|
|
2051
|
+
* Update transition progress (call each frame)
|
|
2052
|
+
*/
|
|
2053
|
+
update(): void;
|
|
2054
|
+
/**
|
|
2055
|
+
* Check if currently transitioning
|
|
2056
|
+
*/
|
|
2057
|
+
get isTransitioning(): boolean;
|
|
2058
|
+
/**
|
|
2059
|
+
* Reset to neutral
|
|
2060
|
+
*/
|
|
2061
|
+
reset(): void;
|
|
2062
|
+
}
|
|
2063
|
+
|
|
2064
|
+
/**
|
|
2065
|
+
* AI Adapter Interface
|
|
2066
|
+
*
|
|
2067
|
+
* Common interface for AI backends (AWS AgentCore, OpenAI Realtime).
|
|
2068
|
+
* Adapters handle the conversation flow and emit events for animation.
|
|
2069
|
+
*
|
|
2070
|
+
* @category AI
|
|
2071
|
+
*/
|
|
2072
|
+
|
|
2073
|
+
/**
|
|
2074
|
+
* Tenant configuration for multi-tenant isolation
|
|
2075
|
+
*/
|
|
2076
|
+
interface TenantConfig {
|
|
2077
|
+
/** Unique tenant identifier */
|
|
2078
|
+
tenantId: string;
|
|
2079
|
+
/** Customer-specific API credentials */
|
|
2080
|
+
credentials: {
|
|
2081
|
+
apiKey?: string;
|
|
2082
|
+
authToken?: string;
|
|
2083
|
+
refreshToken?: string;
|
|
2084
|
+
};
|
|
2085
|
+
/** Character configuration for this tenant */
|
|
2086
|
+
characterId: string;
|
|
2087
|
+
/** Optional custom endpoint override */
|
|
2088
|
+
endpoint?: string;
|
|
2089
|
+
}
|
|
2090
|
+
/**
|
|
2091
|
+
* Voice configuration for TTS
|
|
2092
|
+
*/
|
|
2093
|
+
interface VoiceConfig {
|
|
2094
|
+
/** TTS provider */
|
|
2095
|
+
provider: 'elevenlabs' | 'openai';
|
|
2096
|
+
/** Voice ID */
|
|
2097
|
+
voiceId: string;
|
|
2098
|
+
/** Stability (0-1, ElevenLabs) */
|
|
2099
|
+
stability?: number;
|
|
2100
|
+
/** Similarity boost (0-1, ElevenLabs) */
|
|
2101
|
+
similarityBoost?: number;
|
|
2102
|
+
}
|
|
2103
|
+
/**
|
|
2104
|
+
* Session configuration
|
|
2105
|
+
*/
|
|
2106
|
+
interface SessionConfig {
|
|
2107
|
+
/** Session ID (generated or provided) */
|
|
2108
|
+
sessionId: string;
|
|
2109
|
+
/** Tenant this session belongs to */
|
|
2110
|
+
tenant: TenantConfig;
|
|
2111
|
+
/** Initial system prompt / personality */
|
|
2112
|
+
systemPrompt?: string;
|
|
2113
|
+
/** Voice configuration for TTS */
|
|
2114
|
+
voice?: VoiceConfig;
|
|
2115
|
+
/** Initial emotion state */
|
|
2116
|
+
emotion?: string;
|
|
2117
|
+
/** Language code */
|
|
2118
|
+
language?: string;
|
|
2119
|
+
}
|
|
2120
|
+
/**
|
|
2121
|
+
* Message role in conversation
|
|
2122
|
+
*/
|
|
2123
|
+
type MessageRole = 'user' | 'assistant' | 'system';
|
|
2124
|
+
/**
|
|
2125
|
+
* Conversation message in session history
|
|
2126
|
+
*/
|
|
2127
|
+
interface ConversationMessage {
|
|
2128
|
+
/** Message role */
|
|
2129
|
+
role: MessageRole;
|
|
2130
|
+
/** Text content */
|
|
2131
|
+
content: string;
|
|
2132
|
+
/** Timestamp (ms) */
|
|
2133
|
+
timestamp: number;
|
|
2134
|
+
/** Emotion detected/expressed */
|
|
2135
|
+
emotion?: string;
|
|
2136
|
+
/** Audio duration if applicable (ms) */
|
|
2137
|
+
audioDurationMs?: number;
|
|
2138
|
+
}
|
|
2139
|
+
/**
|
|
2140
|
+
* Session state
|
|
2141
|
+
*/
|
|
2142
|
+
type AISessionState = 'idle' | 'listening' | 'thinking' | 'speaking' | 'interrupted' | 'error' | 'disconnected';
|
|
2143
|
+
/**
|
|
2144
|
+
* Events emitted by AI adapters
|
|
2145
|
+
*/
|
|
2146
|
+
interface AIAdapterEvents {
|
|
2147
|
+
[key: string]: unknown;
|
|
2148
|
+
'state.change': {
|
|
2149
|
+
state: AISessionState;
|
|
2150
|
+
previousState: AISessionState;
|
|
2151
|
+
};
|
|
2152
|
+
'user.speech.start': {
|
|
2153
|
+
timestamp: number;
|
|
2154
|
+
};
|
|
2155
|
+
'user.speech.end': {
|
|
2156
|
+
timestamp: number;
|
|
2157
|
+
durationMs: number;
|
|
2158
|
+
};
|
|
2159
|
+
'user.transcript.partial': {
|
|
2160
|
+
text: string;
|
|
2161
|
+
confidence: number;
|
|
2162
|
+
};
|
|
2163
|
+
'user.transcript.final': {
|
|
2164
|
+
text: string;
|
|
2165
|
+
confidence: number;
|
|
2166
|
+
};
|
|
2167
|
+
'ai.thinking.start': {
|
|
2168
|
+
timestamp: number;
|
|
2169
|
+
};
|
|
2170
|
+
'ai.response.start': {
|
|
2171
|
+
text?: string;
|
|
2172
|
+
emotion?: string;
|
|
2173
|
+
};
|
|
2174
|
+
'ai.response.chunk': {
|
|
2175
|
+
text: string;
|
|
2176
|
+
isLast: boolean;
|
|
2177
|
+
};
|
|
2178
|
+
'ai.response.end': {
|
|
2179
|
+
fullText: string;
|
|
2180
|
+
durationMs: number;
|
|
2181
|
+
};
|
|
2182
|
+
'audio.output.chunk': {
|
|
2183
|
+
audio: ArrayBuffer;
|
|
2184
|
+
sampleRate: number;
|
|
2185
|
+
timestamp: number;
|
|
2186
|
+
};
|
|
2187
|
+
'audio.output.end': {
|
|
2188
|
+
durationMs: number;
|
|
2189
|
+
};
|
|
2190
|
+
'animation': AnimationEvent;
|
|
2191
|
+
'memory.updated': {
|
|
2192
|
+
messageCount: number;
|
|
2193
|
+
tokenCount?: number;
|
|
2194
|
+
};
|
|
2195
|
+
'connection.opened': {
|
|
2196
|
+
sessionId: string;
|
|
2197
|
+
adapter: string;
|
|
2198
|
+
};
|
|
2199
|
+
'connection.closed': {
|
|
2200
|
+
reason: string;
|
|
2201
|
+
};
|
|
2202
|
+
'connection.error': {
|
|
2203
|
+
error: Error;
|
|
2204
|
+
recoverable: boolean;
|
|
2205
|
+
};
|
|
2206
|
+
'interruption.detected': {
|
|
2207
|
+
timestamp: number;
|
|
2208
|
+
};
|
|
2209
|
+
'interruption.handled': {
|
|
2210
|
+
action: 'stop' | 'continue';
|
|
2211
|
+
timestamp: number;
|
|
2212
|
+
};
|
|
2213
|
+
}
|
|
2214
|
+
/**
|
|
2215
|
+
* Base interface for all AI adapters
|
|
2216
|
+
*/
|
|
2217
|
+
interface AIAdapter {
|
|
2218
|
+
/** Adapter name for logging/debugging */
|
|
2219
|
+
readonly name: string;
|
|
2220
|
+
/** Current session state */
|
|
2221
|
+
readonly state: AISessionState;
|
|
2222
|
+
/** Current session ID (null if not connected) */
|
|
2223
|
+
readonly sessionId: string | null;
|
|
2224
|
+
/** Whether the adapter is connected */
|
|
2225
|
+
readonly isConnected: boolean;
|
|
2226
|
+
/**
|
|
2227
|
+
* Initialize and connect the adapter
|
|
2228
|
+
*/
|
|
2229
|
+
connect(config: SessionConfig): Promise<void>;
|
|
2230
|
+
/**
|
|
2231
|
+
* Disconnect and cleanup
|
|
2232
|
+
*/
|
|
2233
|
+
disconnect(): Promise<void>;
|
|
2234
|
+
/**
|
|
2235
|
+
* Push user audio for processing
|
|
2236
|
+
* @param audio - PCM audio data (16kHz, mono)
|
|
2237
|
+
*/
|
|
2238
|
+
pushAudio(audio: Int16Array | Float32Array): void;
|
|
2239
|
+
/**
|
|
2240
|
+
* Send text message directly (bypasses STT)
|
|
2241
|
+
*/
|
|
2242
|
+
sendText(text: string): Promise<void>;
|
|
2243
|
+
/**
|
|
2244
|
+
* Handle user interruption
|
|
2245
|
+
* Stops current AI speech and prepares for new input
|
|
2246
|
+
*/
|
|
2247
|
+
interrupt(): void;
|
|
2248
|
+
/**
|
|
2249
|
+
* Get conversation history
|
|
2250
|
+
*/
|
|
2251
|
+
getHistory(): ConversationMessage[];
|
|
2252
|
+
/**
|
|
2253
|
+
* Clear conversation history
|
|
2254
|
+
*/
|
|
2255
|
+
clearHistory(): void;
|
|
2256
|
+
/**
|
|
2257
|
+
* Check if adapter is available/healthy
|
|
2258
|
+
*/
|
|
2259
|
+
healthCheck(): Promise<boolean>;
|
|
2260
|
+
on<K extends keyof AIAdapterEvents>(event: K, callback: (data: AIAdapterEvents[K]) => void): () => void;
|
|
2261
|
+
off<K extends keyof AIAdapterEvents>(event: K, callback: (data: AIAdapterEvents[K]) => void): void;
|
|
2262
|
+
once<K extends keyof AIAdapterEvents>(event: K, callback: (data: AIAdapterEvents[K]) => void): () => void;
|
|
2263
|
+
}
|
|
2264
|
+
|
|
2265
|
+
/**
|
|
2266
|
+
* Conversation Session Interface
|
|
2267
|
+
*
|
|
2268
|
+
* Represents an active conversation with memory and state.
|
|
2269
|
+
*
|
|
2270
|
+
* @category AI
|
|
2271
|
+
*/
|
|
2272
|
+
|
|
2273
|
+
/**
|
|
2274
|
+
* Serializable session snapshot for persistence
|
|
2275
|
+
*/
|
|
2276
|
+
interface SessionSnapshot {
|
|
2277
|
+
/** Session ID */
|
|
2278
|
+
sessionId: string;
|
|
2279
|
+
/** Tenant ID */
|
|
2280
|
+
tenantId: string;
|
|
2281
|
+
/** Character ID */
|
|
2282
|
+
characterId: string;
|
|
2283
|
+
/** Conversation history */
|
|
2284
|
+
history: ConversationMessage[];
|
|
2285
|
+
/** Custom context */
|
|
2286
|
+
context: Record<string, string>;
|
|
2287
|
+
/** Emotion state */
|
|
2288
|
+
emotion: EmotionWeights;
|
|
2289
|
+
/** Creation timestamp */
|
|
2290
|
+
createdAt: number;
|
|
2291
|
+
/** Last activity timestamp */
|
|
2292
|
+
lastActivityAt: number;
|
|
2293
|
+
}
|
|
2294
|
+
/**
|
|
2295
|
+
* Extended session with memory management
|
|
2296
|
+
*/
|
|
2297
|
+
interface ConversationSession {
|
|
2298
|
+
/** Session identifier */
|
|
2299
|
+
readonly sessionId: string;
|
|
2300
|
+
/** Associated AI adapter */
|
|
2301
|
+
readonly adapter: AIAdapter;
|
|
2302
|
+
/** Session configuration */
|
|
2303
|
+
readonly config: SessionConfig;
|
|
2304
|
+
/** Current state */
|
|
2305
|
+
readonly state: AISessionState;
|
|
2306
|
+
/** Conversation history */
|
|
2307
|
+
readonly history: ConversationMessage[];
|
|
2308
|
+
/** Current emotion state */
|
|
2309
|
+
readonly emotion: EmotionWeights;
|
|
2310
|
+
/** Session creation timestamp */
|
|
2311
|
+
readonly createdAt: number;
|
|
2312
|
+
/** Last activity timestamp */
|
|
2313
|
+
readonly lastActivityAt: number;
|
|
2314
|
+
/**
|
|
2315
|
+
* Start the session (connects adapter)
|
|
2316
|
+
*/
|
|
2317
|
+
start(): Promise<void>;
|
|
2318
|
+
/**
|
|
2319
|
+
* End the session (disconnects adapter)
|
|
2320
|
+
*/
|
|
2321
|
+
end(): Promise<void>;
|
|
2322
|
+
/**
|
|
2323
|
+
* Push audio input
|
|
2324
|
+
*/
|
|
2325
|
+
pushAudio(audio: Int16Array | Float32Array): void;
|
|
2326
|
+
/**
|
|
2327
|
+
* Send text input directly
|
|
2328
|
+
*/
|
|
2329
|
+
sendText(text: string): Promise<void>;
|
|
2330
|
+
/**
|
|
2331
|
+
* Interrupt current AI response
|
|
2332
|
+
*/
|
|
2333
|
+
interrupt(): void;
|
|
2334
|
+
/**
|
|
2335
|
+
* Update emotion state
|
|
2336
|
+
*/
|
|
2337
|
+
setEmotion(emotion: EmotionWeights): void;
|
|
2338
|
+
/**
|
|
2339
|
+
* Add a context item (custom memory)
|
|
2340
|
+
*/
|
|
2341
|
+
addContext(key: string, value: string): void;
|
|
2342
|
+
/**
|
|
2343
|
+
* Remove a context item
|
|
2344
|
+
*/
|
|
2345
|
+
removeContext(key: string): void;
|
|
2346
|
+
/**
|
|
2347
|
+
* Get all context items
|
|
2348
|
+
*/
|
|
2349
|
+
getContext(): Record<string, string>;
|
|
2350
|
+
/**
|
|
2351
|
+
* Export session for persistence
|
|
2352
|
+
*/
|
|
2353
|
+
export(): SessionSnapshot;
|
|
2354
|
+
/**
|
|
2355
|
+
* Import session from snapshot
|
|
2356
|
+
*/
|
|
2357
|
+
import(snapshot: SessionSnapshot): void;
|
|
2358
|
+
}
|
|
2359
|
+
|
|
2360
|
+
/**
|
|
2361
|
+
* AWS AgentCore Adapter
|
|
2362
|
+
*
|
|
2363
|
+
* Primary AI adapter for the Omote Platform.
|
|
2364
|
+
*
|
|
2365
|
+
* Pipeline:
|
|
2366
|
+
* User Audio -> Whisper ASR (local) -> Text
|
|
2367
|
+
* Text -> AgentCore (WebSocket) -> Response Text + Audio chunks (TTS handled backend-side)
|
|
2368
|
+
* Audio chunks -> LAM (local) -> Blendshapes -> Render
|
|
2369
|
+
*
|
|
2370
|
+
* @category AI
|
|
2371
|
+
*/
|
|
2372
|
+
|
|
2373
|
+
/**
|
|
2374
|
+
* AgentCore-specific configuration
|
|
2375
|
+
*/
|
|
2376
|
+
interface AgentCoreConfig {
|
|
2377
|
+
/** AgentCore WebSocket endpoint */
|
|
2378
|
+
endpoint: string;
|
|
2379
|
+
/** AWS region */
|
|
2380
|
+
region?: string;
|
|
2381
|
+
/** Model URLs */
|
|
2382
|
+
models?: {
|
|
2383
|
+
lamUrl?: string;
|
|
2384
|
+
};
|
|
2385
|
+
/** Enable observability */
|
|
2386
|
+
observability?: {
|
|
2387
|
+
tracing?: boolean;
|
|
2388
|
+
metrics?: boolean;
|
|
2389
|
+
};
|
|
2390
|
+
}
|
|
2391
|
+
/**
|
|
2392
|
+
* AWS AgentCore Adapter
|
|
2393
|
+
*/
|
|
2394
|
+
declare class AgentCoreAdapter extends EventEmitter<AIAdapterEvents> implements AIAdapter {
|
|
2395
|
+
readonly name = "AgentCore";
|
|
2396
|
+
private _state;
|
|
2397
|
+
private _sessionId;
|
|
2398
|
+
private _isConnected;
|
|
2399
|
+
private whisper;
|
|
2400
|
+
private vad;
|
|
2401
|
+
private lam;
|
|
2402
|
+
private emotionController;
|
|
2403
|
+
private pipeline;
|
|
2404
|
+
private ws;
|
|
2405
|
+
private wsReconnectAttempts;
|
|
2406
|
+
private readonly maxReconnectAttempts;
|
|
2407
|
+
private audioBuffer;
|
|
2408
|
+
private history;
|
|
2409
|
+
private currentConfig;
|
|
2410
|
+
private agentCoreConfig;
|
|
2411
|
+
private isSpeaking;
|
|
2412
|
+
private currentTtsAbortController;
|
|
2413
|
+
private tokenCache;
|
|
2414
|
+
constructor(config: AgentCoreConfig);
|
|
2415
|
+
get state(): AISessionState;
|
|
2416
|
+
get sessionId(): string | null;
|
|
2417
|
+
get isConnected(): boolean;
|
|
2418
|
+
/**
|
|
2419
|
+
* Connect to AgentCore with session configuration
|
|
2420
|
+
*/
|
|
2421
|
+
connect(config: SessionConfig): Promise<void>;
|
|
2422
|
+
/**
|
|
2423
|
+
* Disconnect and cleanup
|
|
2424
|
+
*/
|
|
2425
|
+
disconnect(): Promise<void>;
|
|
2426
|
+
/**
|
|
2427
|
+
* Push user audio for processing
|
|
2428
|
+
*/
|
|
2429
|
+
pushAudio(audio: Int16Array | Float32Array): void;
|
|
2430
|
+
/**
|
|
2431
|
+
* Send text directly to AgentCore
|
|
2432
|
+
*/
|
|
2433
|
+
sendText(text: string): Promise<void>;
|
|
2434
|
+
/**
|
|
2435
|
+
* Interrupt current AI response
|
|
2436
|
+
*/
|
|
2437
|
+
interrupt(): void;
|
|
2438
|
+
getHistory(): ConversationMessage[];
|
|
2439
|
+
clearHistory(): void;
|
|
2440
|
+
healthCheck(): Promise<boolean>;
|
|
2441
|
+
private setState;
|
|
2442
|
+
private getAuthToken;
|
|
2443
|
+
private initWhisper;
|
|
2444
|
+
private initLAM;
|
|
2445
|
+
private initPipeline;
|
|
2446
|
+
private connectWebSocket;
|
|
2447
|
+
private handleAgentCoreMessage;
|
|
2448
|
+
private scheduleTranscription;
|
|
2449
|
+
/**
|
|
2450
|
+
* Detect voice activity using Silero VAD
|
|
2451
|
+
* Falls back to simple RMS if VAD not available
|
|
2452
|
+
*/
|
|
2453
|
+
private detectVoiceActivity;
|
|
2454
|
+
private int16ToFloat32;
|
|
2455
|
+
private base64ToArrayBuffer;
|
|
2456
|
+
private addToHistory;
|
|
2457
|
+
private handleDisconnect;
|
|
2458
|
+
}
|
|
2459
|
+
|
|
2460
|
+
/**
|
|
2461
|
+
* Conversation Orchestrator
|
|
2462
|
+
*
|
|
2463
|
+
* Manages the conversation pipeline with AgentCore:
|
|
2464
|
+
* - Handles session lifecycle and tenant isolation
|
|
2465
|
+
* - Manages adapter events and state
|
|
2466
|
+
*
|
|
2467
|
+
* @category AI
|
|
2468
|
+
*/
|
|
2469
|
+
|
|
2470
|
+
/**
|
|
2471
|
+
* Orchestrator configuration
|
|
2472
|
+
*/
|
|
2473
|
+
interface OrchestratorConfig {
|
|
2474
|
+
/** AgentCore adapter config */
|
|
2475
|
+
adapter: AgentCoreConfig;
|
|
2476
|
+
/** Connection timeout in ms */
|
|
2477
|
+
connectionTimeoutMs?: number;
|
|
2478
|
+
/** Max retry attempts */
|
|
2479
|
+
maxRetries?: number;
|
|
2480
|
+
}
|
|
2481
|
+
/**
|
|
2482
|
+
* Orchestrator events (extends AI adapter events)
|
|
2483
|
+
*/
|
|
2484
|
+
interface OrchestratorEvents extends AIAdapterEvents {
|
|
2485
|
+
'session.created': {
|
|
2486
|
+
sessionId: string;
|
|
2487
|
+
tenantId: string;
|
|
2488
|
+
};
|
|
2489
|
+
'session.ended': {
|
|
2490
|
+
sessionId: string;
|
|
2491
|
+
reason: string;
|
|
2492
|
+
};
|
|
2493
|
+
}
|
|
2494
|
+
/**
|
|
2495
|
+
* Conversation Orchestrator
|
|
2496
|
+
*/
|
|
2497
|
+
declare class ConversationOrchestrator extends EventEmitter<OrchestratorEvents> {
|
|
2498
|
+
private config;
|
|
2499
|
+
private adapter;
|
|
2500
|
+
private sessions;
|
|
2501
|
+
private tenants;
|
|
2502
|
+
private healthCheckInterval;
|
|
2503
|
+
private readonly HEALTH_CHECK_INTERVAL_MS;
|
|
2504
|
+
constructor(config: OrchestratorConfig);
|
|
2505
|
+
/**
|
|
2506
|
+
* Register a tenant
|
|
2507
|
+
*/
|
|
2508
|
+
registerTenant(tenant: TenantConfig): void;
|
|
2509
|
+
/**
|
|
2510
|
+
* Unregister a tenant
|
|
2511
|
+
*/
|
|
2512
|
+
unregisterTenant(tenantId: string): void;
|
|
2513
|
+
/**
|
|
2514
|
+
* Get tenant config
|
|
2515
|
+
*/
|
|
2516
|
+
getTenant(tenantId: string): TenantConfig | undefined;
|
|
2517
|
+
/**
|
|
2518
|
+
* Create a new conversation session for a tenant
|
|
2519
|
+
*/
|
|
2520
|
+
createSession(tenantId: string, options?: Partial<SessionConfig>): Promise<ConversationSession>;
|
|
2521
|
+
/**
|
|
2522
|
+
* End a session
|
|
2523
|
+
*/
|
|
2524
|
+
endSession(sessionId: string): Promise<void>;
|
|
2525
|
+
/**
|
|
2526
|
+
* Get session by ID
|
|
2527
|
+
*/
|
|
2528
|
+
getSession(sessionId: string): ConversationSession | undefined;
|
|
2529
|
+
/**
|
|
2530
|
+
* Get all sessions for a tenant
|
|
2531
|
+
*/
|
|
2532
|
+
getTenantSessions(tenantId: string): ConversationSession[];
|
|
2533
|
+
/**
|
|
2534
|
+
* Start health monitoring
|
|
2535
|
+
*/
|
|
2536
|
+
startHealthMonitoring(): void;
|
|
2537
|
+
/**
|
|
2538
|
+
* Stop health monitoring
|
|
2539
|
+
*/
|
|
2540
|
+
stopHealthMonitoring(): void;
|
|
2541
|
+
/**
|
|
2542
|
+
* Dispose all resources
|
|
2543
|
+
*/
|
|
2544
|
+
dispose(): Promise<void>;
|
|
2545
|
+
private generateSessionId;
|
|
2546
|
+
private forwardAdapterEvents;
|
|
2547
|
+
private performHealthCheck;
|
|
2548
|
+
}
|
|
2549
|
+
|
|
2550
|
+
/**
|
|
2551
|
+
* Tenant Manager
|
|
2552
|
+
*
|
|
2553
|
+
* Handles multi-tenant isolation for the Omote Platform:
|
|
2554
|
+
* - Credential isolation per tenant
|
|
2555
|
+
* - Session scoping per tenant
|
|
2556
|
+
* - Quota management
|
|
2557
|
+
* - Token refresh
|
|
2558
|
+
*
|
|
2559
|
+
* @category AI
|
|
2560
|
+
*/
|
|
2561
|
+
|
|
2562
|
+
/**
|
|
2563
|
+
* Tenant quota configuration
|
|
2564
|
+
*/
|
|
2565
|
+
interface TenantQuota {
|
|
2566
|
+
/** Max concurrent sessions */
|
|
2567
|
+
maxSessions: number;
|
|
2568
|
+
/** Requests per minute */
|
|
2569
|
+
requestsPerMinute: number;
|
|
2570
|
+
/** Max tokens per conversation */
|
|
2571
|
+
maxTokensPerConversation: number;
|
|
2572
|
+
/** Max audio minutes per day */
|
|
2573
|
+
maxAudioMinutesPerDay: number;
|
|
2574
|
+
}
|
|
2575
|
+
/**
|
|
2576
|
+
* Tenant usage tracking
|
|
2577
|
+
*/
|
|
2578
|
+
interface TenantUsage {
|
|
2579
|
+
/** Current active sessions */
|
|
2580
|
+
currentSessions: number;
|
|
2581
|
+
/** Requests in current minute */
|
|
2582
|
+
requestsThisMinute: number;
|
|
2583
|
+
/** Total tokens used */
|
|
2584
|
+
tokensUsed: number;
|
|
2585
|
+
/** Audio minutes used today */
|
|
2586
|
+
audioMinutesToday: number;
|
|
2587
|
+
/** Last reset timestamp */
|
|
2588
|
+
lastMinuteReset: number;
|
|
2589
|
+
/** Last daily reset timestamp */
|
|
2590
|
+
lastDailyReset: number;
|
|
2591
|
+
}
|
|
2592
|
+
/**
|
|
2593
|
+
* Token refresh callback
|
|
2594
|
+
*/
|
|
2595
|
+
type TokenRefreshCallback = () => Promise<string>;
|
|
2596
|
+
/**
|
|
2597
|
+
* Tenant Manager
|
|
2598
|
+
*/
|
|
2599
|
+
declare class TenantManager {
|
|
2600
|
+
private tenants;
|
|
2601
|
+
private quotas;
|
|
2602
|
+
private usage;
|
|
2603
|
+
private tokenRefreshCallbacks;
|
|
2604
|
+
/**
|
|
2605
|
+
* Default quota for new tenants
|
|
2606
|
+
*/
|
|
2607
|
+
static readonly DEFAULT_QUOTA: TenantQuota;
|
|
2608
|
+
/**
|
|
2609
|
+
* Register a tenant with quota
|
|
2610
|
+
*/
|
|
2611
|
+
register(tenant: TenantConfig, quota?: TenantQuota, tokenRefreshCallback?: TokenRefreshCallback): void;
|
|
2612
|
+
/**
|
|
2613
|
+
* Unregister a tenant
|
|
2614
|
+
*/
|
|
2615
|
+
unregister(tenantId: string): void;
|
|
2616
|
+
/**
|
|
2617
|
+
* Get tenant config
|
|
2618
|
+
*/
|
|
2619
|
+
get(tenantId: string): TenantConfig | undefined;
|
|
2620
|
+
/**
|
|
2621
|
+
* Check if tenant exists
|
|
2622
|
+
*/
|
|
2623
|
+
has(tenantId: string): boolean;
|
|
2624
|
+
/**
|
|
2625
|
+
* Get all tenant IDs
|
|
2626
|
+
*/
|
|
2627
|
+
getTenantIds(): string[];
|
|
2628
|
+
/**
|
|
2629
|
+
* Check if tenant can create new session
|
|
2630
|
+
*/
|
|
2631
|
+
canCreateSession(tenantId: string): boolean;
|
|
2632
|
+
/**
|
|
2633
|
+
* Check if tenant can make request
|
|
2634
|
+
*/
|
|
2635
|
+
canMakeRequest(tenantId: string): boolean;
|
|
2636
|
+
/**
|
|
2637
|
+
* Check if tenant can use audio
|
|
2638
|
+
*/
|
|
2639
|
+
canUseAudio(tenantId: string, minutes: number): boolean;
|
|
2640
|
+
/**
|
|
2641
|
+
* Increment session count
|
|
2642
|
+
*/
|
|
2643
|
+
incrementSessions(tenantId: string): void;
|
|
2644
|
+
/**
|
|
2645
|
+
* Decrement session count
|
|
2646
|
+
*/
|
|
2647
|
+
decrementSessions(tenantId: string): void;
|
|
2648
|
+
/**
|
|
2649
|
+
* Record a request
|
|
2650
|
+
*/
|
|
2651
|
+
recordRequest(tenantId: string): void;
|
|
2652
|
+
/**
|
|
2653
|
+
* Record token usage
|
|
2654
|
+
*/
|
|
2655
|
+
recordTokens(tenantId: string, tokens: number): void;
|
|
2656
|
+
/**
|
|
2657
|
+
* Record audio usage
|
|
2658
|
+
*/
|
|
2659
|
+
recordAudioMinutes(tenantId: string, minutes: number): void;
|
|
2660
|
+
/**
|
|
2661
|
+
* Get fresh auth token for tenant
|
|
2662
|
+
*/
|
|
2663
|
+
getAuthToken(tenantId: string): Promise<string>;
|
|
2664
|
+
/**
|
|
2665
|
+
* Update tenant credentials
|
|
2666
|
+
*/
|
|
2667
|
+
updateCredentials(tenantId: string, credentials: Partial<TenantConfig['credentials']>): void;
|
|
2668
|
+
/**
|
|
2669
|
+
* Get usage stats for tenant
|
|
2670
|
+
*/
|
|
2671
|
+
getUsage(tenantId: string): TenantUsage | undefined;
|
|
2672
|
+
/**
|
|
2673
|
+
* Get quota for tenant
|
|
2674
|
+
*/
|
|
2675
|
+
getQuota(tenantId: string): TenantQuota | undefined;
|
|
2676
|
+
/**
|
|
2677
|
+
* Update quota for tenant
|
|
2678
|
+
*/
|
|
2679
|
+
updateQuota(tenantId: string, quota: Partial<TenantQuota>): void;
|
|
2680
|
+
/**
|
|
2681
|
+
* Reset all usage stats for a tenant
|
|
2682
|
+
*/
|
|
2683
|
+
resetUsage(tenantId: string): void;
|
|
2684
|
+
private checkMinuteReset;
|
|
2685
|
+
private checkDailyReset;
|
|
2686
|
+
}
|
|
2687
|
+
|
|
2688
|
+
/**
|
|
2689
|
+
* Audio Sync Manager
|
|
2690
|
+
*
|
|
2691
|
+
* Synchronizes TTS audio playback with lip sync animation:
|
|
2692
|
+
* - Buffers audio for inference
|
|
2693
|
+
* - Manages playback timing
|
|
2694
|
+
* - Handles audio queue for streaming
|
|
2695
|
+
*
|
|
2696
|
+
* @category AI
|
|
2697
|
+
*/
|
|
2698
|
+
|
|
2699
|
+
/**
|
|
2700
|
+
* Audio sync events
|
|
2701
|
+
*/
|
|
2702
|
+
interface AudioSyncEvents {
|
|
2703
|
+
[key: string]: unknown;
|
|
2704
|
+
'buffer.ready': {
|
|
2705
|
+
audio: Float32Array;
|
|
2706
|
+
};
|
|
2707
|
+
'playback.start': Record<string, never>;
|
|
2708
|
+
'playback.end': Record<string, never>;
|
|
2709
|
+
'sync.drift': {
|
|
2710
|
+
driftMs: number;
|
|
2711
|
+
};
|
|
2712
|
+
}
|
|
2713
|
+
/**
|
|
2714
|
+
* Audio sync configuration
|
|
2715
|
+
*/
|
|
2716
|
+
interface AudioSyncConfig {
|
|
2717
|
+
/** Target sample rate (default: 16000) */
|
|
2718
|
+
sampleRate?: number;
|
|
2719
|
+
/** Buffer size for inference (default: 16640) */
|
|
2720
|
+
bufferSize?: number;
|
|
2721
|
+
/** Overlap between buffers (default: 4160) */
|
|
2722
|
+
overlapSize?: number;
|
|
2723
|
+
/** Max drift before correction (default: 100ms) */
|
|
2724
|
+
maxDriftMs?: number;
|
|
2725
|
+
}
|
|
2726
|
+
/**
|
|
2727
|
+
* Audio Sync Manager
|
|
2728
|
+
*/
|
|
2729
|
+
declare class AudioSyncManager extends EventEmitter<AudioSyncEvents> {
|
|
2730
|
+
private config;
|
|
2731
|
+
private audioBuffer;
|
|
2732
|
+
private bufferPosition;
|
|
2733
|
+
private playbackQueue;
|
|
2734
|
+
private isPlaying;
|
|
2735
|
+
private audioContext;
|
|
2736
|
+
private playbackStartTime;
|
|
2737
|
+
private samplesPlayed;
|
|
2738
|
+
constructor(config?: AudioSyncConfig);
|
|
2739
|
+
/**
|
|
2740
|
+
* Initialize audio context
|
|
2741
|
+
*/
|
|
2742
|
+
initialize(): Promise<void>;
|
|
2743
|
+
/**
|
|
2744
|
+
* Push audio chunk for processing and playback
|
|
2745
|
+
*/
|
|
2746
|
+
pushAudio(audio: Float32Array): void;
|
|
2747
|
+
/**
|
|
2748
|
+
* Buffer audio for inference
|
|
2749
|
+
*/
|
|
2750
|
+
private bufferForInference;
|
|
2751
|
+
/**
|
|
2752
|
+
* Start audio playback
|
|
2753
|
+
*/
|
|
2754
|
+
private startPlayback;
|
|
2755
|
+
/**
|
|
2756
|
+
* Process playback queue
|
|
2757
|
+
*/
|
|
2758
|
+
private processPlaybackQueue;
|
|
2759
|
+
/**
|
|
2760
|
+
* Check for audio/animation drift
|
|
2761
|
+
*/
|
|
2762
|
+
private checkDrift;
|
|
2763
|
+
/**
|
|
2764
|
+
* Clear playback queue
|
|
2765
|
+
*/
|
|
2766
|
+
clearQueue(): void;
|
|
2767
|
+
/**
|
|
2768
|
+
* Stop playback
|
|
2769
|
+
*/
|
|
2770
|
+
stop(): void;
|
|
2771
|
+
/**
|
|
2772
|
+
* Get current playback position in seconds
|
|
2773
|
+
*/
|
|
2774
|
+
getPlaybackPosition(): number;
|
|
2775
|
+
/**
|
|
2776
|
+
* Check if currently playing
|
|
2777
|
+
*/
|
|
2778
|
+
getIsPlaying(): boolean;
|
|
2779
|
+
/**
|
|
2780
|
+
* Dispose resources
|
|
2781
|
+
*/
|
|
2782
|
+
dispose(): void;
|
|
2783
|
+
}
|
|
2784
|
+
|
|
2785
|
+
/**
|
|
2786
|
+
* Interruption Handler
|
|
2787
|
+
*
|
|
2788
|
+
* VAD-based interruption detection for AI conversations:
|
|
2789
|
+
* - Monitors user audio for speech
|
|
2790
|
+
* - Detects when user interrupts AI response
|
|
2791
|
+
* - Triggers interruption callbacks
|
|
2792
|
+
*
|
|
2793
|
+
* @category AI
|
|
2794
|
+
*/
|
|
2795
|
+
|
|
2796
|
+
/**
|
|
2797
|
+
* Interruption events
|
|
2798
|
+
*/
|
|
2799
|
+
interface InterruptionEvents {
|
|
2800
|
+
[key: string]: unknown;
|
|
2801
|
+
'speech.detected': {
|
|
2802
|
+
rms: number;
|
|
2803
|
+
};
|
|
2804
|
+
'speech.ended': {
|
|
2805
|
+
durationMs: number;
|
|
2806
|
+
};
|
|
2807
|
+
'interruption.triggered': {
|
|
2808
|
+
rms: number;
|
|
2809
|
+
durationMs: number;
|
|
2810
|
+
};
|
|
2811
|
+
}
|
|
2812
|
+
/**
|
|
2813
|
+
* Interruption handler configuration
|
|
2814
|
+
*
|
|
2815
|
+
* Industry standards applied:
|
|
2816
|
+
* - vadThreshold: 0.5 (Silero VAD default)
|
|
2817
|
+
* - minSpeechDurationMs: 200ms (Google/Amazon barge-in standard)
|
|
2818
|
+
* - silenceTimeoutMs: 500ms (OpenAI Realtime API standard)
|
|
2819
|
+
*/
|
|
2820
|
+
interface InterruptionConfig {
|
|
2821
|
+
/** VAD probability threshold for speech detection (default: 0.5, Silero standard) */
|
|
2822
|
+
vadThreshold?: number;
|
|
2823
|
+
/** Minimum speech duration to trigger interruption (default: 200ms, Google/Amazon standard) */
|
|
2824
|
+
minSpeechDurationMs?: number;
|
|
2825
|
+
/** Silence duration to end speech (default: 500ms, OpenAI standard) */
|
|
2826
|
+
silenceTimeoutMs?: number;
|
|
2827
|
+
/** Enable interruption detection (default: true) */
|
|
2828
|
+
enabled?: boolean;
|
|
2829
|
+
}
|
|
2830
|
+
/**
|
|
2831
|
+
* Interruption Handler
|
|
2832
|
+
*/
|
|
2833
|
+
declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
|
|
2834
|
+
private config;
|
|
2835
|
+
private isSpeaking;
|
|
2836
|
+
private speechStartTime;
|
|
2837
|
+
private lastSpeechTime;
|
|
2838
|
+
private silenceTimer;
|
|
2839
|
+
private aiIsSpeaking;
|
|
2840
|
+
private interruptionTriggeredThisSession;
|
|
2841
|
+
constructor(config?: InterruptionConfig);
|
|
2842
|
+
/**
|
|
2843
|
+
* Process VAD result for interruption detection
|
|
2844
|
+
* @param vadProbability - Speech probability from VAD (0-1)
|
|
2845
|
+
* @param audioEnergy - Optional RMS energy for logging (default: 0)
|
|
2846
|
+
*/
|
|
2847
|
+
processVADResult(vadProbability: number, audioEnergy?: number): void;
|
|
2848
|
+
/**
|
|
2849
|
+
* @deprecated Use processVADResult() instead. This method uses naive RMS detection.
|
|
2850
|
+
* Process audio samples for VAD (legacy - uses simple RMS)
|
|
2851
|
+
*/
|
|
2852
|
+
processAudio(samples: Float32Array | Int16Array): void;
|
|
2853
|
+
/**
|
|
2854
|
+
* Notify that AI started speaking
|
|
2855
|
+
*/
|
|
2856
|
+
setAISpeaking(speaking: boolean): void;
|
|
2857
|
+
/**
|
|
2858
|
+
* Enable/disable interruption detection
|
|
2859
|
+
*/
|
|
2860
|
+
setEnabled(enabled: boolean): void;
|
|
2861
|
+
/**
|
|
2862
|
+
* Update configuration
|
|
2863
|
+
*/
|
|
2864
|
+
updateConfig(config: Partial<InterruptionConfig>): void;
|
|
2865
|
+
/**
|
|
2866
|
+
* Reset state
|
|
2867
|
+
*/
|
|
2868
|
+
reset(): void;
|
|
2869
|
+
/**
|
|
2870
|
+
* Get current state
|
|
2871
|
+
*/
|
|
2872
|
+
getState(): {
|
|
2873
|
+
isSpeaking: boolean;
|
|
2874
|
+
speechDurationMs: number;
|
|
2875
|
+
};
|
|
2876
|
+
private calculateRMS;
|
|
2877
|
+
private onSpeechDetected;
|
|
2878
|
+
private onSilenceDetected;
|
|
2879
|
+
}
|
|
2880
|
+
|
|
2881
|
+
/**
|
|
2882
|
+
* Model Cache
|
|
2883
|
+
*
|
|
2884
|
+
* Caches ONNX models in IndexedDB for faster subsequent loads.
|
|
2885
|
+
* IndexedDB can handle large files (100s of MBs) unlike localStorage.
|
|
2886
|
+
*
|
|
2887
|
+
* @category Cache
|
|
2888
|
+
*/
|
|
2889
|
+
/**
|
|
2890
|
+
* Configuration for cache size limits and eviction behavior
|
|
2891
|
+
*/
|
|
2892
|
+
interface CacheConfig {
|
|
2893
|
+
/** Maximum total cache size in bytes (default: 1GB) */
|
|
2894
|
+
maxSizeBytes?: number;
|
|
2895
|
+
/** Maximum age in milliseconds before eviction (default: none) */
|
|
2896
|
+
maxAgeMs?: number;
|
|
2897
|
+
/** Callback when storage quota exceeds warning threshold */
|
|
2898
|
+
onQuotaWarning?: (info: QuotaInfo) => void;
|
|
2899
|
+
}
|
|
2900
|
+
/**
|
|
2901
|
+
* Storage quota information
|
|
2902
|
+
*/
|
|
2903
|
+
interface QuotaInfo {
|
|
2904
|
+
/** Total bytes used across all origins */
|
|
2905
|
+
usedBytes: number;
|
|
2906
|
+
/** Total available quota in bytes */
|
|
2907
|
+
quotaBytes: number;
|
|
2908
|
+
/** Percentage of quota used (0-100) */
|
|
2909
|
+
percentUsed: number;
|
|
2910
|
+
/** Bytes used by omote cache specifically */
|
|
2911
|
+
cacheBytes: number;
|
|
2912
|
+
}
|
|
2913
|
+
/**
|
|
2914
|
+
* Configure cache size limits and eviction behavior
|
|
2915
|
+
*
|
|
2916
|
+
* @param config - Cache configuration options
|
|
2917
|
+
*
|
|
2918
|
+
* @example
|
|
2919
|
+
* ```typescript
|
|
2920
|
+
* import { configureCacheLimit } from '@omote/core';
|
|
2921
|
+
*
|
|
2922
|
+
* // Set 500MB limit with 24-hour max age
|
|
2923
|
+
* configureCacheLimit({
|
|
2924
|
+
* maxSizeBytes: 500 * 1024 * 1024,
|
|
2925
|
+
* maxAgeMs: 24 * 60 * 60 * 1000,
|
|
2926
|
+
* onQuotaWarning: (info) => {
|
|
2927
|
+
* console.warn(`Storage ${info.percentUsed.toFixed(1)}% used`);
|
|
2928
|
+
* }
|
|
2929
|
+
* });
|
|
2930
|
+
* ```
|
|
2931
|
+
*/
|
|
2932
|
+
declare function configureCacheLimit(config: CacheConfig): void;
|
|
2933
|
+
/**
|
|
2934
|
+
* Get current cache configuration
|
|
2935
|
+
*/
|
|
2936
|
+
declare function getCacheConfig(): CacheConfig;
|
|
2937
|
+
/**
|
|
2938
|
+
* Result from getWithValidation() method
|
|
2939
|
+
*/
|
|
2940
|
+
interface ValidationResult {
|
|
2941
|
+
/** The cached data, or null if not found */
|
|
2942
|
+
data: ArrayBuffer | null;
|
|
2943
|
+
/** True if the cached data is stale (etag mismatch) */
|
|
2944
|
+
stale: boolean;
|
|
2945
|
+
}
|
|
2946
|
+
/**
|
|
2947
|
+
* Generate a version-aware cache key
|
|
2948
|
+
*
|
|
2949
|
+
* @param url - The model URL
|
|
2950
|
+
* @param version - Optional version string
|
|
2951
|
+
* @returns The cache key (url#vX.X.X if version provided, url otherwise)
|
|
2952
|
+
*
|
|
2953
|
+
* @example
|
|
2954
|
+
* ```typescript
|
|
2955
|
+
* getCacheKey('http://example.com/model.onnx', '1.0.0')
|
|
2956
|
+
* // Returns: 'http://example.com/model.onnx#v1.0.0'
|
|
2957
|
+
*
|
|
2958
|
+
* getCacheKey('http://example.com/model.onnx')
|
|
2959
|
+
* // Returns: 'http://example.com/model.onnx'
|
|
2960
|
+
* ```
|
|
2961
|
+
*/
|
|
2962
|
+
declare function getCacheKey(url: string, version?: string): string;
|
|
2963
|
+
interface CacheStats {
|
|
2964
|
+
totalSize: number;
|
|
2965
|
+
modelCount: number;
|
|
2966
|
+
models: {
|
|
2967
|
+
url: string;
|
|
2968
|
+
size: number;
|
|
2969
|
+
cachedAt: Date;
|
|
2970
|
+
}[];
|
|
2971
|
+
}
|
|
2972
|
+
/**
|
|
2973
|
+
* ModelCache - IndexedDB-based cache for ONNX models
|
|
2974
|
+
*/
|
|
2975
|
+
declare class ModelCache {
|
|
2976
|
+
private db;
|
|
2977
|
+
private dbPromise;
|
|
2978
|
+
/**
|
|
2979
|
+
* Initialize the cache database
|
|
2980
|
+
*/
|
|
2981
|
+
private getDB;
|
|
2982
|
+
/**
|
|
2983
|
+
* Check if a model is cached
|
|
2984
|
+
*/
|
|
2985
|
+
has(url: string): Promise<boolean>;
|
|
2986
|
+
/**
|
|
2987
|
+
* Get a cached model
|
|
2988
|
+
*
|
|
2989
|
+
* Updates lastAccessedAt timestamp for LRU tracking on cache hit.
|
|
2990
|
+
*/
|
|
2991
|
+
get(url: string): Promise<ArrayBuffer | null>;
|
|
2992
|
+
/**
|
|
2993
|
+
* Get a cached model with ETag validation
|
|
2994
|
+
*
|
|
2995
|
+
* Validates the cached data against the server's current ETag.
|
|
2996
|
+
* If the cached ETag differs from the server's, the data is marked as stale.
|
|
2997
|
+
*
|
|
2998
|
+
* @param url - The cache key
|
|
2999
|
+
* @param originalUrl - The original URL for HEAD request (if different from cache key)
|
|
3000
|
+
* @returns ValidationResult with data and stale flag
|
|
3001
|
+
*
|
|
3002
|
+
* @example
|
|
3003
|
+
* ```typescript
|
|
3004
|
+
* const result = await cache.getWithValidation('http://example.com/model.onnx');
|
|
3005
|
+
* if (result.data && !result.stale) {
|
|
3006
|
+
* // Use cached data
|
|
3007
|
+
* } else if (result.stale) {
|
|
3008
|
+
* // Refetch and update cache
|
|
3009
|
+
* }
|
|
3010
|
+
* ```
|
|
3011
|
+
*/
|
|
3012
|
+
getWithValidation(url: string, originalUrl?: string): Promise<ValidationResult>;
|
|
3013
|
+
/**
|
|
3014
|
+
* Store a model in cache
|
|
3015
|
+
*
|
|
3016
|
+
* After storing, triggers LRU eviction if cache exceeds size limit.
|
|
3017
|
+
*
|
|
3018
|
+
* @param url - The cache key (use getCacheKey() for versioned keys)
|
|
3019
|
+
* @param data - The model data
|
|
3020
|
+
* @param etag - Optional ETag for staleness validation
|
|
3021
|
+
* @param version - Optional version string for metadata
|
|
3022
|
+
*/
|
|
3023
|
+
set(url: string, data: ArrayBuffer, etag?: string, version?: string): Promise<void>;
|
|
3024
|
+
/**
|
|
3025
|
+
* Check storage quota and trigger warnings/cleanup as needed
|
|
3026
|
+
*
|
|
3027
|
+
* - Logs warning if quota > 90% used
|
|
3028
|
+
* - Triggers LRU cleanup if quota > 95% used
|
|
3029
|
+
* - Calls onQuotaWarning callback if configured
|
|
3030
|
+
*/
|
|
3031
|
+
private checkQuota;
|
|
3032
|
+
/**
|
|
3033
|
+
* Delete a cached model
|
|
3034
|
+
*/
|
|
3035
|
+
delete(url: string): Promise<void>;
|
|
3036
|
+
/**
|
|
3037
|
+
* Clear all cached models
|
|
3038
|
+
*/
|
|
3039
|
+
clear(): Promise<void>;
|
|
3040
|
+
/**
|
|
3041
|
+
* Get cache statistics
|
|
3042
|
+
*/
|
|
3043
|
+
getStats(): Promise<CacheStats>;
|
|
3044
|
+
/**
|
|
3045
|
+
* Enforce cache size limit by evicting oldest entries (LRU)
|
|
3046
|
+
*
|
|
3047
|
+
* Called automatically after each set() operation.
|
|
3048
|
+
* Can also be called manually to trigger cleanup.
|
|
3049
|
+
*/
|
|
3050
|
+
enforceLimit(): Promise<void>;
|
|
3051
|
+
/**
|
|
3052
|
+
* Evict oldest entries (by lastAccessedAt) to free space
|
|
3053
|
+
*
|
|
3054
|
+
* @param bytesToFree - Minimum bytes to free
|
|
3055
|
+
* @returns List of evicted URLs
|
|
3056
|
+
*
|
|
3057
|
+
* @example
|
|
3058
|
+
* ```typescript
|
|
3059
|
+
* const cache = getModelCache();
|
|
3060
|
+
* const evicted = await cache.evictOldest(100 * 1024 * 1024); // Free 100MB
|
|
3061
|
+
* console.log('Evicted:', evicted);
|
|
3062
|
+
* ```
|
|
3063
|
+
*/
|
|
3064
|
+
evictOldest(bytesToFree: number): Promise<string[]>;
|
|
3065
|
+
/**
|
|
3066
|
+
* Get storage quota information
|
|
3067
|
+
*
|
|
3068
|
+
* Uses navigator.storage.estimate() to get quota details.
|
|
3069
|
+
* Returns null if the API is unavailable.
|
|
3070
|
+
*
|
|
3071
|
+
* @returns Quota info or null if unavailable
|
|
3072
|
+
*
|
|
3073
|
+
* @example
|
|
3074
|
+
* ```typescript
|
|
3075
|
+
* const cache = getModelCache();
|
|
3076
|
+
* const quota = await cache.getQuotaInfo();
|
|
3077
|
+
* if (quota) {
|
|
3078
|
+
* console.log(`Using ${quota.percentUsed.toFixed(1)}% of quota`);
|
|
3079
|
+
* }
|
|
3080
|
+
* ```
|
|
3081
|
+
*/
|
|
3082
|
+
getQuotaInfo(): Promise<QuotaInfo | null>;
|
|
3083
|
+
}
|
|
3084
|
+
/**
|
|
3085
|
+
* Get the global ModelCache instance
|
|
3086
|
+
*/
|
|
3087
|
+
declare function getModelCache(): ModelCache;
|
|
3088
|
+
/**
|
|
3089
|
+
* Options for fetchWithCache
|
|
3090
|
+
*/
|
|
3091
|
+
interface FetchWithCacheOptions {
|
|
3092
|
+
/** Optional version string for versioned caching */
|
|
3093
|
+
version?: string;
|
|
3094
|
+
/** If true, validates cached data against server ETag and refetches if stale */
|
|
3095
|
+
validateStale?: boolean;
|
|
3096
|
+
/** Progress callback during download */
|
|
3097
|
+
onProgress?: (loaded: number, total: number) => void;
|
|
3098
|
+
}
|
|
3099
|
+
/**
|
|
3100
|
+
* Fetch a model with caching
|
|
3101
|
+
* Uses IndexedDB cache with network fallback
|
|
3102
|
+
* Files larger than 500MB are not cached to IndexedDB to avoid memory pressure
|
|
3103
|
+
* (structured clone during IndexedDB write temporarily doubles memory usage)
|
|
3104
|
+
*
|
|
3105
|
+
* @param url - The URL to fetch
|
|
3106
|
+
* @param onProgress - Optional progress callback (legacy signature)
|
|
3107
|
+
* @returns The fetched ArrayBuffer
|
|
3108
|
+
*
|
|
3109
|
+
* @example
|
|
3110
|
+
* ```typescript
|
|
3111
|
+
* // Simple usage (backwards compatible)
|
|
3112
|
+
* const data = await fetchWithCache('http://example.com/model.onnx');
|
|
3113
|
+
*
|
|
3114
|
+
* // With progress callback (backwards compatible)
|
|
3115
|
+
* const data = await fetchWithCache('http://example.com/model.onnx', (loaded, total) => {
|
|
3116
|
+
* console.log(`${loaded}/${total} bytes`);
|
|
3117
|
+
* });
|
|
3118
|
+
*
|
|
3119
|
+
* // With options (new API)
|
|
3120
|
+
* const data = await fetchWithCache('http://example.com/model.onnx', {
|
|
3121
|
+
* version: '1.0.0',
|
|
3122
|
+
* validateStale: true,
|
|
3123
|
+
* onProgress: (loaded, total) => console.log(`${loaded}/${total}`)
|
|
3124
|
+
* });
|
|
3125
|
+
* ```
|
|
3126
|
+
*/
|
|
3127
|
+
declare function fetchWithCache(url: string, optionsOrProgress?: FetchWithCacheOptions | ((loaded: number, total: number) => void)): Promise<ArrayBuffer>;
|
|
3128
|
+
/**
|
|
3129
|
+
* Preload models into cache without creating sessions
|
|
3130
|
+
*/
|
|
3131
|
+
declare function preloadModels(urls: string[], onProgress?: (current: number, total: number, url: string) => void): Promise<void>;
|
|
3132
|
+
/**
|
|
3133
|
+
* Format bytes as human readable string
|
|
3134
|
+
*/
|
|
3135
|
+
declare function formatBytes(bytes: number): string;
|
|
3136
|
+
|
|
3137
|
+
/**
|
|
3138
|
+
* HuggingFace CDN Utilities
|
|
3139
|
+
*
|
|
3140
|
+
* Helper functions for working with HuggingFace CDN URLs.
|
|
3141
|
+
* Used by transformers.js models (Whisper, etc.) for model downloads.
|
|
3142
|
+
*
|
|
3143
|
+
* @category Cache
|
|
3144
|
+
*/
|
|
3145
|
+
/**
|
|
3146
|
+
* Test URL for HuggingFace CDN reachability check.
|
|
3147
|
+
* Uses a small, stable file from a well-known public model.
|
|
3148
|
+
*/
|
|
3149
|
+
declare const HF_CDN_TEST_URL = "https://huggingface.co/Xenova/whisper-tiny/resolve/main/config.json";
|
|
3150
|
+
/**
|
|
3151
|
+
* Parsed HuggingFace URL components
|
|
3152
|
+
*/
|
|
3153
|
+
interface HuggingFaceUrlInfo {
|
|
3154
|
+
/** Organization or username */
|
|
3155
|
+
org: string;
|
|
3156
|
+
/** Model name */
|
|
3157
|
+
model: string;
|
|
3158
|
+
/** Branch, tag, or commit */
|
|
3159
|
+
branch: string;
|
|
3160
|
+
/** File path within the repository */
|
|
3161
|
+
file: string;
|
|
3162
|
+
}
|
|
3163
|
+
/**
|
|
3164
|
+
* Parse a HuggingFace CDN URL into its components
|
|
3165
|
+
*
|
|
3166
|
+
* @param url - The HuggingFace URL to parse
|
|
3167
|
+
* @returns Parsed URL info or null if not a valid HF URL
|
|
3168
|
+
*
|
|
3169
|
+
* @example
|
|
3170
|
+
* ```typescript
|
|
3171
|
+
* const info = parseHuggingFaceUrl(
|
|
3172
|
+
* 'https://huggingface.co/openai/whisper-tiny/resolve/main/model.onnx'
|
|
3173
|
+
* );
|
|
3174
|
+
* // Returns: { org: 'openai', model: 'whisper-tiny', branch: 'main', file: 'model.onnx' }
|
|
3175
|
+
* ```
|
|
3176
|
+
*/
|
|
3177
|
+
declare function parseHuggingFaceUrl(url: string): HuggingFaceUrlInfo | null;
|
|
3178
|
+
/**
|
|
3179
|
+
* Check if HuggingFace CDN is reachable
|
|
3180
|
+
*
|
|
3181
|
+
* Performs a HEAD request to a known HuggingFace model file to verify
|
|
3182
|
+
* connectivity. Useful for offline detection or network diagnostics.
|
|
3183
|
+
*
|
|
3184
|
+
* @param testUrl - Optional custom URL to test (defaults to HF_CDN_TEST_URL)
|
|
3185
|
+
* @returns True if CDN is reachable, false otherwise
|
|
3186
|
+
*
|
|
3187
|
+
* @example
|
|
3188
|
+
* ```typescript
|
|
3189
|
+
* import { isHuggingFaceCDNReachable } from '@omote/core';
|
|
3190
|
+
*
|
|
3191
|
+
* const reachable = await isHuggingFaceCDNReachable();
|
|
3192
|
+
* if (!reachable) {
|
|
3193
|
+
* console.log('HuggingFace CDN unreachable - running offline?');
|
|
3194
|
+
* // Fall back to cached models or show error
|
|
3195
|
+
* }
|
|
3196
|
+
* ```
|
|
3197
|
+
*/
|
|
3198
|
+
declare function isHuggingFaceCDNReachable(testUrl?: string): Promise<boolean>;
|
|
3199
|
+
|
|
3200
|
+
/**
|
|
3201
|
+
* Utility to clear transformers.js Cache API storage
|
|
3202
|
+
*
|
|
3203
|
+
* Problem: transformers.js v4 uses Browser Cache API which persists across hard refreshes.
|
|
3204
|
+
* If an HTML error page gets cached (due to network errors, CDN issues, or dev server restarts),
|
|
3205
|
+
* it will be served instead of JSON files, causing JSON.parse() errors.
|
|
3206
|
+
*
|
|
3207
|
+
* Solution: Manually clear Cache API storage before loading models.
|
|
3208
|
+
*
|
|
3209
|
+
* @module utils/transformersCacheClear
|
|
3210
|
+
*/
|
|
3211
|
+
/**
|
|
3212
|
+
* Clear all transformers.js and HuggingFace caches from Browser Cache API
|
|
3213
|
+
*
|
|
3214
|
+
* This clears:
|
|
3215
|
+
* - transformers-cache (default cache key)
|
|
3216
|
+
* - Any caches with 'transformers' or 'huggingface' in the name
|
|
3217
|
+
*
|
|
3218
|
+
* @param options Configuration options
|
|
3219
|
+
* @returns Promise resolving to array of deleted cache names
|
|
3220
|
+
*/
|
|
3221
|
+
declare function clearTransformersCache(options?: {
|
|
3222
|
+
/** Whether to log deletion details (default: true) */
|
|
3223
|
+
verbose?: boolean;
|
|
3224
|
+
/** Additional cache name patterns to clear (e.g., ['my-custom-cache']) */
|
|
3225
|
+
additionalPatterns?: string[];
|
|
3226
|
+
}): Promise<string[]>;
|
|
3227
|
+
/**
|
|
3228
|
+
* Clear a specific cache by exact name
|
|
3229
|
+
*
|
|
3230
|
+
* @param cacheName Exact cache name to delete
|
|
3231
|
+
* @returns Promise resolving to true if deleted, false otherwise
|
|
3232
|
+
*/
|
|
3233
|
+
declare function clearSpecificCache(cacheName: string): Promise<boolean>;
|
|
3234
|
+
/**
|
|
3235
|
+
* List all cache names currently stored
|
|
3236
|
+
*
|
|
3237
|
+
* @returns Promise resolving to array of cache names
|
|
3238
|
+
*/
|
|
3239
|
+
declare function listCaches(): Promise<string[]>;
|
|
3240
|
+
/**
|
|
3241
|
+
* Check if a specific cached response is valid JSON/binary (not HTML error page)
|
|
3242
|
+
*
|
|
3243
|
+
* @param cacheName Cache name to check
|
|
3244
|
+
* @param requestUrl URL/key to check
|
|
3245
|
+
* @returns Promise resolving to validation result
|
|
3246
|
+
*/
|
|
3247
|
+
declare function validateCachedResponse(cacheName: string, requestUrl: string): Promise<{
|
|
3248
|
+
exists: boolean;
|
|
3249
|
+
valid: boolean;
|
|
3250
|
+
contentType: string | null;
|
|
3251
|
+
isHtml: boolean;
|
|
3252
|
+
reason?: string;
|
|
3253
|
+
}>;
|
|
3254
|
+
/**
|
|
3255
|
+
* Scan all caches for potentially invalid cached responses
|
|
3256
|
+
*
|
|
3257
|
+
* @returns Promise resolving to report of invalid entries
|
|
3258
|
+
*/
|
|
3259
|
+
declare function scanForInvalidCaches(): Promise<{
|
|
3260
|
+
totalCaches: number;
|
|
3261
|
+
scannedEntries: number;
|
|
3262
|
+
invalidEntries: Array<{
|
|
3263
|
+
cacheName: string;
|
|
3264
|
+
url: string;
|
|
3265
|
+
reason: string;
|
|
3266
|
+
}>;
|
|
3267
|
+
}>;
|
|
3268
|
+
/**
|
|
3269
|
+
* Clear all caches and optionally prevent re-creation (development mode)
|
|
3270
|
+
*
|
|
3271
|
+
* WARNING: This is aggressive and should only be used in development.
|
|
3272
|
+
* It clears ALL browser caches, not just transformers.js.
|
|
3273
|
+
*
|
|
3274
|
+
* @param preventRecreation If true, sets env.useBrowserCache = false
|
|
3275
|
+
* @returns Promise resolving to number of deleted caches
|
|
3276
|
+
*/
|
|
3277
|
+
declare function nukeBrowserCaches(preventRecreation?: boolean): Promise<number>;
|
|
3278
|
+
|
|
3279
|
+
/**
|
|
3280
|
+
* Logging types for Omote SDK
|
|
3281
|
+
*
|
|
3282
|
+
* 6-level logging system with structured output:
|
|
3283
|
+
* - error: Critical failures that prevent operation
|
|
3284
|
+
* - warn: Recoverable issues or degraded performance
|
|
3285
|
+
* - info: Key lifecycle events (model loaded, inference complete)
|
|
3286
|
+
* - debug: Detailed operational info for development
|
|
3287
|
+
* - trace: Fine-grained tracing for performance analysis
|
|
3288
|
+
* - verbose: Extremely detailed output (tensor shapes, intermediate values)
|
|
3289
|
+
*/
|
|
3290
|
+
type LogLevel = 'error' | 'warn' | 'info' | 'debug' | 'trace' | 'verbose';
|
|
3291
|
+
/**
|
|
3292
|
+
* Numeric priority for log levels (lower = more severe)
|
|
3293
|
+
*/
|
|
3294
|
+
declare const LOG_LEVEL_PRIORITY: Record<LogLevel, number>;
|
|
3295
|
+
/**
|
|
3296
|
+
* Structured log entry
|
|
3297
|
+
*/
|
|
3298
|
+
interface LogEntry {
|
|
3299
|
+
/** Unix timestamp in milliseconds */
|
|
3300
|
+
timestamp: number;
|
|
3301
|
+
/** Log level */
|
|
3302
|
+
level: LogLevel;
|
|
3303
|
+
/** Module name (e.g., 'LocalInference', 'ModelCache') */
|
|
3304
|
+
module: string;
|
|
3305
|
+
/** Human-readable message */
|
|
3306
|
+
message: string;
|
|
3307
|
+
/** Optional structured data */
|
|
3308
|
+
data?: Record<string, unknown>;
|
|
3309
|
+
/** Optional error object */
|
|
3310
|
+
error?: Error;
|
|
3311
|
+
}
|
|
3312
|
+
/**
|
|
3313
|
+
* Log output sink interface
|
|
3314
|
+
*/
|
|
3315
|
+
interface LogSink {
|
|
3316
|
+
(entry: LogEntry): void;
|
|
3317
|
+
}
|
|
3318
|
+
/**
|
|
3319
|
+
* Log formatter interface
|
|
3320
|
+
*/
|
|
3321
|
+
interface LogFormatter {
|
|
3322
|
+
(entry: LogEntry): string;
|
|
3323
|
+
}
|
|
3324
|
+
/**
|
|
3325
|
+
* Global logging configuration
|
|
3326
|
+
*/
|
|
3327
|
+
interface LoggingConfig {
|
|
3328
|
+
/** Minimum log level to output (default: 'info') */
|
|
3329
|
+
level: LogLevel;
|
|
3330
|
+
/** Enable/disable logging globally (default: true) */
|
|
3331
|
+
enabled: boolean;
|
|
3332
|
+
/** Output format: 'json' for structured, 'pretty' for human-readable */
|
|
3333
|
+
format: 'json' | 'pretty';
|
|
3334
|
+
/** Custom output sink (default: console) */
|
|
3335
|
+
sink?: LogSink;
|
|
3336
|
+
/** Include timestamps in output (default: true) */
|
|
3337
|
+
timestamps?: boolean;
|
|
3338
|
+
/** Include module name in output (default: true) */
|
|
3339
|
+
includeModule?: boolean;
|
|
3340
|
+
}
|
|
3341
|
+
/**
|
|
3342
|
+
* Logger interface for module-specific logging
|
|
3343
|
+
*/
|
|
3344
|
+
interface ILogger {
|
|
3345
|
+
error(message: string, data?: Record<string, unknown>): void;
|
|
3346
|
+
warn(message: string, data?: Record<string, unknown>): void;
|
|
3347
|
+
info(message: string, data?: Record<string, unknown>): void;
|
|
3348
|
+
debug(message: string, data?: Record<string, unknown>): void;
|
|
3349
|
+
trace(message: string, data?: Record<string, unknown>): void;
|
|
3350
|
+
verbose(message: string, data?: Record<string, unknown>): void;
|
|
3351
|
+
/** Create a child logger with a sub-module name */
|
|
3352
|
+
child(subModule: string): ILogger;
|
|
3353
|
+
/** Get the module name for this logger */
|
|
3354
|
+
readonly module: string;
|
|
3355
|
+
}
|
|
3356
|
+
/**
|
|
3357
|
+
* Default configuration
|
|
3358
|
+
*/
|
|
3359
|
+
declare const DEFAULT_LOGGING_CONFIG: LoggingConfig;
|
|
3360
|
+
|
|
3361
|
+
/**
|
|
3362
|
+
* Omote SDK Logger
|
|
3363
|
+
*
|
|
3364
|
+
* Unified logging system with:
|
|
3365
|
+
* - 6 log levels (error, warn, info, debug, trace, verbose)
|
|
3366
|
+
* - Structured JSON output for machine parsing
|
|
3367
|
+
* - Pretty output for human readability
|
|
3368
|
+
* - Module-based child loggers
|
|
3369
|
+
* - Runtime configuration
|
|
3370
|
+
* - Browser and Node.js compatible
|
|
3371
|
+
*/
|
|
3372
|
+
|
|
3373
|
+
/**
|
|
3374
|
+
* Configure global logging settings
|
|
3375
|
+
*/
|
|
3376
|
+
declare function configureLogging(config: Partial<LoggingConfig>): void;
|
|
3377
|
+
/**
|
|
3378
|
+
* Get current logging configuration
|
|
3379
|
+
*/
|
|
3380
|
+
declare function getLoggingConfig(): LoggingConfig;
|
|
3381
|
+
/**
|
|
3382
|
+
* Reset logging configuration to defaults
|
|
3383
|
+
*/
|
|
3384
|
+
declare function resetLoggingConfig(): void;
|
|
3385
|
+
/**
|
|
3386
|
+
* Set log level at runtime
|
|
3387
|
+
*/
|
|
3388
|
+
declare function setLogLevel(level: LogLevel): void;
|
|
3389
|
+
/**
|
|
3390
|
+
* Enable or disable logging
|
|
3391
|
+
*/
|
|
3392
|
+
declare function setLoggingEnabled(enabled: boolean): void;
|
|
3393
|
+
/**
|
|
3394
|
+
* Create a logger for a specific module
|
|
3395
|
+
*
|
|
3396
|
+
* @param module - Module name (e.g., 'LocalInference', 'ModelCache')
|
|
3397
|
+
* @returns Logger instance
|
|
3398
|
+
*
|
|
3399
|
+
* @example
|
|
3400
|
+
* ```typescript
|
|
3401
|
+
* const logger = createLogger('LocalInference');
|
|
3402
|
+
* logger.info('Model loaded', { backend: 'webgpu', loadTimeMs: 1234 });
|
|
3403
|
+
* ```
|
|
3404
|
+
*/
|
|
3405
|
+
declare function createLogger(module: string): ILogger;
|
|
3406
|
+
/**
|
|
3407
|
+
* No-op logger for when logging is completely disabled
|
|
3408
|
+
*/
|
|
3409
|
+
declare const noopLogger: ILogger;
|
|
3410
|
+
|
|
3411
|
+
/**
|
|
3412
|
+
* Telemetry Types
|
|
3413
|
+
*
|
|
3414
|
+
* Configuration and type definitions for OpenTelemetry instrumentation.
|
|
3415
|
+
*
|
|
3416
|
+
* @category Telemetry
|
|
3417
|
+
*/
|
|
3418
|
+
/**
|
|
3419
|
+
* Supported telemetry exporters
|
|
3420
|
+
*/
|
|
3421
|
+
type TelemetryExporter = 'console' | 'otlp' | 'none';
|
|
3422
|
+
/**
|
|
3423
|
+
* Sampling configuration
|
|
3424
|
+
*/
|
|
3425
|
+
interface SamplingConfig {
|
|
3426
|
+
/** Sampling ratio (0.0 - 1.0). Default: 1.0 (sample everything) */
|
|
3427
|
+
ratio?: number;
|
|
3428
|
+
/** Always sample errors regardless of ratio */
|
|
3429
|
+
alwaysSampleErrors?: boolean;
|
|
3430
|
+
}
|
|
3431
|
+
/**
|
|
3432
|
+
* OTLP exporter configuration
|
|
3433
|
+
*/
|
|
3434
|
+
interface OTLPExporterConfig {
|
|
3435
|
+
/** OTLP endpoint URL (e.g., 'https://tempo.example.com/v1/traces') */
|
|
3436
|
+
endpoint: string;
|
|
3437
|
+
/** Optional headers for authentication */
|
|
3438
|
+
headers?: Record<string, string>;
|
|
3439
|
+
/** Request timeout in ms. Default: 10000 */
|
|
3440
|
+
timeoutMs?: number;
|
|
3441
|
+
}
|
|
3442
|
+
/**
|
|
3443
|
+
* Main telemetry configuration
|
|
3444
|
+
*/
|
|
3445
|
+
interface TelemetryConfig {
|
|
3446
|
+
/** Enable/disable telemetry. Default: false */
|
|
3447
|
+
enabled?: boolean;
|
|
3448
|
+
/** Service name for spans. Default: 'omote-sdk' */
|
|
3449
|
+
serviceName?: string;
|
|
3450
|
+
/** Service version. Default: SDK version */
|
|
3451
|
+
serviceVersion?: string;
|
|
3452
|
+
/** Exporter type. Default: 'none' */
|
|
3453
|
+
exporter?: TelemetryExporter;
|
|
3454
|
+
/** OTLP exporter config (required if exporter is 'otlp') */
|
|
3455
|
+
exporterConfig?: OTLPExporterConfig;
|
|
3456
|
+
/** Sampling configuration */
|
|
3457
|
+
sampling?: SamplingConfig;
|
|
3458
|
+
/** Enable metrics collection. Default: true when telemetry enabled */
|
|
3459
|
+
metricsEnabled?: boolean;
|
|
3460
|
+
/** Metrics export interval in ms. Default: 60000 */
|
|
3461
|
+
metricsIntervalMs?: number;
|
|
3462
|
+
}
|
|
3463
|
+
/**
|
|
3464
|
+
* Span attributes for model operations
|
|
3465
|
+
*/
|
|
3466
|
+
interface ModelSpanAttributes {
|
|
3467
|
+
/** Model URL or identifier */
|
|
3468
|
+
'model.url'?: string;
|
|
3469
|
+
/** Model name (e.g., 'whisper', 'lam', 'silero-vad') */
|
|
3470
|
+
'model.name'?: string;
|
|
3471
|
+
/** Inference backend used */
|
|
3472
|
+
'model.backend'?: 'webgpu' | 'wasm';
|
|
3473
|
+
/** Whether model was loaded from cache */
|
|
3474
|
+
'model.cached'?: boolean;
|
|
3475
|
+
/** Model size in bytes */
|
|
3476
|
+
'model.size_bytes'?: number;
|
|
3477
|
+
}
|
|
3478
|
+
/**
|
|
3479
|
+
* Span attributes for inference operations
|
|
3480
|
+
*/
|
|
3481
|
+
interface InferenceSpanAttributes extends ModelSpanAttributes {
|
|
3482
|
+
/** Number of input audio samples */
|
|
3483
|
+
'inference.input_samples'?: number;
|
|
3484
|
+
/** Input duration in ms */
|
|
3485
|
+
'inference.input_duration_ms'?: number;
|
|
3486
|
+
/** Number of output frames (for LAM) */
|
|
3487
|
+
'inference.output_frames'?: number;
|
|
3488
|
+
/** Inference duration in ms */
|
|
3489
|
+
'inference.duration_ms'?: number;
|
|
3490
|
+
/** Whether inference succeeded */
|
|
3491
|
+
'inference.success'?: boolean;
|
|
3492
|
+
/** Error type if failed */
|
|
3493
|
+
'inference.error_type'?: string;
|
|
3494
|
+
}
|
|
3495
|
+
/**
|
|
3496
|
+
* Span attributes for cache operations
|
|
3497
|
+
*/
|
|
3498
|
+
interface CacheSpanAttributes {
|
|
3499
|
+
/** Cache key (URL) */
|
|
3500
|
+
'cache.key'?: string;
|
|
3501
|
+
/** Whether it was a cache hit */
|
|
3502
|
+
'cache.hit'?: boolean;
|
|
3503
|
+
/** Size of cached item in bytes */
|
|
3504
|
+
'cache.size_bytes'?: number;
|
|
3505
|
+
/** Cache operation type */
|
|
3506
|
+
'cache.operation'?: 'get' | 'set' | 'delete';
|
|
3507
|
+
}
|
|
3508
|
+
/**
|
|
3509
|
+
* Combined span attributes type
|
|
3510
|
+
*/
|
|
3511
|
+
type SpanAttributes = ModelSpanAttributes | InferenceSpanAttributes | CacheSpanAttributes | Record<string, string | number | boolean | undefined>;
|
|
3512
|
+
/**
|
|
3513
|
+
* Metric names used by the SDK
|
|
3514
|
+
*/
|
|
3515
|
+
declare const MetricNames: {
|
|
3516
|
+
/** Histogram: Inference latency in ms */
|
|
3517
|
+
readonly INFERENCE_LATENCY: "omote.inference.latency";
|
|
3518
|
+
/** Histogram: Model load time in ms */
|
|
3519
|
+
readonly MODEL_LOAD_TIME: "omote.model.load_time";
|
|
3520
|
+
/** Counter: Total inference operations */
|
|
3521
|
+
readonly INFERENCE_TOTAL: "omote.inference.total";
|
|
3522
|
+
/** Counter: Total errors */
|
|
3523
|
+
readonly ERRORS_TOTAL: "omote.errors.total";
|
|
3524
|
+
/** Counter: Cache hits */
|
|
3525
|
+
readonly CACHE_HITS: "omote.cache.hits";
|
|
3526
|
+
/** Counter: Cache misses */
|
|
3527
|
+
readonly CACHE_MISSES: "omote.cache.misses";
|
|
3528
|
+
};
|
|
3529
|
+
/**
|
|
3530
|
+
* Histogram buckets for inference latency (ms)
|
|
3531
|
+
*/
|
|
3532
|
+
declare const INFERENCE_LATENCY_BUCKETS: number[];
|
|
3533
|
+
/**
|
|
3534
|
+
* Histogram buckets for model load time (ms)
|
|
3535
|
+
*/
|
|
3536
|
+
declare const MODEL_LOAD_TIME_BUCKETS: number[];
|
|
3537
|
+
|
|
3538
|
+
/**
|
|
3539
|
+
* Muse Telemetry
|
|
3540
|
+
*
|
|
3541
|
+
* Main orchestrator for SDK telemetry. Manages spans, metrics, and exporters.
|
|
3542
|
+
*
|
|
3543
|
+
* @category Telemetry
|
|
3544
|
+
*/
|
|
3545
|
+
|
|
3546
|
+
/**
|
|
3547
|
+
* Span context for tracing
|
|
3548
|
+
*/
|
|
3549
|
+
interface SpanContext {
|
|
3550
|
+
traceId: string;
|
|
3551
|
+
spanId: string;
|
|
3552
|
+
parentSpanId?: string;
|
|
3553
|
+
}
|
|
3554
|
+
/**
|
|
3555
|
+
* Active span handle returned by startSpan
|
|
3556
|
+
*/
|
|
3557
|
+
interface ActiveSpan {
|
|
3558
|
+
/** End the span with success status */
|
|
3559
|
+
end(): void;
|
|
3560
|
+
/** End the span with error status */
|
|
3561
|
+
endWithError(error: Error): void;
|
|
3562
|
+
/** Add attributes to the span */
|
|
3563
|
+
setAttributes(attrs: Partial<SpanAttributes>): void;
|
|
3564
|
+
/** Get the span context */
|
|
3565
|
+
getContext(): SpanContext;
|
|
3566
|
+
}
|
|
3567
|
+
/**
|
|
3568
|
+
* Configure global telemetry
|
|
3569
|
+
*
|
|
3570
|
+
* @example
|
|
3571
|
+
* ```typescript
|
|
3572
|
+
* // Development
|
|
3573
|
+
* configureTelemetry({
|
|
3574
|
+
* enabled: true,
|
|
3575
|
+
* serviceName: 'omote-dev',
|
|
3576
|
+
* exporter: 'console',
|
|
3577
|
+
* });
|
|
3578
|
+
*
|
|
3579
|
+
* // Production
|
|
3580
|
+
* configureTelemetry({
|
|
3581
|
+
* enabled: true,
|
|
3582
|
+
* serviceName: 'omote-prod',
|
|
3583
|
+
* exporter: 'otlp',
|
|
3584
|
+
* exporterConfig: {
|
|
3585
|
+
* endpoint: 'https://tempo.example.com',
|
|
3586
|
+
* },
|
|
3587
|
+
* sampling: { ratio: 0.1 },
|
|
3588
|
+
* });
|
|
3589
|
+
* ```
|
|
3590
|
+
*/
|
|
3591
|
+
declare function configureTelemetry(config: TelemetryConfig): OmoteTelemetry;
|
|
3592
|
+
/**
|
|
3593
|
+
* Get the global telemetry instance
|
|
3594
|
+
*/
|
|
3595
|
+
declare function getTelemetry(): OmoteTelemetry | null;
|
|
3596
|
+
/**
|
|
3597
|
+
* Main telemetry class
|
|
3598
|
+
*
|
|
3599
|
+
* Manages spans, metrics, and exports to configured backends.
|
|
3600
|
+
*/
|
|
3601
|
+
declare class OmoteTelemetry {
|
|
3602
|
+
private config;
|
|
3603
|
+
private exporter;
|
|
3604
|
+
private activeTraceId;
|
|
3605
|
+
private metricsIntervalId;
|
|
3606
|
+
private counters;
|
|
3607
|
+
private histograms;
|
|
3608
|
+
constructor(config: TelemetryConfig);
|
|
3609
|
+
/**
|
|
3610
|
+
* Initialize the configured exporter
|
|
3611
|
+
*/
|
|
3612
|
+
private initExporter;
|
|
3613
|
+
/**
|
|
3614
|
+
* Start periodic metrics collection
|
|
3615
|
+
*/
|
|
3616
|
+
private startMetricsCollection;
|
|
3617
|
+
/**
|
|
3618
|
+
* Check if this operation should be sampled
|
|
3619
|
+
*/
|
|
3620
|
+
private shouldSample;
|
|
3621
|
+
/**
|
|
3622
|
+
* Start a new span
|
|
3623
|
+
*
|
|
3624
|
+
* @example
|
|
3625
|
+
* ```typescript
|
|
3626
|
+
* const span = telemetry.startSpan('Wav2Vec2.infer', {
|
|
3627
|
+
* 'inference.input_samples': samples.length,
|
|
3628
|
+
* 'model.backend': 'webgpu',
|
|
3629
|
+
* });
|
|
3630
|
+
*
|
|
3631
|
+
* try {
|
|
3632
|
+
* const result = await doInference();
|
|
3633
|
+
* span.setAttributes({ 'inference.output_frames': result.frames });
|
|
3634
|
+
* span.end();
|
|
3635
|
+
* } catch (error) {
|
|
3636
|
+
* span.endWithError(error);
|
|
3637
|
+
* }
|
|
3638
|
+
* ```
|
|
3639
|
+
*/
|
|
3640
|
+
startSpan(name: string, attributes?: Partial<SpanAttributes>, parentContext?: SpanContext): ActiveSpan;
|
|
3641
|
+
/**
|
|
3642
|
+
* Wrap an async function with a span
|
|
3643
|
+
*
|
|
3644
|
+
* @example
|
|
3645
|
+
* ```typescript
|
|
3646
|
+
* const result = await telemetry.withSpan('Model.load', async (span) => {
|
|
3647
|
+
* const model = await loadModel();
|
|
3648
|
+
* span.setAttributes({ 'model.size_bytes': model.size });
|
|
3649
|
+
* return model;
|
|
3650
|
+
* });
|
|
3651
|
+
* ```
|
|
3652
|
+
*/
|
|
3653
|
+
withSpan<T>(name: string, fn: (span: ActiveSpan) => Promise<T>, attributes?: Partial<SpanAttributes>, parentContext?: SpanContext): Promise<T>;
|
|
3654
|
+
/**
|
|
3655
|
+
* Increment a counter metric
|
|
3656
|
+
*
|
|
3657
|
+
* @example
|
|
3658
|
+
* ```typescript
|
|
3659
|
+
* telemetry.incrementCounter('omote.inference.total', 1, {
|
|
3660
|
+
* model: 'wav2vec2',
|
|
3661
|
+
* backend: 'webgpu',
|
|
3662
|
+
* status: 'success',
|
|
3663
|
+
* });
|
|
3664
|
+
* ```
|
|
3665
|
+
*/
|
|
3666
|
+
incrementCounter(name: string, value?: number, attributes?: Record<string, string | number | boolean>): void;
|
|
3667
|
+
/**
|
|
3668
|
+
* Record a histogram value
|
|
3669
|
+
*
|
|
3670
|
+
* @example
|
|
3671
|
+
* ```typescript
|
|
3672
|
+
* telemetry.recordHistogram('omote.inference.latency', durationMs, {
|
|
3673
|
+
* model: 'wav2vec2',
|
|
3674
|
+
* backend: 'webgpu',
|
|
3675
|
+
* });
|
|
3676
|
+
* ```
|
|
3677
|
+
*/
|
|
3678
|
+
recordHistogram(name: string, value: number, attributes?: Record<string, string | number | boolean>): void;
|
|
3679
|
+
/**
|
|
3680
|
+
* Generate unique key for metric with attributes
|
|
3681
|
+
*/
|
|
3682
|
+
private getMetricKey;
|
|
3683
|
+
/**
|
|
3684
|
+
* Flush accumulated metrics to exporter
|
|
3685
|
+
*/
|
|
3686
|
+
private flushMetrics;
|
|
3687
|
+
/**
|
|
3688
|
+
* Force flush all pending data
|
|
3689
|
+
*/
|
|
3690
|
+
flush(): Promise<void>;
|
|
3691
|
+
/**
|
|
3692
|
+
* Shutdown telemetry
|
|
3693
|
+
*/
|
|
3694
|
+
shutdown(): Promise<void>;
|
|
3695
|
+
/**
|
|
3696
|
+
* Check if telemetry is enabled
|
|
3697
|
+
*/
|
|
3698
|
+
isEnabled(): boolean;
|
|
3699
|
+
/**
|
|
3700
|
+
* Get current configuration
|
|
3701
|
+
*/
|
|
3702
|
+
getConfig(): TelemetryConfig;
|
|
3703
|
+
}
|
|
3704
|
+
|
|
3705
|
+
/**
|
|
3706
|
+
* Console Exporter
|
|
3707
|
+
*
|
|
3708
|
+
* Exports telemetry data to the browser console for development/debugging.
|
|
3709
|
+
*
|
|
3710
|
+
* @category Telemetry
|
|
3711
|
+
*/
|
|
3712
|
+
|
|
3713
|
+
/**
|
|
3714
|
+
* Span data structure for export
|
|
3715
|
+
*/
|
|
3716
|
+
interface SpanData {
|
|
3717
|
+
name: string;
|
|
3718
|
+
traceId: string;
|
|
3719
|
+
spanId: string;
|
|
3720
|
+
parentSpanId?: string;
|
|
3721
|
+
startTime: number;
|
|
3722
|
+
endTime: number;
|
|
3723
|
+
durationMs: number;
|
|
3724
|
+
status: 'ok' | 'error';
|
|
3725
|
+
attributes: SpanAttributes;
|
|
3726
|
+
error?: Error;
|
|
3727
|
+
}
|
|
3728
|
+
/**
|
|
3729
|
+
* Metric data structure for export
|
|
3730
|
+
*/
|
|
3731
|
+
interface MetricData {
|
|
3732
|
+
name: string;
|
|
3733
|
+
type: 'counter' | 'histogram';
|
|
3734
|
+
value: number;
|
|
3735
|
+
attributes: Record<string, string | number | boolean>;
|
|
3736
|
+
timestamp: number;
|
|
3737
|
+
}
|
|
3738
|
+
/**
|
|
3739
|
+
* Exporter interface that all exporters must implement
|
|
3740
|
+
*/
|
|
3741
|
+
interface TelemetryExporterInterface {
|
|
3742
|
+
/** Export a completed span */
|
|
3743
|
+
exportSpan(span: SpanData): void;
|
|
3744
|
+
/** Export a metric */
|
|
3745
|
+
exportMetric(metric: MetricData): void;
|
|
3746
|
+
/** Flush any buffered data */
|
|
3747
|
+
flush(): Promise<void>;
|
|
3748
|
+
/** Shutdown the exporter */
|
|
3749
|
+
shutdown(): Promise<void>;
|
|
3750
|
+
}
|
|
3751
|
+
/**
|
|
3752
|
+
* Console exporter for development/debugging
|
|
3753
|
+
*
|
|
3754
|
+
* Outputs spans and metrics to the browser console with formatting.
|
|
3755
|
+
*/
|
|
3756
|
+
declare class ConsoleExporter implements TelemetryExporterInterface {
|
|
3757
|
+
private enabled;
|
|
3758
|
+
private prefix;
|
|
3759
|
+
constructor(options?: {
|
|
3760
|
+
enabled?: boolean;
|
|
3761
|
+
prefix?: string;
|
|
3762
|
+
});
|
|
3763
|
+
exportSpan(span: SpanData): void;
|
|
3764
|
+
exportMetric(metric: MetricData): void;
|
|
3765
|
+
flush(): Promise<void>;
|
|
3766
|
+
shutdown(): Promise<void>;
|
|
3767
|
+
}
|
|
3768
|
+
|
|
3769
|
+
/**
|
|
3770
|
+
* OTLP Exporter
|
|
3771
|
+
*
|
|
3772
|
+
* Exports telemetry data to OTLP-compatible backends (Jaeger, Tempo, etc.)
|
|
3773
|
+
* using the OTLP/HTTP JSON protocol.
|
|
3774
|
+
*
|
|
3775
|
+
* @category Telemetry
|
|
3776
|
+
*/
|
|
3777
|
+
|
|
3778
|
+
/**
|
|
3779
|
+
* OTLP exporter for production telemetry
|
|
3780
|
+
*
|
|
3781
|
+
* Sends spans and metrics to OTLP-compatible backends like:
|
|
3782
|
+
* - Jaeger
|
|
3783
|
+
* - Grafana Tempo
|
|
3784
|
+
* - Honeycomb
|
|
3785
|
+
* - Datadog
|
|
3786
|
+
* - AWS X-Ray (with collector)
|
|
3787
|
+
*/
|
|
3788
|
+
declare class OTLPExporter implements TelemetryExporterInterface {
|
|
3789
|
+
private config;
|
|
3790
|
+
private serviceName;
|
|
3791
|
+
private serviceVersion;
|
|
3792
|
+
private spanBuffer;
|
|
3793
|
+
private metricBuffer;
|
|
3794
|
+
private flushIntervalId;
|
|
3795
|
+
private readonly BUFFER_SIZE;
|
|
3796
|
+
private readonly FLUSH_INTERVAL_MS;
|
|
3797
|
+
private isShutdown;
|
|
3798
|
+
constructor(config: OTLPExporterConfig, serviceName?: string, serviceVersion?: string);
|
|
3799
|
+
exportSpan(span: SpanData): void;
|
|
3800
|
+
exportMetric(metric: MetricData): void;
|
|
3801
|
+
flush(): Promise<void>;
|
|
3802
|
+
shutdown(): Promise<void>;
|
|
3803
|
+
private exportSpans;
|
|
3804
|
+
private exportMetrics;
|
|
3805
|
+
private sendRequest;
|
|
3806
|
+
}
|
|
3807
|
+
|
|
3808
|
+
/**
|
|
3809
|
+
* Animation Graph Types
|
|
3810
|
+
*
|
|
3811
|
+
* Renderer-agnostic animation state machine with emotion and audio-driven blending.
|
|
3812
|
+
*
|
|
3813
|
+
* @module animation
|
|
3814
|
+
*/
|
|
3815
|
+
/**
|
|
3816
|
+
* Emotion labels for animation blending
|
|
3817
|
+
* Note: These are the 8 emotion categories used for animation, separate from the
|
|
3818
|
+
* internal EmotionName type used by EmotionController.
|
|
3819
|
+
*/
|
|
3820
|
+
type EmotionLabel = 'angry' | 'calm' | 'disgust' | 'fearful' | 'happy' | 'neutral' | 'sad' | 'surprised';
|
|
3821
|
+
/**
|
|
3822
|
+
* High-level animation states
|
|
3823
|
+
*/
|
|
3824
|
+
type AnimationStateName = 'idle' | 'listening' | 'thinking' | 'speaking';
|
|
3825
|
+
/**
|
|
3826
|
+
* Events that trigger state transitions
|
|
3827
|
+
*/
|
|
3828
|
+
type AnimationTrigger = 'user_speech_start' | 'user_speech_end' | 'transcript_ready' | 'ai_response_start' | 'ai_audio_start' | 'ai_response_end' | 'timeout' | 'interrupt';
|
|
3829
|
+
/**
|
|
3830
|
+
* Animation layer types for blending
|
|
3831
|
+
*/
|
|
3832
|
+
type AnimationLayer = 'base' | 'emotion' | 'gesture' | 'additive';
|
|
3833
|
+
/**
|
|
3834
|
+
* A single animation clip reference
|
|
3835
|
+
*/
|
|
3836
|
+
interface AnimationClip {
|
|
3837
|
+
/** Unique identifier for the clip */
|
|
3838
|
+
name: string;
|
|
3839
|
+
/** Animation layer this clip belongs to */
|
|
3840
|
+
layer: AnimationLayer;
|
|
3841
|
+
/** Whether this clip loops */
|
|
3842
|
+
loop: boolean;
|
|
3843
|
+
/** Default duration in seconds (can be overridden by actual clip) */
|
|
3844
|
+
duration?: number;
|
|
3845
|
+
}
|
|
3846
|
+
/**
|
|
3847
|
+
* Blend weight for an animation clip
|
|
3848
|
+
*/
|
|
3849
|
+
interface BlendWeight {
|
|
3850
|
+
/** Clip name */
|
|
3851
|
+
clip: string;
|
|
3852
|
+
/** Weight 0-1 */
|
|
3853
|
+
weight: number;
|
|
3854
|
+
/** Playback speed multiplier */
|
|
3855
|
+
speed: number;
|
|
3856
|
+
/** Current time in the animation (0-1 normalized) */
|
|
3857
|
+
time: number;
|
|
3858
|
+
}
|
|
3859
|
+
/**
|
|
3860
|
+
* Animation state definition
|
|
3861
|
+
*/
|
|
3862
|
+
interface AnimationState {
|
|
3863
|
+
/** State name */
|
|
3864
|
+
name: AnimationStateName;
|
|
3865
|
+
/** Base animation clips for this state */
|
|
3866
|
+
baseClips: string[];
|
|
3867
|
+
/** Blend weights for base clips */
|
|
3868
|
+
baseWeights: number[];
|
|
3869
|
+
/** Whether emotion overlay is enabled in this state */
|
|
3870
|
+
emotionBlendEnabled: boolean;
|
|
3871
|
+
/** Whether gesture layer is enabled in this state */
|
|
3872
|
+
gestureBlendEnabled: boolean;
|
|
3873
|
+
/** Timeout in ms to auto-transition (0 = no timeout) */
|
|
3874
|
+
timeout: number;
|
|
3875
|
+
/** State to transition to on timeout */
|
|
3876
|
+
timeoutTarget?: AnimationStateName;
|
|
3877
|
+
}
|
|
3878
|
+
/**
|
|
3879
|
+
* Transition between states
|
|
3880
|
+
*/
|
|
3881
|
+
interface Transition {
|
|
3882
|
+
/** Source state */
|
|
3883
|
+
from: AnimationStateName;
|
|
3884
|
+
/** Target state */
|
|
3885
|
+
to: AnimationStateName;
|
|
3886
|
+
/** Event that triggers this transition */
|
|
3887
|
+
trigger: AnimationTrigger;
|
|
3888
|
+
/** Blend duration in ms */
|
|
3889
|
+
duration: number;
|
|
3890
|
+
/** Optional condition function */
|
|
3891
|
+
condition?: () => boolean;
|
|
3892
|
+
}
|
|
3893
|
+
/**
|
|
3894
|
+
* Emotion to animation mapping
|
|
3895
|
+
*/
|
|
3896
|
+
interface EmotionAnimationMap {
|
|
3897
|
+
/** Emotion label */
|
|
3898
|
+
emotion: EmotionLabel;
|
|
3899
|
+
/** Animation clip to blend */
|
|
3900
|
+
clip: string;
|
|
3901
|
+
/** Maximum blend weight for this emotion */
|
|
3902
|
+
maxWeight: number;
|
|
3903
|
+
/** Blend speed (weight change per second) */
|
|
3904
|
+
blendSpeed: number;
|
|
3905
|
+
}
|
|
3906
|
+
/**
|
|
3907
|
+
* Configuration for AnimationGraph
|
|
3908
|
+
*/
|
|
3909
|
+
interface AnimationGraphConfig {
|
|
3910
|
+
/** Available animation states */
|
|
3911
|
+
states: AnimationState[];
|
|
3912
|
+
/** Transitions between states */
|
|
3913
|
+
transitions: Transition[];
|
|
3914
|
+
/** Emotion to animation mappings */
|
|
3915
|
+
emotionMappings: EmotionAnimationMap[];
|
|
3916
|
+
/** Gesture clips for audio-driven animation */
|
|
3917
|
+
gestureClips: string[];
|
|
3918
|
+
/** Initial state */
|
|
3919
|
+
initialState: AnimationStateName;
|
|
3920
|
+
/** Global blend speed for state transitions (weight/sec) */
|
|
3921
|
+
transitionBlendSpeed: number;
|
|
3922
|
+
/** Minimum audio energy to trigger gestures (0-1) */
|
|
3923
|
+
gestureThreshold: number;
|
|
3924
|
+
/** Gesture intensity multiplier */
|
|
3925
|
+
gestureIntensity: number;
|
|
3926
|
+
}
|
|
3927
|
+
/**
|
|
3928
|
+
* Current output of the animation graph
|
|
3929
|
+
*/
|
|
3930
|
+
interface AnimationOutput {
|
|
3931
|
+
/** Current state name */
|
|
3932
|
+
state: AnimationStateName;
|
|
3933
|
+
/** All blend weights to apply */
|
|
3934
|
+
blendWeights: BlendWeight[];
|
|
3935
|
+
/** Active emotion (if any) */
|
|
3936
|
+
activeEmotion: EmotionLabel | null;
|
|
3937
|
+
/** Current gesture intensity (0-1) */
|
|
3938
|
+
gestureIntensity: number;
|
|
3939
|
+
/** Whether currently transitioning between states */
|
|
3940
|
+
isTransitioning: boolean;
|
|
3941
|
+
/** Transition progress (0-1) if transitioning */
|
|
3942
|
+
transitionProgress: number;
|
|
3943
|
+
}
|
|
3944
|
+
/**
|
|
3945
|
+
* Events emitted by AnimationGraph
|
|
3946
|
+
*/
|
|
3947
|
+
type AnimationGraphEvents = {
|
|
3948
|
+
/** State changed */
|
|
3949
|
+
'state.change': {
|
|
3950
|
+
from: AnimationStateName;
|
|
3951
|
+
to: AnimationStateName;
|
|
3952
|
+
trigger: AnimationTrigger;
|
|
3953
|
+
};
|
|
3954
|
+
/** Transition started */
|
|
3955
|
+
'transition.start': {
|
|
3956
|
+
from: AnimationStateName;
|
|
3957
|
+
to: AnimationStateName;
|
|
3958
|
+
duration: number;
|
|
3959
|
+
};
|
|
3960
|
+
/** Transition completed */
|
|
3961
|
+
'transition.end': {
|
|
3962
|
+
state: AnimationStateName;
|
|
3963
|
+
};
|
|
3964
|
+
/** Emotion changed */
|
|
3965
|
+
'emotion.change': {
|
|
3966
|
+
emotion: EmotionLabel | null;
|
|
3967
|
+
confidence: number;
|
|
3968
|
+
};
|
|
3969
|
+
/** Animation output updated (every frame) */
|
|
3970
|
+
'output.update': AnimationOutput;
|
|
3971
|
+
/** Index signature for EventEmitter compatibility */
|
|
3972
|
+
[key: string]: unknown;
|
|
3973
|
+
};
|
|
3974
|
+
/**
|
|
3975
|
+
* Default animation graph configuration
|
|
3976
|
+
*/
|
|
3977
|
+
declare const DEFAULT_ANIMATION_CONFIG: AnimationGraphConfig;
|
|
3978
|
+
|
|
3979
|
+
/**
|
|
3980
|
+
* Animation Graph
|
|
3981
|
+
*
|
|
3982
|
+
* State machine for character animation with emotion and audio-driven blending.
|
|
3983
|
+
* Renderer-agnostic - outputs blend weights that any 3D engine can consume.
|
|
3984
|
+
*
|
|
3985
|
+
* @example
|
|
3986
|
+
* ```typescript
|
|
3987
|
+
* import { AnimationGraph, DEFAULT_ANIMATION_CONFIG } from '@omote/core';
|
|
3988
|
+
*
|
|
3989
|
+
* const graph = new AnimationGraph(DEFAULT_ANIMATION_CONFIG);
|
|
3990
|
+
*
|
|
3991
|
+
* // Connect to voice pipeline
|
|
3992
|
+
* graph.on('output.update', (output) => {
|
|
3993
|
+
* // Apply blend weights to your 3D character
|
|
3994
|
+
* for (const { clip, weight } of output.blendWeights) {
|
|
3995
|
+
* mixer.getAction(clip).setEffectiveWeight(weight);
|
|
3996
|
+
* }
|
|
3997
|
+
* });
|
|
3998
|
+
*
|
|
3999
|
+
* // Drive from voice state
|
|
4000
|
+
* voiceState.on('listening', () => graph.trigger('user_speech_start'));
|
|
4001
|
+
* voiceState.on('thinking', () => graph.trigger('transcript_ready'));
|
|
4002
|
+
* voiceState.on('speaking', () => graph.trigger('ai_audio_start'));
|
|
4003
|
+
*
|
|
4004
|
+
* // Drive from emotion detection
|
|
4005
|
+
* emotion.on('result', ({ emotion, confidence }) => {
|
|
4006
|
+
* graph.setEmotion(emotion, confidence);
|
|
4007
|
+
* });
|
|
4008
|
+
*
|
|
4009
|
+
* // Update every frame
|
|
4010
|
+
* function animate(deltaTime: number) {
|
|
4011
|
+
* graph.update(deltaTime);
|
|
4012
|
+
* }
|
|
4013
|
+
* ```
|
|
4014
|
+
*
|
|
4015
|
+
* @module animation
|
|
4016
|
+
*/
|
|
4017
|
+
|
|
4018
|
+
/**
|
|
4019
|
+
* Animation state machine with smooth blending
|
|
4020
|
+
*/
|
|
4021
|
+
declare class AnimationGraph extends EventEmitter<AnimationGraphEvents> {
|
|
4022
|
+
private config;
|
|
4023
|
+
private currentState;
|
|
4024
|
+
private previousState;
|
|
4025
|
+
private isTransitioning;
|
|
4026
|
+
private transitionProgress;
|
|
4027
|
+
private transitionDuration;
|
|
4028
|
+
private transitionStartTime;
|
|
4029
|
+
private currentEmotion;
|
|
4030
|
+
private emotionConfidence;
|
|
4031
|
+
private emotionBlendWeight;
|
|
4032
|
+
private targetEmotionWeight;
|
|
4033
|
+
private audioEnergy;
|
|
4034
|
+
private gestureWeight;
|
|
4035
|
+
private currentGestureClip;
|
|
4036
|
+
private stateEnterTime;
|
|
4037
|
+
private lastUpdateTime;
|
|
4038
|
+
private cachedOutput;
|
|
4039
|
+
constructor(config?: Partial<AnimationGraphConfig>);
|
|
4040
|
+
/**
|
|
4041
|
+
* Get current state name
|
|
4042
|
+
*/
|
|
4043
|
+
get state(): AnimationStateName;
|
|
4044
|
+
/**
|
|
4045
|
+
* Get current animation output
|
|
4046
|
+
*/
|
|
4047
|
+
get output(): AnimationOutput;
|
|
4048
|
+
/**
|
|
4049
|
+
* Trigger an animation event (may cause state transition)
|
|
4050
|
+
*/
|
|
4051
|
+
trigger(event: AnimationTrigger): boolean;
|
|
4052
|
+
/**
|
|
4053
|
+
* Set current emotion (from DistilHuBERT or manual)
|
|
4054
|
+
*/
|
|
4055
|
+
setEmotion(emotion: EmotionLabel, confidence: number): void;
|
|
4056
|
+
/**
|
|
4057
|
+
* Clear current emotion
|
|
4058
|
+
*/
|
|
4059
|
+
clearEmotion(): void;
|
|
4060
|
+
/**
|
|
4061
|
+
* Set audio energy for gesture animation (0-1)
|
|
4062
|
+
*/
|
|
4063
|
+
setAudioEnergy(energy: number): void;
|
|
4064
|
+
/**
|
|
4065
|
+
* Force transition to a specific state
|
|
4066
|
+
*/
|
|
4067
|
+
setState(stateName: AnimationStateName, blendDuration?: number): void;
|
|
4068
|
+
/**
|
|
4069
|
+
* Update animation graph (call every frame)
|
|
4070
|
+
* @param deltaMs Time since last update in milliseconds
|
|
4071
|
+
*/
|
|
4072
|
+
update(deltaMs?: number): AnimationOutput;
|
|
4073
|
+
/**
|
|
4074
|
+
* Reset to initial state
|
|
4075
|
+
*/
|
|
4076
|
+
reset(): void;
|
|
4077
|
+
/**
|
|
4078
|
+
* Get all clip names used by this graph
|
|
4079
|
+
*/
|
|
4080
|
+
getRequiredClips(): string[];
|
|
4081
|
+
private startTransition;
|
|
4082
|
+
private updateTransition;
|
|
4083
|
+
private checkTimeout;
|
|
4084
|
+
private updateEmotionBlend;
|
|
4085
|
+
private updateGesture;
|
|
4086
|
+
private computeOutput;
|
|
4087
|
+
}
|
|
4088
|
+
|
|
4089
|
+
/**
|
|
4090
|
+
* Audio Energy Analysis
|
|
4091
|
+
*
|
|
4092
|
+
* Utilities for extracting energy/loudness from audio for gesture animation.
|
|
4093
|
+
*
|
|
4094
|
+
* @module animation
|
|
4095
|
+
*/
|
|
4096
|
+
/**
|
|
4097
|
+
* Calculate RMS (Root Mean Square) energy from audio samples
|
|
4098
|
+
* @param samples Audio samples (Float32Array, normalized -1 to 1)
|
|
4099
|
+
* @returns RMS energy value (0 to 1)
|
|
4100
|
+
*/
|
|
4101
|
+
declare function calculateRMS(samples: Float32Array): number;
|
|
4102
|
+
/**
|
|
4103
|
+
* Calculate peak amplitude from audio samples
|
|
4104
|
+
* @param samples Audio samples (Float32Array, normalized -1 to 1)
|
|
4105
|
+
* @returns Peak amplitude (0 to 1)
|
|
4106
|
+
*/
|
|
4107
|
+
declare function calculatePeak(samples: Float32Array): number;
|
|
4108
|
+
/**
|
|
4109
|
+
* Smoothed energy analyzer for gesture animation
|
|
4110
|
+
*/
|
|
4111
|
+
declare class AudioEnergyAnalyzer {
|
|
4112
|
+
private smoothedRMS;
|
|
4113
|
+
private smoothedPeak;
|
|
4114
|
+
private readonly smoothingFactor;
|
|
4115
|
+
private readonly noiseFloor;
|
|
4116
|
+
/**
|
|
4117
|
+
* @param smoothingFactor How much to smooth (0 = no smoothing, 1 = infinite smoothing). Default 0.85
|
|
4118
|
+
* @param noiseFloor Minimum energy threshold to consider as signal. Default 0.01
|
|
4119
|
+
*/
|
|
4120
|
+
constructor(smoothingFactor?: number, noiseFloor?: number);
|
|
4121
|
+
/**
|
|
4122
|
+
* Process audio samples and return smoothed energy values
|
|
4123
|
+
* @param samples Audio samples (Float32Array)
|
|
4124
|
+
* @returns Object with rms and peak values
|
|
4125
|
+
*/
|
|
4126
|
+
process(samples: Float32Array): {
|
|
4127
|
+
rms: number;
|
|
4128
|
+
peak: number;
|
|
4129
|
+
energy: number;
|
|
4130
|
+
};
|
|
4131
|
+
/**
|
|
4132
|
+
* Reset analyzer state
|
|
4133
|
+
*/
|
|
4134
|
+
reset(): void;
|
|
4135
|
+
/**
|
|
4136
|
+
* Get current smoothed RMS value
|
|
4137
|
+
*/
|
|
4138
|
+
get rms(): number;
|
|
4139
|
+
/**
|
|
4140
|
+
* Get current smoothed peak value
|
|
4141
|
+
*/
|
|
4142
|
+
get peak(): number;
|
|
4143
|
+
}
|
|
4144
|
+
/**
|
|
4145
|
+
* Extract emphasis points from audio (for gesture timing)
|
|
4146
|
+
*
|
|
4147
|
+
* Detects sudden increases in energy that correspond to speech emphasis.
|
|
4148
|
+
*/
|
|
4149
|
+
declare class EmphasisDetector {
|
|
4150
|
+
private energyHistory;
|
|
4151
|
+
private readonly historySize;
|
|
4152
|
+
private readonly emphasisThreshold;
|
|
4153
|
+
/**
|
|
4154
|
+
* @param historySize Number of frames to track. Default 10
|
|
4155
|
+
* @param emphasisThreshold Minimum energy increase to count as emphasis. Default 0.15
|
|
4156
|
+
*/
|
|
4157
|
+
constructor(historySize?: number, emphasisThreshold?: number);
|
|
4158
|
+
/**
|
|
4159
|
+
* Process energy value and detect emphasis
|
|
4160
|
+
* @param energy Current energy value (0-1)
|
|
4161
|
+
* @returns Object with isEmphasis flag and emphasisStrength
|
|
4162
|
+
*/
|
|
4163
|
+
process(energy: number): {
|
|
4164
|
+
isEmphasis: boolean;
|
|
4165
|
+
emphasisStrength: number;
|
|
4166
|
+
};
|
|
4167
|
+
/**
|
|
4168
|
+
* Reset detector state
|
|
4169
|
+
*/
|
|
4170
|
+
reset(): void;
|
|
4171
|
+
}
|
|
4172
|
+
|
|
4173
|
+
export { type AIAdapter, type AIAdapterEvents, type AISessionState, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, type AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendEvent, type BackendPreference, type BlendWeight, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, DEFAULT_ANIMATION_CONFIG, DEFAULT_LOGGING_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionEvent, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, type GazeEvent, HF_CDN_TEST_URL, type HuggingFaceUrlInfo, type ILogger, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, LOG_LEVEL_PRIORITY, type LogEntry, type LogFormatter, type LogLevel, type LogSink, type LoggingConfig, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, type OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, type QuotaInfo, RingBuffer, type RuntimeBackend, type STTFinalEvent, type STTPartialEvent, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SessionStateEvent, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TTSEndEvent, type TTSMarkEvent, type TTSStartEvent, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type TranscriptionResult, type Transition, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VisemeEvent, type VoiceConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, type WhisperConfig, WhisperInference, type WhisperModel, blendEmotions, calculatePeak, calculateRMS, clearSpecificCache, clearTransformersCache, configureCacheLimit, configureLogging, configureTelemetry, createEmotionVector, createLogger, createSessionWithFallback, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getLoggingConfig, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isHuggingFaceCDNReachable, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, listCaches, noopLogger, nukeBrowserCaches, parseHuggingFaceUrl, preloadModels, resetLoggingConfig, resolveBackend, scanForInvalidCaches, setLogLevel, setLoggingEnabled, shouldEnableWasmProxy, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, validateCachedResponse };
|