@omote/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4173 @@
1
+ import { InferenceSession, Tensor, Env } from 'onnxruntime-common';
2
+
3
+ /**
4
+ * Type-safe event emitter for Omote core events
5
+ *
6
+ * @category Events
7
+ */
8
+ type EventCallback<T = unknown> = (data: T) => void;
9
+ declare class EventEmitter<TEvents extends {
10
+ [key: string]: unknown;
11
+ }> {
12
+ private listeners;
13
+ on<K extends keyof TEvents>(event: K, callback: EventCallback<TEvents[K]>): () => void;
14
+ off<K extends keyof TEvents>(event: K, callback: EventCallback<TEvents[K]>): void;
15
+ emit<K extends keyof TEvents>(event: K, data: TEvents[K]): void;
16
+ once<K extends keyof TEvents>(event: K, callback: EventCallback<TEvents[K]>): () => void;
17
+ removeAllListeners(event?: keyof TEvents): void;
18
+ }
19
+
20
+ /**
21
+ * Core Omote event types - the contract between core and renderers
22
+ *
23
+ * Renderers subscribe to these events and apply them to their specific
24
+ * rendering system (R3F, Three.js, Babylon, Unity, etc.)
25
+ */
26
+ /** Animation frame with blendshape weights */
27
+ interface AnimationEvent {
28
+ /** 52 ARKit blendshape weights (0-1 range) */
29
+ blendshapes: Float32Array;
30
+ /** Named blendshape access */
31
+ get(name: string): number;
32
+ /** Raw model output weights (for debugging) */
33
+ rawWeights?: Float32Array;
34
+ /** Timestamp in ms */
35
+ timestamp: number;
36
+ /** Inference latency in ms */
37
+ inferenceMs: number;
38
+ /** Frame index within the current batch (for LAM multi-frame output) */
39
+ frameIndex?: number;
40
+ /** Total frames in the current batch (for LAM multi-frame output) */
41
+ totalFrames?: number;
42
+ }
43
+ /** Viseme for lip sync */
44
+ interface VisemeEvent {
45
+ /** Viseme ID or phoneme */
46
+ viseme: string;
47
+ /** Weight 0-1 */
48
+ weight: number;
49
+ /** Duration in ms */
50
+ duration: number;
51
+ }
52
+ /** Emotion state change */
53
+ interface EmotionEvent {
54
+ /** Emotion weights by name */
55
+ values: Record<string, number>;
56
+ /** Transition duration in ms */
57
+ transitionMs: number;
58
+ }
59
+ /** Gaze target change */
60
+ interface GazeEvent {
61
+ /** Target type */
62
+ target: 'camera' | 'wander' | 'position';
63
+ /** Position if target is 'position' */
64
+ position?: {
65
+ x: number;
66
+ y: number;
67
+ z: number;
68
+ };
69
+ }
70
+ /** Audio playback events */
71
+ interface TTSStartEvent {
72
+ /** Audio duration in ms */
73
+ durationMs: number;
74
+ /** Text being spoken */
75
+ text: string;
76
+ }
77
+ interface TTSMarkEvent {
78
+ /** Mark name/type */
79
+ name: string;
80
+ /** Time offset in ms */
81
+ timeMs: number;
82
+ }
83
+ interface TTSEndEvent {
84
+ /** Whether playback completed normally */
85
+ completed: boolean;
86
+ }
87
+ /** STT transcription events */
88
+ interface STTPartialEvent {
89
+ /** Partial transcription */
90
+ text: string;
91
+ /** Confidence 0-1 */
92
+ confidence: number;
93
+ }
94
+ interface STTFinalEvent {
95
+ /** Final transcription */
96
+ text: string;
97
+ /** Confidence 0-1 */
98
+ confidence: number;
99
+ }
100
+ /** Session state events */
101
+ interface SessionStateEvent {
102
+ state: 'connecting' | 'connected' | 'ready' | 'streaming' | 'error' | 'disconnected';
103
+ error?: Error;
104
+ }
105
+ /** Backend info */
106
+ interface BackendEvent {
107
+ type: 'webgpu' | 'wasm' | 'remote';
108
+ modelLoaded: boolean;
109
+ loadTimeMs?: number;
110
+ }
111
+ /** AI adapter state */
112
+ type AISessionState$1 = 'idle' | 'listening' | 'thinking' | 'speaking' | 'interrupted' | 'error' | 'disconnected';
113
+ /** AI state change event */
114
+ interface AIStateChangeEvent {
115
+ state: AISessionState$1;
116
+ previousState: AISessionState$1;
117
+ }
118
+ /** User speech events */
119
+ interface UserSpeechStartEvent {
120
+ timestamp: number;
121
+ }
122
+ interface UserSpeechEndEvent {
123
+ timestamp: number;
124
+ durationMs: number;
125
+ }
126
+ interface UserTranscriptEvent {
127
+ text: string;
128
+ confidence: number;
129
+ }
130
+ /** AI response events */
131
+ interface AIThinkingStartEvent {
132
+ timestamp: number;
133
+ }
134
+ interface AIResponseStartEvent {
135
+ text?: string;
136
+ emotion?: string;
137
+ }
138
+ interface AIResponseChunkEvent {
139
+ text: string;
140
+ isLast: boolean;
141
+ }
142
+ interface AIResponseEndEvent {
143
+ fullText: string;
144
+ durationMs: number;
145
+ }
146
+ /** Audio output events (for lip sync processing) */
147
+ interface AudioOutputChunkEvent {
148
+ audio: ArrayBuffer;
149
+ sampleRate: number;
150
+ timestamp: number;
151
+ }
152
+ interface AudioOutputEndEvent {
153
+ durationMs: number;
154
+ }
155
+ /** Adapter events */
156
+ interface AdapterSwitchEvent {
157
+ from: string;
158
+ to: string;
159
+ reason: string;
160
+ }
161
+ interface AdapterFallbackEvent {
162
+ adapter: string;
163
+ reason: string;
164
+ }
165
+ interface InterruptionEvent {
166
+ timestamp: number;
167
+ action?: 'stop' | 'continue';
168
+ }
169
+ /**
170
+ * Complete event map for OmoteCore
171
+ */
172
+ type OmoteEvents = {
173
+ 'animation': AnimationEvent;
174
+ 'animation.ready': {
175
+ backend: 'webgpu' | 'wasm';
176
+ };
177
+ 'viseme': VisemeEvent;
178
+ 'emotion': EmotionEvent;
179
+ 'gaze': GazeEvent;
180
+ 'tts.start': TTSStartEvent;
181
+ 'tts.mark': TTSMarkEvent;
182
+ 'tts.end': TTSEndEvent;
183
+ 'stt.partial': STTPartialEvent;
184
+ 'stt.final': STTFinalEvent;
185
+ 'session.state': SessionStateEvent;
186
+ 'backend': BackendEvent;
187
+ 'audio.chunk': {
188
+ pcm: Int16Array;
189
+ timestamp: number;
190
+ };
191
+ 'audio.level': {
192
+ rms: number;
193
+ peak: number;
194
+ };
195
+ 'audio.output.chunk': AudioOutputChunkEvent;
196
+ 'audio.output.end': AudioOutputEndEvent;
197
+ 'ai.state.change': AIStateChangeEvent;
198
+ 'ai.thinking.start': AIThinkingStartEvent;
199
+ 'ai.response.start': AIResponseStartEvent;
200
+ 'ai.response.chunk': AIResponseChunkEvent;
201
+ 'ai.response.end': AIResponseEndEvent;
202
+ 'user.speech.start': UserSpeechStartEvent;
203
+ 'user.speech.end': UserSpeechEndEvent;
204
+ 'user.transcript.partial': UserTranscriptEvent;
205
+ 'user.transcript.final': UserTranscriptEvent;
206
+ 'adapter.switch': AdapterSwitchEvent;
207
+ 'adapter.fallback': AdapterFallbackEvent;
208
+ 'adapter.recovered': {
209
+ adapter: string;
210
+ };
211
+ 'interruption.detected': InterruptionEvent;
212
+ 'interruption.handled': InterruptionEvent;
213
+ 'memory.updated': {
214
+ messageCount: number;
215
+ tokenCount?: number;
216
+ };
217
+ 'connection.opened': {
218
+ sessionId: string;
219
+ adapter?: string;
220
+ };
221
+ 'connection.closed': {
222
+ reason: string;
223
+ };
224
+ 'connection.error': {
225
+ error: Error;
226
+ recoverable: boolean;
227
+ };
228
+ 'error': {
229
+ code: string;
230
+ message: string;
231
+ details?: unknown;
232
+ };
233
+ };
234
+
235
+ /**
236
+ * Microphone capture - renderer-agnostic audio input
237
+ *
238
+ * Captures audio from the microphone and emits PCM chunks.
239
+ * Works in any JavaScript environment with Web Audio API.
240
+ *
241
+ * @category Audio
242
+ */
243
+
244
+ interface MicrophoneCaptureConfig {
245
+ /** Target sample rate (default: 16000 for speech processing) */
246
+ sampleRate?: number;
247
+ /** Chunk size in samples (default: 1600 = 100ms at 16kHz) */
248
+ chunkSize?: number;
249
+ }
250
+ declare class MicrophoneCapture {
251
+ private events;
252
+ private config;
253
+ private stream;
254
+ private context;
255
+ private processor;
256
+ private buffer;
257
+ private _isRecording;
258
+ private _loggedFirstChunk;
259
+ constructor(events: EventEmitter<OmoteEvents>, config?: MicrophoneCaptureConfig);
260
+ get isRecording(): boolean;
261
+ get isSupported(): boolean;
262
+ start(): Promise<void>;
263
+ stop(): void;
264
+ private floatToPCM16;
265
+ }
266
+
267
+ /**
268
+ * Ring buffer for audio sample accumulation
269
+ *
270
+ * Efficiently accumulates audio samples and provides
271
+ * contiguous buffers for inference without memory allocation churn.
272
+ *
273
+ * @category Audio
274
+ */
275
+ declare class RingBuffer {
276
+ private readonly size;
277
+ private buffer;
278
+ private writeIndex;
279
+ private isFull;
280
+ constructor(size: number);
281
+ /**
282
+ * Write samples to the ring buffer
283
+ * Converts Int16Array PCM to Float32
284
+ */
285
+ write(pcm: Int16Array): void;
286
+ /**
287
+ * Write float samples directly
288
+ */
289
+ writeFloat(samples: Float32Array): void;
290
+ /**
291
+ * Get a contiguous copy of the buffer contents in chronological order
292
+ * Returns null if buffer isn't full yet
293
+ */
294
+ read(): Float32Array | null;
295
+ /**
296
+ * Check if buffer has enough samples
297
+ */
298
+ get hasData(): boolean;
299
+ /**
300
+ * Get current fill level (0-1)
301
+ */
302
+ get fillLevel(): number;
303
+ /**
304
+ * Reset the buffer
305
+ */
306
+ reset(): void;
307
+ }
308
+
309
+ /**
310
+ * AudioScheduler - Enterprise-grade Web Audio API scheduling
311
+ *
312
+ * Implements the lookahead scheduling pattern from Chris Wilson's
313
+ * "A Tale of Two Clocks" - the authoritative guide on Web Audio timing.
314
+ *
315
+ * Key Features:
316
+ * - Uses AudioContext.currentTime (hardware clock) for sample-accurate timing
317
+ * - Pre-schedules audio chunks for gapless playback
318
+ * - Tracks scheduled sources for cleanup
319
+ * - Provides playback state monitoring
320
+ *
321
+ * @see https://web.dev/articles/audio-scheduling
322
+ * @category Audio
323
+ */
324
+ interface AudioSchedulerOptions {
325
+ /** Sample rate in Hz (default: 16000 for speech) */
326
+ sampleRate?: number;
327
+ /** Number of audio channels (default: 1 for mono) */
328
+ channels?: number;
329
+ }
330
+ declare class AudioScheduler {
331
+ private readonly options;
332
+ private context;
333
+ private nextPlayTime;
334
+ private scheduledSources;
335
+ private isPlaying;
336
+ constructor(options?: AudioSchedulerOptions);
337
+ /**
338
+ * Initialize AudioContext with specified sample rate
339
+ *
340
+ * Note: This is now a no-op. AudioContext is created lazily on first schedule()
341
+ * to avoid browser autoplay policy issues (requires user gesture).
342
+ */
343
+ initialize(): Promise<void>;
344
+ /**
345
+ * Ensure AudioContext is created and ready
346
+ * Called lazily on first schedule() - requires user gesture
347
+ */
348
+ private ensureContext;
349
+ /**
350
+ * Schedule an audio chunk for playback
351
+ *
352
+ * Uses Web Audio's hardware-accurate clock for sample-perfect timing.
353
+ * Chunks are scheduled immediately, not when they should play - this
354
+ * ensures gapless playback even if main thread stalls.
355
+ *
356
+ * @param audioData - Float32Array of audio samples
357
+ * @returns Scheduled playback time in AudioContext seconds
358
+ */
359
+ schedule(audioData: Float32Array): Promise<number>;
360
+ /**
361
+ * Get current audio clock time
362
+ *
363
+ * This is the hardware-accurate time, NOT JavaScript time.
364
+ * Use this for synchronizing visual animations to audio.
365
+ *
366
+ * @returns Current time in AudioContext seconds
367
+ */
368
+ getCurrentTime(): number;
369
+ /**
370
+ * Get scheduled playback end time
371
+ */
372
+ getPlaybackEndTime(): number;
373
+ /**
374
+ * Check if all scheduled audio has finished playing
375
+ */
376
+ isComplete(): boolean;
377
+ /**
378
+ * Cancel all scheduled audio with smooth fade-out
379
+ *
380
+ * Applies a linear fade-out to all playing sources and stops them gracefully.
381
+ * Prevents audio clicks/pops by ramping gain to zero before stopping.
382
+ *
383
+ * @param fadeOutMs - Fade-out duration in milliseconds (default: 50ms)
384
+ * @returns Promise that resolves when fade-out completes
385
+ */
386
+ cancelAll(fadeOutMs?: number): Promise<void>;
387
+ /**
388
+ * Reset scheduler state for new playback session
389
+ */
390
+ reset(): void;
391
+ /**
392
+ * Cleanup resources
393
+ */
394
+ dispose(): void;
395
+ }
396
+
397
+ /**
398
+ * AudioChunkCoalescer - Combine small network chunks into optimal buffers
399
+ *
400
+ * Network streaming often delivers audio in small chunks (e.g., 32ms from TTS APIs).
401
+ * Creating an AudioBufferSourceNode for each tiny chunk is inefficient and can cause
402
+ * overhead from object creation/GC.
403
+ *
404
+ * This class implements a double-buffering pattern: accumulate small chunks in a
405
+ * temporary buffer, then flush to playback queue when threshold is reached.
406
+ *
407
+ * Benefits:
408
+ * - Reduces AudioBufferSourceNode overhead (fewer nodes = less GC pressure)
409
+ * - Configurable buffer size for optimal playback chunk duration
410
+ * - Maintains sample-accurate timing despite buffering
411
+ *
412
+ * Based on patterns from HLS.js and production streaming implementations.
413
+ *
414
+ * @category Audio
415
+ */
416
+ interface AudioChunkCoalescerOptions {
417
+ /**
418
+ * Target duration in milliseconds for combined chunks
419
+ * Default: 200ms (balances latency vs overhead)
420
+ *
421
+ * Smaller values = lower latency, more overhead
422
+ * Larger values = higher latency, less overhead
423
+ */
424
+ targetDurationMs?: number;
425
+ /**
426
+ * Sample rate in Hz
427
+ * Default: 16000 (speech quality)
428
+ */
429
+ sampleRate?: number;
430
+ }
431
+ declare class AudioChunkCoalescer {
432
+ private readonly options;
433
+ private tempBuffer;
434
+ private readonly targetBytes;
435
+ constructor(options?: AudioChunkCoalescerOptions);
436
+ /**
437
+ * Add a chunk to the temporary buffer
438
+ *
439
+ * @param chunk - Uint8Array containing Int16 PCM audio
440
+ * @returns Combined buffer if threshold reached, null otherwise
441
+ */
442
+ add(chunk: Uint8Array): ArrayBuffer | null;
443
+ /**
444
+ * Flush remaining buffered data
445
+ *
446
+ * Call this when the stream ends to ensure all audio is processed,
447
+ * even if it doesn't reach the target threshold.
448
+ *
449
+ * @returns Combined buffer, or null if buffer is empty
450
+ */
451
+ flush(): ArrayBuffer | null;
452
+ /**
453
+ * Get current buffer fill level (0-1)
454
+ */
455
+ get fillLevel(): number;
456
+ /**
457
+ * Get current buffered duration in milliseconds
458
+ */
459
+ getBufferedDurationMs(): number;
460
+ /**
461
+ * Get number of chunks currently buffered
462
+ */
463
+ get chunkCount(): number;
464
+ /**
465
+ * Reset the coalescer
466
+ */
467
+ reset(): void;
468
+ }
469
+
470
+ /**
471
+ * Runtime detection utilities for platform-specific inference configuration
472
+ *
473
+ * These utilities help determine the optimal backend (WebGPU vs WASM) based on
474
+ * the current platform's capabilities and known limitations.
475
+ *
476
+ * Key considerations:
477
+ * - iOS Safari: WebGPU crashes due to JSEP bugs (GitHub #22776, #26827)
478
+ * - Android Chrome: WebGPU works well (Chrome 121+)
479
+ * - Desktop: WebGPU preferred for performance
480
+ *
481
+ * @module utils/runtime
482
+ */
483
+ /**
484
+ * Supported inference backends
485
+ */
486
+ type RuntimeBackend = 'webgpu' | 'wasm';
487
+ /**
488
+ * User-configurable backend preference
489
+ */
490
+ type BackendPreference = 'auto' | 'webgpu' | 'wasm' | 'webgpu-only' | 'wasm-only';
491
+ /**
492
+ * Detect iOS Safari browser
493
+ *
494
+ * iOS Safari has severe WebGPU issues:
495
+ * - JSEP compilation bugs cause OOM during session creation
496
+ * - Threading bugs require numThreads=1
497
+ * - Proxy mode triggers memory leaks
498
+ *
499
+ * @returns true if running in iOS Safari
500
+ */
501
+ declare function isIOSSafari(): boolean;
502
+ /**
503
+ * Detect any iOS device (regardless of browser)
504
+ *
505
+ * On iOS, all browsers use WebKit, so Chrome/Firefox on iOS
506
+ * have the same limitations as Safari.
507
+ *
508
+ * @returns true if running on any iOS device
509
+ */
510
+ declare function isIOS(): boolean;
511
+ /**
512
+ * Detect Android device
513
+ *
514
+ * Android Chrome 121+ has good WebGPU support with Qualcomm/ARM GPUs.
515
+ *
516
+ * @returns true if running on Android
517
+ */
518
+ declare function isAndroid(): boolean;
519
+ /**
520
+ * Detect any mobile device (iOS or Android)
521
+ *
522
+ * Mobile devices have different performance characteristics:
523
+ * - Lower memory limits
524
+ * - Thermal throttling
525
+ * - Different GPU architectures
526
+ *
527
+ * @returns true if running on mobile
528
+ */
529
+ declare function isMobile(): boolean;
530
+ /**
531
+ * Check if WebGPU API is available in the browser
532
+ *
533
+ * Note: This only checks if the API exists, not if it works reliably.
534
+ * iOS has navigator.gpu but ONNX Runtime's WebGPU backend crashes.
535
+ *
536
+ * @returns true if navigator.gpu exists
537
+ */
538
+ declare function hasWebGPUApi(): boolean;
539
+ /**
540
+ * Get the recommended backend for the current platform
541
+ *
542
+ * Decision tree:
543
+ * 1. iOS (any browser): Force WASM (WebGPU crashes)
544
+ * 2. Android: WebGPU preferred (works in Chrome 121+)
545
+ * 3. Desktop: WebGPU preferred (best performance)
546
+ *
547
+ * @returns 'wasm' for iOS, 'webgpu' for everything else
548
+ */
549
+ declare function getRecommendedBackend(): RuntimeBackend;
550
+ /**
551
+ * Resolve user preference to actual backend
552
+ *
553
+ * @param preference User's backend preference
554
+ * @param webgpuAvailable Whether WebGPU is available and working
555
+ * @returns The backend to use
556
+ */
557
+ declare function resolveBackend(preference: BackendPreference, webgpuAvailable: boolean): RuntimeBackend;
558
+ /**
559
+ * Get optimal WASM thread count for current platform
560
+ *
561
+ * @returns Recommended number of WASM threads
562
+ */
563
+ declare function getOptimalWasmThreads(): number;
564
+ /**
565
+ * Check if WASM proxy mode should be enabled
566
+ *
567
+ * Proxy mode offloads inference to a Web Worker, but has issues:
568
+ * - iOS: Triggers Safari 26 JSEP memory leak
569
+ * - Mobile: Generally unstable
570
+ *
571
+ * @returns true if proxy mode is safe to enable
572
+ */
573
+ declare function shouldEnableWasmProxy(): boolean;
574
+ /**
575
+ * Check if Web Speech API is available in the browser
576
+ *
577
+ * The Web Speech API provides native speech recognition in Safari and Chrome.
578
+ * On iOS Safari, this is significantly faster than Whisper WASM.
579
+ *
580
+ * @returns true if SpeechRecognition API is available
581
+ */
582
+ declare function isSpeechRecognitionAvailable(): boolean;
583
+ /**
584
+ * Recommend using native Safari Speech API over Whisper on iOS
585
+ *
586
+ * On iOS, Whisper ASR via WASM takes ~1.3s per inference (30% over target).
587
+ * Safari's native Web Speech API is:
588
+ * - Much faster (native implementation)
589
+ * - Battery-efficient (no WASM overhead)
590
+ * - No model download needed (saves 30-150MB)
591
+ *
592
+ * @returns true if on iOS with Speech API available
593
+ */
594
+ declare function shouldUseNativeASR(): boolean;
595
+ /**
596
+ * Recommend using server-side LAM over client-side on iOS
597
+ *
598
+ * On iOS, LAM lip sync via WASM takes ~332ms per second of audio (3.3x over target).
599
+ * Server-side inference with GPU can achieve ~50ms, providing:
600
+ * - Real-time lip sync (under 100ms target)
601
+ * - Reduced iOS device thermal/battery impact
602
+ * - Better user experience
603
+ *
604
+ * @returns true if on iOS (should use server-side lip sync)
605
+ */
606
+ declare function shouldUseServerLipSync(): boolean;
607
+
608
+ /**
609
+ * Lazy ONNX Runtime loader with conditional WebGPU/WASM bundle loading
610
+ *
611
+ * This module provides a way to dynamically load the appropriate ONNX Runtime bundle
612
+ * based on the platform's capabilities. This is critical for iOS support because:
613
+ *
614
+ * 1. iOS Safari has WebGPU API but ONNX Runtime's WebGPU backend crashes
615
+ * 2. Loading the WebGPU bundle wastes bandwidth and can cause issues
616
+ * 3. WASM-only bundle is smaller and more reliable on iOS
617
+ *
618
+ * Usage:
619
+ * ```typescript
620
+ * const ort = await getOnnxRuntime('wasm'); // Load WASM-only bundle
621
+ * const ort = await getOnnxRuntime('webgpu'); // Load WebGPU bundle (includes WASM)
622
+ * ```
623
+ *
624
+ * @module inference/onnxLoader
625
+ */
626
+
627
+ type OrtModule = {
628
+ InferenceSession: typeof InferenceSession;
629
+ Tensor: typeof Tensor;
630
+ env: Env;
631
+ };
632
+ type SessionOptions = InferenceSession.SessionOptions;
633
+
634
+ /**
635
+ * Check if WebGPU is available and likely to work
636
+ *
637
+ * This is more thorough than just checking navigator.gpu exists.
638
+ * It actually requests an adapter to verify the GPU is accessible.
639
+ *
640
+ * @returns true if WebGPU is available and working
641
+ */
642
+ declare function isWebGPUAvailable(): Promise<boolean>;
643
+ /**
644
+ * Load ONNX Runtime with the specified backend
645
+ *
646
+ * This lazily loads the appropriate bundle:
647
+ * - 'wasm': Loads onnxruntime-web (WASM-only, smaller)
648
+ * - 'webgpu': Loads onnxruntime-web/webgpu (includes WebGPU + WASM fallback)
649
+ *
650
+ * Once loaded, the same instance is reused for all subsequent calls.
651
+ * If you need to switch backends, you must reload the page.
652
+ *
653
+ * @param backend The backend to load ('webgpu' or 'wasm')
654
+ * @returns The ONNX Runtime module
655
+ */
656
+ declare function getOnnxRuntime(backend: RuntimeBackend): Promise<OrtModule>;
657
+ /**
658
+ * Get the appropriate ONNX Runtime based on user preference
659
+ *
660
+ * This resolves the user's preference against platform capabilities
661
+ * and loads the appropriate bundle.
662
+ *
663
+ * @param preference User's backend preference
664
+ * @returns The ONNX Runtime module and the resolved backend
665
+ */
666
+ declare function getOnnxRuntimeForPreference(preference?: BackendPreference): Promise<{
667
+ ort: OrtModule;
668
+ backend: RuntimeBackend;
669
+ }>;
670
+ /**
671
+ * Get session options for creating an inference session
672
+ *
673
+ * This returns optimized session options based on the backend and platform.
674
+ *
675
+ * @param backend The backend being used
676
+ * @returns Session options for InferenceSession.create()
677
+ */
678
+ declare function getSessionOptions(backend: RuntimeBackend): SessionOptions;
679
+ /**
680
+ * Create an inference session with automatic fallback
681
+ *
682
+ * If WebGPU session creation fails, automatically falls back to WASM.
683
+ *
684
+ * @param modelBuffer The model data as ArrayBuffer
685
+ * @param preferredBackend The preferred backend
686
+ * @returns The created session and the backend used
687
+ */
688
+ declare function createSessionWithFallback(modelBuffer: ArrayBuffer, preferredBackend: RuntimeBackend): Promise<{
689
+ session: InferenceSession;
690
+ backend: RuntimeBackend;
691
+ }>;
692
+ /**
693
+ * Get the currently loaded backend (if any)
694
+ */
695
+ declare function getLoadedBackend(): RuntimeBackend | null;
696
+ /**
697
+ * Check if ONNX Runtime has been loaded
698
+ */
699
+ declare function isOnnxRuntimeLoaded(): boolean;
700
+
701
+ /**
702
+ * Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
703
+ *
704
+ * Runs entirely in the browser using WebGPU or WASM.
705
+ * Takes raw 16kHz audio and outputs:
706
+ * - 52 ARKit blendshapes (lip sync)
707
+ * - 32-token CTC logits (speech recognition)
708
+ *
709
+ * @category Inference
710
+ *
711
+ * @example Basic usage
712
+ * ```typescript
713
+ * import { Wav2Vec2Inference } from '@omote/core';
714
+ *
715
+ * const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
716
+ * await wav2vec.load();
717
+ *
718
+ * // Process 1 second of audio (16kHz = 16000 samples)
719
+ * const result = await wav2vec.infer(audioSamples);
720
+ *
721
+ * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
722
+ * console.log('ASR text:', result.text); // Decoded transcription
723
+ * ```
724
+ */
725
+
726
+ type InferenceBackend = BackendPreference;
727
+ interface Wav2Vec2InferenceConfig {
728
+ /** Path or URL to the ONNX model */
729
+ modelUrl: string;
730
+ /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
731
+ backend?: InferenceBackend;
732
+ /** Number of identity classes (default: 12 for streaming model) */
733
+ numIdentityClasses?: number;
734
+ }
735
+ interface ModelInfo {
736
+ backend: 'webgpu' | 'wasm';
737
+ loadTimeMs: number;
738
+ inputNames: string[];
739
+ outputNames: string[];
740
+ }
741
+ /**
742
+ * LAM model blendshape names in order (52 total)
743
+ * NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
744
+ */
745
+ declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
746
+ /** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
747
+ declare const CTC_VOCAB: string[];
748
+ interface Wav2Vec2Result {
749
+ /** Blendshape weights [frames, 52] - 30fps */
750
+ blendshapes: Float32Array[];
751
+ /** Raw CTC logits [frames, 32] - 50fps */
752
+ asrLogits: Float32Array[];
753
+ /** Decoded text from CTC */
754
+ text: string;
755
+ /** Number of A2E frames (30fps) */
756
+ numA2EFrames: number;
757
+ /** Number of ASR frames (50fps) */
758
+ numASRFrames: number;
759
+ /** Inference time in ms */
760
+ inferenceTimeMs: number;
761
+ }
762
+ declare class Wav2Vec2Inference {
763
+ private session;
764
+ private ort;
765
+ private config;
766
+ private _backend;
767
+ private isLoading;
768
+ private numIdentityClasses;
769
+ private inferenceQueue;
770
+ constructor(config: Wav2Vec2InferenceConfig);
771
+ /**
772
+ * Check if WebGPU is available and working
773
+ * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
774
+ */
775
+ static isWebGPUAvailable: typeof isWebGPUAvailable;
776
+ get backend(): 'webgpu' | 'wasm' | null;
777
+ get isLoaded(): boolean;
778
+ /**
779
+ * Load the ONNX model
780
+ */
781
+ load(): Promise<ModelInfo>;
782
+ /**
783
+ * Run inference on raw audio
784
+ * @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
785
+ * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
786
+ *
787
+ * Note: Model expects 1-second chunks (16000 samples) for optimal performance.
788
+ * Audio will be zero-padded or truncated to 16000 samples.
789
+ */
790
+ infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
791
+ /**
792
+ * Decode CTC logits to text using greedy decoding
793
+ */
794
+ private decodeCTC;
795
+ /**
796
+ * Queue inference to serialize ONNX session calls
797
+ */
798
+ private queueInference;
799
+ /**
800
+ * Get blendshape value by name for a specific frame
801
+ */
802
+ getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
803
+ /**
804
+ * Dispose of the model and free resources
805
+ */
806
+ dispose(): Promise<void>;
807
+ }
808
+
809
+ /**
810
+ * LAMPipeline - Coordinate LAM (Wav2Vec2) inference with frame synchronization
811
+ *
812
+ * Manages the buffering and processing pipeline for LAM lip sync:
813
+ * 1. Accumulates audio samples in a ring buffer
814
+ * 2. Triggers LAM inference when buffer reaches required size (16000 samples @ 16kHz = 1.0s)
815
+ * 3. Queues resulting blendshape frames with precise timestamps
816
+ * 4. Provides frames synchronized to AudioContext clock
817
+ *
818
+ * Key Design Decisions:
819
+ * - Ring buffer pattern for efficient sample accumulation (no allocation churn)
820
+ * - Frame queue with timestamps for deterministic playback
821
+ * - Timestamp-based frame retrieval (not callback) for renderer flexibility
822
+ *
823
+ * Based on patterns from Chrome Audio Worklet design and Web Audio clock management.
824
+ *
825
+ * @see https://developer.chrome.com/blog/audio-worklet-design-pattern
826
+ * @category Audio
827
+ */
828
+
829
+ interface LAMFrame {
830
+ /** 52 ARKit blendshape weights */
831
+ frame: Float32Array;
832
+ /** AudioContext time when this frame should be displayed */
833
+ timestamp: number;
834
+ }
835
+ interface LAMPipelineOptions {
836
+ /**
837
+ * Sample rate in Hz (must match audio playback)
838
+ * Default: 16000
839
+ */
840
+ sampleRate?: number;
841
+ /**
842
+ * LAM inference callback
843
+ * Called each time LAM processes a buffer
844
+ */
845
+ onInference?: (frameCount: number) => void;
846
+ /**
847
+ * Error callback for inference failures
848
+ */
849
+ onError?: (error: Error) => void;
850
+ }
851
+ declare class LAMPipeline {
852
+ private readonly options;
853
+ private readonly REQUIRED_SAMPLES;
854
+ private readonly FRAME_RATE;
855
+ private buffer;
856
+ private bufferStartTime;
857
+ private frameQueue;
858
+ /**
859
+ * Last successfully retrieved frame
860
+ * Used as fallback when no new frame is available to prevent avatar freezing
861
+ */
862
+ private lastFrame;
863
+ constructor(options?: LAMPipelineOptions);
864
+ /**
865
+ * Push audio samples into the pipeline
866
+ *
867
+ * Accumulates samples and triggers LAM inference when buffer is full.
868
+ * Multiple calls may be needed to accumulate enough samples.
869
+ *
870
+ * @param samples - Float32Array of audio samples
871
+ * @param timestamp - AudioContext time when these samples start playing
872
+ * @param lam - LAM inference engine
873
+ */
874
+ push(samples: Float32Array, timestamp: number, lam: Wav2Vec2Inference): Promise<void>;
875
+ /**
876
+ * Process accumulated buffer through LAM inference
877
+ */
878
+ private processBuffer;
879
+ /**
880
+ * Get the frame that should be displayed at the current time
881
+ *
882
+ * Automatically removes frames that have already been displayed.
883
+ * This prevents memory leaks from accumulating old frames.
884
+ *
885
+ * Discard Window (prevents premature frame discarding):
886
+ * - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
887
+ * - WASM: 1.0s (LAM inference 50-500ms + higher variability)
888
+ *
889
+ * Last-Frame-Hold: Returns last valid frame instead of null to prevent
890
+ * avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
891
+ *
892
+ * @param currentTime - Current AudioContext time
893
+ * @param lam - LAM inference engine (optional, for backend detection)
894
+ * @returns Current frame, or last frame as fallback, or null if no frames yet
895
+ */
896
+ getFrameForTime(currentTime: number, lam?: {
897
+ backend: 'webgpu' | 'wasm' | null;
898
+ }): Float32Array | null;
899
+ /**
900
+ * Get all frames in the queue (for debugging/monitoring)
901
+ */
902
+ getQueuedFrames(): LAMFrame[];
903
+ /**
904
+ * Get current buffer fill level (0-1)
905
+ */
906
+ get fillLevel(): number;
907
+ /**
908
+ * Get number of frames queued
909
+ */
910
+ get queuedFrameCount(): number;
911
+ /**
912
+ * Get buffered audio duration in seconds
913
+ */
914
+ get bufferedDuration(): number;
915
+ /**
916
+ * Flush remaining buffered audio
917
+ *
918
+ * Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
919
+ * This ensures the final audio chunk generates blendshape frames.
920
+ *
921
+ * Should be called when audio stream ends to prevent losing the last 0-1 seconds.
922
+ *
923
+ * @param lam - LAM inference engine
924
+ */
925
+ flush(lam: Wav2Vec2Inference): Promise<void>;
926
+ /**
927
+ * Adjust all queued frame timestamps by an offset
928
+ *
929
+ * Used for synchronization when audio scheduling time differs from
930
+ * the estimated time used during LAM processing.
931
+ *
932
+ * @param offset - Time offset in seconds to add to all timestamps
933
+ */
934
+ adjustTimestamps(offset: number): void;
935
+ /**
936
+ * Reset the pipeline
937
+ */
938
+ reset(): void;
939
+ }
940
+
941
+ /**
942
+ * SyncedAudioPipeline - Enterprise-grade audio + LAM synchronization coordinator
943
+ *
944
+ * Orchestrates the complete pipeline for synchronized audio playback and lip sync:
945
+ * 1. Network chunks → Coalescer → Optimized buffers
946
+ * 2. Audio buffers → Scheduler → Gapless playback
947
+ * 3. Audio buffers → LAM Pipeline → Blendshape frames
948
+ * 4. Frames synchronized to AudioContext clock → Renderer
949
+ *
950
+ * Key Architecture Pattern: Wait-for-First-LAM
951
+ * - Buffers incoming audio chunks without scheduling playback
952
+ * - Waits for first LAM inference to complete (ensures LAM frames are ready)
953
+ * - Then schedules all buffered audio + LAM frames together
954
+ * - Result: Perfect synchronization from frame 1, no lag compensation needed
955
+ *
956
+ * This is a deterministic, enterprise-grade solution suitable for production use.
957
+ * No hacks, no lag detection, no frame skipping - just guaranteed synchronization.
958
+ *
959
+ * @see https://web.dev/articles/audio-scheduling (Web Audio clock patterns)
960
+ * @see https://developer.chrome.com/blog/audio-worklet-design-pattern (Ring buffer patterns)
961
+ * @category Audio
962
+ */
963
+
964
+ interface SyncedAudioPipelineOptions {
965
+ /** Sample rate in Hz (default: 16000) */
966
+ sampleRate?: number;
967
+ /** Target chunk duration in ms for coalescing (default: 200) */
968
+ chunkTargetMs?: number;
969
+ /** LAM inference engine */
970
+ lam: Wav2Vec2Inference;
971
+ }
972
+ interface SyncedAudioPipelineEvents {
973
+ /** New frame ready for display */
974
+ frame_ready: Float32Array;
975
+ /** Playback has completed */
976
+ playback_complete: void;
977
+ /** First LAM inference completed, playback starting */
978
+ playback_start: number;
979
+ /** Error occurred */
980
+ error: Error;
981
+ /** Index signature for EventEmitter compatibility */
982
+ [key: string]: unknown;
983
+ }
984
+ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents> {
985
+ private readonly options;
986
+ private scheduler;
987
+ private coalescer;
988
+ private lamPipeline;
989
+ private waitingForFirstLAM;
990
+ private bufferedChunks;
991
+ private monitorInterval;
992
+ private frameAnimationId;
993
+ constructor(options: SyncedAudioPipelineOptions);
994
+ /**
995
+ * Initialize the pipeline
996
+ */
997
+ initialize(): Promise<void>;
998
+ /**
999
+ * Start a new playback session
1000
+ *
1001
+ * Resets all state and prepares for incoming audio chunks.
1002
+ * Enables wait-for-first-LAM synchronization.
1003
+ */
1004
+ start(): void;
1005
+ /**
1006
+ * Receive audio chunk from network
1007
+ *
1008
+ * Implements wait-for-first-LAM pattern:
1009
+ * - Chunks are coalesced into optimal buffers
1010
+ * - Buffers are sent to LAM for processing
1011
+ * - Audio scheduling waits until first LAM completes
1012
+ * - Then all buffered audio is scheduled together with LAM frames
1013
+ *
1014
+ * @param chunk - Uint8Array containing Int16 PCM audio
1015
+ */
1016
+ onAudioChunk(chunk: Uint8Array): Promise<void>;
1017
+ /**
1018
+ * Handle first LAM inference completion
1019
+ *
1020
+ * This is the critical synchronization point:
1021
+ * - LAM frames are now ready in the queue
1022
+ * - Schedule all buffered audio chunks
1023
+ * - Adjust LAM frame timestamps to match actual schedule time
1024
+ * - Audio and LAM start playing together, perfectly synchronized
1025
+ */
1026
+ private onFirstLAMComplete;
1027
+ /**
1028
+ * End of audio stream
1029
+ *
1030
+ * Flushes any remaining buffered data.
1031
+ */
1032
+ end(): Promise<void>;
1033
+ /**
1034
+ * Stop playback immediately with smooth fade-out
1035
+ *
1036
+ * Gracefully cancels all audio playback and LAM processing:
1037
+ * - Fades out audio over specified duration (default: 50ms)
1038
+ * - Cancels pending LAM inferences
1039
+ * - Clears all buffers and queues
1040
+ * - Emits 'playback_complete' event
1041
+ *
1042
+ * Use this for interruptions (e.g., user barge-in during AI speech).
1043
+ *
1044
+ * @param fadeOutMs - Fade-out duration in milliseconds (default: 50ms)
1045
+ * @returns Promise that resolves when fade-out completes
1046
+ */
1047
+ stop(fadeOutMs?: number): Promise<void>;
1048
+ /**
1049
+ * Start frame animation loop
1050
+ *
1051
+ * Uses requestAnimationFrame to check for new LAM frames.
1052
+ * Synchronized to AudioContext clock (not visual refresh rate).
1053
+ *
1054
+ * Frame Emission Strategy:
1055
+ * - LAMPipeline uses last-frame-hold to prevent null returns
1056
+ * - Always emit frames (even repeated frames) to maintain smooth animation
1057
+ * - Renderer is responsible for detecting duplicate frames if needed
1058
+ */
1059
+ private startFrameLoop;
1060
+ /**
1061
+ * Start monitoring for playback completion
1062
+ */
1063
+ private startMonitoring;
1064
+ /**
1065
+ * Stop monitoring
1066
+ */
1067
+ private stopMonitoring;
1068
+ /**
1069
+ * Get current pipeline state (for debugging/monitoring)
1070
+ */
1071
+ getState(): {
1072
+ waitingForFirstLAM: boolean;
1073
+ bufferedChunks: number;
1074
+ coalescerFill: number;
1075
+ lamFill: number;
1076
+ queuedFrames: number;
1077
+ currentTime: number;
1078
+ playbackEndTime: number;
1079
+ };
1080
+ /**
1081
+ * Cleanup resources
1082
+ */
1083
+ dispose(): void;
1084
+ }
1085
+
1086
+ /**
1087
+ * Whisper Automatic Speech Recognition using transformers.js
1088
+ * Uses Xenova's proven pipeline API for reliable transcription
1089
+ */
1090
+ type WhisperModel = 'tiny' | 'base' | 'small' | 'medium';
1091
+ type WhisperDtype = 'fp32' | 'fp16' | 'q8' | 'int8' | 'uint8' | 'q4' | 'q4f16' | 'bnb4';
1092
+ interface WhisperConfig {
1093
+ /** Model size: tiny (~75MB), base (~150MB), small (~500MB), medium (~1.5GB) */
1094
+ model?: WhisperModel;
1095
+ /** Use multilingual model (default: false, uses .en models) */
1096
+ multilingual?: boolean;
1097
+ /** Language code (e.g., 'en', 'es', 'fr') - for multilingual models */
1098
+ language?: string;
1099
+ /** Task: transcribe or translate (default: transcribe) */
1100
+ task?: 'transcribe' | 'translate';
1101
+ /** Model quantization format (default: 'q8' for balance of speed/quality) */
1102
+ dtype?: WhisperDtype;
1103
+ /** Use WebGPU acceleration if available (default: auto-detect) */
1104
+ device?: 'auto' | 'webgpu' | 'wasm';
1105
+ /** Local model path (e.g., '/models/whisper-tiny.en') - overrides HuggingFace CDN */
1106
+ localModelPath?: string;
1107
+ /** HuggingFace API token to bypass rate limits (get from https://huggingface.co/settings/tokens) */
1108
+ token?: string;
1109
+ /** Suppress non-speech tokens like [LAUGHTER], [CLICKING], etc. (default: true) */
1110
+ suppressNonSpeech?: boolean;
1111
+ }
1112
+ interface TranscriptionResult {
1113
+ /** Transcribed text */
1114
+ text: string;
1115
+ /** Detected/used language */
1116
+ language: string;
1117
+ /** Inference time in ms */
1118
+ inferenceTimeMs: number;
1119
+ /** Full chunks with timestamps (if requested) */
1120
+ chunks?: Array<{
1121
+ text: string;
1122
+ timestamp: [number, number | null];
1123
+ }>;
1124
+ }
1125
+ /**
1126
+ * Whisper ASR inference using transformers.js pipeline API
1127
+ *
1128
+ * Features:
1129
+ * - Automatic WebGPU/WASM backend selection
1130
+ * - Streaming support with chunk callbacks
1131
+ * - Proven implementation from Xenova's demo
1132
+ * - Handles all audio preprocessing automatically
1133
+ */
1134
+ declare class WhisperInference {
1135
+ private config;
1136
+ private pipeline;
1137
+ private currentModel;
1138
+ private isLoading;
1139
+ private actualBackend;
1140
+ constructor(config?: WhisperConfig);
1141
+ /**
1142
+ * Check if WebGPU is available in this browser
1143
+ */
1144
+ static isWebGPUAvailable(): Promise<boolean>;
1145
+ /**
1146
+ * Load the Whisper model pipeline
1147
+ */
1148
+ load(onProgress?: (progress: {
1149
+ status: string;
1150
+ progress?: number;
1151
+ file?: string;
1152
+ }) => void): Promise<void>;
1153
+ /**
1154
+ * Transcribe audio to text
1155
+ *
1156
+ * @param audio Audio samples (Float32Array, 16kHz mono)
1157
+ * @param options Transcription options
1158
+ */
1159
+ transcribe(audio: Float32Array, options?: {
1160
+ /** Return timestamps for each chunk */
1161
+ returnTimestamps?: boolean;
1162
+ /** Chunk length in seconds (default: 30) */
1163
+ chunkLengthS?: number;
1164
+ /** Stride length in seconds for overlapping chunks (default: 5) */
1165
+ strideLengthS?: number;
1166
+ /** Language override */
1167
+ language?: string;
1168
+ /** Task override */
1169
+ task?: 'transcribe' | 'translate';
1170
+ }): Promise<TranscriptionResult>;
1171
+ /**
1172
+ * Transcribe with streaming chunks (progressive results)
1173
+ *
1174
+ * @param audio Audio samples
1175
+ * @param onChunk Called when each chunk is finalized
1176
+ * @param onUpdate Called after each generation step (optional)
1177
+ */
1178
+ transcribeStreaming(audio: Float32Array, onChunk: (chunk: {
1179
+ text: string;
1180
+ timestamp: [number, number | null];
1181
+ }) => void, onUpdate?: (text: string) => void, options?: {
1182
+ chunkLengthS?: number;
1183
+ strideLengthS?: number;
1184
+ language?: string;
1185
+ task?: 'transcribe' | 'translate';
1186
+ }): Promise<TranscriptionResult>;
1187
+ /**
1188
+ * Dispose of the model and free resources
1189
+ */
1190
+ dispose(): Promise<void>;
1191
+ /**
1192
+ * Check if model is loaded
1193
+ */
1194
+ get isLoaded(): boolean;
1195
+ /**
1196
+ * Get the backend being used (webgpu or wasm)
1197
+ */
1198
+ get backend(): string;
1199
+ /**
1200
+ * Get the full model name used by transformers.js
1201
+ */
1202
+ private getModelName;
1203
+ /**
1204
+ * Remove non-speech event tokens from transcription
1205
+ *
1206
+ * Whisper outputs special tokens for non-speech events like:
1207
+ * [LAUGHTER], [APPLAUSE], [MUSIC], [BLANK_AUDIO], [CLICKING], etc.
1208
+ *
1209
+ * This method strips these tokens and cleans up extra whitespace.
1210
+ */
1211
+ private removeNonSpeechTokens;
1212
+ }
1213
+
1214
+ /**
1215
+ * Silero VAD (Voice Activity Detection) inference
1216
+ *
1217
+ * Neural network-based VAD running in browser via ONNX Runtime Web.
1218
+ * Much more accurate than RMS-based energy detection.
1219
+ *
1220
+ * Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
1221
+ * - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
1222
+ * - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
1223
+ *
1224
+ * @category Inference
1225
+ *
1226
+ * @example Basic usage
1227
+ * ```typescript
1228
+ * import { SileroVADInference } from '@omote/core';
1229
+ *
1230
+ * const vad = new SileroVADInference({
1231
+ * modelUrl: '/models/silero-vad.onnx'
1232
+ * });
1233
+ * await vad.load();
1234
+ *
1235
+ * // Process 32ms chunks (512 samples at 16kHz)
1236
+ * const probability = await vad.process(audioChunk);
1237
+ * if (probability > 0.5) {
1238
+ * console.log('Speech detected!');
1239
+ * }
1240
+ * ```
1241
+ *
1242
+ * @example Streaming with state management
1243
+ * ```typescript
1244
+ * // State is automatically maintained between process() calls
1245
+ * // Call reset() when starting a new audio stream
1246
+ * vad.reset();
1247
+ *
1248
+ * for (const chunk of audioChunks) {
1249
+ * const prob = await vad.process(chunk);
1250
+ * // prob is speech probability [0, 1]
1251
+ * }
1252
+ * ```
1253
+ */
1254
+
1255
+ type VADBackend = BackendPreference;
1256
+ /**
1257
+ * Configuration for Silero VAD
1258
+ */
1259
+ interface SileroVADConfig {
1260
+ /** Path or URL to the ONNX model */
1261
+ modelUrl: string;
1262
+ /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
1263
+ backend?: VADBackend;
1264
+ /** Sample rate (8000 or 16000, default: 16000) */
1265
+ sampleRate?: 8000 | 16000;
1266
+ /** Speech probability threshold (default: 0.5) */
1267
+ threshold?: number;
1268
+ /**
1269
+ * Number of audio chunks to keep in pre-speech buffer.
1270
+ * When VAD triggers, these chunks are prepended to the speech buffer
1271
+ * to capture the beginning of speech that occurred before detection.
1272
+ *
1273
+ * At 512 samples/chunk and 16kHz:
1274
+ * - 10 chunks = 320ms of pre-speech audio
1275
+ * - 15 chunks = 480ms of pre-speech audio
1276
+ *
1277
+ * Default: 10 chunks (320ms)
1278
+ */
1279
+ preSpeechBufferChunks?: number;
1280
+ }
1281
+ /**
1282
+ * VAD model loading information
1283
+ */
1284
+ interface VADModelInfo {
1285
+ backend: 'webgpu' | 'wasm';
1286
+ loadTimeMs: number;
1287
+ inputNames: string[];
1288
+ outputNames: string[];
1289
+ sampleRate: number;
1290
+ chunkSize: number;
1291
+ }
1292
+ /**
1293
+ * Result from a single VAD inference
1294
+ */
1295
+ interface VADResult$1 {
1296
+ /** Speech probability (0-1) */
1297
+ probability: number;
1298
+ /** Whether speech is detected (probability > threshold) */
1299
+ isSpeech: boolean;
1300
+ /** Inference time in milliseconds */
1301
+ inferenceTimeMs: number;
1302
+ /**
1303
+ * Pre-speech audio chunks (only present on first speech detection).
1304
+ * These are the N chunks immediately before VAD triggered, useful for
1305
+ * capturing the beginning of speech that occurred before detection.
1306
+ *
1307
+ * Only populated when transitioning from silence to speech.
1308
+ */
1309
+ preSpeechChunks?: Float32Array[];
1310
+ }
1311
+ /**
1312
+ * Speech segment detected by VAD
1313
+ */
1314
+ interface SpeechSegment {
1315
+ /** Start time in seconds */
1316
+ start: number;
1317
+ /** End time in seconds */
1318
+ end: number;
1319
+ /** Average probability during segment */
1320
+ avgProbability: number;
1321
+ }
1322
+ /**
1323
+ * Silero VAD - Neural network voice activity detection
1324
+ *
1325
+ * Based on snakers4/silero-vad ONNX model.
1326
+ * Processes 32ms chunks (512 samples at 16kHz) with LSTM state.
1327
+ *
1328
+ * @see https://github.com/snakers4/silero-vad
1329
+ */
1330
+ declare class SileroVADInference {
1331
+ private session;
1332
+ private ort;
1333
+ private config;
1334
+ private _backend;
1335
+ private isLoading;
1336
+ private state;
1337
+ private context;
1338
+ private readonly chunkSize;
1339
+ private readonly contextSize;
1340
+ private inferenceQueue;
1341
+ private preSpeechBuffer;
1342
+ private wasSpeaking;
1343
+ constructor(config: SileroVADConfig);
1344
+ get backend(): RuntimeBackend | null;
1345
+ get isLoaded(): boolean;
1346
+ get sampleRate(): number;
1347
+ get threshold(): number;
1348
+ /**
1349
+ * Get required chunk size in samples
1350
+ */
1351
+ getChunkSize(): number;
1352
+ /**
1353
+ * Get chunk duration in milliseconds
1354
+ */
1355
+ getChunkDurationMs(): number;
1356
+ /**
1357
+ * Check if WebGPU is available and working
1358
+ * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
1359
+ */
1360
+ static isWebGPUAvailable: typeof isWebGPUAvailable;
1361
+ /**
1362
+ * Load the ONNX model
1363
+ */
1364
+ load(): Promise<VADModelInfo>;
1365
+ /**
1366
+ * Reset state for new audio stream
1367
+ */
1368
+ reset(): void;
1369
+ /**
1370
+ * Process a single audio chunk
1371
+ *
1372
+ * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
1373
+ * @returns VAD result with speech probability
1374
+ */
1375
+ process(audioChunk: Float32Array): Promise<VADResult$1>;
1376
+ /**
1377
+ * Process audio and detect speech segments
1378
+ *
1379
+ * @param audio - Complete audio buffer
1380
+ * @param options - Detection options
1381
+ * @returns Array of speech segments
1382
+ */
1383
+ detectSpeech(audio: Float32Array, options?: {
1384
+ /** Minimum speech duration in ms (default: 250) */
1385
+ minSpeechDurationMs?: number;
1386
+ /** Minimum silence duration to end segment in ms (default: 300) */
1387
+ minSilenceDurationMs?: number;
1388
+ /** Padding to add before/after speech in ms (default: 30) */
1389
+ speechPadMs?: number;
1390
+ }): Promise<SpeechSegment[]>;
1391
+ /**
1392
+ * Calculate RMS energy of audio chunk
1393
+ */
1394
+ private calculateRMS;
1395
+ /**
1396
+ * Queue inference to serialize ONNX session calls
1397
+ */
1398
+ private queueInference;
1399
+ /**
1400
+ * Dispose of the model and free resources
1401
+ */
1402
+ dispose(): Promise<void>;
1403
+ }
1404
+
1405
+ /**
1406
+ * Configuration for Silero VAD Worker
1407
+ */
1408
+ interface VADWorkerConfig {
1409
+ /** Path or URL to the ONNX model */
1410
+ modelUrl: string;
1411
+ /** Sample rate (8000 or 16000, default: 16000) */
1412
+ sampleRate?: 8000 | 16000;
1413
+ /** Speech probability threshold (default: 0.5) */
1414
+ threshold?: number;
1415
+ /**
1416
+ * Number of audio chunks to keep in pre-speech buffer.
1417
+ * When VAD triggers, these chunks are prepended to the speech buffer
1418
+ * to capture the beginning of speech that occurred before detection.
1419
+ *
1420
+ * At 512 samples/chunk and 16kHz:
1421
+ * - 10 chunks = 320ms of pre-speech audio
1422
+ * - 15 chunks = 480ms of pre-speech audio
1423
+ *
1424
+ * Default: 10 chunks (320ms)
1425
+ */
1426
+ preSpeechBufferChunks?: number;
1427
+ }
1428
+ /**
1429
+ * VAD model loading information from worker
1430
+ */
1431
+ interface VADWorkerModelInfo {
1432
+ backend: 'wasm';
1433
+ loadTimeMs: number;
1434
+ inputNames: string[];
1435
+ outputNames: string[];
1436
+ sampleRate: number;
1437
+ chunkSize: number;
1438
+ }
1439
+ /**
1440
+ * Result from a single VAD inference
1441
+ */
1442
+ interface VADResult {
1443
+ /** Speech probability (0-1) */
1444
+ probability: number;
1445
+ /** Whether speech is detected (probability > threshold) */
1446
+ isSpeech: boolean;
1447
+ /** Inference time in milliseconds */
1448
+ inferenceTimeMs: number;
1449
+ /**
1450
+ * Pre-speech audio chunks (only present on first speech detection).
1451
+ * These are the N chunks immediately before VAD triggered, useful for
1452
+ * capturing the beginning of speech that occurred before detection.
1453
+ *
1454
+ * Only populated when transitioning from silence to speech.
1455
+ */
1456
+ preSpeechChunks?: Float32Array[];
1457
+ }
1458
+ /**
1459
+ * Silero VAD Worker - Voice Activity Detection in a Web Worker
1460
+ *
1461
+ * Runs Silero VAD inference off the main thread to prevent UI blocking.
1462
+ * Feature parity with SileroVADInference but runs in dedicated worker.
1463
+ *
1464
+ * @see SileroVADInference for main-thread version
1465
+ */
1466
+ declare class SileroVADWorker {
1467
+ private worker;
1468
+ private config;
1469
+ private isLoading;
1470
+ private _isLoaded;
1471
+ private state;
1472
+ private context;
1473
+ private readonly chunkSize;
1474
+ private readonly contextSize;
1475
+ private inferenceQueue;
1476
+ private preSpeechBuffer;
1477
+ private wasSpeaking;
1478
+ private pendingResolvers;
1479
+ private messageId;
1480
+ constructor(config: VADWorkerConfig);
1481
+ get isLoaded(): boolean;
1482
+ /**
1483
+ * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
1484
+ */
1485
+ get backend(): 'wasm' | null;
1486
+ get sampleRate(): number;
1487
+ get threshold(): number;
1488
+ /**
1489
+ * Get required chunk size in samples
1490
+ */
1491
+ getChunkSize(): number;
1492
+ /**
1493
+ * Get chunk duration in milliseconds
1494
+ */
1495
+ getChunkDurationMs(): number;
1496
+ /**
1497
+ * Create the worker from inline script
1498
+ */
1499
+ private createWorker;
1500
+ /**
1501
+ * Handle messages from worker
1502
+ */
1503
+ private handleWorkerMessage;
1504
+ /**
1505
+ * Send message to worker and wait for response
1506
+ */
1507
+ private sendMessage;
1508
+ /**
1509
+ * Load the ONNX model in the worker
1510
+ */
1511
+ load(): Promise<VADWorkerModelInfo>;
1512
+ /**
1513
+ * Reset state for new audio stream
1514
+ */
1515
+ reset(): Promise<void>;
1516
+ /**
1517
+ * Process a single audio chunk
1518
+ *
1519
+ * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
1520
+ * @returns VAD result with speech probability
1521
+ */
1522
+ process(audioChunk: Float32Array): Promise<VADResult>;
1523
+ /**
1524
+ * Queue inference to serialize worker calls
1525
+ */
1526
+ private queueInference;
1527
+ /**
1528
+ * Dispose of the worker and free resources
1529
+ */
1530
+ dispose(): Promise<void>;
1531
+ /**
1532
+ * Check if Web Workers are supported
1533
+ */
1534
+ static isSupported(): boolean;
1535
+ }
1536
+
1537
+ /**
1538
+ * Factory function for Silero VAD with automatic Worker vs main thread selection
1539
+ *
1540
+ * Provides a unified API that automatically selects the optimal implementation:
1541
+ * - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
1542
+ * - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
1543
+ * - Fallback: Gracefully falls back to main thread if Worker fails
1544
+ *
1545
+ * @category Inference
1546
+ *
1547
+ * @example Basic usage (auto-detect)
1548
+ * ```typescript
1549
+ * import { createSileroVAD } from '@omote/core';
1550
+ *
1551
+ * const vad = createSileroVAD({
1552
+ * modelUrl: '/models/silero-vad.onnx',
1553
+ * threshold: 0.5,
1554
+ * });
1555
+ *
1556
+ * await vad.load();
1557
+ * const result = await vad.process(audioChunk);
1558
+ * if (result.isSpeech) {
1559
+ * console.log('Speech detected!', result.probability);
1560
+ * }
1561
+ * ```
1562
+ *
1563
+ * @example Force worker usage
1564
+ * ```typescript
1565
+ * const vad = createSileroVAD({
1566
+ * modelUrl: '/models/silero-vad.onnx',
1567
+ * useWorker: true, // Force Worker even on mobile
1568
+ * });
1569
+ * ```
1570
+ *
1571
+ * @example Force main thread
1572
+ * ```typescript
1573
+ * const vad = createSileroVAD({
1574
+ * modelUrl: '/models/silero-vad.onnx',
1575
+ * useWorker: false, // Force main thread
1576
+ * });
1577
+ * ```
1578
+ */
1579
+
1580
+ /**
1581
+ * Common interface for both SileroVADInference and SileroVADWorker
1582
+ *
1583
+ * This interface defines the shared API that both implementations provide,
1584
+ * allowing consumers to use either interchangeably.
1585
+ */
1586
+ interface SileroVADBackend {
1587
+ /** Current backend type (webgpu, wasm, or null if not loaded) */
1588
+ readonly backend: RuntimeBackend | null;
1589
+ /** Whether the model is loaded and ready for inference */
1590
+ readonly isLoaded: boolean;
1591
+ /** Audio sample rate (8000 or 16000 Hz) */
1592
+ readonly sampleRate: number;
1593
+ /** Speech detection threshold (0-1) */
1594
+ readonly threshold: number;
1595
+ /**
1596
+ * Load the ONNX model
1597
+ * @returns Model loading information
1598
+ */
1599
+ load(): Promise<VADModelInfo | VADWorkerModelInfo>;
1600
+ /**
1601
+ * Process a single audio chunk
1602
+ * @param audioChunk - Float32Array of exactly chunkSize samples
1603
+ * @returns VAD result with speech probability
1604
+ */
1605
+ process(audioChunk: Float32Array): Promise<VADResult$1>;
1606
+ /**
1607
+ * Reset state for new audio stream
1608
+ */
1609
+ reset(): void | Promise<void>;
1610
+ /**
1611
+ * Dispose of the model and free resources
1612
+ */
1613
+ dispose(): Promise<void>;
1614
+ /**
1615
+ * Get required chunk size in samples
1616
+ */
1617
+ getChunkSize(): number;
1618
+ /**
1619
+ * Get chunk duration in milliseconds
1620
+ */
1621
+ getChunkDurationMs(): number;
1622
+ }
1623
+ /**
1624
+ * Configuration for the Silero VAD factory
1625
+ *
1626
+ * Extends SileroVADConfig with worker-specific options.
1627
+ */
1628
+ interface SileroVADFactoryConfig extends SileroVADConfig {
1629
+ /**
1630
+ * Force worker usage (true), main thread (false), or auto-detect (undefined).
1631
+ *
1632
+ * Auto-detection behavior:
1633
+ * - Desktop: Uses Worker (better responsiveness, off-main-thread)
1634
+ * - Mobile: Uses main thread (avoids 5MB memory overhead)
1635
+ *
1636
+ * You can override this to:
1637
+ * - `true`: Force Worker even on mobile (if you have memory headroom)
1638
+ * - `false`: Force main thread even on desktop (for debugging)
1639
+ *
1640
+ * Default: undefined (auto-detect)
1641
+ */
1642
+ useWorker?: boolean;
1643
+ /**
1644
+ * Fallback to main thread on worker errors.
1645
+ *
1646
+ * When true (default), if the Worker fails to load or encounters an error,
1647
+ * the factory will automatically create a main thread instance instead.
1648
+ *
1649
+ * When false, worker errors will propagate as exceptions.
1650
+ *
1651
+ * Default: true
1652
+ */
1653
+ fallbackOnError?: boolean;
1654
+ }
1655
+ /**
1656
+ * Check if the current environment supports VAD Web Workers
1657
+ *
1658
+ * Requirements:
1659
+ * - Worker constructor must exist
1660
+ * - Blob URL support (for inline worker script)
1661
+ *
1662
+ * @returns true if VAD Worker is supported
1663
+ */
1664
+ declare function supportsVADWorker(): boolean;
1665
+ /**
1666
+ * Create a Silero VAD instance with automatic implementation selection
1667
+ *
1668
+ * This factory function automatically selects between:
1669
+ * - **SileroVADWorker**: Off-main-thread inference (better for desktop)
1670
+ * - **SileroVADInference**: Main thread inference (better for mobile)
1671
+ *
1672
+ * The selection is based on:
1673
+ * 1. Explicit `useWorker` config (if provided)
1674
+ * 2. Platform detection (mobile vs desktop)
1675
+ * 3. Worker API availability
1676
+ *
1677
+ * Both implementations share the same interface (SileroVADBackend),
1678
+ * so consumers can use either interchangeably.
1679
+ *
1680
+ * @param config - Factory configuration
1681
+ * @returns A SileroVAD instance (either Worker or main thread)
1682
+ *
1683
+ * @example
1684
+ * ```typescript
1685
+ * // Auto-detect (recommended)
1686
+ * const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
1687
+ *
1688
+ * // Force Worker
1689
+ * const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
1690
+ *
1691
+ * // Force main thread
1692
+ * const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
1693
+ * ```
1694
+ */
1695
+ declare function createSileroVAD(config: SileroVADFactoryConfig): SileroVADBackend;
1696
+
1697
+ /**
1698
+ * Safari Web Speech API wrapper for iOS speech recognition
1699
+ *
1700
+ * Provides a similar interface to WhisperInference for easy substitution on iOS.
1701
+ * Uses the native Web Speech API which is significantly faster than Whisper WASM on iOS.
1702
+ *
1703
+ * Key differences from WhisperInference:
1704
+ * - Real-time streaming (not batch processing)
1705
+ * - No audio buffer input (microphone handled by browser)
1706
+ * - transcribe() throws error (use start/stop pattern instead)
1707
+ *
1708
+ * @category Inference
1709
+ *
1710
+ * @example Basic usage
1711
+ * ```typescript
1712
+ * import { SafariSpeechRecognition, shouldUseNativeASR } from '@omote/core';
1713
+ *
1714
+ * // Use native ASR on iOS, Whisper elsewhere
1715
+ * if (shouldUseNativeASR()) {
1716
+ * const speech = new SafariSpeechRecognition({ language: 'en-US' });
1717
+ *
1718
+ * speech.onResult((result) => {
1719
+ * console.log('Transcript:', result.text);
1720
+ * });
1721
+ *
1722
+ * await speech.start();
1723
+ * // ... user speaks ...
1724
+ * const finalResult = await speech.stop();
1725
+ * }
1726
+ * ```
1727
+ *
1728
+ * @example Platform-aware initialization
1729
+ * ```typescript
1730
+ * const asr = shouldUseNativeASR()
1731
+ * ? new SafariSpeechRecognition({ language: 'en-US' })
1732
+ * : new WhisperInference({ model: 'tiny' });
1733
+ * ```
1734
+ */
1735
+ /**
1736
+ * Configuration for Safari Speech Recognition
1737
+ */
1738
+ interface SafariSpeechConfig {
1739
+ /** Language code (default: 'en-US') */
1740
+ language?: string;
1741
+ /** Continuous mode for ongoing conversation (default: true) */
1742
+ continuous?: boolean;
1743
+ /** Interim results before speech ends (default: true) */
1744
+ interimResults?: boolean;
1745
+ /** Max alternatives (default: 1) */
1746
+ maxAlternatives?: number;
1747
+ }
1748
+ /**
1749
+ * Result from speech recognition (matches WhisperInference TranscriptionResult)
1750
+ */
1751
+ interface SpeechRecognitionResult {
1752
+ /** Transcribed text */
1753
+ text: string;
1754
+ /** Detected/used language */
1755
+ language: string;
1756
+ /** Time since start in ms (not inference time - native API) */
1757
+ inferenceTimeMs: number;
1758
+ /** Whether this is a final result or interim */
1759
+ isFinal: boolean;
1760
+ /** Confidence score (0-1) if available */
1761
+ confidence?: number;
1762
+ }
1763
+ /**
1764
+ * Callback for receiving recognition results
1765
+ */
1766
+ type SpeechResultCallback = (result: SpeechRecognitionResult) => void;
1767
+ /**
1768
+ * Callback for receiving recognition errors
1769
+ */
1770
+ type SpeechErrorCallback = (error: Error) => void;
1771
+ interface SpeechRecognitionEvent extends Event {
1772
+ resultIndex: number;
1773
+ results: SpeechRecognitionResultList;
1774
+ }
1775
+ interface SpeechRecognitionResultList {
1776
+ length: number;
1777
+ item(index: number): SpeechRecognitionResult;
1778
+ [index: number]: SpeechRecognitionResultItem;
1779
+ }
1780
+ interface SpeechRecognitionResultItem {
1781
+ isFinal: boolean;
1782
+ length: number;
1783
+ item(index: number): SpeechRecognitionAlternative;
1784
+ [index: number]: SpeechRecognitionAlternative;
1785
+ }
1786
+ interface SpeechRecognitionAlternative {
1787
+ transcript: string;
1788
+ confidence: number;
1789
+ }
1790
+ interface SpeechRecognitionErrorEvent extends Event {
1791
+ error: string;
1792
+ message: string;
1793
+ }
1794
+ interface SpeechRecognitionInterface extends EventTarget {
1795
+ continuous: boolean;
1796
+ interimResults: boolean;
1797
+ lang: string;
1798
+ maxAlternatives: number;
1799
+ start(): void;
1800
+ stop(): void;
1801
+ abort(): void;
1802
+ onresult: ((event: SpeechRecognitionEvent) => void) | null;
1803
+ onerror: ((event: SpeechRecognitionErrorEvent) => void) | null;
1804
+ onend: (() => void) | null;
1805
+ onstart: (() => void) | null;
1806
+ onaudiostart: (() => void) | null;
1807
+ onaudioend: (() => void) | null;
1808
+ onspeechstart: (() => void) | null;
1809
+ onspeechend: (() => void) | null;
1810
+ }
1811
+ declare global {
1812
+ interface Window {
1813
+ SpeechRecognition?: new () => SpeechRecognitionInterface;
1814
+ webkitSpeechRecognition?: new () => SpeechRecognitionInterface;
1815
+ }
1816
+ }
1817
+ /**
1818
+ * Safari Web Speech API wrapper
1819
+ *
1820
+ * Provides native speech recognition on iOS Safari.
1821
+ * Much faster than Whisper WASM and more battery-efficient.
1822
+ */
1823
+ declare class SafariSpeechRecognition {
1824
+ private config;
1825
+ private recognition;
1826
+ private isListening;
1827
+ private startTime;
1828
+ private accumulatedText;
1829
+ private resultCallbacks;
1830
+ private errorCallbacks;
1831
+ private stopResolver;
1832
+ private stopRejecter;
1833
+ constructor(config?: SafariSpeechConfig);
1834
+ /**
1835
+ * Check if Web Speech API is available
1836
+ */
1837
+ static isAvailable(): boolean;
1838
+ /**
1839
+ * Check if currently listening
1840
+ */
1841
+ get listening(): boolean;
1842
+ /**
1843
+ * Get the language being used
1844
+ */
1845
+ get language(): string;
1846
+ /**
1847
+ * Register a callback for receiving results
1848
+ */
1849
+ onResult(callback: SpeechResultCallback): void;
1850
+ /**
1851
+ * Register a callback for receiving errors
1852
+ */
1853
+ onError(callback: SpeechErrorCallback): void;
1854
+ /**
1855
+ * Remove a result callback
1856
+ */
1857
+ offResult(callback: SpeechResultCallback): void;
1858
+ /**
1859
+ * Remove an error callback
1860
+ */
1861
+ offError(callback: SpeechErrorCallback): void;
1862
+ /**
1863
+ * Start listening for speech
1864
+ *
1865
+ * On iOS Safari, this will trigger the microphone permission prompt
1866
+ * if not already granted.
1867
+ */
1868
+ start(): Promise<void>;
1869
+ /**
1870
+ * Stop listening and return the final transcript
1871
+ */
1872
+ stop(): Promise<SpeechRecognitionResult>;
1873
+ /**
1874
+ * Abort recognition without waiting for final result
1875
+ */
1876
+ abort(): void;
1877
+ /**
1878
+ * NOT SUPPORTED: Transcribe audio buffer
1879
+ *
1880
+ * Safari Speech API does not support transcribing pre-recorded audio.
1881
+ * It only works with live microphone input.
1882
+ *
1883
+ * For batch transcription on iOS, use server-side Whisper or a cloud ASR service.
1884
+ *
1885
+ * @throws Error always - this method is not supported
1886
+ */
1887
+ transcribe(_audio: Float32Array): Promise<SpeechRecognitionResult>;
1888
+ /**
1889
+ * Dispose of recognition resources
1890
+ */
1891
+ dispose(): void;
1892
+ /**
1893
+ * Set up event handlers for the recognition instance
1894
+ */
1895
+ private setupEventHandlers;
1896
+ /**
1897
+ * Emit result to all registered callbacks
1898
+ */
1899
+ private emitResult;
1900
+ /**
1901
+ * Emit error to all registered callbacks
1902
+ */
1903
+ private emitError;
1904
+ }
1905
+
1906
+ /**
1907
+ * Emotion - Helper for creating emotion vectors for avatar animation
1908
+ *
1909
+ * Provides 10 explicit emotion channels that can be used to control
1910
+ * avatar expressions and emotional states.
1911
+ *
1912
+ * @category Emotion
1913
+ *
1914
+ * @example Creating emotion vectors
1915
+ * ```typescript
1916
+ * import { createEmotionVector, EmotionPresets } from '@omote/core';
1917
+ *
1918
+ * // Named weights
1919
+ * const happy = createEmotionVector({ joy: 0.8, amazement: 0.2 });
1920
+ *
1921
+ * // Use preset
1922
+ * const surprised = EmotionPresets.surprised;
1923
+ * ```
1924
+ *
1925
+ * @example Smooth transitions
1926
+ * ```typescript
1927
+ * import { EmotionController } from '@omote/core';
1928
+ *
1929
+ * const controller = new EmotionController();
1930
+ * controller.setPreset('happy');
1931
+ * controller.transitionTo({ sadness: 0.7 }, 500);
1932
+ *
1933
+ * // In animation loop
1934
+ * controller.update();
1935
+ * const emotion = controller.emotion;
1936
+ * ```
1937
+ */
1938
+ /** The 10 explicit emotion channels */
1939
+ declare const EMOTION_NAMES: readonly ["amazement", "anger", "cheekiness", "disgust", "fear", "grief", "joy", "outofbreath", "pain", "sadness"];
1940
+ type EmotionName = typeof EMOTION_NAMES[number];
1941
+ /** Emotion weights by name */
1942
+ type EmotionWeights = Partial<Record<EmotionName, number>>;
1943
+ /** Total emotion vector size */
1944
+ declare const EMOTION_VECTOR_SIZE = 26;
1945
+ /**
1946
+ * Create an emotion vector from named weights
1947
+ *
1948
+ * @param weights - Named emotion weights (0-1)
1949
+ * @returns Float32Array of emotion values
1950
+ *
1951
+ * @example
1952
+ * ```ts
1953
+ * const emotion = createEmotionVector({ joy: 0.8, amazement: 0.3 });
1954
+ * ```
1955
+ */
1956
+ declare function createEmotionVector(weights?: EmotionWeights): Float32Array;
1957
+ /**
1958
+ * Pre-built emotion presets for common expressions
1959
+ */
1960
+ declare const EmotionPresets: {
1961
+ /** Neutral/default - no emotional expression */
1962
+ readonly neutral: Float32Array<ArrayBufferLike>;
1963
+ /** Happy - joy with slight amazement */
1964
+ readonly happy: Float32Array<ArrayBufferLike>;
1965
+ /** Sad - grief and sadness */
1966
+ readonly sad: Float32Array<ArrayBufferLike>;
1967
+ /** Angry - anger with disgust */
1968
+ readonly angry: Float32Array<ArrayBufferLike>;
1969
+ /** Surprised - high amazement */
1970
+ readonly surprised: Float32Array<ArrayBufferLike>;
1971
+ /** Scared - fear with pain */
1972
+ readonly scared: Float32Array<ArrayBufferLike>;
1973
+ /** Disgusted - disgust with anger */
1974
+ readonly disgusted: Float32Array<ArrayBufferLike>;
1975
+ /** Excited - joy with amazement and cheekiness */
1976
+ readonly excited: Float32Array<ArrayBufferLike>;
1977
+ /** Tired - out of breath with sadness */
1978
+ readonly tired: Float32Array<ArrayBufferLike>;
1979
+ /** Playful - cheekiness with joy */
1980
+ readonly playful: Float32Array<ArrayBufferLike>;
1981
+ /** Pained - pain with grief */
1982
+ readonly pained: Float32Array<ArrayBufferLike>;
1983
+ /** Contemplative - slight sadness, calm */
1984
+ readonly contemplative: Float32Array<ArrayBufferLike>;
1985
+ };
1986
+ type EmotionPresetName = keyof typeof EmotionPresets;
1987
+ /**
1988
+ * Get an emotion preset by name
1989
+ */
1990
+ declare function getEmotionPreset(name: EmotionPresetName): Float32Array;
1991
+ /**
1992
+ * Blend multiple emotion vectors together
1993
+ *
1994
+ * @param emotions - Array of { vector, weight } pairs
1995
+ * @returns Blended emotion vector
1996
+ *
1997
+ * @example
1998
+ * ```ts
1999
+ * const blended = blendEmotions([
2000
+ * { vector: EmotionPresets.happy, weight: 0.7 },
2001
+ * { vector: EmotionPresets.surprised, weight: 0.3 },
2002
+ * ]);
2003
+ * ```
2004
+ */
2005
+ declare function blendEmotions(emotions: Array<{
2006
+ vector: Float32Array;
2007
+ weight: number;
2008
+ }>): Float32Array;
2009
+ /**
2010
+ * Interpolate between two emotion vectors
2011
+ *
2012
+ * @param from - Starting emotion
2013
+ * @param to - Target emotion
2014
+ * @param t - Interpolation factor (0-1)
2015
+ * @returns Interpolated emotion vector
2016
+ */
2017
+ declare function lerpEmotion(from: Float32Array, to: Float32Array, t: number): Float32Array;
2018
+ /**
2019
+ * EmotionController - Manages emotion state with smooth transitions
2020
+ */
2021
+ declare class EmotionController {
2022
+ private currentEmotion;
2023
+ private targetEmotion;
2024
+ private transitionProgress;
2025
+ private transitionDuration;
2026
+ private transitionStartTime;
2027
+ /**
2028
+ * Get the current emotion vector
2029
+ */
2030
+ get emotion(): Float32Array;
2031
+ /**
2032
+ * Set emotion immediately (no transition)
2033
+ */
2034
+ set(weights: EmotionWeights): void;
2035
+ /**
2036
+ * Set emotion from preset immediately
2037
+ */
2038
+ setPreset(preset: EmotionPresetName): void;
2039
+ /**
2040
+ * Transition to new emotion over time
2041
+ *
2042
+ * @param weights - Target emotion weights
2043
+ * @param durationMs - Transition duration in milliseconds
2044
+ */
2045
+ transitionTo(weights: EmotionWeights, durationMs: number): void;
2046
+ /**
2047
+ * Transition to preset over time
2048
+ */
2049
+ transitionToPreset(preset: EmotionPresetName, durationMs: number): void;
2050
+ /**
2051
+ * Update transition progress (call each frame)
2052
+ */
2053
+ update(): void;
2054
+ /**
2055
+ * Check if currently transitioning
2056
+ */
2057
+ get isTransitioning(): boolean;
2058
+ /**
2059
+ * Reset to neutral
2060
+ */
2061
+ reset(): void;
2062
+ }
2063
+
2064
+ /**
2065
+ * AI Adapter Interface
2066
+ *
2067
+ * Common interface for AI backends (AWS AgentCore, OpenAI Realtime).
2068
+ * Adapters handle the conversation flow and emit events for animation.
2069
+ *
2070
+ * @category AI
2071
+ */
2072
+
2073
+ /**
2074
+ * Tenant configuration for multi-tenant isolation
2075
+ */
2076
+ interface TenantConfig {
2077
+ /** Unique tenant identifier */
2078
+ tenantId: string;
2079
+ /** Customer-specific API credentials */
2080
+ credentials: {
2081
+ apiKey?: string;
2082
+ authToken?: string;
2083
+ refreshToken?: string;
2084
+ };
2085
+ /** Character configuration for this tenant */
2086
+ characterId: string;
2087
+ /** Optional custom endpoint override */
2088
+ endpoint?: string;
2089
+ }
2090
+ /**
2091
+ * Voice configuration for TTS
2092
+ */
2093
+ interface VoiceConfig {
2094
+ /** TTS provider */
2095
+ provider: 'elevenlabs' | 'openai';
2096
+ /** Voice ID */
2097
+ voiceId: string;
2098
+ /** Stability (0-1, ElevenLabs) */
2099
+ stability?: number;
2100
+ /** Similarity boost (0-1, ElevenLabs) */
2101
+ similarityBoost?: number;
2102
+ }
2103
+ /**
2104
+ * Session configuration
2105
+ */
2106
+ interface SessionConfig {
2107
+ /** Session ID (generated or provided) */
2108
+ sessionId: string;
2109
+ /** Tenant this session belongs to */
2110
+ tenant: TenantConfig;
2111
+ /** Initial system prompt / personality */
2112
+ systemPrompt?: string;
2113
+ /** Voice configuration for TTS */
2114
+ voice?: VoiceConfig;
2115
+ /** Initial emotion state */
2116
+ emotion?: string;
2117
+ /** Language code */
2118
+ language?: string;
2119
+ }
2120
+ /**
2121
+ * Message role in conversation
2122
+ */
2123
+ type MessageRole = 'user' | 'assistant' | 'system';
2124
+ /**
2125
+ * Conversation message in session history
2126
+ */
2127
+ interface ConversationMessage {
2128
+ /** Message role */
2129
+ role: MessageRole;
2130
+ /** Text content */
2131
+ content: string;
2132
+ /** Timestamp (ms) */
2133
+ timestamp: number;
2134
+ /** Emotion detected/expressed */
2135
+ emotion?: string;
2136
+ /** Audio duration if applicable (ms) */
2137
+ audioDurationMs?: number;
2138
+ }
2139
+ /**
2140
+ * Session state
2141
+ */
2142
+ type AISessionState = 'idle' | 'listening' | 'thinking' | 'speaking' | 'interrupted' | 'error' | 'disconnected';
2143
+ /**
2144
+ * Events emitted by AI adapters
2145
+ */
2146
+ interface AIAdapterEvents {
2147
+ [key: string]: unknown;
2148
+ 'state.change': {
2149
+ state: AISessionState;
2150
+ previousState: AISessionState;
2151
+ };
2152
+ 'user.speech.start': {
2153
+ timestamp: number;
2154
+ };
2155
+ 'user.speech.end': {
2156
+ timestamp: number;
2157
+ durationMs: number;
2158
+ };
2159
+ 'user.transcript.partial': {
2160
+ text: string;
2161
+ confidence: number;
2162
+ };
2163
+ 'user.transcript.final': {
2164
+ text: string;
2165
+ confidence: number;
2166
+ };
2167
+ 'ai.thinking.start': {
2168
+ timestamp: number;
2169
+ };
2170
+ 'ai.response.start': {
2171
+ text?: string;
2172
+ emotion?: string;
2173
+ };
2174
+ 'ai.response.chunk': {
2175
+ text: string;
2176
+ isLast: boolean;
2177
+ };
2178
+ 'ai.response.end': {
2179
+ fullText: string;
2180
+ durationMs: number;
2181
+ };
2182
+ 'audio.output.chunk': {
2183
+ audio: ArrayBuffer;
2184
+ sampleRate: number;
2185
+ timestamp: number;
2186
+ };
2187
+ 'audio.output.end': {
2188
+ durationMs: number;
2189
+ };
2190
+ 'animation': AnimationEvent;
2191
+ 'memory.updated': {
2192
+ messageCount: number;
2193
+ tokenCount?: number;
2194
+ };
2195
+ 'connection.opened': {
2196
+ sessionId: string;
2197
+ adapter: string;
2198
+ };
2199
+ 'connection.closed': {
2200
+ reason: string;
2201
+ };
2202
+ 'connection.error': {
2203
+ error: Error;
2204
+ recoverable: boolean;
2205
+ };
2206
+ 'interruption.detected': {
2207
+ timestamp: number;
2208
+ };
2209
+ 'interruption.handled': {
2210
+ action: 'stop' | 'continue';
2211
+ timestamp: number;
2212
+ };
2213
+ }
2214
+ /**
2215
+ * Base interface for all AI adapters
2216
+ */
2217
+ interface AIAdapter {
2218
+ /** Adapter name for logging/debugging */
2219
+ readonly name: string;
2220
+ /** Current session state */
2221
+ readonly state: AISessionState;
2222
+ /** Current session ID (null if not connected) */
2223
+ readonly sessionId: string | null;
2224
+ /** Whether the adapter is connected */
2225
+ readonly isConnected: boolean;
2226
+ /**
2227
+ * Initialize and connect the adapter
2228
+ */
2229
+ connect(config: SessionConfig): Promise<void>;
2230
+ /**
2231
+ * Disconnect and cleanup
2232
+ */
2233
+ disconnect(): Promise<void>;
2234
+ /**
2235
+ * Push user audio for processing
2236
+ * @param audio - PCM audio data (16kHz, mono)
2237
+ */
2238
+ pushAudio(audio: Int16Array | Float32Array): void;
2239
+ /**
2240
+ * Send text message directly (bypasses STT)
2241
+ */
2242
+ sendText(text: string): Promise<void>;
2243
+ /**
2244
+ * Handle user interruption
2245
+ * Stops current AI speech and prepares for new input
2246
+ */
2247
+ interrupt(): void;
2248
+ /**
2249
+ * Get conversation history
2250
+ */
2251
+ getHistory(): ConversationMessage[];
2252
+ /**
2253
+ * Clear conversation history
2254
+ */
2255
+ clearHistory(): void;
2256
+ /**
2257
+ * Check if adapter is available/healthy
2258
+ */
2259
+ healthCheck(): Promise<boolean>;
2260
+ on<K extends keyof AIAdapterEvents>(event: K, callback: (data: AIAdapterEvents[K]) => void): () => void;
2261
+ off<K extends keyof AIAdapterEvents>(event: K, callback: (data: AIAdapterEvents[K]) => void): void;
2262
+ once<K extends keyof AIAdapterEvents>(event: K, callback: (data: AIAdapterEvents[K]) => void): () => void;
2263
+ }
2264
+
2265
+ /**
2266
+ * Conversation Session Interface
2267
+ *
2268
+ * Represents an active conversation with memory and state.
2269
+ *
2270
+ * @category AI
2271
+ */
2272
+
2273
+ /**
2274
+ * Serializable session snapshot for persistence
2275
+ */
2276
+ interface SessionSnapshot {
2277
+ /** Session ID */
2278
+ sessionId: string;
2279
+ /** Tenant ID */
2280
+ tenantId: string;
2281
+ /** Character ID */
2282
+ characterId: string;
2283
+ /** Conversation history */
2284
+ history: ConversationMessage[];
2285
+ /** Custom context */
2286
+ context: Record<string, string>;
2287
+ /** Emotion state */
2288
+ emotion: EmotionWeights;
2289
+ /** Creation timestamp */
2290
+ createdAt: number;
2291
+ /** Last activity timestamp */
2292
+ lastActivityAt: number;
2293
+ }
2294
+ /**
2295
+ * Extended session with memory management
2296
+ */
2297
+ interface ConversationSession {
2298
+ /** Session identifier */
2299
+ readonly sessionId: string;
2300
+ /** Associated AI adapter */
2301
+ readonly adapter: AIAdapter;
2302
+ /** Session configuration */
2303
+ readonly config: SessionConfig;
2304
+ /** Current state */
2305
+ readonly state: AISessionState;
2306
+ /** Conversation history */
2307
+ readonly history: ConversationMessage[];
2308
+ /** Current emotion state */
2309
+ readonly emotion: EmotionWeights;
2310
+ /** Session creation timestamp */
2311
+ readonly createdAt: number;
2312
+ /** Last activity timestamp */
2313
+ readonly lastActivityAt: number;
2314
+ /**
2315
+ * Start the session (connects adapter)
2316
+ */
2317
+ start(): Promise<void>;
2318
+ /**
2319
+ * End the session (disconnects adapter)
2320
+ */
2321
+ end(): Promise<void>;
2322
+ /**
2323
+ * Push audio input
2324
+ */
2325
+ pushAudio(audio: Int16Array | Float32Array): void;
2326
+ /**
2327
+ * Send text input directly
2328
+ */
2329
+ sendText(text: string): Promise<void>;
2330
+ /**
2331
+ * Interrupt current AI response
2332
+ */
2333
+ interrupt(): void;
2334
+ /**
2335
+ * Update emotion state
2336
+ */
2337
+ setEmotion(emotion: EmotionWeights): void;
2338
+ /**
2339
+ * Add a context item (custom memory)
2340
+ */
2341
+ addContext(key: string, value: string): void;
2342
+ /**
2343
+ * Remove a context item
2344
+ */
2345
+ removeContext(key: string): void;
2346
+ /**
2347
+ * Get all context items
2348
+ */
2349
+ getContext(): Record<string, string>;
2350
+ /**
2351
+ * Export session for persistence
2352
+ */
2353
+ export(): SessionSnapshot;
2354
+ /**
2355
+ * Import session from snapshot
2356
+ */
2357
+ import(snapshot: SessionSnapshot): void;
2358
+ }
2359
+
2360
+ /**
2361
+ * AWS AgentCore Adapter
2362
+ *
2363
+ * Primary AI adapter for the Omote Platform.
2364
+ *
2365
+ * Pipeline:
2366
+ * User Audio -> Whisper ASR (local) -> Text
2367
+ * Text -> AgentCore (WebSocket) -> Response Text + Audio chunks (TTS handled backend-side)
2368
+ * Audio chunks -> LAM (local) -> Blendshapes -> Render
2369
+ *
2370
+ * @category AI
2371
+ */
2372
+
2373
+ /**
2374
+ * AgentCore-specific configuration
2375
+ */
2376
+ interface AgentCoreConfig {
2377
+ /** AgentCore WebSocket endpoint */
2378
+ endpoint: string;
2379
+ /** AWS region */
2380
+ region?: string;
2381
+ /** Model URLs */
2382
+ models?: {
2383
+ lamUrl?: string;
2384
+ };
2385
+ /** Enable observability */
2386
+ observability?: {
2387
+ tracing?: boolean;
2388
+ metrics?: boolean;
2389
+ };
2390
+ }
2391
+ /**
2392
+ * AWS AgentCore Adapter
2393
+ */
2394
+ declare class AgentCoreAdapter extends EventEmitter<AIAdapterEvents> implements AIAdapter {
2395
+ readonly name = "AgentCore";
2396
+ private _state;
2397
+ private _sessionId;
2398
+ private _isConnected;
2399
+ private whisper;
2400
+ private vad;
2401
+ private lam;
2402
+ private emotionController;
2403
+ private pipeline;
2404
+ private ws;
2405
+ private wsReconnectAttempts;
2406
+ private readonly maxReconnectAttempts;
2407
+ private audioBuffer;
2408
+ private history;
2409
+ private currentConfig;
2410
+ private agentCoreConfig;
2411
+ private isSpeaking;
2412
+ private currentTtsAbortController;
2413
+ private tokenCache;
2414
+ constructor(config: AgentCoreConfig);
2415
+ get state(): AISessionState;
2416
+ get sessionId(): string | null;
2417
+ get isConnected(): boolean;
2418
+ /**
2419
+ * Connect to AgentCore with session configuration
2420
+ */
2421
+ connect(config: SessionConfig): Promise<void>;
2422
+ /**
2423
+ * Disconnect and cleanup
2424
+ */
2425
+ disconnect(): Promise<void>;
2426
+ /**
2427
+ * Push user audio for processing
2428
+ */
2429
+ pushAudio(audio: Int16Array | Float32Array): void;
2430
+ /**
2431
+ * Send text directly to AgentCore
2432
+ */
2433
+ sendText(text: string): Promise<void>;
2434
+ /**
2435
+ * Interrupt current AI response
2436
+ */
2437
+ interrupt(): void;
2438
+ getHistory(): ConversationMessage[];
2439
+ clearHistory(): void;
2440
+ healthCheck(): Promise<boolean>;
2441
+ private setState;
2442
+ private getAuthToken;
2443
+ private initWhisper;
2444
+ private initLAM;
2445
+ private initPipeline;
2446
+ private connectWebSocket;
2447
+ private handleAgentCoreMessage;
2448
+ private scheduleTranscription;
2449
+ /**
2450
+ * Detect voice activity using Silero VAD
2451
+ * Falls back to simple RMS if VAD not available
2452
+ */
2453
+ private detectVoiceActivity;
2454
+ private int16ToFloat32;
2455
+ private base64ToArrayBuffer;
2456
+ private addToHistory;
2457
+ private handleDisconnect;
2458
+ }
2459
+
2460
+ /**
2461
+ * Conversation Orchestrator
2462
+ *
2463
+ * Manages the conversation pipeline with AgentCore:
2464
+ * - Handles session lifecycle and tenant isolation
2465
+ * - Manages adapter events and state
2466
+ *
2467
+ * @category AI
2468
+ */
2469
+
2470
+ /**
2471
+ * Orchestrator configuration
2472
+ */
2473
+ interface OrchestratorConfig {
2474
+ /** AgentCore adapter config */
2475
+ adapter: AgentCoreConfig;
2476
+ /** Connection timeout in ms */
2477
+ connectionTimeoutMs?: number;
2478
+ /** Max retry attempts */
2479
+ maxRetries?: number;
2480
+ }
2481
+ /**
2482
+ * Orchestrator events (extends AI adapter events)
2483
+ */
2484
+ interface OrchestratorEvents extends AIAdapterEvents {
2485
+ 'session.created': {
2486
+ sessionId: string;
2487
+ tenantId: string;
2488
+ };
2489
+ 'session.ended': {
2490
+ sessionId: string;
2491
+ reason: string;
2492
+ };
2493
+ }
2494
+ /**
2495
+ * Conversation Orchestrator
2496
+ */
2497
+ declare class ConversationOrchestrator extends EventEmitter<OrchestratorEvents> {
2498
+ private config;
2499
+ private adapter;
2500
+ private sessions;
2501
+ private tenants;
2502
+ private healthCheckInterval;
2503
+ private readonly HEALTH_CHECK_INTERVAL_MS;
2504
+ constructor(config: OrchestratorConfig);
2505
+ /**
2506
+ * Register a tenant
2507
+ */
2508
+ registerTenant(tenant: TenantConfig): void;
2509
+ /**
2510
+ * Unregister a tenant
2511
+ */
2512
+ unregisterTenant(tenantId: string): void;
2513
+ /**
2514
+ * Get tenant config
2515
+ */
2516
+ getTenant(tenantId: string): TenantConfig | undefined;
2517
+ /**
2518
+ * Create a new conversation session for a tenant
2519
+ */
2520
+ createSession(tenantId: string, options?: Partial<SessionConfig>): Promise<ConversationSession>;
2521
+ /**
2522
+ * End a session
2523
+ */
2524
+ endSession(sessionId: string): Promise<void>;
2525
+ /**
2526
+ * Get session by ID
2527
+ */
2528
+ getSession(sessionId: string): ConversationSession | undefined;
2529
+ /**
2530
+ * Get all sessions for a tenant
2531
+ */
2532
+ getTenantSessions(tenantId: string): ConversationSession[];
2533
+ /**
2534
+ * Start health monitoring
2535
+ */
2536
+ startHealthMonitoring(): void;
2537
+ /**
2538
+ * Stop health monitoring
2539
+ */
2540
+ stopHealthMonitoring(): void;
2541
+ /**
2542
+ * Dispose all resources
2543
+ */
2544
+ dispose(): Promise<void>;
2545
+ private generateSessionId;
2546
+ private forwardAdapterEvents;
2547
+ private performHealthCheck;
2548
+ }
2549
+
2550
+ /**
2551
+ * Tenant Manager
2552
+ *
2553
+ * Handles multi-tenant isolation for the Omote Platform:
2554
+ * - Credential isolation per tenant
2555
+ * - Session scoping per tenant
2556
+ * - Quota management
2557
+ * - Token refresh
2558
+ *
2559
+ * @category AI
2560
+ */
2561
+
2562
+ /**
2563
+ * Tenant quota configuration
2564
+ */
2565
+ interface TenantQuota {
2566
+ /** Max concurrent sessions */
2567
+ maxSessions: number;
2568
+ /** Requests per minute */
2569
+ requestsPerMinute: number;
2570
+ /** Max tokens per conversation */
2571
+ maxTokensPerConversation: number;
2572
+ /** Max audio minutes per day */
2573
+ maxAudioMinutesPerDay: number;
2574
+ }
2575
+ /**
2576
+ * Tenant usage tracking
2577
+ */
2578
+ interface TenantUsage {
2579
+ /** Current active sessions */
2580
+ currentSessions: number;
2581
+ /** Requests in current minute */
2582
+ requestsThisMinute: number;
2583
+ /** Total tokens used */
2584
+ tokensUsed: number;
2585
+ /** Audio minutes used today */
2586
+ audioMinutesToday: number;
2587
+ /** Last reset timestamp */
2588
+ lastMinuteReset: number;
2589
+ /** Last daily reset timestamp */
2590
+ lastDailyReset: number;
2591
+ }
2592
+ /**
2593
+ * Token refresh callback
2594
+ */
2595
+ type TokenRefreshCallback = () => Promise<string>;
2596
+ /**
2597
+ * Tenant Manager
2598
+ */
2599
+ declare class TenantManager {
2600
+ private tenants;
2601
+ private quotas;
2602
+ private usage;
2603
+ private tokenRefreshCallbacks;
2604
+ /**
2605
+ * Default quota for new tenants
2606
+ */
2607
+ static readonly DEFAULT_QUOTA: TenantQuota;
2608
+ /**
2609
+ * Register a tenant with quota
2610
+ */
2611
+ register(tenant: TenantConfig, quota?: TenantQuota, tokenRefreshCallback?: TokenRefreshCallback): void;
2612
+ /**
2613
+ * Unregister a tenant
2614
+ */
2615
+ unregister(tenantId: string): void;
2616
+ /**
2617
+ * Get tenant config
2618
+ */
2619
+ get(tenantId: string): TenantConfig | undefined;
2620
+ /**
2621
+ * Check if tenant exists
2622
+ */
2623
+ has(tenantId: string): boolean;
2624
+ /**
2625
+ * Get all tenant IDs
2626
+ */
2627
+ getTenantIds(): string[];
2628
+ /**
2629
+ * Check if tenant can create new session
2630
+ */
2631
+ canCreateSession(tenantId: string): boolean;
2632
+ /**
2633
+ * Check if tenant can make request
2634
+ */
2635
+ canMakeRequest(tenantId: string): boolean;
2636
+ /**
2637
+ * Check if tenant can use audio
2638
+ */
2639
+ canUseAudio(tenantId: string, minutes: number): boolean;
2640
+ /**
2641
+ * Increment session count
2642
+ */
2643
+ incrementSessions(tenantId: string): void;
2644
+ /**
2645
+ * Decrement session count
2646
+ */
2647
+ decrementSessions(tenantId: string): void;
2648
+ /**
2649
+ * Record a request
2650
+ */
2651
+ recordRequest(tenantId: string): void;
2652
+ /**
2653
+ * Record token usage
2654
+ */
2655
+ recordTokens(tenantId: string, tokens: number): void;
2656
+ /**
2657
+ * Record audio usage
2658
+ */
2659
+ recordAudioMinutes(tenantId: string, minutes: number): void;
2660
+ /**
2661
+ * Get fresh auth token for tenant
2662
+ */
2663
+ getAuthToken(tenantId: string): Promise<string>;
2664
+ /**
2665
+ * Update tenant credentials
2666
+ */
2667
+ updateCredentials(tenantId: string, credentials: Partial<TenantConfig['credentials']>): void;
2668
+ /**
2669
+ * Get usage stats for tenant
2670
+ */
2671
+ getUsage(tenantId: string): TenantUsage | undefined;
2672
+ /**
2673
+ * Get quota for tenant
2674
+ */
2675
+ getQuota(tenantId: string): TenantQuota | undefined;
2676
+ /**
2677
+ * Update quota for tenant
2678
+ */
2679
+ updateQuota(tenantId: string, quota: Partial<TenantQuota>): void;
2680
+ /**
2681
+ * Reset all usage stats for a tenant
2682
+ */
2683
+ resetUsage(tenantId: string): void;
2684
+ private checkMinuteReset;
2685
+ private checkDailyReset;
2686
+ }
2687
+
2688
+ /**
2689
+ * Audio Sync Manager
2690
+ *
2691
+ * Synchronizes TTS audio playback with lip sync animation:
2692
+ * - Buffers audio for inference
2693
+ * - Manages playback timing
2694
+ * - Handles audio queue for streaming
2695
+ *
2696
+ * @category AI
2697
+ */
2698
+
2699
+ /**
2700
+ * Audio sync events
2701
+ */
2702
+ interface AudioSyncEvents {
2703
+ [key: string]: unknown;
2704
+ 'buffer.ready': {
2705
+ audio: Float32Array;
2706
+ };
2707
+ 'playback.start': Record<string, never>;
2708
+ 'playback.end': Record<string, never>;
2709
+ 'sync.drift': {
2710
+ driftMs: number;
2711
+ };
2712
+ }
2713
+ /**
2714
+ * Audio sync configuration
2715
+ */
2716
+ interface AudioSyncConfig {
2717
+ /** Target sample rate (default: 16000) */
2718
+ sampleRate?: number;
2719
+ /** Buffer size for inference (default: 16640) */
2720
+ bufferSize?: number;
2721
+ /** Overlap between buffers (default: 4160) */
2722
+ overlapSize?: number;
2723
+ /** Max drift before correction (default: 100ms) */
2724
+ maxDriftMs?: number;
2725
+ }
2726
+ /**
2727
+ * Audio Sync Manager
2728
+ */
2729
+ declare class AudioSyncManager extends EventEmitter<AudioSyncEvents> {
2730
+ private config;
2731
+ private audioBuffer;
2732
+ private bufferPosition;
2733
+ private playbackQueue;
2734
+ private isPlaying;
2735
+ private audioContext;
2736
+ private playbackStartTime;
2737
+ private samplesPlayed;
2738
+ constructor(config?: AudioSyncConfig);
2739
+ /**
2740
+ * Initialize audio context
2741
+ */
2742
+ initialize(): Promise<void>;
2743
+ /**
2744
+ * Push audio chunk for processing and playback
2745
+ */
2746
+ pushAudio(audio: Float32Array): void;
2747
+ /**
2748
+ * Buffer audio for inference
2749
+ */
2750
+ private bufferForInference;
2751
+ /**
2752
+ * Start audio playback
2753
+ */
2754
+ private startPlayback;
2755
+ /**
2756
+ * Process playback queue
2757
+ */
2758
+ private processPlaybackQueue;
2759
+ /**
2760
+ * Check for audio/animation drift
2761
+ */
2762
+ private checkDrift;
2763
+ /**
2764
+ * Clear playback queue
2765
+ */
2766
+ clearQueue(): void;
2767
+ /**
2768
+ * Stop playback
2769
+ */
2770
+ stop(): void;
2771
+ /**
2772
+ * Get current playback position in seconds
2773
+ */
2774
+ getPlaybackPosition(): number;
2775
+ /**
2776
+ * Check if currently playing
2777
+ */
2778
+ getIsPlaying(): boolean;
2779
+ /**
2780
+ * Dispose resources
2781
+ */
2782
+ dispose(): void;
2783
+ }
2784
+
2785
+ /**
2786
+ * Interruption Handler
2787
+ *
2788
+ * VAD-based interruption detection for AI conversations:
2789
+ * - Monitors user audio for speech
2790
+ * - Detects when user interrupts AI response
2791
+ * - Triggers interruption callbacks
2792
+ *
2793
+ * @category AI
2794
+ */
2795
+
2796
+ /**
2797
+ * Interruption events
2798
+ */
2799
+ interface InterruptionEvents {
2800
+ [key: string]: unknown;
2801
+ 'speech.detected': {
2802
+ rms: number;
2803
+ };
2804
+ 'speech.ended': {
2805
+ durationMs: number;
2806
+ };
2807
+ 'interruption.triggered': {
2808
+ rms: number;
2809
+ durationMs: number;
2810
+ };
2811
+ }
2812
+ /**
2813
+ * Interruption handler configuration
2814
+ *
2815
+ * Industry standards applied:
2816
+ * - vadThreshold: 0.5 (Silero VAD default)
2817
+ * - minSpeechDurationMs: 200ms (Google/Amazon barge-in standard)
2818
+ * - silenceTimeoutMs: 500ms (OpenAI Realtime API standard)
2819
+ */
2820
+ interface InterruptionConfig {
2821
+ /** VAD probability threshold for speech detection (default: 0.5, Silero standard) */
2822
+ vadThreshold?: number;
2823
+ /** Minimum speech duration to trigger interruption (default: 200ms, Google/Amazon standard) */
2824
+ minSpeechDurationMs?: number;
2825
+ /** Silence duration to end speech (default: 500ms, OpenAI standard) */
2826
+ silenceTimeoutMs?: number;
2827
+ /** Enable interruption detection (default: true) */
2828
+ enabled?: boolean;
2829
+ }
2830
+ /**
2831
+ * Interruption Handler
2832
+ */
2833
+ declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
2834
+ private config;
2835
+ private isSpeaking;
2836
+ private speechStartTime;
2837
+ private lastSpeechTime;
2838
+ private silenceTimer;
2839
+ private aiIsSpeaking;
2840
+ private interruptionTriggeredThisSession;
2841
+ constructor(config?: InterruptionConfig);
2842
+ /**
2843
+ * Process VAD result for interruption detection
2844
+ * @param vadProbability - Speech probability from VAD (0-1)
2845
+ * @param audioEnergy - Optional RMS energy for logging (default: 0)
2846
+ */
2847
+ processVADResult(vadProbability: number, audioEnergy?: number): void;
2848
+ /**
2849
+ * @deprecated Use processVADResult() instead. This method uses naive RMS detection.
2850
+ * Process audio samples for VAD (legacy - uses simple RMS)
2851
+ */
2852
+ processAudio(samples: Float32Array | Int16Array): void;
2853
+ /**
2854
+ * Notify that AI started speaking
2855
+ */
2856
+ setAISpeaking(speaking: boolean): void;
2857
+ /**
2858
+ * Enable/disable interruption detection
2859
+ */
2860
+ setEnabled(enabled: boolean): void;
2861
+ /**
2862
+ * Update configuration
2863
+ */
2864
+ updateConfig(config: Partial<InterruptionConfig>): void;
2865
+ /**
2866
+ * Reset state
2867
+ */
2868
+ reset(): void;
2869
+ /**
2870
+ * Get current state
2871
+ */
2872
+ getState(): {
2873
+ isSpeaking: boolean;
2874
+ speechDurationMs: number;
2875
+ };
2876
+ private calculateRMS;
2877
+ private onSpeechDetected;
2878
+ private onSilenceDetected;
2879
+ }
2880
+
2881
+ /**
2882
+ * Model Cache
2883
+ *
2884
+ * Caches ONNX models in IndexedDB for faster subsequent loads.
2885
+ * IndexedDB can handle large files (100s of MBs) unlike localStorage.
2886
+ *
2887
+ * @category Cache
2888
+ */
2889
+ /**
2890
+ * Configuration for cache size limits and eviction behavior
2891
+ */
2892
+ interface CacheConfig {
2893
+ /** Maximum total cache size in bytes (default: 1GB) */
2894
+ maxSizeBytes?: number;
2895
+ /** Maximum age in milliseconds before eviction (default: none) */
2896
+ maxAgeMs?: number;
2897
+ /** Callback when storage quota exceeds warning threshold */
2898
+ onQuotaWarning?: (info: QuotaInfo) => void;
2899
+ }
2900
+ /**
2901
+ * Storage quota information
2902
+ */
2903
+ interface QuotaInfo {
2904
+ /** Total bytes used across all origins */
2905
+ usedBytes: number;
2906
+ /** Total available quota in bytes */
2907
+ quotaBytes: number;
2908
+ /** Percentage of quota used (0-100) */
2909
+ percentUsed: number;
2910
+ /** Bytes used by omote cache specifically */
2911
+ cacheBytes: number;
2912
+ }
2913
+ /**
2914
+ * Configure cache size limits and eviction behavior
2915
+ *
2916
+ * @param config - Cache configuration options
2917
+ *
2918
+ * @example
2919
+ * ```typescript
2920
+ * import { configureCacheLimit } from '@omote/core';
2921
+ *
2922
+ * // Set 500MB limit with 24-hour max age
2923
+ * configureCacheLimit({
2924
+ * maxSizeBytes: 500 * 1024 * 1024,
2925
+ * maxAgeMs: 24 * 60 * 60 * 1000,
2926
+ * onQuotaWarning: (info) => {
2927
+ * console.warn(`Storage ${info.percentUsed.toFixed(1)}% used`);
2928
+ * }
2929
+ * });
2930
+ * ```
2931
+ */
2932
+ declare function configureCacheLimit(config: CacheConfig): void;
2933
+ /**
2934
+ * Get current cache configuration
2935
+ */
2936
+ declare function getCacheConfig(): CacheConfig;
2937
+ /**
2938
+ * Result from getWithValidation() method
2939
+ */
2940
+ interface ValidationResult {
2941
+ /** The cached data, or null if not found */
2942
+ data: ArrayBuffer | null;
2943
+ /** True if the cached data is stale (etag mismatch) */
2944
+ stale: boolean;
2945
+ }
2946
+ /**
2947
+ * Generate a version-aware cache key
2948
+ *
2949
+ * @param url - The model URL
2950
+ * @param version - Optional version string
2951
+ * @returns The cache key (url#vX.X.X if version provided, url otherwise)
2952
+ *
2953
+ * @example
2954
+ * ```typescript
2955
+ * getCacheKey('http://example.com/model.onnx', '1.0.0')
2956
+ * // Returns: 'http://example.com/model.onnx#v1.0.0'
2957
+ *
2958
+ * getCacheKey('http://example.com/model.onnx')
2959
+ * // Returns: 'http://example.com/model.onnx'
2960
+ * ```
2961
+ */
2962
+ declare function getCacheKey(url: string, version?: string): string;
2963
+ interface CacheStats {
2964
+ totalSize: number;
2965
+ modelCount: number;
2966
+ models: {
2967
+ url: string;
2968
+ size: number;
2969
+ cachedAt: Date;
2970
+ }[];
2971
+ }
2972
+ /**
2973
+ * ModelCache - IndexedDB-based cache for ONNX models
2974
+ */
2975
+ declare class ModelCache {
2976
+ private db;
2977
+ private dbPromise;
2978
+ /**
2979
+ * Initialize the cache database
2980
+ */
2981
+ private getDB;
2982
+ /**
2983
+ * Check if a model is cached
2984
+ */
2985
+ has(url: string): Promise<boolean>;
2986
+ /**
2987
+ * Get a cached model
2988
+ *
2989
+ * Updates lastAccessedAt timestamp for LRU tracking on cache hit.
2990
+ */
2991
+ get(url: string): Promise<ArrayBuffer | null>;
2992
+ /**
2993
+ * Get a cached model with ETag validation
2994
+ *
2995
+ * Validates the cached data against the server's current ETag.
2996
+ * If the cached ETag differs from the server's, the data is marked as stale.
2997
+ *
2998
+ * @param url - The cache key
2999
+ * @param originalUrl - The original URL for HEAD request (if different from cache key)
3000
+ * @returns ValidationResult with data and stale flag
3001
+ *
3002
+ * @example
3003
+ * ```typescript
3004
+ * const result = await cache.getWithValidation('http://example.com/model.onnx');
3005
+ * if (result.data && !result.stale) {
3006
+ * // Use cached data
3007
+ * } else if (result.stale) {
3008
+ * // Refetch and update cache
3009
+ * }
3010
+ * ```
3011
+ */
3012
+ getWithValidation(url: string, originalUrl?: string): Promise<ValidationResult>;
3013
+ /**
3014
+ * Store a model in cache
3015
+ *
3016
+ * After storing, triggers LRU eviction if cache exceeds size limit.
3017
+ *
3018
+ * @param url - The cache key (use getCacheKey() for versioned keys)
3019
+ * @param data - The model data
3020
+ * @param etag - Optional ETag for staleness validation
3021
+ * @param version - Optional version string for metadata
3022
+ */
3023
+ set(url: string, data: ArrayBuffer, etag?: string, version?: string): Promise<void>;
3024
+ /**
3025
+ * Check storage quota and trigger warnings/cleanup as needed
3026
+ *
3027
+ * - Logs warning if quota > 90% used
3028
+ * - Triggers LRU cleanup if quota > 95% used
3029
+ * - Calls onQuotaWarning callback if configured
3030
+ */
3031
+ private checkQuota;
3032
+ /**
3033
+ * Delete a cached model
3034
+ */
3035
+ delete(url: string): Promise<void>;
3036
+ /**
3037
+ * Clear all cached models
3038
+ */
3039
+ clear(): Promise<void>;
3040
+ /**
3041
+ * Get cache statistics
3042
+ */
3043
+ getStats(): Promise<CacheStats>;
3044
+ /**
3045
+ * Enforce cache size limit by evicting oldest entries (LRU)
3046
+ *
3047
+ * Called automatically after each set() operation.
3048
+ * Can also be called manually to trigger cleanup.
3049
+ */
3050
+ enforceLimit(): Promise<void>;
3051
+ /**
3052
+ * Evict oldest entries (by lastAccessedAt) to free space
3053
+ *
3054
+ * @param bytesToFree - Minimum bytes to free
3055
+ * @returns List of evicted URLs
3056
+ *
3057
+ * @example
3058
+ * ```typescript
3059
+ * const cache = getModelCache();
3060
+ * const evicted = await cache.evictOldest(100 * 1024 * 1024); // Free 100MB
3061
+ * console.log('Evicted:', evicted);
3062
+ * ```
3063
+ */
3064
+ evictOldest(bytesToFree: number): Promise<string[]>;
3065
+ /**
3066
+ * Get storage quota information
3067
+ *
3068
+ * Uses navigator.storage.estimate() to get quota details.
3069
+ * Returns null if the API is unavailable.
3070
+ *
3071
+ * @returns Quota info or null if unavailable
3072
+ *
3073
+ * @example
3074
+ * ```typescript
3075
+ * const cache = getModelCache();
3076
+ * const quota = await cache.getQuotaInfo();
3077
+ * if (quota) {
3078
+ * console.log(`Using ${quota.percentUsed.toFixed(1)}% of quota`);
3079
+ * }
3080
+ * ```
3081
+ */
3082
+ getQuotaInfo(): Promise<QuotaInfo | null>;
3083
+ }
3084
+ /**
3085
+ * Get the global ModelCache instance
3086
+ */
3087
+ declare function getModelCache(): ModelCache;
3088
+ /**
3089
+ * Options for fetchWithCache
3090
+ */
3091
+ interface FetchWithCacheOptions {
3092
+ /** Optional version string for versioned caching */
3093
+ version?: string;
3094
+ /** If true, validates cached data against server ETag and refetches if stale */
3095
+ validateStale?: boolean;
3096
+ /** Progress callback during download */
3097
+ onProgress?: (loaded: number, total: number) => void;
3098
+ }
3099
+ /**
3100
+ * Fetch a model with caching
3101
+ * Uses IndexedDB cache with network fallback
3102
+ * Files larger than 500MB are not cached to IndexedDB to avoid memory pressure
3103
+ * (structured clone during IndexedDB write temporarily doubles memory usage)
3104
+ *
3105
+ * @param url - The URL to fetch
3106
+ * @param onProgress - Optional progress callback (legacy signature)
3107
+ * @returns The fetched ArrayBuffer
3108
+ *
3109
+ * @example
3110
+ * ```typescript
3111
+ * // Simple usage (backwards compatible)
3112
+ * const data = await fetchWithCache('http://example.com/model.onnx');
3113
+ *
3114
+ * // With progress callback (backwards compatible)
3115
+ * const data = await fetchWithCache('http://example.com/model.onnx', (loaded, total) => {
3116
+ * console.log(`${loaded}/${total} bytes`);
3117
+ * });
3118
+ *
3119
+ * // With options (new API)
3120
+ * const data = await fetchWithCache('http://example.com/model.onnx', {
3121
+ * version: '1.0.0',
3122
+ * validateStale: true,
3123
+ * onProgress: (loaded, total) => console.log(`${loaded}/${total}`)
3124
+ * });
3125
+ * ```
3126
+ */
3127
+ declare function fetchWithCache(url: string, optionsOrProgress?: FetchWithCacheOptions | ((loaded: number, total: number) => void)): Promise<ArrayBuffer>;
3128
+ /**
3129
+ * Preload models into cache without creating sessions
3130
+ */
3131
+ declare function preloadModels(urls: string[], onProgress?: (current: number, total: number, url: string) => void): Promise<void>;
3132
+ /**
3133
+ * Format bytes as human readable string
3134
+ */
3135
+ declare function formatBytes(bytes: number): string;
3136
+
3137
+ /**
3138
+ * HuggingFace CDN Utilities
3139
+ *
3140
+ * Helper functions for working with HuggingFace CDN URLs.
3141
+ * Used by transformers.js models (Whisper, etc.) for model downloads.
3142
+ *
3143
+ * @category Cache
3144
+ */
3145
+ /**
3146
+ * Test URL for HuggingFace CDN reachability check.
3147
+ * Uses a small, stable file from a well-known public model.
3148
+ */
3149
+ declare const HF_CDN_TEST_URL = "https://huggingface.co/Xenova/whisper-tiny/resolve/main/config.json";
3150
+ /**
3151
+ * Parsed HuggingFace URL components
3152
+ */
3153
+ interface HuggingFaceUrlInfo {
3154
+ /** Organization or username */
3155
+ org: string;
3156
+ /** Model name */
3157
+ model: string;
3158
+ /** Branch, tag, or commit */
3159
+ branch: string;
3160
+ /** File path within the repository */
3161
+ file: string;
3162
+ }
3163
+ /**
3164
+ * Parse a HuggingFace CDN URL into its components
3165
+ *
3166
+ * @param url - The HuggingFace URL to parse
3167
+ * @returns Parsed URL info or null if not a valid HF URL
3168
+ *
3169
+ * @example
3170
+ * ```typescript
3171
+ * const info = parseHuggingFaceUrl(
3172
+ * 'https://huggingface.co/openai/whisper-tiny/resolve/main/model.onnx'
3173
+ * );
3174
+ * // Returns: { org: 'openai', model: 'whisper-tiny', branch: 'main', file: 'model.onnx' }
3175
+ * ```
3176
+ */
3177
+ declare function parseHuggingFaceUrl(url: string): HuggingFaceUrlInfo | null;
3178
+ /**
3179
+ * Check if HuggingFace CDN is reachable
3180
+ *
3181
+ * Performs a HEAD request to a known HuggingFace model file to verify
3182
+ * connectivity. Useful for offline detection or network diagnostics.
3183
+ *
3184
+ * @param testUrl - Optional custom URL to test (defaults to HF_CDN_TEST_URL)
3185
+ * @returns True if CDN is reachable, false otherwise
3186
+ *
3187
+ * @example
3188
+ * ```typescript
3189
+ * import { isHuggingFaceCDNReachable } from '@omote/core';
3190
+ *
3191
+ * const reachable = await isHuggingFaceCDNReachable();
3192
+ * if (!reachable) {
3193
+ * console.log('HuggingFace CDN unreachable - running offline?');
3194
+ * // Fall back to cached models or show error
3195
+ * }
3196
+ * ```
3197
+ */
3198
+ declare function isHuggingFaceCDNReachable(testUrl?: string): Promise<boolean>;
3199
+
3200
+ /**
3201
+ * Utility to clear transformers.js Cache API storage
3202
+ *
3203
+ * Problem: transformers.js v4 uses Browser Cache API which persists across hard refreshes.
3204
+ * If an HTML error page gets cached (due to network errors, CDN issues, or dev server restarts),
3205
+ * it will be served instead of JSON files, causing JSON.parse() errors.
3206
+ *
3207
+ * Solution: Manually clear Cache API storage before loading models.
3208
+ *
3209
+ * @module utils/transformersCacheClear
3210
+ */
3211
+ /**
3212
+ * Clear all transformers.js and HuggingFace caches from Browser Cache API
3213
+ *
3214
+ * This clears:
3215
+ * - transformers-cache (default cache key)
3216
+ * - Any caches with 'transformers' or 'huggingface' in the name
3217
+ *
3218
+ * @param options Configuration options
3219
+ * @returns Promise resolving to array of deleted cache names
3220
+ */
3221
+ declare function clearTransformersCache(options?: {
3222
+ /** Whether to log deletion details (default: true) */
3223
+ verbose?: boolean;
3224
+ /** Additional cache name patterns to clear (e.g., ['my-custom-cache']) */
3225
+ additionalPatterns?: string[];
3226
+ }): Promise<string[]>;
3227
+ /**
3228
+ * Clear a specific cache by exact name
3229
+ *
3230
+ * @param cacheName Exact cache name to delete
3231
+ * @returns Promise resolving to true if deleted, false otherwise
3232
+ */
3233
+ declare function clearSpecificCache(cacheName: string): Promise<boolean>;
3234
+ /**
3235
+ * List all cache names currently stored
3236
+ *
3237
+ * @returns Promise resolving to array of cache names
3238
+ */
3239
+ declare function listCaches(): Promise<string[]>;
3240
+ /**
3241
+ * Check if a specific cached response is valid JSON/binary (not HTML error page)
3242
+ *
3243
+ * @param cacheName Cache name to check
3244
+ * @param requestUrl URL/key to check
3245
+ * @returns Promise resolving to validation result
3246
+ */
3247
+ declare function validateCachedResponse(cacheName: string, requestUrl: string): Promise<{
3248
+ exists: boolean;
3249
+ valid: boolean;
3250
+ contentType: string | null;
3251
+ isHtml: boolean;
3252
+ reason?: string;
3253
+ }>;
3254
+ /**
3255
+ * Scan all caches for potentially invalid cached responses
3256
+ *
3257
+ * @returns Promise resolving to report of invalid entries
3258
+ */
3259
+ declare function scanForInvalidCaches(): Promise<{
3260
+ totalCaches: number;
3261
+ scannedEntries: number;
3262
+ invalidEntries: Array<{
3263
+ cacheName: string;
3264
+ url: string;
3265
+ reason: string;
3266
+ }>;
3267
+ }>;
3268
+ /**
3269
+ * Clear all caches and optionally prevent re-creation (development mode)
3270
+ *
3271
+ * WARNING: This is aggressive and should only be used in development.
3272
+ * It clears ALL browser caches, not just transformers.js.
3273
+ *
3274
+ * @param preventRecreation If true, sets env.useBrowserCache = false
3275
+ * @returns Promise resolving to number of deleted caches
3276
+ */
3277
+ declare function nukeBrowserCaches(preventRecreation?: boolean): Promise<number>;
3278
+
3279
+ /**
3280
+ * Logging types for Omote SDK
3281
+ *
3282
+ * 6-level logging system with structured output:
3283
+ * - error: Critical failures that prevent operation
3284
+ * - warn: Recoverable issues or degraded performance
3285
+ * - info: Key lifecycle events (model loaded, inference complete)
3286
+ * - debug: Detailed operational info for development
3287
+ * - trace: Fine-grained tracing for performance analysis
3288
+ * - verbose: Extremely detailed output (tensor shapes, intermediate values)
3289
+ */
3290
+ type LogLevel = 'error' | 'warn' | 'info' | 'debug' | 'trace' | 'verbose';
3291
+ /**
3292
+ * Numeric priority for log levels (lower = more severe)
3293
+ */
3294
+ declare const LOG_LEVEL_PRIORITY: Record<LogLevel, number>;
3295
+ /**
3296
+ * Structured log entry
3297
+ */
3298
+ interface LogEntry {
3299
+ /** Unix timestamp in milliseconds */
3300
+ timestamp: number;
3301
+ /** Log level */
3302
+ level: LogLevel;
3303
+ /** Module name (e.g., 'LocalInference', 'ModelCache') */
3304
+ module: string;
3305
+ /** Human-readable message */
3306
+ message: string;
3307
+ /** Optional structured data */
3308
+ data?: Record<string, unknown>;
3309
+ /** Optional error object */
3310
+ error?: Error;
3311
+ }
3312
+ /**
3313
+ * Log output sink interface
3314
+ */
3315
+ interface LogSink {
3316
+ (entry: LogEntry): void;
3317
+ }
3318
+ /**
3319
+ * Log formatter interface
3320
+ */
3321
+ interface LogFormatter {
3322
+ (entry: LogEntry): string;
3323
+ }
3324
+ /**
3325
+ * Global logging configuration
3326
+ */
3327
+ interface LoggingConfig {
3328
+ /** Minimum log level to output (default: 'info') */
3329
+ level: LogLevel;
3330
+ /** Enable/disable logging globally (default: true) */
3331
+ enabled: boolean;
3332
+ /** Output format: 'json' for structured, 'pretty' for human-readable */
3333
+ format: 'json' | 'pretty';
3334
+ /** Custom output sink (default: console) */
3335
+ sink?: LogSink;
3336
+ /** Include timestamps in output (default: true) */
3337
+ timestamps?: boolean;
3338
+ /** Include module name in output (default: true) */
3339
+ includeModule?: boolean;
3340
+ }
3341
+ /**
3342
+ * Logger interface for module-specific logging
3343
+ */
3344
+ interface ILogger {
3345
+ error(message: string, data?: Record<string, unknown>): void;
3346
+ warn(message: string, data?: Record<string, unknown>): void;
3347
+ info(message: string, data?: Record<string, unknown>): void;
3348
+ debug(message: string, data?: Record<string, unknown>): void;
3349
+ trace(message: string, data?: Record<string, unknown>): void;
3350
+ verbose(message: string, data?: Record<string, unknown>): void;
3351
+ /** Create a child logger with a sub-module name */
3352
+ child(subModule: string): ILogger;
3353
+ /** Get the module name for this logger */
3354
+ readonly module: string;
3355
+ }
3356
+ /**
3357
+ * Default configuration
3358
+ */
3359
+ declare const DEFAULT_LOGGING_CONFIG: LoggingConfig;
3360
+
3361
+ /**
3362
+ * Omote SDK Logger
3363
+ *
3364
+ * Unified logging system with:
3365
+ * - 6 log levels (error, warn, info, debug, trace, verbose)
3366
+ * - Structured JSON output for machine parsing
3367
+ * - Pretty output for human readability
3368
+ * - Module-based child loggers
3369
+ * - Runtime configuration
3370
+ * - Browser and Node.js compatible
3371
+ */
3372
+
3373
+ /**
3374
+ * Configure global logging settings
3375
+ */
3376
+ declare function configureLogging(config: Partial<LoggingConfig>): void;
3377
+ /**
3378
+ * Get current logging configuration
3379
+ */
3380
+ declare function getLoggingConfig(): LoggingConfig;
3381
+ /**
3382
+ * Reset logging configuration to defaults
3383
+ */
3384
+ declare function resetLoggingConfig(): void;
3385
+ /**
3386
+ * Set log level at runtime
3387
+ */
3388
+ declare function setLogLevel(level: LogLevel): void;
3389
+ /**
3390
+ * Enable or disable logging
3391
+ */
3392
+ declare function setLoggingEnabled(enabled: boolean): void;
3393
+ /**
3394
+ * Create a logger for a specific module
3395
+ *
3396
+ * @param module - Module name (e.g., 'LocalInference', 'ModelCache')
3397
+ * @returns Logger instance
3398
+ *
3399
+ * @example
3400
+ * ```typescript
3401
+ * const logger = createLogger('LocalInference');
3402
+ * logger.info('Model loaded', { backend: 'webgpu', loadTimeMs: 1234 });
3403
+ * ```
3404
+ */
3405
+ declare function createLogger(module: string): ILogger;
3406
+ /**
3407
+ * No-op logger for when logging is completely disabled
3408
+ */
3409
+ declare const noopLogger: ILogger;
3410
+
3411
+ /**
3412
+ * Telemetry Types
3413
+ *
3414
+ * Configuration and type definitions for OpenTelemetry instrumentation.
3415
+ *
3416
+ * @category Telemetry
3417
+ */
3418
+ /**
3419
+ * Supported telemetry exporters
3420
+ */
3421
+ type TelemetryExporter = 'console' | 'otlp' | 'none';
3422
+ /**
3423
+ * Sampling configuration
3424
+ */
3425
+ interface SamplingConfig {
3426
+ /** Sampling ratio (0.0 - 1.0). Default: 1.0 (sample everything) */
3427
+ ratio?: number;
3428
+ /** Always sample errors regardless of ratio */
3429
+ alwaysSampleErrors?: boolean;
3430
+ }
3431
+ /**
3432
+ * OTLP exporter configuration
3433
+ */
3434
+ interface OTLPExporterConfig {
3435
+ /** OTLP endpoint URL (e.g., 'https://tempo.example.com/v1/traces') */
3436
+ endpoint: string;
3437
+ /** Optional headers for authentication */
3438
+ headers?: Record<string, string>;
3439
+ /** Request timeout in ms. Default: 10000 */
3440
+ timeoutMs?: number;
3441
+ }
3442
+ /**
3443
+ * Main telemetry configuration
3444
+ */
3445
+ interface TelemetryConfig {
3446
+ /** Enable/disable telemetry. Default: false */
3447
+ enabled?: boolean;
3448
+ /** Service name for spans. Default: 'omote-sdk' */
3449
+ serviceName?: string;
3450
+ /** Service version. Default: SDK version */
3451
+ serviceVersion?: string;
3452
+ /** Exporter type. Default: 'none' */
3453
+ exporter?: TelemetryExporter;
3454
+ /** OTLP exporter config (required if exporter is 'otlp') */
3455
+ exporterConfig?: OTLPExporterConfig;
3456
+ /** Sampling configuration */
3457
+ sampling?: SamplingConfig;
3458
+ /** Enable metrics collection. Default: true when telemetry enabled */
3459
+ metricsEnabled?: boolean;
3460
+ /** Metrics export interval in ms. Default: 60000 */
3461
+ metricsIntervalMs?: number;
3462
+ }
3463
+ /**
3464
+ * Span attributes for model operations
3465
+ */
3466
+ interface ModelSpanAttributes {
3467
+ /** Model URL or identifier */
3468
+ 'model.url'?: string;
3469
+ /** Model name (e.g., 'whisper', 'lam', 'silero-vad') */
3470
+ 'model.name'?: string;
3471
+ /** Inference backend used */
3472
+ 'model.backend'?: 'webgpu' | 'wasm';
3473
+ /** Whether model was loaded from cache */
3474
+ 'model.cached'?: boolean;
3475
+ /** Model size in bytes */
3476
+ 'model.size_bytes'?: number;
3477
+ }
3478
+ /**
3479
+ * Span attributes for inference operations
3480
+ */
3481
+ interface InferenceSpanAttributes extends ModelSpanAttributes {
3482
+ /** Number of input audio samples */
3483
+ 'inference.input_samples'?: number;
3484
+ /** Input duration in ms */
3485
+ 'inference.input_duration_ms'?: number;
3486
+ /** Number of output frames (for LAM) */
3487
+ 'inference.output_frames'?: number;
3488
+ /** Inference duration in ms */
3489
+ 'inference.duration_ms'?: number;
3490
+ /** Whether inference succeeded */
3491
+ 'inference.success'?: boolean;
3492
+ /** Error type if failed */
3493
+ 'inference.error_type'?: string;
3494
+ }
3495
+ /**
3496
+ * Span attributes for cache operations
3497
+ */
3498
+ interface CacheSpanAttributes {
3499
+ /** Cache key (URL) */
3500
+ 'cache.key'?: string;
3501
+ /** Whether it was a cache hit */
3502
+ 'cache.hit'?: boolean;
3503
+ /** Size of cached item in bytes */
3504
+ 'cache.size_bytes'?: number;
3505
+ /** Cache operation type */
3506
+ 'cache.operation'?: 'get' | 'set' | 'delete';
3507
+ }
3508
+ /**
3509
+ * Combined span attributes type
3510
+ */
3511
+ type SpanAttributes = ModelSpanAttributes | InferenceSpanAttributes | CacheSpanAttributes | Record<string, string | number | boolean | undefined>;
3512
+ /**
3513
+ * Metric names used by the SDK
3514
+ */
3515
+ declare const MetricNames: {
3516
+ /** Histogram: Inference latency in ms */
3517
+ readonly INFERENCE_LATENCY: "omote.inference.latency";
3518
+ /** Histogram: Model load time in ms */
3519
+ readonly MODEL_LOAD_TIME: "omote.model.load_time";
3520
+ /** Counter: Total inference operations */
3521
+ readonly INFERENCE_TOTAL: "omote.inference.total";
3522
+ /** Counter: Total errors */
3523
+ readonly ERRORS_TOTAL: "omote.errors.total";
3524
+ /** Counter: Cache hits */
3525
+ readonly CACHE_HITS: "omote.cache.hits";
3526
+ /** Counter: Cache misses */
3527
+ readonly CACHE_MISSES: "omote.cache.misses";
3528
+ };
3529
+ /**
3530
+ * Histogram buckets for inference latency (ms)
3531
+ */
3532
+ declare const INFERENCE_LATENCY_BUCKETS: number[];
3533
+ /**
3534
+ * Histogram buckets for model load time (ms)
3535
+ */
3536
+ declare const MODEL_LOAD_TIME_BUCKETS: number[];
3537
+
3538
+ /**
3539
+ * Muse Telemetry
3540
+ *
3541
+ * Main orchestrator for SDK telemetry. Manages spans, metrics, and exporters.
3542
+ *
3543
+ * @category Telemetry
3544
+ */
3545
+
3546
+ /**
3547
+ * Span context for tracing
3548
+ */
3549
+ interface SpanContext {
3550
+ traceId: string;
3551
+ spanId: string;
3552
+ parentSpanId?: string;
3553
+ }
3554
+ /**
3555
+ * Active span handle returned by startSpan
3556
+ */
3557
+ interface ActiveSpan {
3558
+ /** End the span with success status */
3559
+ end(): void;
3560
+ /** End the span with error status */
3561
+ endWithError(error: Error): void;
3562
+ /** Add attributes to the span */
3563
+ setAttributes(attrs: Partial<SpanAttributes>): void;
3564
+ /** Get the span context */
3565
+ getContext(): SpanContext;
3566
+ }
3567
+ /**
3568
+ * Configure global telemetry
3569
+ *
3570
+ * @example
3571
+ * ```typescript
3572
+ * // Development
3573
+ * configureTelemetry({
3574
+ * enabled: true,
3575
+ * serviceName: 'omote-dev',
3576
+ * exporter: 'console',
3577
+ * });
3578
+ *
3579
+ * // Production
3580
+ * configureTelemetry({
3581
+ * enabled: true,
3582
+ * serviceName: 'omote-prod',
3583
+ * exporter: 'otlp',
3584
+ * exporterConfig: {
3585
+ * endpoint: 'https://tempo.example.com',
3586
+ * },
3587
+ * sampling: { ratio: 0.1 },
3588
+ * });
3589
+ * ```
3590
+ */
3591
+ declare function configureTelemetry(config: TelemetryConfig): OmoteTelemetry;
3592
+ /**
3593
+ * Get the global telemetry instance
3594
+ */
3595
+ declare function getTelemetry(): OmoteTelemetry | null;
3596
+ /**
3597
+ * Main telemetry class
3598
+ *
3599
+ * Manages spans, metrics, and exports to configured backends.
3600
+ */
3601
+ declare class OmoteTelemetry {
3602
+ private config;
3603
+ private exporter;
3604
+ private activeTraceId;
3605
+ private metricsIntervalId;
3606
+ private counters;
3607
+ private histograms;
3608
+ constructor(config: TelemetryConfig);
3609
+ /**
3610
+ * Initialize the configured exporter
3611
+ */
3612
+ private initExporter;
3613
+ /**
3614
+ * Start periodic metrics collection
3615
+ */
3616
+ private startMetricsCollection;
3617
+ /**
3618
+ * Check if this operation should be sampled
3619
+ */
3620
+ private shouldSample;
3621
+ /**
3622
+ * Start a new span
3623
+ *
3624
+ * @example
3625
+ * ```typescript
3626
+ * const span = telemetry.startSpan('Wav2Vec2.infer', {
3627
+ * 'inference.input_samples': samples.length,
3628
+ * 'model.backend': 'webgpu',
3629
+ * });
3630
+ *
3631
+ * try {
3632
+ * const result = await doInference();
3633
+ * span.setAttributes({ 'inference.output_frames': result.frames });
3634
+ * span.end();
3635
+ * } catch (error) {
3636
+ * span.endWithError(error);
3637
+ * }
3638
+ * ```
3639
+ */
3640
+ startSpan(name: string, attributes?: Partial<SpanAttributes>, parentContext?: SpanContext): ActiveSpan;
3641
+ /**
3642
+ * Wrap an async function with a span
3643
+ *
3644
+ * @example
3645
+ * ```typescript
3646
+ * const result = await telemetry.withSpan('Model.load', async (span) => {
3647
+ * const model = await loadModel();
3648
+ * span.setAttributes({ 'model.size_bytes': model.size });
3649
+ * return model;
3650
+ * });
3651
+ * ```
3652
+ */
3653
+ withSpan<T>(name: string, fn: (span: ActiveSpan) => Promise<T>, attributes?: Partial<SpanAttributes>, parentContext?: SpanContext): Promise<T>;
3654
+ /**
3655
+ * Increment a counter metric
3656
+ *
3657
+ * @example
3658
+ * ```typescript
3659
+ * telemetry.incrementCounter('omote.inference.total', 1, {
3660
+ * model: 'wav2vec2',
3661
+ * backend: 'webgpu',
3662
+ * status: 'success',
3663
+ * });
3664
+ * ```
3665
+ */
3666
+ incrementCounter(name: string, value?: number, attributes?: Record<string, string | number | boolean>): void;
3667
+ /**
3668
+ * Record a histogram value
3669
+ *
3670
+ * @example
3671
+ * ```typescript
3672
+ * telemetry.recordHistogram('omote.inference.latency', durationMs, {
3673
+ * model: 'wav2vec2',
3674
+ * backend: 'webgpu',
3675
+ * });
3676
+ * ```
3677
+ */
3678
+ recordHistogram(name: string, value: number, attributes?: Record<string, string | number | boolean>): void;
3679
+ /**
3680
+ * Generate unique key for metric with attributes
3681
+ */
3682
+ private getMetricKey;
3683
+ /**
3684
+ * Flush accumulated metrics to exporter
3685
+ */
3686
+ private flushMetrics;
3687
+ /**
3688
+ * Force flush all pending data
3689
+ */
3690
+ flush(): Promise<void>;
3691
+ /**
3692
+ * Shutdown telemetry
3693
+ */
3694
+ shutdown(): Promise<void>;
3695
+ /**
3696
+ * Check if telemetry is enabled
3697
+ */
3698
+ isEnabled(): boolean;
3699
+ /**
3700
+ * Get current configuration
3701
+ */
3702
+ getConfig(): TelemetryConfig;
3703
+ }
3704
+
3705
+ /**
3706
+ * Console Exporter
3707
+ *
3708
+ * Exports telemetry data to the browser console for development/debugging.
3709
+ *
3710
+ * @category Telemetry
3711
+ */
3712
+
3713
+ /**
3714
+ * Span data structure for export
3715
+ */
3716
+ interface SpanData {
3717
+ name: string;
3718
+ traceId: string;
3719
+ spanId: string;
3720
+ parentSpanId?: string;
3721
+ startTime: number;
3722
+ endTime: number;
3723
+ durationMs: number;
3724
+ status: 'ok' | 'error';
3725
+ attributes: SpanAttributes;
3726
+ error?: Error;
3727
+ }
3728
+ /**
3729
+ * Metric data structure for export
3730
+ */
3731
+ interface MetricData {
3732
+ name: string;
3733
+ type: 'counter' | 'histogram';
3734
+ value: number;
3735
+ attributes: Record<string, string | number | boolean>;
3736
+ timestamp: number;
3737
+ }
3738
+ /**
3739
+ * Exporter interface that all exporters must implement
3740
+ */
3741
+ interface TelemetryExporterInterface {
3742
+ /** Export a completed span */
3743
+ exportSpan(span: SpanData): void;
3744
+ /** Export a metric */
3745
+ exportMetric(metric: MetricData): void;
3746
+ /** Flush any buffered data */
3747
+ flush(): Promise<void>;
3748
+ /** Shutdown the exporter */
3749
+ shutdown(): Promise<void>;
3750
+ }
3751
+ /**
3752
+ * Console exporter for development/debugging
3753
+ *
3754
+ * Outputs spans and metrics to the browser console with formatting.
3755
+ */
3756
+ declare class ConsoleExporter implements TelemetryExporterInterface {
3757
+ private enabled;
3758
+ private prefix;
3759
+ constructor(options?: {
3760
+ enabled?: boolean;
3761
+ prefix?: string;
3762
+ });
3763
+ exportSpan(span: SpanData): void;
3764
+ exportMetric(metric: MetricData): void;
3765
+ flush(): Promise<void>;
3766
+ shutdown(): Promise<void>;
3767
+ }
3768
+
3769
+ /**
3770
+ * OTLP Exporter
3771
+ *
3772
+ * Exports telemetry data to OTLP-compatible backends (Jaeger, Tempo, etc.)
3773
+ * using the OTLP/HTTP JSON protocol.
3774
+ *
3775
+ * @category Telemetry
3776
+ */
3777
+
3778
+ /**
3779
+ * OTLP exporter for production telemetry
3780
+ *
3781
+ * Sends spans and metrics to OTLP-compatible backends like:
3782
+ * - Jaeger
3783
+ * - Grafana Tempo
3784
+ * - Honeycomb
3785
+ * - Datadog
3786
+ * - AWS X-Ray (with collector)
3787
+ */
3788
+ declare class OTLPExporter implements TelemetryExporterInterface {
3789
+ private config;
3790
+ private serviceName;
3791
+ private serviceVersion;
3792
+ private spanBuffer;
3793
+ private metricBuffer;
3794
+ private flushIntervalId;
3795
+ private readonly BUFFER_SIZE;
3796
+ private readonly FLUSH_INTERVAL_MS;
3797
+ private isShutdown;
3798
+ constructor(config: OTLPExporterConfig, serviceName?: string, serviceVersion?: string);
3799
+ exportSpan(span: SpanData): void;
3800
+ exportMetric(metric: MetricData): void;
3801
+ flush(): Promise<void>;
3802
+ shutdown(): Promise<void>;
3803
+ private exportSpans;
3804
+ private exportMetrics;
3805
+ private sendRequest;
3806
+ }
3807
+
3808
+ /**
3809
+ * Animation Graph Types
3810
+ *
3811
+ * Renderer-agnostic animation state machine with emotion and audio-driven blending.
3812
+ *
3813
+ * @module animation
3814
+ */
3815
+ /**
3816
+ * Emotion labels for animation blending
3817
+ * Note: These are the 8 emotion categories used for animation, separate from the
3818
+ * internal EmotionName type used by EmotionController.
3819
+ */
3820
+ type EmotionLabel = 'angry' | 'calm' | 'disgust' | 'fearful' | 'happy' | 'neutral' | 'sad' | 'surprised';
3821
+ /**
3822
+ * High-level animation states
3823
+ */
3824
+ type AnimationStateName = 'idle' | 'listening' | 'thinking' | 'speaking';
3825
+ /**
3826
+ * Events that trigger state transitions
3827
+ */
3828
+ type AnimationTrigger = 'user_speech_start' | 'user_speech_end' | 'transcript_ready' | 'ai_response_start' | 'ai_audio_start' | 'ai_response_end' | 'timeout' | 'interrupt';
3829
+ /**
3830
+ * Animation layer types for blending
3831
+ */
3832
+ type AnimationLayer = 'base' | 'emotion' | 'gesture' | 'additive';
3833
+ /**
3834
+ * A single animation clip reference
3835
+ */
3836
+ interface AnimationClip {
3837
+ /** Unique identifier for the clip */
3838
+ name: string;
3839
+ /** Animation layer this clip belongs to */
3840
+ layer: AnimationLayer;
3841
+ /** Whether this clip loops */
3842
+ loop: boolean;
3843
+ /** Default duration in seconds (can be overridden by actual clip) */
3844
+ duration?: number;
3845
+ }
3846
+ /**
3847
+ * Blend weight for an animation clip
3848
+ */
3849
+ interface BlendWeight {
3850
+ /** Clip name */
3851
+ clip: string;
3852
+ /** Weight 0-1 */
3853
+ weight: number;
3854
+ /** Playback speed multiplier */
3855
+ speed: number;
3856
+ /** Current time in the animation (0-1 normalized) */
3857
+ time: number;
3858
+ }
3859
+ /**
3860
+ * Animation state definition
3861
+ */
3862
+ interface AnimationState {
3863
+ /** State name */
3864
+ name: AnimationStateName;
3865
+ /** Base animation clips for this state */
3866
+ baseClips: string[];
3867
+ /** Blend weights for base clips */
3868
+ baseWeights: number[];
3869
+ /** Whether emotion overlay is enabled in this state */
3870
+ emotionBlendEnabled: boolean;
3871
+ /** Whether gesture layer is enabled in this state */
3872
+ gestureBlendEnabled: boolean;
3873
+ /** Timeout in ms to auto-transition (0 = no timeout) */
3874
+ timeout: number;
3875
+ /** State to transition to on timeout */
3876
+ timeoutTarget?: AnimationStateName;
3877
+ }
3878
+ /**
3879
+ * Transition between states
3880
+ */
3881
+ interface Transition {
3882
+ /** Source state */
3883
+ from: AnimationStateName;
3884
+ /** Target state */
3885
+ to: AnimationStateName;
3886
+ /** Event that triggers this transition */
3887
+ trigger: AnimationTrigger;
3888
+ /** Blend duration in ms */
3889
+ duration: number;
3890
+ /** Optional condition function */
3891
+ condition?: () => boolean;
3892
+ }
3893
+ /**
3894
+ * Emotion to animation mapping
3895
+ */
3896
+ interface EmotionAnimationMap {
3897
+ /** Emotion label */
3898
+ emotion: EmotionLabel;
3899
+ /** Animation clip to blend */
3900
+ clip: string;
3901
+ /** Maximum blend weight for this emotion */
3902
+ maxWeight: number;
3903
+ /** Blend speed (weight change per second) */
3904
+ blendSpeed: number;
3905
+ }
3906
+ /**
3907
+ * Configuration for AnimationGraph
3908
+ */
3909
+ interface AnimationGraphConfig {
3910
+ /** Available animation states */
3911
+ states: AnimationState[];
3912
+ /** Transitions between states */
3913
+ transitions: Transition[];
3914
+ /** Emotion to animation mappings */
3915
+ emotionMappings: EmotionAnimationMap[];
3916
+ /** Gesture clips for audio-driven animation */
3917
+ gestureClips: string[];
3918
+ /** Initial state */
3919
+ initialState: AnimationStateName;
3920
+ /** Global blend speed for state transitions (weight/sec) */
3921
+ transitionBlendSpeed: number;
3922
+ /** Minimum audio energy to trigger gestures (0-1) */
3923
+ gestureThreshold: number;
3924
+ /** Gesture intensity multiplier */
3925
+ gestureIntensity: number;
3926
+ }
3927
+ /**
3928
+ * Current output of the animation graph
3929
+ */
3930
+ interface AnimationOutput {
3931
+ /** Current state name */
3932
+ state: AnimationStateName;
3933
+ /** All blend weights to apply */
3934
+ blendWeights: BlendWeight[];
3935
+ /** Active emotion (if any) */
3936
+ activeEmotion: EmotionLabel | null;
3937
+ /** Current gesture intensity (0-1) */
3938
+ gestureIntensity: number;
3939
+ /** Whether currently transitioning between states */
3940
+ isTransitioning: boolean;
3941
+ /** Transition progress (0-1) if transitioning */
3942
+ transitionProgress: number;
3943
+ }
3944
+ /**
3945
+ * Events emitted by AnimationGraph
3946
+ */
3947
+ type AnimationGraphEvents = {
3948
+ /** State changed */
3949
+ 'state.change': {
3950
+ from: AnimationStateName;
3951
+ to: AnimationStateName;
3952
+ trigger: AnimationTrigger;
3953
+ };
3954
+ /** Transition started */
3955
+ 'transition.start': {
3956
+ from: AnimationStateName;
3957
+ to: AnimationStateName;
3958
+ duration: number;
3959
+ };
3960
+ /** Transition completed */
3961
+ 'transition.end': {
3962
+ state: AnimationStateName;
3963
+ };
3964
+ /** Emotion changed */
3965
+ 'emotion.change': {
3966
+ emotion: EmotionLabel | null;
3967
+ confidence: number;
3968
+ };
3969
+ /** Animation output updated (every frame) */
3970
+ 'output.update': AnimationOutput;
3971
+ /** Index signature for EventEmitter compatibility */
3972
+ [key: string]: unknown;
3973
+ };
3974
+ /**
3975
+ * Default animation graph configuration
3976
+ */
3977
+ declare const DEFAULT_ANIMATION_CONFIG: AnimationGraphConfig;
3978
+
3979
+ /**
3980
+ * Animation Graph
3981
+ *
3982
+ * State machine for character animation with emotion and audio-driven blending.
3983
+ * Renderer-agnostic - outputs blend weights that any 3D engine can consume.
3984
+ *
3985
+ * @example
3986
+ * ```typescript
3987
+ * import { AnimationGraph, DEFAULT_ANIMATION_CONFIG } from '@omote/core';
3988
+ *
3989
+ * const graph = new AnimationGraph(DEFAULT_ANIMATION_CONFIG);
3990
+ *
3991
+ * // Connect to voice pipeline
3992
+ * graph.on('output.update', (output) => {
3993
+ * // Apply blend weights to your 3D character
3994
+ * for (const { clip, weight } of output.blendWeights) {
3995
+ * mixer.getAction(clip).setEffectiveWeight(weight);
3996
+ * }
3997
+ * });
3998
+ *
3999
+ * // Drive from voice state
4000
+ * voiceState.on('listening', () => graph.trigger('user_speech_start'));
4001
+ * voiceState.on('thinking', () => graph.trigger('transcript_ready'));
4002
+ * voiceState.on('speaking', () => graph.trigger('ai_audio_start'));
4003
+ *
4004
+ * // Drive from emotion detection
4005
+ * emotion.on('result', ({ emotion, confidence }) => {
4006
+ * graph.setEmotion(emotion, confidence);
4007
+ * });
4008
+ *
4009
+ * // Update every frame
4010
+ * function animate(deltaTime: number) {
4011
+ * graph.update(deltaTime);
4012
+ * }
4013
+ * ```
4014
+ *
4015
+ * @module animation
4016
+ */
4017
+
4018
+ /**
4019
+ * Animation state machine with smooth blending
4020
+ */
4021
+ declare class AnimationGraph extends EventEmitter<AnimationGraphEvents> {
4022
+ private config;
4023
+ private currentState;
4024
+ private previousState;
4025
+ private isTransitioning;
4026
+ private transitionProgress;
4027
+ private transitionDuration;
4028
+ private transitionStartTime;
4029
+ private currentEmotion;
4030
+ private emotionConfidence;
4031
+ private emotionBlendWeight;
4032
+ private targetEmotionWeight;
4033
+ private audioEnergy;
4034
+ private gestureWeight;
4035
+ private currentGestureClip;
4036
+ private stateEnterTime;
4037
+ private lastUpdateTime;
4038
+ private cachedOutput;
4039
+ constructor(config?: Partial<AnimationGraphConfig>);
4040
+ /**
4041
+ * Get current state name
4042
+ */
4043
+ get state(): AnimationStateName;
4044
+ /**
4045
+ * Get current animation output
4046
+ */
4047
+ get output(): AnimationOutput;
4048
+ /**
4049
+ * Trigger an animation event (may cause state transition)
4050
+ */
4051
+ trigger(event: AnimationTrigger): boolean;
4052
+ /**
4053
+ * Set current emotion (from DistilHuBERT or manual)
4054
+ */
4055
+ setEmotion(emotion: EmotionLabel, confidence: number): void;
4056
+ /**
4057
+ * Clear current emotion
4058
+ */
4059
+ clearEmotion(): void;
4060
+ /**
4061
+ * Set audio energy for gesture animation (0-1)
4062
+ */
4063
+ setAudioEnergy(energy: number): void;
4064
+ /**
4065
+ * Force transition to a specific state
4066
+ */
4067
+ setState(stateName: AnimationStateName, blendDuration?: number): void;
4068
+ /**
4069
+ * Update animation graph (call every frame)
4070
+ * @param deltaMs Time since last update in milliseconds
4071
+ */
4072
+ update(deltaMs?: number): AnimationOutput;
4073
+ /**
4074
+ * Reset to initial state
4075
+ */
4076
+ reset(): void;
4077
+ /**
4078
+ * Get all clip names used by this graph
4079
+ */
4080
+ getRequiredClips(): string[];
4081
+ private startTransition;
4082
+ private updateTransition;
4083
+ private checkTimeout;
4084
+ private updateEmotionBlend;
4085
+ private updateGesture;
4086
+ private computeOutput;
4087
+ }
4088
+
4089
+ /**
4090
+ * Audio Energy Analysis
4091
+ *
4092
+ * Utilities for extracting energy/loudness from audio for gesture animation.
4093
+ *
4094
+ * @module animation
4095
+ */
4096
+ /**
4097
+ * Calculate RMS (Root Mean Square) energy from audio samples
4098
+ * @param samples Audio samples (Float32Array, normalized -1 to 1)
4099
+ * @returns RMS energy value (0 to 1)
4100
+ */
4101
+ declare function calculateRMS(samples: Float32Array): number;
4102
+ /**
4103
+ * Calculate peak amplitude from audio samples
4104
+ * @param samples Audio samples (Float32Array, normalized -1 to 1)
4105
+ * @returns Peak amplitude (0 to 1)
4106
+ */
4107
+ declare function calculatePeak(samples: Float32Array): number;
4108
+ /**
4109
+ * Smoothed energy analyzer for gesture animation
4110
+ */
4111
+ declare class AudioEnergyAnalyzer {
4112
+ private smoothedRMS;
4113
+ private smoothedPeak;
4114
+ private readonly smoothingFactor;
4115
+ private readonly noiseFloor;
4116
+ /**
4117
+ * @param smoothingFactor How much to smooth (0 = no smoothing, 1 = infinite smoothing). Default 0.85
4118
+ * @param noiseFloor Minimum energy threshold to consider as signal. Default 0.01
4119
+ */
4120
+ constructor(smoothingFactor?: number, noiseFloor?: number);
4121
+ /**
4122
+ * Process audio samples and return smoothed energy values
4123
+ * @param samples Audio samples (Float32Array)
4124
+ * @returns Object with rms and peak values
4125
+ */
4126
+ process(samples: Float32Array): {
4127
+ rms: number;
4128
+ peak: number;
4129
+ energy: number;
4130
+ };
4131
+ /**
4132
+ * Reset analyzer state
4133
+ */
4134
+ reset(): void;
4135
+ /**
4136
+ * Get current smoothed RMS value
4137
+ */
4138
+ get rms(): number;
4139
+ /**
4140
+ * Get current smoothed peak value
4141
+ */
4142
+ get peak(): number;
4143
+ }
4144
+ /**
4145
+ * Extract emphasis points from audio (for gesture timing)
4146
+ *
4147
+ * Detects sudden increases in energy that correspond to speech emphasis.
4148
+ */
4149
+ declare class EmphasisDetector {
4150
+ private energyHistory;
4151
+ private readonly historySize;
4152
+ private readonly emphasisThreshold;
4153
+ /**
4154
+ * @param historySize Number of frames to track. Default 10
4155
+ * @param emphasisThreshold Minimum energy increase to count as emphasis. Default 0.15
4156
+ */
4157
+ constructor(historySize?: number, emphasisThreshold?: number);
4158
+ /**
4159
+ * Process energy value and detect emphasis
4160
+ * @param energy Current energy value (0-1)
4161
+ * @returns Object with isEmphasis flag and emphasisStrength
4162
+ */
4163
+ process(energy: number): {
4164
+ isEmphasis: boolean;
4165
+ emphasisStrength: number;
4166
+ };
4167
+ /**
4168
+ * Reset detector state
4169
+ */
4170
+ reset(): void;
4171
+ }
4172
+
4173
+ export { type AIAdapter, type AIAdapterEvents, type AISessionState, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, type AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendEvent, type BackendPreference, type BlendWeight, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, DEFAULT_ANIMATION_CONFIG, DEFAULT_LOGGING_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionEvent, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, type GazeEvent, HF_CDN_TEST_URL, type HuggingFaceUrlInfo, type ILogger, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, LOG_LEVEL_PRIORITY, type LogEntry, type LogFormatter, type LogLevel, type LogSink, type LoggingConfig, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, type OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, type QuotaInfo, RingBuffer, type RuntimeBackend, type STTFinalEvent, type STTPartialEvent, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SessionStateEvent, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TTSEndEvent, type TTSMarkEvent, type TTSStartEvent, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type TranscriptionResult, type Transition, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VisemeEvent, type VoiceConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, type WhisperConfig, WhisperInference, type WhisperModel, blendEmotions, calculatePeak, calculateRMS, clearSpecificCache, clearTransformersCache, configureCacheLimit, configureLogging, configureTelemetry, createEmotionVector, createLogger, createSessionWithFallback, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getLoggingConfig, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isHuggingFaceCDNReachable, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, listCaches, noopLogger, nukeBrowserCaches, parseHuggingFaceUrl, preloadModels, resetLoggingConfig, resolveBackend, scanForInvalidCaches, setLogLevel, setLoggingEnabled, shouldEnableWasmProxy, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, validateCachedResponse };