@omote/core 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +462 -207
- package/dist/index.d.ts +462 -207
- package/dist/index.js +560 -211
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +552 -202
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -112,6 +112,17 @@ declare class AudioScheduler {
|
|
|
112
112
|
* to avoid browser autoplay policy issues (requires user gesture).
|
|
113
113
|
*/
|
|
114
114
|
initialize(): Promise<void>;
|
|
115
|
+
/**
|
|
116
|
+
* Eagerly create and warm up the AudioContext
|
|
117
|
+
*
|
|
118
|
+
* Call this when a playback session starts (e.g., when AI response begins).
|
|
119
|
+
* The AudioContext needs time to initialize the audio hardware — on Windows
|
|
120
|
+
* this can take 50-100ms. By warming up early (before audio data arrives),
|
|
121
|
+
* the context is fully ready when schedule() is first called.
|
|
122
|
+
*
|
|
123
|
+
* Must be called after a user gesture (click/tap) for autoplay policy.
|
|
124
|
+
*/
|
|
125
|
+
warmup(): Promise<void>;
|
|
115
126
|
/**
|
|
116
127
|
* Ensure AudioContext is created and ready
|
|
117
128
|
* Called lazily on first schedule() - requires user gesture
|
|
@@ -157,6 +168,7 @@ declare class AudioScheduler {
|
|
|
157
168
|
cancelAll(fadeOutMs?: number): Promise<void>;
|
|
158
169
|
/**
|
|
159
170
|
* Reset scheduler state for new playback session
|
|
171
|
+
* Stops any orphaned sources that weren't cleaned up by cancelAll()
|
|
160
172
|
*/
|
|
161
173
|
reset(): void;
|
|
162
174
|
/**
|
|
@@ -342,6 +354,26 @@ declare function getOptimalWasmThreads(): number;
|
|
|
342
354
|
* @returns true if proxy mode is safe to enable
|
|
343
355
|
*/
|
|
344
356
|
declare function shouldEnableWasmProxy(): boolean;
|
|
357
|
+
/**
|
|
358
|
+
* Detect Safari browser on any platform (macOS + iOS)
|
|
359
|
+
*
|
|
360
|
+
* Safari WebKit has bugs with ONNX Runtime's WebGPU multithreaded JSEP build
|
|
361
|
+
* that crash session creation. Both iOS and macOS Safari are affected.
|
|
362
|
+
*
|
|
363
|
+
* @returns true if running in Safari on any platform
|
|
364
|
+
*/
|
|
365
|
+
declare function isSafari(): boolean;
|
|
366
|
+
/**
|
|
367
|
+
* Recommend using CPU-optimized lip sync model (wav2arkit_cpu)
|
|
368
|
+
*
|
|
369
|
+
* Safari (macOS + iOS) has WebGPU JSEP bugs that crash ONNX Runtime,
|
|
370
|
+
* and the 384MB LAM model stack-overflows in WASM mode.
|
|
371
|
+
* The wav2arkit_cpu model (1.8MB) provides identical 52 ARKit blendshape
|
|
372
|
+
* output at 22x real-time on CPU/WASM.
|
|
373
|
+
*
|
|
374
|
+
* @returns true if on Safari (should use CPU lip sync model)
|
|
375
|
+
*/
|
|
376
|
+
declare function shouldUseCpuLipSync(): boolean;
|
|
345
377
|
/**
|
|
346
378
|
* Check if Web Speech API is available in the browser
|
|
347
379
|
*
|
|
@@ -377,200 +409,62 @@ declare function shouldUseNativeASR(): boolean;
|
|
|
377
409
|
declare function shouldUseServerLipSync(): boolean;
|
|
378
410
|
|
|
379
411
|
/**
|
|
380
|
-
*
|
|
412
|
+
* Common interface for lip sync inference backends
|
|
381
413
|
*
|
|
382
|
-
*
|
|
383
|
-
*
|
|
414
|
+
* Both Wav2Vec2Inference (GPU, 384MB) and Wav2ArkitCpuInference (CPU, 1.8MB)
|
|
415
|
+
* implement this interface, allowing SyncedAudioPipeline and LAMPipeline to
|
|
416
|
+
* work with either model transparently.
|
|
384
417
|
*
|
|
385
|
-
*
|
|
386
|
-
* 2. Loading the WebGPU bundle wastes bandwidth and can cause issues
|
|
387
|
-
* 3. WASM-only bundle is smaller and more reliable on iOS
|
|
388
|
-
*
|
|
389
|
-
* Usage:
|
|
390
|
-
* ```typescript
|
|
391
|
-
* const ort = await getOnnxRuntime('wasm'); // Load WASM-only bundle
|
|
392
|
-
* const ort = await getOnnxRuntime('webgpu'); // Load WebGPU bundle (includes WASM)
|
|
393
|
-
* ```
|
|
394
|
-
*
|
|
395
|
-
* @module inference/onnxLoader
|
|
418
|
+
* @category Inference
|
|
396
419
|
*/
|
|
397
420
|
|
|
398
|
-
type OrtModule = {
|
|
399
|
-
InferenceSession: typeof InferenceSession;
|
|
400
|
-
Tensor: typeof Tensor;
|
|
401
|
-
env: Env;
|
|
402
|
-
};
|
|
403
|
-
type SessionOptions = InferenceSession.SessionOptions;
|
|
404
|
-
|
|
405
|
-
/**
|
|
406
|
-
* Check if WebGPU is available and likely to work
|
|
407
|
-
*
|
|
408
|
-
* This is more thorough than just checking navigator.gpu exists.
|
|
409
|
-
* It actually requests an adapter to verify the GPU is accessible.
|
|
410
|
-
*
|
|
411
|
-
* @returns true if WebGPU is available and working
|
|
412
|
-
*/
|
|
413
|
-
declare function isWebGPUAvailable(): Promise<boolean>;
|
|
414
|
-
/**
|
|
415
|
-
* Load ONNX Runtime with the specified backend
|
|
416
|
-
*
|
|
417
|
-
* This lazily loads the appropriate bundle:
|
|
418
|
-
* - 'wasm': Loads onnxruntime-web (WASM-only, smaller)
|
|
419
|
-
* - 'webgpu': Loads onnxruntime-web/webgpu (includes WebGPU + WASM fallback)
|
|
420
|
-
*
|
|
421
|
-
* Once loaded, the same instance is reused for all subsequent calls.
|
|
422
|
-
* If you need to switch backends, you must reload the page.
|
|
423
|
-
*
|
|
424
|
-
* @param backend The backend to load ('webgpu' or 'wasm')
|
|
425
|
-
* @returns The ONNX Runtime module
|
|
426
|
-
*/
|
|
427
|
-
declare function getOnnxRuntime(backend: RuntimeBackend): Promise<OrtModule>;
|
|
428
|
-
/**
|
|
429
|
-
* Get the appropriate ONNX Runtime based on user preference
|
|
430
|
-
*
|
|
431
|
-
* This resolves the user's preference against platform capabilities
|
|
432
|
-
* and loads the appropriate bundle.
|
|
433
|
-
*
|
|
434
|
-
* @param preference User's backend preference
|
|
435
|
-
* @returns The ONNX Runtime module and the resolved backend
|
|
436
|
-
*/
|
|
437
|
-
declare function getOnnxRuntimeForPreference(preference?: BackendPreference): Promise<{
|
|
438
|
-
ort: OrtModule;
|
|
439
|
-
backend: RuntimeBackend;
|
|
440
|
-
}>;
|
|
441
421
|
/**
|
|
442
|
-
*
|
|
443
|
-
*
|
|
444
|
-
* This returns optimized session options based on the backend and platform.
|
|
445
|
-
*
|
|
446
|
-
* @param backend The backend being used
|
|
447
|
-
* @returns Session options for InferenceSession.create()
|
|
422
|
+
* Model loading information returned by load()
|
|
448
423
|
*/
|
|
449
|
-
|
|
450
|
-
/**
|
|
451
|
-
* Create an inference session with automatic fallback
|
|
452
|
-
*
|
|
453
|
-
* If WebGPU session creation fails, automatically falls back to WASM.
|
|
454
|
-
*
|
|
455
|
-
* @param modelBuffer The model data as ArrayBuffer
|
|
456
|
-
* @param preferredBackend The preferred backend
|
|
457
|
-
* @returns The created session and the backend used
|
|
458
|
-
*/
|
|
459
|
-
declare function createSessionWithFallback(modelBuffer: ArrayBuffer, preferredBackend: RuntimeBackend): Promise<{
|
|
460
|
-
session: InferenceSession;
|
|
424
|
+
interface LipSyncModelInfo {
|
|
461
425
|
backend: RuntimeBackend;
|
|
462
|
-
}>;
|
|
463
|
-
/**
|
|
464
|
-
* Get the currently loaded backend (if any)
|
|
465
|
-
*/
|
|
466
|
-
declare function getLoadedBackend(): RuntimeBackend | null;
|
|
467
|
-
/**
|
|
468
|
-
* Check if ONNX Runtime has been loaded
|
|
469
|
-
*/
|
|
470
|
-
declare function isOnnxRuntimeLoaded(): boolean;
|
|
471
|
-
|
|
472
|
-
/**
|
|
473
|
-
* Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
|
|
474
|
-
*
|
|
475
|
-
* Runs entirely in the browser using WebGPU or WASM.
|
|
476
|
-
* Takes raw 16kHz audio and outputs:
|
|
477
|
-
* - 52 ARKit blendshapes (lip sync)
|
|
478
|
-
* - 32-token CTC logits (speech recognition)
|
|
479
|
-
*
|
|
480
|
-
* @category Inference
|
|
481
|
-
*
|
|
482
|
-
* @example Basic usage
|
|
483
|
-
* ```typescript
|
|
484
|
-
* import { Wav2Vec2Inference } from '@omote/core';
|
|
485
|
-
*
|
|
486
|
-
* const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
|
|
487
|
-
* await wav2vec.load();
|
|
488
|
-
*
|
|
489
|
-
* // Process 1 second of audio (16kHz = 16000 samples)
|
|
490
|
-
* const result = await wav2vec.infer(audioSamples);
|
|
491
|
-
*
|
|
492
|
-
* console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
|
|
493
|
-
* console.log('ASR text:', result.text); // Decoded transcription
|
|
494
|
-
* ```
|
|
495
|
-
*/
|
|
496
|
-
|
|
497
|
-
type InferenceBackend = BackendPreference;
|
|
498
|
-
interface Wav2Vec2InferenceConfig {
|
|
499
|
-
/** Path or URL to the ONNX model */
|
|
500
|
-
modelUrl: string;
|
|
501
|
-
/** Preferred backend (auto will try WebGPU first, fallback to WASM) */
|
|
502
|
-
backend?: InferenceBackend;
|
|
503
|
-
/** Number of identity classes (default: 12 for streaming model) */
|
|
504
|
-
numIdentityClasses?: number;
|
|
505
|
-
}
|
|
506
|
-
interface ModelInfo {
|
|
507
|
-
backend: 'webgpu' | 'wasm';
|
|
508
426
|
loadTimeMs: number;
|
|
509
427
|
inputNames: string[];
|
|
510
428
|
outputNames: string[];
|
|
511
429
|
}
|
|
512
430
|
/**
|
|
513
|
-
*
|
|
514
|
-
*
|
|
431
|
+
* Result from lip sync inference
|
|
432
|
+
*
|
|
433
|
+
* All implementations must return blendshapes in LAM_BLENDSHAPES order (alphabetical).
|
|
434
|
+
* Models with different native orderings must remap internally before returning.
|
|
515
435
|
*/
|
|
516
|
-
|
|
517
|
-
/**
|
|
518
|
-
declare const CTC_VOCAB: string[];
|
|
519
|
-
interface Wav2Vec2Result {
|
|
520
|
-
/** Blendshape weights [frames, 52] - 30fps */
|
|
436
|
+
interface LipSyncResult {
|
|
437
|
+
/** Blendshape weights [frames, 52] in LAM_BLENDSHAPES order - 30fps */
|
|
521
438
|
blendshapes: Float32Array[];
|
|
522
|
-
/**
|
|
523
|
-
|
|
524
|
-
/** Decoded text from CTC */
|
|
525
|
-
text: string;
|
|
526
|
-
/** Number of A2E frames (30fps) */
|
|
527
|
-
numA2EFrames: number;
|
|
528
|
-
/** Number of ASR frames (50fps) */
|
|
529
|
-
numASRFrames: number;
|
|
439
|
+
/** Number of blendshape frames */
|
|
440
|
+
numFrames: number;
|
|
530
441
|
/** Inference time in ms */
|
|
531
442
|
inferenceTimeMs: number;
|
|
532
443
|
}
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
*/
|
|
546
|
-
static isWebGPUAvailable: typeof isWebGPUAvailable;
|
|
547
|
-
get backend(): 'webgpu' | 'wasm' | null;
|
|
548
|
-
get isLoaded(): boolean;
|
|
444
|
+
/**
|
|
445
|
+
* Common interface for lip sync inference engines
|
|
446
|
+
*
|
|
447
|
+
* Implemented by:
|
|
448
|
+
* - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR + lip sync)
|
|
449
|
+
* - Wav2ArkitCpuInference (WASM-only, 1.8MB, lip sync only)
|
|
450
|
+
*/
|
|
451
|
+
interface LipSyncBackend {
|
|
452
|
+
/** Current backend type (webgpu, wasm, or null if not loaded) */
|
|
453
|
+
readonly backend: RuntimeBackend | null;
|
|
454
|
+
/** Whether the model is loaded and ready for inference */
|
|
455
|
+
readonly isLoaded: boolean;
|
|
549
456
|
/**
|
|
550
457
|
* Load the ONNX model
|
|
458
|
+
* @returns Model loading information
|
|
551
459
|
*/
|
|
552
|
-
load(): Promise<
|
|
460
|
+
load(): Promise<LipSyncModelInfo>;
|
|
553
461
|
/**
|
|
554
462
|
* Run inference on raw audio
|
|
555
|
-
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
556
|
-
* @param identityIndex - Optional identity index (
|
|
557
|
-
*
|
|
558
|
-
* Note: Model expects 1-second chunks (16000 samples) for optimal performance.
|
|
559
|
-
* Audio will be zero-padded or truncated to 16000 samples.
|
|
463
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
464
|
+
* @param identityIndex - Optional identity index (ignored by CPU model)
|
|
465
|
+
* @returns Lip sync result with blendshapes in LAM_BLENDSHAPES order
|
|
560
466
|
*/
|
|
561
|
-
infer(audioSamples: Float32Array, identityIndex?: number): Promise<
|
|
562
|
-
/**
|
|
563
|
-
* Decode CTC logits to text using greedy decoding
|
|
564
|
-
*/
|
|
565
|
-
private decodeCTC;
|
|
566
|
-
/**
|
|
567
|
-
* Queue inference to serialize ONNX session calls
|
|
568
|
-
*/
|
|
569
|
-
private queueInference;
|
|
570
|
-
/**
|
|
571
|
-
* Get blendshape value by name for a specific frame
|
|
572
|
-
*/
|
|
573
|
-
getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
|
|
467
|
+
infer(audioSamples: Float32Array, identityIndex?: number): Promise<LipSyncResult>;
|
|
574
468
|
/**
|
|
575
469
|
* Dispose of the model and free resources
|
|
576
470
|
*/
|
|
@@ -642,7 +536,7 @@ declare class LAMPipeline {
|
|
|
642
536
|
* @param timestamp - AudioContext time when these samples start playing
|
|
643
537
|
* @param lam - LAM inference engine
|
|
644
538
|
*/
|
|
645
|
-
push(samples: Float32Array, timestamp: number, lam:
|
|
539
|
+
push(samples: Float32Array, timestamp: number, lam: LipSyncBackend): Promise<void>;
|
|
646
540
|
/**
|
|
647
541
|
* Process accumulated buffer through LAM inference
|
|
648
542
|
*/
|
|
@@ -693,7 +587,7 @@ declare class LAMPipeline {
|
|
|
693
587
|
*
|
|
694
588
|
* @param lam - LAM inference engine
|
|
695
589
|
*/
|
|
696
|
-
flush(lam:
|
|
590
|
+
flush(lam: LipSyncBackend): Promise<void>;
|
|
697
591
|
/**
|
|
698
592
|
* Adjust all queued frame timestamps by an offset
|
|
699
593
|
*
|
|
@@ -710,25 +604,25 @@ declare class LAMPipeline {
|
|
|
710
604
|
}
|
|
711
605
|
|
|
712
606
|
/**
|
|
713
|
-
* SyncedAudioPipeline -
|
|
607
|
+
* SyncedAudioPipeline - Audio playback + LAM lip sync coordinator
|
|
714
608
|
*
|
|
715
609
|
* Orchestrates the complete pipeline for synchronized audio playback and lip sync:
|
|
716
610
|
* 1. Network chunks → Coalescer → Optimized buffers
|
|
717
|
-
* 2. Audio buffers → Scheduler → Gapless playback
|
|
718
|
-
* 3. Audio buffers → LAM Pipeline → Blendshape frames
|
|
611
|
+
* 2. Audio buffers → Scheduler → Gapless playback (immediate, never blocks)
|
|
612
|
+
* 3. Audio buffers → LAM Pipeline → Blendshape frames (background, fire-and-forget)
|
|
719
613
|
* 4. Frames synchronized to AudioContext clock → Renderer
|
|
720
614
|
*
|
|
721
|
-
* Key Architecture Pattern:
|
|
722
|
-
* -
|
|
723
|
-
* -
|
|
724
|
-
* -
|
|
725
|
-
* -
|
|
615
|
+
* Key Architecture Pattern: Audio-First, LAM-Background
|
|
616
|
+
* - Audio chunks are scheduled for playback immediately (never waits for LAM)
|
|
617
|
+
* - LAM inference runs in background without blocking the audio path
|
|
618
|
+
* - Lip sync starts ~1 second after audio (LAM needs 16000 samples to infer)
|
|
619
|
+
* - Once LAM catches up, frames stay synchronized to AudioContext clock
|
|
726
620
|
*
|
|
727
|
-
* This
|
|
728
|
-
*
|
|
621
|
+
* This decoupled design prevents LAM inference (50-300ms) from blocking audio
|
|
622
|
+
* scheduling, which caused audible stuttering when audio arrived as a continuous
|
|
623
|
+
* stream (e.g., single-call TTS from ElevenLabs via AgentCore).
|
|
729
624
|
*
|
|
730
625
|
* @see https://web.dev/articles/audio-scheduling (Web Audio clock patterns)
|
|
731
|
-
* @see https://developer.chrome.com/blog/audio-worklet-design-pattern (Ring buffer patterns)
|
|
732
626
|
* @category Audio
|
|
733
627
|
*/
|
|
734
628
|
|
|
@@ -738,14 +632,14 @@ interface SyncedAudioPipelineOptions {
|
|
|
738
632
|
/** Target chunk duration in ms for coalescing (default: 200) */
|
|
739
633
|
chunkTargetMs?: number;
|
|
740
634
|
/** LAM inference engine */
|
|
741
|
-
lam:
|
|
635
|
+
lam: LipSyncBackend;
|
|
742
636
|
}
|
|
743
637
|
interface SyncedAudioPipelineEvents {
|
|
744
638
|
/** New frame ready for display */
|
|
745
639
|
frame_ready: Float32Array;
|
|
746
640
|
/** Playback has completed */
|
|
747
641
|
playback_complete: void;
|
|
748
|
-
/** First
|
|
642
|
+
/** First audio chunk scheduled, playback starting */
|
|
749
643
|
playback_start: number;
|
|
750
644
|
/** Error occurred */
|
|
751
645
|
error: Error;
|
|
@@ -757,8 +651,7 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
|
|
|
757
651
|
private scheduler;
|
|
758
652
|
private coalescer;
|
|
759
653
|
private lamPipeline;
|
|
760
|
-
private
|
|
761
|
-
private bufferedChunks;
|
|
654
|
+
private playbackStarted;
|
|
762
655
|
private monitorInterval;
|
|
763
656
|
private frameAnimationId;
|
|
764
657
|
constructor(options: SyncedAudioPipelineOptions);
|
|
@@ -770,31 +663,19 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
|
|
|
770
663
|
* Start a new playback session
|
|
771
664
|
*
|
|
772
665
|
* Resets all state and prepares for incoming audio chunks.
|
|
773
|
-
*
|
|
666
|
+
* Audio will be scheduled immediately as chunks arrive (no buffering).
|
|
774
667
|
*/
|
|
775
668
|
start(): void;
|
|
776
669
|
/**
|
|
777
670
|
* Receive audio chunk from network
|
|
778
671
|
*
|
|
779
|
-
*
|
|
780
|
-
*
|
|
781
|
-
*
|
|
782
|
-
* - Audio scheduling waits until first LAM completes
|
|
783
|
-
* - Then all buffered audio is scheduled together with LAM frames
|
|
672
|
+
* Audio-first design: schedules audio immediately, LAM runs in background.
|
|
673
|
+
* This prevents LAM inference (50-300ms) from blocking audio scheduling,
|
|
674
|
+
* which caused audible stuttering with continuous audio streams.
|
|
784
675
|
*
|
|
785
676
|
* @param chunk - Uint8Array containing Int16 PCM audio
|
|
786
677
|
*/
|
|
787
678
|
onAudioChunk(chunk: Uint8Array): Promise<void>;
|
|
788
|
-
/**
|
|
789
|
-
* Handle first LAM inference completion
|
|
790
|
-
*
|
|
791
|
-
* This is the critical synchronization point:
|
|
792
|
-
* - LAM frames are now ready in the queue
|
|
793
|
-
* - Schedule all buffered audio chunks
|
|
794
|
-
* - Adjust LAM frame timestamps to match actual schedule time
|
|
795
|
-
* - Audio and LAM start playing together, perfectly synchronized
|
|
796
|
-
*/
|
|
797
|
-
private onFirstLAMComplete;
|
|
798
679
|
/**
|
|
799
680
|
* End of audio stream
|
|
800
681
|
*
|
|
@@ -840,8 +721,7 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
|
|
|
840
721
|
* Get current pipeline state (for debugging/monitoring)
|
|
841
722
|
*/
|
|
842
723
|
getState(): {
|
|
843
|
-
|
|
844
|
-
bufferedChunks: number;
|
|
724
|
+
playbackStarted: boolean;
|
|
845
725
|
coalescerFill: number;
|
|
846
726
|
lamFill: number;
|
|
847
727
|
queuedFrames: number;
|
|
@@ -854,6 +734,99 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
|
|
|
854
734
|
dispose(): void;
|
|
855
735
|
}
|
|
856
736
|
|
|
737
|
+
/**
|
|
738
|
+
* Lazy ONNX Runtime loader with conditional WebGPU/WASM bundle loading
|
|
739
|
+
*
|
|
740
|
+
* This module provides a way to dynamically load the appropriate ONNX Runtime bundle
|
|
741
|
+
* based on the platform's capabilities. This is critical for iOS support because:
|
|
742
|
+
*
|
|
743
|
+
* 1. iOS Safari has WebGPU API but ONNX Runtime's WebGPU backend crashes
|
|
744
|
+
* 2. Loading the WebGPU bundle wastes bandwidth and can cause issues
|
|
745
|
+
* 3. WASM-only bundle is smaller and more reliable on iOS
|
|
746
|
+
*
|
|
747
|
+
* Usage:
|
|
748
|
+
* ```typescript
|
|
749
|
+
* const ort = await getOnnxRuntime('wasm'); // Load WASM-only bundle
|
|
750
|
+
* const ort = await getOnnxRuntime('webgpu'); // Load WebGPU bundle (includes WASM)
|
|
751
|
+
* ```
|
|
752
|
+
*
|
|
753
|
+
* @module inference/onnxLoader
|
|
754
|
+
*/
|
|
755
|
+
|
|
756
|
+
type OrtModule = {
|
|
757
|
+
InferenceSession: typeof InferenceSession;
|
|
758
|
+
Tensor: typeof Tensor;
|
|
759
|
+
env: Env;
|
|
760
|
+
};
|
|
761
|
+
type SessionOptions = InferenceSession.SessionOptions;
|
|
762
|
+
|
|
763
|
+
/**
|
|
764
|
+
* Check if WebGPU is available and likely to work
|
|
765
|
+
*
|
|
766
|
+
* This is more thorough than just checking navigator.gpu exists.
|
|
767
|
+
* It actually requests an adapter to verify the GPU is accessible.
|
|
768
|
+
*
|
|
769
|
+
* @returns true if WebGPU is available and working
|
|
770
|
+
*/
|
|
771
|
+
declare function isWebGPUAvailable(): Promise<boolean>;
|
|
772
|
+
/**
|
|
773
|
+
* Load ONNX Runtime with the specified backend
|
|
774
|
+
*
|
|
775
|
+
* This lazily loads the appropriate bundle:
|
|
776
|
+
* - 'wasm': Loads onnxruntime-web (WASM-only, smaller)
|
|
777
|
+
* - 'webgpu': Loads onnxruntime-web/webgpu (includes WebGPU + WASM fallback)
|
|
778
|
+
*
|
|
779
|
+
* Once loaded, the same instance is reused for all subsequent calls.
|
|
780
|
+
* If you need to switch backends, you must reload the page.
|
|
781
|
+
*
|
|
782
|
+
* @param backend The backend to load ('webgpu' or 'wasm')
|
|
783
|
+
* @returns The ONNX Runtime module
|
|
784
|
+
*/
|
|
785
|
+
declare function getOnnxRuntime(backend: RuntimeBackend): Promise<OrtModule>;
|
|
786
|
+
/**
|
|
787
|
+
* Get the appropriate ONNX Runtime based on user preference
|
|
788
|
+
*
|
|
789
|
+
* This resolves the user's preference against platform capabilities
|
|
790
|
+
* and loads the appropriate bundle.
|
|
791
|
+
*
|
|
792
|
+
* @param preference User's backend preference
|
|
793
|
+
* @returns The ONNX Runtime module and the resolved backend
|
|
794
|
+
*/
|
|
795
|
+
declare function getOnnxRuntimeForPreference(preference?: BackendPreference): Promise<{
|
|
796
|
+
ort: OrtModule;
|
|
797
|
+
backend: RuntimeBackend;
|
|
798
|
+
}>;
|
|
799
|
+
/**
|
|
800
|
+
* Get session options for creating an inference session
|
|
801
|
+
*
|
|
802
|
+
* This returns optimized session options based on the backend and platform.
|
|
803
|
+
*
|
|
804
|
+
* @param backend The backend being used
|
|
805
|
+
* @returns Session options for InferenceSession.create()
|
|
806
|
+
*/
|
|
807
|
+
declare function getSessionOptions(backend: RuntimeBackend): SessionOptions;
|
|
808
|
+
/**
|
|
809
|
+
* Create an inference session with automatic fallback
|
|
810
|
+
*
|
|
811
|
+
* If WebGPU session creation fails, automatically falls back to WASM.
|
|
812
|
+
*
|
|
813
|
+
* @param modelBuffer The model data as ArrayBuffer
|
|
814
|
+
* @param preferredBackend The preferred backend
|
|
815
|
+
* @returns The created session and the backend used
|
|
816
|
+
*/
|
|
817
|
+
declare function createSessionWithFallback(modelBuffer: ArrayBuffer, preferredBackend: RuntimeBackend): Promise<{
|
|
818
|
+
session: InferenceSession;
|
|
819
|
+
backend: RuntimeBackend;
|
|
820
|
+
}>;
|
|
821
|
+
/**
|
|
822
|
+
* Get the currently loaded backend (if any)
|
|
823
|
+
*/
|
|
824
|
+
declare function getLoadedBackend(): RuntimeBackend | null;
|
|
825
|
+
/**
|
|
826
|
+
* Check if ONNX Runtime has been loaded
|
|
827
|
+
*/
|
|
828
|
+
declare function isOnnxRuntimeLoaded(): boolean;
|
|
829
|
+
|
|
857
830
|
/**
|
|
858
831
|
* Whisper Automatic Speech Recognition using transformers.js
|
|
859
832
|
* Uses Xenova's proven pipeline API for reliable transcription
|
|
@@ -982,6 +955,288 @@ declare class WhisperInference {
|
|
|
982
955
|
private removeNonSpeechTokens;
|
|
983
956
|
}
|
|
984
957
|
|
|
958
|
+
/**
|
|
959
|
+
* Shared blendshape constants and utilities for lip sync inference
|
|
960
|
+
*
|
|
961
|
+
* Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
|
|
962
|
+
* index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
|
|
963
|
+
*
|
|
964
|
+
* This module is the single source of truth for blendshape ordering to
|
|
965
|
+
* avoid circular dependencies between inference classes.
|
|
966
|
+
*
|
|
967
|
+
* @category Inference
|
|
968
|
+
*/
|
|
969
|
+
/**
|
|
970
|
+
* LAM model blendshape names in order (52 total)
|
|
971
|
+
* NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
|
|
972
|
+
*/
|
|
973
|
+
declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
974
|
+
/** Alias for backwards compatibility */
|
|
975
|
+
declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
976
|
+
/**
|
|
977
|
+
* Symmetrize blendshapes by averaging left/right pairs
|
|
978
|
+
* From LAM official postprocessing (models/utils.py)
|
|
979
|
+
* This fixes asymmetric output from the raw model
|
|
980
|
+
*/
|
|
981
|
+
declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
|
|
982
|
+
/**
|
|
983
|
+
* wav2arkit_cpu model blendshape ordering
|
|
984
|
+
*
|
|
985
|
+
* Indices 0-24 match LAM_BLENDSHAPES, but 25+ diverge:
|
|
986
|
+
* - LAM puts jawRight, mouthClose, mouthDimpleLeft, mouthDimpleRight at 25-28
|
|
987
|
+
* - wav2arkit_cpu puts mouthFrownLeft at 25 and moves those four to 48-51
|
|
988
|
+
*/
|
|
989
|
+
declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
|
|
990
|
+
/**
|
|
991
|
+
* Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
|
|
992
|
+
*
|
|
993
|
+
* @param frame - Float32Array of 52 blendshape values in wav2arkit_cpu order
|
|
994
|
+
* @returns Float32Array of 52 blendshape values in LAM_BLENDSHAPES order
|
|
995
|
+
*/
|
|
996
|
+
declare function remapWav2ArkitToLam(frame: Float32Array): Float32Array;
|
|
997
|
+
|
|
998
|
+
/**
|
|
999
|
+
* Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
|
|
1000
|
+
*
|
|
1001
|
+
* Runs entirely in the browser using WebGPU or WASM.
|
|
1002
|
+
* Takes raw 16kHz audio and outputs:
|
|
1003
|
+
* - 52 ARKit blendshapes (lip sync)
|
|
1004
|
+
* - 32-token CTC logits (speech recognition)
|
|
1005
|
+
*
|
|
1006
|
+
* @category Inference
|
|
1007
|
+
*
|
|
1008
|
+
* @example Basic usage
|
|
1009
|
+
* ```typescript
|
|
1010
|
+
* import { Wav2Vec2Inference } from '@omote/core';
|
|
1011
|
+
*
|
|
1012
|
+
* const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
|
|
1013
|
+
* await wav2vec.load();
|
|
1014
|
+
*
|
|
1015
|
+
* // Process 1 second of audio (16kHz = 16000 samples)
|
|
1016
|
+
* const result = await wav2vec.infer(audioSamples);
|
|
1017
|
+
*
|
|
1018
|
+
* console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
|
|
1019
|
+
* console.log('ASR text:', result.text); // Decoded transcription
|
|
1020
|
+
* ```
|
|
1021
|
+
*/
|
|
1022
|
+
|
|
1023
|
+
type InferenceBackend = BackendPreference;
|
|
1024
|
+
interface Wav2Vec2InferenceConfig {
|
|
1025
|
+
/** Path or URL to the ONNX model */
|
|
1026
|
+
modelUrl: string;
|
|
1027
|
+
/** Preferred backend (auto will try WebGPU first, fallback to WASM) */
|
|
1028
|
+
backend?: InferenceBackend;
|
|
1029
|
+
/** Number of identity classes (default: 12 for streaming model) */
|
|
1030
|
+
numIdentityClasses?: number;
|
|
1031
|
+
}
|
|
1032
|
+
interface ModelInfo {
|
|
1033
|
+
backend: 'webgpu' | 'wasm';
|
|
1034
|
+
loadTimeMs: number;
|
|
1035
|
+
inputNames: string[];
|
|
1036
|
+
outputNames: string[];
|
|
1037
|
+
}
|
|
1038
|
+
|
|
1039
|
+
/** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
|
|
1040
|
+
declare const CTC_VOCAB: string[];
|
|
1041
|
+
interface Wav2Vec2Result {
|
|
1042
|
+
/** Blendshape weights [frames, 52] - 30fps */
|
|
1043
|
+
blendshapes: Float32Array[];
|
|
1044
|
+
/** Raw CTC logits [frames, 32] - 50fps */
|
|
1045
|
+
asrLogits: Float32Array[];
|
|
1046
|
+
/** Decoded text from CTC */
|
|
1047
|
+
text: string;
|
|
1048
|
+
/** Number of blendshape frames (30fps) — alias for numA2EFrames */
|
|
1049
|
+
numFrames: number;
|
|
1050
|
+
/** Number of A2E frames (30fps) */
|
|
1051
|
+
numA2EFrames: number;
|
|
1052
|
+
/** Number of ASR frames (50fps) */
|
|
1053
|
+
numASRFrames: number;
|
|
1054
|
+
/** Inference time in ms */
|
|
1055
|
+
inferenceTimeMs: number;
|
|
1056
|
+
}
|
|
1057
|
+
declare class Wav2Vec2Inference {
|
|
1058
|
+
private session;
|
|
1059
|
+
private ort;
|
|
1060
|
+
private config;
|
|
1061
|
+
private _backend;
|
|
1062
|
+
private isLoading;
|
|
1063
|
+
private numIdentityClasses;
|
|
1064
|
+
private inferenceQueue;
|
|
1065
|
+
constructor(config: Wav2Vec2InferenceConfig);
|
|
1066
|
+
/**
|
|
1067
|
+
* Check if WebGPU is available and working
|
|
1068
|
+
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
1069
|
+
*/
|
|
1070
|
+
static isWebGPUAvailable: typeof isWebGPUAvailable;
|
|
1071
|
+
get backend(): 'webgpu' | 'wasm' | null;
|
|
1072
|
+
get isLoaded(): boolean;
|
|
1073
|
+
/**
|
|
1074
|
+
* Load the ONNX model
|
|
1075
|
+
*/
|
|
1076
|
+
load(): Promise<ModelInfo>;
|
|
1077
|
+
/**
|
|
1078
|
+
* Run inference on raw audio
|
|
1079
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
|
|
1080
|
+
* @param identityIndex - Optional identity index (0-11, default 0 = neutral)
|
|
1081
|
+
*
|
|
1082
|
+
* Note: Model expects 1-second chunks (16000 samples) for optimal performance.
|
|
1083
|
+
* Audio will be zero-padded or truncated to 16000 samples.
|
|
1084
|
+
*/
|
|
1085
|
+
infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
|
|
1086
|
+
/**
|
|
1087
|
+
* Decode CTC logits to text using greedy decoding
|
|
1088
|
+
*/
|
|
1089
|
+
private decodeCTC;
|
|
1090
|
+
/**
|
|
1091
|
+
* Queue inference to serialize ONNX session calls
|
|
1092
|
+
*/
|
|
1093
|
+
private queueInference;
|
|
1094
|
+
/**
|
|
1095
|
+
* Get blendshape value by name for a specific frame
|
|
1096
|
+
*/
|
|
1097
|
+
getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
|
|
1098
|
+
/**
|
|
1099
|
+
* Dispose of the model and free resources
|
|
1100
|
+
*/
|
|
1101
|
+
dispose(): Promise<void>;
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
/**
|
|
1105
|
+
* CPU-optimized lip sync inference using wav2arkit_cpu model
|
|
1106
|
+
*
|
|
1107
|
+
* A lightweight (1.8MB) alternative to Wav2Vec2Inference (384MB) designed
|
|
1108
|
+
* for Safari/iOS where WebGPU crashes due to ONNX Runtime JSEP bugs.
|
|
1109
|
+
*
|
|
1110
|
+
* Key differences from Wav2Vec2Inference:
|
|
1111
|
+
* - WASM-only backend (CPU-optimized, no WebGPU)
|
|
1112
|
+
* - 1.8MB model vs 384MB
|
|
1113
|
+
* - No identity input (baked to identity 11)
|
|
1114
|
+
* - No ASR output (lip sync only)
|
|
1115
|
+
* - Dynamic input length (not fixed to 16000 samples)
|
|
1116
|
+
* - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
|
|
1117
|
+
*
|
|
1118
|
+
* @category Inference
|
|
1119
|
+
*
|
|
1120
|
+
* @example
|
|
1121
|
+
* ```typescript
|
|
1122
|
+
* import { Wav2ArkitCpuInference } from '@omote/core';
|
|
1123
|
+
*
|
|
1124
|
+
* const lam = new Wav2ArkitCpuInference({
|
|
1125
|
+
* modelUrl: '/models/wav2arkit_cpu.onnx',
|
|
1126
|
+
* });
|
|
1127
|
+
* await lam.load();
|
|
1128
|
+
*
|
|
1129
|
+
* const { blendshapes } = await lam.infer(audioSamples);
|
|
1130
|
+
* // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
|
|
1131
|
+
* ```
|
|
1132
|
+
*/
|
|
1133
|
+
|
|
1134
|
+
interface Wav2ArkitCpuConfig {
|
|
1135
|
+
/** Path or URL to the wav2arkit_cpu ONNX model */
|
|
1136
|
+
modelUrl: string;
|
|
1137
|
+
/** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
|
|
1138
|
+
backend?: BackendPreference;
|
|
1139
|
+
}
|
|
1140
|
+
declare class Wav2ArkitCpuInference implements LipSyncBackend {
|
|
1141
|
+
private session;
|
|
1142
|
+
private ort;
|
|
1143
|
+
private config;
|
|
1144
|
+
private _backend;
|
|
1145
|
+
private isLoading;
|
|
1146
|
+
private inferenceQueue;
|
|
1147
|
+
constructor(config: Wav2ArkitCpuConfig);
|
|
1148
|
+
get backend(): RuntimeBackend | null;
|
|
1149
|
+
get isLoaded(): boolean;
|
|
1150
|
+
/**
|
|
1151
|
+
* Load the ONNX model
|
|
1152
|
+
*/
|
|
1153
|
+
load(): Promise<LipSyncModelInfo>;
|
|
1154
|
+
/**
|
|
1155
|
+
* Run inference on raw audio
|
|
1156
|
+
*
|
|
1157
|
+
* Accepts variable-length audio (not fixed to 16000 samples).
|
|
1158
|
+
* Output frames = ceil(30 * numSamples / 16000).
|
|
1159
|
+
*
|
|
1160
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
1161
|
+
* @param _identityIndex - Ignored (identity 11 is baked into the model)
|
|
1162
|
+
*/
|
|
1163
|
+
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
|
|
1164
|
+
/**
|
|
1165
|
+
* Queue inference to serialize ONNX session calls
|
|
1166
|
+
*/
|
|
1167
|
+
private queueInference;
|
|
1168
|
+
/**
|
|
1169
|
+
* Dispose of the model and free resources
|
|
1170
|
+
*/
|
|
1171
|
+
dispose(): Promise<void>;
|
|
1172
|
+
}
|
|
1173
|
+
|
|
1174
|
+
/**
|
|
1175
|
+
* Factory function for lip sync with automatic GPU/CPU model selection
|
|
1176
|
+
*
|
|
1177
|
+
* Provides a unified API that automatically selects the optimal model:
|
|
1178
|
+
* - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (1.8MB, WASM)
|
|
1179
|
+
* - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
|
|
1180
|
+
* - Fallback: Gracefully falls back to CPU model if GPU model fails to load
|
|
1181
|
+
*
|
|
1182
|
+
* @category Inference
|
|
1183
|
+
*
|
|
1184
|
+
* @example Auto-detect (recommended)
|
|
1185
|
+
* ```typescript
|
|
1186
|
+
* import { createLipSync } from '@omote/core';
|
|
1187
|
+
*
|
|
1188
|
+
* const lam = createLipSync({
|
|
1189
|
+
* gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
|
|
1190
|
+
* cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
1191
|
+
* });
|
|
1192
|
+
*
|
|
1193
|
+
* await lam.load();
|
|
1194
|
+
* const { blendshapes } = await lam.infer(audioSamples);
|
|
1195
|
+
* ```
|
|
1196
|
+
*
|
|
1197
|
+
* @example Force CPU model
|
|
1198
|
+
* ```typescript
|
|
1199
|
+
* const lam = createLipSync({
|
|
1200
|
+
* gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
|
|
1201
|
+
* cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
1202
|
+
* mode: 'cpu',
|
|
1203
|
+
* });
|
|
1204
|
+
* ```
|
|
1205
|
+
*/
|
|
1206
|
+
|
|
1207
|
+
/**
|
|
1208
|
+
* Configuration for the lip sync factory
|
|
1209
|
+
*/
|
|
1210
|
+
interface CreateLipSyncConfig {
|
|
1211
|
+
/** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
|
|
1212
|
+
gpuModelUrl: string;
|
|
1213
|
+
/** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
|
|
1214
|
+
cpuModelUrl: string;
|
|
1215
|
+
/**
|
|
1216
|
+
* Model selection mode:
|
|
1217
|
+
* - 'auto': Safari/iOS → CPU, everything else → GPU (default)
|
|
1218
|
+
* - 'gpu': Force GPU model (Wav2Vec2Inference)
|
|
1219
|
+
* - 'cpu': Force CPU model (Wav2ArkitCpuInference)
|
|
1220
|
+
*/
|
|
1221
|
+
mode?: 'auto' | 'gpu' | 'cpu';
|
|
1222
|
+
/** Backend preference for GPU model (default: 'auto') */
|
|
1223
|
+
gpuBackend?: BackendPreference;
|
|
1224
|
+
/** Number of identity classes for GPU model (default: 12) */
|
|
1225
|
+
numIdentityClasses?: number;
|
|
1226
|
+
/**
|
|
1227
|
+
* Fall back to CPU model if GPU model fails to load (default: true)
|
|
1228
|
+
* Only applies when mode is 'auto' or 'gpu'
|
|
1229
|
+
*/
|
|
1230
|
+
fallbackOnError?: boolean;
|
|
1231
|
+
}
|
|
1232
|
+
/**
|
|
1233
|
+
* Create a lip sync instance with automatic GPU/CPU model selection
|
|
1234
|
+
*
|
|
1235
|
+
* @param config - Factory configuration
|
|
1236
|
+
* @returns A LipSyncBackend instance (either GPU or CPU model)
|
|
1237
|
+
*/
|
|
1238
|
+
declare function createLipSync(config: CreateLipSyncConfig): LipSyncBackend;
|
|
1239
|
+
|
|
985
1240
|
/**
|
|
986
1241
|
* Silero VAD (Voice Activity Detection) inference
|
|
987
1242
|
*
|
|
@@ -3809,4 +4064,4 @@ declare class EmphasisDetector {
|
|
|
3809
4064
|
reset(): void;
|
|
3810
4065
|
}
|
|
3811
4066
|
|
|
3812
|
-
export { type AIAdapter, type AIAdapterEvents, type AISessionState, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, HF_CDN_TEST_URL, type HuggingFaceUrlInfo, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type TranscriptionResult, type Transition, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, type WhisperConfig, WhisperInference, type WhisperModel, blendEmotions, calculatePeak, calculateRMS, clearSpecificCache, clearTransformersCache, configureCacheLimit, configureTelemetry, createEmotionVector, createSessionWithFallback, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isHuggingFaceCDNReachable, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, listCaches, nukeBrowserCaches, parseHuggingFaceUrl, preloadModels, resolveBackend, scanForInvalidCaches, shouldEnableWasmProxy, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, validateCachedResponse };
|
|
4067
|
+
export { type AIAdapter, type AIAdapterEvents, type AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, HF_CDN_TEST_URL, type HuggingFaceUrlInfo, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type TranscriptionResult, type Transition, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, type WhisperConfig, WhisperInference, type WhisperModel, blendEmotions, calculatePeak, calculateRMS, clearSpecificCache, clearTransformersCache, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSessionWithFallback, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isHuggingFaceCDNReachable, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, listCaches, nukeBrowserCaches, parseHuggingFaceUrl, preloadModels, remapWav2ArkitToLam, resolveBackend, scanForInvalidCaches, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes, validateCachedResponse };
|