@omote/core 0.1.3 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +463 -207
- package/dist/index.d.ts +463 -207
- package/dist/index.js +542 -186
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +534 -178
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -112,6 +112,17 @@ declare class AudioScheduler {
|
|
|
112
112
|
* to avoid browser autoplay policy issues (requires user gesture).
|
|
113
113
|
*/
|
|
114
114
|
initialize(): Promise<void>;
|
|
115
|
+
/**
|
|
116
|
+
* Eagerly create and warm up the AudioContext
|
|
117
|
+
*
|
|
118
|
+
* Call this when a playback session starts (e.g., when AI response begins).
|
|
119
|
+
* The AudioContext needs time to initialize the audio hardware — on Windows
|
|
120
|
+
* this can take 50-100ms. By warming up early (before audio data arrives),
|
|
121
|
+
* the context is fully ready when schedule() is first called.
|
|
122
|
+
*
|
|
123
|
+
* Must be called after a user gesture (click/tap) for autoplay policy.
|
|
124
|
+
*/
|
|
125
|
+
warmup(): Promise<void>;
|
|
115
126
|
/**
|
|
116
127
|
* Ensure AudioContext is created and ready
|
|
117
128
|
* Called lazily on first schedule() - requires user gesture
|
|
@@ -157,6 +168,7 @@ declare class AudioScheduler {
|
|
|
157
168
|
cancelAll(fadeOutMs?: number): Promise<void>;
|
|
158
169
|
/**
|
|
159
170
|
* Reset scheduler state for new playback session
|
|
171
|
+
* Stops any orphaned sources that weren't cleaned up by cancelAll()
|
|
160
172
|
*/
|
|
161
173
|
reset(): void;
|
|
162
174
|
/**
|
|
@@ -342,6 +354,27 @@ declare function getOptimalWasmThreads(): number;
|
|
|
342
354
|
* @returns true if proxy mode is safe to enable
|
|
343
355
|
*/
|
|
344
356
|
declare function shouldEnableWasmProxy(): boolean;
|
|
357
|
+
/**
|
|
358
|
+
* Detect Safari browser on any platform (macOS + iOS)
|
|
359
|
+
*
|
|
360
|
+
* Safari WebKit has bugs with ONNX Runtime's WebGPU multithreaded JSEP build
|
|
361
|
+
* that crash session creation. Both iOS and macOS Safari are affected.
|
|
362
|
+
*
|
|
363
|
+
* @returns true if running in Safari on any platform
|
|
364
|
+
*/
|
|
365
|
+
declare function isSafari(): boolean;
|
|
366
|
+
/**
|
|
367
|
+
* Recommend using CPU-optimized lip sync model (wav2arkit_cpu)
|
|
368
|
+
*
|
|
369
|
+
* All WebKit browsers (Safari macOS, Safari iOS, Chrome iOS, Firefox iOS)
|
|
370
|
+
* have ONNX Runtime WebGPU JSEP bugs that crash session creation, and the
|
|
371
|
+
* 384MB LAM model stack-overflows in WASM mode.
|
|
372
|
+
* The wav2arkit_cpu model (1.8MB) provides identical 52 ARKit blendshape
|
|
373
|
+
* output at 22x real-time on CPU/WASM.
|
|
374
|
+
*
|
|
375
|
+
* @returns true if on Safari or any iOS browser (should use CPU lip sync model)
|
|
376
|
+
*/
|
|
377
|
+
declare function shouldUseCpuLipSync(): boolean;
|
|
345
378
|
/**
|
|
346
379
|
* Check if Web Speech API is available in the browser
|
|
347
380
|
*
|
|
@@ -377,200 +410,62 @@ declare function shouldUseNativeASR(): boolean;
|
|
|
377
410
|
declare function shouldUseServerLipSync(): boolean;
|
|
378
411
|
|
|
379
412
|
/**
|
|
380
|
-
*
|
|
413
|
+
* Common interface for lip sync inference backends
|
|
381
414
|
*
|
|
382
|
-
*
|
|
383
|
-
*
|
|
415
|
+
* Both Wav2Vec2Inference (GPU, 384MB) and Wav2ArkitCpuInference (CPU, 1.8MB)
|
|
416
|
+
* implement this interface, allowing SyncedAudioPipeline and LAMPipeline to
|
|
417
|
+
* work with either model transparently.
|
|
384
418
|
*
|
|
385
|
-
*
|
|
386
|
-
* 2. Loading the WebGPU bundle wastes bandwidth and can cause issues
|
|
387
|
-
* 3. WASM-only bundle is smaller and more reliable on iOS
|
|
388
|
-
*
|
|
389
|
-
* Usage:
|
|
390
|
-
* ```typescript
|
|
391
|
-
* const ort = await getOnnxRuntime('wasm'); // Load WASM-only bundle
|
|
392
|
-
* const ort = await getOnnxRuntime('webgpu'); // Load WebGPU bundle (includes WASM)
|
|
393
|
-
* ```
|
|
394
|
-
*
|
|
395
|
-
* @module inference/onnxLoader
|
|
419
|
+
* @category Inference
|
|
396
420
|
*/
|
|
397
421
|
|
|
398
|
-
type OrtModule = {
|
|
399
|
-
InferenceSession: typeof InferenceSession;
|
|
400
|
-
Tensor: typeof Tensor;
|
|
401
|
-
env: Env;
|
|
402
|
-
};
|
|
403
|
-
type SessionOptions = InferenceSession.SessionOptions;
|
|
404
|
-
|
|
405
|
-
/**
|
|
406
|
-
* Check if WebGPU is available and likely to work
|
|
407
|
-
*
|
|
408
|
-
* This is more thorough than just checking navigator.gpu exists.
|
|
409
|
-
* It actually requests an adapter to verify the GPU is accessible.
|
|
410
|
-
*
|
|
411
|
-
* @returns true if WebGPU is available and working
|
|
412
|
-
*/
|
|
413
|
-
declare function isWebGPUAvailable(): Promise<boolean>;
|
|
414
|
-
/**
|
|
415
|
-
* Load ONNX Runtime with the specified backend
|
|
416
|
-
*
|
|
417
|
-
* This lazily loads the appropriate bundle:
|
|
418
|
-
* - 'wasm': Loads onnxruntime-web (WASM-only, smaller)
|
|
419
|
-
* - 'webgpu': Loads onnxruntime-web/webgpu (includes WebGPU + WASM fallback)
|
|
420
|
-
*
|
|
421
|
-
* Once loaded, the same instance is reused for all subsequent calls.
|
|
422
|
-
* If you need to switch backends, you must reload the page.
|
|
423
|
-
*
|
|
424
|
-
* @param backend The backend to load ('webgpu' or 'wasm')
|
|
425
|
-
* @returns The ONNX Runtime module
|
|
426
|
-
*/
|
|
427
|
-
declare function getOnnxRuntime(backend: RuntimeBackend): Promise<OrtModule>;
|
|
428
|
-
/**
|
|
429
|
-
* Get the appropriate ONNX Runtime based on user preference
|
|
430
|
-
*
|
|
431
|
-
* This resolves the user's preference against platform capabilities
|
|
432
|
-
* and loads the appropriate bundle.
|
|
433
|
-
*
|
|
434
|
-
* @param preference User's backend preference
|
|
435
|
-
* @returns The ONNX Runtime module and the resolved backend
|
|
436
|
-
*/
|
|
437
|
-
declare function getOnnxRuntimeForPreference(preference?: BackendPreference): Promise<{
|
|
438
|
-
ort: OrtModule;
|
|
439
|
-
backend: RuntimeBackend;
|
|
440
|
-
}>;
|
|
441
422
|
/**
|
|
442
|
-
*
|
|
443
|
-
*
|
|
444
|
-
* This returns optimized session options based on the backend and platform.
|
|
445
|
-
*
|
|
446
|
-
* @param backend The backend being used
|
|
447
|
-
* @returns Session options for InferenceSession.create()
|
|
423
|
+
* Model loading information returned by load()
|
|
448
424
|
*/
|
|
449
|
-
|
|
450
|
-
/**
|
|
451
|
-
* Create an inference session with automatic fallback
|
|
452
|
-
*
|
|
453
|
-
* If WebGPU session creation fails, automatically falls back to WASM.
|
|
454
|
-
*
|
|
455
|
-
* @param modelBuffer The model data as ArrayBuffer
|
|
456
|
-
* @param preferredBackend The preferred backend
|
|
457
|
-
* @returns The created session and the backend used
|
|
458
|
-
*/
|
|
459
|
-
declare function createSessionWithFallback(modelBuffer: ArrayBuffer, preferredBackend: RuntimeBackend): Promise<{
|
|
460
|
-
session: InferenceSession;
|
|
425
|
+
interface LipSyncModelInfo {
|
|
461
426
|
backend: RuntimeBackend;
|
|
462
|
-
}>;
|
|
463
|
-
/**
|
|
464
|
-
* Get the currently loaded backend (if any)
|
|
465
|
-
*/
|
|
466
|
-
declare function getLoadedBackend(): RuntimeBackend | null;
|
|
467
|
-
/**
|
|
468
|
-
* Check if ONNX Runtime has been loaded
|
|
469
|
-
*/
|
|
470
|
-
declare function isOnnxRuntimeLoaded(): boolean;
|
|
471
|
-
|
|
472
|
-
/**
|
|
473
|
-
* Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
|
|
474
|
-
*
|
|
475
|
-
* Runs entirely in the browser using WebGPU or WASM.
|
|
476
|
-
* Takes raw 16kHz audio and outputs:
|
|
477
|
-
* - 52 ARKit blendshapes (lip sync)
|
|
478
|
-
* - 32-token CTC logits (speech recognition)
|
|
479
|
-
*
|
|
480
|
-
* @category Inference
|
|
481
|
-
*
|
|
482
|
-
* @example Basic usage
|
|
483
|
-
* ```typescript
|
|
484
|
-
* import { Wav2Vec2Inference } from '@omote/core';
|
|
485
|
-
*
|
|
486
|
-
* const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
|
|
487
|
-
* await wav2vec.load();
|
|
488
|
-
*
|
|
489
|
-
* // Process 1 second of audio (16kHz = 16000 samples)
|
|
490
|
-
* const result = await wav2vec.infer(audioSamples);
|
|
491
|
-
*
|
|
492
|
-
* console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
|
|
493
|
-
* console.log('ASR text:', result.text); // Decoded transcription
|
|
494
|
-
* ```
|
|
495
|
-
*/
|
|
496
|
-
|
|
497
|
-
type InferenceBackend = BackendPreference;
|
|
498
|
-
interface Wav2Vec2InferenceConfig {
|
|
499
|
-
/** Path or URL to the ONNX model */
|
|
500
|
-
modelUrl: string;
|
|
501
|
-
/** Preferred backend (auto will try WebGPU first, fallback to WASM) */
|
|
502
|
-
backend?: InferenceBackend;
|
|
503
|
-
/** Number of identity classes (default: 12 for streaming model) */
|
|
504
|
-
numIdentityClasses?: number;
|
|
505
|
-
}
|
|
506
|
-
interface ModelInfo {
|
|
507
|
-
backend: 'webgpu' | 'wasm';
|
|
508
427
|
loadTimeMs: number;
|
|
509
428
|
inputNames: string[];
|
|
510
429
|
outputNames: string[];
|
|
511
430
|
}
|
|
512
431
|
/**
|
|
513
|
-
*
|
|
514
|
-
*
|
|
432
|
+
* Result from lip sync inference
|
|
433
|
+
*
|
|
434
|
+
* All implementations must return blendshapes in LAM_BLENDSHAPES order (alphabetical).
|
|
435
|
+
* Models with different native orderings must remap internally before returning.
|
|
515
436
|
*/
|
|
516
|
-
|
|
517
|
-
/**
|
|
518
|
-
declare const CTC_VOCAB: string[];
|
|
519
|
-
interface Wav2Vec2Result {
|
|
520
|
-
/** Blendshape weights [frames, 52] - 30fps */
|
|
437
|
+
interface LipSyncResult {
|
|
438
|
+
/** Blendshape weights [frames, 52] in LAM_BLENDSHAPES order - 30fps */
|
|
521
439
|
blendshapes: Float32Array[];
|
|
522
|
-
/**
|
|
523
|
-
|
|
524
|
-
/** Decoded text from CTC */
|
|
525
|
-
text: string;
|
|
526
|
-
/** Number of A2E frames (30fps) */
|
|
527
|
-
numA2EFrames: number;
|
|
528
|
-
/** Number of ASR frames (50fps) */
|
|
529
|
-
numASRFrames: number;
|
|
440
|
+
/** Number of blendshape frames */
|
|
441
|
+
numFrames: number;
|
|
530
442
|
/** Inference time in ms */
|
|
531
443
|
inferenceTimeMs: number;
|
|
532
444
|
}
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
*/
|
|
546
|
-
static isWebGPUAvailable: typeof isWebGPUAvailable;
|
|
547
|
-
get backend(): 'webgpu' | 'wasm' | null;
|
|
548
|
-
get isLoaded(): boolean;
|
|
445
|
+
/**
|
|
446
|
+
* Common interface for lip sync inference engines
|
|
447
|
+
*
|
|
448
|
+
* Implemented by:
|
|
449
|
+
* - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR + lip sync)
|
|
450
|
+
* - Wav2ArkitCpuInference (WASM-only, 1.8MB, lip sync only)
|
|
451
|
+
*/
|
|
452
|
+
interface LipSyncBackend {
|
|
453
|
+
/** Current backend type (webgpu, wasm, or null if not loaded) */
|
|
454
|
+
readonly backend: RuntimeBackend | null;
|
|
455
|
+
/** Whether the model is loaded and ready for inference */
|
|
456
|
+
readonly isLoaded: boolean;
|
|
549
457
|
/**
|
|
550
458
|
* Load the ONNX model
|
|
459
|
+
* @returns Model loading information
|
|
551
460
|
*/
|
|
552
|
-
load(): Promise<
|
|
461
|
+
load(): Promise<LipSyncModelInfo>;
|
|
553
462
|
/**
|
|
554
463
|
* Run inference on raw audio
|
|
555
|
-
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
556
|
-
* @param identityIndex - Optional identity index (
|
|
557
|
-
*
|
|
558
|
-
* Note: Model expects 1-second chunks (16000 samples) for optimal performance.
|
|
559
|
-
* Audio will be zero-padded or truncated to 16000 samples.
|
|
464
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
465
|
+
* @param identityIndex - Optional identity index (ignored by CPU model)
|
|
466
|
+
* @returns Lip sync result with blendshapes in LAM_BLENDSHAPES order
|
|
560
467
|
*/
|
|
561
|
-
infer(audioSamples: Float32Array, identityIndex?: number): Promise<
|
|
562
|
-
/**
|
|
563
|
-
* Decode CTC logits to text using greedy decoding
|
|
564
|
-
*/
|
|
565
|
-
private decodeCTC;
|
|
566
|
-
/**
|
|
567
|
-
* Queue inference to serialize ONNX session calls
|
|
568
|
-
*/
|
|
569
|
-
private queueInference;
|
|
570
|
-
/**
|
|
571
|
-
* Get blendshape value by name for a specific frame
|
|
572
|
-
*/
|
|
573
|
-
getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
|
|
468
|
+
infer(audioSamples: Float32Array, identityIndex?: number): Promise<LipSyncResult>;
|
|
574
469
|
/**
|
|
575
470
|
* Dispose of the model and free resources
|
|
576
471
|
*/
|
|
@@ -642,7 +537,7 @@ declare class LAMPipeline {
|
|
|
642
537
|
* @param timestamp - AudioContext time when these samples start playing
|
|
643
538
|
* @param lam - LAM inference engine
|
|
644
539
|
*/
|
|
645
|
-
push(samples: Float32Array, timestamp: number, lam:
|
|
540
|
+
push(samples: Float32Array, timestamp: number, lam: LipSyncBackend): Promise<void>;
|
|
646
541
|
/**
|
|
647
542
|
* Process accumulated buffer through LAM inference
|
|
648
543
|
*/
|
|
@@ -693,7 +588,7 @@ declare class LAMPipeline {
|
|
|
693
588
|
*
|
|
694
589
|
* @param lam - LAM inference engine
|
|
695
590
|
*/
|
|
696
|
-
flush(lam:
|
|
591
|
+
flush(lam: LipSyncBackend): Promise<void>;
|
|
697
592
|
/**
|
|
698
593
|
* Adjust all queued frame timestamps by an offset
|
|
699
594
|
*
|
|
@@ -710,25 +605,25 @@ declare class LAMPipeline {
|
|
|
710
605
|
}
|
|
711
606
|
|
|
712
607
|
/**
|
|
713
|
-
* SyncedAudioPipeline -
|
|
608
|
+
* SyncedAudioPipeline - Audio playback + LAM lip sync coordinator
|
|
714
609
|
*
|
|
715
610
|
* Orchestrates the complete pipeline for synchronized audio playback and lip sync:
|
|
716
611
|
* 1. Network chunks → Coalescer → Optimized buffers
|
|
717
|
-
* 2. Audio buffers → Scheduler → Gapless playback
|
|
718
|
-
* 3. Audio buffers → LAM Pipeline → Blendshape frames
|
|
612
|
+
* 2. Audio buffers → Scheduler → Gapless playback (immediate, never blocks)
|
|
613
|
+
* 3. Audio buffers → LAM Pipeline → Blendshape frames (background, fire-and-forget)
|
|
719
614
|
* 4. Frames synchronized to AudioContext clock → Renderer
|
|
720
615
|
*
|
|
721
|
-
* Key Architecture Pattern:
|
|
722
|
-
* -
|
|
723
|
-
* -
|
|
724
|
-
* -
|
|
725
|
-
* -
|
|
616
|
+
* Key Architecture Pattern: Audio-First, LAM-Background
|
|
617
|
+
* - Audio chunks are scheduled for playback immediately (never waits for LAM)
|
|
618
|
+
* - LAM inference runs in background without blocking the audio path
|
|
619
|
+
* - Lip sync starts ~1 second after audio (LAM needs 16000 samples to infer)
|
|
620
|
+
* - Once LAM catches up, frames stay synchronized to AudioContext clock
|
|
726
621
|
*
|
|
727
|
-
* This
|
|
728
|
-
*
|
|
622
|
+
* This decoupled design prevents LAM inference (50-300ms) from blocking audio
|
|
623
|
+
* scheduling, which caused audible stuttering when audio arrived as a continuous
|
|
624
|
+
* stream (e.g., single-call TTS from ElevenLabs via AgentCore).
|
|
729
625
|
*
|
|
730
626
|
* @see https://web.dev/articles/audio-scheduling (Web Audio clock patterns)
|
|
731
|
-
* @see https://developer.chrome.com/blog/audio-worklet-design-pattern (Ring buffer patterns)
|
|
732
627
|
* @category Audio
|
|
733
628
|
*/
|
|
734
629
|
|
|
@@ -738,14 +633,14 @@ interface SyncedAudioPipelineOptions {
|
|
|
738
633
|
/** Target chunk duration in ms for coalescing (default: 200) */
|
|
739
634
|
chunkTargetMs?: number;
|
|
740
635
|
/** LAM inference engine */
|
|
741
|
-
lam:
|
|
636
|
+
lam: LipSyncBackend;
|
|
742
637
|
}
|
|
743
638
|
interface SyncedAudioPipelineEvents {
|
|
744
639
|
/** New frame ready for display */
|
|
745
640
|
frame_ready: Float32Array;
|
|
746
641
|
/** Playback has completed */
|
|
747
642
|
playback_complete: void;
|
|
748
|
-
/** First
|
|
643
|
+
/** First audio chunk scheduled, playback starting */
|
|
749
644
|
playback_start: number;
|
|
750
645
|
/** Error occurred */
|
|
751
646
|
error: Error;
|
|
@@ -757,8 +652,7 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
|
|
|
757
652
|
private scheduler;
|
|
758
653
|
private coalescer;
|
|
759
654
|
private lamPipeline;
|
|
760
|
-
private
|
|
761
|
-
private bufferedChunks;
|
|
655
|
+
private playbackStarted;
|
|
762
656
|
private monitorInterval;
|
|
763
657
|
private frameAnimationId;
|
|
764
658
|
constructor(options: SyncedAudioPipelineOptions);
|
|
@@ -770,31 +664,19 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
|
|
|
770
664
|
* Start a new playback session
|
|
771
665
|
*
|
|
772
666
|
* Resets all state and prepares for incoming audio chunks.
|
|
773
|
-
*
|
|
667
|
+
* Audio will be scheduled immediately as chunks arrive (no buffering).
|
|
774
668
|
*/
|
|
775
669
|
start(): void;
|
|
776
670
|
/**
|
|
777
671
|
* Receive audio chunk from network
|
|
778
672
|
*
|
|
779
|
-
*
|
|
780
|
-
*
|
|
781
|
-
*
|
|
782
|
-
* - Audio scheduling waits until first LAM completes
|
|
783
|
-
* - Then all buffered audio is scheduled together with LAM frames
|
|
673
|
+
* Audio-first design: schedules audio immediately, LAM runs in background.
|
|
674
|
+
* This prevents LAM inference (50-300ms) from blocking audio scheduling,
|
|
675
|
+
* which caused audible stuttering with continuous audio streams.
|
|
784
676
|
*
|
|
785
677
|
* @param chunk - Uint8Array containing Int16 PCM audio
|
|
786
678
|
*/
|
|
787
679
|
onAudioChunk(chunk: Uint8Array): Promise<void>;
|
|
788
|
-
/**
|
|
789
|
-
* Handle first LAM inference completion
|
|
790
|
-
*
|
|
791
|
-
* This is the critical synchronization point:
|
|
792
|
-
* - LAM frames are now ready in the queue
|
|
793
|
-
* - Schedule all buffered audio chunks
|
|
794
|
-
* - Adjust LAM frame timestamps to match actual schedule time
|
|
795
|
-
* - Audio and LAM start playing together, perfectly synchronized
|
|
796
|
-
*/
|
|
797
|
-
private onFirstLAMComplete;
|
|
798
680
|
/**
|
|
799
681
|
* End of audio stream
|
|
800
682
|
*
|
|
@@ -840,8 +722,7 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
|
|
|
840
722
|
* Get current pipeline state (for debugging/monitoring)
|
|
841
723
|
*/
|
|
842
724
|
getState(): {
|
|
843
|
-
|
|
844
|
-
bufferedChunks: number;
|
|
725
|
+
playbackStarted: boolean;
|
|
845
726
|
coalescerFill: number;
|
|
846
727
|
lamFill: number;
|
|
847
728
|
queuedFrames: number;
|
|
@@ -854,6 +735,99 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
|
|
|
854
735
|
dispose(): void;
|
|
855
736
|
}
|
|
856
737
|
|
|
738
|
+
/**
|
|
739
|
+
* Lazy ONNX Runtime loader with conditional WebGPU/WASM bundle loading
|
|
740
|
+
*
|
|
741
|
+
* This module provides a way to dynamically load the appropriate ONNX Runtime bundle
|
|
742
|
+
* based on the platform's capabilities. This is critical for iOS support because:
|
|
743
|
+
*
|
|
744
|
+
* 1. iOS Safari has WebGPU API but ONNX Runtime's WebGPU backend crashes
|
|
745
|
+
* 2. Loading the WebGPU bundle wastes bandwidth and can cause issues
|
|
746
|
+
* 3. WASM-only bundle is smaller and more reliable on iOS
|
|
747
|
+
*
|
|
748
|
+
* Usage:
|
|
749
|
+
* ```typescript
|
|
750
|
+
* const ort = await getOnnxRuntime('wasm'); // Load WASM-only bundle
|
|
751
|
+
* const ort = await getOnnxRuntime('webgpu'); // Load WebGPU bundle (includes WASM)
|
|
752
|
+
* ```
|
|
753
|
+
*
|
|
754
|
+
* @module inference/onnxLoader
|
|
755
|
+
*/
|
|
756
|
+
|
|
757
|
+
type OrtModule = {
|
|
758
|
+
InferenceSession: typeof InferenceSession;
|
|
759
|
+
Tensor: typeof Tensor;
|
|
760
|
+
env: Env;
|
|
761
|
+
};
|
|
762
|
+
type SessionOptions = InferenceSession.SessionOptions;
|
|
763
|
+
|
|
764
|
+
/**
|
|
765
|
+
* Check if WebGPU is available and likely to work
|
|
766
|
+
*
|
|
767
|
+
* This is more thorough than just checking navigator.gpu exists.
|
|
768
|
+
* It actually requests an adapter to verify the GPU is accessible.
|
|
769
|
+
*
|
|
770
|
+
* @returns true if WebGPU is available and working
|
|
771
|
+
*/
|
|
772
|
+
declare function isWebGPUAvailable(): Promise<boolean>;
|
|
773
|
+
/**
|
|
774
|
+
* Load ONNX Runtime with the specified backend
|
|
775
|
+
*
|
|
776
|
+
* This lazily loads the appropriate bundle:
|
|
777
|
+
* - 'wasm': Loads onnxruntime-web (WASM-only, smaller)
|
|
778
|
+
* - 'webgpu': Loads onnxruntime-web/webgpu (includes WebGPU + WASM fallback)
|
|
779
|
+
*
|
|
780
|
+
* Once loaded, the same instance is reused for all subsequent calls.
|
|
781
|
+
* If you need to switch backends, you must reload the page.
|
|
782
|
+
*
|
|
783
|
+
* @param backend The backend to load ('webgpu' or 'wasm')
|
|
784
|
+
* @returns The ONNX Runtime module
|
|
785
|
+
*/
|
|
786
|
+
declare function getOnnxRuntime(backend: RuntimeBackend): Promise<OrtModule>;
|
|
787
|
+
/**
|
|
788
|
+
* Get the appropriate ONNX Runtime based on user preference
|
|
789
|
+
*
|
|
790
|
+
* This resolves the user's preference against platform capabilities
|
|
791
|
+
* and loads the appropriate bundle.
|
|
792
|
+
*
|
|
793
|
+
* @param preference User's backend preference
|
|
794
|
+
* @returns The ONNX Runtime module and the resolved backend
|
|
795
|
+
*/
|
|
796
|
+
declare function getOnnxRuntimeForPreference(preference?: BackendPreference): Promise<{
|
|
797
|
+
ort: OrtModule;
|
|
798
|
+
backend: RuntimeBackend;
|
|
799
|
+
}>;
|
|
800
|
+
/**
|
|
801
|
+
* Get session options for creating an inference session
|
|
802
|
+
*
|
|
803
|
+
* This returns optimized session options based on the backend and platform.
|
|
804
|
+
*
|
|
805
|
+
* @param backend The backend being used
|
|
806
|
+
* @returns Session options for InferenceSession.create()
|
|
807
|
+
*/
|
|
808
|
+
declare function getSessionOptions(backend: RuntimeBackend): SessionOptions;
|
|
809
|
+
/**
|
|
810
|
+
* Create an inference session with automatic fallback
|
|
811
|
+
*
|
|
812
|
+
* If WebGPU session creation fails, automatically falls back to WASM.
|
|
813
|
+
*
|
|
814
|
+
* @param modelBuffer The model data as ArrayBuffer
|
|
815
|
+
* @param preferredBackend The preferred backend
|
|
816
|
+
* @returns The created session and the backend used
|
|
817
|
+
*/
|
|
818
|
+
declare function createSessionWithFallback(modelBuffer: ArrayBuffer, preferredBackend: RuntimeBackend): Promise<{
|
|
819
|
+
session: InferenceSession;
|
|
820
|
+
backend: RuntimeBackend;
|
|
821
|
+
}>;
|
|
822
|
+
/**
|
|
823
|
+
* Get the currently loaded backend (if any)
|
|
824
|
+
*/
|
|
825
|
+
declare function getLoadedBackend(): RuntimeBackend | null;
|
|
826
|
+
/**
|
|
827
|
+
* Check if ONNX Runtime has been loaded
|
|
828
|
+
*/
|
|
829
|
+
declare function isOnnxRuntimeLoaded(): boolean;
|
|
830
|
+
|
|
857
831
|
/**
|
|
858
832
|
* Whisper Automatic Speech Recognition using transformers.js
|
|
859
833
|
* Uses Xenova's proven pipeline API for reliable transcription
|
|
@@ -982,6 +956,288 @@ declare class WhisperInference {
|
|
|
982
956
|
private removeNonSpeechTokens;
|
|
983
957
|
}
|
|
984
958
|
|
|
959
|
+
/**
|
|
960
|
+
* Shared blendshape constants and utilities for lip sync inference
|
|
961
|
+
*
|
|
962
|
+
* Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
|
|
963
|
+
* index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
|
|
964
|
+
*
|
|
965
|
+
* This module is the single source of truth for blendshape ordering to
|
|
966
|
+
* avoid circular dependencies between inference classes.
|
|
967
|
+
*
|
|
968
|
+
* @category Inference
|
|
969
|
+
*/
|
|
970
|
+
/**
|
|
971
|
+
* LAM model blendshape names in order (52 total)
|
|
972
|
+
* NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
|
|
973
|
+
*/
|
|
974
|
+
declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
975
|
+
/** Alias for backwards compatibility */
|
|
976
|
+
declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
977
|
+
/**
|
|
978
|
+
* Symmetrize blendshapes by averaging left/right pairs
|
|
979
|
+
* From LAM official postprocessing (models/utils.py)
|
|
980
|
+
* This fixes asymmetric output from the raw model
|
|
981
|
+
*/
|
|
982
|
+
declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
|
|
983
|
+
/**
|
|
984
|
+
* wav2arkit_cpu model blendshape ordering
|
|
985
|
+
*
|
|
986
|
+
* Indices 0-24 match LAM_BLENDSHAPES, but 25+ diverge:
|
|
987
|
+
* - LAM puts jawRight, mouthClose, mouthDimpleLeft, mouthDimpleRight at 25-28
|
|
988
|
+
* - wav2arkit_cpu puts mouthFrownLeft at 25 and moves those four to 48-51
|
|
989
|
+
*/
|
|
990
|
+
declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
|
|
991
|
+
/**
|
|
992
|
+
* Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
|
|
993
|
+
*
|
|
994
|
+
* @param frame - Float32Array of 52 blendshape values in wav2arkit_cpu order
|
|
995
|
+
* @returns Float32Array of 52 blendshape values in LAM_BLENDSHAPES order
|
|
996
|
+
*/
|
|
997
|
+
declare function remapWav2ArkitToLam(frame: Float32Array): Float32Array;
|
|
998
|
+
|
|
999
|
+
/**
|
|
1000
|
+
* Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
|
|
1001
|
+
*
|
|
1002
|
+
* Runs entirely in the browser using WebGPU or WASM.
|
|
1003
|
+
* Takes raw 16kHz audio and outputs:
|
|
1004
|
+
* - 52 ARKit blendshapes (lip sync)
|
|
1005
|
+
* - 32-token CTC logits (speech recognition)
|
|
1006
|
+
*
|
|
1007
|
+
* @category Inference
|
|
1008
|
+
*
|
|
1009
|
+
* @example Basic usage
|
|
1010
|
+
* ```typescript
|
|
1011
|
+
* import { Wav2Vec2Inference } from '@omote/core';
|
|
1012
|
+
*
|
|
1013
|
+
* const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
|
|
1014
|
+
* await wav2vec.load();
|
|
1015
|
+
*
|
|
1016
|
+
* // Process 1 second of audio (16kHz = 16000 samples)
|
|
1017
|
+
* const result = await wav2vec.infer(audioSamples);
|
|
1018
|
+
*
|
|
1019
|
+
* console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
|
|
1020
|
+
* console.log('ASR text:', result.text); // Decoded transcription
|
|
1021
|
+
* ```
|
|
1022
|
+
*/
|
|
1023
|
+
|
|
1024
|
+
type InferenceBackend = BackendPreference;
|
|
1025
|
+
interface Wav2Vec2InferenceConfig {
|
|
1026
|
+
/** Path or URL to the ONNX model */
|
|
1027
|
+
modelUrl: string;
|
|
1028
|
+
/** Preferred backend (auto will try WebGPU first, fallback to WASM) */
|
|
1029
|
+
backend?: InferenceBackend;
|
|
1030
|
+
/** Number of identity classes (default: 12 for streaming model) */
|
|
1031
|
+
numIdentityClasses?: number;
|
|
1032
|
+
}
|
|
1033
|
+
interface ModelInfo {
|
|
1034
|
+
backend: 'webgpu' | 'wasm';
|
|
1035
|
+
loadTimeMs: number;
|
|
1036
|
+
inputNames: string[];
|
|
1037
|
+
outputNames: string[];
|
|
1038
|
+
}
|
|
1039
|
+
|
|
1040
|
+
/** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
|
|
1041
|
+
declare const CTC_VOCAB: string[];
|
|
1042
|
+
interface Wav2Vec2Result {
|
|
1043
|
+
/** Blendshape weights [frames, 52] - 30fps */
|
|
1044
|
+
blendshapes: Float32Array[];
|
|
1045
|
+
/** Raw CTC logits [frames, 32] - 50fps */
|
|
1046
|
+
asrLogits: Float32Array[];
|
|
1047
|
+
/** Decoded text from CTC */
|
|
1048
|
+
text: string;
|
|
1049
|
+
/** Number of blendshape frames (30fps) — alias for numA2EFrames */
|
|
1050
|
+
numFrames: number;
|
|
1051
|
+
/** Number of A2E frames (30fps) */
|
|
1052
|
+
numA2EFrames: number;
|
|
1053
|
+
/** Number of ASR frames (50fps) */
|
|
1054
|
+
numASRFrames: number;
|
|
1055
|
+
/** Inference time in ms */
|
|
1056
|
+
inferenceTimeMs: number;
|
|
1057
|
+
}
|
|
1058
|
+
declare class Wav2Vec2Inference {
|
|
1059
|
+
private session;
|
|
1060
|
+
private ort;
|
|
1061
|
+
private config;
|
|
1062
|
+
private _backend;
|
|
1063
|
+
private isLoading;
|
|
1064
|
+
private numIdentityClasses;
|
|
1065
|
+
private inferenceQueue;
|
|
1066
|
+
constructor(config: Wav2Vec2InferenceConfig);
|
|
1067
|
+
/**
|
|
1068
|
+
* Check if WebGPU is available and working
|
|
1069
|
+
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
1070
|
+
*/
|
|
1071
|
+
static isWebGPUAvailable: typeof isWebGPUAvailable;
|
|
1072
|
+
get backend(): 'webgpu' | 'wasm' | null;
|
|
1073
|
+
get isLoaded(): boolean;
|
|
1074
|
+
/**
|
|
1075
|
+
* Load the ONNX model
|
|
1076
|
+
*/
|
|
1077
|
+
load(): Promise<ModelInfo>;
|
|
1078
|
+
/**
|
|
1079
|
+
* Run inference on raw audio
|
|
1080
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
|
|
1081
|
+
* @param identityIndex - Optional identity index (0-11, default 0 = neutral)
|
|
1082
|
+
*
|
|
1083
|
+
* Note: Model expects 1-second chunks (16000 samples) for optimal performance.
|
|
1084
|
+
* Audio will be zero-padded or truncated to 16000 samples.
|
|
1085
|
+
*/
|
|
1086
|
+
infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
|
|
1087
|
+
/**
|
|
1088
|
+
* Decode CTC logits to text using greedy decoding
|
|
1089
|
+
*/
|
|
1090
|
+
private decodeCTC;
|
|
1091
|
+
/**
|
|
1092
|
+
* Queue inference to serialize ONNX session calls
|
|
1093
|
+
*/
|
|
1094
|
+
private queueInference;
|
|
1095
|
+
/**
|
|
1096
|
+
* Get blendshape value by name for a specific frame
|
|
1097
|
+
*/
|
|
1098
|
+
getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
|
|
1099
|
+
/**
|
|
1100
|
+
* Dispose of the model and free resources
|
|
1101
|
+
*/
|
|
1102
|
+
dispose(): Promise<void>;
|
|
1103
|
+
}
|
|
1104
|
+
|
|
1105
|
+
/**
|
|
1106
|
+
* CPU-optimized lip sync inference using wav2arkit_cpu model
|
|
1107
|
+
*
|
|
1108
|
+
* A lightweight (1.8MB) alternative to Wav2Vec2Inference (384MB) designed
|
|
1109
|
+
* for Safari/iOS where WebGPU crashes due to ONNX Runtime JSEP bugs.
|
|
1110
|
+
*
|
|
1111
|
+
* Key differences from Wav2Vec2Inference:
|
|
1112
|
+
* - WASM-only backend (CPU-optimized, no WebGPU)
|
|
1113
|
+
* - 1.8MB model vs 384MB
|
|
1114
|
+
* - No identity input (baked to identity 11)
|
|
1115
|
+
* - No ASR output (lip sync only)
|
|
1116
|
+
* - Dynamic input length (not fixed to 16000 samples)
|
|
1117
|
+
* - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
|
|
1118
|
+
*
|
|
1119
|
+
* @category Inference
|
|
1120
|
+
*
|
|
1121
|
+
* @example
|
|
1122
|
+
* ```typescript
|
|
1123
|
+
* import { Wav2ArkitCpuInference } from '@omote/core';
|
|
1124
|
+
*
|
|
1125
|
+
* const lam = new Wav2ArkitCpuInference({
|
|
1126
|
+
* modelUrl: '/models/wav2arkit_cpu.onnx',
|
|
1127
|
+
* });
|
|
1128
|
+
* await lam.load();
|
|
1129
|
+
*
|
|
1130
|
+
* const { blendshapes } = await lam.infer(audioSamples);
|
|
1131
|
+
* // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
|
|
1132
|
+
* ```
|
|
1133
|
+
*/
|
|
1134
|
+
|
|
1135
|
+
interface Wav2ArkitCpuConfig {
|
|
1136
|
+
/** Path or URL to the wav2arkit_cpu ONNX model */
|
|
1137
|
+
modelUrl: string;
|
|
1138
|
+
/** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
|
|
1139
|
+
backend?: BackendPreference;
|
|
1140
|
+
}
|
|
1141
|
+
declare class Wav2ArkitCpuInference implements LipSyncBackend {
|
|
1142
|
+
private session;
|
|
1143
|
+
private ort;
|
|
1144
|
+
private config;
|
|
1145
|
+
private _backend;
|
|
1146
|
+
private isLoading;
|
|
1147
|
+
private inferenceQueue;
|
|
1148
|
+
constructor(config: Wav2ArkitCpuConfig);
|
|
1149
|
+
get backend(): RuntimeBackend | null;
|
|
1150
|
+
get isLoaded(): boolean;
|
|
1151
|
+
/**
|
|
1152
|
+
* Load the ONNX model
|
|
1153
|
+
*/
|
|
1154
|
+
load(): Promise<LipSyncModelInfo>;
|
|
1155
|
+
/**
|
|
1156
|
+
* Run inference on raw audio
|
|
1157
|
+
*
|
|
1158
|
+
* Accepts variable-length audio (not fixed to 16000 samples).
|
|
1159
|
+
* Output frames = ceil(30 * numSamples / 16000).
|
|
1160
|
+
*
|
|
1161
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
1162
|
+
* @param _identityIndex - Ignored (identity 11 is baked into the model)
|
|
1163
|
+
*/
|
|
1164
|
+
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
|
|
1165
|
+
/**
|
|
1166
|
+
* Queue inference to serialize ONNX session calls
|
|
1167
|
+
*/
|
|
1168
|
+
private queueInference;
|
|
1169
|
+
/**
|
|
1170
|
+
* Dispose of the model and free resources
|
|
1171
|
+
*/
|
|
1172
|
+
dispose(): Promise<void>;
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
/**
|
|
1176
|
+
* Factory function for lip sync with automatic GPU/CPU model selection
|
|
1177
|
+
*
|
|
1178
|
+
* Provides a unified API that automatically selects the optimal model:
|
|
1179
|
+
* - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (1.8MB, WASM)
|
|
1180
|
+
* - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
|
|
1181
|
+
* - Fallback: Gracefully falls back to CPU model if GPU model fails to load
|
|
1182
|
+
*
|
|
1183
|
+
* @category Inference
|
|
1184
|
+
*
|
|
1185
|
+
* @example Auto-detect (recommended)
|
|
1186
|
+
* ```typescript
|
|
1187
|
+
* import { createLipSync } from '@omote/core';
|
|
1188
|
+
*
|
|
1189
|
+
* const lam = createLipSync({
|
|
1190
|
+
* gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
|
|
1191
|
+
* cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
1192
|
+
* });
|
|
1193
|
+
*
|
|
1194
|
+
* await lam.load();
|
|
1195
|
+
* const { blendshapes } = await lam.infer(audioSamples);
|
|
1196
|
+
* ```
|
|
1197
|
+
*
|
|
1198
|
+
* @example Force CPU model
|
|
1199
|
+
* ```typescript
|
|
1200
|
+
* const lam = createLipSync({
|
|
1201
|
+
* gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
|
|
1202
|
+
* cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
1203
|
+
* mode: 'cpu',
|
|
1204
|
+
* });
|
|
1205
|
+
* ```
|
|
1206
|
+
*/
|
|
1207
|
+
|
|
1208
|
+
/**
|
|
1209
|
+
* Configuration for the lip sync factory
|
|
1210
|
+
*/
|
|
1211
|
+
interface CreateLipSyncConfig {
|
|
1212
|
+
/** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
|
|
1213
|
+
gpuModelUrl: string;
|
|
1214
|
+
/** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
|
|
1215
|
+
cpuModelUrl: string;
|
|
1216
|
+
/**
|
|
1217
|
+
* Model selection mode:
|
|
1218
|
+
* - 'auto': Safari/iOS → CPU, everything else → GPU (default)
|
|
1219
|
+
* - 'gpu': Force GPU model (Wav2Vec2Inference)
|
|
1220
|
+
* - 'cpu': Force CPU model (Wav2ArkitCpuInference)
|
|
1221
|
+
*/
|
|
1222
|
+
mode?: 'auto' | 'gpu' | 'cpu';
|
|
1223
|
+
/** Backend preference for GPU model (default: 'auto') */
|
|
1224
|
+
gpuBackend?: BackendPreference;
|
|
1225
|
+
/** Number of identity classes for GPU model (default: 12) */
|
|
1226
|
+
numIdentityClasses?: number;
|
|
1227
|
+
/**
|
|
1228
|
+
* Fall back to CPU model if GPU model fails to load (default: true)
|
|
1229
|
+
* Only applies when mode is 'auto' or 'gpu'
|
|
1230
|
+
*/
|
|
1231
|
+
fallbackOnError?: boolean;
|
|
1232
|
+
}
|
|
1233
|
+
/**
|
|
1234
|
+
* Create a lip sync instance with automatic GPU/CPU model selection
|
|
1235
|
+
*
|
|
1236
|
+
* @param config - Factory configuration
|
|
1237
|
+
* @returns A LipSyncBackend instance (either GPU or CPU model)
|
|
1238
|
+
*/
|
|
1239
|
+
declare function createLipSync(config: CreateLipSyncConfig): LipSyncBackend;
|
|
1240
|
+
|
|
985
1241
|
/**
|
|
986
1242
|
* Silero VAD (Voice Activity Detection) inference
|
|
987
1243
|
*
|
|
@@ -3809,4 +4065,4 @@ declare class EmphasisDetector {
|
|
|
3809
4065
|
reset(): void;
|
|
3810
4066
|
}
|
|
3811
4067
|
|
|
3812
|
-
export { type AIAdapter, type AIAdapterEvents, type AISessionState, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, HF_CDN_TEST_URL, type HuggingFaceUrlInfo, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type TranscriptionResult, type Transition, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, type WhisperConfig, WhisperInference, type WhisperModel, blendEmotions, calculatePeak, calculateRMS, clearSpecificCache, clearTransformersCache, configureCacheLimit, configureTelemetry, createEmotionVector, createSessionWithFallback, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isHuggingFaceCDNReachable, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, listCaches, nukeBrowserCaches, parseHuggingFaceUrl, preloadModels, resolveBackend, scanForInvalidCaches, shouldEnableWasmProxy, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, validateCachedResponse };
|
|
4068
|
+
export { type AIAdapter, type AIAdapterEvents, type AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, HF_CDN_TEST_URL, type HuggingFaceUrlInfo, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type TranscriptionResult, type Transition, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, type WhisperConfig, WhisperInference, type WhisperModel, blendEmotions, calculatePeak, calculateRMS, clearSpecificCache, clearTransformersCache, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSessionWithFallback, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isHuggingFaceCDNReachable, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, listCaches, nukeBrowserCaches, parseHuggingFaceUrl, preloadModels, remapWav2ArkitToLam, resolveBackend, scanForInvalidCaches, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes, validateCachedResponse };
|