@omote/core 0.4.7 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +470 -861
- package/dist/index.d.ts +470 -861
- package/dist/index.js +1383 -1565
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +949 -1131
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import { EventEmitter, OmoteEvents, AISessionState, AnimationEvent } from './events/index.mjs';
|
|
2
2
|
export { BackendEvent, EmotionEvent, GazeEvent, STTFinalEvent, STTPartialEvent, SessionStateEvent, TTSEndEvent, TTSMarkEvent, TTSStartEvent, VisemeEvent } from './events/index.mjs';
|
|
3
|
-
import { InferenceSession, Tensor, Env } from 'onnxruntime-common';
|
|
4
3
|
export { D as DEFAULT_LOGGING_CONFIG, I as ILogger, e as LOG_LEVEL_PRIORITY, b as LogEntry, L as LogFormatter, a as LogLevel, c as LogSink, d as LoggingConfig, f as configureLogging, i as createLogger, g as getLoggingConfig, n as noopLogger, r as resetLoggingConfig, s as setLogLevel, h as setLoggingEnabled } from './Logger-I_k4sGhM.mjs';
|
|
5
4
|
export { ARKitToFLAMEMapping, ApiError, AudioChunkEvent, AvatarFormat, Character, CharacterAvatar, CharacterMemory, CharacterPersonality, CharacterSpec, CharacterVoice, CreateCharacterRequest, CreateCharacterResponse, CreateLAMJobRequest, CreateLAMJobResponse, CreateSessionRequest, CreateSessionResponse, GSplatConfig, LAMJob, LAMJobStatus, PROTOCOL_VERSION, PaginatedResponse, PlatformSession, ErrorEvent as ProtocolErrorEvent, ProtocolEvent, ResponseChunkEvent, ResponseEndEvent, ResponseStartEvent, SessionMessage, SessionStatus, isProtocolEvent } from '@omote/types';
|
|
6
5
|
|
|
@@ -379,7 +378,7 @@ declare function shouldEnableWasmProxy(): boolean;
|
|
|
379
378
|
*/
|
|
380
379
|
declare function isSafari(): boolean;
|
|
381
380
|
/**
|
|
382
|
-
* Recommend using CPU-optimized
|
|
381
|
+
* Recommend using CPU-optimized A2E model (wav2arkit_cpu)
|
|
383
382
|
*
|
|
384
383
|
* All iOS browsers use WebKit and have tight memory limits — the 384MB
|
|
385
384
|
* LAM model causes silent crashes. wav2arkit_cpu uses URL pass-through
|
|
@@ -390,7 +389,7 @@ declare function isSafari(): boolean;
|
|
|
390
389
|
*
|
|
391
390
|
* @returns true if iOS (any browser) or Safari (any platform)
|
|
392
391
|
*/
|
|
393
|
-
declare function
|
|
392
|
+
declare function shouldUseCpuA2E(): boolean;
|
|
394
393
|
/**
|
|
395
394
|
* Check if Web Speech API is available in the browser
|
|
396
395
|
*
|
|
@@ -415,18 +414,18 @@ declare function shouldUseNativeASR(): boolean;
|
|
|
415
414
|
/**
|
|
416
415
|
* Recommend using server-side LAM over client-side on iOS
|
|
417
416
|
*
|
|
418
|
-
* On iOS, LAM
|
|
417
|
+
* On iOS, LAM A2E via WASM takes ~332ms per second of audio (3.3x over target).
|
|
419
418
|
* Server-side inference with GPU can achieve ~50ms, providing:
|
|
420
|
-
* - Real-time
|
|
419
|
+
* - Real-time A2E (under 100ms target)
|
|
421
420
|
* - Reduced iOS device thermal/battery impact
|
|
422
421
|
* - Better user experience
|
|
423
422
|
*
|
|
424
|
-
* @returns true if on iOS (should use server-side
|
|
423
|
+
* @returns true if on iOS (should use server-side A2E)
|
|
425
424
|
*/
|
|
426
|
-
declare function
|
|
425
|
+
declare function shouldUseServerA2E(): boolean;
|
|
427
426
|
|
|
428
427
|
/**
|
|
429
|
-
* Common interface for
|
|
428
|
+
* Common interface for audio-to-expression (A2E) inference backends
|
|
430
429
|
*
|
|
431
430
|
* Both Wav2Vec2Inference (GPU, 384MB) and Wav2ArkitCpuInference (CPU, 404MB)
|
|
432
431
|
* implement this interface, allowing SyncedAudioPipeline and LAMPipeline to
|
|
@@ -438,19 +437,19 @@ declare function shouldUseServerLipSync(): boolean;
|
|
|
438
437
|
/**
|
|
439
438
|
* Model loading information returned by load()
|
|
440
439
|
*/
|
|
441
|
-
interface
|
|
440
|
+
interface A2EModelInfo {
|
|
442
441
|
backend: RuntimeBackend;
|
|
443
442
|
loadTimeMs: number;
|
|
444
443
|
inputNames: string[];
|
|
445
444
|
outputNames: string[];
|
|
446
445
|
}
|
|
447
446
|
/**
|
|
448
|
-
* Result from
|
|
447
|
+
* Result from A2E inference
|
|
449
448
|
*
|
|
450
449
|
* All implementations must return blendshapes in LAM_BLENDSHAPES order (alphabetical).
|
|
451
450
|
* Models with different native orderings must remap internally before returning.
|
|
452
451
|
*/
|
|
453
|
-
interface
|
|
452
|
+
interface A2EResult {
|
|
454
453
|
/** Blendshape weights [frames, 52] in LAM_BLENDSHAPES order - 30fps */
|
|
455
454
|
blendshapes: Float32Array[];
|
|
456
455
|
/** Number of blendshape frames */
|
|
@@ -459,31 +458,33 @@ interface LipSyncResult {
|
|
|
459
458
|
inferenceTimeMs: number;
|
|
460
459
|
}
|
|
461
460
|
/**
|
|
462
|
-
* Common interface for
|
|
461
|
+
* Common interface for A2E (audio-to-expression) inference engines
|
|
463
462
|
*
|
|
464
463
|
* Implemented by:
|
|
465
|
-
* - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR +
|
|
466
|
-
* - Wav2ArkitCpuInference (WASM-only, 404MB,
|
|
464
|
+
* - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR + A2E)
|
|
465
|
+
* - Wav2ArkitCpuInference (WASM-only, 404MB, A2E only)
|
|
467
466
|
*/
|
|
468
|
-
interface
|
|
467
|
+
interface A2EBackend {
|
|
469
468
|
/** Model identifier for backend-specific tuning (e.g. audio delay) */
|
|
470
469
|
readonly modelId: 'wav2vec2' | 'wav2arkit_cpu';
|
|
471
470
|
/** Current backend type (webgpu, wasm, or null if not loaded) */
|
|
472
471
|
readonly backend: RuntimeBackend | null;
|
|
473
472
|
/** Whether the model is loaded and ready for inference */
|
|
474
473
|
readonly isLoaded: boolean;
|
|
474
|
+
/** Optimal number of audio samples per inference call (e.g. 16000 = 1s at 16kHz) */
|
|
475
|
+
readonly chunkSize: number;
|
|
475
476
|
/**
|
|
476
477
|
* Load the ONNX model
|
|
477
478
|
* @returns Model loading information
|
|
478
479
|
*/
|
|
479
|
-
load(): Promise<
|
|
480
|
+
load(): Promise<A2EModelInfo>;
|
|
480
481
|
/**
|
|
481
482
|
* Run inference on raw audio
|
|
482
483
|
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
483
484
|
* @param identityIndex - Optional identity index (ignored by CPU model)
|
|
484
|
-
* @returns
|
|
485
|
+
* @returns A2E result with blendshapes in LAM_BLENDSHAPES order
|
|
485
486
|
*/
|
|
486
|
-
infer(audioSamples: Float32Array, identityIndex?: number): Promise<
|
|
487
|
+
infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
|
|
487
488
|
/**
|
|
488
489
|
* Dispose of the model and free resources
|
|
489
490
|
*/
|
|
@@ -491,542 +492,16 @@ interface LipSyncBackend {
|
|
|
491
492
|
}
|
|
492
493
|
|
|
493
494
|
/**
|
|
494
|
-
*
|
|
495
|
+
* FullFacePipeline - A2E expression pipeline with ExpressionProfile weight scaling
|
|
495
496
|
*
|
|
496
|
-
*
|
|
497
|
-
* 1.
|
|
498
|
-
* 2.
|
|
499
|
-
* 3.
|
|
500
|
-
* 4. Provides frames synchronized to AudioContext clock
|
|
497
|
+
* Orchestrates full-face animation by:
|
|
498
|
+
* 1. Scheduling audio for playback immediately (audio-first, never waits for A2E)
|
|
499
|
+
* 2. Running A2E inference in background (fire-and-forget via A2EProcessor)
|
|
500
|
+
* 3. Applying per-character ExpressionProfile scaling to raw A2E output
|
|
501
501
|
*
|
|
502
|
-
*
|
|
503
|
-
*
|
|
504
|
-
*
|
|
505
|
-
* - Timestamp-based frame retrieval (not callback) for renderer flexibility
|
|
506
|
-
*
|
|
507
|
-
* Based on patterns from Chrome Audio Worklet design and Web Audio clock management.
|
|
508
|
-
*
|
|
509
|
-
* @see https://developer.chrome.com/blog/audio-worklet-design-pattern
|
|
510
|
-
* @category Audio
|
|
511
|
-
*/
|
|
512
|
-
|
|
513
|
-
interface LAMFrame {
|
|
514
|
-
/** 52 ARKit blendshape weights */
|
|
515
|
-
frame: Float32Array;
|
|
516
|
-
/** AudioContext time when this frame should be displayed */
|
|
517
|
-
timestamp: number;
|
|
518
|
-
}
|
|
519
|
-
interface LAMPipelineOptions {
|
|
520
|
-
/**
|
|
521
|
-
* Sample rate in Hz (must match audio playback)
|
|
522
|
-
* Default: 16000
|
|
523
|
-
*/
|
|
524
|
-
sampleRate?: number;
|
|
525
|
-
/**
|
|
526
|
-
* LAM inference callback
|
|
527
|
-
* Called each time LAM processes a buffer
|
|
528
|
-
*/
|
|
529
|
-
onInference?: (frameCount: number) => void;
|
|
530
|
-
/**
|
|
531
|
-
* Error callback for inference failures
|
|
532
|
-
*/
|
|
533
|
-
onError?: (error: Error) => void;
|
|
534
|
-
}
|
|
535
|
-
declare class LAMPipeline {
|
|
536
|
-
private readonly options;
|
|
537
|
-
private readonly REQUIRED_SAMPLES;
|
|
538
|
-
private readonly FRAME_RATE;
|
|
539
|
-
private buffer;
|
|
540
|
-
private bufferStartTime;
|
|
541
|
-
private frameQueue;
|
|
542
|
-
/**
|
|
543
|
-
* Last successfully retrieved frame
|
|
544
|
-
* Used as fallback when no new frame is available to prevent avatar freezing
|
|
545
|
-
*/
|
|
546
|
-
private lastFrame;
|
|
547
|
-
constructor(options?: LAMPipelineOptions);
|
|
548
|
-
/**
|
|
549
|
-
* Push audio samples into the pipeline
|
|
550
|
-
*
|
|
551
|
-
* Accumulates samples and triggers LAM inference when buffer is full.
|
|
552
|
-
* Multiple calls may be needed to accumulate enough samples.
|
|
553
|
-
*
|
|
554
|
-
* @param samples - Float32Array of audio samples
|
|
555
|
-
* @param timestamp - AudioContext time when these samples start playing
|
|
556
|
-
* @param lam - LAM inference engine
|
|
557
|
-
*/
|
|
558
|
-
push(samples: Float32Array, timestamp: number, lam: LipSyncBackend): Promise<void>;
|
|
559
|
-
/**
|
|
560
|
-
* Process accumulated buffer through LAM inference
|
|
561
|
-
*/
|
|
562
|
-
private processBuffer;
|
|
563
|
-
/**
|
|
564
|
-
* Get the frame that should be displayed at the current time
|
|
565
|
-
*
|
|
566
|
-
* Automatically removes frames that have already been displayed.
|
|
567
|
-
* This prevents memory leaks from accumulating old frames.
|
|
568
|
-
*
|
|
569
|
-
* Discard Window (prevents premature frame discarding):
|
|
570
|
-
* - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
|
|
571
|
-
* - WASM: 1.0s (LAM inference 50-500ms + higher variability)
|
|
572
|
-
*
|
|
573
|
-
* Last-Frame-Hold: Returns last valid frame instead of null to prevent
|
|
574
|
-
* avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
|
|
575
|
-
*
|
|
576
|
-
* @param currentTime - Current AudioContext time
|
|
577
|
-
* @param lam - LAM inference engine (optional, for backend detection)
|
|
578
|
-
* @returns Current frame, or last frame as fallback, or null if no frames yet
|
|
579
|
-
*/
|
|
580
|
-
getFrameForTime(currentTime: number, lam?: {
|
|
581
|
-
backend: 'webgpu' | 'wasm' | null;
|
|
582
|
-
}): Float32Array | null;
|
|
583
|
-
/**
|
|
584
|
-
* Get all frames in the queue (for debugging/monitoring)
|
|
585
|
-
*/
|
|
586
|
-
getQueuedFrames(): LAMFrame[];
|
|
587
|
-
/**
|
|
588
|
-
* Get current buffer fill level (0-1)
|
|
589
|
-
*/
|
|
590
|
-
get fillLevel(): number;
|
|
591
|
-
/**
|
|
592
|
-
* Get number of frames queued
|
|
593
|
-
*/
|
|
594
|
-
get queuedFrameCount(): number;
|
|
595
|
-
/**
|
|
596
|
-
* Get buffered audio duration in seconds
|
|
597
|
-
*/
|
|
598
|
-
get bufferedDuration(): number;
|
|
599
|
-
/**
|
|
600
|
-
* Flush remaining buffered audio
|
|
601
|
-
*
|
|
602
|
-
* Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
|
|
603
|
-
* This ensures the final audio chunk generates blendshape frames.
|
|
604
|
-
*
|
|
605
|
-
* Should be called when audio stream ends to prevent losing the last 0-1 seconds.
|
|
606
|
-
*
|
|
607
|
-
* @param lam - LAM inference engine
|
|
608
|
-
*/
|
|
609
|
-
flush(lam: LipSyncBackend): Promise<void>;
|
|
610
|
-
/**
|
|
611
|
-
* Adjust all queued frame timestamps by an offset
|
|
612
|
-
*
|
|
613
|
-
* Used for synchronization when audio scheduling time differs from
|
|
614
|
-
* the estimated time used during LAM processing.
|
|
615
|
-
*
|
|
616
|
-
* @param offset - Time offset in seconds to add to all timestamps
|
|
617
|
-
*/
|
|
618
|
-
adjustTimestamps(offset: number): void;
|
|
619
|
-
/**
|
|
620
|
-
* Reset the pipeline
|
|
621
|
-
*/
|
|
622
|
-
reset(): void;
|
|
623
|
-
}
|
|
624
|
-
|
|
625
|
-
/**
|
|
626
|
-
* SyncedAudioPipeline - Audio playback + LAM lip sync coordinator
|
|
627
|
-
*
|
|
628
|
-
* Orchestrates the complete pipeline for synchronized audio playback and lip sync:
|
|
629
|
-
* 1. Network chunks → Coalescer → Optimized buffers
|
|
630
|
-
* 2. Audio buffers → Scheduler → Gapless playback (immediate, never blocks)
|
|
631
|
-
* 3. Audio buffers → LAM Pipeline → Blendshape frames (background, fire-and-forget)
|
|
632
|
-
* 4. Frames synchronized to AudioContext clock → Renderer
|
|
633
|
-
*
|
|
634
|
-
* Key Architecture Pattern: Audio-First, LAM-Background
|
|
635
|
-
* - Audio chunks are scheduled for playback immediately (never waits for LAM)
|
|
636
|
-
* - LAM inference runs in background without blocking the audio path
|
|
637
|
-
* - Lip sync starts ~1 second after audio (LAM needs 16000 samples to infer)
|
|
638
|
-
* - Once LAM catches up, frames stay synchronized to AudioContext clock
|
|
639
|
-
*
|
|
640
|
-
* This decoupled design prevents LAM inference (50-300ms) from blocking audio
|
|
641
|
-
* scheduling, which caused audible stuttering when audio arrived as a continuous
|
|
642
|
-
* stream (e.g., single-call TTS from ElevenLabs via AgentCore).
|
|
643
|
-
*
|
|
644
|
-
* @see https://web.dev/articles/audio-scheduling (Web Audio clock patterns)
|
|
645
|
-
* @category Audio
|
|
646
|
-
*/
|
|
647
|
-
|
|
648
|
-
interface SyncedAudioPipelineOptions {
|
|
649
|
-
/** Sample rate in Hz (default: 16000) */
|
|
650
|
-
sampleRate?: number;
|
|
651
|
-
/** Target chunk duration in ms for coalescing (default: 200) */
|
|
652
|
-
chunkTargetMs?: number;
|
|
653
|
-
/** LAM inference engine */
|
|
654
|
-
lam: LipSyncBackend;
|
|
655
|
-
/**
|
|
656
|
-
* Audio playback delay in ms before first audio plays.
|
|
657
|
-
* Gives LAM inference time to pre-compute blendshapes.
|
|
658
|
-
* Default: auto-detected from lam.backend (50ms WebGPU, 350ms WASM).
|
|
659
|
-
*/
|
|
660
|
-
audioDelayMs?: number;
|
|
661
|
-
}
|
|
662
|
-
interface SyncedAudioPipelineEvents {
|
|
663
|
-
/** New frame ready for display */
|
|
664
|
-
frame_ready: Float32Array;
|
|
665
|
-
/** Playback has completed */
|
|
666
|
-
playback_complete: void;
|
|
667
|
-
/** First audio chunk scheduled, playback starting */
|
|
668
|
-
playback_start: number;
|
|
669
|
-
/** Error occurred */
|
|
670
|
-
error: Error;
|
|
671
|
-
/** Index signature for EventEmitter compatibility */
|
|
672
|
-
[key: string]: unknown;
|
|
673
|
-
}
|
|
674
|
-
declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents> {
|
|
675
|
-
private readonly options;
|
|
676
|
-
private scheduler;
|
|
677
|
-
private coalescer;
|
|
678
|
-
private lamPipeline;
|
|
679
|
-
private playbackStarted;
|
|
680
|
-
private monitorInterval;
|
|
681
|
-
private frameAnimationId;
|
|
682
|
-
constructor(options: SyncedAudioPipelineOptions);
|
|
683
|
-
/**
|
|
684
|
-
* Initialize the pipeline
|
|
685
|
-
*/
|
|
686
|
-
initialize(): Promise<void>;
|
|
687
|
-
/**
|
|
688
|
-
* Start a new playback session
|
|
689
|
-
*
|
|
690
|
-
* Resets all state and prepares for incoming audio chunks.
|
|
691
|
-
* Audio will be scheduled immediately as chunks arrive (no buffering).
|
|
692
|
-
*/
|
|
693
|
-
start(): void;
|
|
694
|
-
/**
|
|
695
|
-
* Receive audio chunk from network
|
|
696
|
-
*
|
|
697
|
-
* Audio-first design: schedules audio immediately, LAM runs in background.
|
|
698
|
-
* This prevents LAM inference (50-300ms) from blocking audio scheduling,
|
|
699
|
-
* which caused audible stuttering with continuous audio streams.
|
|
700
|
-
*
|
|
701
|
-
* @param chunk - Uint8Array containing Int16 PCM audio
|
|
702
|
-
*/
|
|
703
|
-
onAudioChunk(chunk: Uint8Array): Promise<void>;
|
|
704
|
-
/**
|
|
705
|
-
* End of audio stream
|
|
706
|
-
*
|
|
707
|
-
* Flushes any remaining buffered data.
|
|
708
|
-
*/
|
|
709
|
-
end(): Promise<void>;
|
|
710
|
-
/**
|
|
711
|
-
* Stop playback immediately with smooth fade-out
|
|
712
|
-
*
|
|
713
|
-
* Gracefully cancels all audio playback and LAM processing:
|
|
714
|
-
* - Fades out audio over specified duration (default: 50ms)
|
|
715
|
-
* - Cancels pending LAM inferences
|
|
716
|
-
* - Clears all buffers and queues
|
|
717
|
-
* - Emits 'playback_complete' event
|
|
718
|
-
*
|
|
719
|
-
* Use this for interruptions (e.g., user barge-in during AI speech).
|
|
720
|
-
*
|
|
721
|
-
* @param fadeOutMs - Fade-out duration in milliseconds (default: 50ms)
|
|
722
|
-
* @returns Promise that resolves when fade-out completes
|
|
723
|
-
*/
|
|
724
|
-
stop(fadeOutMs?: number): Promise<void>;
|
|
725
|
-
/**
|
|
726
|
-
* Start frame animation loop
|
|
727
|
-
*
|
|
728
|
-
* Uses requestAnimationFrame to check for new LAM frames.
|
|
729
|
-
* Synchronized to AudioContext clock (not visual refresh rate).
|
|
730
|
-
*
|
|
731
|
-
* Frame Emission Strategy:
|
|
732
|
-
* - LAMPipeline uses last-frame-hold to prevent null returns
|
|
733
|
-
* - Always emit frames (even repeated frames) to maintain smooth animation
|
|
734
|
-
* - Renderer is responsible for detecting duplicate frames if needed
|
|
735
|
-
*/
|
|
736
|
-
private startFrameLoop;
|
|
737
|
-
/**
|
|
738
|
-
* Start monitoring for playback completion
|
|
739
|
-
*/
|
|
740
|
-
private startMonitoring;
|
|
741
|
-
/**
|
|
742
|
-
* Stop monitoring
|
|
743
|
-
*/
|
|
744
|
-
private stopMonitoring;
|
|
745
|
-
/**
|
|
746
|
-
* Get current pipeline state (for debugging/monitoring)
|
|
747
|
-
*/
|
|
748
|
-
getState(): {
|
|
749
|
-
playbackStarted: boolean;
|
|
750
|
-
coalescerFill: number;
|
|
751
|
-
lamFill: number;
|
|
752
|
-
queuedFrames: number;
|
|
753
|
-
currentTime: number;
|
|
754
|
-
playbackEndTime: number;
|
|
755
|
-
};
|
|
756
|
-
/**
|
|
757
|
-
* Cleanup resources
|
|
758
|
-
*/
|
|
759
|
-
dispose(): void;
|
|
760
|
-
}
|
|
761
|
-
|
|
762
|
-
/**
|
|
763
|
-
* Emotion to ARKit Blendshape Mapper
|
|
764
|
-
*
|
|
765
|
-
* Converts Emotion2VecInference output to upper face ARKit blendshapes for
|
|
766
|
-
* expressive avatar animation. Maps 4 emotion categories (neutral, happy, angry, sad)
|
|
767
|
-
* to 11 upper face blendshapes (brows, eyes, cheeks).
|
|
768
|
-
*
|
|
769
|
-
* Supports two blend modes:
|
|
770
|
-
* - 'dominant': Uses only the strongest emotion (simpler, more stable)
|
|
771
|
-
* - 'weighted': Blends all emotions by probability (more nuanced, e.g., bittersweet)
|
|
772
|
-
*
|
|
773
|
-
* Also supports energy modulation to scale emotion intensity by audio energy,
|
|
774
|
-
* making expressions stronger during emphasized speech.
|
|
775
|
-
*
|
|
776
|
-
* @example Basic usage
|
|
777
|
-
* ```typescript
|
|
778
|
-
* import { EmotionToBlendshapeMapper } from '@omote/core';
|
|
779
|
-
* import { Emotion2VecInference } from '@omote/core';
|
|
780
|
-
*
|
|
781
|
-
* const emotion = new Emotion2VecInference({ modelUrl: '/models/emotion.onnx' });
|
|
782
|
-
* const mapper = new EmotionToBlendshapeMapper();
|
|
783
|
-
*
|
|
784
|
-
* // Process emotion frame
|
|
785
|
-
* const result = await emotion.infer(audioSamples);
|
|
786
|
-
* const blendshapes = mapper.mapFrame(result.dominant);
|
|
787
|
-
*
|
|
788
|
-
* // Apply to avatar
|
|
789
|
-
* for (const [name, value] of Object.entries(blendshapes)) {
|
|
790
|
-
* avatar.setBlendshape(name, value);
|
|
791
|
-
* }
|
|
792
|
-
* ```
|
|
793
|
-
*
|
|
794
|
-
* @example Weighted blending for nuanced expressions
|
|
795
|
-
* ```typescript
|
|
796
|
-
* const mapper = new EmotionToBlendshapeMapper({
|
|
797
|
-
* blendMode: 'weighted',
|
|
798
|
-
* minBlendProbability: 0.1,
|
|
799
|
-
* });
|
|
800
|
-
*
|
|
801
|
-
* // Frame with mixed emotions: { happy: 0.6, sad: 0.3, neutral: 0.1 }
|
|
802
|
-
* // Result: bittersweet expression (smiling but worried brow)
|
|
803
|
-
* const blendshapes = mapper.mapFrame(emotionFrame);
|
|
804
|
-
* ```
|
|
805
|
-
*
|
|
806
|
-
* @example Energy-modulated emotion
|
|
807
|
-
* ```typescript
|
|
808
|
-
* import { AudioEnergyAnalyzer } from '@omote/core';
|
|
809
|
-
*
|
|
810
|
-
* const energyAnalyzer = new AudioEnergyAnalyzer();
|
|
811
|
-
* const mapper = new EmotionToBlendshapeMapper({ energyModulation: true });
|
|
812
|
-
*
|
|
813
|
-
* // In animation loop
|
|
814
|
-
* function animate(audioChunk: Float32Array, emotionFrame: EmotionFrame) {
|
|
815
|
-
* const { energy } = energyAnalyzer.analyze(audioChunk);
|
|
816
|
-
* mapper.mapFrame(emotionFrame, energy); // Louder = stronger emotion
|
|
817
|
-
* mapper.update(16);
|
|
818
|
-
* applyToAvatar(mapper.getCurrentBlendshapes());
|
|
819
|
-
* }
|
|
820
|
-
* ```
|
|
821
|
-
*
|
|
822
|
-
* @module animation
|
|
823
|
-
*/
|
|
824
|
-
declare const EMOTION2VEC_LABELS: readonly ["neutral", "happy", "angry", "sad"];
|
|
825
|
-
type Emotion2VecLabel = (typeof EMOTION2VEC_LABELS)[number];
|
|
826
|
-
interface EmotionFrame {
|
|
827
|
-
/** Primary emotion label */
|
|
828
|
-
emotion: Emotion2VecLabel;
|
|
829
|
-
/** Confidence for primary emotion (0-1) */
|
|
830
|
-
confidence: number;
|
|
831
|
-
/** All emotion probabilities */
|
|
832
|
-
probabilities: Record<Emotion2VecLabel, number>;
|
|
833
|
-
}
|
|
834
|
-
/**
|
|
835
|
-
* Upper face ARKit blendshape names (11 total)
|
|
836
|
-
*
|
|
837
|
-
* These blendshapes control the upper face (brows, eyes, cheeks) and are
|
|
838
|
-
* driven by emotion detection, complementing the mouth blendshapes from
|
|
839
|
-
* LAM lip sync.
|
|
840
|
-
*/
|
|
841
|
-
declare const UPPER_FACE_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "cheekSquintLeft", "cheekSquintRight"];
|
|
842
|
-
type UpperFaceBlendshapeName = (typeof UPPER_FACE_BLENDSHAPES)[number];
|
|
843
|
-
/**
|
|
844
|
-
* Upper face blendshape values (0-1 for each)
|
|
845
|
-
*/
|
|
846
|
-
type UpperFaceBlendshapes = Record<UpperFaceBlendshapeName, number>;
|
|
847
|
-
/**
|
|
848
|
-
* Blend mode for combining emotions
|
|
849
|
-
* - 'dominant': Use only the strongest emotion (default, more stable)
|
|
850
|
-
* - 'weighted': Blend all emotions by probability (more nuanced)
|
|
851
|
-
*/
|
|
852
|
-
type EmotionBlendMode = 'dominant' | 'weighted';
|
|
853
|
-
/**
|
|
854
|
-
* Emotion to ARKit blendshape mapping
|
|
855
|
-
*
|
|
856
|
-
* Based on Paul Ekman's FACS (Facial Action Coding System) research:
|
|
857
|
-
*
|
|
858
|
-
* - Happy (AU6+AU12): Cheek raise + lip corner pull (Duchenne smile)
|
|
859
|
-
* Upper face: cheekSquint (AU6) + slight eyeSquint from genuine smile
|
|
860
|
-
*
|
|
861
|
-
* - Angry (AU4+AU5+AU7+AU23): Brow lower + eye wide + lid tighten + lip press
|
|
862
|
-
* Upper face: browDown (AU4) + eyeWide (AU5) + eyeSquint (AU7) creates the "glare"
|
|
863
|
-
*
|
|
864
|
-
* - Sad (AU1+AU4+AU15): Inner brow raise + brow furrow + lip corner depress
|
|
865
|
-
* Upper face: browInnerUp (AU1) + browDown (AU4) creates the worried/sad brow
|
|
866
|
-
*
|
|
867
|
-
* - Neutral: All zeros (no expression overlay)
|
|
868
|
-
*
|
|
869
|
-
* @see https://imotions.com/blog/learning/research-fundamentals/facial-action-coding-system/
|
|
870
|
-
* @see https://melindaozel.com/arkit-to-facs-cheat-sheet/
|
|
871
|
-
*/
|
|
872
|
-
declare const EMOTION_ARKIT_MAP: Record<Emotion2VecLabel, Partial<UpperFaceBlendshapes>>;
|
|
873
|
-
/**
|
|
874
|
-
* Configuration for EmotionToBlendshapeMapper
|
|
875
|
-
*/
|
|
876
|
-
interface EmotionBlendshapeConfig {
|
|
877
|
-
/**
|
|
878
|
-
* Smoothing factor for exponential moving average (0-1)
|
|
879
|
-
* Lower = slower, smoother transitions
|
|
880
|
-
* Higher = faster, more responsive
|
|
881
|
-
* @default 0.15
|
|
882
|
-
*/
|
|
883
|
-
smoothingFactor?: number;
|
|
884
|
-
/**
|
|
885
|
-
* Minimum confidence threshold for emotion to take effect
|
|
886
|
-
* Emotions below this confidence are treated as neutral
|
|
887
|
-
* @default 0.3
|
|
888
|
-
*/
|
|
889
|
-
confidenceThreshold?: number;
|
|
890
|
-
/**
|
|
891
|
-
* Global intensity multiplier for all blendshapes (0-2)
|
|
892
|
-
* @default 1.0
|
|
893
|
-
*/
|
|
894
|
-
intensity?: number;
|
|
895
|
-
/**
|
|
896
|
-
* Blend mode for combining emotions
|
|
897
|
-
* - 'dominant': Use only the strongest emotion (default)
|
|
898
|
-
* - 'weighted': Blend all emotions by probability
|
|
899
|
-
* @default 'dominant'
|
|
900
|
-
*/
|
|
901
|
-
blendMode?: EmotionBlendMode;
|
|
902
|
-
/**
|
|
903
|
-
* Minimum probability for an emotion to contribute in weighted blend mode
|
|
904
|
-
* Emotions with probability below this are ignored
|
|
905
|
-
* @default 0.1
|
|
906
|
-
*/
|
|
907
|
-
minBlendProbability?: number;
|
|
908
|
-
/**
|
|
909
|
-
* Enable energy modulation - scale emotion intensity by audio energy
|
|
910
|
-
* When enabled, louder speech produces stronger expressions
|
|
911
|
-
* @default false
|
|
912
|
-
*/
|
|
913
|
-
energyModulation?: boolean;
|
|
914
|
-
/**
|
|
915
|
-
* Minimum energy scale when energy modulation is enabled (0-1)
|
|
916
|
-
* At zero audio energy, emotion intensity is scaled by this factor
|
|
917
|
-
* @default 0.3
|
|
918
|
-
*/
|
|
919
|
-
minEnergyScale?: number;
|
|
920
|
-
/**
|
|
921
|
-
* Maximum energy scale when energy modulation is enabled (0-2)
|
|
922
|
-
* At maximum audio energy, emotion intensity is scaled by this factor
|
|
923
|
-
* @default 1.0
|
|
924
|
-
*/
|
|
925
|
-
maxEnergyScale?: number;
|
|
926
|
-
}
|
|
927
|
-
/**
|
|
928
|
-
* EmotionToBlendshapeMapper
|
|
929
|
-
*
|
|
930
|
-
* Converts emotion detection output to upper face ARKit blendshapes.
|
|
931
|
-
* Provides smooth transitions between emotion states using exponential
|
|
932
|
-
* moving average interpolation.
|
|
933
|
-
*
|
|
934
|
-
* Supports two blend modes:
|
|
935
|
-
* - 'dominant': Uses only the strongest emotion
|
|
936
|
-
* - 'weighted': Blends all emotions by probability for nuanced expressions
|
|
937
|
-
*
|
|
938
|
-
* Also supports energy modulation to scale emotion intensity by audio energy.
|
|
939
|
-
*/
|
|
940
|
-
declare class EmotionToBlendshapeMapper {
|
|
941
|
-
private config;
|
|
942
|
-
private targetBlendshapes;
|
|
943
|
-
private currentBlendshapes;
|
|
944
|
-
private currentEnergy;
|
|
945
|
-
/**
|
|
946
|
-
* Create a new EmotionToBlendshapeMapper
|
|
947
|
-
*
|
|
948
|
-
* @param config - Optional configuration
|
|
949
|
-
*/
|
|
950
|
-
constructor(config?: EmotionBlendshapeConfig);
|
|
951
|
-
/**
|
|
952
|
-
* Map an emotion frame to target blendshapes
|
|
953
|
-
*
|
|
954
|
-
* This sets the target values that the mapper will smoothly interpolate
|
|
955
|
-
* towards. Call update() each frame to apply smoothing.
|
|
956
|
-
*
|
|
957
|
-
* @param frame - Emotion frame from Emotion2VecInference
|
|
958
|
-
* @param audioEnergy - Optional audio energy (0-1) for energy modulation
|
|
959
|
-
* @returns Target upper face blendshapes (before smoothing)
|
|
960
|
-
*/
|
|
961
|
-
mapFrame(frame: EmotionFrame, audioEnergy?: number): UpperFaceBlendshapes;
|
|
962
|
-
/**
|
|
963
|
-
* Map using dominant emotion only (original behavior)
|
|
964
|
-
*/
|
|
965
|
-
private mapFrameDominant;
|
|
966
|
-
/**
|
|
967
|
-
* Map using weighted blend of all emotions by probability
|
|
968
|
-
* Creates more nuanced expressions (e.g., bittersweet = happy + sad)
|
|
969
|
-
*/
|
|
970
|
-
private mapFrameWeighted;
|
|
971
|
-
/**
|
|
972
|
-
* Apply energy modulation to scale emotion intensity by audio energy
|
|
973
|
-
* Louder speech = stronger expressions
|
|
974
|
-
*/
|
|
975
|
-
private applyEnergyModulation;
|
|
976
|
-
/**
|
|
977
|
-
* Apply smoothing to interpolate current values towards target
|
|
978
|
-
*
|
|
979
|
-
* Uses exponential moving average:
|
|
980
|
-
* current = current + smoothingFactor * (target - current)
|
|
981
|
-
*
|
|
982
|
-
* @param _deltaMs - Delta time in milliseconds (reserved for future time-based smoothing)
|
|
983
|
-
*/
|
|
984
|
-
update(_deltaMs: number): void;
|
|
985
|
-
/**
|
|
986
|
-
* Get current smoothed blendshape values
|
|
987
|
-
*
|
|
988
|
-
* @returns Current upper face blendshapes (after smoothing)
|
|
989
|
-
*/
|
|
990
|
-
getCurrentBlendshapes(): UpperFaceBlendshapes;
|
|
991
|
-
/**
|
|
992
|
-
* Reset mapper to neutral state
|
|
993
|
-
*
|
|
994
|
-
* Sets both target and current blendshapes to zero.
|
|
995
|
-
*/
|
|
996
|
-
reset(): void;
|
|
997
|
-
/**
|
|
998
|
-
* Get current configuration
|
|
999
|
-
*/
|
|
1000
|
-
getConfig(): Required<EmotionBlendshapeConfig>;
|
|
1001
|
-
/**
|
|
1002
|
-
* Update configuration
|
|
1003
|
-
*
|
|
1004
|
-
* @param config - Partial configuration to update
|
|
1005
|
-
*/
|
|
1006
|
-
setConfig(config: Partial<EmotionBlendshapeConfig>): void;
|
|
1007
|
-
}
|
|
1008
|
-
|
|
1009
|
-
/**
|
|
1010
|
-
* FullFacePipeline - Combined LAM lip sync + Emotion upper face pipeline
|
|
1011
|
-
*
|
|
1012
|
-
* Orchestrates full-face animation by combining:
|
|
1013
|
-
* 1. LAM lip sync (52 ARKit blendshapes) via audio-first scheduling
|
|
1014
|
-
* 2. Emotion labels (from backend LLM or `setEmotionLabel()`) for upper face
|
|
1015
|
-
* 3. AudioEnergyAnalyzer for prosody-driven fallback when no emotion label is set
|
|
1016
|
-
*
|
|
1017
|
-
* Architecture: Audio-First, LAM-Background (same as SyncedAudioPipeline)
|
|
1018
|
-
* - Audio chunks are scheduled for playback immediately (never waits for LAM)
|
|
1019
|
-
* - LAM inference runs in background without blocking the audio path
|
|
1020
|
-
* - Lip sync starts ~1 second after audio (LAM needs 16000 samples to infer)
|
|
1021
|
-
*
|
|
1022
|
-
* Merge Strategy:
|
|
1023
|
-
* - Lower face (41 blendshapes): 100% from LAM (mouth, jaw, tongue, etc.)
|
|
1024
|
-
* - Upper face (11 blendshapes): Emotion overlay with LAM as subtle fallback
|
|
1025
|
-
* Formula: emotion * emotionBlendFactor + lam * lamBlendFactor
|
|
1026
|
-
*
|
|
1027
|
-
* Emotion Sources (in priority order):
|
|
1028
|
-
* 1. `setEmotionLabel()` — explicit label from backend LLM (recommended)
|
|
1029
|
-
* 2. Prosody fallback — subtle brow movement from audio energy (automatic)
|
|
502
|
+
* The A2E model outputs all 52 ARKit blendshapes from audio — brows, eyes, cheeks,
|
|
503
|
+
* mouth, jaw, everything. ExpressionProfile allows per-character weight scaling
|
|
504
|
+
* by group (eyes, brows, jaw, mouth, cheeks, nose, tongue) with per-blendshape overrides.
|
|
1030
505
|
*
|
|
1031
506
|
* @category Audio
|
|
1032
507
|
*
|
|
@@ -1036,8 +511,7 @@ declare class EmotionToBlendshapeMapper {
|
|
|
1036
511
|
*
|
|
1037
512
|
* const pipeline = new FullFacePipeline({
|
|
1038
513
|
* lam,
|
|
1039
|
-
*
|
|
1040
|
-
* lamBlendFactor: 0.2,
|
|
514
|
+
* profile: { mouth: 1.2, brows: 0.8 },
|
|
1041
515
|
* });
|
|
1042
516
|
* await pipeline.initialize();
|
|
1043
517
|
*
|
|
@@ -1046,11 +520,41 @@ declare class EmotionToBlendshapeMapper {
|
|
|
1046
520
|
* });
|
|
1047
521
|
*
|
|
1048
522
|
* pipeline.start();
|
|
1049
|
-
* pipeline.setEmotionLabel('happy'); // From backend LLM
|
|
1050
523
|
* await pipeline.onAudioChunk(audioData);
|
|
1051
524
|
* ```
|
|
1052
525
|
*/
|
|
1053
526
|
|
|
527
|
+
type BlendshapeGroup = 'eyes' | 'brows' | 'jaw' | 'mouth' | 'cheeks' | 'nose' | 'tongue';
|
|
528
|
+
/**
|
|
529
|
+
* Per-character weight scaling for A2E blendshape output.
|
|
530
|
+
*
|
|
531
|
+
* Group scalers multiply all blendshapes in that group (default 1.0).
|
|
532
|
+
* Per-blendshape overrides take priority over group scalers.
|
|
533
|
+
* Final values are clamped to [0, 1].
|
|
534
|
+
*/
|
|
535
|
+
interface ExpressionProfile {
|
|
536
|
+
/** eyeBlink*, eyeLook*, eyeSquint*, eyeWide* (14 blendshapes) */
|
|
537
|
+
eyes?: number;
|
|
538
|
+
/** browDown*, browInnerUp, browOuterUp* (5 blendshapes) */
|
|
539
|
+
brows?: number;
|
|
540
|
+
/** jawForward, jawLeft, jawRight, jawOpen (4 blendshapes) */
|
|
541
|
+
jaw?: number;
|
|
542
|
+
/** mouth* (23 blendshapes) */
|
|
543
|
+
mouth?: number;
|
|
544
|
+
/** cheekPuff, cheekSquint* (3 blendshapes) */
|
|
545
|
+
cheeks?: number;
|
|
546
|
+
/** noseSneer* (2 blendshapes) */
|
|
547
|
+
nose?: number;
|
|
548
|
+
/** tongueOut (1 blendshape) */
|
|
549
|
+
tongue?: number;
|
|
550
|
+
/** Per-blendshape overrides (0-2). Takes priority over group scalers. */
|
|
551
|
+
overrides?: Partial<Record<string, number>>;
|
|
552
|
+
}
|
|
553
|
+
/**
|
|
554
|
+
* Map each LAM_BLENDSHAPES entry to its BlendshapeGroup.
|
|
555
|
+
* Built once at module load from prefix matching.
|
|
556
|
+
*/
|
|
557
|
+
declare const BLENDSHAPE_TO_GROUP: Map<string, BlendshapeGroup>;
|
|
1054
558
|
/**
|
|
1055
559
|
* Configuration for FullFacePipeline
|
|
1056
560
|
*/
|
|
@@ -1061,37 +565,43 @@ interface FullFacePipelineOptions {
|
|
|
1061
565
|
chunkTargetMs?: number;
|
|
1062
566
|
/**
|
|
1063
567
|
* Audio playback delay in ms before first audio plays.
|
|
1064
|
-
* Gives
|
|
1065
|
-
*
|
|
568
|
+
* Gives A2E inference time to pre-compute blendshapes before audio
|
|
569
|
+
* starts, preventing frame drops/desync. Must be ≥ chunkSize
|
|
570
|
+
* accumulation time + inference latency.
|
|
571
|
+
*
|
|
572
|
+
* Default: auto-calculated from chunkSize and backend type.
|
|
1066
573
|
*/
|
|
1067
574
|
audioDelayMs?: number;
|
|
1068
|
-
/** LAM inference engine */
|
|
1069
|
-
lam: LipSyncBackend;
|
|
1070
575
|
/**
|
|
1071
|
-
*
|
|
1072
|
-
*
|
|
1073
|
-
*
|
|
576
|
+
* A2E inference chunk size in samples.
|
|
577
|
+
* Controls how many samples accumulate before each inference call.
|
|
578
|
+
* Smaller = lower latency (less delay before first frame), more overhead.
|
|
579
|
+
* Larger = higher latency, less overhead.
|
|
580
|
+
*
|
|
581
|
+
* Default: 16000 (1s) — the model's native window size.
|
|
582
|
+
* Smaller chunks get zero-padded, causing near-zero blendshape output.
|
|
1074
583
|
*/
|
|
1075
|
-
|
|
584
|
+
chunkSize?: number;
|
|
585
|
+
/** A2E inference engine */
|
|
586
|
+
lam: A2EBackend;
|
|
587
|
+
/** Per-character expression weight scaling */
|
|
588
|
+
profile?: ExpressionProfile;
|
|
1076
589
|
/**
|
|
1077
|
-
*
|
|
1078
|
-
*
|
|
1079
|
-
*
|
|
590
|
+
* Time in ms with no new inference frames before logging a stale warning.
|
|
591
|
+
*
|
|
592
|
+
* Must be larger than the inter-batch gap (chunkSize/sampleRate + inference time).
|
|
593
|
+
* Default: 2000
|
|
1080
594
|
*/
|
|
1081
|
-
|
|
595
|
+
staleThresholdMs?: number;
|
|
1082
596
|
}
|
|
1083
597
|
/**
|
|
1084
|
-
* Full face frame with
|
|
598
|
+
* Full face frame with scaled blendshapes
|
|
1085
599
|
*/
|
|
1086
600
|
interface FullFaceFrame {
|
|
1087
|
-
/**
|
|
601
|
+
/** Scaled 52 ARKit blendshapes (ExpressionProfile applied) */
|
|
1088
602
|
blendshapes: Float32Array;
|
|
1089
|
-
/**
|
|
1090
|
-
|
|
1091
|
-
/** Emotion-driven upper face blendshapes (11) */
|
|
1092
|
-
emotionBlendshapes: UpperFaceBlendshapes;
|
|
1093
|
-
/** Raw emotion frame data */
|
|
1094
|
-
emotion: EmotionFrame | null;
|
|
603
|
+
/** Raw A2E output (52 blendshapes, before profile scaling) */
|
|
604
|
+
rawBlendshapes: Float32Array;
|
|
1095
605
|
/** AudioContext timestamp for this frame */
|
|
1096
606
|
timestamp: number;
|
|
1097
607
|
}
|
|
@@ -1103,8 +613,6 @@ interface FullFacePipelineEvents {
|
|
|
1103
613
|
full_frame_ready: FullFaceFrame;
|
|
1104
614
|
/** Raw LAM frame ready (for debugging/monitoring) */
|
|
1105
615
|
lam_frame_ready: Float32Array;
|
|
1106
|
-
/** Emotion frame ready (for debugging/monitoring) */
|
|
1107
|
-
emotion_frame_ready: EmotionFrame;
|
|
1108
616
|
/** Playback has completed */
|
|
1109
617
|
playback_complete: void;
|
|
1110
618
|
/** First frame ready, playback starting */
|
|
@@ -1115,53 +623,45 @@ interface FullFacePipelineEvents {
|
|
|
1115
623
|
[key: string]: unknown;
|
|
1116
624
|
}
|
|
1117
625
|
/**
|
|
1118
|
-
* FullFacePipeline -
|
|
626
|
+
* FullFacePipeline - A2E animation pipeline with ExpressionProfile scaling
|
|
1119
627
|
*
|
|
1120
628
|
* Audio-first design matching SyncedAudioPipeline:
|
|
1121
|
-
* - Audio is scheduled immediately (never waits for
|
|
1122
|
-
* -
|
|
1123
|
-
* -
|
|
629
|
+
* - Audio is scheduled immediately (never waits for A2E)
|
|
630
|
+
* - A2E runs in background (fire-and-forget via A2EProcessor)
|
|
631
|
+
* - ExpressionProfile scales raw A2E output per-character
|
|
1124
632
|
*/
|
|
1125
633
|
declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
|
|
1126
634
|
private readonly options;
|
|
1127
635
|
private scheduler;
|
|
1128
636
|
private coalescer;
|
|
1129
|
-
private
|
|
1130
|
-
private emotionMapper;
|
|
1131
|
-
private energyAnalyzer;
|
|
637
|
+
private processor;
|
|
1132
638
|
private playbackStarted;
|
|
1133
639
|
private monitorInterval;
|
|
1134
640
|
private frameAnimationId;
|
|
1135
|
-
private lastEmotionFrame;
|
|
1136
|
-
private currentAudioEnergy;
|
|
1137
641
|
private lastNewFrameTime;
|
|
1138
642
|
private lastKnownLamFrame;
|
|
1139
643
|
private staleWarningEmitted;
|
|
1140
|
-
private
|
|
1141
|
-
private
|
|
1142
|
-
private
|
|
644
|
+
private readonly staleThresholdMs;
|
|
645
|
+
private frameLoopCount;
|
|
646
|
+
private profile;
|
|
1143
647
|
constructor(options: FullFacePipelineOptions);
|
|
1144
648
|
/**
|
|
1145
649
|
* Initialize the pipeline
|
|
1146
650
|
*/
|
|
1147
651
|
initialize(): Promise<void>;
|
|
1148
652
|
/**
|
|
1149
|
-
*
|
|
1150
|
-
*
|
|
1151
|
-
* Converts a natural language emotion label into an EmotionFrame
|
|
1152
|
-
* that drives upper face blendshapes for the duration of the utterance.
|
|
1153
|
-
*
|
|
1154
|
-
* Supported labels: happy, excited, joyful, sad, melancholic, angry,
|
|
1155
|
-
* frustrated, neutral, etc.
|
|
1156
|
-
*
|
|
1157
|
-
* @param label - Emotion label string (case-insensitive)
|
|
653
|
+
* Update the ExpressionProfile at runtime (e.g., character switch).
|
|
1158
654
|
*/
|
|
1159
|
-
|
|
655
|
+
setProfile(profile: ExpressionProfile): void;
|
|
1160
656
|
/**
|
|
1161
|
-
*
|
|
1162
|
-
*
|
|
657
|
+
* Apply ExpressionProfile scaling to raw A2E blendshapes.
|
|
658
|
+
*
|
|
659
|
+
* For each blendshape:
|
|
660
|
+
* 1. If an override exists for the blendshape name, use override as scaler
|
|
661
|
+
* 2. Otherwise, use the group scaler (default 1.0)
|
|
662
|
+
* 3. Clamp result to [0, 1]
|
|
1163
663
|
*/
|
|
1164
|
-
|
|
664
|
+
applyProfile(raw: Float32Array): Float32Array;
|
|
1165
665
|
/**
|
|
1166
666
|
* Start a new playback session
|
|
1167
667
|
*
|
|
@@ -1172,29 +672,18 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
|
|
|
1172
672
|
/**
|
|
1173
673
|
* Receive audio chunk from network
|
|
1174
674
|
*
|
|
1175
|
-
* Audio-first design: schedules audio immediately,
|
|
1176
|
-
* This prevents
|
|
675
|
+
* Audio-first design: schedules audio immediately, A2E runs in background.
|
|
676
|
+
* This prevents A2E inference (50-300ms) from blocking audio scheduling.
|
|
1177
677
|
*
|
|
1178
678
|
* @param chunk - Uint8Array containing Int16 PCM audio
|
|
1179
679
|
*/
|
|
1180
680
|
onAudioChunk(chunk: Uint8Array): Promise<void>;
|
|
1181
|
-
/**
|
|
1182
|
-
* Get emotion frame for current animation.
|
|
1183
|
-
*
|
|
1184
|
-
* Priority:
|
|
1185
|
-
* 1. Explicit emotion label from setEmotionLabel()
|
|
1186
|
-
* 2. Prosody fallback: subtle brow movement from audio energy
|
|
1187
|
-
*/
|
|
1188
|
-
private getEmotionFrame;
|
|
1189
|
-
/**
|
|
1190
|
-
* Merge LAM blendshapes with emotion upper face blendshapes
|
|
1191
|
-
*/
|
|
1192
|
-
mergeBlendshapes(lamFrame: Float32Array, emotionFrame: EmotionFrame | null, audioEnergy?: number): {
|
|
1193
|
-
merged: Float32Array;
|
|
1194
|
-
emotionBlendshapes: UpperFaceBlendshapes;
|
|
1195
|
-
};
|
|
1196
681
|
/**
|
|
1197
682
|
* Start frame animation loop
|
|
683
|
+
*
|
|
684
|
+
* Polls A2EProcessor at render rate (60fps) for the latest inference frame
|
|
685
|
+
* matching the current AudioContext time. Between inference batches (~30fps
|
|
686
|
+
* bursts), getFrameForTime() holds the last frame.
|
|
1198
687
|
*/
|
|
1199
688
|
private startFrameLoop;
|
|
1200
689
|
/**
|
|
@@ -1219,17 +708,11 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
|
|
|
1219
708
|
getState(): {
|
|
1220
709
|
playbackStarted: boolean;
|
|
1221
710
|
coalescerFill: number;
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
emotionLabel: "neutral" | "happy" | "angry" | "sad" | null;
|
|
1225
|
-
currentAudioEnergy: number;
|
|
711
|
+
processorFill: number;
|
|
712
|
+
queuedFrames: number;
|
|
1226
713
|
currentTime: number;
|
|
1227
714
|
playbackEndTime: number;
|
|
1228
715
|
};
|
|
1229
|
-
/**
|
|
1230
|
-
* Check if an explicit emotion label is currently set
|
|
1231
|
-
*/
|
|
1232
|
-
get hasEmotionLabel(): boolean;
|
|
1233
716
|
/**
|
|
1234
717
|
* Cleanup resources
|
|
1235
718
|
*/
|
|
@@ -1255,13 +738,6 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
|
|
|
1255
738
|
* @module inference/onnxLoader
|
|
1256
739
|
*/
|
|
1257
740
|
|
|
1258
|
-
type OrtModule = {
|
|
1259
|
-
InferenceSession: typeof InferenceSession;
|
|
1260
|
-
Tensor: typeof Tensor;
|
|
1261
|
-
env: Env;
|
|
1262
|
-
};
|
|
1263
|
-
type SessionOptions = InferenceSession.SessionOptions;
|
|
1264
|
-
|
|
1265
741
|
/**
|
|
1266
742
|
* Check if WebGPU is available and likely to work
|
|
1267
743
|
*
|
|
@@ -1271,74 +747,6 @@ type SessionOptions = InferenceSession.SessionOptions;
|
|
|
1271
747
|
* @returns true if WebGPU is available and working
|
|
1272
748
|
*/
|
|
1273
749
|
declare function isWebGPUAvailable(): Promise<boolean>;
|
|
1274
|
-
/**
|
|
1275
|
-
* Load ONNX Runtime with the specified backend
|
|
1276
|
-
*
|
|
1277
|
-
* This lazily loads the appropriate bundle:
|
|
1278
|
-
* - 'wasm': Loads onnxruntime-web (WASM-only, smaller)
|
|
1279
|
-
* - 'webgpu': Loads onnxruntime-web/webgpu (includes WebGPU + WASM fallback)
|
|
1280
|
-
*
|
|
1281
|
-
* Once loaded, the same instance is reused for all subsequent calls.
|
|
1282
|
-
* If you need to switch backends, you must reload the page.
|
|
1283
|
-
*
|
|
1284
|
-
* @param backend The backend to load ('webgpu' or 'wasm')
|
|
1285
|
-
* @returns The ONNX Runtime module
|
|
1286
|
-
*/
|
|
1287
|
-
declare function getOnnxRuntime(backend: RuntimeBackend): Promise<OrtModule>;
|
|
1288
|
-
/**
|
|
1289
|
-
* Get the appropriate ONNX Runtime based on user preference
|
|
1290
|
-
*
|
|
1291
|
-
* This resolves the user's preference against platform capabilities
|
|
1292
|
-
* and loads the appropriate bundle.
|
|
1293
|
-
*
|
|
1294
|
-
* @param preference User's backend preference
|
|
1295
|
-
* @returns The ONNX Runtime module and the resolved backend
|
|
1296
|
-
*/
|
|
1297
|
-
declare function getOnnxRuntimeForPreference(preference?: BackendPreference): Promise<{
|
|
1298
|
-
ort: OrtModule;
|
|
1299
|
-
backend: RuntimeBackend;
|
|
1300
|
-
}>;
|
|
1301
|
-
/**
|
|
1302
|
-
* Get session options for creating an inference session
|
|
1303
|
-
*
|
|
1304
|
-
* This returns optimized session options based on the backend and platform.
|
|
1305
|
-
*
|
|
1306
|
-
* @param backend The backend being used
|
|
1307
|
-
* @returns Session options for InferenceSession.create()
|
|
1308
|
-
*/
|
|
1309
|
-
declare function getSessionOptions(backend: RuntimeBackend): SessionOptions;
|
|
1310
|
-
/**
|
|
1311
|
-
* Create an inference session with automatic fallback
|
|
1312
|
-
*
|
|
1313
|
-
* If WebGPU session creation fails, automatically falls back to WASM.
|
|
1314
|
-
*
|
|
1315
|
-
* @param modelBuffer The model data as ArrayBuffer
|
|
1316
|
-
* @param preferredBackend The preferred backend
|
|
1317
|
-
* @returns The created session and the backend used
|
|
1318
|
-
*/
|
|
1319
|
-
declare function createSessionWithFallback(modelBuffer: ArrayBuffer, preferredBackend: RuntimeBackend): Promise<{
|
|
1320
|
-
session: InferenceSession;
|
|
1321
|
-
backend: RuntimeBackend;
|
|
1322
|
-
}>;
|
|
1323
|
-
/**
|
|
1324
|
-
* Get the currently loaded backend (if any)
|
|
1325
|
-
*/
|
|
1326
|
-
declare function getLoadedBackend(): RuntimeBackend | null;
|
|
1327
|
-
/**
|
|
1328
|
-
* Check if ONNX Runtime has been loaded
|
|
1329
|
-
*/
|
|
1330
|
-
declare function isOnnxRuntimeLoaded(): boolean;
|
|
1331
|
-
/**
|
|
1332
|
-
* Preload ONNX Runtime and compile the WASM binary early
|
|
1333
|
-
*
|
|
1334
|
-
* Call this before loading heavy resources (Three.js, VRM models) to ensure
|
|
1335
|
-
* WASM memory is allocated in a clean JS heap, reducing iOS memory pressure.
|
|
1336
|
-
* Uses the singleton pattern — subsequent model loading reuses this instance.
|
|
1337
|
-
*
|
|
1338
|
-
* @param preference Backend preference (default: 'auto')
|
|
1339
|
-
* @returns The resolved backend that was loaded
|
|
1340
|
-
*/
|
|
1341
|
-
declare function preloadOnnxRuntime(preference?: BackendPreference): Promise<RuntimeBackend>;
|
|
1342
750
|
|
|
1343
751
|
/**
|
|
1344
752
|
* SenseVoice automatic speech recognition using ONNX Runtime Web
|
|
@@ -2094,8 +1502,9 @@ interface Wav2ArkitCpuWorkerConfig {
|
|
|
2094
1502
|
*
|
|
2095
1503
|
* @see Wav2ArkitCpuInference for main-thread version
|
|
2096
1504
|
*/
|
|
2097
|
-
declare class Wav2ArkitCpuWorker implements
|
|
1505
|
+
declare class Wav2ArkitCpuWorker implements A2EBackend {
|
|
2098
1506
|
readonly modelId: "wav2arkit_cpu";
|
|
1507
|
+
readonly chunkSize: number;
|
|
2099
1508
|
private worker;
|
|
2100
1509
|
private config;
|
|
2101
1510
|
private isLoading;
|
|
@@ -2124,7 +1533,7 @@ declare class Wav2ArkitCpuWorker implements LipSyncBackend {
|
|
|
2124
1533
|
/**
|
|
2125
1534
|
* Load the ONNX model in the worker
|
|
2126
1535
|
*/
|
|
2127
|
-
load(): Promise<
|
|
1536
|
+
load(): Promise<A2EModelInfo>;
|
|
2128
1537
|
/**
|
|
2129
1538
|
* Run inference on raw audio
|
|
2130
1539
|
*
|
|
@@ -2134,7 +1543,7 @@ declare class Wav2ArkitCpuWorker implements LipSyncBackend {
|
|
|
2134
1543
|
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
2135
1544
|
* @param _identityIndex - Ignored (identity 11 is baked into the model)
|
|
2136
1545
|
*/
|
|
2137
|
-
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<
|
|
1546
|
+
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
|
|
2138
1547
|
/**
|
|
2139
1548
|
* Queue inference to serialize worker calls
|
|
2140
1549
|
*/
|
|
@@ -2166,7 +1575,7 @@ declare class Wav2ArkitCpuWorker implements LipSyncBackend {
|
|
|
2166
1575
|
* await worker.init();
|
|
2167
1576
|
*
|
|
2168
1577
|
* const asr = createSenseVoice({ modelUrl: '...', unifiedWorker: worker });
|
|
2169
|
-
* const lam =
|
|
1578
|
+
* const lam = createA2E({ gpuModelUrl: '...', cpuModelUrl: '...', unifiedWorker: worker });
|
|
2170
1579
|
* const vad = createSileroVAD({ modelUrl: '...', unifiedWorker: worker });
|
|
2171
1580
|
* ```
|
|
2172
1581
|
*
|
|
@@ -2196,17 +1605,17 @@ declare class UnifiedInferenceWorker {
|
|
|
2196
1605
|
}): Promise<SenseVoiceModelInfo>;
|
|
2197
1606
|
transcribe(audio: Float32Array): Promise<SenseVoiceResult>;
|
|
2198
1607
|
disposeSenseVoice(): Promise<void>;
|
|
2199
|
-
|
|
1608
|
+
loadA2E(config: {
|
|
2200
1609
|
modelUrl: string;
|
|
2201
1610
|
externalDataUrl: string | null;
|
|
2202
|
-
}): Promise<
|
|
2203
|
-
|
|
1611
|
+
}): Promise<A2EModelInfo>;
|
|
1612
|
+
inferA2E(audio: Float32Array): Promise<{
|
|
2204
1613
|
blendshapes: Float32Array;
|
|
2205
1614
|
numFrames: number;
|
|
2206
1615
|
numBlendshapes: number;
|
|
2207
1616
|
inferenceTimeMs: number;
|
|
2208
1617
|
}>;
|
|
2209
|
-
|
|
1618
|
+
disposeA2E(): Promise<void>;
|
|
2210
1619
|
loadVAD(config: {
|
|
2211
1620
|
modelUrl: string;
|
|
2212
1621
|
sampleRate: number;
|
|
@@ -2252,10 +1661,11 @@ declare class SenseVoiceUnifiedAdapter implements SenseVoiceBackend {
|
|
|
2252
1661
|
/**
|
|
2253
1662
|
* Wav2ArkitCpu adapter backed by UnifiedInferenceWorker
|
|
2254
1663
|
*
|
|
2255
|
-
* Implements
|
|
1664
|
+
* Implements A2EBackend, delegating all inference to the shared worker.
|
|
2256
1665
|
*/
|
|
2257
|
-
declare class Wav2ArkitCpuUnifiedAdapter implements
|
|
1666
|
+
declare class Wav2ArkitCpuUnifiedAdapter implements A2EBackend {
|
|
2258
1667
|
readonly modelId: "wav2arkit_cpu";
|
|
1668
|
+
readonly chunkSize: number;
|
|
2259
1669
|
private worker;
|
|
2260
1670
|
private config;
|
|
2261
1671
|
private _isLoaded;
|
|
@@ -2263,8 +1673,8 @@ declare class Wav2ArkitCpuUnifiedAdapter implements LipSyncBackend {
|
|
|
2263
1673
|
constructor(worker: UnifiedInferenceWorker, config: Wav2ArkitCpuWorkerConfig);
|
|
2264
1674
|
get isLoaded(): boolean;
|
|
2265
1675
|
get backend(): RuntimeBackend | null;
|
|
2266
|
-
load(): Promise<
|
|
2267
|
-
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<
|
|
1676
|
+
load(): Promise<A2EModelInfo>;
|
|
1677
|
+
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
|
|
2268
1678
|
dispose(): Promise<void>;
|
|
2269
1679
|
}
|
|
2270
1680
|
/**
|
|
@@ -2392,116 +1802,6 @@ interface CreateSenseVoiceConfig {
|
|
|
2392
1802
|
*/
|
|
2393
1803
|
declare function createSenseVoice(config: CreateSenseVoiceConfig): SenseVoiceBackend;
|
|
2394
1804
|
|
|
2395
|
-
/**
|
|
2396
|
-
* Kaldi-compatible filterbank (fbank) feature extraction
|
|
2397
|
-
*
|
|
2398
|
-
* Pure TypeScript implementation matching kaldi-native-fbank parameters
|
|
2399
|
-
* used by SenseVoice. No external dependencies.
|
|
2400
|
-
*
|
|
2401
|
-
* Pipeline: audio → framing → windowing → FFT → power spectrum → mel filterbank → log
|
|
2402
|
-
*
|
|
2403
|
-
* @module inference/kaldiFbank
|
|
2404
|
-
*/
|
|
2405
|
-
interface KaldiFbankOptions {
|
|
2406
|
-
/** Frame length in ms (default: 25) */
|
|
2407
|
-
frameLengthMs?: number;
|
|
2408
|
-
/** Frame shift in ms (default: 10) */
|
|
2409
|
-
frameShiftMs?: number;
|
|
2410
|
-
/** Low frequency cutoff in Hz (default: 20) */
|
|
2411
|
-
lowFreq?: number;
|
|
2412
|
-
/** High frequency cutoff in Hz (default: sampleRate / 2) */
|
|
2413
|
-
highFreq?: number;
|
|
2414
|
-
/** Dither amount (default: 0 for deterministic output) */
|
|
2415
|
-
dither?: number;
|
|
2416
|
-
/** Preemphasis coefficient (default: 0.97) */
|
|
2417
|
-
preemphasis?: number;
|
|
2418
|
-
}
|
|
2419
|
-
/**
|
|
2420
|
-
* Compute Kaldi-compatible log mel filterbank features
|
|
2421
|
-
*
|
|
2422
|
-
* @param audio Raw audio samples (float32, [-1, 1] range)
|
|
2423
|
-
* @param sampleRate Sample rate in Hz (must be 16000 for SenseVoice)
|
|
2424
|
-
* @param numMelBins Number of mel bins (80 for SenseVoice)
|
|
2425
|
-
* @param opts Optional parameters
|
|
2426
|
-
* @returns Flattened Float32Array of shape [numFrames, numMelBins]
|
|
2427
|
-
*/
|
|
2428
|
-
declare function computeKaldiFbank(audio: Float32Array, sampleRate: number, numMelBins: number, opts?: KaldiFbankOptions): Float32Array;
|
|
2429
|
-
/**
|
|
2430
|
-
* Apply Low Frame Rate stacking for SenseVoice
|
|
2431
|
-
*
|
|
2432
|
-
* Concatenates lfrM consecutive frames with stride lfrN.
|
|
2433
|
-
* Left-pads with copies of first frame, right-pads last group.
|
|
2434
|
-
*
|
|
2435
|
-
* @param features Flattened [numFrames, featureDim]
|
|
2436
|
-
* @param featureDim Feature dimension per frame (e.g., 80)
|
|
2437
|
-
* @param lfrM Number of frames to stack (default: 7)
|
|
2438
|
-
* @param lfrN Stride (default: 6)
|
|
2439
|
-
* @returns Flattened [numOutputFrames, featureDim * lfrM]
|
|
2440
|
-
*/
|
|
2441
|
-
declare function applyLFR(features: Float32Array, featureDim: number, lfrM?: number, lfrN?: number): Float32Array;
|
|
2442
|
-
/**
|
|
2443
|
-
* Apply CMVN normalization in-place
|
|
2444
|
-
*
|
|
2445
|
-
* Formula: normalized[i] = (features[i] + negMean[i % dim]) * invStddev[i % dim]
|
|
2446
|
-
*
|
|
2447
|
-
* @param features Flattened feature array (modified in-place)
|
|
2448
|
-
* @param dim Feature dimension (560 for SenseVoice after LFR)
|
|
2449
|
-
* @param negMean Negative mean vector (dim-dimensional)
|
|
2450
|
-
* @param invStddev Inverse standard deviation vector (dim-dimensional)
|
|
2451
|
-
* @returns The same features array (for chaining)
|
|
2452
|
-
*/
|
|
2453
|
-
declare function applyCMVN(features: Float32Array, dim: number, negMean: Float32Array, invStddev: Float32Array): Float32Array;
|
|
2454
|
-
/**
|
|
2455
|
-
* Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
|
|
2456
|
-
*
|
|
2457
|
-
* The sherpa-onnx SenseVoice export stores neg_mean and inv_stddev
|
|
2458
|
-
* as comma-separated float strings in the model's metadata.
|
|
2459
|
-
*/
|
|
2460
|
-
declare function parseCMVNFromMetadata(negMeanStr: string, invStddevStr: string): {
|
|
2461
|
-
negMean: Float32Array;
|
|
2462
|
-
invStddev: Float32Array;
|
|
2463
|
-
};
|
|
2464
|
-
|
|
2465
|
-
/**
|
|
2466
|
-
* CTC greedy decoder for SenseVoice
|
|
2467
|
-
*
|
|
2468
|
-
* Decodes CTC logits into text with structured token parsing
|
|
2469
|
-
* for language, emotion, and audio event detection.
|
|
2470
|
-
*
|
|
2471
|
-
* @module inference/ctcDecoder
|
|
2472
|
-
*/
|
|
2473
|
-
interface CTCDecodeResult {
|
|
2474
|
-
/** Decoded text (speech content only) */
|
|
2475
|
-
text: string;
|
|
2476
|
-
/** Detected language (e.g., 'zh', 'en', 'ja', 'ko', 'yue') */
|
|
2477
|
-
language?: string;
|
|
2478
|
-
/** Detected emotion (e.g., 'HAPPY', 'SAD', 'ANGRY', 'NEUTRAL') */
|
|
2479
|
-
emotion?: string;
|
|
2480
|
-
/** Detected audio event (e.g., 'Speech', 'BGM', 'Laughter') */
|
|
2481
|
-
event?: string;
|
|
2482
|
-
}
|
|
2483
|
-
/** Resolve language string to SenseVoice language ID */
|
|
2484
|
-
declare function resolveLanguageId(language: string): number;
|
|
2485
|
-
/** Resolve text norm string to SenseVoice text norm ID */
|
|
2486
|
-
declare function resolveTextNormId(textNorm: string): number;
|
|
2487
|
-
/**
|
|
2488
|
-
* Parse tokens.txt into a token ID → string map
|
|
2489
|
-
*
|
|
2490
|
-
* Format: each line is "token_string token_id"
|
|
2491
|
-
* e.g., "<unk> 0", "▁the 3", "s 4"
|
|
2492
|
-
*/
|
|
2493
|
-
declare function parseTokensFile(content: string): Map<number, string>;
|
|
2494
|
-
/**
|
|
2495
|
-
* CTC greedy decode
|
|
2496
|
-
*
|
|
2497
|
-
* @param logits Raw logits from model output, flattened [seqLen, vocabSize]
|
|
2498
|
-
* @param seqLen Sequence length (time steps)
|
|
2499
|
-
* @param vocabSize Vocabulary size
|
|
2500
|
-
* @param tokenMap Token ID → string map from tokens.txt
|
|
2501
|
-
* @returns Decoded text and structured metadata
|
|
2502
|
-
*/
|
|
2503
|
-
declare function ctcGreedyDecode(logits: Float32Array, seqLen: number, vocabSize: number, tokenMap: Map<number, string>): CTCDecodeResult;
|
|
2504
|
-
|
|
2505
1805
|
/**
|
|
2506
1806
|
* Shared blendshape constants and utilities for lip sync inference
|
|
2507
1807
|
*
|
|
@@ -2521,26 +1821,18 @@ declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browI
|
|
|
2521
1821
|
/** Alias for backwards compatibility */
|
|
2522
1822
|
declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
2523
1823
|
/**
|
|
2524
|
-
*
|
|
2525
|
-
* From LAM official postprocessing (models/utils.py)
|
|
2526
|
-
* This fixes asymmetric output from the raw model
|
|
2527
|
-
*/
|
|
2528
|
-
declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
|
|
2529
|
-
/**
|
|
2530
|
-
* wav2arkit_cpu model blendshape ordering
|
|
1824
|
+
* Linearly interpolate between two blendshape weight arrays.
|
|
2531
1825
|
*
|
|
2532
|
-
*
|
|
2533
|
-
*
|
|
2534
|
-
*
|
|
2535
|
-
*/
|
|
2536
|
-
declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
|
|
2537
|
-
/**
|
|
2538
|
-
* Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
|
|
1826
|
+
* Pure math utility with zero renderer dependency — used by all renderer
|
|
1827
|
+
* adapters (@omote/three, @omote/babylon, @omote/r3f) for smooth frame
|
|
1828
|
+
* transitions.
|
|
2539
1829
|
*
|
|
2540
|
-
* @param
|
|
2541
|
-
* @
|
|
1830
|
+
* @param current - Current blendshape weights
|
|
1831
|
+
* @param target - Target blendshape weights
|
|
1832
|
+
* @param factor - Interpolation factor (0 = no change, 1 = snap to target). Default: 0.3
|
|
1833
|
+
* @returns Interpolated weights as number[]
|
|
2542
1834
|
*/
|
|
2543
|
-
declare function
|
|
1835
|
+
declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
|
|
2544
1836
|
|
|
2545
1837
|
/**
|
|
2546
1838
|
* Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
|
|
@@ -2582,6 +1874,12 @@ interface Wav2Vec2InferenceConfig {
|
|
|
2582
1874
|
backend?: InferenceBackend;
|
|
2583
1875
|
/** Number of identity classes (default: 12 for streaming model) */
|
|
2584
1876
|
numIdentityClasses?: number;
|
|
1877
|
+
/**
|
|
1878
|
+
* Number of audio samples per inference chunk (default: 16000).
|
|
1879
|
+
* Model supports variable chunk sizes. Smaller chunks = lower latency,
|
|
1880
|
+
* more inference overhead. 8000 (500ms) is recommended for real-time lip sync.
|
|
1881
|
+
*/
|
|
1882
|
+
chunkSize?: number;
|
|
2585
1883
|
}
|
|
2586
1884
|
interface ModelInfo {
|
|
2587
1885
|
backend: 'webgpu' | 'wasm';
|
|
@@ -2608,7 +1906,7 @@ interface Wav2Vec2Result {
|
|
|
2608
1906
|
/** Inference time in ms */
|
|
2609
1907
|
inferenceTimeMs: number;
|
|
2610
1908
|
}
|
|
2611
|
-
declare class Wav2Vec2Inference implements
|
|
1909
|
+
declare class Wav2Vec2Inference implements A2EBackend {
|
|
2612
1910
|
readonly modelId: "wav2vec2";
|
|
2613
1911
|
private session;
|
|
2614
1912
|
private ort;
|
|
@@ -2616,6 +1914,7 @@ declare class Wav2Vec2Inference implements LipSyncBackend {
|
|
|
2616
1914
|
private _backend;
|
|
2617
1915
|
private isLoading;
|
|
2618
1916
|
private numIdentityClasses;
|
|
1917
|
+
readonly chunkSize: number;
|
|
2619
1918
|
private inferenceQueue;
|
|
2620
1919
|
private poisoned;
|
|
2621
1920
|
private static readonly INFERENCE_TIMEOUT_MS;
|
|
@@ -2635,11 +1934,10 @@ declare class Wav2Vec2Inference implements LipSyncBackend {
|
|
|
2635
1934
|
load(): Promise<ModelInfo>;
|
|
2636
1935
|
/**
|
|
2637
1936
|
* Run inference on raw audio
|
|
2638
|
-
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
1937
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
2639
1938
|
* @param identityIndex - Optional identity index (0-11, default 0 = neutral)
|
|
2640
1939
|
*
|
|
2641
|
-
*
|
|
2642
|
-
* Audio will be zero-padded or truncated to 16000 samples.
|
|
1940
|
+
* Audio will be zero-padded or truncated to chunkSize samples.
|
|
2643
1941
|
*/
|
|
2644
1942
|
infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
|
|
2645
1943
|
/**
|
|
@@ -2707,8 +2005,9 @@ interface Wav2ArkitCpuConfig {
|
|
|
2707
2005
|
/** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
|
|
2708
2006
|
backend?: BackendPreference;
|
|
2709
2007
|
}
|
|
2710
|
-
declare class Wav2ArkitCpuInference implements
|
|
2008
|
+
declare class Wav2ArkitCpuInference implements A2EBackend {
|
|
2711
2009
|
readonly modelId: "wav2arkit_cpu";
|
|
2010
|
+
readonly chunkSize: number;
|
|
2712
2011
|
private session;
|
|
2713
2012
|
private ort;
|
|
2714
2013
|
private config;
|
|
@@ -2723,7 +2022,7 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
|
|
|
2723
2022
|
/**
|
|
2724
2023
|
* Load the ONNX model
|
|
2725
2024
|
*/
|
|
2726
|
-
load(): Promise<
|
|
2025
|
+
load(): Promise<A2EModelInfo>;
|
|
2727
2026
|
/**
|
|
2728
2027
|
* Run inference on raw audio
|
|
2729
2028
|
*
|
|
@@ -2733,7 +2032,7 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
|
|
|
2733
2032
|
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
2734
2033
|
* @param _identityIndex - Ignored (identity 11 is baked into the model)
|
|
2735
2034
|
*/
|
|
2736
|
-
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<
|
|
2035
|
+
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
|
|
2737
2036
|
/**
|
|
2738
2037
|
* Queue inference to serialize ONNX session calls
|
|
2739
2038
|
*/
|
|
@@ -2745,7 +2044,7 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
|
|
|
2745
2044
|
}
|
|
2746
2045
|
|
|
2747
2046
|
/**
|
|
2748
|
-
* Factory function for
|
|
2047
|
+
* Factory function for A2E with automatic GPU/CPU model selection
|
|
2749
2048
|
*
|
|
2750
2049
|
* Provides a unified API that automatically selects the optimal model:
|
|
2751
2050
|
* - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
|
|
@@ -2766,20 +2065,20 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
|
|
|
2766
2065
|
*
|
|
2767
2066
|
* @example Auto-detect (recommended)
|
|
2768
2067
|
* ```typescript
|
|
2769
|
-
* import {
|
|
2068
|
+
* import { createA2E } from '@omote/core';
|
|
2770
2069
|
*
|
|
2771
|
-
* const
|
|
2070
|
+
* const a2e = createA2E({
|
|
2772
2071
|
* gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
|
|
2773
2072
|
* cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
2774
2073
|
* });
|
|
2775
2074
|
*
|
|
2776
|
-
* await
|
|
2777
|
-
* const { blendshapes } = await
|
|
2075
|
+
* await a2e.load();
|
|
2076
|
+
* const { blendshapes } = await a2e.infer(audioSamples);
|
|
2778
2077
|
* ```
|
|
2779
2078
|
*
|
|
2780
2079
|
* @example Force CPU model
|
|
2781
2080
|
* ```typescript
|
|
2782
|
-
* const
|
|
2081
|
+
* const a2e = createA2E({
|
|
2783
2082
|
* gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
|
|
2784
2083
|
* cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
2785
2084
|
* mode: 'cpu',
|
|
@@ -2788,9 +2087,9 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
|
|
|
2788
2087
|
*/
|
|
2789
2088
|
|
|
2790
2089
|
/**
|
|
2791
|
-
* Configuration for the
|
|
2090
|
+
* Configuration for the A2E factory
|
|
2792
2091
|
*/
|
|
2793
|
-
interface
|
|
2092
|
+
interface CreateA2EConfig {
|
|
2794
2093
|
/** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
|
|
2795
2094
|
gpuModelUrl: string;
|
|
2796
2095
|
/**
|
|
@@ -2804,7 +2103,7 @@ interface CreateLipSyncConfig {
|
|
|
2804
2103
|
cpuModelUrl: string;
|
|
2805
2104
|
/**
|
|
2806
2105
|
* Model selection mode:
|
|
2807
|
-
* - 'auto': Safari/iOS
|
|
2106
|
+
* - 'auto': Safari/iOS -> CPU, everything else -> GPU (default)
|
|
2808
2107
|
* - 'gpu': Force GPU model (Wav2Vec2Inference)
|
|
2809
2108
|
* - 'cpu': Force CPU model (Wav2ArkitCpuInference)
|
|
2810
2109
|
*/
|
|
@@ -2838,12 +2137,322 @@ interface CreateLipSyncConfig {
|
|
|
2838
2137
|
unifiedWorker?: UnifiedInferenceWorker;
|
|
2839
2138
|
}
|
|
2840
2139
|
/**
|
|
2841
|
-
* Create
|
|
2140
|
+
* Create an A2E instance with automatic GPU/CPU model selection
|
|
2842
2141
|
*
|
|
2843
2142
|
* @param config - Factory configuration
|
|
2844
|
-
* @returns
|
|
2143
|
+
* @returns An A2EBackend instance (either GPU or CPU model)
|
|
2144
|
+
*/
|
|
2145
|
+
declare function createA2E(config: CreateA2EConfig): A2EBackend;
|
|
2146
|
+
|
|
2147
|
+
/**
|
|
2148
|
+
* A2EProcessor — Engine-agnostic audio-to-expression processor
|
|
2149
|
+
*
|
|
2150
|
+
* The core inference primitive: audio samples in → blendshape frames out.
|
|
2151
|
+
* No mic capture, no audio playback, no Web Audio API.
|
|
2152
|
+
*
|
|
2153
|
+
* This is what Unity/Unreal/Godot/any engine would use directly.
|
|
2154
|
+
* Web-specific concerns (mic, AudioContext, scheduling) live in the
|
|
2155
|
+
* orchestrator and pipeline layers above.
|
|
2156
|
+
*
|
|
2157
|
+
* Two output modes:
|
|
2158
|
+
* - **Pull mode**: `pushAudio(samples, timestamp)` + `getFrameForTime(t)`
|
|
2159
|
+
* For TTS playback where frames are synced to AudioContext clock.
|
|
2160
|
+
* - **Push mode**: `pushAudio(samples)` + `startDrip()` + `latestFrame`
|
|
2161
|
+
* For live mic / game loop where frames are consumed at ~30fps.
|
|
2162
|
+
*
|
|
2163
|
+
* @category Inference
|
|
2164
|
+
*
|
|
2165
|
+
* @example Pull mode (TTS playback)
|
|
2166
|
+
* ```typescript
|
|
2167
|
+
* const processor = new A2EProcessor({ backend: a2e });
|
|
2168
|
+
* processor.pushAudio(samples, audioContext.currentTime + delay);
|
|
2169
|
+
* const frame = processor.getFrameForTime(audioContext.currentTime);
|
|
2170
|
+
* ```
|
|
2171
|
+
*
|
|
2172
|
+
* @example Push mode (live mic)
|
|
2173
|
+
* ```typescript
|
|
2174
|
+
* const processor = new A2EProcessor({
|
|
2175
|
+
* backend: a2e,
|
|
2176
|
+
* onFrame: (frame) => applyToAvatar(frame),
|
|
2177
|
+
* });
|
|
2178
|
+
* processor.startDrip();
|
|
2179
|
+
* processor.pushAudio(micSamples); // no timestamp → drip mode
|
|
2180
|
+
* ```
|
|
2181
|
+
*/
|
|
2182
|
+
|
|
2183
|
+
interface A2EProcessorConfig {
|
|
2184
|
+
/** Inference backend */
|
|
2185
|
+
backend: A2EBackend;
|
|
2186
|
+
/** Sample rate (default: 16000) */
|
|
2187
|
+
sampleRate?: number;
|
|
2188
|
+
/** Samples per inference chunk (default: 16000 = 1s) */
|
|
2189
|
+
chunkSize?: number;
|
|
2190
|
+
/** Callback fired with each blendshape frame (push mode) */
|
|
2191
|
+
onFrame?: (frame: Float32Array) => void;
|
|
2192
|
+
/** Error callback */
|
|
2193
|
+
onError?: (error: Error) => void;
|
|
2194
|
+
}
|
|
2195
|
+
declare class A2EProcessor {
|
|
2196
|
+
private readonly backend;
|
|
2197
|
+
private readonly sampleRate;
|
|
2198
|
+
private readonly chunkSize;
|
|
2199
|
+
private readonly onFrame?;
|
|
2200
|
+
private readonly onError?;
|
|
2201
|
+
private bufferCapacity;
|
|
2202
|
+
private buffer;
|
|
2203
|
+
private writeOffset;
|
|
2204
|
+
private bufferStartTime;
|
|
2205
|
+
private timestampedQueue;
|
|
2206
|
+
private plainQueue;
|
|
2207
|
+
private _latestFrame;
|
|
2208
|
+
private dripInterval;
|
|
2209
|
+
private lastPulledFrame;
|
|
2210
|
+
private inferenceRunning;
|
|
2211
|
+
private pendingChunks;
|
|
2212
|
+
private getFrameCallCount;
|
|
2213
|
+
private disposed;
|
|
2214
|
+
constructor(config: A2EProcessorConfig);
|
|
2215
|
+
/**
|
|
2216
|
+
* Push audio samples for inference (any source: mic, TTS, file).
|
|
2217
|
+
*
|
|
2218
|
+
* - With `timestamp`: frames stored with timestamps (pull mode)
|
|
2219
|
+
* - Without `timestamp`: frames stored in plain queue (drip/push mode)
|
|
2220
|
+
*
|
|
2221
|
+
* Fire-and-forget: returns immediately, inference runs async.
|
|
2222
|
+
*/
|
|
2223
|
+
pushAudio(samples: Float32Array, timestamp?: number): void;
|
|
2224
|
+
/**
|
|
2225
|
+
* Flush remaining buffered audio (pads to chunkSize).
|
|
2226
|
+
* Call at end of stream to process final partial chunk.
|
|
2227
|
+
*
|
|
2228
|
+
* Routes through the serialized pendingChunks pipeline to maintain
|
|
2229
|
+
* correct frame ordering. Without this, flush() could push frames
|
|
2230
|
+
* with the latest timestamp to the queue before drainPendingChunks()
|
|
2231
|
+
* finishes pushing frames with earlier timestamps — causing
|
|
2232
|
+
* getFrameForTime() to see out-of-order timestamps and stall.
|
|
2233
|
+
*/
|
|
2234
|
+
flush(): Promise<void>;
|
|
2235
|
+
/**
|
|
2236
|
+
* Reset buffer and frame queues
|
|
2237
|
+
*/
|
|
2238
|
+
reset(): void;
|
|
2239
|
+
/**
|
|
2240
|
+
* Get frame synced to external clock (e.g. AudioContext.currentTime).
|
|
2241
|
+
*
|
|
2242
|
+
* Discards frames that are too old, returns the current frame,
|
|
2243
|
+
* or holds last frame as fallback to prevent avatar freezing.
|
|
2244
|
+
*
|
|
2245
|
+
* @param currentTime - Current playback time (seconds)
|
|
2246
|
+
* @returns Blendshape frame, or null if no frames yet
|
|
2247
|
+
*/
|
|
2248
|
+
getFrameForTime(currentTime: number): Float32Array | null;
|
|
2249
|
+
/** Latest frame from drip-feed (live mic, game loop) */
|
|
2250
|
+
get latestFrame(): Float32Array | null;
|
|
2251
|
+
/** Start 30fps drip-feed timer (push mode) */
|
|
2252
|
+
startDrip(): void;
|
|
2253
|
+
/** Stop drip-feed timer */
|
|
2254
|
+
stopDrip(): void;
|
|
2255
|
+
/** Number of frames waiting in queue (both modes combined) */
|
|
2256
|
+
get queuedFrameCount(): number;
|
|
2257
|
+
/** Buffer fill level as fraction of chunkSize (0-1) */
|
|
2258
|
+
get fillLevel(): number;
|
|
2259
|
+
/** Dispose resources */
|
|
2260
|
+
dispose(): void;
|
|
2261
|
+
/**
|
|
2262
|
+
* Process pending chunks sequentially.
|
|
2263
|
+
* Fire-and-forget — called from pushAudio() without awaiting.
|
|
2264
|
+
*/
|
|
2265
|
+
private drainPendingChunks;
|
|
2266
|
+
private handleError;
|
|
2267
|
+
}
|
|
2268
|
+
|
|
2269
|
+
/**
|
|
2270
|
+
* BlendshapeSmoother — Per-channel critically damped spring for 52 ARKit blendshapes
|
|
2271
|
+
*
|
|
2272
|
+
* Eliminates frame gaps between inference batches by smoothly interpolating
|
|
2273
|
+
* blendshape weights using critically damped springs (the game industry standard).
|
|
2274
|
+
*
|
|
2275
|
+
* Each of the 52 blendshape channels has its own spring with position + velocity
|
|
2276
|
+
* state. When a new inference frame arrives, spring targets are updated. Between
|
|
2277
|
+
* frames, springs continue converging toward the last target — no frozen face.
|
|
2278
|
+
*
|
|
2279
|
+
* When inference stalls, `decayToNeutral()` sets all targets to 0, and the
|
|
2280
|
+
* springs smoothly close the mouth / relax the face over the halflife period.
|
|
2281
|
+
*
|
|
2282
|
+
* Math from Daniel Holden's "Spring-It-On" (Epic Games):
|
|
2283
|
+
* https://theorangeduck.com/page/spring-roll-call
|
|
2284
|
+
*
|
|
2285
|
+
* @category Inference
|
|
2286
|
+
*
|
|
2287
|
+
* @example Basic usage
|
|
2288
|
+
* ```typescript
|
|
2289
|
+
* const smoother = new BlendshapeSmoother({ halflife: 0.06 });
|
|
2290
|
+
*
|
|
2291
|
+
* // In frame loop (60fps):
|
|
2292
|
+
* smoother.setTarget(inferenceFrame); // when new frame arrives
|
|
2293
|
+
* const smoothed = smoother.update(1/60); // every render frame
|
|
2294
|
+
* applyToAvatar(smoothed);
|
|
2295
|
+
* ```
|
|
2845
2296
|
*/
|
|
2846
|
-
|
|
2297
|
+
interface BlendshapeSmootherConfig {
|
|
2298
|
+
/**
|
|
2299
|
+
* Spring halflife in seconds — time for the distance to the target
|
|
2300
|
+
* to reduce by half. Lower = snappier, higher = smoother.
|
|
2301
|
+
*
|
|
2302
|
+
* - 0.04s (40ms): Very snappy, slight jitter on fast transitions
|
|
2303
|
+
* - 0.06s (60ms): Sweet spot for lip sync (default)
|
|
2304
|
+
* - 0.10s (100ms): Very smooth, slight lag on fast consonants
|
|
2305
|
+
* - 0: Bypass mode — passes through raw target values (no smoothing)
|
|
2306
|
+
*
|
|
2307
|
+
* Default: 0.06
|
|
2308
|
+
*/
|
|
2309
|
+
halflife?: number;
|
|
2310
|
+
}
|
|
2311
|
+
declare class BlendshapeSmoother {
|
|
2312
|
+
private readonly halflife;
|
|
2313
|
+
/** Current smoothed blendshape values */
|
|
2314
|
+
private values;
|
|
2315
|
+
/** Per-channel spring velocities */
|
|
2316
|
+
private velocities;
|
|
2317
|
+
/** Current spring targets (from latest inference frame) */
|
|
2318
|
+
private targets;
|
|
2319
|
+
/** Whether any target has been set */
|
|
2320
|
+
private _hasTarget;
|
|
2321
|
+
constructor(config?: BlendshapeSmootherConfig);
|
|
2322
|
+
/** Whether a target frame has been set (false until first setTarget call) */
|
|
2323
|
+
get hasTarget(): boolean;
|
|
2324
|
+
/**
|
|
2325
|
+
* Set new target frame from inference output.
|
|
2326
|
+
* Springs will converge toward these values on subsequent update() calls.
|
|
2327
|
+
*/
|
|
2328
|
+
setTarget(frame: Float32Array): void;
|
|
2329
|
+
/**
|
|
2330
|
+
* Advance all 52 springs by `dt` seconds and return the smoothed frame.
|
|
2331
|
+
*
|
|
2332
|
+
* Call this every render frame (e.g., inside requestAnimationFrame).
|
|
2333
|
+
* Returns the internal values buffer — do NOT mutate the returned array.
|
|
2334
|
+
*
|
|
2335
|
+
* @param dt - Time step in seconds (e.g., 1/60 for 60fps)
|
|
2336
|
+
* @returns Smoothed blendshape values (Float32Array of 52)
|
|
2337
|
+
*/
|
|
2338
|
+
update(dt: number): Float32Array;
|
|
2339
|
+
/**
|
|
2340
|
+
* Decay all spring targets to neutral (0).
|
|
2341
|
+
*
|
|
2342
|
+
* Call when inference stalls (no new frames for threshold duration).
|
|
2343
|
+
* The springs will smoothly close the mouth / relax the face over
|
|
2344
|
+
* the halflife period rather than freezing.
|
|
2345
|
+
*/
|
|
2346
|
+
decayToNeutral(): void;
|
|
2347
|
+
/**
|
|
2348
|
+
* Reset all state (values, velocities, targets).
|
|
2349
|
+
* Call when starting a new playback session.
|
|
2350
|
+
*/
|
|
2351
|
+
reset(): void;
|
|
2352
|
+
}
|
|
2353
|
+
|
|
2354
|
+
/**
|
|
2355
|
+
* Renderer-agnostic A2E (audio-to-expression) orchestrator
|
|
2356
|
+
*
|
|
2357
|
+
* Manages the mic capture + A2E inference loop independently of any
|
|
2358
|
+
* 3D renderer. Adapter packages (@omote/three, @omote/babylon) wrap this
|
|
2359
|
+
* thinly and pipe `latestWeights` into their renderer-specific blendshape
|
|
2360
|
+
* controllers.
|
|
2361
|
+
*
|
|
2362
|
+
* Internally delegates all buffer accumulation, inference, and frame
|
|
2363
|
+
* drip-feeding to {@link A2EProcessor}. This class only handles mic capture
|
|
2364
|
+
* (getUserMedia, ScriptProcessorNode, resampling).
|
|
2365
|
+
*
|
|
2366
|
+
* @category Inference
|
|
2367
|
+
*/
|
|
2368
|
+
|
|
2369
|
+
/**
|
|
2370
|
+
* Progress event emitted during model download / compile
|
|
2371
|
+
*/
|
|
2372
|
+
interface A2EProgressEvent {
|
|
2373
|
+
phase: 'download' | 'compile';
|
|
2374
|
+
progress: number;
|
|
2375
|
+
}
|
|
2376
|
+
/**
|
|
2377
|
+
* Configuration for the A2EOrchestrator
|
|
2378
|
+
*/
|
|
2379
|
+
interface A2EOrchestratorConfig {
|
|
2380
|
+
/** URL for the GPU model (Wav2Vec2, Chrome/Firefox/Edge) */
|
|
2381
|
+
gpuModelUrl: string;
|
|
2382
|
+
/** URL for GPU model external data file */
|
|
2383
|
+
gpuExternalDataUrl?: string | false;
|
|
2384
|
+
/** URL for the CPU model (wav2arkit_cpu, Safari/iOS) */
|
|
2385
|
+
cpuModelUrl?: string;
|
|
2386
|
+
/** Sample rate for mic capture (default: 16000) */
|
|
2387
|
+
sampleRate?: number;
|
|
2388
|
+
/** Chunk size in samples for mic capture (default: 16000 = 1s at 16kHz) */
|
|
2389
|
+
chunkSize?: number;
|
|
2390
|
+
/** Callback fired with new blendshape weights after each inference */
|
|
2391
|
+
onFrame?: (weights: Float32Array) => void;
|
|
2392
|
+
/** Callback fired during model loading progress */
|
|
2393
|
+
onProgress?: (event: A2EProgressEvent) => void;
|
|
2394
|
+
/** Callback fired on error */
|
|
2395
|
+
onError?: (error: Error) => void;
|
|
2396
|
+
/** Callback fired when model is loaded and ready */
|
|
2397
|
+
onReady?: () => void;
|
|
2398
|
+
/** Additional createA2E config options */
|
|
2399
|
+
a2eConfig?: Partial<CreateA2EConfig>;
|
|
2400
|
+
}
|
|
2401
|
+
/**
|
|
2402
|
+
* Renderer-agnostic A2E orchestrator.
|
|
2403
|
+
*
|
|
2404
|
+
* Manages mic capture + delegates inference to {@link A2EProcessor}.
|
|
2405
|
+
* Adapters read `latestWeights` each frame to apply to their meshes.
|
|
2406
|
+
*
|
|
2407
|
+
* @example Quick start (used by @omote/three and @omote/babylon adapters)
|
|
2408
|
+
* ```typescript
|
|
2409
|
+
* const orchestrator = new A2EOrchestrator({
|
|
2410
|
+
* gpuModelUrl: '/models/wav2vec2.onnx',
|
|
2411
|
+
* cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
2412
|
+
* onFrame: (weights) => controller.update(weights),
|
|
2413
|
+
* });
|
|
2414
|
+
* await orchestrator.load();
|
|
2415
|
+
* await orchestrator.start();
|
|
2416
|
+
* ```
|
|
2417
|
+
*/
|
|
2418
|
+
declare class A2EOrchestrator {
|
|
2419
|
+
private config;
|
|
2420
|
+
private a2e;
|
|
2421
|
+
private processor;
|
|
2422
|
+
private stream;
|
|
2423
|
+
private audioContext;
|
|
2424
|
+
private scriptProcessor;
|
|
2425
|
+
private nativeSampleRate;
|
|
2426
|
+
private _isReady;
|
|
2427
|
+
private _isStreaming;
|
|
2428
|
+
private _backend;
|
|
2429
|
+
private disposed;
|
|
2430
|
+
constructor(config: A2EOrchestratorConfig);
|
|
2431
|
+
/** Latest blendshape weights from inference (null if none yet) */
|
|
2432
|
+
get latestWeights(): Float32Array | null;
|
|
2433
|
+
/** Whether the model is loaded and ready for inference */
|
|
2434
|
+
get isReady(): boolean;
|
|
2435
|
+
/** Whether mic is active and inference loop is running */
|
|
2436
|
+
get isStreaming(): boolean;
|
|
2437
|
+
/** Current backend type (webgpu, wasm, or null) */
|
|
2438
|
+
get backend(): string | null;
|
|
2439
|
+
/**
|
|
2440
|
+
* Load the A2E model and create the processor
|
|
2441
|
+
*/
|
|
2442
|
+
load(): Promise<void>;
|
|
2443
|
+
/**
|
|
2444
|
+
* Start mic capture and inference loop
|
|
2445
|
+
*/
|
|
2446
|
+
start(): Promise<void>;
|
|
2447
|
+
/**
|
|
2448
|
+
* Stop mic capture and inference loop
|
|
2449
|
+
*/
|
|
2450
|
+
stop(): void;
|
|
2451
|
+
/**
|
|
2452
|
+
* Dispose of all resources
|
|
2453
|
+
*/
|
|
2454
|
+
dispose(): Promise<void>;
|
|
2455
|
+
}
|
|
2847
2456
|
|
|
2848
2457
|
/**
|
|
2849
2458
|
* Safari Web Speech API wrapper for iOS speech recognition
|
|
@@ -5190,4 +4799,4 @@ declare class ProceduralLifeLayer {
|
|
|
5190
4799
|
private updateBrowNoise;
|
|
5191
4800
|
}
|
|
5192
4801
|
|
|
5193
|
-
export { type AIAdapter, type AIAdapterEvents, AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, type
|
|
4802
|
+
export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, type AIAdapter, type AIAdapterEvents, AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateA2EConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, ProceduralLifeLayer, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SessionConfig, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureTelemetry, createA2E, createEmotionVector, createSenseVoice, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, preloadModels, resolveBackend, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker };
|