@omote/core 0.4.6 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +484 -867
- package/dist/index.d.ts +484 -867
- package/dist/index.js +1419 -1598
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +972 -1151
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import { EventEmitter, OmoteEvents, AISessionState, AnimationEvent } from './events/index.mjs';
|
|
2
2
|
export { BackendEvent, EmotionEvent, GazeEvent, STTFinalEvent, STTPartialEvent, SessionStateEvent, TTSEndEvent, TTSMarkEvent, TTSStartEvent, VisemeEvent } from './events/index.mjs';
|
|
3
|
-
import { InferenceSession, Tensor, Env } from 'onnxruntime-common';
|
|
4
3
|
export { D as DEFAULT_LOGGING_CONFIG, I as ILogger, e as LOG_LEVEL_PRIORITY, b as LogEntry, L as LogFormatter, a as LogLevel, c as LogSink, d as LoggingConfig, f as configureLogging, i as createLogger, g as getLoggingConfig, n as noopLogger, r as resetLoggingConfig, s as setLogLevel, h as setLoggingEnabled } from './Logger-I_k4sGhM.mjs';
|
|
5
4
|
export { ARKitToFLAMEMapping, ApiError, AudioChunkEvent, AvatarFormat, Character, CharacterAvatar, CharacterMemory, CharacterPersonality, CharacterSpec, CharacterVoice, CreateCharacterRequest, CreateCharacterResponse, CreateLAMJobRequest, CreateLAMJobResponse, CreateSessionRequest, CreateSessionResponse, GSplatConfig, LAMJob, LAMJobStatus, PROTOCOL_VERSION, PaginatedResponse, PlatformSession, ErrorEvent as ProtocolErrorEvent, ProtocolEvent, ResponseChunkEvent, ResponseEndEvent, ResponseStartEvent, SessionMessage, SessionStatus, isProtocolEvent } from '@omote/types';
|
|
6
5
|
|
|
@@ -379,7 +378,7 @@ declare function shouldEnableWasmProxy(): boolean;
|
|
|
379
378
|
*/
|
|
380
379
|
declare function isSafari(): boolean;
|
|
381
380
|
/**
|
|
382
|
-
* Recommend using CPU-optimized
|
|
381
|
+
* Recommend using CPU-optimized A2E model (wav2arkit_cpu)
|
|
383
382
|
*
|
|
384
383
|
* All iOS browsers use WebKit and have tight memory limits — the 384MB
|
|
385
384
|
* LAM model causes silent crashes. wav2arkit_cpu uses URL pass-through
|
|
@@ -390,7 +389,7 @@ declare function isSafari(): boolean;
|
|
|
390
389
|
*
|
|
391
390
|
* @returns true if iOS (any browser) or Safari (any platform)
|
|
392
391
|
*/
|
|
393
|
-
declare function
|
|
392
|
+
declare function shouldUseCpuA2E(): boolean;
|
|
394
393
|
/**
|
|
395
394
|
* Check if Web Speech API is available in the browser
|
|
396
395
|
*
|
|
@@ -415,18 +414,18 @@ declare function shouldUseNativeASR(): boolean;
|
|
|
415
414
|
/**
|
|
416
415
|
* Recommend using server-side LAM over client-side on iOS
|
|
417
416
|
*
|
|
418
|
-
* On iOS, LAM
|
|
417
|
+
* On iOS, LAM A2E via WASM takes ~332ms per second of audio (3.3x over target).
|
|
419
418
|
* Server-side inference with GPU can achieve ~50ms, providing:
|
|
420
|
-
* - Real-time
|
|
419
|
+
* - Real-time A2E (under 100ms target)
|
|
421
420
|
* - Reduced iOS device thermal/battery impact
|
|
422
421
|
* - Better user experience
|
|
423
422
|
*
|
|
424
|
-
* @returns true if on iOS (should use server-side
|
|
423
|
+
* @returns true if on iOS (should use server-side A2E)
|
|
425
424
|
*/
|
|
426
|
-
declare function
|
|
425
|
+
declare function shouldUseServerA2E(): boolean;
|
|
427
426
|
|
|
428
427
|
/**
|
|
429
|
-
* Common interface for
|
|
428
|
+
* Common interface for audio-to-expression (A2E) inference backends
|
|
430
429
|
*
|
|
431
430
|
* Both Wav2Vec2Inference (GPU, 384MB) and Wav2ArkitCpuInference (CPU, 404MB)
|
|
432
431
|
* implement this interface, allowing SyncedAudioPipeline and LAMPipeline to
|
|
@@ -438,19 +437,19 @@ declare function shouldUseServerLipSync(): boolean;
|
|
|
438
437
|
/**
|
|
439
438
|
* Model loading information returned by load()
|
|
440
439
|
*/
|
|
441
|
-
interface
|
|
440
|
+
interface A2EModelInfo {
|
|
442
441
|
backend: RuntimeBackend;
|
|
443
442
|
loadTimeMs: number;
|
|
444
443
|
inputNames: string[];
|
|
445
444
|
outputNames: string[];
|
|
446
445
|
}
|
|
447
446
|
/**
|
|
448
|
-
* Result from
|
|
447
|
+
* Result from A2E inference
|
|
449
448
|
*
|
|
450
449
|
* All implementations must return blendshapes in LAM_BLENDSHAPES order (alphabetical).
|
|
451
450
|
* Models with different native orderings must remap internally before returning.
|
|
452
451
|
*/
|
|
453
|
-
interface
|
|
452
|
+
interface A2EResult {
|
|
454
453
|
/** Blendshape weights [frames, 52] in LAM_BLENDSHAPES order - 30fps */
|
|
455
454
|
blendshapes: Float32Array[];
|
|
456
455
|
/** Number of blendshape frames */
|
|
@@ -459,31 +458,33 @@ interface LipSyncResult {
|
|
|
459
458
|
inferenceTimeMs: number;
|
|
460
459
|
}
|
|
461
460
|
/**
|
|
462
|
-
* Common interface for
|
|
461
|
+
* Common interface for A2E (audio-to-expression) inference engines
|
|
463
462
|
*
|
|
464
463
|
* Implemented by:
|
|
465
|
-
* - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR +
|
|
466
|
-
* - Wav2ArkitCpuInference (WASM-only, 404MB,
|
|
464
|
+
* - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR + A2E)
|
|
465
|
+
* - Wav2ArkitCpuInference (WASM-only, 404MB, A2E only)
|
|
467
466
|
*/
|
|
468
|
-
interface
|
|
467
|
+
interface A2EBackend {
|
|
469
468
|
/** Model identifier for backend-specific tuning (e.g. audio delay) */
|
|
470
469
|
readonly modelId: 'wav2vec2' | 'wav2arkit_cpu';
|
|
471
470
|
/** Current backend type (webgpu, wasm, or null if not loaded) */
|
|
472
471
|
readonly backend: RuntimeBackend | null;
|
|
473
472
|
/** Whether the model is loaded and ready for inference */
|
|
474
473
|
readonly isLoaded: boolean;
|
|
474
|
+
/** Optimal number of audio samples per inference call (e.g. 16000 = 1s at 16kHz) */
|
|
475
|
+
readonly chunkSize: number;
|
|
475
476
|
/**
|
|
476
477
|
* Load the ONNX model
|
|
477
478
|
* @returns Model loading information
|
|
478
479
|
*/
|
|
479
|
-
load(): Promise<
|
|
480
|
+
load(): Promise<A2EModelInfo>;
|
|
480
481
|
/**
|
|
481
482
|
* Run inference on raw audio
|
|
482
483
|
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
483
484
|
* @param identityIndex - Optional identity index (ignored by CPU model)
|
|
484
|
-
* @returns
|
|
485
|
+
* @returns A2E result with blendshapes in LAM_BLENDSHAPES order
|
|
485
486
|
*/
|
|
486
|
-
infer(audioSamples: Float32Array, identityIndex?: number): Promise<
|
|
487
|
+
infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
|
|
487
488
|
/**
|
|
488
489
|
* Dispose of the model and free resources
|
|
489
490
|
*/
|
|
@@ -491,542 +492,16 @@ interface LipSyncBackend {
|
|
|
491
492
|
}
|
|
492
493
|
|
|
493
494
|
/**
|
|
494
|
-
*
|
|
495
|
+
* FullFacePipeline - A2E expression pipeline with ExpressionProfile weight scaling
|
|
495
496
|
*
|
|
496
|
-
*
|
|
497
|
-
* 1.
|
|
498
|
-
* 2.
|
|
499
|
-
* 3.
|
|
500
|
-
* 4. Provides frames synchronized to AudioContext clock
|
|
497
|
+
* Orchestrates full-face animation by:
|
|
498
|
+
* 1. Scheduling audio for playback immediately (audio-first, never waits for A2E)
|
|
499
|
+
* 2. Running A2E inference in background (fire-and-forget via A2EProcessor)
|
|
500
|
+
* 3. Applying per-character ExpressionProfile scaling to raw A2E output
|
|
501
501
|
*
|
|
502
|
-
*
|
|
503
|
-
*
|
|
504
|
-
*
|
|
505
|
-
* - Timestamp-based frame retrieval (not callback) for renderer flexibility
|
|
506
|
-
*
|
|
507
|
-
* Based on patterns from Chrome Audio Worklet design and Web Audio clock management.
|
|
508
|
-
*
|
|
509
|
-
* @see https://developer.chrome.com/blog/audio-worklet-design-pattern
|
|
510
|
-
* @category Audio
|
|
511
|
-
*/
|
|
512
|
-
|
|
513
|
-
interface LAMFrame {
|
|
514
|
-
/** 52 ARKit blendshape weights */
|
|
515
|
-
frame: Float32Array;
|
|
516
|
-
/** AudioContext time when this frame should be displayed */
|
|
517
|
-
timestamp: number;
|
|
518
|
-
}
|
|
519
|
-
interface LAMPipelineOptions {
|
|
520
|
-
/**
|
|
521
|
-
* Sample rate in Hz (must match audio playback)
|
|
522
|
-
* Default: 16000
|
|
523
|
-
*/
|
|
524
|
-
sampleRate?: number;
|
|
525
|
-
/**
|
|
526
|
-
* LAM inference callback
|
|
527
|
-
* Called each time LAM processes a buffer
|
|
528
|
-
*/
|
|
529
|
-
onInference?: (frameCount: number) => void;
|
|
530
|
-
/**
|
|
531
|
-
* Error callback for inference failures
|
|
532
|
-
*/
|
|
533
|
-
onError?: (error: Error) => void;
|
|
534
|
-
}
|
|
535
|
-
declare class LAMPipeline {
|
|
536
|
-
private readonly options;
|
|
537
|
-
private readonly REQUIRED_SAMPLES;
|
|
538
|
-
private readonly FRAME_RATE;
|
|
539
|
-
private buffer;
|
|
540
|
-
private bufferStartTime;
|
|
541
|
-
private frameQueue;
|
|
542
|
-
/**
|
|
543
|
-
* Last successfully retrieved frame
|
|
544
|
-
* Used as fallback when no new frame is available to prevent avatar freezing
|
|
545
|
-
*/
|
|
546
|
-
private lastFrame;
|
|
547
|
-
constructor(options?: LAMPipelineOptions);
|
|
548
|
-
/**
|
|
549
|
-
* Push audio samples into the pipeline
|
|
550
|
-
*
|
|
551
|
-
* Accumulates samples and triggers LAM inference when buffer is full.
|
|
552
|
-
* Multiple calls may be needed to accumulate enough samples.
|
|
553
|
-
*
|
|
554
|
-
* @param samples - Float32Array of audio samples
|
|
555
|
-
* @param timestamp - AudioContext time when these samples start playing
|
|
556
|
-
* @param lam - LAM inference engine
|
|
557
|
-
*/
|
|
558
|
-
push(samples: Float32Array, timestamp: number, lam: LipSyncBackend): Promise<void>;
|
|
559
|
-
/**
|
|
560
|
-
* Process accumulated buffer through LAM inference
|
|
561
|
-
*/
|
|
562
|
-
private processBuffer;
|
|
563
|
-
/**
|
|
564
|
-
* Get the frame that should be displayed at the current time
|
|
565
|
-
*
|
|
566
|
-
* Automatically removes frames that have already been displayed.
|
|
567
|
-
* This prevents memory leaks from accumulating old frames.
|
|
568
|
-
*
|
|
569
|
-
* Discard Window (prevents premature frame discarding):
|
|
570
|
-
* - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
|
|
571
|
-
* - WASM: 1.0s (LAM inference 50-500ms + higher variability)
|
|
572
|
-
*
|
|
573
|
-
* Last-Frame-Hold: Returns last valid frame instead of null to prevent
|
|
574
|
-
* avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
|
|
575
|
-
*
|
|
576
|
-
* @param currentTime - Current AudioContext time
|
|
577
|
-
* @param lam - LAM inference engine (optional, for backend detection)
|
|
578
|
-
* @returns Current frame, or last frame as fallback, or null if no frames yet
|
|
579
|
-
*/
|
|
580
|
-
getFrameForTime(currentTime: number, lam?: {
|
|
581
|
-
backend: 'webgpu' | 'wasm' | null;
|
|
582
|
-
}): Float32Array | null;
|
|
583
|
-
/**
|
|
584
|
-
* Get all frames in the queue (for debugging/monitoring)
|
|
585
|
-
*/
|
|
586
|
-
getQueuedFrames(): LAMFrame[];
|
|
587
|
-
/**
|
|
588
|
-
* Get current buffer fill level (0-1)
|
|
589
|
-
*/
|
|
590
|
-
get fillLevel(): number;
|
|
591
|
-
/**
|
|
592
|
-
* Get number of frames queued
|
|
593
|
-
*/
|
|
594
|
-
get queuedFrameCount(): number;
|
|
595
|
-
/**
|
|
596
|
-
* Get buffered audio duration in seconds
|
|
597
|
-
*/
|
|
598
|
-
get bufferedDuration(): number;
|
|
599
|
-
/**
|
|
600
|
-
* Flush remaining buffered audio
|
|
601
|
-
*
|
|
602
|
-
* Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
|
|
603
|
-
* This ensures the final audio chunk generates blendshape frames.
|
|
604
|
-
*
|
|
605
|
-
* Should be called when audio stream ends to prevent losing the last 0-1 seconds.
|
|
606
|
-
*
|
|
607
|
-
* @param lam - LAM inference engine
|
|
608
|
-
*/
|
|
609
|
-
flush(lam: LipSyncBackend): Promise<void>;
|
|
610
|
-
/**
|
|
611
|
-
* Adjust all queued frame timestamps by an offset
|
|
612
|
-
*
|
|
613
|
-
* Used for synchronization when audio scheduling time differs from
|
|
614
|
-
* the estimated time used during LAM processing.
|
|
615
|
-
*
|
|
616
|
-
* @param offset - Time offset in seconds to add to all timestamps
|
|
617
|
-
*/
|
|
618
|
-
adjustTimestamps(offset: number): void;
|
|
619
|
-
/**
|
|
620
|
-
* Reset the pipeline
|
|
621
|
-
*/
|
|
622
|
-
reset(): void;
|
|
623
|
-
}
|
|
624
|
-
|
|
625
|
-
/**
|
|
626
|
-
* SyncedAudioPipeline - Audio playback + LAM lip sync coordinator
|
|
627
|
-
*
|
|
628
|
-
* Orchestrates the complete pipeline for synchronized audio playback and lip sync:
|
|
629
|
-
* 1. Network chunks → Coalescer → Optimized buffers
|
|
630
|
-
* 2. Audio buffers → Scheduler → Gapless playback (immediate, never blocks)
|
|
631
|
-
* 3. Audio buffers → LAM Pipeline → Blendshape frames (background, fire-and-forget)
|
|
632
|
-
* 4. Frames synchronized to AudioContext clock → Renderer
|
|
633
|
-
*
|
|
634
|
-
* Key Architecture Pattern: Audio-First, LAM-Background
|
|
635
|
-
* - Audio chunks are scheduled for playback immediately (never waits for LAM)
|
|
636
|
-
* - LAM inference runs in background without blocking the audio path
|
|
637
|
-
* - Lip sync starts ~1 second after audio (LAM needs 16000 samples to infer)
|
|
638
|
-
* - Once LAM catches up, frames stay synchronized to AudioContext clock
|
|
639
|
-
*
|
|
640
|
-
* This decoupled design prevents LAM inference (50-300ms) from blocking audio
|
|
641
|
-
* scheduling, which caused audible stuttering when audio arrived as a continuous
|
|
642
|
-
* stream (e.g., single-call TTS from ElevenLabs via AgentCore).
|
|
643
|
-
*
|
|
644
|
-
* @see https://web.dev/articles/audio-scheduling (Web Audio clock patterns)
|
|
645
|
-
* @category Audio
|
|
646
|
-
*/
|
|
647
|
-
|
|
648
|
-
interface SyncedAudioPipelineOptions {
|
|
649
|
-
/** Sample rate in Hz (default: 16000) */
|
|
650
|
-
sampleRate?: number;
|
|
651
|
-
/** Target chunk duration in ms for coalescing (default: 200) */
|
|
652
|
-
chunkTargetMs?: number;
|
|
653
|
-
/** LAM inference engine */
|
|
654
|
-
lam: LipSyncBackend;
|
|
655
|
-
/**
|
|
656
|
-
* Audio playback delay in ms before first audio plays.
|
|
657
|
-
* Gives LAM inference time to pre-compute blendshapes.
|
|
658
|
-
* Default: auto-detected from lam.backend (50ms WebGPU, 350ms WASM).
|
|
659
|
-
*/
|
|
660
|
-
audioDelayMs?: number;
|
|
661
|
-
}
|
|
662
|
-
interface SyncedAudioPipelineEvents {
|
|
663
|
-
/** New frame ready for display */
|
|
664
|
-
frame_ready: Float32Array;
|
|
665
|
-
/** Playback has completed */
|
|
666
|
-
playback_complete: void;
|
|
667
|
-
/** First audio chunk scheduled, playback starting */
|
|
668
|
-
playback_start: number;
|
|
669
|
-
/** Error occurred */
|
|
670
|
-
error: Error;
|
|
671
|
-
/** Index signature for EventEmitter compatibility */
|
|
672
|
-
[key: string]: unknown;
|
|
673
|
-
}
|
|
674
|
-
declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents> {
|
|
675
|
-
private readonly options;
|
|
676
|
-
private scheduler;
|
|
677
|
-
private coalescer;
|
|
678
|
-
private lamPipeline;
|
|
679
|
-
private playbackStarted;
|
|
680
|
-
private monitorInterval;
|
|
681
|
-
private frameAnimationId;
|
|
682
|
-
constructor(options: SyncedAudioPipelineOptions);
|
|
683
|
-
/**
|
|
684
|
-
* Initialize the pipeline
|
|
685
|
-
*/
|
|
686
|
-
initialize(): Promise<void>;
|
|
687
|
-
/**
|
|
688
|
-
* Start a new playback session
|
|
689
|
-
*
|
|
690
|
-
* Resets all state and prepares for incoming audio chunks.
|
|
691
|
-
* Audio will be scheduled immediately as chunks arrive (no buffering).
|
|
692
|
-
*/
|
|
693
|
-
start(): void;
|
|
694
|
-
/**
|
|
695
|
-
* Receive audio chunk from network
|
|
696
|
-
*
|
|
697
|
-
* Audio-first design: schedules audio immediately, LAM runs in background.
|
|
698
|
-
* This prevents LAM inference (50-300ms) from blocking audio scheduling,
|
|
699
|
-
* which caused audible stuttering with continuous audio streams.
|
|
700
|
-
*
|
|
701
|
-
* @param chunk - Uint8Array containing Int16 PCM audio
|
|
702
|
-
*/
|
|
703
|
-
onAudioChunk(chunk: Uint8Array): Promise<void>;
|
|
704
|
-
/**
|
|
705
|
-
* End of audio stream
|
|
706
|
-
*
|
|
707
|
-
* Flushes any remaining buffered data.
|
|
708
|
-
*/
|
|
709
|
-
end(): Promise<void>;
|
|
710
|
-
/**
|
|
711
|
-
* Stop playback immediately with smooth fade-out
|
|
712
|
-
*
|
|
713
|
-
* Gracefully cancels all audio playback and LAM processing:
|
|
714
|
-
* - Fades out audio over specified duration (default: 50ms)
|
|
715
|
-
* - Cancels pending LAM inferences
|
|
716
|
-
* - Clears all buffers and queues
|
|
717
|
-
* - Emits 'playback_complete' event
|
|
718
|
-
*
|
|
719
|
-
* Use this for interruptions (e.g., user barge-in during AI speech).
|
|
720
|
-
*
|
|
721
|
-
* @param fadeOutMs - Fade-out duration in milliseconds (default: 50ms)
|
|
722
|
-
* @returns Promise that resolves when fade-out completes
|
|
723
|
-
*/
|
|
724
|
-
stop(fadeOutMs?: number): Promise<void>;
|
|
725
|
-
/**
|
|
726
|
-
* Start frame animation loop
|
|
727
|
-
*
|
|
728
|
-
* Uses requestAnimationFrame to check for new LAM frames.
|
|
729
|
-
* Synchronized to AudioContext clock (not visual refresh rate).
|
|
730
|
-
*
|
|
731
|
-
* Frame Emission Strategy:
|
|
732
|
-
* - LAMPipeline uses last-frame-hold to prevent null returns
|
|
733
|
-
* - Always emit frames (even repeated frames) to maintain smooth animation
|
|
734
|
-
* - Renderer is responsible for detecting duplicate frames if needed
|
|
735
|
-
*/
|
|
736
|
-
private startFrameLoop;
|
|
737
|
-
/**
|
|
738
|
-
* Start monitoring for playback completion
|
|
739
|
-
*/
|
|
740
|
-
private startMonitoring;
|
|
741
|
-
/**
|
|
742
|
-
* Stop monitoring
|
|
743
|
-
*/
|
|
744
|
-
private stopMonitoring;
|
|
745
|
-
/**
|
|
746
|
-
* Get current pipeline state (for debugging/monitoring)
|
|
747
|
-
*/
|
|
748
|
-
getState(): {
|
|
749
|
-
playbackStarted: boolean;
|
|
750
|
-
coalescerFill: number;
|
|
751
|
-
lamFill: number;
|
|
752
|
-
queuedFrames: number;
|
|
753
|
-
currentTime: number;
|
|
754
|
-
playbackEndTime: number;
|
|
755
|
-
};
|
|
756
|
-
/**
|
|
757
|
-
* Cleanup resources
|
|
758
|
-
*/
|
|
759
|
-
dispose(): void;
|
|
760
|
-
}
|
|
761
|
-
|
|
762
|
-
/**
|
|
763
|
-
* Emotion to ARKit Blendshape Mapper
|
|
764
|
-
*
|
|
765
|
-
* Converts Emotion2VecInference output to upper face ARKit blendshapes for
|
|
766
|
-
* expressive avatar animation. Maps 4 emotion categories (neutral, happy, angry, sad)
|
|
767
|
-
* to 11 upper face blendshapes (brows, eyes, cheeks).
|
|
768
|
-
*
|
|
769
|
-
* Supports two blend modes:
|
|
770
|
-
* - 'dominant': Uses only the strongest emotion (simpler, more stable)
|
|
771
|
-
* - 'weighted': Blends all emotions by probability (more nuanced, e.g., bittersweet)
|
|
772
|
-
*
|
|
773
|
-
* Also supports energy modulation to scale emotion intensity by audio energy,
|
|
774
|
-
* making expressions stronger during emphasized speech.
|
|
775
|
-
*
|
|
776
|
-
* @example Basic usage
|
|
777
|
-
* ```typescript
|
|
778
|
-
* import { EmotionToBlendshapeMapper } from '@omote/core';
|
|
779
|
-
* import { Emotion2VecInference } from '@omote/core';
|
|
780
|
-
*
|
|
781
|
-
* const emotion = new Emotion2VecInference({ modelUrl: '/models/emotion.onnx' });
|
|
782
|
-
* const mapper = new EmotionToBlendshapeMapper();
|
|
783
|
-
*
|
|
784
|
-
* // Process emotion frame
|
|
785
|
-
* const result = await emotion.infer(audioSamples);
|
|
786
|
-
* const blendshapes = mapper.mapFrame(result.dominant);
|
|
787
|
-
*
|
|
788
|
-
* // Apply to avatar
|
|
789
|
-
* for (const [name, value] of Object.entries(blendshapes)) {
|
|
790
|
-
* avatar.setBlendshape(name, value);
|
|
791
|
-
* }
|
|
792
|
-
* ```
|
|
793
|
-
*
|
|
794
|
-
* @example Weighted blending for nuanced expressions
|
|
795
|
-
* ```typescript
|
|
796
|
-
* const mapper = new EmotionToBlendshapeMapper({
|
|
797
|
-
* blendMode: 'weighted',
|
|
798
|
-
* minBlendProbability: 0.1,
|
|
799
|
-
* });
|
|
800
|
-
*
|
|
801
|
-
* // Frame with mixed emotions: { happy: 0.6, sad: 0.3, neutral: 0.1 }
|
|
802
|
-
* // Result: bittersweet expression (smiling but worried brow)
|
|
803
|
-
* const blendshapes = mapper.mapFrame(emotionFrame);
|
|
804
|
-
* ```
|
|
805
|
-
*
|
|
806
|
-
* @example Energy-modulated emotion
|
|
807
|
-
* ```typescript
|
|
808
|
-
* import { AudioEnergyAnalyzer } from '@omote/core';
|
|
809
|
-
*
|
|
810
|
-
* const energyAnalyzer = new AudioEnergyAnalyzer();
|
|
811
|
-
* const mapper = new EmotionToBlendshapeMapper({ energyModulation: true });
|
|
812
|
-
*
|
|
813
|
-
* // In animation loop
|
|
814
|
-
* function animate(audioChunk: Float32Array, emotionFrame: EmotionFrame) {
|
|
815
|
-
* const { energy } = energyAnalyzer.analyze(audioChunk);
|
|
816
|
-
* mapper.mapFrame(emotionFrame, energy); // Louder = stronger emotion
|
|
817
|
-
* mapper.update(16);
|
|
818
|
-
* applyToAvatar(mapper.getCurrentBlendshapes());
|
|
819
|
-
* }
|
|
820
|
-
* ```
|
|
821
|
-
*
|
|
822
|
-
* @module animation
|
|
823
|
-
*/
|
|
824
|
-
declare const EMOTION2VEC_LABELS: readonly ["neutral", "happy", "angry", "sad"];
|
|
825
|
-
type Emotion2VecLabel = (typeof EMOTION2VEC_LABELS)[number];
|
|
826
|
-
interface EmotionFrame {
|
|
827
|
-
/** Primary emotion label */
|
|
828
|
-
emotion: Emotion2VecLabel;
|
|
829
|
-
/** Confidence for primary emotion (0-1) */
|
|
830
|
-
confidence: number;
|
|
831
|
-
/** All emotion probabilities */
|
|
832
|
-
probabilities: Record<Emotion2VecLabel, number>;
|
|
833
|
-
}
|
|
834
|
-
/**
|
|
835
|
-
* Upper face ARKit blendshape names (11 total)
|
|
836
|
-
*
|
|
837
|
-
* These blendshapes control the upper face (brows, eyes, cheeks) and are
|
|
838
|
-
* driven by emotion detection, complementing the mouth blendshapes from
|
|
839
|
-
* LAM lip sync.
|
|
840
|
-
*/
|
|
841
|
-
declare const UPPER_FACE_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "cheekSquintLeft", "cheekSquintRight"];
|
|
842
|
-
type UpperFaceBlendshapeName = (typeof UPPER_FACE_BLENDSHAPES)[number];
|
|
843
|
-
/**
|
|
844
|
-
* Upper face blendshape values (0-1 for each)
|
|
845
|
-
*/
|
|
846
|
-
type UpperFaceBlendshapes = Record<UpperFaceBlendshapeName, number>;
|
|
847
|
-
/**
|
|
848
|
-
* Blend mode for combining emotions
|
|
849
|
-
* - 'dominant': Use only the strongest emotion (default, more stable)
|
|
850
|
-
* - 'weighted': Blend all emotions by probability (more nuanced)
|
|
851
|
-
*/
|
|
852
|
-
type EmotionBlendMode = 'dominant' | 'weighted';
|
|
853
|
-
/**
|
|
854
|
-
* Emotion to ARKit blendshape mapping
|
|
855
|
-
*
|
|
856
|
-
* Based on Paul Ekman's FACS (Facial Action Coding System) research:
|
|
857
|
-
*
|
|
858
|
-
* - Happy (AU6+AU12): Cheek raise + lip corner pull (Duchenne smile)
|
|
859
|
-
* Upper face: cheekSquint (AU6) + slight eyeSquint from genuine smile
|
|
860
|
-
*
|
|
861
|
-
* - Angry (AU4+AU5+AU7+AU23): Brow lower + eye wide + lid tighten + lip press
|
|
862
|
-
* Upper face: browDown (AU4) + eyeWide (AU5) + eyeSquint (AU7) creates the "glare"
|
|
863
|
-
*
|
|
864
|
-
* - Sad (AU1+AU4+AU15): Inner brow raise + brow furrow + lip corner depress
|
|
865
|
-
* Upper face: browInnerUp (AU1) + browDown (AU4) creates the worried/sad brow
|
|
866
|
-
*
|
|
867
|
-
* - Neutral: All zeros (no expression overlay)
|
|
868
|
-
*
|
|
869
|
-
* @see https://imotions.com/blog/learning/research-fundamentals/facial-action-coding-system/
|
|
870
|
-
* @see https://melindaozel.com/arkit-to-facs-cheat-sheet/
|
|
871
|
-
*/
|
|
872
|
-
declare const EMOTION_ARKIT_MAP: Record<Emotion2VecLabel, Partial<UpperFaceBlendshapes>>;
|
|
873
|
-
/**
|
|
874
|
-
* Configuration for EmotionToBlendshapeMapper
|
|
875
|
-
*/
|
|
876
|
-
interface EmotionBlendshapeConfig {
|
|
877
|
-
/**
|
|
878
|
-
* Smoothing factor for exponential moving average (0-1)
|
|
879
|
-
* Lower = slower, smoother transitions
|
|
880
|
-
* Higher = faster, more responsive
|
|
881
|
-
* @default 0.15
|
|
882
|
-
*/
|
|
883
|
-
smoothingFactor?: number;
|
|
884
|
-
/**
|
|
885
|
-
* Minimum confidence threshold for emotion to take effect
|
|
886
|
-
* Emotions below this confidence are treated as neutral
|
|
887
|
-
* @default 0.3
|
|
888
|
-
*/
|
|
889
|
-
confidenceThreshold?: number;
|
|
890
|
-
/**
|
|
891
|
-
* Global intensity multiplier for all blendshapes (0-2)
|
|
892
|
-
* @default 1.0
|
|
893
|
-
*/
|
|
894
|
-
intensity?: number;
|
|
895
|
-
/**
|
|
896
|
-
* Blend mode for combining emotions
|
|
897
|
-
* - 'dominant': Use only the strongest emotion (default)
|
|
898
|
-
* - 'weighted': Blend all emotions by probability
|
|
899
|
-
* @default 'dominant'
|
|
900
|
-
*/
|
|
901
|
-
blendMode?: EmotionBlendMode;
|
|
902
|
-
/**
|
|
903
|
-
* Minimum probability for an emotion to contribute in weighted blend mode
|
|
904
|
-
* Emotions with probability below this are ignored
|
|
905
|
-
* @default 0.1
|
|
906
|
-
*/
|
|
907
|
-
minBlendProbability?: number;
|
|
908
|
-
/**
|
|
909
|
-
* Enable energy modulation - scale emotion intensity by audio energy
|
|
910
|
-
* When enabled, louder speech produces stronger expressions
|
|
911
|
-
* @default false
|
|
912
|
-
*/
|
|
913
|
-
energyModulation?: boolean;
|
|
914
|
-
/**
|
|
915
|
-
* Minimum energy scale when energy modulation is enabled (0-1)
|
|
916
|
-
* At zero audio energy, emotion intensity is scaled by this factor
|
|
917
|
-
* @default 0.3
|
|
918
|
-
*/
|
|
919
|
-
minEnergyScale?: number;
|
|
920
|
-
/**
|
|
921
|
-
* Maximum energy scale when energy modulation is enabled (0-2)
|
|
922
|
-
* At maximum audio energy, emotion intensity is scaled by this factor
|
|
923
|
-
* @default 1.0
|
|
924
|
-
*/
|
|
925
|
-
maxEnergyScale?: number;
|
|
926
|
-
}
|
|
927
|
-
/**
|
|
928
|
-
* EmotionToBlendshapeMapper
|
|
929
|
-
*
|
|
930
|
-
* Converts emotion detection output to upper face ARKit blendshapes.
|
|
931
|
-
* Provides smooth transitions between emotion states using exponential
|
|
932
|
-
* moving average interpolation.
|
|
933
|
-
*
|
|
934
|
-
* Supports two blend modes:
|
|
935
|
-
* - 'dominant': Uses only the strongest emotion
|
|
936
|
-
* - 'weighted': Blends all emotions by probability for nuanced expressions
|
|
937
|
-
*
|
|
938
|
-
* Also supports energy modulation to scale emotion intensity by audio energy.
|
|
939
|
-
*/
|
|
940
|
-
declare class EmotionToBlendshapeMapper {
|
|
941
|
-
private config;
|
|
942
|
-
private targetBlendshapes;
|
|
943
|
-
private currentBlendshapes;
|
|
944
|
-
private currentEnergy;
|
|
945
|
-
/**
|
|
946
|
-
* Create a new EmotionToBlendshapeMapper
|
|
947
|
-
*
|
|
948
|
-
* @param config - Optional configuration
|
|
949
|
-
*/
|
|
950
|
-
constructor(config?: EmotionBlendshapeConfig);
|
|
951
|
-
/**
|
|
952
|
-
* Map an emotion frame to target blendshapes
|
|
953
|
-
*
|
|
954
|
-
* This sets the target values that the mapper will smoothly interpolate
|
|
955
|
-
* towards. Call update() each frame to apply smoothing.
|
|
956
|
-
*
|
|
957
|
-
* @param frame - Emotion frame from Emotion2VecInference
|
|
958
|
-
* @param audioEnergy - Optional audio energy (0-1) for energy modulation
|
|
959
|
-
* @returns Target upper face blendshapes (before smoothing)
|
|
960
|
-
*/
|
|
961
|
-
mapFrame(frame: EmotionFrame, audioEnergy?: number): UpperFaceBlendshapes;
|
|
962
|
-
/**
|
|
963
|
-
* Map using dominant emotion only (original behavior)
|
|
964
|
-
*/
|
|
965
|
-
private mapFrameDominant;
|
|
966
|
-
/**
|
|
967
|
-
* Map using weighted blend of all emotions by probability
|
|
968
|
-
* Creates more nuanced expressions (e.g., bittersweet = happy + sad)
|
|
969
|
-
*/
|
|
970
|
-
private mapFrameWeighted;
|
|
971
|
-
/**
|
|
972
|
-
* Apply energy modulation to scale emotion intensity by audio energy
|
|
973
|
-
* Louder speech = stronger expressions
|
|
974
|
-
*/
|
|
975
|
-
private applyEnergyModulation;
|
|
976
|
-
/**
|
|
977
|
-
* Apply smoothing to interpolate current values towards target
|
|
978
|
-
*
|
|
979
|
-
* Uses exponential moving average:
|
|
980
|
-
* current = current + smoothingFactor * (target - current)
|
|
981
|
-
*
|
|
982
|
-
* @param _deltaMs - Delta time in milliseconds (reserved for future time-based smoothing)
|
|
983
|
-
*/
|
|
984
|
-
update(_deltaMs: number): void;
|
|
985
|
-
/**
|
|
986
|
-
* Get current smoothed blendshape values
|
|
987
|
-
*
|
|
988
|
-
* @returns Current upper face blendshapes (after smoothing)
|
|
989
|
-
*/
|
|
990
|
-
getCurrentBlendshapes(): UpperFaceBlendshapes;
|
|
991
|
-
/**
|
|
992
|
-
* Reset mapper to neutral state
|
|
993
|
-
*
|
|
994
|
-
* Sets both target and current blendshapes to zero.
|
|
995
|
-
*/
|
|
996
|
-
reset(): void;
|
|
997
|
-
/**
|
|
998
|
-
* Get current configuration
|
|
999
|
-
*/
|
|
1000
|
-
getConfig(): Required<EmotionBlendshapeConfig>;
|
|
1001
|
-
/**
|
|
1002
|
-
* Update configuration
|
|
1003
|
-
*
|
|
1004
|
-
* @param config - Partial configuration to update
|
|
1005
|
-
*/
|
|
1006
|
-
setConfig(config: Partial<EmotionBlendshapeConfig>): void;
|
|
1007
|
-
}
|
|
1008
|
-
|
|
1009
|
-
/**
|
|
1010
|
-
* FullFacePipeline - Combined LAM lip sync + Emotion upper face pipeline
|
|
1011
|
-
*
|
|
1012
|
-
* Orchestrates full-face animation by combining:
|
|
1013
|
-
* 1. LAM lip sync (52 ARKit blendshapes) via audio-first scheduling
|
|
1014
|
-
* 2. Emotion labels (from backend LLM or `setEmotionLabel()`) for upper face
|
|
1015
|
-
* 3. AudioEnergyAnalyzer for prosody-driven fallback when no emotion label is set
|
|
1016
|
-
*
|
|
1017
|
-
* Architecture: Audio-First, LAM-Background (same as SyncedAudioPipeline)
|
|
1018
|
-
* - Audio chunks are scheduled for playback immediately (never waits for LAM)
|
|
1019
|
-
* - LAM inference runs in background without blocking the audio path
|
|
1020
|
-
* - Lip sync starts ~1 second after audio (LAM needs 16000 samples to infer)
|
|
1021
|
-
*
|
|
1022
|
-
* Merge Strategy:
|
|
1023
|
-
* - Lower face (41 blendshapes): 100% from LAM (mouth, jaw, tongue, etc.)
|
|
1024
|
-
* - Upper face (11 blendshapes): Emotion overlay with LAM as subtle fallback
|
|
1025
|
-
* Formula: emotion * emotionBlendFactor + lam * lamBlendFactor
|
|
1026
|
-
*
|
|
1027
|
-
* Emotion Sources (in priority order):
|
|
1028
|
-
* 1. `setEmotionLabel()` — explicit label from backend LLM (recommended)
|
|
1029
|
-
* 2. Prosody fallback — subtle brow movement from audio energy (automatic)
|
|
502
|
+
* The A2E model outputs all 52 ARKit blendshapes from audio — brows, eyes, cheeks,
|
|
503
|
+
* mouth, jaw, everything. ExpressionProfile allows per-character weight scaling
|
|
504
|
+
* by group (eyes, brows, jaw, mouth, cheeks, nose, tongue) with per-blendshape overrides.
|
|
1030
505
|
*
|
|
1031
506
|
* @category Audio
|
|
1032
507
|
*
|
|
@@ -1036,8 +511,7 @@ declare class EmotionToBlendshapeMapper {
|
|
|
1036
511
|
*
|
|
1037
512
|
* const pipeline = new FullFacePipeline({
|
|
1038
513
|
* lam,
|
|
1039
|
-
*
|
|
1040
|
-
* lamBlendFactor: 0.2,
|
|
514
|
+
* profile: { mouth: 1.2, brows: 0.8 },
|
|
1041
515
|
* });
|
|
1042
516
|
* await pipeline.initialize();
|
|
1043
517
|
*
|
|
@@ -1046,11 +520,41 @@ declare class EmotionToBlendshapeMapper {
|
|
|
1046
520
|
* });
|
|
1047
521
|
*
|
|
1048
522
|
* pipeline.start();
|
|
1049
|
-
* pipeline.setEmotionLabel('happy'); // From backend LLM
|
|
1050
523
|
* await pipeline.onAudioChunk(audioData);
|
|
1051
524
|
* ```
|
|
1052
525
|
*/
|
|
1053
526
|
|
|
527
|
+
type BlendshapeGroup = 'eyes' | 'brows' | 'jaw' | 'mouth' | 'cheeks' | 'nose' | 'tongue';
|
|
528
|
+
/**
|
|
529
|
+
* Per-character weight scaling for A2E blendshape output.
|
|
530
|
+
*
|
|
531
|
+
* Group scalers multiply all blendshapes in that group (default 1.0).
|
|
532
|
+
* Per-blendshape overrides take priority over group scalers.
|
|
533
|
+
* Final values are clamped to [0, 1].
|
|
534
|
+
*/
|
|
535
|
+
interface ExpressionProfile {
|
|
536
|
+
/** eyeBlink*, eyeLook*, eyeSquint*, eyeWide* (14 blendshapes) */
|
|
537
|
+
eyes?: number;
|
|
538
|
+
/** browDown*, browInnerUp, browOuterUp* (5 blendshapes) */
|
|
539
|
+
brows?: number;
|
|
540
|
+
/** jawForward, jawLeft, jawRight, jawOpen (4 blendshapes) */
|
|
541
|
+
jaw?: number;
|
|
542
|
+
/** mouth* (23 blendshapes) */
|
|
543
|
+
mouth?: number;
|
|
544
|
+
/** cheekPuff, cheekSquint* (3 blendshapes) */
|
|
545
|
+
cheeks?: number;
|
|
546
|
+
/** noseSneer* (2 blendshapes) */
|
|
547
|
+
nose?: number;
|
|
548
|
+
/** tongueOut (1 blendshape) */
|
|
549
|
+
tongue?: number;
|
|
550
|
+
/** Per-blendshape overrides (0-2). Takes priority over group scalers. */
|
|
551
|
+
overrides?: Partial<Record<string, number>>;
|
|
552
|
+
}
|
|
553
|
+
/**
|
|
554
|
+
* Map each LAM_BLENDSHAPES entry to its BlendshapeGroup.
|
|
555
|
+
* Built once at module load from prefix matching.
|
|
556
|
+
*/
|
|
557
|
+
declare const BLENDSHAPE_TO_GROUP: Map<string, BlendshapeGroup>;
|
|
1054
558
|
/**
|
|
1055
559
|
* Configuration for FullFacePipeline
|
|
1056
560
|
*/
|
|
@@ -1061,37 +565,54 @@ interface FullFacePipelineOptions {
|
|
|
1061
565
|
chunkTargetMs?: number;
|
|
1062
566
|
/**
|
|
1063
567
|
* Audio playback delay in ms before first audio plays.
|
|
1064
|
-
* Gives
|
|
1065
|
-
*
|
|
568
|
+
* Gives A2E inference time to pre-compute blendshapes before audio
|
|
569
|
+
* starts, preventing frame drops/desync. Must be ≥ chunkSize
|
|
570
|
+
* accumulation time + inference latency.
|
|
571
|
+
*
|
|
572
|
+
* Default: auto-calculated from chunkSize and backend type.
|
|
1066
573
|
*/
|
|
1067
574
|
audioDelayMs?: number;
|
|
1068
|
-
/** LAM inference engine */
|
|
1069
|
-
lam: LipSyncBackend;
|
|
1070
575
|
/**
|
|
1071
|
-
*
|
|
1072
|
-
*
|
|
1073
|
-
*
|
|
576
|
+
* A2E inference chunk size in samples.
|
|
577
|
+
* Controls how many samples accumulate before each inference call.
|
|
578
|
+
* Smaller = lower latency (less delay before first frame), more overhead.
|
|
579
|
+
* Larger = higher latency, less overhead.
|
|
580
|
+
*
|
|
581
|
+
* Default: 16000 (1s) — the model's native window size.
|
|
582
|
+
* Smaller chunks get zero-padded, causing near-zero blendshape output.
|
|
1074
583
|
*/
|
|
1075
|
-
|
|
584
|
+
chunkSize?: number;
|
|
585
|
+
/** A2E inference engine */
|
|
586
|
+
lam: A2EBackend;
|
|
587
|
+
/** Per-character expression weight scaling */
|
|
588
|
+
profile?: ExpressionProfile;
|
|
589
|
+
/**
|
|
590
|
+
* Spring smoothing halflife in seconds.
|
|
591
|
+
* Controls how quickly blendshapes converge to inference targets.
|
|
592
|
+
* Lower = snappier but more jittery. Higher = smoother but laggy.
|
|
593
|
+
* Set to 0 to disable smoothing (raw frame pass-through).
|
|
594
|
+
*
|
|
595
|
+
* Default: 0.06 (60ms)
|
|
596
|
+
*/
|
|
597
|
+
smoothingHalflife?: number;
|
|
1076
598
|
/**
|
|
1077
|
-
*
|
|
1078
|
-
*
|
|
1079
|
-
*
|
|
599
|
+
* Time in ms with no new inference frames before decaying to neutral.
|
|
600
|
+
* When exceeded, spring targets are set to 0 and the face smoothly
|
|
601
|
+
* relaxes rather than freezing on the last frame.
|
|
602
|
+
*
|
|
603
|
+
* Must be larger than the inter-batch gap (chunkSize/sampleRate + inference time).
|
|
604
|
+
* Default: 2000
|
|
1080
605
|
*/
|
|
1081
|
-
|
|
606
|
+
staleThresholdMs?: number;
|
|
1082
607
|
}
|
|
1083
608
|
/**
|
|
1084
|
-
* Full face frame with
|
|
609
|
+
* Full face frame with scaled blendshapes
|
|
1085
610
|
*/
|
|
1086
611
|
interface FullFaceFrame {
|
|
1087
|
-
/**
|
|
612
|
+
/** Scaled 52 ARKit blendshapes (ExpressionProfile applied) */
|
|
1088
613
|
blendshapes: Float32Array;
|
|
1089
|
-
/**
|
|
1090
|
-
|
|
1091
|
-
/** Emotion-driven upper face blendshapes (11) */
|
|
1092
|
-
emotionBlendshapes: UpperFaceBlendshapes;
|
|
1093
|
-
/** Raw emotion frame data */
|
|
1094
|
-
emotion: EmotionFrame | null;
|
|
614
|
+
/** Raw A2E output (52 blendshapes, before profile scaling) */
|
|
615
|
+
rawBlendshapes: Float32Array;
|
|
1095
616
|
/** AudioContext timestamp for this frame */
|
|
1096
617
|
timestamp: number;
|
|
1097
618
|
}
|
|
@@ -1103,8 +624,6 @@ interface FullFacePipelineEvents {
|
|
|
1103
624
|
full_frame_ready: FullFaceFrame;
|
|
1104
625
|
/** Raw LAM frame ready (for debugging/monitoring) */
|
|
1105
626
|
lam_frame_ready: Float32Array;
|
|
1106
|
-
/** Emotion frame ready (for debugging/monitoring) */
|
|
1107
|
-
emotion_frame_ready: EmotionFrame;
|
|
1108
627
|
/** Playback has completed */
|
|
1109
628
|
playback_complete: void;
|
|
1110
629
|
/** First frame ready, playback starting */
|
|
@@ -1115,53 +634,47 @@ interface FullFacePipelineEvents {
|
|
|
1115
634
|
[key: string]: unknown;
|
|
1116
635
|
}
|
|
1117
636
|
/**
|
|
1118
|
-
* FullFacePipeline -
|
|
637
|
+
* FullFacePipeline - A2E animation pipeline with ExpressionProfile scaling
|
|
1119
638
|
*
|
|
1120
639
|
* Audio-first design matching SyncedAudioPipeline:
|
|
1121
|
-
* - Audio is scheduled immediately (never waits for
|
|
1122
|
-
* -
|
|
1123
|
-
* -
|
|
640
|
+
* - Audio is scheduled immediately (never waits for A2E)
|
|
641
|
+
* - A2E runs in background (fire-and-forget via A2EProcessor)
|
|
642
|
+
* - ExpressionProfile scales raw A2E output per-character
|
|
1124
643
|
*/
|
|
1125
644
|
declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
|
|
1126
645
|
private readonly options;
|
|
1127
646
|
private scheduler;
|
|
1128
647
|
private coalescer;
|
|
1129
|
-
private
|
|
1130
|
-
private
|
|
1131
|
-
private energyAnalyzer;
|
|
648
|
+
private processor;
|
|
649
|
+
private smoother;
|
|
1132
650
|
private playbackStarted;
|
|
1133
651
|
private monitorInterval;
|
|
1134
652
|
private frameAnimationId;
|
|
1135
|
-
private lastEmotionFrame;
|
|
1136
|
-
private currentAudioEnergy;
|
|
1137
653
|
private lastNewFrameTime;
|
|
1138
654
|
private lastKnownLamFrame;
|
|
1139
655
|
private staleWarningEmitted;
|
|
1140
|
-
private
|
|
1141
|
-
private
|
|
1142
|
-
private
|
|
656
|
+
private readonly staleThresholdMs;
|
|
657
|
+
private lastFrameLoopTime;
|
|
658
|
+
private frameLoopCount;
|
|
659
|
+
private profile;
|
|
1143
660
|
constructor(options: FullFacePipelineOptions);
|
|
1144
661
|
/**
|
|
1145
662
|
* Initialize the pipeline
|
|
1146
663
|
*/
|
|
1147
664
|
initialize(): Promise<void>;
|
|
1148
665
|
/**
|
|
1149
|
-
*
|
|
1150
|
-
*
|
|
1151
|
-
* Converts a natural language emotion label into an EmotionFrame
|
|
1152
|
-
* that drives upper face blendshapes for the duration of the utterance.
|
|
1153
|
-
*
|
|
1154
|
-
* Supported labels: happy, excited, joyful, sad, melancholic, angry,
|
|
1155
|
-
* frustrated, neutral, etc.
|
|
1156
|
-
*
|
|
1157
|
-
* @param label - Emotion label string (case-insensitive)
|
|
666
|
+
* Update the ExpressionProfile at runtime (e.g., character switch).
|
|
1158
667
|
*/
|
|
1159
|
-
|
|
668
|
+
setProfile(profile: ExpressionProfile): void;
|
|
1160
669
|
/**
|
|
1161
|
-
*
|
|
1162
|
-
*
|
|
670
|
+
* Apply ExpressionProfile scaling to raw A2E blendshapes.
|
|
671
|
+
*
|
|
672
|
+
* For each blendshape:
|
|
673
|
+
* 1. If an override exists for the blendshape name, use override as scaler
|
|
674
|
+
* 2. Otherwise, use the group scaler (default 1.0)
|
|
675
|
+
* 3. Clamp result to [0, 1]
|
|
1163
676
|
*/
|
|
1164
|
-
|
|
677
|
+
applyProfile(raw: Float32Array): Float32Array;
|
|
1165
678
|
/**
|
|
1166
679
|
* Start a new playback session
|
|
1167
680
|
*
|
|
@@ -1172,29 +685,19 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
|
|
|
1172
685
|
/**
|
|
1173
686
|
* Receive audio chunk from network
|
|
1174
687
|
*
|
|
1175
|
-
* Audio-first design: schedules audio immediately,
|
|
1176
|
-
* This prevents
|
|
688
|
+
* Audio-first design: schedules audio immediately, A2E runs in background.
|
|
689
|
+
* This prevents A2E inference (50-300ms) from blocking audio scheduling.
|
|
1177
690
|
*
|
|
1178
691
|
* @param chunk - Uint8Array containing Int16 PCM audio
|
|
1179
692
|
*/
|
|
1180
693
|
onAudioChunk(chunk: Uint8Array): Promise<void>;
|
|
1181
|
-
/**
|
|
1182
|
-
* Get emotion frame for current animation.
|
|
1183
|
-
*
|
|
1184
|
-
* Priority:
|
|
1185
|
-
* 1. Explicit emotion label from setEmotionLabel()
|
|
1186
|
-
* 2. Prosody fallback: subtle brow movement from audio energy
|
|
1187
|
-
*/
|
|
1188
|
-
private getEmotionFrame;
|
|
1189
|
-
/**
|
|
1190
|
-
* Merge LAM blendshapes with emotion upper face blendshapes
|
|
1191
|
-
*/
|
|
1192
|
-
mergeBlendshapes(lamFrame: Float32Array, emotionFrame: EmotionFrame | null, audioEnergy?: number): {
|
|
1193
|
-
merged: Float32Array;
|
|
1194
|
-
emotionBlendshapes: UpperFaceBlendshapes;
|
|
1195
|
-
};
|
|
1196
694
|
/**
|
|
1197
695
|
* Start frame animation loop
|
|
696
|
+
*
|
|
697
|
+
* Uses critically damped spring smoother to produce continuous output
|
|
698
|
+
* at render rate (60fps), even between inference batches (~30fps bursts).
|
|
699
|
+
* Springs interpolate toward the latest inference target, and decay
|
|
700
|
+
* to neutral when inference stalls.
|
|
1198
701
|
*/
|
|
1199
702
|
private startFrameLoop;
|
|
1200
703
|
/**
|
|
@@ -1219,17 +722,11 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
|
|
|
1219
722
|
getState(): {
|
|
1220
723
|
playbackStarted: boolean;
|
|
1221
724
|
coalescerFill: number;
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
emotionLabel: "neutral" | "happy" | "angry" | "sad" | null;
|
|
1225
|
-
currentAudioEnergy: number;
|
|
725
|
+
processorFill: number;
|
|
726
|
+
queuedFrames: number;
|
|
1226
727
|
currentTime: number;
|
|
1227
728
|
playbackEndTime: number;
|
|
1228
729
|
};
|
|
1229
|
-
/**
|
|
1230
|
-
* Check if an explicit emotion label is currently set
|
|
1231
|
-
*/
|
|
1232
|
-
get hasEmotionLabel(): boolean;
|
|
1233
730
|
/**
|
|
1234
731
|
* Cleanup resources
|
|
1235
732
|
*/
|
|
@@ -1255,13 +752,6 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
|
|
|
1255
752
|
* @module inference/onnxLoader
|
|
1256
753
|
*/
|
|
1257
754
|
|
|
1258
|
-
type OrtModule = {
|
|
1259
|
-
InferenceSession: typeof InferenceSession;
|
|
1260
|
-
Tensor: typeof Tensor;
|
|
1261
|
-
env: Env;
|
|
1262
|
-
};
|
|
1263
|
-
type SessionOptions = InferenceSession.SessionOptions;
|
|
1264
|
-
|
|
1265
755
|
/**
|
|
1266
756
|
* Check if WebGPU is available and likely to work
|
|
1267
757
|
*
|
|
@@ -1271,74 +761,6 @@ type SessionOptions = InferenceSession.SessionOptions;
|
|
|
1271
761
|
* @returns true if WebGPU is available and working
|
|
1272
762
|
*/
|
|
1273
763
|
declare function isWebGPUAvailable(): Promise<boolean>;
|
|
1274
|
-
/**
|
|
1275
|
-
* Load ONNX Runtime with the specified backend
|
|
1276
|
-
*
|
|
1277
|
-
* This lazily loads the appropriate bundle:
|
|
1278
|
-
* - 'wasm': Loads onnxruntime-web (WASM-only, smaller)
|
|
1279
|
-
* - 'webgpu': Loads onnxruntime-web/webgpu (includes WebGPU + WASM fallback)
|
|
1280
|
-
*
|
|
1281
|
-
* Once loaded, the same instance is reused for all subsequent calls.
|
|
1282
|
-
* If you need to switch backends, you must reload the page.
|
|
1283
|
-
*
|
|
1284
|
-
* @param backend The backend to load ('webgpu' or 'wasm')
|
|
1285
|
-
* @returns The ONNX Runtime module
|
|
1286
|
-
*/
|
|
1287
|
-
declare function getOnnxRuntime(backend: RuntimeBackend): Promise<OrtModule>;
|
|
1288
|
-
/**
|
|
1289
|
-
* Get the appropriate ONNX Runtime based on user preference
|
|
1290
|
-
*
|
|
1291
|
-
* This resolves the user's preference against platform capabilities
|
|
1292
|
-
* and loads the appropriate bundle.
|
|
1293
|
-
*
|
|
1294
|
-
* @param preference User's backend preference
|
|
1295
|
-
* @returns The ONNX Runtime module and the resolved backend
|
|
1296
|
-
*/
|
|
1297
|
-
declare function getOnnxRuntimeForPreference(preference?: BackendPreference): Promise<{
|
|
1298
|
-
ort: OrtModule;
|
|
1299
|
-
backend: RuntimeBackend;
|
|
1300
|
-
}>;
|
|
1301
|
-
/**
|
|
1302
|
-
* Get session options for creating an inference session
|
|
1303
|
-
*
|
|
1304
|
-
* This returns optimized session options based on the backend and platform.
|
|
1305
|
-
*
|
|
1306
|
-
* @param backend The backend being used
|
|
1307
|
-
* @returns Session options for InferenceSession.create()
|
|
1308
|
-
*/
|
|
1309
|
-
declare function getSessionOptions(backend: RuntimeBackend): SessionOptions;
|
|
1310
|
-
/**
|
|
1311
|
-
* Create an inference session with automatic fallback
|
|
1312
|
-
*
|
|
1313
|
-
* If WebGPU session creation fails, automatically falls back to WASM.
|
|
1314
|
-
*
|
|
1315
|
-
* @param modelBuffer The model data as ArrayBuffer
|
|
1316
|
-
* @param preferredBackend The preferred backend
|
|
1317
|
-
* @returns The created session and the backend used
|
|
1318
|
-
*/
|
|
1319
|
-
declare function createSessionWithFallback(modelBuffer: ArrayBuffer, preferredBackend: RuntimeBackend): Promise<{
|
|
1320
|
-
session: InferenceSession;
|
|
1321
|
-
backend: RuntimeBackend;
|
|
1322
|
-
}>;
|
|
1323
|
-
/**
|
|
1324
|
-
* Get the currently loaded backend (if any)
|
|
1325
|
-
*/
|
|
1326
|
-
declare function getLoadedBackend(): RuntimeBackend | null;
|
|
1327
|
-
/**
|
|
1328
|
-
* Check if ONNX Runtime has been loaded
|
|
1329
|
-
*/
|
|
1330
|
-
declare function isOnnxRuntimeLoaded(): boolean;
|
|
1331
|
-
/**
|
|
1332
|
-
* Preload ONNX Runtime and compile the WASM binary early
|
|
1333
|
-
*
|
|
1334
|
-
* Call this before loading heavy resources (Three.js, VRM models) to ensure
|
|
1335
|
-
* WASM memory is allocated in a clean JS heap, reducing iOS memory pressure.
|
|
1336
|
-
* Uses the singleton pattern — subsequent model loading reuses this instance.
|
|
1337
|
-
*
|
|
1338
|
-
* @param preference Backend preference (default: 'auto')
|
|
1339
|
-
* @returns The resolved backend that was loaded
|
|
1340
|
-
*/
|
|
1341
|
-
declare function preloadOnnxRuntime(preference?: BackendPreference): Promise<RuntimeBackend>;
|
|
1342
764
|
|
|
1343
765
|
/**
|
|
1344
766
|
* SenseVoice automatic speech recognition using ONNX Runtime Web
|
|
@@ -2094,8 +1516,9 @@ interface Wav2ArkitCpuWorkerConfig {
|
|
|
2094
1516
|
*
|
|
2095
1517
|
* @see Wav2ArkitCpuInference for main-thread version
|
|
2096
1518
|
*/
|
|
2097
|
-
declare class Wav2ArkitCpuWorker implements
|
|
1519
|
+
declare class Wav2ArkitCpuWorker implements A2EBackend {
|
|
2098
1520
|
readonly modelId: "wav2arkit_cpu";
|
|
1521
|
+
readonly chunkSize: number;
|
|
2099
1522
|
private worker;
|
|
2100
1523
|
private config;
|
|
2101
1524
|
private isLoading;
|
|
@@ -2124,7 +1547,7 @@ declare class Wav2ArkitCpuWorker implements LipSyncBackend {
|
|
|
2124
1547
|
/**
|
|
2125
1548
|
* Load the ONNX model in the worker
|
|
2126
1549
|
*/
|
|
2127
|
-
load(): Promise<
|
|
1550
|
+
load(): Promise<A2EModelInfo>;
|
|
2128
1551
|
/**
|
|
2129
1552
|
* Run inference on raw audio
|
|
2130
1553
|
*
|
|
@@ -2134,7 +1557,7 @@ declare class Wav2ArkitCpuWorker implements LipSyncBackend {
|
|
|
2134
1557
|
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
2135
1558
|
* @param _identityIndex - Ignored (identity 11 is baked into the model)
|
|
2136
1559
|
*/
|
|
2137
|
-
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<
|
|
1560
|
+
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
|
|
2138
1561
|
/**
|
|
2139
1562
|
* Queue inference to serialize worker calls
|
|
2140
1563
|
*/
|
|
@@ -2166,7 +1589,7 @@ declare class Wav2ArkitCpuWorker implements LipSyncBackend {
|
|
|
2166
1589
|
* await worker.init();
|
|
2167
1590
|
*
|
|
2168
1591
|
* const asr = createSenseVoice({ modelUrl: '...', unifiedWorker: worker });
|
|
2169
|
-
* const lam =
|
|
1592
|
+
* const lam = createA2E({ gpuModelUrl: '...', cpuModelUrl: '...', unifiedWorker: worker });
|
|
2170
1593
|
* const vad = createSileroVAD({ modelUrl: '...', unifiedWorker: worker });
|
|
2171
1594
|
* ```
|
|
2172
1595
|
*
|
|
@@ -2196,17 +1619,17 @@ declare class UnifiedInferenceWorker {
|
|
|
2196
1619
|
}): Promise<SenseVoiceModelInfo>;
|
|
2197
1620
|
transcribe(audio: Float32Array): Promise<SenseVoiceResult>;
|
|
2198
1621
|
disposeSenseVoice(): Promise<void>;
|
|
2199
|
-
|
|
1622
|
+
loadA2E(config: {
|
|
2200
1623
|
modelUrl: string;
|
|
2201
1624
|
externalDataUrl: string | null;
|
|
2202
|
-
}): Promise<
|
|
2203
|
-
|
|
1625
|
+
}): Promise<A2EModelInfo>;
|
|
1626
|
+
inferA2E(audio: Float32Array): Promise<{
|
|
2204
1627
|
blendshapes: Float32Array;
|
|
2205
1628
|
numFrames: number;
|
|
2206
1629
|
numBlendshapes: number;
|
|
2207
1630
|
inferenceTimeMs: number;
|
|
2208
1631
|
}>;
|
|
2209
|
-
|
|
1632
|
+
disposeA2E(): Promise<void>;
|
|
2210
1633
|
loadVAD(config: {
|
|
2211
1634
|
modelUrl: string;
|
|
2212
1635
|
sampleRate: number;
|
|
@@ -2252,10 +1675,11 @@ declare class SenseVoiceUnifiedAdapter implements SenseVoiceBackend {
|
|
|
2252
1675
|
/**
|
|
2253
1676
|
* Wav2ArkitCpu adapter backed by UnifiedInferenceWorker
|
|
2254
1677
|
*
|
|
2255
|
-
* Implements
|
|
1678
|
+
* Implements A2EBackend, delegating all inference to the shared worker.
|
|
2256
1679
|
*/
|
|
2257
|
-
declare class Wav2ArkitCpuUnifiedAdapter implements
|
|
1680
|
+
declare class Wav2ArkitCpuUnifiedAdapter implements A2EBackend {
|
|
2258
1681
|
readonly modelId: "wav2arkit_cpu";
|
|
1682
|
+
readonly chunkSize: number;
|
|
2259
1683
|
private worker;
|
|
2260
1684
|
private config;
|
|
2261
1685
|
private _isLoaded;
|
|
@@ -2263,8 +1687,8 @@ declare class Wav2ArkitCpuUnifiedAdapter implements LipSyncBackend {
|
|
|
2263
1687
|
constructor(worker: UnifiedInferenceWorker, config: Wav2ArkitCpuWorkerConfig);
|
|
2264
1688
|
get isLoaded(): boolean;
|
|
2265
1689
|
get backend(): RuntimeBackend | null;
|
|
2266
|
-
load(): Promise<
|
|
2267
|
-
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<
|
|
1690
|
+
load(): Promise<A2EModelInfo>;
|
|
1691
|
+
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
|
|
2268
1692
|
dispose(): Promise<void>;
|
|
2269
1693
|
}
|
|
2270
1694
|
/**
|
|
@@ -2392,116 +1816,6 @@ interface CreateSenseVoiceConfig {
|
|
|
2392
1816
|
*/
|
|
2393
1817
|
declare function createSenseVoice(config: CreateSenseVoiceConfig): SenseVoiceBackend;
|
|
2394
1818
|
|
|
2395
|
-
/**
|
|
2396
|
-
* Kaldi-compatible filterbank (fbank) feature extraction
|
|
2397
|
-
*
|
|
2398
|
-
* Pure TypeScript implementation matching kaldi-native-fbank parameters
|
|
2399
|
-
* used by SenseVoice. No external dependencies.
|
|
2400
|
-
*
|
|
2401
|
-
* Pipeline: audio → framing → windowing → FFT → power spectrum → mel filterbank → log
|
|
2402
|
-
*
|
|
2403
|
-
* @module inference/kaldiFbank
|
|
2404
|
-
*/
|
|
2405
|
-
interface KaldiFbankOptions {
|
|
2406
|
-
/** Frame length in ms (default: 25) */
|
|
2407
|
-
frameLengthMs?: number;
|
|
2408
|
-
/** Frame shift in ms (default: 10) */
|
|
2409
|
-
frameShiftMs?: number;
|
|
2410
|
-
/** Low frequency cutoff in Hz (default: 20) */
|
|
2411
|
-
lowFreq?: number;
|
|
2412
|
-
/** High frequency cutoff in Hz (default: sampleRate / 2) */
|
|
2413
|
-
highFreq?: number;
|
|
2414
|
-
/** Dither amount (default: 0 for deterministic output) */
|
|
2415
|
-
dither?: number;
|
|
2416
|
-
/** Preemphasis coefficient (default: 0.97) */
|
|
2417
|
-
preemphasis?: number;
|
|
2418
|
-
}
|
|
2419
|
-
/**
|
|
2420
|
-
* Compute Kaldi-compatible log mel filterbank features
|
|
2421
|
-
*
|
|
2422
|
-
* @param audio Raw audio samples (float32, [-1, 1] range)
|
|
2423
|
-
* @param sampleRate Sample rate in Hz (must be 16000 for SenseVoice)
|
|
2424
|
-
* @param numMelBins Number of mel bins (80 for SenseVoice)
|
|
2425
|
-
* @param opts Optional parameters
|
|
2426
|
-
* @returns Flattened Float32Array of shape [numFrames, numMelBins]
|
|
2427
|
-
*/
|
|
2428
|
-
declare function computeKaldiFbank(audio: Float32Array, sampleRate: number, numMelBins: number, opts?: KaldiFbankOptions): Float32Array;
|
|
2429
|
-
/**
|
|
2430
|
-
* Apply Low Frame Rate stacking for SenseVoice
|
|
2431
|
-
*
|
|
2432
|
-
* Concatenates lfrM consecutive frames with stride lfrN.
|
|
2433
|
-
* Left-pads with copies of first frame, right-pads last group.
|
|
2434
|
-
*
|
|
2435
|
-
* @param features Flattened [numFrames, featureDim]
|
|
2436
|
-
* @param featureDim Feature dimension per frame (e.g., 80)
|
|
2437
|
-
* @param lfrM Number of frames to stack (default: 7)
|
|
2438
|
-
* @param lfrN Stride (default: 6)
|
|
2439
|
-
* @returns Flattened [numOutputFrames, featureDim * lfrM]
|
|
2440
|
-
*/
|
|
2441
|
-
declare function applyLFR(features: Float32Array, featureDim: number, lfrM?: number, lfrN?: number): Float32Array;
|
|
2442
|
-
/**
|
|
2443
|
-
* Apply CMVN normalization in-place
|
|
2444
|
-
*
|
|
2445
|
-
* Formula: normalized[i] = (features[i] + negMean[i % dim]) * invStddev[i % dim]
|
|
2446
|
-
*
|
|
2447
|
-
* @param features Flattened feature array (modified in-place)
|
|
2448
|
-
* @param dim Feature dimension (560 for SenseVoice after LFR)
|
|
2449
|
-
* @param negMean Negative mean vector (dim-dimensional)
|
|
2450
|
-
* @param invStddev Inverse standard deviation vector (dim-dimensional)
|
|
2451
|
-
* @returns The same features array (for chaining)
|
|
2452
|
-
*/
|
|
2453
|
-
declare function applyCMVN(features: Float32Array, dim: number, negMean: Float32Array, invStddev: Float32Array): Float32Array;
|
|
2454
|
-
/**
|
|
2455
|
-
* Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
|
|
2456
|
-
*
|
|
2457
|
-
* The sherpa-onnx SenseVoice export stores neg_mean and inv_stddev
|
|
2458
|
-
* as comma-separated float strings in the model's metadata.
|
|
2459
|
-
*/
|
|
2460
|
-
declare function parseCMVNFromMetadata(negMeanStr: string, invStddevStr: string): {
|
|
2461
|
-
negMean: Float32Array;
|
|
2462
|
-
invStddev: Float32Array;
|
|
2463
|
-
};
|
|
2464
|
-
|
|
2465
|
-
/**
|
|
2466
|
-
* CTC greedy decoder for SenseVoice
|
|
2467
|
-
*
|
|
2468
|
-
* Decodes CTC logits into text with structured token parsing
|
|
2469
|
-
* for language, emotion, and audio event detection.
|
|
2470
|
-
*
|
|
2471
|
-
* @module inference/ctcDecoder
|
|
2472
|
-
*/
|
|
2473
|
-
interface CTCDecodeResult {
|
|
2474
|
-
/** Decoded text (speech content only) */
|
|
2475
|
-
text: string;
|
|
2476
|
-
/** Detected language (e.g., 'zh', 'en', 'ja', 'ko', 'yue') */
|
|
2477
|
-
language?: string;
|
|
2478
|
-
/** Detected emotion (e.g., 'HAPPY', 'SAD', 'ANGRY', 'NEUTRAL') */
|
|
2479
|
-
emotion?: string;
|
|
2480
|
-
/** Detected audio event (e.g., 'Speech', 'BGM', 'Laughter') */
|
|
2481
|
-
event?: string;
|
|
2482
|
-
}
|
|
2483
|
-
/** Resolve language string to SenseVoice language ID */
|
|
2484
|
-
declare function resolveLanguageId(language: string): number;
|
|
2485
|
-
/** Resolve text norm string to SenseVoice text norm ID */
|
|
2486
|
-
declare function resolveTextNormId(textNorm: string): number;
|
|
2487
|
-
/**
|
|
2488
|
-
* Parse tokens.txt into a token ID → string map
|
|
2489
|
-
*
|
|
2490
|
-
* Format: each line is "token_string token_id"
|
|
2491
|
-
* e.g., "<unk> 0", "▁the 3", "s 4"
|
|
2492
|
-
*/
|
|
2493
|
-
declare function parseTokensFile(content: string): Map<number, string>;
|
|
2494
|
-
/**
|
|
2495
|
-
* CTC greedy decode
|
|
2496
|
-
*
|
|
2497
|
-
* @param logits Raw logits from model output, flattened [seqLen, vocabSize]
|
|
2498
|
-
* @param seqLen Sequence length (time steps)
|
|
2499
|
-
* @param vocabSize Vocabulary size
|
|
2500
|
-
* @param tokenMap Token ID → string map from tokens.txt
|
|
2501
|
-
* @returns Decoded text and structured metadata
|
|
2502
|
-
*/
|
|
2503
|
-
declare function ctcGreedyDecode(logits: Float32Array, seqLen: number, vocabSize: number, tokenMap: Map<number, string>): CTCDecodeResult;
|
|
2504
|
-
|
|
2505
1819
|
/**
|
|
2506
1820
|
* Shared blendshape constants and utilities for lip sync inference
|
|
2507
1821
|
*
|
|
@@ -2521,26 +1835,18 @@ declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browI
|
|
|
2521
1835
|
/** Alias for backwards compatibility */
|
|
2522
1836
|
declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
2523
1837
|
/**
|
|
2524
|
-
*
|
|
2525
|
-
* From LAM official postprocessing (models/utils.py)
|
|
2526
|
-
* This fixes asymmetric output from the raw model
|
|
2527
|
-
*/
|
|
2528
|
-
declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
|
|
2529
|
-
/**
|
|
2530
|
-
* wav2arkit_cpu model blendshape ordering
|
|
1838
|
+
* Linearly interpolate between two blendshape weight arrays.
|
|
2531
1839
|
*
|
|
2532
|
-
*
|
|
2533
|
-
*
|
|
2534
|
-
*
|
|
2535
|
-
*/
|
|
2536
|
-
declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
|
|
2537
|
-
/**
|
|
2538
|
-
* Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
|
|
1840
|
+
* Pure math utility with zero renderer dependency — used by all renderer
|
|
1841
|
+
* adapters (@omote/three, @omote/babylon, @omote/r3f) for smooth frame
|
|
1842
|
+
* transitions.
|
|
2539
1843
|
*
|
|
2540
|
-
* @param
|
|
2541
|
-
* @
|
|
1844
|
+
* @param current - Current blendshape weights
|
|
1845
|
+
* @param target - Target blendshape weights
|
|
1846
|
+
* @param factor - Interpolation factor (0 = no change, 1 = snap to target). Default: 0.3
|
|
1847
|
+
* @returns Interpolated weights as number[]
|
|
2542
1848
|
*/
|
|
2543
|
-
declare function
|
|
1849
|
+
declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
|
|
2544
1850
|
|
|
2545
1851
|
/**
|
|
2546
1852
|
* Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
|
|
@@ -2582,6 +1888,12 @@ interface Wav2Vec2InferenceConfig {
|
|
|
2582
1888
|
backend?: InferenceBackend;
|
|
2583
1889
|
/** Number of identity classes (default: 12 for streaming model) */
|
|
2584
1890
|
numIdentityClasses?: number;
|
|
1891
|
+
/**
|
|
1892
|
+
* Number of audio samples per inference chunk (default: 16000).
|
|
1893
|
+
* Model supports variable chunk sizes. Smaller chunks = lower latency,
|
|
1894
|
+
* more inference overhead. 8000 (500ms) is recommended for real-time lip sync.
|
|
1895
|
+
*/
|
|
1896
|
+
chunkSize?: number;
|
|
2585
1897
|
}
|
|
2586
1898
|
interface ModelInfo {
|
|
2587
1899
|
backend: 'webgpu' | 'wasm';
|
|
@@ -2608,7 +1920,7 @@ interface Wav2Vec2Result {
|
|
|
2608
1920
|
/** Inference time in ms */
|
|
2609
1921
|
inferenceTimeMs: number;
|
|
2610
1922
|
}
|
|
2611
|
-
declare class Wav2Vec2Inference implements
|
|
1923
|
+
declare class Wav2Vec2Inference implements A2EBackend {
|
|
2612
1924
|
readonly modelId: "wav2vec2";
|
|
2613
1925
|
private session;
|
|
2614
1926
|
private ort;
|
|
@@ -2616,6 +1928,7 @@ declare class Wav2Vec2Inference implements LipSyncBackend {
|
|
|
2616
1928
|
private _backend;
|
|
2617
1929
|
private isLoading;
|
|
2618
1930
|
private numIdentityClasses;
|
|
1931
|
+
readonly chunkSize: number;
|
|
2619
1932
|
private inferenceQueue;
|
|
2620
1933
|
private poisoned;
|
|
2621
1934
|
private static readonly INFERENCE_TIMEOUT_MS;
|
|
@@ -2635,11 +1948,10 @@ declare class Wav2Vec2Inference implements LipSyncBackend {
|
|
|
2635
1948
|
load(): Promise<ModelInfo>;
|
|
2636
1949
|
/**
|
|
2637
1950
|
* Run inference on raw audio
|
|
2638
|
-
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
1951
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
2639
1952
|
* @param identityIndex - Optional identity index (0-11, default 0 = neutral)
|
|
2640
1953
|
*
|
|
2641
|
-
*
|
|
2642
|
-
* Audio will be zero-padded or truncated to 16000 samples.
|
|
1954
|
+
* Audio will be zero-padded or truncated to chunkSize samples.
|
|
2643
1955
|
*/
|
|
2644
1956
|
infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
|
|
2645
1957
|
/**
|
|
@@ -2707,8 +2019,9 @@ interface Wav2ArkitCpuConfig {
|
|
|
2707
2019
|
/** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
|
|
2708
2020
|
backend?: BackendPreference;
|
|
2709
2021
|
}
|
|
2710
|
-
declare class Wav2ArkitCpuInference implements
|
|
2022
|
+
declare class Wav2ArkitCpuInference implements A2EBackend {
|
|
2711
2023
|
readonly modelId: "wav2arkit_cpu";
|
|
2024
|
+
readonly chunkSize: number;
|
|
2712
2025
|
private session;
|
|
2713
2026
|
private ort;
|
|
2714
2027
|
private config;
|
|
@@ -2723,7 +2036,7 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
|
|
|
2723
2036
|
/**
|
|
2724
2037
|
* Load the ONNX model
|
|
2725
2038
|
*/
|
|
2726
|
-
load(): Promise<
|
|
2039
|
+
load(): Promise<A2EModelInfo>;
|
|
2727
2040
|
/**
|
|
2728
2041
|
* Run inference on raw audio
|
|
2729
2042
|
*
|
|
@@ -2733,7 +2046,7 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
|
|
|
2733
2046
|
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
2734
2047
|
* @param _identityIndex - Ignored (identity 11 is baked into the model)
|
|
2735
2048
|
*/
|
|
2736
|
-
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<
|
|
2049
|
+
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
|
|
2737
2050
|
/**
|
|
2738
2051
|
* Queue inference to serialize ONNX session calls
|
|
2739
2052
|
*/
|
|
@@ -2745,7 +2058,7 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
|
|
|
2745
2058
|
}
|
|
2746
2059
|
|
|
2747
2060
|
/**
|
|
2748
|
-
* Factory function for
|
|
2061
|
+
* Factory function for A2E with automatic GPU/CPU model selection
|
|
2749
2062
|
*
|
|
2750
2063
|
* Provides a unified API that automatically selects the optimal model:
|
|
2751
2064
|
* - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
|
|
@@ -2766,20 +2079,20 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
|
|
|
2766
2079
|
*
|
|
2767
2080
|
* @example Auto-detect (recommended)
|
|
2768
2081
|
* ```typescript
|
|
2769
|
-
* import {
|
|
2082
|
+
* import { createA2E } from '@omote/core';
|
|
2770
2083
|
*
|
|
2771
|
-
* const
|
|
2084
|
+
* const a2e = createA2E({
|
|
2772
2085
|
* gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
|
|
2773
2086
|
* cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
2774
2087
|
* });
|
|
2775
2088
|
*
|
|
2776
|
-
* await
|
|
2777
|
-
* const { blendshapes } = await
|
|
2089
|
+
* await a2e.load();
|
|
2090
|
+
* const { blendshapes } = await a2e.infer(audioSamples);
|
|
2778
2091
|
* ```
|
|
2779
2092
|
*
|
|
2780
2093
|
* @example Force CPU model
|
|
2781
2094
|
* ```typescript
|
|
2782
|
-
* const
|
|
2095
|
+
* const a2e = createA2E({
|
|
2783
2096
|
* gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
|
|
2784
2097
|
* cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
2785
2098
|
* mode: 'cpu',
|
|
@@ -2788,9 +2101,9 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
|
|
|
2788
2101
|
*/
|
|
2789
2102
|
|
|
2790
2103
|
/**
|
|
2791
|
-
* Configuration for the
|
|
2104
|
+
* Configuration for the A2E factory
|
|
2792
2105
|
*/
|
|
2793
|
-
interface
|
|
2106
|
+
interface CreateA2EConfig {
|
|
2794
2107
|
/** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
|
|
2795
2108
|
gpuModelUrl: string;
|
|
2796
2109
|
/**
|
|
@@ -2804,7 +2117,7 @@ interface CreateLipSyncConfig {
|
|
|
2804
2117
|
cpuModelUrl: string;
|
|
2805
2118
|
/**
|
|
2806
2119
|
* Model selection mode:
|
|
2807
|
-
* - 'auto': Safari/iOS
|
|
2120
|
+
* - 'auto': Safari/iOS -> CPU, everything else -> GPU (default)
|
|
2808
2121
|
* - 'gpu': Force GPU model (Wav2Vec2Inference)
|
|
2809
2122
|
* - 'cpu': Force CPU model (Wav2ArkitCpuInference)
|
|
2810
2123
|
*/
|
|
@@ -2838,12 +2151,322 @@ interface CreateLipSyncConfig {
|
|
|
2838
2151
|
unifiedWorker?: UnifiedInferenceWorker;
|
|
2839
2152
|
}
|
|
2840
2153
|
/**
|
|
2841
|
-
* Create
|
|
2154
|
+
* Create an A2E instance with automatic GPU/CPU model selection
|
|
2842
2155
|
*
|
|
2843
2156
|
* @param config - Factory configuration
|
|
2844
|
-
* @returns
|
|
2157
|
+
* @returns An A2EBackend instance (either GPU or CPU model)
|
|
2158
|
+
*/
|
|
2159
|
+
declare function createA2E(config: CreateA2EConfig): A2EBackend;
|
|
2160
|
+
|
|
2161
|
+
/**
|
|
2162
|
+
* A2EProcessor — Engine-agnostic audio-to-expression processor
|
|
2163
|
+
*
|
|
2164
|
+
* The core inference primitive: audio samples in → blendshape frames out.
|
|
2165
|
+
* No mic capture, no audio playback, no Web Audio API.
|
|
2166
|
+
*
|
|
2167
|
+
* This is what Unity/Unreal/Godot/any engine would use directly.
|
|
2168
|
+
* Web-specific concerns (mic, AudioContext, scheduling) live in the
|
|
2169
|
+
* orchestrator and pipeline layers above.
|
|
2170
|
+
*
|
|
2171
|
+
* Two output modes:
|
|
2172
|
+
* - **Pull mode**: `pushAudio(samples, timestamp)` + `getFrameForTime(t)`
|
|
2173
|
+
* For TTS playback where frames are synced to AudioContext clock.
|
|
2174
|
+
* - **Push mode**: `pushAudio(samples)` + `startDrip()` + `latestFrame`
|
|
2175
|
+
* For live mic / game loop where frames are consumed at ~30fps.
|
|
2176
|
+
*
|
|
2177
|
+
* @category Inference
|
|
2178
|
+
*
|
|
2179
|
+
* @example Pull mode (TTS playback)
|
|
2180
|
+
* ```typescript
|
|
2181
|
+
* const processor = new A2EProcessor({ backend: a2e });
|
|
2182
|
+
* processor.pushAudio(samples, audioContext.currentTime + delay);
|
|
2183
|
+
* const frame = processor.getFrameForTime(audioContext.currentTime);
|
|
2184
|
+
* ```
|
|
2185
|
+
*
|
|
2186
|
+
* @example Push mode (live mic)
|
|
2187
|
+
* ```typescript
|
|
2188
|
+
* const processor = new A2EProcessor({
|
|
2189
|
+
* backend: a2e,
|
|
2190
|
+
* onFrame: (frame) => applyToAvatar(frame),
|
|
2191
|
+
* });
|
|
2192
|
+
* processor.startDrip();
|
|
2193
|
+
* processor.pushAudio(micSamples); // no timestamp → drip mode
|
|
2194
|
+
* ```
|
|
2195
|
+
*/
|
|
2196
|
+
|
|
2197
|
+
interface A2EProcessorConfig {
|
|
2198
|
+
/** Inference backend */
|
|
2199
|
+
backend: A2EBackend;
|
|
2200
|
+
/** Sample rate (default: 16000) */
|
|
2201
|
+
sampleRate?: number;
|
|
2202
|
+
/** Samples per inference chunk (default: 16000 = 1s) */
|
|
2203
|
+
chunkSize?: number;
|
|
2204
|
+
/** Callback fired with each blendshape frame (push mode) */
|
|
2205
|
+
onFrame?: (frame: Float32Array) => void;
|
|
2206
|
+
/** Error callback */
|
|
2207
|
+
onError?: (error: Error) => void;
|
|
2208
|
+
}
|
|
2209
|
+
declare class A2EProcessor {
|
|
2210
|
+
private readonly backend;
|
|
2211
|
+
private readonly sampleRate;
|
|
2212
|
+
private readonly chunkSize;
|
|
2213
|
+
private readonly onFrame?;
|
|
2214
|
+
private readonly onError?;
|
|
2215
|
+
private bufferCapacity;
|
|
2216
|
+
private buffer;
|
|
2217
|
+
private writeOffset;
|
|
2218
|
+
private bufferStartTime;
|
|
2219
|
+
private timestampedQueue;
|
|
2220
|
+
private plainQueue;
|
|
2221
|
+
private _latestFrame;
|
|
2222
|
+
private dripInterval;
|
|
2223
|
+
private lastPulledFrame;
|
|
2224
|
+
private inferenceRunning;
|
|
2225
|
+
private pendingChunks;
|
|
2226
|
+
private getFrameCallCount;
|
|
2227
|
+
private disposed;
|
|
2228
|
+
constructor(config: A2EProcessorConfig);
|
|
2229
|
+
/**
|
|
2230
|
+
* Push audio samples for inference (any source: mic, TTS, file).
|
|
2231
|
+
*
|
|
2232
|
+
* - With `timestamp`: frames stored with timestamps (pull mode)
|
|
2233
|
+
* - Without `timestamp`: frames stored in plain queue (drip/push mode)
|
|
2234
|
+
*
|
|
2235
|
+
* Fire-and-forget: returns immediately, inference runs async.
|
|
2236
|
+
*/
|
|
2237
|
+
pushAudio(samples: Float32Array, timestamp?: number): void;
|
|
2238
|
+
/**
|
|
2239
|
+
* Flush remaining buffered audio (pads to chunkSize).
|
|
2240
|
+
* Call at end of stream to process final partial chunk.
|
|
2241
|
+
*
|
|
2242
|
+
* Routes through the serialized pendingChunks pipeline to maintain
|
|
2243
|
+
* correct frame ordering. Without this, flush() could push frames
|
|
2244
|
+
* with the latest timestamp to the queue before drainPendingChunks()
|
|
2245
|
+
* finishes pushing frames with earlier timestamps — causing
|
|
2246
|
+
* getFrameForTime() to see out-of-order timestamps and stall.
|
|
2247
|
+
*/
|
|
2248
|
+
flush(): Promise<void>;
|
|
2249
|
+
/**
|
|
2250
|
+
* Reset buffer and frame queues
|
|
2251
|
+
*/
|
|
2252
|
+
reset(): void;
|
|
2253
|
+
/**
|
|
2254
|
+
* Get frame synced to external clock (e.g. AudioContext.currentTime).
|
|
2255
|
+
*
|
|
2256
|
+
* Discards frames that are too old, returns the current frame,
|
|
2257
|
+
* or holds last frame as fallback to prevent avatar freezing.
|
|
2258
|
+
*
|
|
2259
|
+
* @param currentTime - Current playback time (seconds)
|
|
2260
|
+
* @returns Blendshape frame, or null if no frames yet
|
|
2261
|
+
*/
|
|
2262
|
+
getFrameForTime(currentTime: number): Float32Array | null;
|
|
2263
|
+
/** Latest frame from drip-feed (live mic, game loop) */
|
|
2264
|
+
get latestFrame(): Float32Array | null;
|
|
2265
|
+
/** Start 30fps drip-feed timer (push mode) */
|
|
2266
|
+
startDrip(): void;
|
|
2267
|
+
/** Stop drip-feed timer */
|
|
2268
|
+
stopDrip(): void;
|
|
2269
|
+
/** Number of frames waiting in queue (both modes combined) */
|
|
2270
|
+
get queuedFrameCount(): number;
|
|
2271
|
+
/** Buffer fill level as fraction of chunkSize (0-1) */
|
|
2272
|
+
get fillLevel(): number;
|
|
2273
|
+
/** Dispose resources */
|
|
2274
|
+
dispose(): void;
|
|
2275
|
+
/**
|
|
2276
|
+
* Process pending chunks sequentially.
|
|
2277
|
+
* Fire-and-forget — called from pushAudio() without awaiting.
|
|
2278
|
+
*/
|
|
2279
|
+
private drainPendingChunks;
|
|
2280
|
+
private handleError;
|
|
2281
|
+
}
|
|
2282
|
+
|
|
2283
|
+
/**
|
|
2284
|
+
* BlendshapeSmoother — Per-channel critically damped spring for 52 ARKit blendshapes
|
|
2285
|
+
*
|
|
2286
|
+
* Eliminates frame gaps between inference batches by smoothly interpolating
|
|
2287
|
+
* blendshape weights using critically damped springs (the game industry standard).
|
|
2288
|
+
*
|
|
2289
|
+
* Each of the 52 blendshape channels has its own spring with position + velocity
|
|
2290
|
+
* state. When a new inference frame arrives, spring targets are updated. Between
|
|
2291
|
+
* frames, springs continue converging toward the last target — no frozen face.
|
|
2292
|
+
*
|
|
2293
|
+
* When inference stalls, `decayToNeutral()` sets all targets to 0, and the
|
|
2294
|
+
* springs smoothly close the mouth / relax the face over the halflife period.
|
|
2295
|
+
*
|
|
2296
|
+
* Math from Daniel Holden's "Spring-It-On" (Epic Games):
|
|
2297
|
+
* https://theorangeduck.com/page/spring-roll-call
|
|
2298
|
+
*
|
|
2299
|
+
* @category Inference
|
|
2300
|
+
*
|
|
2301
|
+
* @example Basic usage
|
|
2302
|
+
* ```typescript
|
|
2303
|
+
* const smoother = new BlendshapeSmoother({ halflife: 0.06 });
|
|
2304
|
+
*
|
|
2305
|
+
* // In frame loop (60fps):
|
|
2306
|
+
* smoother.setTarget(inferenceFrame); // when new frame arrives
|
|
2307
|
+
* const smoothed = smoother.update(1/60); // every render frame
|
|
2308
|
+
* applyToAvatar(smoothed);
|
|
2309
|
+
* ```
|
|
2310
|
+
*/
|
|
2311
|
+
interface BlendshapeSmootherConfig {
|
|
2312
|
+
/**
|
|
2313
|
+
* Spring halflife in seconds — time for the distance to the target
|
|
2314
|
+
* to reduce by half. Lower = snappier, higher = smoother.
|
|
2315
|
+
*
|
|
2316
|
+
* - 0.04s (40ms): Very snappy, slight jitter on fast transitions
|
|
2317
|
+
* - 0.06s (60ms): Sweet spot for lip sync (default)
|
|
2318
|
+
* - 0.10s (100ms): Very smooth, slight lag on fast consonants
|
|
2319
|
+
* - 0: Bypass mode — passes through raw target values (no smoothing)
|
|
2320
|
+
*
|
|
2321
|
+
* Default: 0.06
|
|
2322
|
+
*/
|
|
2323
|
+
halflife?: number;
|
|
2324
|
+
}
|
|
2325
|
+
declare class BlendshapeSmoother {
|
|
2326
|
+
private readonly halflife;
|
|
2327
|
+
/** Current smoothed blendshape values */
|
|
2328
|
+
private values;
|
|
2329
|
+
/** Per-channel spring velocities */
|
|
2330
|
+
private velocities;
|
|
2331
|
+
/** Current spring targets (from latest inference frame) */
|
|
2332
|
+
private targets;
|
|
2333
|
+
/** Whether any target has been set */
|
|
2334
|
+
private _hasTarget;
|
|
2335
|
+
constructor(config?: BlendshapeSmootherConfig);
|
|
2336
|
+
/** Whether a target frame has been set (false until first setTarget call) */
|
|
2337
|
+
get hasTarget(): boolean;
|
|
2338
|
+
/**
|
|
2339
|
+
* Set new target frame from inference output.
|
|
2340
|
+
* Springs will converge toward these values on subsequent update() calls.
|
|
2341
|
+
*/
|
|
2342
|
+
setTarget(frame: Float32Array): void;
|
|
2343
|
+
/**
|
|
2344
|
+
* Advance all 52 springs by `dt` seconds and return the smoothed frame.
|
|
2345
|
+
*
|
|
2346
|
+
* Call this every render frame (e.g., inside requestAnimationFrame).
|
|
2347
|
+
* Returns the internal values buffer — do NOT mutate the returned array.
|
|
2348
|
+
*
|
|
2349
|
+
* @param dt - Time step in seconds (e.g., 1/60 for 60fps)
|
|
2350
|
+
* @returns Smoothed blendshape values (Float32Array of 52)
|
|
2351
|
+
*/
|
|
2352
|
+
update(dt: number): Float32Array;
|
|
2353
|
+
/**
|
|
2354
|
+
* Decay all spring targets to neutral (0).
|
|
2355
|
+
*
|
|
2356
|
+
* Call when inference stalls (no new frames for threshold duration).
|
|
2357
|
+
* The springs will smoothly close the mouth / relax the face over
|
|
2358
|
+
* the halflife period rather than freezing.
|
|
2359
|
+
*/
|
|
2360
|
+
decayToNeutral(): void;
|
|
2361
|
+
/**
|
|
2362
|
+
* Reset all state (values, velocities, targets).
|
|
2363
|
+
* Call when starting a new playback session.
|
|
2364
|
+
*/
|
|
2365
|
+
reset(): void;
|
|
2366
|
+
}
|
|
2367
|
+
|
|
2368
|
+
/**
|
|
2369
|
+
* Renderer-agnostic A2E (audio-to-expression) orchestrator
|
|
2370
|
+
*
|
|
2371
|
+
* Manages the mic capture + A2E inference loop independently of any
|
|
2372
|
+
* 3D renderer. Adapter packages (@omote/three, @omote/babylon) wrap this
|
|
2373
|
+
* thinly and pipe `latestWeights` into their renderer-specific blendshape
|
|
2374
|
+
* controllers.
|
|
2375
|
+
*
|
|
2376
|
+
* Internally delegates all buffer accumulation, inference, and frame
|
|
2377
|
+
* drip-feeding to {@link A2EProcessor}. This class only handles mic capture
|
|
2378
|
+
* (getUserMedia, ScriptProcessorNode, resampling).
|
|
2379
|
+
*
|
|
2380
|
+
* @category Inference
|
|
2381
|
+
*/
|
|
2382
|
+
|
|
2383
|
+
/**
|
|
2384
|
+
* Progress event emitted during model download / compile
|
|
2385
|
+
*/
|
|
2386
|
+
interface A2EProgressEvent {
|
|
2387
|
+
phase: 'download' | 'compile';
|
|
2388
|
+
progress: number;
|
|
2389
|
+
}
|
|
2390
|
+
/**
|
|
2391
|
+
* Configuration for the A2EOrchestrator
|
|
2392
|
+
*/
|
|
2393
|
+
interface A2EOrchestratorConfig {
|
|
2394
|
+
/** URL for the GPU model (Wav2Vec2, Chrome/Firefox/Edge) */
|
|
2395
|
+
gpuModelUrl: string;
|
|
2396
|
+
/** URL for GPU model external data file */
|
|
2397
|
+
gpuExternalDataUrl?: string | false;
|
|
2398
|
+
/** URL for the CPU model (wav2arkit_cpu, Safari/iOS) */
|
|
2399
|
+
cpuModelUrl?: string;
|
|
2400
|
+
/** Sample rate for mic capture (default: 16000) */
|
|
2401
|
+
sampleRate?: number;
|
|
2402
|
+
/** Chunk size in samples for mic capture (default: 16000 = 1s at 16kHz) */
|
|
2403
|
+
chunkSize?: number;
|
|
2404
|
+
/** Callback fired with new blendshape weights after each inference */
|
|
2405
|
+
onFrame?: (weights: Float32Array) => void;
|
|
2406
|
+
/** Callback fired during model loading progress */
|
|
2407
|
+
onProgress?: (event: A2EProgressEvent) => void;
|
|
2408
|
+
/** Callback fired on error */
|
|
2409
|
+
onError?: (error: Error) => void;
|
|
2410
|
+
/** Callback fired when model is loaded and ready */
|
|
2411
|
+
onReady?: () => void;
|
|
2412
|
+
/** Additional createA2E config options */
|
|
2413
|
+
a2eConfig?: Partial<CreateA2EConfig>;
|
|
2414
|
+
}
|
|
2415
|
+
/**
|
|
2416
|
+
* Renderer-agnostic A2E orchestrator.
|
|
2417
|
+
*
|
|
2418
|
+
* Manages mic capture + delegates inference to {@link A2EProcessor}.
|
|
2419
|
+
* Adapters read `latestWeights` each frame to apply to their meshes.
|
|
2420
|
+
*
|
|
2421
|
+
* @example Quick start (used by @omote/three and @omote/babylon adapters)
|
|
2422
|
+
* ```typescript
|
|
2423
|
+
* const orchestrator = new A2EOrchestrator({
|
|
2424
|
+
* gpuModelUrl: '/models/wav2vec2.onnx',
|
|
2425
|
+
* cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
2426
|
+
* onFrame: (weights) => controller.update(weights),
|
|
2427
|
+
* });
|
|
2428
|
+
* await orchestrator.load();
|
|
2429
|
+
* await orchestrator.start();
|
|
2430
|
+
* ```
|
|
2845
2431
|
*/
|
|
2846
|
-
declare
|
|
2432
|
+
declare class A2EOrchestrator {
|
|
2433
|
+
private config;
|
|
2434
|
+
private a2e;
|
|
2435
|
+
private processor;
|
|
2436
|
+
private stream;
|
|
2437
|
+
private audioContext;
|
|
2438
|
+
private scriptProcessor;
|
|
2439
|
+
private nativeSampleRate;
|
|
2440
|
+
private _isReady;
|
|
2441
|
+
private _isStreaming;
|
|
2442
|
+
private _backend;
|
|
2443
|
+
private disposed;
|
|
2444
|
+
constructor(config: A2EOrchestratorConfig);
|
|
2445
|
+
/** Latest blendshape weights from inference (null if none yet) */
|
|
2446
|
+
get latestWeights(): Float32Array | null;
|
|
2447
|
+
/** Whether the model is loaded and ready for inference */
|
|
2448
|
+
get isReady(): boolean;
|
|
2449
|
+
/** Whether mic is active and inference loop is running */
|
|
2450
|
+
get isStreaming(): boolean;
|
|
2451
|
+
/** Current backend type (webgpu, wasm, or null) */
|
|
2452
|
+
get backend(): string | null;
|
|
2453
|
+
/**
|
|
2454
|
+
* Load the A2E model and create the processor
|
|
2455
|
+
*/
|
|
2456
|
+
load(): Promise<void>;
|
|
2457
|
+
/**
|
|
2458
|
+
* Start mic capture and inference loop
|
|
2459
|
+
*/
|
|
2460
|
+
start(): Promise<void>;
|
|
2461
|
+
/**
|
|
2462
|
+
* Stop mic capture and inference loop
|
|
2463
|
+
*/
|
|
2464
|
+
stop(): void;
|
|
2465
|
+
/**
|
|
2466
|
+
* Dispose of all resources
|
|
2467
|
+
*/
|
|
2468
|
+
dispose(): Promise<void>;
|
|
2469
|
+
}
|
|
2847
2470
|
|
|
2848
2471
|
/**
|
|
2849
2472
|
* Safari Web Speech API wrapper for iOS speech recognition
|
|
@@ -3992,11 +3615,6 @@ declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
|
|
|
3992
3615
|
* @param audioEnergy - Optional RMS energy for logging (default: 0)
|
|
3993
3616
|
*/
|
|
3994
3617
|
processVADResult(vadProbability: number, audioEnergy?: number): void;
|
|
3995
|
-
/**
|
|
3996
|
-
* @deprecated Use processVADResult() instead. This method uses naive RMS detection.
|
|
3997
|
-
* Process audio samples for VAD (legacy - uses simple RMS)
|
|
3998
|
-
*/
|
|
3999
|
-
processAudio(samples: Float32Array | Int16Array): void;
|
|
4000
3618
|
/**
|
|
4001
3619
|
* Notify that AI started speaking
|
|
4002
3620
|
*/
|
|
@@ -4020,7 +3638,6 @@ declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
|
|
|
4020
3638
|
isSpeaking: boolean;
|
|
4021
3639
|
speechDurationMs: number;
|
|
4022
3640
|
};
|
|
4023
|
-
private calculateRMS;
|
|
4024
3641
|
private onSpeechDetected;
|
|
4025
3642
|
private onSilenceDetected;
|
|
4026
3643
|
}
|
|
@@ -5196,4 +4813,4 @@ declare class ProceduralLifeLayer {
|
|
|
5196
4813
|
private updateBrowNoise;
|
|
5197
4814
|
}
|
|
5198
4815
|
|
|
5199
|
-
export { type AIAdapter, type AIAdapterEvents, AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, type
|
|
4816
|
+
export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, type AIAdapter, type AIAdapterEvents, AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateA2EConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, ProceduralLifeLayer, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SessionConfig, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureTelemetry, createA2E, createEmotionVector, createSenseVoice, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, preloadModels, resolveBackend, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker };
|