@omote/core 0.5.7 → 0.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -13
- package/dist/index.d.mts +813 -86
- package/dist/index.d.ts +813 -86
- package/dist/index.js +1653 -563
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +1648 -558
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -2
package/dist/index.d.ts
CHANGED
|
@@ -380,7 +380,7 @@ declare function isSafari(): boolean;
|
|
|
380
380
|
/**
|
|
381
381
|
* Recommend using CPU-optimized A2E model (wav2arkit_cpu)
|
|
382
382
|
*
|
|
383
|
-
* All iOS browsers use WebKit and have tight memory limits — the
|
|
383
|
+
* All iOS browsers use WebKit and have tight memory limits — the 192MB fp16
|
|
384
384
|
* LAM model causes silent crashes. wav2arkit_cpu uses URL pass-through
|
|
385
385
|
* (ORT fetches the 402MB weights directly into WASM, no JS heap copy).
|
|
386
386
|
*
|
|
@@ -427,8 +427,8 @@ declare function shouldUseServerA2E(): boolean;
|
|
|
427
427
|
/**
|
|
428
428
|
* Common interface for audio-to-expression (A2E) inference backends
|
|
429
429
|
*
|
|
430
|
-
* Both Wav2Vec2Inference (GPU,
|
|
431
|
-
* implement this interface, allowing
|
|
430
|
+
* Both Wav2Vec2Inference (GPU, 192MB fp16) and Wav2ArkitCpuInference (CPU, 404MB)
|
|
431
|
+
* implement this interface, allowing FullFacePipeline and A2EProcessor to
|
|
432
432
|
* work with either model transparently.
|
|
433
433
|
*
|
|
434
434
|
* @category Inference
|
|
@@ -461,7 +461,7 @@ interface A2EResult {
|
|
|
461
461
|
* Common interface for A2E (audio-to-expression) inference engines
|
|
462
462
|
*
|
|
463
463
|
* Implemented by:
|
|
464
|
-
* - Wav2Vec2Inference (WebGPU/WASM,
|
|
464
|
+
* - Wav2Vec2Inference (WebGPU/WASM, 192MB fp16, A2E)
|
|
465
465
|
* - Wav2ArkitCpuInference (WASM-only, 404MB, A2E only)
|
|
466
466
|
*/
|
|
467
467
|
interface A2EBackend {
|
|
@@ -492,38 +492,13 @@ interface A2EBackend {
|
|
|
492
492
|
}
|
|
493
493
|
|
|
494
494
|
/**
|
|
495
|
-
*
|
|
496
|
-
*
|
|
497
|
-
* Orchestrates full-face animation by:
|
|
498
|
-
* 1. Scheduling audio for playback immediately (audio-first, never waits for A2E)
|
|
499
|
-
* 2. Running A2E inference in background (fire-and-forget via A2EProcessor)
|
|
500
|
-
* 3. Applying per-character ExpressionProfile scaling to raw A2E output
|
|
495
|
+
* ExpressionProfile - Per-character weight scaling for A2E blendshape output
|
|
501
496
|
*
|
|
502
|
-
*
|
|
503
|
-
*
|
|
504
|
-
* by group (eyes, brows, jaw, mouth, cheeks, nose, tongue) with per-blendshape overrides.
|
|
497
|
+
* Maps blendshape groups (eyes, brows, jaw, mouth, cheeks, nose, tongue)
|
|
498
|
+
* to weight scalers. Used by PlaybackPipeline, MicLipSync, and VoicePipeline.
|
|
505
499
|
*
|
|
506
500
|
* @category Audio
|
|
507
|
-
*
|
|
508
|
-
* @example Basic usage
|
|
509
|
-
* ```typescript
|
|
510
|
-
* import { FullFacePipeline } from '@omote/core';
|
|
511
|
-
*
|
|
512
|
-
* const pipeline = new FullFacePipeline({
|
|
513
|
-
* lam,
|
|
514
|
-
* profile: { mouth: 1.2, brows: 0.8 },
|
|
515
|
-
* });
|
|
516
|
-
* await pipeline.initialize();
|
|
517
|
-
*
|
|
518
|
-
* pipeline.on('full_frame_ready', (frame) => {
|
|
519
|
-
* applyToAvatar(frame.blendshapes);
|
|
520
|
-
* });
|
|
521
|
-
*
|
|
522
|
-
* pipeline.start();
|
|
523
|
-
* await pipeline.onAudioChunk(audioData);
|
|
524
|
-
* ```
|
|
525
501
|
*/
|
|
526
|
-
|
|
527
502
|
type BlendshapeGroup = 'eyes' | 'brows' | 'jaw' | 'mouth' | 'cheeks' | 'nose' | 'tongue';
|
|
528
503
|
/**
|
|
529
504
|
* Per-character weight scaling for A2E blendshape output.
|
|
@@ -555,6 +530,53 @@ interface ExpressionProfile {
|
|
|
555
530
|
* Built once at module load from prefix matching.
|
|
556
531
|
*/
|
|
557
532
|
declare const BLENDSHAPE_TO_GROUP: Map<string, BlendshapeGroup>;
|
|
533
|
+
/**
|
|
534
|
+
* Apply ExpressionProfile scaling to raw A2E blendshapes.
|
|
535
|
+
*
|
|
536
|
+
* For each blendshape:
|
|
537
|
+
* 1. If an override exists for the blendshape name, use override as scaler
|
|
538
|
+
* 2. Otherwise, use the group scaler (default 1.0)
|
|
539
|
+
* 3. Clamp result to [0, 1]
|
|
540
|
+
*/
|
|
541
|
+
declare function applyProfile(raw: Float32Array, profile: ExpressionProfile): Float32Array;
|
|
542
|
+
|
|
543
|
+
/**
|
|
544
|
+
* FullFacePipeline - A2E expression pipeline with ExpressionProfile weight scaling
|
|
545
|
+
*
|
|
546
|
+
* Orchestrates full-face animation by:
|
|
547
|
+
* 1. Scheduling audio for playback immediately (audio-first, never waits for A2E)
|
|
548
|
+
* 2. Running A2E inference in background (fire-and-forget via A2EProcessor)
|
|
549
|
+
* 3. Applying per-character ExpressionProfile scaling to raw A2E output
|
|
550
|
+
*
|
|
551
|
+
* The A2E model outputs all 52 ARKit blendshapes from audio — brows, eyes, cheeks,
|
|
552
|
+
* mouth, jaw, everything. ExpressionProfile allows per-character weight scaling
|
|
553
|
+
* by group (eyes, brows, jaw, mouth, cheeks, nose, tongue) with per-blendshape overrides.
|
|
554
|
+
*
|
|
555
|
+
* @deprecated Use {@link PlaybackPipeline} from `@omote/core` instead. PlaybackPipeline
|
|
556
|
+
* is a superset with sync mode (`feedBuffer`), state tracking, and opt-in neutral transition.
|
|
557
|
+
* FullFacePipeline will continue to work but is no longer actively developed.
|
|
558
|
+
*
|
|
559
|
+
* @category Audio
|
|
560
|
+
*
|
|
561
|
+
* @example Basic usage
|
|
562
|
+
* ```typescript
|
|
563
|
+
* import { FullFacePipeline } from '@omote/core';
|
|
564
|
+
*
|
|
565
|
+
* const pipeline = new FullFacePipeline({
|
|
566
|
+
* lam,
|
|
567
|
+
* profile: { mouth: 1.2, brows: 0.8 },
|
|
568
|
+
* });
|
|
569
|
+
* await pipeline.initialize();
|
|
570
|
+
*
|
|
571
|
+
* pipeline.on('full_frame_ready', (frame) => {
|
|
572
|
+
* applyToAvatar(frame.blendshapes);
|
|
573
|
+
* });
|
|
574
|
+
*
|
|
575
|
+
* pipeline.start();
|
|
576
|
+
* await pipeline.onAudioChunk(audioData);
|
|
577
|
+
* ```
|
|
578
|
+
*/
|
|
579
|
+
|
|
558
580
|
/**
|
|
559
581
|
* Configuration for FullFacePipeline
|
|
560
582
|
*/
|
|
@@ -605,7 +627,7 @@ interface FullFacePipelineOptions {
|
|
|
605
627
|
/**
|
|
606
628
|
* Full face frame with scaled blendshapes
|
|
607
629
|
*/
|
|
608
|
-
interface FullFaceFrame {
|
|
630
|
+
interface FullFaceFrame$1 {
|
|
609
631
|
/** Scaled 52 ARKit blendshapes (ExpressionProfile applied) */
|
|
610
632
|
blendshapes: Float32Array;
|
|
611
633
|
/** Raw A2E output (52 blendshapes, before profile scaling) */
|
|
@@ -618,7 +640,7 @@ interface FullFaceFrame {
|
|
|
618
640
|
*/
|
|
619
641
|
interface FullFacePipelineEvents {
|
|
620
642
|
/** New merged frame ready for display */
|
|
621
|
-
full_frame_ready: FullFaceFrame;
|
|
643
|
+
full_frame_ready: FullFaceFrame$1;
|
|
622
644
|
/** Raw LAM frame ready (for debugging/monitoring) */
|
|
623
645
|
lam_frame_ready: Float32Array;
|
|
624
646
|
/** Playback has completed */
|
|
@@ -664,10 +686,7 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
|
|
|
664
686
|
/**
|
|
665
687
|
* Apply ExpressionProfile scaling to raw A2E blendshapes.
|
|
666
688
|
*
|
|
667
|
-
*
|
|
668
|
-
* 1. If an override exists for the blendshape name, use override as scaler
|
|
669
|
-
* 2. Otherwise, use the group scaler (default 1.0)
|
|
670
|
-
* 3. Clamp result to [0, 1]
|
|
689
|
+
* Delegates to the standalone applyProfile() utility from expressionProfile.ts.
|
|
671
690
|
*/
|
|
672
691
|
applyProfile(raw: Float32Array): Float32Array;
|
|
673
692
|
/**
|
|
@@ -727,6 +746,147 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
|
|
|
727
746
|
dispose(): void;
|
|
728
747
|
}
|
|
729
748
|
|
|
749
|
+
/**
|
|
750
|
+
* PlaybackPipeline - Audio playback + A2E lip sync with ExpressionProfile scaling
|
|
751
|
+
*
|
|
752
|
+
* Refactored superset of FullFacePipeline. Adds:
|
|
753
|
+
* - Sync mode (`feedBuffer`) for pre-recorded audio
|
|
754
|
+
* - State tracking (idle → playing → stopping)
|
|
755
|
+
* - Opt-in neutral transition animation on playback complete
|
|
756
|
+
* - Idempotent `start()` (no spurious playback:complete on restart)
|
|
757
|
+
*
|
|
758
|
+
* @category Audio
|
|
759
|
+
*/
|
|
760
|
+
|
|
761
|
+
type PlaybackState = 'idle' | 'playing' | 'stopping';
|
|
762
|
+
interface PlaybackPipelineConfig {
|
|
763
|
+
/** A2E inference backend (from createA2E) */
|
|
764
|
+
lam: A2EBackend;
|
|
765
|
+
/** Sample rate in Hz (default: 16000) */
|
|
766
|
+
sampleRate?: number;
|
|
767
|
+
/** Target chunk duration for coalescing in ms (default: 200) */
|
|
768
|
+
chunkTargetMs?: number;
|
|
769
|
+
/** Audio playback delay in ms (default: auto-detected from backend) */
|
|
770
|
+
audioDelayMs?: number;
|
|
771
|
+
/** A2E inference chunk size in samples (default: 16000) */
|
|
772
|
+
chunkSize?: number;
|
|
773
|
+
/** Identity/style index for Wav2Vec2 (default: 0) */
|
|
774
|
+
identityIndex?: number;
|
|
775
|
+
/** Per-character expression weight scaling */
|
|
776
|
+
profile?: ExpressionProfile;
|
|
777
|
+
/** Enable neutral transition on playback complete (default: false) */
|
|
778
|
+
neutralTransitionEnabled?: boolean;
|
|
779
|
+
/** Duration of neutral fade-out in ms (default: 250). Only applies when neutralTransitionEnabled=true. */
|
|
780
|
+
neutralTransitionMs?: number;
|
|
781
|
+
/** Stale frame warning threshold in ms (default: 2000) */
|
|
782
|
+
staleThresholdMs?: number;
|
|
783
|
+
}
|
|
784
|
+
/**
|
|
785
|
+
* Full face frame with scaled blendshapes
|
|
786
|
+
*/
|
|
787
|
+
interface FullFaceFrame {
|
|
788
|
+
/** Scaled 52 ARKit blendshapes (ExpressionProfile applied) */
|
|
789
|
+
blendshapes: Float32Array;
|
|
790
|
+
/** Raw A2E output (52 blendshapes, before profile scaling) */
|
|
791
|
+
rawBlendshapes: Float32Array;
|
|
792
|
+
/** AudioContext timestamp for this frame */
|
|
793
|
+
timestamp: number;
|
|
794
|
+
}
|
|
795
|
+
interface PlaybackPipelineEvents {
|
|
796
|
+
/** New frame ready for display (scaled by ExpressionProfile) */
|
|
797
|
+
'frame': FullFaceFrame;
|
|
798
|
+
/** Raw A2E frame (before profile scaling) */
|
|
799
|
+
'frame:raw': Float32Array;
|
|
800
|
+
/** Playback started (first audio scheduled) */
|
|
801
|
+
'playback:start': {
|
|
802
|
+
time: number;
|
|
803
|
+
};
|
|
804
|
+
/** Playback completed naturally */
|
|
805
|
+
'playback:complete': void;
|
|
806
|
+
/** Playback stopped (user-initiated) */
|
|
807
|
+
'playback:stop': void;
|
|
808
|
+
/** Error occurred */
|
|
809
|
+
'error': Error;
|
|
810
|
+
/** State changed */
|
|
811
|
+
'state': PlaybackState;
|
|
812
|
+
'full_frame_ready': FullFaceFrame;
|
|
813
|
+
'lam_frame_ready': Float32Array;
|
|
814
|
+
'playback_complete': void;
|
|
815
|
+
'playback_start': number;
|
|
816
|
+
[key: string]: unknown;
|
|
817
|
+
}
|
|
818
|
+
declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
819
|
+
private readonly config;
|
|
820
|
+
private scheduler;
|
|
821
|
+
private coalescer;
|
|
822
|
+
private processor;
|
|
823
|
+
private readonly sampleRate;
|
|
824
|
+
private _state;
|
|
825
|
+
private playbackStarted;
|
|
826
|
+
private monitorInterval;
|
|
827
|
+
private frameAnimationId;
|
|
828
|
+
private lastNewFrameTime;
|
|
829
|
+
private lastKnownLamFrame;
|
|
830
|
+
private staleWarningEmitted;
|
|
831
|
+
private readonly staleThresholdMs;
|
|
832
|
+
private frameLoopCount;
|
|
833
|
+
private profile;
|
|
834
|
+
private readonly neutralTransitionEnabled;
|
|
835
|
+
private readonly neutralTransitionMs;
|
|
836
|
+
private neutralTransitionFrame;
|
|
837
|
+
private neutralTransitionStart;
|
|
838
|
+
private neutralAnimationId;
|
|
839
|
+
private _currentFrame;
|
|
840
|
+
private _currentRawFrame;
|
|
841
|
+
/** Current pipeline state */
|
|
842
|
+
get state(): PlaybackState;
|
|
843
|
+
/** Current scaled blendshapes (updated in-place for perf) */
|
|
844
|
+
get currentFrame(): Float32Array | null;
|
|
845
|
+
/** Raw A2E blendshapes (before profile scaling) */
|
|
846
|
+
get currentRawFrame(): Float32Array | null;
|
|
847
|
+
constructor(config: PlaybackPipelineConfig);
|
|
848
|
+
/** Initialize AudioContext (lazy, call after user gesture) */
|
|
849
|
+
initialize(): Promise<void>;
|
|
850
|
+
/** Update ExpressionProfile at runtime */
|
|
851
|
+
setProfile(profile: ExpressionProfile): void;
|
|
852
|
+
/**
|
|
853
|
+
* Start a new playback session.
|
|
854
|
+
* Idempotent — calling during playback resets cleanly without emitting
|
|
855
|
+
* spurious playback:complete.
|
|
856
|
+
*/
|
|
857
|
+
start(): void;
|
|
858
|
+
/** Feed a streaming audio chunk (PCM16 Uint8Array) */
|
|
859
|
+
onAudioChunk(chunk: Uint8Array): Promise<void>;
|
|
860
|
+
/** Signal end of audio stream (flushes remaining audio) */
|
|
861
|
+
end(): Promise<void>;
|
|
862
|
+
/**
|
|
863
|
+
* Feed a complete audio buffer. Chunks into 200ms pieces, schedules each
|
|
864
|
+
* for playback, runs A2E inference, then waits for completion.
|
|
865
|
+
*/
|
|
866
|
+
feedBuffer(audio: ArrayBuffer | Float32Array): Promise<void>;
|
|
867
|
+
/** Stop playback immediately with fade-out */
|
|
868
|
+
stop(fadeOutMs?: number): Promise<void>;
|
|
869
|
+
/** Cleanup all resources */
|
|
870
|
+
dispose(): void;
|
|
871
|
+
/** Get pipeline debug state */
|
|
872
|
+
getDebugState(): {
|
|
873
|
+
state: PlaybackState;
|
|
874
|
+
playbackStarted: boolean;
|
|
875
|
+
coalescerFill: number;
|
|
876
|
+
processorFill: number;
|
|
877
|
+
queuedFrames: number;
|
|
878
|
+
currentTime: number;
|
|
879
|
+
playbackEndTime: number;
|
|
880
|
+
};
|
|
881
|
+
private startFrameLoop;
|
|
882
|
+
private startMonitoring;
|
|
883
|
+
private onPlaybackComplete;
|
|
884
|
+
private startNeutralTransition;
|
|
885
|
+
private cancelNeutralTransition;
|
|
886
|
+
private stopInternal;
|
|
887
|
+
private setState;
|
|
888
|
+
}
|
|
889
|
+
|
|
730
890
|
/**
|
|
731
891
|
* Interruption Handler
|
|
732
892
|
*
|
|
@@ -1456,7 +1616,9 @@ interface SileroVADBackend {
|
|
|
1456
1616
|
*
|
|
1457
1617
|
* Extends SileroVADConfig with worker-specific options.
|
|
1458
1618
|
*/
|
|
1459
|
-
interface SileroVADFactoryConfig extends SileroVADConfig {
|
|
1619
|
+
interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'> {
|
|
1620
|
+
/** Path or URL to the ONNX model. Default: HuggingFace CDN */
|
|
1621
|
+
modelUrl?: string;
|
|
1460
1622
|
/**
|
|
1461
1623
|
* Force worker usage (true), main thread (false), or auto-detect (undefined).
|
|
1462
1624
|
*
|
|
@@ -1529,7 +1691,7 @@ declare function supportsVADWorker(): boolean;
|
|
|
1529
1691
|
* const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
|
|
1530
1692
|
* ```
|
|
1531
1693
|
*/
|
|
1532
|
-
declare function createSileroVAD(config
|
|
1694
|
+
declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
|
|
1533
1695
|
|
|
1534
1696
|
/**
|
|
1535
1697
|
* Web Worker-based wav2arkit_cpu lip sync inference
|
|
@@ -1852,8 +2014,8 @@ interface SenseVoiceBackend {
|
|
|
1852
2014
|
* Configuration for the SenseVoice factory
|
|
1853
2015
|
*/
|
|
1854
2016
|
interface CreateSenseVoiceConfig {
|
|
1855
|
-
/** Path or URL to model.int8.onnx (239MB) */
|
|
1856
|
-
modelUrl
|
|
2017
|
+
/** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
|
|
2018
|
+
modelUrl?: string;
|
|
1857
2019
|
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
1858
2020
|
tokensUrl?: string;
|
|
1859
2021
|
/** Language hint (default: 'auto') */
|
|
@@ -1880,7 +2042,7 @@ interface CreateSenseVoiceConfig {
|
|
|
1880
2042
|
* @param config - Factory configuration
|
|
1881
2043
|
* @returns A SenseVoiceBackend instance (either Worker or main thread)
|
|
1882
2044
|
*/
|
|
1883
|
-
declare function createSenseVoice(config
|
|
2045
|
+
declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
|
|
1884
2046
|
|
|
1885
2047
|
/**
|
|
1886
2048
|
* Shared blendshape constants and utilities for lip sync inference
|
|
@@ -1915,12 +2077,10 @@ declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "bro
|
|
|
1915
2077
|
declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
|
|
1916
2078
|
|
|
1917
2079
|
/**
|
|
1918
|
-
*
|
|
2080
|
+
* Wav2Vec2 inference engine for Audio-to-Expression (A2E)
|
|
1919
2081
|
*
|
|
1920
2082
|
* Runs entirely in the browser using WebGPU or WASM.
|
|
1921
|
-
* Takes raw 16kHz audio and outputs
|
|
1922
|
-
* - 52 ARKit blendshapes (lip sync)
|
|
1923
|
-
* - 32-token CTC logits (speech recognition)
|
|
2083
|
+
* Takes raw 16kHz audio and outputs 52 ARKit blendshapes for lip sync.
|
|
1924
2084
|
*
|
|
1925
2085
|
* @category Inference
|
|
1926
2086
|
*
|
|
@@ -1928,14 +2088,12 @@ declare function lerpBlendshapes(current: Float32Array | number[], target: Float
|
|
|
1928
2088
|
* ```typescript
|
|
1929
2089
|
* import { Wav2Vec2Inference } from '@omote/core';
|
|
1930
2090
|
*
|
|
1931
|
-
* const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/
|
|
2091
|
+
* const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/model.onnx' });
|
|
1932
2092
|
* await wav2vec.load();
|
|
1933
2093
|
*
|
|
1934
2094
|
* // Process 1 second of audio (16kHz = 16000 samples)
|
|
1935
2095
|
* const result = await wav2vec.infer(audioSamples);
|
|
1936
|
-
*
|
|
1937
2096
|
* console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
|
|
1938
|
-
* console.log('ASR text:', result.text); // Decoded transcription
|
|
1939
2097
|
* ```
|
|
1940
2098
|
*/
|
|
1941
2099
|
|
|
@@ -1968,21 +2126,16 @@ interface ModelInfo {
|
|
|
1968
2126
|
outputNames: string[];
|
|
1969
2127
|
}
|
|
1970
2128
|
|
|
1971
|
-
/**
|
|
2129
|
+
/**
|
|
2130
|
+
* CTC vocabulary (32 tokens from wav2vec2-base-960h)
|
|
2131
|
+
* @deprecated ASR is handled by SenseVoice. This will be removed in a future release.
|
|
2132
|
+
*/
|
|
1972
2133
|
declare const CTC_VOCAB: string[];
|
|
1973
2134
|
interface Wav2Vec2Result {
|
|
1974
2135
|
/** Blendshape weights [frames, 52] - 30fps */
|
|
1975
2136
|
blendshapes: Float32Array[];
|
|
1976
|
-
/**
|
|
1977
|
-
asrLogits: Float32Array[];
|
|
1978
|
-
/** Decoded text from CTC */
|
|
1979
|
-
text: string;
|
|
1980
|
-
/** Number of blendshape frames (30fps) — alias for numA2EFrames */
|
|
2137
|
+
/** Number of blendshape frames (30fps) */
|
|
1981
2138
|
numFrames: number;
|
|
1982
|
-
/** Number of A2E frames (30fps) */
|
|
1983
|
-
numA2EFrames: number;
|
|
1984
|
-
/** Number of ASR frames (50fps) */
|
|
1985
|
-
numASRFrames: number;
|
|
1986
2139
|
/** Inference time in ms */
|
|
1987
2140
|
inferenceTimeMs: number;
|
|
1988
2141
|
}
|
|
@@ -2020,10 +2173,6 @@ declare class Wav2Vec2Inference implements A2EBackend {
|
|
|
2020
2173
|
* Audio will be zero-padded or truncated to chunkSize samples.
|
|
2021
2174
|
*/
|
|
2022
2175
|
infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
|
|
2023
|
-
/**
|
|
2024
|
-
* Decode CTC logits to text using greedy decoding
|
|
2025
|
-
*/
|
|
2026
|
-
private decodeCTC;
|
|
2027
2176
|
/**
|
|
2028
2177
|
* Queue inference to serialize ONNX session calls
|
|
2029
2178
|
*/
|
|
@@ -2038,10 +2187,85 @@ declare class Wav2Vec2Inference implements A2EBackend {
|
|
|
2038
2187
|
dispose(): Promise<void>;
|
|
2039
2188
|
}
|
|
2040
2189
|
|
|
2190
|
+
/**
|
|
2191
|
+
* Default and user-configurable model URLs for all ONNX models
|
|
2192
|
+
*
|
|
2193
|
+
* Out of the box, models are served from HuggingFace CDN (`/resolve/main/`
|
|
2194
|
+
* endpoint with `Access-Control-Allow-Origin: *`). For production apps that
|
|
2195
|
+
* need faster or more reliable delivery, call {@link configureModelUrls} once
|
|
2196
|
+
* at startup to point any or all models at your own CDN.
|
|
2197
|
+
*
|
|
2198
|
+
* @category Inference
|
|
2199
|
+
*
|
|
2200
|
+
* @example Use HuggingFace defaults (zero-config)
|
|
2201
|
+
* ```typescript
|
|
2202
|
+
* import { createA2E } from '@omote/core';
|
|
2203
|
+
* const a2e = createA2E(); // fetches from HuggingFace CDN
|
|
2204
|
+
* ```
|
|
2205
|
+
*
|
|
2206
|
+
* @example Self-host on your own CDN
|
|
2207
|
+
* ```typescript
|
|
2208
|
+
* import { configureModelUrls, createA2E } from '@omote/core';
|
|
2209
|
+
*
|
|
2210
|
+
* configureModelUrls({
|
|
2211
|
+
* lam: 'https://cdn.example.com/models/model_fp16.onnx',
|
|
2212
|
+
* senseVoice: 'https://cdn.example.com/models/sensevoice.int8.onnx',
|
|
2213
|
+
* // omitted keys keep HuggingFace defaults
|
|
2214
|
+
* });
|
|
2215
|
+
*
|
|
2216
|
+
* const a2e = createA2E(); // now fetches from your CDN
|
|
2217
|
+
* ```
|
|
2218
|
+
*/
|
|
2219
|
+
/** Model URL keys that can be configured */
|
|
2220
|
+
type ModelUrlKey = 'lam' | 'wav2arkitCpu' | 'senseVoice' | 'sileroVad';
|
|
2221
|
+
/**
|
|
2222
|
+
* Resolved model URLs — user overrides take priority, HuggingFace CDN is fallback.
|
|
2223
|
+
*
|
|
2224
|
+
* All SDK factories (`createA2E`, `createSenseVoice`, `createSileroVAD`) and
|
|
2225
|
+
* orchestrators (`VoicePipeline`) read from this object. Call
|
|
2226
|
+
* {@link configureModelUrls} before constructing any pipelines to point
|
|
2227
|
+
* models at your own CDN.
|
|
2228
|
+
*/
|
|
2229
|
+
declare const DEFAULT_MODEL_URLS: Readonly<Record<ModelUrlKey, string>>;
|
|
2230
|
+
/**
|
|
2231
|
+
* Configure custom model URLs. Overrides persist for the lifetime of the page.
|
|
2232
|
+
* Omitted keys keep their HuggingFace CDN defaults.
|
|
2233
|
+
*
|
|
2234
|
+
* Call this **once** at app startup, before constructing any pipelines.
|
|
2235
|
+
*
|
|
2236
|
+
* @example Self-host all models
|
|
2237
|
+
* ```typescript
|
|
2238
|
+
* configureModelUrls({
|
|
2239
|
+
* lam: 'https://cdn.example.com/models/model_fp16.onnx',
|
|
2240
|
+
* wav2arkitCpu: 'https://cdn.example.com/models/wav2arkit_cpu.onnx',
|
|
2241
|
+
* senseVoice: 'https://cdn.example.com/models/sensevoice.int8.onnx',
|
|
2242
|
+
* sileroVad: 'https://cdn.example.com/models/silero-vad.onnx',
|
|
2243
|
+
* });
|
|
2244
|
+
* ```
|
|
2245
|
+
*
|
|
2246
|
+
* @example Override only one model
|
|
2247
|
+
* ```typescript
|
|
2248
|
+
* configureModelUrls({
|
|
2249
|
+
* lam: '/models/model_fp16.onnx', // self-hosted, same origin
|
|
2250
|
+
* });
|
|
2251
|
+
* ```
|
|
2252
|
+
*/
|
|
2253
|
+
declare function configureModelUrls(urls: Partial<Record<ModelUrlKey, string>>): void;
|
|
2254
|
+
/**
|
|
2255
|
+
* Reset all model URL overrides back to HuggingFace CDN defaults.
|
|
2256
|
+
* Mainly useful for testing.
|
|
2257
|
+
*/
|
|
2258
|
+
declare function resetModelUrls(): void;
|
|
2259
|
+
/**
|
|
2260
|
+
* Get the immutable HuggingFace CDN URLs (ignoring any overrides).
|
|
2261
|
+
* Useful for documentation or fallback logic.
|
|
2262
|
+
*/
|
|
2263
|
+
declare const HF_CDN_URLS: Readonly<Record<ModelUrlKey, string>>;
|
|
2264
|
+
|
|
2041
2265
|
/**
|
|
2042
2266
|
* CPU-optimized lip sync inference using wav2arkit_cpu model
|
|
2043
2267
|
*
|
|
2044
|
-
* A Safari/iOS-compatible alternative to Wav2Vec2Inference (
|
|
2268
|
+
* A Safari/iOS-compatible alternative to Wav2Vec2Inference (192MB fp16) designed
|
|
2045
2269
|
* for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
|
|
2046
2270
|
*
|
|
2047
2271
|
* The model uses ONNX external data format:
|
|
@@ -2128,41 +2352,33 @@ declare class Wav2ArkitCpuInference implements A2EBackend {
|
|
|
2128
2352
|
*
|
|
2129
2353
|
* Provides a unified API that automatically selects the optimal model:
|
|
2130
2354
|
* - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
|
|
2131
|
-
* - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (
|
|
2355
|
+
* - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (192MB fp16, WebGPU)
|
|
2132
2356
|
* - Fallback: Gracefully falls back to CPU model if GPU model fails to load
|
|
2133
2357
|
*
|
|
2134
2358
|
* Why two separate models?
|
|
2135
2359
|
* Wav2Vec2 (LAM) cannot run on Safari/iOS for two reasons:
|
|
2136
2360
|
* 1. Its dual-head transformer graph needs ~750-950MB peak during ORT session
|
|
2137
2361
|
* creation (graph optimization), exceeding iOS WebKit's ~1-1.5GB tab limit.
|
|
2138
|
-
* 2. It ships as a single
|
|
2139
|
-
* ORT can consume it. iOS WebKit OOMs on this allocation.
|
|
2362
|
+
* 2. It ships as a single 192MB .onnx file (fp16) that must load into JS heap
|
|
2363
|
+
* before ORT can consume it. iOS WebKit OOMs on this allocation.
|
|
2140
2364
|
* wav2arkit_cpu solves both: external data format (1.86MB graph + 402MB weights)
|
|
2141
2365
|
* lets ORT load only the tiny graph, then stream weights via URL pass-through
|
|
2142
2366
|
* directly into WASM memory. JS heap stays at ~2MB.
|
|
2143
2367
|
*
|
|
2144
2368
|
* @category Inference
|
|
2145
2369
|
*
|
|
2146
|
-
* @example Auto-detect (recommended)
|
|
2370
|
+
* @example Auto-detect (recommended, zero-config)
|
|
2147
2371
|
* ```typescript
|
|
2148
2372
|
* import { createA2E } from '@omote/core';
|
|
2149
2373
|
*
|
|
2150
|
-
* const a2e = createA2E(
|
|
2151
|
-
* gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
|
|
2152
|
-
* cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
2153
|
-
* });
|
|
2154
|
-
*
|
|
2374
|
+
* const a2e = createA2E(); // uses HF CDN defaults (192MB fp16 GPU, 404MB CPU fallback)
|
|
2155
2375
|
* await a2e.load();
|
|
2156
2376
|
* const { blendshapes } = await a2e.infer(audioSamples);
|
|
2157
2377
|
* ```
|
|
2158
2378
|
*
|
|
2159
2379
|
* @example Force CPU model
|
|
2160
2380
|
* ```typescript
|
|
2161
|
-
* const a2e = createA2E({
|
|
2162
|
-
* gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
|
|
2163
|
-
* cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
2164
|
-
* mode: 'cpu',
|
|
2165
|
-
* });
|
|
2381
|
+
* const a2e = createA2E({ mode: 'cpu' });
|
|
2166
2382
|
* ```
|
|
2167
2383
|
*/
|
|
2168
2384
|
|
|
@@ -2170,8 +2386,8 @@ declare class Wav2ArkitCpuInference implements A2EBackend {
|
|
|
2170
2386
|
* Configuration for the A2E factory
|
|
2171
2387
|
*/
|
|
2172
2388
|
interface CreateA2EConfig {
|
|
2173
|
-
/** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
|
|
2174
|
-
gpuModelUrl
|
|
2389
|
+
/** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge). Default: HuggingFace CDN */
|
|
2390
|
+
gpuModelUrl?: string;
|
|
2175
2391
|
/**
|
|
2176
2392
|
* URL for GPU model external data file (.onnx.data weights).
|
|
2177
2393
|
* Default: `${gpuModelUrl}.data`
|
|
@@ -2179,8 +2395,8 @@ interface CreateA2EConfig {
|
|
|
2179
2395
|
* Set to `false` to skip external data loading (single-file models only).
|
|
2180
2396
|
*/
|
|
2181
2397
|
gpuExternalDataUrl?: string | false;
|
|
2182
|
-
/** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
|
|
2183
|
-
cpuModelUrl
|
|
2398
|
+
/** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS). Default: HuggingFace CDN */
|
|
2399
|
+
cpuModelUrl?: string;
|
|
2184
2400
|
/**
|
|
2185
2401
|
* Model selection mode:
|
|
2186
2402
|
* - 'auto': Safari/iOS -> CPU, everything else -> GPU (default)
|
|
@@ -2222,7 +2438,7 @@ interface CreateA2EConfig {
|
|
|
2222
2438
|
* @param config - Factory configuration
|
|
2223
2439
|
* @returns An A2EBackend instance (either GPU or CPU model)
|
|
2224
2440
|
*/
|
|
2225
|
-
declare function createA2E(config
|
|
2441
|
+
declare function createA2E(config?: CreateA2EConfig): A2EBackend;
|
|
2226
2442
|
|
|
2227
2443
|
/**
|
|
2228
2444
|
* A2EProcessor — Engine-agnostic audio-to-expression processor
|
|
@@ -2455,6 +2671,10 @@ declare class BlendshapeSmoother {
|
|
|
2455
2671
|
* drip-feeding to {@link A2EProcessor}. This class only handles mic capture
|
|
2456
2672
|
* (getUserMedia, ScriptProcessorNode, resampling).
|
|
2457
2673
|
*
|
|
2674
|
+
* @deprecated Use {@link MicLipSync} from `@omote/core` instead. MicLipSync provides
|
|
2675
|
+
* the same mic → A2E composition with proper MicrophoneCapture integration, VAD support,
|
|
2676
|
+
* ExpressionProfile scaling, and pause/resume. This class will be removed in a future version.
|
|
2677
|
+
*
|
|
2458
2678
|
* @category Inference
|
|
2459
2679
|
*/
|
|
2460
2680
|
|
|
@@ -3947,10 +4167,12 @@ declare class EmphasisDetector {
|
|
|
3947
4167
|
* breathing/postural sway, and simplex noise-driven brow drift.
|
|
3948
4168
|
*
|
|
3949
4169
|
* Research sources:
|
|
3950
|
-
* - Blink frequency:
|
|
4170
|
+
* - Blink frequency: log-normal IBI (mean=5.97s, SD(log)=0.89), PMC3565584
|
|
4171
|
+
* - Blink shape: asymmetric (92ms close, 242ms open, 3:1 ratio), PMC4043155
|
|
3951
4172
|
* - Saccade latency: ~200ms, duration 20-200ms
|
|
3952
4173
|
* - Microsaccades: ~1/second, amplitude 0.02-0.05, Scholarpedia
|
|
3953
4174
|
* - Fixation duration: 200-350ms, Nature Scientific Reports
|
|
4175
|
+
* - Conversational gaze: Kendon (1967), Argyle & Cook (1976)
|
|
3954
4176
|
* - Brow noise: NVIDIA Audio2Face, Unreal MetaHuman layered procedural animation
|
|
3955
4177
|
*
|
|
3956
4178
|
* @category Animation
|
|
@@ -3967,6 +4189,7 @@ declare class EmphasisDetector {
|
|
|
3967
4189
|
* eyeTargetY: normalizedY,
|
|
3968
4190
|
* audioEnergy: energy, // 0-1 from AudioEnergyAnalyzer
|
|
3969
4191
|
* isSpeaking: true,
|
|
4192
|
+
* state: 'speaking', // conversational state for gaze behavior
|
|
3970
4193
|
* });
|
|
3971
4194
|
*
|
|
3972
4195
|
* // Apply blendshapes to mesh
|
|
@@ -4005,6 +4228,8 @@ interface LifeLayerConfig {
|
|
|
4005
4228
|
/** Eye smoothing factor (higher = faster response). Default: 15 */
|
|
4006
4229
|
eyeSmoothing?: number;
|
|
4007
4230
|
}
|
|
4231
|
+
/** Conversational state for state-dependent gaze behavior */
|
|
4232
|
+
type ConversationalState = 'idle' | 'listening' | 'thinking' | 'speaking';
|
|
4008
4233
|
/**
|
|
4009
4234
|
* Per-frame input to the life layer
|
|
4010
4235
|
*/
|
|
@@ -4017,6 +4242,8 @@ interface LifeLayerInput {
|
|
|
4017
4242
|
audioEnergy?: number;
|
|
4018
4243
|
/** Whether avatar is speaking. Multiplies brow noise amplitude. */
|
|
4019
4244
|
isSpeaking?: boolean;
|
|
4245
|
+
/** Conversational state for gaze behavior (idle/listening/thinking/speaking) */
|
|
4246
|
+
state?: ConversationalState;
|
|
4020
4247
|
}
|
|
4021
4248
|
/**
|
|
4022
4249
|
* Per-frame output from the life layer
|
|
@@ -4038,6 +4265,7 @@ interface LifeLayerOutput {
|
|
|
4038
4265
|
*/
|
|
4039
4266
|
declare class ProceduralLifeLayer {
|
|
4040
4267
|
private blinkIntervalRange;
|
|
4268
|
+
private useLogNormalBlinks;
|
|
4041
4269
|
private gazeBreakIntervalRange;
|
|
4042
4270
|
private gazeBreakAmplitudeRange;
|
|
4043
4271
|
private eyeNoiseAmplitude;
|
|
@@ -4065,6 +4293,7 @@ declare class ProceduralLifeLayer {
|
|
|
4065
4293
|
private gazeBreakTargetY;
|
|
4066
4294
|
private gazeBreakCurrentX;
|
|
4067
4295
|
private gazeBreakCurrentY;
|
|
4296
|
+
private currentState;
|
|
4068
4297
|
private microMotionTime;
|
|
4069
4298
|
private breathingPhase;
|
|
4070
4299
|
private noiseTime;
|
|
@@ -4079,15 +4308,513 @@ declare class ProceduralLifeLayer {
|
|
|
4079
4308
|
* @returns Blendshape values and head rotation deltas
|
|
4080
4309
|
*/
|
|
4081
4310
|
update(delta: number, input?: LifeLayerInput): LifeLayerOutput;
|
|
4311
|
+
/**
|
|
4312
|
+
* Write life layer output directly to a Float32Array[52] in LAM_BLENDSHAPES order.
|
|
4313
|
+
*
|
|
4314
|
+
* Includes micro-jitter (0.4% amplitude simplex noise on all channels) to
|
|
4315
|
+
* break uncanny stillness on undriven channels.
|
|
4316
|
+
*
|
|
4317
|
+
* @param delta - Time since last frame in seconds
|
|
4318
|
+
* @param input - Per-frame input
|
|
4319
|
+
* @param out - Pre-allocated Float32Array(52) to write into
|
|
4320
|
+
*/
|
|
4321
|
+
updateToArray(delta: number, input: LifeLayerInput, out: Float32Array): void;
|
|
4082
4322
|
/**
|
|
4083
4323
|
* Reset all internal state to initial values.
|
|
4084
4324
|
*/
|
|
4085
4325
|
reset(): void;
|
|
4326
|
+
/**
|
|
4327
|
+
* Sample next blink interval.
|
|
4328
|
+
* Uses log-normal distribution (PMC3565584) when using default config,
|
|
4329
|
+
* or uniform random when custom blinkIntervalRange is provided.
|
|
4330
|
+
*/
|
|
4331
|
+
private nextBlinkInterval;
|
|
4086
4332
|
private updateBlinks;
|
|
4087
4333
|
private getBlinkValues;
|
|
4088
4334
|
private getEyeMicroMotion;
|
|
4335
|
+
/**
|
|
4336
|
+
* Get active gaze parameters — uses state-dependent params when
|
|
4337
|
+
* conversational state is provided, otherwise falls back to config ranges.
|
|
4338
|
+
*/
|
|
4339
|
+
private getActiveGazeParams;
|
|
4089
4340
|
private updateGazeBreaks;
|
|
4090
4341
|
private updateBrowNoise;
|
|
4091
4342
|
}
|
|
4092
4343
|
|
|
4093
|
-
|
|
4344
|
+
/**
|
|
4345
|
+
* FACS (Facial Action Coding System) to ARKit Blendshape Mapping
|
|
4346
|
+
*
|
|
4347
|
+
* Two static lookup tables that decompose emotions into FACS Action Units,
|
|
4348
|
+
* then map AUs to ARKit blendshapes. Based on Ekman's FACS research.
|
|
4349
|
+
*
|
|
4350
|
+
* @category Face
|
|
4351
|
+
*/
|
|
4352
|
+
|
|
4353
|
+
/**
|
|
4354
|
+
* A single FACS Action Unit activation within an emotion
|
|
4355
|
+
*/
|
|
4356
|
+
interface AUActivation {
|
|
4357
|
+
/** FACS Action Unit identifier (e.g. 'AU6', 'AU12') */
|
|
4358
|
+
au: string;
|
|
4359
|
+
/** Activation intensity 0-1 */
|
|
4360
|
+
intensity: number;
|
|
4361
|
+
/** Facial region: upper (brows/eyes/cheeks) or lower (mouth/jaw) */
|
|
4362
|
+
region: 'upper' | 'lower';
|
|
4363
|
+
}
|
|
4364
|
+
/**
|
|
4365
|
+
* Table 1: Emotion → FACS Action Units
|
|
4366
|
+
*
|
|
4367
|
+
* Maps each of the 10 SDK emotion channels to their FACS AU combinations
|
|
4368
|
+
* with intensity and upper/lower face region tags.
|
|
4369
|
+
*
|
|
4370
|
+
* Sources:
|
|
4371
|
+
* - Ekman & Friesen (1978) FACS Manual
|
|
4372
|
+
* - Ekman (2003) Emotions Revealed
|
|
4373
|
+
* - Lucey et al. (2010) Extended Cohn-Kanade dataset
|
|
4374
|
+
*/
|
|
4375
|
+
declare const EMOTION_TO_AU: Record<EmotionName, AUActivation[]>;
|
|
4376
|
+
/**
|
|
4377
|
+
* Table 2: FACS Action Unit → ARKit Blendshapes
|
|
4378
|
+
*
|
|
4379
|
+
* Maps each AU to one or more ARKit blendshape channels with weight.
|
|
4380
|
+
*
|
|
4381
|
+
* Sources:
|
|
4382
|
+
* - Apple ARKit face tracking documentation
|
|
4383
|
+
* - Melinda Ozel's ARKit-to-FACS cheat sheet
|
|
4384
|
+
*/
|
|
4385
|
+
declare const AU_TO_ARKIT: Record<string, {
|
|
4386
|
+
blendshape: string;
|
|
4387
|
+
weight: number;
|
|
4388
|
+
}[]>;
|
|
4389
|
+
/**
|
|
4390
|
+
* All AU identifiers referenced by EMOTION_TO_AU (for validation)
|
|
4391
|
+
*/
|
|
4392
|
+
declare const ALL_AUS: string[];
|
|
4393
|
+
|
|
4394
|
+
/**
|
|
4395
|
+
* EmotionResolver — Resolves EmotionWeights → split upper/lower face Float32Array[52]
|
|
4396
|
+
*
|
|
4397
|
+
* Uses FACS decomposition (EMOTION_TO_AU → AU_TO_ARKIT) to produce
|
|
4398
|
+
* anatomically correct blendshape contributions, split by facial region
|
|
4399
|
+
* for the FaceCompositor's modulation strategy:
|
|
4400
|
+
* - Upper face: additive overlay (independent of speech)
|
|
4401
|
+
* - Lower face: modulates speech output
|
|
4402
|
+
*
|
|
4403
|
+
* @category Face
|
|
4404
|
+
*/
|
|
4405
|
+
|
|
4406
|
+
/**
|
|
4407
|
+
* Resolved emotion split into upper and lower face contributions
|
|
4408
|
+
*/
|
|
4409
|
+
interface ResolvedEmotion {
|
|
4410
|
+
/** 52 channels — only upper face (brows, eyes, cheeks, nose) non-zero */
|
|
4411
|
+
upper: Float32Array;
|
|
4412
|
+
/** 52 channels — only lower face (mouth, jaw) non-zero */
|
|
4413
|
+
lower: Float32Array;
|
|
4414
|
+
}
|
|
4415
|
+
/**
|
|
4416
|
+
* Resolves EmotionWeights into upper/lower face blendshape arrays
|
|
4417
|
+
* using FACS Action Unit decomposition.
|
|
4418
|
+
*/
|
|
4419
|
+
declare class EmotionResolver {
|
|
4420
|
+
private readonly upperBuffer;
|
|
4421
|
+
private readonly lowerBuffer;
|
|
4422
|
+
/**
|
|
4423
|
+
* Resolve emotion weights to upper/lower face blendshape contributions.
|
|
4424
|
+
*
|
|
4425
|
+
* @param weights - Emotion channel weights from EmotionController
|
|
4426
|
+
* @param intensity - Global intensity multiplier (0-2). Default: 1.0
|
|
4427
|
+
* @returns Upper and lower face blendshape arrays (52 channels each)
|
|
4428
|
+
*/
|
|
4429
|
+
resolve(weights: EmotionWeights, intensity?: number): ResolvedEmotion;
|
|
4430
|
+
}
|
|
4431
|
+
|
|
4432
|
+
/**
|
|
4433
|
+
* FaceCompositor — 5-stage signal processing chain for facial animation
|
|
4434
|
+
*
|
|
4435
|
+
* Composes A2E lip sync, emotion modulation, procedural life, and character
|
|
4436
|
+
* profile into a single Float32Array[52] per frame.
|
|
4437
|
+
*
|
|
4438
|
+
* ```
|
|
4439
|
+
* BASE (A2E) → EMOTION MODULATION → PROCEDURAL LIFE → CHARACTER PROFILE → OUTPUT [0,1]
|
|
4440
|
+
* ```
|
|
4441
|
+
*
|
|
4442
|
+
* Replaces manual blendshape merging in consumer code with a single `compose()` call.
|
|
4443
|
+
*
|
|
4444
|
+
* @category Face
|
|
4445
|
+
*/
|
|
4446
|
+
|
|
4447
|
+
/**
|
|
4448
|
+
* Per-blendshape character profile (multiplier + offset)
|
|
4449
|
+
*
|
|
4450
|
+
* Superset of ExpressionProfile — gives per-channel control instead of per-group.
|
|
4451
|
+
*/
|
|
4452
|
+
interface CharacterProfile {
|
|
4453
|
+
/** Per-blendshape multiplier (default: all 1.0) */
|
|
4454
|
+
multiplier?: Partial<Record<string, number>>;
|
|
4455
|
+
/** Per-blendshape offset (default: all 0.0) */
|
|
4456
|
+
offset?: Partial<Record<string, number>>;
|
|
4457
|
+
}
|
|
4458
|
+
/**
|
|
4459
|
+
* Configuration for FaceCompositor
|
|
4460
|
+
*/
|
|
4461
|
+
interface FaceCompositorConfig {
|
|
4462
|
+
/** ProceduralLifeLayer instance (compositor creates default if omitted) */
|
|
4463
|
+
lifeLayer?: ProceduralLifeLayer;
|
|
4464
|
+
/** Character profile: per-BS multiplier + offset */
|
|
4465
|
+
profile?: CharacterProfile;
|
|
4466
|
+
/** Emotion smoothing factor per frame (0-1). Default: 0.12 */
|
|
4467
|
+
emotionSmoothing?: number;
|
|
4468
|
+
}
|
|
4469
|
+
/**
|
|
4470
|
+
* Per-frame input to the compositor
|
|
4471
|
+
*/
|
|
4472
|
+
interface FaceCompositorInput extends LifeLayerInput {
|
|
4473
|
+
/** Delta time in seconds */
|
|
4474
|
+
deltaTime: number;
|
|
4475
|
+
/** Current emotion weights (from EmotionController.emotion or manual) */
|
|
4476
|
+
emotion?: EmotionWeights;
|
|
4477
|
+
/** Emotion intensity multiplier (0-2). Default: 1.0 */
|
|
4478
|
+
emotionIntensity?: number;
|
|
4479
|
+
}
|
|
4480
|
+
/**
|
|
4481
|
+
* FaceCompositor — 5-stage facial animation signal chain.
|
|
4482
|
+
*
|
|
4483
|
+
* @example
|
|
4484
|
+
* ```typescript
|
|
4485
|
+
* import { FaceCompositor, createA2E } from '@omote/core';
|
|
4486
|
+
*
|
|
4487
|
+
* const compositor = new FaceCompositor();
|
|
4488
|
+
*
|
|
4489
|
+
* // In animation loop:
|
|
4490
|
+
* const output = compositor.compose(a2eFrame, {
|
|
4491
|
+
* deltaTime: 0.016,
|
|
4492
|
+
* emotion: { joy: 0.8 },
|
|
4493
|
+
* isSpeaking: true,
|
|
4494
|
+
* audioEnergy: 0.5,
|
|
4495
|
+
* });
|
|
4496
|
+
*
|
|
4497
|
+
* // Apply output[0..51] to avatar morphTargetInfluences
|
|
4498
|
+
* ```
|
|
4499
|
+
*/
|
|
4500
|
+
declare class FaceCompositor {
|
|
4501
|
+
private readonly emotionResolver;
|
|
4502
|
+
private readonly lifeLayer;
|
|
4503
|
+
private readonly emotionSmoothing;
|
|
4504
|
+
private readonly smoothedUpper;
|
|
4505
|
+
private readonly smoothedLower;
|
|
4506
|
+
private readonly lifeBuffer;
|
|
4507
|
+
private readonly multiplier;
|
|
4508
|
+
private readonly offset;
|
|
4509
|
+
private stickyEmotion;
|
|
4510
|
+
constructor(config?: FaceCompositorConfig);
|
|
4511
|
+
/**
|
|
4512
|
+
* Compose a single output frame from the 5-stage signal chain.
|
|
4513
|
+
*
|
|
4514
|
+
* @param base - A2E raw output (Float32Array[52], LAM_BLENDSHAPES order)
|
|
4515
|
+
* @param input - Per-frame input (deltaTime, emotion, life layer params)
|
|
4516
|
+
* @returns Float32Array[52] with all values clamped to [0, 1]
|
|
4517
|
+
*/
|
|
4518
|
+
compose(base: Float32Array, input: FaceCompositorInput): Float32Array;
|
|
4519
|
+
/**
|
|
4520
|
+
* Set sticky emotion (used when input.emotion is not provided).
|
|
4521
|
+
*/
|
|
4522
|
+
setEmotion(weights: EmotionWeights): void;
|
|
4523
|
+
/**
|
|
4524
|
+
* Update character profile at runtime.
|
|
4525
|
+
*/
|
|
4526
|
+
setProfile(profile: CharacterProfile): void;
|
|
4527
|
+
/**
|
|
4528
|
+
* Reset all smoothing state and life layer.
|
|
4529
|
+
*/
|
|
4530
|
+
reset(): void;
|
|
4531
|
+
/** Expand partial profile maps into dense Float32Arrays */
|
|
4532
|
+
private applyProfileArrays;
|
|
4533
|
+
}
|
|
4534
|
+
|
|
4535
|
+
/**
|
|
4536
|
+
* MicLipSync - Microphone → VAD → A2E → blendshapes
|
|
4537
|
+
*
|
|
4538
|
+
* Simple composition class for live mic lip sync ("mirror mode").
|
|
4539
|
+
* Replaces A2EOrchestrator with proper MicrophoneCapture integration.
|
|
4540
|
+
*
|
|
4541
|
+
* @category Orchestration
|
|
4542
|
+
*/
|
|
4543
|
+
|
|
4544
|
+
type MicLipSyncState = 'idle' | 'active' | 'paused';
|
|
4545
|
+
interface MicLipSyncConfig {
|
|
4546
|
+
/** A2E inference backend (from createA2E) — required */
|
|
4547
|
+
lam: A2EBackend;
|
|
4548
|
+
/** VAD backend for speech boundary detection (optional) */
|
|
4549
|
+
vad?: SileroVADBackend;
|
|
4550
|
+
/** Mic sample rate (default: 16000) */
|
|
4551
|
+
sampleRate?: number;
|
|
4552
|
+
/** Mic chunk size in samples (default: 512, required by Silero VAD) */
|
|
4553
|
+
micChunkSize?: number;
|
|
4554
|
+
/** Per-character expression weight scaling */
|
|
4555
|
+
profile?: ExpressionProfile;
|
|
4556
|
+
/** Identity/style index for Wav2Vec2 (default: 0) */
|
|
4557
|
+
identityIndex?: number;
|
|
4558
|
+
}
|
|
4559
|
+
interface MicLipSyncFrame {
|
|
4560
|
+
blendshapes: Float32Array;
|
|
4561
|
+
rawBlendshapes: Float32Array;
|
|
4562
|
+
}
|
|
4563
|
+
interface MicLipSyncEvents {
|
|
4564
|
+
/** New blendshape frame ready */
|
|
4565
|
+
'frame': MicLipSyncFrame;
|
|
4566
|
+
/** Speech started (VAD) */
|
|
4567
|
+
'speech:start': void;
|
|
4568
|
+
/** Speech ended (VAD) */
|
|
4569
|
+
'speech:end': {
|
|
4570
|
+
durationMs: number;
|
|
4571
|
+
};
|
|
4572
|
+
/** Microphone started */
|
|
4573
|
+
'mic:start': void;
|
|
4574
|
+
/** Microphone stopped */
|
|
4575
|
+
'mic:stop': void;
|
|
4576
|
+
/** Audio level update */
|
|
4577
|
+
'audio:level': {
|
|
4578
|
+
rms: number;
|
|
4579
|
+
peak: number;
|
|
4580
|
+
};
|
|
4581
|
+
/** State changed */
|
|
4582
|
+
'state': MicLipSyncState;
|
|
4583
|
+
/** Error occurred */
|
|
4584
|
+
'error': Error;
|
|
4585
|
+
[key: string]: unknown;
|
|
4586
|
+
}
|
|
4587
|
+
declare class MicLipSync extends EventEmitter<MicLipSyncEvents> {
|
|
4588
|
+
private omoteEvents;
|
|
4589
|
+
private mic;
|
|
4590
|
+
private processor;
|
|
4591
|
+
private vad?;
|
|
4592
|
+
private _state;
|
|
4593
|
+
private _isSpeaking;
|
|
4594
|
+
private _currentFrame;
|
|
4595
|
+
private _currentRawFrame;
|
|
4596
|
+
private profile;
|
|
4597
|
+
private speechStartTime;
|
|
4598
|
+
private vadChunkSize;
|
|
4599
|
+
private vadBuffer;
|
|
4600
|
+
private vadBufferOffset;
|
|
4601
|
+
/** Current state */
|
|
4602
|
+
get state(): MicLipSyncState;
|
|
4603
|
+
/** Latest blendshape frame (null before first inference) */
|
|
4604
|
+
get currentFrame(): Float32Array | null;
|
|
4605
|
+
/** Whether speech is currently detected (requires VAD) */
|
|
4606
|
+
get isSpeaking(): boolean;
|
|
4607
|
+
/** Current backend type */
|
|
4608
|
+
get backend(): string | null;
|
|
4609
|
+
constructor(config: MicLipSyncConfig);
|
|
4610
|
+
/** Start microphone capture and inference loop */
|
|
4611
|
+
start(): Promise<void>;
|
|
4612
|
+
/** Stop microphone and inference */
|
|
4613
|
+
stop(): void;
|
|
4614
|
+
/** Pause inference (mic stays open for faster resume) */
|
|
4615
|
+
pause(): void;
|
|
4616
|
+
/** Resume inference after pause */
|
|
4617
|
+
resume(): void;
|
|
4618
|
+
/** Update ExpressionProfile at runtime */
|
|
4619
|
+
setProfile(profile: ExpressionProfile): void;
|
|
4620
|
+
/** Dispose of all resources */
|
|
4621
|
+
dispose(): Promise<void>;
|
|
4622
|
+
private processVAD;
|
|
4623
|
+
private setState;
|
|
4624
|
+
}
|
|
4625
|
+
|
|
4626
|
+
/**
|
|
4627
|
+
* Shared types for orchestration layer
|
|
4628
|
+
*
|
|
4629
|
+
* @category Orchestration
|
|
4630
|
+
*/
|
|
4631
|
+
|
|
4632
|
+
type VoicePipelineState = 'idle' | 'loading' | 'ready' | 'listening' | 'thinking' | 'speaking' | 'error';
|
|
4633
|
+
interface LoadingProgress {
|
|
4634
|
+
currentModel: string;
|
|
4635
|
+
progress: number;
|
|
4636
|
+
totalModels: number;
|
|
4637
|
+
modelsLoaded: number;
|
|
4638
|
+
}
|
|
4639
|
+
interface TranscriptResult {
|
|
4640
|
+
text: string;
|
|
4641
|
+
emotion?: string;
|
|
4642
|
+
language?: string;
|
|
4643
|
+
event?: string;
|
|
4644
|
+
isFinal: boolean;
|
|
4645
|
+
inferenceTimeMs?: number;
|
|
4646
|
+
}
|
|
4647
|
+
/**
|
|
4648
|
+
* Consumer's response handler. VoicePipeline calls this with transcribed text.
|
|
4649
|
+
* Consumer must stream audio back for playback + lip sync.
|
|
4650
|
+
*/
|
|
4651
|
+
interface ResponseHandler {
|
|
4652
|
+
(params: {
|
|
4653
|
+
text: string;
|
|
4654
|
+
emotion?: string;
|
|
4655
|
+
event?: string;
|
|
4656
|
+
/** Stream audio chunks to pipeline for playback + lip sync */
|
|
4657
|
+
send: (chunk: Uint8Array) => Promise<void>;
|
|
4658
|
+
/** Call when all audio has been sent */
|
|
4659
|
+
done: () => Promise<void>;
|
|
4660
|
+
/** Aborted on interruption or stop() */
|
|
4661
|
+
signal: AbortSignal;
|
|
4662
|
+
/** Session ID for backend correlation */
|
|
4663
|
+
sessionId: string;
|
|
4664
|
+
}): Promise<void>;
|
|
4665
|
+
}
|
|
4666
|
+
|
|
4667
|
+
/**
|
|
4668
|
+
* VoicePipeline - Full conversational agent loop
|
|
4669
|
+
*
|
|
4670
|
+
* Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → PlaybackPipeline (A2E)
|
|
4671
|
+
*
|
|
4672
|
+
* State machine: idle → loading → ready → listening → thinking → speaking → listening → ...
|
|
4673
|
+
*
|
|
4674
|
+
* The consumer provides an `onResponse` callback that receives transcribed text
|
|
4675
|
+
* and streams audio back for playback + lip sync. VoicePipeline is backend-agnostic.
|
|
4676
|
+
*
|
|
4677
|
+
* @category Orchestration
|
|
4678
|
+
*/
|
|
4679
|
+
|
|
4680
|
+
interface VoicePipelineConfig {
|
|
4681
|
+
/** URLs and options for model loading */
|
|
4682
|
+
models: {
|
|
4683
|
+
senseVoice: {
|
|
4684
|
+
modelUrl: string;
|
|
4685
|
+
tokensUrl?: string;
|
|
4686
|
+
language?: string;
|
|
4687
|
+
};
|
|
4688
|
+
lam: {
|
|
4689
|
+
gpuModelUrl: string;
|
|
4690
|
+
gpuExternalDataUrl?: string | false;
|
|
4691
|
+
cpuModelUrl: string;
|
|
4692
|
+
mode?: 'auto' | 'gpu' | 'cpu';
|
|
4693
|
+
};
|
|
4694
|
+
vad: {
|
|
4695
|
+
modelUrl: string;
|
|
4696
|
+
threshold?: number;
|
|
4697
|
+
preSpeechBufferChunks?: number;
|
|
4698
|
+
};
|
|
4699
|
+
};
|
|
4700
|
+
/** Consumer's response handler */
|
|
4701
|
+
onResponse: ResponseHandler;
|
|
4702
|
+
/** Per-character expression weight scaling */
|
|
4703
|
+
profile?: ExpressionProfile;
|
|
4704
|
+
/** Identity/style index for Wav2Vec2 (default: 0) */
|
|
4705
|
+
identityIndex?: number;
|
|
4706
|
+
/** LAM load timeout in ms — CPU fallback on timeout (default: 30000) */
|
|
4707
|
+
lamLoadTimeoutMs?: number;
|
|
4708
|
+
/** Base silence timeout in ms (default: 500) */
|
|
4709
|
+
silenceTimeoutMs?: number;
|
|
4710
|
+
/** Extended silence timeout for long utterances (default: 700) */
|
|
4711
|
+
silenceTimeoutExtendedMs?: number;
|
|
4712
|
+
/** Enable adaptive timeout based on speech duration (default: true) */
|
|
4713
|
+
adaptiveTimeout?: boolean;
|
|
4714
|
+
/** Minimum audio duration in seconds (default: 0.3) */
|
|
4715
|
+
minAudioDurationSec?: number;
|
|
4716
|
+
/** Minimum audio energy (default: 0.02) */
|
|
4717
|
+
minAudioEnergy?: number;
|
|
4718
|
+
/** Enable audio normalization for quiet audio (default: true) */
|
|
4719
|
+
normalizeAudio?: boolean;
|
|
4720
|
+
/** Progressive transcription interval — desktop (default: 500ms) */
|
|
4721
|
+
progressiveIntervalMs?: number;
|
|
4722
|
+
/** Progressive transcription interval — iOS (default: 800ms) */
|
|
4723
|
+
progressiveIntervalIosMs?: number;
|
|
4724
|
+
/** Coverage threshold to use progressive result (default: 0.8) */
|
|
4725
|
+
progressiveCoverageThreshold?: number;
|
|
4726
|
+
/** Minimum samples before progressive transcription starts (default: 8000) */
|
|
4727
|
+
progressiveMinSamples?: number;
|
|
4728
|
+
/** Timeout for individual SenseVoice.transcribe() calls (default: 10000ms) */
|
|
4729
|
+
transcriptionTimeoutMs?: number;
|
|
4730
|
+
/** Enable barge-in detection (default: true) */
|
|
4731
|
+
interruptionEnabled?: boolean;
|
|
4732
|
+
/** Minimum speech duration for interruption (default: 200ms) */
|
|
4733
|
+
interruptionMinSpeechMs?: number;
|
|
4734
|
+
/** Audio playback delay (default: auto-detected) */
|
|
4735
|
+
audioDelayMs?: number;
|
|
4736
|
+
/** Coalescer target duration (default: 200ms) */
|
|
4737
|
+
chunkTargetMs?: number;
|
|
4738
|
+
/** Enable neutral transition on playback complete (default: true) */
|
|
4739
|
+
neutralTransitionEnabled?: boolean;
|
|
4740
|
+
/** Duration of neutral fade-out (default: 250ms) */
|
|
4741
|
+
neutralTransitionMs?: number;
|
|
4742
|
+
}
|
|
4743
|
+
interface VoicePipelineEvents {
|
|
4744
|
+
'state': VoicePipelineState;
|
|
4745
|
+
'loading:progress': LoadingProgress;
|
|
4746
|
+
'transcript': TranscriptResult;
|
|
4747
|
+
'frame': FullFaceFrame;
|
|
4748
|
+
'frame:raw': Float32Array;
|
|
4749
|
+
'speech:start': void;
|
|
4750
|
+
'speech:end': {
|
|
4751
|
+
durationMs: number;
|
|
4752
|
+
};
|
|
4753
|
+
'playback:start': {
|
|
4754
|
+
time: number;
|
|
4755
|
+
};
|
|
4756
|
+
'playback:complete': void;
|
|
4757
|
+
'interruption': void;
|
|
4758
|
+
'audio:level': {
|
|
4759
|
+
rms: number;
|
|
4760
|
+
peak: number;
|
|
4761
|
+
};
|
|
4762
|
+
'error': Error;
|
|
4763
|
+
[key: string]: unknown;
|
|
4764
|
+
}
|
|
4765
|
+
declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
|
|
4766
|
+
private readonly config;
|
|
4767
|
+
private _state;
|
|
4768
|
+
private stopped;
|
|
4769
|
+
private epoch;
|
|
4770
|
+
private _sessionId;
|
|
4771
|
+
private asr;
|
|
4772
|
+
private lam;
|
|
4773
|
+
private vad;
|
|
4774
|
+
private unifiedWorker;
|
|
4775
|
+
private playback;
|
|
4776
|
+
private interruption;
|
|
4777
|
+
private omoteEvents;
|
|
4778
|
+
private mic;
|
|
4779
|
+
private audioBuffer;
|
|
4780
|
+
private audioBufferSamples;
|
|
4781
|
+
private speechStartTime;
|
|
4782
|
+
private silenceTimer;
|
|
4783
|
+
private isSpeaking;
|
|
4784
|
+
private progressiveTimer;
|
|
4785
|
+
private progressivePromise;
|
|
4786
|
+
private lastProgressiveResult;
|
|
4787
|
+
private lastProgressiveSamples;
|
|
4788
|
+
private asrErrorCount;
|
|
4789
|
+
private responseAbortController;
|
|
4790
|
+
private _currentFrame;
|
|
4791
|
+
/** Current pipeline state */
|
|
4792
|
+
get state(): VoicePipelineState;
|
|
4793
|
+
/** Latest blendshape frame */
|
|
4794
|
+
get currentFrame(): Float32Array | null;
|
|
4795
|
+
/** Whether user is currently speaking */
|
|
4796
|
+
get isSpeechActive(): boolean;
|
|
4797
|
+
/** Session ID (generated on start(), null before) */
|
|
4798
|
+
get sessionId(): string | null;
|
|
4799
|
+
constructor(config: VoicePipelineConfig);
|
|
4800
|
+
loadModels(): Promise<void>;
|
|
4801
|
+
start(): Promise<void>;
|
|
4802
|
+
stop(): void;
|
|
4803
|
+
setProfile(profile: ExpressionProfile): void;
|
|
4804
|
+
dispose(): Promise<void>;
|
|
4805
|
+
private processAudioChunk;
|
|
4806
|
+
private getSilenceTimeout;
|
|
4807
|
+
private onSilenceDetected;
|
|
4808
|
+
private processEndOfSpeech;
|
|
4809
|
+
private callResponseHandler;
|
|
4810
|
+
private handleInterruption;
|
|
4811
|
+
private startProgressiveTranscription;
|
|
4812
|
+
private stopProgressiveTranscription;
|
|
4813
|
+
private transcribeWithTimeout;
|
|
4814
|
+
private normalizeAudio;
|
|
4815
|
+
private setState;
|
|
4816
|
+
private emitProgress;
|
|
4817
|
+
private clearSilenceTimer;
|
|
4818
|
+
}
|
|
4819
|
+
|
|
4820
|
+
export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, type CharacterProfile, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, VoicePipeline, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineState, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createSenseVoice, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, preloadModels, resetModelUrls, resolveBackend, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker };
|