@omote/core 0.6.4 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,8 +1,60 @@
1
1
  import { EventEmitter, OmoteEvents } from './events/index.js';
2
2
  export { AnimationEvent, BackendEvent, EmotionEvent, GazeEvent, STTFinalEvent, STTPartialEvent, SessionStateEvent, TTSEndEvent, TTSMarkEvent, TTSStartEvent, VisemeEvent } from './events/index.js';
3
- export { D as DEFAULT_LOGGING_CONFIG, I as ILogger, e as LOG_LEVEL_PRIORITY, b as LogEntry, L as LogFormatter, a as LogLevel, c as LogSink, d as LoggingConfig, f as configureLogging, i as createLogger, g as getLoggingConfig, n as noopLogger, r as resetLoggingConfig, s as setLogLevel, h as setLoggingEnabled } from './Logger-I_k4sGhM.js';
3
+ export { D as DEFAULT_LOGGING_CONFIG, I as ILogger, a as LOG_LEVEL_PRIORITY, b as LogEntry, L as LogFormatter, c as LogLevel, d as LogSink, e as LoggingConfig, g as configureLogging, h as createLogger, i as getLoggingConfig, n as noopLogger, r as resetLoggingConfig, s as setLogLevel, k as setLoggingEnabled } from './Logger-DSoGAYJu.js';
4
4
  export { ARKitToFLAMEMapping, ApiError, AudioChunkEvent, AvatarFormat, Character, CharacterAvatar, CharacterMemory, CharacterPersonality, CharacterSpec, CharacterVoice, CreateCharacterRequest, CreateCharacterResponse, CreateLAMJobRequest, CreateLAMJobResponse, CreateSessionRequest, CreateSessionResponse, GSplatConfig, LAMJob, LAMJobStatus, PROTOCOL_VERSION, PaginatedResponse, PlatformSession, ErrorEvent as ProtocolErrorEvent, ProtocolEvent, ResponseChunkEvent, ResponseEndEvent, ResponseStartEvent, SessionMessage, SessionStatus, isProtocolEvent } from '@omote/types';
5
5
 
6
+ /**
7
+ * Audio format conversion utilities
8
+ *
9
+ * Bridges the gap between TTS engines (Float32 at various sample rates)
10
+ * and playback pipelines (Uint8Array PCM16 at 16kHz).
11
+ *
12
+ * @module audio/audioConvert
13
+ */
14
+ /**
15
+ * Convert Float32 [-1,1] samples to PCM16 Uint8Array (little-endian).
16
+ *
17
+ * @param samples - Float32Array of normalized audio samples
18
+ * @returns Uint8Array of PCM16 bytes (2 bytes per sample, little-endian)
19
+ */
20
+ declare function float32ToPcm16(samples: Float32Array): Uint8Array;
21
+ /**
22
+ * Linear interpolation resampler.
23
+ * Good enough for speech (no sinc filtering needed).
24
+ *
25
+ * @param samples - Input audio samples
26
+ * @param fromRate - Source sample rate (e.g., 24000)
27
+ * @param toRate - Target sample rate (e.g., 16000)
28
+ * @returns Resampled Float32Array
29
+ */
30
+ declare function resampleLinear(samples: Float32Array, fromRate: number, toRate: number): Float32Array;
31
+ /**
32
+ * Convenience: resample + encode in one call.
33
+ * Converts TTS output (Float32 at TTS rate) to pipeline format (PCM16 Uint8Array at 16kHz).
34
+ *
35
+ * @param audio - Float32Array from TTS engine
36
+ * @param sourceRate - TTS engine's output sample rate (default: 24000)
37
+ * @param targetRate - Pipeline's expected sample rate (default: 16000)
38
+ * @returns Uint8Array PCM16 at target rate
39
+ */
40
+ declare function ttsToPlaybackFormat(audio: Float32Array, sourceRate?: number, targetRate?: number): Uint8Array;
41
+
42
+ /**
43
+ * Shared audio utility functions
44
+ *
45
+ * @module audio
46
+ */
47
+ /**
48
+ * Safely convert an ArrayBuffer of PCM16 bytes to Float32 samples.
49
+ * Handles odd-length buffers by truncating to the nearest even byte boundary.
50
+ */
51
+ declare function pcm16ToFloat32(buffer: ArrayBuffer): Float32Array;
52
+ /**
53
+ * Convert Int16Array samples to Float32Array.
54
+ * Each sample is divided by 32768 to normalize to [-1, 1] range.
55
+ */
56
+ declare function int16ToFloat32(int16: Int16Array): Float32Array;
57
+
6
58
  /**
7
59
  * Microphone capture - renderer-agnostic audio input
8
60
  *
@@ -540,6 +592,147 @@ declare const BLENDSHAPE_TO_GROUP: Map<string, BlendshapeGroup>;
540
592
  */
541
593
  declare function applyProfile(raw: Float32Array, profile: ExpressionProfile): Float32Array;
542
594
 
595
+ /**
596
+ * PlaybackPipeline - Audio playback + A2E lip sync with ExpressionProfile scaling
597
+ *
598
+ * Refactored superset of FullFacePipeline. Adds:
599
+ * - Sync mode (`feedBuffer`) for pre-recorded audio
600
+ * - State tracking (idle → playing → stopping)
601
+ * - Opt-in neutral transition animation on playback complete
602
+ * - Idempotent `start()` (no spurious playback:complete on restart)
603
+ *
604
+ * @category Audio
605
+ */
606
+
607
+ type PlaybackState = 'idle' | 'playing' | 'stopping';
608
+ interface PlaybackPipelineConfig {
609
+ /** A2E inference backend (from createA2E) */
610
+ lam: A2EBackend;
611
+ /** Sample rate in Hz (default: 16000) */
612
+ sampleRate?: number;
613
+ /** Target chunk duration for coalescing in ms (default: 200) */
614
+ chunkTargetMs?: number;
615
+ /** Audio playback delay in ms (default: auto-detected from backend) */
616
+ audioDelayMs?: number;
617
+ /** A2E inference chunk size in samples (default: 16000) */
618
+ chunkSize?: number;
619
+ /** Identity/style index for Wav2Vec2 (default: 0) */
620
+ identityIndex?: number;
621
+ /** Per-character expression weight scaling */
622
+ profile?: ExpressionProfile;
623
+ /** Enable neutral transition on playback complete (default: false) */
624
+ neutralTransitionEnabled?: boolean;
625
+ /** Duration of neutral fade-out in ms (default: 250). Only applies when neutralTransitionEnabled=true. */
626
+ neutralTransitionMs?: number;
627
+ /** Stale frame warning threshold in ms (default: 2000) */
628
+ staleThresholdMs?: number;
629
+ }
630
+ /**
631
+ * Full face frame with scaled blendshapes
632
+ */
633
+ interface FullFaceFrame {
634
+ /** Scaled 52 ARKit blendshapes (ExpressionProfile applied) */
635
+ blendshapes: Float32Array;
636
+ /** Raw A2E output (52 blendshapes, before profile scaling) */
637
+ rawBlendshapes: Float32Array;
638
+ /** AudioContext timestamp for this frame */
639
+ timestamp: number;
640
+ }
641
+ interface PlaybackPipelineEvents {
642
+ /** New frame ready for display (scaled by ExpressionProfile) */
643
+ 'frame': FullFaceFrame;
644
+ /** Raw A2E frame (before profile scaling) */
645
+ 'frame:raw': Float32Array;
646
+ /** Playback started (first audio scheduled) */
647
+ 'playback:start': {
648
+ time: number;
649
+ };
650
+ /** Playback completed naturally */
651
+ 'playback:complete': void;
652
+ /** Playback stopped (user-initiated) */
653
+ 'playback:stop': void;
654
+ /** Error occurred */
655
+ 'error': Error;
656
+ /** State changed */
657
+ 'state': PlaybackState;
658
+ 'full_frame_ready': FullFaceFrame;
659
+ 'lam_frame_ready': Float32Array;
660
+ 'playback_complete': void;
661
+ 'playback_start': number;
662
+ [key: string]: unknown;
663
+ }
664
+ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
665
+ private readonly config;
666
+ private scheduler;
667
+ private coalescer;
668
+ private processor;
669
+ private readonly sampleRate;
670
+ private _state;
671
+ private playbackStarted;
672
+ private monitorInterval;
673
+ private frameAnimationId;
674
+ private lastNewFrameTime;
675
+ private lastKnownLamFrame;
676
+ private staleWarningEmitted;
677
+ private readonly staleThresholdMs;
678
+ private frameLoopCount;
679
+ private profile;
680
+ private readonly neutralTransitionEnabled;
681
+ private readonly neutralTransitionMs;
682
+ private neutralTransitionFrame;
683
+ private neutralTransitionStart;
684
+ private neutralAnimationId;
685
+ private _currentFrame;
686
+ private _currentRawFrame;
687
+ /** Current pipeline state */
688
+ get state(): PlaybackState;
689
+ /** Current scaled blendshapes (updated in-place for perf) */
690
+ get currentFrame(): Float32Array | null;
691
+ /** Raw A2E blendshapes (before profile scaling) */
692
+ get currentRawFrame(): Float32Array | null;
693
+ constructor(config: PlaybackPipelineConfig);
694
+ /** Initialize AudioContext (lazy, call after user gesture) */
695
+ initialize(): Promise<void>;
696
+ /** Update ExpressionProfile at runtime */
697
+ setProfile(profile: ExpressionProfile): void;
698
+ /**
699
+ * Start a new playback session.
700
+ * Idempotent — calling during playback resets cleanly without emitting
701
+ * spurious playback:complete.
702
+ */
703
+ start(): void;
704
+ /** Feed a streaming audio chunk (PCM16 Uint8Array) */
705
+ onAudioChunk(chunk: Uint8Array): Promise<void>;
706
+ /** Signal end of audio stream (flushes remaining audio) */
707
+ end(): Promise<void>;
708
+ /**
709
+ * Feed a complete audio buffer. Chunks into 200ms pieces, schedules each
710
+ * for playback, runs A2E inference, then waits for completion.
711
+ */
712
+ feedBuffer(audio: ArrayBuffer | Float32Array): Promise<void>;
713
+ /** Stop playback immediately with fade-out */
714
+ stop(fadeOutMs?: number): Promise<void>;
715
+ /** Cleanup all resources */
716
+ dispose(): void;
717
+ /** Get pipeline debug state */
718
+ getDebugState(): {
719
+ state: PlaybackState;
720
+ playbackStarted: boolean;
721
+ coalescerFill: number;
722
+ processorFill: number;
723
+ queuedFrames: number;
724
+ currentTime: number;
725
+ playbackEndTime: number;
726
+ };
727
+ private startFrameLoop;
728
+ private startMonitoring;
729
+ private onPlaybackComplete;
730
+ private startNeutralTransition;
731
+ private cancelNeutralTransition;
732
+ private stopInternal;
733
+ private setState;
734
+ }
735
+
543
736
  /**
544
737
  * FullFacePipeline - A2E expression pipeline with ExpressionProfile weight scaling
545
738
  *
@@ -624,23 +817,12 @@ interface FullFacePipelineOptions {
624
817
  */
625
818
  staleThresholdMs?: number;
626
819
  }
627
- /**
628
- * Full face frame with scaled blendshapes
629
- */
630
- interface FullFaceFrame$1 {
631
- /** Scaled 52 ARKit blendshapes (ExpressionProfile applied) */
632
- blendshapes: Float32Array;
633
- /** Raw A2E output (52 blendshapes, before profile scaling) */
634
- rawBlendshapes: Float32Array;
635
- /** AudioContext timestamp for this frame */
636
- timestamp: number;
637
- }
638
820
  /**
639
821
  * Events emitted by FullFacePipeline
640
822
  */
641
823
  interface FullFacePipelineEvents {
642
824
  /** New merged frame ready for display */
643
- full_frame_ready: FullFaceFrame$1;
825
+ full_frame_ready: FullFaceFrame;
644
826
  /** Raw LAM frame ready (for debugging/monitoring) */
645
827
  lam_frame_ready: Float32Array;
646
828
  /** Playback has completed */
@@ -747,144 +929,131 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
747
929
  }
748
930
 
749
931
  /**
750
- * PlaybackPipeline - Audio playback + A2E lip sync with ExpressionProfile scaling
932
+ * TTSBackend Streaming text-to-speech backend interface.
751
933
  *
752
- * Refactored superset of FullFacePipeline. Adds:
753
- * - Sync mode (`feedBuffer`) for pre-recorded audio
754
- * - State tracking (idle → playing → stopping)
755
- * - Opt-in neutral transition animation on playback complete
756
- * - Idempotent `start()` (no spurious playback:complete on restart)
934
+ * Any TTS engine (Kokoro, ElevenLabs, etc.) can implement this contract
935
+ * to integrate with TTSPlayback and VoicePipeline.
936
+ *
937
+ * @category Inference
938
+ */
939
+ /**
940
+ * Streaming TTS backend interface.
941
+ *
942
+ * Implementations must provide:
943
+ * - `stream()` for sentence-by-sentence audio generation
944
+ * - `sampleRate` for format conversion
945
+ * - `load()` for model initialization
946
+ *
947
+ * @example
948
+ * ```typescript
949
+ * const kokoro: TTSBackend = new KokoroTTSInference({ defaultVoice: 'af_heart' });
950
+ * await kokoro.load();
951
+ *
952
+ * for await (const chunk of kokoro.stream("Hello world!", { voice: 'af_heart' })) {
953
+ * // chunk.audio is Float32Array at kokoro.sampleRate
954
+ * }
955
+ * ```
956
+ */
957
+ interface TTSBackend {
958
+ /** Stream audio chunks for given text. Each chunk: Float32Array at engine's native rate. */
959
+ stream(text: string, options?: TTSStreamOptions): AsyncGenerator<TTSChunk>;
960
+ /** Engine's native output sample rate (e.g., 24000 for Kokoro). */
961
+ readonly sampleRate: number;
962
+ /** Load model if not already loaded. */
963
+ load(): Promise<unknown>;
964
+ /** Whether model is loaded and ready. */
965
+ readonly isLoaded: boolean;
966
+ /** Release resources. */
967
+ dispose(): Promise<void>;
968
+ }
969
+ /**
970
+ * Options for TTSBackend.stream()
971
+ */
972
+ interface TTSStreamOptions {
973
+ /** Abort signal for cancellation */
974
+ signal?: AbortSignal;
975
+ /** Voice override per-call */
976
+ voice?: string;
977
+ /** Speed multiplier override per-call */
978
+ speed?: number;
979
+ }
980
+ /**
981
+ * A single chunk of TTS audio output
982
+ */
983
+ interface TTSChunk {
984
+ /** Audio samples at engine's native sample rate */
985
+ audio: Float32Array;
986
+ /** Duration in seconds */
987
+ duration: number;
988
+ /** Sentence/segment text that produced this audio */
989
+ text?: string;
990
+ }
991
+
992
+ /**
993
+ * TTSPlayback — Composes TTSBackend + PlaybackPipeline for text → lip sync.
994
+ *
995
+ * Handles format conversion (Float32 @ TTS rate → PCM16 @ 16kHz)
996
+ * and sentence prefetch for gapless playback.
757
997
  *
758
998
  * @category Audio
759
999
  */
760
1000
 
761
- type PlaybackState = 'idle' | 'playing' | 'stopping';
762
- interface PlaybackPipelineConfig {
1001
+ interface TTSPlaybackConfig {
1002
+ /** TTS backend (e.g., KokoroTTSInference) */
1003
+ tts: TTSBackend;
763
1004
  /** A2E inference backend (from createA2E) */
764
1005
  lam: A2EBackend;
765
- /** Sample rate in Hz (default: 16000) */
766
- sampleRate?: number;
767
- /** Target chunk duration for coalescing in ms (default: 200) */
768
- chunkTargetMs?: number;
769
- /** Audio playback delay in ms (default: auto-detected from backend) */
770
- audioDelayMs?: number;
771
- /** A2E inference chunk size in samples (default: 16000) */
772
- chunkSize?: number;
773
- /** Identity/style index for Wav2Vec2 (default: 0) */
774
- identityIndex?: number;
775
1006
  /** Per-character expression weight scaling */
776
1007
  profile?: ExpressionProfile;
777
- /** Enable neutral transition on playback complete (default: false) */
1008
+ /** Prefetch next sentence while current plays. Default: true */
1009
+ prefetch?: boolean;
1010
+ /** Identity/style index for Wav2Vec2 (default: 0) */
1011
+ identityIndex?: number;
1012
+ /** Audio playback delay in ms */
1013
+ audioDelayMs?: number;
1014
+ /** Enable neutral transition on playback complete */
778
1015
  neutralTransitionEnabled?: boolean;
779
- /** Duration of neutral fade-out in ms (default: 250). Only applies when neutralTransitionEnabled=true. */
1016
+ /** Duration of neutral fade-out in ms */
780
1017
  neutralTransitionMs?: number;
781
- /** Stale frame warning threshold in ms (default: 2000) */
782
- staleThresholdMs?: number;
783
- }
784
- /**
785
- * Full face frame with scaled blendshapes
786
- */
787
- interface FullFaceFrame {
788
- /** Scaled 52 ARKit blendshapes (ExpressionProfile applied) */
789
- blendshapes: Float32Array;
790
- /** Raw A2E output (52 blendshapes, before profile scaling) */
791
- rawBlendshapes: Float32Array;
792
- /** AudioContext timestamp for this frame */
793
- timestamp: number;
794
1018
  }
795
- interface PlaybackPipelineEvents {
796
- /** New frame ready for display (scaled by ExpressionProfile) */
1019
+ interface TTSPlaybackEvents {
1020
+ /** New frame ready for display */
797
1021
  'frame': FullFaceFrame;
798
- /** Raw A2E frame (before profile scaling) */
1022
+ /** Raw A2E frame */
799
1023
  'frame:raw': Float32Array;
800
- /** Playback started (first audio scheduled) */
1024
+ /** Playback started */
801
1025
  'playback:start': {
802
1026
  time: number;
803
1027
  };
804
- /** Playback completed naturally */
1028
+ /** Playback completed */
805
1029
  'playback:complete': void;
806
- /** Playback stopped (user-initiated) */
807
- 'playback:stop': void;
808
- /** Error occurred */
1030
+ /** Error */
809
1031
  'error': Error;
810
- /** State changed */
811
- 'state': PlaybackState;
812
- 'full_frame_ready': FullFaceFrame;
813
- 'lam_frame_ready': Float32Array;
814
- 'playback_complete': void;
815
- 'playback_start': number;
816
1032
  [key: string]: unknown;
817
1033
  }
818
- declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
1034
+ declare class TTSPlayback extends EventEmitter<TTSPlaybackEvents> {
819
1035
  private readonly config;
820
- private scheduler;
821
- private coalescer;
822
- private processor;
823
- private readonly sampleRate;
824
- private _state;
825
- private playbackStarted;
826
- private monitorInterval;
827
- private frameAnimationId;
828
- private lastNewFrameTime;
829
- private lastKnownLamFrame;
830
- private staleWarningEmitted;
831
- private readonly staleThresholdMs;
832
- private frameLoopCount;
833
- private profile;
834
- private readonly neutralTransitionEnabled;
835
- private readonly neutralTransitionMs;
836
- private neutralTransitionFrame;
837
- private neutralTransitionStart;
838
- private neutralAnimationId;
839
- private _currentFrame;
840
- private _currentRawFrame;
841
- /** Current pipeline state */
842
- get state(): PlaybackState;
843
- /** Current scaled blendshapes (updated in-place for perf) */
844
- get currentFrame(): Float32Array | null;
845
- /** Raw A2E blendshapes (before profile scaling) */
846
- get currentRawFrame(): Float32Array | null;
847
- constructor(config: PlaybackPipelineConfig);
848
- /** Initialize AudioContext (lazy, call after user gesture) */
1036
+ private _pipeline;
1037
+ private initialized;
1038
+ constructor(config: TTSPlaybackConfig);
1039
+ /** Access underlying PlaybackPipeline for event subscriptions. */
1040
+ get pipeline(): PlaybackPipeline | null;
1041
+ /** Load TTS model + initialize PlaybackPipeline. */
849
1042
  initialize(): Promise<void>;
850
- /** Update ExpressionProfile at runtime */
851
- setProfile(profile: ExpressionProfile): void;
852
- /**
853
- * Start a new playback session.
854
- * Idempotent — calling during playback resets cleanly without emitting
855
- * spurious playback:complete.
856
- */
857
- start(): void;
858
- /** Feed a streaming audio chunk (PCM16 Uint8Array) */
859
- onAudioChunk(chunk: Uint8Array): Promise<void>;
860
- /** Signal end of audio stream (flushes remaining audio) */
861
- end(): Promise<void>;
862
1043
  /**
863
- * Feed a complete audio buffer. Chunks into 200ms pieces, schedules each
864
- * for playback, runs A2E inference, then waits for completion.
1044
+ * Synthesize text and play with lip sync.
1045
+ * Streams sentences with prefetch for minimal gaps.
1046
+ *
1047
+ * @returns Resolves when playback completes
865
1048
  */
866
- feedBuffer(audio: ArrayBuffer | Float32Array): Promise<void>;
867
- /** Stop playback immediately with fade-out */
868
- stop(fadeOutMs?: number): Promise<void>;
869
- /** Cleanup all resources */
870
- dispose(): void;
871
- /** Get pipeline debug state */
872
- getDebugState(): {
873
- state: PlaybackState;
874
- playbackStarted: boolean;
875
- coalescerFill: number;
876
- processorFill: number;
877
- queuedFrames: number;
878
- currentTime: number;
879
- playbackEndTime: number;
880
- };
881
- private startFrameLoop;
882
- private startMonitoring;
883
- private onPlaybackComplete;
884
- private startNeutralTransition;
885
- private cancelNeutralTransition;
886
- private stopInternal;
887
- private setState;
1049
+ speak(text: string, options?: {
1050
+ signal?: AbortSignal;
1051
+ voice?: string;
1052
+ }): Promise<void>;
1053
+ /** Dispose of all resources. */
1054
+ dispose(): Promise<void>;
1055
+ private speakWithPrefetch;
1056
+ private speakSequential;
888
1057
  }
889
1058
 
890
1059
  /**
@@ -936,6 +1105,15 @@ declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
936
1105
  private aiIsSpeaking;
937
1106
  private interruptionTriggeredThisSession;
938
1107
  constructor(config?: InterruptionConfig);
1108
+ /**
1109
+ * Process raw audio energy for interruption detection (no VAD required).
1110
+ * Used during speaking state when the unified worker is busy with TTS.
1111
+ * Echo-cancelled mic input means energy above threshold = user speech.
1112
+ *
1113
+ * @param rms - RMS energy of audio chunk (0-1)
1114
+ * @param energyThreshold - Minimum energy to consider speech (default: 0.02)
1115
+ */
1116
+ processAudioEnergy(rms: number, energyThreshold?: number): void;
939
1117
  /**
940
1118
  * Process VAD result for interruption detection
941
1119
  * @param vadProbability - Speech probability from VAD (0-1)
@@ -1459,6 +1637,7 @@ declare class SileroVADWorker {
1459
1637
  private config;
1460
1638
  private isLoading;
1461
1639
  private _isLoaded;
1640
+ private poisoned;
1462
1641
  private state;
1463
1642
  private context;
1464
1643
  private readonly chunkSize;
@@ -1526,1137 +1705,1376 @@ declare class SileroVADWorker {
1526
1705
  }
1527
1706
 
1528
1707
  /**
1529
- * Factory function for Silero VAD with automatic Worker vs main thread selection
1708
+ * Unified Inference Worker single Web Worker hosting all WASM models
1530
1709
  *
1531
- * Provides a unified API that automatically selects the optimal implementation:
1532
- * - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
1533
- * - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
1534
- * - Fallback: Gracefully falls back to main thread if Worker fails
1710
+ * Solves the multi-worker ORT problem: three per-model workers each load their
1711
+ * own ORT WASM instance (~40MB each). On iOS this exceeds the ~1-1.5GB tab
1712
+ * limit, forcing main-thread fallback which blocks the render loop.
1535
1713
  *
1536
- * @category Inference
1714
+ * This worker hosts SenseVoice + Wav2ArkitCpu + Silero VAD in a single
1715
+ * ORT WASM instance. Same total model memory (~643MB), but inference runs
1716
+ * off-main-thread. Works on iOS because there's only one ORT instance.
1537
1717
  *
1538
- * @example Basic usage (auto-detect)
1718
+ * Consumer usage:
1539
1719
  * ```typescript
1540
- * import { createSileroVAD } from '@omote/core';
1541
- *
1542
- * const vad = createSileroVAD({
1543
- * modelUrl: '/models/silero-vad.onnx',
1544
- * threshold: 0.5,
1545
- * });
1546
- *
1547
- * await vad.load();
1548
- * const result = await vad.process(audioChunk);
1549
- * if (result.isSpeech) {
1550
- * console.log('Speech detected!', result.probability);
1551
- * }
1552
- * ```
1720
+ * const worker = new UnifiedInferenceWorker();
1721
+ * await worker.init();
1553
1722
  *
1554
- * @example Force worker usage
1555
- * ```typescript
1556
- * const vad = createSileroVAD({
1557
- * modelUrl: '/models/silero-vad.onnx',
1558
- * useWorker: true, // Force Worker even on mobile
1559
- * });
1723
+ * const asr = createSenseVoice({ modelUrl: '...', unifiedWorker: worker });
1724
+ * const lam = createA2E({ gpuModelUrl: '...', cpuModelUrl: '...', unifiedWorker: worker });
1725
+ * const vad = createSileroVAD({ modelUrl: '...', unifiedWorker: worker });
1560
1726
  * ```
1561
1727
  *
1562
- * @example Force main thread
1563
- * ```typescript
1564
- * const vad = createSileroVAD({
1565
- * modelUrl: '/models/silero-vad.onnx',
1566
- * useWorker: false, // Force main thread
1567
- * });
1568
- * ```
1728
+ * @category Inference
1569
1729
  */
1570
1730
 
1731
+ /** Health state of the unified worker */
1732
+ type WorkerHealthState = 'healthy' | 'unhealthy' | 'recovering';
1571
1733
  /**
1572
- * Common interface for both SileroVADInference and SileroVADWorker
1734
+ * Unified Inference Worker — single Web Worker for all WASM models
1573
1735
  *
1574
- * This interface defines the shared API that both implementations provide,
1575
- * allowing consumers to use either interchangeably.
1736
+ * Hosts SenseVoice, Wav2ArkitCpu, and Silero VAD in one ORT instance.
1737
+ * Eliminates the multi-worker memory problem on iOS.
1576
1738
  */
1577
- interface SileroVADBackend {
1578
- /** Current backend type (webgpu, wasm, or null if not loaded) */
1579
- readonly backend: RuntimeBackend | null;
1580
- /** Whether the model is loaded and ready for inference */
1581
- readonly isLoaded: boolean;
1582
- /** Audio sample rate (8000 or 16000 Hz) */
1583
- readonly sampleRate: number;
1584
- /** Speech detection threshold (0-1) */
1585
- readonly threshold: number;
1586
- /**
1587
- * Load the ONNX model
1588
- * @returns Model loading information
1589
- */
1590
- load(): Promise<VADModelInfo | VADWorkerModelInfo>;
1591
- /**
1592
- * Process a single audio chunk
1593
- * @param audioChunk - Float32Array of exactly chunkSize samples
1594
- * @returns VAD result with speech probability
1595
- */
1596
- process(audioChunk: Float32Array): Promise<VADResult>;
1597
- /**
1598
- * Reset state for new audio stream
1599
- */
1600
- reset(): void | Promise<void>;
1739
+ declare class UnifiedInferenceWorker {
1740
+ private worker;
1741
+ private pendingRequests;
1742
+ private initialized;
1743
+ private healthState;
1744
+ private consecutiveFailures;
1745
+ private _generation;
1746
+ private recovering;
1601
1747
  /**
1602
- * Dispose of the model and free resources
1748
+ * Initialize the worker (load ORT WASM from CDN)
1603
1749
  */
1750
+ init(): Promise<void>;
1751
+ loadSenseVoice(config: {
1752
+ modelUrl: string;
1753
+ tokensUrl: string;
1754
+ language: number;
1755
+ textNorm: number;
1756
+ }): Promise<SenseVoiceModelInfo>;
1757
+ transcribe(audio: Float32Array): Promise<SenseVoiceResult>;
1758
+ disposeSenseVoice(): Promise<void>;
1759
+ loadA2E(config: {
1760
+ modelUrl: string;
1761
+ externalDataUrl: string | null;
1762
+ }): Promise<A2EModelInfo>;
1763
+ inferA2E(audio: Float32Array): Promise<{
1764
+ blendshapes: Float32Array;
1765
+ numFrames: number;
1766
+ numBlendshapes: number;
1767
+ inferenceTimeMs: number;
1768
+ }>;
1769
+ disposeA2E(): Promise<void>;
1770
+ loadLAM(config: {
1771
+ modelUrl: string;
1772
+ externalDataUrl: string | null;
1773
+ numIdentityClasses?: number;
1774
+ }): Promise<A2EModelInfo>;
1775
+ inferLAM(audio: Float32Array, identityIndex?: number): Promise<{
1776
+ blendshapes: Float32Array;
1777
+ numFrames: number;
1778
+ numBlendshapes: number;
1779
+ inferenceTimeMs: number;
1780
+ }>;
1781
+ disposeLAM(): Promise<void>;
1782
+ loadKokoro(config: {
1783
+ modelUrl: string;
1784
+ }): Promise<{
1785
+ loadTimeMs: number;
1786
+ }>;
1787
+ inferKokoro(tokens: number[], style: Float32Array, speed: number): Promise<{
1788
+ audio: Float32Array;
1789
+ inferenceTimeMs: number;
1790
+ }>;
1791
+ disposeKokoro(): Promise<void>;
1792
+ loadVAD(config: {
1793
+ modelUrl: string;
1794
+ sampleRate: number;
1795
+ }): Promise<VADWorkerModelInfo>;
1796
+ processVAD(audio: Float32Array, state: Float32Array, context: Float32Array): Promise<{
1797
+ probability: number;
1798
+ state: Float32Array;
1799
+ inferenceTimeMs: number;
1800
+ }>;
1801
+ resetVAD(): Promise<Float32Array>;
1802
+ disposeVAD(): Promise<void>;
1604
1803
  dispose(): Promise<void>;
1804
+ /** Check if the worker is initialized and healthy */
1805
+ get isReady(): boolean;
1806
+ /** Current health state of the worker */
1807
+ get health(): WorkerHealthState;
1808
+ /** Generation counter — increments on worker recovery. Adapters compare to detect stale sessions. */
1809
+ get workerGeneration(): number;
1810
+ /** Check if Web Workers are supported */
1811
+ static isSupported(): boolean;
1812
+ private assertReady;
1813
+ private createWorker;
1814
+ private handleWorkerMessage;
1815
+ private sendMessage;
1605
1816
  /**
1606
- * Get required chunk size in samples
1817
+ * Ping the worker to check if it's alive. If ping succeeds, worker was just
1818
+ * busy with long inference. If ping fails, worker is truly stuck — recover.
1607
1819
  */
1608
- getChunkSize(): number;
1820
+ private runHealthCheck;
1609
1821
  /**
1610
- * Get chunk duration in milliseconds
1822
+ * Terminate the stuck worker, create a new one, and re-initialize ORT.
1823
+ * Model sessions are lost — adapters must reload via generation check.
1611
1824
  */
1612
- getChunkDurationMs(): number;
1825
+ private recoverWorker;
1826
+ private rejectAllPending;
1827
+ private cleanup;
1613
1828
  }
1829
+
1614
1830
  /**
1615
- * Configuration for the Silero VAD factory
1831
+ * Shared base config for all inference factory functions.
1616
1832
  *
1617
- * Extends SileroVADConfig with worker-specific options.
1833
+ * @category Inference
1618
1834
  */
1619
- interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'> {
1620
- /** Path or URL to the ONNX model. Default: HuggingFace CDN */
1621
- modelUrl?: string;
1622
- /**
1623
- * Force worker usage (true), main thread (false), or auto-detect (undefined).
1624
- *
1625
- * Auto-detection behavior:
1626
- * - Desktop: Uses Worker (better responsiveness, off-main-thread)
1627
- * - Mobile: Uses main thread (avoids 5MB memory overhead)
1628
- *
1629
- * You can override this to:
1630
- * - `true`: Force Worker even on mobile (if you have memory headroom)
1631
- * - `false`: Force main thread even on desktop (for debugging)
1632
- *
1633
- * Default: undefined (auto-detect)
1634
- */
1635
- useWorker?: boolean;
1835
+
1836
+ /** Base config shared across all inference factory functions */
1837
+ interface InferenceFactoryConfig {
1636
1838
  /**
1637
- * Fallback to main thread on worker errors.
1638
- *
1639
- * When true (default), if the Worker fails to load or encounters an error,
1640
- * the factory will automatically create a main thread instance instead.
1641
- *
1642
- * When false, worker errors will propagate as exceptions.
1643
- *
1644
- * Default: true
1839
+ * Worker mode:
1840
+ * - 'auto' (default): Use Worker if supported, else main thread
1841
+ * - true: Force Worker (throws if unsupported)
1842
+ * - false: Force main thread
1645
1843
  */
1646
- fallbackOnError?: boolean;
1844
+ useWorker?: boolean | 'auto';
1647
1845
  /**
1648
1846
  * Unified inference worker instance.
1649
- * When provided, uses SileroVADUnifiedAdapter (shared single-ORT worker).
1847
+ * When provided, routes inference through the shared worker,
1848
+ * keeping all inference off the main thread.
1650
1849
  * Takes precedence over useWorker setting.
1651
1850
  */
1652
1851
  unifiedWorker?: UnifiedInferenceWorker;
1653
1852
  }
1853
+
1654
1854
  /**
1655
- * Check if the current environment supports VAD Web Workers
1855
+ * Factory function for SenseVoice ASR with automatic Worker vs main thread selection
1656
1856
  *
1657
- * Requirements:
1658
- * - Worker constructor must exist
1659
- * - Blob URL support (for inline worker script)
1857
+ * Provides a unified API that automatically selects the optimal implementation:
1858
+ * - Worker supported: Uses SenseVoiceWorker (off-main-thread inference)
1859
+ * - Worker unsupported: Uses SenseVoiceInference (main thread)
1660
1860
  *
1661
- * @returns true if VAD Worker is supported
1662
- */
1663
- declare function supportsVADWorker(): boolean;
1664
- /**
1665
- * Create a Silero VAD instance with automatic implementation selection
1861
+ * @category Inference
1666
1862
  *
1667
- * This factory function automatically selects between:
1668
- * - **SileroVADWorker**: Off-main-thread inference (better for desktop)
1669
- * - **SileroVADInference**: Main thread inference (better for mobile)
1863
+ * @example Auto-detect (recommended)
1864
+ * ```typescript
1865
+ * import { createSenseVoice } from '@omote/core';
1670
1866
  *
1671
- * The selection is based on:
1672
- * 1. Explicit `useWorker` config (if provided)
1673
- * 2. Platform detection (mobile vs desktop)
1674
- * 3. Worker API availability
1867
+ * const asr = createSenseVoice({
1868
+ * modelUrl: '/models/sensevoice/model.int8.onnx',
1869
+ * });
1870
+ * await asr.load();
1871
+ * const { text, emotion } = await asr.transcribe(audioSamples);
1872
+ * ```
1675
1873
  *
1676
- * Both implementations share the same interface (SileroVADBackend),
1677
- * so consumers can use either interchangeably.
1874
+ * @example Force worker
1875
+ * ```typescript
1876
+ * const asr = createSenseVoice({
1877
+ * modelUrl: '/models/sensevoice/model.int8.onnx',
1878
+ * useWorker: true,
1879
+ * });
1880
+ * ```
1881
+ *
1882
+ * @example Force main thread
1883
+ * ```typescript
1884
+ * const asr = createSenseVoice({
1885
+ * modelUrl: '/models/sensevoice/model.int8.onnx',
1886
+ * useWorker: false,
1887
+ * });
1888
+ * ```
1889
+ */
1890
+
1891
+ /**
1892
+ * Common interface for both SenseVoiceInference and SenseVoiceWorker
1893
+ */
1894
+ interface SenseVoiceBackend {
1895
+ /** Whether the model is loaded and ready for inference */
1896
+ readonly isLoaded: boolean;
1897
+ /** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
1898
+ readonly backend: 'wasm' | 'webgpu' | null;
1899
+ /**
1900
+ * Load the ONNX model
1901
+ * @param onProgress - Optional progress callback (fires once at 100% for worker)
1902
+ * @returns Model loading information
1903
+ */
1904
+ load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
1905
+ /**
1906
+ * Transcribe audio samples to text
1907
+ * @param audioSamples - Float32Array of audio samples at 16kHz
1908
+ * @returns Transcription result
1909
+ */
1910
+ transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
1911
+ /**
1912
+ * Dispose of the model and free resources
1913
+ */
1914
+ dispose(): Promise<void>;
1915
+ }
1916
+ /**
1917
+ * Configuration for the SenseVoice factory
1918
+ */
1919
+ interface CreateSenseVoiceConfig extends InferenceFactoryConfig {
1920
+ /** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
1921
+ modelUrl?: string;
1922
+ /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
1923
+ tokensUrl?: string;
1924
+ /** Language hint (default: 'auto') */
1925
+ language?: SenseVoiceLanguage;
1926
+ /** Text normalization (default: 'with_itn') */
1927
+ textNorm?: 'with_itn' | 'without_itn';
1928
+ }
1929
+ /**
1930
+ * Create a SenseVoice ASR instance with automatic implementation selection
1678
1931
  *
1679
1932
  * @param config - Factory configuration
1680
- * @returns A SileroVAD instance (either Worker or main thread)
1933
+ * @returns A SenseVoiceBackend instance (either Worker or main thread)
1934
+ */
1935
+ declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
1936
+
1937
+ /**
1938
+ * Shared blendshape constants and utilities for lip sync inference
1681
1939
  *
1682
- * @example
1683
- * ```typescript
1684
- * // Auto-detect (recommended)
1685
- * const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
1940
+ * Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
1941
+ * index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
1686
1942
  *
1687
- * // Force Worker
1688
- * const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
1943
+ * This module is the single source of truth for blendshape ordering to
1944
+ * avoid circular dependencies between inference classes.
1689
1945
  *
1690
- * // Force main thread
1691
- * const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
1692
- * ```
1946
+ * @category Inference
1693
1947
  */
1694
- declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
1695
-
1696
1948
  /**
1697
- * Web Worker-based wav2arkit_cpu lip sync inference
1949
+ * LAM model blendshape names in order (52 total)
1950
+ * NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
1951
+ */
1952
+ declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
1953
+ /** Alias for backwards compatibility */
1954
+ declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
1955
+ /**
1956
+ * Linearly interpolate between two blendshape weight arrays.
1698
1957
  *
1699
- * Runs wav2arkit_cpu inference in a dedicated Web Worker to prevent main thread blocking.
1700
- * Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
1958
+ * Pure math utility with zero renderer dependency used by all renderer
1959
+ * adapters (@omote/three, @omote/babylon, @omote/r3f) for smooth frame
1960
+ * transitions.
1701
1961
  *
1702
- * Key design decisions:
1703
- * - WASM backend only (WebGPU doesn't work in Workers)
1704
- * - Audio copied (not transferred) to retain main thread access
1705
- * - ONNX Runtime loaded from CDN in worker (no bundler complications)
1706
- * - Blendshape symmetrization inlined in worker (no module imports)
1707
- * - iOS: passes model URLs as strings directly to ORT (avoids 400MB+ JS heap)
1962
+ * @param current - Current blendshape weights
1963
+ * @param target - Target blendshape weights
1964
+ * @param factor - Interpolation factor (0 = no change, 1 = snap to target). Default: 0.3
1965
+ * @returns Interpolated weights as number[]
1966
+ */
1967
+ declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
1968
+
1969
+ /**
1970
+ * Wav2Vec2 inference engine for Audio-to-Expression (A2E)
1971
+ *
1972
+ * Runs entirely in the browser using WebGPU or WASM.
1973
+ * Takes raw 16kHz audio and outputs 52 ARKit blendshapes for lip sync.
1708
1974
  *
1709
1975
  * @category Inference
1710
1976
  *
1711
- * @example
1977
+ * @example Basic usage
1712
1978
  * ```typescript
1713
- * import { Wav2ArkitCpuWorker } from '@omote/core';
1979
+ * import { Wav2Vec2Inference } from '@omote/core';
1714
1980
  *
1715
- * const lam = new Wav2ArkitCpuWorker({
1716
- * modelUrl: '/models/wav2arkit_cpu.onnx',
1717
- * });
1718
- * await lam.load();
1981
+ * const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/model.onnx' });
1982
+ * await wav2vec.load();
1719
1983
  *
1720
- * const { blendshapes } = await lam.infer(audioSamples);
1721
- * // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
1984
+ * // Process 1 second of audio (16kHz = 16000 samples)
1985
+ * const result = await wav2vec.infer(audioSamples);
1986
+ * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
1722
1987
  * ```
1723
1988
  */
1724
1989
 
1725
- /**
1726
- * Configuration for Wav2ArkitCpu Worker
1727
- */
1728
- interface Wav2ArkitCpuWorkerConfig {
1729
- /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
1990
+ type InferenceBackend = BackendPreference;
1991
+ interface Wav2Vec2InferenceConfig {
1992
+ /** Path or URL to the ONNX model */
1730
1993
  modelUrl: string;
1731
1994
  /**
1732
1995
  * Path or URL to external model data file (.onnx.data weights).
1733
- * Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
1996
+ * Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
1734
1997
  *
1735
1998
  * Set to `false` to skip external data loading (single-file models only).
1736
1999
  */
1737
2000
  externalDataUrl?: string | false;
2001
+ /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
2002
+ backend?: InferenceBackend;
2003
+ /** Number of identity classes (default: 12 for streaming model) */
2004
+ numIdentityClasses?: number;
2005
+ /**
2006
+ * Number of audio samples per inference chunk (default: 16000).
2007
+ * Model supports variable chunk sizes. Smaller chunks = lower latency,
2008
+ * more inference overhead. 8000 (500ms) is recommended for real-time lip sync.
2009
+ */
2010
+ chunkSize?: number;
2011
+ }
2012
+ interface ModelInfo {
2013
+ backend: 'webgpu' | 'wasm';
2014
+ loadTimeMs: number;
2015
+ inputNames: string[];
2016
+ outputNames: string[];
1738
2017
  }
2018
+
1739
2019
  /**
1740
- * Wav2ArkitCpu Worker - Lip sync inference in a Web Worker
1741
- *
1742
- * Runs wav2arkit_cpu inference off the main thread to prevent UI blocking.
1743
- * Feature parity with Wav2ArkitCpuInference but runs in dedicated worker.
1744
- *
1745
- * @see Wav2ArkitCpuInference for main-thread version
2020
+ * CTC vocabulary (32 tokens from wav2vec2-base-960h)
2021
+ * @deprecated ASR is handled by SenseVoice. This will be removed in a future release.
1746
2022
  */
1747
- declare class Wav2ArkitCpuWorker implements A2EBackend {
1748
- readonly modelId: "wav2arkit_cpu";
1749
- readonly chunkSize: number;
1750
- private worker;
2023
+ declare const CTC_VOCAB: string[];
2024
+ interface Wav2Vec2Result {
2025
+ /** Blendshape weights [frames, 52] - 30fps */
2026
+ blendshapes: Float32Array[];
2027
+ /** Number of blendshape frames (30fps) */
2028
+ numFrames: number;
2029
+ /** Inference time in ms */
2030
+ inferenceTimeMs: number;
2031
+ }
2032
+ declare class Wav2Vec2Inference implements A2EBackend {
2033
+ readonly modelId: "wav2vec2";
2034
+ private session;
2035
+ private ort;
1751
2036
  private config;
2037
+ private _backend;
1752
2038
  private isLoading;
1753
- private _isLoaded;
2039
+ private numIdentityClasses;
2040
+ readonly chunkSize: number;
1754
2041
  private inferenceQueue;
1755
2042
  private poisoned;
1756
- private pendingResolvers;
1757
- constructor(config: Wav2ArkitCpuWorkerConfig);
1758
- get isLoaded(): boolean;
1759
- /**
1760
- * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
1761
- */
1762
- get backend(): 'wasm' | null;
1763
- /**
1764
- * Create the worker from inline script
1765
- */
1766
- private createWorker;
1767
- /**
1768
- * Handle messages from worker
1769
- */
1770
- private handleWorkerMessage;
2043
+ private static readonly INFERENCE_TIMEOUT_MS;
2044
+ constructor(config: Wav2Vec2InferenceConfig);
1771
2045
  /**
1772
- * Send message to worker and wait for response
2046
+ * Check if WebGPU is available and working
2047
+ * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
1773
2048
  */
1774
- private sendMessage;
2049
+ static isWebGPUAvailable: typeof isWebGPUAvailable;
2050
+ get backend(): 'webgpu' | 'wasm' | null;
2051
+ get isLoaded(): boolean;
2052
+ /** True if inference timed out and the session is permanently unusable */
2053
+ get isSessionPoisoned(): boolean;
1775
2054
  /**
1776
- * Load the ONNX model in the worker
2055
+ * Load the ONNX model
1777
2056
  */
1778
- load(): Promise<A2EModelInfo>;
2057
+ load(): Promise<ModelInfo>;
1779
2058
  /**
1780
2059
  * Run inference on raw audio
1781
- *
1782
- * Accepts variable-length audio (not fixed to 16000 samples).
1783
- * Output frames = ceil(30 * numSamples / 16000).
1784
- *
1785
2060
  * @param audioSamples - Float32Array of raw audio at 16kHz
1786
- * @param _identityIndex - Ignored (identity 11 is baked into the model)
2061
+ * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
2062
+ *
2063
+ * Audio will be zero-padded or truncated to chunkSize samples.
1787
2064
  */
1788
- infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
2065
+ infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
1789
2066
  /**
1790
- * Queue inference to serialize worker calls
2067
+ * Queue inference to serialize ONNX session calls
1791
2068
  */
1792
2069
  private queueInference;
1793
2070
  /**
1794
- * Dispose of the worker and free resources
2071
+ * Get blendshape value by name for a specific frame
1795
2072
  */
1796
- dispose(): Promise<void>;
2073
+ getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
1797
2074
  /**
1798
- * Check if Web Workers are supported
2075
+ * Dispose of the model and free resources
1799
2076
  */
1800
- static isSupported(): boolean;
2077
+ dispose(): Promise<void>;
1801
2078
  }
1802
2079
 
1803
2080
  /**
1804
- * Unified Inference Worker single Web Worker hosting all WASM models
2081
+ * Default and user-configurable model URLs for all ONNX models
1805
2082
  *
1806
- * Solves the multi-worker ORT problem: three per-model workers each load their
1807
- * own ORT WASM instance (~40MB each). On iOS this exceeds the ~1-1.5GB tab
1808
- * limit, forcing main-thread fallback which blocks the render loop.
2083
+ * Out of the box, models are served from HuggingFace CDN (`/resolve/main/`
2084
+ * endpoint with `Access-Control-Allow-Origin: *`). For production apps that
2085
+ * need faster or more reliable delivery, call {@link configureModelUrls} once
2086
+ * at startup to point any or all models at your own CDN.
1809
2087
  *
1810
- * This worker hosts SenseVoice + Wav2ArkitCpu + Silero VAD in a single
1811
- * ORT WASM instance. Same total model memory (~643MB), but inference runs
1812
- * off-main-thread. Works on iOS because there's only one ORT instance.
2088
+ * @category Inference
1813
2089
  *
1814
- * Consumer usage:
2090
+ * @example Use HuggingFace defaults (zero-config)
1815
2091
  * ```typescript
1816
- * const worker = new UnifiedInferenceWorker();
1817
- * await worker.init();
1818
- *
1819
- * const asr = createSenseVoice({ modelUrl: '...', unifiedWorker: worker });
1820
- * const lam = createA2E({ gpuModelUrl: '...', cpuModelUrl: '...', unifiedWorker: worker });
1821
- * const vad = createSileroVAD({ modelUrl: '...', unifiedWorker: worker });
2092
+ * import { createA2E } from '@omote/core';
2093
+ * const a2e = createA2E(); // fetches from HuggingFace CDN
1822
2094
  * ```
1823
2095
  *
1824
- * @category Inference
2096
+ * @example Self-host on your own CDN
2097
+ * ```typescript
2098
+ * import { configureModelUrls, createA2E } from '@omote/core';
2099
+ *
2100
+ * configureModelUrls({
2101
+ * lam: 'https://cdn.example.com/models/model_fp16.onnx',
2102
+ * senseVoice: 'https://cdn.example.com/models/sensevoice.int8.onnx',
2103
+ * // omitted keys keep HuggingFace defaults
2104
+ * });
2105
+ *
2106
+ * const a2e = createA2E(); // now fetches from your CDN
2107
+ * ```
1825
2108
  */
1826
-
2109
+ /** Model URL keys that can be configured */
2110
+ type ModelUrlKey = 'lam' | 'lamIos' | 'wav2arkitCpu' | 'senseVoice' | 'sileroVad' | 'kokoroTTS' | 'kokoroVoices';
1827
2111
  /**
1828
- * Unified Inference Workersingle Web Worker for all WASM models
2112
+ * Resolved model URLsuser overrides take priority, HuggingFace CDN is fallback.
1829
2113
  *
1830
- * Hosts SenseVoice, Wav2ArkitCpu, and Silero VAD in one ORT instance.
1831
- * Eliminates the multi-worker memory problem on iOS.
2114
+ * All SDK factories (`createA2E`, `createSenseVoice`, `createSileroVAD`) and
2115
+ * orchestrators (`VoicePipeline`) read from this object. Call
2116
+ * {@link configureModelUrls} before constructing any pipelines to point
2117
+ * models at your own CDN.
1832
2118
  */
1833
- declare class UnifiedInferenceWorker {
1834
- private worker;
1835
- private pendingRequests;
1836
- private initialized;
1837
- private poisoned;
1838
- /**
1839
- * Initialize the worker (load ORT WASM from CDN)
1840
- */
1841
- init(): Promise<void>;
1842
- loadSenseVoice(config: {
1843
- modelUrl: string;
1844
- tokensUrl: string;
1845
- language: number;
1846
- textNorm: number;
1847
- }): Promise<SenseVoiceModelInfo>;
1848
- transcribe(audio: Float32Array): Promise<SenseVoiceResult>;
1849
- disposeSenseVoice(): Promise<void>;
1850
- loadA2E(config: {
1851
- modelUrl: string;
1852
- externalDataUrl: string | null;
1853
- }): Promise<A2EModelInfo>;
1854
- inferA2E(audio: Float32Array): Promise<{
1855
- blendshapes: Float32Array;
1856
- numFrames: number;
1857
- numBlendshapes: number;
1858
- inferenceTimeMs: number;
1859
- }>;
1860
- disposeA2E(): Promise<void>;
1861
- loadVAD(config: {
1862
- modelUrl: string;
1863
- sampleRate: number;
1864
- }): Promise<VADWorkerModelInfo>;
1865
- processVAD(audio: Float32Array, state: Float32Array, context: Float32Array): Promise<{
1866
- probability: number;
1867
- state: Float32Array;
1868
- inferenceTimeMs: number;
1869
- }>;
1870
- resetVAD(): Promise<Float32Array>;
1871
- disposeVAD(): Promise<void>;
1872
- dispose(): Promise<void>;
1873
- /** Check if the worker is initialized and not poisoned */
1874
- get isReady(): boolean;
1875
- /** Check if Web Workers are supported */
1876
- static isSupported(): boolean;
1877
- private assertReady;
1878
- private createWorker;
1879
- private handleWorkerMessage;
1880
- private sendMessage;
1881
- private rejectAllPending;
1882
- private cleanup;
1883
- }
2119
+ declare const DEFAULT_MODEL_URLS: Readonly<Record<ModelUrlKey, string>>;
1884
2120
  /**
1885
- * SenseVoice adapter backed by UnifiedInferenceWorker
2121
+ * Configure custom model URLs. Overrides persist for the lifetime of the page.
2122
+ * Omitted keys keep their HuggingFace CDN defaults.
1886
2123
  *
1887
- * Implements SenseVoiceBackend, delegating all inference to the shared worker.
2124
+ * Call this **once** at app startup, before constructing any pipelines.
2125
+ *
2126
+ * @example Self-host all models
2127
+ * ```typescript
2128
+ * configureModelUrls({
2129
+ * lam: 'https://cdn.example.com/models/model_fp16.onnx',
2130
+ * wav2arkitCpu: 'https://cdn.example.com/models/wav2arkit_cpu.onnx',
2131
+ * senseVoice: 'https://cdn.example.com/models/sensevoice.int8.onnx',
2132
+ * sileroVad: 'https://cdn.example.com/models/silero-vad.onnx',
2133
+ * });
2134
+ * ```
2135
+ *
2136
+ * @example Override only one model
2137
+ * ```typescript
2138
+ * configureModelUrls({
2139
+ * lam: '/models/model_fp16.onnx', // self-hosted, same origin
2140
+ * });
2141
+ * ```
1888
2142
  */
1889
- declare class SenseVoiceUnifiedAdapter implements SenseVoiceBackend {
1890
- private worker;
1891
- private config;
1892
- private _isLoaded;
1893
- private languageId;
1894
- private textNormId;
1895
- private inferenceQueue;
1896
- constructor(worker: UnifiedInferenceWorker, config: SenseVoiceWorkerConfig);
1897
- get isLoaded(): boolean;
1898
- get backend(): 'wasm' | null;
1899
- load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
1900
- transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
1901
- dispose(): Promise<void>;
1902
- }
2143
+ declare function configureModelUrls(urls: Partial<Record<ModelUrlKey, string>>): void;
1903
2144
  /**
1904
- * Wav2ArkitCpu adapter backed by UnifiedInferenceWorker
1905
- *
1906
- * Implements A2EBackend, delegating all inference to the shared worker.
2145
+ * Reset all model URL overrides back to HuggingFace CDN defaults.
2146
+ * Mainly useful for testing.
1907
2147
  */
1908
- declare class Wav2ArkitCpuUnifiedAdapter implements A2EBackend {
1909
- readonly modelId: "wav2arkit_cpu";
1910
- readonly chunkSize: number;
1911
- private worker;
1912
- private config;
1913
- private _isLoaded;
1914
- private inferenceQueue;
1915
- constructor(worker: UnifiedInferenceWorker, config: Wav2ArkitCpuWorkerConfig);
1916
- get isLoaded(): boolean;
1917
- get backend(): RuntimeBackend | null;
1918
- load(): Promise<A2EModelInfo>;
1919
- infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
1920
- dispose(): Promise<void>;
1921
- }
2148
+ declare function resetModelUrls(): void;
1922
2149
  /**
1923
- * Silero VAD adapter backed by UnifiedInferenceWorker
1924
- *
1925
- * Implements SileroVADBackend, delegating all inference to the shared worker.
2150
+ * Get the immutable HuggingFace CDN URLs (ignoring any overrides).
2151
+ * Useful for documentation or fallback logic.
1926
2152
  */
1927
- declare class SileroVADUnifiedAdapter implements SileroVADBackend {
1928
- private worker;
1929
- private config;
1930
- private _isLoaded;
1931
- private state;
1932
- private context;
1933
- private readonly chunkSize;
1934
- private readonly contextSize;
1935
- private inferenceQueue;
1936
- private preSpeechBuffer;
1937
- private wasSpeaking;
1938
- constructor(worker: UnifiedInferenceWorker, config: SileroVADConfig);
1939
- get isLoaded(): boolean;
1940
- get backend(): RuntimeBackend | null;
1941
- get sampleRate(): number;
1942
- get threshold(): number;
1943
- getChunkSize(): number;
1944
- getChunkDurationMs(): number;
1945
- load(): Promise<VADWorkerModelInfo>;
1946
- process(audioChunk: Float32Array): Promise<VADResult>;
1947
- reset(): Promise<void>;
1948
- dispose(): Promise<void>;
1949
- }
2153
+ declare const HF_CDN_URLS: Readonly<Record<ModelUrlKey, string>>;
1950
2154
 
1951
2155
  /**
1952
- * Factory function for SenseVoice ASR with automatic Worker vs main thread selection
2156
+ * CPU-optimized lip sync inference using wav2arkit_cpu model
1953
2157
  *
1954
- * Provides a unified API that automatically selects the optimal implementation:
1955
- * - Worker supported: Uses SenseVoiceWorker (off-main-thread inference)
1956
- * - Worker unsupported: Uses SenseVoiceInference (main thread)
2158
+ * A Safari/iOS-compatible alternative to Wav2Vec2Inference (192MB fp16) designed
2159
+ * for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
1957
2160
  *
1958
- * @category Inference
2161
+ * The model uses ONNX external data format:
2162
+ * - wav2arkit_cpu.onnx (1.86MB graph structure)
2163
+ * - wav2arkit_cpu.onnx.data (402MB weights)
2164
+ * Both files are fetched and cached automatically.
1959
2165
  *
1960
- * @example Auto-detect (recommended)
1961
- * ```typescript
1962
- * import { createSenseVoice } from '@omote/core';
2166
+ * Key differences from Wav2Vec2Inference:
2167
+ * - WASM-only backend (CPU-optimized, single-threaded, no WebGPU)
2168
+ * - No identity input (baked to identity 11)
2169
+ * - No ASR output (lip sync only)
2170
+ * - Dynamic input length (not fixed to 16000 samples)
2171
+ * - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
1963
2172
  *
1964
- * const asr = createSenseVoice({
1965
- * modelUrl: '/models/sensevoice/model.int8.onnx',
1966
- * });
1967
- * await asr.load();
1968
- * const { text, emotion } = await asr.transcribe(audioSamples);
1969
- * ```
2173
+ * @category Inference
1970
2174
  *
1971
- * @example Force worker
2175
+ * @example
1972
2176
  * ```typescript
1973
- * const asr = createSenseVoice({
1974
- * modelUrl: '/models/sensevoice/model.int8.onnx',
1975
- * useWorker: true,
1976
- * });
1977
- * ```
2177
+ * import { Wav2ArkitCpuInference } from '@omote/core';
1978
2178
  *
1979
- * @example Force main thread
1980
- * ```typescript
1981
- * const asr = createSenseVoice({
1982
- * modelUrl: '/models/sensevoice/model.int8.onnx',
1983
- * useWorker: false,
2179
+ * const lam = new Wav2ArkitCpuInference({
2180
+ * modelUrl: '/models/wav2arkit_cpu.onnx',
1984
2181
  * });
2182
+ * await lam.load();
2183
+ *
2184
+ * const { blendshapes } = await lam.infer(audioSamples);
2185
+ * // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
1985
2186
  * ```
1986
2187
  */
1987
2188
 
1988
- /**
1989
- * Common interface for both SenseVoiceInference and SenseVoiceWorker
1990
- */
1991
- interface SenseVoiceBackend {
1992
- /** Whether the model is loaded and ready for inference */
1993
- readonly isLoaded: boolean;
1994
- /** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
1995
- readonly backend: 'wasm' | 'webgpu' | null;
2189
+ interface Wav2ArkitCpuConfig {
2190
+ /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
2191
+ modelUrl: string;
1996
2192
  /**
1997
- * Load the ONNX model
1998
- * @param onProgress - Optional progress callback (fires once at 100% for worker)
1999
- * @returns Model loading information
2193
+ * Path or URL to external model data file (.onnx.data weights).
2194
+ * Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
2195
+ *
2196
+ * Set to `false` to skip external data loading (single-file models only).
2000
2197
  */
2001
- load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
2198
+ externalDataUrl?: string | false;
2199
+ /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
2200
+ backend?: BackendPreference;
2201
+ }
2202
+ declare class Wav2ArkitCpuInference implements A2EBackend {
2203
+ readonly modelId: "wav2arkit_cpu";
2204
+ readonly chunkSize: number;
2205
+ private session;
2206
+ private ort;
2207
+ private config;
2208
+ private _backend;
2209
+ private isLoading;
2210
+ private inferenceQueue;
2211
+ private poisoned;
2212
+ private static readonly INFERENCE_TIMEOUT_MS;
2213
+ constructor(config: Wav2ArkitCpuConfig);
2214
+ get backend(): RuntimeBackend | null;
2215
+ get isLoaded(): boolean;
2002
2216
  /**
2003
- * Transcribe audio samples to text
2004
- * @param audioSamples - Float32Array of audio samples at 16kHz
2005
- * @returns Transcription result
2217
+ * Load the ONNX model
2006
2218
  */
2007
- transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
2219
+ load(): Promise<A2EModelInfo>;
2008
2220
  /**
2009
- * Dispose of the model and free resources
2221
+ * Run inference on raw audio
2222
+ *
2223
+ * Accepts variable-length audio (not fixed to 16000 samples).
2224
+ * Output frames = ceil(30 * numSamples / 16000).
2225
+ *
2226
+ * @param audioSamples - Float32Array of raw audio at 16kHz
2227
+ * @param _identityIndex - Ignored (identity 11 is baked into the model)
2010
2228
  */
2011
- dispose(): Promise<void>;
2012
- }
2013
- /**
2014
- * Configuration for the SenseVoice factory
2015
- */
2016
- interface CreateSenseVoiceConfig {
2017
- /** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
2018
- modelUrl?: string;
2019
- /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
2020
- tokensUrl?: string;
2021
- /** Language hint (default: 'auto') */
2022
- language?: SenseVoiceLanguage;
2023
- /** Text normalization (default: 'with_itn') */
2024
- textNorm?: 'with_itn' | 'without_itn';
2229
+ infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
2025
2230
  /**
2026
- * Worker mode:
2027
- * - 'auto' (default): Use Worker if supported, else main thread
2028
- * - true: Force Worker (throws if unsupported)
2029
- * - false: Force main thread
2231
+ * Queue inference to serialize ONNX session calls
2030
2232
  */
2031
- useWorker?: boolean | 'auto';
2233
+ private queueInference;
2032
2234
  /**
2033
- * Unified inference worker instance.
2034
- * When provided, uses SenseVoiceUnifiedAdapter (shared single-ORT worker).
2035
- * Takes precedence over useWorker setting.
2235
+ * Dispose of the model and free resources
2036
2236
  */
2037
- unifiedWorker?: UnifiedInferenceWorker;
2237
+ dispose(): Promise<void>;
2038
2238
  }
2039
- /**
2040
- * Create a SenseVoice ASR instance with automatic implementation selection
2041
- *
2042
- * @param config - Factory configuration
2043
- * @returns A SenseVoiceBackend instance (either Worker or main thread)
2044
- */
2045
- declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
2046
2239
 
2047
2240
  /**
2048
- * Shared blendshape constants and utilities for lip sync inference
2049
- *
2050
- * Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
2051
- * index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
2052
- *
2053
- * This module is the single source of truth for blendshape ordering to
2054
- * avoid circular dependencies between inference classes.
2055
- *
2056
- * @category Inference
2057
- */
2058
- /**
2059
- * LAM model blendshape names in order (52 total)
2060
- * NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
2061
- */
2062
- declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
2063
- /** Alias for backwards compatibility */
2064
- declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
2065
- /**
2066
- * Linearly interpolate between two blendshape weight arrays.
2067
- *
2068
- * Pure math utility with zero renderer dependency — used by all renderer
2069
- * adapters (@omote/three, @omote/babylon, @omote/r3f) for smooth frame
2070
- * transitions.
2241
+ * Web Worker-based wav2arkit_cpu lip sync inference
2071
2242
  *
2072
- * @param current - Current blendshape weights
2073
- * @param target - Target blendshape weights
2074
- * @param factor - Interpolation factor (0 = no change, 1 = snap to target). Default: 0.3
2075
- * @returns Interpolated weights as number[]
2076
- */
2077
- declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
2078
-
2079
- /**
2080
- * Wav2Vec2 inference engine for Audio-to-Expression (A2E)
2243
+ * Runs wav2arkit_cpu inference in a dedicated Web Worker to prevent main thread blocking.
2244
+ * Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
2081
2245
  *
2082
- * Runs entirely in the browser using WebGPU or WASM.
2083
- * Takes raw 16kHz audio and outputs 52 ARKit blendshapes for lip sync.
2246
+ * Key design decisions:
2247
+ * - WASM backend only (WebGPU doesn't work in Workers)
2248
+ * - Audio copied (not transferred) to retain main thread access
2249
+ * - ONNX Runtime loaded from CDN in worker (no bundler complications)
2250
+ * - Blendshape symmetrization inlined in worker (no module imports)
2251
+ * - iOS: passes model URLs as strings directly to ORT (avoids 400MB+ JS heap)
2084
2252
  *
2085
2253
  * @category Inference
2086
2254
  *
2087
- * @example Basic usage
2255
+ * @example
2088
2256
  * ```typescript
2089
- * import { Wav2Vec2Inference } from '@omote/core';
2257
+ * import { Wav2ArkitCpuWorker } from '@omote/core';
2090
2258
  *
2091
- * const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/model.onnx' });
2092
- * await wav2vec.load();
2259
+ * const lam = new Wav2ArkitCpuWorker({
2260
+ * modelUrl: '/models/wav2arkit_cpu.onnx',
2261
+ * });
2262
+ * await lam.load();
2093
2263
  *
2094
- * // Process 1 second of audio (16kHz = 16000 samples)
2095
- * const result = await wav2vec.infer(audioSamples);
2096
- * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
2264
+ * const { blendshapes } = await lam.infer(audioSamples);
2265
+ * // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
2097
2266
  * ```
2098
2267
  */
2099
2268
 
2100
- type InferenceBackend = BackendPreference;
2101
- interface Wav2Vec2InferenceConfig {
2102
- /** Path or URL to the ONNX model */
2103
- modelUrl: string;
2104
- /**
2105
- * Path or URL to external model data file (.onnx.data weights).
2106
- * Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
2107
- *
2108
- * Set to `false` to skip external data loading (single-file models only).
2109
- */
2110
- externalDataUrl?: string | false;
2111
- /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
2112
- backend?: InferenceBackend;
2113
- /** Number of identity classes (default: 12 for streaming model) */
2114
- numIdentityClasses?: number;
2115
- /**
2116
- * Number of audio samples per inference chunk (default: 16000).
2117
- * Model supports variable chunk sizes. Smaller chunks = lower latency,
2118
- * more inference overhead. 8000 (500ms) is recommended for real-time lip sync.
2119
- */
2120
- chunkSize?: number;
2121
- }
2122
- interface ModelInfo {
2123
- backend: 'webgpu' | 'wasm';
2124
- loadTimeMs: number;
2125
- inputNames: string[];
2126
- outputNames: string[];
2127
- }
2128
-
2129
2269
  /**
2130
- * CTC vocabulary (32 tokens from wav2vec2-base-960h)
2131
- * @deprecated ASR is handled by SenseVoice. This will be removed in a future release.
2270
+ * Configuration for Wav2ArkitCpu Worker
2132
2271
  */
2133
- declare const CTC_VOCAB: string[];
2134
- interface Wav2Vec2Result {
2135
- /** Blendshape weights [frames, 52] - 30fps */
2136
- blendshapes: Float32Array[];
2137
- /** Number of blendshape frames (30fps) */
2138
- numFrames: number;
2139
- /** Inference time in ms */
2140
- inferenceTimeMs: number;
2272
+ interface Wav2ArkitCpuWorkerConfig {
2273
+ /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
2274
+ modelUrl: string;
2275
+ /**
2276
+ * Path or URL to external model data file (.onnx.data weights).
2277
+ * Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
2278
+ *
2279
+ * Set to `false` to skip external data loading (single-file models only).
2280
+ */
2281
+ externalDataUrl?: string | false;
2141
2282
  }
2142
- declare class Wav2Vec2Inference implements A2EBackend {
2143
- readonly modelId: "wav2vec2";
2144
- private session;
2145
- private ort;
2283
+ /**
2284
+ * Wav2ArkitCpu Worker - Lip sync inference in a Web Worker
2285
+ *
2286
+ * Runs wav2arkit_cpu inference off the main thread to prevent UI blocking.
2287
+ * Feature parity with Wav2ArkitCpuInference but runs in dedicated worker.
2288
+ *
2289
+ * @see Wav2ArkitCpuInference for main-thread version
2290
+ */
2291
+ declare class Wav2ArkitCpuWorker implements A2EBackend {
2292
+ readonly modelId: "wav2arkit_cpu";
2293
+ readonly chunkSize: number;
2294
+ private worker;
2146
2295
  private config;
2147
- private _backend;
2148
2296
  private isLoading;
2149
- private numIdentityClasses;
2150
- readonly chunkSize: number;
2297
+ private _isLoaded;
2151
2298
  private inferenceQueue;
2152
2299
  private poisoned;
2153
- private static readonly INFERENCE_TIMEOUT_MS;
2154
- constructor(config: Wav2Vec2InferenceConfig);
2300
+ private pendingResolvers;
2301
+ constructor(config: Wav2ArkitCpuWorkerConfig);
2302
+ get isLoaded(): boolean;
2155
2303
  /**
2156
- * Check if WebGPU is available and working
2157
- * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
2304
+ * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
2158
2305
  */
2159
- static isWebGPUAvailable: typeof isWebGPUAvailable;
2160
- get backend(): 'webgpu' | 'wasm' | null;
2161
- get isLoaded(): boolean;
2162
- /** True if inference timed out and the session is permanently unusable */
2163
- get isSessionPoisoned(): boolean;
2306
+ get backend(): 'wasm' | null;
2164
2307
  /**
2165
- * Load the ONNX model
2308
+ * Create the worker from inline script
2166
2309
  */
2167
- load(): Promise<ModelInfo>;
2310
+ private createWorker;
2311
+ /**
2312
+ * Handle messages from worker
2313
+ */
2314
+ private handleWorkerMessage;
2315
+ /**
2316
+ * Send message to worker and wait for response
2317
+ */
2318
+ private sendMessage;
2319
+ /**
2320
+ * Load the ONNX model in the worker
2321
+ */
2322
+ load(): Promise<A2EModelInfo>;
2168
2323
  /**
2169
2324
  * Run inference on raw audio
2170
- * @param audioSamples - Float32Array of raw audio at 16kHz
2171
- * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
2172
2325
  *
2173
- * Audio will be zero-padded or truncated to chunkSize samples.
2326
+ * Accepts variable-length audio (not fixed to 16000 samples).
2327
+ * Output frames = ceil(30 * numSamples / 16000).
2328
+ *
2329
+ * @param audioSamples - Float32Array of raw audio at 16kHz
2330
+ * @param _identityIndex - Ignored (identity 11 is baked into the model)
2174
2331
  */
2175
- infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
2332
+ infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
2176
2333
  /**
2177
- * Queue inference to serialize ONNX session calls
2334
+ * Queue inference to serialize worker calls
2178
2335
  */
2179
2336
  private queueInference;
2180
2337
  /**
2181
- * Get blendshape value by name for a specific frame
2338
+ * Dispose of the worker and free resources
2182
2339
  */
2183
- getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
2340
+ dispose(): Promise<void>;
2184
2341
  /**
2185
- * Dispose of the model and free resources
2342
+ * Check if Web Workers are supported
2186
2343
  */
2187
- dispose(): Promise<void>;
2344
+ static isSupported(): boolean;
2188
2345
  }
2189
2346
 
2190
2347
  /**
2191
- * Default and user-configurable model URLs for all ONNX models
2348
+ * Factory function for A2E with automatic GPU/CPU model selection
2192
2349
  *
2193
- * Out of the box, models are served from HuggingFace CDN (`/resolve/main/`
2194
- * endpoint with `Access-Control-Allow-Origin: *`). For production apps that
2195
- * need faster or more reliable delivery, call {@link configureModelUrls} once
2196
- * at startup to point any or all models at your own CDN.
2350
+ * Provides a unified API with platform-aware model selection:
2351
+ *
2352
+ * **Desktop (Chrome/Edge/Android):**
2353
+ * Wav2Vec2 (WebGPU, 192MB fp16) wav2arkit_cpu fallback
2354
+ *
2355
+ * **iOS/Safari:**
2356
+ * LAM iOS (WASM, opset 18, ~192MB fp16, native LayerNorm) → wav2arkit_cpu fallback
2357
+ *
2358
+ * The iOS variant is the same LAM model re-exported at opset 18 with native
2359
+ * LayerNormalization ops (~256 fewer graph nodes than desktop's opset 14
2360
+ * decomposed LayerNorm). This dramatically reduces peak memory during ORT
2361
+ * graph parsing/optimization, fitting within iOS's ~1-1.5GB tab limit.
2362
+ *
2363
+ * Both variants use fp16 external data format (small graph + ~192MB weights).
2364
+ * On iOS, ORT streams weights directly into WASM memory via URL pass-through
2365
+ * (~2MB JS heap). If the model still OOMs, A2EWithFallback falls back to
2366
+ * wav2arkit_cpu (404MB fp32, lower quality).
2197
2367
  *
2198
2368
  * @category Inference
2199
2369
  *
2200
- * @example Use HuggingFace defaults (zero-config)
2370
+ * @example Auto-detect (recommended, zero-config)
2201
2371
  * ```typescript
2202
2372
  * import { createA2E } from '@omote/core';
2203
- * const a2e = createA2E(); // fetches from HuggingFace CDN
2373
+ *
2374
+ * const a2e = createA2E(); // uses HF CDN defaults (192MB fp16 GPU, 404MB CPU fallback)
2375
+ * await a2e.load();
2376
+ * const { blendshapes } = await a2e.infer(audioSamples);
2204
2377
  * ```
2205
2378
  *
2206
- * @example Self-host on your own CDN
2379
+ * @example Force CPU model
2207
2380
  * ```typescript
2208
- * import { configureModelUrls, createA2E } from '@omote/core';
2209
- *
2210
- * configureModelUrls({
2211
- * lam: 'https://cdn.example.com/models/model_fp16.onnx',
2212
- * senseVoice: 'https://cdn.example.com/models/sensevoice.int8.onnx',
2213
- * // omitted keys keep HuggingFace defaults
2214
- * });
2215
- *
2216
- * const a2e = createA2E(); // now fetches from your CDN
2381
+ * const a2e = createA2E({ mode: 'cpu' });
2217
2382
  * ```
2218
2383
  */
2219
- /** Model URL keys that can be configured */
2220
- type ModelUrlKey = 'lam' | 'wav2arkitCpu' | 'senseVoice' | 'sileroVad';
2384
+
2221
2385
  /**
2222
- * Resolved model URLs user overrides take priority, HuggingFace CDN is fallback.
2386
+ * Configuration for the A2E factory
2387
+ */
2388
+ interface CreateA2EConfig extends InferenceFactoryConfig {
2389
+ /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge). Default: HuggingFace CDN */
2390
+ gpuModelUrl?: string;
2391
+ /**
2392
+ * URL for GPU model external data file (.onnx.data weights).
2393
+ * Default: `${gpuModelUrl}.data`
2394
+ *
2395
+ * Set to `false` to skip external data loading (single-file models only).
2396
+ */
2397
+ gpuExternalDataUrl?: string | false;
2398
+ /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS). Default: HuggingFace CDN */
2399
+ cpuModelUrl?: string;
2400
+ /**
2401
+ * Model selection mode:
2402
+ * - 'auto': Safari/iOS -> CPU, everything else -> GPU (default)
2403
+ * - 'gpu': Force GPU model (Wav2Vec2Inference)
2404
+ * - 'cpu': Force CPU model (Wav2ArkitCpuInference)
2405
+ */
2406
+ mode?: 'auto' | 'gpu' | 'cpu';
2407
+ /** Backend preference for GPU model (default: 'auto') */
2408
+ gpuBackend?: BackendPreference;
2409
+ /** Number of identity classes for GPU model (default: 12) */
2410
+ numIdentityClasses?: number;
2411
+ /**
2412
+ * Fall back to CPU model if GPU model fails to load (default: true)
2413
+ * Only applies when mode is 'auto' or 'gpu'
2414
+ */
2415
+ fallbackOnError?: boolean;
2416
+ }
2417
+ /**
2418
+ * Create an A2E instance with automatic GPU/CPU model selection
2223
2419
  *
2224
- * All SDK factories (`createA2E`, `createSenseVoice`, `createSileroVAD`) and
2225
- * orchestrators (`VoicePipeline`) read from this object. Call
2226
- * {@link configureModelUrls} before constructing any pipelines to point
2227
- * models at your own CDN.
2420
+ * @param config - Factory configuration
2421
+ * @returns An A2EBackend instance (either GPU or CPU model)
2228
2422
  */
2229
- declare const DEFAULT_MODEL_URLS: Readonly<Record<ModelUrlKey, string>>;
2423
+ declare function createA2E(config?: CreateA2EConfig): A2EBackend;
2424
+
2230
2425
  /**
2231
- * Configure custom model URLs. Overrides persist for the lifetime of the page.
2232
- * Omitted keys keep their HuggingFace CDN defaults.
2426
+ * A2EProcessor Engine-agnostic audio-to-expression processor
2233
2427
  *
2234
- * Call this **once** at app startup, before constructing any pipelines.
2428
+ * The core inference primitive: audio samples in blendshape frames out.
2429
+ * No mic capture, no audio playback, no Web Audio API.
2235
2430
  *
2236
- * @example Self-host all models
2431
+ * This is what Unity/Unreal/Godot/any engine would use directly.
2432
+ * Web-specific concerns (mic, AudioContext, scheduling) live in the
2433
+ * orchestrator and pipeline layers above.
2434
+ *
2435
+ * Two output modes:
2436
+ * - **Pull mode**: `pushAudio(samples, timestamp)` + `getFrameForTime(t)`
2437
+ * For TTS playback where frames are synced to AudioContext clock.
2438
+ * - **Push mode**: `pushAudio(samples)` + `startDrip()` + `latestFrame`
2439
+ * For live mic / game loop where frames are consumed at ~30fps.
2440
+ *
2441
+ * @category Inference
2442
+ *
2443
+ * @example Pull mode (TTS playback)
2237
2444
  * ```typescript
2238
- * configureModelUrls({
2239
- * lam: 'https://cdn.example.com/models/model_fp16.onnx',
2240
- * wav2arkitCpu: 'https://cdn.example.com/models/wav2arkit_cpu.onnx',
2241
- * senseVoice: 'https://cdn.example.com/models/sensevoice.int8.onnx',
2242
- * sileroVad: 'https://cdn.example.com/models/silero-vad.onnx',
2243
- * });
2445
+ * const processor = new A2EProcessor({ backend: a2e });
2446
+ * processor.pushAudio(samples, audioContext.currentTime + delay);
2447
+ * const frame = processor.getFrameForTime(audioContext.currentTime);
2244
2448
  * ```
2245
2449
  *
2246
- * @example Override only one model
2450
+ * @example Push mode (live mic)
2247
2451
  * ```typescript
2248
- * configureModelUrls({
2249
- * lam: '/models/model_fp16.onnx', // self-hosted, same origin
2452
+ * const processor = new A2EProcessor({
2453
+ * backend: a2e,
2454
+ * onFrame: (frame) => applyToAvatar(frame),
2250
2455
  * });
2456
+ * processor.startDrip();
2457
+ * processor.pushAudio(micSamples); // no timestamp → drip mode
2251
2458
  * ```
2252
2459
  */
2253
- declare function configureModelUrls(urls: Partial<Record<ModelUrlKey, string>>): void;
2254
- /**
2255
- * Reset all model URL overrides back to HuggingFace CDN defaults.
2256
- * Mainly useful for testing.
2257
- */
2258
- declare function resetModelUrls(): void;
2259
- /**
2260
- * Get the immutable HuggingFace CDN URLs (ignoring any overrides).
2261
- * Useful for documentation or fallback logic.
2262
- */
2263
- declare const HF_CDN_URLS: Readonly<Record<ModelUrlKey, string>>;
2460
+
2461
+ interface A2EProcessorConfig {
2462
+ /** Inference backend */
2463
+ backend: A2EBackend;
2464
+ /** Sample rate (default: 16000) */
2465
+ sampleRate?: number;
2466
+ /** Samples per inference chunk (default: 16000 = 1s) */
2467
+ chunkSize?: number;
2468
+ /**
2469
+ * Identity/style index for the A2E model (default: 0).
2470
+ *
2471
+ * The LAM model uses a one-hot identity vector (12 classes, indices 0-11) as
2472
+ * style conditioning alongside audio features. Different indices produce
2473
+ * different expression intensity across face regions (brows, eyes, cheeks).
2474
+ *
2475
+ * Only affects Wav2Vec2Inference (GPU model). Wav2ArkitCpuInference has
2476
+ * identity 11 baked into the model weights.
2477
+ */
2478
+ identityIndex?: number;
2479
+ /** Callback fired with each blendshape frame (push mode) */
2480
+ onFrame?: (frame: Float32Array) => void;
2481
+ /** Error callback */
2482
+ onError?: (error: Error) => void;
2483
+ }
2484
+ declare class A2EProcessor {
2485
+ private readonly backend;
2486
+ private readonly sampleRate;
2487
+ private readonly chunkSize;
2488
+ private readonly identityIndex;
2489
+ private readonly onFrame?;
2490
+ private readonly onError?;
2491
+ private bufferCapacity;
2492
+ private buffer;
2493
+ private writeOffset;
2494
+ private bufferStartTime;
2495
+ private timestampedQueue;
2496
+ private plainQueue;
2497
+ private _latestFrame;
2498
+ private dripInterval;
2499
+ private lastPulledFrame;
2500
+ private inferenceRunning;
2501
+ private pendingChunks;
2502
+ private getFrameCallCount;
2503
+ private disposed;
2504
+ constructor(config: A2EProcessorConfig);
2505
+ /**
2506
+ * Push audio samples for inference (any source: mic, TTS, file).
2507
+ *
2508
+ * - With `timestamp`: frames stored with timestamps (pull mode)
2509
+ * - Without `timestamp`: frames stored in plain queue (drip/push mode)
2510
+ *
2511
+ * Fire-and-forget: returns immediately, inference runs async.
2512
+ */
2513
+ pushAudio(samples: Float32Array, timestamp?: number): void;
2514
+ /**
2515
+ * Flush remaining buffered audio (pads to chunkSize).
2516
+ * Call at end of stream to process final partial chunk.
2517
+ *
2518
+ * Routes through the serialized pendingChunks pipeline to maintain
2519
+ * correct frame ordering. Without this, flush() could push frames
2520
+ * with the latest timestamp to the queue before drainPendingChunks()
2521
+ * finishes pushing frames with earlier timestamps — causing
2522
+ * getFrameForTime() to see out-of-order timestamps and stall.
2523
+ */
2524
+ flush(): Promise<void>;
2525
+ /**
2526
+ * Reset buffer and frame queues
2527
+ */
2528
+ reset(): void;
2529
+ /**
2530
+ * Get frame synced to external clock (e.g. AudioContext.currentTime).
2531
+ *
2532
+ * Discards frames that are too old, returns the current frame,
2533
+ * or holds last frame as fallback to prevent avatar freezing.
2534
+ *
2535
+ * @param currentTime - Current playback time (seconds)
2536
+ * @returns Blendshape frame, or null if no frames yet
2537
+ */
2538
+ getFrameForTime(currentTime: number): Float32Array | null;
2539
+ /** Latest frame from drip-feed (live mic, game loop) */
2540
+ get latestFrame(): Float32Array | null;
2541
+ /** Start 30fps drip-feed timer (push mode) */
2542
+ startDrip(): void;
2543
+ /** Stop drip-feed timer */
2544
+ stopDrip(): void;
2545
+ /** Number of frames waiting in queue (both modes combined) */
2546
+ get queuedFrameCount(): number;
2547
+ /** Buffer fill level as fraction of chunkSize (0-1) */
2548
+ get fillLevel(): number;
2549
+ /** Dispose resources */
2550
+ dispose(): void;
2551
+ /**
2552
+ * Process pending chunks sequentially.
2553
+ * Fire-and-forget — called from pushAudio() without awaiting.
2554
+ */
2555
+ private drainPendingChunks;
2556
+ private handleError;
2557
+ }
2264
2558
 
2265
2559
  /**
2266
- * CPU-optimized lip sync inference using wav2arkit_cpu model
2560
+ * BlendshapeSmoother — Per-channel critically damped spring for 52 ARKit blendshapes
2267
2561
  *
2268
- * A Safari/iOS-compatible alternative to Wav2Vec2Inference (192MB fp16) designed
2269
- * for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
2562
+ * Eliminates frame gaps between inference batches by smoothly interpolating
2563
+ * blendshape weights using critically damped springs (the game industry standard).
2270
2564
  *
2271
- * The model uses ONNX external data format:
2272
- * - wav2arkit_cpu.onnx (1.86MB graph structure)
2273
- * - wav2arkit_cpu.onnx.data (402MB weights)
2274
- * Both files are fetched and cached automatically.
2565
+ * Each of the 52 blendshape channels has its own spring with position + velocity
2566
+ * state. When a new inference frame arrives, spring targets are updated. Between
2567
+ * frames, springs continue converging toward the last target — no frozen face.
2275
2568
  *
2276
- * Key differences from Wav2Vec2Inference:
2277
- * - WASM-only backend (CPU-optimized, single-threaded, no WebGPU)
2278
- * - No identity input (baked to identity 11)
2279
- * - No ASR output (lip sync only)
2280
- * - Dynamic input length (not fixed to 16000 samples)
2281
- * - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
2569
+ * When inference stalls, `decayToNeutral()` sets all targets to 0, and the
2570
+ * springs smoothly close the mouth / relax the face over the halflife period.
2571
+ *
2572
+ * Math from Daniel Holden's "Spring-It-On" (Epic Games):
2573
+ * https://theorangeduck.com/page/spring-roll-call
2282
2574
  *
2283
2575
  * @category Inference
2284
2576
  *
2285
- * @example
2577
+ * @example Basic usage
2286
2578
  * ```typescript
2287
- * import { Wav2ArkitCpuInference } from '@omote/core';
2288
- *
2289
- * const lam = new Wav2ArkitCpuInference({
2290
- * modelUrl: '/models/wav2arkit_cpu.onnx',
2291
- * });
2292
- * await lam.load();
2579
+ * const smoother = new BlendshapeSmoother({ halflife: 0.06 });
2293
2580
  *
2294
- * const { blendshapes } = await lam.infer(audioSamples);
2295
- * // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
2581
+ * // In frame loop (60fps):
2582
+ * smoother.setTarget(inferenceFrame); // when new frame arrives
2583
+ * const smoothed = smoother.update(1/60); // every render frame
2584
+ * applyToAvatar(smoothed);
2296
2585
  * ```
2297
2586
  */
2298
-
2299
- interface Wav2ArkitCpuConfig {
2300
- /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
2301
- modelUrl: string;
2587
+ interface BlendshapeSmootherConfig {
2302
2588
  /**
2303
- * Path or URL to external model data file (.onnx.data weights).
2304
- * Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
2589
+ * Spring halflife in seconds time for the distance to the target
2590
+ * to reduce by half. Lower = snappier, higher = smoother.
2305
2591
  *
2306
- * Set to `false` to skip external data loading (single-file models only).
2592
+ * - 0.04s (40ms): Very snappy, slight jitter on fast transitions
2593
+ * - 0.06s (60ms): Sweet spot for lip sync (default)
2594
+ * - 0.10s (100ms): Very smooth, slight lag on fast consonants
2595
+ * - 0: Bypass mode — passes through raw target values (no smoothing)
2596
+ *
2597
+ * Default: 0.06
2307
2598
  */
2308
- externalDataUrl?: string | false;
2309
- /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
2310
- backend?: BackendPreference;
2599
+ halflife?: number;
2311
2600
  }
2312
- declare class Wav2ArkitCpuInference implements A2EBackend {
2313
- readonly modelId: "wav2arkit_cpu";
2314
- readonly chunkSize: number;
2315
- private session;
2316
- private ort;
2317
- private config;
2318
- private _backend;
2319
- private isLoading;
2320
- private inferenceQueue;
2321
- private poisoned;
2322
- private static readonly INFERENCE_TIMEOUT_MS;
2323
- constructor(config: Wav2ArkitCpuConfig);
2324
- get backend(): RuntimeBackend | null;
2325
- get isLoaded(): boolean;
2601
+ declare class BlendshapeSmoother {
2602
+ private readonly halflife;
2603
+ /** Current smoothed blendshape values */
2604
+ private values;
2605
+ /** Per-channel spring velocities */
2606
+ private velocities;
2607
+ /** Current spring targets (from latest inference frame) */
2608
+ private targets;
2609
+ /** Whether any target has been set */
2610
+ private _hasTarget;
2611
+ constructor(config?: BlendshapeSmootherConfig);
2612
+ /** Whether a target frame has been set (false until first setTarget call) */
2613
+ get hasTarget(): boolean;
2326
2614
  /**
2327
- * Load the ONNX model
2615
+ * Set new target frame from inference output.
2616
+ * Springs will converge toward these values on subsequent update() calls.
2328
2617
  */
2329
- load(): Promise<A2EModelInfo>;
2618
+ setTarget(frame: Float32Array): void;
2330
2619
  /**
2331
- * Run inference on raw audio
2620
+ * Advance all 52 springs by `dt` seconds and return the smoothed frame.
2332
2621
  *
2333
- * Accepts variable-length audio (not fixed to 16000 samples).
2334
- * Output frames = ceil(30 * numSamples / 16000).
2622
+ * Call this every render frame (e.g., inside requestAnimationFrame).
2623
+ * Returns the internal values buffer do NOT mutate the returned array.
2335
2624
  *
2336
- * @param audioSamples - Float32Array of raw audio at 16kHz
2337
- * @param _identityIndex - Ignored (identity 11 is baked into the model)
2625
+ * @param dt - Time step in seconds (e.g., 1/60 for 60fps)
2626
+ * @returns Smoothed blendshape values (Float32Array of 52)
2338
2627
  */
2339
- infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
2628
+ update(dt: number): Float32Array;
2340
2629
  /**
2341
- * Queue inference to serialize ONNX session calls
2630
+ * Decay all spring targets to neutral (0).
2631
+ *
2632
+ * Call when inference stalls (no new frames for threshold duration).
2633
+ * The springs will smoothly close the mouth / relax the face over
2634
+ * the halflife period rather than freezing.
2342
2635
  */
2343
- private queueInference;
2636
+ decayToNeutral(): void;
2344
2637
  /**
2345
- * Dispose of the model and free resources
2638
+ * Reset all state (values, velocities, targets).
2639
+ * Call when starting a new playback session.
2346
2640
  */
2347
- dispose(): Promise<void>;
2641
+ reset(): void;
2348
2642
  }
2349
2643
 
2350
2644
  /**
2351
- * Factory function for A2E with automatic GPU/CPU model selection
2645
+ * Factory function for Silero VAD with automatic Worker vs main thread selection
2352
2646
  *
2353
- * Provides a unified API that automatically selects the optimal model:
2354
- * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
2355
- * - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (192MB fp16, WebGPU)
2356
- * - Fallback: Gracefully falls back to CPU model if GPU model fails to load
2357
- *
2358
- * Why two separate models?
2359
- * Wav2Vec2 (LAM) cannot run on Safari/iOS for two reasons:
2360
- * 1. Its dual-head transformer graph needs ~750-950MB peak during ORT session
2361
- * creation (graph optimization), exceeding iOS WebKit's ~1-1.5GB tab limit.
2362
- * 2. It ships as a single 192MB .onnx file (fp16) that must load into JS heap
2363
- * before ORT can consume it. iOS WebKit OOMs on this allocation.
2364
- * wav2arkit_cpu solves both: external data format (1.86MB graph + 402MB weights)
2365
- * lets ORT load only the tiny graph, then stream weights via URL pass-through
2366
- * directly into WASM memory. JS heap stays at ~2MB.
2647
+ * Provides a unified API that automatically selects the optimal implementation:
2648
+ * - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
2649
+ * - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
2650
+ * - Fallback: Gracefully falls back to main thread if Worker fails
2367
2651
  *
2368
2652
  * @category Inference
2369
2653
  *
2370
- * @example Auto-detect (recommended, zero-config)
2654
+ * @example Basic usage (auto-detect)
2371
2655
  * ```typescript
2372
- * import { createA2E } from '@omote/core';
2656
+ * import { createSileroVAD } from '@omote/core';
2373
2657
  *
2374
- * const a2e = createA2E(); // uses HF CDN defaults (192MB fp16 GPU, 404MB CPU fallback)
2375
- * await a2e.load();
2376
- * const { blendshapes } = await a2e.infer(audioSamples);
2658
+ * const vad = createSileroVAD({
2659
+ * modelUrl: '/models/silero-vad.onnx',
2660
+ * threshold: 0.5,
2661
+ * });
2662
+ *
2663
+ * await vad.load();
2664
+ * const result = await vad.process(audioChunk);
2665
+ * if (result.isSpeech) {
2666
+ * console.log('Speech detected!', result.probability);
2667
+ * }
2377
2668
  * ```
2378
2669
  *
2379
- * @example Force CPU model
2670
+ * @example Force worker usage
2380
2671
  * ```typescript
2381
- * const a2e = createA2E({ mode: 'cpu' });
2672
+ * const vad = createSileroVAD({
2673
+ * modelUrl: '/models/silero-vad.onnx',
2674
+ * useWorker: true, // Force Worker even on mobile
2675
+ * });
2676
+ * ```
2677
+ *
2678
+ * @example Force main thread
2679
+ * ```typescript
2680
+ * const vad = createSileroVAD({
2681
+ * modelUrl: '/models/silero-vad.onnx',
2682
+ * useWorker: false, // Force main thread
2683
+ * });
2382
2684
  * ```
2383
2685
  */
2384
2686
 
2385
2687
  /**
2386
- * Configuration for the A2E factory
2688
+ * Common interface for both SileroVADInference and SileroVADWorker
2689
+ *
2690
+ * This interface defines the shared API that both implementations provide,
2691
+ * allowing consumers to use either interchangeably.
2387
2692
  */
2388
- interface CreateA2EConfig {
2389
- /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge). Default: HuggingFace CDN */
2390
- gpuModelUrl?: string;
2693
+ interface SileroVADBackend {
2694
+ /** Current backend type (webgpu, wasm, or null if not loaded) */
2695
+ readonly backend: RuntimeBackend | null;
2696
+ /** Whether the model is loaded and ready for inference */
2697
+ readonly isLoaded: boolean;
2698
+ /** Audio sample rate (8000 or 16000 Hz) */
2699
+ readonly sampleRate: number;
2700
+ /** Speech detection threshold (0-1) */
2701
+ readonly threshold: number;
2391
2702
  /**
2392
- * URL for GPU model external data file (.onnx.data weights).
2393
- * Default: `${gpuModelUrl}.data`
2394
- *
2395
- * Set to `false` to skip external data loading (single-file models only).
2703
+ * Load the ONNX model
2704
+ * @returns Model loading information
2396
2705
  */
2397
- gpuExternalDataUrl?: string | false;
2398
- /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS). Default: HuggingFace CDN */
2399
- cpuModelUrl?: string;
2706
+ load(): Promise<VADModelInfo | VADWorkerModelInfo>;
2400
2707
  /**
2401
- * Model selection mode:
2402
- * - 'auto': Safari/iOS -> CPU, everything else -> GPU (default)
2403
- * - 'gpu': Force GPU model (Wav2Vec2Inference)
2404
- * - 'cpu': Force CPU model (Wav2ArkitCpuInference)
2708
+ * Process a single audio chunk
2709
+ * @param audioChunk - Float32Array of exactly chunkSize samples
2710
+ * @returns VAD result with speech probability
2405
2711
  */
2406
- mode?: 'auto' | 'gpu' | 'cpu';
2407
- /** Backend preference for GPU model (default: 'auto') */
2408
- gpuBackend?: BackendPreference;
2409
- /** Number of identity classes for GPU model (default: 12) */
2410
- numIdentityClasses?: number;
2712
+ process(audioChunk: Float32Array): Promise<VADResult>;
2411
2713
  /**
2412
- * Fall back to CPU model if GPU model fails to load (default: true)
2413
- * Only applies when mode is 'auto' or 'gpu'
2714
+ * Reset state for new audio stream
2414
2715
  */
2415
- fallbackOnError?: boolean;
2716
+ reset(): void | Promise<void>;
2717
+ /**
2718
+ * Dispose of the model and free resources
2719
+ */
2720
+ dispose(): Promise<void>;
2721
+ /**
2722
+ * Get required chunk size in samples
2723
+ */
2724
+ getChunkSize(): number;
2725
+ /**
2726
+ * Get chunk duration in milliseconds
2727
+ */
2728
+ getChunkDurationMs(): number;
2729
+ }
2730
+ /**
2731
+ * Configuration for the Silero VAD factory
2732
+ *
2733
+ * Extends SileroVADConfig with worker-specific options.
2734
+ */
2735
+ interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'>, InferenceFactoryConfig {
2736
+ /** Path or URL to the ONNX model. Default: HuggingFace CDN */
2737
+ modelUrl?: string;
2416
2738
  /**
2417
- * Use Web Worker for CPU model inference (default: false)
2739
+ * Fallback to main thread on worker errors.
2418
2740
  *
2419
- * When true, Wav2ArkitCpuWorker is used instead of Wav2ArkitCpuInference,
2420
- * running inference off the main thread to prevent UI blocking during
2421
- * model loading and inference.
2741
+ * When true (default), if the Worker fails to load or encounters an error,
2742
+ * the factory will automatically create a main thread instance instead.
2422
2743
  *
2423
- * Only applies when the CPU model is selected (mode: 'cpu', auto on Safari/iOS,
2424
- * or fallback from GPU).
2425
- */
2426
- useWorker?: boolean;
2427
- /**
2428
- * Unified inference worker instance.
2429
- * When provided and CPU model is selected, uses Wav2ArkitCpuUnifiedAdapter.
2430
- * Takes precedence over useWorker setting for the CPU model path.
2431
- * GPU model (Wav2Vec2) always stays on main thread (WebGPU).
2744
+ * When false, worker errors will propagate as exceptions.
2745
+ *
2746
+ * Default: true
2432
2747
  */
2433
- unifiedWorker?: UnifiedInferenceWorker;
2748
+ fallbackOnError?: boolean;
2434
2749
  }
2435
2750
  /**
2436
- * Create an A2E instance with automatic GPU/CPU model selection
2751
+ * Check if the current environment supports VAD Web Workers
2437
2752
  *
2438
- * @param config - Factory configuration
2439
- * @returns An A2EBackend instance (either GPU or CPU model)
2753
+ * Requirements:
2754
+ * - Worker constructor must exist
2755
+ * - Blob URL support (for inline worker script)
2756
+ *
2757
+ * @returns true if VAD Worker is supported
2440
2758
  */
2441
- declare function createA2E(config?: CreateA2EConfig): A2EBackend;
2442
-
2759
+ declare function supportsVADWorker(): boolean;
2443
2760
  /**
2444
- * A2EProcessor Engine-agnostic audio-to-expression processor
2761
+ * Create a Silero VAD instance with automatic implementation selection
2445
2762
  *
2446
- * The core inference primitive: audio samples in → blendshape frames out.
2447
- * No mic capture, no audio playback, no Web Audio API.
2763
+ * This factory function automatically selects between:
2764
+ * - **SileroVADWorker**: Off-main-thread inference (better for desktop)
2765
+ * - **SileroVADInference**: Main thread inference (better for mobile)
2448
2766
  *
2449
- * This is what Unity/Unreal/Godot/any engine would use directly.
2450
- * Web-specific concerns (mic, AudioContext, scheduling) live in the
2451
- * orchestrator and pipeline layers above.
2767
+ * The selection is based on:
2768
+ * 1. Explicit `useWorker` config (if provided)
2769
+ * 2. Platform detection (mobile vs desktop)
2770
+ * 3. Worker API availability
2452
2771
  *
2453
- * Two output modes:
2454
- * - **Pull mode**: `pushAudio(samples, timestamp)` + `getFrameForTime(t)`
2455
- * For TTS playback where frames are synced to AudioContext clock.
2456
- * - **Push mode**: `pushAudio(samples)` + `startDrip()` + `latestFrame`
2457
- * For live mic / game loop where frames are consumed at ~30fps.
2772
+ * Both implementations share the same interface (SileroVADBackend),
2773
+ * so consumers can use either interchangeably.
2458
2774
  *
2459
- * @category Inference
2775
+ * @param config - Factory configuration
2776
+ * @returns A SileroVAD instance (either Worker or main thread)
2460
2777
  *
2461
- * @example Pull mode (TTS playback)
2778
+ * @example
2462
2779
  * ```typescript
2463
- * const processor = new A2EProcessor({ backend: a2e });
2464
- * processor.pushAudio(samples, audioContext.currentTime + delay);
2465
- * const frame = processor.getFrameForTime(audioContext.currentTime);
2466
- * ```
2780
+ * // Auto-detect (recommended)
2781
+ * const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
2467
2782
  *
2468
- * @example Push mode (live mic)
2469
- * ```typescript
2470
- * const processor = new A2EProcessor({
2471
- * backend: a2e,
2472
- * onFrame: (frame) => applyToAvatar(frame),
2473
- * });
2474
- * processor.startDrip();
2475
- * processor.pushAudio(micSamples); // no timestamp → drip mode
2783
+ * // Force Worker
2784
+ * const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
2785
+ *
2786
+ * // Force main thread
2787
+ * const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
2476
2788
  * ```
2477
2789
  */
2790
+ declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
2478
2791
 
2479
- interface A2EProcessorConfig {
2480
- /** Inference backend */
2481
- backend: A2EBackend;
2482
- /** Sample rate (default: 16000) */
2483
- sampleRate?: number;
2484
- /** Samples per inference chunk (default: 16000 = 1s) */
2485
- chunkSize?: number;
2486
- /**
2487
- * Identity/style index for the A2E model (default: 0).
2488
- *
2489
- * The LAM model uses a one-hot identity vector (12 classes, indices 0-11) as
2490
- * style conditioning alongside audio features. Different indices produce
2491
- * different expression intensity across face regions (brows, eyes, cheeks).
2492
- *
2493
- * Only affects Wav2Vec2Inference (GPU model). Wav2ArkitCpuInference has
2494
- * identity 11 baked into the model weights.
2495
- */
2496
- identityIndex?: number;
2497
- /** Callback fired with each blendshape frame (push mode) */
2498
- onFrame?: (frame: Float32Array) => void;
2499
- /** Error callback */
2500
- onError?: (error: Error) => void;
2501
- }
2502
- declare class A2EProcessor {
2503
- private readonly backend;
2504
- private readonly sampleRate;
2505
- private readonly chunkSize;
2506
- private readonly identityIndex;
2507
- private readonly onFrame?;
2508
- private readonly onError?;
2509
- private bufferCapacity;
2510
- private buffer;
2511
- private writeOffset;
2512
- private bufferStartTime;
2513
- private timestampedQueue;
2514
- private plainQueue;
2515
- private _latestFrame;
2516
- private dripInterval;
2517
- private lastPulledFrame;
2518
- private inferenceRunning;
2519
- private pendingChunks;
2520
- private getFrameCallCount;
2521
- private disposed;
2522
- constructor(config: A2EProcessorConfig);
2523
- /**
2524
- * Push audio samples for inference (any source: mic, TTS, file).
2525
- *
2526
- * - With `timestamp`: frames stored with timestamps (pull mode)
2527
- * - Without `timestamp`: frames stored in plain queue (drip/push mode)
2528
- *
2529
- * Fire-and-forget: returns immediately, inference runs async.
2530
- */
2531
- pushAudio(samples: Float32Array, timestamp?: number): void;
2532
- /**
2533
- * Flush remaining buffered audio (pads to chunkSize).
2534
- * Call at end of stream to process final partial chunk.
2535
- *
2536
- * Routes through the serialized pendingChunks pipeline to maintain
2537
- * correct frame ordering. Without this, flush() could push frames
2538
- * with the latest timestamp to the queue before drainPendingChunks()
2539
- * finishes pushing frames with earlier timestamps — causing
2540
- * getFrameForTime() to see out-of-order timestamps and stall.
2541
- */
2542
- flush(): Promise<void>;
2543
- /**
2544
- * Reset buffer and frame queues
2545
- */
2546
- reset(): void;
2547
- /**
2548
- * Get frame synced to external clock (e.g. AudioContext.currentTime).
2549
- *
2550
- * Discards frames that are too old, returns the current frame,
2551
- * or holds last frame as fallback to prevent avatar freezing.
2552
- *
2553
- * @param currentTime - Current playback time (seconds)
2554
- * @returns Blendshape frame, or null if no frames yet
2555
- */
2556
- getFrameForTime(currentTime: number): Float32Array | null;
2557
- /** Latest frame from drip-feed (live mic, game loop) */
2558
- get latestFrame(): Float32Array | null;
2559
- /** Start 30fps drip-feed timer (push mode) */
2560
- startDrip(): void;
2561
- /** Stop drip-feed timer */
2562
- stopDrip(): void;
2563
- /** Number of frames waiting in queue (both modes combined) */
2564
- get queuedFrameCount(): number;
2565
- /** Buffer fill level as fraction of chunkSize (0-1) */
2566
- get fillLevel(): number;
2567
- /** Dispose resources */
2568
- dispose(): void;
2569
- /**
2570
- * Process pending chunks sequentially.
2571
- * Fire-and-forget — called from pushAudio() without awaiting.
2572
- */
2573
- private drainPendingChunks;
2574
- private handleError;
2792
+ /**
2793
+ * SenseVoice adapter backed by UnifiedInferenceWorker
2794
+ *
2795
+ * Implements SenseVoiceBackend, delegating all inference to the shared worker.
2796
+ */
2797
+
2798
+ declare class SenseVoiceUnifiedAdapter implements SenseVoiceBackend {
2799
+ private worker;
2800
+ private config;
2801
+ private _isLoaded;
2802
+ private loadedGeneration;
2803
+ private languageId;
2804
+ private textNormId;
2805
+ /** Per-adapter inference queue — ensures sequential state updates. */
2806
+ private inferenceQueue;
2807
+ constructor(worker: UnifiedInferenceWorker, config: SenseVoiceWorkerConfig);
2808
+ get isLoaded(): boolean;
2809
+ get backend(): 'wasm' | null;
2810
+ load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
2811
+ transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
2812
+ dispose(): Promise<void>;
2813
+ private assertLoaded;
2575
2814
  }
2576
2815
 
2577
2816
  /**
2578
- * BlendshapeSmoother Per-channel critically damped spring for 52 ARKit blendshapes
2817
+ * Wav2ArkitCpu adapter backed by UnifiedInferenceWorker
2579
2818
  *
2580
- * Eliminates frame gaps between inference batches by smoothly interpolating
2581
- * blendshape weights using critically damped springs (the game industry standard).
2819
+ * Implements A2EBackend, delegating all inference to the shared worker.
2820
+ */
2821
+
2822
+ declare class Wav2ArkitCpuUnifiedAdapter implements A2EBackend {
2823
+ readonly modelId: "wav2arkit_cpu";
2824
+ readonly chunkSize: number;
2825
+ private worker;
2826
+ private config;
2827
+ private _isLoaded;
2828
+ private loadedGeneration;
2829
+ /** Per-adapter inference queue — ensures sequential state updates. */
2830
+ private inferenceQueue;
2831
+ constructor(worker: UnifiedInferenceWorker, config: Wav2ArkitCpuWorkerConfig);
2832
+ get isLoaded(): boolean;
2833
+ get backend(): RuntimeBackend | null;
2834
+ load(): Promise<A2EModelInfo>;
2835
+ infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
2836
+ dispose(): Promise<void>;
2837
+ private assertLoaded;
2838
+ }
2839
+
2840
+ /**
2841
+ * Wav2Vec2 (LAM) adapter backed by UnifiedInferenceWorker
2582
2842
  *
2583
- * Each of the 52 blendshape channels has its own spring with position + velocity
2584
- * state. When a new inference frame arrives, spring targets are updated. Between
2585
- * frames, springs continue converging toward the last target — no frozen face.
2843
+ * Implements A2EBackend, delegating all inference to the shared worker.
2844
+ * Used on iOS to run LAM inference off the main thread via the unified worker.
2845
+ */
2846
+
2847
+ declare class Wav2Vec2UnifiedAdapter implements A2EBackend {
2848
+ readonly modelId: "wav2vec2";
2849
+ readonly chunkSize: number;
2850
+ private worker;
2851
+ private modelUrl;
2852
+ private externalDataUrl;
2853
+ private numIdentityClasses;
2854
+ private _isLoaded;
2855
+ private loadedGeneration;
2856
+ /** Per-adapter inference queue — ensures sequential state updates. */
2857
+ private inferenceQueue;
2858
+ constructor(worker: UnifiedInferenceWorker, config: {
2859
+ modelUrl: string;
2860
+ externalDataUrl?: string | false;
2861
+ numIdentityClasses?: number;
2862
+ chunkSize?: number;
2863
+ });
2864
+ get isLoaded(): boolean;
2865
+ get backend(): RuntimeBackend | null;
2866
+ load(): Promise<A2EModelInfo>;
2867
+ infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
2868
+ dispose(): Promise<void>;
2869
+ private assertLoaded;
2870
+ }
2871
+
2872
+ /**
2873
+ * Kokoro TTS inference using ONNX Runtime Web
2586
2874
  *
2587
- * When inference stalls, `decayToNeutral()` sets all targets to 0, and the
2588
- * springs smoothly close the mouth / relax the face over the halflife period.
2875
+ * Pure ONNX pipeline for browser-based text-to-speech. No transformers.js dependency.
2876
+ * Uses eSpeak-NG WASM for phonemization and Kokoro-82M (q8, 92MB) for synthesis.
2589
2877
  *
2590
- * Math from Daniel Holden's "Spring-It-On" (Epic Games):
2591
- * https://theorangeduck.com/page/spring-roll-call
2878
+ * Pipeline: Text Normalize Phonemize (eSpeak WASM) → Tokenize → Voice Style → ONNX → Audio
2592
2879
  *
2593
2880
  * @category Inference
2594
2881
  *
2595
2882
  * @example Basic usage
2596
2883
  * ```typescript
2597
- * const smoother = new BlendshapeSmoother({ halflife: 0.06 });
2884
+ * import { KokoroTTSInference } from '@omote/core';
2598
2885
  *
2599
- * // In frame loop (60fps):
2600
- * smoother.setTarget(inferenceFrame); // when new frame arrives
2601
- * const smoothed = smoother.update(1/60); // every render frame
2602
- * applyToAvatar(smoothed);
2886
+ * const tts = new KokoroTTSInference({ defaultVoice: 'af_heart' });
2887
+ * await tts.load();
2888
+ *
2889
+ * const { audio, duration } = await tts.synthesize("Hello world");
2890
+ * // audio: Float32Array @ 24kHz
2891
+ * ```
2892
+ *
2893
+ * @example Streaming (sentence-by-sentence)
2894
+ * ```typescript
2895
+ * for await (const chunk of tts.stream("First sentence. Second sentence.")) {
2896
+ * playbackPipeline.feedBuffer(chunk.audio);
2897
+ * }
2603
2898
  * ```
2899
+ *
2900
+ * @module inference/KokoroTTSInference
2604
2901
  */
2605
- interface BlendshapeSmootherConfig {
2902
+
2903
+ interface KokoroTTSConfig {
2904
+ /** ONNX model URL (default: HF CDN q8, 92MB) */
2905
+ modelUrl?: string;
2906
+ /** Voice files base URL (default: HF CDN voices directory) */
2907
+ voiceBaseUrl?: string;
2908
+ /** Default voice (default: 'af_heart') */
2909
+ defaultVoice?: string;
2910
+ /** Backend preference (default: 'wasm' — WebGPU crashes on int64 input_ids) */
2911
+ backend?: BackendPreference;
2912
+ /** Speech speed multiplier (default: 1.0) */
2913
+ speed?: number;
2914
+ }
2915
+ interface KokoroTTSResult {
2916
+ /** Audio samples at 24kHz */
2917
+ audio: Float32Array;
2918
+ /** Duration in seconds */
2919
+ duration: number;
2920
+ /** Inference time in ms */
2921
+ inferenceTimeMs: number;
2922
+ }
2923
+ interface KokoroStreamChunk {
2924
+ /** Audio for this sentence */
2925
+ audio: Float32Array;
2926
+ /** Original text segment */
2927
+ text: string;
2928
+ /** Phonemes for this segment */
2929
+ phonemes: string;
2930
+ /** Duration in seconds */
2931
+ duration: number;
2932
+ }
2933
+ interface KokoroTTSModelInfo {
2934
+ /** Resolved backend */
2935
+ backend: string;
2936
+ /** Model load time in ms */
2937
+ loadTimeMs: number;
2938
+ /** Default voice */
2939
+ defaultVoice: string;
2940
+ }
2941
+ interface SynthesizeOptions {
2942
+ /** Voice to use (overrides defaultVoice) */
2943
+ voice?: string;
2944
+ /** Speed multiplier (overrides config speed) */
2945
+ speed?: number;
2946
+ }
2947
+ declare class KokoroTTSInference implements TTSBackend {
2948
+ private readonly config;
2949
+ private readonly modelUrl;
2950
+ private readonly voiceBaseUrl;
2951
+ private ort;
2952
+ private session;
2953
+ private _backend;
2954
+ private isLoading;
2955
+ private poisoned;
2956
+ private inferenceQueue;
2957
+ private phonemizerReady;
2958
+ private defaultVoiceLoaded;
2959
+ /** Cached voice data (voice name → Float32Array) */
2960
+ private loadedVoices;
2961
+ constructor(config?: KokoroTTSConfig);
2962
+ get isLoaded(): boolean;
2963
+ get sampleRate(): number;
2606
2964
  /**
2607
- * Spring halflife in seconds time for the distance to the target
2608
- * to reduce by half. Lower = snappier, higher = smoother.
2609
- *
2610
- * - 0.04s (40ms): Very snappy, slight jitter on fast transitions
2611
- * - 0.06s (60ms): Sweet spot for lip sync (default)
2612
- * - 0.10s (100ms): Very smooth, slight lag on fast consonants
2613
- * - 0: Bypass mode — passes through raw target values (no smoothing)
2614
- *
2615
- * Default: 0.06
2965
+ * Load the ONNX model, phonemizer WASM, and default voice.
2966
+ * Safe to call multiple times (no-ops after first successful load).
2616
2967
  */
2617
- halflife?: number;
2618
- }
2619
- declare class BlendshapeSmoother {
2620
- private readonly halflife;
2621
- /** Current smoothed blendshape values */
2622
- private values;
2623
- /** Per-channel spring velocities */
2624
- private velocities;
2625
- /** Current spring targets (from latest inference frame) */
2626
- private targets;
2627
- /** Whether any target has been set */
2628
- private _hasTarget;
2629
- constructor(config?: BlendshapeSmootherConfig);
2630
- /** Whether a target frame has been set (false until first setTarget call) */
2631
- get hasTarget(): boolean;
2968
+ load(): Promise<KokoroTTSModelInfo>;
2632
2969
  /**
2633
- * Set new target frame from inference output.
2634
- * Springs will converge toward these values on subsequent update() calls.
2970
+ * Lazily initialize phonemizer and default voice on first use.
2971
+ * Moves 100-200ms of main-thread blocking out of load() into first synthesis.
2635
2972
  */
2636
- setTarget(frame: Float32Array): void;
2973
+ private ensureReady;
2637
2974
  /**
2638
- * Advance all 52 springs by `dt` seconds and return the smoothed frame.
2639
- *
2640
- * Call this every render frame (e.g., inside requestAnimationFrame).
2641
- * Returns the internal values buffer — do NOT mutate the returned array.
2975
+ * Synthesize speech from text (one-shot, full audio output).
2642
2976
  *
2643
- * @param dt - Time step in seconds (e.g., 1/60 for 60fps)
2644
- * @returns Smoothed blendshape values (Float32Array of 52)
2977
+ * @param text - Input text to synthesize
2978
+ * @param options - Voice and speed overrides
2979
+ * @returns Audio Float32Array at 24kHz with duration
2645
2980
  */
2646
- update(dt: number): Float32Array;
2981
+ synthesize(text: string, options?: SynthesizeOptions): Promise<KokoroTTSResult>;
2647
2982
  /**
2648
- * Decay all spring targets to neutral (0).
2983
+ * Stream synthesis sentence-by-sentence (async generator).
2984
+ * Splits text on sentence boundaries and yields audio for each.
2649
2985
  *
2650
- * Call when inference stalls (no new frames for threshold duration).
2651
- * The springs will smoothly close the mouth / relax the face over
2652
- * the halflife period rather than freezing.
2986
+ * Compatible with both `SynthesizeOptions` (legacy) and `TTSStreamOptions` (TTSBackend).
2987
+ *
2988
+ * @param text - Input text (can be multiple sentences)
2989
+ * @param options - Voice, speed, and abort signal overrides
2653
2990
  */
2654
- decayToNeutral(): void;
2991
+ stream(text: string, options?: SynthesizeOptions & TTSStreamOptions): AsyncGenerator<KokoroStreamChunk & TTSChunk>;
2655
2992
  /**
2656
- * Reset all state (values, velocities, targets).
2657
- * Call when starting a new playback session.
2993
+ * Preload a voice (fetches and caches the .bin file).
2658
2994
  */
2659
- reset(): void;
2995
+ preloadVoice(voiceName: string): Promise<void>;
2996
+ /**
2997
+ * List available voice names.
2998
+ */
2999
+ listVoices(): string[];
3000
+ /**
3001
+ * Release the ONNX session and clear cached voices.
3002
+ */
3003
+ dispose(): Promise<void>;
3004
+ private ensureVoice;
3005
+ private queueInference;
3006
+ private runInference;
3007
+ }
3008
+
3009
+ /**
3010
+ * Kokoro TTS adapter backed by UnifiedInferenceWorker
3011
+ *
3012
+ * Implements TTSBackend, delegating ONNX inference to the shared worker.
3013
+ * Phonemization, tokenization, and voice loading stay on main thread (fast, <10ms).
3014
+ * Only the heavy `session.run()` (~1-2s per sentence) goes to the worker.
3015
+ */
3016
+
3017
+ declare class KokoroTTSUnifiedAdapter implements TTSBackend {
3018
+ private worker;
3019
+ private readonly config;
3020
+ private readonly modelUrl;
3021
+ private readonly voiceBaseUrl;
3022
+ private _isLoaded;
3023
+ private loadedGeneration;
3024
+ /** Per-adapter inference queue — ensures sequential state updates. */
3025
+ private inferenceQueue;
3026
+ private loadedVoices;
3027
+ private phonemizerReady;
3028
+ private defaultVoiceLoaded;
3029
+ constructor(worker: UnifiedInferenceWorker, config?: KokoroTTSConfig);
3030
+ get isLoaded(): boolean;
3031
+ get sampleRate(): number;
3032
+ load(): Promise<KokoroTTSModelInfo>;
3033
+ stream(text: string, options?: SynthesizeOptions & TTSStreamOptions): AsyncGenerator<KokoroStreamChunk & TTSChunk>;
3034
+ dispose(): Promise<void>;
3035
+ private ensureVoice;
3036
+ private assertLoaded;
3037
+ private runWorkerInference;
3038
+ }
3039
+
3040
+ /**
3041
+ * Silero VAD adapter backed by UnifiedInferenceWorker
3042
+ *
3043
+ * Implements SileroVADBackend, delegating all inference to the shared worker.
3044
+ */
3045
+
3046
+ declare class SileroVADUnifiedAdapter implements SileroVADBackend {
3047
+ private worker;
3048
+ private config;
3049
+ private _isLoaded;
3050
+ private loadedGeneration;
3051
+ private state;
3052
+ private context;
3053
+ private readonly chunkSize;
3054
+ private readonly contextSize;
3055
+ /**
3056
+ * Per-adapter inference queue — ensures sequential state updates.
3057
+ *
3058
+ * The unified worker processes messages serially (single thread), but this queue
3059
+ * guarantees per-adapter state consistency. Example: VAD LSTM state from call N
3060
+ * must be applied before call N+1 starts. Without the queue, two rapid process()
3061
+ * calls could both read the same stale state.
3062
+ */
3063
+ private inferenceQueue;
3064
+ private preSpeechBuffer;
3065
+ private wasSpeaking;
3066
+ constructor(worker: UnifiedInferenceWorker, config: SileroVADConfig);
3067
+ get isLoaded(): boolean;
3068
+ get backend(): RuntimeBackend | null;
3069
+ get sampleRate(): number;
3070
+ get threshold(): number;
3071
+ getChunkSize(): number;
3072
+ getChunkDurationMs(): number;
3073
+ load(): Promise<VADWorkerModelInfo>;
3074
+ process(audioChunk: Float32Array): Promise<VADResult>;
3075
+ reset(): Promise<void>;
3076
+ dispose(): Promise<void>;
3077
+ private assertLoaded;
2660
3078
  }
2661
3079
 
2662
3080
  /**
@@ -2975,6 +3393,175 @@ declare class SafariSpeechRecognition {
2975
3393
  private emitError;
2976
3394
  }
2977
3395
 
3396
+ /**
3397
+ * Kokoro TTS Web Worker implementation
3398
+ *
3399
+ * Moves the heavy ONNX `session.run()` to a dedicated Web Worker to prevent
3400
+ * main thread blocking (~1-2s per sentence on WASM). Phonemizer, tokenizer,
3401
+ * and voice logic stay on the main thread (fast, <10ms combined).
3402
+ *
3403
+ * Architecture:
3404
+ * ```
3405
+ * Main Thread (KokoroTTSWorker): Worker (WORKER_SCRIPT):
3406
+ * stream(text) →
3407
+ * splitSentences(text)
3408
+ * for each sentence:
3409
+ * phonemize(sentence) → phonemes
3410
+ * tokenize(phonemes) → tokens
3411
+ * ensureVoice() → style
3412
+ * postMessage(tokens, style, speed) ──→ session.run(feeds)
3413
+ * await result ←── postMessage(audio)
3414
+ * yield {audio, text, phonemes, duration}
3415
+ * ```
3416
+ *
3417
+ * @category Inference
3418
+ *
3419
+ * @example Basic usage
3420
+ * ```typescript
3421
+ * import { KokoroTTSWorker } from '@omote/core';
3422
+ *
3423
+ * const tts = new KokoroTTSWorker({ defaultVoice: 'af_heart' });
3424
+ * await tts.load();
3425
+ *
3426
+ * for await (const chunk of tts.stream("Hello world!")) {
3427
+ * playbackPipeline.feedBuffer(chunk.audio);
3428
+ * }
3429
+ * ```
3430
+ *
3431
+ * @module inference/KokoroTTSWorker
3432
+ */
3433
+
3434
+ /**
3435
+ * Kokoro TTS Worker — off-main-thread ONNX inference for non-blocking TTS.
3436
+ *
3437
+ * Phonemizer/tokenizer/voice logic run on the main thread (fast, <10ms).
3438
+ * Only the heavy ONNX `session.run()` is delegated to the worker.
3439
+ *
3440
+ * Implements the same TTSBackend interface as KokoroTTSInference.
3441
+ *
3442
+ * @see KokoroTTSInference for main-thread version
3443
+ */
3444
+ declare class KokoroTTSWorker implements TTSBackend {
3445
+ private readonly config;
3446
+ private readonly modelUrl;
3447
+ private readonly voiceBaseUrl;
3448
+ private worker;
3449
+ private _isLoaded;
3450
+ private isLoading;
3451
+ private poisoned;
3452
+ /** Serializes all worker calls (stream sentence chunks + synthesize) */
3453
+ private inferenceQueue;
3454
+ /** Cached voice data (voice name → Float32Array) */
3455
+ private loadedVoices;
3456
+ /** Pending message handlers */
3457
+ private pendingResolvers;
3458
+ constructor(config?: KokoroTTSConfig);
3459
+ get isLoaded(): boolean;
3460
+ get sampleRate(): number;
3461
+ load(): Promise<KokoroTTSModelInfo>;
3462
+ synthesize(text: string, options?: SynthesizeOptions): Promise<KokoroTTSResult>;
3463
+ stream(text: string, options?: SynthesizeOptions & TTSStreamOptions): AsyncGenerator<KokoroStreamChunk & TTSChunk>;
3464
+ preloadVoice(voiceName: string): Promise<void>;
3465
+ listVoices(): string[];
3466
+ dispose(): Promise<void>;
3467
+ static isSupported(): boolean;
3468
+ private ensureVoice;
3469
+ private createWorker;
3470
+ private handleWorkerMessage;
3471
+ private sendMessage;
3472
+ /**
3473
+ * Queue worker inference through the serialization queue.
3474
+ * Sends pre-computed tokens + style to worker, returns audio.
3475
+ */
3476
+ private runWorkerInference;
3477
+ /**
3478
+ * One-shot synthesis (phonemize + tokenize + worker inference).
3479
+ */
3480
+ private queueInference;
3481
+ }
3482
+
3483
+ /**
3484
+ * Factory function for Kokoro TTS with automatic Worker vs main thread selection
3485
+ *
3486
+ * Provides a unified API that automatically selects the optimal implementation:
3487
+ * - Desktop: Uses KokoroTTSWorker (off-main-thread inference, no render hitching)
3488
+ * - iOS: Uses KokoroTTSInference (main thread, shared ORT instance to avoid OOM)
3489
+ *
3490
+ * @category Inference
3491
+ *
3492
+ * @example Auto-detect (recommended)
3493
+ * ```typescript
3494
+ * import { createKokoroTTS } from '@omote/core';
3495
+ *
3496
+ * const tts = createKokoroTTS({ defaultVoice: 'af_heart' });
3497
+ * await tts.load();
3498
+ *
3499
+ * for await (const chunk of tts.stream("Hello world!")) {
3500
+ * playbackPipeline.feedBuffer(chunk.audio);
3501
+ * }
3502
+ * ```
3503
+ *
3504
+ * @example Force worker
3505
+ * ```typescript
3506
+ * const tts = createKokoroTTS({ defaultVoice: 'af_heart', useWorker: true });
3507
+ * ```
3508
+ *
3509
+ * @example Force main thread
3510
+ * ```typescript
3511
+ * const tts = createKokoroTTS({ defaultVoice: 'af_heart', useWorker: false });
3512
+ * ```
3513
+ */
3514
+
3515
+ /**
3516
+ * Configuration for the Kokoro TTS factory
3517
+ */
3518
+ interface CreateKokoroTTSConfig extends KokoroTTSConfig, InferenceFactoryConfig {
3519
+ }
3520
+ /**
3521
+ * Create a Kokoro TTS instance with automatic implementation selection.
3522
+ *
3523
+ * @param config - Factory configuration
3524
+ * @returns A TTSBackend instance (either Worker or main thread)
3525
+ */
3526
+ declare function createKokoroTTS(config?: CreateKokoroTTSConfig): TTSBackend;
3527
+
3528
+ /** Available Kokoro v1.0 voices */
3529
+ declare const KOKORO_VOICES: {
3530
+ readonly af_heart: "af_heart";
3531
+ readonly af_alloy: "af_alloy";
3532
+ readonly af_aoede: "af_aoede";
3533
+ readonly af_bella: "af_bella";
3534
+ readonly af_jessica: "af_jessica";
3535
+ readonly af_kore: "af_kore";
3536
+ readonly af_nicole: "af_nicole";
3537
+ readonly af_nova: "af_nova";
3538
+ readonly af_river: "af_river";
3539
+ readonly af_sarah: "af_sarah";
3540
+ readonly af_sky: "af_sky";
3541
+ readonly am_adam: "am_adam";
3542
+ readonly am_echo: "am_echo";
3543
+ readonly am_eric: "am_eric";
3544
+ readonly am_fenrir: "am_fenrir";
3545
+ readonly am_liam: "am_liam";
3546
+ readonly am_michael: "am_michael";
3547
+ readonly am_onyx: "am_onyx";
3548
+ readonly am_puck: "am_puck";
3549
+ readonly am_santa: "am_santa";
3550
+ readonly bf_alice: "bf_alice";
3551
+ readonly bf_emma: "bf_emma";
3552
+ readonly bf_isabella: "bf_isabella";
3553
+ readonly bf_lily: "bf_lily";
3554
+ readonly bm_daniel: "bm_daniel";
3555
+ readonly bm_fable: "bm_fable";
3556
+ readonly bm_george: "bm_george";
3557
+ readonly bm_lewis: "bm_lewis";
3558
+ };
3559
+ type KokoroVoiceName = keyof typeof KOKORO_VOICES;
3560
+ /**
3561
+ * List all available voice names.
3562
+ */
3563
+ declare function listVoices(): string[];
3564
+
2978
3565
  /**
2979
3566
  * Emotion - Helper for creating emotion vectors for avatar animation
2980
3567
  *
@@ -4404,12 +4991,15 @@ declare const ALL_AUS: string[];
4404
4991
  */
4405
4992
 
4406
4993
  /**
4407
- * Resolved emotion split into upper and lower face contributions
4994
+ * Resolved emotion split into upper and lower face contributions.
4995
+ *
4996
+ * WARNING: Buffers are owned by EmotionResolver and are overwritten
4997
+ * on the next resolve() call. Copy if you need to retain values.
4408
4998
  */
4409
4999
  interface ResolvedEmotion {
4410
- /** 52 channels — only upper face (brows, eyes, cheeks, nose) non-zero */
5000
+ /** 52 channels — only upper face non-zero. Valid until next resolve() call. */
4411
5001
  upper: Float32Array;
4412
- /** 52 channels — only lower face (mouth, jaw) non-zero */
5002
+ /** 52 channels — only lower face non-zero. Valid until next resolve() call. */
4413
5003
  lower: Float32Array;
4414
5004
  }
4415
5005
  /**
@@ -4444,6 +5034,28 @@ declare class EmotionResolver {
4444
5034
  * @category Face
4445
5035
  */
4446
5036
 
5037
+ /**
5038
+ * Output of FaceCompositor.compose()
5039
+ *
5040
+ * WARNING: When using the internal output buffer (no `target` param),
5041
+ * `blendshapes` is a shared reference that is overwritten on the next
5042
+ * compose() call. Copy with `new Float32Array(output.blendshapes)` if
5043
+ * you need to retain values across frames.
5044
+ */
5045
+ interface FaceCompositorOutput {
5046
+ /**
5047
+ * 52 ARKit blendshape values, clamped [0,1].
5048
+ *
5049
+ * This buffer is reused across calls when no `target` parameter is
5050
+ * provided to compose(). Valid until the next compose() call.
5051
+ */
5052
+ blendshapes: Float32Array;
5053
+ /** Head rotation deltas in radians (from ProceduralLifeLayer) */
5054
+ headDelta: {
5055
+ yaw: number;
5056
+ pitch: number;
5057
+ };
5058
+ }
4447
5059
  /**
4448
5060
  * Per-blendshape character profile (multiplier + offset)
4449
5061
  *
@@ -4494,13 +5106,14 @@ interface FaceCompositorInput extends LifeLayerInput {
4494
5106
  * audioEnergy: 0.5,
4495
5107
  * });
4496
5108
  *
4497
- * // Apply output[0..51] to avatar morphTargetInfluences
5109
+ * // Apply output.blendshapes[0..51] to avatar morphTargetInfluences
4498
5110
  * ```
4499
5111
  */
4500
5112
  declare class FaceCompositor {
4501
5113
  private readonly emotionResolver;
4502
5114
  private readonly lifeLayer;
4503
5115
  private readonly emotionSmoothing;
5116
+ private readonly outputBuffer;
4504
5117
  private readonly smoothedUpper;
4505
5118
  private readonly smoothedLower;
4506
5119
  private readonly lifeBuffer;
@@ -4513,9 +5126,11 @@ declare class FaceCompositor {
4513
5126
  *
4514
5127
  * @param base - A2E raw output (Float32Array[52], LAM_BLENDSHAPES order)
4515
5128
  * @param input - Per-frame input (deltaTime, emotion, life layer params)
4516
- * @returns Float32Array[52] with all values clamped to [0, 1]
5129
+ * @param target - Optional pre-allocated output buffer (avoids per-frame allocation).
5130
+ * When omitted, an internal buffer is used (valid until next compose() call).
5131
+ * @returns Blendshapes (Float32Array[52] clamped [0,1]) and head rotation deltas
4517
5132
  */
4518
- compose(base: Float32Array, input: FaceCompositorInput): Float32Array;
5133
+ compose(base: Float32Array, input: FaceCompositorInput, target?: Float32Array): FaceCompositorOutput;
4519
5134
  /**
4520
5135
  * Set sticky emotion (used when input.emotion is not provided).
4521
5136
  */
@@ -4532,6 +5147,123 @@ declare class FaceCompositor {
4532
5147
  private applyProfileArrays;
4533
5148
  }
4534
5149
 
5150
+ /**
5151
+ * CharacterController — Renderer-agnostic avatar composition loop
5152
+ *
5153
+ * Extracted from r3f's useOmoteAvatar + useGazeTracking.
5154
+ * Owns FaceCompositor, emotion resolution, eye angle math, head smoothing.
5155
+ * Pure function: input → output. No renderer side effects.
5156
+ *
5157
+ * @category Character
5158
+ */
5159
+
5160
+ /**
5161
+ * Convert an emotion label string or EmotionWeights object to EmotionWeights.
5162
+ * Cached to avoid per-frame string allocation.
5163
+ */
5164
+ declare function resolveEmotion(emotion: string | EmotionWeights | null | undefined): EmotionWeights | undefined;
5165
+ /** Simple 3D vector (renderer-agnostic) */
5166
+ interface Vec3 {
5167
+ x: number;
5168
+ y: number;
5169
+ z: number;
5170
+ }
5171
+ /** Quaternion (renderer-agnostic, for head rotation) */
5172
+ interface Quat {
5173
+ x: number;
5174
+ y: number;
5175
+ z: number;
5176
+ w: number;
5177
+ }
5178
+ interface CharacterControllerConfig {
5179
+ /** FaceCompositor configuration */
5180
+ compositor?: FaceCompositorConfig;
5181
+ /** Gaze tracking config */
5182
+ gaze?: {
5183
+ enabled?: boolean;
5184
+ yawInfluence?: number;
5185
+ pitchInfluence?: number;
5186
+ smoothing?: number;
5187
+ };
5188
+ }
5189
+ interface CharacterUpdateInput {
5190
+ /** Time since last frame in seconds */
5191
+ deltaTime: number;
5192
+ /** Scaled blendshapes from pipeline frame (or null when no frame) */
5193
+ baseBlendshapes: Float32Array | null;
5194
+ /** Raw blendshapes before profile scaling (optional) */
5195
+ rawBlendshapes?: Float32Array | null;
5196
+ /** Current emotion (string preset or weights object) */
5197
+ emotion?: string | EmotionWeights | null;
5198
+ /** Whether the avatar is currently speaking */
5199
+ isSpeaking: boolean;
5200
+ /** Current conversational state */
5201
+ state: ConversationalState;
5202
+ /** Audio energy level (0-1, drives emphasis/gesture intensity) */
5203
+ audioEnergy?: number;
5204
+ /** Camera world position (renderer provides in its own coords) */
5205
+ cameraWorldPos?: Vec3;
5206
+ /** Head bone world position (renderer provides in its own coords) */
5207
+ headWorldPos?: Vec3;
5208
+ /** Head bone world quaternion (for eye gaze local-space transform) */
5209
+ headWorldQuat?: Quat;
5210
+ /** Current avatar Y rotation in radians (for gaze compensation) */
5211
+ avatarRotationY?: number;
5212
+ }
5213
+ interface CharacterUpdateOutput {
5214
+ /** 52 ARKit blendshape values, clamped [0,1] — apply to morph targets */
5215
+ blendshapes: Float32Array;
5216
+ /** Head rotation delta (radians) — apply to head bone */
5217
+ headDelta: {
5218
+ yaw: number;
5219
+ pitch: number;
5220
+ };
5221
+ /** Normalized eye targets for eye blendshapes */
5222
+ eyeTargets: {
5223
+ x: number;
5224
+ y: number;
5225
+ };
5226
+ }
5227
+ declare class CharacterController {
5228
+ private readonly _compositor;
5229
+ private readonly gazeEnabled;
5230
+ private readonly gazeYawInfluence;
5231
+ private readonly gazePitchInfluence;
5232
+ private readonly gazeSmoothing;
5233
+ private readonly zeroBase;
5234
+ private readonly outputBuffer;
5235
+ private readonly compositorInput;
5236
+ private gazeHeadYaw;
5237
+ private gazeHeadPitch;
5238
+ constructor(config?: CharacterControllerConfig);
5239
+ /**
5240
+ * Call each frame. Pure function: input → output. No renderer side effects.
5241
+ *
5242
+ * Composes A2E blendshapes, emotion, procedural life, gaze tracking
5243
+ * into a single output frame.
5244
+ */
5245
+ update(input: CharacterUpdateInput): CharacterUpdateOutput;
5246
+ /** Set emotion (string preset or weights object). */
5247
+ setEmotion(emotion: string | EmotionWeights): void;
5248
+ /** Update character profile at runtime. */
5249
+ setProfile(profile: CharacterProfile): void;
5250
+ /** Access underlying FaceCompositor for advanced use. */
5251
+ get compositor(): FaceCompositor;
5252
+ /** Reset all state (smoothing, life layer, emotions). */
5253
+ reset(): void;
5254
+ dispose(): void;
5255
+ /**
5256
+ * Compute normalized eye targets from camera and head positions.
5257
+ * Pure atan2/asin math — no renderer dependency.
5258
+ */
5259
+ private computeEyeTargets;
5260
+ /**
5261
+ * Compute smoothed head rotation. Returns target yaw/pitch values.
5262
+ * Renderer is responsible for applying these to the head bone.
5263
+ */
5264
+ private computeHeadGaze;
5265
+ }
5266
+
4535
5267
  /**
4536
5268
  * MicLipSync - Microphone → VAD → A2E → blendshapes
4537
5269
  *
@@ -4594,6 +5326,7 @@ declare class MicLipSync extends EventEmitter<MicLipSyncEvents> {
4594
5326
  private _currentFrame;
4595
5327
  private _currentRawFrame;
4596
5328
  private profile;
5329
+ private _firstFrameEmitted;
4597
5330
  private speechStartTime;
4598
5331
  private vadChunkSize;
4599
5332
  private vadBuffer;
@@ -4677,9 +5410,19 @@ interface ResponseHandler {
4677
5410
  * @category Orchestration
4678
5411
  */
4679
5412
 
4680
- interface VoicePipelineConfig {
4681
- /** URLs and options for model loading */
4682
- models: {
5413
+ /** Shared config options for all VoicePipeline modes */
5414
+ interface VoicePipelineBaseConfig {
5415
+ /** Pre-built backends — skip internal factory creation. Takes precedence over `models`. */
5416
+ backends?: {
5417
+ asr: SenseVoiceBackend;
5418
+ lam: A2EBackend;
5419
+ vad: SileroVADBackend;
5420
+ tts?: TTSBackend;
5421
+ };
5422
+ /** External unified worker (reuse across pipelines). Takes precedence over internal creation. */
5423
+ unifiedWorker?: UnifiedInferenceWorker;
5424
+ /** URLs and options for model loading. Required if `backends` not provided. */
5425
+ models?: {
4683
5426
  senseVoice: {
4684
5427
  modelUrl: string;
4685
5428
  tokensUrl?: string;
@@ -4697,8 +5440,6 @@ interface VoicePipelineConfig {
4697
5440
  preSpeechBufferChunks?: number;
4698
5441
  };
4699
5442
  };
4700
- /** Consumer's response handler */
4701
- onResponse: ResponseHandler;
4702
5443
  /** Per-character expression weight scaling */
4703
5444
  profile?: ExpressionProfile;
4704
5445
  /** Identity/style index for Wav2Vec2 (default: 0) */
@@ -4740,6 +5481,46 @@ interface VoicePipelineConfig {
4740
5481
  /** Duration of neutral fade-out (default: 250ms) */
4741
5482
  neutralTransitionMs?: number;
4742
5483
  }
5484
+ /** Cloud TTS mode: consumer handles response + audio streaming */
5485
+ interface VoicePipelineCloudConfig extends VoicePipelineBaseConfig {
5486
+ mode: 'cloud';
5487
+ /** Consumer's response handler (streams audio back) */
5488
+ onResponse: ResponseHandler;
5489
+ }
5490
+ /** Local TTS mode: SDK handles synthesis internally via TTSBackend */
5491
+ interface VoicePipelineLocalConfig extends VoicePipelineBaseConfig {
5492
+ mode: 'local';
5493
+ /**
5494
+ * TTS backend (e.g., KokoroTTSInference). Provide either `tts` or `ttsConfig`.
5495
+ *
5496
+ * When `tts` is provided, VoicePipeline uses it as-is. On iOS, this means
5497
+ * inference runs on the main thread (may cause UI freezes).
5498
+ *
5499
+ * Prefer `ttsConfig` for automatic unified worker integration on iOS.
5500
+ */
5501
+ tts?: TTSBackend;
5502
+ /**
5503
+ * Kokoro TTS configuration. When provided, VoicePipeline creates the TTS
5504
+ * internally and passes the unified worker on iOS for off-main-thread inference.
5505
+ *
5506
+ * Takes precedence over `tts` if both are provided.
5507
+ */
5508
+ ttsConfig?: {
5509
+ defaultVoice?: string;
5510
+ speed?: number;
5511
+ modelUrl?: string;
5512
+ voiceBaseUrl?: string;
5513
+ };
5514
+ /** Optional text transform (e.g., LLM call). Receives transcript, returns response text. */
5515
+ onTranscript?: (text: string) => string | Promise<string>;
5516
+ }
5517
+ /** Legacy config (no mode field) — treated as cloud mode. @deprecated Use mode: 'cloud' explicitly. */
5518
+ interface VoicePipelineLegacyConfig extends VoicePipelineBaseConfig {
5519
+ mode?: undefined;
5520
+ /** Consumer's response handler */
5521
+ onResponse: ResponseHandler;
5522
+ }
5523
+ type VoicePipelineConfig = VoicePipelineCloudConfig | VoicePipelineLocalConfig | VoicePipelineLegacyConfig;
4743
5524
  interface VoicePipelineEvents {
4744
5525
  'state': VoicePipelineState;
4745
5526
  'loading:progress': LoadingProgress;
@@ -4764,6 +5545,7 @@ interface VoicePipelineEvents {
4764
5545
  }
4765
5546
  declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
4766
5547
  private readonly config;
5548
+ private readonly isLocalMode;
4767
5549
  private _state;
4768
5550
  private stopped;
4769
5551
  private epoch;
@@ -4798,6 +5580,15 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
4798
5580
  get sessionId(): string | null;
4799
5581
  constructor(config: VoicePipelineConfig);
4800
5582
  loadModels(): Promise<void>;
5583
+ /**
5584
+ * Load from pre-built backends (dependency injection path).
5585
+ * Loads any backends that aren't loaded yet.
5586
+ */
5587
+ private loadFromBackends;
5588
+ /**
5589
+ * Load from factories (original path). Now loads SenseVoice, LAM, and VAD in parallel.
5590
+ */
5591
+ private loadFromFactories;
4801
5592
  start(): Promise<void>;
4802
5593
  stop(): void;
4803
5594
  setProfile(profile: ExpressionProfile): void;
@@ -4807,6 +5598,10 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
4807
5598
  private onSilenceDetected;
4808
5599
  private processEndOfSpeech;
4809
5600
  private callResponseHandler;
5601
+ /** Cloud mode: delegate to consumer's onResponse handler */
5602
+ private handleCloudResponse;
5603
+ /** Local mode: synthesize text with TTSBackend, stream to PlaybackPipeline */
5604
+ private handleLocalResponse;
4810
5605
  private handleInterruption;
4811
5606
  private startProgressiveTranscription;
4812
5607
  private stopProgressiveTranscription;
@@ -4817,4 +5612,4 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
4817
5612
  private clearSilenceTimer;
4818
5613
  }
4819
5614
 
4820
- export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, type CharacterProfile, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, VoicePipeline, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineState, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createSenseVoice, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, preloadModels, resetModelUrls, resolveBackend, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker };
5615
+ export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, CharacterController, type CharacterControllerConfig, type CharacterProfile, type CharacterUpdateInput, type CharacterUpdateOutput, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateKokoroTTSConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceFactoryConfig, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, KOKORO_VOICES, type KokoroStreamChunk, type KokoroTTSConfig, KokoroTTSInference, type KokoroTTSModelInfo, type KokoroTTSResult, KokoroTTSUnifiedAdapter, KokoroTTSWorker, type KokoroVoiceName, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type Quat, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type SynthesizeOptions, type TTSBackend, type TTSChunk, TTSPlayback, type TTSPlaybackConfig, type TTSPlaybackEvents, type TTSStreamOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type Vec3, VoicePipeline, type VoicePipelineCloudConfig, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineLocalConfig, type VoicePipelineState, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, Wav2Vec2UnifiedAdapter, type WorkerHealthState, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createKokoroTTS, createSenseVoice, createSileroVAD, fetchWithCache, float32ToPcm16, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, int16ToFloat32, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, listVoices as listKokoroVoices, pcm16ToFloat32, preloadModels, resampleLinear, resetModelUrls, resolveBackend, resolveEmotion, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker, ttsToPlaybackFormat };