@omote/core 0.3.1 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -27,11 +27,19 @@ declare class MicrophoneCapture {
27
27
  private buffer;
28
28
  private _isRecording;
29
29
  private _loggedFirstChunk;
30
+ /** Actual AudioContext sample rate (may differ from target on Firefox) */
31
+ private _nativeSampleRate;
30
32
  constructor(events: EventEmitter<OmoteEvents>, config?: MicrophoneCaptureConfig);
31
33
  get isRecording(): boolean;
32
34
  get isSupported(): boolean;
33
35
  start(): Promise<void>;
34
36
  stop(): void;
37
+ /**
38
+ * Resample audio using linear interpolation.
39
+ * Used when the AudioContext runs at the device's native rate (e.g. 48kHz)
40
+ * and we need to downsample to the target rate (e.g. 16kHz).
41
+ */
42
+ private resample;
35
43
  private floatToPCM16;
36
44
  }
37
45
 
@@ -98,12 +106,11 @@ interface AudioSchedulerOptions {
98
106
  /** Number of audio channels (default: 1 for mono) */
99
107
  channels?: number;
100
108
  /**
101
- * Delay before first audio chunk plays (seconds).
102
- * Gives slow inference backends (WASM) a head start so lip sync
103
- * frames are ready by the time audio reaches the listener.
104
- * Default: 0.05 (50ms — just enough to enqueue the first node)
109
+ * Initial lookahead delay in seconds before first audio plays.
110
+ * Gives LAM inference time to compute blendshapes before audio starts.
111
+ * Default: 0.05 (50ms) for WebGPU, increase to 0.3-0.5 for WASM on iOS.
105
112
  */
106
- initialDelayS?: number;
113
+ initialLookaheadSec?: number;
107
114
  }
108
115
  declare class AudioScheduler {
109
116
  private readonly options;
@@ -373,13 +380,14 @@ declare function isSafari(): boolean;
373
380
  /**
374
381
  * Recommend using CPU-optimized lip sync model (wav2arkit_cpu)
375
382
  *
376
- * All WebKit browsers (Safari macOS, Safari iOS, Chrome iOS, Firefox iOS)
377
- * have ONNX Runtime WebGPU JSEP bugs that crash session creation, and the
378
- * 384MB LAM model stack-overflows in WASM mode.
379
- * The wav2arkit_cpu model (1.8MB) provides identical 52 ARKit blendshape
380
- * output at 22x real-time on CPU/WASM.
383
+ * All iOS browsers use WebKit and have tight memory limits — the 384MB
384
+ * LAM model causes silent crashes. wav2arkit_cpu uses URL pass-through
385
+ * (ORT fetches the 402MB weights directly into WASM, no JS heap copy).
386
+ *
387
+ * macOS Safari also needs this due to ONNX Runtime JSEP/ASYNCIFY bugs
388
+ * that crash WebKit's JIT compiler.
381
389
  *
382
- * @returns true if on Safari or any iOS browser (should use CPU lip sync model)
390
+ * @returns true if iOS (any browser) or Safari (any platform)
383
391
  */
384
392
  declare function shouldUseCpuLipSync(): boolean;
385
393
  /**
@@ -400,7 +408,7 @@ declare function isSpeechRecognitionAvailable(): boolean;
400
408
  * - Battery-efficient (no WASM overhead)
401
409
  * - No model download needed (saves 30-150MB)
402
410
  *
403
- * @returns true if on iOS with Speech API available
411
+ * @returns true if on iOS or Safari with Speech API available
404
412
  */
405
413
  declare function shouldUseNativeASR(): boolean;
406
414
  /**
@@ -419,7 +427,7 @@ declare function shouldUseServerLipSync(): boolean;
419
427
  /**
420
428
  * Common interface for lip sync inference backends
421
429
  *
422
- * Both Wav2Vec2Inference (GPU, 384MB) and Wav2ArkitCpuInference (CPU, 1.8MB)
430
+ * Both Wav2Vec2Inference (GPU, 384MB) and Wav2ArkitCpuInference (CPU, 404MB)
423
431
  * implement this interface, allowing SyncedAudioPipeline and LAMPipeline to
424
432
  * work with either model transparently.
425
433
  *
@@ -454,19 +462,15 @@ interface LipSyncResult {
454
462
  *
455
463
  * Implemented by:
456
464
  * - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR + lip sync)
457
- * - Wav2ArkitCpuInference (WASM-only, 1.8MB, lip sync only)
465
+ * - Wav2ArkitCpuInference (WASM-only, 404MB, lip sync only)
458
466
  */
459
467
  interface LipSyncBackend {
468
+ /** Model identifier for backend-specific tuning (e.g. audio delay) */
469
+ readonly modelId: 'wav2vec2' | 'wav2arkit_cpu';
460
470
  /** Current backend type (webgpu, wasm, or null if not loaded) */
461
471
  readonly backend: RuntimeBackend | null;
462
472
  /** Whether the model is loaded and ready for inference */
463
473
  readonly isLoaded: boolean;
464
- /**
465
- * Preferred number of audio samples per inference chunk.
466
- * Models with variable-length input can use smaller values for lower latency.
467
- * Default (if undefined): 16000 (1.0s at 16kHz, required by Wav2Vec2).
468
- */
469
- readonly chunkSamples?: number;
470
474
  /**
471
475
  * Load the ONNX model
472
476
  * @returns Model loading information
@@ -529,7 +533,7 @@ interface LAMPipelineOptions {
529
533
  }
530
534
  declare class LAMPipeline {
531
535
  private readonly options;
532
- private readonly DEFAULT_CHUNK_SAMPLES;
536
+ private readonly REQUIRED_SAMPLES;
533
537
  private readonly FRAME_RATE;
534
538
  private buffer;
535
539
  private bufferStartTime;
@@ -558,13 +562,15 @@ declare class LAMPipeline {
558
562
  /**
559
563
  * Get the frame that should be displayed at the current time
560
564
  *
561
- * Timestamp-synced playback for all backends. Audio playback is delayed
562
- * for slow backends (WASM gets 1s head start via AudioScheduler) so
563
- * frames are ready by the time their corresponding audio plays.
565
+ * Automatically removes frames that have already been displayed.
566
+ * This prevents memory leaks from accumulating old frames.
567
+ *
568
+ * Discard Window (prevents premature frame discarding):
569
+ * - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
570
+ * - WASM: 1.0s (LAM inference 50-500ms + higher variability)
564
571
  *
565
- * Discard window is generous for WASM to handle inference jitter.
566
- * Late frames play at RAF rate (~60fps) until caught up, then settle
567
- * to natural 30fps pacing via timestamp gating.
572
+ * Last-Frame-Hold: Returns last valid frame instead of null to prevent
573
+ * avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
568
574
  *
569
575
  * @param currentTime - Current AudioContext time
570
576
  * @param lam - LAM inference engine (optional, for backend detection)
@@ -592,7 +598,7 @@ declare class LAMPipeline {
592
598
  /**
593
599
  * Flush remaining buffered audio
594
600
  *
595
- * Processes any remaining audio in the buffer, even if less than the chunk size.
601
+ * Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
596
602
  * This ensures the final audio chunk generates blendshape frames.
597
603
  *
598
604
  * Should be called when audio stream ends to prevent losing the last 0-1 seconds.
@@ -645,6 +651,12 @@ interface SyncedAudioPipelineOptions {
645
651
  chunkTargetMs?: number;
646
652
  /** LAM inference engine */
647
653
  lam: LipSyncBackend;
654
+ /**
655
+ * Audio playback delay in ms before first audio plays.
656
+ * Gives LAM inference time to pre-compute blendshapes.
657
+ * Default: auto-detected from lam.backend (50ms WebGPU, 350ms WASM).
658
+ */
659
+ audioDelayMs?: number;
648
660
  }
649
661
  interface SyncedAudioPipelineEvents {
650
662
  /** New frame ready for display */
@@ -838,134 +850,219 @@ declare function getLoadedBackend(): RuntimeBackend | null;
838
850
  * Check if ONNX Runtime has been loaded
839
851
  */
840
852
  declare function isOnnxRuntimeLoaded(): boolean;
853
+ /**
854
+ * Preload ONNX Runtime and compile the WASM binary early
855
+ *
856
+ * Call this before loading heavy resources (Three.js, VRM models) to ensure
857
+ * WASM memory is allocated in a clean JS heap, reducing iOS memory pressure.
858
+ * Uses the singleton pattern — subsequent model loading reuses this instance.
859
+ *
860
+ * @param preference Backend preference (default: 'auto')
861
+ * @returns The resolved backend that was loaded
862
+ */
863
+ declare function preloadOnnxRuntime(preference?: BackendPreference): Promise<RuntimeBackend>;
841
864
 
842
865
  /**
843
- * Whisper Automatic Speech Recognition using transformers.js
844
- * Uses Xenova's proven pipeline API for reliable transcription
845
- */
846
- type WhisperModel = 'tiny' | 'base' | 'small' | 'medium';
847
- type WhisperDtype = 'fp32' | 'fp16' | 'q8' | 'int8' | 'uint8' | 'q4' | 'q4f16' | 'bnb4';
848
- interface WhisperConfig {
849
- /** Model size: tiny (~75MB), base (~150MB), small (~500MB), medium (~1.5GB) */
850
- model?: WhisperModel;
851
- /** Use multilingual model (default: false, uses .en models) */
852
- multilingual?: boolean;
853
- /** Language code (e.g., 'en', 'es', 'fr') - for multilingual models */
854
- language?: string;
855
- /** Task: transcribe or translate (default: transcribe) */
856
- task?: 'transcribe' | 'translate';
857
- /** Model quantization format (default: 'q8' for balance of speed/quality) */
858
- dtype?: WhisperDtype;
859
- /** Use WebGPU acceleration if available (default: auto-detect) */
860
- device?: 'auto' | 'webgpu' | 'wasm';
861
- /** Local model path (e.g., '/models/whisper-tiny.en') - overrides HuggingFace CDN */
862
- localModelPath?: string;
863
- /** HuggingFace API token to bypass rate limits (get from https://huggingface.co/settings/tokens) */
864
- token?: string;
865
- /** Suppress non-speech tokens like [LAUGHTER], [CLICKING], etc. (default: true) */
866
- suppressNonSpeech?: boolean;
867
- }
868
- interface TranscriptionResult {
866
+ * SenseVoice automatic speech recognition using ONNX Runtime Web
867
+ *
868
+ * Non-autoregressive CTC-based ASR that is 5x faster than Whisper-Small.
869
+ * Runs entirely in browser via WebGPU or WASM. No transformers.js dependency.
870
+ *
871
+ * Uses the sherpa-onnx SenseVoice export (model.int8.onnx, 239MB int8 quantized).
872
+ * Also provides emotion detection, language identification, and audio event detection
873
+ * from the same forward pass.
874
+ *
875
+ * @category Inference
876
+ *
877
+ * @example Basic usage
878
+ * ```typescript
879
+ * import { SenseVoiceInference } from '@omote/core';
880
+ *
881
+ * const asr = new SenseVoiceInference({
882
+ * modelUrl: '/models/sensevoice/model.int8.onnx',
883
+ * tokensUrl: '/models/sensevoice/tokens.txt',
884
+ * });
885
+ * await asr.load();
886
+ *
887
+ * const { text, emotion, language } = await asr.transcribe(audioSamples);
888
+ * console.log(text); // "Hello world"
889
+ * console.log(emotion); // "NEUTRAL"
890
+ * console.log(language); // "en"
891
+ * ```
892
+ *
893
+ * @module inference/SenseVoiceInference
894
+ */
895
+
896
+ type SenseVoiceLanguage = 'auto' | 'zh' | 'en' | 'ja' | 'ko' | 'yue';
897
+ interface SenseVoiceConfig {
898
+ /** Path or URL to model.int8.onnx (239MB) */
899
+ modelUrl: string;
900
+ /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
901
+ tokensUrl?: string;
902
+ /** Language hint (default: 'auto' for auto-detection) */
903
+ language?: SenseVoiceLanguage;
904
+ /** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
905
+ textNorm?: 'with_itn' | 'without_itn';
906
+ /** Preferred backend (default: 'auto') */
907
+ backend?: BackendPreference;
908
+ }
909
+ interface SenseVoiceResult {
869
910
  /** Transcribed text */
870
911
  text: string;
871
- /** Detected/used language */
872
- language: string;
873
- /** Inference time in ms */
912
+ /** Detected language (e.g., 'zh', 'en', 'ja', 'ko', 'yue') */
913
+ language?: string;
914
+ /** Detected emotion (e.g., 'HAPPY', 'SAD', 'ANGRY', 'NEUTRAL') */
915
+ emotion?: string;
916
+ /** Detected audio event (e.g., 'Speech', 'BGM', 'Laughter') */
917
+ event?: string;
918
+ /** Inference time in milliseconds (preprocessing + model + decode) */
874
919
  inferenceTimeMs: number;
875
- /** Full chunks with timestamps (if requested) */
876
- chunks?: Array<{
877
- text: string;
878
- timestamp: [number, number | null];
879
- }>;
920
+ /** Preprocessing time in milliseconds (fbank + LFR + CMVN) */
921
+ preprocessTimeMs: number;
880
922
  }
881
- /**
882
- * Whisper ASR inference using transformers.js pipeline API
883
- *
884
- * Features:
885
- * - Automatic WebGPU/WASM backend selection
886
- * - Streaming support with chunk callbacks
887
- * - Proven implementation from Xenova's demo
888
- * - Handles all audio preprocessing automatically
889
- */
890
- declare class WhisperInference {
923
+ interface SenseVoiceModelInfo {
924
+ backend: RuntimeBackend;
925
+ loadTimeMs: number;
926
+ inputNames: string[];
927
+ outputNames: string[];
928
+ vocabSize: number;
929
+ }
930
+ declare class SenseVoiceInference {
931
+ private session;
932
+ private ort;
891
933
  private config;
892
- private pipeline;
893
- private currentModel;
934
+ private _backend;
894
935
  private isLoading;
895
- private actualBackend;
896
- constructor(config?: WhisperConfig);
897
- /**
898
- * Check if WebGPU is available in this browser
899
- */
900
- static isWebGPUAvailable(): Promise<boolean>;
901
- /**
902
- * Load the Whisper model pipeline
903
- */
904
- load(onProgress?: (progress: {
905
- status: string;
906
- progress?: number;
907
- file?: string;
908
- }) => void): Promise<void>;
909
- /**
910
- * Transcribe audio to text
911
- *
912
- * @param audio Audio samples (Float32Array, 16kHz mono)
913
- * @param options Transcription options
914
- */
915
- transcribe(audio: Float32Array, options?: {
916
- /** Return timestamps for each chunk */
917
- returnTimestamps?: boolean;
918
- /** Chunk length in seconds (default: 30) */
919
- chunkLengthS?: number;
920
- /** Stride length in seconds for overlapping chunks (default: 5) */
921
- strideLengthS?: number;
922
- /** Language override */
923
- language?: string;
924
- /** Task override */
925
- task?: 'transcribe' | 'translate';
926
- }): Promise<TranscriptionResult>;
927
- /**
928
- * Transcribe with streaming chunks (progressive results)
929
- *
930
- * @param audio Audio samples
931
- * @param onChunk Called when each chunk is finalized
932
- * @param onUpdate Called after each generation step (optional)
933
- */
934
- transcribeStreaming(audio: Float32Array, onChunk: (chunk: {
935
- text: string;
936
- timestamp: [number, number | null];
937
- }) => void, onUpdate?: (text: string) => void, options?: {
938
- chunkLengthS?: number;
939
- strideLengthS?: number;
940
- language?: string;
941
- task?: 'transcribe' | 'translate';
942
- }): Promise<TranscriptionResult>;
943
- /**
944
- * Dispose of the model and free resources
945
- */
946
- dispose(): Promise<void>;
947
- /**
948
- * Check if model is loaded
949
- */
936
+ private inferenceQueue;
937
+ private tokenMap;
938
+ private negMean;
939
+ private invStddev;
940
+ private languageId;
941
+ private textNormId;
942
+ constructor(config: SenseVoiceConfig);
943
+ get backend(): RuntimeBackend | null;
950
944
  get isLoaded(): boolean;
945
+ load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
951
946
  /**
952
- * Get the backend being used (webgpu or wasm)
953
- */
954
- get backend(): string;
955
- /**
956
- * Get the full model name used by transformers.js
957
- */
958
- private getModelName;
959
- /**
960
- * Remove non-speech event tokens from transcription
961
- *
962
- * Whisper outputs special tokens for non-speech events like:
963
- * [LAUGHTER], [APPLAUSE], [MUSIC], [BLANK_AUDIO], [CLICKING], etc.
947
+ * Transcribe audio samples to text
964
948
  *
965
- * This method strips these tokens and cleans up extra whitespace.
949
+ * @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
950
+ * @returns Transcription result with text, emotion, language, and event
966
951
  */
967
- private removeNonSpeechTokens;
952
+ transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
953
+ private queueInference;
954
+ dispose(): Promise<void>;
955
+ }
956
+
957
+ /**
958
+ * Kaldi-compatible filterbank (fbank) feature extraction
959
+ *
960
+ * Pure TypeScript implementation matching kaldi-native-fbank parameters
961
+ * used by SenseVoice. No external dependencies.
962
+ *
963
+ * Pipeline: audio → framing → windowing → FFT → power spectrum → mel filterbank → log
964
+ *
965
+ * @module inference/kaldiFbank
966
+ */
967
+ interface KaldiFbankOptions {
968
+ /** Frame length in ms (default: 25) */
969
+ frameLengthMs?: number;
970
+ /** Frame shift in ms (default: 10) */
971
+ frameShiftMs?: number;
972
+ /** Low frequency cutoff in Hz (default: 20) */
973
+ lowFreq?: number;
974
+ /** High frequency cutoff in Hz (default: sampleRate / 2) */
975
+ highFreq?: number;
976
+ /** Dither amount (default: 0 for deterministic output) */
977
+ dither?: number;
978
+ /** Preemphasis coefficient (default: 0.97) */
979
+ preemphasis?: number;
980
+ }
981
+ /**
982
+ * Compute Kaldi-compatible log mel filterbank features
983
+ *
984
+ * @param audio Raw audio samples (float32, [-1, 1] range)
985
+ * @param sampleRate Sample rate in Hz (must be 16000 for SenseVoice)
986
+ * @param numMelBins Number of mel bins (80 for SenseVoice)
987
+ * @param opts Optional parameters
988
+ * @returns Flattened Float32Array of shape [numFrames, numMelBins]
989
+ */
990
+ declare function computeKaldiFbank(audio: Float32Array, sampleRate: number, numMelBins: number, opts?: KaldiFbankOptions): Float32Array;
991
+ /**
992
+ * Apply Low Frame Rate stacking for SenseVoice
993
+ *
994
+ * Concatenates lfrM consecutive frames with stride lfrN.
995
+ * Left-pads with copies of first frame, right-pads last group.
996
+ *
997
+ * @param features Flattened [numFrames, featureDim]
998
+ * @param featureDim Feature dimension per frame (e.g., 80)
999
+ * @param lfrM Number of frames to stack (default: 7)
1000
+ * @param lfrN Stride (default: 6)
1001
+ * @returns Flattened [numOutputFrames, featureDim * lfrM]
1002
+ */
1003
+ declare function applyLFR(features: Float32Array, featureDim: number, lfrM?: number, lfrN?: number): Float32Array;
1004
+ /**
1005
+ * Apply CMVN normalization in-place
1006
+ *
1007
+ * Formula: normalized[i] = (features[i] + negMean[i % dim]) * invStddev[i % dim]
1008
+ *
1009
+ * @param features Flattened feature array (modified in-place)
1010
+ * @param dim Feature dimension (560 for SenseVoice after LFR)
1011
+ * @param negMean Negative mean vector (dim-dimensional)
1012
+ * @param invStddev Inverse standard deviation vector (dim-dimensional)
1013
+ * @returns The same features array (for chaining)
1014
+ */
1015
+ declare function applyCMVN(features: Float32Array, dim: number, negMean: Float32Array, invStddev: Float32Array): Float32Array;
1016
+ /**
1017
+ * Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
1018
+ *
1019
+ * The sherpa-onnx SenseVoice export stores neg_mean and inv_stddev
1020
+ * as comma-separated float strings in the model's metadata.
1021
+ */
1022
+ declare function parseCMVNFromMetadata(negMeanStr: string, invStddevStr: string): {
1023
+ negMean: Float32Array;
1024
+ invStddev: Float32Array;
1025
+ };
1026
+
1027
+ /**
1028
+ * CTC greedy decoder for SenseVoice
1029
+ *
1030
+ * Decodes CTC logits into text with structured token parsing
1031
+ * for language, emotion, and audio event detection.
1032
+ *
1033
+ * @module inference/ctcDecoder
1034
+ */
1035
+ interface CTCDecodeResult {
1036
+ /** Decoded text (speech content only) */
1037
+ text: string;
1038
+ /** Detected language (e.g., 'zh', 'en', 'ja', 'ko', 'yue') */
1039
+ language?: string;
1040
+ /** Detected emotion (e.g., 'HAPPY', 'SAD', 'ANGRY', 'NEUTRAL') */
1041
+ emotion?: string;
1042
+ /** Detected audio event (e.g., 'Speech', 'BGM', 'Laughter') */
1043
+ event?: string;
968
1044
  }
1045
+ /** Resolve language string to SenseVoice language ID */
1046
+ declare function resolveLanguageId(language: string): number;
1047
+ /** Resolve text norm string to SenseVoice text norm ID */
1048
+ declare function resolveTextNormId(textNorm: string): number;
1049
+ /**
1050
+ * Parse tokens.txt into a token ID → string map
1051
+ *
1052
+ * Format: each line is "token_string token_id"
1053
+ * e.g., "<unk> 0", "▁the 3", "s 4"
1054
+ */
1055
+ declare function parseTokensFile(content: string): Map<number, string>;
1056
+ /**
1057
+ * CTC greedy decode
1058
+ *
1059
+ * @param logits Raw logits from model output, flattened [seqLen, vocabSize]
1060
+ * @param seqLen Sequence length (time steps)
1061
+ * @param vocabSize Vocabulary size
1062
+ * @param tokenMap Token ID → string map from tokens.txt
1063
+ * @returns Decoded text and structured metadata
1064
+ */
1065
+ declare function ctcGreedyDecode(logits: Float32Array, seqLen: number, vocabSize: number, tokenMap: Map<number, string>): CTCDecodeResult;
969
1066
 
970
1067
  /**
971
1068
  * Shared blendshape constants and utilities for lip sync inference
@@ -1036,6 +1133,13 @@ type InferenceBackend = BackendPreference;
1036
1133
  interface Wav2Vec2InferenceConfig {
1037
1134
  /** Path or URL to the ONNX model */
1038
1135
  modelUrl: string;
1136
+ /**
1137
+ * Path or URL to external model data file (.onnx.data weights).
1138
+ * Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
1139
+ *
1140
+ * Set to `false` to skip external data loading (single-file models only).
1141
+ */
1142
+ externalDataUrl?: string | false;
1039
1143
  /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
1040
1144
  backend?: InferenceBackend;
1041
1145
  /** Number of identity classes (default: 12 for streaming model) */
@@ -1066,7 +1170,8 @@ interface Wav2Vec2Result {
1066
1170
  /** Inference time in ms */
1067
1171
  inferenceTimeMs: number;
1068
1172
  }
1069
- declare class Wav2Vec2Inference {
1173
+ declare class Wav2Vec2Inference implements LipSyncBackend {
1174
+ readonly modelId: "wav2vec2";
1070
1175
  private session;
1071
1176
  private ort;
1072
1177
  private config;
@@ -1116,12 +1221,16 @@ declare class Wav2Vec2Inference {
1116
1221
  /**
1117
1222
  * CPU-optimized lip sync inference using wav2arkit_cpu model
1118
1223
  *
1119
- * A lightweight (1.8MB) alternative to Wav2Vec2Inference (384MB) designed
1120
- * for Safari/iOS where WebGPU crashes due to ONNX Runtime JSEP bugs.
1224
+ * A Safari/iOS-compatible alternative to Wav2Vec2Inference (384MB) designed
1225
+ * for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
1226
+ *
1227
+ * The model uses ONNX external data format:
1228
+ * - wav2arkit_cpu.onnx (1.86MB graph structure)
1229
+ * - wav2arkit_cpu.onnx.data (402MB weights)
1230
+ * Both files are fetched and cached automatically.
1121
1231
  *
1122
1232
  * Key differences from Wav2Vec2Inference:
1123
- * - WASM-only backend (CPU-optimized, no WebGPU)
1124
- * - 1.8MB model vs 384MB
1233
+ * - WASM-only backend (CPU-optimized, single-threaded, no WebGPU)
1125
1234
  * - No identity input (baked to identity 11)
1126
1235
  * - No ASR output (lip sync only)
1127
1236
  * - Dynamic input length (not fixed to 16000 samples)
@@ -1146,12 +1255,18 @@ declare class Wav2Vec2Inference {
1146
1255
  interface Wav2ArkitCpuConfig {
1147
1256
  /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
1148
1257
  modelUrl: string;
1149
- /** Path or URL to the external data file (.onnx.data weights file) */
1150
- modelDataUrl?: string;
1258
+ /**
1259
+ * Path or URL to external model data file (.onnx.data weights).
1260
+ * Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
1261
+ *
1262
+ * Set to `false` to skip external data loading (single-file models only).
1263
+ */
1264
+ externalDataUrl?: string | false;
1151
1265
  /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
1152
1266
  backend?: BackendPreference;
1153
1267
  }
1154
1268
  declare class Wav2ArkitCpuInference implements LipSyncBackend {
1269
+ readonly modelId: "wav2arkit_cpu";
1155
1270
  private session;
1156
1271
  private ort;
1157
1272
  private config;
@@ -1161,12 +1276,6 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
1161
1276
  constructor(config: Wav2ArkitCpuConfig);
1162
1277
  get backend(): RuntimeBackend | null;
1163
1278
  get isLoaded(): boolean;
1164
- /**
1165
- * Preferred chunk size: 4000 samples (250ms at 16kHz).
1166
- * wav2arkit_cpu accepts variable-length input, so we use smaller chunks
1167
- * for lower latency on WASM (vs 16000 for Wav2Vec2's fixed requirement).
1168
- */
1169
- readonly chunkSamples = 4000;
1170
1279
  /**
1171
1280
  * Load the ONNX model
1172
1281
  */
@@ -1195,10 +1304,20 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
1195
1304
  * Factory function for lip sync with automatic GPU/CPU model selection
1196
1305
  *
1197
1306
  * Provides a unified API that automatically selects the optimal model:
1198
- * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (1.8MB, WASM)
1307
+ * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
1199
1308
  * - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
1200
1309
  * - Fallback: Gracefully falls back to CPU model if GPU model fails to load
1201
1310
  *
1311
+ * Why two separate models?
1312
+ * Wav2Vec2 (LAM) cannot run on Safari/iOS for two reasons:
1313
+ * 1. Its dual-head transformer graph needs ~750-950MB peak during ORT session
1314
+ * creation (graph optimization), exceeding iOS WebKit's ~1-1.5GB tab limit.
1315
+ * 2. It ships as a single 384MB .onnx file that must load into JS heap before
1316
+ * ORT can consume it. iOS WebKit OOMs on this allocation.
1317
+ * wav2arkit_cpu solves both: external data format (1.86MB graph + 402MB weights)
1318
+ * lets ORT load only the tiny graph, then stream weights via URL pass-through
1319
+ * directly into WASM memory. JS heap stays at ~2MB.
1320
+ *
1202
1321
  * @category Inference
1203
1322
  *
1204
1323
  * @example Auto-detect (recommended)
@@ -1230,10 +1349,15 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
1230
1349
  interface CreateLipSyncConfig {
1231
1350
  /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
1232
1351
  gpuModelUrl: string;
1352
+ /**
1353
+ * URL for GPU model external data file (.onnx.data weights).
1354
+ * Default: `${gpuModelUrl}.data`
1355
+ *
1356
+ * Set to `false` to skip external data loading (single-file models only).
1357
+ */
1358
+ gpuExternalDataUrl?: string | false;
1233
1359
  /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
1234
1360
  cpuModelUrl: string;
1235
- /** URL for the CPU model's external data file (.onnx.data weights) */
1236
- cpuModelDataUrl?: string;
1237
1361
  /**
1238
1362
  * Model selection mode:
1239
1363
  * - 'auto': Safari/iOS → CPU, everything else → GPU (default)
@@ -1388,6 +1512,7 @@ declare class SileroVADInference {
1388
1512
  private inferenceQueue;
1389
1513
  private preSpeechBuffer;
1390
1514
  private wasSpeaking;
1515
+ private srTensor;
1391
1516
  constructor(config: SileroVADConfig);
1392
1517
  get backend(): RuntimeBackend | null;
1393
1518
  get isLoaded(): boolean;
@@ -2444,7 +2569,7 @@ declare class AgentCoreAdapter extends EventEmitter<AIAdapterEvents> implements
2444
2569
  private _state;
2445
2570
  private _sessionId;
2446
2571
  private _isConnected;
2447
- private whisper;
2572
+ private asr;
2448
2573
  private vad;
2449
2574
  private lam;
2450
2575
  private emotionController;
@@ -2488,7 +2613,7 @@ declare class AgentCoreAdapter extends EventEmitter<AIAdapterEvents> implements
2488
2613
  healthCheck(): Promise<boolean>;
2489
2614
  private setState;
2490
2615
  private getAuthToken;
2491
- private initWhisper;
2616
+ private initASR;
2492
2617
  private initLAM;
2493
2618
  private initPipeline;
2494
2619
  private connectWebSocket;
@@ -3182,148 +3307,6 @@ declare function preloadModels(urls: string[], onProgress?: (current: number, to
3182
3307
  */
3183
3308
  declare function formatBytes(bytes: number): string;
3184
3309
 
3185
- /**
3186
- * HuggingFace CDN Utilities
3187
- *
3188
- * Helper functions for working with HuggingFace CDN URLs.
3189
- * Used by transformers.js models (Whisper, etc.) for model downloads.
3190
- *
3191
- * @category Cache
3192
- */
3193
- /**
3194
- * Test URL for HuggingFace CDN reachability check.
3195
- * Uses a small, stable file from a well-known public model.
3196
- */
3197
- declare const HF_CDN_TEST_URL = "https://huggingface.co/Xenova/whisper-tiny/resolve/main/config.json";
3198
- /**
3199
- * Parsed HuggingFace URL components
3200
- */
3201
- interface HuggingFaceUrlInfo {
3202
- /** Organization or username */
3203
- org: string;
3204
- /** Model name */
3205
- model: string;
3206
- /** Branch, tag, or commit */
3207
- branch: string;
3208
- /** File path within the repository */
3209
- file: string;
3210
- }
3211
- /**
3212
- * Parse a HuggingFace CDN URL into its components
3213
- *
3214
- * @param url - The HuggingFace URL to parse
3215
- * @returns Parsed URL info or null if not a valid HF URL
3216
- *
3217
- * @example
3218
- * ```typescript
3219
- * const info = parseHuggingFaceUrl(
3220
- * 'https://huggingface.co/openai/whisper-tiny/resolve/main/model.onnx'
3221
- * );
3222
- * // Returns: { org: 'openai', model: 'whisper-tiny', branch: 'main', file: 'model.onnx' }
3223
- * ```
3224
- */
3225
- declare function parseHuggingFaceUrl(url: string): HuggingFaceUrlInfo | null;
3226
- /**
3227
- * Check if HuggingFace CDN is reachable
3228
- *
3229
- * Performs a HEAD request to a known HuggingFace model file to verify
3230
- * connectivity. Useful for offline detection or network diagnostics.
3231
- *
3232
- * @param testUrl - Optional custom URL to test (defaults to HF_CDN_TEST_URL)
3233
- * @returns True if CDN is reachable, false otherwise
3234
- *
3235
- * @example
3236
- * ```typescript
3237
- * import { isHuggingFaceCDNReachable } from '@omote/core';
3238
- *
3239
- * const reachable = await isHuggingFaceCDNReachable();
3240
- * if (!reachable) {
3241
- * console.log('HuggingFace CDN unreachable - running offline?');
3242
- * // Fall back to cached models or show error
3243
- * }
3244
- * ```
3245
- */
3246
- declare function isHuggingFaceCDNReachable(testUrl?: string): Promise<boolean>;
3247
-
3248
- /**
3249
- * Utility to clear transformers.js Cache API storage
3250
- *
3251
- * Problem: transformers.js v4 uses Browser Cache API which persists across hard refreshes.
3252
- * If an HTML error page gets cached (due to network errors, CDN issues, or dev server restarts),
3253
- * it will be served instead of JSON files, causing JSON.parse() errors.
3254
- *
3255
- * Solution: Manually clear Cache API storage before loading models.
3256
- *
3257
- * @module utils/transformersCacheClear
3258
- */
3259
- /**
3260
- * Clear all transformers.js and HuggingFace caches from Browser Cache API
3261
- *
3262
- * This clears:
3263
- * - transformers-cache (default cache key)
3264
- * - Any caches with 'transformers' or 'huggingface' in the name
3265
- *
3266
- * @param options Configuration options
3267
- * @returns Promise resolving to array of deleted cache names
3268
- */
3269
- declare function clearTransformersCache(options?: {
3270
- /** Whether to log deletion details (default: true) */
3271
- verbose?: boolean;
3272
- /** Additional cache name patterns to clear (e.g., ['my-custom-cache']) */
3273
- additionalPatterns?: string[];
3274
- }): Promise<string[]>;
3275
- /**
3276
- * Clear a specific cache by exact name
3277
- *
3278
- * @param cacheName Exact cache name to delete
3279
- * @returns Promise resolving to true if deleted, false otherwise
3280
- */
3281
- declare function clearSpecificCache(cacheName: string): Promise<boolean>;
3282
- /**
3283
- * List all cache names currently stored
3284
- *
3285
- * @returns Promise resolving to array of cache names
3286
- */
3287
- declare function listCaches(): Promise<string[]>;
3288
- /**
3289
- * Check if a specific cached response is valid JSON/binary (not HTML error page)
3290
- *
3291
- * @param cacheName Cache name to check
3292
- * @param requestUrl URL/key to check
3293
- * @returns Promise resolving to validation result
3294
- */
3295
- declare function validateCachedResponse(cacheName: string, requestUrl: string): Promise<{
3296
- exists: boolean;
3297
- valid: boolean;
3298
- contentType: string | null;
3299
- isHtml: boolean;
3300
- reason?: string;
3301
- }>;
3302
- /**
3303
- * Scan all caches for potentially invalid cached responses
3304
- *
3305
- * @returns Promise resolving to report of invalid entries
3306
- */
3307
- declare function scanForInvalidCaches(): Promise<{
3308
- totalCaches: number;
3309
- scannedEntries: number;
3310
- invalidEntries: Array<{
3311
- cacheName: string;
3312
- url: string;
3313
- reason: string;
3314
- }>;
3315
- }>;
3316
- /**
3317
- * Clear all caches and optionally prevent re-creation (development mode)
3318
- *
3319
- * WARNING: This is aggressive and should only be used in development.
3320
- * It clears ALL browser caches, not just transformers.js.
3321
- *
3322
- * @param preventRecreation If true, sets env.useBrowserCache = false
3323
- * @returns Promise resolving to number of deleted caches
3324
- */
3325
- declare function nukeBrowserCaches(preventRecreation?: boolean): Promise<number>;
3326
-
3327
3310
  /**
3328
3311
  * Telemetry Types
3329
3312
  *
@@ -4086,4 +4069,4 @@ declare class EmphasisDetector {
4086
4069
  reset(): void;
4087
4070
  }
4088
4071
 
4089
- export { type AIAdapter, type AIAdapterEvents, type AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, HF_CDN_TEST_URL, type HuggingFaceUrlInfo, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type TranscriptionResult, type Transition, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, type WhisperConfig, WhisperInference, type WhisperModel, blendEmotions, calculatePeak, calculateRMS, clearSpecificCache, clearTransformersCache, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSessionWithFallback, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isHuggingFaceCDNReachable, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, listCaches, nukeBrowserCaches, parseHuggingFaceUrl, preloadModels, remapWav2ArkitToLam, resolveBackend, scanForInvalidCaches, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes, validateCachedResponse };
4072
+ export { type AIAdapter, type AIAdapterEvents, type AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, type CTCDecodeResult, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type KaldiFbankOptions, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyCMVN, applyLFR, blendEmotions, calculatePeak, calculateRMS, computeKaldiFbank, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSessionWithFallback, createSileroVAD, ctcGreedyDecode, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, parseCMVNFromMetadata, parseTokensFile, preloadModels, preloadOnnxRuntime, remapWav2ArkitToLam, resolveBackend, resolveLanguageId, resolveTextNormId, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes };