@omote/core 0.4.4 → 0.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { EventEmitter, OmoteEvents, AnimationEvent } from './events/index.js';
1
+ import { EventEmitter, OmoteEvents, AISessionState, AnimationEvent } from './events/index.js';
2
2
  export { BackendEvent, EmotionEvent, GazeEvent, STTFinalEvent, STTPartialEvent, SessionStateEvent, TTSEndEvent, TTSMarkEvent, TTSStartEvent, VisemeEvent } from './events/index.js';
3
3
  import { InferenceSession, Tensor, Env } from 'onnxruntime-common';
4
4
  export { D as DEFAULT_LOGGING_CONFIG, I as ILogger, e as LOG_LEVEL_PRIORITY, b as LogEntry, L as LogFormatter, a as LogLevel, c as LogSink, d as LoggingConfig, f as configureLogging, i as createLogger, g as getLoggingConfig, n as noopLogger, r as resetLoggingConfig, s as setLogLevel, h as setLoggingEnabled } from './Logger-I_k4sGhM.js';
@@ -1412,6 +1412,8 @@ declare class SenseVoiceInference {
1412
1412
  private _backend;
1413
1413
  private isLoading;
1414
1414
  private inferenceQueue;
1415
+ private poisoned;
1416
+ private static readonly INFERENCE_TIMEOUT_MS;
1415
1417
  private tokenMap;
1416
1418
  private negMean;
1417
1419
  private invStddev;
@@ -1433,267 +1435,297 @@ declare class SenseVoiceInference {
1433
1435
  }
1434
1436
 
1435
1437
  /**
1436
- * Kaldi-compatible filterbank (fbank) feature extraction
1437
- *
1438
- * Pure TypeScript implementation matching kaldi-native-fbank parameters
1439
- * used by SenseVoice. No external dependencies.
1440
- *
1441
- * Pipeline: audio → framing → windowing → FFT → power spectrum → mel filterbank → log
1442
- *
1443
- * @module inference/kaldiFbank
1444
- */
1445
- interface KaldiFbankOptions {
1446
- /** Frame length in ms (default: 25) */
1447
- frameLengthMs?: number;
1448
- /** Frame shift in ms (default: 10) */
1449
- frameShiftMs?: number;
1450
- /** Low frequency cutoff in Hz (default: 20) */
1451
- lowFreq?: number;
1452
- /** High frequency cutoff in Hz (default: sampleRate / 2) */
1453
- highFreq?: number;
1454
- /** Dither amount (default: 0 for deterministic output) */
1455
- dither?: number;
1456
- /** Preemphasis coefficient (default: 0.97) */
1457
- preemphasis?: number;
1458
- }
1459
- /**
1460
- * Compute Kaldi-compatible log mel filterbank features
1438
+ * SenseVoice ASR Web Worker implementation
1461
1439
  *
1462
- * @param audio Raw audio samples (float32, [-1, 1] range)
1463
- * @param sampleRate Sample rate in Hz (must be 16000 for SenseVoice)
1464
- * @param numMelBins Number of mel bins (80 for SenseVoice)
1465
- * @param opts Optional parameters
1466
- * @returns Flattened Float32Array of shape [numFrames, numMelBins]
1467
- */
1468
- declare function computeKaldiFbank(audio: Float32Array, sampleRate: number, numMelBins: number, opts?: KaldiFbankOptions): Float32Array;
1469
- /**
1470
- * Apply Low Frame Rate stacking for SenseVoice
1440
+ * Runs SenseVoice speech recognition in a dedicated Web Worker to prevent
1441
+ * main thread blocking. Uses inline worker script (Blob URL pattern) to
1442
+ * avoid separate file deployment.
1471
1443
  *
1472
- * Concatenates lfrM consecutive frames with stride lfrN.
1473
- * Left-pads with copies of first frame, right-pads last group.
1444
+ * Key design decisions:
1445
+ * - WASM backend only (WebGPU doesn't work in Workers)
1446
+ * - All preprocessing (fbank, LFR, CMVN) and CTC decoding inlined in worker
1447
+ * - Audio copied (not transferred) to retain main thread access
1448
+ * - ONNX Runtime loaded from CDN in worker (no bundler complications)
1449
+ * - iOS: model URL passed as string to ORT (avoids 239MB JS heap allocation)
1474
1450
  *
1475
- * @param features Flattened [numFrames, featureDim]
1476
- * @param featureDim Feature dimension per frame (e.g., 80)
1477
- * @param lfrM Number of frames to stack (default: 7)
1478
- * @param lfrN Stride (default: 6)
1479
- * @returns Flattened [numOutputFrames, featureDim * lfrM]
1480
- */
1481
- declare function applyLFR(features: Float32Array, featureDim: number, lfrM?: number, lfrN?: number): Float32Array;
1482
- /**
1483
- * Apply CMVN normalization in-place
1451
+ * @category Inference
1484
1452
  *
1485
- * Formula: normalized[i] = (features[i] + negMean[i % dim]) * invStddev[i % dim]
1453
+ * @example Basic usage
1454
+ * ```typescript
1455
+ * import { SenseVoiceWorker } from '@omote/core';
1486
1456
  *
1487
- * @param features Flattened feature array (modified in-place)
1488
- * @param dim Feature dimension (560 for SenseVoice after LFR)
1489
- * @param negMean Negative mean vector (dim-dimensional)
1490
- * @param invStddev Inverse standard deviation vector (dim-dimensional)
1491
- * @returns The same features array (for chaining)
1492
- */
1493
- declare function applyCMVN(features: Float32Array, dim: number, negMean: Float32Array, invStddev: Float32Array): Float32Array;
1494
- /**
1495
- * Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
1457
+ * const asr = new SenseVoiceWorker({
1458
+ * modelUrl: '/models/sensevoice/model.int8.onnx',
1459
+ * tokensUrl: '/models/sensevoice/tokens.txt',
1460
+ * });
1461
+ * await asr.load();
1496
1462
  *
1497
- * The sherpa-onnx SenseVoice export stores neg_mean and inv_stddev
1498
- * as comma-separated float strings in the model's metadata.
1463
+ * const { text, emotion, language } = await asr.transcribe(audioSamples);
1464
+ * console.log(text); // "Hello world"
1465
+ * console.log(emotion); // "NEUTRAL"
1466
+ * console.log(language); // "en"
1467
+ * ```
1499
1468
  */
1500
- declare function parseCMVNFromMetadata(negMeanStr: string, invStddevStr: string): {
1501
- negMean: Float32Array;
1502
- invStddev: Float32Array;
1503
- };
1504
1469
 
1505
1470
  /**
1506
- * CTC greedy decoder for SenseVoice
1507
- *
1508
- * Decodes CTC logits into text with structured token parsing
1509
- * for language, emotion, and audio event detection.
1510
- *
1511
- * @module inference/ctcDecoder
1471
+ * Configuration for SenseVoice Worker
1512
1472
  */
1513
- interface CTCDecodeResult {
1514
- /** Decoded text (speech content only) */
1515
- text: string;
1516
- /** Detected language (e.g., 'zh', 'en', 'ja', 'ko', 'yue') */
1517
- language?: string;
1518
- /** Detected emotion (e.g., 'HAPPY', 'SAD', 'ANGRY', 'NEUTRAL') */
1519
- emotion?: string;
1520
- /** Detected audio event (e.g., 'Speech', 'BGM', 'Laughter') */
1521
- event?: string;
1473
+ interface SenseVoiceWorkerConfig {
1474
+ /** Path or URL to model.int8.onnx (239MB) */
1475
+ modelUrl: string;
1476
+ /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
1477
+ tokensUrl?: string;
1478
+ /** Language hint (default: 'auto' for auto-detection) */
1479
+ language?: SenseVoiceLanguage;
1480
+ /** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
1481
+ textNorm?: 'with_itn' | 'without_itn';
1522
1482
  }
1523
- /** Resolve language string to SenseVoice language ID */
1524
- declare function resolveLanguageId(language: string): number;
1525
- /** Resolve text norm string to SenseVoice text norm ID */
1526
- declare function resolveTextNormId(textNorm: string): number;
1527
1483
  /**
1528
- * Parse tokens.txt into a token ID string map
1484
+ * SenseVoice ASR Worker - Speech Recognition in a Web Worker
1529
1485
  *
1530
- * Format: each line is "token_string token_id"
1531
- * e.g., "<unk> 0", "▁the 3", "s 4"
1532
- */
1533
- declare function parseTokensFile(content: string): Map<number, string>;
1534
- /**
1535
- * CTC greedy decode
1486
+ * Runs SenseVoice inference off the main thread to prevent UI blocking.
1487
+ * All preprocessing (fbank, LFR, CMVN) and CTC decoding run in the worker.
1536
1488
  *
1537
- * @param logits Raw logits from model output, flattened [seqLen, vocabSize]
1538
- * @param seqLen Sequence length (time steps)
1539
- * @param vocabSize Vocabulary size
1540
- * @param tokenMap Token ID → string map from tokens.txt
1541
- * @returns Decoded text and structured metadata
1489
+ * @see SenseVoiceInference for main-thread version
1542
1490
  */
1543
- declare function ctcGreedyDecode(logits: Float32Array, seqLen: number, vocabSize: number, tokenMap: Map<number, string>): CTCDecodeResult;
1491
+ declare class SenseVoiceWorker {
1492
+ private worker;
1493
+ private config;
1494
+ private isLoading;
1495
+ private _isLoaded;
1496
+ private inferenceQueue;
1497
+ private poisoned;
1498
+ private pendingResolvers;
1499
+ private languageId;
1500
+ private textNormId;
1501
+ constructor(config: SenseVoiceWorkerConfig);
1502
+ get isLoaded(): boolean;
1503
+ /**
1504
+ * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
1505
+ */
1506
+ get backend(): 'wasm' | null;
1507
+ /**
1508
+ * Create the worker from inline script
1509
+ */
1510
+ private createWorker;
1511
+ /**
1512
+ * Handle messages from worker
1513
+ */
1514
+ private handleWorkerMessage;
1515
+ /**
1516
+ * Send message to worker and wait for response
1517
+ */
1518
+ private sendMessage;
1519
+ /**
1520
+ * Load the ONNX model in the worker
1521
+ *
1522
+ * @param onProgress - Optional progress callback. Fires once at 100% when load completes
1523
+ * (the worker downloads and loads the model internally, so granular progress is not available).
1524
+ */
1525
+ load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
1526
+ /**
1527
+ * Transcribe audio samples to text
1528
+ *
1529
+ * @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
1530
+ * @returns Transcription result with text, emotion, language, and event
1531
+ */
1532
+ transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
1533
+ /**
1534
+ * Queue inference to serialize worker calls
1535
+ */
1536
+ private queueInference;
1537
+ /**
1538
+ * Dispose of the worker and free resources
1539
+ */
1540
+ dispose(): Promise<void>;
1541
+ /**
1542
+ * Check if Web Workers are supported
1543
+ */
1544
+ static isSupported(): boolean;
1545
+ }
1544
1546
 
1545
1547
  /**
1546
- * Shared blendshape constants and utilities for lip sync inference
1547
- *
1548
- * Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
1549
- * index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
1550
- *
1551
- * This module is the single source of truth for blendshape ordering to
1552
- * avoid circular dependencies between inference classes.
1553
- *
1554
- * @category Inference
1555
- */
1556
- /**
1557
- * LAM model blendshape names in order (52 total)
1558
- * NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
1559
- */
1560
- declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
1561
- /** Alias for backwards compatibility */
1562
- declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
1563
- /**
1564
- * Symmetrize blendshapes by averaging left/right pairs
1565
- * From LAM official postprocessing (models/utils.py)
1566
- * This fixes asymmetric output from the raw model
1567
- */
1568
- declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
1569
- /**
1570
- * wav2arkit_cpu model blendshape ordering
1571
- *
1572
- * Indices 0-24 match LAM_BLENDSHAPES, but 25+ diverge:
1573
- * - LAM puts jawRight, mouthClose, mouthDimpleLeft, mouthDimpleRight at 25-28
1574
- * - wav2arkit_cpu puts mouthFrownLeft at 25 and moves those four to 48-51
1575
- */
1576
- declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
1577
- /**
1578
- * Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
1548
+ * Silero VAD (Voice Activity Detection) inference
1579
1549
  *
1580
- * @param frame - Float32Array of 52 blendshape values in wav2arkit_cpu order
1581
- * @returns Float32Array of 52 blendshape values in LAM_BLENDSHAPES order
1582
- */
1583
- declare function remapWav2ArkitToLam(frame: Float32Array): Float32Array;
1584
-
1585
- /**
1586
- * Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
1550
+ * Neural network-based VAD running in browser via ONNX Runtime Web.
1551
+ * Much more accurate than RMS-based energy detection.
1587
1552
  *
1588
- * Runs entirely in the browser using WebGPU or WASM.
1589
- * Takes raw 16kHz audio and outputs:
1590
- * - 52 ARKit blendshapes (lip sync)
1591
- * - 32-token CTC logits (speech recognition)
1553
+ * Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
1554
+ * - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
1555
+ * - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
1592
1556
  *
1593
1557
  * @category Inference
1594
1558
  *
1595
1559
  * @example Basic usage
1596
1560
  * ```typescript
1597
- * import { Wav2Vec2Inference } from '@omote/core';
1561
+ * import { SileroVADInference } from '@omote/core';
1598
1562
  *
1599
- * const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
1600
- * await wav2vec.load();
1563
+ * const vad = new SileroVADInference({
1564
+ * modelUrl: '/models/silero-vad.onnx'
1565
+ * });
1566
+ * await vad.load();
1601
1567
  *
1602
- * // Process 1 second of audio (16kHz = 16000 samples)
1603
- * const result = await wav2vec.infer(audioSamples);
1568
+ * // Process 32ms chunks (512 samples at 16kHz)
1569
+ * const probability = await vad.process(audioChunk);
1570
+ * if (probability > 0.5) {
1571
+ * console.log('Speech detected!');
1572
+ * }
1573
+ * ```
1604
1574
  *
1605
- * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
1606
- * console.log('ASR text:', result.text); // Decoded transcription
1575
+ * @example Streaming with state management
1576
+ * ```typescript
1577
+ * // State is automatically maintained between process() calls
1578
+ * // Call reset() when starting a new audio stream
1579
+ * vad.reset();
1580
+ *
1581
+ * for (const chunk of audioChunks) {
1582
+ * const prob = await vad.process(chunk);
1583
+ * // prob is speech probability [0, 1]
1584
+ * }
1607
1585
  * ```
1608
1586
  */
1609
1587
 
1610
- type InferenceBackend = BackendPreference;
1611
- interface Wav2Vec2InferenceConfig {
1588
+ type VADBackend = BackendPreference;
1589
+ /**
1590
+ * Configuration for Silero VAD
1591
+ */
1592
+ interface SileroVADConfig {
1612
1593
  /** Path or URL to the ONNX model */
1613
1594
  modelUrl: string;
1614
- /**
1615
- * Path or URL to external model data file (.onnx.data weights).
1616
- * Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
1617
- *
1618
- * Set to `false` to skip external data loading (single-file models only).
1619
- */
1620
- externalDataUrl?: string | false;
1621
1595
  /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
1622
- backend?: InferenceBackend;
1623
- /** Number of identity classes (default: 12 for streaming model) */
1624
- numIdentityClasses?: number;
1596
+ backend?: VADBackend;
1597
+ /** Sample rate (8000 or 16000, default: 16000) */
1598
+ sampleRate?: 8000 | 16000;
1599
+ /** Speech probability threshold (default: 0.5) */
1600
+ threshold?: number;
1601
+ /**
1602
+ * Number of audio chunks to keep in pre-speech buffer.
1603
+ * When VAD triggers, these chunks are prepended to the speech buffer
1604
+ * to capture the beginning of speech that occurred before detection.
1605
+ *
1606
+ * At 512 samples/chunk and 16kHz:
1607
+ * - 10 chunks = 320ms of pre-speech audio
1608
+ * - 15 chunks = 480ms of pre-speech audio
1609
+ *
1610
+ * Default: 10 chunks (320ms)
1611
+ */
1612
+ preSpeechBufferChunks?: number;
1625
1613
  }
1626
- interface ModelInfo {
1614
+ /**
1615
+ * VAD model loading information
1616
+ */
1617
+ interface VADModelInfo {
1627
1618
  backend: 'webgpu' | 'wasm';
1628
1619
  loadTimeMs: number;
1629
1620
  inputNames: string[];
1630
1621
  outputNames: string[];
1622
+ sampleRate: number;
1623
+ chunkSize: number;
1631
1624
  }
1632
-
1633
- /** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
1634
- declare const CTC_VOCAB: string[];
1635
- interface Wav2Vec2Result {
1636
- /** Blendshape weights [frames, 52] - 30fps */
1637
- blendshapes: Float32Array[];
1638
- /** Raw CTC logits [frames, 32] - 50fps */
1639
- asrLogits: Float32Array[];
1640
- /** Decoded text from CTC */
1641
- text: string;
1642
- /** Number of blendshape frames (30fps) — alias for numA2EFrames */
1643
- numFrames: number;
1644
- /** Number of A2E frames (30fps) */
1645
- numA2EFrames: number;
1646
- /** Number of ASR frames (50fps) */
1647
- numASRFrames: number;
1648
- /** Inference time in ms */
1625
+ /**
1626
+ * Result from a single VAD inference
1627
+ */
1628
+ interface VADResult {
1629
+ /** Speech probability (0-1) */
1630
+ probability: number;
1631
+ /** Whether speech is detected (probability > threshold) */
1632
+ isSpeech: boolean;
1633
+ /** Inference time in milliseconds */
1649
1634
  inferenceTimeMs: number;
1635
+ /**
1636
+ * Pre-speech audio chunks (only present on first speech detection).
1637
+ * These are the N chunks immediately before VAD triggered, useful for
1638
+ * capturing the beginning of speech that occurred before detection.
1639
+ *
1640
+ * Only populated when transitioning from silence to speech.
1641
+ */
1642
+ preSpeechChunks?: Float32Array[];
1650
1643
  }
1651
- declare class Wav2Vec2Inference implements LipSyncBackend {
1652
- readonly modelId: "wav2vec2";
1644
+ /**
1645
+ * Speech segment detected by VAD
1646
+ */
1647
+ interface SpeechSegment {
1648
+ /** Start time in seconds */
1649
+ start: number;
1650
+ /** End time in seconds */
1651
+ end: number;
1652
+ /** Average probability during segment */
1653
+ avgProbability: number;
1654
+ }
1655
+ /**
1656
+ * Silero VAD - Neural network voice activity detection
1657
+ *
1658
+ * Based on snakers4/silero-vad ONNX model.
1659
+ * Processes 32ms chunks (512 samples at 16kHz) with LSTM state.
1660
+ *
1661
+ * @see https://github.com/snakers4/silero-vad
1662
+ */
1663
+ declare class SileroVADInference {
1653
1664
  private session;
1654
1665
  private ort;
1655
1666
  private config;
1656
1667
  private _backend;
1657
1668
  private isLoading;
1658
- private numIdentityClasses;
1669
+ private state;
1670
+ private context;
1671
+ private readonly chunkSize;
1672
+ private readonly contextSize;
1659
1673
  private inferenceQueue;
1660
- private poisoned;
1661
- private static readonly INFERENCE_TIMEOUT_MS;
1662
- constructor(config: Wav2Vec2InferenceConfig);
1674
+ private preSpeechBuffer;
1675
+ private wasSpeaking;
1676
+ private srTensor;
1677
+ constructor(config: SileroVADConfig);
1678
+ get backend(): RuntimeBackend | null;
1679
+ get isLoaded(): boolean;
1680
+ get sampleRate(): number;
1681
+ get threshold(): number;
1682
+ /**
1683
+ * Get required chunk size in samples
1684
+ */
1685
+ getChunkSize(): number;
1686
+ /**
1687
+ * Get chunk duration in milliseconds
1688
+ */
1689
+ getChunkDurationMs(): number;
1663
1690
  /**
1664
1691
  * Check if WebGPU is available and working
1665
1692
  * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
1666
1693
  */
1667
1694
  static isWebGPUAvailable: typeof isWebGPUAvailable;
1668
- get backend(): 'webgpu' | 'wasm' | null;
1669
- get isLoaded(): boolean;
1670
- /** True if inference timed out and the session is permanently unusable */
1671
- get isSessionPoisoned(): boolean;
1672
1695
  /**
1673
1696
  * Load the ONNX model
1674
1697
  */
1675
- load(): Promise<ModelInfo>;
1698
+ load(): Promise<VADModelInfo>;
1676
1699
  /**
1677
- * Run inference on raw audio
1678
- * @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
1679
- * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
1700
+ * Reset state for new audio stream
1701
+ */
1702
+ reset(): void;
1703
+ /**
1704
+ * Process a single audio chunk
1680
1705
  *
1681
- * Note: Model expects 1-second chunks (16000 samples) for optimal performance.
1682
- * Audio will be zero-padded or truncated to 16000 samples.
1706
+ * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
1707
+ * @returns VAD result with speech probability
1683
1708
  */
1684
- infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
1709
+ process(audioChunk: Float32Array): Promise<VADResult>;
1685
1710
  /**
1686
- * Decode CTC logits to text using greedy decoding
1711
+ * Process audio and detect speech segments
1712
+ *
1713
+ * @param audio - Complete audio buffer
1714
+ * @param options - Detection options
1715
+ * @returns Array of speech segments
1687
1716
  */
1688
- private decodeCTC;
1717
+ detectSpeech(audio: Float32Array, options?: {
1718
+ /** Minimum speech duration in ms (default: 250) */
1719
+ minSpeechDurationMs?: number;
1720
+ /** Minimum silence duration to end segment in ms (default: 300) */
1721
+ minSilenceDurationMs?: number;
1722
+ /** Padding to add before/after speech in ms (default: 30) */
1723
+ speechPadMs?: number;
1724
+ }): Promise<SpeechSegment[]>;
1689
1725
  /**
1690
1726
  * Queue inference to serialize ONNX session calls
1691
1727
  */
1692
1728
  private queueInference;
1693
- /**
1694
- * Get blendshape value by name for a specific frame
1695
- */
1696
- getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
1697
1729
  /**
1698
1730
  * Dispose of the model and free resources
1699
1731
  */
@@ -1701,356 +1733,927 @@ declare class Wav2Vec2Inference implements LipSyncBackend {
1701
1733
  }
1702
1734
 
1703
1735
  /**
1704
- * CPU-optimized lip sync inference using wav2arkit_cpu model
1705
- *
1706
- * A Safari/iOS-compatible alternative to Wav2Vec2Inference (384MB) designed
1707
- * for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
1736
+ * Silero VAD Web Worker implementation
1708
1737
  *
1709
- * The model uses ONNX external data format:
1710
- * - wav2arkit_cpu.onnx (1.86MB graph structure)
1711
- * - wav2arkit_cpu.onnx.data (402MB weights)
1712
- * Both files are fetched and cached automatically.
1738
+ * Runs Silero VAD inference in a dedicated Web Worker to prevent main thread blocking.
1739
+ * Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
1713
1740
  *
1714
- * Key differences from Wav2Vec2Inference:
1715
- * - WASM-only backend (CPU-optimized, single-threaded, no WebGPU)
1716
- * - No identity input (baked to identity 11)
1717
- * - No ASR output (lip sync only)
1718
- * - Dynamic input length (not fixed to 16000 samples)
1719
- * - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
1741
+ * Key design decisions:
1742
+ * - WASM backend only (WebGPU doesn't work in Workers)
1743
+ * - LSTM state serialized as Float32Array (Tensors can't cross worker boundary)
1744
+ * - Audio copied (not transferred) to retain main thread access for pre-speech buffer
1745
+ * - ONNX Runtime loaded from CDN in worker (no bundler complications)
1720
1746
  *
1721
1747
  * @category Inference
1722
1748
  *
1723
- * @example
1749
+ * @example Basic usage
1724
1750
  * ```typescript
1725
- * import { Wav2ArkitCpuInference } from '@omote/core';
1751
+ * import { SileroVADWorker } from '@omote/core';
1726
1752
  *
1727
- * const lam = new Wav2ArkitCpuInference({
1728
- * modelUrl: '/models/wav2arkit_cpu.onnx',
1753
+ * const vad = new SileroVADWorker({
1754
+ * modelUrl: '/models/silero-vad.onnx'
1729
1755
  * });
1730
- * await lam.load();
1756
+ * await vad.load();
1731
1757
  *
1732
- * const { blendshapes } = await lam.infer(audioSamples);
1733
- * // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
1758
+ * // Process 32ms chunks (512 samples at 16kHz)
1759
+ * const result = await vad.process(audioChunk);
1760
+ * if (result.isSpeech) {
1761
+ * console.log('Speech detected!', result.probability);
1762
+ * }
1734
1763
  * ```
1735
1764
  */
1736
1765
 
1737
- interface Wav2ArkitCpuConfig {
1738
- /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
1766
+ /**
1767
+ * Configuration for Silero VAD Worker
1768
+ */
1769
+ interface VADWorkerConfig {
1770
+ /** Path or URL to the ONNX model */
1739
1771
  modelUrl: string;
1772
+ /** Sample rate (8000 or 16000, default: 16000) */
1773
+ sampleRate?: 8000 | 16000;
1774
+ /** Speech probability threshold (default: 0.5) */
1775
+ threshold?: number;
1740
1776
  /**
1741
- * Path or URL to external model data file (.onnx.data weights).
1742
- * Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
1777
+ * Number of audio chunks to keep in pre-speech buffer.
1778
+ * When VAD triggers, these chunks are prepended to the speech buffer
1779
+ * to capture the beginning of speech that occurred before detection.
1743
1780
  *
1744
- * Set to `false` to skip external data loading (single-file models only).
1781
+ * At 512 samples/chunk and 16kHz:
1782
+ * - 10 chunks = 320ms of pre-speech audio
1783
+ * - 15 chunks = 480ms of pre-speech audio
1784
+ *
1785
+ * Default: 10 chunks (320ms)
1745
1786
  */
1746
- externalDataUrl?: string | false;
1747
- /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
1748
- backend?: BackendPreference;
1787
+ preSpeechBufferChunks?: number;
1749
1788
  }
1750
- declare class Wav2ArkitCpuInference implements LipSyncBackend {
1751
- readonly modelId: "wav2arkit_cpu";
1752
- private session;
1753
- private ort;
1789
+ /**
1790
+ * VAD model loading information from worker
1791
+ */
1792
+ interface VADWorkerModelInfo {
1793
+ backend: 'wasm';
1794
+ loadTimeMs: number;
1795
+ inputNames: string[];
1796
+ outputNames: string[];
1797
+ sampleRate: number;
1798
+ chunkSize: number;
1799
+ }
1800
+
1801
+ /**
1802
+ * Silero VAD Worker - Voice Activity Detection in a Web Worker
1803
+ *
1804
+ * Runs Silero VAD inference off the main thread to prevent UI blocking.
1805
+ * Feature parity with SileroVADInference but runs in dedicated worker.
1806
+ *
1807
+ * @see SileroVADInference for main-thread version
1808
+ */
1809
+ declare class SileroVADWorker {
1810
+ private worker;
1754
1811
  private config;
1755
- private _backend;
1756
1812
  private isLoading;
1813
+ private _isLoaded;
1814
+ private state;
1815
+ private context;
1816
+ private readonly chunkSize;
1817
+ private readonly contextSize;
1757
1818
  private inferenceQueue;
1758
- constructor(config: Wav2ArkitCpuConfig);
1759
- get backend(): RuntimeBackend | null;
1819
+ private preSpeechBuffer;
1820
+ private wasSpeaking;
1821
+ private pendingResolvers;
1822
+ private messageId;
1823
+ constructor(config: VADWorkerConfig);
1760
1824
  get isLoaded(): boolean;
1761
1825
  /**
1762
- * Load the ONNX model
1826
+ * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
1763
1827
  */
1764
- load(): Promise<LipSyncModelInfo>;
1828
+ get backend(): 'wasm' | null;
1829
+ get sampleRate(): number;
1830
+ get threshold(): number;
1765
1831
  /**
1766
- * Run inference on raw audio
1767
- *
1768
- * Accepts variable-length audio (not fixed to 16000 samples).
1769
- * Output frames = ceil(30 * numSamples / 16000).
1770
- *
1771
- * @param audioSamples - Float32Array of raw audio at 16kHz
1772
- * @param _identityIndex - Ignored (identity 11 is baked into the model)
1832
+ * Get required chunk size in samples
1773
1833
  */
1774
- infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
1834
+ getChunkSize(): number;
1775
1835
  /**
1776
- * Queue inference to serialize ONNX session calls
1836
+ * Get chunk duration in milliseconds
1837
+ */
1838
+ getChunkDurationMs(): number;
1839
+ /**
1840
+ * Create the worker from inline script
1841
+ */
1842
+ private createWorker;
1843
+ /**
1844
+ * Handle messages from worker
1845
+ */
1846
+ private handleWorkerMessage;
1847
+ /**
1848
+ * Send message to worker and wait for response
1849
+ */
1850
+ private sendMessage;
1851
+ /**
1852
+ * Load the ONNX model in the worker
1853
+ */
1854
+ load(): Promise<VADWorkerModelInfo>;
1855
+ /**
1856
+ * Reset state for new audio stream
1857
+ */
1858
+ reset(): Promise<void>;
1859
+ /**
1860
+ * Process a single audio chunk
1861
+ *
1862
+ * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
1863
+ * @returns VAD result with speech probability
1864
+ */
1865
+ process(audioChunk: Float32Array): Promise<VADResult>;
1866
+ /**
1867
+ * Queue inference to serialize worker calls
1777
1868
  */
1778
1869
  private queueInference;
1779
1870
  /**
1780
- * Dispose of the model and free resources
1871
+ * Dispose of the worker and free resources
1781
1872
  */
1782
1873
  dispose(): Promise<void>;
1874
+ /**
1875
+ * Check if Web Workers are supported
1876
+ */
1877
+ static isSupported(): boolean;
1783
1878
  }
1784
1879
 
1785
1880
  /**
1786
- * Factory function for lip sync with automatic GPU/CPU model selection
1787
- *
1788
- * Provides a unified API that automatically selects the optimal model:
1789
- * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
1790
- * - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
1791
- * - Fallback: Gracefully falls back to CPU model if GPU model fails to load
1881
+ * Factory function for Silero VAD with automatic Worker vs main thread selection
1792
1882
  *
1793
- * Why two separate models?
1794
- * Wav2Vec2 (LAM) cannot run on Safari/iOS for two reasons:
1795
- * 1. Its dual-head transformer graph needs ~750-950MB peak during ORT session
1796
- * creation (graph optimization), exceeding iOS WebKit's ~1-1.5GB tab limit.
1797
- * 2. It ships as a single 384MB .onnx file that must load into JS heap before
1798
- * ORT can consume it. iOS WebKit OOMs on this allocation.
1799
- * wav2arkit_cpu solves both: external data format (1.86MB graph + 402MB weights)
1800
- * lets ORT load only the tiny graph, then stream weights via URL pass-through
1801
- * directly into WASM memory. JS heap stays at ~2MB.
1883
+ * Provides a unified API that automatically selects the optimal implementation:
1884
+ * - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
1885
+ * - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
1886
+ * - Fallback: Gracefully falls back to main thread if Worker fails
1802
1887
  *
1803
1888
  * @category Inference
1804
1889
  *
1805
- * @example Auto-detect (recommended)
1890
+ * @example Basic usage (auto-detect)
1806
1891
  * ```typescript
1807
- * import { createLipSync } from '@omote/core';
1892
+ * import { createSileroVAD } from '@omote/core';
1808
1893
  *
1809
- * const lam = createLipSync({
1810
- * gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
1811
- * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
1894
+ * const vad = createSileroVAD({
1895
+ * modelUrl: '/models/silero-vad.onnx',
1896
+ * threshold: 0.5,
1812
1897
  * });
1813
1898
  *
1814
- * await lam.load();
1815
- * const { blendshapes } = await lam.infer(audioSamples);
1899
+ * await vad.load();
1900
+ * const result = await vad.process(audioChunk);
1901
+ * if (result.isSpeech) {
1902
+ * console.log('Speech detected!', result.probability);
1903
+ * }
1816
1904
  * ```
1817
1905
  *
1818
- * @example Force CPU model
1906
+ * @example Force worker usage
1819
1907
  * ```typescript
1820
- * const lam = createLipSync({
1821
- * gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
1822
- * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
1823
- * mode: 'cpu',
1908
+ * const vad = createSileroVAD({
1909
+ * modelUrl: '/models/silero-vad.onnx',
1910
+ * useWorker: true, // Force Worker even on mobile
1911
+ * });
1912
+ * ```
1913
+ *
1914
+ * @example Force main thread
1915
+ * ```typescript
1916
+ * const vad = createSileroVAD({
1917
+ * modelUrl: '/models/silero-vad.onnx',
1918
+ * useWorker: false, // Force main thread
1824
1919
  * });
1825
1920
  * ```
1826
1921
  */
1827
1922
 
1828
1923
  /**
1829
- * Configuration for the lip sync factory
1924
+ * Common interface for both SileroVADInference and SileroVADWorker
1925
+ *
1926
+ * This interface defines the shared API that both implementations provide,
1927
+ * allowing consumers to use either interchangeably.
1830
1928
  */
1831
- interface CreateLipSyncConfig {
1832
- /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
1833
- gpuModelUrl: string;
1929
+ interface SileroVADBackend {
1930
+ /** Current backend type (webgpu, wasm, or null if not loaded) */
1931
+ readonly backend: RuntimeBackend | null;
1932
+ /** Whether the model is loaded and ready for inference */
1933
+ readonly isLoaded: boolean;
1934
+ /** Audio sample rate (8000 or 16000 Hz) */
1935
+ readonly sampleRate: number;
1936
+ /** Speech detection threshold (0-1) */
1937
+ readonly threshold: number;
1834
1938
  /**
1835
- * URL for GPU model external data file (.onnx.data weights).
1836
- * Default: `${gpuModelUrl}.data`
1837
- *
1838
- * Set to `false` to skip external data loading (single-file models only).
1939
+ * Load the ONNX model
1940
+ * @returns Model loading information
1839
1941
  */
1840
- gpuExternalDataUrl?: string | false;
1841
- /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
1842
- cpuModelUrl: string;
1942
+ load(): Promise<VADModelInfo | VADWorkerModelInfo>;
1843
1943
  /**
1844
- * Model selection mode:
1845
- * - 'auto': Safari/iOS CPU, everything else → GPU (default)
1846
- * - 'gpu': Force GPU model (Wav2Vec2Inference)
1847
- * - 'cpu': Force CPU model (Wav2ArkitCpuInference)
1944
+ * Process a single audio chunk
1945
+ * @param audioChunk - Float32Array of exactly chunkSize samples
1946
+ * @returns VAD result with speech probability
1848
1947
  */
1849
- mode?: 'auto' | 'gpu' | 'cpu';
1850
- /** Backend preference for GPU model (default: 'auto') */
1851
- gpuBackend?: BackendPreference;
1852
- /** Number of identity classes for GPU model (default: 12) */
1853
- numIdentityClasses?: number;
1948
+ process(audioChunk: Float32Array): Promise<VADResult>;
1854
1949
  /**
1855
- * Fall back to CPU model if GPU model fails to load (default: true)
1856
- * Only applies when mode is 'auto' or 'gpu'
1950
+ * Reset state for new audio stream
1951
+ */
1952
+ reset(): void | Promise<void>;
1953
+ /**
1954
+ * Dispose of the model and free resources
1955
+ */
1956
+ dispose(): Promise<void>;
1957
+ /**
1958
+ * Get required chunk size in samples
1959
+ */
1960
+ getChunkSize(): number;
1961
+ /**
1962
+ * Get chunk duration in milliseconds
1963
+ */
1964
+ getChunkDurationMs(): number;
1965
+ }
1966
+ /**
1967
+ * Configuration for the Silero VAD factory
1968
+ *
1969
+ * Extends SileroVADConfig with worker-specific options.
1970
+ */
1971
+ interface SileroVADFactoryConfig extends SileroVADConfig {
1972
+ /**
1973
+ * Force worker usage (true), main thread (false), or auto-detect (undefined).
1974
+ *
1975
+ * Auto-detection behavior:
1976
+ * - Desktop: Uses Worker (better responsiveness, off-main-thread)
1977
+ * - Mobile: Uses main thread (avoids 5MB memory overhead)
1978
+ *
1979
+ * You can override this to:
1980
+ * - `true`: Force Worker even on mobile (if you have memory headroom)
1981
+ * - `false`: Force main thread even on desktop (for debugging)
1982
+ *
1983
+ * Default: undefined (auto-detect)
1984
+ */
1985
+ useWorker?: boolean;
1986
+ /**
1987
+ * Fallback to main thread on worker errors.
1988
+ *
1989
+ * When true (default), if the Worker fails to load or encounters an error,
1990
+ * the factory will automatically create a main thread instance instead.
1991
+ *
1992
+ * When false, worker errors will propagate as exceptions.
1993
+ *
1994
+ * Default: true
1857
1995
  */
1858
1996
  fallbackOnError?: boolean;
1997
+ /**
1998
+ * Unified inference worker instance.
1999
+ * When provided, uses SileroVADUnifiedAdapter (shared single-ORT worker).
2000
+ * Takes precedence over useWorker setting.
2001
+ */
2002
+ unifiedWorker?: UnifiedInferenceWorker;
1859
2003
  }
1860
2004
  /**
1861
- * Create a lip sync instance with automatic GPU/CPU model selection
2005
+ * Check if the current environment supports VAD Web Workers
2006
+ *
2007
+ * Requirements:
2008
+ * - Worker constructor must exist
2009
+ * - Blob URL support (for inline worker script)
2010
+ *
2011
+ * @returns true if VAD Worker is supported
2012
+ */
2013
+ declare function supportsVADWorker(): boolean;
2014
+ /**
2015
+ * Create a Silero VAD instance with automatic implementation selection
2016
+ *
2017
+ * This factory function automatically selects between:
2018
+ * - **SileroVADWorker**: Off-main-thread inference (better for desktop)
2019
+ * - **SileroVADInference**: Main thread inference (better for mobile)
2020
+ *
2021
+ * The selection is based on:
2022
+ * 1. Explicit `useWorker` config (if provided)
2023
+ * 2. Platform detection (mobile vs desktop)
2024
+ * 3. Worker API availability
2025
+ *
2026
+ * Both implementations share the same interface (SileroVADBackend),
2027
+ * so consumers can use either interchangeably.
1862
2028
  *
1863
2029
  * @param config - Factory configuration
1864
- * @returns A LipSyncBackend instance (either GPU or CPU model)
2030
+ * @returns A SileroVAD instance (either Worker or main thread)
2031
+ *
2032
+ * @example
2033
+ * ```typescript
2034
+ * // Auto-detect (recommended)
2035
+ * const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
2036
+ *
2037
+ * // Force Worker
2038
+ * const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
2039
+ *
2040
+ * // Force main thread
2041
+ * const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
2042
+ * ```
1865
2043
  */
1866
- declare function createLipSync(config: CreateLipSyncConfig): LipSyncBackend;
2044
+ declare function createSileroVAD(config: SileroVADFactoryConfig): SileroVADBackend;
1867
2045
 
1868
2046
  /**
1869
- * Silero VAD (Voice Activity Detection) inference
2047
+ * Web Worker-based wav2arkit_cpu lip sync inference
1870
2048
  *
1871
- * Neural network-based VAD running in browser via ONNX Runtime Web.
1872
- * Much more accurate than RMS-based energy detection.
2049
+ * Runs wav2arkit_cpu inference in a dedicated Web Worker to prevent main thread blocking.
2050
+ * Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
1873
2051
  *
1874
- * Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
1875
- * - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
1876
- * - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
2052
+ * Key design decisions:
2053
+ * - WASM backend only (WebGPU doesn't work in Workers)
2054
+ * - Audio copied (not transferred) to retain main thread access
2055
+ * - ONNX Runtime loaded from CDN in worker (no bundler complications)
2056
+ * - Blendshape symmetrization inlined in worker (no module imports)
2057
+ * - iOS: passes model URLs as strings directly to ORT (avoids 400MB+ JS heap)
1877
2058
  *
1878
2059
  * @category Inference
1879
2060
  *
1880
- * @example Basic usage
2061
+ * @example
1881
2062
  * ```typescript
1882
- * import { SileroVADInference } from '@omote/core';
2063
+ * import { Wav2ArkitCpuWorker } from '@omote/core';
1883
2064
  *
1884
- * const vad = new SileroVADInference({
1885
- * modelUrl: '/models/silero-vad.onnx'
2065
+ * const lam = new Wav2ArkitCpuWorker({
2066
+ * modelUrl: '/models/wav2arkit_cpu.onnx',
1886
2067
  * });
1887
- * await vad.load();
2068
+ * await lam.load();
1888
2069
  *
1889
- * // Process 32ms chunks (512 samples at 16kHz)
1890
- * const probability = await vad.process(audioChunk);
1891
- * if (probability > 0.5) {
1892
- * console.log('Speech detected!');
1893
- * }
2070
+ * const { blendshapes } = await lam.infer(audioSamples);
2071
+ * // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
1894
2072
  * ```
1895
- *
1896
- * @example Streaming with state management
1897
- * ```typescript
1898
- * // State is automatically maintained between process() calls
1899
- * // Call reset() when starting a new audio stream
1900
- * vad.reset();
1901
- *
1902
- * for (const chunk of audioChunks) {
1903
- * const prob = await vad.process(chunk);
1904
- * // prob is speech probability [0, 1]
1905
- * }
2073
+ */
2074
+
2075
+ /**
2076
+ * Configuration for Wav2ArkitCpu Worker
2077
+ */
2078
+ interface Wav2ArkitCpuWorkerConfig {
2079
+ /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
2080
+ modelUrl: string;
2081
+ /**
2082
+ * Path or URL to external model data file (.onnx.data weights).
2083
+ * Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
2084
+ *
2085
+ * Set to `false` to skip external data loading (single-file models only).
2086
+ */
2087
+ externalDataUrl?: string | false;
2088
+ }
2089
+ /**
2090
+ * Wav2ArkitCpu Worker - Lip sync inference in a Web Worker
2091
+ *
2092
+ * Runs wav2arkit_cpu inference off the main thread to prevent UI blocking.
2093
+ * Feature parity with Wav2ArkitCpuInference but runs in dedicated worker.
2094
+ *
2095
+ * @see Wav2ArkitCpuInference for main-thread version
2096
+ */
2097
+ declare class Wav2ArkitCpuWorker implements LipSyncBackend {
2098
+ readonly modelId: "wav2arkit_cpu";
2099
+ private worker;
2100
+ private config;
2101
+ private isLoading;
2102
+ private _isLoaded;
2103
+ private inferenceQueue;
2104
+ private poisoned;
2105
+ private pendingResolvers;
2106
+ constructor(config: Wav2ArkitCpuWorkerConfig);
2107
+ get isLoaded(): boolean;
2108
+ /**
2109
+ * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
2110
+ */
2111
+ get backend(): 'wasm' | null;
2112
+ /**
2113
+ * Create the worker from inline script
2114
+ */
2115
+ private createWorker;
2116
+ /**
2117
+ * Handle messages from worker
2118
+ */
2119
+ private handleWorkerMessage;
2120
+ /**
2121
+ * Send message to worker and wait for response
2122
+ */
2123
+ private sendMessage;
2124
+ /**
2125
+ * Load the ONNX model in the worker
2126
+ */
2127
+ load(): Promise<LipSyncModelInfo>;
2128
+ /**
2129
+ * Run inference on raw audio
2130
+ *
2131
+ * Accepts variable-length audio (not fixed to 16000 samples).
2132
+ * Output frames = ceil(30 * numSamples / 16000).
2133
+ *
2134
+ * @param audioSamples - Float32Array of raw audio at 16kHz
2135
+ * @param _identityIndex - Ignored (identity 11 is baked into the model)
2136
+ */
2137
+ infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
2138
+ /**
2139
+ * Queue inference to serialize worker calls
2140
+ */
2141
+ private queueInference;
2142
+ /**
2143
+ * Dispose of the worker and free resources
2144
+ */
2145
+ dispose(): Promise<void>;
2146
+ /**
2147
+ * Check if Web Workers are supported
2148
+ */
2149
+ static isSupported(): boolean;
2150
+ }
2151
+
2152
+ /**
2153
+ * Unified Inference Worker — single Web Worker hosting all WASM models
2154
+ *
2155
+ * Solves the multi-worker ORT problem: three per-model workers each load their
2156
+ * own ORT WASM instance (~40MB each). On iOS this exceeds the ~1-1.5GB tab
2157
+ * limit, forcing main-thread fallback which blocks the render loop.
2158
+ *
2159
+ * This worker hosts SenseVoice + Wav2ArkitCpu + Silero VAD in a single
2160
+ * ORT WASM instance. Same total model memory (~643MB), but inference runs
2161
+ * off-main-thread. Works on iOS because there's only one ORT instance.
2162
+ *
2163
+ * Consumer usage:
2164
+ * ```typescript
2165
+ * const worker = new UnifiedInferenceWorker();
2166
+ * await worker.init();
2167
+ *
2168
+ * const asr = createSenseVoice({ modelUrl: '...', unifiedWorker: worker });
2169
+ * const lam = createLipSync({ gpuModelUrl: '...', cpuModelUrl: '...', unifiedWorker: worker });
2170
+ * const vad = createSileroVAD({ modelUrl: '...', unifiedWorker: worker });
2171
+ * ```
2172
+ *
2173
+ * @category Inference
2174
+ */
2175
+
2176
+ /**
2177
+ * Unified Inference Worker — single Web Worker for all WASM models
2178
+ *
2179
+ * Hosts SenseVoice, Wav2ArkitCpu, and Silero VAD in one ORT instance.
2180
+ * Eliminates the multi-worker memory problem on iOS.
2181
+ */
2182
+ declare class UnifiedInferenceWorker {
2183
+ private worker;
2184
+ private pendingRequests;
2185
+ private initialized;
2186
+ private poisoned;
2187
+ /**
2188
+ * Initialize the worker (load ORT WASM from CDN)
2189
+ */
2190
+ init(): Promise<void>;
2191
+ loadSenseVoice(config: {
2192
+ modelUrl: string;
2193
+ tokensUrl: string;
2194
+ language: number;
2195
+ textNorm: number;
2196
+ }): Promise<SenseVoiceModelInfo>;
2197
+ transcribe(audio: Float32Array): Promise<SenseVoiceResult>;
2198
+ disposeSenseVoice(): Promise<void>;
2199
+ loadLipSync(config: {
2200
+ modelUrl: string;
2201
+ externalDataUrl: string | null;
2202
+ }): Promise<LipSyncModelInfo>;
2203
+ inferLipSync(audio: Float32Array): Promise<{
2204
+ blendshapes: Float32Array;
2205
+ numFrames: number;
2206
+ numBlendshapes: number;
2207
+ inferenceTimeMs: number;
2208
+ }>;
2209
+ disposeLipSync(): Promise<void>;
2210
+ loadVAD(config: {
2211
+ modelUrl: string;
2212
+ sampleRate: number;
2213
+ }): Promise<VADWorkerModelInfo>;
2214
+ processVAD(audio: Float32Array, state: Float32Array, context: Float32Array): Promise<{
2215
+ probability: number;
2216
+ state: Float32Array;
2217
+ inferenceTimeMs: number;
2218
+ }>;
2219
+ resetVAD(): Promise<Float32Array>;
2220
+ disposeVAD(): Promise<void>;
2221
+ dispose(): Promise<void>;
2222
+ /** Check if the worker is initialized and not poisoned */
2223
+ get isReady(): boolean;
2224
+ /** Check if Web Workers are supported */
2225
+ static isSupported(): boolean;
2226
+ private assertReady;
2227
+ private createWorker;
2228
+ private handleWorkerMessage;
2229
+ private sendMessage;
2230
+ private rejectAllPending;
2231
+ private cleanup;
2232
+ }
2233
+ /**
2234
+ * SenseVoice adapter backed by UnifiedInferenceWorker
2235
+ *
2236
+ * Implements SenseVoiceBackend, delegating all inference to the shared worker.
2237
+ */
2238
+ declare class SenseVoiceUnifiedAdapter implements SenseVoiceBackend {
2239
+ private worker;
2240
+ private config;
2241
+ private _isLoaded;
2242
+ private languageId;
2243
+ private textNormId;
2244
+ private inferenceQueue;
2245
+ constructor(worker: UnifiedInferenceWorker, config: SenseVoiceWorkerConfig);
2246
+ get isLoaded(): boolean;
2247
+ get backend(): 'wasm' | null;
2248
+ load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
2249
+ transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
2250
+ dispose(): Promise<void>;
2251
+ }
2252
+ /**
2253
+ * Wav2ArkitCpu adapter backed by UnifiedInferenceWorker
2254
+ *
2255
+ * Implements LipSyncBackend, delegating all inference to the shared worker.
2256
+ */
2257
+ declare class Wav2ArkitCpuUnifiedAdapter implements LipSyncBackend {
2258
+ readonly modelId: "wav2arkit_cpu";
2259
+ private worker;
2260
+ private config;
2261
+ private _isLoaded;
2262
+ private inferenceQueue;
2263
+ constructor(worker: UnifiedInferenceWorker, config: Wav2ArkitCpuWorkerConfig);
2264
+ get isLoaded(): boolean;
2265
+ get backend(): RuntimeBackend | null;
2266
+ load(): Promise<LipSyncModelInfo>;
2267
+ infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
2268
+ dispose(): Promise<void>;
2269
+ }
2270
+ /**
2271
+ * Silero VAD adapter backed by UnifiedInferenceWorker
2272
+ *
2273
+ * Implements SileroVADBackend, delegating all inference to the shared worker.
2274
+ */
2275
+ declare class SileroVADUnifiedAdapter implements SileroVADBackend {
2276
+ private worker;
2277
+ private config;
2278
+ private _isLoaded;
2279
+ private state;
2280
+ private context;
2281
+ private readonly chunkSize;
2282
+ private readonly contextSize;
2283
+ private inferenceQueue;
2284
+ private preSpeechBuffer;
2285
+ private wasSpeaking;
2286
+ constructor(worker: UnifiedInferenceWorker, config: SileroVADConfig);
2287
+ get isLoaded(): boolean;
2288
+ get backend(): RuntimeBackend | null;
2289
+ get sampleRate(): number;
2290
+ get threshold(): number;
2291
+ getChunkSize(): number;
2292
+ getChunkDurationMs(): number;
2293
+ load(): Promise<VADWorkerModelInfo>;
2294
+ process(audioChunk: Float32Array): Promise<VADResult>;
2295
+ reset(): Promise<void>;
2296
+ dispose(): Promise<void>;
2297
+ }
2298
+
2299
+ /**
2300
+ * Factory function for SenseVoice ASR with automatic Worker vs main thread selection
2301
+ *
2302
+ * Provides a unified API that automatically selects the optimal implementation:
2303
+ * - Worker supported: Uses SenseVoiceWorker (off-main-thread inference)
2304
+ * - Worker unsupported: Uses SenseVoiceInference (main thread)
2305
+ *
2306
+ * @category Inference
2307
+ *
2308
+ * @example Auto-detect (recommended)
2309
+ * ```typescript
2310
+ * import { createSenseVoice } from '@omote/core';
2311
+ *
2312
+ * const asr = createSenseVoice({
2313
+ * modelUrl: '/models/sensevoice/model.int8.onnx',
2314
+ * });
2315
+ * await asr.load();
2316
+ * const { text, emotion } = await asr.transcribe(audioSamples);
2317
+ * ```
2318
+ *
2319
+ * @example Force worker
2320
+ * ```typescript
2321
+ * const asr = createSenseVoice({
2322
+ * modelUrl: '/models/sensevoice/model.int8.onnx',
2323
+ * useWorker: true,
2324
+ * });
2325
+ * ```
2326
+ *
2327
+ * @example Force main thread
2328
+ * ```typescript
2329
+ * const asr = createSenseVoice({
2330
+ * modelUrl: '/models/sensevoice/model.int8.onnx',
2331
+ * useWorker: false,
2332
+ * });
2333
+ * ```
2334
+ */
2335
+
2336
+ /**
2337
+ * Common interface for both SenseVoiceInference and SenseVoiceWorker
2338
+ */
2339
+ interface SenseVoiceBackend {
2340
+ /** Whether the model is loaded and ready for inference */
2341
+ readonly isLoaded: boolean;
2342
+ /** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
2343
+ readonly backend: 'wasm' | 'webgpu' | null;
2344
+ /**
2345
+ * Load the ONNX model
2346
+ * @param onProgress - Optional progress callback (fires once at 100% for worker)
2347
+ * @returns Model loading information
2348
+ */
2349
+ load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
2350
+ /**
2351
+ * Transcribe audio samples to text
2352
+ * @param audioSamples - Float32Array of audio samples at 16kHz
2353
+ * @returns Transcription result
2354
+ */
2355
+ transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
2356
+ /**
2357
+ * Dispose of the model and free resources
2358
+ */
2359
+ dispose(): Promise<void>;
2360
+ }
2361
+ /**
2362
+ * Configuration for the SenseVoice factory
2363
+ */
2364
+ interface CreateSenseVoiceConfig {
2365
+ /** Path or URL to model.int8.onnx (239MB) */
2366
+ modelUrl: string;
2367
+ /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
2368
+ tokensUrl?: string;
2369
+ /** Language hint (default: 'auto') */
2370
+ language?: SenseVoiceLanguage;
2371
+ /** Text normalization (default: 'with_itn') */
2372
+ textNorm?: 'with_itn' | 'without_itn';
2373
+ /**
2374
+ * Worker mode:
2375
+ * - 'auto' (default): Use Worker if supported, else main thread
2376
+ * - true: Force Worker (throws if unsupported)
2377
+ * - false: Force main thread
2378
+ */
2379
+ useWorker?: boolean | 'auto';
2380
+ /**
2381
+ * Unified inference worker instance.
2382
+ * When provided, uses SenseVoiceUnifiedAdapter (shared single-ORT worker).
2383
+ * Takes precedence over useWorker setting.
2384
+ */
2385
+ unifiedWorker?: UnifiedInferenceWorker;
2386
+ }
2387
+ /**
2388
+ * Create a SenseVoice ASR instance with automatic implementation selection
2389
+ *
2390
+ * @param config - Factory configuration
2391
+ * @returns A SenseVoiceBackend instance (either Worker or main thread)
2392
+ */
2393
+ declare function createSenseVoice(config: CreateSenseVoiceConfig): SenseVoiceBackend;
2394
+
2395
+ /**
2396
+ * Kaldi-compatible filterbank (fbank) feature extraction
2397
+ *
2398
+ * Pure TypeScript implementation matching kaldi-native-fbank parameters
2399
+ * used by SenseVoice. No external dependencies.
2400
+ *
2401
+ * Pipeline: audio → framing → windowing → FFT → power spectrum → mel filterbank → log
2402
+ *
2403
+ * @module inference/kaldiFbank
2404
+ */
2405
+ interface KaldiFbankOptions {
2406
+ /** Frame length in ms (default: 25) */
2407
+ frameLengthMs?: number;
2408
+ /** Frame shift in ms (default: 10) */
2409
+ frameShiftMs?: number;
2410
+ /** Low frequency cutoff in Hz (default: 20) */
2411
+ lowFreq?: number;
2412
+ /** High frequency cutoff in Hz (default: sampleRate / 2) */
2413
+ highFreq?: number;
2414
+ /** Dither amount (default: 0 for deterministic output) */
2415
+ dither?: number;
2416
+ /** Preemphasis coefficient (default: 0.97) */
2417
+ preemphasis?: number;
2418
+ }
2419
+ /**
2420
+ * Compute Kaldi-compatible log mel filterbank features
2421
+ *
2422
+ * @param audio Raw audio samples (float32, [-1, 1] range)
2423
+ * @param sampleRate Sample rate in Hz (must be 16000 for SenseVoice)
2424
+ * @param numMelBins Number of mel bins (80 for SenseVoice)
2425
+ * @param opts Optional parameters
2426
+ * @returns Flattened Float32Array of shape [numFrames, numMelBins]
2427
+ */
2428
+ declare function computeKaldiFbank(audio: Float32Array, sampleRate: number, numMelBins: number, opts?: KaldiFbankOptions): Float32Array;
2429
+ /**
2430
+ * Apply Low Frame Rate stacking for SenseVoice
2431
+ *
2432
+ * Concatenates lfrM consecutive frames with stride lfrN.
2433
+ * Left-pads with copies of first frame, right-pads last group.
2434
+ *
2435
+ * @param features Flattened [numFrames, featureDim]
2436
+ * @param featureDim Feature dimension per frame (e.g., 80)
2437
+ * @param lfrM Number of frames to stack (default: 7)
2438
+ * @param lfrN Stride (default: 6)
2439
+ * @returns Flattened [numOutputFrames, featureDim * lfrM]
2440
+ */
2441
+ declare function applyLFR(features: Float32Array, featureDim: number, lfrM?: number, lfrN?: number): Float32Array;
2442
+ /**
2443
+ * Apply CMVN normalization in-place
2444
+ *
2445
+ * Formula: normalized[i] = (features[i] + negMean[i % dim]) * invStddev[i % dim]
2446
+ *
2447
+ * @param features Flattened feature array (modified in-place)
2448
+ * @param dim Feature dimension (560 for SenseVoice after LFR)
2449
+ * @param negMean Negative mean vector (dim-dimensional)
2450
+ * @param invStddev Inverse standard deviation vector (dim-dimensional)
2451
+ * @returns The same features array (for chaining)
2452
+ */
2453
+ declare function applyCMVN(features: Float32Array, dim: number, negMean: Float32Array, invStddev: Float32Array): Float32Array;
2454
+ /**
2455
+ * Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
2456
+ *
2457
+ * The sherpa-onnx SenseVoice export stores neg_mean and inv_stddev
2458
+ * as comma-separated float strings in the model's metadata.
2459
+ */
2460
+ declare function parseCMVNFromMetadata(negMeanStr: string, invStddevStr: string): {
2461
+ negMean: Float32Array;
2462
+ invStddev: Float32Array;
2463
+ };
2464
+
2465
+ /**
2466
+ * CTC greedy decoder for SenseVoice
2467
+ *
2468
+ * Decodes CTC logits into text with structured token parsing
2469
+ * for language, emotion, and audio event detection.
2470
+ *
2471
+ * @module inference/ctcDecoder
2472
+ */
2473
+ interface CTCDecodeResult {
2474
+ /** Decoded text (speech content only) */
2475
+ text: string;
2476
+ /** Detected language (e.g., 'zh', 'en', 'ja', 'ko', 'yue') */
2477
+ language?: string;
2478
+ /** Detected emotion (e.g., 'HAPPY', 'SAD', 'ANGRY', 'NEUTRAL') */
2479
+ emotion?: string;
2480
+ /** Detected audio event (e.g., 'Speech', 'BGM', 'Laughter') */
2481
+ event?: string;
2482
+ }
2483
+ /** Resolve language string to SenseVoice language ID */
2484
+ declare function resolveLanguageId(language: string): number;
2485
+ /** Resolve text norm string to SenseVoice text norm ID */
2486
+ declare function resolveTextNormId(textNorm: string): number;
2487
+ /**
2488
+ * Parse tokens.txt into a token ID → string map
2489
+ *
2490
+ * Format: each line is "token_string token_id"
2491
+ * e.g., "<unk> 0", "▁the 3", "s 4"
2492
+ */
2493
+ declare function parseTokensFile(content: string): Map<number, string>;
2494
+ /**
2495
+ * CTC greedy decode
2496
+ *
2497
+ * @param logits Raw logits from model output, flattened [seqLen, vocabSize]
2498
+ * @param seqLen Sequence length (time steps)
2499
+ * @param vocabSize Vocabulary size
2500
+ * @param tokenMap Token ID → string map from tokens.txt
2501
+ * @returns Decoded text and structured metadata
2502
+ */
2503
+ declare function ctcGreedyDecode(logits: Float32Array, seqLen: number, vocabSize: number, tokenMap: Map<number, string>): CTCDecodeResult;
2504
+
2505
+ /**
2506
+ * Shared blendshape constants and utilities for lip sync inference
2507
+ *
2508
+ * Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
2509
+ * index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
2510
+ *
2511
+ * This module is the single source of truth for blendshape ordering to
2512
+ * avoid circular dependencies between inference classes.
2513
+ *
2514
+ * @category Inference
2515
+ */
2516
+ /**
2517
+ * LAM model blendshape names in order (52 total)
2518
+ * NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
2519
+ */
2520
+ declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
2521
+ /** Alias for backwards compatibility */
2522
+ declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
2523
+ /**
2524
+ * Symmetrize blendshapes by averaging left/right pairs
2525
+ * From LAM official postprocessing (models/utils.py)
2526
+ * This fixes asymmetric output from the raw model
2527
+ */
2528
+ declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
2529
+ /**
2530
+ * wav2arkit_cpu model blendshape ordering
2531
+ *
2532
+ * Indices 0-24 match LAM_BLENDSHAPES, but 25+ diverge:
2533
+ * - LAM puts jawRight, mouthClose, mouthDimpleLeft, mouthDimpleRight at 25-28
2534
+ * - wav2arkit_cpu puts mouthFrownLeft at 25 and moves those four to 48-51
2535
+ */
2536
+ declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
2537
+ /**
2538
+ * Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
2539
+ *
2540
+ * @param frame - Float32Array of 52 blendshape values in wav2arkit_cpu order
2541
+ * @returns Float32Array of 52 blendshape values in LAM_BLENDSHAPES order
2542
+ */
2543
+ declare function remapWav2ArkitToLam(frame: Float32Array): Float32Array;
2544
+
2545
+ /**
2546
+ * Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
2547
+ *
2548
+ * Runs entirely in the browser using WebGPU or WASM.
2549
+ * Takes raw 16kHz audio and outputs:
2550
+ * - 52 ARKit blendshapes (lip sync)
2551
+ * - 32-token CTC logits (speech recognition)
2552
+ *
2553
+ * @category Inference
2554
+ *
2555
+ * @example Basic usage
2556
+ * ```typescript
2557
+ * import { Wav2Vec2Inference } from '@omote/core';
2558
+ *
2559
+ * const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
2560
+ * await wav2vec.load();
2561
+ *
2562
+ * // Process 1 second of audio (16kHz = 16000 samples)
2563
+ * const result = await wav2vec.infer(audioSamples);
2564
+ *
2565
+ * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
2566
+ * console.log('ASR text:', result.text); // Decoded transcription
1906
2567
  * ```
1907
2568
  */
1908
2569
 
1909
- type VADBackend = BackendPreference;
1910
- /**
1911
- * Configuration for Silero VAD
1912
- */
1913
- interface SileroVADConfig {
2570
+ type InferenceBackend = BackendPreference;
2571
+ interface Wav2Vec2InferenceConfig {
1914
2572
  /** Path or URL to the ONNX model */
1915
2573
  modelUrl: string;
1916
- /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
1917
- backend?: VADBackend;
1918
- /** Sample rate (8000 or 16000, default: 16000) */
1919
- sampleRate?: 8000 | 16000;
1920
- /** Speech probability threshold (default: 0.5) */
1921
- threshold?: number;
1922
2574
  /**
1923
- * Number of audio chunks to keep in pre-speech buffer.
1924
- * When VAD triggers, these chunks are prepended to the speech buffer
1925
- * to capture the beginning of speech that occurred before detection.
1926
- *
1927
- * At 512 samples/chunk and 16kHz:
1928
- * - 10 chunks = 320ms of pre-speech audio
1929
- * - 15 chunks = 480ms of pre-speech audio
2575
+ * Path or URL to external model data file (.onnx.data weights).
2576
+ * Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
1930
2577
  *
1931
- * Default: 10 chunks (320ms)
2578
+ * Set to `false` to skip external data loading (single-file models only).
1932
2579
  */
1933
- preSpeechBufferChunks?: number;
2580
+ externalDataUrl?: string | false;
2581
+ /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
2582
+ backend?: InferenceBackend;
2583
+ /** Number of identity classes (default: 12 for streaming model) */
2584
+ numIdentityClasses?: number;
1934
2585
  }
1935
- /**
1936
- * VAD model loading information
1937
- */
1938
- interface VADModelInfo {
2586
+ interface ModelInfo {
1939
2587
  backend: 'webgpu' | 'wasm';
1940
2588
  loadTimeMs: number;
1941
2589
  inputNames: string[];
1942
2590
  outputNames: string[];
1943
- sampleRate: number;
1944
- chunkSize: number;
1945
2591
  }
1946
- /**
1947
- * Result from a single VAD inference
1948
- */
1949
- interface VADResult$1 {
1950
- /** Speech probability (0-1) */
1951
- probability: number;
1952
- /** Whether speech is detected (probability > threshold) */
1953
- isSpeech: boolean;
1954
- /** Inference time in milliseconds */
2592
+
2593
+ /** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
2594
+ declare const CTC_VOCAB: string[];
2595
+ interface Wav2Vec2Result {
2596
+ /** Blendshape weights [frames, 52] - 30fps */
2597
+ blendshapes: Float32Array[];
2598
+ /** Raw CTC logits [frames, 32] - 50fps */
2599
+ asrLogits: Float32Array[];
2600
+ /** Decoded text from CTC */
2601
+ text: string;
2602
+ /** Number of blendshape frames (30fps) — alias for numA2EFrames */
2603
+ numFrames: number;
2604
+ /** Number of A2E frames (30fps) */
2605
+ numA2EFrames: number;
2606
+ /** Number of ASR frames (50fps) */
2607
+ numASRFrames: number;
2608
+ /** Inference time in ms */
1955
2609
  inferenceTimeMs: number;
1956
- /**
1957
- * Pre-speech audio chunks (only present on first speech detection).
1958
- * These are the N chunks immediately before VAD triggered, useful for
1959
- * capturing the beginning of speech that occurred before detection.
1960
- *
1961
- * Only populated when transitioning from silence to speech.
1962
- */
1963
- preSpeechChunks?: Float32Array[];
1964
- }
1965
- /**
1966
- * Speech segment detected by VAD
1967
- */
1968
- interface SpeechSegment {
1969
- /** Start time in seconds */
1970
- start: number;
1971
- /** End time in seconds */
1972
- end: number;
1973
- /** Average probability during segment */
1974
- avgProbability: number;
1975
2610
  }
1976
- /**
1977
- * Silero VAD - Neural network voice activity detection
1978
- *
1979
- * Based on snakers4/silero-vad ONNX model.
1980
- * Processes 32ms chunks (512 samples at 16kHz) with LSTM state.
1981
- *
1982
- * @see https://github.com/snakers4/silero-vad
1983
- */
1984
- declare class SileroVADInference {
2611
+ declare class Wav2Vec2Inference implements LipSyncBackend {
2612
+ readonly modelId: "wav2vec2";
1985
2613
  private session;
1986
2614
  private ort;
1987
2615
  private config;
1988
2616
  private _backend;
1989
2617
  private isLoading;
1990
- private state;
1991
- private context;
1992
- private readonly chunkSize;
1993
- private readonly contextSize;
2618
+ private numIdentityClasses;
1994
2619
  private inferenceQueue;
1995
- private preSpeechBuffer;
1996
- private wasSpeaking;
1997
- private srTensor;
1998
- constructor(config: SileroVADConfig);
1999
- get backend(): RuntimeBackend | null;
2000
- get isLoaded(): boolean;
2001
- get sampleRate(): number;
2002
- get threshold(): number;
2003
- /**
2004
- * Get required chunk size in samples
2005
- */
2006
- getChunkSize(): number;
2007
- /**
2008
- * Get chunk duration in milliseconds
2009
- */
2010
- getChunkDurationMs(): number;
2620
+ private poisoned;
2621
+ private static readonly INFERENCE_TIMEOUT_MS;
2622
+ constructor(config: Wav2Vec2InferenceConfig);
2011
2623
  /**
2012
2624
  * Check if WebGPU is available and working
2013
2625
  * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
2014
2626
  */
2015
2627
  static isWebGPUAvailable: typeof isWebGPUAvailable;
2628
+ get backend(): 'webgpu' | 'wasm' | null;
2629
+ get isLoaded(): boolean;
2630
+ /** True if inference timed out and the session is permanently unusable */
2631
+ get isSessionPoisoned(): boolean;
2016
2632
  /**
2017
2633
  * Load the ONNX model
2018
2634
  */
2019
- load(): Promise<VADModelInfo>;
2020
- /**
2021
- * Reset state for new audio stream
2022
- */
2023
- reset(): void;
2024
- /**
2025
- * Process a single audio chunk
2026
- *
2027
- * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
2028
- * @returns VAD result with speech probability
2029
- */
2030
- process(audioChunk: Float32Array): Promise<VADResult$1>;
2635
+ load(): Promise<ModelInfo>;
2031
2636
  /**
2032
- * Process audio and detect speech segments
2637
+ * Run inference on raw audio
2638
+ * @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
2639
+ * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
2033
2640
  *
2034
- * @param audio - Complete audio buffer
2035
- * @param options - Detection options
2036
- * @returns Array of speech segments
2641
+ * Note: Model expects 1-second chunks (16000 samples) for optimal performance.
2642
+ * Audio will be zero-padded or truncated to 16000 samples.
2037
2643
  */
2038
- detectSpeech(audio: Float32Array, options?: {
2039
- /** Minimum speech duration in ms (default: 250) */
2040
- minSpeechDurationMs?: number;
2041
- /** Minimum silence duration to end segment in ms (default: 300) */
2042
- minSilenceDurationMs?: number;
2043
- /** Padding to add before/after speech in ms (default: 30) */
2044
- speechPadMs?: number;
2045
- }): Promise<SpeechSegment[]>;
2644
+ infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
2046
2645
  /**
2047
- * Calculate RMS energy of audio chunk
2646
+ * Decode CTC logits to text using greedy decoding
2048
2647
  */
2049
- private calculateRMS;
2648
+ private decodeCTC;
2050
2649
  /**
2051
2650
  * Queue inference to serialize ONNX session calls
2052
2651
  */
2053
2652
  private queueInference;
2653
+ /**
2654
+ * Get blendshape value by name for a specific frame
2655
+ */
2656
+ getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
2054
2657
  /**
2055
2658
  * Dispose of the model and free resources
2056
2659
  */
@@ -2058,296 +2661,189 @@ declare class SileroVADInference {
2058
2661
  }
2059
2662
 
2060
2663
  /**
2061
- * Configuration for Silero VAD Worker
2664
+ * CPU-optimized lip sync inference using wav2arkit_cpu model
2665
+ *
2666
+ * A Safari/iOS-compatible alternative to Wav2Vec2Inference (384MB) designed
2667
+ * for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
2668
+ *
2669
+ * The model uses ONNX external data format:
2670
+ * - wav2arkit_cpu.onnx (1.86MB graph structure)
2671
+ * - wav2arkit_cpu.onnx.data (402MB weights)
2672
+ * Both files are fetched and cached automatically.
2673
+ *
2674
+ * Key differences from Wav2Vec2Inference:
2675
+ * - WASM-only backend (CPU-optimized, single-threaded, no WebGPU)
2676
+ * - No identity input (baked to identity 11)
2677
+ * - No ASR output (lip sync only)
2678
+ * - Dynamic input length (not fixed to 16000 samples)
2679
+ * - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
2680
+ *
2681
+ * @category Inference
2682
+ *
2683
+ * @example
2684
+ * ```typescript
2685
+ * import { Wav2ArkitCpuInference } from '@omote/core';
2686
+ *
2687
+ * const lam = new Wav2ArkitCpuInference({
2688
+ * modelUrl: '/models/wav2arkit_cpu.onnx',
2689
+ * });
2690
+ * await lam.load();
2691
+ *
2692
+ * const { blendshapes } = await lam.infer(audioSamples);
2693
+ * // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
2694
+ * ```
2062
2695
  */
2063
- interface VADWorkerConfig {
2064
- /** Path or URL to the ONNX model */
2696
+
2697
+ interface Wav2ArkitCpuConfig {
2698
+ /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
2065
2699
  modelUrl: string;
2066
- /** Sample rate (8000 or 16000, default: 16000) */
2067
- sampleRate?: 8000 | 16000;
2068
- /** Speech probability threshold (default: 0.5) */
2069
- threshold?: number;
2070
- /**
2071
- * Number of audio chunks to keep in pre-speech buffer.
2072
- * When VAD triggers, these chunks are prepended to the speech buffer
2073
- * to capture the beginning of speech that occurred before detection.
2074
- *
2075
- * At 512 samples/chunk and 16kHz:
2076
- * - 10 chunks = 320ms of pre-speech audio
2077
- * - 15 chunks = 480ms of pre-speech audio
2078
- *
2079
- * Default: 10 chunks (320ms)
2080
- */
2081
- preSpeechBufferChunks?: number;
2082
- }
2083
- /**
2084
- * VAD model loading information from worker
2085
- */
2086
- interface VADWorkerModelInfo {
2087
- backend: 'wasm';
2088
- loadTimeMs: number;
2089
- inputNames: string[];
2090
- outputNames: string[];
2091
- sampleRate: number;
2092
- chunkSize: number;
2093
- }
2094
- /**
2095
- * Result from a single VAD inference
2096
- */
2097
- interface VADResult {
2098
- /** Speech probability (0-1) */
2099
- probability: number;
2100
- /** Whether speech is detected (probability > threshold) */
2101
- isSpeech: boolean;
2102
- /** Inference time in milliseconds */
2103
- inferenceTimeMs: number;
2104
2700
  /**
2105
- * Pre-speech audio chunks (only present on first speech detection).
2106
- * These are the N chunks immediately before VAD triggered, useful for
2107
- * capturing the beginning of speech that occurred before detection.
2701
+ * Path or URL to external model data file (.onnx.data weights).
2702
+ * Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
2108
2703
  *
2109
- * Only populated when transitioning from silence to speech.
2110
- */
2111
- preSpeechChunks?: Float32Array[];
2112
- }
2113
- /**
2114
- * Silero VAD Worker - Voice Activity Detection in a Web Worker
2115
- *
2116
- * Runs Silero VAD inference off the main thread to prevent UI blocking.
2117
- * Feature parity with SileroVADInference but runs in dedicated worker.
2118
- *
2119
- * @see SileroVADInference for main-thread version
2120
- */
2121
- declare class SileroVADWorker {
2122
- private worker;
2704
+ * Set to `false` to skip external data loading (single-file models only).
2705
+ */
2706
+ externalDataUrl?: string | false;
2707
+ /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
2708
+ backend?: BackendPreference;
2709
+ }
2710
+ declare class Wav2ArkitCpuInference implements LipSyncBackend {
2711
+ readonly modelId: "wav2arkit_cpu";
2712
+ private session;
2713
+ private ort;
2123
2714
  private config;
2715
+ private _backend;
2124
2716
  private isLoading;
2125
- private _isLoaded;
2126
- private state;
2127
- private context;
2128
- private readonly chunkSize;
2129
- private readonly contextSize;
2130
2717
  private inferenceQueue;
2131
- private preSpeechBuffer;
2132
- private wasSpeaking;
2133
- private pendingResolvers;
2134
- private messageId;
2135
- constructor(config: VADWorkerConfig);
2718
+ private poisoned;
2719
+ private static readonly INFERENCE_TIMEOUT_MS;
2720
+ constructor(config: Wav2ArkitCpuConfig);
2721
+ get backend(): RuntimeBackend | null;
2136
2722
  get isLoaded(): boolean;
2137
2723
  /**
2138
- * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
2139
- */
2140
- get backend(): 'wasm' | null;
2141
- get sampleRate(): number;
2142
- get threshold(): number;
2143
- /**
2144
- * Get required chunk size in samples
2145
- */
2146
- getChunkSize(): number;
2147
- /**
2148
- * Get chunk duration in milliseconds
2149
- */
2150
- getChunkDurationMs(): number;
2151
- /**
2152
- * Create the worker from inline script
2153
- */
2154
- private createWorker;
2155
- /**
2156
- * Handle messages from worker
2157
- */
2158
- private handleWorkerMessage;
2159
- /**
2160
- * Send message to worker and wait for response
2161
- */
2162
- private sendMessage;
2163
- /**
2164
- * Load the ONNX model in the worker
2165
- */
2166
- load(): Promise<VADWorkerModelInfo>;
2167
- /**
2168
- * Reset state for new audio stream
2724
+ * Load the ONNX model
2169
2725
  */
2170
- reset(): Promise<void>;
2726
+ load(): Promise<LipSyncModelInfo>;
2171
2727
  /**
2172
- * Process a single audio chunk
2728
+ * Run inference on raw audio
2173
2729
  *
2174
- * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
2175
- * @returns VAD result with speech probability
2730
+ * Accepts variable-length audio (not fixed to 16000 samples).
2731
+ * Output frames = ceil(30 * numSamples / 16000).
2732
+ *
2733
+ * @param audioSamples - Float32Array of raw audio at 16kHz
2734
+ * @param _identityIndex - Ignored (identity 11 is baked into the model)
2176
2735
  */
2177
- process(audioChunk: Float32Array): Promise<VADResult>;
2736
+ infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
2178
2737
  /**
2179
- * Queue inference to serialize worker calls
2738
+ * Queue inference to serialize ONNX session calls
2180
2739
  */
2181
2740
  private queueInference;
2182
2741
  /**
2183
- * Dispose of the worker and free resources
2742
+ * Dispose of the model and free resources
2184
2743
  */
2185
2744
  dispose(): Promise<void>;
2186
- /**
2187
- * Check if Web Workers are supported
2188
- */
2189
- static isSupported(): boolean;
2190
2745
  }
2191
2746
 
2192
2747
  /**
2193
- * Factory function for Silero VAD with automatic Worker vs main thread selection
2748
+ * Factory function for lip sync with automatic GPU/CPU model selection
2194
2749
  *
2195
- * Provides a unified API that automatically selects the optimal implementation:
2196
- * - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
2197
- * - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
2198
- * - Fallback: Gracefully falls back to main thread if Worker fails
2750
+ * Provides a unified API that automatically selects the optimal model:
2751
+ * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
2752
+ * - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
2753
+ * - Fallback: Gracefully falls back to CPU model if GPU model fails to load
2754
+ *
2755
+ * Why two separate models?
2756
+ * Wav2Vec2 (LAM) cannot run on Safari/iOS for two reasons:
2757
+ * 1. Its dual-head transformer graph needs ~750-950MB peak during ORT session
2758
+ * creation (graph optimization), exceeding iOS WebKit's ~1-1.5GB tab limit.
2759
+ * 2. It ships as a single 384MB .onnx file that must load into JS heap before
2760
+ * ORT can consume it. iOS WebKit OOMs on this allocation.
2761
+ * wav2arkit_cpu solves both: external data format (1.86MB graph + 402MB weights)
2762
+ * lets ORT load only the tiny graph, then stream weights via URL pass-through
2763
+ * directly into WASM memory. JS heap stays at ~2MB.
2199
2764
  *
2200
2765
  * @category Inference
2201
2766
  *
2202
- * @example Basic usage (auto-detect)
2767
+ * @example Auto-detect (recommended)
2203
2768
  * ```typescript
2204
- * import { createSileroVAD } from '@omote/core';
2769
+ * import { createLipSync } from '@omote/core';
2205
2770
  *
2206
- * const vad = createSileroVAD({
2207
- * modelUrl: '/models/silero-vad.onnx',
2208
- * threshold: 0.5,
2771
+ * const lam = createLipSync({
2772
+ * gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
2773
+ * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
2209
2774
  * });
2210
2775
  *
2211
- * await vad.load();
2212
- * const result = await vad.process(audioChunk);
2213
- * if (result.isSpeech) {
2214
- * console.log('Speech detected!', result.probability);
2215
- * }
2216
- * ```
2217
- *
2218
- * @example Force worker usage
2219
- * ```typescript
2220
- * const vad = createSileroVAD({
2221
- * modelUrl: '/models/silero-vad.onnx',
2222
- * useWorker: true, // Force Worker even on mobile
2223
- * });
2776
+ * await lam.load();
2777
+ * const { blendshapes } = await lam.infer(audioSamples);
2224
2778
  * ```
2225
2779
  *
2226
- * @example Force main thread
2780
+ * @example Force CPU model
2227
2781
  * ```typescript
2228
- * const vad = createSileroVAD({
2229
- * modelUrl: '/models/silero-vad.onnx',
2230
- * useWorker: false, // Force main thread
2782
+ * const lam = createLipSync({
2783
+ * gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
2784
+ * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
2785
+ * mode: 'cpu',
2231
2786
  * });
2232
2787
  * ```
2233
2788
  */
2234
2789
 
2235
2790
  /**
2236
- * Common interface for both SileroVADInference and SileroVADWorker
2237
- *
2238
- * This interface defines the shared API that both implementations provide,
2239
- * allowing consumers to use either interchangeably.
2791
+ * Configuration for the lip sync factory
2240
2792
  */
2241
- interface SileroVADBackend {
2242
- /** Current backend type (webgpu, wasm, or null if not loaded) */
2243
- readonly backend: RuntimeBackend | null;
2244
- /** Whether the model is loaded and ready for inference */
2245
- readonly isLoaded: boolean;
2246
- /** Audio sample rate (8000 or 16000 Hz) */
2247
- readonly sampleRate: number;
2248
- /** Speech detection threshold (0-1) */
2249
- readonly threshold: number;
2250
- /**
2251
- * Load the ONNX model
2252
- * @returns Model loading information
2253
- */
2254
- load(): Promise<VADModelInfo | VADWorkerModelInfo>;
2255
- /**
2256
- * Process a single audio chunk
2257
- * @param audioChunk - Float32Array of exactly chunkSize samples
2258
- * @returns VAD result with speech probability
2259
- */
2260
- process(audioChunk: Float32Array): Promise<VADResult$1>;
2261
- /**
2262
- * Reset state for new audio stream
2263
- */
2264
- reset(): void | Promise<void>;
2793
+ interface CreateLipSyncConfig {
2794
+ /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
2795
+ gpuModelUrl: string;
2265
2796
  /**
2266
- * Dispose of the model and free resources
2797
+ * URL for GPU model external data file (.onnx.data weights).
2798
+ * Default: `${gpuModelUrl}.data`
2799
+ *
2800
+ * Set to `false` to skip external data loading (single-file models only).
2267
2801
  */
2268
- dispose(): Promise<void>;
2802
+ gpuExternalDataUrl?: string | false;
2803
+ /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
2804
+ cpuModelUrl: string;
2269
2805
  /**
2270
- * Get required chunk size in samples
2806
+ * Model selection mode:
2807
+ * - 'auto': Safari/iOS → CPU, everything else → GPU (default)
2808
+ * - 'gpu': Force GPU model (Wav2Vec2Inference)
2809
+ * - 'cpu': Force CPU model (Wav2ArkitCpuInference)
2271
2810
  */
2272
- getChunkSize(): number;
2811
+ mode?: 'auto' | 'gpu' | 'cpu';
2812
+ /** Backend preference for GPU model (default: 'auto') */
2813
+ gpuBackend?: BackendPreference;
2814
+ /** Number of identity classes for GPU model (default: 12) */
2815
+ numIdentityClasses?: number;
2273
2816
  /**
2274
- * Get chunk duration in milliseconds
2817
+ * Fall back to CPU model if GPU model fails to load (default: true)
2818
+ * Only applies when mode is 'auto' or 'gpu'
2275
2819
  */
2276
- getChunkDurationMs(): number;
2277
- }
2278
- /**
2279
- * Configuration for the Silero VAD factory
2280
- *
2281
- * Extends SileroVADConfig with worker-specific options.
2282
- */
2283
- interface SileroVADFactoryConfig extends SileroVADConfig {
2820
+ fallbackOnError?: boolean;
2284
2821
  /**
2285
- * Force worker usage (true), main thread (false), or auto-detect (undefined).
2286
- *
2287
- * Auto-detection behavior:
2288
- * - Desktop: Uses Worker (better responsiveness, off-main-thread)
2289
- * - Mobile: Uses main thread (avoids 5MB memory overhead)
2822
+ * Use Web Worker for CPU model inference (default: false)
2290
2823
  *
2291
- * You can override this to:
2292
- * - `true`: Force Worker even on mobile (if you have memory headroom)
2293
- * - `false`: Force main thread even on desktop (for debugging)
2824
+ * When true, Wav2ArkitCpuWorker is used instead of Wav2ArkitCpuInference,
2825
+ * running inference off the main thread to prevent UI blocking during
2826
+ * model loading and inference.
2294
2827
  *
2295
- * Default: undefined (auto-detect)
2828
+ * Only applies when the CPU model is selected (mode: 'cpu', auto on Safari/iOS,
2829
+ * or fallback from GPU).
2296
2830
  */
2297
2831
  useWorker?: boolean;
2298
2832
  /**
2299
- * Fallback to main thread on worker errors.
2300
- *
2301
- * When true (default), if the Worker fails to load or encounters an error,
2302
- * the factory will automatically create a main thread instance instead.
2303
- *
2304
- * When false, worker errors will propagate as exceptions.
2305
- *
2306
- * Default: true
2833
+ * Unified inference worker instance.
2834
+ * When provided and CPU model is selected, uses Wav2ArkitCpuUnifiedAdapter.
2835
+ * Takes precedence over useWorker setting for the CPU model path.
2836
+ * GPU model (Wav2Vec2) always stays on main thread (WebGPU).
2307
2837
  */
2308
- fallbackOnError?: boolean;
2838
+ unifiedWorker?: UnifiedInferenceWorker;
2309
2839
  }
2310
2840
  /**
2311
- * Check if the current environment supports VAD Web Workers
2312
- *
2313
- * Requirements:
2314
- * - Worker constructor must exist
2315
- * - Blob URL support (for inline worker script)
2316
- *
2317
- * @returns true if VAD Worker is supported
2318
- */
2319
- declare function supportsVADWorker(): boolean;
2320
- /**
2321
- * Create a Silero VAD instance with automatic implementation selection
2322
- *
2323
- * This factory function automatically selects between:
2324
- * - **SileroVADWorker**: Off-main-thread inference (better for desktop)
2325
- * - **SileroVADInference**: Main thread inference (better for mobile)
2326
- *
2327
- * The selection is based on:
2328
- * 1. Explicit `useWorker` config (if provided)
2329
- * 2. Platform detection (mobile vs desktop)
2330
- * 3. Worker API availability
2331
- *
2332
- * Both implementations share the same interface (SileroVADBackend),
2333
- * so consumers can use either interchangeably.
2841
+ * Create a lip sync instance with automatic GPU/CPU model selection
2334
2842
  *
2335
2843
  * @param config - Factory configuration
2336
- * @returns A SileroVAD instance (either Worker or main thread)
2337
- *
2338
- * @example
2339
- * ```typescript
2340
- * // Auto-detect (recommended)
2341
- * const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
2342
- *
2343
- * // Force Worker
2344
- * const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
2345
- *
2346
- * // Force main thread
2347
- * const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
2348
- * ```
2844
+ * @returns A LipSyncBackend instance (either GPU or CPU model)
2349
2845
  */
2350
- declare function createSileroVAD(config: SileroVADFactoryConfig): SileroVADBackend;
2846
+ declare function createLipSync(config: CreateLipSyncConfig): LipSyncBackend;
2351
2847
 
2352
2848
  /**
2353
2849
  * Safari Web Speech API wrapper for iOS speech recognition
@@ -2791,10 +3287,7 @@ interface ConversationMessage {
2791
3287
  /** Audio duration if applicable (ms) */
2792
3288
  audioDurationMs?: number;
2793
3289
  }
2794
- /**
2795
- * Session state
2796
- */
2797
- type AISessionState = 'idle' | 'listening' | 'thinking' | 'speaking' | 'interrupted' | 'error' | 'disconnected';
3290
+
2798
3291
  /**
2799
3292
  * Events emitted by AI adapters
2800
3293
  */
@@ -3106,7 +3599,6 @@ declare class AgentCoreAdapter extends EventEmitter<AIAdapterEvents> implements
3106
3599
  * Falls back to simple RMS if VAD not available
3107
3600
  */
3108
3601
  private detectVoiceActivity;
3109
- private int16ToFloat32;
3110
3602
  private base64ToArrayBuffer;
3111
3603
  private addToHistory;
3112
3604
  private handleDisconnect;
@@ -4704,4 +5196,4 @@ declare class ProceduralLifeLayer {
4704
5196
  private updateBrowNoise;
4705
5197
  }
4706
5198
 
4707
- export { type AIAdapter, type AIAdapterEvents, type AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, type CTCDecodeResult, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_ARKIT_MAP, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type Emotion2VecLabel, type EmotionAnimationMap, type EmotionBlendMode, type EmotionBlendshapeConfig, EmotionController, type EmotionFrame, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionToBlendshapeMapper, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type KaldiFbankOptions, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, ProceduralLifeLayer, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, UPPER_FACE_BLENDSHAPES, type UpperFaceBlendshapeName, type UpperFaceBlendshapes, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyCMVN, applyLFR, blendEmotions, calculatePeak, calculateRMS, computeKaldiFbank, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSessionWithFallback, createSileroVAD, ctcGreedyDecode, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, parseCMVNFromMetadata, parseTokensFile, preloadModels, preloadOnnxRuntime, remapWav2ArkitToLam, resolveBackend, resolveLanguageId, resolveTextNormId, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes };
5199
+ export { type AIAdapter, type AIAdapterEvents, AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, type CTCDecodeResult, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_ARKIT_MAP, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type Emotion2VecLabel, type EmotionAnimationMap, type EmotionBlendMode, type EmotionBlendshapeConfig, EmotionController, type EmotionFrame, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionToBlendshapeMapper, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type KaldiFbankOptions, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, ProceduralLifeLayer, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, UPPER_FACE_BLENDSHAPES, UnifiedInferenceWorker, type UpperFaceBlendshapeName, type UpperFaceBlendshapes, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyCMVN, applyLFR, blendEmotions, calculatePeak, calculateRMS, computeKaldiFbank, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSenseVoice, createSessionWithFallback, createSileroVAD, ctcGreedyDecode, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, parseCMVNFromMetadata, parseTokensFile, preloadModels, preloadOnnxRuntime, remapWav2ArkitToLam, resolveBackend, resolveLanguageId, resolveTextNormId, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes };