@omote/core 0.4.5 → 0.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -1435,298 +1435,634 @@ declare class SenseVoiceInference {
1435
1435
  }
1436
1436
 
1437
1437
  /**
1438
- * Kaldi-compatible filterbank (fbank) feature extraction
1439
- *
1440
- * Pure TypeScript implementation matching kaldi-native-fbank parameters
1441
- * used by SenseVoice. No external dependencies.
1442
- *
1443
- * Pipeline: audio → framing → windowing → FFT → power spectrum → mel filterbank → log
1444
- *
1445
- * @module inference/kaldiFbank
1446
- */
1447
- interface KaldiFbankOptions {
1448
- /** Frame length in ms (default: 25) */
1449
- frameLengthMs?: number;
1450
- /** Frame shift in ms (default: 10) */
1451
- frameShiftMs?: number;
1452
- /** Low frequency cutoff in Hz (default: 20) */
1453
- lowFreq?: number;
1454
- /** High frequency cutoff in Hz (default: sampleRate / 2) */
1455
- highFreq?: number;
1456
- /** Dither amount (default: 0 for deterministic output) */
1457
- dither?: number;
1458
- /** Preemphasis coefficient (default: 0.97) */
1459
- preemphasis?: number;
1460
- }
1461
- /**
1462
- * Compute Kaldi-compatible log mel filterbank features
1438
+ * SenseVoice ASR Web Worker implementation
1463
1439
  *
1464
- * @param audio Raw audio samples (float32, [-1, 1] range)
1465
- * @param sampleRate Sample rate in Hz (must be 16000 for SenseVoice)
1466
- * @param numMelBins Number of mel bins (80 for SenseVoice)
1467
- * @param opts Optional parameters
1468
- * @returns Flattened Float32Array of shape [numFrames, numMelBins]
1469
- */
1470
- declare function computeKaldiFbank(audio: Float32Array, sampleRate: number, numMelBins: number, opts?: KaldiFbankOptions): Float32Array;
1471
- /**
1472
- * Apply Low Frame Rate stacking for SenseVoice
1440
+ * Runs SenseVoice speech recognition in a dedicated Web Worker to prevent
1441
+ * main thread blocking. Uses inline worker script (Blob URL pattern) to
1442
+ * avoid separate file deployment.
1473
1443
  *
1474
- * Concatenates lfrM consecutive frames with stride lfrN.
1475
- * Left-pads with copies of first frame, right-pads last group.
1444
+ * Key design decisions:
1445
+ * - WASM backend only (WebGPU doesn't work in Workers)
1446
+ * - All preprocessing (fbank, LFR, CMVN) and CTC decoding inlined in worker
1447
+ * - Audio copied (not transferred) to retain main thread access
1448
+ * - ONNX Runtime loaded from CDN in worker (no bundler complications)
1449
+ * - iOS: model URL passed as string to ORT (avoids 239MB JS heap allocation)
1476
1450
  *
1477
- * @param features Flattened [numFrames, featureDim]
1478
- * @param featureDim Feature dimension per frame (e.g., 80)
1479
- * @param lfrM Number of frames to stack (default: 7)
1480
- * @param lfrN Stride (default: 6)
1481
- * @returns Flattened [numOutputFrames, featureDim * lfrM]
1482
- */
1483
- declare function applyLFR(features: Float32Array, featureDim: number, lfrM?: number, lfrN?: number): Float32Array;
1484
- /**
1485
- * Apply CMVN normalization in-place
1451
+ * @category Inference
1486
1452
  *
1487
- * Formula: normalized[i] = (features[i] + negMean[i % dim]) * invStddev[i % dim]
1453
+ * @example Basic usage
1454
+ * ```typescript
1455
+ * import { SenseVoiceWorker } from '@omote/core';
1488
1456
  *
1489
- * @param features Flattened feature array (modified in-place)
1490
- * @param dim Feature dimension (560 for SenseVoice after LFR)
1491
- * @param negMean Negative mean vector (dim-dimensional)
1492
- * @param invStddev Inverse standard deviation vector (dim-dimensional)
1493
- * @returns The same features array (for chaining)
1494
- */
1495
- declare function applyCMVN(features: Float32Array, dim: number, negMean: Float32Array, invStddev: Float32Array): Float32Array;
1496
- /**
1497
- * Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
1457
+ * const asr = new SenseVoiceWorker({
1458
+ * modelUrl: '/models/sensevoice/model.int8.onnx',
1459
+ * tokensUrl: '/models/sensevoice/tokens.txt',
1460
+ * });
1461
+ * await asr.load();
1498
1462
  *
1499
- * The sherpa-onnx SenseVoice export stores neg_mean and inv_stddev
1500
- * as comma-separated float strings in the model's metadata.
1463
+ * const { text, emotion, language } = await asr.transcribe(audioSamples);
1464
+ * console.log(text); // "Hello world"
1465
+ * console.log(emotion); // "NEUTRAL"
1466
+ * console.log(language); // "en"
1467
+ * ```
1501
1468
  */
1502
- declare function parseCMVNFromMetadata(negMeanStr: string, invStddevStr: string): {
1503
- negMean: Float32Array;
1504
- invStddev: Float32Array;
1505
- };
1506
1469
 
1507
1470
  /**
1508
- * CTC greedy decoder for SenseVoice
1509
- *
1510
- * Decodes CTC logits into text with structured token parsing
1511
- * for language, emotion, and audio event detection.
1512
- *
1513
- * @module inference/ctcDecoder
1471
+ * Configuration for SenseVoice Worker
1514
1472
  */
1515
- interface CTCDecodeResult {
1516
- /** Decoded text (speech content only) */
1517
- text: string;
1518
- /** Detected language (e.g., 'zh', 'en', 'ja', 'ko', 'yue') */
1519
- language?: string;
1520
- /** Detected emotion (e.g., 'HAPPY', 'SAD', 'ANGRY', 'NEUTRAL') */
1521
- emotion?: string;
1522
- /** Detected audio event (e.g., 'Speech', 'BGM', 'Laughter') */
1523
- event?: string;
1473
+ interface SenseVoiceWorkerConfig {
1474
+ /** Path or URL to model.int8.onnx (239MB) */
1475
+ modelUrl: string;
1476
+ /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
1477
+ tokensUrl?: string;
1478
+ /** Language hint (default: 'auto' for auto-detection) */
1479
+ language?: SenseVoiceLanguage;
1480
+ /** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
1481
+ textNorm?: 'with_itn' | 'without_itn';
1524
1482
  }
1525
- /** Resolve language string to SenseVoice language ID */
1526
- declare function resolveLanguageId(language: string): number;
1527
- /** Resolve text norm string to SenseVoice text norm ID */
1528
- declare function resolveTextNormId(textNorm: string): number;
1529
1483
  /**
1530
- * Parse tokens.txt into a token ID string map
1484
+ * SenseVoice ASR Worker - Speech Recognition in a Web Worker
1531
1485
  *
1532
- * Format: each line is "token_string token_id"
1533
- * e.g., "<unk> 0", "▁the 3", "s 4"
1534
- */
1535
- declare function parseTokensFile(content: string): Map<number, string>;
1536
- /**
1537
- * CTC greedy decode
1486
+ * Runs SenseVoice inference off the main thread to prevent UI blocking.
1487
+ * All preprocessing (fbank, LFR, CMVN) and CTC decoding run in the worker.
1538
1488
  *
1539
- * @param logits Raw logits from model output, flattened [seqLen, vocabSize]
1540
- * @param seqLen Sequence length (time steps)
1541
- * @param vocabSize Vocabulary size
1542
- * @param tokenMap Token ID → string map from tokens.txt
1543
- * @returns Decoded text and structured metadata
1489
+ * @see SenseVoiceInference for main-thread version
1544
1490
  */
1545
- declare function ctcGreedyDecode(logits: Float32Array, seqLen: number, vocabSize: number, tokenMap: Map<number, string>): CTCDecodeResult;
1491
+ declare class SenseVoiceWorker {
1492
+ private worker;
1493
+ private config;
1494
+ private isLoading;
1495
+ private _isLoaded;
1496
+ private inferenceQueue;
1497
+ private poisoned;
1498
+ private pendingResolvers;
1499
+ private languageId;
1500
+ private textNormId;
1501
+ constructor(config: SenseVoiceWorkerConfig);
1502
+ get isLoaded(): boolean;
1503
+ /**
1504
+ * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
1505
+ */
1506
+ get backend(): 'wasm' | null;
1507
+ /**
1508
+ * Create the worker from inline script
1509
+ */
1510
+ private createWorker;
1511
+ /**
1512
+ * Handle messages from worker
1513
+ */
1514
+ private handleWorkerMessage;
1515
+ /**
1516
+ * Send message to worker and wait for response
1517
+ */
1518
+ private sendMessage;
1519
+ /**
1520
+ * Load the ONNX model in the worker
1521
+ *
1522
+ * @param onProgress - Optional progress callback. Fires once at 100% when load completes
1523
+ * (the worker downloads and loads the model internally, so granular progress is not available).
1524
+ */
1525
+ load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
1526
+ /**
1527
+ * Transcribe audio samples to text
1528
+ *
1529
+ * @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
1530
+ * @returns Transcription result with text, emotion, language, and event
1531
+ */
1532
+ transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
1533
+ /**
1534
+ * Queue inference to serialize worker calls
1535
+ */
1536
+ private queueInference;
1537
+ /**
1538
+ * Dispose of the worker and free resources
1539
+ */
1540
+ dispose(): Promise<void>;
1541
+ /**
1542
+ * Check if Web Workers are supported
1543
+ */
1544
+ static isSupported(): boolean;
1545
+ }
1546
1546
 
1547
1547
  /**
1548
- * Shared blendshape constants and utilities for lip sync inference
1549
- *
1550
- * Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
1551
- * index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
1552
- *
1553
- * This module is the single source of truth for blendshape ordering to
1554
- * avoid circular dependencies between inference classes.
1555
- *
1556
- * @category Inference
1557
- */
1558
- /**
1559
- * LAM model blendshape names in order (52 total)
1560
- * NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
1561
- */
1562
- declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
1563
- /** Alias for backwards compatibility */
1564
- declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
1565
- /**
1566
- * Symmetrize blendshapes by averaging left/right pairs
1567
- * From LAM official postprocessing (models/utils.py)
1568
- * This fixes asymmetric output from the raw model
1569
- */
1570
- declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
1571
- /**
1572
- * wav2arkit_cpu model blendshape ordering
1573
- *
1574
- * Indices 0-24 match LAM_BLENDSHAPES, but 25+ diverge:
1575
- * - LAM puts jawRight, mouthClose, mouthDimpleLeft, mouthDimpleRight at 25-28
1576
- * - wav2arkit_cpu puts mouthFrownLeft at 25 and moves those four to 48-51
1577
- */
1578
- declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
1579
- /**
1580
- * Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
1548
+ * Silero VAD (Voice Activity Detection) inference
1581
1549
  *
1582
- * @param frame - Float32Array of 52 blendshape values in wav2arkit_cpu order
1583
- * @returns Float32Array of 52 blendshape values in LAM_BLENDSHAPES order
1584
- */
1585
- declare function remapWav2ArkitToLam(frame: Float32Array): Float32Array;
1586
-
1587
- /**
1588
- * Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
1550
+ * Neural network-based VAD running in browser via ONNX Runtime Web.
1551
+ * Much more accurate than RMS-based energy detection.
1589
1552
  *
1590
- * Runs entirely in the browser using WebGPU or WASM.
1591
- * Takes raw 16kHz audio and outputs:
1592
- * - 52 ARKit blendshapes (lip sync)
1593
- * - 32-token CTC logits (speech recognition)
1553
+ * Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
1554
+ * - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
1555
+ * - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
1594
1556
  *
1595
1557
  * @category Inference
1596
1558
  *
1597
1559
  * @example Basic usage
1598
1560
  * ```typescript
1599
- * import { Wav2Vec2Inference } from '@omote/core';
1561
+ * import { SileroVADInference } from '@omote/core';
1600
1562
  *
1601
- * const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
1602
- * await wav2vec.load();
1563
+ * const vad = new SileroVADInference({
1564
+ * modelUrl: '/models/silero-vad.onnx'
1565
+ * });
1566
+ * await vad.load();
1603
1567
  *
1604
- * // Process 1 second of audio (16kHz = 16000 samples)
1605
- * const result = await wav2vec.infer(audioSamples);
1568
+ * // Process 32ms chunks (512 samples at 16kHz)
1569
+ * const probability = await vad.process(audioChunk);
1570
+ * if (probability > 0.5) {
1571
+ * console.log('Speech detected!');
1572
+ * }
1573
+ * ```
1606
1574
  *
1607
- * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
1608
- * console.log('ASR text:', result.text); // Decoded transcription
1575
+ * @example Streaming with state management
1576
+ * ```typescript
1577
+ * // State is automatically maintained between process() calls
1578
+ * // Call reset() when starting a new audio stream
1579
+ * vad.reset();
1580
+ *
1581
+ * for (const chunk of audioChunks) {
1582
+ * const prob = await vad.process(chunk);
1583
+ * // prob is speech probability [0, 1]
1584
+ * }
1609
1585
  * ```
1610
1586
  */
1611
1587
 
1612
- type InferenceBackend = BackendPreference;
1613
- interface Wav2Vec2InferenceConfig {
1588
+ type VADBackend = BackendPreference;
1589
+ /**
1590
+ * Configuration for Silero VAD
1591
+ */
1592
+ interface SileroVADConfig {
1614
1593
  /** Path or URL to the ONNX model */
1615
1594
  modelUrl: string;
1616
- /**
1617
- * Path or URL to external model data file (.onnx.data weights).
1618
- * Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
1619
- *
1620
- * Set to `false` to skip external data loading (single-file models only).
1621
- */
1622
- externalDataUrl?: string | false;
1623
1595
  /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
1624
- backend?: InferenceBackend;
1625
- /** Number of identity classes (default: 12 for streaming model) */
1626
- numIdentityClasses?: number;
1596
+ backend?: VADBackend;
1597
+ /** Sample rate (8000 or 16000, default: 16000) */
1598
+ sampleRate?: 8000 | 16000;
1599
+ /** Speech probability threshold (default: 0.5) */
1600
+ threshold?: number;
1601
+ /**
1602
+ * Number of audio chunks to keep in pre-speech buffer.
1603
+ * When VAD triggers, these chunks are prepended to the speech buffer
1604
+ * to capture the beginning of speech that occurred before detection.
1605
+ *
1606
+ * At 512 samples/chunk and 16kHz:
1607
+ * - 10 chunks = 320ms of pre-speech audio
1608
+ * - 15 chunks = 480ms of pre-speech audio
1609
+ *
1610
+ * Default: 10 chunks (320ms)
1611
+ */
1612
+ preSpeechBufferChunks?: number;
1627
1613
  }
1628
- interface ModelInfo {
1614
+ /**
1615
+ * VAD model loading information
1616
+ */
1617
+ interface VADModelInfo {
1629
1618
  backend: 'webgpu' | 'wasm';
1630
1619
  loadTimeMs: number;
1631
1620
  inputNames: string[];
1632
1621
  outputNames: string[];
1622
+ sampleRate: number;
1623
+ chunkSize: number;
1633
1624
  }
1634
-
1635
- /** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
1636
- declare const CTC_VOCAB: string[];
1637
- interface Wav2Vec2Result {
1638
- /** Blendshape weights [frames, 52] - 30fps */
1639
- blendshapes: Float32Array[];
1640
- /** Raw CTC logits [frames, 32] - 50fps */
1641
- asrLogits: Float32Array[];
1642
- /** Decoded text from CTC */
1643
- text: string;
1644
- /** Number of blendshape frames (30fps) — alias for numA2EFrames */
1645
- numFrames: number;
1646
- /** Number of A2E frames (30fps) */
1647
- numA2EFrames: number;
1648
- /** Number of ASR frames (50fps) */
1649
- numASRFrames: number;
1650
- /** Inference time in ms */
1625
+ /**
1626
+ * Result from a single VAD inference
1627
+ */
1628
+ interface VADResult {
1629
+ /** Speech probability (0-1) */
1630
+ probability: number;
1631
+ /** Whether speech is detected (probability > threshold) */
1632
+ isSpeech: boolean;
1633
+ /** Inference time in milliseconds */
1651
1634
  inferenceTimeMs: number;
1635
+ /**
1636
+ * Pre-speech audio chunks (only present on first speech detection).
1637
+ * These are the N chunks immediately before VAD triggered, useful for
1638
+ * capturing the beginning of speech that occurred before detection.
1639
+ *
1640
+ * Only populated when transitioning from silence to speech.
1641
+ */
1642
+ preSpeechChunks?: Float32Array[];
1652
1643
  }
1653
- declare class Wav2Vec2Inference implements LipSyncBackend {
1654
- readonly modelId: "wav2vec2";
1644
+ /**
1645
+ * Speech segment detected by VAD
1646
+ */
1647
+ interface SpeechSegment {
1648
+ /** Start time in seconds */
1649
+ start: number;
1650
+ /** End time in seconds */
1651
+ end: number;
1652
+ /** Average probability during segment */
1653
+ avgProbability: number;
1654
+ }
1655
+ /**
1656
+ * Silero VAD - Neural network voice activity detection
1657
+ *
1658
+ * Based on snakers4/silero-vad ONNX model.
1659
+ * Processes 32ms chunks (512 samples at 16kHz) with LSTM state.
1660
+ *
1661
+ * @see https://github.com/snakers4/silero-vad
1662
+ */
1663
+ declare class SileroVADInference {
1655
1664
  private session;
1656
1665
  private ort;
1657
1666
  private config;
1658
1667
  private _backend;
1659
1668
  private isLoading;
1660
- private numIdentityClasses;
1669
+ private state;
1670
+ private context;
1671
+ private readonly chunkSize;
1672
+ private readonly contextSize;
1661
1673
  private inferenceQueue;
1662
- private poisoned;
1663
- private static readonly INFERENCE_TIMEOUT_MS;
1664
- constructor(config: Wav2Vec2InferenceConfig);
1674
+ private preSpeechBuffer;
1675
+ private wasSpeaking;
1676
+ private srTensor;
1677
+ constructor(config: SileroVADConfig);
1678
+ get backend(): RuntimeBackend | null;
1679
+ get isLoaded(): boolean;
1680
+ get sampleRate(): number;
1681
+ get threshold(): number;
1682
+ /**
1683
+ * Get required chunk size in samples
1684
+ */
1685
+ getChunkSize(): number;
1686
+ /**
1687
+ * Get chunk duration in milliseconds
1688
+ */
1689
+ getChunkDurationMs(): number;
1690
+ /**
1691
+ * Check if WebGPU is available and working
1692
+ * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
1693
+ */
1694
+ static isWebGPUAvailable: typeof isWebGPUAvailable;
1695
+ /**
1696
+ * Load the ONNX model
1697
+ */
1698
+ load(): Promise<VADModelInfo>;
1699
+ /**
1700
+ * Reset state for new audio stream
1701
+ */
1702
+ reset(): void;
1703
+ /**
1704
+ * Process a single audio chunk
1705
+ *
1706
+ * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
1707
+ * @returns VAD result with speech probability
1708
+ */
1709
+ process(audioChunk: Float32Array): Promise<VADResult>;
1710
+ /**
1711
+ * Process audio and detect speech segments
1712
+ *
1713
+ * @param audio - Complete audio buffer
1714
+ * @param options - Detection options
1715
+ * @returns Array of speech segments
1716
+ */
1717
+ detectSpeech(audio: Float32Array, options?: {
1718
+ /** Minimum speech duration in ms (default: 250) */
1719
+ minSpeechDurationMs?: number;
1720
+ /** Minimum silence duration to end segment in ms (default: 300) */
1721
+ minSilenceDurationMs?: number;
1722
+ /** Padding to add before/after speech in ms (default: 30) */
1723
+ speechPadMs?: number;
1724
+ }): Promise<SpeechSegment[]>;
1725
+ /**
1726
+ * Queue inference to serialize ONNX session calls
1727
+ */
1728
+ private queueInference;
1729
+ /**
1730
+ * Dispose of the model and free resources
1731
+ */
1732
+ dispose(): Promise<void>;
1733
+ }
1734
+
1735
+ /**
1736
+ * Silero VAD Web Worker implementation
1737
+ *
1738
+ * Runs Silero VAD inference in a dedicated Web Worker to prevent main thread blocking.
1739
+ * Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
1740
+ *
1741
+ * Key design decisions:
1742
+ * - WASM backend only (WebGPU doesn't work in Workers)
1743
+ * - LSTM state serialized as Float32Array (Tensors can't cross worker boundary)
1744
+ * - Audio copied (not transferred) to retain main thread access for pre-speech buffer
1745
+ * - ONNX Runtime loaded from CDN in worker (no bundler complications)
1746
+ *
1747
+ * @category Inference
1748
+ *
1749
+ * @example Basic usage
1750
+ * ```typescript
1751
+ * import { SileroVADWorker } from '@omote/core';
1752
+ *
1753
+ * const vad = new SileroVADWorker({
1754
+ * modelUrl: '/models/silero-vad.onnx'
1755
+ * });
1756
+ * await vad.load();
1757
+ *
1758
+ * // Process 32ms chunks (512 samples at 16kHz)
1759
+ * const result = await vad.process(audioChunk);
1760
+ * if (result.isSpeech) {
1761
+ * console.log('Speech detected!', result.probability);
1762
+ * }
1763
+ * ```
1764
+ */
1765
+
1766
+ /**
1767
+ * Configuration for Silero VAD Worker
1768
+ */
1769
+ interface VADWorkerConfig {
1770
+ /** Path or URL to the ONNX model */
1771
+ modelUrl: string;
1772
+ /** Sample rate (8000 or 16000, default: 16000) */
1773
+ sampleRate?: 8000 | 16000;
1774
+ /** Speech probability threshold (default: 0.5) */
1775
+ threshold?: number;
1776
+ /**
1777
+ * Number of audio chunks to keep in pre-speech buffer.
1778
+ * When VAD triggers, these chunks are prepended to the speech buffer
1779
+ * to capture the beginning of speech that occurred before detection.
1780
+ *
1781
+ * At 512 samples/chunk and 16kHz:
1782
+ * - 10 chunks = 320ms of pre-speech audio
1783
+ * - 15 chunks = 480ms of pre-speech audio
1784
+ *
1785
+ * Default: 10 chunks (320ms)
1786
+ */
1787
+ preSpeechBufferChunks?: number;
1788
+ }
1789
+ /**
1790
+ * VAD model loading information from worker
1791
+ */
1792
+ interface VADWorkerModelInfo {
1793
+ backend: 'wasm';
1794
+ loadTimeMs: number;
1795
+ inputNames: string[];
1796
+ outputNames: string[];
1797
+ sampleRate: number;
1798
+ chunkSize: number;
1799
+ }
1800
+
1801
+ /**
1802
+ * Silero VAD Worker - Voice Activity Detection in a Web Worker
1803
+ *
1804
+ * Runs Silero VAD inference off the main thread to prevent UI blocking.
1805
+ * Feature parity with SileroVADInference but runs in dedicated worker.
1806
+ *
1807
+ * @see SileroVADInference for main-thread version
1808
+ */
1809
+ declare class SileroVADWorker {
1810
+ private worker;
1811
+ private config;
1812
+ private isLoading;
1813
+ private _isLoaded;
1814
+ private state;
1815
+ private context;
1816
+ private readonly chunkSize;
1817
+ private readonly contextSize;
1818
+ private inferenceQueue;
1819
+ private preSpeechBuffer;
1820
+ private wasSpeaking;
1821
+ private pendingResolvers;
1822
+ private messageId;
1823
+ constructor(config: VADWorkerConfig);
1824
+ get isLoaded(): boolean;
1825
+ /**
1826
+ * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
1827
+ */
1828
+ get backend(): 'wasm' | null;
1829
+ get sampleRate(): number;
1830
+ get threshold(): number;
1831
+ /**
1832
+ * Get required chunk size in samples
1833
+ */
1834
+ getChunkSize(): number;
1835
+ /**
1836
+ * Get chunk duration in milliseconds
1837
+ */
1838
+ getChunkDurationMs(): number;
1839
+ /**
1840
+ * Create the worker from inline script
1841
+ */
1842
+ private createWorker;
1843
+ /**
1844
+ * Handle messages from worker
1845
+ */
1846
+ private handleWorkerMessage;
1847
+ /**
1848
+ * Send message to worker and wait for response
1849
+ */
1850
+ private sendMessage;
1851
+ /**
1852
+ * Load the ONNX model in the worker
1853
+ */
1854
+ load(): Promise<VADWorkerModelInfo>;
1855
+ /**
1856
+ * Reset state for new audio stream
1857
+ */
1858
+ reset(): Promise<void>;
1859
+ /**
1860
+ * Process a single audio chunk
1861
+ *
1862
+ * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
1863
+ * @returns VAD result with speech probability
1864
+ */
1865
+ process(audioChunk: Float32Array): Promise<VADResult>;
1866
+ /**
1867
+ * Queue inference to serialize worker calls
1868
+ */
1869
+ private queueInference;
1870
+ /**
1871
+ * Dispose of the worker and free resources
1872
+ */
1873
+ dispose(): Promise<void>;
1874
+ /**
1875
+ * Check if Web Workers are supported
1876
+ */
1877
+ static isSupported(): boolean;
1878
+ }
1879
+
1880
+ /**
1881
+ * Factory function for Silero VAD with automatic Worker vs main thread selection
1882
+ *
1883
+ * Provides a unified API that automatically selects the optimal implementation:
1884
+ * - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
1885
+ * - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
1886
+ * - Fallback: Gracefully falls back to main thread if Worker fails
1887
+ *
1888
+ * @category Inference
1889
+ *
1890
+ * @example Basic usage (auto-detect)
1891
+ * ```typescript
1892
+ * import { createSileroVAD } from '@omote/core';
1893
+ *
1894
+ * const vad = createSileroVAD({
1895
+ * modelUrl: '/models/silero-vad.onnx',
1896
+ * threshold: 0.5,
1897
+ * });
1898
+ *
1899
+ * await vad.load();
1900
+ * const result = await vad.process(audioChunk);
1901
+ * if (result.isSpeech) {
1902
+ * console.log('Speech detected!', result.probability);
1903
+ * }
1904
+ * ```
1905
+ *
1906
+ * @example Force worker usage
1907
+ * ```typescript
1908
+ * const vad = createSileroVAD({
1909
+ * modelUrl: '/models/silero-vad.onnx',
1910
+ * useWorker: true, // Force Worker even on mobile
1911
+ * });
1912
+ * ```
1913
+ *
1914
+ * @example Force main thread
1915
+ * ```typescript
1916
+ * const vad = createSileroVAD({
1917
+ * modelUrl: '/models/silero-vad.onnx',
1918
+ * useWorker: false, // Force main thread
1919
+ * });
1920
+ * ```
1921
+ */
1922
+
1923
+ /**
1924
+ * Common interface for both SileroVADInference and SileroVADWorker
1925
+ *
1926
+ * This interface defines the shared API that both implementations provide,
1927
+ * allowing consumers to use either interchangeably.
1928
+ */
1929
+ interface SileroVADBackend {
1930
+ /** Current backend type (webgpu, wasm, or null if not loaded) */
1931
+ readonly backend: RuntimeBackend | null;
1932
+ /** Whether the model is loaded and ready for inference */
1933
+ readonly isLoaded: boolean;
1934
+ /** Audio sample rate (8000 or 16000 Hz) */
1935
+ readonly sampleRate: number;
1936
+ /** Speech detection threshold (0-1) */
1937
+ readonly threshold: number;
1665
1938
  /**
1666
- * Check if WebGPU is available and working
1667
- * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
1939
+ * Load the ONNX model
1940
+ * @returns Model loading information
1668
1941
  */
1669
- static isWebGPUAvailable: typeof isWebGPUAvailable;
1670
- get backend(): 'webgpu' | 'wasm' | null;
1671
- get isLoaded(): boolean;
1672
- /** True if inference timed out and the session is permanently unusable */
1673
- get isSessionPoisoned(): boolean;
1942
+ load(): Promise<VADModelInfo | VADWorkerModelInfo>;
1674
1943
  /**
1675
- * Load the ONNX model
1944
+ * Process a single audio chunk
1945
+ * @param audioChunk - Float32Array of exactly chunkSize samples
1946
+ * @returns VAD result with speech probability
1676
1947
  */
1677
- load(): Promise<ModelInfo>;
1948
+ process(audioChunk: Float32Array): Promise<VADResult>;
1678
1949
  /**
1679
- * Run inference on raw audio
1680
- * @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
1681
- * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
1682
- *
1683
- * Note: Model expects 1-second chunks (16000 samples) for optimal performance.
1684
- * Audio will be zero-padded or truncated to 16000 samples.
1950
+ * Reset state for new audio stream
1685
1951
  */
1686
- infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
1952
+ reset(): void | Promise<void>;
1687
1953
  /**
1688
- * Decode CTC logits to text using greedy decoding
1954
+ * Dispose of the model and free resources
1689
1955
  */
1690
- private decodeCTC;
1956
+ dispose(): Promise<void>;
1691
1957
  /**
1692
- * Queue inference to serialize ONNX session calls
1958
+ * Get required chunk size in samples
1693
1959
  */
1694
- private queueInference;
1960
+ getChunkSize(): number;
1695
1961
  /**
1696
- * Get blendshape value by name for a specific frame
1962
+ * Get chunk duration in milliseconds
1697
1963
  */
1698
- getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
1964
+ getChunkDurationMs(): number;
1965
+ }
1966
+ /**
1967
+ * Configuration for the Silero VAD factory
1968
+ *
1969
+ * Extends SileroVADConfig with worker-specific options.
1970
+ */
1971
+ interface SileroVADFactoryConfig extends SileroVADConfig {
1699
1972
  /**
1700
- * Dispose of the model and free resources
1973
+ * Force worker usage (true), main thread (false), or auto-detect (undefined).
1974
+ *
1975
+ * Auto-detection behavior:
1976
+ * - Desktop: Uses Worker (better responsiveness, off-main-thread)
1977
+ * - Mobile: Uses main thread (avoids 5MB memory overhead)
1978
+ *
1979
+ * You can override this to:
1980
+ * - `true`: Force Worker even on mobile (if you have memory headroom)
1981
+ * - `false`: Force main thread even on desktop (for debugging)
1982
+ *
1983
+ * Default: undefined (auto-detect)
1701
1984
  */
1702
- dispose(): Promise<void>;
1985
+ useWorker?: boolean;
1986
+ /**
1987
+ * Fallback to main thread on worker errors.
1988
+ *
1989
+ * When true (default), if the Worker fails to load or encounters an error,
1990
+ * the factory will automatically create a main thread instance instead.
1991
+ *
1992
+ * When false, worker errors will propagate as exceptions.
1993
+ *
1994
+ * Default: true
1995
+ */
1996
+ fallbackOnError?: boolean;
1997
+ /**
1998
+ * Unified inference worker instance.
1999
+ * When provided, uses SileroVADUnifiedAdapter (shared single-ORT worker).
2000
+ * Takes precedence over useWorker setting.
2001
+ */
2002
+ unifiedWorker?: UnifiedInferenceWorker;
1703
2003
  }
1704
-
1705
2004
  /**
1706
- * CPU-optimized lip sync inference using wav2arkit_cpu model
2005
+ * Check if the current environment supports VAD Web Workers
1707
2006
  *
1708
- * A Safari/iOS-compatible alternative to Wav2Vec2Inference (384MB) designed
1709
- * for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
2007
+ * Requirements:
2008
+ * - Worker constructor must exist
2009
+ * - Blob URL support (for inline worker script)
1710
2010
  *
1711
- * The model uses ONNX external data format:
1712
- * - wav2arkit_cpu.onnx (1.86MB graph structure)
1713
- * - wav2arkit_cpu.onnx.data (402MB weights)
1714
- * Both files are fetched and cached automatically.
2011
+ * @returns true if VAD Worker is supported
2012
+ */
2013
+ declare function supportsVADWorker(): boolean;
2014
+ /**
2015
+ * Create a Silero VAD instance with automatic implementation selection
1715
2016
  *
1716
- * Key differences from Wav2Vec2Inference:
1717
- * - WASM-only backend (CPU-optimized, single-threaded, no WebGPU)
1718
- * - No identity input (baked to identity 11)
1719
- * - No ASR output (lip sync only)
1720
- * - Dynamic input length (not fixed to 16000 samples)
1721
- * - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
2017
+ * This factory function automatically selects between:
2018
+ * - **SileroVADWorker**: Off-main-thread inference (better for desktop)
2019
+ * - **SileroVADInference**: Main thread inference (better for mobile)
2020
+ *
2021
+ * The selection is based on:
2022
+ * 1. Explicit `useWorker` config (if provided)
2023
+ * 2. Platform detection (mobile vs desktop)
2024
+ * 3. Worker API availability
2025
+ *
2026
+ * Both implementations share the same interface (SileroVADBackend),
2027
+ * so consumers can use either interchangeably.
2028
+ *
2029
+ * @param config - Factory configuration
2030
+ * @returns A SileroVAD instance (either Worker or main thread)
2031
+ *
2032
+ * @example
2033
+ * ```typescript
2034
+ * // Auto-detect (recommended)
2035
+ * const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
2036
+ *
2037
+ * // Force Worker
2038
+ * const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
2039
+ *
2040
+ * // Force main thread
2041
+ * const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
2042
+ * ```
2043
+ */
2044
+ declare function createSileroVAD(config: SileroVADFactoryConfig): SileroVADBackend;
2045
+
2046
+ /**
2047
+ * Web Worker-based wav2arkit_cpu lip sync inference
2048
+ *
2049
+ * Runs wav2arkit_cpu inference in a dedicated Web Worker to prevent main thread blocking.
2050
+ * Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
2051
+ *
2052
+ * Key design decisions:
2053
+ * - WASM backend only (WebGPU doesn't work in Workers)
2054
+ * - Audio copied (not transferred) to retain main thread access
2055
+ * - ONNX Runtime loaded from CDN in worker (no bundler complications)
2056
+ * - Blendshape symmetrization inlined in worker (no module imports)
2057
+ * - iOS: passes model URLs as strings directly to ORT (avoids 400MB+ JS heap)
1722
2058
  *
1723
2059
  * @category Inference
1724
2060
  *
1725
2061
  * @example
1726
2062
  * ```typescript
1727
- * import { Wav2ArkitCpuInference } from '@omote/core';
2063
+ * import { Wav2ArkitCpuWorker } from '@omote/core';
1728
2064
  *
1729
- * const lam = new Wav2ArkitCpuInference({
2065
+ * const lam = new Wav2ArkitCpuWorker({
1730
2066
  * modelUrl: '/models/wav2arkit_cpu.onnx',
1731
2067
  * });
1732
2068
  * await lam.load();
@@ -1736,7 +2072,10 @@ declare class Wav2Vec2Inference implements LipSyncBackend {
1736
2072
  * ```
1737
2073
  */
1738
2074
 
1739
- interface Wav2ArkitCpuConfig {
2075
+ /**
2076
+ * Configuration for Wav2ArkitCpu Worker
2077
+ */
2078
+ interface Wav2ArkitCpuWorkerConfig {
1740
2079
  /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
1741
2080
  modelUrl: string;
1742
2081
  /**
@@ -1746,24 +2085,44 @@ interface Wav2ArkitCpuConfig {
1746
2085
  * Set to `false` to skip external data loading (single-file models only).
1747
2086
  */
1748
2087
  externalDataUrl?: string | false;
1749
- /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
1750
- backend?: BackendPreference;
1751
2088
  }
1752
- declare class Wav2ArkitCpuInference implements LipSyncBackend {
2089
+ /**
2090
+ * Wav2ArkitCpu Worker - Lip sync inference in a Web Worker
2091
+ *
2092
+ * Runs wav2arkit_cpu inference off the main thread to prevent UI blocking.
2093
+ * Feature parity with Wav2ArkitCpuInference but runs in dedicated worker.
2094
+ *
2095
+ * @see Wav2ArkitCpuInference for main-thread version
2096
+ */
2097
+ declare class Wav2ArkitCpuWorker implements LipSyncBackend {
1753
2098
  readonly modelId: "wav2arkit_cpu";
1754
- private session;
1755
- private ort;
2099
+ private worker;
1756
2100
  private config;
1757
- private _backend;
1758
2101
  private isLoading;
2102
+ private _isLoaded;
1759
2103
  private inferenceQueue;
1760
2104
  private poisoned;
1761
- private static readonly INFERENCE_TIMEOUT_MS;
1762
- constructor(config: Wav2ArkitCpuConfig);
1763
- get backend(): RuntimeBackend | null;
2105
+ private pendingResolvers;
2106
+ constructor(config: Wav2ArkitCpuWorkerConfig);
1764
2107
  get isLoaded(): boolean;
1765
2108
  /**
1766
- * Load the ONNX model
2109
+ * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
2110
+ */
2111
+ get backend(): 'wasm' | null;
2112
+ /**
2113
+ * Create the worker from inline script
2114
+ */
2115
+ private createWorker;
2116
+ /**
2117
+ * Handle messages from worker
2118
+ */
2119
+ private handleWorkerMessage;
2120
+ /**
2121
+ * Send message to worker and wait for response
2122
+ */
2123
+ private sendMessage;
2124
+ /**
2125
+ * Load the ONNX model in the worker
1767
2126
  */
1768
2127
  load(): Promise<LipSyncModelInfo>;
1769
2128
  /**
@@ -1777,280 +2136,524 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
1777
2136
  */
1778
2137
  infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
1779
2138
  /**
1780
- * Queue inference to serialize ONNX session calls
2139
+ * Queue inference to serialize worker calls
1781
2140
  */
1782
2141
  private queueInference;
1783
2142
  /**
1784
- * Dispose of the model and free resources
2143
+ * Dispose of the worker and free resources
2144
+ */
2145
+ dispose(): Promise<void>;
2146
+ /**
2147
+ * Check if Web Workers are supported
1785
2148
  */
2149
+ static isSupported(): boolean;
2150
+ }
2151
+
2152
+ /**
2153
+ * Unified Inference Worker — single Web Worker hosting all WASM models
2154
+ *
2155
+ * Solves the multi-worker ORT problem: three per-model workers each load their
2156
+ * own ORT WASM instance (~40MB each). On iOS this exceeds the ~1-1.5GB tab
2157
+ * limit, forcing main-thread fallback which blocks the render loop.
2158
+ *
2159
+ * This worker hosts SenseVoice + Wav2ArkitCpu + Silero VAD in a single
2160
+ * ORT WASM instance. Same total model memory (~643MB), but inference runs
2161
+ * off-main-thread. Works on iOS because there's only one ORT instance.
2162
+ *
2163
+ * Consumer usage:
2164
+ * ```typescript
2165
+ * const worker = new UnifiedInferenceWorker();
2166
+ * await worker.init();
2167
+ *
2168
+ * const asr = createSenseVoice({ modelUrl: '...', unifiedWorker: worker });
2169
+ * const lam = createLipSync({ gpuModelUrl: '...', cpuModelUrl: '...', unifiedWorker: worker });
2170
+ * const vad = createSileroVAD({ modelUrl: '...', unifiedWorker: worker });
2171
+ * ```
2172
+ *
2173
+ * @category Inference
2174
+ */
2175
+
2176
+ /**
2177
+ * Unified Inference Worker — single Web Worker for all WASM models
2178
+ *
2179
+ * Hosts SenseVoice, Wav2ArkitCpu, and Silero VAD in one ORT instance.
2180
+ * Eliminates the multi-worker memory problem on iOS.
2181
+ */
2182
+ declare class UnifiedInferenceWorker {
2183
+ private worker;
2184
+ private pendingRequests;
2185
+ private initialized;
2186
+ private poisoned;
2187
+ /**
2188
+ * Initialize the worker (load ORT WASM from CDN)
2189
+ */
2190
+ init(): Promise<void>;
2191
+ loadSenseVoice(config: {
2192
+ modelUrl: string;
2193
+ tokensUrl: string;
2194
+ language: number;
2195
+ textNorm: number;
2196
+ }): Promise<SenseVoiceModelInfo>;
2197
+ transcribe(audio: Float32Array): Promise<SenseVoiceResult>;
2198
+ disposeSenseVoice(): Promise<void>;
2199
+ loadLipSync(config: {
2200
+ modelUrl: string;
2201
+ externalDataUrl: string | null;
2202
+ }): Promise<LipSyncModelInfo>;
2203
+ inferLipSync(audio: Float32Array): Promise<{
2204
+ blendshapes: Float32Array;
2205
+ numFrames: number;
2206
+ numBlendshapes: number;
2207
+ inferenceTimeMs: number;
2208
+ }>;
2209
+ disposeLipSync(): Promise<void>;
2210
+ loadVAD(config: {
2211
+ modelUrl: string;
2212
+ sampleRate: number;
2213
+ }): Promise<VADWorkerModelInfo>;
2214
+ processVAD(audio: Float32Array, state: Float32Array, context: Float32Array): Promise<{
2215
+ probability: number;
2216
+ state: Float32Array;
2217
+ inferenceTimeMs: number;
2218
+ }>;
2219
+ resetVAD(): Promise<Float32Array>;
2220
+ disposeVAD(): Promise<void>;
2221
+ dispose(): Promise<void>;
2222
+ /** Check if the worker is initialized and not poisoned */
2223
+ get isReady(): boolean;
2224
+ /** Check if Web Workers are supported */
2225
+ static isSupported(): boolean;
2226
+ private assertReady;
2227
+ private createWorker;
2228
+ private handleWorkerMessage;
2229
+ private sendMessage;
2230
+ private rejectAllPending;
2231
+ private cleanup;
2232
+ }
2233
+ /**
2234
+ * SenseVoice adapter backed by UnifiedInferenceWorker
2235
+ *
2236
+ * Implements SenseVoiceBackend, delegating all inference to the shared worker.
2237
+ */
2238
+ declare class SenseVoiceUnifiedAdapter implements SenseVoiceBackend {
2239
+ private worker;
2240
+ private config;
2241
+ private _isLoaded;
2242
+ private languageId;
2243
+ private textNormId;
2244
+ private inferenceQueue;
2245
+ constructor(worker: UnifiedInferenceWorker, config: SenseVoiceWorkerConfig);
2246
+ get isLoaded(): boolean;
2247
+ get backend(): 'wasm' | null;
2248
+ load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
2249
+ transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
2250
+ dispose(): Promise<void>;
2251
+ }
2252
+ /**
2253
+ * Wav2ArkitCpu adapter backed by UnifiedInferenceWorker
2254
+ *
2255
+ * Implements LipSyncBackend, delegating all inference to the shared worker.
2256
+ */
2257
+ declare class Wav2ArkitCpuUnifiedAdapter implements LipSyncBackend {
2258
+ readonly modelId: "wav2arkit_cpu";
2259
+ private worker;
2260
+ private config;
2261
+ private _isLoaded;
2262
+ private inferenceQueue;
2263
+ constructor(worker: UnifiedInferenceWorker, config: Wav2ArkitCpuWorkerConfig);
2264
+ get isLoaded(): boolean;
2265
+ get backend(): RuntimeBackend | null;
2266
+ load(): Promise<LipSyncModelInfo>;
2267
+ infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
2268
+ dispose(): Promise<void>;
2269
+ }
2270
+ /**
2271
+ * Silero VAD adapter backed by UnifiedInferenceWorker
2272
+ *
2273
+ * Implements SileroVADBackend, delegating all inference to the shared worker.
2274
+ */
2275
+ declare class SileroVADUnifiedAdapter implements SileroVADBackend {
2276
+ private worker;
2277
+ private config;
2278
+ private _isLoaded;
2279
+ private state;
2280
+ private context;
2281
+ private readonly chunkSize;
2282
+ private readonly contextSize;
2283
+ private inferenceQueue;
2284
+ private preSpeechBuffer;
2285
+ private wasSpeaking;
2286
+ constructor(worker: UnifiedInferenceWorker, config: SileroVADConfig);
2287
+ get isLoaded(): boolean;
2288
+ get backend(): RuntimeBackend | null;
2289
+ get sampleRate(): number;
2290
+ get threshold(): number;
2291
+ getChunkSize(): number;
2292
+ getChunkDurationMs(): number;
2293
+ load(): Promise<VADWorkerModelInfo>;
2294
+ process(audioChunk: Float32Array): Promise<VADResult>;
2295
+ reset(): Promise<void>;
1786
2296
  dispose(): Promise<void>;
1787
2297
  }
1788
2298
 
1789
2299
  /**
1790
- * Factory function for lip sync with automatic GPU/CPU model selection
1791
- *
1792
- * Provides a unified API that automatically selects the optimal model:
1793
- * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
1794
- * - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
1795
- * - Fallback: Gracefully falls back to CPU model if GPU model fails to load
2300
+ * Factory function for SenseVoice ASR with automatic Worker vs main thread selection
1796
2301
  *
1797
- * Why two separate models?
1798
- * Wav2Vec2 (LAM) cannot run on Safari/iOS for two reasons:
1799
- * 1. Its dual-head transformer graph needs ~750-950MB peak during ORT session
1800
- * creation (graph optimization), exceeding iOS WebKit's ~1-1.5GB tab limit.
1801
- * 2. It ships as a single 384MB .onnx file that must load into JS heap before
1802
- * ORT can consume it. iOS WebKit OOMs on this allocation.
1803
- * wav2arkit_cpu solves both: external data format (1.86MB graph + 402MB weights)
1804
- * lets ORT load only the tiny graph, then stream weights via URL pass-through
1805
- * directly into WASM memory. JS heap stays at ~2MB.
2302
+ * Provides a unified API that automatically selects the optimal implementation:
2303
+ * - Worker supported: Uses SenseVoiceWorker (off-main-thread inference)
2304
+ * - Worker unsupported: Uses SenseVoiceInference (main thread)
1806
2305
  *
1807
2306
  * @category Inference
1808
2307
  *
1809
2308
  * @example Auto-detect (recommended)
1810
2309
  * ```typescript
1811
- * import { createLipSync } from '@omote/core';
2310
+ * import { createSenseVoice } from '@omote/core';
1812
2311
  *
1813
- * const lam = createLipSync({
1814
- * gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
1815
- * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
2312
+ * const asr = createSenseVoice({
2313
+ * modelUrl: '/models/sensevoice/model.int8.onnx',
1816
2314
  * });
2315
+ * await asr.load();
2316
+ * const { text, emotion } = await asr.transcribe(audioSamples);
2317
+ * ```
1817
2318
  *
1818
- * await lam.load();
1819
- * const { blendshapes } = await lam.infer(audioSamples);
2319
+ * @example Force worker
2320
+ * ```typescript
2321
+ * const asr = createSenseVoice({
2322
+ * modelUrl: '/models/sensevoice/model.int8.onnx',
2323
+ * useWorker: true,
2324
+ * });
1820
2325
  * ```
1821
2326
  *
1822
- * @example Force CPU model
2327
+ * @example Force main thread
1823
2328
  * ```typescript
1824
- * const lam = createLipSync({
1825
- * gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
1826
- * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
1827
- * mode: 'cpu',
2329
+ * const asr = createSenseVoice({
2330
+ * modelUrl: '/models/sensevoice/model.int8.onnx',
2331
+ * useWorker: false,
1828
2332
  * });
1829
2333
  * ```
1830
2334
  */
1831
2335
 
1832
2336
  /**
1833
- * Configuration for the lip sync factory
2337
+ * Common interface for both SenseVoiceInference and SenseVoiceWorker
1834
2338
  */
1835
- interface CreateLipSyncConfig {
1836
- /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
1837
- gpuModelUrl: string;
2339
+ interface SenseVoiceBackend {
2340
+ /** Whether the model is loaded and ready for inference */
2341
+ readonly isLoaded: boolean;
2342
+ /** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
2343
+ readonly backend: 'wasm' | 'webgpu' | null;
1838
2344
  /**
1839
- * URL for GPU model external data file (.onnx.data weights).
1840
- * Default: `${gpuModelUrl}.data`
1841
- *
1842
- * Set to `false` to skip external data loading (single-file models only).
2345
+ * Load the ONNX model
2346
+ * @param onProgress - Optional progress callback (fires once at 100% for worker)
2347
+ * @returns Model loading information
1843
2348
  */
1844
- gpuExternalDataUrl?: string | false;
1845
- /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
1846
- cpuModelUrl: string;
2349
+ load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
1847
2350
  /**
1848
- * Model selection mode:
1849
- * - 'auto': Safari/iOS CPU, everything else GPU (default)
1850
- * - 'gpu': Force GPU model (Wav2Vec2Inference)
1851
- * - 'cpu': Force CPU model (Wav2ArkitCpuInference)
2351
+ * Transcribe audio samples to text
2352
+ * @param audioSamples - Float32Array of audio samples at 16kHz
2353
+ * @returns Transcription result
1852
2354
  */
1853
- mode?: 'auto' | 'gpu' | 'cpu';
1854
- /** Backend preference for GPU model (default: 'auto') */
1855
- gpuBackend?: BackendPreference;
1856
- /** Number of identity classes for GPU model (default: 12) */
1857
- numIdentityClasses?: number;
2355
+ transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
1858
2356
  /**
1859
- * Fall back to CPU model if GPU model fails to load (default: true)
1860
- * Only applies when mode is 'auto' or 'gpu'
2357
+ * Dispose of the model and free resources
1861
2358
  */
1862
- fallbackOnError?: boolean;
2359
+ dispose(): Promise<void>;
1863
2360
  }
1864
2361
  /**
1865
- * Create a lip sync instance with automatic GPU/CPU model selection
2362
+ * Configuration for the SenseVoice factory
2363
+ */
2364
+ interface CreateSenseVoiceConfig {
2365
+ /** Path or URL to model.int8.onnx (239MB) */
2366
+ modelUrl: string;
2367
+ /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
2368
+ tokensUrl?: string;
2369
+ /** Language hint (default: 'auto') */
2370
+ language?: SenseVoiceLanguage;
2371
+ /** Text normalization (default: 'with_itn') */
2372
+ textNorm?: 'with_itn' | 'without_itn';
2373
+ /**
2374
+ * Worker mode:
2375
+ * - 'auto' (default): Use Worker if supported, else main thread
2376
+ * - true: Force Worker (throws if unsupported)
2377
+ * - false: Force main thread
2378
+ */
2379
+ useWorker?: boolean | 'auto';
2380
+ /**
2381
+ * Unified inference worker instance.
2382
+ * When provided, uses SenseVoiceUnifiedAdapter (shared single-ORT worker).
2383
+ * Takes precedence over useWorker setting.
2384
+ */
2385
+ unifiedWorker?: UnifiedInferenceWorker;
2386
+ }
2387
+ /**
2388
+ * Create a SenseVoice ASR instance with automatic implementation selection
1866
2389
  *
1867
2390
  * @param config - Factory configuration
1868
- * @returns A LipSyncBackend instance (either GPU or CPU model)
2391
+ * @returns A SenseVoiceBackend instance (either Worker or main thread)
1869
2392
  */
1870
- declare function createLipSync(config: CreateLipSyncConfig): LipSyncBackend;
2393
+ declare function createSenseVoice(config: CreateSenseVoiceConfig): SenseVoiceBackend;
1871
2394
 
1872
2395
  /**
1873
- * Silero VAD (Voice Activity Detection) inference
2396
+ * Kaldi-compatible filterbank (fbank) feature extraction
1874
2397
  *
1875
- * Neural network-based VAD running in browser via ONNX Runtime Web.
1876
- * Much more accurate than RMS-based energy detection.
2398
+ * Pure TypeScript implementation matching kaldi-native-fbank parameters
2399
+ * used by SenseVoice. No external dependencies.
1877
2400
  *
1878
- * Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
1879
- * - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
1880
- * - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
2401
+ * Pipeline: audio framing windowing FFT power spectrum → mel filterbank → log
2402
+ *
2403
+ * @module inference/kaldiFbank
2404
+ */
2405
+ interface KaldiFbankOptions {
2406
+ /** Frame length in ms (default: 25) */
2407
+ frameLengthMs?: number;
2408
+ /** Frame shift in ms (default: 10) */
2409
+ frameShiftMs?: number;
2410
+ /** Low frequency cutoff in Hz (default: 20) */
2411
+ lowFreq?: number;
2412
+ /** High frequency cutoff in Hz (default: sampleRate / 2) */
2413
+ highFreq?: number;
2414
+ /** Dither amount (default: 0 for deterministic output) */
2415
+ dither?: number;
2416
+ /** Preemphasis coefficient (default: 0.97) */
2417
+ preemphasis?: number;
2418
+ }
2419
+ /**
2420
+ * Compute Kaldi-compatible log mel filterbank features
2421
+ *
2422
+ * @param audio Raw audio samples (float32, [-1, 1] range)
2423
+ * @param sampleRate Sample rate in Hz (must be 16000 for SenseVoice)
2424
+ * @param numMelBins Number of mel bins (80 for SenseVoice)
2425
+ * @param opts Optional parameters
2426
+ * @returns Flattened Float32Array of shape [numFrames, numMelBins]
2427
+ */
2428
+ declare function computeKaldiFbank(audio: Float32Array, sampleRate: number, numMelBins: number, opts?: KaldiFbankOptions): Float32Array;
2429
+ /**
2430
+ * Apply Low Frame Rate stacking for SenseVoice
2431
+ *
2432
+ * Concatenates lfrM consecutive frames with stride lfrN.
2433
+ * Left-pads with copies of first frame, right-pads last group.
2434
+ *
2435
+ * @param features Flattened [numFrames, featureDim]
2436
+ * @param featureDim Feature dimension per frame (e.g., 80)
2437
+ * @param lfrM Number of frames to stack (default: 7)
2438
+ * @param lfrN Stride (default: 6)
2439
+ * @returns Flattened [numOutputFrames, featureDim * lfrM]
2440
+ */
2441
+ declare function applyLFR(features: Float32Array, featureDim: number, lfrM?: number, lfrN?: number): Float32Array;
2442
+ /**
2443
+ * Apply CMVN normalization in-place
2444
+ *
2445
+ * Formula: normalized[i] = (features[i] + negMean[i % dim]) * invStddev[i % dim]
2446
+ *
2447
+ * @param features Flattened feature array (modified in-place)
2448
+ * @param dim Feature dimension (560 for SenseVoice after LFR)
2449
+ * @param negMean Negative mean vector (dim-dimensional)
2450
+ * @param invStddev Inverse standard deviation vector (dim-dimensional)
2451
+ * @returns The same features array (for chaining)
2452
+ */
2453
+ declare function applyCMVN(features: Float32Array, dim: number, negMean: Float32Array, invStddev: Float32Array): Float32Array;
2454
+ /**
2455
+ * Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
2456
+ *
2457
+ * The sherpa-onnx SenseVoice export stores neg_mean and inv_stddev
2458
+ * as comma-separated float strings in the model's metadata.
2459
+ */
2460
+ declare function parseCMVNFromMetadata(negMeanStr: string, invStddevStr: string): {
2461
+ negMean: Float32Array;
2462
+ invStddev: Float32Array;
2463
+ };
2464
+
2465
+ /**
2466
+ * CTC greedy decoder for SenseVoice
2467
+ *
2468
+ * Decodes CTC logits into text with structured token parsing
2469
+ * for language, emotion, and audio event detection.
2470
+ *
2471
+ * @module inference/ctcDecoder
2472
+ */
2473
+ interface CTCDecodeResult {
2474
+ /** Decoded text (speech content only) */
2475
+ text: string;
2476
+ /** Detected language (e.g., 'zh', 'en', 'ja', 'ko', 'yue') */
2477
+ language?: string;
2478
+ /** Detected emotion (e.g., 'HAPPY', 'SAD', 'ANGRY', 'NEUTRAL') */
2479
+ emotion?: string;
2480
+ /** Detected audio event (e.g., 'Speech', 'BGM', 'Laughter') */
2481
+ event?: string;
2482
+ }
2483
+ /** Resolve language string to SenseVoice language ID */
2484
+ declare function resolveLanguageId(language: string): number;
2485
+ /** Resolve text norm string to SenseVoice text norm ID */
2486
+ declare function resolveTextNormId(textNorm: string): number;
2487
+ /**
2488
+ * Parse tokens.txt into a token ID → string map
2489
+ *
2490
+ * Format: each line is "token_string token_id"
2491
+ * e.g., "<unk> 0", "▁the 3", "s 4"
2492
+ */
2493
+ declare function parseTokensFile(content: string): Map<number, string>;
2494
+ /**
2495
+ * CTC greedy decode
2496
+ *
2497
+ * @param logits Raw logits from model output, flattened [seqLen, vocabSize]
2498
+ * @param seqLen Sequence length (time steps)
2499
+ * @param vocabSize Vocabulary size
2500
+ * @param tokenMap Token ID → string map from tokens.txt
2501
+ * @returns Decoded text and structured metadata
2502
+ */
2503
+ declare function ctcGreedyDecode(logits: Float32Array, seqLen: number, vocabSize: number, tokenMap: Map<number, string>): CTCDecodeResult;
2504
+
2505
+ /**
2506
+ * Shared blendshape constants and utilities for lip sync inference
2507
+ *
2508
+ * Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
2509
+ * index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
2510
+ *
2511
+ * This module is the single source of truth for blendshape ordering to
2512
+ * avoid circular dependencies between inference classes.
2513
+ *
2514
+ * @category Inference
2515
+ */
2516
+ /**
2517
+ * LAM model blendshape names in order (52 total)
2518
+ * NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
2519
+ */
2520
+ declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
2521
+ /** Alias for backwards compatibility */
2522
+ declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
2523
+ /**
2524
+ * Symmetrize blendshapes by averaging left/right pairs
2525
+ * From LAM official postprocessing (models/utils.py)
2526
+ * This fixes asymmetric output from the raw model
2527
+ */
2528
+ declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
2529
+ /**
2530
+ * wav2arkit_cpu model blendshape ordering
2531
+ *
2532
+ * Indices 0-24 match LAM_BLENDSHAPES, but 25+ diverge:
2533
+ * - LAM puts jawRight, mouthClose, mouthDimpleLeft, mouthDimpleRight at 25-28
2534
+ * - wav2arkit_cpu puts mouthFrownLeft at 25 and moves those four to 48-51
2535
+ */
2536
+ declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
2537
+ /**
2538
+ * Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
2539
+ *
2540
+ * @param frame - Float32Array of 52 blendshape values in wav2arkit_cpu order
2541
+ * @returns Float32Array of 52 blendshape values in LAM_BLENDSHAPES order
2542
+ */
2543
+ declare function remapWav2ArkitToLam(frame: Float32Array): Float32Array;
2544
+
2545
+ /**
2546
+ * Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
2547
+ *
2548
+ * Runs entirely in the browser using WebGPU or WASM.
2549
+ * Takes raw 16kHz audio and outputs:
2550
+ * - 52 ARKit blendshapes (lip sync)
2551
+ * - 32-token CTC logits (speech recognition)
1881
2552
  *
1882
2553
  * @category Inference
1883
2554
  *
1884
2555
  * @example Basic usage
1885
2556
  * ```typescript
1886
- * import { SileroVADInference } from '@omote/core';
1887
- *
1888
- * const vad = new SileroVADInference({
1889
- * modelUrl: '/models/silero-vad.onnx'
1890
- * });
1891
- * await vad.load();
2557
+ * import { Wav2Vec2Inference } from '@omote/core';
1892
2558
  *
1893
- * // Process 32ms chunks (512 samples at 16kHz)
1894
- * const probability = await vad.process(audioChunk);
1895
- * if (probability > 0.5) {
1896
- * console.log('Speech detected!');
1897
- * }
1898
- * ```
2559
+ * const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
2560
+ * await wav2vec.load();
1899
2561
  *
1900
- * @example Streaming with state management
1901
- * ```typescript
1902
- * // State is automatically maintained between process() calls
1903
- * // Call reset() when starting a new audio stream
1904
- * vad.reset();
2562
+ * // Process 1 second of audio (16kHz = 16000 samples)
2563
+ * const result = await wav2vec.infer(audioSamples);
1905
2564
  *
1906
- * for (const chunk of audioChunks) {
1907
- * const prob = await vad.process(chunk);
1908
- * // prob is speech probability [0, 1]
1909
- * }
2565
+ * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
2566
+ * console.log('ASR text:', result.text); // Decoded transcription
1910
2567
  * ```
1911
2568
  */
1912
2569
 
1913
- type VADBackend = BackendPreference;
1914
- /**
1915
- * Configuration for Silero VAD
1916
- */
1917
- interface SileroVADConfig {
2570
+ type InferenceBackend = BackendPreference;
2571
+ interface Wav2Vec2InferenceConfig {
1918
2572
  /** Path or URL to the ONNX model */
1919
2573
  modelUrl: string;
1920
- /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
1921
- backend?: VADBackend;
1922
- /** Sample rate (8000 or 16000, default: 16000) */
1923
- sampleRate?: 8000 | 16000;
1924
- /** Speech probability threshold (default: 0.5) */
1925
- threshold?: number;
1926
2574
  /**
1927
- * Number of audio chunks to keep in pre-speech buffer.
1928
- * When VAD triggers, these chunks are prepended to the speech buffer
1929
- * to capture the beginning of speech that occurred before detection.
1930
- *
1931
- * At 512 samples/chunk and 16kHz:
1932
- * - 10 chunks = 320ms of pre-speech audio
1933
- * - 15 chunks = 480ms of pre-speech audio
2575
+ * Path or URL to external model data file (.onnx.data weights).
2576
+ * Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
1934
2577
  *
1935
- * Default: 10 chunks (320ms)
2578
+ * Set to `false` to skip external data loading (single-file models only).
1936
2579
  */
1937
- preSpeechBufferChunks?: number;
2580
+ externalDataUrl?: string | false;
2581
+ /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
2582
+ backend?: InferenceBackend;
2583
+ /** Number of identity classes (default: 12 for streaming model) */
2584
+ numIdentityClasses?: number;
1938
2585
  }
1939
- /**
1940
- * VAD model loading information
1941
- */
1942
- interface VADModelInfo {
2586
+ interface ModelInfo {
1943
2587
  backend: 'webgpu' | 'wasm';
1944
2588
  loadTimeMs: number;
1945
2589
  inputNames: string[];
1946
2590
  outputNames: string[];
1947
- sampleRate: number;
1948
- chunkSize: number;
1949
2591
  }
1950
- /**
1951
- * Result from a single VAD inference
1952
- */
1953
- interface VADResult {
1954
- /** Speech probability (0-1) */
1955
- probability: number;
1956
- /** Whether speech is detected (probability > threshold) */
1957
- isSpeech: boolean;
1958
- /** Inference time in milliseconds */
2592
+
2593
+ /** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
2594
+ declare const CTC_VOCAB: string[];
2595
+ interface Wav2Vec2Result {
2596
+ /** Blendshape weights [frames, 52] - 30fps */
2597
+ blendshapes: Float32Array[];
2598
+ /** Raw CTC logits [frames, 32] - 50fps */
2599
+ asrLogits: Float32Array[];
2600
+ /** Decoded text from CTC */
2601
+ text: string;
2602
+ /** Number of blendshape frames (30fps) — alias for numA2EFrames */
2603
+ numFrames: number;
2604
+ /** Number of A2E frames (30fps) */
2605
+ numA2EFrames: number;
2606
+ /** Number of ASR frames (50fps) */
2607
+ numASRFrames: number;
2608
+ /** Inference time in ms */
1959
2609
  inferenceTimeMs: number;
1960
- /**
1961
- * Pre-speech audio chunks (only present on first speech detection).
1962
- * These are the N chunks immediately before VAD triggered, useful for
1963
- * capturing the beginning of speech that occurred before detection.
1964
- *
1965
- * Only populated when transitioning from silence to speech.
1966
- */
1967
- preSpeechChunks?: Float32Array[];
1968
- }
1969
- /**
1970
- * Speech segment detected by VAD
1971
- */
1972
- interface SpeechSegment {
1973
- /** Start time in seconds */
1974
- start: number;
1975
- /** End time in seconds */
1976
- end: number;
1977
- /** Average probability during segment */
1978
- avgProbability: number;
1979
2610
  }
1980
- /**
1981
- * Silero VAD - Neural network voice activity detection
1982
- *
1983
- * Based on snakers4/silero-vad ONNX model.
1984
- * Processes 32ms chunks (512 samples at 16kHz) with LSTM state.
1985
- *
1986
- * @see https://github.com/snakers4/silero-vad
1987
- */
1988
- declare class SileroVADInference {
2611
+ declare class Wav2Vec2Inference implements LipSyncBackend {
2612
+ readonly modelId: "wav2vec2";
1989
2613
  private session;
1990
2614
  private ort;
1991
2615
  private config;
1992
2616
  private _backend;
1993
2617
  private isLoading;
1994
- private state;
1995
- private context;
1996
- private readonly chunkSize;
1997
- private readonly contextSize;
2618
+ private numIdentityClasses;
1998
2619
  private inferenceQueue;
1999
- private preSpeechBuffer;
2000
- private wasSpeaking;
2001
- private srTensor;
2002
- constructor(config: SileroVADConfig);
2003
- get backend(): RuntimeBackend | null;
2004
- get isLoaded(): boolean;
2005
- get sampleRate(): number;
2006
- get threshold(): number;
2007
- /**
2008
- * Get required chunk size in samples
2009
- */
2010
- getChunkSize(): number;
2011
- /**
2012
- * Get chunk duration in milliseconds
2013
- */
2014
- getChunkDurationMs(): number;
2620
+ private poisoned;
2621
+ private static readonly INFERENCE_TIMEOUT_MS;
2622
+ constructor(config: Wav2Vec2InferenceConfig);
2015
2623
  /**
2016
2624
  * Check if WebGPU is available and working
2017
2625
  * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
2018
2626
  */
2019
2627
  static isWebGPUAvailable: typeof isWebGPUAvailable;
2628
+ get backend(): 'webgpu' | 'wasm' | null;
2629
+ get isLoaded(): boolean;
2630
+ /** True if inference timed out and the session is permanently unusable */
2631
+ get isSessionPoisoned(): boolean;
2020
2632
  /**
2021
2633
  * Load the ONNX model
2022
2634
  */
2023
- load(): Promise<VADModelInfo>;
2024
- /**
2025
- * Reset state for new audio stream
2026
- */
2027
- reset(): void;
2635
+ load(): Promise<ModelInfo>;
2028
2636
  /**
2029
- * Process a single audio chunk
2637
+ * Run inference on raw audio
2638
+ * @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
2639
+ * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
2030
2640
  *
2031
- * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
2032
- * @returns VAD result with speech probability
2641
+ * Note: Model expects 1-second chunks (16000 samples) for optimal performance.
2642
+ * Audio will be zero-padded or truncated to 16000 samples.
2033
2643
  */
2034
- process(audioChunk: Float32Array): Promise<VADResult>;
2644
+ infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
2035
2645
  /**
2036
- * Process audio and detect speech segments
2037
- *
2038
- * @param audio - Complete audio buffer
2039
- * @param options - Detection options
2040
- * @returns Array of speech segments
2646
+ * Decode CTC logits to text using greedy decoding
2041
2647
  */
2042
- detectSpeech(audio: Float32Array, options?: {
2043
- /** Minimum speech duration in ms (default: 250) */
2044
- minSpeechDurationMs?: number;
2045
- /** Minimum silence duration to end segment in ms (default: 300) */
2046
- minSilenceDurationMs?: number;
2047
- /** Padding to add before/after speech in ms (default: 30) */
2048
- speechPadMs?: number;
2049
- }): Promise<SpeechSegment[]>;
2648
+ private decodeCTC;
2050
2649
  /**
2051
2650
  * Queue inference to serialize ONNX session calls
2052
2651
  */
2053
2652
  private queueInference;
2653
+ /**
2654
+ * Get blendshape value by name for a specific frame
2655
+ */
2656
+ getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
2054
2657
  /**
2055
2658
  * Dispose of the model and free resources
2056
2659
  */
@@ -2058,309 +2661,189 @@ declare class SileroVADInference {
2058
2661
  }
2059
2662
 
2060
2663
  /**
2061
- * Silero VAD Web Worker implementation
2664
+ * CPU-optimized lip sync inference using wav2arkit_cpu model
2062
2665
  *
2063
- * Runs Silero VAD inference in a dedicated Web Worker to prevent main thread blocking.
2064
- * Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
2666
+ * A Safari/iOS-compatible alternative to Wav2Vec2Inference (384MB) designed
2667
+ * for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
2065
2668
  *
2066
- * Key design decisions:
2067
- * - WASM backend only (WebGPU doesn't work in Workers)
2068
- * - LSTM state serialized as Float32Array (Tensors can't cross worker boundary)
2069
- * - Audio copied (not transferred) to retain main thread access for pre-speech buffer
2070
- * - ONNX Runtime loaded from CDN in worker (no bundler complications)
2669
+ * The model uses ONNX external data format:
2670
+ * - wav2arkit_cpu.onnx (1.86MB graph structure)
2671
+ * - wav2arkit_cpu.onnx.data (402MB weights)
2672
+ * Both files are fetched and cached automatically.
2673
+ *
2674
+ * Key differences from Wav2Vec2Inference:
2675
+ * - WASM-only backend (CPU-optimized, single-threaded, no WebGPU)
2676
+ * - No identity input (baked to identity 11)
2677
+ * - No ASR output (lip sync only)
2678
+ * - Dynamic input length (not fixed to 16000 samples)
2679
+ * - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
2071
2680
  *
2072
2681
  * @category Inference
2073
2682
  *
2074
- * @example Basic usage
2683
+ * @example
2075
2684
  * ```typescript
2076
- * import { SileroVADWorker } from '@omote/core';
2685
+ * import { Wav2ArkitCpuInference } from '@omote/core';
2077
2686
  *
2078
- * const vad = new SileroVADWorker({
2079
- * modelUrl: '/models/silero-vad.onnx'
2687
+ * const lam = new Wav2ArkitCpuInference({
2688
+ * modelUrl: '/models/wav2arkit_cpu.onnx',
2080
2689
  * });
2081
- * await vad.load();
2690
+ * await lam.load();
2082
2691
  *
2083
- * // Process 32ms chunks (512 samples at 16kHz)
2084
- * const result = await vad.process(audioChunk);
2085
- * if (result.isSpeech) {
2086
- * console.log('Speech detected!', result.probability);
2087
- * }
2692
+ * const { blendshapes } = await lam.infer(audioSamples);
2693
+ * // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
2088
2694
  * ```
2089
2695
  */
2090
2696
 
2091
- /**
2092
- * Configuration for Silero VAD Worker
2093
- */
2094
- interface VADWorkerConfig {
2095
- /** Path or URL to the ONNX model */
2697
+ interface Wav2ArkitCpuConfig {
2698
+ /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
2096
2699
  modelUrl: string;
2097
- /** Sample rate (8000 or 16000, default: 16000) */
2098
- sampleRate?: 8000 | 16000;
2099
- /** Speech probability threshold (default: 0.5) */
2100
- threshold?: number;
2101
2700
  /**
2102
- * Number of audio chunks to keep in pre-speech buffer.
2103
- * When VAD triggers, these chunks are prepended to the speech buffer
2104
- * to capture the beginning of speech that occurred before detection.
2105
- *
2106
- * At 512 samples/chunk and 16kHz:
2107
- * - 10 chunks = 320ms of pre-speech audio
2108
- * - 15 chunks = 480ms of pre-speech audio
2701
+ * Path or URL to external model data file (.onnx.data weights).
2702
+ * Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
2109
2703
  *
2110
- * Default: 10 chunks (320ms)
2111
- */
2112
- preSpeechBufferChunks?: number;
2113
- }
2114
- /**
2115
- * VAD model loading information from worker
2116
- */
2117
- interface VADWorkerModelInfo {
2118
- backend: 'wasm';
2119
- loadTimeMs: number;
2120
- inputNames: string[];
2121
- outputNames: string[];
2122
- sampleRate: number;
2123
- chunkSize: number;
2124
- }
2125
-
2126
- /**
2127
- * Silero VAD Worker - Voice Activity Detection in a Web Worker
2128
- *
2129
- * Runs Silero VAD inference off the main thread to prevent UI blocking.
2130
- * Feature parity with SileroVADInference but runs in dedicated worker.
2131
- *
2132
- * @see SileroVADInference for main-thread version
2133
- */
2134
- declare class SileroVADWorker {
2135
- private worker;
2704
+ * Set to `false` to skip external data loading (single-file models only).
2705
+ */
2706
+ externalDataUrl?: string | false;
2707
+ /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
2708
+ backend?: BackendPreference;
2709
+ }
2710
+ declare class Wav2ArkitCpuInference implements LipSyncBackend {
2711
+ readonly modelId: "wav2arkit_cpu";
2712
+ private session;
2713
+ private ort;
2136
2714
  private config;
2715
+ private _backend;
2137
2716
  private isLoading;
2138
- private _isLoaded;
2139
- private state;
2140
- private context;
2141
- private readonly chunkSize;
2142
- private readonly contextSize;
2143
2717
  private inferenceQueue;
2144
- private preSpeechBuffer;
2145
- private wasSpeaking;
2146
- private pendingResolvers;
2147
- private messageId;
2148
- constructor(config: VADWorkerConfig);
2718
+ private poisoned;
2719
+ private static readonly INFERENCE_TIMEOUT_MS;
2720
+ constructor(config: Wav2ArkitCpuConfig);
2721
+ get backend(): RuntimeBackend | null;
2149
2722
  get isLoaded(): boolean;
2150
2723
  /**
2151
- * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
2152
- */
2153
- get backend(): 'wasm' | null;
2154
- get sampleRate(): number;
2155
- get threshold(): number;
2156
- /**
2157
- * Get required chunk size in samples
2158
- */
2159
- getChunkSize(): number;
2160
- /**
2161
- * Get chunk duration in milliseconds
2162
- */
2163
- getChunkDurationMs(): number;
2164
- /**
2165
- * Create the worker from inline script
2166
- */
2167
- private createWorker;
2168
- /**
2169
- * Handle messages from worker
2170
- */
2171
- private handleWorkerMessage;
2172
- /**
2173
- * Send message to worker and wait for response
2174
- */
2175
- private sendMessage;
2176
- /**
2177
- * Load the ONNX model in the worker
2178
- */
2179
- load(): Promise<VADWorkerModelInfo>;
2180
- /**
2181
- * Reset state for new audio stream
2724
+ * Load the ONNX model
2182
2725
  */
2183
- reset(): Promise<void>;
2726
+ load(): Promise<LipSyncModelInfo>;
2184
2727
  /**
2185
- * Process a single audio chunk
2728
+ * Run inference on raw audio
2186
2729
  *
2187
- * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
2188
- * @returns VAD result with speech probability
2730
+ * Accepts variable-length audio (not fixed to 16000 samples).
2731
+ * Output frames = ceil(30 * numSamples / 16000).
2732
+ *
2733
+ * @param audioSamples - Float32Array of raw audio at 16kHz
2734
+ * @param _identityIndex - Ignored (identity 11 is baked into the model)
2189
2735
  */
2190
- process(audioChunk: Float32Array): Promise<VADResult>;
2736
+ infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
2191
2737
  /**
2192
- * Queue inference to serialize worker calls
2738
+ * Queue inference to serialize ONNX session calls
2193
2739
  */
2194
2740
  private queueInference;
2195
2741
  /**
2196
- * Dispose of the worker and free resources
2742
+ * Dispose of the model and free resources
2197
2743
  */
2198
2744
  dispose(): Promise<void>;
2199
- /**
2200
- * Check if Web Workers are supported
2201
- */
2202
- static isSupported(): boolean;
2203
2745
  }
2204
2746
 
2205
2747
  /**
2206
- * Factory function for Silero VAD with automatic Worker vs main thread selection
2748
+ * Factory function for lip sync with automatic GPU/CPU model selection
2207
2749
  *
2208
- * Provides a unified API that automatically selects the optimal implementation:
2209
- * - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
2210
- * - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
2211
- * - Fallback: Gracefully falls back to main thread if Worker fails
2750
+ * Provides a unified API that automatically selects the optimal model:
2751
+ * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
2752
+ * - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
2753
+ * - Fallback: Gracefully falls back to CPU model if GPU model fails to load
2754
+ *
2755
+ * Why two separate models?
2756
+ * Wav2Vec2 (LAM) cannot run on Safari/iOS for two reasons:
2757
+ * 1. Its dual-head transformer graph needs ~750-950MB peak during ORT session
2758
+ * creation (graph optimization), exceeding iOS WebKit's ~1-1.5GB tab limit.
2759
+ * 2. It ships as a single 384MB .onnx file that must load into JS heap before
2760
+ * ORT can consume it. iOS WebKit OOMs on this allocation.
2761
+ * wav2arkit_cpu solves both: external data format (1.86MB graph + 402MB weights)
2762
+ * lets ORT load only the tiny graph, then stream weights via URL pass-through
2763
+ * directly into WASM memory. JS heap stays at ~2MB.
2212
2764
  *
2213
2765
  * @category Inference
2214
2766
  *
2215
- * @example Basic usage (auto-detect)
2767
+ * @example Auto-detect (recommended)
2216
2768
  * ```typescript
2217
- * import { createSileroVAD } from '@omote/core';
2769
+ * import { createLipSync } from '@omote/core';
2218
2770
  *
2219
- * const vad = createSileroVAD({
2220
- * modelUrl: '/models/silero-vad.onnx',
2221
- * threshold: 0.5,
2771
+ * const lam = createLipSync({
2772
+ * gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
2773
+ * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
2222
2774
  * });
2223
2775
  *
2224
- * await vad.load();
2225
- * const result = await vad.process(audioChunk);
2226
- * if (result.isSpeech) {
2227
- * console.log('Speech detected!', result.probability);
2228
- * }
2229
- * ```
2230
- *
2231
- * @example Force worker usage
2232
- * ```typescript
2233
- * const vad = createSileroVAD({
2234
- * modelUrl: '/models/silero-vad.onnx',
2235
- * useWorker: true, // Force Worker even on mobile
2236
- * });
2776
+ * await lam.load();
2777
+ * const { blendshapes } = await lam.infer(audioSamples);
2237
2778
  * ```
2238
2779
  *
2239
- * @example Force main thread
2780
+ * @example Force CPU model
2240
2781
  * ```typescript
2241
- * const vad = createSileroVAD({
2242
- * modelUrl: '/models/silero-vad.onnx',
2243
- * useWorker: false, // Force main thread
2782
+ * const lam = createLipSync({
2783
+ * gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
2784
+ * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
2785
+ * mode: 'cpu',
2244
2786
  * });
2245
2787
  * ```
2246
2788
  */
2247
2789
 
2248
2790
  /**
2249
- * Common interface for both SileroVADInference and SileroVADWorker
2250
- *
2251
- * This interface defines the shared API that both implementations provide,
2252
- * allowing consumers to use either interchangeably.
2791
+ * Configuration for the lip sync factory
2253
2792
  */
2254
- interface SileroVADBackend {
2255
- /** Current backend type (webgpu, wasm, or null if not loaded) */
2256
- readonly backend: RuntimeBackend | null;
2257
- /** Whether the model is loaded and ready for inference */
2258
- readonly isLoaded: boolean;
2259
- /** Audio sample rate (8000 or 16000 Hz) */
2260
- readonly sampleRate: number;
2261
- /** Speech detection threshold (0-1) */
2262
- readonly threshold: number;
2263
- /**
2264
- * Load the ONNX model
2265
- * @returns Model loading information
2266
- */
2267
- load(): Promise<VADModelInfo | VADWorkerModelInfo>;
2268
- /**
2269
- * Process a single audio chunk
2270
- * @param audioChunk - Float32Array of exactly chunkSize samples
2271
- * @returns VAD result with speech probability
2272
- */
2273
- process(audioChunk: Float32Array): Promise<VADResult>;
2274
- /**
2275
- * Reset state for new audio stream
2276
- */
2277
- reset(): void | Promise<void>;
2793
+ interface CreateLipSyncConfig {
2794
+ /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
2795
+ gpuModelUrl: string;
2278
2796
  /**
2279
- * Dispose of the model and free resources
2797
+ * URL for GPU model external data file (.onnx.data weights).
2798
+ * Default: `${gpuModelUrl}.data`
2799
+ *
2800
+ * Set to `false` to skip external data loading (single-file models only).
2280
2801
  */
2281
- dispose(): Promise<void>;
2802
+ gpuExternalDataUrl?: string | false;
2803
+ /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
2804
+ cpuModelUrl: string;
2282
2805
  /**
2283
- * Get required chunk size in samples
2806
+ * Model selection mode:
2807
+ * - 'auto': Safari/iOS → CPU, everything else → GPU (default)
2808
+ * - 'gpu': Force GPU model (Wav2Vec2Inference)
2809
+ * - 'cpu': Force CPU model (Wav2ArkitCpuInference)
2284
2810
  */
2285
- getChunkSize(): number;
2811
+ mode?: 'auto' | 'gpu' | 'cpu';
2812
+ /** Backend preference for GPU model (default: 'auto') */
2813
+ gpuBackend?: BackendPreference;
2814
+ /** Number of identity classes for GPU model (default: 12) */
2815
+ numIdentityClasses?: number;
2286
2816
  /**
2287
- * Get chunk duration in milliseconds
2817
+ * Fall back to CPU model if GPU model fails to load (default: true)
2818
+ * Only applies when mode is 'auto' or 'gpu'
2288
2819
  */
2289
- getChunkDurationMs(): number;
2290
- }
2291
- /**
2292
- * Configuration for the Silero VAD factory
2293
- *
2294
- * Extends SileroVADConfig with worker-specific options.
2295
- */
2296
- interface SileroVADFactoryConfig extends SileroVADConfig {
2820
+ fallbackOnError?: boolean;
2297
2821
  /**
2298
- * Force worker usage (true), main thread (false), or auto-detect (undefined).
2299
- *
2300
- * Auto-detection behavior:
2301
- * - Desktop: Uses Worker (better responsiveness, off-main-thread)
2302
- * - Mobile: Uses main thread (avoids 5MB memory overhead)
2822
+ * Use Web Worker for CPU model inference (default: false)
2303
2823
  *
2304
- * You can override this to:
2305
- * - `true`: Force Worker even on mobile (if you have memory headroom)
2306
- * - `false`: Force main thread even on desktop (for debugging)
2824
+ * When true, Wav2ArkitCpuWorker is used instead of Wav2ArkitCpuInference,
2825
+ * running inference off the main thread to prevent UI blocking during
2826
+ * model loading and inference.
2307
2827
  *
2308
- * Default: undefined (auto-detect)
2828
+ * Only applies when the CPU model is selected (mode: 'cpu', auto on Safari/iOS,
2829
+ * or fallback from GPU).
2309
2830
  */
2310
2831
  useWorker?: boolean;
2311
2832
  /**
2312
- * Fallback to main thread on worker errors.
2313
- *
2314
- * When true (default), if the Worker fails to load or encounters an error,
2315
- * the factory will automatically create a main thread instance instead.
2316
- *
2317
- * When false, worker errors will propagate as exceptions.
2318
- *
2319
- * Default: true
2833
+ * Unified inference worker instance.
2834
+ * When provided and CPU model is selected, uses Wav2ArkitCpuUnifiedAdapter.
2835
+ * Takes precedence over useWorker setting for the CPU model path.
2836
+ * GPU model (Wav2Vec2) always stays on main thread (WebGPU).
2320
2837
  */
2321
- fallbackOnError?: boolean;
2838
+ unifiedWorker?: UnifiedInferenceWorker;
2322
2839
  }
2323
2840
  /**
2324
- * Check if the current environment supports VAD Web Workers
2325
- *
2326
- * Requirements:
2327
- * - Worker constructor must exist
2328
- * - Blob URL support (for inline worker script)
2329
- *
2330
- * @returns true if VAD Worker is supported
2331
- */
2332
- declare function supportsVADWorker(): boolean;
2333
- /**
2334
- * Create a Silero VAD instance with automatic implementation selection
2335
- *
2336
- * This factory function automatically selects between:
2337
- * - **SileroVADWorker**: Off-main-thread inference (better for desktop)
2338
- * - **SileroVADInference**: Main thread inference (better for mobile)
2339
- *
2340
- * The selection is based on:
2341
- * 1. Explicit `useWorker` config (if provided)
2342
- * 2. Platform detection (mobile vs desktop)
2343
- * 3. Worker API availability
2344
- *
2345
- * Both implementations share the same interface (SileroVADBackend),
2346
- * so consumers can use either interchangeably.
2841
+ * Create a lip sync instance with automatic GPU/CPU model selection
2347
2842
  *
2348
2843
  * @param config - Factory configuration
2349
- * @returns A SileroVAD instance (either Worker or main thread)
2350
- *
2351
- * @example
2352
- * ```typescript
2353
- * // Auto-detect (recommended)
2354
- * const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
2355
- *
2356
- * // Force Worker
2357
- * const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
2358
- *
2359
- * // Force main thread
2360
- * const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
2361
- * ```
2844
+ * @returns A LipSyncBackend instance (either GPU or CPU model)
2362
2845
  */
2363
- declare function createSileroVAD(config: SileroVADFactoryConfig): SileroVADBackend;
2846
+ declare function createLipSync(config: CreateLipSyncConfig): LipSyncBackend;
2364
2847
 
2365
2848
  /**
2366
2849
  * Safari Web Speech API wrapper for iOS speech recognition
@@ -3509,11 +3992,6 @@ declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
3509
3992
  * @param audioEnergy - Optional RMS energy for logging (default: 0)
3510
3993
  */
3511
3994
  processVADResult(vadProbability: number, audioEnergy?: number): void;
3512
- /**
3513
- * @deprecated Use processVADResult() instead. This method uses naive RMS detection.
3514
- * Process audio samples for VAD (legacy - uses simple RMS)
3515
- */
3516
- processAudio(samples: Float32Array | Int16Array): void;
3517
3995
  /**
3518
3996
  * Notify that AI started speaking
3519
3997
  */
@@ -3537,7 +4015,6 @@ declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
3537
4015
  isSpeaking: boolean;
3538
4016
  speechDurationMs: number;
3539
4017
  };
3540
- private calculateRMS;
3541
4018
  private onSpeechDetected;
3542
4019
  private onSilenceDetected;
3543
4020
  }
@@ -4713,4 +5190,4 @@ declare class ProceduralLifeLayer {
4713
5190
  private updateBrowNoise;
4714
5191
  }
4715
5192
 
4716
- export { type AIAdapter, type AIAdapterEvents, AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, type CTCDecodeResult, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_ARKIT_MAP, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type Emotion2VecLabel, type EmotionAnimationMap, type EmotionBlendMode, type EmotionBlendshapeConfig, EmotionController, type EmotionFrame, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionToBlendshapeMapper, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type KaldiFbankOptions, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, ProceduralLifeLayer, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, UPPER_FACE_BLENDSHAPES, type UpperFaceBlendshapeName, type UpperFaceBlendshapes, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyCMVN, applyLFR, blendEmotions, calculatePeak, calculateRMS, computeKaldiFbank, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSessionWithFallback, createSileroVAD, ctcGreedyDecode, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, parseCMVNFromMetadata, parseTokensFile, preloadModels, preloadOnnxRuntime, remapWav2ArkitToLam, resolveBackend, resolveLanguageId, resolveTextNormId, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes };
5193
+ export { type AIAdapter, type AIAdapterEvents, AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, type CTCDecodeResult, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_ARKIT_MAP, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type Emotion2VecLabel, type EmotionAnimationMap, type EmotionBlendMode, type EmotionBlendshapeConfig, EmotionController, type EmotionFrame, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionToBlendshapeMapper, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type KaldiFbankOptions, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, ProceduralLifeLayer, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, UPPER_FACE_BLENDSHAPES, UnifiedInferenceWorker, type UpperFaceBlendshapeName, type UpperFaceBlendshapes, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyCMVN, applyLFR, blendEmotions, calculatePeak, calculateRMS, computeKaldiFbank, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSenseVoice, createSessionWithFallback, createSileroVAD, ctcGreedyDecode, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, parseCMVNFromMetadata, parseTokensFile, preloadModels, preloadOnnxRuntime, remapWav2ArkitToLam, resolveBackend, resolveLanguageId, resolveTextNormId, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes };