@omote/core 0.4.5 → 0.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +1137 -654
- package/dist/index.d.ts +1137 -654
- package/dist/index.js +3189 -325
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +3184 -320
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -1435,298 +1435,634 @@ declare class SenseVoiceInference {
|
|
|
1435
1435
|
}
|
|
1436
1436
|
|
|
1437
1437
|
/**
|
|
1438
|
-
*
|
|
1439
|
-
*
|
|
1440
|
-
* Pure TypeScript implementation matching kaldi-native-fbank parameters
|
|
1441
|
-
* used by SenseVoice. No external dependencies.
|
|
1442
|
-
*
|
|
1443
|
-
* Pipeline: audio → framing → windowing → FFT → power spectrum → mel filterbank → log
|
|
1444
|
-
*
|
|
1445
|
-
* @module inference/kaldiFbank
|
|
1446
|
-
*/
|
|
1447
|
-
interface KaldiFbankOptions {
|
|
1448
|
-
/** Frame length in ms (default: 25) */
|
|
1449
|
-
frameLengthMs?: number;
|
|
1450
|
-
/** Frame shift in ms (default: 10) */
|
|
1451
|
-
frameShiftMs?: number;
|
|
1452
|
-
/** Low frequency cutoff in Hz (default: 20) */
|
|
1453
|
-
lowFreq?: number;
|
|
1454
|
-
/** High frequency cutoff in Hz (default: sampleRate / 2) */
|
|
1455
|
-
highFreq?: number;
|
|
1456
|
-
/** Dither amount (default: 0 for deterministic output) */
|
|
1457
|
-
dither?: number;
|
|
1458
|
-
/** Preemphasis coefficient (default: 0.97) */
|
|
1459
|
-
preemphasis?: number;
|
|
1460
|
-
}
|
|
1461
|
-
/**
|
|
1462
|
-
* Compute Kaldi-compatible log mel filterbank features
|
|
1438
|
+
* SenseVoice ASR Web Worker implementation
|
|
1463
1439
|
*
|
|
1464
|
-
*
|
|
1465
|
-
*
|
|
1466
|
-
*
|
|
1467
|
-
* @param opts Optional parameters
|
|
1468
|
-
* @returns Flattened Float32Array of shape [numFrames, numMelBins]
|
|
1469
|
-
*/
|
|
1470
|
-
declare function computeKaldiFbank(audio: Float32Array, sampleRate: number, numMelBins: number, opts?: KaldiFbankOptions): Float32Array;
|
|
1471
|
-
/**
|
|
1472
|
-
* Apply Low Frame Rate stacking for SenseVoice
|
|
1440
|
+
* Runs SenseVoice speech recognition in a dedicated Web Worker to prevent
|
|
1441
|
+
* main thread blocking. Uses inline worker script (Blob URL pattern) to
|
|
1442
|
+
* avoid separate file deployment.
|
|
1473
1443
|
*
|
|
1474
|
-
*
|
|
1475
|
-
*
|
|
1444
|
+
* Key design decisions:
|
|
1445
|
+
* - WASM backend only (WebGPU doesn't work in Workers)
|
|
1446
|
+
* - All preprocessing (fbank, LFR, CMVN) and CTC decoding inlined in worker
|
|
1447
|
+
* - Audio copied (not transferred) to retain main thread access
|
|
1448
|
+
* - ONNX Runtime loaded from CDN in worker (no bundler complications)
|
|
1449
|
+
* - iOS: model URL passed as string to ORT (avoids 239MB JS heap allocation)
|
|
1476
1450
|
*
|
|
1477
|
-
* @
|
|
1478
|
-
* @param featureDim Feature dimension per frame (e.g., 80)
|
|
1479
|
-
* @param lfrM Number of frames to stack (default: 7)
|
|
1480
|
-
* @param lfrN Stride (default: 6)
|
|
1481
|
-
* @returns Flattened [numOutputFrames, featureDim * lfrM]
|
|
1482
|
-
*/
|
|
1483
|
-
declare function applyLFR(features: Float32Array, featureDim: number, lfrM?: number, lfrN?: number): Float32Array;
|
|
1484
|
-
/**
|
|
1485
|
-
* Apply CMVN normalization in-place
|
|
1451
|
+
* @category Inference
|
|
1486
1452
|
*
|
|
1487
|
-
*
|
|
1453
|
+
* @example Basic usage
|
|
1454
|
+
* ```typescript
|
|
1455
|
+
* import { SenseVoiceWorker } from '@omote/core';
|
|
1488
1456
|
*
|
|
1489
|
-
*
|
|
1490
|
-
*
|
|
1491
|
-
*
|
|
1492
|
-
*
|
|
1493
|
-
*
|
|
1494
|
-
*/
|
|
1495
|
-
declare function applyCMVN(features: Float32Array, dim: number, negMean: Float32Array, invStddev: Float32Array): Float32Array;
|
|
1496
|
-
/**
|
|
1497
|
-
* Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
|
|
1457
|
+
* const asr = new SenseVoiceWorker({
|
|
1458
|
+
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
1459
|
+
* tokensUrl: '/models/sensevoice/tokens.txt',
|
|
1460
|
+
* });
|
|
1461
|
+
* await asr.load();
|
|
1498
1462
|
*
|
|
1499
|
-
*
|
|
1500
|
-
*
|
|
1463
|
+
* const { text, emotion, language } = await asr.transcribe(audioSamples);
|
|
1464
|
+
* console.log(text); // "Hello world"
|
|
1465
|
+
* console.log(emotion); // "NEUTRAL"
|
|
1466
|
+
* console.log(language); // "en"
|
|
1467
|
+
* ```
|
|
1501
1468
|
*/
|
|
1502
|
-
declare function parseCMVNFromMetadata(negMeanStr: string, invStddevStr: string): {
|
|
1503
|
-
negMean: Float32Array;
|
|
1504
|
-
invStddev: Float32Array;
|
|
1505
|
-
};
|
|
1506
1469
|
|
|
1507
1470
|
/**
|
|
1508
|
-
*
|
|
1509
|
-
*
|
|
1510
|
-
* Decodes CTC logits into text with structured token parsing
|
|
1511
|
-
* for language, emotion, and audio event detection.
|
|
1512
|
-
*
|
|
1513
|
-
* @module inference/ctcDecoder
|
|
1471
|
+
* Configuration for SenseVoice Worker
|
|
1514
1472
|
*/
|
|
1515
|
-
interface
|
|
1516
|
-
/**
|
|
1517
|
-
|
|
1518
|
-
/**
|
|
1519
|
-
|
|
1520
|
-
/**
|
|
1521
|
-
|
|
1522
|
-
/**
|
|
1523
|
-
|
|
1473
|
+
interface SenseVoiceWorkerConfig {
|
|
1474
|
+
/** Path or URL to model.int8.onnx (239MB) */
|
|
1475
|
+
modelUrl: string;
|
|
1476
|
+
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
1477
|
+
tokensUrl?: string;
|
|
1478
|
+
/** Language hint (default: 'auto' for auto-detection) */
|
|
1479
|
+
language?: SenseVoiceLanguage;
|
|
1480
|
+
/** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
|
|
1481
|
+
textNorm?: 'with_itn' | 'without_itn';
|
|
1524
1482
|
}
|
|
1525
|
-
/** Resolve language string to SenseVoice language ID */
|
|
1526
|
-
declare function resolveLanguageId(language: string): number;
|
|
1527
|
-
/** Resolve text norm string to SenseVoice text norm ID */
|
|
1528
|
-
declare function resolveTextNormId(textNorm: string): number;
|
|
1529
1483
|
/**
|
|
1530
|
-
*
|
|
1484
|
+
* SenseVoice ASR Worker - Speech Recognition in a Web Worker
|
|
1531
1485
|
*
|
|
1532
|
-
*
|
|
1533
|
-
*
|
|
1534
|
-
*/
|
|
1535
|
-
declare function parseTokensFile(content: string): Map<number, string>;
|
|
1536
|
-
/**
|
|
1537
|
-
* CTC greedy decode
|
|
1486
|
+
* Runs SenseVoice inference off the main thread to prevent UI blocking.
|
|
1487
|
+
* All preprocessing (fbank, LFR, CMVN) and CTC decoding run in the worker.
|
|
1538
1488
|
*
|
|
1539
|
-
* @
|
|
1540
|
-
* @param seqLen Sequence length (time steps)
|
|
1541
|
-
* @param vocabSize Vocabulary size
|
|
1542
|
-
* @param tokenMap Token ID → string map from tokens.txt
|
|
1543
|
-
* @returns Decoded text and structured metadata
|
|
1489
|
+
* @see SenseVoiceInference for main-thread version
|
|
1544
1490
|
*/
|
|
1545
|
-
declare
|
|
1491
|
+
declare class SenseVoiceWorker {
|
|
1492
|
+
private worker;
|
|
1493
|
+
private config;
|
|
1494
|
+
private isLoading;
|
|
1495
|
+
private _isLoaded;
|
|
1496
|
+
private inferenceQueue;
|
|
1497
|
+
private poisoned;
|
|
1498
|
+
private pendingResolvers;
|
|
1499
|
+
private languageId;
|
|
1500
|
+
private textNormId;
|
|
1501
|
+
constructor(config: SenseVoiceWorkerConfig);
|
|
1502
|
+
get isLoaded(): boolean;
|
|
1503
|
+
/**
|
|
1504
|
+
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
1505
|
+
*/
|
|
1506
|
+
get backend(): 'wasm' | null;
|
|
1507
|
+
/**
|
|
1508
|
+
* Create the worker from inline script
|
|
1509
|
+
*/
|
|
1510
|
+
private createWorker;
|
|
1511
|
+
/**
|
|
1512
|
+
* Handle messages from worker
|
|
1513
|
+
*/
|
|
1514
|
+
private handleWorkerMessage;
|
|
1515
|
+
/**
|
|
1516
|
+
* Send message to worker and wait for response
|
|
1517
|
+
*/
|
|
1518
|
+
private sendMessage;
|
|
1519
|
+
/**
|
|
1520
|
+
* Load the ONNX model in the worker
|
|
1521
|
+
*
|
|
1522
|
+
* @param onProgress - Optional progress callback. Fires once at 100% when load completes
|
|
1523
|
+
* (the worker downloads and loads the model internally, so granular progress is not available).
|
|
1524
|
+
*/
|
|
1525
|
+
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
1526
|
+
/**
|
|
1527
|
+
* Transcribe audio samples to text
|
|
1528
|
+
*
|
|
1529
|
+
* @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
|
|
1530
|
+
* @returns Transcription result with text, emotion, language, and event
|
|
1531
|
+
*/
|
|
1532
|
+
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
1533
|
+
/**
|
|
1534
|
+
* Queue inference to serialize worker calls
|
|
1535
|
+
*/
|
|
1536
|
+
private queueInference;
|
|
1537
|
+
/**
|
|
1538
|
+
* Dispose of the worker and free resources
|
|
1539
|
+
*/
|
|
1540
|
+
dispose(): Promise<void>;
|
|
1541
|
+
/**
|
|
1542
|
+
* Check if Web Workers are supported
|
|
1543
|
+
*/
|
|
1544
|
+
static isSupported(): boolean;
|
|
1545
|
+
}
|
|
1546
1546
|
|
|
1547
1547
|
/**
|
|
1548
|
-
*
|
|
1549
|
-
*
|
|
1550
|
-
* Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
|
|
1551
|
-
* index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
|
|
1552
|
-
*
|
|
1553
|
-
* This module is the single source of truth for blendshape ordering to
|
|
1554
|
-
* avoid circular dependencies between inference classes.
|
|
1555
|
-
*
|
|
1556
|
-
* @category Inference
|
|
1557
|
-
*/
|
|
1558
|
-
/**
|
|
1559
|
-
* LAM model blendshape names in order (52 total)
|
|
1560
|
-
* NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
|
|
1561
|
-
*/
|
|
1562
|
-
declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
1563
|
-
/** Alias for backwards compatibility */
|
|
1564
|
-
declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
1565
|
-
/**
|
|
1566
|
-
* Symmetrize blendshapes by averaging left/right pairs
|
|
1567
|
-
* From LAM official postprocessing (models/utils.py)
|
|
1568
|
-
* This fixes asymmetric output from the raw model
|
|
1569
|
-
*/
|
|
1570
|
-
declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
|
|
1571
|
-
/**
|
|
1572
|
-
* wav2arkit_cpu model blendshape ordering
|
|
1573
|
-
*
|
|
1574
|
-
* Indices 0-24 match LAM_BLENDSHAPES, but 25+ diverge:
|
|
1575
|
-
* - LAM puts jawRight, mouthClose, mouthDimpleLeft, mouthDimpleRight at 25-28
|
|
1576
|
-
* - wav2arkit_cpu puts mouthFrownLeft at 25 and moves those four to 48-51
|
|
1577
|
-
*/
|
|
1578
|
-
declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
|
|
1579
|
-
/**
|
|
1580
|
-
* Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
|
|
1548
|
+
* Silero VAD (Voice Activity Detection) inference
|
|
1581
1549
|
*
|
|
1582
|
-
*
|
|
1583
|
-
*
|
|
1584
|
-
*/
|
|
1585
|
-
declare function remapWav2ArkitToLam(frame: Float32Array): Float32Array;
|
|
1586
|
-
|
|
1587
|
-
/**
|
|
1588
|
-
* Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
|
|
1550
|
+
* Neural network-based VAD running in browser via ONNX Runtime Web.
|
|
1551
|
+
* Much more accurate than RMS-based energy detection.
|
|
1589
1552
|
*
|
|
1590
|
-
*
|
|
1591
|
-
*
|
|
1592
|
-
* -
|
|
1593
|
-
* - 32-token CTC logits (speech recognition)
|
|
1553
|
+
* Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
|
|
1554
|
+
* - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
|
|
1555
|
+
* - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
|
|
1594
1556
|
*
|
|
1595
1557
|
* @category Inference
|
|
1596
1558
|
*
|
|
1597
1559
|
* @example Basic usage
|
|
1598
1560
|
* ```typescript
|
|
1599
|
-
* import {
|
|
1561
|
+
* import { SileroVADInference } from '@omote/core';
|
|
1600
1562
|
*
|
|
1601
|
-
* const
|
|
1602
|
-
*
|
|
1563
|
+
* const vad = new SileroVADInference({
|
|
1564
|
+
* modelUrl: '/models/silero-vad.onnx'
|
|
1565
|
+
* });
|
|
1566
|
+
* await vad.load();
|
|
1603
1567
|
*
|
|
1604
|
-
* // Process
|
|
1605
|
-
* const
|
|
1568
|
+
* // Process 32ms chunks (512 samples at 16kHz)
|
|
1569
|
+
* const probability = await vad.process(audioChunk);
|
|
1570
|
+
* if (probability > 0.5) {
|
|
1571
|
+
* console.log('Speech detected!');
|
|
1572
|
+
* }
|
|
1573
|
+
* ```
|
|
1606
1574
|
*
|
|
1607
|
-
*
|
|
1608
|
-
*
|
|
1575
|
+
* @example Streaming with state management
|
|
1576
|
+
* ```typescript
|
|
1577
|
+
* // State is automatically maintained between process() calls
|
|
1578
|
+
* // Call reset() when starting a new audio stream
|
|
1579
|
+
* vad.reset();
|
|
1580
|
+
*
|
|
1581
|
+
* for (const chunk of audioChunks) {
|
|
1582
|
+
* const prob = await vad.process(chunk);
|
|
1583
|
+
* // prob is speech probability [0, 1]
|
|
1584
|
+
* }
|
|
1609
1585
|
* ```
|
|
1610
1586
|
*/
|
|
1611
1587
|
|
|
1612
|
-
type
|
|
1613
|
-
|
|
1588
|
+
type VADBackend = BackendPreference;
|
|
1589
|
+
/**
|
|
1590
|
+
* Configuration for Silero VAD
|
|
1591
|
+
*/
|
|
1592
|
+
interface SileroVADConfig {
|
|
1614
1593
|
/** Path or URL to the ONNX model */
|
|
1615
1594
|
modelUrl: string;
|
|
1616
|
-
/**
|
|
1617
|
-
* Path or URL to external model data file (.onnx.data weights).
|
|
1618
|
-
* Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
|
|
1619
|
-
*
|
|
1620
|
-
* Set to `false` to skip external data loading (single-file models only).
|
|
1621
|
-
*/
|
|
1622
|
-
externalDataUrl?: string | false;
|
|
1623
1595
|
/** Preferred backend (auto will try WebGPU first, fallback to WASM) */
|
|
1624
|
-
backend?:
|
|
1625
|
-
/**
|
|
1626
|
-
|
|
1596
|
+
backend?: VADBackend;
|
|
1597
|
+
/** Sample rate (8000 or 16000, default: 16000) */
|
|
1598
|
+
sampleRate?: 8000 | 16000;
|
|
1599
|
+
/** Speech probability threshold (default: 0.5) */
|
|
1600
|
+
threshold?: number;
|
|
1601
|
+
/**
|
|
1602
|
+
* Number of audio chunks to keep in pre-speech buffer.
|
|
1603
|
+
* When VAD triggers, these chunks are prepended to the speech buffer
|
|
1604
|
+
* to capture the beginning of speech that occurred before detection.
|
|
1605
|
+
*
|
|
1606
|
+
* At 512 samples/chunk and 16kHz:
|
|
1607
|
+
* - 10 chunks = 320ms of pre-speech audio
|
|
1608
|
+
* - 15 chunks = 480ms of pre-speech audio
|
|
1609
|
+
*
|
|
1610
|
+
* Default: 10 chunks (320ms)
|
|
1611
|
+
*/
|
|
1612
|
+
preSpeechBufferChunks?: number;
|
|
1627
1613
|
}
|
|
1628
|
-
|
|
1614
|
+
/**
|
|
1615
|
+
* VAD model loading information
|
|
1616
|
+
*/
|
|
1617
|
+
interface VADModelInfo {
|
|
1629
1618
|
backend: 'webgpu' | 'wasm';
|
|
1630
1619
|
loadTimeMs: number;
|
|
1631
1620
|
inputNames: string[];
|
|
1632
1621
|
outputNames: string[];
|
|
1622
|
+
sampleRate: number;
|
|
1623
|
+
chunkSize: number;
|
|
1633
1624
|
}
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
|
|
1637
|
-
interface
|
|
1638
|
-
/**
|
|
1639
|
-
|
|
1640
|
-
/**
|
|
1641
|
-
|
|
1642
|
-
/**
|
|
1643
|
-
text: string;
|
|
1644
|
-
/** Number of blendshape frames (30fps) — alias for numA2EFrames */
|
|
1645
|
-
numFrames: number;
|
|
1646
|
-
/** Number of A2E frames (30fps) */
|
|
1647
|
-
numA2EFrames: number;
|
|
1648
|
-
/** Number of ASR frames (50fps) */
|
|
1649
|
-
numASRFrames: number;
|
|
1650
|
-
/** Inference time in ms */
|
|
1625
|
+
/**
|
|
1626
|
+
* Result from a single VAD inference
|
|
1627
|
+
*/
|
|
1628
|
+
interface VADResult {
|
|
1629
|
+
/** Speech probability (0-1) */
|
|
1630
|
+
probability: number;
|
|
1631
|
+
/** Whether speech is detected (probability > threshold) */
|
|
1632
|
+
isSpeech: boolean;
|
|
1633
|
+
/** Inference time in milliseconds */
|
|
1651
1634
|
inferenceTimeMs: number;
|
|
1635
|
+
/**
|
|
1636
|
+
* Pre-speech audio chunks (only present on first speech detection).
|
|
1637
|
+
* These are the N chunks immediately before VAD triggered, useful for
|
|
1638
|
+
* capturing the beginning of speech that occurred before detection.
|
|
1639
|
+
*
|
|
1640
|
+
* Only populated when transitioning from silence to speech.
|
|
1641
|
+
*/
|
|
1642
|
+
preSpeechChunks?: Float32Array[];
|
|
1652
1643
|
}
|
|
1653
|
-
|
|
1654
|
-
|
|
1644
|
+
/**
|
|
1645
|
+
* Speech segment detected by VAD
|
|
1646
|
+
*/
|
|
1647
|
+
interface SpeechSegment {
|
|
1648
|
+
/** Start time in seconds */
|
|
1649
|
+
start: number;
|
|
1650
|
+
/** End time in seconds */
|
|
1651
|
+
end: number;
|
|
1652
|
+
/** Average probability during segment */
|
|
1653
|
+
avgProbability: number;
|
|
1654
|
+
}
|
|
1655
|
+
/**
|
|
1656
|
+
* Silero VAD - Neural network voice activity detection
|
|
1657
|
+
*
|
|
1658
|
+
* Based on snakers4/silero-vad ONNX model.
|
|
1659
|
+
* Processes 32ms chunks (512 samples at 16kHz) with LSTM state.
|
|
1660
|
+
*
|
|
1661
|
+
* @see https://github.com/snakers4/silero-vad
|
|
1662
|
+
*/
|
|
1663
|
+
declare class SileroVADInference {
|
|
1655
1664
|
private session;
|
|
1656
1665
|
private ort;
|
|
1657
1666
|
private config;
|
|
1658
1667
|
private _backend;
|
|
1659
1668
|
private isLoading;
|
|
1660
|
-
private
|
|
1669
|
+
private state;
|
|
1670
|
+
private context;
|
|
1671
|
+
private readonly chunkSize;
|
|
1672
|
+
private readonly contextSize;
|
|
1661
1673
|
private inferenceQueue;
|
|
1662
|
-
private
|
|
1663
|
-
private
|
|
1664
|
-
|
|
1674
|
+
private preSpeechBuffer;
|
|
1675
|
+
private wasSpeaking;
|
|
1676
|
+
private srTensor;
|
|
1677
|
+
constructor(config: SileroVADConfig);
|
|
1678
|
+
get backend(): RuntimeBackend | null;
|
|
1679
|
+
get isLoaded(): boolean;
|
|
1680
|
+
get sampleRate(): number;
|
|
1681
|
+
get threshold(): number;
|
|
1682
|
+
/**
|
|
1683
|
+
* Get required chunk size in samples
|
|
1684
|
+
*/
|
|
1685
|
+
getChunkSize(): number;
|
|
1686
|
+
/**
|
|
1687
|
+
* Get chunk duration in milliseconds
|
|
1688
|
+
*/
|
|
1689
|
+
getChunkDurationMs(): number;
|
|
1690
|
+
/**
|
|
1691
|
+
* Check if WebGPU is available and working
|
|
1692
|
+
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
1693
|
+
*/
|
|
1694
|
+
static isWebGPUAvailable: typeof isWebGPUAvailable;
|
|
1695
|
+
/**
|
|
1696
|
+
* Load the ONNX model
|
|
1697
|
+
*/
|
|
1698
|
+
load(): Promise<VADModelInfo>;
|
|
1699
|
+
/**
|
|
1700
|
+
* Reset state for new audio stream
|
|
1701
|
+
*/
|
|
1702
|
+
reset(): void;
|
|
1703
|
+
/**
|
|
1704
|
+
* Process a single audio chunk
|
|
1705
|
+
*
|
|
1706
|
+
* @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
|
|
1707
|
+
* @returns VAD result with speech probability
|
|
1708
|
+
*/
|
|
1709
|
+
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1710
|
+
/**
|
|
1711
|
+
* Process audio and detect speech segments
|
|
1712
|
+
*
|
|
1713
|
+
* @param audio - Complete audio buffer
|
|
1714
|
+
* @param options - Detection options
|
|
1715
|
+
* @returns Array of speech segments
|
|
1716
|
+
*/
|
|
1717
|
+
detectSpeech(audio: Float32Array, options?: {
|
|
1718
|
+
/** Minimum speech duration in ms (default: 250) */
|
|
1719
|
+
minSpeechDurationMs?: number;
|
|
1720
|
+
/** Minimum silence duration to end segment in ms (default: 300) */
|
|
1721
|
+
minSilenceDurationMs?: number;
|
|
1722
|
+
/** Padding to add before/after speech in ms (default: 30) */
|
|
1723
|
+
speechPadMs?: number;
|
|
1724
|
+
}): Promise<SpeechSegment[]>;
|
|
1725
|
+
/**
|
|
1726
|
+
* Queue inference to serialize ONNX session calls
|
|
1727
|
+
*/
|
|
1728
|
+
private queueInference;
|
|
1729
|
+
/**
|
|
1730
|
+
* Dispose of the model and free resources
|
|
1731
|
+
*/
|
|
1732
|
+
dispose(): Promise<void>;
|
|
1733
|
+
}
|
|
1734
|
+
|
|
1735
|
+
/**
|
|
1736
|
+
* Silero VAD Web Worker implementation
|
|
1737
|
+
*
|
|
1738
|
+
* Runs Silero VAD inference in a dedicated Web Worker to prevent main thread blocking.
|
|
1739
|
+
* Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
|
|
1740
|
+
*
|
|
1741
|
+
* Key design decisions:
|
|
1742
|
+
* - WASM backend only (WebGPU doesn't work in Workers)
|
|
1743
|
+
* - LSTM state serialized as Float32Array (Tensors can't cross worker boundary)
|
|
1744
|
+
* - Audio copied (not transferred) to retain main thread access for pre-speech buffer
|
|
1745
|
+
* - ONNX Runtime loaded from CDN in worker (no bundler complications)
|
|
1746
|
+
*
|
|
1747
|
+
* @category Inference
|
|
1748
|
+
*
|
|
1749
|
+
* @example Basic usage
|
|
1750
|
+
* ```typescript
|
|
1751
|
+
* import { SileroVADWorker } from '@omote/core';
|
|
1752
|
+
*
|
|
1753
|
+
* const vad = new SileroVADWorker({
|
|
1754
|
+
* modelUrl: '/models/silero-vad.onnx'
|
|
1755
|
+
* });
|
|
1756
|
+
* await vad.load();
|
|
1757
|
+
*
|
|
1758
|
+
* // Process 32ms chunks (512 samples at 16kHz)
|
|
1759
|
+
* const result = await vad.process(audioChunk);
|
|
1760
|
+
* if (result.isSpeech) {
|
|
1761
|
+
* console.log('Speech detected!', result.probability);
|
|
1762
|
+
* }
|
|
1763
|
+
* ```
|
|
1764
|
+
*/
|
|
1765
|
+
|
|
1766
|
+
/**
|
|
1767
|
+
* Configuration for Silero VAD Worker
|
|
1768
|
+
*/
|
|
1769
|
+
interface VADWorkerConfig {
|
|
1770
|
+
/** Path or URL to the ONNX model */
|
|
1771
|
+
modelUrl: string;
|
|
1772
|
+
/** Sample rate (8000 or 16000, default: 16000) */
|
|
1773
|
+
sampleRate?: 8000 | 16000;
|
|
1774
|
+
/** Speech probability threshold (default: 0.5) */
|
|
1775
|
+
threshold?: number;
|
|
1776
|
+
/**
|
|
1777
|
+
* Number of audio chunks to keep in pre-speech buffer.
|
|
1778
|
+
* When VAD triggers, these chunks are prepended to the speech buffer
|
|
1779
|
+
* to capture the beginning of speech that occurred before detection.
|
|
1780
|
+
*
|
|
1781
|
+
* At 512 samples/chunk and 16kHz:
|
|
1782
|
+
* - 10 chunks = 320ms of pre-speech audio
|
|
1783
|
+
* - 15 chunks = 480ms of pre-speech audio
|
|
1784
|
+
*
|
|
1785
|
+
* Default: 10 chunks (320ms)
|
|
1786
|
+
*/
|
|
1787
|
+
preSpeechBufferChunks?: number;
|
|
1788
|
+
}
|
|
1789
|
+
/**
|
|
1790
|
+
* VAD model loading information from worker
|
|
1791
|
+
*/
|
|
1792
|
+
interface VADWorkerModelInfo {
|
|
1793
|
+
backend: 'wasm';
|
|
1794
|
+
loadTimeMs: number;
|
|
1795
|
+
inputNames: string[];
|
|
1796
|
+
outputNames: string[];
|
|
1797
|
+
sampleRate: number;
|
|
1798
|
+
chunkSize: number;
|
|
1799
|
+
}
|
|
1800
|
+
|
|
1801
|
+
/**
|
|
1802
|
+
* Silero VAD Worker - Voice Activity Detection in a Web Worker
|
|
1803
|
+
*
|
|
1804
|
+
* Runs Silero VAD inference off the main thread to prevent UI blocking.
|
|
1805
|
+
* Feature parity with SileroVADInference but runs in dedicated worker.
|
|
1806
|
+
*
|
|
1807
|
+
* @see SileroVADInference for main-thread version
|
|
1808
|
+
*/
|
|
1809
|
+
declare class SileroVADWorker {
|
|
1810
|
+
private worker;
|
|
1811
|
+
private config;
|
|
1812
|
+
private isLoading;
|
|
1813
|
+
private _isLoaded;
|
|
1814
|
+
private state;
|
|
1815
|
+
private context;
|
|
1816
|
+
private readonly chunkSize;
|
|
1817
|
+
private readonly contextSize;
|
|
1818
|
+
private inferenceQueue;
|
|
1819
|
+
private preSpeechBuffer;
|
|
1820
|
+
private wasSpeaking;
|
|
1821
|
+
private pendingResolvers;
|
|
1822
|
+
private messageId;
|
|
1823
|
+
constructor(config: VADWorkerConfig);
|
|
1824
|
+
get isLoaded(): boolean;
|
|
1825
|
+
/**
|
|
1826
|
+
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
1827
|
+
*/
|
|
1828
|
+
get backend(): 'wasm' | null;
|
|
1829
|
+
get sampleRate(): number;
|
|
1830
|
+
get threshold(): number;
|
|
1831
|
+
/**
|
|
1832
|
+
* Get required chunk size in samples
|
|
1833
|
+
*/
|
|
1834
|
+
getChunkSize(): number;
|
|
1835
|
+
/**
|
|
1836
|
+
* Get chunk duration in milliseconds
|
|
1837
|
+
*/
|
|
1838
|
+
getChunkDurationMs(): number;
|
|
1839
|
+
/**
|
|
1840
|
+
* Create the worker from inline script
|
|
1841
|
+
*/
|
|
1842
|
+
private createWorker;
|
|
1843
|
+
/**
|
|
1844
|
+
* Handle messages from worker
|
|
1845
|
+
*/
|
|
1846
|
+
private handleWorkerMessage;
|
|
1847
|
+
/**
|
|
1848
|
+
* Send message to worker and wait for response
|
|
1849
|
+
*/
|
|
1850
|
+
private sendMessage;
|
|
1851
|
+
/**
|
|
1852
|
+
* Load the ONNX model in the worker
|
|
1853
|
+
*/
|
|
1854
|
+
load(): Promise<VADWorkerModelInfo>;
|
|
1855
|
+
/**
|
|
1856
|
+
* Reset state for new audio stream
|
|
1857
|
+
*/
|
|
1858
|
+
reset(): Promise<void>;
|
|
1859
|
+
/**
|
|
1860
|
+
* Process a single audio chunk
|
|
1861
|
+
*
|
|
1862
|
+
* @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
|
|
1863
|
+
* @returns VAD result with speech probability
|
|
1864
|
+
*/
|
|
1865
|
+
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1866
|
+
/**
|
|
1867
|
+
* Queue inference to serialize worker calls
|
|
1868
|
+
*/
|
|
1869
|
+
private queueInference;
|
|
1870
|
+
/**
|
|
1871
|
+
* Dispose of the worker and free resources
|
|
1872
|
+
*/
|
|
1873
|
+
dispose(): Promise<void>;
|
|
1874
|
+
/**
|
|
1875
|
+
* Check if Web Workers are supported
|
|
1876
|
+
*/
|
|
1877
|
+
static isSupported(): boolean;
|
|
1878
|
+
}
|
|
1879
|
+
|
|
1880
|
+
/**
|
|
1881
|
+
* Factory function for Silero VAD with automatic Worker vs main thread selection
|
|
1882
|
+
*
|
|
1883
|
+
* Provides a unified API that automatically selects the optimal implementation:
|
|
1884
|
+
* - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
|
|
1885
|
+
* - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
|
|
1886
|
+
* - Fallback: Gracefully falls back to main thread if Worker fails
|
|
1887
|
+
*
|
|
1888
|
+
* @category Inference
|
|
1889
|
+
*
|
|
1890
|
+
* @example Basic usage (auto-detect)
|
|
1891
|
+
* ```typescript
|
|
1892
|
+
* import { createSileroVAD } from '@omote/core';
|
|
1893
|
+
*
|
|
1894
|
+
* const vad = createSileroVAD({
|
|
1895
|
+
* modelUrl: '/models/silero-vad.onnx',
|
|
1896
|
+
* threshold: 0.5,
|
|
1897
|
+
* });
|
|
1898
|
+
*
|
|
1899
|
+
* await vad.load();
|
|
1900
|
+
* const result = await vad.process(audioChunk);
|
|
1901
|
+
* if (result.isSpeech) {
|
|
1902
|
+
* console.log('Speech detected!', result.probability);
|
|
1903
|
+
* }
|
|
1904
|
+
* ```
|
|
1905
|
+
*
|
|
1906
|
+
* @example Force worker usage
|
|
1907
|
+
* ```typescript
|
|
1908
|
+
* const vad = createSileroVAD({
|
|
1909
|
+
* modelUrl: '/models/silero-vad.onnx',
|
|
1910
|
+
* useWorker: true, // Force Worker even on mobile
|
|
1911
|
+
* });
|
|
1912
|
+
* ```
|
|
1913
|
+
*
|
|
1914
|
+
* @example Force main thread
|
|
1915
|
+
* ```typescript
|
|
1916
|
+
* const vad = createSileroVAD({
|
|
1917
|
+
* modelUrl: '/models/silero-vad.onnx',
|
|
1918
|
+
* useWorker: false, // Force main thread
|
|
1919
|
+
* });
|
|
1920
|
+
* ```
|
|
1921
|
+
*/
|
|
1922
|
+
|
|
1923
|
+
/**
|
|
1924
|
+
* Common interface for both SileroVADInference and SileroVADWorker
|
|
1925
|
+
*
|
|
1926
|
+
* This interface defines the shared API that both implementations provide,
|
|
1927
|
+
* allowing consumers to use either interchangeably.
|
|
1928
|
+
*/
|
|
1929
|
+
interface SileroVADBackend {
|
|
1930
|
+
/** Current backend type (webgpu, wasm, or null if not loaded) */
|
|
1931
|
+
readonly backend: RuntimeBackend | null;
|
|
1932
|
+
/** Whether the model is loaded and ready for inference */
|
|
1933
|
+
readonly isLoaded: boolean;
|
|
1934
|
+
/** Audio sample rate (8000 or 16000 Hz) */
|
|
1935
|
+
readonly sampleRate: number;
|
|
1936
|
+
/** Speech detection threshold (0-1) */
|
|
1937
|
+
readonly threshold: number;
|
|
1665
1938
|
/**
|
|
1666
|
-
*
|
|
1667
|
-
*
|
|
1939
|
+
* Load the ONNX model
|
|
1940
|
+
* @returns Model loading information
|
|
1668
1941
|
*/
|
|
1669
|
-
|
|
1670
|
-
get backend(): 'webgpu' | 'wasm' | null;
|
|
1671
|
-
get isLoaded(): boolean;
|
|
1672
|
-
/** True if inference timed out and the session is permanently unusable */
|
|
1673
|
-
get isSessionPoisoned(): boolean;
|
|
1942
|
+
load(): Promise<VADModelInfo | VADWorkerModelInfo>;
|
|
1674
1943
|
/**
|
|
1675
|
-
*
|
|
1944
|
+
* Process a single audio chunk
|
|
1945
|
+
* @param audioChunk - Float32Array of exactly chunkSize samples
|
|
1946
|
+
* @returns VAD result with speech probability
|
|
1676
1947
|
*/
|
|
1677
|
-
|
|
1948
|
+
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1678
1949
|
/**
|
|
1679
|
-
*
|
|
1680
|
-
* @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
|
|
1681
|
-
* @param identityIndex - Optional identity index (0-11, default 0 = neutral)
|
|
1682
|
-
*
|
|
1683
|
-
* Note: Model expects 1-second chunks (16000 samples) for optimal performance.
|
|
1684
|
-
* Audio will be zero-padded or truncated to 16000 samples.
|
|
1950
|
+
* Reset state for new audio stream
|
|
1685
1951
|
*/
|
|
1686
|
-
|
|
1952
|
+
reset(): void | Promise<void>;
|
|
1687
1953
|
/**
|
|
1688
|
-
*
|
|
1954
|
+
* Dispose of the model and free resources
|
|
1689
1955
|
*/
|
|
1690
|
-
|
|
1956
|
+
dispose(): Promise<void>;
|
|
1691
1957
|
/**
|
|
1692
|
-
*
|
|
1958
|
+
* Get required chunk size in samples
|
|
1693
1959
|
*/
|
|
1694
|
-
|
|
1960
|
+
getChunkSize(): number;
|
|
1695
1961
|
/**
|
|
1696
|
-
* Get
|
|
1962
|
+
* Get chunk duration in milliseconds
|
|
1697
1963
|
*/
|
|
1698
|
-
|
|
1964
|
+
getChunkDurationMs(): number;
|
|
1965
|
+
}
|
|
1966
|
+
/**
|
|
1967
|
+
* Configuration for the Silero VAD factory
|
|
1968
|
+
*
|
|
1969
|
+
* Extends SileroVADConfig with worker-specific options.
|
|
1970
|
+
*/
|
|
1971
|
+
interface SileroVADFactoryConfig extends SileroVADConfig {
|
|
1699
1972
|
/**
|
|
1700
|
-
*
|
|
1973
|
+
* Force worker usage (true), main thread (false), or auto-detect (undefined).
|
|
1974
|
+
*
|
|
1975
|
+
* Auto-detection behavior:
|
|
1976
|
+
* - Desktop: Uses Worker (better responsiveness, off-main-thread)
|
|
1977
|
+
* - Mobile: Uses main thread (avoids 5MB memory overhead)
|
|
1978
|
+
*
|
|
1979
|
+
* You can override this to:
|
|
1980
|
+
* - `true`: Force Worker even on mobile (if you have memory headroom)
|
|
1981
|
+
* - `false`: Force main thread even on desktop (for debugging)
|
|
1982
|
+
*
|
|
1983
|
+
* Default: undefined (auto-detect)
|
|
1701
1984
|
*/
|
|
1702
|
-
|
|
1985
|
+
useWorker?: boolean;
|
|
1986
|
+
/**
|
|
1987
|
+
* Fallback to main thread on worker errors.
|
|
1988
|
+
*
|
|
1989
|
+
* When true (default), if the Worker fails to load or encounters an error,
|
|
1990
|
+
* the factory will automatically create a main thread instance instead.
|
|
1991
|
+
*
|
|
1992
|
+
* When false, worker errors will propagate as exceptions.
|
|
1993
|
+
*
|
|
1994
|
+
* Default: true
|
|
1995
|
+
*/
|
|
1996
|
+
fallbackOnError?: boolean;
|
|
1997
|
+
/**
|
|
1998
|
+
* Unified inference worker instance.
|
|
1999
|
+
* When provided, uses SileroVADUnifiedAdapter (shared single-ORT worker).
|
|
2000
|
+
* Takes precedence over useWorker setting.
|
|
2001
|
+
*/
|
|
2002
|
+
unifiedWorker?: UnifiedInferenceWorker;
|
|
1703
2003
|
}
|
|
1704
|
-
|
|
1705
2004
|
/**
|
|
1706
|
-
*
|
|
2005
|
+
* Check if the current environment supports VAD Web Workers
|
|
1707
2006
|
*
|
|
1708
|
-
*
|
|
1709
|
-
*
|
|
2007
|
+
* Requirements:
|
|
2008
|
+
* - Worker constructor must exist
|
|
2009
|
+
* - Blob URL support (for inline worker script)
|
|
1710
2010
|
*
|
|
1711
|
-
*
|
|
1712
|
-
|
|
1713
|
-
|
|
1714
|
-
|
|
2011
|
+
* @returns true if VAD Worker is supported
|
|
2012
|
+
*/
|
|
2013
|
+
declare function supportsVADWorker(): boolean;
|
|
2014
|
+
/**
|
|
2015
|
+
* Create a Silero VAD instance with automatic implementation selection
|
|
1715
2016
|
*
|
|
1716
|
-
*
|
|
1717
|
-
* -
|
|
1718
|
-
* -
|
|
1719
|
-
*
|
|
1720
|
-
*
|
|
1721
|
-
*
|
|
2017
|
+
* This factory function automatically selects between:
|
|
2018
|
+
* - **SileroVADWorker**: Off-main-thread inference (better for desktop)
|
|
2019
|
+
* - **SileroVADInference**: Main thread inference (better for mobile)
|
|
2020
|
+
*
|
|
2021
|
+
* The selection is based on:
|
|
2022
|
+
* 1. Explicit `useWorker` config (if provided)
|
|
2023
|
+
* 2. Platform detection (mobile vs desktop)
|
|
2024
|
+
* 3. Worker API availability
|
|
2025
|
+
*
|
|
2026
|
+
* Both implementations share the same interface (SileroVADBackend),
|
|
2027
|
+
* so consumers can use either interchangeably.
|
|
2028
|
+
*
|
|
2029
|
+
* @param config - Factory configuration
|
|
2030
|
+
* @returns A SileroVAD instance (either Worker or main thread)
|
|
2031
|
+
*
|
|
2032
|
+
* @example
|
|
2033
|
+
* ```typescript
|
|
2034
|
+
* // Auto-detect (recommended)
|
|
2035
|
+
* const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
|
|
2036
|
+
*
|
|
2037
|
+
* // Force Worker
|
|
2038
|
+
* const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
|
|
2039
|
+
*
|
|
2040
|
+
* // Force main thread
|
|
2041
|
+
* const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
|
|
2042
|
+
* ```
|
|
2043
|
+
*/
|
|
2044
|
+
declare function createSileroVAD(config: SileroVADFactoryConfig): SileroVADBackend;
|
|
2045
|
+
|
|
2046
|
+
/**
|
|
2047
|
+
* Web Worker-based wav2arkit_cpu lip sync inference
|
|
2048
|
+
*
|
|
2049
|
+
* Runs wav2arkit_cpu inference in a dedicated Web Worker to prevent main thread blocking.
|
|
2050
|
+
* Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
|
|
2051
|
+
*
|
|
2052
|
+
* Key design decisions:
|
|
2053
|
+
* - WASM backend only (WebGPU doesn't work in Workers)
|
|
2054
|
+
* - Audio copied (not transferred) to retain main thread access
|
|
2055
|
+
* - ONNX Runtime loaded from CDN in worker (no bundler complications)
|
|
2056
|
+
* - Blendshape symmetrization inlined in worker (no module imports)
|
|
2057
|
+
* - iOS: passes model URLs as strings directly to ORT (avoids 400MB+ JS heap)
|
|
1722
2058
|
*
|
|
1723
2059
|
* @category Inference
|
|
1724
2060
|
*
|
|
1725
2061
|
* @example
|
|
1726
2062
|
* ```typescript
|
|
1727
|
-
* import {
|
|
2063
|
+
* import { Wav2ArkitCpuWorker } from '@omote/core';
|
|
1728
2064
|
*
|
|
1729
|
-
* const lam = new
|
|
2065
|
+
* const lam = new Wav2ArkitCpuWorker({
|
|
1730
2066
|
* modelUrl: '/models/wav2arkit_cpu.onnx',
|
|
1731
2067
|
* });
|
|
1732
2068
|
* await lam.load();
|
|
@@ -1736,7 +2072,10 @@ declare class Wav2Vec2Inference implements LipSyncBackend {
|
|
|
1736
2072
|
* ```
|
|
1737
2073
|
*/
|
|
1738
2074
|
|
|
1739
|
-
|
|
2075
|
+
/**
|
|
2076
|
+
* Configuration for Wav2ArkitCpu Worker
|
|
2077
|
+
*/
|
|
2078
|
+
interface Wav2ArkitCpuWorkerConfig {
|
|
1740
2079
|
/** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
|
|
1741
2080
|
modelUrl: string;
|
|
1742
2081
|
/**
|
|
@@ -1746,24 +2085,44 @@ interface Wav2ArkitCpuConfig {
|
|
|
1746
2085
|
* Set to `false` to skip external data loading (single-file models only).
|
|
1747
2086
|
*/
|
|
1748
2087
|
externalDataUrl?: string | false;
|
|
1749
|
-
/** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
|
|
1750
|
-
backend?: BackendPreference;
|
|
1751
2088
|
}
|
|
1752
|
-
|
|
2089
|
+
/**
|
|
2090
|
+
* Wav2ArkitCpu Worker - Lip sync inference in a Web Worker
|
|
2091
|
+
*
|
|
2092
|
+
* Runs wav2arkit_cpu inference off the main thread to prevent UI blocking.
|
|
2093
|
+
* Feature parity with Wav2ArkitCpuInference but runs in dedicated worker.
|
|
2094
|
+
*
|
|
2095
|
+
* @see Wav2ArkitCpuInference for main-thread version
|
|
2096
|
+
*/
|
|
2097
|
+
declare class Wav2ArkitCpuWorker implements LipSyncBackend {
|
|
1753
2098
|
readonly modelId: "wav2arkit_cpu";
|
|
1754
|
-
private
|
|
1755
|
-
private ort;
|
|
2099
|
+
private worker;
|
|
1756
2100
|
private config;
|
|
1757
|
-
private _backend;
|
|
1758
2101
|
private isLoading;
|
|
2102
|
+
private _isLoaded;
|
|
1759
2103
|
private inferenceQueue;
|
|
1760
2104
|
private poisoned;
|
|
1761
|
-
private
|
|
1762
|
-
constructor(config:
|
|
1763
|
-
get backend(): RuntimeBackend | null;
|
|
2105
|
+
private pendingResolvers;
|
|
2106
|
+
constructor(config: Wav2ArkitCpuWorkerConfig);
|
|
1764
2107
|
get isLoaded(): boolean;
|
|
1765
2108
|
/**
|
|
1766
|
-
*
|
|
2109
|
+
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
2110
|
+
*/
|
|
2111
|
+
get backend(): 'wasm' | null;
|
|
2112
|
+
/**
|
|
2113
|
+
* Create the worker from inline script
|
|
2114
|
+
*/
|
|
2115
|
+
private createWorker;
|
|
2116
|
+
/**
|
|
2117
|
+
* Handle messages from worker
|
|
2118
|
+
*/
|
|
2119
|
+
private handleWorkerMessage;
|
|
2120
|
+
/**
|
|
2121
|
+
* Send message to worker and wait for response
|
|
2122
|
+
*/
|
|
2123
|
+
private sendMessage;
|
|
2124
|
+
/**
|
|
2125
|
+
* Load the ONNX model in the worker
|
|
1767
2126
|
*/
|
|
1768
2127
|
load(): Promise<LipSyncModelInfo>;
|
|
1769
2128
|
/**
|
|
@@ -1777,280 +2136,524 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
|
|
|
1777
2136
|
*/
|
|
1778
2137
|
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
|
|
1779
2138
|
/**
|
|
1780
|
-
* Queue inference to serialize
|
|
2139
|
+
* Queue inference to serialize worker calls
|
|
1781
2140
|
*/
|
|
1782
2141
|
private queueInference;
|
|
1783
2142
|
/**
|
|
1784
|
-
* Dispose of the
|
|
2143
|
+
* Dispose of the worker and free resources
|
|
2144
|
+
*/
|
|
2145
|
+
dispose(): Promise<void>;
|
|
2146
|
+
/**
|
|
2147
|
+
* Check if Web Workers are supported
|
|
1785
2148
|
*/
|
|
2149
|
+
static isSupported(): boolean;
|
|
2150
|
+
}
|
|
2151
|
+
|
|
2152
|
+
/**
|
|
2153
|
+
* Unified Inference Worker — single Web Worker hosting all WASM models
|
|
2154
|
+
*
|
|
2155
|
+
* Solves the multi-worker ORT problem: three per-model workers each load their
|
|
2156
|
+
* own ORT WASM instance (~40MB each). On iOS this exceeds the ~1-1.5GB tab
|
|
2157
|
+
* limit, forcing main-thread fallback which blocks the render loop.
|
|
2158
|
+
*
|
|
2159
|
+
* This worker hosts SenseVoice + Wav2ArkitCpu + Silero VAD in a single
|
|
2160
|
+
* ORT WASM instance. Same total model memory (~643MB), but inference runs
|
|
2161
|
+
* off-main-thread. Works on iOS because there's only one ORT instance.
|
|
2162
|
+
*
|
|
2163
|
+
* Consumer usage:
|
|
2164
|
+
* ```typescript
|
|
2165
|
+
* const worker = new UnifiedInferenceWorker();
|
|
2166
|
+
* await worker.init();
|
|
2167
|
+
*
|
|
2168
|
+
* const asr = createSenseVoice({ modelUrl: '...', unifiedWorker: worker });
|
|
2169
|
+
* const lam = createLipSync({ gpuModelUrl: '...', cpuModelUrl: '...', unifiedWorker: worker });
|
|
2170
|
+
* const vad = createSileroVAD({ modelUrl: '...', unifiedWorker: worker });
|
|
2171
|
+
* ```
|
|
2172
|
+
*
|
|
2173
|
+
* @category Inference
|
|
2174
|
+
*/
|
|
2175
|
+
|
|
2176
|
+
/**
|
|
2177
|
+
* Unified Inference Worker — single Web Worker for all WASM models
|
|
2178
|
+
*
|
|
2179
|
+
* Hosts SenseVoice, Wav2ArkitCpu, and Silero VAD in one ORT instance.
|
|
2180
|
+
* Eliminates the multi-worker memory problem on iOS.
|
|
2181
|
+
*/
|
|
2182
|
+
declare class UnifiedInferenceWorker {
|
|
2183
|
+
private worker;
|
|
2184
|
+
private pendingRequests;
|
|
2185
|
+
private initialized;
|
|
2186
|
+
private poisoned;
|
|
2187
|
+
/**
|
|
2188
|
+
* Initialize the worker (load ORT WASM from CDN)
|
|
2189
|
+
*/
|
|
2190
|
+
init(): Promise<void>;
|
|
2191
|
+
loadSenseVoice(config: {
|
|
2192
|
+
modelUrl: string;
|
|
2193
|
+
tokensUrl: string;
|
|
2194
|
+
language: number;
|
|
2195
|
+
textNorm: number;
|
|
2196
|
+
}): Promise<SenseVoiceModelInfo>;
|
|
2197
|
+
transcribe(audio: Float32Array): Promise<SenseVoiceResult>;
|
|
2198
|
+
disposeSenseVoice(): Promise<void>;
|
|
2199
|
+
loadLipSync(config: {
|
|
2200
|
+
modelUrl: string;
|
|
2201
|
+
externalDataUrl: string | null;
|
|
2202
|
+
}): Promise<LipSyncModelInfo>;
|
|
2203
|
+
inferLipSync(audio: Float32Array): Promise<{
|
|
2204
|
+
blendshapes: Float32Array;
|
|
2205
|
+
numFrames: number;
|
|
2206
|
+
numBlendshapes: number;
|
|
2207
|
+
inferenceTimeMs: number;
|
|
2208
|
+
}>;
|
|
2209
|
+
disposeLipSync(): Promise<void>;
|
|
2210
|
+
loadVAD(config: {
|
|
2211
|
+
modelUrl: string;
|
|
2212
|
+
sampleRate: number;
|
|
2213
|
+
}): Promise<VADWorkerModelInfo>;
|
|
2214
|
+
processVAD(audio: Float32Array, state: Float32Array, context: Float32Array): Promise<{
|
|
2215
|
+
probability: number;
|
|
2216
|
+
state: Float32Array;
|
|
2217
|
+
inferenceTimeMs: number;
|
|
2218
|
+
}>;
|
|
2219
|
+
resetVAD(): Promise<Float32Array>;
|
|
2220
|
+
disposeVAD(): Promise<void>;
|
|
2221
|
+
dispose(): Promise<void>;
|
|
2222
|
+
/** Check if the worker is initialized and not poisoned */
|
|
2223
|
+
get isReady(): boolean;
|
|
2224
|
+
/** Check if Web Workers are supported */
|
|
2225
|
+
static isSupported(): boolean;
|
|
2226
|
+
private assertReady;
|
|
2227
|
+
private createWorker;
|
|
2228
|
+
private handleWorkerMessage;
|
|
2229
|
+
private sendMessage;
|
|
2230
|
+
private rejectAllPending;
|
|
2231
|
+
private cleanup;
|
|
2232
|
+
}
|
|
2233
|
+
/**
|
|
2234
|
+
* SenseVoice adapter backed by UnifiedInferenceWorker
|
|
2235
|
+
*
|
|
2236
|
+
* Implements SenseVoiceBackend, delegating all inference to the shared worker.
|
|
2237
|
+
*/
|
|
2238
|
+
declare class SenseVoiceUnifiedAdapter implements SenseVoiceBackend {
|
|
2239
|
+
private worker;
|
|
2240
|
+
private config;
|
|
2241
|
+
private _isLoaded;
|
|
2242
|
+
private languageId;
|
|
2243
|
+
private textNormId;
|
|
2244
|
+
private inferenceQueue;
|
|
2245
|
+
constructor(worker: UnifiedInferenceWorker, config: SenseVoiceWorkerConfig);
|
|
2246
|
+
get isLoaded(): boolean;
|
|
2247
|
+
get backend(): 'wasm' | null;
|
|
2248
|
+
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
2249
|
+
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
2250
|
+
dispose(): Promise<void>;
|
|
2251
|
+
}
|
|
2252
|
+
/**
|
|
2253
|
+
* Wav2ArkitCpu adapter backed by UnifiedInferenceWorker
|
|
2254
|
+
*
|
|
2255
|
+
* Implements LipSyncBackend, delegating all inference to the shared worker.
|
|
2256
|
+
*/
|
|
2257
|
+
declare class Wav2ArkitCpuUnifiedAdapter implements LipSyncBackend {
|
|
2258
|
+
readonly modelId: "wav2arkit_cpu";
|
|
2259
|
+
private worker;
|
|
2260
|
+
private config;
|
|
2261
|
+
private _isLoaded;
|
|
2262
|
+
private inferenceQueue;
|
|
2263
|
+
constructor(worker: UnifiedInferenceWorker, config: Wav2ArkitCpuWorkerConfig);
|
|
2264
|
+
get isLoaded(): boolean;
|
|
2265
|
+
get backend(): RuntimeBackend | null;
|
|
2266
|
+
load(): Promise<LipSyncModelInfo>;
|
|
2267
|
+
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
|
|
2268
|
+
dispose(): Promise<void>;
|
|
2269
|
+
}
|
|
2270
|
+
/**
|
|
2271
|
+
* Silero VAD adapter backed by UnifiedInferenceWorker
|
|
2272
|
+
*
|
|
2273
|
+
* Implements SileroVADBackend, delegating all inference to the shared worker.
|
|
2274
|
+
*/
|
|
2275
|
+
declare class SileroVADUnifiedAdapter implements SileroVADBackend {
|
|
2276
|
+
private worker;
|
|
2277
|
+
private config;
|
|
2278
|
+
private _isLoaded;
|
|
2279
|
+
private state;
|
|
2280
|
+
private context;
|
|
2281
|
+
private readonly chunkSize;
|
|
2282
|
+
private readonly contextSize;
|
|
2283
|
+
private inferenceQueue;
|
|
2284
|
+
private preSpeechBuffer;
|
|
2285
|
+
private wasSpeaking;
|
|
2286
|
+
constructor(worker: UnifiedInferenceWorker, config: SileroVADConfig);
|
|
2287
|
+
get isLoaded(): boolean;
|
|
2288
|
+
get backend(): RuntimeBackend | null;
|
|
2289
|
+
get sampleRate(): number;
|
|
2290
|
+
get threshold(): number;
|
|
2291
|
+
getChunkSize(): number;
|
|
2292
|
+
getChunkDurationMs(): number;
|
|
2293
|
+
load(): Promise<VADWorkerModelInfo>;
|
|
2294
|
+
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
2295
|
+
reset(): Promise<void>;
|
|
1786
2296
|
dispose(): Promise<void>;
|
|
1787
2297
|
}
|
|
1788
2298
|
|
|
1789
2299
|
/**
|
|
1790
|
-
* Factory function for
|
|
1791
|
-
*
|
|
1792
|
-
* Provides a unified API that automatically selects the optimal model:
|
|
1793
|
-
* - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
|
|
1794
|
-
* - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
|
|
1795
|
-
* - Fallback: Gracefully falls back to CPU model if GPU model fails to load
|
|
2300
|
+
* Factory function for SenseVoice ASR with automatic Worker vs main thread selection
|
|
1796
2301
|
*
|
|
1797
|
-
*
|
|
1798
|
-
*
|
|
1799
|
-
*
|
|
1800
|
-
* creation (graph optimization), exceeding iOS WebKit's ~1-1.5GB tab limit.
|
|
1801
|
-
* 2. It ships as a single 384MB .onnx file that must load into JS heap before
|
|
1802
|
-
* ORT can consume it. iOS WebKit OOMs on this allocation.
|
|
1803
|
-
* wav2arkit_cpu solves both: external data format (1.86MB graph + 402MB weights)
|
|
1804
|
-
* lets ORT load only the tiny graph, then stream weights via URL pass-through
|
|
1805
|
-
* directly into WASM memory. JS heap stays at ~2MB.
|
|
2302
|
+
* Provides a unified API that automatically selects the optimal implementation:
|
|
2303
|
+
* - Worker supported: Uses SenseVoiceWorker (off-main-thread inference)
|
|
2304
|
+
* - Worker unsupported: Uses SenseVoiceInference (main thread)
|
|
1806
2305
|
*
|
|
1807
2306
|
* @category Inference
|
|
1808
2307
|
*
|
|
1809
2308
|
* @example Auto-detect (recommended)
|
|
1810
2309
|
* ```typescript
|
|
1811
|
-
* import {
|
|
2310
|
+
* import { createSenseVoice } from '@omote/core';
|
|
1812
2311
|
*
|
|
1813
|
-
* const
|
|
1814
|
-
*
|
|
1815
|
-
* cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
2312
|
+
* const asr = createSenseVoice({
|
|
2313
|
+
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
1816
2314
|
* });
|
|
2315
|
+
* await asr.load();
|
|
2316
|
+
* const { text, emotion } = await asr.transcribe(audioSamples);
|
|
2317
|
+
* ```
|
|
1817
2318
|
*
|
|
1818
|
-
*
|
|
1819
|
-
*
|
|
2319
|
+
* @example Force worker
|
|
2320
|
+
* ```typescript
|
|
2321
|
+
* const asr = createSenseVoice({
|
|
2322
|
+
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
2323
|
+
* useWorker: true,
|
|
2324
|
+
* });
|
|
1820
2325
|
* ```
|
|
1821
2326
|
*
|
|
1822
|
-
* @example Force
|
|
2327
|
+
* @example Force main thread
|
|
1823
2328
|
* ```typescript
|
|
1824
|
-
* const
|
|
1825
|
-
*
|
|
1826
|
-
*
|
|
1827
|
-
* mode: 'cpu',
|
|
2329
|
+
* const asr = createSenseVoice({
|
|
2330
|
+
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
2331
|
+
* useWorker: false,
|
|
1828
2332
|
* });
|
|
1829
2333
|
* ```
|
|
1830
2334
|
*/
|
|
1831
2335
|
|
|
1832
2336
|
/**
|
|
1833
|
-
*
|
|
2337
|
+
* Common interface for both SenseVoiceInference and SenseVoiceWorker
|
|
1834
2338
|
*/
|
|
1835
|
-
interface
|
|
1836
|
-
/**
|
|
1837
|
-
|
|
2339
|
+
interface SenseVoiceBackend {
|
|
2340
|
+
/** Whether the model is loaded and ready for inference */
|
|
2341
|
+
readonly isLoaded: boolean;
|
|
2342
|
+
/** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
|
|
2343
|
+
readonly backend: 'wasm' | 'webgpu' | null;
|
|
1838
2344
|
/**
|
|
1839
|
-
*
|
|
1840
|
-
*
|
|
1841
|
-
*
|
|
1842
|
-
* Set to `false` to skip external data loading (single-file models only).
|
|
2345
|
+
* Load the ONNX model
|
|
2346
|
+
* @param onProgress - Optional progress callback (fires once at 100% for worker)
|
|
2347
|
+
* @returns Model loading information
|
|
1843
2348
|
*/
|
|
1844
|
-
|
|
1845
|
-
/** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
|
|
1846
|
-
cpuModelUrl: string;
|
|
2349
|
+
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
1847
2350
|
/**
|
|
1848
|
-
*
|
|
1849
|
-
*
|
|
1850
|
-
*
|
|
1851
|
-
* - 'cpu': Force CPU model (Wav2ArkitCpuInference)
|
|
2351
|
+
* Transcribe audio samples to text
|
|
2352
|
+
* @param audioSamples - Float32Array of audio samples at 16kHz
|
|
2353
|
+
* @returns Transcription result
|
|
1852
2354
|
*/
|
|
1853
|
-
|
|
1854
|
-
/** Backend preference for GPU model (default: 'auto') */
|
|
1855
|
-
gpuBackend?: BackendPreference;
|
|
1856
|
-
/** Number of identity classes for GPU model (default: 12) */
|
|
1857
|
-
numIdentityClasses?: number;
|
|
2355
|
+
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
1858
2356
|
/**
|
|
1859
|
-
*
|
|
1860
|
-
* Only applies when mode is 'auto' or 'gpu'
|
|
2357
|
+
* Dispose of the model and free resources
|
|
1861
2358
|
*/
|
|
1862
|
-
|
|
2359
|
+
dispose(): Promise<void>;
|
|
1863
2360
|
}
|
|
1864
2361
|
/**
|
|
1865
|
-
*
|
|
2362
|
+
* Configuration for the SenseVoice factory
|
|
2363
|
+
*/
|
|
2364
|
+
interface CreateSenseVoiceConfig {
|
|
2365
|
+
/** Path or URL to model.int8.onnx (239MB) */
|
|
2366
|
+
modelUrl: string;
|
|
2367
|
+
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
2368
|
+
tokensUrl?: string;
|
|
2369
|
+
/** Language hint (default: 'auto') */
|
|
2370
|
+
language?: SenseVoiceLanguage;
|
|
2371
|
+
/** Text normalization (default: 'with_itn') */
|
|
2372
|
+
textNorm?: 'with_itn' | 'without_itn';
|
|
2373
|
+
/**
|
|
2374
|
+
* Worker mode:
|
|
2375
|
+
* - 'auto' (default): Use Worker if supported, else main thread
|
|
2376
|
+
* - true: Force Worker (throws if unsupported)
|
|
2377
|
+
* - false: Force main thread
|
|
2378
|
+
*/
|
|
2379
|
+
useWorker?: boolean | 'auto';
|
|
2380
|
+
/**
|
|
2381
|
+
* Unified inference worker instance.
|
|
2382
|
+
* When provided, uses SenseVoiceUnifiedAdapter (shared single-ORT worker).
|
|
2383
|
+
* Takes precedence over useWorker setting.
|
|
2384
|
+
*/
|
|
2385
|
+
unifiedWorker?: UnifiedInferenceWorker;
|
|
2386
|
+
}
|
|
2387
|
+
/**
|
|
2388
|
+
* Create a SenseVoice ASR instance with automatic implementation selection
|
|
1866
2389
|
*
|
|
1867
2390
|
* @param config - Factory configuration
|
|
1868
|
-
* @returns A
|
|
2391
|
+
* @returns A SenseVoiceBackend instance (either Worker or main thread)
|
|
1869
2392
|
*/
|
|
1870
|
-
declare function
|
|
2393
|
+
declare function createSenseVoice(config: CreateSenseVoiceConfig): SenseVoiceBackend;
|
|
1871
2394
|
|
|
1872
2395
|
/**
|
|
1873
|
-
*
|
|
2396
|
+
* Kaldi-compatible filterbank (fbank) feature extraction
|
|
1874
2397
|
*
|
|
1875
|
-
*
|
|
1876
|
-
*
|
|
2398
|
+
* Pure TypeScript implementation matching kaldi-native-fbank parameters
|
|
2399
|
+
* used by SenseVoice. No external dependencies.
|
|
1877
2400
|
*
|
|
1878
|
-
*
|
|
1879
|
-
*
|
|
1880
|
-
*
|
|
2401
|
+
* Pipeline: audio → framing → windowing → FFT → power spectrum → mel filterbank → log
|
|
2402
|
+
*
|
|
2403
|
+
* @module inference/kaldiFbank
|
|
2404
|
+
*/
|
|
2405
|
+
interface KaldiFbankOptions {
|
|
2406
|
+
/** Frame length in ms (default: 25) */
|
|
2407
|
+
frameLengthMs?: number;
|
|
2408
|
+
/** Frame shift in ms (default: 10) */
|
|
2409
|
+
frameShiftMs?: number;
|
|
2410
|
+
/** Low frequency cutoff in Hz (default: 20) */
|
|
2411
|
+
lowFreq?: number;
|
|
2412
|
+
/** High frequency cutoff in Hz (default: sampleRate / 2) */
|
|
2413
|
+
highFreq?: number;
|
|
2414
|
+
/** Dither amount (default: 0 for deterministic output) */
|
|
2415
|
+
dither?: number;
|
|
2416
|
+
/** Preemphasis coefficient (default: 0.97) */
|
|
2417
|
+
preemphasis?: number;
|
|
2418
|
+
}
|
|
2419
|
+
/**
|
|
2420
|
+
* Compute Kaldi-compatible log mel filterbank features
|
|
2421
|
+
*
|
|
2422
|
+
* @param audio Raw audio samples (float32, [-1, 1] range)
|
|
2423
|
+
* @param sampleRate Sample rate in Hz (must be 16000 for SenseVoice)
|
|
2424
|
+
* @param numMelBins Number of mel bins (80 for SenseVoice)
|
|
2425
|
+
* @param opts Optional parameters
|
|
2426
|
+
* @returns Flattened Float32Array of shape [numFrames, numMelBins]
|
|
2427
|
+
*/
|
|
2428
|
+
declare function computeKaldiFbank(audio: Float32Array, sampleRate: number, numMelBins: number, opts?: KaldiFbankOptions): Float32Array;
|
|
2429
|
+
/**
|
|
2430
|
+
* Apply Low Frame Rate stacking for SenseVoice
|
|
2431
|
+
*
|
|
2432
|
+
* Concatenates lfrM consecutive frames with stride lfrN.
|
|
2433
|
+
* Left-pads with copies of first frame, right-pads last group.
|
|
2434
|
+
*
|
|
2435
|
+
* @param features Flattened [numFrames, featureDim]
|
|
2436
|
+
* @param featureDim Feature dimension per frame (e.g., 80)
|
|
2437
|
+
* @param lfrM Number of frames to stack (default: 7)
|
|
2438
|
+
* @param lfrN Stride (default: 6)
|
|
2439
|
+
* @returns Flattened [numOutputFrames, featureDim * lfrM]
|
|
2440
|
+
*/
|
|
2441
|
+
declare function applyLFR(features: Float32Array, featureDim: number, lfrM?: number, lfrN?: number): Float32Array;
|
|
2442
|
+
/**
|
|
2443
|
+
* Apply CMVN normalization in-place
|
|
2444
|
+
*
|
|
2445
|
+
* Formula: normalized[i] = (features[i] + negMean[i % dim]) * invStddev[i % dim]
|
|
2446
|
+
*
|
|
2447
|
+
* @param features Flattened feature array (modified in-place)
|
|
2448
|
+
* @param dim Feature dimension (560 for SenseVoice after LFR)
|
|
2449
|
+
* @param negMean Negative mean vector (dim-dimensional)
|
|
2450
|
+
* @param invStddev Inverse standard deviation vector (dim-dimensional)
|
|
2451
|
+
* @returns The same features array (for chaining)
|
|
2452
|
+
*/
|
|
2453
|
+
declare function applyCMVN(features: Float32Array, dim: number, negMean: Float32Array, invStddev: Float32Array): Float32Array;
|
|
2454
|
+
/**
|
|
2455
|
+
* Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
|
|
2456
|
+
*
|
|
2457
|
+
* The sherpa-onnx SenseVoice export stores neg_mean and inv_stddev
|
|
2458
|
+
* as comma-separated float strings in the model's metadata.
|
|
2459
|
+
*/
|
|
2460
|
+
declare function parseCMVNFromMetadata(negMeanStr: string, invStddevStr: string): {
|
|
2461
|
+
negMean: Float32Array;
|
|
2462
|
+
invStddev: Float32Array;
|
|
2463
|
+
};
|
|
2464
|
+
|
|
2465
|
+
/**
|
|
2466
|
+
* CTC greedy decoder for SenseVoice
|
|
2467
|
+
*
|
|
2468
|
+
* Decodes CTC logits into text with structured token parsing
|
|
2469
|
+
* for language, emotion, and audio event detection.
|
|
2470
|
+
*
|
|
2471
|
+
* @module inference/ctcDecoder
|
|
2472
|
+
*/
|
|
2473
|
+
interface CTCDecodeResult {
|
|
2474
|
+
/** Decoded text (speech content only) */
|
|
2475
|
+
text: string;
|
|
2476
|
+
/** Detected language (e.g., 'zh', 'en', 'ja', 'ko', 'yue') */
|
|
2477
|
+
language?: string;
|
|
2478
|
+
/** Detected emotion (e.g., 'HAPPY', 'SAD', 'ANGRY', 'NEUTRAL') */
|
|
2479
|
+
emotion?: string;
|
|
2480
|
+
/** Detected audio event (e.g., 'Speech', 'BGM', 'Laughter') */
|
|
2481
|
+
event?: string;
|
|
2482
|
+
}
|
|
2483
|
+
/** Resolve language string to SenseVoice language ID */
|
|
2484
|
+
declare function resolveLanguageId(language: string): number;
|
|
2485
|
+
/** Resolve text norm string to SenseVoice text norm ID */
|
|
2486
|
+
declare function resolveTextNormId(textNorm: string): number;
|
|
2487
|
+
/**
|
|
2488
|
+
* Parse tokens.txt into a token ID → string map
|
|
2489
|
+
*
|
|
2490
|
+
* Format: each line is "token_string token_id"
|
|
2491
|
+
* e.g., "<unk> 0", "▁the 3", "s 4"
|
|
2492
|
+
*/
|
|
2493
|
+
declare function parseTokensFile(content: string): Map<number, string>;
|
|
2494
|
+
/**
|
|
2495
|
+
* CTC greedy decode
|
|
2496
|
+
*
|
|
2497
|
+
* @param logits Raw logits from model output, flattened [seqLen, vocabSize]
|
|
2498
|
+
* @param seqLen Sequence length (time steps)
|
|
2499
|
+
* @param vocabSize Vocabulary size
|
|
2500
|
+
* @param tokenMap Token ID → string map from tokens.txt
|
|
2501
|
+
* @returns Decoded text and structured metadata
|
|
2502
|
+
*/
|
|
2503
|
+
declare function ctcGreedyDecode(logits: Float32Array, seqLen: number, vocabSize: number, tokenMap: Map<number, string>): CTCDecodeResult;
|
|
2504
|
+
|
|
2505
|
+
/**
|
|
2506
|
+
* Shared blendshape constants and utilities for lip sync inference
|
|
2507
|
+
*
|
|
2508
|
+
* Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
|
|
2509
|
+
* index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
|
|
2510
|
+
*
|
|
2511
|
+
* This module is the single source of truth for blendshape ordering to
|
|
2512
|
+
* avoid circular dependencies between inference classes.
|
|
2513
|
+
*
|
|
2514
|
+
* @category Inference
|
|
2515
|
+
*/
|
|
2516
|
+
/**
|
|
2517
|
+
* LAM model blendshape names in order (52 total)
|
|
2518
|
+
* NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
|
|
2519
|
+
*/
|
|
2520
|
+
declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
2521
|
+
/** Alias for backwards compatibility */
|
|
2522
|
+
declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
2523
|
+
/**
|
|
2524
|
+
* Symmetrize blendshapes by averaging left/right pairs
|
|
2525
|
+
* From LAM official postprocessing (models/utils.py)
|
|
2526
|
+
* This fixes asymmetric output from the raw model
|
|
2527
|
+
*/
|
|
2528
|
+
declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
|
|
2529
|
+
/**
|
|
2530
|
+
* wav2arkit_cpu model blendshape ordering
|
|
2531
|
+
*
|
|
2532
|
+
* Indices 0-24 match LAM_BLENDSHAPES, but 25+ diverge:
|
|
2533
|
+
* - LAM puts jawRight, mouthClose, mouthDimpleLeft, mouthDimpleRight at 25-28
|
|
2534
|
+
* - wav2arkit_cpu puts mouthFrownLeft at 25 and moves those four to 48-51
|
|
2535
|
+
*/
|
|
2536
|
+
declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
|
|
2537
|
+
/**
|
|
2538
|
+
* Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
|
|
2539
|
+
*
|
|
2540
|
+
* @param frame - Float32Array of 52 blendshape values in wav2arkit_cpu order
|
|
2541
|
+
* @returns Float32Array of 52 blendshape values in LAM_BLENDSHAPES order
|
|
2542
|
+
*/
|
|
2543
|
+
declare function remapWav2ArkitToLam(frame: Float32Array): Float32Array;
|
|
2544
|
+
|
|
2545
|
+
/**
|
|
2546
|
+
* Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
|
|
2547
|
+
*
|
|
2548
|
+
* Runs entirely in the browser using WebGPU or WASM.
|
|
2549
|
+
* Takes raw 16kHz audio and outputs:
|
|
2550
|
+
* - 52 ARKit blendshapes (lip sync)
|
|
2551
|
+
* - 32-token CTC logits (speech recognition)
|
|
1881
2552
|
*
|
|
1882
2553
|
* @category Inference
|
|
1883
2554
|
*
|
|
1884
2555
|
* @example Basic usage
|
|
1885
2556
|
* ```typescript
|
|
1886
|
-
* import {
|
|
1887
|
-
*
|
|
1888
|
-
* const vad = new SileroVADInference({
|
|
1889
|
-
* modelUrl: '/models/silero-vad.onnx'
|
|
1890
|
-
* });
|
|
1891
|
-
* await vad.load();
|
|
2557
|
+
* import { Wav2Vec2Inference } from '@omote/core';
|
|
1892
2558
|
*
|
|
1893
|
-
*
|
|
1894
|
-
*
|
|
1895
|
-
* if (probability > 0.5) {
|
|
1896
|
-
* console.log('Speech detected!');
|
|
1897
|
-
* }
|
|
1898
|
-
* ```
|
|
2559
|
+
* const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
|
|
2560
|
+
* await wav2vec.load();
|
|
1899
2561
|
*
|
|
1900
|
-
*
|
|
1901
|
-
*
|
|
1902
|
-
* // State is automatically maintained between process() calls
|
|
1903
|
-
* // Call reset() when starting a new audio stream
|
|
1904
|
-
* vad.reset();
|
|
2562
|
+
* // Process 1 second of audio (16kHz = 16000 samples)
|
|
2563
|
+
* const result = await wav2vec.infer(audioSamples);
|
|
1905
2564
|
*
|
|
1906
|
-
*
|
|
1907
|
-
*
|
|
1908
|
-
* // prob is speech probability [0, 1]
|
|
1909
|
-
* }
|
|
2565
|
+
* console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
|
|
2566
|
+
* console.log('ASR text:', result.text); // Decoded transcription
|
|
1910
2567
|
* ```
|
|
1911
2568
|
*/
|
|
1912
2569
|
|
|
1913
|
-
type
|
|
1914
|
-
|
|
1915
|
-
* Configuration for Silero VAD
|
|
1916
|
-
*/
|
|
1917
|
-
interface SileroVADConfig {
|
|
2570
|
+
type InferenceBackend = BackendPreference;
|
|
2571
|
+
interface Wav2Vec2InferenceConfig {
|
|
1918
2572
|
/** Path or URL to the ONNX model */
|
|
1919
2573
|
modelUrl: string;
|
|
1920
|
-
/** Preferred backend (auto will try WebGPU first, fallback to WASM) */
|
|
1921
|
-
backend?: VADBackend;
|
|
1922
|
-
/** Sample rate (8000 or 16000, default: 16000) */
|
|
1923
|
-
sampleRate?: 8000 | 16000;
|
|
1924
|
-
/** Speech probability threshold (default: 0.5) */
|
|
1925
|
-
threshold?: number;
|
|
1926
2574
|
/**
|
|
1927
|
-
*
|
|
1928
|
-
*
|
|
1929
|
-
* to capture the beginning of speech that occurred before detection.
|
|
1930
|
-
*
|
|
1931
|
-
* At 512 samples/chunk and 16kHz:
|
|
1932
|
-
* - 10 chunks = 320ms of pre-speech audio
|
|
1933
|
-
* - 15 chunks = 480ms of pre-speech audio
|
|
2575
|
+
* Path or URL to external model data file (.onnx.data weights).
|
|
2576
|
+
* Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
|
|
1934
2577
|
*
|
|
1935
|
-
*
|
|
2578
|
+
* Set to `false` to skip external data loading (single-file models only).
|
|
1936
2579
|
*/
|
|
1937
|
-
|
|
2580
|
+
externalDataUrl?: string | false;
|
|
2581
|
+
/** Preferred backend (auto will try WebGPU first, fallback to WASM) */
|
|
2582
|
+
backend?: InferenceBackend;
|
|
2583
|
+
/** Number of identity classes (default: 12 for streaming model) */
|
|
2584
|
+
numIdentityClasses?: number;
|
|
1938
2585
|
}
|
|
1939
|
-
|
|
1940
|
-
* VAD model loading information
|
|
1941
|
-
*/
|
|
1942
|
-
interface VADModelInfo {
|
|
2586
|
+
interface ModelInfo {
|
|
1943
2587
|
backend: 'webgpu' | 'wasm';
|
|
1944
2588
|
loadTimeMs: number;
|
|
1945
2589
|
inputNames: string[];
|
|
1946
2590
|
outputNames: string[];
|
|
1947
|
-
sampleRate: number;
|
|
1948
|
-
chunkSize: number;
|
|
1949
2591
|
}
|
|
1950
|
-
|
|
1951
|
-
|
|
1952
|
-
|
|
1953
|
-
interface
|
|
1954
|
-
/**
|
|
1955
|
-
|
|
1956
|
-
/**
|
|
1957
|
-
|
|
1958
|
-
/**
|
|
2592
|
+
|
|
2593
|
+
/** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
|
|
2594
|
+
declare const CTC_VOCAB: string[];
|
|
2595
|
+
interface Wav2Vec2Result {
|
|
2596
|
+
/** Blendshape weights [frames, 52] - 30fps */
|
|
2597
|
+
blendshapes: Float32Array[];
|
|
2598
|
+
/** Raw CTC logits [frames, 32] - 50fps */
|
|
2599
|
+
asrLogits: Float32Array[];
|
|
2600
|
+
/** Decoded text from CTC */
|
|
2601
|
+
text: string;
|
|
2602
|
+
/** Number of blendshape frames (30fps) — alias for numA2EFrames */
|
|
2603
|
+
numFrames: number;
|
|
2604
|
+
/** Number of A2E frames (30fps) */
|
|
2605
|
+
numA2EFrames: number;
|
|
2606
|
+
/** Number of ASR frames (50fps) */
|
|
2607
|
+
numASRFrames: number;
|
|
2608
|
+
/** Inference time in ms */
|
|
1959
2609
|
inferenceTimeMs: number;
|
|
1960
|
-
/**
|
|
1961
|
-
* Pre-speech audio chunks (only present on first speech detection).
|
|
1962
|
-
* These are the N chunks immediately before VAD triggered, useful for
|
|
1963
|
-
* capturing the beginning of speech that occurred before detection.
|
|
1964
|
-
*
|
|
1965
|
-
* Only populated when transitioning from silence to speech.
|
|
1966
|
-
*/
|
|
1967
|
-
preSpeechChunks?: Float32Array[];
|
|
1968
|
-
}
|
|
1969
|
-
/**
|
|
1970
|
-
* Speech segment detected by VAD
|
|
1971
|
-
*/
|
|
1972
|
-
interface SpeechSegment {
|
|
1973
|
-
/** Start time in seconds */
|
|
1974
|
-
start: number;
|
|
1975
|
-
/** End time in seconds */
|
|
1976
|
-
end: number;
|
|
1977
|
-
/** Average probability during segment */
|
|
1978
|
-
avgProbability: number;
|
|
1979
2610
|
}
|
|
1980
|
-
|
|
1981
|
-
|
|
1982
|
-
*
|
|
1983
|
-
* Based on snakers4/silero-vad ONNX model.
|
|
1984
|
-
* Processes 32ms chunks (512 samples at 16kHz) with LSTM state.
|
|
1985
|
-
*
|
|
1986
|
-
* @see https://github.com/snakers4/silero-vad
|
|
1987
|
-
*/
|
|
1988
|
-
declare class SileroVADInference {
|
|
2611
|
+
declare class Wav2Vec2Inference implements LipSyncBackend {
|
|
2612
|
+
readonly modelId: "wav2vec2";
|
|
1989
2613
|
private session;
|
|
1990
2614
|
private ort;
|
|
1991
2615
|
private config;
|
|
1992
2616
|
private _backend;
|
|
1993
2617
|
private isLoading;
|
|
1994
|
-
private
|
|
1995
|
-
private context;
|
|
1996
|
-
private readonly chunkSize;
|
|
1997
|
-
private readonly contextSize;
|
|
2618
|
+
private numIdentityClasses;
|
|
1998
2619
|
private inferenceQueue;
|
|
1999
|
-
private
|
|
2000
|
-
private
|
|
2001
|
-
|
|
2002
|
-
constructor(config: SileroVADConfig);
|
|
2003
|
-
get backend(): RuntimeBackend | null;
|
|
2004
|
-
get isLoaded(): boolean;
|
|
2005
|
-
get sampleRate(): number;
|
|
2006
|
-
get threshold(): number;
|
|
2007
|
-
/**
|
|
2008
|
-
* Get required chunk size in samples
|
|
2009
|
-
*/
|
|
2010
|
-
getChunkSize(): number;
|
|
2011
|
-
/**
|
|
2012
|
-
* Get chunk duration in milliseconds
|
|
2013
|
-
*/
|
|
2014
|
-
getChunkDurationMs(): number;
|
|
2620
|
+
private poisoned;
|
|
2621
|
+
private static readonly INFERENCE_TIMEOUT_MS;
|
|
2622
|
+
constructor(config: Wav2Vec2InferenceConfig);
|
|
2015
2623
|
/**
|
|
2016
2624
|
* Check if WebGPU is available and working
|
|
2017
2625
|
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
2018
2626
|
*/
|
|
2019
2627
|
static isWebGPUAvailable: typeof isWebGPUAvailable;
|
|
2628
|
+
get backend(): 'webgpu' | 'wasm' | null;
|
|
2629
|
+
get isLoaded(): boolean;
|
|
2630
|
+
/** True if inference timed out and the session is permanently unusable */
|
|
2631
|
+
get isSessionPoisoned(): boolean;
|
|
2020
2632
|
/**
|
|
2021
2633
|
* Load the ONNX model
|
|
2022
2634
|
*/
|
|
2023
|
-
load(): Promise<
|
|
2024
|
-
/**
|
|
2025
|
-
* Reset state for new audio stream
|
|
2026
|
-
*/
|
|
2027
|
-
reset(): void;
|
|
2635
|
+
load(): Promise<ModelInfo>;
|
|
2028
2636
|
/**
|
|
2029
|
-
*
|
|
2637
|
+
* Run inference on raw audio
|
|
2638
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
|
|
2639
|
+
* @param identityIndex - Optional identity index (0-11, default 0 = neutral)
|
|
2030
2640
|
*
|
|
2031
|
-
*
|
|
2032
|
-
*
|
|
2641
|
+
* Note: Model expects 1-second chunks (16000 samples) for optimal performance.
|
|
2642
|
+
* Audio will be zero-padded or truncated to 16000 samples.
|
|
2033
2643
|
*/
|
|
2034
|
-
|
|
2644
|
+
infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
|
|
2035
2645
|
/**
|
|
2036
|
-
*
|
|
2037
|
-
*
|
|
2038
|
-
* @param audio - Complete audio buffer
|
|
2039
|
-
* @param options - Detection options
|
|
2040
|
-
* @returns Array of speech segments
|
|
2646
|
+
* Decode CTC logits to text using greedy decoding
|
|
2041
2647
|
*/
|
|
2042
|
-
|
|
2043
|
-
/** Minimum speech duration in ms (default: 250) */
|
|
2044
|
-
minSpeechDurationMs?: number;
|
|
2045
|
-
/** Minimum silence duration to end segment in ms (default: 300) */
|
|
2046
|
-
minSilenceDurationMs?: number;
|
|
2047
|
-
/** Padding to add before/after speech in ms (default: 30) */
|
|
2048
|
-
speechPadMs?: number;
|
|
2049
|
-
}): Promise<SpeechSegment[]>;
|
|
2648
|
+
private decodeCTC;
|
|
2050
2649
|
/**
|
|
2051
2650
|
* Queue inference to serialize ONNX session calls
|
|
2052
2651
|
*/
|
|
2053
2652
|
private queueInference;
|
|
2653
|
+
/**
|
|
2654
|
+
* Get blendshape value by name for a specific frame
|
|
2655
|
+
*/
|
|
2656
|
+
getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
|
|
2054
2657
|
/**
|
|
2055
2658
|
* Dispose of the model and free resources
|
|
2056
2659
|
*/
|
|
@@ -2058,309 +2661,189 @@ declare class SileroVADInference {
|
|
|
2058
2661
|
}
|
|
2059
2662
|
|
|
2060
2663
|
/**
|
|
2061
|
-
*
|
|
2664
|
+
* CPU-optimized lip sync inference using wav2arkit_cpu model
|
|
2062
2665
|
*
|
|
2063
|
-
*
|
|
2064
|
-
*
|
|
2666
|
+
* A Safari/iOS-compatible alternative to Wav2Vec2Inference (384MB) designed
|
|
2667
|
+
* for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
|
|
2065
2668
|
*
|
|
2066
|
-
*
|
|
2067
|
-
* -
|
|
2068
|
-
* -
|
|
2069
|
-
*
|
|
2070
|
-
*
|
|
2669
|
+
* The model uses ONNX external data format:
|
|
2670
|
+
* - wav2arkit_cpu.onnx (1.86MB graph structure)
|
|
2671
|
+
* - wav2arkit_cpu.onnx.data (402MB weights)
|
|
2672
|
+
* Both files are fetched and cached automatically.
|
|
2673
|
+
*
|
|
2674
|
+
* Key differences from Wav2Vec2Inference:
|
|
2675
|
+
* - WASM-only backend (CPU-optimized, single-threaded, no WebGPU)
|
|
2676
|
+
* - No identity input (baked to identity 11)
|
|
2677
|
+
* - No ASR output (lip sync only)
|
|
2678
|
+
* - Dynamic input length (not fixed to 16000 samples)
|
|
2679
|
+
* - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
|
|
2071
2680
|
*
|
|
2072
2681
|
* @category Inference
|
|
2073
2682
|
*
|
|
2074
|
-
* @example
|
|
2683
|
+
* @example
|
|
2075
2684
|
* ```typescript
|
|
2076
|
-
* import {
|
|
2685
|
+
* import { Wav2ArkitCpuInference } from '@omote/core';
|
|
2077
2686
|
*
|
|
2078
|
-
* const
|
|
2079
|
-
* modelUrl: '/models/
|
|
2687
|
+
* const lam = new Wav2ArkitCpuInference({
|
|
2688
|
+
* modelUrl: '/models/wav2arkit_cpu.onnx',
|
|
2080
2689
|
* });
|
|
2081
|
-
* await
|
|
2690
|
+
* await lam.load();
|
|
2082
2691
|
*
|
|
2083
|
-
*
|
|
2084
|
-
*
|
|
2085
|
-
* if (result.isSpeech) {
|
|
2086
|
-
* console.log('Speech detected!', result.probability);
|
|
2087
|
-
* }
|
|
2692
|
+
* const { blendshapes } = await lam.infer(audioSamples);
|
|
2693
|
+
* // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
|
|
2088
2694
|
* ```
|
|
2089
2695
|
*/
|
|
2090
2696
|
|
|
2091
|
-
|
|
2092
|
-
|
|
2093
|
-
*/
|
|
2094
|
-
interface VADWorkerConfig {
|
|
2095
|
-
/** Path or URL to the ONNX model */
|
|
2697
|
+
interface Wav2ArkitCpuConfig {
|
|
2698
|
+
/** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
|
|
2096
2699
|
modelUrl: string;
|
|
2097
|
-
/** Sample rate (8000 or 16000, default: 16000) */
|
|
2098
|
-
sampleRate?: 8000 | 16000;
|
|
2099
|
-
/** Speech probability threshold (default: 0.5) */
|
|
2100
|
-
threshold?: number;
|
|
2101
2700
|
/**
|
|
2102
|
-
*
|
|
2103
|
-
*
|
|
2104
|
-
*
|
|
2105
|
-
*
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
|
|
2109
|
-
|
|
2110
|
-
|
|
2111
|
-
|
|
2112
|
-
|
|
2113
|
-
|
|
2114
|
-
|
|
2115
|
-
* VAD model loading information from worker
|
|
2116
|
-
*/
|
|
2117
|
-
interface VADWorkerModelInfo {
|
|
2118
|
-
backend: 'wasm';
|
|
2119
|
-
loadTimeMs: number;
|
|
2120
|
-
inputNames: string[];
|
|
2121
|
-
outputNames: string[];
|
|
2122
|
-
sampleRate: number;
|
|
2123
|
-
chunkSize: number;
|
|
2124
|
-
}
|
|
2125
|
-
|
|
2126
|
-
/**
|
|
2127
|
-
* Silero VAD Worker - Voice Activity Detection in a Web Worker
|
|
2128
|
-
*
|
|
2129
|
-
* Runs Silero VAD inference off the main thread to prevent UI blocking.
|
|
2130
|
-
* Feature parity with SileroVADInference but runs in dedicated worker.
|
|
2131
|
-
*
|
|
2132
|
-
* @see SileroVADInference for main-thread version
|
|
2133
|
-
*/
|
|
2134
|
-
declare class SileroVADWorker {
|
|
2135
|
-
private worker;
|
|
2701
|
+
* Path or URL to external model data file (.onnx.data weights).
|
|
2702
|
+
* Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
|
|
2703
|
+
*
|
|
2704
|
+
* Set to `false` to skip external data loading (single-file models only).
|
|
2705
|
+
*/
|
|
2706
|
+
externalDataUrl?: string | false;
|
|
2707
|
+
/** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
|
|
2708
|
+
backend?: BackendPreference;
|
|
2709
|
+
}
|
|
2710
|
+
declare class Wav2ArkitCpuInference implements LipSyncBackend {
|
|
2711
|
+
readonly modelId: "wav2arkit_cpu";
|
|
2712
|
+
private session;
|
|
2713
|
+
private ort;
|
|
2136
2714
|
private config;
|
|
2715
|
+
private _backend;
|
|
2137
2716
|
private isLoading;
|
|
2138
|
-
private _isLoaded;
|
|
2139
|
-
private state;
|
|
2140
|
-
private context;
|
|
2141
|
-
private readonly chunkSize;
|
|
2142
|
-
private readonly contextSize;
|
|
2143
2717
|
private inferenceQueue;
|
|
2144
|
-
private
|
|
2145
|
-
private
|
|
2146
|
-
|
|
2147
|
-
|
|
2148
|
-
constructor(config: VADWorkerConfig);
|
|
2718
|
+
private poisoned;
|
|
2719
|
+
private static readonly INFERENCE_TIMEOUT_MS;
|
|
2720
|
+
constructor(config: Wav2ArkitCpuConfig);
|
|
2721
|
+
get backend(): RuntimeBackend | null;
|
|
2149
2722
|
get isLoaded(): boolean;
|
|
2150
2723
|
/**
|
|
2151
|
-
*
|
|
2152
|
-
*/
|
|
2153
|
-
get backend(): 'wasm' | null;
|
|
2154
|
-
get sampleRate(): number;
|
|
2155
|
-
get threshold(): number;
|
|
2156
|
-
/**
|
|
2157
|
-
* Get required chunk size in samples
|
|
2158
|
-
*/
|
|
2159
|
-
getChunkSize(): number;
|
|
2160
|
-
/**
|
|
2161
|
-
* Get chunk duration in milliseconds
|
|
2162
|
-
*/
|
|
2163
|
-
getChunkDurationMs(): number;
|
|
2164
|
-
/**
|
|
2165
|
-
* Create the worker from inline script
|
|
2166
|
-
*/
|
|
2167
|
-
private createWorker;
|
|
2168
|
-
/**
|
|
2169
|
-
* Handle messages from worker
|
|
2170
|
-
*/
|
|
2171
|
-
private handleWorkerMessage;
|
|
2172
|
-
/**
|
|
2173
|
-
* Send message to worker and wait for response
|
|
2174
|
-
*/
|
|
2175
|
-
private sendMessage;
|
|
2176
|
-
/**
|
|
2177
|
-
* Load the ONNX model in the worker
|
|
2178
|
-
*/
|
|
2179
|
-
load(): Promise<VADWorkerModelInfo>;
|
|
2180
|
-
/**
|
|
2181
|
-
* Reset state for new audio stream
|
|
2724
|
+
* Load the ONNX model
|
|
2182
2725
|
*/
|
|
2183
|
-
|
|
2726
|
+
load(): Promise<LipSyncModelInfo>;
|
|
2184
2727
|
/**
|
|
2185
|
-
*
|
|
2728
|
+
* Run inference on raw audio
|
|
2186
2729
|
*
|
|
2187
|
-
*
|
|
2188
|
-
*
|
|
2730
|
+
* Accepts variable-length audio (not fixed to 16000 samples).
|
|
2731
|
+
* Output frames = ceil(30 * numSamples / 16000).
|
|
2732
|
+
*
|
|
2733
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
2734
|
+
* @param _identityIndex - Ignored (identity 11 is baked into the model)
|
|
2189
2735
|
*/
|
|
2190
|
-
|
|
2736
|
+
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
|
|
2191
2737
|
/**
|
|
2192
|
-
* Queue inference to serialize
|
|
2738
|
+
* Queue inference to serialize ONNX session calls
|
|
2193
2739
|
*/
|
|
2194
2740
|
private queueInference;
|
|
2195
2741
|
/**
|
|
2196
|
-
* Dispose of the
|
|
2742
|
+
* Dispose of the model and free resources
|
|
2197
2743
|
*/
|
|
2198
2744
|
dispose(): Promise<void>;
|
|
2199
|
-
/**
|
|
2200
|
-
* Check if Web Workers are supported
|
|
2201
|
-
*/
|
|
2202
|
-
static isSupported(): boolean;
|
|
2203
2745
|
}
|
|
2204
2746
|
|
|
2205
2747
|
/**
|
|
2206
|
-
* Factory function for
|
|
2748
|
+
* Factory function for lip sync with automatic GPU/CPU model selection
|
|
2207
2749
|
*
|
|
2208
|
-
* Provides a unified API that automatically selects the optimal
|
|
2209
|
-
* -
|
|
2210
|
-
* -
|
|
2211
|
-
* - Fallback: Gracefully falls back to
|
|
2750
|
+
* Provides a unified API that automatically selects the optimal model:
|
|
2751
|
+
* - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
|
|
2752
|
+
* - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
|
|
2753
|
+
* - Fallback: Gracefully falls back to CPU model if GPU model fails to load
|
|
2754
|
+
*
|
|
2755
|
+
* Why two separate models?
|
|
2756
|
+
* Wav2Vec2 (LAM) cannot run on Safari/iOS for two reasons:
|
|
2757
|
+
* 1. Its dual-head transformer graph needs ~750-950MB peak during ORT session
|
|
2758
|
+
* creation (graph optimization), exceeding iOS WebKit's ~1-1.5GB tab limit.
|
|
2759
|
+
* 2. It ships as a single 384MB .onnx file that must load into JS heap before
|
|
2760
|
+
* ORT can consume it. iOS WebKit OOMs on this allocation.
|
|
2761
|
+
* wav2arkit_cpu solves both: external data format (1.86MB graph + 402MB weights)
|
|
2762
|
+
* lets ORT load only the tiny graph, then stream weights via URL pass-through
|
|
2763
|
+
* directly into WASM memory. JS heap stays at ~2MB.
|
|
2212
2764
|
*
|
|
2213
2765
|
* @category Inference
|
|
2214
2766
|
*
|
|
2215
|
-
* @example
|
|
2767
|
+
* @example Auto-detect (recommended)
|
|
2216
2768
|
* ```typescript
|
|
2217
|
-
* import {
|
|
2769
|
+
* import { createLipSync } from '@omote/core';
|
|
2218
2770
|
*
|
|
2219
|
-
* const
|
|
2220
|
-
*
|
|
2221
|
-
*
|
|
2771
|
+
* const lam = createLipSync({
|
|
2772
|
+
* gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
|
|
2773
|
+
* cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
2222
2774
|
* });
|
|
2223
2775
|
*
|
|
2224
|
-
* await
|
|
2225
|
-
* const
|
|
2226
|
-
* if (result.isSpeech) {
|
|
2227
|
-
* console.log('Speech detected!', result.probability);
|
|
2228
|
-
* }
|
|
2229
|
-
* ```
|
|
2230
|
-
*
|
|
2231
|
-
* @example Force worker usage
|
|
2232
|
-
* ```typescript
|
|
2233
|
-
* const vad = createSileroVAD({
|
|
2234
|
-
* modelUrl: '/models/silero-vad.onnx',
|
|
2235
|
-
* useWorker: true, // Force Worker even on mobile
|
|
2236
|
-
* });
|
|
2776
|
+
* await lam.load();
|
|
2777
|
+
* const { blendshapes } = await lam.infer(audioSamples);
|
|
2237
2778
|
* ```
|
|
2238
2779
|
*
|
|
2239
|
-
* @example Force
|
|
2780
|
+
* @example Force CPU model
|
|
2240
2781
|
* ```typescript
|
|
2241
|
-
* const
|
|
2242
|
-
*
|
|
2243
|
-
*
|
|
2782
|
+
* const lam = createLipSync({
|
|
2783
|
+
* gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
|
|
2784
|
+
* cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
2785
|
+
* mode: 'cpu',
|
|
2244
2786
|
* });
|
|
2245
2787
|
* ```
|
|
2246
2788
|
*/
|
|
2247
2789
|
|
|
2248
2790
|
/**
|
|
2249
|
-
*
|
|
2250
|
-
*
|
|
2251
|
-
* This interface defines the shared API that both implementations provide,
|
|
2252
|
-
* allowing consumers to use either interchangeably.
|
|
2791
|
+
* Configuration for the lip sync factory
|
|
2253
2792
|
*/
|
|
2254
|
-
interface
|
|
2255
|
-
/**
|
|
2256
|
-
|
|
2257
|
-
/** Whether the model is loaded and ready for inference */
|
|
2258
|
-
readonly isLoaded: boolean;
|
|
2259
|
-
/** Audio sample rate (8000 or 16000 Hz) */
|
|
2260
|
-
readonly sampleRate: number;
|
|
2261
|
-
/** Speech detection threshold (0-1) */
|
|
2262
|
-
readonly threshold: number;
|
|
2263
|
-
/**
|
|
2264
|
-
* Load the ONNX model
|
|
2265
|
-
* @returns Model loading information
|
|
2266
|
-
*/
|
|
2267
|
-
load(): Promise<VADModelInfo | VADWorkerModelInfo>;
|
|
2268
|
-
/**
|
|
2269
|
-
* Process a single audio chunk
|
|
2270
|
-
* @param audioChunk - Float32Array of exactly chunkSize samples
|
|
2271
|
-
* @returns VAD result with speech probability
|
|
2272
|
-
*/
|
|
2273
|
-
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
2274
|
-
/**
|
|
2275
|
-
* Reset state for new audio stream
|
|
2276
|
-
*/
|
|
2277
|
-
reset(): void | Promise<void>;
|
|
2793
|
+
interface CreateLipSyncConfig {
|
|
2794
|
+
/** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
|
|
2795
|
+
gpuModelUrl: string;
|
|
2278
2796
|
/**
|
|
2279
|
-
*
|
|
2797
|
+
* URL for GPU model external data file (.onnx.data weights).
|
|
2798
|
+
* Default: `${gpuModelUrl}.data`
|
|
2799
|
+
*
|
|
2800
|
+
* Set to `false` to skip external data loading (single-file models only).
|
|
2280
2801
|
*/
|
|
2281
|
-
|
|
2802
|
+
gpuExternalDataUrl?: string | false;
|
|
2803
|
+
/** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
|
|
2804
|
+
cpuModelUrl: string;
|
|
2282
2805
|
/**
|
|
2283
|
-
*
|
|
2806
|
+
* Model selection mode:
|
|
2807
|
+
* - 'auto': Safari/iOS → CPU, everything else → GPU (default)
|
|
2808
|
+
* - 'gpu': Force GPU model (Wav2Vec2Inference)
|
|
2809
|
+
* - 'cpu': Force CPU model (Wav2ArkitCpuInference)
|
|
2284
2810
|
*/
|
|
2285
|
-
|
|
2811
|
+
mode?: 'auto' | 'gpu' | 'cpu';
|
|
2812
|
+
/** Backend preference for GPU model (default: 'auto') */
|
|
2813
|
+
gpuBackend?: BackendPreference;
|
|
2814
|
+
/** Number of identity classes for GPU model (default: 12) */
|
|
2815
|
+
numIdentityClasses?: number;
|
|
2286
2816
|
/**
|
|
2287
|
-
*
|
|
2817
|
+
* Fall back to CPU model if GPU model fails to load (default: true)
|
|
2818
|
+
* Only applies when mode is 'auto' or 'gpu'
|
|
2288
2819
|
*/
|
|
2289
|
-
|
|
2290
|
-
}
|
|
2291
|
-
/**
|
|
2292
|
-
* Configuration for the Silero VAD factory
|
|
2293
|
-
*
|
|
2294
|
-
* Extends SileroVADConfig with worker-specific options.
|
|
2295
|
-
*/
|
|
2296
|
-
interface SileroVADFactoryConfig extends SileroVADConfig {
|
|
2820
|
+
fallbackOnError?: boolean;
|
|
2297
2821
|
/**
|
|
2298
|
-
*
|
|
2299
|
-
*
|
|
2300
|
-
* Auto-detection behavior:
|
|
2301
|
-
* - Desktop: Uses Worker (better responsiveness, off-main-thread)
|
|
2302
|
-
* - Mobile: Uses main thread (avoids 5MB memory overhead)
|
|
2822
|
+
* Use Web Worker for CPU model inference (default: false)
|
|
2303
2823
|
*
|
|
2304
|
-
*
|
|
2305
|
-
*
|
|
2306
|
-
*
|
|
2824
|
+
* When true, Wav2ArkitCpuWorker is used instead of Wav2ArkitCpuInference,
|
|
2825
|
+
* running inference off the main thread to prevent UI blocking during
|
|
2826
|
+
* model loading and inference.
|
|
2307
2827
|
*
|
|
2308
|
-
*
|
|
2828
|
+
* Only applies when the CPU model is selected (mode: 'cpu', auto on Safari/iOS,
|
|
2829
|
+
* or fallback from GPU).
|
|
2309
2830
|
*/
|
|
2310
2831
|
useWorker?: boolean;
|
|
2311
2832
|
/**
|
|
2312
|
-
*
|
|
2313
|
-
*
|
|
2314
|
-
*
|
|
2315
|
-
*
|
|
2316
|
-
*
|
|
2317
|
-
* When false, worker errors will propagate as exceptions.
|
|
2318
|
-
*
|
|
2319
|
-
* Default: true
|
|
2833
|
+
* Unified inference worker instance.
|
|
2834
|
+
* When provided and CPU model is selected, uses Wav2ArkitCpuUnifiedAdapter.
|
|
2835
|
+
* Takes precedence over useWorker setting for the CPU model path.
|
|
2836
|
+
* GPU model (Wav2Vec2) always stays on main thread (WebGPU).
|
|
2320
2837
|
*/
|
|
2321
|
-
|
|
2838
|
+
unifiedWorker?: UnifiedInferenceWorker;
|
|
2322
2839
|
}
|
|
2323
2840
|
/**
|
|
2324
|
-
*
|
|
2325
|
-
*
|
|
2326
|
-
* Requirements:
|
|
2327
|
-
* - Worker constructor must exist
|
|
2328
|
-
* - Blob URL support (for inline worker script)
|
|
2329
|
-
*
|
|
2330
|
-
* @returns true if VAD Worker is supported
|
|
2331
|
-
*/
|
|
2332
|
-
declare function supportsVADWorker(): boolean;
|
|
2333
|
-
/**
|
|
2334
|
-
* Create a Silero VAD instance with automatic implementation selection
|
|
2335
|
-
*
|
|
2336
|
-
* This factory function automatically selects between:
|
|
2337
|
-
* - **SileroVADWorker**: Off-main-thread inference (better for desktop)
|
|
2338
|
-
* - **SileroVADInference**: Main thread inference (better for mobile)
|
|
2339
|
-
*
|
|
2340
|
-
* The selection is based on:
|
|
2341
|
-
* 1. Explicit `useWorker` config (if provided)
|
|
2342
|
-
* 2. Platform detection (mobile vs desktop)
|
|
2343
|
-
* 3. Worker API availability
|
|
2344
|
-
*
|
|
2345
|
-
* Both implementations share the same interface (SileroVADBackend),
|
|
2346
|
-
* so consumers can use either interchangeably.
|
|
2841
|
+
* Create a lip sync instance with automatic GPU/CPU model selection
|
|
2347
2842
|
*
|
|
2348
2843
|
* @param config - Factory configuration
|
|
2349
|
-
* @returns A
|
|
2350
|
-
*
|
|
2351
|
-
* @example
|
|
2352
|
-
* ```typescript
|
|
2353
|
-
* // Auto-detect (recommended)
|
|
2354
|
-
* const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
|
|
2355
|
-
*
|
|
2356
|
-
* // Force Worker
|
|
2357
|
-
* const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
|
|
2358
|
-
*
|
|
2359
|
-
* // Force main thread
|
|
2360
|
-
* const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
|
|
2361
|
-
* ```
|
|
2844
|
+
* @returns A LipSyncBackend instance (either GPU or CPU model)
|
|
2362
2845
|
*/
|
|
2363
|
-
declare function
|
|
2846
|
+
declare function createLipSync(config: CreateLipSyncConfig): LipSyncBackend;
|
|
2364
2847
|
|
|
2365
2848
|
/**
|
|
2366
2849
|
* Safari Web Speech API wrapper for iOS speech recognition
|
|
@@ -4713,4 +5196,4 @@ declare class ProceduralLifeLayer {
|
|
|
4713
5196
|
private updateBrowNoise;
|
|
4714
5197
|
}
|
|
4715
5198
|
|
|
4716
|
-
export { type AIAdapter, type AIAdapterEvents, AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, type CTCDecodeResult, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_ARKIT_MAP, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type Emotion2VecLabel, type EmotionAnimationMap, type EmotionBlendMode, type EmotionBlendshapeConfig, EmotionController, type EmotionFrame, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionToBlendshapeMapper, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type KaldiFbankOptions, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, ProceduralLifeLayer, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, UPPER_FACE_BLENDSHAPES, type UpperFaceBlendshapeName, type UpperFaceBlendshapes, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyCMVN, applyLFR, blendEmotions, calculatePeak, calculateRMS, computeKaldiFbank, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSessionWithFallback, createSileroVAD, ctcGreedyDecode, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, parseCMVNFromMetadata, parseTokensFile, preloadModels, preloadOnnxRuntime, remapWav2ArkitToLam, resolveBackend, resolveLanguageId, resolveTextNormId, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes };
|
|
5199
|
+
export { type AIAdapter, type AIAdapterEvents, AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, type CTCDecodeResult, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_ARKIT_MAP, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type Emotion2VecLabel, type EmotionAnimationMap, type EmotionBlendMode, type EmotionBlendshapeConfig, EmotionController, type EmotionFrame, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionToBlendshapeMapper, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type KaldiFbankOptions, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, ProceduralLifeLayer, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, UPPER_FACE_BLENDSHAPES, UnifiedInferenceWorker, type UpperFaceBlendshapeName, type UpperFaceBlendshapes, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyCMVN, applyLFR, blendEmotions, calculatePeak, calculateRMS, computeKaldiFbank, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSenseVoice, createSessionWithFallback, createSileroVAD, ctcGreedyDecode, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, parseCMVNFromMetadata, parseTokensFile, preloadModels, preloadOnnxRuntime, remapWav2ArkitToLam, resolveBackend, resolveLanguageId, resolveTextNormId, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes };
|