@omote/core 0.4.4 → 0.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +1165 -673
- package/dist/index.d.ts +1165 -673
- package/dist/index.js +3307 -337
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +3302 -332
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { EventEmitter, OmoteEvents, AnimationEvent } from './events/index.js';
|
|
1
|
+
import { EventEmitter, OmoteEvents, AISessionState, AnimationEvent } from './events/index.js';
|
|
2
2
|
export { BackendEvent, EmotionEvent, GazeEvent, STTFinalEvent, STTPartialEvent, SessionStateEvent, TTSEndEvent, TTSMarkEvent, TTSStartEvent, VisemeEvent } from './events/index.js';
|
|
3
3
|
import { InferenceSession, Tensor, Env } from 'onnxruntime-common';
|
|
4
4
|
export { D as DEFAULT_LOGGING_CONFIG, I as ILogger, e as LOG_LEVEL_PRIORITY, b as LogEntry, L as LogFormatter, a as LogLevel, c as LogSink, d as LoggingConfig, f as configureLogging, i as createLogger, g as getLoggingConfig, n as noopLogger, r as resetLoggingConfig, s as setLogLevel, h as setLoggingEnabled } from './Logger-I_k4sGhM.js';
|
|
@@ -1412,6 +1412,8 @@ declare class SenseVoiceInference {
|
|
|
1412
1412
|
private _backend;
|
|
1413
1413
|
private isLoading;
|
|
1414
1414
|
private inferenceQueue;
|
|
1415
|
+
private poisoned;
|
|
1416
|
+
private static readonly INFERENCE_TIMEOUT_MS;
|
|
1415
1417
|
private tokenMap;
|
|
1416
1418
|
private negMean;
|
|
1417
1419
|
private invStddev;
|
|
@@ -1433,267 +1435,297 @@ declare class SenseVoiceInference {
|
|
|
1433
1435
|
}
|
|
1434
1436
|
|
|
1435
1437
|
/**
|
|
1436
|
-
*
|
|
1437
|
-
*
|
|
1438
|
-
* Pure TypeScript implementation matching kaldi-native-fbank parameters
|
|
1439
|
-
* used by SenseVoice. No external dependencies.
|
|
1440
|
-
*
|
|
1441
|
-
* Pipeline: audio → framing → windowing → FFT → power spectrum → mel filterbank → log
|
|
1442
|
-
*
|
|
1443
|
-
* @module inference/kaldiFbank
|
|
1444
|
-
*/
|
|
1445
|
-
interface KaldiFbankOptions {
|
|
1446
|
-
/** Frame length in ms (default: 25) */
|
|
1447
|
-
frameLengthMs?: number;
|
|
1448
|
-
/** Frame shift in ms (default: 10) */
|
|
1449
|
-
frameShiftMs?: number;
|
|
1450
|
-
/** Low frequency cutoff in Hz (default: 20) */
|
|
1451
|
-
lowFreq?: number;
|
|
1452
|
-
/** High frequency cutoff in Hz (default: sampleRate / 2) */
|
|
1453
|
-
highFreq?: number;
|
|
1454
|
-
/** Dither amount (default: 0 for deterministic output) */
|
|
1455
|
-
dither?: number;
|
|
1456
|
-
/** Preemphasis coefficient (default: 0.97) */
|
|
1457
|
-
preemphasis?: number;
|
|
1458
|
-
}
|
|
1459
|
-
/**
|
|
1460
|
-
* Compute Kaldi-compatible log mel filterbank features
|
|
1438
|
+
* SenseVoice ASR Web Worker implementation
|
|
1461
1439
|
*
|
|
1462
|
-
*
|
|
1463
|
-
*
|
|
1464
|
-
*
|
|
1465
|
-
* @param opts Optional parameters
|
|
1466
|
-
* @returns Flattened Float32Array of shape [numFrames, numMelBins]
|
|
1467
|
-
*/
|
|
1468
|
-
declare function computeKaldiFbank(audio: Float32Array, sampleRate: number, numMelBins: number, opts?: KaldiFbankOptions): Float32Array;
|
|
1469
|
-
/**
|
|
1470
|
-
* Apply Low Frame Rate stacking for SenseVoice
|
|
1440
|
+
* Runs SenseVoice speech recognition in a dedicated Web Worker to prevent
|
|
1441
|
+
* main thread blocking. Uses inline worker script (Blob URL pattern) to
|
|
1442
|
+
* avoid separate file deployment.
|
|
1471
1443
|
*
|
|
1472
|
-
*
|
|
1473
|
-
*
|
|
1444
|
+
* Key design decisions:
|
|
1445
|
+
* - WASM backend only (WebGPU doesn't work in Workers)
|
|
1446
|
+
* - All preprocessing (fbank, LFR, CMVN) and CTC decoding inlined in worker
|
|
1447
|
+
* - Audio copied (not transferred) to retain main thread access
|
|
1448
|
+
* - ONNX Runtime loaded from CDN in worker (no bundler complications)
|
|
1449
|
+
* - iOS: model URL passed as string to ORT (avoids 239MB JS heap allocation)
|
|
1474
1450
|
*
|
|
1475
|
-
* @
|
|
1476
|
-
* @param featureDim Feature dimension per frame (e.g., 80)
|
|
1477
|
-
* @param lfrM Number of frames to stack (default: 7)
|
|
1478
|
-
* @param lfrN Stride (default: 6)
|
|
1479
|
-
* @returns Flattened [numOutputFrames, featureDim * lfrM]
|
|
1480
|
-
*/
|
|
1481
|
-
declare function applyLFR(features: Float32Array, featureDim: number, lfrM?: number, lfrN?: number): Float32Array;
|
|
1482
|
-
/**
|
|
1483
|
-
* Apply CMVN normalization in-place
|
|
1451
|
+
* @category Inference
|
|
1484
1452
|
*
|
|
1485
|
-
*
|
|
1453
|
+
* @example Basic usage
|
|
1454
|
+
* ```typescript
|
|
1455
|
+
* import { SenseVoiceWorker } from '@omote/core';
|
|
1486
1456
|
*
|
|
1487
|
-
*
|
|
1488
|
-
*
|
|
1489
|
-
*
|
|
1490
|
-
*
|
|
1491
|
-
*
|
|
1492
|
-
*/
|
|
1493
|
-
declare function applyCMVN(features: Float32Array, dim: number, negMean: Float32Array, invStddev: Float32Array): Float32Array;
|
|
1494
|
-
/**
|
|
1495
|
-
* Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
|
|
1457
|
+
* const asr = new SenseVoiceWorker({
|
|
1458
|
+
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
1459
|
+
* tokensUrl: '/models/sensevoice/tokens.txt',
|
|
1460
|
+
* });
|
|
1461
|
+
* await asr.load();
|
|
1496
1462
|
*
|
|
1497
|
-
*
|
|
1498
|
-
*
|
|
1463
|
+
* const { text, emotion, language } = await asr.transcribe(audioSamples);
|
|
1464
|
+
* console.log(text); // "Hello world"
|
|
1465
|
+
* console.log(emotion); // "NEUTRAL"
|
|
1466
|
+
* console.log(language); // "en"
|
|
1467
|
+
* ```
|
|
1499
1468
|
*/
|
|
1500
|
-
declare function parseCMVNFromMetadata(negMeanStr: string, invStddevStr: string): {
|
|
1501
|
-
negMean: Float32Array;
|
|
1502
|
-
invStddev: Float32Array;
|
|
1503
|
-
};
|
|
1504
1469
|
|
|
1505
1470
|
/**
|
|
1506
|
-
*
|
|
1507
|
-
*
|
|
1508
|
-
* Decodes CTC logits into text with structured token parsing
|
|
1509
|
-
* for language, emotion, and audio event detection.
|
|
1510
|
-
*
|
|
1511
|
-
* @module inference/ctcDecoder
|
|
1471
|
+
* Configuration for SenseVoice Worker
|
|
1512
1472
|
*/
|
|
1513
|
-
interface
|
|
1514
|
-
/**
|
|
1515
|
-
|
|
1516
|
-
/**
|
|
1517
|
-
|
|
1518
|
-
/**
|
|
1519
|
-
|
|
1520
|
-
/**
|
|
1521
|
-
|
|
1473
|
+
interface SenseVoiceWorkerConfig {
|
|
1474
|
+
/** Path or URL to model.int8.onnx (239MB) */
|
|
1475
|
+
modelUrl: string;
|
|
1476
|
+
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
1477
|
+
tokensUrl?: string;
|
|
1478
|
+
/** Language hint (default: 'auto' for auto-detection) */
|
|
1479
|
+
language?: SenseVoiceLanguage;
|
|
1480
|
+
/** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
|
|
1481
|
+
textNorm?: 'with_itn' | 'without_itn';
|
|
1522
1482
|
}
|
|
1523
|
-
/** Resolve language string to SenseVoice language ID */
|
|
1524
|
-
declare function resolveLanguageId(language: string): number;
|
|
1525
|
-
/** Resolve text norm string to SenseVoice text norm ID */
|
|
1526
|
-
declare function resolveTextNormId(textNorm: string): number;
|
|
1527
1483
|
/**
|
|
1528
|
-
*
|
|
1484
|
+
* SenseVoice ASR Worker - Speech Recognition in a Web Worker
|
|
1529
1485
|
*
|
|
1530
|
-
*
|
|
1531
|
-
*
|
|
1532
|
-
*/
|
|
1533
|
-
declare function parseTokensFile(content: string): Map<number, string>;
|
|
1534
|
-
/**
|
|
1535
|
-
* CTC greedy decode
|
|
1486
|
+
* Runs SenseVoice inference off the main thread to prevent UI blocking.
|
|
1487
|
+
* All preprocessing (fbank, LFR, CMVN) and CTC decoding run in the worker.
|
|
1536
1488
|
*
|
|
1537
|
-
* @
|
|
1538
|
-
* @param seqLen Sequence length (time steps)
|
|
1539
|
-
* @param vocabSize Vocabulary size
|
|
1540
|
-
* @param tokenMap Token ID → string map from tokens.txt
|
|
1541
|
-
* @returns Decoded text and structured metadata
|
|
1489
|
+
* @see SenseVoiceInference for main-thread version
|
|
1542
1490
|
*/
|
|
1543
|
-
declare
|
|
1491
|
+
declare class SenseVoiceWorker {
|
|
1492
|
+
private worker;
|
|
1493
|
+
private config;
|
|
1494
|
+
private isLoading;
|
|
1495
|
+
private _isLoaded;
|
|
1496
|
+
private inferenceQueue;
|
|
1497
|
+
private poisoned;
|
|
1498
|
+
private pendingResolvers;
|
|
1499
|
+
private languageId;
|
|
1500
|
+
private textNormId;
|
|
1501
|
+
constructor(config: SenseVoiceWorkerConfig);
|
|
1502
|
+
get isLoaded(): boolean;
|
|
1503
|
+
/**
|
|
1504
|
+
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
1505
|
+
*/
|
|
1506
|
+
get backend(): 'wasm' | null;
|
|
1507
|
+
/**
|
|
1508
|
+
* Create the worker from inline script
|
|
1509
|
+
*/
|
|
1510
|
+
private createWorker;
|
|
1511
|
+
/**
|
|
1512
|
+
* Handle messages from worker
|
|
1513
|
+
*/
|
|
1514
|
+
private handleWorkerMessage;
|
|
1515
|
+
/**
|
|
1516
|
+
* Send message to worker and wait for response
|
|
1517
|
+
*/
|
|
1518
|
+
private sendMessage;
|
|
1519
|
+
/**
|
|
1520
|
+
* Load the ONNX model in the worker
|
|
1521
|
+
*
|
|
1522
|
+
* @param onProgress - Optional progress callback. Fires once at 100% when load completes
|
|
1523
|
+
* (the worker downloads and loads the model internally, so granular progress is not available).
|
|
1524
|
+
*/
|
|
1525
|
+
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
1526
|
+
/**
|
|
1527
|
+
* Transcribe audio samples to text
|
|
1528
|
+
*
|
|
1529
|
+
* @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
|
|
1530
|
+
* @returns Transcription result with text, emotion, language, and event
|
|
1531
|
+
*/
|
|
1532
|
+
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
1533
|
+
/**
|
|
1534
|
+
* Queue inference to serialize worker calls
|
|
1535
|
+
*/
|
|
1536
|
+
private queueInference;
|
|
1537
|
+
/**
|
|
1538
|
+
* Dispose of the worker and free resources
|
|
1539
|
+
*/
|
|
1540
|
+
dispose(): Promise<void>;
|
|
1541
|
+
/**
|
|
1542
|
+
* Check if Web Workers are supported
|
|
1543
|
+
*/
|
|
1544
|
+
static isSupported(): boolean;
|
|
1545
|
+
}
|
|
1544
1546
|
|
|
1545
1547
|
/**
|
|
1546
|
-
*
|
|
1547
|
-
*
|
|
1548
|
-
* Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
|
|
1549
|
-
* index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
|
|
1550
|
-
*
|
|
1551
|
-
* This module is the single source of truth for blendshape ordering to
|
|
1552
|
-
* avoid circular dependencies between inference classes.
|
|
1553
|
-
*
|
|
1554
|
-
* @category Inference
|
|
1555
|
-
*/
|
|
1556
|
-
/**
|
|
1557
|
-
* LAM model blendshape names in order (52 total)
|
|
1558
|
-
* NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
|
|
1559
|
-
*/
|
|
1560
|
-
declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
1561
|
-
/** Alias for backwards compatibility */
|
|
1562
|
-
declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
1563
|
-
/**
|
|
1564
|
-
* Symmetrize blendshapes by averaging left/right pairs
|
|
1565
|
-
* From LAM official postprocessing (models/utils.py)
|
|
1566
|
-
* This fixes asymmetric output from the raw model
|
|
1567
|
-
*/
|
|
1568
|
-
declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
|
|
1569
|
-
/**
|
|
1570
|
-
* wav2arkit_cpu model blendshape ordering
|
|
1571
|
-
*
|
|
1572
|
-
* Indices 0-24 match LAM_BLENDSHAPES, but 25+ diverge:
|
|
1573
|
-
* - LAM puts jawRight, mouthClose, mouthDimpleLeft, mouthDimpleRight at 25-28
|
|
1574
|
-
* - wav2arkit_cpu puts mouthFrownLeft at 25 and moves those four to 48-51
|
|
1575
|
-
*/
|
|
1576
|
-
declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
|
|
1577
|
-
/**
|
|
1578
|
-
* Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
|
|
1548
|
+
* Silero VAD (Voice Activity Detection) inference
|
|
1579
1549
|
*
|
|
1580
|
-
*
|
|
1581
|
-
*
|
|
1582
|
-
*/
|
|
1583
|
-
declare function remapWav2ArkitToLam(frame: Float32Array): Float32Array;
|
|
1584
|
-
|
|
1585
|
-
/**
|
|
1586
|
-
* Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
|
|
1550
|
+
* Neural network-based VAD running in browser via ONNX Runtime Web.
|
|
1551
|
+
* Much more accurate than RMS-based energy detection.
|
|
1587
1552
|
*
|
|
1588
|
-
*
|
|
1589
|
-
*
|
|
1590
|
-
* -
|
|
1591
|
-
* - 32-token CTC logits (speech recognition)
|
|
1553
|
+
* Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
|
|
1554
|
+
* - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
|
|
1555
|
+
* - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
|
|
1592
1556
|
*
|
|
1593
1557
|
* @category Inference
|
|
1594
1558
|
*
|
|
1595
1559
|
* @example Basic usage
|
|
1596
1560
|
* ```typescript
|
|
1597
|
-
* import {
|
|
1561
|
+
* import { SileroVADInference } from '@omote/core';
|
|
1598
1562
|
*
|
|
1599
|
-
* const
|
|
1600
|
-
*
|
|
1563
|
+
* const vad = new SileroVADInference({
|
|
1564
|
+
* modelUrl: '/models/silero-vad.onnx'
|
|
1565
|
+
* });
|
|
1566
|
+
* await vad.load();
|
|
1601
1567
|
*
|
|
1602
|
-
* // Process
|
|
1603
|
-
* const
|
|
1568
|
+
* // Process 32ms chunks (512 samples at 16kHz)
|
|
1569
|
+
* const probability = await vad.process(audioChunk);
|
|
1570
|
+
* if (probability > 0.5) {
|
|
1571
|
+
* console.log('Speech detected!');
|
|
1572
|
+
* }
|
|
1573
|
+
* ```
|
|
1604
1574
|
*
|
|
1605
|
-
*
|
|
1606
|
-
*
|
|
1575
|
+
* @example Streaming with state management
|
|
1576
|
+
* ```typescript
|
|
1577
|
+
* // State is automatically maintained between process() calls
|
|
1578
|
+
* // Call reset() when starting a new audio stream
|
|
1579
|
+
* vad.reset();
|
|
1580
|
+
*
|
|
1581
|
+
* for (const chunk of audioChunks) {
|
|
1582
|
+
* const prob = await vad.process(chunk);
|
|
1583
|
+
* // prob is speech probability [0, 1]
|
|
1584
|
+
* }
|
|
1607
1585
|
* ```
|
|
1608
1586
|
*/
|
|
1609
1587
|
|
|
1610
|
-
type
|
|
1611
|
-
|
|
1588
|
+
type VADBackend = BackendPreference;
|
|
1589
|
+
/**
|
|
1590
|
+
* Configuration for Silero VAD
|
|
1591
|
+
*/
|
|
1592
|
+
interface SileroVADConfig {
|
|
1612
1593
|
/** Path or URL to the ONNX model */
|
|
1613
1594
|
modelUrl: string;
|
|
1614
|
-
/**
|
|
1615
|
-
* Path or URL to external model data file (.onnx.data weights).
|
|
1616
|
-
* Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
|
|
1617
|
-
*
|
|
1618
|
-
* Set to `false` to skip external data loading (single-file models only).
|
|
1619
|
-
*/
|
|
1620
|
-
externalDataUrl?: string | false;
|
|
1621
1595
|
/** Preferred backend (auto will try WebGPU first, fallback to WASM) */
|
|
1622
|
-
backend?:
|
|
1623
|
-
/**
|
|
1624
|
-
|
|
1596
|
+
backend?: VADBackend;
|
|
1597
|
+
/** Sample rate (8000 or 16000, default: 16000) */
|
|
1598
|
+
sampleRate?: 8000 | 16000;
|
|
1599
|
+
/** Speech probability threshold (default: 0.5) */
|
|
1600
|
+
threshold?: number;
|
|
1601
|
+
/**
|
|
1602
|
+
* Number of audio chunks to keep in pre-speech buffer.
|
|
1603
|
+
* When VAD triggers, these chunks are prepended to the speech buffer
|
|
1604
|
+
* to capture the beginning of speech that occurred before detection.
|
|
1605
|
+
*
|
|
1606
|
+
* At 512 samples/chunk and 16kHz:
|
|
1607
|
+
* - 10 chunks = 320ms of pre-speech audio
|
|
1608
|
+
* - 15 chunks = 480ms of pre-speech audio
|
|
1609
|
+
*
|
|
1610
|
+
* Default: 10 chunks (320ms)
|
|
1611
|
+
*/
|
|
1612
|
+
preSpeechBufferChunks?: number;
|
|
1625
1613
|
}
|
|
1626
|
-
|
|
1614
|
+
/**
|
|
1615
|
+
* VAD model loading information
|
|
1616
|
+
*/
|
|
1617
|
+
interface VADModelInfo {
|
|
1627
1618
|
backend: 'webgpu' | 'wasm';
|
|
1628
1619
|
loadTimeMs: number;
|
|
1629
1620
|
inputNames: string[];
|
|
1630
1621
|
outputNames: string[];
|
|
1622
|
+
sampleRate: number;
|
|
1623
|
+
chunkSize: number;
|
|
1631
1624
|
}
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
interface
|
|
1636
|
-
/**
|
|
1637
|
-
|
|
1638
|
-
/**
|
|
1639
|
-
|
|
1640
|
-
/**
|
|
1641
|
-
text: string;
|
|
1642
|
-
/** Number of blendshape frames (30fps) — alias for numA2EFrames */
|
|
1643
|
-
numFrames: number;
|
|
1644
|
-
/** Number of A2E frames (30fps) */
|
|
1645
|
-
numA2EFrames: number;
|
|
1646
|
-
/** Number of ASR frames (50fps) */
|
|
1647
|
-
numASRFrames: number;
|
|
1648
|
-
/** Inference time in ms */
|
|
1625
|
+
/**
|
|
1626
|
+
* Result from a single VAD inference
|
|
1627
|
+
*/
|
|
1628
|
+
interface VADResult {
|
|
1629
|
+
/** Speech probability (0-1) */
|
|
1630
|
+
probability: number;
|
|
1631
|
+
/** Whether speech is detected (probability > threshold) */
|
|
1632
|
+
isSpeech: boolean;
|
|
1633
|
+
/** Inference time in milliseconds */
|
|
1649
1634
|
inferenceTimeMs: number;
|
|
1635
|
+
/**
|
|
1636
|
+
* Pre-speech audio chunks (only present on first speech detection).
|
|
1637
|
+
* These are the N chunks immediately before VAD triggered, useful for
|
|
1638
|
+
* capturing the beginning of speech that occurred before detection.
|
|
1639
|
+
*
|
|
1640
|
+
* Only populated when transitioning from silence to speech.
|
|
1641
|
+
*/
|
|
1642
|
+
preSpeechChunks?: Float32Array[];
|
|
1650
1643
|
}
|
|
1651
|
-
|
|
1652
|
-
|
|
1644
|
+
/**
|
|
1645
|
+
* Speech segment detected by VAD
|
|
1646
|
+
*/
|
|
1647
|
+
interface SpeechSegment {
|
|
1648
|
+
/** Start time in seconds */
|
|
1649
|
+
start: number;
|
|
1650
|
+
/** End time in seconds */
|
|
1651
|
+
end: number;
|
|
1652
|
+
/** Average probability during segment */
|
|
1653
|
+
avgProbability: number;
|
|
1654
|
+
}
|
|
1655
|
+
/**
|
|
1656
|
+
* Silero VAD - Neural network voice activity detection
|
|
1657
|
+
*
|
|
1658
|
+
* Based on snakers4/silero-vad ONNX model.
|
|
1659
|
+
* Processes 32ms chunks (512 samples at 16kHz) with LSTM state.
|
|
1660
|
+
*
|
|
1661
|
+
* @see https://github.com/snakers4/silero-vad
|
|
1662
|
+
*/
|
|
1663
|
+
declare class SileroVADInference {
|
|
1653
1664
|
private session;
|
|
1654
1665
|
private ort;
|
|
1655
1666
|
private config;
|
|
1656
1667
|
private _backend;
|
|
1657
1668
|
private isLoading;
|
|
1658
|
-
private
|
|
1669
|
+
private state;
|
|
1670
|
+
private context;
|
|
1671
|
+
private readonly chunkSize;
|
|
1672
|
+
private readonly contextSize;
|
|
1659
1673
|
private inferenceQueue;
|
|
1660
|
-
private
|
|
1661
|
-
private
|
|
1662
|
-
|
|
1674
|
+
private preSpeechBuffer;
|
|
1675
|
+
private wasSpeaking;
|
|
1676
|
+
private srTensor;
|
|
1677
|
+
constructor(config: SileroVADConfig);
|
|
1678
|
+
get backend(): RuntimeBackend | null;
|
|
1679
|
+
get isLoaded(): boolean;
|
|
1680
|
+
get sampleRate(): number;
|
|
1681
|
+
get threshold(): number;
|
|
1682
|
+
/**
|
|
1683
|
+
* Get required chunk size in samples
|
|
1684
|
+
*/
|
|
1685
|
+
getChunkSize(): number;
|
|
1686
|
+
/**
|
|
1687
|
+
* Get chunk duration in milliseconds
|
|
1688
|
+
*/
|
|
1689
|
+
getChunkDurationMs(): number;
|
|
1663
1690
|
/**
|
|
1664
1691
|
* Check if WebGPU is available and working
|
|
1665
1692
|
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
1666
1693
|
*/
|
|
1667
1694
|
static isWebGPUAvailable: typeof isWebGPUAvailable;
|
|
1668
|
-
get backend(): 'webgpu' | 'wasm' | null;
|
|
1669
|
-
get isLoaded(): boolean;
|
|
1670
|
-
/** True if inference timed out and the session is permanently unusable */
|
|
1671
|
-
get isSessionPoisoned(): boolean;
|
|
1672
1695
|
/**
|
|
1673
1696
|
* Load the ONNX model
|
|
1674
1697
|
*/
|
|
1675
|
-
load(): Promise<
|
|
1698
|
+
load(): Promise<VADModelInfo>;
|
|
1676
1699
|
/**
|
|
1677
|
-
*
|
|
1678
|
-
|
|
1679
|
-
|
|
1700
|
+
* Reset state for new audio stream
|
|
1701
|
+
*/
|
|
1702
|
+
reset(): void;
|
|
1703
|
+
/**
|
|
1704
|
+
* Process a single audio chunk
|
|
1680
1705
|
*
|
|
1681
|
-
*
|
|
1682
|
-
*
|
|
1706
|
+
* @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
|
|
1707
|
+
* @returns VAD result with speech probability
|
|
1683
1708
|
*/
|
|
1684
|
-
|
|
1709
|
+
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1685
1710
|
/**
|
|
1686
|
-
*
|
|
1711
|
+
* Process audio and detect speech segments
|
|
1712
|
+
*
|
|
1713
|
+
* @param audio - Complete audio buffer
|
|
1714
|
+
* @param options - Detection options
|
|
1715
|
+
* @returns Array of speech segments
|
|
1687
1716
|
*/
|
|
1688
|
-
|
|
1717
|
+
detectSpeech(audio: Float32Array, options?: {
|
|
1718
|
+
/** Minimum speech duration in ms (default: 250) */
|
|
1719
|
+
minSpeechDurationMs?: number;
|
|
1720
|
+
/** Minimum silence duration to end segment in ms (default: 300) */
|
|
1721
|
+
minSilenceDurationMs?: number;
|
|
1722
|
+
/** Padding to add before/after speech in ms (default: 30) */
|
|
1723
|
+
speechPadMs?: number;
|
|
1724
|
+
}): Promise<SpeechSegment[]>;
|
|
1689
1725
|
/**
|
|
1690
1726
|
* Queue inference to serialize ONNX session calls
|
|
1691
1727
|
*/
|
|
1692
1728
|
private queueInference;
|
|
1693
|
-
/**
|
|
1694
|
-
* Get blendshape value by name for a specific frame
|
|
1695
|
-
*/
|
|
1696
|
-
getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
|
|
1697
1729
|
/**
|
|
1698
1730
|
* Dispose of the model and free resources
|
|
1699
1731
|
*/
|
|
@@ -1701,356 +1733,927 @@ declare class Wav2Vec2Inference implements LipSyncBackend {
|
|
|
1701
1733
|
}
|
|
1702
1734
|
|
|
1703
1735
|
/**
|
|
1704
|
-
*
|
|
1705
|
-
*
|
|
1706
|
-
* A Safari/iOS-compatible alternative to Wav2Vec2Inference (384MB) designed
|
|
1707
|
-
* for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
|
|
1736
|
+
* Silero VAD Web Worker implementation
|
|
1708
1737
|
*
|
|
1709
|
-
*
|
|
1710
|
-
*
|
|
1711
|
-
* - wav2arkit_cpu.onnx.data (402MB weights)
|
|
1712
|
-
* Both files are fetched and cached automatically.
|
|
1738
|
+
* Runs Silero VAD inference in a dedicated Web Worker to prevent main thread blocking.
|
|
1739
|
+
* Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
|
|
1713
1740
|
*
|
|
1714
|
-
* Key
|
|
1715
|
-
* - WASM
|
|
1716
|
-
* -
|
|
1717
|
-
* -
|
|
1718
|
-
* -
|
|
1719
|
-
* - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
|
|
1741
|
+
* Key design decisions:
|
|
1742
|
+
* - WASM backend only (WebGPU doesn't work in Workers)
|
|
1743
|
+
* - LSTM state serialized as Float32Array (Tensors can't cross worker boundary)
|
|
1744
|
+
* - Audio copied (not transferred) to retain main thread access for pre-speech buffer
|
|
1745
|
+
* - ONNX Runtime loaded from CDN in worker (no bundler complications)
|
|
1720
1746
|
*
|
|
1721
1747
|
* @category Inference
|
|
1722
1748
|
*
|
|
1723
|
-
* @example
|
|
1749
|
+
* @example Basic usage
|
|
1724
1750
|
* ```typescript
|
|
1725
|
-
* import {
|
|
1751
|
+
* import { SileroVADWorker } from '@omote/core';
|
|
1726
1752
|
*
|
|
1727
|
-
* const
|
|
1728
|
-
* modelUrl: '/models/
|
|
1753
|
+
* const vad = new SileroVADWorker({
|
|
1754
|
+
* modelUrl: '/models/silero-vad.onnx'
|
|
1729
1755
|
* });
|
|
1730
|
-
* await
|
|
1756
|
+
* await vad.load();
|
|
1731
1757
|
*
|
|
1732
|
-
*
|
|
1733
|
-
*
|
|
1758
|
+
* // Process 32ms chunks (512 samples at 16kHz)
|
|
1759
|
+
* const result = await vad.process(audioChunk);
|
|
1760
|
+
* if (result.isSpeech) {
|
|
1761
|
+
* console.log('Speech detected!', result.probability);
|
|
1762
|
+
* }
|
|
1734
1763
|
* ```
|
|
1735
1764
|
*/
|
|
1736
1765
|
|
|
1737
|
-
|
|
1738
|
-
|
|
1766
|
+
/**
|
|
1767
|
+
* Configuration for Silero VAD Worker
|
|
1768
|
+
*/
|
|
1769
|
+
interface VADWorkerConfig {
|
|
1770
|
+
/** Path or URL to the ONNX model */
|
|
1739
1771
|
modelUrl: string;
|
|
1772
|
+
/** Sample rate (8000 or 16000, default: 16000) */
|
|
1773
|
+
sampleRate?: 8000 | 16000;
|
|
1774
|
+
/** Speech probability threshold (default: 0.5) */
|
|
1775
|
+
threshold?: number;
|
|
1740
1776
|
/**
|
|
1741
|
-
*
|
|
1742
|
-
*
|
|
1777
|
+
* Number of audio chunks to keep in pre-speech buffer.
|
|
1778
|
+
* When VAD triggers, these chunks are prepended to the speech buffer
|
|
1779
|
+
* to capture the beginning of speech that occurred before detection.
|
|
1743
1780
|
*
|
|
1744
|
-
*
|
|
1781
|
+
* At 512 samples/chunk and 16kHz:
|
|
1782
|
+
* - 10 chunks = 320ms of pre-speech audio
|
|
1783
|
+
* - 15 chunks = 480ms of pre-speech audio
|
|
1784
|
+
*
|
|
1785
|
+
* Default: 10 chunks (320ms)
|
|
1745
1786
|
*/
|
|
1746
|
-
|
|
1747
|
-
/** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
|
|
1748
|
-
backend?: BackendPreference;
|
|
1787
|
+
preSpeechBufferChunks?: number;
|
|
1749
1788
|
}
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1789
|
+
/**
|
|
1790
|
+
* VAD model loading information from worker
|
|
1791
|
+
*/
|
|
1792
|
+
interface VADWorkerModelInfo {
|
|
1793
|
+
backend: 'wasm';
|
|
1794
|
+
loadTimeMs: number;
|
|
1795
|
+
inputNames: string[];
|
|
1796
|
+
outputNames: string[];
|
|
1797
|
+
sampleRate: number;
|
|
1798
|
+
chunkSize: number;
|
|
1799
|
+
}
|
|
1800
|
+
|
|
1801
|
+
/**
|
|
1802
|
+
* Silero VAD Worker - Voice Activity Detection in a Web Worker
|
|
1803
|
+
*
|
|
1804
|
+
* Runs Silero VAD inference off the main thread to prevent UI blocking.
|
|
1805
|
+
* Feature parity with SileroVADInference but runs in dedicated worker.
|
|
1806
|
+
*
|
|
1807
|
+
* @see SileroVADInference for main-thread version
|
|
1808
|
+
*/
|
|
1809
|
+
declare class SileroVADWorker {
|
|
1810
|
+
private worker;
|
|
1754
1811
|
private config;
|
|
1755
|
-
private _backend;
|
|
1756
1812
|
private isLoading;
|
|
1813
|
+
private _isLoaded;
|
|
1814
|
+
private state;
|
|
1815
|
+
private context;
|
|
1816
|
+
private readonly chunkSize;
|
|
1817
|
+
private readonly contextSize;
|
|
1757
1818
|
private inferenceQueue;
|
|
1758
|
-
|
|
1759
|
-
|
|
1819
|
+
private preSpeechBuffer;
|
|
1820
|
+
private wasSpeaking;
|
|
1821
|
+
private pendingResolvers;
|
|
1822
|
+
private messageId;
|
|
1823
|
+
constructor(config: VADWorkerConfig);
|
|
1760
1824
|
get isLoaded(): boolean;
|
|
1761
1825
|
/**
|
|
1762
|
-
*
|
|
1826
|
+
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
1763
1827
|
*/
|
|
1764
|
-
|
|
1828
|
+
get backend(): 'wasm' | null;
|
|
1829
|
+
get sampleRate(): number;
|
|
1830
|
+
get threshold(): number;
|
|
1765
1831
|
/**
|
|
1766
|
-
*
|
|
1767
|
-
*
|
|
1768
|
-
* Accepts variable-length audio (not fixed to 16000 samples).
|
|
1769
|
-
* Output frames = ceil(30 * numSamples / 16000).
|
|
1770
|
-
*
|
|
1771
|
-
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
1772
|
-
* @param _identityIndex - Ignored (identity 11 is baked into the model)
|
|
1832
|
+
* Get required chunk size in samples
|
|
1773
1833
|
*/
|
|
1774
|
-
|
|
1834
|
+
getChunkSize(): number;
|
|
1775
1835
|
/**
|
|
1776
|
-
*
|
|
1836
|
+
* Get chunk duration in milliseconds
|
|
1837
|
+
*/
|
|
1838
|
+
getChunkDurationMs(): number;
|
|
1839
|
+
/**
|
|
1840
|
+
* Create the worker from inline script
|
|
1841
|
+
*/
|
|
1842
|
+
private createWorker;
|
|
1843
|
+
/**
|
|
1844
|
+
* Handle messages from worker
|
|
1845
|
+
*/
|
|
1846
|
+
private handleWorkerMessage;
|
|
1847
|
+
/**
|
|
1848
|
+
* Send message to worker and wait for response
|
|
1849
|
+
*/
|
|
1850
|
+
private sendMessage;
|
|
1851
|
+
/**
|
|
1852
|
+
* Load the ONNX model in the worker
|
|
1853
|
+
*/
|
|
1854
|
+
load(): Promise<VADWorkerModelInfo>;
|
|
1855
|
+
/**
|
|
1856
|
+
* Reset state for new audio stream
|
|
1857
|
+
*/
|
|
1858
|
+
reset(): Promise<void>;
|
|
1859
|
+
/**
|
|
1860
|
+
* Process a single audio chunk
|
|
1861
|
+
*
|
|
1862
|
+
* @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
|
|
1863
|
+
* @returns VAD result with speech probability
|
|
1864
|
+
*/
|
|
1865
|
+
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1866
|
+
/**
|
|
1867
|
+
* Queue inference to serialize worker calls
|
|
1777
1868
|
*/
|
|
1778
1869
|
private queueInference;
|
|
1779
1870
|
/**
|
|
1780
|
-
* Dispose of the
|
|
1871
|
+
* Dispose of the worker and free resources
|
|
1781
1872
|
*/
|
|
1782
1873
|
dispose(): Promise<void>;
|
|
1874
|
+
/**
|
|
1875
|
+
* Check if Web Workers are supported
|
|
1876
|
+
*/
|
|
1877
|
+
static isSupported(): boolean;
|
|
1783
1878
|
}
|
|
1784
1879
|
|
|
1785
1880
|
/**
|
|
1786
|
-
* Factory function for
|
|
1787
|
-
*
|
|
1788
|
-
* Provides a unified API that automatically selects the optimal model:
|
|
1789
|
-
* - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
|
|
1790
|
-
* - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
|
|
1791
|
-
* - Fallback: Gracefully falls back to CPU model if GPU model fails to load
|
|
1881
|
+
* Factory function for Silero VAD with automatic Worker vs main thread selection
|
|
1792
1882
|
*
|
|
1793
|
-
*
|
|
1794
|
-
*
|
|
1795
|
-
*
|
|
1796
|
-
*
|
|
1797
|
-
* 2. It ships as a single 384MB .onnx file that must load into JS heap before
|
|
1798
|
-
* ORT can consume it. iOS WebKit OOMs on this allocation.
|
|
1799
|
-
* wav2arkit_cpu solves both: external data format (1.86MB graph + 402MB weights)
|
|
1800
|
-
* lets ORT load only the tiny graph, then stream weights via URL pass-through
|
|
1801
|
-
* directly into WASM memory. JS heap stays at ~2MB.
|
|
1883
|
+
* Provides a unified API that automatically selects the optimal implementation:
|
|
1884
|
+
* - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
|
|
1885
|
+
* - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
|
|
1886
|
+
* - Fallback: Gracefully falls back to main thread if Worker fails
|
|
1802
1887
|
*
|
|
1803
1888
|
* @category Inference
|
|
1804
1889
|
*
|
|
1805
|
-
* @example
|
|
1890
|
+
* @example Basic usage (auto-detect)
|
|
1806
1891
|
* ```typescript
|
|
1807
|
-
* import {
|
|
1892
|
+
* import { createSileroVAD } from '@omote/core';
|
|
1808
1893
|
*
|
|
1809
|
-
* const
|
|
1810
|
-
*
|
|
1811
|
-
*
|
|
1894
|
+
* const vad = createSileroVAD({
|
|
1895
|
+
* modelUrl: '/models/silero-vad.onnx',
|
|
1896
|
+
* threshold: 0.5,
|
|
1812
1897
|
* });
|
|
1813
1898
|
*
|
|
1814
|
-
* await
|
|
1815
|
-
* const
|
|
1899
|
+
* await vad.load();
|
|
1900
|
+
* const result = await vad.process(audioChunk);
|
|
1901
|
+
* if (result.isSpeech) {
|
|
1902
|
+
* console.log('Speech detected!', result.probability);
|
|
1903
|
+
* }
|
|
1816
1904
|
* ```
|
|
1817
1905
|
*
|
|
1818
|
-
* @example Force
|
|
1906
|
+
* @example Force worker usage
|
|
1819
1907
|
* ```typescript
|
|
1820
|
-
* const
|
|
1821
|
-
*
|
|
1822
|
-
*
|
|
1823
|
-
*
|
|
1908
|
+
* const vad = createSileroVAD({
|
|
1909
|
+
* modelUrl: '/models/silero-vad.onnx',
|
|
1910
|
+
* useWorker: true, // Force Worker even on mobile
|
|
1911
|
+
* });
|
|
1912
|
+
* ```
|
|
1913
|
+
*
|
|
1914
|
+
* @example Force main thread
|
|
1915
|
+
* ```typescript
|
|
1916
|
+
* const vad = createSileroVAD({
|
|
1917
|
+
* modelUrl: '/models/silero-vad.onnx',
|
|
1918
|
+
* useWorker: false, // Force main thread
|
|
1824
1919
|
* });
|
|
1825
1920
|
* ```
|
|
1826
1921
|
*/
|
|
1827
1922
|
|
|
1828
1923
|
/**
|
|
1829
|
-
*
|
|
1924
|
+
* Common interface for both SileroVADInference and SileroVADWorker
|
|
1925
|
+
*
|
|
1926
|
+
* This interface defines the shared API that both implementations provide,
|
|
1927
|
+
* allowing consumers to use either interchangeably.
|
|
1830
1928
|
*/
|
|
1831
|
-
interface
|
|
1832
|
-
/**
|
|
1833
|
-
|
|
1929
|
+
interface SileroVADBackend {
|
|
1930
|
+
/** Current backend type (webgpu, wasm, or null if not loaded) */
|
|
1931
|
+
readonly backend: RuntimeBackend | null;
|
|
1932
|
+
/** Whether the model is loaded and ready for inference */
|
|
1933
|
+
readonly isLoaded: boolean;
|
|
1934
|
+
/** Audio sample rate (8000 or 16000 Hz) */
|
|
1935
|
+
readonly sampleRate: number;
|
|
1936
|
+
/** Speech detection threshold (0-1) */
|
|
1937
|
+
readonly threshold: number;
|
|
1834
1938
|
/**
|
|
1835
|
-
*
|
|
1836
|
-
*
|
|
1837
|
-
*
|
|
1838
|
-
* Set to `false` to skip external data loading (single-file models only).
|
|
1939
|
+
* Load the ONNX model
|
|
1940
|
+
* @returns Model loading information
|
|
1839
1941
|
*/
|
|
1840
|
-
|
|
1841
|
-
/** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
|
|
1842
|
-
cpuModelUrl: string;
|
|
1942
|
+
load(): Promise<VADModelInfo | VADWorkerModelInfo>;
|
|
1843
1943
|
/**
|
|
1844
|
-
*
|
|
1845
|
-
*
|
|
1846
|
-
*
|
|
1847
|
-
* - 'cpu': Force CPU model (Wav2ArkitCpuInference)
|
|
1944
|
+
* Process a single audio chunk
|
|
1945
|
+
* @param audioChunk - Float32Array of exactly chunkSize samples
|
|
1946
|
+
* @returns VAD result with speech probability
|
|
1848
1947
|
*/
|
|
1849
|
-
|
|
1850
|
-
/** Backend preference for GPU model (default: 'auto') */
|
|
1851
|
-
gpuBackend?: BackendPreference;
|
|
1852
|
-
/** Number of identity classes for GPU model (default: 12) */
|
|
1853
|
-
numIdentityClasses?: number;
|
|
1948
|
+
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1854
1949
|
/**
|
|
1855
|
-
*
|
|
1856
|
-
|
|
1950
|
+
* Reset state for new audio stream
|
|
1951
|
+
*/
|
|
1952
|
+
reset(): void | Promise<void>;
|
|
1953
|
+
/**
|
|
1954
|
+
* Dispose of the model and free resources
|
|
1955
|
+
*/
|
|
1956
|
+
dispose(): Promise<void>;
|
|
1957
|
+
/**
|
|
1958
|
+
* Get required chunk size in samples
|
|
1959
|
+
*/
|
|
1960
|
+
getChunkSize(): number;
|
|
1961
|
+
/**
|
|
1962
|
+
* Get chunk duration in milliseconds
|
|
1963
|
+
*/
|
|
1964
|
+
getChunkDurationMs(): number;
|
|
1965
|
+
}
|
|
1966
|
+
/**
|
|
1967
|
+
* Configuration for the Silero VAD factory
|
|
1968
|
+
*
|
|
1969
|
+
* Extends SileroVADConfig with worker-specific options.
|
|
1970
|
+
*/
|
|
1971
|
+
interface SileroVADFactoryConfig extends SileroVADConfig {
|
|
1972
|
+
/**
|
|
1973
|
+
* Force worker usage (true), main thread (false), or auto-detect (undefined).
|
|
1974
|
+
*
|
|
1975
|
+
* Auto-detection behavior:
|
|
1976
|
+
* - Desktop: Uses Worker (better responsiveness, off-main-thread)
|
|
1977
|
+
* - Mobile: Uses main thread (avoids 5MB memory overhead)
|
|
1978
|
+
*
|
|
1979
|
+
* You can override this to:
|
|
1980
|
+
* - `true`: Force Worker even on mobile (if you have memory headroom)
|
|
1981
|
+
* - `false`: Force main thread even on desktop (for debugging)
|
|
1982
|
+
*
|
|
1983
|
+
* Default: undefined (auto-detect)
|
|
1984
|
+
*/
|
|
1985
|
+
useWorker?: boolean;
|
|
1986
|
+
/**
|
|
1987
|
+
* Fallback to main thread on worker errors.
|
|
1988
|
+
*
|
|
1989
|
+
* When true (default), if the Worker fails to load or encounters an error,
|
|
1990
|
+
* the factory will automatically create a main thread instance instead.
|
|
1991
|
+
*
|
|
1992
|
+
* When false, worker errors will propagate as exceptions.
|
|
1993
|
+
*
|
|
1994
|
+
* Default: true
|
|
1857
1995
|
*/
|
|
1858
1996
|
fallbackOnError?: boolean;
|
|
1997
|
+
/**
|
|
1998
|
+
* Unified inference worker instance.
|
|
1999
|
+
* When provided, uses SileroVADUnifiedAdapter (shared single-ORT worker).
|
|
2000
|
+
* Takes precedence over useWorker setting.
|
|
2001
|
+
*/
|
|
2002
|
+
unifiedWorker?: UnifiedInferenceWorker;
|
|
1859
2003
|
}
|
|
1860
2004
|
/**
|
|
1861
|
-
*
|
|
2005
|
+
* Check if the current environment supports VAD Web Workers
|
|
2006
|
+
*
|
|
2007
|
+
* Requirements:
|
|
2008
|
+
* - Worker constructor must exist
|
|
2009
|
+
* - Blob URL support (for inline worker script)
|
|
2010
|
+
*
|
|
2011
|
+
* @returns true if VAD Worker is supported
|
|
2012
|
+
*/
|
|
2013
|
+
declare function supportsVADWorker(): boolean;
|
|
2014
|
+
/**
|
|
2015
|
+
* Create a Silero VAD instance with automatic implementation selection
|
|
2016
|
+
*
|
|
2017
|
+
* This factory function automatically selects between:
|
|
2018
|
+
* - **SileroVADWorker**: Off-main-thread inference (better for desktop)
|
|
2019
|
+
* - **SileroVADInference**: Main thread inference (better for mobile)
|
|
2020
|
+
*
|
|
2021
|
+
* The selection is based on:
|
|
2022
|
+
* 1. Explicit `useWorker` config (if provided)
|
|
2023
|
+
* 2. Platform detection (mobile vs desktop)
|
|
2024
|
+
* 3. Worker API availability
|
|
2025
|
+
*
|
|
2026
|
+
* Both implementations share the same interface (SileroVADBackend),
|
|
2027
|
+
* so consumers can use either interchangeably.
|
|
1862
2028
|
*
|
|
1863
2029
|
* @param config - Factory configuration
|
|
1864
|
-
* @returns A
|
|
2030
|
+
* @returns A SileroVAD instance (either Worker or main thread)
|
|
2031
|
+
*
|
|
2032
|
+
* @example
|
|
2033
|
+
* ```typescript
|
|
2034
|
+
* // Auto-detect (recommended)
|
|
2035
|
+
* const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
|
|
2036
|
+
*
|
|
2037
|
+
* // Force Worker
|
|
2038
|
+
* const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
|
|
2039
|
+
*
|
|
2040
|
+
* // Force main thread
|
|
2041
|
+
* const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
|
|
2042
|
+
* ```
|
|
1865
2043
|
*/
|
|
1866
|
-
declare function
|
|
2044
|
+
declare function createSileroVAD(config: SileroVADFactoryConfig): SileroVADBackend;
|
|
1867
2045
|
|
|
1868
2046
|
/**
|
|
1869
|
-
*
|
|
2047
|
+
* Web Worker-based wav2arkit_cpu lip sync inference
|
|
1870
2048
|
*
|
|
1871
|
-
*
|
|
1872
|
-
*
|
|
2049
|
+
* Runs wav2arkit_cpu inference in a dedicated Web Worker to prevent main thread blocking.
|
|
2050
|
+
* Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
|
|
1873
2051
|
*
|
|
1874
|
-
*
|
|
1875
|
-
* -
|
|
1876
|
-
* -
|
|
2052
|
+
* Key design decisions:
|
|
2053
|
+
* - WASM backend only (WebGPU doesn't work in Workers)
|
|
2054
|
+
* - Audio copied (not transferred) to retain main thread access
|
|
2055
|
+
* - ONNX Runtime loaded from CDN in worker (no bundler complications)
|
|
2056
|
+
* - Blendshape symmetrization inlined in worker (no module imports)
|
|
2057
|
+
* - iOS: passes model URLs as strings directly to ORT (avoids 400MB+ JS heap)
|
|
1877
2058
|
*
|
|
1878
2059
|
* @category Inference
|
|
1879
2060
|
*
|
|
1880
|
-
* @example
|
|
2061
|
+
* @example
|
|
1881
2062
|
* ```typescript
|
|
1882
|
-
* import {
|
|
2063
|
+
* import { Wav2ArkitCpuWorker } from '@omote/core';
|
|
1883
2064
|
*
|
|
1884
|
-
* const
|
|
1885
|
-
* modelUrl: '/models/
|
|
2065
|
+
* const lam = new Wav2ArkitCpuWorker({
|
|
2066
|
+
* modelUrl: '/models/wav2arkit_cpu.onnx',
|
|
1886
2067
|
* });
|
|
1887
|
-
* await
|
|
2068
|
+
* await lam.load();
|
|
1888
2069
|
*
|
|
1889
|
-
*
|
|
1890
|
-
*
|
|
1891
|
-
* if (probability > 0.5) {
|
|
1892
|
-
* console.log('Speech detected!');
|
|
1893
|
-
* }
|
|
2070
|
+
* const { blendshapes } = await lam.infer(audioSamples);
|
|
2071
|
+
* // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
|
|
1894
2072
|
* ```
|
|
1895
|
-
|
|
1896
|
-
|
|
1897
|
-
|
|
1898
|
-
*
|
|
1899
|
-
|
|
1900
|
-
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
|
|
1904
|
-
|
|
1905
|
-
|
|
2073
|
+
*/
|
|
2074
|
+
|
|
2075
|
+
/**
|
|
2076
|
+
* Configuration for Wav2ArkitCpu Worker
|
|
2077
|
+
*/
|
|
2078
|
+
interface Wav2ArkitCpuWorkerConfig {
|
|
2079
|
+
/** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
|
|
2080
|
+
modelUrl: string;
|
|
2081
|
+
/**
|
|
2082
|
+
* Path or URL to external model data file (.onnx.data weights).
|
|
2083
|
+
* Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
|
|
2084
|
+
*
|
|
2085
|
+
* Set to `false` to skip external data loading (single-file models only).
|
|
2086
|
+
*/
|
|
2087
|
+
externalDataUrl?: string | false;
|
|
2088
|
+
}
|
|
2089
|
+
/**
|
|
2090
|
+
* Wav2ArkitCpu Worker - Lip sync inference in a Web Worker
|
|
2091
|
+
*
|
|
2092
|
+
* Runs wav2arkit_cpu inference off the main thread to prevent UI blocking.
|
|
2093
|
+
* Feature parity with Wav2ArkitCpuInference but runs in dedicated worker.
|
|
2094
|
+
*
|
|
2095
|
+
* @see Wav2ArkitCpuInference for main-thread version
|
|
2096
|
+
*/
|
|
2097
|
+
declare class Wav2ArkitCpuWorker implements LipSyncBackend {
|
|
2098
|
+
readonly modelId: "wav2arkit_cpu";
|
|
2099
|
+
private worker;
|
|
2100
|
+
private config;
|
|
2101
|
+
private isLoading;
|
|
2102
|
+
private _isLoaded;
|
|
2103
|
+
private inferenceQueue;
|
|
2104
|
+
private poisoned;
|
|
2105
|
+
private pendingResolvers;
|
|
2106
|
+
constructor(config: Wav2ArkitCpuWorkerConfig);
|
|
2107
|
+
get isLoaded(): boolean;
|
|
2108
|
+
/**
|
|
2109
|
+
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
2110
|
+
*/
|
|
2111
|
+
get backend(): 'wasm' | null;
|
|
2112
|
+
/**
|
|
2113
|
+
* Create the worker from inline script
|
|
2114
|
+
*/
|
|
2115
|
+
private createWorker;
|
|
2116
|
+
/**
|
|
2117
|
+
* Handle messages from worker
|
|
2118
|
+
*/
|
|
2119
|
+
private handleWorkerMessage;
|
|
2120
|
+
/**
|
|
2121
|
+
* Send message to worker and wait for response
|
|
2122
|
+
*/
|
|
2123
|
+
private sendMessage;
|
|
2124
|
+
/**
|
|
2125
|
+
* Load the ONNX model in the worker
|
|
2126
|
+
*/
|
|
2127
|
+
load(): Promise<LipSyncModelInfo>;
|
|
2128
|
+
/**
|
|
2129
|
+
* Run inference on raw audio
|
|
2130
|
+
*
|
|
2131
|
+
* Accepts variable-length audio (not fixed to 16000 samples).
|
|
2132
|
+
* Output frames = ceil(30 * numSamples / 16000).
|
|
2133
|
+
*
|
|
2134
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
2135
|
+
* @param _identityIndex - Ignored (identity 11 is baked into the model)
|
|
2136
|
+
*/
|
|
2137
|
+
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
|
|
2138
|
+
/**
|
|
2139
|
+
* Queue inference to serialize worker calls
|
|
2140
|
+
*/
|
|
2141
|
+
private queueInference;
|
|
2142
|
+
/**
|
|
2143
|
+
* Dispose of the worker and free resources
|
|
2144
|
+
*/
|
|
2145
|
+
dispose(): Promise<void>;
|
|
2146
|
+
/**
|
|
2147
|
+
* Check if Web Workers are supported
|
|
2148
|
+
*/
|
|
2149
|
+
static isSupported(): boolean;
|
|
2150
|
+
}
|
|
2151
|
+
|
|
2152
|
+
/**
|
|
2153
|
+
* Unified Inference Worker — single Web Worker hosting all WASM models
|
|
2154
|
+
*
|
|
2155
|
+
* Solves the multi-worker ORT problem: three per-model workers each load their
|
|
2156
|
+
* own ORT WASM instance (~40MB each). On iOS this exceeds the ~1-1.5GB tab
|
|
2157
|
+
* limit, forcing main-thread fallback which blocks the render loop.
|
|
2158
|
+
*
|
|
2159
|
+
* This worker hosts SenseVoice + Wav2ArkitCpu + Silero VAD in a single
|
|
2160
|
+
* ORT WASM instance. Same total model memory (~643MB), but inference runs
|
|
2161
|
+
* off-main-thread. Works on iOS because there's only one ORT instance.
|
|
2162
|
+
*
|
|
2163
|
+
* Consumer usage:
|
|
2164
|
+
* ```typescript
|
|
2165
|
+
* const worker = new UnifiedInferenceWorker();
|
|
2166
|
+
* await worker.init();
|
|
2167
|
+
*
|
|
2168
|
+
* const asr = createSenseVoice({ modelUrl: '...', unifiedWorker: worker });
|
|
2169
|
+
* const lam = createLipSync({ gpuModelUrl: '...', cpuModelUrl: '...', unifiedWorker: worker });
|
|
2170
|
+
* const vad = createSileroVAD({ modelUrl: '...', unifiedWorker: worker });
|
|
2171
|
+
* ```
|
|
2172
|
+
*
|
|
2173
|
+
* @category Inference
|
|
2174
|
+
*/
|
|
2175
|
+
|
|
2176
|
+
/**
|
|
2177
|
+
* Unified Inference Worker — single Web Worker for all WASM models
|
|
2178
|
+
*
|
|
2179
|
+
* Hosts SenseVoice, Wav2ArkitCpu, and Silero VAD in one ORT instance.
|
|
2180
|
+
* Eliminates the multi-worker memory problem on iOS.
|
|
2181
|
+
*/
|
|
2182
|
+
declare class UnifiedInferenceWorker {
|
|
2183
|
+
private worker;
|
|
2184
|
+
private pendingRequests;
|
|
2185
|
+
private initialized;
|
|
2186
|
+
private poisoned;
|
|
2187
|
+
/**
|
|
2188
|
+
* Initialize the worker (load ORT WASM from CDN)
|
|
2189
|
+
*/
|
|
2190
|
+
init(): Promise<void>;
|
|
2191
|
+
loadSenseVoice(config: {
|
|
2192
|
+
modelUrl: string;
|
|
2193
|
+
tokensUrl: string;
|
|
2194
|
+
language: number;
|
|
2195
|
+
textNorm: number;
|
|
2196
|
+
}): Promise<SenseVoiceModelInfo>;
|
|
2197
|
+
transcribe(audio: Float32Array): Promise<SenseVoiceResult>;
|
|
2198
|
+
disposeSenseVoice(): Promise<void>;
|
|
2199
|
+
loadLipSync(config: {
|
|
2200
|
+
modelUrl: string;
|
|
2201
|
+
externalDataUrl: string | null;
|
|
2202
|
+
}): Promise<LipSyncModelInfo>;
|
|
2203
|
+
inferLipSync(audio: Float32Array): Promise<{
|
|
2204
|
+
blendshapes: Float32Array;
|
|
2205
|
+
numFrames: number;
|
|
2206
|
+
numBlendshapes: number;
|
|
2207
|
+
inferenceTimeMs: number;
|
|
2208
|
+
}>;
|
|
2209
|
+
disposeLipSync(): Promise<void>;
|
|
2210
|
+
loadVAD(config: {
|
|
2211
|
+
modelUrl: string;
|
|
2212
|
+
sampleRate: number;
|
|
2213
|
+
}): Promise<VADWorkerModelInfo>;
|
|
2214
|
+
processVAD(audio: Float32Array, state: Float32Array, context: Float32Array): Promise<{
|
|
2215
|
+
probability: number;
|
|
2216
|
+
state: Float32Array;
|
|
2217
|
+
inferenceTimeMs: number;
|
|
2218
|
+
}>;
|
|
2219
|
+
resetVAD(): Promise<Float32Array>;
|
|
2220
|
+
disposeVAD(): Promise<void>;
|
|
2221
|
+
dispose(): Promise<void>;
|
|
2222
|
+
/** Check if the worker is initialized and not poisoned */
|
|
2223
|
+
get isReady(): boolean;
|
|
2224
|
+
/** Check if Web Workers are supported */
|
|
2225
|
+
static isSupported(): boolean;
|
|
2226
|
+
private assertReady;
|
|
2227
|
+
private createWorker;
|
|
2228
|
+
private handleWorkerMessage;
|
|
2229
|
+
private sendMessage;
|
|
2230
|
+
private rejectAllPending;
|
|
2231
|
+
private cleanup;
|
|
2232
|
+
}
|
|
2233
|
+
/**
|
|
2234
|
+
* SenseVoice adapter backed by UnifiedInferenceWorker
|
|
2235
|
+
*
|
|
2236
|
+
* Implements SenseVoiceBackend, delegating all inference to the shared worker.
|
|
2237
|
+
*/
|
|
2238
|
+
declare class SenseVoiceUnifiedAdapter implements SenseVoiceBackend {
|
|
2239
|
+
private worker;
|
|
2240
|
+
private config;
|
|
2241
|
+
private _isLoaded;
|
|
2242
|
+
private languageId;
|
|
2243
|
+
private textNormId;
|
|
2244
|
+
private inferenceQueue;
|
|
2245
|
+
constructor(worker: UnifiedInferenceWorker, config: SenseVoiceWorkerConfig);
|
|
2246
|
+
get isLoaded(): boolean;
|
|
2247
|
+
get backend(): 'wasm' | null;
|
|
2248
|
+
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
2249
|
+
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
2250
|
+
dispose(): Promise<void>;
|
|
2251
|
+
}
|
|
2252
|
+
/**
|
|
2253
|
+
* Wav2ArkitCpu adapter backed by UnifiedInferenceWorker
|
|
2254
|
+
*
|
|
2255
|
+
* Implements LipSyncBackend, delegating all inference to the shared worker.
|
|
2256
|
+
*/
|
|
2257
|
+
declare class Wav2ArkitCpuUnifiedAdapter implements LipSyncBackend {
|
|
2258
|
+
readonly modelId: "wav2arkit_cpu";
|
|
2259
|
+
private worker;
|
|
2260
|
+
private config;
|
|
2261
|
+
private _isLoaded;
|
|
2262
|
+
private inferenceQueue;
|
|
2263
|
+
constructor(worker: UnifiedInferenceWorker, config: Wav2ArkitCpuWorkerConfig);
|
|
2264
|
+
get isLoaded(): boolean;
|
|
2265
|
+
get backend(): RuntimeBackend | null;
|
|
2266
|
+
load(): Promise<LipSyncModelInfo>;
|
|
2267
|
+
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
|
|
2268
|
+
dispose(): Promise<void>;
|
|
2269
|
+
}
|
|
2270
|
+
/**
|
|
2271
|
+
* Silero VAD adapter backed by UnifiedInferenceWorker
|
|
2272
|
+
*
|
|
2273
|
+
* Implements SileroVADBackend, delegating all inference to the shared worker.
|
|
2274
|
+
*/
|
|
2275
|
+
declare class SileroVADUnifiedAdapter implements SileroVADBackend {
|
|
2276
|
+
private worker;
|
|
2277
|
+
private config;
|
|
2278
|
+
private _isLoaded;
|
|
2279
|
+
private state;
|
|
2280
|
+
private context;
|
|
2281
|
+
private readonly chunkSize;
|
|
2282
|
+
private readonly contextSize;
|
|
2283
|
+
private inferenceQueue;
|
|
2284
|
+
private preSpeechBuffer;
|
|
2285
|
+
private wasSpeaking;
|
|
2286
|
+
constructor(worker: UnifiedInferenceWorker, config: SileroVADConfig);
|
|
2287
|
+
get isLoaded(): boolean;
|
|
2288
|
+
get backend(): RuntimeBackend | null;
|
|
2289
|
+
get sampleRate(): number;
|
|
2290
|
+
get threshold(): number;
|
|
2291
|
+
getChunkSize(): number;
|
|
2292
|
+
getChunkDurationMs(): number;
|
|
2293
|
+
load(): Promise<VADWorkerModelInfo>;
|
|
2294
|
+
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
2295
|
+
reset(): Promise<void>;
|
|
2296
|
+
dispose(): Promise<void>;
|
|
2297
|
+
}
|
|
2298
|
+
|
|
2299
|
+
/**
|
|
2300
|
+
* Factory function for SenseVoice ASR with automatic Worker vs main thread selection
|
|
2301
|
+
*
|
|
2302
|
+
* Provides a unified API that automatically selects the optimal implementation:
|
|
2303
|
+
* - Worker supported: Uses SenseVoiceWorker (off-main-thread inference)
|
|
2304
|
+
* - Worker unsupported: Uses SenseVoiceInference (main thread)
|
|
2305
|
+
*
|
|
2306
|
+
* @category Inference
|
|
2307
|
+
*
|
|
2308
|
+
* @example Auto-detect (recommended)
|
|
2309
|
+
* ```typescript
|
|
2310
|
+
* import { createSenseVoice } from '@omote/core';
|
|
2311
|
+
*
|
|
2312
|
+
* const asr = createSenseVoice({
|
|
2313
|
+
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
2314
|
+
* });
|
|
2315
|
+
* await asr.load();
|
|
2316
|
+
* const { text, emotion } = await asr.transcribe(audioSamples);
|
|
2317
|
+
* ```
|
|
2318
|
+
*
|
|
2319
|
+
* @example Force worker
|
|
2320
|
+
* ```typescript
|
|
2321
|
+
* const asr = createSenseVoice({
|
|
2322
|
+
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
2323
|
+
* useWorker: true,
|
|
2324
|
+
* });
|
|
2325
|
+
* ```
|
|
2326
|
+
*
|
|
2327
|
+
* @example Force main thread
|
|
2328
|
+
* ```typescript
|
|
2329
|
+
* const asr = createSenseVoice({
|
|
2330
|
+
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
2331
|
+
* useWorker: false,
|
|
2332
|
+
* });
|
|
2333
|
+
* ```
|
|
2334
|
+
*/
|
|
2335
|
+
|
|
2336
|
+
/**
|
|
2337
|
+
* Common interface for both SenseVoiceInference and SenseVoiceWorker
|
|
2338
|
+
*/
|
|
2339
|
+
interface SenseVoiceBackend {
|
|
2340
|
+
/** Whether the model is loaded and ready for inference */
|
|
2341
|
+
readonly isLoaded: boolean;
|
|
2342
|
+
/** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
|
|
2343
|
+
readonly backend: 'wasm' | 'webgpu' | null;
|
|
2344
|
+
/**
|
|
2345
|
+
* Load the ONNX model
|
|
2346
|
+
* @param onProgress - Optional progress callback (fires once at 100% for worker)
|
|
2347
|
+
* @returns Model loading information
|
|
2348
|
+
*/
|
|
2349
|
+
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
2350
|
+
/**
|
|
2351
|
+
* Transcribe audio samples to text
|
|
2352
|
+
* @param audioSamples - Float32Array of audio samples at 16kHz
|
|
2353
|
+
* @returns Transcription result
|
|
2354
|
+
*/
|
|
2355
|
+
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
2356
|
+
/**
|
|
2357
|
+
* Dispose of the model and free resources
|
|
2358
|
+
*/
|
|
2359
|
+
dispose(): Promise<void>;
|
|
2360
|
+
}
|
|
2361
|
+
/**
|
|
2362
|
+
* Configuration for the SenseVoice factory
|
|
2363
|
+
*/
|
|
2364
|
+
interface CreateSenseVoiceConfig {
|
|
2365
|
+
/** Path or URL to model.int8.onnx (239MB) */
|
|
2366
|
+
modelUrl: string;
|
|
2367
|
+
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
2368
|
+
tokensUrl?: string;
|
|
2369
|
+
/** Language hint (default: 'auto') */
|
|
2370
|
+
language?: SenseVoiceLanguage;
|
|
2371
|
+
/** Text normalization (default: 'with_itn') */
|
|
2372
|
+
textNorm?: 'with_itn' | 'without_itn';
|
|
2373
|
+
/**
|
|
2374
|
+
* Worker mode:
|
|
2375
|
+
* - 'auto' (default): Use Worker if supported, else main thread
|
|
2376
|
+
* - true: Force Worker (throws if unsupported)
|
|
2377
|
+
* - false: Force main thread
|
|
2378
|
+
*/
|
|
2379
|
+
useWorker?: boolean | 'auto';
|
|
2380
|
+
/**
|
|
2381
|
+
* Unified inference worker instance.
|
|
2382
|
+
* When provided, uses SenseVoiceUnifiedAdapter (shared single-ORT worker).
|
|
2383
|
+
* Takes precedence over useWorker setting.
|
|
2384
|
+
*/
|
|
2385
|
+
unifiedWorker?: UnifiedInferenceWorker;
|
|
2386
|
+
}
|
|
2387
|
+
/**
|
|
2388
|
+
* Create a SenseVoice ASR instance with automatic implementation selection
|
|
2389
|
+
*
|
|
2390
|
+
* @param config - Factory configuration
|
|
2391
|
+
* @returns A SenseVoiceBackend instance (either Worker or main thread)
|
|
2392
|
+
*/
|
|
2393
|
+
declare function createSenseVoice(config: CreateSenseVoiceConfig): SenseVoiceBackend;
|
|
2394
|
+
|
|
2395
|
+
/**
|
|
2396
|
+
* Kaldi-compatible filterbank (fbank) feature extraction
|
|
2397
|
+
*
|
|
2398
|
+
* Pure TypeScript implementation matching kaldi-native-fbank parameters
|
|
2399
|
+
* used by SenseVoice. No external dependencies.
|
|
2400
|
+
*
|
|
2401
|
+
* Pipeline: audio → framing → windowing → FFT → power spectrum → mel filterbank → log
|
|
2402
|
+
*
|
|
2403
|
+
* @module inference/kaldiFbank
|
|
2404
|
+
*/
|
|
2405
|
+
interface KaldiFbankOptions {
|
|
2406
|
+
/** Frame length in ms (default: 25) */
|
|
2407
|
+
frameLengthMs?: number;
|
|
2408
|
+
/** Frame shift in ms (default: 10) */
|
|
2409
|
+
frameShiftMs?: number;
|
|
2410
|
+
/** Low frequency cutoff in Hz (default: 20) */
|
|
2411
|
+
lowFreq?: number;
|
|
2412
|
+
/** High frequency cutoff in Hz (default: sampleRate / 2) */
|
|
2413
|
+
highFreq?: number;
|
|
2414
|
+
/** Dither amount (default: 0 for deterministic output) */
|
|
2415
|
+
dither?: number;
|
|
2416
|
+
/** Preemphasis coefficient (default: 0.97) */
|
|
2417
|
+
preemphasis?: number;
|
|
2418
|
+
}
|
|
2419
|
+
/**
|
|
2420
|
+
* Compute Kaldi-compatible log mel filterbank features
|
|
2421
|
+
*
|
|
2422
|
+
* @param audio Raw audio samples (float32, [-1, 1] range)
|
|
2423
|
+
* @param sampleRate Sample rate in Hz (must be 16000 for SenseVoice)
|
|
2424
|
+
* @param numMelBins Number of mel bins (80 for SenseVoice)
|
|
2425
|
+
* @param opts Optional parameters
|
|
2426
|
+
* @returns Flattened Float32Array of shape [numFrames, numMelBins]
|
|
2427
|
+
*/
|
|
2428
|
+
declare function computeKaldiFbank(audio: Float32Array, sampleRate: number, numMelBins: number, opts?: KaldiFbankOptions): Float32Array;
|
|
2429
|
+
/**
|
|
2430
|
+
* Apply Low Frame Rate stacking for SenseVoice
|
|
2431
|
+
*
|
|
2432
|
+
* Concatenates lfrM consecutive frames with stride lfrN.
|
|
2433
|
+
* Left-pads with copies of first frame, right-pads last group.
|
|
2434
|
+
*
|
|
2435
|
+
* @param features Flattened [numFrames, featureDim]
|
|
2436
|
+
* @param featureDim Feature dimension per frame (e.g., 80)
|
|
2437
|
+
* @param lfrM Number of frames to stack (default: 7)
|
|
2438
|
+
* @param lfrN Stride (default: 6)
|
|
2439
|
+
* @returns Flattened [numOutputFrames, featureDim * lfrM]
|
|
2440
|
+
*/
|
|
2441
|
+
declare function applyLFR(features: Float32Array, featureDim: number, lfrM?: number, lfrN?: number): Float32Array;
|
|
2442
|
+
/**
|
|
2443
|
+
* Apply CMVN normalization in-place
|
|
2444
|
+
*
|
|
2445
|
+
* Formula: normalized[i] = (features[i] + negMean[i % dim]) * invStddev[i % dim]
|
|
2446
|
+
*
|
|
2447
|
+
* @param features Flattened feature array (modified in-place)
|
|
2448
|
+
* @param dim Feature dimension (560 for SenseVoice after LFR)
|
|
2449
|
+
* @param negMean Negative mean vector (dim-dimensional)
|
|
2450
|
+
* @param invStddev Inverse standard deviation vector (dim-dimensional)
|
|
2451
|
+
* @returns The same features array (for chaining)
|
|
2452
|
+
*/
|
|
2453
|
+
declare function applyCMVN(features: Float32Array, dim: number, negMean: Float32Array, invStddev: Float32Array): Float32Array;
|
|
2454
|
+
/**
|
|
2455
|
+
* Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
|
|
2456
|
+
*
|
|
2457
|
+
* The sherpa-onnx SenseVoice export stores neg_mean and inv_stddev
|
|
2458
|
+
* as comma-separated float strings in the model's metadata.
|
|
2459
|
+
*/
|
|
2460
|
+
declare function parseCMVNFromMetadata(negMeanStr: string, invStddevStr: string): {
|
|
2461
|
+
negMean: Float32Array;
|
|
2462
|
+
invStddev: Float32Array;
|
|
2463
|
+
};
|
|
2464
|
+
|
|
2465
|
+
/**
|
|
2466
|
+
* CTC greedy decoder for SenseVoice
|
|
2467
|
+
*
|
|
2468
|
+
* Decodes CTC logits into text with structured token parsing
|
|
2469
|
+
* for language, emotion, and audio event detection.
|
|
2470
|
+
*
|
|
2471
|
+
* @module inference/ctcDecoder
|
|
2472
|
+
*/
|
|
2473
|
+
interface CTCDecodeResult {
|
|
2474
|
+
/** Decoded text (speech content only) */
|
|
2475
|
+
text: string;
|
|
2476
|
+
/** Detected language (e.g., 'zh', 'en', 'ja', 'ko', 'yue') */
|
|
2477
|
+
language?: string;
|
|
2478
|
+
/** Detected emotion (e.g., 'HAPPY', 'SAD', 'ANGRY', 'NEUTRAL') */
|
|
2479
|
+
emotion?: string;
|
|
2480
|
+
/** Detected audio event (e.g., 'Speech', 'BGM', 'Laughter') */
|
|
2481
|
+
event?: string;
|
|
2482
|
+
}
|
|
2483
|
+
/** Resolve language string to SenseVoice language ID */
|
|
2484
|
+
declare function resolveLanguageId(language: string): number;
|
|
2485
|
+
/** Resolve text norm string to SenseVoice text norm ID */
|
|
2486
|
+
declare function resolveTextNormId(textNorm: string): number;
|
|
2487
|
+
/**
|
|
2488
|
+
* Parse tokens.txt into a token ID → string map
|
|
2489
|
+
*
|
|
2490
|
+
* Format: each line is "token_string token_id"
|
|
2491
|
+
* e.g., "<unk> 0", "▁the 3", "s 4"
|
|
2492
|
+
*/
|
|
2493
|
+
declare function parseTokensFile(content: string): Map<number, string>;
|
|
2494
|
+
/**
|
|
2495
|
+
* CTC greedy decode
|
|
2496
|
+
*
|
|
2497
|
+
* @param logits Raw logits from model output, flattened [seqLen, vocabSize]
|
|
2498
|
+
* @param seqLen Sequence length (time steps)
|
|
2499
|
+
* @param vocabSize Vocabulary size
|
|
2500
|
+
* @param tokenMap Token ID → string map from tokens.txt
|
|
2501
|
+
* @returns Decoded text and structured metadata
|
|
2502
|
+
*/
|
|
2503
|
+
declare function ctcGreedyDecode(logits: Float32Array, seqLen: number, vocabSize: number, tokenMap: Map<number, string>): CTCDecodeResult;
|
|
2504
|
+
|
|
2505
|
+
/**
|
|
2506
|
+
* Shared blendshape constants and utilities for lip sync inference
|
|
2507
|
+
*
|
|
2508
|
+
* Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
|
|
2509
|
+
* index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
|
|
2510
|
+
*
|
|
2511
|
+
* This module is the single source of truth for blendshape ordering to
|
|
2512
|
+
* avoid circular dependencies between inference classes.
|
|
2513
|
+
*
|
|
2514
|
+
* @category Inference
|
|
2515
|
+
*/
|
|
2516
|
+
/**
|
|
2517
|
+
* LAM model blendshape names in order (52 total)
|
|
2518
|
+
* NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
|
|
2519
|
+
*/
|
|
2520
|
+
declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
2521
|
+
/** Alias for backwards compatibility */
|
|
2522
|
+
declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
2523
|
+
/**
|
|
2524
|
+
* Symmetrize blendshapes by averaging left/right pairs
|
|
2525
|
+
* From LAM official postprocessing (models/utils.py)
|
|
2526
|
+
* This fixes asymmetric output from the raw model
|
|
2527
|
+
*/
|
|
2528
|
+
declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
|
|
2529
|
+
/**
|
|
2530
|
+
* wav2arkit_cpu model blendshape ordering
|
|
2531
|
+
*
|
|
2532
|
+
* Indices 0-24 match LAM_BLENDSHAPES, but 25+ diverge:
|
|
2533
|
+
* - LAM puts jawRight, mouthClose, mouthDimpleLeft, mouthDimpleRight at 25-28
|
|
2534
|
+
* - wav2arkit_cpu puts mouthFrownLeft at 25 and moves those four to 48-51
|
|
2535
|
+
*/
|
|
2536
|
+
declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
|
|
2537
|
+
/**
|
|
2538
|
+
* Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
|
|
2539
|
+
*
|
|
2540
|
+
* @param frame - Float32Array of 52 blendshape values in wav2arkit_cpu order
|
|
2541
|
+
* @returns Float32Array of 52 blendshape values in LAM_BLENDSHAPES order
|
|
2542
|
+
*/
|
|
2543
|
+
declare function remapWav2ArkitToLam(frame: Float32Array): Float32Array;
|
|
2544
|
+
|
|
2545
|
+
/**
|
|
2546
|
+
* Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
|
|
2547
|
+
*
|
|
2548
|
+
* Runs entirely in the browser using WebGPU or WASM.
|
|
2549
|
+
* Takes raw 16kHz audio and outputs:
|
|
2550
|
+
* - 52 ARKit blendshapes (lip sync)
|
|
2551
|
+
* - 32-token CTC logits (speech recognition)
|
|
2552
|
+
*
|
|
2553
|
+
* @category Inference
|
|
2554
|
+
*
|
|
2555
|
+
* @example Basic usage
|
|
2556
|
+
* ```typescript
|
|
2557
|
+
* import { Wav2Vec2Inference } from '@omote/core';
|
|
2558
|
+
*
|
|
2559
|
+
* const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
|
|
2560
|
+
* await wav2vec.load();
|
|
2561
|
+
*
|
|
2562
|
+
* // Process 1 second of audio (16kHz = 16000 samples)
|
|
2563
|
+
* const result = await wav2vec.infer(audioSamples);
|
|
2564
|
+
*
|
|
2565
|
+
* console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
|
|
2566
|
+
* console.log('ASR text:', result.text); // Decoded transcription
|
|
1906
2567
|
* ```
|
|
1907
2568
|
*/
|
|
1908
2569
|
|
|
1909
|
-
type
|
|
1910
|
-
|
|
1911
|
-
* Configuration for Silero VAD
|
|
1912
|
-
*/
|
|
1913
|
-
interface SileroVADConfig {
|
|
2570
|
+
type InferenceBackend = BackendPreference;
|
|
2571
|
+
interface Wav2Vec2InferenceConfig {
|
|
1914
2572
|
/** Path or URL to the ONNX model */
|
|
1915
2573
|
modelUrl: string;
|
|
1916
|
-
/** Preferred backend (auto will try WebGPU first, fallback to WASM) */
|
|
1917
|
-
backend?: VADBackend;
|
|
1918
|
-
/** Sample rate (8000 or 16000, default: 16000) */
|
|
1919
|
-
sampleRate?: 8000 | 16000;
|
|
1920
|
-
/** Speech probability threshold (default: 0.5) */
|
|
1921
|
-
threshold?: number;
|
|
1922
2574
|
/**
|
|
1923
|
-
*
|
|
1924
|
-
*
|
|
1925
|
-
* to capture the beginning of speech that occurred before detection.
|
|
1926
|
-
*
|
|
1927
|
-
* At 512 samples/chunk and 16kHz:
|
|
1928
|
-
* - 10 chunks = 320ms of pre-speech audio
|
|
1929
|
-
* - 15 chunks = 480ms of pre-speech audio
|
|
2575
|
+
* Path or URL to external model data file (.onnx.data weights).
|
|
2576
|
+
* Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
|
|
1930
2577
|
*
|
|
1931
|
-
*
|
|
2578
|
+
* Set to `false` to skip external data loading (single-file models only).
|
|
1932
2579
|
*/
|
|
1933
|
-
|
|
2580
|
+
externalDataUrl?: string | false;
|
|
2581
|
+
/** Preferred backend (auto will try WebGPU first, fallback to WASM) */
|
|
2582
|
+
backend?: InferenceBackend;
|
|
2583
|
+
/** Number of identity classes (default: 12 for streaming model) */
|
|
2584
|
+
numIdentityClasses?: number;
|
|
1934
2585
|
}
|
|
1935
|
-
|
|
1936
|
-
* VAD model loading information
|
|
1937
|
-
*/
|
|
1938
|
-
interface VADModelInfo {
|
|
2586
|
+
interface ModelInfo {
|
|
1939
2587
|
backend: 'webgpu' | 'wasm';
|
|
1940
2588
|
loadTimeMs: number;
|
|
1941
2589
|
inputNames: string[];
|
|
1942
2590
|
outputNames: string[];
|
|
1943
|
-
sampleRate: number;
|
|
1944
|
-
chunkSize: number;
|
|
1945
2591
|
}
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
interface
|
|
1950
|
-
/**
|
|
1951
|
-
|
|
1952
|
-
/**
|
|
1953
|
-
|
|
1954
|
-
/**
|
|
2592
|
+
|
|
2593
|
+
/** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
|
|
2594
|
+
declare const CTC_VOCAB: string[];
|
|
2595
|
+
interface Wav2Vec2Result {
|
|
2596
|
+
/** Blendshape weights [frames, 52] - 30fps */
|
|
2597
|
+
blendshapes: Float32Array[];
|
|
2598
|
+
/** Raw CTC logits [frames, 32] - 50fps */
|
|
2599
|
+
asrLogits: Float32Array[];
|
|
2600
|
+
/** Decoded text from CTC */
|
|
2601
|
+
text: string;
|
|
2602
|
+
/** Number of blendshape frames (30fps) — alias for numA2EFrames */
|
|
2603
|
+
numFrames: number;
|
|
2604
|
+
/** Number of A2E frames (30fps) */
|
|
2605
|
+
numA2EFrames: number;
|
|
2606
|
+
/** Number of ASR frames (50fps) */
|
|
2607
|
+
numASRFrames: number;
|
|
2608
|
+
/** Inference time in ms */
|
|
1955
2609
|
inferenceTimeMs: number;
|
|
1956
|
-
/**
|
|
1957
|
-
* Pre-speech audio chunks (only present on first speech detection).
|
|
1958
|
-
* These are the N chunks immediately before VAD triggered, useful for
|
|
1959
|
-
* capturing the beginning of speech that occurred before detection.
|
|
1960
|
-
*
|
|
1961
|
-
* Only populated when transitioning from silence to speech.
|
|
1962
|
-
*/
|
|
1963
|
-
preSpeechChunks?: Float32Array[];
|
|
1964
|
-
}
|
|
1965
|
-
/**
|
|
1966
|
-
* Speech segment detected by VAD
|
|
1967
|
-
*/
|
|
1968
|
-
interface SpeechSegment {
|
|
1969
|
-
/** Start time in seconds */
|
|
1970
|
-
start: number;
|
|
1971
|
-
/** End time in seconds */
|
|
1972
|
-
end: number;
|
|
1973
|
-
/** Average probability during segment */
|
|
1974
|
-
avgProbability: number;
|
|
1975
2610
|
}
|
|
1976
|
-
|
|
1977
|
-
|
|
1978
|
-
*
|
|
1979
|
-
* Based on snakers4/silero-vad ONNX model.
|
|
1980
|
-
* Processes 32ms chunks (512 samples at 16kHz) with LSTM state.
|
|
1981
|
-
*
|
|
1982
|
-
* @see https://github.com/snakers4/silero-vad
|
|
1983
|
-
*/
|
|
1984
|
-
declare class SileroVADInference {
|
|
2611
|
+
declare class Wav2Vec2Inference implements LipSyncBackend {
|
|
2612
|
+
readonly modelId: "wav2vec2";
|
|
1985
2613
|
private session;
|
|
1986
2614
|
private ort;
|
|
1987
2615
|
private config;
|
|
1988
2616
|
private _backend;
|
|
1989
2617
|
private isLoading;
|
|
1990
|
-
private
|
|
1991
|
-
private context;
|
|
1992
|
-
private readonly chunkSize;
|
|
1993
|
-
private readonly contextSize;
|
|
2618
|
+
private numIdentityClasses;
|
|
1994
2619
|
private inferenceQueue;
|
|
1995
|
-
private
|
|
1996
|
-
private
|
|
1997
|
-
|
|
1998
|
-
constructor(config: SileroVADConfig);
|
|
1999
|
-
get backend(): RuntimeBackend | null;
|
|
2000
|
-
get isLoaded(): boolean;
|
|
2001
|
-
get sampleRate(): number;
|
|
2002
|
-
get threshold(): number;
|
|
2003
|
-
/**
|
|
2004
|
-
* Get required chunk size in samples
|
|
2005
|
-
*/
|
|
2006
|
-
getChunkSize(): number;
|
|
2007
|
-
/**
|
|
2008
|
-
* Get chunk duration in milliseconds
|
|
2009
|
-
*/
|
|
2010
|
-
getChunkDurationMs(): number;
|
|
2620
|
+
private poisoned;
|
|
2621
|
+
private static readonly INFERENCE_TIMEOUT_MS;
|
|
2622
|
+
constructor(config: Wav2Vec2InferenceConfig);
|
|
2011
2623
|
/**
|
|
2012
2624
|
* Check if WebGPU is available and working
|
|
2013
2625
|
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
2014
2626
|
*/
|
|
2015
2627
|
static isWebGPUAvailable: typeof isWebGPUAvailable;
|
|
2628
|
+
get backend(): 'webgpu' | 'wasm' | null;
|
|
2629
|
+
get isLoaded(): boolean;
|
|
2630
|
+
/** True if inference timed out and the session is permanently unusable */
|
|
2631
|
+
get isSessionPoisoned(): boolean;
|
|
2016
2632
|
/**
|
|
2017
2633
|
* Load the ONNX model
|
|
2018
2634
|
*/
|
|
2019
|
-
load(): Promise<
|
|
2020
|
-
/**
|
|
2021
|
-
* Reset state for new audio stream
|
|
2022
|
-
*/
|
|
2023
|
-
reset(): void;
|
|
2024
|
-
/**
|
|
2025
|
-
* Process a single audio chunk
|
|
2026
|
-
*
|
|
2027
|
-
* @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
|
|
2028
|
-
* @returns VAD result with speech probability
|
|
2029
|
-
*/
|
|
2030
|
-
process(audioChunk: Float32Array): Promise<VADResult$1>;
|
|
2635
|
+
load(): Promise<ModelInfo>;
|
|
2031
2636
|
/**
|
|
2032
|
-
*
|
|
2637
|
+
* Run inference on raw audio
|
|
2638
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
|
|
2639
|
+
* @param identityIndex - Optional identity index (0-11, default 0 = neutral)
|
|
2033
2640
|
*
|
|
2034
|
-
*
|
|
2035
|
-
*
|
|
2036
|
-
* @returns Array of speech segments
|
|
2641
|
+
* Note: Model expects 1-second chunks (16000 samples) for optimal performance.
|
|
2642
|
+
* Audio will be zero-padded or truncated to 16000 samples.
|
|
2037
2643
|
*/
|
|
2038
|
-
|
|
2039
|
-
/** Minimum speech duration in ms (default: 250) */
|
|
2040
|
-
minSpeechDurationMs?: number;
|
|
2041
|
-
/** Minimum silence duration to end segment in ms (default: 300) */
|
|
2042
|
-
minSilenceDurationMs?: number;
|
|
2043
|
-
/** Padding to add before/after speech in ms (default: 30) */
|
|
2044
|
-
speechPadMs?: number;
|
|
2045
|
-
}): Promise<SpeechSegment[]>;
|
|
2644
|
+
infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
|
|
2046
2645
|
/**
|
|
2047
|
-
*
|
|
2646
|
+
* Decode CTC logits to text using greedy decoding
|
|
2048
2647
|
*/
|
|
2049
|
-
private
|
|
2648
|
+
private decodeCTC;
|
|
2050
2649
|
/**
|
|
2051
2650
|
* Queue inference to serialize ONNX session calls
|
|
2052
2651
|
*/
|
|
2053
2652
|
private queueInference;
|
|
2653
|
+
/**
|
|
2654
|
+
* Get blendshape value by name for a specific frame
|
|
2655
|
+
*/
|
|
2656
|
+
getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
|
|
2054
2657
|
/**
|
|
2055
2658
|
* Dispose of the model and free resources
|
|
2056
2659
|
*/
|
|
@@ -2058,296 +2661,189 @@ declare class SileroVADInference {
|
|
|
2058
2661
|
}
|
|
2059
2662
|
|
|
2060
2663
|
/**
|
|
2061
|
-
*
|
|
2664
|
+
* CPU-optimized lip sync inference using wav2arkit_cpu model
|
|
2665
|
+
*
|
|
2666
|
+
* A Safari/iOS-compatible alternative to Wav2Vec2Inference (384MB) designed
|
|
2667
|
+
* for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
|
|
2668
|
+
*
|
|
2669
|
+
* The model uses ONNX external data format:
|
|
2670
|
+
* - wav2arkit_cpu.onnx (1.86MB graph structure)
|
|
2671
|
+
* - wav2arkit_cpu.onnx.data (402MB weights)
|
|
2672
|
+
* Both files are fetched and cached automatically.
|
|
2673
|
+
*
|
|
2674
|
+
* Key differences from Wav2Vec2Inference:
|
|
2675
|
+
* - WASM-only backend (CPU-optimized, single-threaded, no WebGPU)
|
|
2676
|
+
* - No identity input (baked to identity 11)
|
|
2677
|
+
* - No ASR output (lip sync only)
|
|
2678
|
+
* - Dynamic input length (not fixed to 16000 samples)
|
|
2679
|
+
* - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
|
|
2680
|
+
*
|
|
2681
|
+
* @category Inference
|
|
2682
|
+
*
|
|
2683
|
+
* @example
|
|
2684
|
+
* ```typescript
|
|
2685
|
+
* import { Wav2ArkitCpuInference } from '@omote/core';
|
|
2686
|
+
*
|
|
2687
|
+
* const lam = new Wav2ArkitCpuInference({
|
|
2688
|
+
* modelUrl: '/models/wav2arkit_cpu.onnx',
|
|
2689
|
+
* });
|
|
2690
|
+
* await lam.load();
|
|
2691
|
+
*
|
|
2692
|
+
* const { blendshapes } = await lam.infer(audioSamples);
|
|
2693
|
+
* // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
|
|
2694
|
+
* ```
|
|
2062
2695
|
*/
|
|
2063
|
-
|
|
2064
|
-
|
|
2696
|
+
|
|
2697
|
+
interface Wav2ArkitCpuConfig {
|
|
2698
|
+
/** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
|
|
2065
2699
|
modelUrl: string;
|
|
2066
|
-
/** Sample rate (8000 or 16000, default: 16000) */
|
|
2067
|
-
sampleRate?: 8000 | 16000;
|
|
2068
|
-
/** Speech probability threshold (default: 0.5) */
|
|
2069
|
-
threshold?: number;
|
|
2070
|
-
/**
|
|
2071
|
-
* Number of audio chunks to keep in pre-speech buffer.
|
|
2072
|
-
* When VAD triggers, these chunks are prepended to the speech buffer
|
|
2073
|
-
* to capture the beginning of speech that occurred before detection.
|
|
2074
|
-
*
|
|
2075
|
-
* At 512 samples/chunk and 16kHz:
|
|
2076
|
-
* - 10 chunks = 320ms of pre-speech audio
|
|
2077
|
-
* - 15 chunks = 480ms of pre-speech audio
|
|
2078
|
-
*
|
|
2079
|
-
* Default: 10 chunks (320ms)
|
|
2080
|
-
*/
|
|
2081
|
-
preSpeechBufferChunks?: number;
|
|
2082
|
-
}
|
|
2083
|
-
/**
|
|
2084
|
-
* VAD model loading information from worker
|
|
2085
|
-
*/
|
|
2086
|
-
interface VADWorkerModelInfo {
|
|
2087
|
-
backend: 'wasm';
|
|
2088
|
-
loadTimeMs: number;
|
|
2089
|
-
inputNames: string[];
|
|
2090
|
-
outputNames: string[];
|
|
2091
|
-
sampleRate: number;
|
|
2092
|
-
chunkSize: number;
|
|
2093
|
-
}
|
|
2094
|
-
/**
|
|
2095
|
-
* Result from a single VAD inference
|
|
2096
|
-
*/
|
|
2097
|
-
interface VADResult {
|
|
2098
|
-
/** Speech probability (0-1) */
|
|
2099
|
-
probability: number;
|
|
2100
|
-
/** Whether speech is detected (probability > threshold) */
|
|
2101
|
-
isSpeech: boolean;
|
|
2102
|
-
/** Inference time in milliseconds */
|
|
2103
|
-
inferenceTimeMs: number;
|
|
2104
2700
|
/**
|
|
2105
|
-
*
|
|
2106
|
-
*
|
|
2107
|
-
* capturing the beginning of speech that occurred before detection.
|
|
2701
|
+
* Path or URL to external model data file (.onnx.data weights).
|
|
2702
|
+
* Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
|
|
2108
2703
|
*
|
|
2109
|
-
*
|
|
2110
|
-
*/
|
|
2111
|
-
|
|
2112
|
-
|
|
2113
|
-
|
|
2114
|
-
|
|
2115
|
-
|
|
2116
|
-
|
|
2117
|
-
|
|
2118
|
-
|
|
2119
|
-
* @see SileroVADInference for main-thread version
|
|
2120
|
-
*/
|
|
2121
|
-
declare class SileroVADWorker {
|
|
2122
|
-
private worker;
|
|
2704
|
+
* Set to `false` to skip external data loading (single-file models only).
|
|
2705
|
+
*/
|
|
2706
|
+
externalDataUrl?: string | false;
|
|
2707
|
+
/** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
|
|
2708
|
+
backend?: BackendPreference;
|
|
2709
|
+
}
|
|
2710
|
+
declare class Wav2ArkitCpuInference implements LipSyncBackend {
|
|
2711
|
+
readonly modelId: "wav2arkit_cpu";
|
|
2712
|
+
private session;
|
|
2713
|
+
private ort;
|
|
2123
2714
|
private config;
|
|
2715
|
+
private _backend;
|
|
2124
2716
|
private isLoading;
|
|
2125
|
-
private _isLoaded;
|
|
2126
|
-
private state;
|
|
2127
|
-
private context;
|
|
2128
|
-
private readonly chunkSize;
|
|
2129
|
-
private readonly contextSize;
|
|
2130
2717
|
private inferenceQueue;
|
|
2131
|
-
private
|
|
2132
|
-
private
|
|
2133
|
-
|
|
2134
|
-
|
|
2135
|
-
constructor(config: VADWorkerConfig);
|
|
2718
|
+
private poisoned;
|
|
2719
|
+
private static readonly INFERENCE_TIMEOUT_MS;
|
|
2720
|
+
constructor(config: Wav2ArkitCpuConfig);
|
|
2721
|
+
get backend(): RuntimeBackend | null;
|
|
2136
2722
|
get isLoaded(): boolean;
|
|
2137
2723
|
/**
|
|
2138
|
-
*
|
|
2139
|
-
*/
|
|
2140
|
-
get backend(): 'wasm' | null;
|
|
2141
|
-
get sampleRate(): number;
|
|
2142
|
-
get threshold(): number;
|
|
2143
|
-
/**
|
|
2144
|
-
* Get required chunk size in samples
|
|
2145
|
-
*/
|
|
2146
|
-
getChunkSize(): number;
|
|
2147
|
-
/**
|
|
2148
|
-
* Get chunk duration in milliseconds
|
|
2149
|
-
*/
|
|
2150
|
-
getChunkDurationMs(): number;
|
|
2151
|
-
/**
|
|
2152
|
-
* Create the worker from inline script
|
|
2153
|
-
*/
|
|
2154
|
-
private createWorker;
|
|
2155
|
-
/**
|
|
2156
|
-
* Handle messages from worker
|
|
2157
|
-
*/
|
|
2158
|
-
private handleWorkerMessage;
|
|
2159
|
-
/**
|
|
2160
|
-
* Send message to worker and wait for response
|
|
2161
|
-
*/
|
|
2162
|
-
private sendMessage;
|
|
2163
|
-
/**
|
|
2164
|
-
* Load the ONNX model in the worker
|
|
2165
|
-
*/
|
|
2166
|
-
load(): Promise<VADWorkerModelInfo>;
|
|
2167
|
-
/**
|
|
2168
|
-
* Reset state for new audio stream
|
|
2724
|
+
* Load the ONNX model
|
|
2169
2725
|
*/
|
|
2170
|
-
|
|
2726
|
+
load(): Promise<LipSyncModelInfo>;
|
|
2171
2727
|
/**
|
|
2172
|
-
*
|
|
2728
|
+
* Run inference on raw audio
|
|
2173
2729
|
*
|
|
2174
|
-
*
|
|
2175
|
-
*
|
|
2730
|
+
* Accepts variable-length audio (not fixed to 16000 samples).
|
|
2731
|
+
* Output frames = ceil(30 * numSamples / 16000).
|
|
2732
|
+
*
|
|
2733
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
2734
|
+
* @param _identityIndex - Ignored (identity 11 is baked into the model)
|
|
2176
2735
|
*/
|
|
2177
|
-
|
|
2736
|
+
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
|
|
2178
2737
|
/**
|
|
2179
|
-
* Queue inference to serialize
|
|
2738
|
+
* Queue inference to serialize ONNX session calls
|
|
2180
2739
|
*/
|
|
2181
2740
|
private queueInference;
|
|
2182
2741
|
/**
|
|
2183
|
-
* Dispose of the
|
|
2742
|
+
* Dispose of the model and free resources
|
|
2184
2743
|
*/
|
|
2185
2744
|
dispose(): Promise<void>;
|
|
2186
|
-
/**
|
|
2187
|
-
* Check if Web Workers are supported
|
|
2188
|
-
*/
|
|
2189
|
-
static isSupported(): boolean;
|
|
2190
2745
|
}
|
|
2191
2746
|
|
|
2192
2747
|
/**
|
|
2193
|
-
* Factory function for
|
|
2748
|
+
* Factory function for lip sync with automatic GPU/CPU model selection
|
|
2194
2749
|
*
|
|
2195
|
-
* Provides a unified API that automatically selects the optimal
|
|
2196
|
-
* -
|
|
2197
|
-
* -
|
|
2198
|
-
* - Fallback: Gracefully falls back to
|
|
2750
|
+
* Provides a unified API that automatically selects the optimal model:
|
|
2751
|
+
* - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
|
|
2752
|
+
* - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
|
|
2753
|
+
* - Fallback: Gracefully falls back to CPU model if GPU model fails to load
|
|
2754
|
+
*
|
|
2755
|
+
* Why two separate models?
|
|
2756
|
+
* Wav2Vec2 (LAM) cannot run on Safari/iOS for two reasons:
|
|
2757
|
+
* 1. Its dual-head transformer graph needs ~750-950MB peak during ORT session
|
|
2758
|
+
* creation (graph optimization), exceeding iOS WebKit's ~1-1.5GB tab limit.
|
|
2759
|
+
* 2. It ships as a single 384MB .onnx file that must load into JS heap before
|
|
2760
|
+
* ORT can consume it. iOS WebKit OOMs on this allocation.
|
|
2761
|
+
* wav2arkit_cpu solves both: external data format (1.86MB graph + 402MB weights)
|
|
2762
|
+
* lets ORT load only the tiny graph, then stream weights via URL pass-through
|
|
2763
|
+
* directly into WASM memory. JS heap stays at ~2MB.
|
|
2199
2764
|
*
|
|
2200
2765
|
* @category Inference
|
|
2201
2766
|
*
|
|
2202
|
-
* @example
|
|
2767
|
+
* @example Auto-detect (recommended)
|
|
2203
2768
|
* ```typescript
|
|
2204
|
-
* import {
|
|
2769
|
+
* import { createLipSync } from '@omote/core';
|
|
2205
2770
|
*
|
|
2206
|
-
* const
|
|
2207
|
-
*
|
|
2208
|
-
*
|
|
2771
|
+
* const lam = createLipSync({
|
|
2772
|
+
* gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
|
|
2773
|
+
* cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
2209
2774
|
* });
|
|
2210
2775
|
*
|
|
2211
|
-
* await
|
|
2212
|
-
* const
|
|
2213
|
-
* if (result.isSpeech) {
|
|
2214
|
-
* console.log('Speech detected!', result.probability);
|
|
2215
|
-
* }
|
|
2216
|
-
* ```
|
|
2217
|
-
*
|
|
2218
|
-
* @example Force worker usage
|
|
2219
|
-
* ```typescript
|
|
2220
|
-
* const vad = createSileroVAD({
|
|
2221
|
-
* modelUrl: '/models/silero-vad.onnx',
|
|
2222
|
-
* useWorker: true, // Force Worker even on mobile
|
|
2223
|
-
* });
|
|
2776
|
+
* await lam.load();
|
|
2777
|
+
* const { blendshapes } = await lam.infer(audioSamples);
|
|
2224
2778
|
* ```
|
|
2225
2779
|
*
|
|
2226
|
-
* @example Force
|
|
2780
|
+
* @example Force CPU model
|
|
2227
2781
|
* ```typescript
|
|
2228
|
-
* const
|
|
2229
|
-
*
|
|
2230
|
-
*
|
|
2782
|
+
* const lam = createLipSync({
|
|
2783
|
+
* gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
|
|
2784
|
+
* cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
2785
|
+
* mode: 'cpu',
|
|
2231
2786
|
* });
|
|
2232
2787
|
* ```
|
|
2233
2788
|
*/
|
|
2234
2789
|
|
|
2235
2790
|
/**
|
|
2236
|
-
*
|
|
2237
|
-
*
|
|
2238
|
-
* This interface defines the shared API that both implementations provide,
|
|
2239
|
-
* allowing consumers to use either interchangeably.
|
|
2791
|
+
* Configuration for the lip sync factory
|
|
2240
2792
|
*/
|
|
2241
|
-
interface
|
|
2242
|
-
/**
|
|
2243
|
-
|
|
2244
|
-
/** Whether the model is loaded and ready for inference */
|
|
2245
|
-
readonly isLoaded: boolean;
|
|
2246
|
-
/** Audio sample rate (8000 or 16000 Hz) */
|
|
2247
|
-
readonly sampleRate: number;
|
|
2248
|
-
/** Speech detection threshold (0-1) */
|
|
2249
|
-
readonly threshold: number;
|
|
2250
|
-
/**
|
|
2251
|
-
* Load the ONNX model
|
|
2252
|
-
* @returns Model loading information
|
|
2253
|
-
*/
|
|
2254
|
-
load(): Promise<VADModelInfo | VADWorkerModelInfo>;
|
|
2255
|
-
/**
|
|
2256
|
-
* Process a single audio chunk
|
|
2257
|
-
* @param audioChunk - Float32Array of exactly chunkSize samples
|
|
2258
|
-
* @returns VAD result with speech probability
|
|
2259
|
-
*/
|
|
2260
|
-
process(audioChunk: Float32Array): Promise<VADResult$1>;
|
|
2261
|
-
/**
|
|
2262
|
-
* Reset state for new audio stream
|
|
2263
|
-
*/
|
|
2264
|
-
reset(): void | Promise<void>;
|
|
2793
|
+
interface CreateLipSyncConfig {
|
|
2794
|
+
/** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
|
|
2795
|
+
gpuModelUrl: string;
|
|
2265
2796
|
/**
|
|
2266
|
-
*
|
|
2797
|
+
* URL for GPU model external data file (.onnx.data weights).
|
|
2798
|
+
* Default: `${gpuModelUrl}.data`
|
|
2799
|
+
*
|
|
2800
|
+
* Set to `false` to skip external data loading (single-file models only).
|
|
2267
2801
|
*/
|
|
2268
|
-
|
|
2802
|
+
gpuExternalDataUrl?: string | false;
|
|
2803
|
+
/** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
|
|
2804
|
+
cpuModelUrl: string;
|
|
2269
2805
|
/**
|
|
2270
|
-
*
|
|
2806
|
+
* Model selection mode:
|
|
2807
|
+
* - 'auto': Safari/iOS → CPU, everything else → GPU (default)
|
|
2808
|
+
* - 'gpu': Force GPU model (Wav2Vec2Inference)
|
|
2809
|
+
* - 'cpu': Force CPU model (Wav2ArkitCpuInference)
|
|
2271
2810
|
*/
|
|
2272
|
-
|
|
2811
|
+
mode?: 'auto' | 'gpu' | 'cpu';
|
|
2812
|
+
/** Backend preference for GPU model (default: 'auto') */
|
|
2813
|
+
gpuBackend?: BackendPreference;
|
|
2814
|
+
/** Number of identity classes for GPU model (default: 12) */
|
|
2815
|
+
numIdentityClasses?: number;
|
|
2273
2816
|
/**
|
|
2274
|
-
*
|
|
2817
|
+
* Fall back to CPU model if GPU model fails to load (default: true)
|
|
2818
|
+
* Only applies when mode is 'auto' or 'gpu'
|
|
2275
2819
|
*/
|
|
2276
|
-
|
|
2277
|
-
}
|
|
2278
|
-
/**
|
|
2279
|
-
* Configuration for the Silero VAD factory
|
|
2280
|
-
*
|
|
2281
|
-
* Extends SileroVADConfig with worker-specific options.
|
|
2282
|
-
*/
|
|
2283
|
-
interface SileroVADFactoryConfig extends SileroVADConfig {
|
|
2820
|
+
fallbackOnError?: boolean;
|
|
2284
2821
|
/**
|
|
2285
|
-
*
|
|
2286
|
-
*
|
|
2287
|
-
* Auto-detection behavior:
|
|
2288
|
-
* - Desktop: Uses Worker (better responsiveness, off-main-thread)
|
|
2289
|
-
* - Mobile: Uses main thread (avoids 5MB memory overhead)
|
|
2822
|
+
* Use Web Worker for CPU model inference (default: false)
|
|
2290
2823
|
*
|
|
2291
|
-
*
|
|
2292
|
-
*
|
|
2293
|
-
*
|
|
2824
|
+
* When true, Wav2ArkitCpuWorker is used instead of Wav2ArkitCpuInference,
|
|
2825
|
+
* running inference off the main thread to prevent UI blocking during
|
|
2826
|
+
* model loading and inference.
|
|
2294
2827
|
*
|
|
2295
|
-
*
|
|
2828
|
+
* Only applies when the CPU model is selected (mode: 'cpu', auto on Safari/iOS,
|
|
2829
|
+
* or fallback from GPU).
|
|
2296
2830
|
*/
|
|
2297
2831
|
useWorker?: boolean;
|
|
2298
2832
|
/**
|
|
2299
|
-
*
|
|
2300
|
-
*
|
|
2301
|
-
*
|
|
2302
|
-
*
|
|
2303
|
-
*
|
|
2304
|
-
* When false, worker errors will propagate as exceptions.
|
|
2305
|
-
*
|
|
2306
|
-
* Default: true
|
|
2833
|
+
* Unified inference worker instance.
|
|
2834
|
+
* When provided and CPU model is selected, uses Wav2ArkitCpuUnifiedAdapter.
|
|
2835
|
+
* Takes precedence over useWorker setting for the CPU model path.
|
|
2836
|
+
* GPU model (Wav2Vec2) always stays on main thread (WebGPU).
|
|
2307
2837
|
*/
|
|
2308
|
-
|
|
2838
|
+
unifiedWorker?: UnifiedInferenceWorker;
|
|
2309
2839
|
}
|
|
2310
2840
|
/**
|
|
2311
|
-
*
|
|
2312
|
-
*
|
|
2313
|
-
* Requirements:
|
|
2314
|
-
* - Worker constructor must exist
|
|
2315
|
-
* - Blob URL support (for inline worker script)
|
|
2316
|
-
*
|
|
2317
|
-
* @returns true if VAD Worker is supported
|
|
2318
|
-
*/
|
|
2319
|
-
declare function supportsVADWorker(): boolean;
|
|
2320
|
-
/**
|
|
2321
|
-
* Create a Silero VAD instance with automatic implementation selection
|
|
2322
|
-
*
|
|
2323
|
-
* This factory function automatically selects between:
|
|
2324
|
-
* - **SileroVADWorker**: Off-main-thread inference (better for desktop)
|
|
2325
|
-
* - **SileroVADInference**: Main thread inference (better for mobile)
|
|
2326
|
-
*
|
|
2327
|
-
* The selection is based on:
|
|
2328
|
-
* 1. Explicit `useWorker` config (if provided)
|
|
2329
|
-
* 2. Platform detection (mobile vs desktop)
|
|
2330
|
-
* 3. Worker API availability
|
|
2331
|
-
*
|
|
2332
|
-
* Both implementations share the same interface (SileroVADBackend),
|
|
2333
|
-
* so consumers can use either interchangeably.
|
|
2841
|
+
* Create a lip sync instance with automatic GPU/CPU model selection
|
|
2334
2842
|
*
|
|
2335
2843
|
* @param config - Factory configuration
|
|
2336
|
-
* @returns A
|
|
2337
|
-
*
|
|
2338
|
-
* @example
|
|
2339
|
-
* ```typescript
|
|
2340
|
-
* // Auto-detect (recommended)
|
|
2341
|
-
* const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
|
|
2342
|
-
*
|
|
2343
|
-
* // Force Worker
|
|
2344
|
-
* const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
|
|
2345
|
-
*
|
|
2346
|
-
* // Force main thread
|
|
2347
|
-
* const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
|
|
2348
|
-
* ```
|
|
2844
|
+
* @returns A LipSyncBackend instance (either GPU or CPU model)
|
|
2349
2845
|
*/
|
|
2350
|
-
declare function
|
|
2846
|
+
declare function createLipSync(config: CreateLipSyncConfig): LipSyncBackend;
|
|
2351
2847
|
|
|
2352
2848
|
/**
|
|
2353
2849
|
* Safari Web Speech API wrapper for iOS speech recognition
|
|
@@ -2791,10 +3287,7 @@ interface ConversationMessage {
|
|
|
2791
3287
|
/** Audio duration if applicable (ms) */
|
|
2792
3288
|
audioDurationMs?: number;
|
|
2793
3289
|
}
|
|
2794
|
-
|
|
2795
|
-
* Session state
|
|
2796
|
-
*/
|
|
2797
|
-
type AISessionState = 'idle' | 'listening' | 'thinking' | 'speaking' | 'interrupted' | 'error' | 'disconnected';
|
|
3290
|
+
|
|
2798
3291
|
/**
|
|
2799
3292
|
* Events emitted by AI adapters
|
|
2800
3293
|
*/
|
|
@@ -3106,7 +3599,6 @@ declare class AgentCoreAdapter extends EventEmitter<AIAdapterEvents> implements
|
|
|
3106
3599
|
* Falls back to simple RMS if VAD not available
|
|
3107
3600
|
*/
|
|
3108
3601
|
private detectVoiceActivity;
|
|
3109
|
-
private int16ToFloat32;
|
|
3110
3602
|
private base64ToArrayBuffer;
|
|
3111
3603
|
private addToHistory;
|
|
3112
3604
|
private handleDisconnect;
|
|
@@ -4704,4 +5196,4 @@ declare class ProceduralLifeLayer {
|
|
|
4704
5196
|
private updateBrowNoise;
|
|
4705
5197
|
}
|
|
4706
5198
|
|
|
4707
|
-
export { type AIAdapter, type AIAdapterEvents,
|
|
5199
|
+
export { type AIAdapter, type AIAdapterEvents, AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, type CTCDecodeResult, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_ARKIT_MAP, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type Emotion2VecLabel, type EmotionAnimationMap, type EmotionBlendMode, type EmotionBlendshapeConfig, EmotionController, type EmotionFrame, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionToBlendshapeMapper, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type KaldiFbankOptions, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, ProceduralLifeLayer, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, UPPER_FACE_BLENDSHAPES, UnifiedInferenceWorker, type UpperFaceBlendshapeName, type UpperFaceBlendshapes, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyCMVN, applyLFR, blendEmotions, calculatePeak, calculateRMS, computeKaldiFbank, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSenseVoice, createSessionWithFallback, createSileroVAD, ctcGreedyDecode, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, parseCMVNFromMetadata, parseTokensFile, preloadModels, preloadOnnxRuntime, remapWav2ArkitToLam, resolveBackend, resolveLanguageId, resolveTextNormId, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes };
|