omnivad 0.2.8 → 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,194 @@
1
+ # omnivad
2
+
3
+ [![npm](https://img.shields.io/npm/v/omnivad)](https://www.npmjs.com/package/omnivad)
4
+ [![npm bundle size](https://img.shields.io/bundlephobia/min/omnivad)](https://bundlephobia.com/package/omnivad)
5
+ [![license](https://img.shields.io/npm/l/omnivad)](https://github.com/lifeiteng/OmniVAD-Kit/blob/main/LICENSE)
6
+
7
+ Cross-platform Voice Activity Detection and Audio Event Detection via WebAssembly.
8
+ Runs in **browsers, Web Workers, and Node.js** with a single API. Zero runtime
9
+ dependencies. Built on [FireRedVAD](https://github.com/FireRedTeam/FireRedVAD)
10
+ from Xiaohongshu (DFSMN architecture, ~2.2 MB per model).
11
+
12
+ ## What's in the box
13
+
14
+ | Class | Use case | Output |
15
+ |-------|----------|--------|
16
+ | **`OmniVAD`** | Whole-audio voice activity detection | `[start, end]` timestamps |
17
+ | **`OmniStreamVAD`** | Real-time, frame-by-frame VAD with segment-boundary events | per-frame probability + start/end events |
18
+ | **`OmniAED`** | Audio event detection (3-class) | `speech` / `singing` / `music` timestamps |
19
+ | **`mergeChunks`** | Pack VAD output into Whisper-style 30 s chunks | `{ start, end, segStartIdx, segCount }[]` |
20
+
21
+ All four share one WASM module (~2.2 MB SIMD-enabled), one C implementation,
22
+ and a single bundle (~24 KB JS, ESM + CJS + types).
23
+
24
+ ## Install
25
+
26
+ ```bash
27
+ pnpm add omnivad # or: npm install omnivad / yarn add omnivad
28
+ ```
29
+
30
+ Models are served from jsDelivr by default (zero config). For air-gapped or
31
+ custom deployments, pass `modelUrl` or pre-loaded `modelData`.
32
+
33
+ ## Quickstart — whole-audio VAD
34
+
35
+ ```ts
36
+ import { OmniVAD } from "omnivad";
37
+
38
+ const vad = await OmniVAD.create();
39
+
40
+ // Float32Array in [-1, 1] (Web Audio, decodeAudioData) or Int16Array (raw PCM)
41
+ const result = vad.detect(audioFloat32);
42
+ // { duration: 12.4, timestamps: [[0.35, 4.8], [5.1, 12.4]] }
43
+ ```
44
+
45
+ ## Streaming VAD — real-time, frame-by-frame
46
+
47
+ `OmniStreamVAD` processes 10 ms frames (160 samples @ 16 kHz) and emits
48
+ segment-boundary events on the same call that confirms the boundary —
49
+ bit-identical to upstream FireRedVAD's `FireRedStreamVad`.
50
+
51
+ `processFrame()` accepts `Float32Array` in `[-1, 1]` (Web Audio,
52
+ `AudioWorkletProcessor`, decoded WebRTC tracks) or `Int16Array` PCM
53
+ (WAV / microphone). Dispatch is by dtype — no scaling in JS.
54
+
55
+ ```ts
56
+ import { OmniStreamVAD } from "omnivad";
57
+
58
+ const vad = await OmniStreamVAD.create();
59
+
60
+ // Float32Array [-1, 1] from Web Audio:
61
+ for (let i = 0; i + 160 <= floatPcm.length; i += 160) {
62
+ const r = vad.processFrame(floatPcm.subarray(i, i + 160));
63
+ if (!r) continue;
64
+ if (r.isSpeechStart) console.log(`START @ ${(r.speechStartFrame * 0.01).toFixed(2)}s`);
65
+ if (r.isSpeechEnd) console.log(`END @ ${(r.speechEndFrame * 0.01).toFixed(2)}s`);
66
+ }
67
+
68
+ // Or Int16Array PCM from a WAV file — same call, same result:
69
+ for (let i = 0; i + 160 <= int16Pcm.length; i += 160) {
70
+ vad.processFrame(int16Pcm.subarray(i, i + 160));
71
+ }
72
+ ```
73
+
74
+ `processFrame()` returns `{ confidence, smoothedProb, isSpeech, isSpeechStart,
75
+ isSpeechEnd, frameIdx, speechStartFrame, speechEndFrame }` — every field comes
76
+ straight from the C state machine.
77
+
78
+ ## Audio Event Detection — speech / singing / music
79
+
80
+ ```ts
81
+ import { OmniAED } from "omnivad";
82
+
83
+ const aed = await OmniAED.create();
84
+ const events = aed.detect(audioFloat32);
85
+ // { duration: 22.0,
86
+ // events: { speech: [[...]], singing: [[...]], music: [[...]] },
87
+ // ratios: { speech: 0.41, singing: 0.0, music: 0.59 } }
88
+ ```
89
+
90
+ ## Whisper / WhisperX-style chunking
91
+
92
+ `OmniVAD` + `mergeChunks(mode: "greedy")` is the 1:1 equivalent of WhisperX's
93
+ `Binarize(max_duration=chunk_size)` + greedy packing. Use this recipe when
94
+ feeding chunks into Whisper-family ASR models that expect a fixed 30 s window:
95
+
96
+ ```ts
97
+ import { OmniVAD, mergeChunks } from "omnivad";
98
+
99
+ const vad = await OmniVAD.create(); // threshold=0.4 default — safer for Whisper
100
+ const result = vad.detect(audioFloat32);
101
+
102
+ const chunks = await mergeChunks(result.timestamps, {
103
+ maxChunkSecs: 30.0, // Whisper input window
104
+ mode: "greedy", // WhisperX behavior
105
+ padOnsetSecs: 0.04,
106
+ padOffsetSecs: 0.04,
107
+ minSilenceSecs: 0.20,
108
+ });
109
+ // Slice the audio at [chunk.start, chunk.end] and feed each slice to Whisper.
110
+ ```
111
+
112
+ A second mode `"longest_gap"` exists for variable-length-input models
113
+ (forced alignment, TTS) — see the GitHub README for the comparison table.
114
+
115
+ ## Multi-stream concurrency
116
+
117
+ `OmniStreamVAD` instances have mutable per-stream state and **must not** be
118
+ shared across concurrent streams. Use `clone()` to spin up a fresh instance
119
+ that shares the underlying model weights but has its own state — instant,
120
+ near-zero memory overhead per stream.
121
+
122
+ ```ts
123
+ const base = await OmniStreamVAD.create();
124
+ const streamA = base.clone();
125
+ const streamB = base.clone();
126
+ // Process two independent audio sessions in parallel.
127
+ ```
128
+
129
+ ## Models and CDN
130
+
131
+ By default, models are fetched from jsDelivr:
132
+
133
+ ```
134
+ https://cdn.jsdelivr.net/npm/omnivad@<version>/models/{vad,stream-vad,aed}.omnivad
135
+ ```
136
+
137
+ Override per call when you need to host them yourself or pre-bundle:
138
+
139
+ ```ts
140
+ const vad = await OmniVAD.create({
141
+ modelUrl: "https://your-cdn/vad.omnivad", // or
142
+ modelData: arrayBufferYouAlreadyHave,
143
+ });
144
+ ```
145
+
146
+ In Node.js, models are read from the installed package (`omnivad/models/`) — no
147
+ network access required at runtime.
148
+
149
+ ## Performance
150
+
151
+ Real-Time Factor (lower = faster) on Apple M-series:
152
+
153
+ | Model | RTF | Speed |
154
+ |-------|-----|-------|
155
+ | VAD | ~0.003 | ~330× real-time |
156
+ | Streaming VAD | ~0.002 | ~500× real-time |
157
+ | AED | ~0.002 | ~500× real-time |
158
+
159
+ WASM is built with SIMD enabled and ncnn fp16 weights.
160
+
161
+ ## Accuracy
162
+
163
+ Verified bit-identical to upstream PyTorch reference on 5 audio files × 3
164
+ models — see the [accuracy table](https://github.com/lifeiteng/OmniVAD-Kit#testing)
165
+ in the main repo.
166
+
167
+ ## Browser, Worker, Node — same API
168
+
169
+ The package detects its runtime and loads the right glue:
170
+
171
+ - **Browsers (main thread)** — classic-script injection of the Emscripten glue
172
+ (works around `MODULARIZE=1` IIFE issues with `import()`).
173
+ - **Web Workers / ServiceWorkers** — same path via `importScripts`.
174
+ - **Node.js (≥ 18)** — `createRequire` + local CJS resolution. No bundler
175
+ config needed.
176
+
177
+ ## See also
178
+
179
+ - Full documentation, accuracy tables, C/C++ API, Python package, native build:
180
+ [GitHub repository](https://github.com/lifeiteng/OmniVAD-Kit)
181
+ - [中文 README](https://github.com/lifeiteng/OmniVAD-Kit/blob/main/README.zh.md)
182
+ - [Local development guide](https://github.com/lifeiteng/OmniVAD-Kit#local-development)
183
+
184
+ ## Credits
185
+
186
+ - [**FireRedVAD**](https://github.com/FireRedTeam/FireRedVAD) — Kaituo Xu,
187
+ Wenpeng Li, Kai Huang, Kun Liu (Xiaohongshu). Source models, DFSMN
188
+ architecture, training pipeline.
189
+ - [ncnn](https://github.com/Tencent/ncnn) — Tencent. Inference backend.
190
+ - [Emscripten](https://emscripten.org/) — WebAssembly toolchain.
191
+
192
+ ## License
193
+
194
+ Apache-2.0 — same as upstream FireRedVAD.
package/dist/index.cjs CHANGED
@@ -38,7 +38,7 @@ var SIZEOF_AED_SEGMENT = 16;
38
38
  var SIZEOF_CHUNK_CONFIG = 28;
39
39
  var SIZEOF_CHUNK = 16;
40
40
  var OMNI_ERR_NO_FRAMES = -7;
41
- var VERSION = "0.2.8";
41
+ var VERSION = "0.2.10";
42
42
  var DEFAULT_CDN_BASE = `https://cdn.jsdelivr.net/npm/omnivad@${VERSION}/models`;
43
43
  var MODEL_FILES = {
44
44
  vad: "vad.omnivad",
@@ -163,6 +163,23 @@ function copyAudioToHeap(M, audio) {
163
163
  heap.set(audio);
164
164
  return ptr;
165
165
  }
166
+ function copyInt16ToHeap(M, audio) {
167
+ const ptr = M._malloc(audio.length * 2);
168
+ const heap = new Int16Array(M.HEAPU8.buffer, ptr, audio.length);
169
+ heap.set(audio);
170
+ return ptr;
171
+ }
172
+ function dispatchAudio(M, audio) {
173
+ if (audio instanceof Float32Array) {
174
+ return { ptr: copyAudioToHeap(M, audio), length: audio.length, format: "f32" };
175
+ }
176
+ if (audio instanceof Int16Array) {
177
+ return { ptr: copyInt16ToHeap(M, audio), length: audio.length, format: "int16" };
178
+ }
179
+ throw new TypeError(
180
+ `unsupported audio dtype; expected Float32Array in [-1, 1] or Int16Array`
181
+ );
182
+ }
166
183
  function writePostConfig(M, ptr, cfg) {
167
184
  M.setValue(ptr + 0, cfg.threshold, "float");
168
185
  M.setValue(ptr + 4, cfg.smoothWindowSize, "i32");
@@ -402,14 +419,15 @@ function streamVadCreate(M, modelBuffer, config = {}) {
402
419
  }
403
420
  }
404
421
  var SIZEOF_STREAM_VAD_RESULT = 24;
405
- function streamVadProcess(M, handle, pcm16Ptr, numSamples) {
422
+ function streamVadProcess(M, handle, audioPtr, numSamples, format = "f32") {
406
423
  const resultPtr = M._malloc(SIZEOF_STREAM_VAD_RESULT);
424
+ const fn = format === "int16" ? "omni_stream_vad_process_int16" : "omni_stream_vad_process";
407
425
  try {
408
426
  const ret = M.ccall(
409
- "omni_stream_vad_process",
427
+ fn,
410
428
  "number",
411
429
  ["number", "number", "number", "number"],
412
- [handle, pcm16Ptr, numSamples, resultPtr]
430
+ [handle, audioPtr, numSamples, resultPtr]
413
431
  );
414
432
  if (ret === OMNI_ERR_NO_FRAMES) return null;
415
433
  if (ret !== 0) throw new Error(`StreamVAD process failed: ${ret}`);
@@ -427,6 +445,28 @@ function streamVadProcess(M, handle, pcm16Ptr, numSamples) {
427
445
  M._free(resultPtr);
428
446
  }
429
447
  }
448
+ function streamVadDetectFull(M, handle, audioPtr, numSamples, format = "f32") {
449
+ const probsPtrPtr = M._malloc(4);
450
+ const framesPtr = M._malloc(4);
451
+ const fn = format === "int16" ? "omni_stream_vad_detect_full_int16" : "omni_stream_vad_detect_full";
452
+ try {
453
+ const ret = M.ccall(
454
+ fn,
455
+ "number",
456
+ ["number", "number", "number", "number", "number"],
457
+ [handle, audioPtr, numSamples, probsPtrPtr, framesPtr]
458
+ );
459
+ if (ret !== 0) throw new Error(`StreamVAD detectFull failed: ${ret}`);
460
+ const numFrames = M.getValue(framesPtr, "i32");
461
+ const probsPtr = M.getValue(probsPtrPtr, "i32");
462
+ const probabilities = probsPtr ? new Float32Array(new Float32Array(M.HEAPU8.buffer, probsPtr, numFrames)) : new Float32Array(0);
463
+ if (probsPtr) M._free(probsPtr);
464
+ return { probabilities, numFrames };
465
+ } finally {
466
+ M._free(probsPtrPtr);
467
+ M._free(framesPtr);
468
+ }
469
+ }
430
470
  function streamVadClone(M, handle) {
431
471
  const errPtr = M._malloc(4);
432
472
  try {
@@ -486,7 +526,7 @@ var OmniVAD = class _OmniVAD {
486
526
  */
487
527
  detect(audio) {
488
528
  const M = getModule();
489
- const { ptr, length, format } = prepareAudio(M, audio);
529
+ const { ptr, length, format } = dispatchAudio(M, audio);
490
530
  try {
491
531
  const timestamps = vadDetect(M, this.handle, ptr, length, this.config, format);
492
532
  return {
@@ -505,16 +545,6 @@ var OmniVAD = class _OmniVAD {
505
545
  }
506
546
  }
507
547
  };
508
- function prepareAudio(M, audio) {
509
- const f32 = audio instanceof Int16Array ? int16ToNormalizedFloat32(audio) : audio;
510
- const ptr = copyAudioToHeap(M, f32);
511
- return { ptr, length: f32.length, format: "f32" };
512
- }
513
- function int16ToNormalizedFloat32(i16) {
514
- const f32 = new Float32Array(i16.length);
515
- for (let i = 0; i < i16.length; i++) f32[i] = i16[i] / 32768;
516
- return f32;
517
- }
518
548
 
519
549
  // src/stream-vad.ts
520
550
  var SAMPLE_RATE2 = 16e3;
@@ -553,20 +583,23 @@ var OmniStreamVAD = class _OmniStreamVAD {
553
583
  return new _OmniStreamVAD(newHandle);
554
584
  }
555
585
  /**
556
- * Process one frame of audio (160 int16 samples = 10ms @ 16kHz).
586
+ * Process one frame of audio (160 samples = 10ms @ 16kHz).
587
+ *
588
+ * Accepts Float32Array in [-1, 1] (Web Audio, soundfile, torch) or
589
+ * Int16Array PCM (WAV, microphone). Dispatches by dtype to the matching
590
+ * C entry — no scaling in JS.
591
+ *
557
592
  * Returns null until enough audio is accumulated.
558
593
  *
559
594
  * Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
560
595
  * speech_*_frame indices) come straight from the C-layer state machine
561
596
  * (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
562
597
  */
563
- processFrame(pcm160) {
598
+ processFrame(audio) {
564
599
  const M = getModule();
565
- const ptr = M._malloc(pcm160.length * 2);
566
- const heap16 = new Int16Array(M.HEAPU8.buffer, ptr, pcm160.length);
567
- heap16.set(pcm160);
600
+ const { ptr, length, format } = dispatchAudio(M, audio);
568
601
  try {
569
- const result = streamVadProcess(M, this.handle, ptr, pcm160.length);
602
+ const result = streamVadProcess(M, this.handle, ptr, length, format);
570
603
  if (!result) return null;
571
604
  return {
572
605
  confidence: result.confidence,
@@ -588,31 +621,22 @@ var OmniStreamVAD = class _OmniStreamVAD {
588
621
  */
589
622
  detectFull(audio) {
590
623
  const M = getModule();
591
- const f32 = prepareDetectFullAudio(audio);
592
- const audioPtr = copyAudioToHeap(M, f32);
593
- const probsPtrPtr = M._malloc(4);
594
- const framesPtr = M._malloc(4);
624
+ const { ptr, length, format } = dispatchAudio(M, audio);
595
625
  try {
596
- const ret = M.ccall(
597
- "omni_stream_vad_detect_full",
598
- "number",
599
- ["number", "number", "number", "number", "number"],
600
- [this.handle, audioPtr, f32.length, probsPtrPtr, framesPtr]
626
+ const { probabilities, numFrames } = streamVadDetectFull(
627
+ M,
628
+ this.handle,
629
+ ptr,
630
+ length,
631
+ format
601
632
  );
602
- if (ret !== 0) throw new Error(`StreamVAD detectFull failed: ${ret}`);
603
- const numFrames = M.getValue(framesPtr, "i32");
604
- const probsPtr = M.getValue(probsPtrPtr, "i32");
605
- const probabilities = probsPtr ? new Float32Array(new Float32Array(M.HEAPU8.buffer, probsPtr, numFrames)) : new Float32Array(0);
606
- if (probsPtr) M._free(probsPtr);
607
633
  return {
608
634
  probabilities,
609
635
  numFrames,
610
- duration: Math.round(f32.length / SAMPLE_RATE2 * 1e3) / 1e3
636
+ duration: Math.round(length / SAMPLE_RATE2 * 1e3) / 1e3
611
637
  };
612
638
  } finally {
613
- M._free(audioPtr);
614
- M._free(probsPtrPtr);
615
- M._free(framesPtr);
639
+ M._free(ptr);
616
640
  }
617
641
  }
618
642
  /** Reset all internal state (model cache, audio buffer, postprocessor). */
@@ -627,31 +651,6 @@ var OmniStreamVAD = class _OmniStreamVAD {
627
651
  }
628
652
  }
629
653
  };
630
- function int16ToFloat32(i16) {
631
- const f32 = new Float32Array(i16.length);
632
- for (let i = 0; i < i16.length; i++) f32[i] = i16[i];
633
- return f32;
634
- }
635
- function prepareDetectFullAudio(audio) {
636
- if (audio instanceof Int16Array) {
637
- return int16ToFloat32(audio);
638
- }
639
- if (isNormalizedFloat(audio)) {
640
- const scaled = new Float32Array(audio.length);
641
- for (let i = 0; i < audio.length; i++) scaled[i] = audio[i] * 32768;
642
- return scaled;
643
- }
644
- return audio;
645
- }
646
- function isNormalizedFloat(audio) {
647
- const step = Math.max(1, Math.floor(audio.length / 1e3));
648
- let maxAbs = 0;
649
- for (let i = 0; i < audio.length; i += step) {
650
- const v = Math.abs(audio[i]);
651
- if (v > maxAbs) maxAbs = v;
652
- }
653
- return maxAbs <= 1;
654
- }
655
654
 
656
655
  // src/aed.ts
657
656
  var SAMPLE_RATE3 = 16e3;
@@ -691,7 +690,7 @@ var OmniAED = class _OmniAED {
691
690
  */
692
691
  detect(audio) {
693
692
  const M = getModule();
694
- const { ptr, length, format } = prepareAudio2(M, audio);
693
+ const { ptr, length, format } = dispatchAudio(M, audio);
695
694
  const duration = Math.round(length / SAMPLE_RATE3 * 1e3) / 1e3;
696
695
  try {
697
696
  const events = aedDetect(M, this.handle, ptr, length, this.config, format);
@@ -712,18 +711,6 @@ var OmniAED = class _OmniAED {
712
711
  }
713
712
  }
714
713
  };
715
- function prepareAudio2(M, audio) {
716
- const f32 = audio instanceof Int16Array ? int16ToNormalizedFloat322(audio) : audio;
717
- const ptr = M._malloc(f32.length * 4);
718
- const heap = new Float32Array(M.HEAPU8.buffer, ptr, f32.length);
719
- heap.set(f32);
720
- return { ptr, length: f32.length, format: "f32" };
721
- }
722
- function int16ToNormalizedFloat322(i16) {
723
- const f32 = new Float32Array(i16.length);
724
- for (let i = 0; i < i16.length; i++) f32[i] = i16[i] / 32768;
725
- return f32;
726
- }
727
714
  function computeCoverageRatios(events, duration) {
728
715
  const ratios = {
729
716
  speech: 0,