omnivad 0.2.8 → 0.2.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +194 -0
- package/dist/index.cjs +64 -77
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +18 -8
- package/dist/index.d.ts +18 -8
- package/dist/index.js +64 -77
- package/dist/index.js.map +1 -1
- package/dist/wasm/omnivad.cjs +1 -1
- package/dist/wasm/omnivad.js +1 -1
- package/dist/wasm/omnivad.wasm +0 -0
- package/package.json +34 -6
package/README.md
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# omnivad
|
|
2
|
+
|
|
3
|
+
[](https://www.npmjs.com/package/omnivad)
|
|
4
|
+
[](https://bundlephobia.com/package/omnivad)
|
|
5
|
+
[](https://github.com/lifeiteng/OmniVAD-Kit/blob/main/LICENSE)
|
|
6
|
+
|
|
7
|
+
Cross-platform Voice Activity Detection and Audio Event Detection via WebAssembly.
|
|
8
|
+
Runs in **browsers, Web Workers, and Node.js** with a single API. Zero runtime
|
|
9
|
+
dependencies. Built on [FireRedVAD](https://github.com/FireRedTeam/FireRedVAD)
|
|
10
|
+
from Xiaohongshu (DFSMN architecture, ~2.2 MB per model).
|
|
11
|
+
|
|
12
|
+
## What's in the box
|
|
13
|
+
|
|
14
|
+
| Class | Use case | Output |
|
|
15
|
+
|-------|----------|--------|
|
|
16
|
+
| **`OmniVAD`** | Whole-audio voice activity detection | `[start, end]` timestamps |
|
|
17
|
+
| **`OmniStreamVAD`** | Real-time, frame-by-frame VAD with segment-boundary events | per-frame probability + start/end events |
|
|
18
|
+
| **`OmniAED`** | Audio event detection (3-class) | `speech` / `singing` / `music` timestamps |
|
|
19
|
+
| **`mergeChunks`** | Pack VAD output into Whisper-style 30 s chunks | `{ start, end, segStartIdx, segCount }[]` |
|
|
20
|
+
|
|
21
|
+
All four share one WASM module (~2.2 MB SIMD-enabled), one C implementation,
|
|
22
|
+
and a single bundle (~24 KB JS, ESM + CJS + types).
|
|
23
|
+
|
|
24
|
+
## Install
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pnpm add omnivad # or: npm install omnivad / yarn add omnivad
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Models are served from jsDelivr by default (zero config). For air-gapped or
|
|
31
|
+
custom deployments, pass `modelUrl` or pre-loaded `modelData`.
|
|
32
|
+
|
|
33
|
+
## Quickstart — whole-audio VAD
|
|
34
|
+
|
|
35
|
+
```ts
|
|
36
|
+
import { OmniVAD } from "omnivad";
|
|
37
|
+
|
|
38
|
+
const vad = await OmniVAD.create();
|
|
39
|
+
|
|
40
|
+
// Float32Array in [-1, 1] (Web Audio, decodeAudioData) or Int16Array (raw PCM)
|
|
41
|
+
const result = vad.detect(audioFloat32);
|
|
42
|
+
// { duration: 12.4, timestamps: [[0.35, 4.8], [5.1, 12.4]] }
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Streaming VAD — real-time, frame-by-frame
|
|
46
|
+
|
|
47
|
+
`OmniStreamVAD` processes 10 ms frames (160 samples @ 16 kHz) and emits
|
|
48
|
+
segment-boundary events on the same call that confirms the boundary —
|
|
49
|
+
bit-identical to upstream FireRedVAD's `FireRedStreamVad`.
|
|
50
|
+
|
|
51
|
+
`processFrame()` accepts `Float32Array` in `[-1, 1]` (Web Audio,
|
|
52
|
+
`AudioWorkletProcessor`, decoded WebRTC tracks) or `Int16Array` PCM
|
|
53
|
+
(WAV / microphone). Dispatch is by dtype — no scaling in JS.
|
|
54
|
+
|
|
55
|
+
```ts
|
|
56
|
+
import { OmniStreamVAD } from "omnivad";
|
|
57
|
+
|
|
58
|
+
const vad = await OmniStreamVAD.create();
|
|
59
|
+
|
|
60
|
+
// Float32Array [-1, 1] from Web Audio:
|
|
61
|
+
for (let i = 0; i + 160 <= floatPcm.length; i += 160) {
|
|
62
|
+
const r = vad.processFrame(floatPcm.subarray(i, i + 160));
|
|
63
|
+
if (!r) continue;
|
|
64
|
+
if (r.isSpeechStart) console.log(`START @ ${(r.speechStartFrame * 0.01).toFixed(2)}s`);
|
|
65
|
+
if (r.isSpeechEnd) console.log(`END @ ${(r.speechEndFrame * 0.01).toFixed(2)}s`);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Or Int16Array PCM from a WAV file — same call, same result:
|
|
69
|
+
for (let i = 0; i + 160 <= int16Pcm.length; i += 160) {
|
|
70
|
+
vad.processFrame(int16Pcm.subarray(i, i + 160));
|
|
71
|
+
}
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
`processFrame()` returns `{ confidence, smoothedProb, isSpeech, isSpeechStart,
|
|
75
|
+
isSpeechEnd, frameIdx, speechStartFrame, speechEndFrame }` — every field comes
|
|
76
|
+
straight from the C state machine.
|
|
77
|
+
|
|
78
|
+
## Audio Event Detection — speech / singing / music
|
|
79
|
+
|
|
80
|
+
```ts
|
|
81
|
+
import { OmniAED } from "omnivad";
|
|
82
|
+
|
|
83
|
+
const aed = await OmniAED.create();
|
|
84
|
+
const events = aed.detect(audioFloat32);
|
|
85
|
+
// { duration: 22.0,
|
|
86
|
+
// events: { speech: [[...]], singing: [[...]], music: [[...]] },
|
|
87
|
+
// ratios: { speech: 0.41, singing: 0.0, music: 0.59 } }
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Whisper / WhisperX-style chunking
|
|
91
|
+
|
|
92
|
+
`OmniVAD` + `mergeChunks(mode: "greedy")` is the 1:1 equivalent of WhisperX's
|
|
93
|
+
`Binarize(max_duration=chunk_size)` + greedy packing. Use this recipe when
|
|
94
|
+
feeding chunks into Whisper-family ASR models that expect a fixed 30 s window:
|
|
95
|
+
|
|
96
|
+
```ts
|
|
97
|
+
import { OmniVAD, mergeChunks } from "omnivad";
|
|
98
|
+
|
|
99
|
+
const vad = await OmniVAD.create(); // threshold=0.4 default — safer for Whisper
|
|
100
|
+
const result = vad.detect(audioFloat32);
|
|
101
|
+
|
|
102
|
+
const chunks = await mergeChunks(result.timestamps, {
|
|
103
|
+
maxChunkSecs: 30.0, // Whisper input window
|
|
104
|
+
mode: "greedy", // WhisperX behavior
|
|
105
|
+
padOnsetSecs: 0.04,
|
|
106
|
+
padOffsetSecs: 0.04,
|
|
107
|
+
minSilenceSecs: 0.20,
|
|
108
|
+
});
|
|
109
|
+
// Slice the audio at [chunk.start, chunk.end] and feed each slice to Whisper.
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
A second mode `"longest_gap"` exists for variable-length-input models
|
|
113
|
+
(forced alignment, TTS) — see the GitHub README for the comparison table.
|
|
114
|
+
|
|
115
|
+
## Multi-stream concurrency
|
|
116
|
+
|
|
117
|
+
`OmniStreamVAD` instances have mutable per-stream state and **must not** be
|
|
118
|
+
shared across concurrent streams. Use `clone()` to spin up a fresh instance
|
|
119
|
+
that shares the underlying model weights but has its own state — instant,
|
|
120
|
+
near-zero memory overhead per stream.
|
|
121
|
+
|
|
122
|
+
```ts
|
|
123
|
+
const base = await OmniStreamVAD.create();
|
|
124
|
+
const streamA = base.clone();
|
|
125
|
+
const streamB = base.clone();
|
|
126
|
+
// Process two independent audio sessions in parallel.
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Models and CDN
|
|
130
|
+
|
|
131
|
+
By default, models are fetched from jsDelivr:
|
|
132
|
+
|
|
133
|
+
```
|
|
134
|
+
https://cdn.jsdelivr.net/npm/omnivad@<version>/models/{vad,stream-vad,aed}.omnivad
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Override per call when you need to host them yourself or pre-bundle:
|
|
138
|
+
|
|
139
|
+
```ts
|
|
140
|
+
const vad = await OmniVAD.create({
|
|
141
|
+
modelUrl: "https://your-cdn/vad.omnivad", // or
|
|
142
|
+
modelData: arrayBufferYouAlreadyHave,
|
|
143
|
+
});
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
In Node.js, models are read from the installed package (`omnivad/models/`) — no
|
|
147
|
+
network access required at runtime.
|
|
148
|
+
|
|
149
|
+
## Performance
|
|
150
|
+
|
|
151
|
+
Real-Time Factor (lower = faster) on Apple M-series:
|
|
152
|
+
|
|
153
|
+
| Model | RTF | Speed |
|
|
154
|
+
|-------|-----|-------|
|
|
155
|
+
| VAD | ~0.003 | ~330× real-time |
|
|
156
|
+
| Streaming VAD | ~0.002 | ~500× real-time |
|
|
157
|
+
| AED | ~0.002 | ~500× real-time |
|
|
158
|
+
|
|
159
|
+
WASM is built with SIMD enabled and ncnn fp16 weights.
|
|
160
|
+
|
|
161
|
+
## Accuracy
|
|
162
|
+
|
|
163
|
+
Verified bit-identical to upstream PyTorch reference on 5 audio files × 3
|
|
164
|
+
models — see the [accuracy table](https://github.com/lifeiteng/OmniVAD-Kit#testing)
|
|
165
|
+
in the main repo.
|
|
166
|
+
|
|
167
|
+
## Browser, Worker, Node — same API
|
|
168
|
+
|
|
169
|
+
The package detects its runtime and loads the right glue:
|
|
170
|
+
|
|
171
|
+
- **Browsers (main thread)** — classic-script injection of the Emscripten glue
|
|
172
|
+
(works around `MODULARIZE=1` IIFE issues with `import()`).
|
|
173
|
+
- **Web Workers / ServiceWorkers** — same path via `importScripts`.
|
|
174
|
+
- **Node.js (≥ 18)** — `createRequire` + local CJS resolution. No bundler
|
|
175
|
+
config needed.
|
|
176
|
+
|
|
177
|
+
## See also
|
|
178
|
+
|
|
179
|
+
- Full documentation, accuracy tables, C/C++ API, Python package, native build:
|
|
180
|
+
[GitHub repository](https://github.com/lifeiteng/OmniVAD-Kit)
|
|
181
|
+
- [中文 README](https://github.com/lifeiteng/OmniVAD-Kit/blob/main/README.zh.md)
|
|
182
|
+
- [Local development guide](https://github.com/lifeiteng/OmniVAD-Kit#local-development)
|
|
183
|
+
|
|
184
|
+
## Credits
|
|
185
|
+
|
|
186
|
+
- [**FireRedVAD**](https://github.com/FireRedTeam/FireRedVAD) — Kaituo Xu,
|
|
187
|
+
Wenpeng Li, Kai Huang, Kun Liu (Xiaohongshu). Source models, DFSMN
|
|
188
|
+
architecture, training pipeline.
|
|
189
|
+
- [ncnn](https://github.com/Tencent/ncnn) — Tencent. Inference backend.
|
|
190
|
+
- [Emscripten](https://emscripten.org/) — WebAssembly toolchain.
|
|
191
|
+
|
|
192
|
+
## License
|
|
193
|
+
|
|
194
|
+
Apache-2.0 — same as upstream FireRedVAD.
|
package/dist/index.cjs
CHANGED
|
@@ -38,7 +38,7 @@ var SIZEOF_AED_SEGMENT = 16;
|
|
|
38
38
|
var SIZEOF_CHUNK_CONFIG = 28;
|
|
39
39
|
var SIZEOF_CHUNK = 16;
|
|
40
40
|
var OMNI_ERR_NO_FRAMES = -7;
|
|
41
|
-
var VERSION = "0.2.
|
|
41
|
+
var VERSION = "0.2.10";
|
|
42
42
|
var DEFAULT_CDN_BASE = `https://cdn.jsdelivr.net/npm/omnivad@${VERSION}/models`;
|
|
43
43
|
var MODEL_FILES = {
|
|
44
44
|
vad: "vad.omnivad",
|
|
@@ -163,6 +163,23 @@ function copyAudioToHeap(M, audio) {
|
|
|
163
163
|
heap.set(audio);
|
|
164
164
|
return ptr;
|
|
165
165
|
}
|
|
166
|
+
function copyInt16ToHeap(M, audio) {
|
|
167
|
+
const ptr = M._malloc(audio.length * 2);
|
|
168
|
+
const heap = new Int16Array(M.HEAPU8.buffer, ptr, audio.length);
|
|
169
|
+
heap.set(audio);
|
|
170
|
+
return ptr;
|
|
171
|
+
}
|
|
172
|
+
function dispatchAudio(M, audio) {
|
|
173
|
+
if (audio instanceof Float32Array) {
|
|
174
|
+
return { ptr: copyAudioToHeap(M, audio), length: audio.length, format: "f32" };
|
|
175
|
+
}
|
|
176
|
+
if (audio instanceof Int16Array) {
|
|
177
|
+
return { ptr: copyInt16ToHeap(M, audio), length: audio.length, format: "int16" };
|
|
178
|
+
}
|
|
179
|
+
throw new TypeError(
|
|
180
|
+
`unsupported audio dtype; expected Float32Array in [-1, 1] or Int16Array`
|
|
181
|
+
);
|
|
182
|
+
}
|
|
166
183
|
function writePostConfig(M, ptr, cfg) {
|
|
167
184
|
M.setValue(ptr + 0, cfg.threshold, "float");
|
|
168
185
|
M.setValue(ptr + 4, cfg.smoothWindowSize, "i32");
|
|
@@ -402,14 +419,15 @@ function streamVadCreate(M, modelBuffer, config = {}) {
|
|
|
402
419
|
}
|
|
403
420
|
}
|
|
404
421
|
var SIZEOF_STREAM_VAD_RESULT = 24;
|
|
405
|
-
function streamVadProcess(M, handle,
|
|
422
|
+
function streamVadProcess(M, handle, audioPtr, numSamples, format = "f32") {
|
|
406
423
|
const resultPtr = M._malloc(SIZEOF_STREAM_VAD_RESULT);
|
|
424
|
+
const fn = format === "int16" ? "omni_stream_vad_process_int16" : "omni_stream_vad_process";
|
|
407
425
|
try {
|
|
408
426
|
const ret = M.ccall(
|
|
409
|
-
|
|
427
|
+
fn,
|
|
410
428
|
"number",
|
|
411
429
|
["number", "number", "number", "number"],
|
|
412
|
-
[handle,
|
|
430
|
+
[handle, audioPtr, numSamples, resultPtr]
|
|
413
431
|
);
|
|
414
432
|
if (ret === OMNI_ERR_NO_FRAMES) return null;
|
|
415
433
|
if (ret !== 0) throw new Error(`StreamVAD process failed: ${ret}`);
|
|
@@ -427,6 +445,28 @@ function streamVadProcess(M, handle, pcm16Ptr, numSamples) {
|
|
|
427
445
|
M._free(resultPtr);
|
|
428
446
|
}
|
|
429
447
|
}
|
|
448
|
+
function streamVadDetectFull(M, handle, audioPtr, numSamples, format = "f32") {
|
|
449
|
+
const probsPtrPtr = M._malloc(4);
|
|
450
|
+
const framesPtr = M._malloc(4);
|
|
451
|
+
const fn = format === "int16" ? "omni_stream_vad_detect_full_int16" : "omni_stream_vad_detect_full";
|
|
452
|
+
try {
|
|
453
|
+
const ret = M.ccall(
|
|
454
|
+
fn,
|
|
455
|
+
"number",
|
|
456
|
+
["number", "number", "number", "number", "number"],
|
|
457
|
+
[handle, audioPtr, numSamples, probsPtrPtr, framesPtr]
|
|
458
|
+
);
|
|
459
|
+
if (ret !== 0) throw new Error(`StreamVAD detectFull failed: ${ret}`);
|
|
460
|
+
const numFrames = M.getValue(framesPtr, "i32");
|
|
461
|
+
const probsPtr = M.getValue(probsPtrPtr, "i32");
|
|
462
|
+
const probabilities = probsPtr ? new Float32Array(new Float32Array(M.HEAPU8.buffer, probsPtr, numFrames)) : new Float32Array(0);
|
|
463
|
+
if (probsPtr) M._free(probsPtr);
|
|
464
|
+
return { probabilities, numFrames };
|
|
465
|
+
} finally {
|
|
466
|
+
M._free(probsPtrPtr);
|
|
467
|
+
M._free(framesPtr);
|
|
468
|
+
}
|
|
469
|
+
}
|
|
430
470
|
function streamVadClone(M, handle) {
|
|
431
471
|
const errPtr = M._malloc(4);
|
|
432
472
|
try {
|
|
@@ -486,7 +526,7 @@ var OmniVAD = class _OmniVAD {
|
|
|
486
526
|
*/
|
|
487
527
|
detect(audio) {
|
|
488
528
|
const M = getModule();
|
|
489
|
-
const { ptr, length, format } =
|
|
529
|
+
const { ptr, length, format } = dispatchAudio(M, audio);
|
|
490
530
|
try {
|
|
491
531
|
const timestamps = vadDetect(M, this.handle, ptr, length, this.config, format);
|
|
492
532
|
return {
|
|
@@ -505,16 +545,6 @@ var OmniVAD = class _OmniVAD {
|
|
|
505
545
|
}
|
|
506
546
|
}
|
|
507
547
|
};
|
|
508
|
-
function prepareAudio(M, audio) {
|
|
509
|
-
const f32 = audio instanceof Int16Array ? int16ToNormalizedFloat32(audio) : audio;
|
|
510
|
-
const ptr = copyAudioToHeap(M, f32);
|
|
511
|
-
return { ptr, length: f32.length, format: "f32" };
|
|
512
|
-
}
|
|
513
|
-
function int16ToNormalizedFloat32(i16) {
|
|
514
|
-
const f32 = new Float32Array(i16.length);
|
|
515
|
-
for (let i = 0; i < i16.length; i++) f32[i] = i16[i] / 32768;
|
|
516
|
-
return f32;
|
|
517
|
-
}
|
|
518
548
|
|
|
519
549
|
// src/stream-vad.ts
|
|
520
550
|
var SAMPLE_RATE2 = 16e3;
|
|
@@ -553,20 +583,23 @@ var OmniStreamVAD = class _OmniStreamVAD {
|
|
|
553
583
|
return new _OmniStreamVAD(newHandle);
|
|
554
584
|
}
|
|
555
585
|
/**
|
|
556
|
-
* Process one frame of audio (160
|
|
586
|
+
* Process one frame of audio (160 samples = 10ms @ 16kHz).
|
|
587
|
+
*
|
|
588
|
+
* Accepts Float32Array in [-1, 1] (Web Audio, soundfile, torch) or
|
|
589
|
+
* Int16Array PCM (WAV, microphone). Dispatches by dtype to the matching
|
|
590
|
+
* C entry — no scaling in JS.
|
|
591
|
+
*
|
|
557
592
|
* Returns null until enough audio is accumulated.
|
|
558
593
|
*
|
|
559
594
|
* Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
|
|
560
595
|
* speech_*_frame indices) come straight from the C-layer state machine
|
|
561
596
|
* (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
|
|
562
597
|
*/
|
|
563
|
-
processFrame(
|
|
598
|
+
processFrame(audio) {
|
|
564
599
|
const M = getModule();
|
|
565
|
-
const ptr = M
|
|
566
|
-
const heap16 = new Int16Array(M.HEAPU8.buffer, ptr, pcm160.length);
|
|
567
|
-
heap16.set(pcm160);
|
|
600
|
+
const { ptr, length, format } = dispatchAudio(M, audio);
|
|
568
601
|
try {
|
|
569
|
-
const result = streamVadProcess(M, this.handle, ptr,
|
|
602
|
+
const result = streamVadProcess(M, this.handle, ptr, length, format);
|
|
570
603
|
if (!result) return null;
|
|
571
604
|
return {
|
|
572
605
|
confidence: result.confidence,
|
|
@@ -588,31 +621,22 @@ var OmniStreamVAD = class _OmniStreamVAD {
|
|
|
588
621
|
*/
|
|
589
622
|
detectFull(audio) {
|
|
590
623
|
const M = getModule();
|
|
591
|
-
const
|
|
592
|
-
const audioPtr = copyAudioToHeap(M, f32);
|
|
593
|
-
const probsPtrPtr = M._malloc(4);
|
|
594
|
-
const framesPtr = M._malloc(4);
|
|
624
|
+
const { ptr, length, format } = dispatchAudio(M, audio);
|
|
595
625
|
try {
|
|
596
|
-
const
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
626
|
+
const { probabilities, numFrames } = streamVadDetectFull(
|
|
627
|
+
M,
|
|
628
|
+
this.handle,
|
|
629
|
+
ptr,
|
|
630
|
+
length,
|
|
631
|
+
format
|
|
601
632
|
);
|
|
602
|
-
if (ret !== 0) throw new Error(`StreamVAD detectFull failed: ${ret}`);
|
|
603
|
-
const numFrames = M.getValue(framesPtr, "i32");
|
|
604
|
-
const probsPtr = M.getValue(probsPtrPtr, "i32");
|
|
605
|
-
const probabilities = probsPtr ? new Float32Array(new Float32Array(M.HEAPU8.buffer, probsPtr, numFrames)) : new Float32Array(0);
|
|
606
|
-
if (probsPtr) M._free(probsPtr);
|
|
607
633
|
return {
|
|
608
634
|
probabilities,
|
|
609
635
|
numFrames,
|
|
610
|
-
duration: Math.round(
|
|
636
|
+
duration: Math.round(length / SAMPLE_RATE2 * 1e3) / 1e3
|
|
611
637
|
};
|
|
612
638
|
} finally {
|
|
613
|
-
M._free(
|
|
614
|
-
M._free(probsPtrPtr);
|
|
615
|
-
M._free(framesPtr);
|
|
639
|
+
M._free(ptr);
|
|
616
640
|
}
|
|
617
641
|
}
|
|
618
642
|
/** Reset all internal state (model cache, audio buffer, postprocessor). */
|
|
@@ -627,31 +651,6 @@ var OmniStreamVAD = class _OmniStreamVAD {
|
|
|
627
651
|
}
|
|
628
652
|
}
|
|
629
653
|
};
|
|
630
|
-
function int16ToFloat32(i16) {
|
|
631
|
-
const f32 = new Float32Array(i16.length);
|
|
632
|
-
for (let i = 0; i < i16.length; i++) f32[i] = i16[i];
|
|
633
|
-
return f32;
|
|
634
|
-
}
|
|
635
|
-
function prepareDetectFullAudio(audio) {
|
|
636
|
-
if (audio instanceof Int16Array) {
|
|
637
|
-
return int16ToFloat32(audio);
|
|
638
|
-
}
|
|
639
|
-
if (isNormalizedFloat(audio)) {
|
|
640
|
-
const scaled = new Float32Array(audio.length);
|
|
641
|
-
for (let i = 0; i < audio.length; i++) scaled[i] = audio[i] * 32768;
|
|
642
|
-
return scaled;
|
|
643
|
-
}
|
|
644
|
-
return audio;
|
|
645
|
-
}
|
|
646
|
-
function isNormalizedFloat(audio) {
|
|
647
|
-
const step = Math.max(1, Math.floor(audio.length / 1e3));
|
|
648
|
-
let maxAbs = 0;
|
|
649
|
-
for (let i = 0; i < audio.length; i += step) {
|
|
650
|
-
const v = Math.abs(audio[i]);
|
|
651
|
-
if (v > maxAbs) maxAbs = v;
|
|
652
|
-
}
|
|
653
|
-
return maxAbs <= 1;
|
|
654
|
-
}
|
|
655
654
|
|
|
656
655
|
// src/aed.ts
|
|
657
656
|
var SAMPLE_RATE3 = 16e3;
|
|
@@ -691,7 +690,7 @@ var OmniAED = class _OmniAED {
|
|
|
691
690
|
*/
|
|
692
691
|
detect(audio) {
|
|
693
692
|
const M = getModule();
|
|
694
|
-
const { ptr, length, format } =
|
|
693
|
+
const { ptr, length, format } = dispatchAudio(M, audio);
|
|
695
694
|
const duration = Math.round(length / SAMPLE_RATE3 * 1e3) / 1e3;
|
|
696
695
|
try {
|
|
697
696
|
const events = aedDetect(M, this.handle, ptr, length, this.config, format);
|
|
@@ -712,18 +711,6 @@ var OmniAED = class _OmniAED {
|
|
|
712
711
|
}
|
|
713
712
|
}
|
|
714
713
|
};
|
|
715
|
-
function prepareAudio2(M, audio) {
|
|
716
|
-
const f32 = audio instanceof Int16Array ? int16ToNormalizedFloat322(audio) : audio;
|
|
717
|
-
const ptr = M._malloc(f32.length * 4);
|
|
718
|
-
const heap = new Float32Array(M.HEAPU8.buffer, ptr, f32.length);
|
|
719
|
-
heap.set(f32);
|
|
720
|
-
return { ptr, length: f32.length, format: "f32" };
|
|
721
|
-
}
|
|
722
|
-
function int16ToNormalizedFloat322(i16) {
|
|
723
|
-
const f32 = new Float32Array(i16.length);
|
|
724
|
-
for (let i = 0; i < i16.length; i++) f32[i] = i16[i] / 32768;
|
|
725
|
-
return f32;
|
|
726
|
-
}
|
|
727
714
|
function computeCoverageRatios(events, duration) {
|
|
728
715
|
const ratios = {
|
|
729
716
|
speech: 0,
|