omnivad 0.2.9 → 0.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -3
- package/dist/index.cjs +64 -77
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +18 -8
- package/dist/index.d.ts +18 -8
- package/dist/index.js +64 -77
- package/dist/index.js.map +1 -1
- package/dist/wasm/omnivad.cjs +1 -1
- package/dist/wasm/omnivad.js +1 -1
- package/dist/wasm/omnivad.wasm +0 -0
- package/package.json +3 -2
package/dist/index.d.cts
CHANGED
|
@@ -155,9 +155,10 @@ interface ChunkResult {
|
|
|
155
155
|
/**
|
|
156
156
|
* Non-streaming Voice Activity Detection (WASM/ncnn backend).
|
|
157
157
|
*
|
|
158
|
-
* Audio format:
|
|
159
|
-
*
|
|
160
|
-
* - Float32Array in [-1.0, 1.0]
|
|
158
|
+
* Audio format: two types only. Wrappers dispatch by dtype to the matching
|
|
159
|
+
* C entry — never scale or cast in JS.
|
|
160
|
+
* - Float32Array in [-1.0, 1.0] (Web Audio, soundfile, torch)
|
|
161
|
+
* - Int16Array (raw 16-bit PCM from WAV / microphone)
|
|
161
162
|
*/
|
|
162
163
|
|
|
163
164
|
declare class OmniVAD {
|
|
@@ -182,6 +183,9 @@ declare class OmniVAD {
|
|
|
182
183
|
/**
|
|
183
184
|
* Streaming Voice Activity Detection (WASM/ncnn backend).
|
|
184
185
|
* Processes audio frame-by-frame (10ms chunks of 160 samples @ 16kHz).
|
|
186
|
+
*
|
|
187
|
+
* Audio format: Float32Array in [-1, 1] or Int16Array PCM. Wrappers
|
|
188
|
+
* dispatch by dtype; all scaling lives in the C entries.
|
|
185
189
|
*/
|
|
186
190
|
|
|
187
191
|
declare class OmniStreamVAD {
|
|
@@ -200,14 +204,19 @@ declare class OmniStreamVAD {
|
|
|
200
204
|
*/
|
|
201
205
|
clone(): OmniStreamVAD;
|
|
202
206
|
/**
|
|
203
|
-
* Process one frame of audio (160
|
|
207
|
+
* Process one frame of audio (160 samples = 10ms @ 16kHz).
|
|
208
|
+
*
|
|
209
|
+
* Accepts Float32Array in [-1, 1] (Web Audio, soundfile, torch) or
|
|
210
|
+
* Int16Array PCM (WAV, microphone). Dispatches by dtype to the matching
|
|
211
|
+
* C entry — no scaling in JS.
|
|
212
|
+
*
|
|
204
213
|
* Returns null until enough audio is accumulated.
|
|
205
214
|
*
|
|
206
215
|
* Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
|
|
207
216
|
* speech_*_frame indices) come straight from the C-layer state machine
|
|
208
217
|
* (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
|
|
209
218
|
*/
|
|
210
|
-
processFrame(
|
|
219
|
+
processFrame(audio: Float32Array | Int16Array): StreamVADFrameResult | null;
|
|
211
220
|
/**
|
|
212
221
|
* Process entire audio at once and return per-frame probabilities.
|
|
213
222
|
* @param audio - Float32Array in [-1, 1] or Int16Array of 16kHz mono PCM
|
|
@@ -222,7 +231,8 @@ declare class OmniStreamVAD {
|
|
|
222
231
|
/**
|
|
223
232
|
* Audio Event Detection: speech, singing, music (WASM/ncnn backend).
|
|
224
233
|
*
|
|
225
|
-
* Audio format: same as OmniVAD —
|
|
234
|
+
* Audio format: same as OmniVAD — Float32Array in [-1, 1] or Int16Array PCM.
|
|
235
|
+
* Wrappers dispatch by dtype; all scaling lives in the C entries.
|
|
226
236
|
*/
|
|
227
237
|
|
|
228
238
|
declare class OmniAED {
|
|
@@ -250,9 +260,9 @@ declare class OmniAED {
|
|
|
250
260
|
*/
|
|
251
261
|
type EmscriptenModule = any;
|
|
252
262
|
/** Package version — used to construct default CDN URLs. */
|
|
253
|
-
declare const VERSION = "0.2.
|
|
263
|
+
declare const VERSION = "0.2.10";
|
|
254
264
|
/** Default CDN base for model files (jsDelivr serves npm package contents). */
|
|
255
|
-
declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.
|
|
265
|
+
declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.10/models";
|
|
256
266
|
/** Model filenames keyed by type. */
|
|
257
267
|
declare const MODEL_FILES: {
|
|
258
268
|
readonly vad: "vad.omnivad";
|
package/dist/index.d.ts
CHANGED
|
@@ -155,9 +155,10 @@ interface ChunkResult {
|
|
|
155
155
|
/**
|
|
156
156
|
* Non-streaming Voice Activity Detection (WASM/ncnn backend).
|
|
157
157
|
*
|
|
158
|
-
* Audio format:
|
|
159
|
-
*
|
|
160
|
-
* - Float32Array in [-1.0, 1.0]
|
|
158
|
+
* Audio format: two types only. Wrappers dispatch by dtype to the matching
|
|
159
|
+
* C entry — never scale or cast in JS.
|
|
160
|
+
* - Float32Array in [-1.0, 1.0] (Web Audio, soundfile, torch)
|
|
161
|
+
* - Int16Array (raw 16-bit PCM from WAV / microphone)
|
|
161
162
|
*/
|
|
162
163
|
|
|
163
164
|
declare class OmniVAD {
|
|
@@ -182,6 +183,9 @@ declare class OmniVAD {
|
|
|
182
183
|
/**
|
|
183
184
|
* Streaming Voice Activity Detection (WASM/ncnn backend).
|
|
184
185
|
* Processes audio frame-by-frame (10ms chunks of 160 samples @ 16kHz).
|
|
186
|
+
*
|
|
187
|
+
* Audio format: Float32Array in [-1, 1] or Int16Array PCM. Wrappers
|
|
188
|
+
* dispatch by dtype; all scaling lives in the C entries.
|
|
185
189
|
*/
|
|
186
190
|
|
|
187
191
|
declare class OmniStreamVAD {
|
|
@@ -200,14 +204,19 @@ declare class OmniStreamVAD {
|
|
|
200
204
|
*/
|
|
201
205
|
clone(): OmniStreamVAD;
|
|
202
206
|
/**
|
|
203
|
-
* Process one frame of audio (160
|
|
207
|
+
* Process one frame of audio (160 samples = 10ms @ 16kHz).
|
|
208
|
+
*
|
|
209
|
+
* Accepts Float32Array in [-1, 1] (Web Audio, soundfile, torch) or
|
|
210
|
+
* Int16Array PCM (WAV, microphone). Dispatches by dtype to the matching
|
|
211
|
+
* C entry — no scaling in JS.
|
|
212
|
+
*
|
|
204
213
|
* Returns null until enough audio is accumulated.
|
|
205
214
|
*
|
|
206
215
|
* Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
|
|
207
216
|
* speech_*_frame indices) come straight from the C-layer state machine
|
|
208
217
|
* (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
|
|
209
218
|
*/
|
|
210
|
-
processFrame(
|
|
219
|
+
processFrame(audio: Float32Array | Int16Array): StreamVADFrameResult | null;
|
|
211
220
|
/**
|
|
212
221
|
* Process entire audio at once and return per-frame probabilities.
|
|
213
222
|
* @param audio - Float32Array in [-1, 1] or Int16Array of 16kHz mono PCM
|
|
@@ -222,7 +231,8 @@ declare class OmniStreamVAD {
|
|
|
222
231
|
/**
|
|
223
232
|
* Audio Event Detection: speech, singing, music (WASM/ncnn backend).
|
|
224
233
|
*
|
|
225
|
-
* Audio format: same as OmniVAD —
|
|
234
|
+
* Audio format: same as OmniVAD — Float32Array in [-1, 1] or Int16Array PCM.
|
|
235
|
+
* Wrappers dispatch by dtype; all scaling lives in the C entries.
|
|
226
236
|
*/
|
|
227
237
|
|
|
228
238
|
declare class OmniAED {
|
|
@@ -250,9 +260,9 @@ declare class OmniAED {
|
|
|
250
260
|
*/
|
|
251
261
|
type EmscriptenModule = any;
|
|
252
262
|
/** Package version — used to construct default CDN URLs. */
|
|
253
|
-
declare const VERSION = "0.2.
|
|
263
|
+
declare const VERSION = "0.2.10";
|
|
254
264
|
/** Default CDN base for model files (jsDelivr serves npm package contents). */
|
|
255
|
-
declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.
|
|
265
|
+
declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.10/models";
|
|
256
266
|
/** Model filenames keyed by type. */
|
|
257
267
|
declare const MODEL_FILES: {
|
|
258
268
|
readonly vad: "vad.omnivad";
|
package/dist/index.js
CHANGED
|
@@ -35,7 +35,7 @@ var SIZEOF_AED_SEGMENT = 16;
|
|
|
35
35
|
var SIZEOF_CHUNK_CONFIG = 28;
|
|
36
36
|
var SIZEOF_CHUNK = 16;
|
|
37
37
|
var OMNI_ERR_NO_FRAMES = -7;
|
|
38
|
-
var VERSION = "0.2.
|
|
38
|
+
var VERSION = "0.2.10";
|
|
39
39
|
var DEFAULT_CDN_BASE = `https://cdn.jsdelivr.net/npm/omnivad@${VERSION}/models`;
|
|
40
40
|
var MODEL_FILES = {
|
|
41
41
|
vad: "vad.omnivad",
|
|
@@ -160,6 +160,23 @@ function copyAudioToHeap(M, audio) {
|
|
|
160
160
|
heap.set(audio);
|
|
161
161
|
return ptr;
|
|
162
162
|
}
|
|
163
|
+
function copyInt16ToHeap(M, audio) {
|
|
164
|
+
const ptr = M._malloc(audio.length * 2);
|
|
165
|
+
const heap = new Int16Array(M.HEAPU8.buffer, ptr, audio.length);
|
|
166
|
+
heap.set(audio);
|
|
167
|
+
return ptr;
|
|
168
|
+
}
|
|
169
|
+
function dispatchAudio(M, audio) {
|
|
170
|
+
if (audio instanceof Float32Array) {
|
|
171
|
+
return { ptr: copyAudioToHeap(M, audio), length: audio.length, format: "f32" };
|
|
172
|
+
}
|
|
173
|
+
if (audio instanceof Int16Array) {
|
|
174
|
+
return { ptr: copyInt16ToHeap(M, audio), length: audio.length, format: "int16" };
|
|
175
|
+
}
|
|
176
|
+
throw new TypeError(
|
|
177
|
+
`unsupported audio dtype; expected Float32Array in [-1, 1] or Int16Array`
|
|
178
|
+
);
|
|
179
|
+
}
|
|
163
180
|
function writePostConfig(M, ptr, cfg) {
|
|
164
181
|
M.setValue(ptr + 0, cfg.threshold, "float");
|
|
165
182
|
M.setValue(ptr + 4, cfg.smoothWindowSize, "i32");
|
|
@@ -399,14 +416,15 @@ function streamVadCreate(M, modelBuffer, config = {}) {
|
|
|
399
416
|
}
|
|
400
417
|
}
|
|
401
418
|
var SIZEOF_STREAM_VAD_RESULT = 24;
|
|
402
|
-
function streamVadProcess(M, handle,
|
|
419
|
+
function streamVadProcess(M, handle, audioPtr, numSamples, format = "f32") {
|
|
403
420
|
const resultPtr = M._malloc(SIZEOF_STREAM_VAD_RESULT);
|
|
421
|
+
const fn = format === "int16" ? "omni_stream_vad_process_int16" : "omni_stream_vad_process";
|
|
404
422
|
try {
|
|
405
423
|
const ret = M.ccall(
|
|
406
|
-
|
|
424
|
+
fn,
|
|
407
425
|
"number",
|
|
408
426
|
["number", "number", "number", "number"],
|
|
409
|
-
[handle,
|
|
427
|
+
[handle, audioPtr, numSamples, resultPtr]
|
|
410
428
|
);
|
|
411
429
|
if (ret === OMNI_ERR_NO_FRAMES) return null;
|
|
412
430
|
if (ret !== 0) throw new Error(`StreamVAD process failed: ${ret}`);
|
|
@@ -424,6 +442,28 @@ function streamVadProcess(M, handle, pcm16Ptr, numSamples) {
|
|
|
424
442
|
M._free(resultPtr);
|
|
425
443
|
}
|
|
426
444
|
}
|
|
445
|
+
function streamVadDetectFull(M, handle, audioPtr, numSamples, format = "f32") {
|
|
446
|
+
const probsPtrPtr = M._malloc(4);
|
|
447
|
+
const framesPtr = M._malloc(4);
|
|
448
|
+
const fn = format === "int16" ? "omni_stream_vad_detect_full_int16" : "omni_stream_vad_detect_full";
|
|
449
|
+
try {
|
|
450
|
+
const ret = M.ccall(
|
|
451
|
+
fn,
|
|
452
|
+
"number",
|
|
453
|
+
["number", "number", "number", "number", "number"],
|
|
454
|
+
[handle, audioPtr, numSamples, probsPtrPtr, framesPtr]
|
|
455
|
+
);
|
|
456
|
+
if (ret !== 0) throw new Error(`StreamVAD detectFull failed: ${ret}`);
|
|
457
|
+
const numFrames = M.getValue(framesPtr, "i32");
|
|
458
|
+
const probsPtr = M.getValue(probsPtrPtr, "i32");
|
|
459
|
+
const probabilities = probsPtr ? new Float32Array(new Float32Array(M.HEAPU8.buffer, probsPtr, numFrames)) : new Float32Array(0);
|
|
460
|
+
if (probsPtr) M._free(probsPtr);
|
|
461
|
+
return { probabilities, numFrames };
|
|
462
|
+
} finally {
|
|
463
|
+
M._free(probsPtrPtr);
|
|
464
|
+
M._free(framesPtr);
|
|
465
|
+
}
|
|
466
|
+
}
|
|
427
467
|
function streamVadClone(M, handle) {
|
|
428
468
|
const errPtr = M._malloc(4);
|
|
429
469
|
try {
|
|
@@ -483,7 +523,7 @@ var OmniVAD = class _OmniVAD {
|
|
|
483
523
|
*/
|
|
484
524
|
detect(audio) {
|
|
485
525
|
const M = getModule();
|
|
486
|
-
const { ptr, length, format } =
|
|
526
|
+
const { ptr, length, format } = dispatchAudio(M, audio);
|
|
487
527
|
try {
|
|
488
528
|
const timestamps = vadDetect(M, this.handle, ptr, length, this.config, format);
|
|
489
529
|
return {
|
|
@@ -502,16 +542,6 @@ var OmniVAD = class _OmniVAD {
|
|
|
502
542
|
}
|
|
503
543
|
}
|
|
504
544
|
};
|
|
505
|
-
function prepareAudio(M, audio) {
|
|
506
|
-
const f32 = audio instanceof Int16Array ? int16ToNormalizedFloat32(audio) : audio;
|
|
507
|
-
const ptr = copyAudioToHeap(M, f32);
|
|
508
|
-
return { ptr, length: f32.length, format: "f32" };
|
|
509
|
-
}
|
|
510
|
-
function int16ToNormalizedFloat32(i16) {
|
|
511
|
-
const f32 = new Float32Array(i16.length);
|
|
512
|
-
for (let i = 0; i < i16.length; i++) f32[i] = i16[i] / 32768;
|
|
513
|
-
return f32;
|
|
514
|
-
}
|
|
515
545
|
|
|
516
546
|
// src/stream-vad.ts
|
|
517
547
|
var SAMPLE_RATE2 = 16e3;
|
|
@@ -550,20 +580,23 @@ var OmniStreamVAD = class _OmniStreamVAD {
|
|
|
550
580
|
return new _OmniStreamVAD(newHandle);
|
|
551
581
|
}
|
|
552
582
|
/**
|
|
553
|
-
* Process one frame of audio (160
|
|
583
|
+
* Process one frame of audio (160 samples = 10ms @ 16kHz).
|
|
584
|
+
*
|
|
585
|
+
* Accepts Float32Array in [-1, 1] (Web Audio, soundfile, torch) or
|
|
586
|
+
* Int16Array PCM (WAV, microphone). Dispatches by dtype to the matching
|
|
587
|
+
* C entry — no scaling in JS.
|
|
588
|
+
*
|
|
554
589
|
* Returns null until enough audio is accumulated.
|
|
555
590
|
*
|
|
556
591
|
* Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
|
|
557
592
|
* speech_*_frame indices) come straight from the C-layer state machine
|
|
558
593
|
* (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
|
|
559
594
|
*/
|
|
560
|
-
processFrame(
|
|
595
|
+
processFrame(audio) {
|
|
561
596
|
const M = getModule();
|
|
562
|
-
const ptr = M
|
|
563
|
-
const heap16 = new Int16Array(M.HEAPU8.buffer, ptr, pcm160.length);
|
|
564
|
-
heap16.set(pcm160);
|
|
597
|
+
const { ptr, length, format } = dispatchAudio(M, audio);
|
|
565
598
|
try {
|
|
566
|
-
const result = streamVadProcess(M, this.handle, ptr,
|
|
599
|
+
const result = streamVadProcess(M, this.handle, ptr, length, format);
|
|
567
600
|
if (!result) return null;
|
|
568
601
|
return {
|
|
569
602
|
confidence: result.confidence,
|
|
@@ -585,31 +618,22 @@ var OmniStreamVAD = class _OmniStreamVAD {
|
|
|
585
618
|
*/
|
|
586
619
|
detectFull(audio) {
|
|
587
620
|
const M = getModule();
|
|
588
|
-
const
|
|
589
|
-
const audioPtr = copyAudioToHeap(M, f32);
|
|
590
|
-
const probsPtrPtr = M._malloc(4);
|
|
591
|
-
const framesPtr = M._malloc(4);
|
|
621
|
+
const { ptr, length, format } = dispatchAudio(M, audio);
|
|
592
622
|
try {
|
|
593
|
-
const
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
623
|
+
const { probabilities, numFrames } = streamVadDetectFull(
|
|
624
|
+
M,
|
|
625
|
+
this.handle,
|
|
626
|
+
ptr,
|
|
627
|
+
length,
|
|
628
|
+
format
|
|
598
629
|
);
|
|
599
|
-
if (ret !== 0) throw new Error(`StreamVAD detectFull failed: ${ret}`);
|
|
600
|
-
const numFrames = M.getValue(framesPtr, "i32");
|
|
601
|
-
const probsPtr = M.getValue(probsPtrPtr, "i32");
|
|
602
|
-
const probabilities = probsPtr ? new Float32Array(new Float32Array(M.HEAPU8.buffer, probsPtr, numFrames)) : new Float32Array(0);
|
|
603
|
-
if (probsPtr) M._free(probsPtr);
|
|
604
630
|
return {
|
|
605
631
|
probabilities,
|
|
606
632
|
numFrames,
|
|
607
|
-
duration: Math.round(
|
|
633
|
+
duration: Math.round(length / SAMPLE_RATE2 * 1e3) / 1e3
|
|
608
634
|
};
|
|
609
635
|
} finally {
|
|
610
|
-
M._free(
|
|
611
|
-
M._free(probsPtrPtr);
|
|
612
|
-
M._free(framesPtr);
|
|
636
|
+
M._free(ptr);
|
|
613
637
|
}
|
|
614
638
|
}
|
|
615
639
|
/** Reset all internal state (model cache, audio buffer, postprocessor). */
|
|
@@ -624,31 +648,6 @@ var OmniStreamVAD = class _OmniStreamVAD {
|
|
|
624
648
|
}
|
|
625
649
|
}
|
|
626
650
|
};
|
|
627
|
-
function int16ToFloat32(i16) {
|
|
628
|
-
const f32 = new Float32Array(i16.length);
|
|
629
|
-
for (let i = 0; i < i16.length; i++) f32[i] = i16[i];
|
|
630
|
-
return f32;
|
|
631
|
-
}
|
|
632
|
-
function prepareDetectFullAudio(audio) {
|
|
633
|
-
if (audio instanceof Int16Array) {
|
|
634
|
-
return int16ToFloat32(audio);
|
|
635
|
-
}
|
|
636
|
-
if (isNormalizedFloat(audio)) {
|
|
637
|
-
const scaled = new Float32Array(audio.length);
|
|
638
|
-
for (let i = 0; i < audio.length; i++) scaled[i] = audio[i] * 32768;
|
|
639
|
-
return scaled;
|
|
640
|
-
}
|
|
641
|
-
return audio;
|
|
642
|
-
}
|
|
643
|
-
function isNormalizedFloat(audio) {
|
|
644
|
-
const step = Math.max(1, Math.floor(audio.length / 1e3));
|
|
645
|
-
let maxAbs = 0;
|
|
646
|
-
for (let i = 0; i < audio.length; i += step) {
|
|
647
|
-
const v = Math.abs(audio[i]);
|
|
648
|
-
if (v > maxAbs) maxAbs = v;
|
|
649
|
-
}
|
|
650
|
-
return maxAbs <= 1;
|
|
651
|
-
}
|
|
652
651
|
|
|
653
652
|
// src/aed.ts
|
|
654
653
|
var SAMPLE_RATE3 = 16e3;
|
|
@@ -688,7 +687,7 @@ var OmniAED = class _OmniAED {
|
|
|
688
687
|
*/
|
|
689
688
|
detect(audio) {
|
|
690
689
|
const M = getModule();
|
|
691
|
-
const { ptr, length, format } =
|
|
690
|
+
const { ptr, length, format } = dispatchAudio(M, audio);
|
|
692
691
|
const duration = Math.round(length / SAMPLE_RATE3 * 1e3) / 1e3;
|
|
693
692
|
try {
|
|
694
693
|
const events = aedDetect(M, this.handle, ptr, length, this.config, format);
|
|
@@ -709,18 +708,6 @@ var OmniAED = class _OmniAED {
|
|
|
709
708
|
}
|
|
710
709
|
}
|
|
711
710
|
};
|
|
712
|
-
function prepareAudio2(M, audio) {
|
|
713
|
-
const f32 = audio instanceof Int16Array ? int16ToNormalizedFloat322(audio) : audio;
|
|
714
|
-
const ptr = M._malloc(f32.length * 4);
|
|
715
|
-
const heap = new Float32Array(M.HEAPU8.buffer, ptr, f32.length);
|
|
716
|
-
heap.set(f32);
|
|
717
|
-
return { ptr, length: f32.length, format: "f32" };
|
|
718
|
-
}
|
|
719
|
-
function int16ToNormalizedFloat322(i16) {
|
|
720
|
-
const f32 = new Float32Array(i16.length);
|
|
721
|
-
for (let i = 0; i < i16.length; i++) f32[i] = i16[i] / 32768;
|
|
722
|
-
return f32;
|
|
723
|
-
}
|
|
724
711
|
function computeCoverageRatios(events, duration) {
|
|
725
712
|
const ratios = {
|
|
726
713
|
speech: 0,
|