omnivad 0.2.9 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -155,9 +155,10 @@ interface ChunkResult {
155
155
  /**
156
156
  * Non-streaming Voice Activity Detection (WASM/ncnn backend).
157
157
  *
158
- * Audio format:
159
- * - Int16Array: raw 16-bit PCM, converted to normalized float internally
160
- * - Float32Array in [-1.0, 1.0]: normalized audio (Web Audio API format)
158
+ * Audio format: two types only. Wrappers dispatch by dtype to the matching
159
+ * C entry never scale or cast in JS.
160
+ * - Float32Array in [-1.0, 1.0] (Web Audio, soundfile, torch)
161
+ * - Int16Array (raw 16-bit PCM from WAV / microphone)
161
162
  */
162
163
 
163
164
  declare class OmniVAD {
@@ -182,6 +183,9 @@ declare class OmniVAD {
182
183
  /**
183
184
  * Streaming Voice Activity Detection (WASM/ncnn backend).
184
185
  * Processes audio frame-by-frame (10ms chunks of 160 samples @ 16kHz).
186
+ *
187
+ * Audio format: Float32Array in [-1, 1] or Int16Array PCM. Wrappers
188
+ * dispatch by dtype; all scaling lives in the C entries.
185
189
  */
186
190
 
187
191
  declare class OmniStreamVAD {
@@ -200,14 +204,19 @@ declare class OmniStreamVAD {
200
204
  */
201
205
  clone(): OmniStreamVAD;
202
206
  /**
203
- * Process one frame of audio (160 int16 samples = 10ms @ 16kHz).
207
+ * Process one frame of audio (160 samples = 10ms @ 16kHz).
208
+ *
209
+ * Accepts Float32Array in [-1, 1] (Web Audio, soundfile, torch) or
210
+ * Int16Array PCM (WAV, microphone). Dispatches by dtype to the matching
211
+ * C entry — no scaling in JS.
212
+ *
204
213
  * Returns null until enough audio is accumulated.
205
214
  *
206
215
  * Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
207
216
  * speech_*_frame indices) come straight from the C-layer state machine
208
217
  * (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
209
218
  */
210
- processFrame(pcm160: Int16Array): StreamVADFrameResult | null;
219
+ processFrame(audio: Float32Array | Int16Array): StreamVADFrameResult | null;
211
220
  /**
212
221
  * Process entire audio at once and return per-frame probabilities.
213
222
  * @param audio - Float32Array in [-1, 1] or Int16Array of 16kHz mono PCM
@@ -222,7 +231,8 @@ declare class OmniStreamVAD {
222
231
  /**
223
232
  * Audio Event Detection: speech, singing, music (WASM/ncnn backend).
224
233
  *
225
- * Audio format: same as OmniVAD — Int16Array or normalized Float32Array [-1, 1].
234
+ * Audio format: same as OmniVAD — Float32Array in [-1, 1] or Int16Array PCM.
235
+ * Wrappers dispatch by dtype; all scaling lives in the C entries.
226
236
  */
227
237
 
228
238
  declare class OmniAED {
@@ -250,9 +260,9 @@ declare class OmniAED {
250
260
  */
251
261
  type EmscriptenModule = any;
252
262
  /** Package version — used to construct default CDN URLs. */
253
- declare const VERSION = "0.2.9";
263
+ declare const VERSION = "0.2.10";
254
264
  /** Default CDN base for model files (jsDelivr serves npm package contents). */
255
- declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.9/models";
265
+ declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.10/models";
256
266
  /** Model filenames keyed by type. */
257
267
  declare const MODEL_FILES: {
258
268
  readonly vad: "vad.omnivad";
package/dist/index.d.ts CHANGED
@@ -155,9 +155,10 @@ interface ChunkResult {
155
155
  /**
156
156
  * Non-streaming Voice Activity Detection (WASM/ncnn backend).
157
157
  *
158
- * Audio format:
159
- * - Int16Array: raw 16-bit PCM, converted to normalized float internally
160
- * - Float32Array in [-1.0, 1.0]: normalized audio (Web Audio API format)
158
+ * Audio format: two types only. Wrappers dispatch by dtype to the matching
159
+ * C entry never scale or cast in JS.
160
+ * - Float32Array in [-1.0, 1.0] (Web Audio, soundfile, torch)
161
+ * - Int16Array (raw 16-bit PCM from WAV / microphone)
161
162
  */
162
163
 
163
164
  declare class OmniVAD {
@@ -182,6 +183,9 @@ declare class OmniVAD {
182
183
  /**
183
184
  * Streaming Voice Activity Detection (WASM/ncnn backend).
184
185
  * Processes audio frame-by-frame (10ms chunks of 160 samples @ 16kHz).
186
+ *
187
+ * Audio format: Float32Array in [-1, 1] or Int16Array PCM. Wrappers
188
+ * dispatch by dtype; all scaling lives in the C entries.
185
189
  */
186
190
 
187
191
  declare class OmniStreamVAD {
@@ -200,14 +204,19 @@ declare class OmniStreamVAD {
200
204
  */
201
205
  clone(): OmniStreamVAD;
202
206
  /**
203
- * Process one frame of audio (160 int16 samples = 10ms @ 16kHz).
207
+ * Process one frame of audio (160 samples = 10ms @ 16kHz).
208
+ *
209
+ * Accepts Float32Array in [-1, 1] (Web Audio, soundfile, torch) or
210
+ * Int16Array PCM (WAV, microphone). Dispatches by dtype to the matching
211
+ * C entry — no scaling in JS.
212
+ *
204
213
  * Returns null until enough audio is accumulated.
205
214
  *
206
215
  * Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
207
216
  * speech_*_frame indices) come straight from the C-layer state machine
208
217
  * (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
209
218
  */
210
- processFrame(pcm160: Int16Array): StreamVADFrameResult | null;
219
+ processFrame(audio: Float32Array | Int16Array): StreamVADFrameResult | null;
211
220
  /**
212
221
  * Process entire audio at once and return per-frame probabilities.
213
222
  * @param audio - Float32Array in [-1, 1] or Int16Array of 16kHz mono PCM
@@ -222,7 +231,8 @@ declare class OmniStreamVAD {
222
231
  /**
223
232
  * Audio Event Detection: speech, singing, music (WASM/ncnn backend).
224
233
  *
225
- * Audio format: same as OmniVAD — Int16Array or normalized Float32Array [-1, 1].
234
+ * Audio format: same as OmniVAD — Float32Array in [-1, 1] or Int16Array PCM.
235
+ * Wrappers dispatch by dtype; all scaling lives in the C entries.
226
236
  */
227
237
 
228
238
  declare class OmniAED {
@@ -250,9 +260,9 @@ declare class OmniAED {
250
260
  */
251
261
  type EmscriptenModule = any;
252
262
  /** Package version — used to construct default CDN URLs. */
253
- declare const VERSION = "0.2.9";
263
+ declare const VERSION = "0.2.10";
254
264
  /** Default CDN base for model files (jsDelivr serves npm package contents). */
255
- declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.9/models";
265
+ declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.10/models";
256
266
  /** Model filenames keyed by type. */
257
267
  declare const MODEL_FILES: {
258
268
  readonly vad: "vad.omnivad";
package/dist/index.js CHANGED
@@ -35,7 +35,7 @@ var SIZEOF_AED_SEGMENT = 16;
35
35
  var SIZEOF_CHUNK_CONFIG = 28;
36
36
  var SIZEOF_CHUNK = 16;
37
37
  var OMNI_ERR_NO_FRAMES = -7;
38
- var VERSION = "0.2.9";
38
+ var VERSION = "0.2.10";
39
39
  var DEFAULT_CDN_BASE = `https://cdn.jsdelivr.net/npm/omnivad@${VERSION}/models`;
40
40
  var MODEL_FILES = {
41
41
  vad: "vad.omnivad",
@@ -160,6 +160,23 @@ function copyAudioToHeap(M, audio) {
160
160
  heap.set(audio);
161
161
  return ptr;
162
162
  }
163
+ function copyInt16ToHeap(M, audio) {
164
+ const ptr = M._malloc(audio.length * 2);
165
+ const heap = new Int16Array(M.HEAPU8.buffer, ptr, audio.length);
166
+ heap.set(audio);
167
+ return ptr;
168
+ }
169
+ function dispatchAudio(M, audio) {
170
+ if (audio instanceof Float32Array) {
171
+ return { ptr: copyAudioToHeap(M, audio), length: audio.length, format: "f32" };
172
+ }
173
+ if (audio instanceof Int16Array) {
174
+ return { ptr: copyInt16ToHeap(M, audio), length: audio.length, format: "int16" };
175
+ }
176
+ throw new TypeError(
177
+ `unsupported audio dtype; expected Float32Array in [-1, 1] or Int16Array`
178
+ );
179
+ }
163
180
  function writePostConfig(M, ptr, cfg) {
164
181
  M.setValue(ptr + 0, cfg.threshold, "float");
165
182
  M.setValue(ptr + 4, cfg.smoothWindowSize, "i32");
@@ -399,14 +416,15 @@ function streamVadCreate(M, modelBuffer, config = {}) {
399
416
  }
400
417
  }
401
418
  var SIZEOF_STREAM_VAD_RESULT = 24;
402
- function streamVadProcess(M, handle, pcm16Ptr, numSamples) {
419
+ function streamVadProcess(M, handle, audioPtr, numSamples, format = "f32") {
403
420
  const resultPtr = M._malloc(SIZEOF_STREAM_VAD_RESULT);
421
+ const fn = format === "int16" ? "omni_stream_vad_process_int16" : "omni_stream_vad_process";
404
422
  try {
405
423
  const ret = M.ccall(
406
- "omni_stream_vad_process",
424
+ fn,
407
425
  "number",
408
426
  ["number", "number", "number", "number"],
409
- [handle, pcm16Ptr, numSamples, resultPtr]
427
+ [handle, audioPtr, numSamples, resultPtr]
410
428
  );
411
429
  if (ret === OMNI_ERR_NO_FRAMES) return null;
412
430
  if (ret !== 0) throw new Error(`StreamVAD process failed: ${ret}`);
@@ -424,6 +442,28 @@ function streamVadProcess(M, handle, pcm16Ptr, numSamples) {
424
442
  M._free(resultPtr);
425
443
  }
426
444
  }
445
+ function streamVadDetectFull(M, handle, audioPtr, numSamples, format = "f32") {
446
+ const probsPtrPtr = M._malloc(4);
447
+ const framesPtr = M._malloc(4);
448
+ const fn = format === "int16" ? "omni_stream_vad_detect_full_int16" : "omni_stream_vad_detect_full";
449
+ try {
450
+ const ret = M.ccall(
451
+ fn,
452
+ "number",
453
+ ["number", "number", "number", "number", "number"],
454
+ [handle, audioPtr, numSamples, probsPtrPtr, framesPtr]
455
+ );
456
+ if (ret !== 0) throw new Error(`StreamVAD detectFull failed: ${ret}`);
457
+ const numFrames = M.getValue(framesPtr, "i32");
458
+ const probsPtr = M.getValue(probsPtrPtr, "i32");
459
+ const probabilities = probsPtr ? new Float32Array(new Float32Array(M.HEAPU8.buffer, probsPtr, numFrames)) : new Float32Array(0);
460
+ if (probsPtr) M._free(probsPtr);
461
+ return { probabilities, numFrames };
462
+ } finally {
463
+ M._free(probsPtrPtr);
464
+ M._free(framesPtr);
465
+ }
466
+ }
427
467
  function streamVadClone(M, handle) {
428
468
  const errPtr = M._malloc(4);
429
469
  try {
@@ -483,7 +523,7 @@ var OmniVAD = class _OmniVAD {
483
523
  */
484
524
  detect(audio) {
485
525
  const M = getModule();
486
- const { ptr, length, format } = prepareAudio(M, audio);
526
+ const { ptr, length, format } = dispatchAudio(M, audio);
487
527
  try {
488
528
  const timestamps = vadDetect(M, this.handle, ptr, length, this.config, format);
489
529
  return {
@@ -502,16 +542,6 @@ var OmniVAD = class _OmniVAD {
502
542
  }
503
543
  }
504
544
  };
505
- function prepareAudio(M, audio) {
506
- const f32 = audio instanceof Int16Array ? int16ToNormalizedFloat32(audio) : audio;
507
- const ptr = copyAudioToHeap(M, f32);
508
- return { ptr, length: f32.length, format: "f32" };
509
- }
510
- function int16ToNormalizedFloat32(i16) {
511
- const f32 = new Float32Array(i16.length);
512
- for (let i = 0; i < i16.length; i++) f32[i] = i16[i] / 32768;
513
- return f32;
514
- }
515
545
 
516
546
  // src/stream-vad.ts
517
547
  var SAMPLE_RATE2 = 16e3;
@@ -550,20 +580,23 @@ var OmniStreamVAD = class _OmniStreamVAD {
550
580
  return new _OmniStreamVAD(newHandle);
551
581
  }
552
582
  /**
553
- * Process one frame of audio (160 int16 samples = 10ms @ 16kHz).
583
+ * Process one frame of audio (160 samples = 10ms @ 16kHz).
584
+ *
585
+ * Accepts Float32Array in [-1, 1] (Web Audio, soundfile, torch) or
586
+ * Int16Array PCM (WAV, microphone). Dispatches by dtype to the matching
587
+ * C entry — no scaling in JS.
588
+ *
554
589
  * Returns null until enough audio is accumulated.
555
590
  *
556
591
  * Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
557
592
  * speech_*_frame indices) come straight from the C-layer state machine
558
593
  * (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
559
594
  */
560
- processFrame(pcm160) {
595
+ processFrame(audio) {
561
596
  const M = getModule();
562
- const ptr = M._malloc(pcm160.length * 2);
563
- const heap16 = new Int16Array(M.HEAPU8.buffer, ptr, pcm160.length);
564
- heap16.set(pcm160);
597
+ const { ptr, length, format } = dispatchAudio(M, audio);
565
598
  try {
566
- const result = streamVadProcess(M, this.handle, ptr, pcm160.length);
599
+ const result = streamVadProcess(M, this.handle, ptr, length, format);
567
600
  if (!result) return null;
568
601
  return {
569
602
  confidence: result.confidence,
@@ -585,31 +618,22 @@ var OmniStreamVAD = class _OmniStreamVAD {
585
618
  */
586
619
  detectFull(audio) {
587
620
  const M = getModule();
588
- const f32 = prepareDetectFullAudio(audio);
589
- const audioPtr = copyAudioToHeap(M, f32);
590
- const probsPtrPtr = M._malloc(4);
591
- const framesPtr = M._malloc(4);
621
+ const { ptr, length, format } = dispatchAudio(M, audio);
592
622
  try {
593
- const ret = M.ccall(
594
- "omni_stream_vad_detect_full",
595
- "number",
596
- ["number", "number", "number", "number", "number"],
597
- [this.handle, audioPtr, f32.length, probsPtrPtr, framesPtr]
623
+ const { probabilities, numFrames } = streamVadDetectFull(
624
+ M,
625
+ this.handle,
626
+ ptr,
627
+ length,
628
+ format
598
629
  );
599
- if (ret !== 0) throw new Error(`StreamVAD detectFull failed: ${ret}`);
600
- const numFrames = M.getValue(framesPtr, "i32");
601
- const probsPtr = M.getValue(probsPtrPtr, "i32");
602
- const probabilities = probsPtr ? new Float32Array(new Float32Array(M.HEAPU8.buffer, probsPtr, numFrames)) : new Float32Array(0);
603
- if (probsPtr) M._free(probsPtr);
604
630
  return {
605
631
  probabilities,
606
632
  numFrames,
607
- duration: Math.round(f32.length / SAMPLE_RATE2 * 1e3) / 1e3
633
+ duration: Math.round(length / SAMPLE_RATE2 * 1e3) / 1e3
608
634
  };
609
635
  } finally {
610
- M._free(audioPtr);
611
- M._free(probsPtrPtr);
612
- M._free(framesPtr);
636
+ M._free(ptr);
613
637
  }
614
638
  }
615
639
  /** Reset all internal state (model cache, audio buffer, postprocessor). */
@@ -624,31 +648,6 @@ var OmniStreamVAD = class _OmniStreamVAD {
624
648
  }
625
649
  }
626
650
  };
627
- function int16ToFloat32(i16) {
628
- const f32 = new Float32Array(i16.length);
629
- for (let i = 0; i < i16.length; i++) f32[i] = i16[i];
630
- return f32;
631
- }
632
- function prepareDetectFullAudio(audio) {
633
- if (audio instanceof Int16Array) {
634
- return int16ToFloat32(audio);
635
- }
636
- if (isNormalizedFloat(audio)) {
637
- const scaled = new Float32Array(audio.length);
638
- for (let i = 0; i < audio.length; i++) scaled[i] = audio[i] * 32768;
639
- return scaled;
640
- }
641
- return audio;
642
- }
643
- function isNormalizedFloat(audio) {
644
- const step = Math.max(1, Math.floor(audio.length / 1e3));
645
- let maxAbs = 0;
646
- for (let i = 0; i < audio.length; i += step) {
647
- const v = Math.abs(audio[i]);
648
- if (v > maxAbs) maxAbs = v;
649
- }
650
- return maxAbs <= 1;
651
- }
652
651
 
653
652
  // src/aed.ts
654
653
  var SAMPLE_RATE3 = 16e3;
@@ -688,7 +687,7 @@ var OmniAED = class _OmniAED {
688
687
  */
689
688
  detect(audio) {
690
689
  const M = getModule();
691
- const { ptr, length, format } = prepareAudio2(M, audio);
690
+ const { ptr, length, format } = dispatchAudio(M, audio);
692
691
  const duration = Math.round(length / SAMPLE_RATE3 * 1e3) / 1e3;
693
692
  try {
694
693
  const events = aedDetect(M, this.handle, ptr, length, this.config, format);
@@ -709,18 +708,6 @@ var OmniAED = class _OmniAED {
709
708
  }
710
709
  }
711
710
  };
712
- function prepareAudio2(M, audio) {
713
- const f32 = audio instanceof Int16Array ? int16ToNormalizedFloat322(audio) : audio;
714
- const ptr = M._malloc(f32.length * 4);
715
- const heap = new Float32Array(M.HEAPU8.buffer, ptr, f32.length);
716
- heap.set(f32);
717
- return { ptr, length: f32.length, format: "f32" };
718
- }
719
- function int16ToNormalizedFloat322(i16) {
720
- const f32 = new Float32Array(i16.length);
721
- for (let i = 0; i < i16.length; i++) f32[i] = i16[i] / 32768;
722
- return f32;
723
- }
724
711
  function computeCoverageRatios(events, duration) {
725
712
  const ratios = {
726
713
  speech: 0,