npm - omnivad - Versions diffs - 0.2.9 → 0.2.11 - Mend

omnivad 0.2.9 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/dist/index.d.cts CHANGED Viewed

@@ -155,9 +155,10 @@ interface ChunkResult {
 /**
  * Non-streaming Voice Activity Detection (WASM/ncnn backend).
  *
- * Audio format:
- *   - Int16Array: raw 16-bit PCM, converted to normalized float internally
- *   - Float32Array in [-1.0, 1.0]: normalized audio (Web Audio API format)
+ * Audio format: two types only. Wrappers dispatch by dtype to the matching
+ * C entry — never scale or cast in JS.
+ *   - Float32Array in [-1.0, 1.0] (Web Audio, soundfile, torch)
+ *   - Int16Array (raw 16-bit PCM from WAV / microphone)
  */
 declare class OmniVAD {
@@ -182,6 +183,9 @@ declare class OmniVAD {
 /**
  * Streaming Voice Activity Detection (WASM/ncnn backend).
  * Processes audio frame-by-frame (10ms chunks of 160 samples @ 16kHz).
+ *
+ * Audio format: Float32Array in [-1, 1] or Int16Array PCM. Wrappers
+ * dispatch by dtype; all scaling lives in the C entries.
  */
 declare class OmniStreamVAD {
@@ -200,14 +204,19 @@ declare class OmniStreamVAD {
      */
     clone(): OmniStreamVAD;
     /**
-     * Process one frame of audio (160 int16 samples = 10ms @ 16kHz).
+     * Process one frame of audio (160 samples = 10ms @ 16kHz).
+     *
+     * Accepts Float32Array in [-1, 1] (Web Audio, soundfile, torch) or
+     * Int16Array PCM (WAV, microphone). Dispatches by dtype to the matching
+     * C entry — no scaling in JS.
+     *
      * Returns null until enough audio is accumulated.
      *
      * Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
      * speech_*_frame indices) come straight from the C-layer state machine
      * (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
      */
-    processFrame(pcm160: Int16Array): StreamVADFrameResult | null;
+    processFrame(audio: Float32Array | Int16Array): StreamVADFrameResult | null;
     /**
      * Process entire audio at once and return per-frame probabilities.
      * @param audio - Float32Array in [-1, 1] or Int16Array of 16kHz mono PCM
@@ -222,7 +231,8 @@ declare class OmniStreamVAD {
 /**
  * Audio Event Detection: speech, singing, music (WASM/ncnn backend).
  *
- * Audio format: same as OmniVAD — Int16Array or normalized Float32Array [-1, 1].
+ * Audio format: same as OmniVAD — Float32Array in [-1, 1] or Int16Array PCM.
+ * Wrappers dispatch by dtype; all scaling lives in the C entries.
  */
 declare class OmniAED {
@@ -250,9 +260,9 @@ declare class OmniAED {
  */
 type EmscriptenModule = any;
 /** Package version — used to construct default CDN URLs. */
-declare const VERSION = "0.2.9";
+declare const VERSION = "0.2.10";
 /** Default CDN base for model files (jsDelivr serves npm package contents). */
-declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.9/models";
+declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.10/models";
 /** Model filenames keyed by type. */
 declare const MODEL_FILES: {
     readonly vad: "vad.omnivad";

package/dist/index.d.ts CHANGED Viewed

@@ -155,9 +155,10 @@ interface ChunkResult {
 /**
  * Non-streaming Voice Activity Detection (WASM/ncnn backend).
  *
- * Audio format:
- *   - Int16Array: raw 16-bit PCM, converted to normalized float internally
- *   - Float32Array in [-1.0, 1.0]: normalized audio (Web Audio API format)
+ * Audio format: two types only. Wrappers dispatch by dtype to the matching
+ * C entry — never scale or cast in JS.
+ *   - Float32Array in [-1.0, 1.0] (Web Audio, soundfile, torch)
+ *   - Int16Array (raw 16-bit PCM from WAV / microphone)
  */
 declare class OmniVAD {
@@ -182,6 +183,9 @@ declare class OmniVAD {
 /**
  * Streaming Voice Activity Detection (WASM/ncnn backend).
  * Processes audio frame-by-frame (10ms chunks of 160 samples @ 16kHz).
+ *
+ * Audio format: Float32Array in [-1, 1] or Int16Array PCM. Wrappers
+ * dispatch by dtype; all scaling lives in the C entries.
  */
 declare class OmniStreamVAD {
@@ -200,14 +204,19 @@ declare class OmniStreamVAD {
      */
     clone(): OmniStreamVAD;
     /**
-     * Process one frame of audio (160 int16 samples = 10ms @ 16kHz).
+     * Process one frame of audio (160 samples = 10ms @ 16kHz).
+     *
+     * Accepts Float32Array in [-1, 1] (Web Audio, soundfile, torch) or
+     * Int16Array PCM (WAV, microphone). Dispatches by dtype to the matching
+     * C entry — no scaling in JS.
+     *
      * Returns null until enough audio is accumulated.
      *
      * Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
      * speech_*_frame indices) come straight from the C-layer state machine
      * (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
      */
-    processFrame(pcm160: Int16Array): StreamVADFrameResult | null;
+    processFrame(audio: Float32Array | Int16Array): StreamVADFrameResult | null;
     /**
      * Process entire audio at once and return per-frame probabilities.
      * @param audio - Float32Array in [-1, 1] or Int16Array of 16kHz mono PCM
@@ -222,7 +231,8 @@ declare class OmniStreamVAD {
 /**
  * Audio Event Detection: speech, singing, music (WASM/ncnn backend).
  *
- * Audio format: same as OmniVAD — Int16Array or normalized Float32Array [-1, 1].
+ * Audio format: same as OmniVAD — Float32Array in [-1, 1] or Int16Array PCM.
+ * Wrappers dispatch by dtype; all scaling lives in the C entries.
  */
 declare class OmniAED {
@@ -250,9 +260,9 @@ declare class OmniAED {
  */
 type EmscriptenModule = any;
 /** Package version — used to construct default CDN URLs. */
-declare const VERSION = "0.2.9";
+declare const VERSION = "0.2.10";
 /** Default CDN base for model files (jsDelivr serves npm package contents). */
-declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.9/models";
+declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.10/models";
 /** Model filenames keyed by type. */
 declare const MODEL_FILES: {
     readonly vad: "vad.omnivad";

package/dist/index.js CHANGED Viewed

@@ -35,7 +35,7 @@ var SIZEOF_AED_SEGMENT = 16;
 var SIZEOF_CHUNK_CONFIG = 28;
 var SIZEOF_CHUNK = 16;
 var OMNI_ERR_NO_FRAMES = -7;
-var VERSION = "0.2.9";
+var VERSION = "0.2.10";
 var DEFAULT_CDN_BASE = `https://cdn.jsdelivr.net/npm/omnivad@${VERSION}/models`;
 var MODEL_FILES = {
   vad: "vad.omnivad",
@@ -160,6 +160,23 @@ function copyAudioToHeap(M, audio) {
   heap.set(audio);
   return ptr;
 }
+function copyInt16ToHeap(M, audio) {
+  const ptr = M._malloc(audio.length * 2);
+  const heap = new Int16Array(M.HEAPU8.buffer, ptr, audio.length);
+  heap.set(audio);
+  return ptr;
+}
+function dispatchAudio(M, audio) {
+  if (audio instanceof Float32Array) {
+    return { ptr: copyAudioToHeap(M, audio), length: audio.length, format: "f32" };
+  }
+  if (audio instanceof Int16Array) {
+    return { ptr: copyInt16ToHeap(M, audio), length: audio.length, format: "int16" };
+  }
+  throw new TypeError(
+    `unsupported audio dtype; expected Float32Array in [-1, 1] or Int16Array`
+  );
+}
 function writePostConfig(M, ptr, cfg) {
   M.setValue(ptr + 0, cfg.threshold, "float");
   M.setValue(ptr + 4, cfg.smoothWindowSize, "i32");
@@ -399,14 +416,15 @@ function streamVadCreate(M, modelBuffer, config = {}) {
   }
 }
 var SIZEOF_STREAM_VAD_RESULT = 24;
-function streamVadProcess(M, handle, pcm16Ptr, numSamples) {
+function streamVadProcess(M, handle, audioPtr, numSamples, format = "f32") {
   const resultPtr = M._malloc(SIZEOF_STREAM_VAD_RESULT);
+  const fn = format === "int16" ? "omni_stream_vad_process_int16" : "omni_stream_vad_process";
   try {
     const ret = M.ccall(
-      "omni_stream_vad_process",
+      fn,
       "number",
       ["number", "number", "number", "number"],
-      [handle, pcm16Ptr, numSamples, resultPtr]
+      [handle, audioPtr, numSamples, resultPtr]
     );
     if (ret === OMNI_ERR_NO_FRAMES) return null;
     if (ret !== 0) throw new Error(`StreamVAD process failed: ${ret}`);
@@ -424,6 +442,28 @@ function streamVadProcess(M, handle, pcm16Ptr, numSamples) {
     M._free(resultPtr);
   }
 }
+function streamVadDetectFull(M, handle, audioPtr, numSamples, format = "f32") {
+  const probsPtrPtr = M._malloc(4);
+  const framesPtr = M._malloc(4);
+  const fn = format === "int16" ? "omni_stream_vad_detect_full_int16" : "omni_stream_vad_detect_full";
+  try {
+    const ret = M.ccall(
+      fn,
+      "number",
+      ["number", "number", "number", "number", "number"],
+      [handle, audioPtr, numSamples, probsPtrPtr, framesPtr]
+    );
+    if (ret !== 0) throw new Error(`StreamVAD detectFull failed: ${ret}`);
+    const numFrames = M.getValue(framesPtr, "i32");
+    const probsPtr = M.getValue(probsPtrPtr, "i32");
+    const probabilities = probsPtr ? new Float32Array(new Float32Array(M.HEAPU8.buffer, probsPtr, numFrames)) : new Float32Array(0);
+    if (probsPtr) M._free(probsPtr);
+    return { probabilities, numFrames };
+  } finally {
+    M._free(probsPtrPtr);
+    M._free(framesPtr);
+  }
+}
 function streamVadClone(M, handle) {
   const errPtr = M._malloc(4);
   try {
@@ -483,7 +523,7 @@ var OmniVAD = class _OmniVAD {
    */
   detect(audio) {
     const M = getModule();
-    const { ptr, length, format } = prepareAudio(M, audio);
+    const { ptr, length, format } = dispatchAudio(M, audio);
     try {
       const timestamps = vadDetect(M, this.handle, ptr, length, this.config, format);
       return {
@@ -502,16 +542,6 @@ var OmniVAD = class _OmniVAD {
     }
   }
 };
-function prepareAudio(M, audio) {
-  const f32 = audio instanceof Int16Array ? int16ToNormalizedFloat32(audio) : audio;
-  const ptr = copyAudioToHeap(M, f32);
-  return { ptr, length: f32.length, format: "f32" };
-}
-function int16ToNormalizedFloat32(i16) {
-  const f32 = new Float32Array(i16.length);
-  for (let i = 0; i < i16.length; i++) f32[i] = i16[i] / 32768;
-  return f32;
-}
 // src/stream-vad.ts
 var SAMPLE_RATE2 = 16e3;
@@ -550,20 +580,23 @@ var OmniStreamVAD = class _OmniStreamVAD {
     return new _OmniStreamVAD(newHandle);
   }
   /**
-   * Process one frame of audio (160 int16 samples = 10ms @ 16kHz).
+   * Process one frame of audio (160 samples = 10ms @ 16kHz).
+   *
+   * Accepts Float32Array in [-1, 1] (Web Audio, soundfile, torch) or
+   * Int16Array PCM (WAV, microphone). Dispatches by dtype to the matching
+   * C entry — no scaling in JS.
+   *
    * Returns null until enough audio is accumulated.
    *
    * Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
    * speech_*_frame indices) come straight from the C-layer state machine
    * (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
    */
-  processFrame(pcm160) {
+  processFrame(audio) {
     const M = getModule();
-    const ptr = M._malloc(pcm160.length * 2);
-    const heap16 = new Int16Array(M.HEAPU8.buffer, ptr, pcm160.length);
-    heap16.set(pcm160);
+    const { ptr, length, format } = dispatchAudio(M, audio);
     try {
-      const result = streamVadProcess(M, this.handle, ptr, pcm160.length);
+      const result = streamVadProcess(M, this.handle, ptr, length, format);
       if (!result) return null;
       return {
         confidence: result.confidence,
@@ -585,31 +618,22 @@ var OmniStreamVAD = class _OmniStreamVAD {
    */
   detectFull(audio) {
     const M = getModule();
-    const f32 = prepareDetectFullAudio(audio);
-    const audioPtr = copyAudioToHeap(M, f32);
-    const probsPtrPtr = M._malloc(4);
-    const framesPtr = M._malloc(4);
+    const { ptr, length, format } = dispatchAudio(M, audio);
     try {
-      const ret = M.ccall(
-        "omni_stream_vad_detect_full",
-        "number",
-        ["number", "number", "number", "number", "number"],
-        [this.handle, audioPtr, f32.length, probsPtrPtr, framesPtr]
+      const { probabilities, numFrames } = streamVadDetectFull(
+        M,
+        this.handle,
+        ptr,
+        length,
+        format
       );
-      if (ret !== 0) throw new Error(`StreamVAD detectFull failed: ${ret}`);
-      const numFrames = M.getValue(framesPtr, "i32");
-      const probsPtr = M.getValue(probsPtrPtr, "i32");
-      const probabilities = probsPtr ? new Float32Array(new Float32Array(M.HEAPU8.buffer, probsPtr, numFrames)) : new Float32Array(0);
-      if (probsPtr) M._free(probsPtr);
       return {
         probabilities,
         numFrames,
-        duration: Math.round(f32.length / SAMPLE_RATE2 * 1e3) / 1e3
+        duration: Math.round(length / SAMPLE_RATE2 * 1e3) / 1e3
       };
     } finally {
-      M._free(audioPtr);
-      M._free(probsPtrPtr);
-      M._free(framesPtr);
+      M._free(ptr);
     }
   }
   /** Reset all internal state (model cache, audio buffer, postprocessor). */
@@ -624,31 +648,6 @@ var OmniStreamVAD = class _OmniStreamVAD {
     }
   }
 };
-function int16ToFloat32(i16) {
-  const f32 = new Float32Array(i16.length);
-  for (let i = 0; i < i16.length; i++) f32[i] = i16[i];
-  return f32;
-}
-function prepareDetectFullAudio(audio) {
-  if (audio instanceof Int16Array) {
-    return int16ToFloat32(audio);
-  }
-  if (isNormalizedFloat(audio)) {
-    const scaled = new Float32Array(audio.length);
-    for (let i = 0; i < audio.length; i++) scaled[i] = audio[i] * 32768;
-    return scaled;
-  }
-  return audio;
-}
-function isNormalizedFloat(audio) {
-  const step = Math.max(1, Math.floor(audio.length / 1e3));
-  let maxAbs = 0;
-  for (let i = 0; i < audio.length; i += step) {
-    const v = Math.abs(audio[i]);
-    if (v > maxAbs) maxAbs = v;
-  }
-  return maxAbs <= 1;
-}
 // src/aed.ts
 var SAMPLE_RATE3 = 16e3;
@@ -688,7 +687,7 @@ var OmniAED = class _OmniAED {
    */
   detect(audio) {
     const M = getModule();
-    const { ptr, length, format } = prepareAudio2(M, audio);
+    const { ptr, length, format } = dispatchAudio(M, audio);
     const duration = Math.round(length / SAMPLE_RATE3 * 1e3) / 1e3;
     try {
       const events = aedDetect(M, this.handle, ptr, length, this.config, format);
@@ -709,18 +708,6 @@ var OmniAED = class _OmniAED {
     }
   }
 };
-function prepareAudio2(M, audio) {
-  const f32 = audio instanceof Int16Array ? int16ToNormalizedFloat322(audio) : audio;
-  const ptr = M._malloc(f32.length * 4);
-  const heap = new Float32Array(M.HEAPU8.buffer, ptr, f32.length);
-  heap.set(f32);
-  return { ptr, length: f32.length, format: "f32" };
-}
-function int16ToNormalizedFloat322(i16) {
-  const f32 = new Float32Array(i16.length);
-  for (let i = 0; i < i16.length; i++) f32[i] = i16[i] / 32768;
-  return f32;
-}
 function computeCoverageRatios(events, duration) {
   const ratios = {
     speech: 0,