npm - omnivad - Versions diffs - 0.2.4 → 0.2.8 - Mend

omnivad 0.2.4 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -14,23 +14,27 @@ interface AEDResult {
     /** Detected duration coverage ratio for each event type */
     ratios: Record<string, number>;
 }
-/** Per-frame result from streaming VAD */
+/** Per-frame result from streaming VAD.
+ *
+ *  Bit-identical to upstream FireRedVAD's StreamVadFrameResult: every
+ *  successful processFrame() call carries both per-frame probabilities
+ *  AND segment-boundary events (no external segmenter needed). */
 interface StreamVADFrameResult {
-    /** Raw probability from model output */
+    /** Raw probability from model output [0, 1] */
     confidence: number;
-    /** Currently identical to confidence; reserved for future smoothing */
-    smoothedConfidence: number;
-    /** Whether current frame is classified as speech */
+    /** Causal moving-average of confidence (window = smoothWindowSize) */
+    smoothedProb: number;
+    /** smoothedProb >= threshold */
     isSpeech: boolean;
     /** 1-based frame index of the emitted frame */
     frameIndex: number;
-    /** True when speech becomes active at this frame */
+    /** True on the frame that confirms a new SPEECH segment */
     isSpeechStart: boolean;
-    /** True when speech ends on the previous frame */
+    /** True on the frame that confirms a SPEECH segment end */
     isSpeechEnd: boolean;
-    /** Start frame of the active or just-finished speech segment */
+    /** 1-based start frame of the segment when isSpeechStart, else -1 */
     speechStartFrame: number;
-    /** End frame of the just-finished speech segment, or 0 if not ending */
+    /** 1-based end frame of the segment when isSpeechEnd, else -1 */
     speechEndFrame: number;
 }
 /** Full-audio streaming-model output */
@@ -57,7 +61,7 @@ interface VADConfig extends ModelSource {
     smoothWindowSize?: number;
     /** Minimum speech segment length in frames (default: 20) */
     minSpeechFrames?: number;
-    /** Maximum speech segment length in frames before splitting (default: 2000 = 20s) */
+    /** Maximum speech segment length in frames before splitting (default: 3000 = 30s; matches Whisper) */
     maxSpeechFrames?: number;
     /** Minimum silence segment length in frames for state machine (default: 20) */
     minSilenceFrames?: number;
@@ -73,10 +77,79 @@ interface AEDConfig extends VADConfig {
     /** Music probability threshold (default: 0.5) */
     musicThreshold?: number;
 }
-/** Configuration for streaming VAD */
+/** Configuration for streaming VAD.
+ *
+ *  Bit-identical to upstream FireRedStreamVadConfig — every parameter
+ *  has the same name (without the speech_ prefix) and the same default. */
 interface StreamVADConfig extends ModelSource {
-    /** Speech probability threshold (default: 0.5) */
-    speechThreshold?: number;
+    /** Speech activation threshold [0, 1] (default: 0.5). */
+    threshold?: number;
+    /** Causal moving-average window in frames (default: 5). */
+    smoothWindowSize?: number;
+    /** Extend confirmed segment START backward by N frames (default: 5;
+     *  clamped to >= smoothWindowSize internally). */
+    padStartFrame?: number;
+    /** Min continuous speech frames to confirm START (default: 8 = 80ms). */
+    minSpeechFrame?: number;
+    /** Force-split when SPEECH-state count hits this (default: 2000 = 20s). */
+    maxSpeechFrame?: number;
+    /** Min continuous silence frames to confirm END (default: 20 = 200ms). */
+    minSilenceFrame?: number;
+}
+/**
+ * Chunk packing strategy. Both modes honor `maxChunkSecs` and `maxGapSecs` as
+ * hard constraints — they only differ in WHERE the cut lands.
+ *
+ * - `"greedy"` — sequential append; cuts at the first point that violates
+ *   a constraint. Recommended for **fixed-length-input ASR** like Whisper /
+ *   whisperX (which pad to 30s anyway).
+ * - `"longest_gap"` — recursive split at the longest internal pause until
+ *   every chunk satisfies both constraints. Falls back to equal hard-split
+ *   when a single segment exceeds `maxChunkSecs`. Recommended for
+ *   **variable-length-input models** (forced alignment, TTS, encoder-style
+ *   ASR) — splits at natural pauses, no fixed-length padding required.
+ *   **NOTE: This is NOT how WhisperX packs chunks** — WhisperX uses greedy
+ *   packing (`Binarize(max_duration=...)` + sequential append). For
+ *   WhisperX-equivalent behavior pass `mode: "greedy"` (the default).
+ */
+type ChunkMode$1 = "greedy" | "longest_gap";
+/**
+ * Configuration for {@link mergeChunks}. Mirrors C struct OmniChunkConfig.
+ * All fields are optional in the public API; defaults match
+ * {@link DEFAULT_CHUNK_CONFIG}.
+ */
+interface ChunkOptions {
+    /** Hard upper bound on chunk duration in seconds. Must be > 0. Default: 30. */
+    maxChunkSecs?: number;
+    /** Split if the gap between adjacent segments exceeds this. Pass `Infinity`
+     *  to disable. Default: `Infinity`. Honored by both modes. */
+    maxGapSecs?: number;
+    /** Extend each chunk start backward by this many seconds (clamped to >= 0).
+     *  Default: 0.04. */
+    padOnsetSecs?: number;
+    /** Extend each chunk end forward by this many seconds. Default: 0.04. */
+    padOffsetSecs?: number;
+    /** Drop input segments shorter than this many seconds. Default: 0.0.
+     *  Pairs with VAD `minSpeechFrames` (frame-domain equivalent). */
+    minSpeechSecs?: number;
+    /** Pre-merge consecutive segments whose silence gap is shorter than this.
+     *  Default: 0.20 (matches VAD `minSilenceFrames=20` @ 10ms frame shift). */
+    minSilenceSecs?: number;
+    /** Packing strategy. Default: `"greedy"`. */
+    mode?: ChunkMode$1;
+}
+/** A single chunk emitted by {@link mergeChunks}. */
+interface ChunkResult {
+    /** Chunk start time (seconds), with `padOnsetSecs` applied (clamped to >= 0). */
+    start: number;
+    /** Chunk end time (seconds), with `padOffsetSecs` applied. */
+    end: number;
+    /** Index of the first input segment included in this chunk. Refers to the
+     *  *post-filter* segment list — segments dropped by `minSpeechSecs` and
+     *  pre-merged by `minSilenceSecs` are not counted. */
+    segStartIdx: number;
+    /** Number of input segments included in this chunk. */
+    segCount: number;
 }
 /**
@@ -113,17 +186,26 @@ declare class OmniVAD {
 declare class OmniStreamVAD {
     private handle;
-    private inSpeech;
-    private speechStartFrame;
     private constructor();
     /**
      * Create a new OmniStreamVAD instance.
      * Loads model from CDN (browser), local package (Node.js), or custom source.
      */
     static create(options?: StreamVADConfig): Promise<OmniStreamVAD>;
+    /**
+     * Create a lightweight clone sharing the same underlying model weights.
+     * The clone has fresh per-instance state (empty audio buffer, zeroed cache).
+     * This is synchronous and extremely fast — ideal for multi-stream scenarios
+     * (e.g., handling multiple WebRTC tracks or concurrent audio sessions).
+     */
+    clone(): OmniStreamVAD;
     /**
      * Process one frame of audio (160 int16 samples = 10ms @ 16kHz).
      * Returns null until enough audio is accumulated.
+     *
+     * Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
+     * speech_*_frame indices) come straight from the C-layer state machine
+     * (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
      */
     processFrame(pcm160: Int16Array): StreamVADFrameResult | null;
     /**
@@ -131,7 +213,7 @@ declare class OmniStreamVAD {
      * @param audio - Float32Array in [-1, 1] or Int16Array of 16kHz mono PCM
      */
     detectFull(audio: Float32Array | Int16Array): StreamVADFullResult;
-    /** Reset all internal state. */
+    /** Reset all internal state (model cache, audio buffer, postprocessor). */
     reset(): void;
     /** Release native resources. */
     dispose(): void;
@@ -168,9 +250,9 @@ declare class OmniAED {
  */
 type EmscriptenModule = any;
 /** Package version — used to construct default CDN URLs. */
-declare const VERSION = "0.2.1";
+declare const VERSION = "0.2.8";
 /** Default CDN base for model files (jsDelivr serves npm package contents). */
-declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.1/models";
+declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.8/models";
 /** Model filenames keyed by type. */
 declare const MODEL_FILES: {
     readonly vad: "vad.omnivad";
@@ -193,5 +275,63 @@ declare function initWasm(wasmLocator?: (filename: string) => string): Promise<E
  *   4. Browser — fetch from jsDelivr CDN
  */
 declare function loadModel(modelType: ModelType, modelUrl?: string | URL, modelData?: ArrayBuffer): Promise<ArrayBuffer>;
+/**
+ * Chunking strategy:
+ * - "greedy" — sequential append. Recommended for fixed-length-input ASR
+ *              (Whisper / whisperX, which pad to 30s anyway).
+ * - "longest_gap" — recursive split at longest pause; falls back to hard-split
+ *                   when a single segment exceeds maxChunkSecs. Recommended for
+ *                   variable-length-input models (forced alignment, TTS,
+ *                   encoder-style ASR); no fixed-length padding required.
+ */
+type ChunkMode = "greedy" | "longest_gap";
+/** Configuration for omni_merge_chunks (matches C struct OmniChunkConfig, 28 bytes) */
+interface ChunkConfig {
+    maxChunkSecs: number;
+    maxGapSecs: number;
+    padOnsetSecs: number;
+    padOffsetSecs: number;
+    minSpeechSecs: number;
+    minSilenceSecs: number;
+    mode: ChunkMode;
+}
+/**
+ * Default chunk config. Mirrors C-side omni_chunk_config_default(); kept in
+ * TS so callers don't need a roundtrip into WASM just to read defaults.
+ *
+ * Defaults: max_chunk_secs matches Whisper's 30s input window.
+ */
+declare const DEFAULT_CHUNK_CONFIG: ChunkConfig;
+/**
+ * Pure-algorithm chunking utility — wraps the C function omni_merge_chunks
+ * compiled into the WASM module.
+ *
+ * WhisperX-style binarize+merge, minus the binarize half because OmniVAD
+ * already returns binarized timestamps.
+ *
+ * Usage:
+ *
+ *   import { mergeChunks } from "omnivad";
+ *
+ *   const chunks = await mergeChunks(
+ *     [[0.0, 5.0], [6.0, 10.0]],
+ *     { maxChunkSecs: 30.0, maxGapSecs: 2.0 }
+ *   );
+ *   // [{ start: 0, end: 10, segStartIdx: 0, segCount: 2 }]
+ */
+/**
+ * Merge a sorted array of [start, end] speech segments into duration-bounded
+ * chunks.
+ *
+ * Lazily initializes the WASM module on first call (so the caller doesn't have
+ * to await `initWasm()` separately). Subsequent calls reuse the cached module.
+ *
+ * @param segments  array of [start, end] pairs in seconds, sorted by start
+ * @param options   chunking configuration; missing fields fall back to
+ *                  {@link DEFAULT_CHUNK_CONFIG}
+ */
+declare function mergeChunks(segments: Array<[number, number]>, options?: ChunkOptions): Promise<ChunkResult[]>;
-export { type AEDConfig, type AEDResult, DEFAULT_CDN_BASE, OmniAED as FireRedAED, OmniStreamVAD as FireRedStreamVAD, OmniVAD as FireRedVAD, MODEL_FILES, type ModelSource, OmniAED, OmniStreamVAD, OmniVAD, type StreamVADConfig, type StreamVADFrameResult, type StreamVADFullResult, type VADConfig, type VADResult, VERSION, initWasm, loadModel };
+export { type AEDConfig, type AEDResult, type ChunkOptions, type ChunkResult, DEFAULT_CDN_BASE, DEFAULT_CHUNK_CONFIG, OmniAED as FireRedAED, OmniStreamVAD as FireRedStreamVAD, OmniVAD as FireRedVAD, MODEL_FILES, type ModelSource, OmniAED, OmniStreamVAD, OmniVAD, type StreamVADConfig, type StreamVADFrameResult, type StreamVADFullResult, type VADConfig, type VADResult, VERSION, initWasm, loadModel, mergeChunks };

package/dist/index.js CHANGED Viewed

@@ -1,12 +1,41 @@
 // src/wasm-binding.ts
 var _module = null;
 var _loading = null;
+function loadScript(url) {
+  if (typeof globalThis.document === "undefined") {
+    return new Promise((resolve, reject) => {
+      try {
+        const importScripts = globalThis.importScripts;
+        if (typeof importScripts !== "function") {
+          throw new Error(
+            "omnivad: cannot load glue script \u2014 no document and no importScripts"
+          );
+        }
+        importScripts(url);
+        resolve();
+      } catch (err) {
+        reject(err instanceof Error ? err : new Error(String(err)));
+      }
+    });
+  }
+  return new Promise((resolve, reject) => {
+    const s = globalThis.document.createElement("script");
+    s.src = url;
+    s.async = true;
+    s.crossOrigin = "anonymous";
+    s.onload = () => resolve();
+    s.onerror = () => reject(new Error(`Failed to load omnivad glue script: ${url}`));
+    globalThis.document.head.appendChild(s);
+  });
+}
 var SIZEOF_POST_CONFIG = 28;
 var SIZEOF_AED_POST_CONFIG = 3 * SIZEOF_POST_CONFIG;
 var SIZEOF_SEGMENT = 8;
 var SIZEOF_AED_SEGMENT = 16;
+var SIZEOF_CHUNK_CONFIG = 28;
+var SIZEOF_CHUNK = 16;
 var OMNI_ERR_NO_FRAMES = -7;
-var VERSION = "0.2.1";
+var VERSION = "0.2.8";
 var DEFAULT_CDN_BASE = `https://cdn.jsdelivr.net/npm/omnivad@${VERSION}/models`;
 var MODEL_FILES = {
   vad: "vad.omnivad",
@@ -22,22 +51,41 @@ async function initWasm(wasmLocator) {
     if (typeof globalThis.process?.versions?.node === "string") {
       const { createRequire } = await import(
         /* webpackIgnore: true */
+        /* turbopackIgnore: true */
         'module'
       );
-      const { dirname, join } = await import('path');
+      const { dirname, join } = await import(
+        /* webpackIgnore: true */
+        /* turbopackIgnore: true */
+        'path'
+      );
       const req = createRequire(import.meta.url);
       const gluePath = req.resolve("../dist/wasm/omnivad.cjs");
       const wasmDir = dirname(gluePath);
       createOmniVAD = req(gluePath);
       defaultLocateFile = (filename) => join(wasmDir, filename);
     } else {
-      const glueUrl = new URL("../dist/wasm/omnivad.js", import.meta.url);
-      const mod = await import(
-        /* webpackIgnore: true */
-        glueUrl.href
-      );
-      createOmniVAD = mod.default || mod;
-      const wasmBaseUrl = new URL("./", glueUrl);
+      let glueUrlStr;
+      if (wasmLocator) {
+        glueUrlStr = wasmLocator("omnivad.js");
+      } else {
+        glueUrlStr = new URL("../dist/wasm/omnivad.js", import.meta.url).href;
+      }
+      const g = globalThis;
+      let factory = g.createOmniVAD;
+      if (typeof factory !== "function") {
+        await loadScript(glueUrlStr);
+        factory = g.createOmniVAD;
+      }
+      if (typeof factory !== "function") {
+        throw new Error(
+          `omnivad.js loaded from ${glueUrlStr} but globalThis.createOmniVAD is missing`
+        );
+      }
+      createOmniVAD = factory;
+      const baseHref = typeof globalThis.location !== "undefined" ? globalThis.location.href : "file:///";
+      const absGlue = new URL(glueUrlStr, baseHref);
+      const wasmBaseUrl = new URL("./", absGlue);
       defaultLocateFile = (filename) => new URL(filename, wasmBaseUrl).toString();
     }
     const opts = {};
@@ -61,10 +109,19 @@ async function loadModel(modelType, modelUrl, modelData) {
   if (typeof globalThis.process?.versions?.node === "string") {
     const { createRequire } = await import(
       /* webpackIgnore: true */
+      /* turbopackIgnore: true */
       'module'
     );
-    const { dirname, join } = await import('path');
-    const { readFile } = await import('fs/promises');
+    const { dirname, join } = await import(
+      /* webpackIgnore: true */
+      /* turbopackIgnore: true */
+      'path'
+    );
+    const { readFile } = await import(
+      /* webpackIgnore: true */
+      /* turbopackIgnore: true */
+      'fs/promises'
+    );
     const req = createRequire(import.meta.url);
     const pkgDir = dirname(req.resolve("../package.json"));
     const modelPath = join(pkgDir, "models", filename);
@@ -117,10 +174,86 @@ var DEFAULT_VAD_CONFIG = {
   smoothWindowSize: 5,
   minSpeechFrames: 20,
   minSilenceFrames: 20,
-  maxSpeechFrames: 2e3,
+  maxSpeechFrames: 3e3,
   mergeSilenceFrames: 0,
   extendSpeechFrames: 0
 };
+var OMNI_CHUNK_GREEDY = 0;
+var OMNI_CHUNK_LONGEST_GAP = 1;
+var DEFAULT_CHUNK_CONFIG = {
+  maxChunkSecs: 30,
+  maxGapSecs: Infinity,
+  padOnsetSecs: 0.04,
+  padOffsetSecs: 0.04,
+  minSpeechSecs: 0,
+  minSilenceSecs: 0.2,
+  // matches VAD minSilenceFrames=20 @ 10ms shift
+  mode: "greedy"
+};
+function modeToInt(m) {
+  switch (m) {
+    case "greedy":
+      return OMNI_CHUNK_GREEDY;
+    case "longest_gap":
+      return OMNI_CHUNK_LONGEST_GAP;
+    default:
+      throw new Error(`Unknown chunking mode: ${String(m)}`);
+  }
+}
+function writeChunkConfig(M, ptr, cfg) {
+  M.setValue(ptr + 0, cfg.maxChunkSecs, "float");
+  M.setValue(ptr + 4, cfg.maxGapSecs, "float");
+  M.setValue(ptr + 8, cfg.padOnsetSecs, "float");
+  M.setValue(ptr + 12, cfg.padOffsetSecs, "float");
+  M.setValue(ptr + 16, cfg.minSpeechSecs, "float");
+  M.setValue(ptr + 20, cfg.minSilenceSecs, "float");
+  M.setValue(ptr + 24, modeToInt(cfg.mode), "i32");
+}
+function chunkMerge(M, segments, config) {
+  const numSegments = segments.length;
+  const segPtr = numSegments > 0 ? M._malloc(numSegments * SIZEOF_SEGMENT) : 0;
+  const cfgPtr = M._malloc(SIZEOF_CHUNK_CONFIG);
+  const outPtrPtr = M._malloc(4);
+  const outCountPtr = M._malloc(4);
+  try {
+    for (let i = 0; i < numSegments; i++) {
+      const base = segPtr + i * SIZEOF_SEGMENT;
+      M.setValue(base + 0, segments[i][0], "float");
+      M.setValue(base + 4, segments[i][1], "float");
+    }
+    writeChunkConfig(M, cfgPtr, config);
+    M.setValue(outPtrPtr, 0, "i32");
+    M.setValue(outCountPtr, 0, "i32");
+    const rc = M.ccall(
+      "omni_merge_chunks",
+      "number",
+      ["number", "number", "number", "number", "number"],
+      [segPtr, numSegments, cfgPtr, outPtrPtr, outCountPtr]
+    );
+    if (rc !== 0) {
+      throw new Error(`omni_merge_chunks failed: ${readNativeError(M, rc)}`);
+    }
+    const count = M.getValue(outCountPtr, "i32");
+    const chunkPtr = M.getValue(outPtrPtr, "i32");
+    const chunks = [];
+    for (let i = 0; i < count; i++) {
+      const base = chunkPtr + i * SIZEOF_CHUNK;
+      chunks.push({
+        start: M.getValue(base + 0, "float"),
+        end: M.getValue(base + 4, "float"),
+        segStartIdx: M.getValue(base + 8, "i32"),
+        segCount: M.getValue(base + 12, "i32")
+      });
+    }
+    if (chunkPtr) M._free(chunkPtr);
+    return chunks;
+  } finally {
+    if (segPtr) M._free(segPtr);
+    M._free(cfgPtr);
+    M._free(outPtrPtr);
+    M._free(outCountPtr);
+  }
+}
 function vadCreate(M, modelBuffer) {
   const bytes = new Uint8Array(modelBuffer);
   const ptr = M._malloc(bytes.length);
@@ -225,24 +358,49 @@ function aedDetect(M, handle, audioPtr, numSamples, cfg, format = "f32") {
 function aedDestroy(M, handle) {
   M.ccall("omni_aed_destroy", null, ["number"], [handle]);
 }
-function streamVadCreate(M, modelBuffer, threshold = 0.5) {
+var DEFAULT_STREAM_VAD_CONFIG = {
+  threshold: 0.5,
+  smoothWindowSize: 5,
+  padStartFrame: 5,
+  minSpeechFrame: 8,
+  maxSpeechFrame: 2e3,
+  minSilenceFrame: 20
+};
+var SIZEOF_STREAM_VAD_CONFIG = 24;
+function writeStreamVadConfig(M, ptr, cfg) {
+  M.setValue(ptr + 0, cfg.threshold, "float");
+  M.setValue(ptr + 4, cfg.smoothWindowSize, "i32");
+  M.setValue(ptr + 8, cfg.padStartFrame, "i32");
+  M.setValue(ptr + 12, cfg.minSpeechFrame, "i32");
+  M.setValue(ptr + 16, cfg.maxSpeechFrame, "i32");
+  M.setValue(ptr + 20, cfg.minSilenceFrame, "i32");
+}
+function streamVadCreate(M, modelBuffer, config = {}) {
+  const overrides = Object.fromEntries(
+    Object.entries(config).filter(([, v]) => v !== void 0)
+  );
+  const cfg = { ...DEFAULT_STREAM_VAD_CONFIG, ...overrides };
   const bytes = new Uint8Array(modelBuffer);
-  const ptr = M._malloc(bytes.length);
-  M.HEAPU8.set(bytes, ptr);
+  const dataPtr = M._malloc(bytes.length);
+  M.HEAPU8.set(bytes, dataPtr);
+  const cfgPtr = M._malloc(SIZEOF_STREAM_VAD_CONFIG);
   try {
+    writeStreamVadConfig(M, cfgPtr, cfg);
     return createModel(
       M,
       "omni_stream_vad_create_from_buffer",
       ["number", "number", "number"],
-      [ptr, bytes.length, threshold],
+      [dataPtr, bytes.length, cfgPtr],
       "StreamVAD"
     );
   } finally {
-    M._free(ptr);
+    M._free(dataPtr);
+    M._free(cfgPtr);
   }
 }
+var SIZEOF_STREAM_VAD_RESULT = 24;
 function streamVadProcess(M, handle, pcm16Ptr, numSamples) {
-  const resultPtr = M._malloc(12);
+  const resultPtr = M._malloc(SIZEOF_STREAM_VAD_RESULT);
   try {
     const ret = M.ccall(
       "omni_stream_vad_process",
@@ -253,14 +411,37 @@ function streamVadProcess(M, handle, pcm16Ptr, numSamples) {
     if (ret === OMNI_ERR_NO_FRAMES) return null;
     if (ret !== 0) throw new Error(`StreamVAD process failed: ${ret}`);
     return {
-      confidence: M.getValue(resultPtr, "float"),
-      isSpeech: M.getValue(resultPtr + 4, "i8") !== 0,
-      frameOffset: M.getValue(resultPtr + 8, "i32")
+      confidence: M.getValue(resultPtr + 0, "float"),
+      smoothedProb: M.getValue(resultPtr + 4, "float"),
+      isSpeech: M.getValue(resultPtr + 8, "i8") !== 0,
+      isSpeechStart: M.getValue(resultPtr + 9, "i8") !== 0,
+      isSpeechEnd: M.getValue(resultPtr + 10, "i8") !== 0,
+      frameIdx: M.getValue(resultPtr + 12, "i32"),
+      speechStartFrame: M.getValue(resultPtr + 16, "i32"),
+      speechEndFrame: M.getValue(resultPtr + 20, "i32")
     };
   } finally {
     M._free(resultPtr);
   }
 }
+function streamVadClone(M, handle) {
+  const errPtr = M._malloc(4);
+  try {
+    const newHandle = M.ccall(
+      "omni_stream_vad_clone",
+      "number",
+      ["number", "number"],
+      [handle, errPtr]
+    );
+    if (!newHandle) {
+      const err = M.getValue(errPtr, "i32");
+      throw new Error(`StreamVAD clone failed: ${readNativeError(M, err)}`);
+    }
+    return newHandle;
+  } finally {
+    M._free(errPtr);
+  }
+}
 function streamVadReset(M, handle) {
   M.ccall("omni_stream_vad_reset", null, ["number"], [handle]);
 }
@@ -336,8 +517,6 @@ function int16ToNormalizedFloat32(i16) {
 var SAMPLE_RATE2 = 16e3;
 var OmniStreamVAD = class _OmniStreamVAD {
   constructor(handle) {
-    this.inSpeech = false;
-    this.speechStartFrame = 0;
     this.handle = handle;
   }
   /**
@@ -348,13 +527,35 @@ var OmniStreamVAD = class _OmniStreamVAD {
     await initWasm();
     const M = getModule();
     const modelBuffer = await loadModel("stream-vad", options.modelUrl, options.modelData);
-    const threshold = options.speechThreshold ?? 0.5;
-    const handle = streamVadCreate(M, modelBuffer, threshold);
+    const handle = streamVadCreate(M, modelBuffer, {
+      threshold: options.threshold,
+      smoothWindowSize: options.smoothWindowSize,
+      padStartFrame: options.padStartFrame,
+      minSpeechFrame: options.minSpeechFrame,
+      maxSpeechFrame: options.maxSpeechFrame,
+      minSilenceFrame: options.minSilenceFrame
+    });
     return new _OmniStreamVAD(handle);
   }
+  /**
+   * Create a lightweight clone sharing the same underlying model weights.
+   * The clone has fresh per-instance state (empty audio buffer, zeroed cache).
+   * This is synchronous and extremely fast — ideal for multi-stream scenarios
+   * (e.g., handling multiple WebRTC tracks or concurrent audio sessions).
+   */
+  clone() {
+    if (!this.handle) throw new Error("Cannot clone a disposed instance.");
+    const M = getModule();
+    const newHandle = streamVadClone(M, this.handle);
+    return new _OmniStreamVAD(newHandle);
+  }
   /**
    * Process one frame of audio (160 int16 samples = 10ms @ 16kHz).
    * Returns null until enough audio is accumulated.
+   *
+   * Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
+   * speech_*_frame indices) come straight from the C-layer state machine
+   * (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
    */
   processFrame(pcm160) {
     const M = getModule();
@@ -363,28 +564,16 @@ var OmniStreamVAD = class _OmniStreamVAD {
     heap16.set(pcm160);
     try {
       const result = streamVadProcess(M, this.handle, ptr, pcm160.length);
-      if (!result || result.frameOffset === 0) return null;
-      const frameIndex = result.frameOffset;
-      const isSpeechStart = result.isSpeech && !this.inSpeech;
-      const isSpeechEnd = !result.isSpeech && this.inSpeech;
-      if (isSpeechStart) {
-        this.speechStartFrame = frameIndex;
-      }
-      const activeSpeechStartFrame = isSpeechEnd ? this.speechStartFrame : result.isSpeech ? this.speechStartFrame : 0;
-      const speechEndFrame = isSpeechEnd ? Math.max(1, frameIndex - 1) : 0;
-      this.inSpeech = result.isSpeech;
-      if (isSpeechEnd) {
-        this.speechStartFrame = 0;
-      }
+      if (!result) return null;
       return {
         confidence: result.confidence,
-        smoothedConfidence: result.confidence,
+        smoothedProb: result.smoothedProb,
         isSpeech: result.isSpeech,
-        frameIndex,
-        isSpeechStart,
-        isSpeechEnd,
-        speechStartFrame: activeSpeechStartFrame,
-        speechEndFrame
+        frameIndex: result.frameIdx,
+        isSpeechStart: result.isSpeechStart,
+        isSpeechEnd: result.isSpeechEnd,
+        speechStartFrame: result.speechStartFrame,
+        speechEndFrame: result.speechEndFrame
       };
     } finally {
       M._free(ptr);
@@ -423,11 +612,9 @@ var OmniStreamVAD = class _OmniStreamVAD {
       M._free(framesPtr);
     }
   }
-  /** Reset all internal state. */
+  /** Reset all internal state (model cache, audio buffer, postprocessor). */
   reset() {
     streamVadReset(getModule(), this.handle);
-    this.inSpeech = false;
-    this.speechStartFrame = 0;
   }
   /** Release native resources. */
   dispose() {
@@ -435,8 +622,6 @@ var OmniStreamVAD = class _OmniStreamVAD {
       streamVadDestroy(getModule(), this.handle);
       this.handle = 0;
     }
-    this.inSpeech = false;
-    this.speechStartFrame = 0;
   }
 };
 function int16ToFloat32(i16) {
@@ -550,6 +735,28 @@ function computeCoverageRatios(events, duration) {
   return ratios;
 }
-export { DEFAULT_CDN_BASE, OmniAED as FireRedAED, OmniStreamVAD as FireRedStreamVAD, OmniVAD as FireRedVAD, MODEL_FILES, OmniAED, OmniStreamVAD, OmniVAD, VERSION, initWasm, loadModel };
+// src/chunking.ts
+async function mergeChunks(segments, options = {}) {
+  await initWasm();
+  const M = getModule();
+  const cfg = {
+    maxChunkSecs: options.maxChunkSecs ?? DEFAULT_CHUNK_CONFIG.maxChunkSecs,
+    maxGapSecs: options.maxGapSecs ?? DEFAULT_CHUNK_CONFIG.maxGapSecs,
+    padOnsetSecs: options.padOnsetSecs ?? DEFAULT_CHUNK_CONFIG.padOnsetSecs,
+    padOffsetSecs: options.padOffsetSecs ?? DEFAULT_CHUNK_CONFIG.padOffsetSecs,
+    minSpeechSecs: options.minSpeechSecs ?? DEFAULT_CHUNK_CONFIG.minSpeechSecs,
+    minSilenceSecs: options.minSilenceSecs ?? DEFAULT_CHUNK_CONFIG.minSilenceSecs,
+    mode: options.mode ?? DEFAULT_CHUNK_CONFIG.mode
+  };
+  const records = chunkMerge(M, segments, cfg);
+  return records.map((r) => ({
+    start: r.start,
+    end: r.end,
+    segStartIdx: r.segStartIdx,
+    segCount: r.segCount
+  }));
+}
+export { DEFAULT_CDN_BASE, DEFAULT_CHUNK_CONFIG, OmniAED as FireRedAED, OmniStreamVAD as FireRedStreamVAD, OmniVAD as FireRedVAD, MODEL_FILES, OmniAED, OmniStreamVAD, OmniVAD, VERSION, initWasm, loadModel, mergeChunks };
 //# sourceMappingURL=index.js.map
 //# sourceMappingURL=index.js.map