omnivad 0.2.5 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -14,23 +14,27 @@ interface AEDResult {
14
14
  /** Detected duration coverage ratio for each event type */
15
15
  ratios: Record<string, number>;
16
16
  }
17
- /** Per-frame result from streaming VAD */
17
+ /** Per-frame result from streaming VAD.
18
+ *
19
+ * Bit-identical to upstream FireRedVAD's StreamVadFrameResult: every
20
+ * successful processFrame() call carries both per-frame probabilities
21
+ * AND segment-boundary events (no external segmenter needed). */
18
22
  interface StreamVADFrameResult {
19
- /** Raw probability from model output */
23
+ /** Raw probability from model output [0, 1] */
20
24
  confidence: number;
21
- /** Currently identical to confidence; reserved for future smoothing */
22
- smoothedConfidence: number;
23
- /** Whether current frame is classified as speech */
25
+ /** Causal moving-average of confidence (window = smoothWindowSize) */
26
+ smoothedProb: number;
27
+ /** smoothedProb >= threshold */
24
28
  isSpeech: boolean;
25
29
  /** 1-based frame index of the emitted frame */
26
30
  frameIndex: number;
27
- /** True when speech becomes active at this frame */
31
+ /** True on the frame that confirms a new SPEECH segment */
28
32
  isSpeechStart: boolean;
29
- /** True when speech ends on the previous frame */
33
+ /** True on the frame that confirms a SPEECH segment end */
30
34
  isSpeechEnd: boolean;
31
- /** Start frame of the active or just-finished speech segment */
35
+ /** 1-based start frame of the segment when isSpeechStart, else -1 */
32
36
  speechStartFrame: number;
33
- /** End frame of the just-finished speech segment, or 0 if not ending */
37
+ /** 1-based end frame of the segment when isSpeechEnd, else -1 */
34
38
  speechEndFrame: number;
35
39
  }
36
40
  /** Full-audio streaming-model output */
@@ -57,7 +61,7 @@ interface VADConfig extends ModelSource {
57
61
  smoothWindowSize?: number;
58
62
  /** Minimum speech segment length in frames (default: 20) */
59
63
  minSpeechFrames?: number;
60
- /** Maximum speech segment length in frames before splitting (default: 2000 = 20s) */
64
+ /** Maximum speech segment length in frames before splitting (default: 3000 = 30s; matches Whisper) */
61
65
  maxSpeechFrames?: number;
62
66
  /** Minimum silence segment length in frames for state machine (default: 20) */
63
67
  minSilenceFrames?: number;
@@ -73,10 +77,79 @@ interface AEDConfig extends VADConfig {
73
77
  /** Music probability threshold (default: 0.5) */
74
78
  musicThreshold?: number;
75
79
  }
76
- /** Configuration for streaming VAD */
80
+ /** Configuration for streaming VAD.
81
+ *
82
+ * Bit-identical to upstream FireRedStreamVadConfig — every parameter
83
+ * has the same name (without the speech_ prefix) and the same default. */
77
84
  interface StreamVADConfig extends ModelSource {
78
- /** Speech probability threshold (default: 0.5) */
79
- speechThreshold?: number;
85
+ /** Speech activation threshold [0, 1] (default: 0.5). */
86
+ threshold?: number;
87
+ /** Causal moving-average window in frames (default: 5). */
88
+ smoothWindowSize?: number;
89
+ /** Extend confirmed segment START backward by N frames (default: 5;
90
+ * clamped to >= smoothWindowSize internally). */
91
+ padStartFrame?: number;
92
+ /** Min continuous speech frames to confirm START (default: 8 = 80ms). */
93
+ minSpeechFrame?: number;
94
+ /** Force-split when SPEECH-state count hits this (default: 2000 = 20s). */
95
+ maxSpeechFrame?: number;
96
+ /** Min continuous silence frames to confirm END (default: 20 = 200ms). */
97
+ minSilenceFrame?: number;
98
+ }
99
+ /**
100
+ * Chunk packing strategy. Both modes honor `maxChunkSecs` and `maxGapSecs` as
101
+ * hard constraints — they only differ in WHERE the cut lands.
102
+ *
103
+ * - `"greedy"` — sequential append; cuts at the first point that violates
104
+ * a constraint. Recommended for **fixed-length-input ASR** like Whisper /
105
+ * whisperX (which pad to 30s anyway).
106
+ * - `"longest_gap"` — recursive split at the longest internal pause until
107
+ * every chunk satisfies both constraints. Falls back to equal hard-split
108
+ * when a single segment exceeds `maxChunkSecs`. Recommended for
109
+ * **variable-length-input models** (forced alignment, TTS, encoder-style
110
+ * ASR) — splits at natural pauses, no fixed-length padding required.
111
+ * **NOTE: This is NOT how WhisperX packs chunks** — WhisperX uses greedy
112
+ * packing (`Binarize(max_duration=...)` + sequential append). For
113
+ * WhisperX-equivalent behavior pass `mode: "greedy"` (the default).
114
+ */
115
+ type ChunkMode$1 = "greedy" | "longest_gap";
116
+ /**
117
+ * Configuration for {@link mergeChunks}. Mirrors C struct OmniChunkConfig.
118
+ * All fields are optional in the public API; defaults match
119
+ * {@link DEFAULT_CHUNK_CONFIG}.
120
+ */
121
+ interface ChunkOptions {
122
+ /** Hard upper bound on chunk duration in seconds. Must be > 0. Default: 30. */
123
+ maxChunkSecs?: number;
124
+ /** Split if the gap between adjacent segments exceeds this. Pass `Infinity`
125
+ * to disable. Default: `Infinity`. Honored by both modes. */
126
+ maxGapSecs?: number;
127
+ /** Extend each chunk start backward by this many seconds (clamped to >= 0).
128
+ * Default: 0.04. */
129
+ padOnsetSecs?: number;
130
+ /** Extend each chunk end forward by this many seconds. Default: 0.04. */
131
+ padOffsetSecs?: number;
132
+ /** Drop input segments shorter than this many seconds. Default: 0.0.
133
+ * Pairs with VAD `minSpeechFrames` (frame-domain equivalent). */
134
+ minSpeechSecs?: number;
135
+ /** Pre-merge consecutive segments whose silence gap is shorter than this.
136
+ * Default: 0.20 (matches VAD `minSilenceFrames=20` @ 10ms frame shift). */
137
+ minSilenceSecs?: number;
138
+ /** Packing strategy. Default: `"greedy"`. */
139
+ mode?: ChunkMode$1;
140
+ }
141
+ /** A single chunk emitted by {@link mergeChunks}. */
142
+ interface ChunkResult {
143
+ /** Chunk start time (seconds), with `padOnsetSecs` applied (clamped to >= 0). */
144
+ start: number;
145
+ /** Chunk end time (seconds), with `padOffsetSecs` applied. */
146
+ end: number;
147
+ /** Index of the first input segment included in this chunk. Refers to the
148
+ * *post-filter* segment list — segments dropped by `minSpeechSecs` and
149
+ * pre-merged by `minSilenceSecs` are not counted. */
150
+ segStartIdx: number;
151
+ /** Number of input segments included in this chunk. */
152
+ segCount: number;
80
153
  }
81
154
 
82
155
  /**
@@ -113,8 +186,6 @@ declare class OmniVAD {
113
186
 
114
187
  declare class OmniStreamVAD {
115
188
  private handle;
116
- private inSpeech;
117
- private speechStartFrame;
118
189
  private constructor();
119
190
  /**
120
191
  * Create a new OmniStreamVAD instance.
@@ -131,6 +202,10 @@ declare class OmniStreamVAD {
131
202
  /**
132
203
  * Process one frame of audio (160 int16 samples = 10ms @ 16kHz).
133
204
  * Returns null until enough audio is accumulated.
205
+ *
206
+ * Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
207
+ * speech_*_frame indices) come straight from the C-layer state machine
208
+ * (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
134
209
  */
135
210
  processFrame(pcm160: Int16Array): StreamVADFrameResult | null;
136
211
  /**
@@ -138,7 +213,7 @@ declare class OmniStreamVAD {
138
213
  * @param audio - Float32Array in [-1, 1] or Int16Array of 16kHz mono PCM
139
214
  */
140
215
  detectFull(audio: Float32Array | Int16Array): StreamVADFullResult;
141
- /** Reset all internal state. */
216
+ /** Reset all internal state (model cache, audio buffer, postprocessor). */
142
217
  reset(): void;
143
218
  /** Release native resources. */
144
219
  dispose(): void;
@@ -175,9 +250,9 @@ declare class OmniAED {
175
250
  */
176
251
  type EmscriptenModule = any;
177
252
  /** Package version — used to construct default CDN URLs. */
178
- declare const VERSION = "0.2.5";
253
+ declare const VERSION = "0.2.8";
179
254
  /** Default CDN base for model files (jsDelivr serves npm package contents). */
180
- declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.5/models";
255
+ declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.8/models";
181
256
  /** Model filenames keyed by type. */
182
257
  declare const MODEL_FILES: {
183
258
  readonly vad: "vad.omnivad";
@@ -200,5 +275,63 @@ declare function initWasm(wasmLocator?: (filename: string) => string): Promise<E
200
275
  * 4. Browser — fetch from jsDelivr CDN
201
276
  */
202
277
  declare function loadModel(modelType: ModelType, modelUrl?: string | URL, modelData?: ArrayBuffer): Promise<ArrayBuffer>;
278
+ /**
279
+ * Chunking strategy:
280
+ * - "greedy" — sequential append. Recommended for fixed-length-input ASR
281
+ * (Whisper / whisperX, which pad to 30s anyway).
282
+ * - "longest_gap" — recursive split at longest pause; falls back to hard-split
283
+ * when a single segment exceeds maxChunkSecs. Recommended for
284
+ * variable-length-input models (forced alignment, TTS,
285
+ * encoder-style ASR); no fixed-length padding required.
286
+ */
287
+ type ChunkMode = "greedy" | "longest_gap";
288
+ /** Configuration for omni_merge_chunks (matches C struct OmniChunkConfig, 28 bytes) */
289
+ interface ChunkConfig {
290
+ maxChunkSecs: number;
291
+ maxGapSecs: number;
292
+ padOnsetSecs: number;
293
+ padOffsetSecs: number;
294
+ minSpeechSecs: number;
295
+ minSilenceSecs: number;
296
+ mode: ChunkMode;
297
+ }
298
+ /**
299
+ * Default chunk config. Mirrors C-side omni_chunk_config_default(); kept in
300
+ * TS so callers don't need a roundtrip into WASM just to read defaults.
301
+ *
302
+ * Defaults: max_chunk_secs matches Whisper's 30s input window.
303
+ */
304
+ declare const DEFAULT_CHUNK_CONFIG: ChunkConfig;
305
+
306
+ /**
307
+ * Pure-algorithm chunking utility — wraps the C function omni_merge_chunks
308
+ * compiled into the WASM module.
309
+ *
310
+ * WhisperX-style binarize+merge, minus the binarize half because OmniVAD
311
+ * already returns binarized timestamps.
312
+ *
313
+ * Usage:
314
+ *
315
+ * import { mergeChunks } from "omnivad";
316
+ *
317
+ * const chunks = await mergeChunks(
318
+ * [[0.0, 5.0], [6.0, 10.0]],
319
+ * { maxChunkSecs: 30.0, maxGapSecs: 2.0 }
320
+ * );
321
+ * // [{ start: 0, end: 10, segStartIdx: 0, segCount: 2 }]
322
+ */
323
+
324
+ /**
325
+ * Merge a sorted array of [start, end] speech segments into duration-bounded
326
+ * chunks.
327
+ *
328
+ * Lazily initializes the WASM module on first call (so the caller doesn't have
329
+ * to await `initWasm()` separately). Subsequent calls reuse the cached module.
330
+ *
331
+ * @param segments array of [start, end] pairs in seconds, sorted by start
332
+ * @param options chunking configuration; missing fields fall back to
333
+ * {@link DEFAULT_CHUNK_CONFIG}
334
+ */
335
+ declare function mergeChunks(segments: Array<[number, number]>, options?: ChunkOptions): Promise<ChunkResult[]>;
203
336
 
204
- export { type AEDConfig, type AEDResult, DEFAULT_CDN_BASE, OmniAED as FireRedAED, OmniStreamVAD as FireRedStreamVAD, OmniVAD as FireRedVAD, MODEL_FILES, type ModelSource, OmniAED, OmniStreamVAD, OmniVAD, type StreamVADConfig, type StreamVADFrameResult, type StreamVADFullResult, type VADConfig, type VADResult, VERSION, initWasm, loadModel };
337
+ export { type AEDConfig, type AEDResult, type ChunkOptions, type ChunkResult, DEFAULT_CDN_BASE, DEFAULT_CHUNK_CONFIG, OmniAED as FireRedAED, OmniStreamVAD as FireRedStreamVAD, OmniVAD as FireRedVAD, MODEL_FILES, type ModelSource, OmniAED, OmniStreamVAD, OmniVAD, type StreamVADConfig, type StreamVADFrameResult, type StreamVADFullResult, type VADConfig, type VADResult, VERSION, initWasm, loadModel, mergeChunks };
package/dist/index.js CHANGED
@@ -1,12 +1,41 @@
1
1
  // src/wasm-binding.ts
2
2
  var _module = null;
3
3
  var _loading = null;
4
+ function loadScript(url) {
5
+ if (typeof globalThis.document === "undefined") {
6
+ return new Promise((resolve, reject) => {
7
+ try {
8
+ const importScripts = globalThis.importScripts;
9
+ if (typeof importScripts !== "function") {
10
+ throw new Error(
11
+ "omnivad: cannot load glue script \u2014 no document and no importScripts"
12
+ );
13
+ }
14
+ importScripts(url);
15
+ resolve();
16
+ } catch (err) {
17
+ reject(err instanceof Error ? err : new Error(String(err)));
18
+ }
19
+ });
20
+ }
21
+ return new Promise((resolve, reject) => {
22
+ const s = globalThis.document.createElement("script");
23
+ s.src = url;
24
+ s.async = true;
25
+ s.crossOrigin = "anonymous";
26
+ s.onload = () => resolve();
27
+ s.onerror = () => reject(new Error(`Failed to load omnivad glue script: ${url}`));
28
+ globalThis.document.head.appendChild(s);
29
+ });
30
+ }
4
31
  var SIZEOF_POST_CONFIG = 28;
5
32
  var SIZEOF_AED_POST_CONFIG = 3 * SIZEOF_POST_CONFIG;
6
33
  var SIZEOF_SEGMENT = 8;
7
34
  var SIZEOF_AED_SEGMENT = 16;
35
+ var SIZEOF_CHUNK_CONFIG = 28;
36
+ var SIZEOF_CHUNK = 16;
8
37
  var OMNI_ERR_NO_FRAMES = -7;
9
- var VERSION = "0.2.5";
38
+ var VERSION = "0.2.8";
10
39
  var DEFAULT_CDN_BASE = `https://cdn.jsdelivr.net/npm/omnivad@${VERSION}/models`;
11
40
  var MODEL_FILES = {
12
41
  vad: "vad.omnivad",
@@ -22,22 +51,41 @@ async function initWasm(wasmLocator) {
22
51
  if (typeof globalThis.process?.versions?.node === "string") {
23
52
  const { createRequire } = await import(
24
53
  /* webpackIgnore: true */
54
+ /* turbopackIgnore: true */
25
55
  'module'
26
56
  );
27
- const { dirname, join } = await import('path');
57
+ const { dirname, join } = await import(
58
+ /* webpackIgnore: true */
59
+ /* turbopackIgnore: true */
60
+ 'path'
61
+ );
28
62
  const req = createRequire(import.meta.url);
29
63
  const gluePath = req.resolve("../dist/wasm/omnivad.cjs");
30
64
  const wasmDir = dirname(gluePath);
31
65
  createOmniVAD = req(gluePath);
32
66
  defaultLocateFile = (filename) => join(wasmDir, filename);
33
67
  } else {
34
- const glueUrl = new URL("../dist/wasm/omnivad.js", import.meta.url);
35
- const mod = await import(
36
- /* webpackIgnore: true */
37
- glueUrl.href
38
- );
39
- createOmniVAD = mod.default || mod;
40
- const wasmBaseUrl = new URL("./", glueUrl);
68
+ let glueUrlStr;
69
+ if (wasmLocator) {
70
+ glueUrlStr = wasmLocator("omnivad.js");
71
+ } else {
72
+ glueUrlStr = new URL("../dist/wasm/omnivad.js", import.meta.url).href;
73
+ }
74
+ const g = globalThis;
75
+ let factory = g.createOmniVAD;
76
+ if (typeof factory !== "function") {
77
+ await loadScript(glueUrlStr);
78
+ factory = g.createOmniVAD;
79
+ }
80
+ if (typeof factory !== "function") {
81
+ throw new Error(
82
+ `omnivad.js loaded from ${glueUrlStr} but globalThis.createOmniVAD is missing`
83
+ );
84
+ }
85
+ createOmniVAD = factory;
86
+ const baseHref = typeof globalThis.location !== "undefined" ? globalThis.location.href : "file:///";
87
+ const absGlue = new URL(glueUrlStr, baseHref);
88
+ const wasmBaseUrl = new URL("./", absGlue);
41
89
  defaultLocateFile = (filename) => new URL(filename, wasmBaseUrl).toString();
42
90
  }
43
91
  const opts = {};
@@ -61,10 +109,19 @@ async function loadModel(modelType, modelUrl, modelData) {
61
109
  if (typeof globalThis.process?.versions?.node === "string") {
62
110
  const { createRequire } = await import(
63
111
  /* webpackIgnore: true */
112
+ /* turbopackIgnore: true */
64
113
  'module'
65
114
  );
66
- const { dirname, join } = await import('path');
67
- const { readFile } = await import('fs/promises');
115
+ const { dirname, join } = await import(
116
+ /* webpackIgnore: true */
117
+ /* turbopackIgnore: true */
118
+ 'path'
119
+ );
120
+ const { readFile } = await import(
121
+ /* webpackIgnore: true */
122
+ /* turbopackIgnore: true */
123
+ 'fs/promises'
124
+ );
68
125
  const req = createRequire(import.meta.url);
69
126
  const pkgDir = dirname(req.resolve("../package.json"));
70
127
  const modelPath = join(pkgDir, "models", filename);
@@ -117,10 +174,86 @@ var DEFAULT_VAD_CONFIG = {
117
174
  smoothWindowSize: 5,
118
175
  minSpeechFrames: 20,
119
176
  minSilenceFrames: 20,
120
- maxSpeechFrames: 2e3,
177
+ maxSpeechFrames: 3e3,
121
178
  mergeSilenceFrames: 0,
122
179
  extendSpeechFrames: 0
123
180
  };
181
+ var OMNI_CHUNK_GREEDY = 0;
182
+ var OMNI_CHUNK_LONGEST_GAP = 1;
183
+ var DEFAULT_CHUNK_CONFIG = {
184
+ maxChunkSecs: 30,
185
+ maxGapSecs: Infinity,
186
+ padOnsetSecs: 0.04,
187
+ padOffsetSecs: 0.04,
188
+ minSpeechSecs: 0,
189
+ minSilenceSecs: 0.2,
190
+ // matches VAD minSilenceFrames=20 @ 10ms shift
191
+ mode: "greedy"
192
+ };
193
+ function modeToInt(m) {
194
+ switch (m) {
195
+ case "greedy":
196
+ return OMNI_CHUNK_GREEDY;
197
+ case "longest_gap":
198
+ return OMNI_CHUNK_LONGEST_GAP;
199
+ default:
200
+ throw new Error(`Unknown chunking mode: ${String(m)}`);
201
+ }
202
+ }
203
+ function writeChunkConfig(M, ptr, cfg) {
204
+ M.setValue(ptr + 0, cfg.maxChunkSecs, "float");
205
+ M.setValue(ptr + 4, cfg.maxGapSecs, "float");
206
+ M.setValue(ptr + 8, cfg.padOnsetSecs, "float");
207
+ M.setValue(ptr + 12, cfg.padOffsetSecs, "float");
208
+ M.setValue(ptr + 16, cfg.minSpeechSecs, "float");
209
+ M.setValue(ptr + 20, cfg.minSilenceSecs, "float");
210
+ M.setValue(ptr + 24, modeToInt(cfg.mode), "i32");
211
+ }
212
+ function chunkMerge(M, segments, config) {
213
+ const numSegments = segments.length;
214
+ const segPtr = numSegments > 0 ? M._malloc(numSegments * SIZEOF_SEGMENT) : 0;
215
+ const cfgPtr = M._malloc(SIZEOF_CHUNK_CONFIG);
216
+ const outPtrPtr = M._malloc(4);
217
+ const outCountPtr = M._malloc(4);
218
+ try {
219
+ for (let i = 0; i < numSegments; i++) {
220
+ const base = segPtr + i * SIZEOF_SEGMENT;
221
+ M.setValue(base + 0, segments[i][0], "float");
222
+ M.setValue(base + 4, segments[i][1], "float");
223
+ }
224
+ writeChunkConfig(M, cfgPtr, config);
225
+ M.setValue(outPtrPtr, 0, "i32");
226
+ M.setValue(outCountPtr, 0, "i32");
227
+ const rc = M.ccall(
228
+ "omni_merge_chunks",
229
+ "number",
230
+ ["number", "number", "number", "number", "number"],
231
+ [segPtr, numSegments, cfgPtr, outPtrPtr, outCountPtr]
232
+ );
233
+ if (rc !== 0) {
234
+ throw new Error(`omni_merge_chunks failed: ${readNativeError(M, rc)}`);
235
+ }
236
+ const count = M.getValue(outCountPtr, "i32");
237
+ const chunkPtr = M.getValue(outPtrPtr, "i32");
238
+ const chunks = [];
239
+ for (let i = 0; i < count; i++) {
240
+ const base = chunkPtr + i * SIZEOF_CHUNK;
241
+ chunks.push({
242
+ start: M.getValue(base + 0, "float"),
243
+ end: M.getValue(base + 4, "float"),
244
+ segStartIdx: M.getValue(base + 8, "i32"),
245
+ segCount: M.getValue(base + 12, "i32")
246
+ });
247
+ }
248
+ if (chunkPtr) M._free(chunkPtr);
249
+ return chunks;
250
+ } finally {
251
+ if (segPtr) M._free(segPtr);
252
+ M._free(cfgPtr);
253
+ M._free(outPtrPtr);
254
+ M._free(outCountPtr);
255
+ }
256
+ }
124
257
  function vadCreate(M, modelBuffer) {
125
258
  const bytes = new Uint8Array(modelBuffer);
126
259
  const ptr = M._malloc(bytes.length);
@@ -225,24 +358,49 @@ function aedDetect(M, handle, audioPtr, numSamples, cfg, format = "f32") {
225
358
  function aedDestroy(M, handle) {
226
359
  M.ccall("omni_aed_destroy", null, ["number"], [handle]);
227
360
  }
228
- function streamVadCreate(M, modelBuffer, threshold = 0.5) {
361
+ var DEFAULT_STREAM_VAD_CONFIG = {
362
+ threshold: 0.5,
363
+ smoothWindowSize: 5,
364
+ padStartFrame: 5,
365
+ minSpeechFrame: 8,
366
+ maxSpeechFrame: 2e3,
367
+ minSilenceFrame: 20
368
+ };
369
+ var SIZEOF_STREAM_VAD_CONFIG = 24;
370
+ function writeStreamVadConfig(M, ptr, cfg) {
371
+ M.setValue(ptr + 0, cfg.threshold, "float");
372
+ M.setValue(ptr + 4, cfg.smoothWindowSize, "i32");
373
+ M.setValue(ptr + 8, cfg.padStartFrame, "i32");
374
+ M.setValue(ptr + 12, cfg.minSpeechFrame, "i32");
375
+ M.setValue(ptr + 16, cfg.maxSpeechFrame, "i32");
376
+ M.setValue(ptr + 20, cfg.minSilenceFrame, "i32");
377
+ }
378
+ function streamVadCreate(M, modelBuffer, config = {}) {
379
+ const overrides = Object.fromEntries(
380
+ Object.entries(config).filter(([, v]) => v !== void 0)
381
+ );
382
+ const cfg = { ...DEFAULT_STREAM_VAD_CONFIG, ...overrides };
229
383
  const bytes = new Uint8Array(modelBuffer);
230
- const ptr = M._malloc(bytes.length);
231
- M.HEAPU8.set(bytes, ptr);
384
+ const dataPtr = M._malloc(bytes.length);
385
+ M.HEAPU8.set(bytes, dataPtr);
386
+ const cfgPtr = M._malloc(SIZEOF_STREAM_VAD_CONFIG);
232
387
  try {
388
+ writeStreamVadConfig(M, cfgPtr, cfg);
233
389
  return createModel(
234
390
  M,
235
391
  "omni_stream_vad_create_from_buffer",
236
392
  ["number", "number", "number"],
237
- [ptr, bytes.length, threshold],
393
+ [dataPtr, bytes.length, cfgPtr],
238
394
  "StreamVAD"
239
395
  );
240
396
  } finally {
241
- M._free(ptr);
397
+ M._free(dataPtr);
398
+ M._free(cfgPtr);
242
399
  }
243
400
  }
401
+ var SIZEOF_STREAM_VAD_RESULT = 24;
244
402
  function streamVadProcess(M, handle, pcm16Ptr, numSamples) {
245
- const resultPtr = M._malloc(12);
403
+ const resultPtr = M._malloc(SIZEOF_STREAM_VAD_RESULT);
246
404
  try {
247
405
  const ret = M.ccall(
248
406
  "omni_stream_vad_process",
@@ -253,9 +411,14 @@ function streamVadProcess(M, handle, pcm16Ptr, numSamples) {
253
411
  if (ret === OMNI_ERR_NO_FRAMES) return null;
254
412
  if (ret !== 0) throw new Error(`StreamVAD process failed: ${ret}`);
255
413
  return {
256
- confidence: M.getValue(resultPtr, "float"),
257
- isSpeech: M.getValue(resultPtr + 4, "i8") !== 0,
258
- frameOffset: M.getValue(resultPtr + 8, "i32")
414
+ confidence: M.getValue(resultPtr + 0, "float"),
415
+ smoothedProb: M.getValue(resultPtr + 4, "float"),
416
+ isSpeech: M.getValue(resultPtr + 8, "i8") !== 0,
417
+ isSpeechStart: M.getValue(resultPtr + 9, "i8") !== 0,
418
+ isSpeechEnd: M.getValue(resultPtr + 10, "i8") !== 0,
419
+ frameIdx: M.getValue(resultPtr + 12, "i32"),
420
+ speechStartFrame: M.getValue(resultPtr + 16, "i32"),
421
+ speechEndFrame: M.getValue(resultPtr + 20, "i32")
259
422
  };
260
423
  } finally {
261
424
  M._free(resultPtr);
@@ -354,8 +517,6 @@ function int16ToNormalizedFloat32(i16) {
354
517
  var SAMPLE_RATE2 = 16e3;
355
518
  var OmniStreamVAD = class _OmniStreamVAD {
356
519
  constructor(handle) {
357
- this.inSpeech = false;
358
- this.speechStartFrame = 0;
359
520
  this.handle = handle;
360
521
  }
361
522
  /**
@@ -366,8 +527,14 @@ var OmniStreamVAD = class _OmniStreamVAD {
366
527
  await initWasm();
367
528
  const M = getModule();
368
529
  const modelBuffer = await loadModel("stream-vad", options.modelUrl, options.modelData);
369
- const threshold = options.speechThreshold ?? 0.5;
370
- const handle = streamVadCreate(M, modelBuffer, threshold);
530
+ const handle = streamVadCreate(M, modelBuffer, {
531
+ threshold: options.threshold,
532
+ smoothWindowSize: options.smoothWindowSize,
533
+ padStartFrame: options.padStartFrame,
534
+ minSpeechFrame: options.minSpeechFrame,
535
+ maxSpeechFrame: options.maxSpeechFrame,
536
+ minSilenceFrame: options.minSilenceFrame
537
+ });
371
538
  return new _OmniStreamVAD(handle);
372
539
  }
373
540
  /**
@@ -385,6 +552,10 @@ var OmniStreamVAD = class _OmniStreamVAD {
385
552
  /**
386
553
  * Process one frame of audio (160 int16 samples = 10ms @ 16kHz).
387
554
  * Returns null until enough audio is accumulated.
555
+ *
556
+ * Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
557
+ * speech_*_frame indices) come straight from the C-layer state machine
558
+ * (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
388
559
  */
389
560
  processFrame(pcm160) {
390
561
  const M = getModule();
@@ -393,28 +564,16 @@ var OmniStreamVAD = class _OmniStreamVAD {
393
564
  heap16.set(pcm160);
394
565
  try {
395
566
  const result = streamVadProcess(M, this.handle, ptr, pcm160.length);
396
- if (!result || result.frameOffset === 0) return null;
397
- const frameIndex = result.frameOffset;
398
- const isSpeechStart = result.isSpeech && !this.inSpeech;
399
- const isSpeechEnd = !result.isSpeech && this.inSpeech;
400
- if (isSpeechStart) {
401
- this.speechStartFrame = frameIndex;
402
- }
403
- const activeSpeechStartFrame = isSpeechEnd ? this.speechStartFrame : result.isSpeech ? this.speechStartFrame : 0;
404
- const speechEndFrame = isSpeechEnd ? Math.max(1, frameIndex - 1) : 0;
405
- this.inSpeech = result.isSpeech;
406
- if (isSpeechEnd) {
407
- this.speechStartFrame = 0;
408
- }
567
+ if (!result) return null;
409
568
  return {
410
569
  confidence: result.confidence,
411
- smoothedConfidence: result.confidence,
570
+ smoothedProb: result.smoothedProb,
412
571
  isSpeech: result.isSpeech,
413
- frameIndex,
414
- isSpeechStart,
415
- isSpeechEnd,
416
- speechStartFrame: activeSpeechStartFrame,
417
- speechEndFrame
572
+ frameIndex: result.frameIdx,
573
+ isSpeechStart: result.isSpeechStart,
574
+ isSpeechEnd: result.isSpeechEnd,
575
+ speechStartFrame: result.speechStartFrame,
576
+ speechEndFrame: result.speechEndFrame
418
577
  };
419
578
  } finally {
420
579
  M._free(ptr);
@@ -453,11 +612,9 @@ var OmniStreamVAD = class _OmniStreamVAD {
453
612
  M._free(framesPtr);
454
613
  }
455
614
  }
456
- /** Reset all internal state. */
615
+ /** Reset all internal state (model cache, audio buffer, postprocessor). */
457
616
  reset() {
458
617
  streamVadReset(getModule(), this.handle);
459
- this.inSpeech = false;
460
- this.speechStartFrame = 0;
461
618
  }
462
619
  /** Release native resources. */
463
620
  dispose() {
@@ -465,8 +622,6 @@ var OmniStreamVAD = class _OmniStreamVAD {
465
622
  streamVadDestroy(getModule(), this.handle);
466
623
  this.handle = 0;
467
624
  }
468
- this.inSpeech = false;
469
- this.speechStartFrame = 0;
470
625
  }
471
626
  };
472
627
  function int16ToFloat32(i16) {
@@ -580,6 +735,28 @@ function computeCoverageRatios(events, duration) {
580
735
  return ratios;
581
736
  }
582
737
 
583
- export { DEFAULT_CDN_BASE, OmniAED as FireRedAED, OmniStreamVAD as FireRedStreamVAD, OmniVAD as FireRedVAD, MODEL_FILES, OmniAED, OmniStreamVAD, OmniVAD, VERSION, initWasm, loadModel };
738
+ // src/chunking.ts
739
+ async function mergeChunks(segments, options = {}) {
740
+ await initWasm();
741
+ const M = getModule();
742
+ const cfg = {
743
+ maxChunkSecs: options.maxChunkSecs ?? DEFAULT_CHUNK_CONFIG.maxChunkSecs,
744
+ maxGapSecs: options.maxGapSecs ?? DEFAULT_CHUNK_CONFIG.maxGapSecs,
745
+ padOnsetSecs: options.padOnsetSecs ?? DEFAULT_CHUNK_CONFIG.padOnsetSecs,
746
+ padOffsetSecs: options.padOffsetSecs ?? DEFAULT_CHUNK_CONFIG.padOffsetSecs,
747
+ minSpeechSecs: options.minSpeechSecs ?? DEFAULT_CHUNK_CONFIG.minSpeechSecs,
748
+ minSilenceSecs: options.minSilenceSecs ?? DEFAULT_CHUNK_CONFIG.minSilenceSecs,
749
+ mode: options.mode ?? DEFAULT_CHUNK_CONFIG.mode
750
+ };
751
+ const records = chunkMerge(M, segments, cfg);
752
+ return records.map((r) => ({
753
+ start: r.start,
754
+ end: r.end,
755
+ segStartIdx: r.segStartIdx,
756
+ segCount: r.segCount
757
+ }));
758
+ }
759
+
760
+ export { DEFAULT_CDN_BASE, DEFAULT_CHUNK_CONFIG, OmniAED as FireRedAED, OmniStreamVAD as FireRedStreamVAD, OmniVAD as FireRedVAD, MODEL_FILES, OmniAED, OmniStreamVAD, OmniVAD, VERSION, initWasm, loadModel, mergeChunks };
584
761
  //# sourceMappingURL=index.js.map
585
762
  //# sourceMappingURL=index.js.map