omnivad 0.2.4 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +258 -49
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +159 -19
- package/dist/index.d.ts +159 -19
- package/dist/index.js +257 -50
- package/dist/index.js.map +1 -1
- package/dist/wasm/omnivad.cjs +1 -1
- package/dist/wasm/omnivad.js +1 -1
- package/dist/wasm/omnivad.wasm +0 -0
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -14,23 +14,27 @@ interface AEDResult {
|
|
|
14
14
|
/** Detected duration coverage ratio for each event type */
|
|
15
15
|
ratios: Record<string, number>;
|
|
16
16
|
}
|
|
17
|
-
/** Per-frame result from streaming VAD
|
|
17
|
+
/** Per-frame result from streaming VAD.
|
|
18
|
+
*
|
|
19
|
+
* Bit-identical to upstream FireRedVAD's StreamVadFrameResult: every
|
|
20
|
+
* successful processFrame() call carries both per-frame probabilities
|
|
21
|
+
* AND segment-boundary events (no external segmenter needed). */
|
|
18
22
|
interface StreamVADFrameResult {
|
|
19
|
-
/** Raw probability from model output */
|
|
23
|
+
/** Raw probability from model output [0, 1] */
|
|
20
24
|
confidence: number;
|
|
21
|
-
/**
|
|
22
|
-
|
|
23
|
-
/**
|
|
25
|
+
/** Causal moving-average of confidence (window = smoothWindowSize) */
|
|
26
|
+
smoothedProb: number;
|
|
27
|
+
/** smoothedProb >= threshold */
|
|
24
28
|
isSpeech: boolean;
|
|
25
29
|
/** 1-based frame index of the emitted frame */
|
|
26
30
|
frameIndex: number;
|
|
27
|
-
/** True
|
|
31
|
+
/** True on the frame that confirms a new SPEECH segment */
|
|
28
32
|
isSpeechStart: boolean;
|
|
29
|
-
/** True
|
|
33
|
+
/** True on the frame that confirms a SPEECH segment end */
|
|
30
34
|
isSpeechEnd: boolean;
|
|
31
|
-
/**
|
|
35
|
+
/** 1-based start frame of the segment when isSpeechStart, else -1 */
|
|
32
36
|
speechStartFrame: number;
|
|
33
|
-
/**
|
|
37
|
+
/** 1-based end frame of the segment when isSpeechEnd, else -1 */
|
|
34
38
|
speechEndFrame: number;
|
|
35
39
|
}
|
|
36
40
|
/** Full-audio streaming-model output */
|
|
@@ -57,7 +61,7 @@ interface VADConfig extends ModelSource {
|
|
|
57
61
|
smoothWindowSize?: number;
|
|
58
62
|
/** Minimum speech segment length in frames (default: 20) */
|
|
59
63
|
minSpeechFrames?: number;
|
|
60
|
-
/** Maximum speech segment length in frames before splitting (default:
|
|
64
|
+
/** Maximum speech segment length in frames before splitting (default: 3000 = 30s; matches Whisper) */
|
|
61
65
|
maxSpeechFrames?: number;
|
|
62
66
|
/** Minimum silence segment length in frames for state machine (default: 20) */
|
|
63
67
|
minSilenceFrames?: number;
|
|
@@ -73,10 +77,79 @@ interface AEDConfig extends VADConfig {
|
|
|
73
77
|
/** Music probability threshold (default: 0.5) */
|
|
74
78
|
musicThreshold?: number;
|
|
75
79
|
}
|
|
76
|
-
/** Configuration for streaming VAD
|
|
80
|
+
/** Configuration for streaming VAD.
|
|
81
|
+
*
|
|
82
|
+
* Bit-identical to upstream FireRedStreamVadConfig — every parameter
|
|
83
|
+
* has the same name (without the speech_ prefix) and the same default. */
|
|
77
84
|
interface StreamVADConfig extends ModelSource {
|
|
78
|
-
/** Speech
|
|
79
|
-
|
|
85
|
+
/** Speech activation threshold [0, 1] (default: 0.5). */
|
|
86
|
+
threshold?: number;
|
|
87
|
+
/** Causal moving-average window in frames (default: 5). */
|
|
88
|
+
smoothWindowSize?: number;
|
|
89
|
+
/** Extend confirmed segment START backward by N frames (default: 5;
|
|
90
|
+
* clamped to >= smoothWindowSize internally). */
|
|
91
|
+
padStartFrame?: number;
|
|
92
|
+
/** Min continuous speech frames to confirm START (default: 8 = 80ms). */
|
|
93
|
+
minSpeechFrame?: number;
|
|
94
|
+
/** Force-split when SPEECH-state count hits this (default: 2000 = 20s). */
|
|
95
|
+
maxSpeechFrame?: number;
|
|
96
|
+
/** Min continuous silence frames to confirm END (default: 20 = 200ms). */
|
|
97
|
+
minSilenceFrame?: number;
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Chunk packing strategy. Both modes honor `maxChunkSecs` and `maxGapSecs` as
|
|
101
|
+
* hard constraints — they only differ in WHERE the cut lands.
|
|
102
|
+
*
|
|
103
|
+
* - `"greedy"` — sequential append; cuts at the first point that violates
|
|
104
|
+
* a constraint. Recommended for **fixed-length-input ASR** like Whisper /
|
|
105
|
+
* whisperX (which pad to 30s anyway).
|
|
106
|
+
* - `"longest_gap"` — recursive split at the longest internal pause until
|
|
107
|
+
* every chunk satisfies both constraints. Falls back to equal hard-split
|
|
108
|
+
* when a single segment exceeds `maxChunkSecs`. Recommended for
|
|
109
|
+
* **variable-length-input models** (forced alignment, TTS, encoder-style
|
|
110
|
+
* ASR) — splits at natural pauses, no fixed-length padding required.
|
|
111
|
+
* **NOTE: This is NOT how WhisperX packs chunks** — WhisperX uses greedy
|
|
112
|
+
* packing (`Binarize(max_duration=...)` + sequential append). For
|
|
113
|
+
* WhisperX-equivalent behavior pass `mode: "greedy"` (the default).
|
|
114
|
+
*/
|
|
115
|
+
type ChunkMode$1 = "greedy" | "longest_gap";
|
|
116
|
+
/**
|
|
117
|
+
* Configuration for {@link mergeChunks}. Mirrors C struct OmniChunkConfig.
|
|
118
|
+
* All fields are optional in the public API; defaults match
|
|
119
|
+
* {@link DEFAULT_CHUNK_CONFIG}.
|
|
120
|
+
*/
|
|
121
|
+
interface ChunkOptions {
|
|
122
|
+
/** Hard upper bound on chunk duration in seconds. Must be > 0. Default: 30. */
|
|
123
|
+
maxChunkSecs?: number;
|
|
124
|
+
/** Split if the gap between adjacent segments exceeds this. Pass `Infinity`
|
|
125
|
+
* to disable. Default: `Infinity`. Honored by both modes. */
|
|
126
|
+
maxGapSecs?: number;
|
|
127
|
+
/** Extend each chunk start backward by this many seconds (clamped to >= 0).
|
|
128
|
+
* Default: 0.04. */
|
|
129
|
+
padOnsetSecs?: number;
|
|
130
|
+
/** Extend each chunk end forward by this many seconds. Default: 0.04. */
|
|
131
|
+
padOffsetSecs?: number;
|
|
132
|
+
/** Drop input segments shorter than this many seconds. Default: 0.0.
|
|
133
|
+
* Pairs with VAD `minSpeechFrames` (frame-domain equivalent). */
|
|
134
|
+
minSpeechSecs?: number;
|
|
135
|
+
/** Pre-merge consecutive segments whose silence gap is shorter than this.
|
|
136
|
+
* Default: 0.20 (matches VAD `minSilenceFrames=20` @ 10ms frame shift). */
|
|
137
|
+
minSilenceSecs?: number;
|
|
138
|
+
/** Packing strategy. Default: `"greedy"`. */
|
|
139
|
+
mode?: ChunkMode$1;
|
|
140
|
+
}
|
|
141
|
+
/** A single chunk emitted by {@link mergeChunks}. */
|
|
142
|
+
interface ChunkResult {
|
|
143
|
+
/** Chunk start time (seconds), with `padOnsetSecs` applied (clamped to >= 0). */
|
|
144
|
+
start: number;
|
|
145
|
+
/** Chunk end time (seconds), with `padOffsetSecs` applied. */
|
|
146
|
+
end: number;
|
|
147
|
+
/** Index of the first input segment included in this chunk. Refers to the
|
|
148
|
+
* *post-filter* segment list — segments dropped by `minSpeechSecs` and
|
|
149
|
+
* pre-merged by `minSilenceSecs` are not counted. */
|
|
150
|
+
segStartIdx: number;
|
|
151
|
+
/** Number of input segments included in this chunk. */
|
|
152
|
+
segCount: number;
|
|
80
153
|
}
|
|
81
154
|
|
|
82
155
|
/**
|
|
@@ -113,17 +186,26 @@ declare class OmniVAD {
|
|
|
113
186
|
|
|
114
187
|
declare class OmniStreamVAD {
|
|
115
188
|
private handle;
|
|
116
|
-
private inSpeech;
|
|
117
|
-
private speechStartFrame;
|
|
118
189
|
private constructor();
|
|
119
190
|
/**
|
|
120
191
|
* Create a new OmniStreamVAD instance.
|
|
121
192
|
* Loads model from CDN (browser), local package (Node.js), or custom source.
|
|
122
193
|
*/
|
|
123
194
|
static create(options?: StreamVADConfig): Promise<OmniStreamVAD>;
|
|
195
|
+
/**
|
|
196
|
+
* Create a lightweight clone sharing the same underlying model weights.
|
|
197
|
+
* The clone has fresh per-instance state (empty audio buffer, zeroed cache).
|
|
198
|
+
* This is synchronous and extremely fast — ideal for multi-stream scenarios
|
|
199
|
+
* (e.g., handling multiple WebRTC tracks or concurrent audio sessions).
|
|
200
|
+
*/
|
|
201
|
+
clone(): OmniStreamVAD;
|
|
124
202
|
/**
|
|
125
203
|
* Process one frame of audio (160 int16 samples = 10ms @ 16kHz).
|
|
126
204
|
* Returns null until enough audio is accumulated.
|
|
205
|
+
*
|
|
206
|
+
* Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
|
|
207
|
+
* speech_*_frame indices) come straight from the C-layer state machine
|
|
208
|
+
* (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
|
|
127
209
|
*/
|
|
128
210
|
processFrame(pcm160: Int16Array): StreamVADFrameResult | null;
|
|
129
211
|
/**
|
|
@@ -131,7 +213,7 @@ declare class OmniStreamVAD {
|
|
|
131
213
|
* @param audio - Float32Array in [-1, 1] or Int16Array of 16kHz mono PCM
|
|
132
214
|
*/
|
|
133
215
|
detectFull(audio: Float32Array | Int16Array): StreamVADFullResult;
|
|
134
|
-
/** Reset all internal state. */
|
|
216
|
+
/** Reset all internal state (model cache, audio buffer, postprocessor). */
|
|
135
217
|
reset(): void;
|
|
136
218
|
/** Release native resources. */
|
|
137
219
|
dispose(): void;
|
|
@@ -168,9 +250,9 @@ declare class OmniAED {
|
|
|
168
250
|
*/
|
|
169
251
|
type EmscriptenModule = any;
|
|
170
252
|
/** Package version — used to construct default CDN URLs. */
|
|
171
|
-
declare const VERSION = "0.2.
|
|
253
|
+
declare const VERSION = "0.2.8";
|
|
172
254
|
/** Default CDN base for model files (jsDelivr serves npm package contents). */
|
|
173
|
-
declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.
|
|
255
|
+
declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.8/models";
|
|
174
256
|
/** Model filenames keyed by type. */
|
|
175
257
|
declare const MODEL_FILES: {
|
|
176
258
|
readonly vad: "vad.omnivad";
|
|
@@ -193,5 +275,63 @@ declare function initWasm(wasmLocator?: (filename: string) => string): Promise<E
|
|
|
193
275
|
* 4. Browser — fetch from jsDelivr CDN
|
|
194
276
|
*/
|
|
195
277
|
declare function loadModel(modelType: ModelType, modelUrl?: string | URL, modelData?: ArrayBuffer): Promise<ArrayBuffer>;
|
|
278
|
+
/**
|
|
279
|
+
* Chunking strategy:
|
|
280
|
+
* - "greedy" — sequential append. Recommended for fixed-length-input ASR
|
|
281
|
+
* (Whisper / whisperX, which pad to 30s anyway).
|
|
282
|
+
* - "longest_gap" — recursive split at longest pause; falls back to hard-split
|
|
283
|
+
* when a single segment exceeds maxChunkSecs. Recommended for
|
|
284
|
+
* variable-length-input models (forced alignment, TTS,
|
|
285
|
+
* encoder-style ASR); no fixed-length padding required.
|
|
286
|
+
*/
|
|
287
|
+
type ChunkMode = "greedy" | "longest_gap";
|
|
288
|
+
/** Configuration for omni_merge_chunks (matches C struct OmniChunkConfig, 28 bytes) */
|
|
289
|
+
interface ChunkConfig {
|
|
290
|
+
maxChunkSecs: number;
|
|
291
|
+
maxGapSecs: number;
|
|
292
|
+
padOnsetSecs: number;
|
|
293
|
+
padOffsetSecs: number;
|
|
294
|
+
minSpeechSecs: number;
|
|
295
|
+
minSilenceSecs: number;
|
|
296
|
+
mode: ChunkMode;
|
|
297
|
+
}
|
|
298
|
+
/**
|
|
299
|
+
* Default chunk config. Mirrors C-side omni_chunk_config_default(); kept in
|
|
300
|
+
* TS so callers don't need a roundtrip into WASM just to read defaults.
|
|
301
|
+
*
|
|
302
|
+
* Defaults: max_chunk_secs matches Whisper's 30s input window.
|
|
303
|
+
*/
|
|
304
|
+
declare const DEFAULT_CHUNK_CONFIG: ChunkConfig;
|
|
305
|
+
|
|
306
|
+
/**
|
|
307
|
+
* Pure-algorithm chunking utility — wraps the C function omni_merge_chunks
|
|
308
|
+
* compiled into the WASM module.
|
|
309
|
+
*
|
|
310
|
+
* WhisperX-style binarize+merge, minus the binarize half because OmniVAD
|
|
311
|
+
* already returns binarized timestamps.
|
|
312
|
+
*
|
|
313
|
+
* Usage:
|
|
314
|
+
*
|
|
315
|
+
* import { mergeChunks } from "omnivad";
|
|
316
|
+
*
|
|
317
|
+
* const chunks = await mergeChunks(
|
|
318
|
+
* [[0.0, 5.0], [6.0, 10.0]],
|
|
319
|
+
* { maxChunkSecs: 30.0, maxGapSecs: 2.0 }
|
|
320
|
+
* );
|
|
321
|
+
* // [{ start: 0, end: 10, segStartIdx: 0, segCount: 2 }]
|
|
322
|
+
*/
|
|
323
|
+
|
|
324
|
+
/**
|
|
325
|
+
* Merge a sorted array of [start, end] speech segments into duration-bounded
|
|
326
|
+
* chunks.
|
|
327
|
+
*
|
|
328
|
+
* Lazily initializes the WASM module on first call (so the caller doesn't have
|
|
329
|
+
* to await `initWasm()` separately). Subsequent calls reuse the cached module.
|
|
330
|
+
*
|
|
331
|
+
* @param segments array of [start, end] pairs in seconds, sorted by start
|
|
332
|
+
* @param options chunking configuration; missing fields fall back to
|
|
333
|
+
* {@link DEFAULT_CHUNK_CONFIG}
|
|
334
|
+
*/
|
|
335
|
+
declare function mergeChunks(segments: Array<[number, number]>, options?: ChunkOptions): Promise<ChunkResult[]>;
|
|
196
336
|
|
|
197
|
-
export { type AEDConfig, type AEDResult, DEFAULT_CDN_BASE, OmniAED as FireRedAED, OmniStreamVAD as FireRedStreamVAD, OmniVAD as FireRedVAD, MODEL_FILES, type ModelSource, OmniAED, OmniStreamVAD, OmniVAD, type StreamVADConfig, type StreamVADFrameResult, type StreamVADFullResult, type VADConfig, type VADResult, VERSION, initWasm, loadModel };
|
|
337
|
+
export { type AEDConfig, type AEDResult, type ChunkOptions, type ChunkResult, DEFAULT_CDN_BASE, DEFAULT_CHUNK_CONFIG, OmniAED as FireRedAED, OmniStreamVAD as FireRedStreamVAD, OmniVAD as FireRedVAD, MODEL_FILES, type ModelSource, OmniAED, OmniStreamVAD, OmniVAD, type StreamVADConfig, type StreamVADFrameResult, type StreamVADFullResult, type VADConfig, type VADResult, VERSION, initWasm, loadModel, mergeChunks };
|
package/dist/index.js
CHANGED
|
@@ -1,12 +1,41 @@
|
|
|
1
1
|
// src/wasm-binding.ts
|
|
2
2
|
var _module = null;
|
|
3
3
|
var _loading = null;
|
|
4
|
+
function loadScript(url) {
|
|
5
|
+
if (typeof globalThis.document === "undefined") {
|
|
6
|
+
return new Promise((resolve, reject) => {
|
|
7
|
+
try {
|
|
8
|
+
const importScripts = globalThis.importScripts;
|
|
9
|
+
if (typeof importScripts !== "function") {
|
|
10
|
+
throw new Error(
|
|
11
|
+
"omnivad: cannot load glue script \u2014 no document and no importScripts"
|
|
12
|
+
);
|
|
13
|
+
}
|
|
14
|
+
importScripts(url);
|
|
15
|
+
resolve();
|
|
16
|
+
} catch (err) {
|
|
17
|
+
reject(err instanceof Error ? err : new Error(String(err)));
|
|
18
|
+
}
|
|
19
|
+
});
|
|
20
|
+
}
|
|
21
|
+
return new Promise((resolve, reject) => {
|
|
22
|
+
const s = globalThis.document.createElement("script");
|
|
23
|
+
s.src = url;
|
|
24
|
+
s.async = true;
|
|
25
|
+
s.crossOrigin = "anonymous";
|
|
26
|
+
s.onload = () => resolve();
|
|
27
|
+
s.onerror = () => reject(new Error(`Failed to load omnivad glue script: ${url}`));
|
|
28
|
+
globalThis.document.head.appendChild(s);
|
|
29
|
+
});
|
|
30
|
+
}
|
|
4
31
|
var SIZEOF_POST_CONFIG = 28;
|
|
5
32
|
var SIZEOF_AED_POST_CONFIG = 3 * SIZEOF_POST_CONFIG;
|
|
6
33
|
var SIZEOF_SEGMENT = 8;
|
|
7
34
|
var SIZEOF_AED_SEGMENT = 16;
|
|
35
|
+
var SIZEOF_CHUNK_CONFIG = 28;
|
|
36
|
+
var SIZEOF_CHUNK = 16;
|
|
8
37
|
var OMNI_ERR_NO_FRAMES = -7;
|
|
9
|
-
var VERSION = "0.2.
|
|
38
|
+
var VERSION = "0.2.8";
|
|
10
39
|
var DEFAULT_CDN_BASE = `https://cdn.jsdelivr.net/npm/omnivad@${VERSION}/models`;
|
|
11
40
|
var MODEL_FILES = {
|
|
12
41
|
vad: "vad.omnivad",
|
|
@@ -22,22 +51,41 @@ async function initWasm(wasmLocator) {
|
|
|
22
51
|
if (typeof globalThis.process?.versions?.node === "string") {
|
|
23
52
|
const { createRequire } = await import(
|
|
24
53
|
/* webpackIgnore: true */
|
|
54
|
+
/* turbopackIgnore: true */
|
|
25
55
|
'module'
|
|
26
56
|
);
|
|
27
|
-
const { dirname, join } = await import(
|
|
57
|
+
const { dirname, join } = await import(
|
|
58
|
+
/* webpackIgnore: true */
|
|
59
|
+
/* turbopackIgnore: true */
|
|
60
|
+
'path'
|
|
61
|
+
);
|
|
28
62
|
const req = createRequire(import.meta.url);
|
|
29
63
|
const gluePath = req.resolve("../dist/wasm/omnivad.cjs");
|
|
30
64
|
const wasmDir = dirname(gluePath);
|
|
31
65
|
createOmniVAD = req(gluePath);
|
|
32
66
|
defaultLocateFile = (filename) => join(wasmDir, filename);
|
|
33
67
|
} else {
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
const
|
|
68
|
+
let glueUrlStr;
|
|
69
|
+
if (wasmLocator) {
|
|
70
|
+
glueUrlStr = wasmLocator("omnivad.js");
|
|
71
|
+
} else {
|
|
72
|
+
glueUrlStr = new URL("../dist/wasm/omnivad.js", import.meta.url).href;
|
|
73
|
+
}
|
|
74
|
+
const g = globalThis;
|
|
75
|
+
let factory = g.createOmniVAD;
|
|
76
|
+
if (typeof factory !== "function") {
|
|
77
|
+
await loadScript(glueUrlStr);
|
|
78
|
+
factory = g.createOmniVAD;
|
|
79
|
+
}
|
|
80
|
+
if (typeof factory !== "function") {
|
|
81
|
+
throw new Error(
|
|
82
|
+
`omnivad.js loaded from ${glueUrlStr} but globalThis.createOmniVAD is missing`
|
|
83
|
+
);
|
|
84
|
+
}
|
|
85
|
+
createOmniVAD = factory;
|
|
86
|
+
const baseHref = typeof globalThis.location !== "undefined" ? globalThis.location.href : "file:///";
|
|
87
|
+
const absGlue = new URL(glueUrlStr, baseHref);
|
|
88
|
+
const wasmBaseUrl = new URL("./", absGlue);
|
|
41
89
|
defaultLocateFile = (filename) => new URL(filename, wasmBaseUrl).toString();
|
|
42
90
|
}
|
|
43
91
|
const opts = {};
|
|
@@ -61,10 +109,19 @@ async function loadModel(modelType, modelUrl, modelData) {
|
|
|
61
109
|
if (typeof globalThis.process?.versions?.node === "string") {
|
|
62
110
|
const { createRequire } = await import(
|
|
63
111
|
/* webpackIgnore: true */
|
|
112
|
+
/* turbopackIgnore: true */
|
|
64
113
|
'module'
|
|
65
114
|
);
|
|
66
|
-
const { dirname, join } = await import(
|
|
67
|
-
|
|
115
|
+
const { dirname, join } = await import(
|
|
116
|
+
/* webpackIgnore: true */
|
|
117
|
+
/* turbopackIgnore: true */
|
|
118
|
+
'path'
|
|
119
|
+
);
|
|
120
|
+
const { readFile } = await import(
|
|
121
|
+
/* webpackIgnore: true */
|
|
122
|
+
/* turbopackIgnore: true */
|
|
123
|
+
'fs/promises'
|
|
124
|
+
);
|
|
68
125
|
const req = createRequire(import.meta.url);
|
|
69
126
|
const pkgDir = dirname(req.resolve("../package.json"));
|
|
70
127
|
const modelPath = join(pkgDir, "models", filename);
|
|
@@ -117,10 +174,86 @@ var DEFAULT_VAD_CONFIG = {
|
|
|
117
174
|
smoothWindowSize: 5,
|
|
118
175
|
minSpeechFrames: 20,
|
|
119
176
|
minSilenceFrames: 20,
|
|
120
|
-
maxSpeechFrames:
|
|
177
|
+
maxSpeechFrames: 3e3,
|
|
121
178
|
mergeSilenceFrames: 0,
|
|
122
179
|
extendSpeechFrames: 0
|
|
123
180
|
};
|
|
181
|
+
var OMNI_CHUNK_GREEDY = 0;
|
|
182
|
+
var OMNI_CHUNK_LONGEST_GAP = 1;
|
|
183
|
+
var DEFAULT_CHUNK_CONFIG = {
|
|
184
|
+
maxChunkSecs: 30,
|
|
185
|
+
maxGapSecs: Infinity,
|
|
186
|
+
padOnsetSecs: 0.04,
|
|
187
|
+
padOffsetSecs: 0.04,
|
|
188
|
+
minSpeechSecs: 0,
|
|
189
|
+
minSilenceSecs: 0.2,
|
|
190
|
+
// matches VAD minSilenceFrames=20 @ 10ms shift
|
|
191
|
+
mode: "greedy"
|
|
192
|
+
};
|
|
193
|
+
function modeToInt(m) {
|
|
194
|
+
switch (m) {
|
|
195
|
+
case "greedy":
|
|
196
|
+
return OMNI_CHUNK_GREEDY;
|
|
197
|
+
case "longest_gap":
|
|
198
|
+
return OMNI_CHUNK_LONGEST_GAP;
|
|
199
|
+
default:
|
|
200
|
+
throw new Error(`Unknown chunking mode: ${String(m)}`);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
function writeChunkConfig(M, ptr, cfg) {
|
|
204
|
+
M.setValue(ptr + 0, cfg.maxChunkSecs, "float");
|
|
205
|
+
M.setValue(ptr + 4, cfg.maxGapSecs, "float");
|
|
206
|
+
M.setValue(ptr + 8, cfg.padOnsetSecs, "float");
|
|
207
|
+
M.setValue(ptr + 12, cfg.padOffsetSecs, "float");
|
|
208
|
+
M.setValue(ptr + 16, cfg.minSpeechSecs, "float");
|
|
209
|
+
M.setValue(ptr + 20, cfg.minSilenceSecs, "float");
|
|
210
|
+
M.setValue(ptr + 24, modeToInt(cfg.mode), "i32");
|
|
211
|
+
}
|
|
212
|
+
function chunkMerge(M, segments, config) {
|
|
213
|
+
const numSegments = segments.length;
|
|
214
|
+
const segPtr = numSegments > 0 ? M._malloc(numSegments * SIZEOF_SEGMENT) : 0;
|
|
215
|
+
const cfgPtr = M._malloc(SIZEOF_CHUNK_CONFIG);
|
|
216
|
+
const outPtrPtr = M._malloc(4);
|
|
217
|
+
const outCountPtr = M._malloc(4);
|
|
218
|
+
try {
|
|
219
|
+
for (let i = 0; i < numSegments; i++) {
|
|
220
|
+
const base = segPtr + i * SIZEOF_SEGMENT;
|
|
221
|
+
M.setValue(base + 0, segments[i][0], "float");
|
|
222
|
+
M.setValue(base + 4, segments[i][1], "float");
|
|
223
|
+
}
|
|
224
|
+
writeChunkConfig(M, cfgPtr, config);
|
|
225
|
+
M.setValue(outPtrPtr, 0, "i32");
|
|
226
|
+
M.setValue(outCountPtr, 0, "i32");
|
|
227
|
+
const rc = M.ccall(
|
|
228
|
+
"omni_merge_chunks",
|
|
229
|
+
"number",
|
|
230
|
+
["number", "number", "number", "number", "number"],
|
|
231
|
+
[segPtr, numSegments, cfgPtr, outPtrPtr, outCountPtr]
|
|
232
|
+
);
|
|
233
|
+
if (rc !== 0) {
|
|
234
|
+
throw new Error(`omni_merge_chunks failed: ${readNativeError(M, rc)}`);
|
|
235
|
+
}
|
|
236
|
+
const count = M.getValue(outCountPtr, "i32");
|
|
237
|
+
const chunkPtr = M.getValue(outPtrPtr, "i32");
|
|
238
|
+
const chunks = [];
|
|
239
|
+
for (let i = 0; i < count; i++) {
|
|
240
|
+
const base = chunkPtr + i * SIZEOF_CHUNK;
|
|
241
|
+
chunks.push({
|
|
242
|
+
start: M.getValue(base + 0, "float"),
|
|
243
|
+
end: M.getValue(base + 4, "float"),
|
|
244
|
+
segStartIdx: M.getValue(base + 8, "i32"),
|
|
245
|
+
segCount: M.getValue(base + 12, "i32")
|
|
246
|
+
});
|
|
247
|
+
}
|
|
248
|
+
if (chunkPtr) M._free(chunkPtr);
|
|
249
|
+
return chunks;
|
|
250
|
+
} finally {
|
|
251
|
+
if (segPtr) M._free(segPtr);
|
|
252
|
+
M._free(cfgPtr);
|
|
253
|
+
M._free(outPtrPtr);
|
|
254
|
+
M._free(outCountPtr);
|
|
255
|
+
}
|
|
256
|
+
}
|
|
124
257
|
function vadCreate(M, modelBuffer) {
|
|
125
258
|
const bytes = new Uint8Array(modelBuffer);
|
|
126
259
|
const ptr = M._malloc(bytes.length);
|
|
@@ -225,24 +358,49 @@ function aedDetect(M, handle, audioPtr, numSamples, cfg, format = "f32") {
|
|
|
225
358
|
function aedDestroy(M, handle) {
|
|
226
359
|
M.ccall("omni_aed_destroy", null, ["number"], [handle]);
|
|
227
360
|
}
|
|
228
|
-
|
|
361
|
+
var DEFAULT_STREAM_VAD_CONFIG = {
|
|
362
|
+
threshold: 0.5,
|
|
363
|
+
smoothWindowSize: 5,
|
|
364
|
+
padStartFrame: 5,
|
|
365
|
+
minSpeechFrame: 8,
|
|
366
|
+
maxSpeechFrame: 2e3,
|
|
367
|
+
minSilenceFrame: 20
|
|
368
|
+
};
|
|
369
|
+
var SIZEOF_STREAM_VAD_CONFIG = 24;
|
|
370
|
+
function writeStreamVadConfig(M, ptr, cfg) {
|
|
371
|
+
M.setValue(ptr + 0, cfg.threshold, "float");
|
|
372
|
+
M.setValue(ptr + 4, cfg.smoothWindowSize, "i32");
|
|
373
|
+
M.setValue(ptr + 8, cfg.padStartFrame, "i32");
|
|
374
|
+
M.setValue(ptr + 12, cfg.minSpeechFrame, "i32");
|
|
375
|
+
M.setValue(ptr + 16, cfg.maxSpeechFrame, "i32");
|
|
376
|
+
M.setValue(ptr + 20, cfg.minSilenceFrame, "i32");
|
|
377
|
+
}
|
|
378
|
+
function streamVadCreate(M, modelBuffer, config = {}) {
|
|
379
|
+
const overrides = Object.fromEntries(
|
|
380
|
+
Object.entries(config).filter(([, v]) => v !== void 0)
|
|
381
|
+
);
|
|
382
|
+
const cfg = { ...DEFAULT_STREAM_VAD_CONFIG, ...overrides };
|
|
229
383
|
const bytes = new Uint8Array(modelBuffer);
|
|
230
|
-
const
|
|
231
|
-
M.HEAPU8.set(bytes,
|
|
384
|
+
const dataPtr = M._malloc(bytes.length);
|
|
385
|
+
M.HEAPU8.set(bytes, dataPtr);
|
|
386
|
+
const cfgPtr = M._malloc(SIZEOF_STREAM_VAD_CONFIG);
|
|
232
387
|
try {
|
|
388
|
+
writeStreamVadConfig(M, cfgPtr, cfg);
|
|
233
389
|
return createModel(
|
|
234
390
|
M,
|
|
235
391
|
"omni_stream_vad_create_from_buffer",
|
|
236
392
|
["number", "number", "number"],
|
|
237
|
-
[
|
|
393
|
+
[dataPtr, bytes.length, cfgPtr],
|
|
238
394
|
"StreamVAD"
|
|
239
395
|
);
|
|
240
396
|
} finally {
|
|
241
|
-
M._free(
|
|
397
|
+
M._free(dataPtr);
|
|
398
|
+
M._free(cfgPtr);
|
|
242
399
|
}
|
|
243
400
|
}
|
|
401
|
+
var SIZEOF_STREAM_VAD_RESULT = 24;
|
|
244
402
|
function streamVadProcess(M, handle, pcm16Ptr, numSamples) {
|
|
245
|
-
const resultPtr = M._malloc(
|
|
403
|
+
const resultPtr = M._malloc(SIZEOF_STREAM_VAD_RESULT);
|
|
246
404
|
try {
|
|
247
405
|
const ret = M.ccall(
|
|
248
406
|
"omni_stream_vad_process",
|
|
@@ -253,14 +411,37 @@ function streamVadProcess(M, handle, pcm16Ptr, numSamples) {
|
|
|
253
411
|
if (ret === OMNI_ERR_NO_FRAMES) return null;
|
|
254
412
|
if (ret !== 0) throw new Error(`StreamVAD process failed: ${ret}`);
|
|
255
413
|
return {
|
|
256
|
-
confidence: M.getValue(resultPtr, "float"),
|
|
257
|
-
|
|
258
|
-
|
|
414
|
+
confidence: M.getValue(resultPtr + 0, "float"),
|
|
415
|
+
smoothedProb: M.getValue(resultPtr + 4, "float"),
|
|
416
|
+
isSpeech: M.getValue(resultPtr + 8, "i8") !== 0,
|
|
417
|
+
isSpeechStart: M.getValue(resultPtr + 9, "i8") !== 0,
|
|
418
|
+
isSpeechEnd: M.getValue(resultPtr + 10, "i8") !== 0,
|
|
419
|
+
frameIdx: M.getValue(resultPtr + 12, "i32"),
|
|
420
|
+
speechStartFrame: M.getValue(resultPtr + 16, "i32"),
|
|
421
|
+
speechEndFrame: M.getValue(resultPtr + 20, "i32")
|
|
259
422
|
};
|
|
260
423
|
} finally {
|
|
261
424
|
M._free(resultPtr);
|
|
262
425
|
}
|
|
263
426
|
}
|
|
427
|
+
function streamVadClone(M, handle) {
|
|
428
|
+
const errPtr = M._malloc(4);
|
|
429
|
+
try {
|
|
430
|
+
const newHandle = M.ccall(
|
|
431
|
+
"omni_stream_vad_clone",
|
|
432
|
+
"number",
|
|
433
|
+
["number", "number"],
|
|
434
|
+
[handle, errPtr]
|
|
435
|
+
);
|
|
436
|
+
if (!newHandle) {
|
|
437
|
+
const err = M.getValue(errPtr, "i32");
|
|
438
|
+
throw new Error(`StreamVAD clone failed: ${readNativeError(M, err)}`);
|
|
439
|
+
}
|
|
440
|
+
return newHandle;
|
|
441
|
+
} finally {
|
|
442
|
+
M._free(errPtr);
|
|
443
|
+
}
|
|
444
|
+
}
|
|
264
445
|
function streamVadReset(M, handle) {
|
|
265
446
|
M.ccall("omni_stream_vad_reset", null, ["number"], [handle]);
|
|
266
447
|
}
|
|
@@ -336,8 +517,6 @@ function int16ToNormalizedFloat32(i16) {
|
|
|
336
517
|
var SAMPLE_RATE2 = 16e3;
|
|
337
518
|
var OmniStreamVAD = class _OmniStreamVAD {
|
|
338
519
|
constructor(handle) {
|
|
339
|
-
this.inSpeech = false;
|
|
340
|
-
this.speechStartFrame = 0;
|
|
341
520
|
this.handle = handle;
|
|
342
521
|
}
|
|
343
522
|
/**
|
|
@@ -348,13 +527,35 @@ var OmniStreamVAD = class _OmniStreamVAD {
|
|
|
348
527
|
await initWasm();
|
|
349
528
|
const M = getModule();
|
|
350
529
|
const modelBuffer = await loadModel("stream-vad", options.modelUrl, options.modelData);
|
|
351
|
-
const
|
|
352
|
-
|
|
530
|
+
const handle = streamVadCreate(M, modelBuffer, {
|
|
531
|
+
threshold: options.threshold,
|
|
532
|
+
smoothWindowSize: options.smoothWindowSize,
|
|
533
|
+
padStartFrame: options.padStartFrame,
|
|
534
|
+
minSpeechFrame: options.minSpeechFrame,
|
|
535
|
+
maxSpeechFrame: options.maxSpeechFrame,
|
|
536
|
+
minSilenceFrame: options.minSilenceFrame
|
|
537
|
+
});
|
|
353
538
|
return new _OmniStreamVAD(handle);
|
|
354
539
|
}
|
|
540
|
+
/**
|
|
541
|
+
* Create a lightweight clone sharing the same underlying model weights.
|
|
542
|
+
* The clone has fresh per-instance state (empty audio buffer, zeroed cache).
|
|
543
|
+
* This is synchronous and extremely fast — ideal for multi-stream scenarios
|
|
544
|
+
* (e.g., handling multiple WebRTC tracks or concurrent audio sessions).
|
|
545
|
+
*/
|
|
546
|
+
clone() {
|
|
547
|
+
if (!this.handle) throw new Error("Cannot clone a disposed instance.");
|
|
548
|
+
const M = getModule();
|
|
549
|
+
const newHandle = streamVadClone(M, this.handle);
|
|
550
|
+
return new _OmniStreamVAD(newHandle);
|
|
551
|
+
}
|
|
355
552
|
/**
|
|
356
553
|
* Process one frame of audio (160 int16 samples = 10ms @ 16kHz).
|
|
357
554
|
* Returns null until enough audio is accumulated.
|
|
555
|
+
*
|
|
556
|
+
* Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
|
|
557
|
+
* speech_*_frame indices) come straight from the C-layer state machine
|
|
558
|
+
* (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
|
|
358
559
|
*/
|
|
359
560
|
processFrame(pcm160) {
|
|
360
561
|
const M = getModule();
|
|
@@ -363,28 +564,16 @@ var OmniStreamVAD = class _OmniStreamVAD {
|
|
|
363
564
|
heap16.set(pcm160);
|
|
364
565
|
try {
|
|
365
566
|
const result = streamVadProcess(M, this.handle, ptr, pcm160.length);
|
|
366
|
-
if (!result
|
|
367
|
-
const frameIndex = result.frameOffset;
|
|
368
|
-
const isSpeechStart = result.isSpeech && !this.inSpeech;
|
|
369
|
-
const isSpeechEnd = !result.isSpeech && this.inSpeech;
|
|
370
|
-
if (isSpeechStart) {
|
|
371
|
-
this.speechStartFrame = frameIndex;
|
|
372
|
-
}
|
|
373
|
-
const activeSpeechStartFrame = isSpeechEnd ? this.speechStartFrame : result.isSpeech ? this.speechStartFrame : 0;
|
|
374
|
-
const speechEndFrame = isSpeechEnd ? Math.max(1, frameIndex - 1) : 0;
|
|
375
|
-
this.inSpeech = result.isSpeech;
|
|
376
|
-
if (isSpeechEnd) {
|
|
377
|
-
this.speechStartFrame = 0;
|
|
378
|
-
}
|
|
567
|
+
if (!result) return null;
|
|
379
568
|
return {
|
|
380
569
|
confidence: result.confidence,
|
|
381
|
-
|
|
570
|
+
smoothedProb: result.smoothedProb,
|
|
382
571
|
isSpeech: result.isSpeech,
|
|
383
|
-
frameIndex,
|
|
384
|
-
isSpeechStart,
|
|
385
|
-
isSpeechEnd,
|
|
386
|
-
speechStartFrame:
|
|
387
|
-
speechEndFrame
|
|
572
|
+
frameIndex: result.frameIdx,
|
|
573
|
+
isSpeechStart: result.isSpeechStart,
|
|
574
|
+
isSpeechEnd: result.isSpeechEnd,
|
|
575
|
+
speechStartFrame: result.speechStartFrame,
|
|
576
|
+
speechEndFrame: result.speechEndFrame
|
|
388
577
|
};
|
|
389
578
|
} finally {
|
|
390
579
|
M._free(ptr);
|
|
@@ -423,11 +612,9 @@ var OmniStreamVAD = class _OmniStreamVAD {
|
|
|
423
612
|
M._free(framesPtr);
|
|
424
613
|
}
|
|
425
614
|
}
|
|
426
|
-
/** Reset all internal state. */
|
|
615
|
+
/** Reset all internal state (model cache, audio buffer, postprocessor). */
|
|
427
616
|
reset() {
|
|
428
617
|
streamVadReset(getModule(), this.handle);
|
|
429
|
-
this.inSpeech = false;
|
|
430
|
-
this.speechStartFrame = 0;
|
|
431
618
|
}
|
|
432
619
|
/** Release native resources. */
|
|
433
620
|
dispose() {
|
|
@@ -435,8 +622,6 @@ var OmniStreamVAD = class _OmniStreamVAD {
|
|
|
435
622
|
streamVadDestroy(getModule(), this.handle);
|
|
436
623
|
this.handle = 0;
|
|
437
624
|
}
|
|
438
|
-
this.inSpeech = false;
|
|
439
|
-
this.speechStartFrame = 0;
|
|
440
625
|
}
|
|
441
626
|
};
|
|
442
627
|
function int16ToFloat32(i16) {
|
|
@@ -550,6 +735,28 @@ function computeCoverageRatios(events, duration) {
|
|
|
550
735
|
return ratios;
|
|
551
736
|
}
|
|
552
737
|
|
|
553
|
-
|
|
738
|
+
// src/chunking.ts
|
|
739
|
+
async function mergeChunks(segments, options = {}) {
|
|
740
|
+
await initWasm();
|
|
741
|
+
const M = getModule();
|
|
742
|
+
const cfg = {
|
|
743
|
+
maxChunkSecs: options.maxChunkSecs ?? DEFAULT_CHUNK_CONFIG.maxChunkSecs,
|
|
744
|
+
maxGapSecs: options.maxGapSecs ?? DEFAULT_CHUNK_CONFIG.maxGapSecs,
|
|
745
|
+
padOnsetSecs: options.padOnsetSecs ?? DEFAULT_CHUNK_CONFIG.padOnsetSecs,
|
|
746
|
+
padOffsetSecs: options.padOffsetSecs ?? DEFAULT_CHUNK_CONFIG.padOffsetSecs,
|
|
747
|
+
minSpeechSecs: options.minSpeechSecs ?? DEFAULT_CHUNK_CONFIG.minSpeechSecs,
|
|
748
|
+
minSilenceSecs: options.minSilenceSecs ?? DEFAULT_CHUNK_CONFIG.minSilenceSecs,
|
|
749
|
+
mode: options.mode ?? DEFAULT_CHUNK_CONFIG.mode
|
|
750
|
+
};
|
|
751
|
+
const records = chunkMerge(M, segments, cfg);
|
|
752
|
+
return records.map((r) => ({
|
|
753
|
+
start: r.start,
|
|
754
|
+
end: r.end,
|
|
755
|
+
segStartIdx: r.segStartIdx,
|
|
756
|
+
segCount: r.segCount
|
|
757
|
+
}));
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
export { DEFAULT_CDN_BASE, DEFAULT_CHUNK_CONFIG, OmniAED as FireRedAED, OmniStreamVAD as FireRedStreamVAD, OmniVAD as FireRedVAD, MODEL_FILES, OmniAED, OmniStreamVAD, OmniVAD, VERSION, initWasm, loadModel, mergeChunks };
|
|
554
761
|
//# sourceMappingURL=index.js.map
|
|
555
762
|
//# sourceMappingURL=index.js.map
|