omnivad 0.2.5 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +228 -49
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +152 -19
- package/dist/index.d.ts +152 -19
- package/dist/index.js +227 -50
- package/dist/index.js.map +1 -1
- package/dist/wasm/omnivad.cjs +1 -1
- package/dist/wasm/omnivad.js +1 -1
- package/dist/wasm/omnivad.wasm +0 -0
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -14,23 +14,27 @@ interface AEDResult {
|
|
|
14
14
|
/** Detected duration coverage ratio for each event type */
|
|
15
15
|
ratios: Record<string, number>;
|
|
16
16
|
}
|
|
17
|
-
/** Per-frame result from streaming VAD
|
|
17
|
+
/** Per-frame result from streaming VAD.
|
|
18
|
+
*
|
|
19
|
+
* Bit-identical to upstream FireRedVAD's StreamVadFrameResult: every
|
|
20
|
+
* successful processFrame() call carries both per-frame probabilities
|
|
21
|
+
* AND segment-boundary events (no external segmenter needed). */
|
|
18
22
|
interface StreamVADFrameResult {
|
|
19
|
-
/** Raw probability from model output */
|
|
23
|
+
/** Raw probability from model output [0, 1] */
|
|
20
24
|
confidence: number;
|
|
21
|
-
/**
|
|
22
|
-
|
|
23
|
-
/**
|
|
25
|
+
/** Causal moving-average of confidence (window = smoothWindowSize) */
|
|
26
|
+
smoothedProb: number;
|
|
27
|
+
/** smoothedProb >= threshold */
|
|
24
28
|
isSpeech: boolean;
|
|
25
29
|
/** 1-based frame index of the emitted frame */
|
|
26
30
|
frameIndex: number;
|
|
27
|
-
/** True
|
|
31
|
+
/** True on the frame that confirms a new SPEECH segment */
|
|
28
32
|
isSpeechStart: boolean;
|
|
29
|
-
/** True
|
|
33
|
+
/** True on the frame that confirms a SPEECH segment end */
|
|
30
34
|
isSpeechEnd: boolean;
|
|
31
|
-
/**
|
|
35
|
+
/** 1-based start frame of the segment when isSpeechStart, else -1 */
|
|
32
36
|
speechStartFrame: number;
|
|
33
|
-
/**
|
|
37
|
+
/** 1-based end frame of the segment when isSpeechEnd, else -1 */
|
|
34
38
|
speechEndFrame: number;
|
|
35
39
|
}
|
|
36
40
|
/** Full-audio streaming-model output */
|
|
@@ -57,7 +61,7 @@ interface VADConfig extends ModelSource {
|
|
|
57
61
|
smoothWindowSize?: number;
|
|
58
62
|
/** Minimum speech segment length in frames (default: 20) */
|
|
59
63
|
minSpeechFrames?: number;
|
|
60
|
-
/** Maximum speech segment length in frames before splitting (default:
|
|
64
|
+
/** Maximum speech segment length in frames before splitting (default: 3000 = 30s; matches Whisper) */
|
|
61
65
|
maxSpeechFrames?: number;
|
|
62
66
|
/** Minimum silence segment length in frames for state machine (default: 20) */
|
|
63
67
|
minSilenceFrames?: number;
|
|
@@ -73,10 +77,79 @@ interface AEDConfig extends VADConfig {
|
|
|
73
77
|
/** Music probability threshold (default: 0.5) */
|
|
74
78
|
musicThreshold?: number;
|
|
75
79
|
}
|
|
76
|
-
/** Configuration for streaming VAD
|
|
80
|
+
/** Configuration for streaming VAD.
|
|
81
|
+
*
|
|
82
|
+
* Bit-identical to upstream FireRedStreamVadConfig — every parameter
|
|
83
|
+
* has the same name (without the speech_ prefix) and the same default. */
|
|
77
84
|
interface StreamVADConfig extends ModelSource {
|
|
78
|
-
/** Speech
|
|
79
|
-
|
|
85
|
+
/** Speech activation threshold [0, 1] (default: 0.5). */
|
|
86
|
+
threshold?: number;
|
|
87
|
+
/** Causal moving-average window in frames (default: 5). */
|
|
88
|
+
smoothWindowSize?: number;
|
|
89
|
+
/** Extend confirmed segment START backward by N frames (default: 5;
|
|
90
|
+
* clamped to >= smoothWindowSize internally). */
|
|
91
|
+
padStartFrame?: number;
|
|
92
|
+
/** Min continuous speech frames to confirm START (default: 8 = 80ms). */
|
|
93
|
+
minSpeechFrame?: number;
|
|
94
|
+
/** Force-split when SPEECH-state count hits this (default: 2000 = 20s). */
|
|
95
|
+
maxSpeechFrame?: number;
|
|
96
|
+
/** Min continuous silence frames to confirm END (default: 20 = 200ms). */
|
|
97
|
+
minSilenceFrame?: number;
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Chunk packing strategy. Both modes honor `maxChunkSecs` and `maxGapSecs` as
|
|
101
|
+
* hard constraints — they only differ in WHERE the cut lands.
|
|
102
|
+
*
|
|
103
|
+
* - `"greedy"` — sequential append; cuts at the first point that violates
|
|
104
|
+
* a constraint. Recommended for **fixed-length-input ASR** like Whisper /
|
|
105
|
+
* whisperX (which pad to 30s anyway).
|
|
106
|
+
* - `"longest_gap"` — recursive split at the longest internal pause until
|
|
107
|
+
* every chunk satisfies both constraints. Falls back to equal hard-split
|
|
108
|
+
* when a single segment exceeds `maxChunkSecs`. Recommended for
|
|
109
|
+
* **variable-length-input models** (forced alignment, TTS, encoder-style
|
|
110
|
+
* ASR) — splits at natural pauses, no fixed-length padding required.
|
|
111
|
+
* **NOTE: This is NOT how WhisperX packs chunks** — WhisperX uses greedy
|
|
112
|
+
* packing (`Binarize(max_duration=...)` + sequential append). For
|
|
113
|
+
* WhisperX-equivalent behavior pass `mode: "greedy"` (the default).
|
|
114
|
+
*/
|
|
115
|
+
type ChunkMode$1 = "greedy" | "longest_gap";
|
|
116
|
+
/**
|
|
117
|
+
* Configuration for {@link mergeChunks}. Mirrors C struct OmniChunkConfig.
|
|
118
|
+
* All fields are optional in the public API; defaults match
|
|
119
|
+
* {@link DEFAULT_CHUNK_CONFIG}.
|
|
120
|
+
*/
|
|
121
|
+
interface ChunkOptions {
|
|
122
|
+
/** Hard upper bound on chunk duration in seconds. Must be > 0. Default: 30. */
|
|
123
|
+
maxChunkSecs?: number;
|
|
124
|
+
/** Split if the gap between adjacent segments exceeds this. Pass `Infinity`
|
|
125
|
+
* to disable. Default: `Infinity`. Honored by both modes. */
|
|
126
|
+
maxGapSecs?: number;
|
|
127
|
+
/** Extend each chunk start backward by this many seconds (clamped to >= 0).
|
|
128
|
+
* Default: 0.04. */
|
|
129
|
+
padOnsetSecs?: number;
|
|
130
|
+
/** Extend each chunk end forward by this many seconds. Default: 0.04. */
|
|
131
|
+
padOffsetSecs?: number;
|
|
132
|
+
/** Drop input segments shorter than this many seconds. Default: 0.0.
|
|
133
|
+
* Pairs with VAD `minSpeechFrames` (frame-domain equivalent). */
|
|
134
|
+
minSpeechSecs?: number;
|
|
135
|
+
/** Pre-merge consecutive segments whose silence gap is shorter than this.
|
|
136
|
+
* Default: 0.20 (matches VAD `minSilenceFrames=20` @ 10ms frame shift). */
|
|
137
|
+
minSilenceSecs?: number;
|
|
138
|
+
/** Packing strategy. Default: `"greedy"`. */
|
|
139
|
+
mode?: ChunkMode$1;
|
|
140
|
+
}
|
|
141
|
+
/** A single chunk emitted by {@link mergeChunks}. */
|
|
142
|
+
interface ChunkResult {
|
|
143
|
+
/** Chunk start time (seconds), with `padOnsetSecs` applied (clamped to >= 0). */
|
|
144
|
+
start: number;
|
|
145
|
+
/** Chunk end time (seconds), with `padOffsetSecs` applied. */
|
|
146
|
+
end: number;
|
|
147
|
+
/** Index of the first input segment included in this chunk. Refers to the
|
|
148
|
+
* *post-filter* segment list — segments dropped by `minSpeechSecs` and
|
|
149
|
+
* pre-merged by `minSilenceSecs` are not counted. */
|
|
150
|
+
segStartIdx: number;
|
|
151
|
+
/** Number of input segments included in this chunk. */
|
|
152
|
+
segCount: number;
|
|
80
153
|
}
|
|
81
154
|
|
|
82
155
|
/**
|
|
@@ -113,8 +186,6 @@ declare class OmniVAD {
|
|
|
113
186
|
|
|
114
187
|
declare class OmniStreamVAD {
|
|
115
188
|
private handle;
|
|
116
|
-
private inSpeech;
|
|
117
|
-
private speechStartFrame;
|
|
118
189
|
private constructor();
|
|
119
190
|
/**
|
|
120
191
|
* Create a new OmniStreamVAD instance.
|
|
@@ -131,6 +202,10 @@ declare class OmniStreamVAD {
|
|
|
131
202
|
/**
|
|
132
203
|
* Process one frame of audio (160 int16 samples = 10ms @ 16kHz).
|
|
133
204
|
* Returns null until enough audio is accumulated.
|
|
205
|
+
*
|
|
206
|
+
* Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
|
|
207
|
+
* speech_*_frame indices) come straight from the C-layer state machine
|
|
208
|
+
* (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
|
|
134
209
|
*/
|
|
135
210
|
processFrame(pcm160: Int16Array): StreamVADFrameResult | null;
|
|
136
211
|
/**
|
|
@@ -138,7 +213,7 @@ declare class OmniStreamVAD {
|
|
|
138
213
|
* @param audio - Float32Array in [-1, 1] or Int16Array of 16kHz mono PCM
|
|
139
214
|
*/
|
|
140
215
|
detectFull(audio: Float32Array | Int16Array): StreamVADFullResult;
|
|
141
|
-
/** Reset all internal state. */
|
|
216
|
+
/** Reset all internal state (model cache, audio buffer, postprocessor). */
|
|
142
217
|
reset(): void;
|
|
143
218
|
/** Release native resources. */
|
|
144
219
|
dispose(): void;
|
|
@@ -175,9 +250,9 @@ declare class OmniAED {
|
|
|
175
250
|
*/
|
|
176
251
|
type EmscriptenModule = any;
|
|
177
252
|
/** Package version — used to construct default CDN URLs. */
|
|
178
|
-
declare const VERSION = "0.2.
|
|
253
|
+
declare const VERSION = "0.2.8";
|
|
179
254
|
/** Default CDN base for model files (jsDelivr serves npm package contents). */
|
|
180
|
-
declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.
|
|
255
|
+
declare const DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/omnivad@0.2.8/models";
|
|
181
256
|
/** Model filenames keyed by type. */
|
|
182
257
|
declare const MODEL_FILES: {
|
|
183
258
|
readonly vad: "vad.omnivad";
|
|
@@ -200,5 +275,63 @@ declare function initWasm(wasmLocator?: (filename: string) => string): Promise<E
|
|
|
200
275
|
* 4. Browser — fetch from jsDelivr CDN
|
|
201
276
|
*/
|
|
202
277
|
declare function loadModel(modelType: ModelType, modelUrl?: string | URL, modelData?: ArrayBuffer): Promise<ArrayBuffer>;
|
|
278
|
+
/**
|
|
279
|
+
* Chunking strategy:
|
|
280
|
+
* - "greedy" — sequential append. Recommended for fixed-length-input ASR
|
|
281
|
+
* (Whisper / whisperX, which pad to 30s anyway).
|
|
282
|
+
* - "longest_gap" — recursive split at longest pause; falls back to hard-split
|
|
283
|
+
* when a single segment exceeds maxChunkSecs. Recommended for
|
|
284
|
+
* variable-length-input models (forced alignment, TTS,
|
|
285
|
+
* encoder-style ASR); no fixed-length padding required.
|
|
286
|
+
*/
|
|
287
|
+
type ChunkMode = "greedy" | "longest_gap";
|
|
288
|
+
/** Configuration for omni_merge_chunks (matches C struct OmniChunkConfig, 28 bytes) */
|
|
289
|
+
interface ChunkConfig {
|
|
290
|
+
maxChunkSecs: number;
|
|
291
|
+
maxGapSecs: number;
|
|
292
|
+
padOnsetSecs: number;
|
|
293
|
+
padOffsetSecs: number;
|
|
294
|
+
minSpeechSecs: number;
|
|
295
|
+
minSilenceSecs: number;
|
|
296
|
+
mode: ChunkMode;
|
|
297
|
+
}
|
|
298
|
+
/**
|
|
299
|
+
* Default chunk config. Mirrors C-side omni_chunk_config_default(); kept in
|
|
300
|
+
* TS so callers don't need a roundtrip into WASM just to read defaults.
|
|
301
|
+
*
|
|
302
|
+
* Defaults: max_chunk_secs matches Whisper's 30s input window.
|
|
303
|
+
*/
|
|
304
|
+
declare const DEFAULT_CHUNK_CONFIG: ChunkConfig;
|
|
305
|
+
|
|
306
|
+
/**
|
|
307
|
+
* Pure-algorithm chunking utility — wraps the C function omni_merge_chunks
|
|
308
|
+
* compiled into the WASM module.
|
|
309
|
+
*
|
|
310
|
+
* WhisperX-style binarize+merge, minus the binarize half because OmniVAD
|
|
311
|
+
* already returns binarized timestamps.
|
|
312
|
+
*
|
|
313
|
+
* Usage:
|
|
314
|
+
*
|
|
315
|
+
* import { mergeChunks } from "omnivad";
|
|
316
|
+
*
|
|
317
|
+
* const chunks = await mergeChunks(
|
|
318
|
+
* [[0.0, 5.0], [6.0, 10.0]],
|
|
319
|
+
* { maxChunkSecs: 30.0, maxGapSecs: 2.0 }
|
|
320
|
+
* );
|
|
321
|
+
* // [{ start: 0, end: 10, segStartIdx: 0, segCount: 2 }]
|
|
322
|
+
*/
|
|
323
|
+
|
|
324
|
+
/**
|
|
325
|
+
* Merge a sorted array of [start, end] speech segments into duration-bounded
|
|
326
|
+
* chunks.
|
|
327
|
+
*
|
|
328
|
+
* Lazily initializes the WASM module on first call (so the caller doesn't have
|
|
329
|
+
* to await `initWasm()` separately). Subsequent calls reuse the cached module.
|
|
330
|
+
*
|
|
331
|
+
* @param segments array of [start, end] pairs in seconds, sorted by start
|
|
332
|
+
* @param options chunking configuration; missing fields fall back to
|
|
333
|
+
* {@link DEFAULT_CHUNK_CONFIG}
|
|
334
|
+
*/
|
|
335
|
+
declare function mergeChunks(segments: Array<[number, number]>, options?: ChunkOptions): Promise<ChunkResult[]>;
|
|
203
336
|
|
|
204
|
-
export { type AEDConfig, type AEDResult, DEFAULT_CDN_BASE, OmniAED as FireRedAED, OmniStreamVAD as FireRedStreamVAD, OmniVAD as FireRedVAD, MODEL_FILES, type ModelSource, OmniAED, OmniStreamVAD, OmniVAD, type StreamVADConfig, type StreamVADFrameResult, type StreamVADFullResult, type VADConfig, type VADResult, VERSION, initWasm, loadModel };
|
|
337
|
+
export { type AEDConfig, type AEDResult, type ChunkOptions, type ChunkResult, DEFAULT_CDN_BASE, DEFAULT_CHUNK_CONFIG, OmniAED as FireRedAED, OmniStreamVAD as FireRedStreamVAD, OmniVAD as FireRedVAD, MODEL_FILES, type ModelSource, OmniAED, OmniStreamVAD, OmniVAD, type StreamVADConfig, type StreamVADFrameResult, type StreamVADFullResult, type VADConfig, type VADResult, VERSION, initWasm, loadModel, mergeChunks };
|
package/dist/index.js
CHANGED
|
@@ -1,12 +1,41 @@
|
|
|
1
1
|
// src/wasm-binding.ts
|
|
2
2
|
var _module = null;
|
|
3
3
|
var _loading = null;
|
|
4
|
+
function loadScript(url) {
|
|
5
|
+
if (typeof globalThis.document === "undefined") {
|
|
6
|
+
return new Promise((resolve, reject) => {
|
|
7
|
+
try {
|
|
8
|
+
const importScripts = globalThis.importScripts;
|
|
9
|
+
if (typeof importScripts !== "function") {
|
|
10
|
+
throw new Error(
|
|
11
|
+
"omnivad: cannot load glue script \u2014 no document and no importScripts"
|
|
12
|
+
);
|
|
13
|
+
}
|
|
14
|
+
importScripts(url);
|
|
15
|
+
resolve();
|
|
16
|
+
} catch (err) {
|
|
17
|
+
reject(err instanceof Error ? err : new Error(String(err)));
|
|
18
|
+
}
|
|
19
|
+
});
|
|
20
|
+
}
|
|
21
|
+
return new Promise((resolve, reject) => {
|
|
22
|
+
const s = globalThis.document.createElement("script");
|
|
23
|
+
s.src = url;
|
|
24
|
+
s.async = true;
|
|
25
|
+
s.crossOrigin = "anonymous";
|
|
26
|
+
s.onload = () => resolve();
|
|
27
|
+
s.onerror = () => reject(new Error(`Failed to load omnivad glue script: ${url}`));
|
|
28
|
+
globalThis.document.head.appendChild(s);
|
|
29
|
+
});
|
|
30
|
+
}
|
|
4
31
|
var SIZEOF_POST_CONFIG = 28;
|
|
5
32
|
var SIZEOF_AED_POST_CONFIG = 3 * SIZEOF_POST_CONFIG;
|
|
6
33
|
var SIZEOF_SEGMENT = 8;
|
|
7
34
|
var SIZEOF_AED_SEGMENT = 16;
|
|
35
|
+
var SIZEOF_CHUNK_CONFIG = 28;
|
|
36
|
+
var SIZEOF_CHUNK = 16;
|
|
8
37
|
var OMNI_ERR_NO_FRAMES = -7;
|
|
9
|
-
var VERSION = "0.2.
|
|
38
|
+
var VERSION = "0.2.8";
|
|
10
39
|
var DEFAULT_CDN_BASE = `https://cdn.jsdelivr.net/npm/omnivad@${VERSION}/models`;
|
|
11
40
|
var MODEL_FILES = {
|
|
12
41
|
vad: "vad.omnivad",
|
|
@@ -22,22 +51,41 @@ async function initWasm(wasmLocator) {
|
|
|
22
51
|
if (typeof globalThis.process?.versions?.node === "string") {
|
|
23
52
|
const { createRequire } = await import(
|
|
24
53
|
/* webpackIgnore: true */
|
|
54
|
+
/* turbopackIgnore: true */
|
|
25
55
|
'module'
|
|
26
56
|
);
|
|
27
|
-
const { dirname, join } = await import(
|
|
57
|
+
const { dirname, join } = await import(
|
|
58
|
+
/* webpackIgnore: true */
|
|
59
|
+
/* turbopackIgnore: true */
|
|
60
|
+
'path'
|
|
61
|
+
);
|
|
28
62
|
const req = createRequire(import.meta.url);
|
|
29
63
|
const gluePath = req.resolve("../dist/wasm/omnivad.cjs");
|
|
30
64
|
const wasmDir = dirname(gluePath);
|
|
31
65
|
createOmniVAD = req(gluePath);
|
|
32
66
|
defaultLocateFile = (filename) => join(wasmDir, filename);
|
|
33
67
|
} else {
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
const
|
|
68
|
+
let glueUrlStr;
|
|
69
|
+
if (wasmLocator) {
|
|
70
|
+
glueUrlStr = wasmLocator("omnivad.js");
|
|
71
|
+
} else {
|
|
72
|
+
glueUrlStr = new URL("../dist/wasm/omnivad.js", import.meta.url).href;
|
|
73
|
+
}
|
|
74
|
+
const g = globalThis;
|
|
75
|
+
let factory = g.createOmniVAD;
|
|
76
|
+
if (typeof factory !== "function") {
|
|
77
|
+
await loadScript(glueUrlStr);
|
|
78
|
+
factory = g.createOmniVAD;
|
|
79
|
+
}
|
|
80
|
+
if (typeof factory !== "function") {
|
|
81
|
+
throw new Error(
|
|
82
|
+
`omnivad.js loaded from ${glueUrlStr} but globalThis.createOmniVAD is missing`
|
|
83
|
+
);
|
|
84
|
+
}
|
|
85
|
+
createOmniVAD = factory;
|
|
86
|
+
const baseHref = typeof globalThis.location !== "undefined" ? globalThis.location.href : "file:///";
|
|
87
|
+
const absGlue = new URL(glueUrlStr, baseHref);
|
|
88
|
+
const wasmBaseUrl = new URL("./", absGlue);
|
|
41
89
|
defaultLocateFile = (filename) => new URL(filename, wasmBaseUrl).toString();
|
|
42
90
|
}
|
|
43
91
|
const opts = {};
|
|
@@ -61,10 +109,19 @@ async function loadModel(modelType, modelUrl, modelData) {
|
|
|
61
109
|
if (typeof globalThis.process?.versions?.node === "string") {
|
|
62
110
|
const { createRequire } = await import(
|
|
63
111
|
/* webpackIgnore: true */
|
|
112
|
+
/* turbopackIgnore: true */
|
|
64
113
|
'module'
|
|
65
114
|
);
|
|
66
|
-
const { dirname, join } = await import(
|
|
67
|
-
|
|
115
|
+
const { dirname, join } = await import(
|
|
116
|
+
/* webpackIgnore: true */
|
|
117
|
+
/* turbopackIgnore: true */
|
|
118
|
+
'path'
|
|
119
|
+
);
|
|
120
|
+
const { readFile } = await import(
|
|
121
|
+
/* webpackIgnore: true */
|
|
122
|
+
/* turbopackIgnore: true */
|
|
123
|
+
'fs/promises'
|
|
124
|
+
);
|
|
68
125
|
const req = createRequire(import.meta.url);
|
|
69
126
|
const pkgDir = dirname(req.resolve("../package.json"));
|
|
70
127
|
const modelPath = join(pkgDir, "models", filename);
|
|
@@ -117,10 +174,86 @@ var DEFAULT_VAD_CONFIG = {
|
|
|
117
174
|
smoothWindowSize: 5,
|
|
118
175
|
minSpeechFrames: 20,
|
|
119
176
|
minSilenceFrames: 20,
|
|
120
|
-
maxSpeechFrames:
|
|
177
|
+
maxSpeechFrames: 3e3,
|
|
121
178
|
mergeSilenceFrames: 0,
|
|
122
179
|
extendSpeechFrames: 0
|
|
123
180
|
};
|
|
181
|
+
var OMNI_CHUNK_GREEDY = 0;
|
|
182
|
+
var OMNI_CHUNK_LONGEST_GAP = 1;
|
|
183
|
+
var DEFAULT_CHUNK_CONFIG = {
|
|
184
|
+
maxChunkSecs: 30,
|
|
185
|
+
maxGapSecs: Infinity,
|
|
186
|
+
padOnsetSecs: 0.04,
|
|
187
|
+
padOffsetSecs: 0.04,
|
|
188
|
+
minSpeechSecs: 0,
|
|
189
|
+
minSilenceSecs: 0.2,
|
|
190
|
+
// matches VAD minSilenceFrames=20 @ 10ms shift
|
|
191
|
+
mode: "greedy"
|
|
192
|
+
};
|
|
193
|
+
function modeToInt(m) {
|
|
194
|
+
switch (m) {
|
|
195
|
+
case "greedy":
|
|
196
|
+
return OMNI_CHUNK_GREEDY;
|
|
197
|
+
case "longest_gap":
|
|
198
|
+
return OMNI_CHUNK_LONGEST_GAP;
|
|
199
|
+
default:
|
|
200
|
+
throw new Error(`Unknown chunking mode: ${String(m)}`);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
function writeChunkConfig(M, ptr, cfg) {
|
|
204
|
+
M.setValue(ptr + 0, cfg.maxChunkSecs, "float");
|
|
205
|
+
M.setValue(ptr + 4, cfg.maxGapSecs, "float");
|
|
206
|
+
M.setValue(ptr + 8, cfg.padOnsetSecs, "float");
|
|
207
|
+
M.setValue(ptr + 12, cfg.padOffsetSecs, "float");
|
|
208
|
+
M.setValue(ptr + 16, cfg.minSpeechSecs, "float");
|
|
209
|
+
M.setValue(ptr + 20, cfg.minSilenceSecs, "float");
|
|
210
|
+
M.setValue(ptr + 24, modeToInt(cfg.mode), "i32");
|
|
211
|
+
}
|
|
212
|
+
function chunkMerge(M, segments, config) {
|
|
213
|
+
const numSegments = segments.length;
|
|
214
|
+
const segPtr = numSegments > 0 ? M._malloc(numSegments * SIZEOF_SEGMENT) : 0;
|
|
215
|
+
const cfgPtr = M._malloc(SIZEOF_CHUNK_CONFIG);
|
|
216
|
+
const outPtrPtr = M._malloc(4);
|
|
217
|
+
const outCountPtr = M._malloc(4);
|
|
218
|
+
try {
|
|
219
|
+
for (let i = 0; i < numSegments; i++) {
|
|
220
|
+
const base = segPtr + i * SIZEOF_SEGMENT;
|
|
221
|
+
M.setValue(base + 0, segments[i][0], "float");
|
|
222
|
+
M.setValue(base + 4, segments[i][1], "float");
|
|
223
|
+
}
|
|
224
|
+
writeChunkConfig(M, cfgPtr, config);
|
|
225
|
+
M.setValue(outPtrPtr, 0, "i32");
|
|
226
|
+
M.setValue(outCountPtr, 0, "i32");
|
|
227
|
+
const rc = M.ccall(
|
|
228
|
+
"omni_merge_chunks",
|
|
229
|
+
"number",
|
|
230
|
+
["number", "number", "number", "number", "number"],
|
|
231
|
+
[segPtr, numSegments, cfgPtr, outPtrPtr, outCountPtr]
|
|
232
|
+
);
|
|
233
|
+
if (rc !== 0) {
|
|
234
|
+
throw new Error(`omni_merge_chunks failed: ${readNativeError(M, rc)}`);
|
|
235
|
+
}
|
|
236
|
+
const count = M.getValue(outCountPtr, "i32");
|
|
237
|
+
const chunkPtr = M.getValue(outPtrPtr, "i32");
|
|
238
|
+
const chunks = [];
|
|
239
|
+
for (let i = 0; i < count; i++) {
|
|
240
|
+
const base = chunkPtr + i * SIZEOF_CHUNK;
|
|
241
|
+
chunks.push({
|
|
242
|
+
start: M.getValue(base + 0, "float"),
|
|
243
|
+
end: M.getValue(base + 4, "float"),
|
|
244
|
+
segStartIdx: M.getValue(base + 8, "i32"),
|
|
245
|
+
segCount: M.getValue(base + 12, "i32")
|
|
246
|
+
});
|
|
247
|
+
}
|
|
248
|
+
if (chunkPtr) M._free(chunkPtr);
|
|
249
|
+
return chunks;
|
|
250
|
+
} finally {
|
|
251
|
+
if (segPtr) M._free(segPtr);
|
|
252
|
+
M._free(cfgPtr);
|
|
253
|
+
M._free(outPtrPtr);
|
|
254
|
+
M._free(outCountPtr);
|
|
255
|
+
}
|
|
256
|
+
}
|
|
124
257
|
function vadCreate(M, modelBuffer) {
|
|
125
258
|
const bytes = new Uint8Array(modelBuffer);
|
|
126
259
|
const ptr = M._malloc(bytes.length);
|
|
@@ -225,24 +358,49 @@ function aedDetect(M, handle, audioPtr, numSamples, cfg, format = "f32") {
|
|
|
225
358
|
function aedDestroy(M, handle) {
|
|
226
359
|
M.ccall("omni_aed_destroy", null, ["number"], [handle]);
|
|
227
360
|
}
|
|
228
|
-
|
|
361
|
+
var DEFAULT_STREAM_VAD_CONFIG = {
|
|
362
|
+
threshold: 0.5,
|
|
363
|
+
smoothWindowSize: 5,
|
|
364
|
+
padStartFrame: 5,
|
|
365
|
+
minSpeechFrame: 8,
|
|
366
|
+
maxSpeechFrame: 2e3,
|
|
367
|
+
minSilenceFrame: 20
|
|
368
|
+
};
|
|
369
|
+
var SIZEOF_STREAM_VAD_CONFIG = 24;
|
|
370
|
+
function writeStreamVadConfig(M, ptr, cfg) {
|
|
371
|
+
M.setValue(ptr + 0, cfg.threshold, "float");
|
|
372
|
+
M.setValue(ptr + 4, cfg.smoothWindowSize, "i32");
|
|
373
|
+
M.setValue(ptr + 8, cfg.padStartFrame, "i32");
|
|
374
|
+
M.setValue(ptr + 12, cfg.minSpeechFrame, "i32");
|
|
375
|
+
M.setValue(ptr + 16, cfg.maxSpeechFrame, "i32");
|
|
376
|
+
M.setValue(ptr + 20, cfg.minSilenceFrame, "i32");
|
|
377
|
+
}
|
|
378
|
+
function streamVadCreate(M, modelBuffer, config = {}) {
|
|
379
|
+
const overrides = Object.fromEntries(
|
|
380
|
+
Object.entries(config).filter(([, v]) => v !== void 0)
|
|
381
|
+
);
|
|
382
|
+
const cfg = { ...DEFAULT_STREAM_VAD_CONFIG, ...overrides };
|
|
229
383
|
const bytes = new Uint8Array(modelBuffer);
|
|
230
|
-
const
|
|
231
|
-
M.HEAPU8.set(bytes,
|
|
384
|
+
const dataPtr = M._malloc(bytes.length);
|
|
385
|
+
M.HEAPU8.set(bytes, dataPtr);
|
|
386
|
+
const cfgPtr = M._malloc(SIZEOF_STREAM_VAD_CONFIG);
|
|
232
387
|
try {
|
|
388
|
+
writeStreamVadConfig(M, cfgPtr, cfg);
|
|
233
389
|
return createModel(
|
|
234
390
|
M,
|
|
235
391
|
"omni_stream_vad_create_from_buffer",
|
|
236
392
|
["number", "number", "number"],
|
|
237
|
-
[
|
|
393
|
+
[dataPtr, bytes.length, cfgPtr],
|
|
238
394
|
"StreamVAD"
|
|
239
395
|
);
|
|
240
396
|
} finally {
|
|
241
|
-
M._free(
|
|
397
|
+
M._free(dataPtr);
|
|
398
|
+
M._free(cfgPtr);
|
|
242
399
|
}
|
|
243
400
|
}
|
|
401
|
+
var SIZEOF_STREAM_VAD_RESULT = 24;
|
|
244
402
|
function streamVadProcess(M, handle, pcm16Ptr, numSamples) {
|
|
245
|
-
const resultPtr = M._malloc(
|
|
403
|
+
const resultPtr = M._malloc(SIZEOF_STREAM_VAD_RESULT);
|
|
246
404
|
try {
|
|
247
405
|
const ret = M.ccall(
|
|
248
406
|
"omni_stream_vad_process",
|
|
@@ -253,9 +411,14 @@ function streamVadProcess(M, handle, pcm16Ptr, numSamples) {
|
|
|
253
411
|
if (ret === OMNI_ERR_NO_FRAMES) return null;
|
|
254
412
|
if (ret !== 0) throw new Error(`StreamVAD process failed: ${ret}`);
|
|
255
413
|
return {
|
|
256
|
-
confidence: M.getValue(resultPtr, "float"),
|
|
257
|
-
|
|
258
|
-
|
|
414
|
+
confidence: M.getValue(resultPtr + 0, "float"),
|
|
415
|
+
smoothedProb: M.getValue(resultPtr + 4, "float"),
|
|
416
|
+
isSpeech: M.getValue(resultPtr + 8, "i8") !== 0,
|
|
417
|
+
isSpeechStart: M.getValue(resultPtr + 9, "i8") !== 0,
|
|
418
|
+
isSpeechEnd: M.getValue(resultPtr + 10, "i8") !== 0,
|
|
419
|
+
frameIdx: M.getValue(resultPtr + 12, "i32"),
|
|
420
|
+
speechStartFrame: M.getValue(resultPtr + 16, "i32"),
|
|
421
|
+
speechEndFrame: M.getValue(resultPtr + 20, "i32")
|
|
259
422
|
};
|
|
260
423
|
} finally {
|
|
261
424
|
M._free(resultPtr);
|
|
@@ -354,8 +517,6 @@ function int16ToNormalizedFloat32(i16) {
|
|
|
354
517
|
var SAMPLE_RATE2 = 16e3;
|
|
355
518
|
var OmniStreamVAD = class _OmniStreamVAD {
|
|
356
519
|
constructor(handle) {
|
|
357
|
-
this.inSpeech = false;
|
|
358
|
-
this.speechStartFrame = 0;
|
|
359
520
|
this.handle = handle;
|
|
360
521
|
}
|
|
361
522
|
/**
|
|
@@ -366,8 +527,14 @@ var OmniStreamVAD = class _OmniStreamVAD {
|
|
|
366
527
|
await initWasm();
|
|
367
528
|
const M = getModule();
|
|
368
529
|
const modelBuffer = await loadModel("stream-vad", options.modelUrl, options.modelData);
|
|
369
|
-
const
|
|
370
|
-
|
|
530
|
+
const handle = streamVadCreate(M, modelBuffer, {
|
|
531
|
+
threshold: options.threshold,
|
|
532
|
+
smoothWindowSize: options.smoothWindowSize,
|
|
533
|
+
padStartFrame: options.padStartFrame,
|
|
534
|
+
minSpeechFrame: options.minSpeechFrame,
|
|
535
|
+
maxSpeechFrame: options.maxSpeechFrame,
|
|
536
|
+
minSilenceFrame: options.minSilenceFrame
|
|
537
|
+
});
|
|
371
538
|
return new _OmniStreamVAD(handle);
|
|
372
539
|
}
|
|
373
540
|
/**
|
|
@@ -385,6 +552,10 @@ var OmniStreamVAD = class _OmniStreamVAD {
|
|
|
385
552
|
/**
|
|
386
553
|
* Process one frame of audio (160 int16 samples = 10ms @ 16kHz).
|
|
387
554
|
* Returns null until enough audio is accumulated.
|
|
555
|
+
*
|
|
556
|
+
* Segment-boundary events (isSpeechStart / isSpeechEnd and the matching
|
|
557
|
+
* speech_*_frame indices) come straight from the C-layer state machine
|
|
558
|
+
* (bit-identical to upstream FireRedVAD) — the wrapper is just a marshaller.
|
|
388
559
|
*/
|
|
389
560
|
processFrame(pcm160) {
|
|
390
561
|
const M = getModule();
|
|
@@ -393,28 +564,16 @@ var OmniStreamVAD = class _OmniStreamVAD {
|
|
|
393
564
|
heap16.set(pcm160);
|
|
394
565
|
try {
|
|
395
566
|
const result = streamVadProcess(M, this.handle, ptr, pcm160.length);
|
|
396
|
-
if (!result
|
|
397
|
-
const frameIndex = result.frameOffset;
|
|
398
|
-
const isSpeechStart = result.isSpeech && !this.inSpeech;
|
|
399
|
-
const isSpeechEnd = !result.isSpeech && this.inSpeech;
|
|
400
|
-
if (isSpeechStart) {
|
|
401
|
-
this.speechStartFrame = frameIndex;
|
|
402
|
-
}
|
|
403
|
-
const activeSpeechStartFrame = isSpeechEnd ? this.speechStartFrame : result.isSpeech ? this.speechStartFrame : 0;
|
|
404
|
-
const speechEndFrame = isSpeechEnd ? Math.max(1, frameIndex - 1) : 0;
|
|
405
|
-
this.inSpeech = result.isSpeech;
|
|
406
|
-
if (isSpeechEnd) {
|
|
407
|
-
this.speechStartFrame = 0;
|
|
408
|
-
}
|
|
567
|
+
if (!result) return null;
|
|
409
568
|
return {
|
|
410
569
|
confidence: result.confidence,
|
|
411
|
-
|
|
570
|
+
smoothedProb: result.smoothedProb,
|
|
412
571
|
isSpeech: result.isSpeech,
|
|
413
|
-
frameIndex,
|
|
414
|
-
isSpeechStart,
|
|
415
|
-
isSpeechEnd,
|
|
416
|
-
speechStartFrame:
|
|
417
|
-
speechEndFrame
|
|
572
|
+
frameIndex: result.frameIdx,
|
|
573
|
+
isSpeechStart: result.isSpeechStart,
|
|
574
|
+
isSpeechEnd: result.isSpeechEnd,
|
|
575
|
+
speechStartFrame: result.speechStartFrame,
|
|
576
|
+
speechEndFrame: result.speechEndFrame
|
|
418
577
|
};
|
|
419
578
|
} finally {
|
|
420
579
|
M._free(ptr);
|
|
@@ -453,11 +612,9 @@ var OmniStreamVAD = class _OmniStreamVAD {
|
|
|
453
612
|
M._free(framesPtr);
|
|
454
613
|
}
|
|
455
614
|
}
|
|
456
|
-
/** Reset all internal state. */
|
|
615
|
+
/** Reset all internal state (model cache, audio buffer, postprocessor). */
|
|
457
616
|
reset() {
|
|
458
617
|
streamVadReset(getModule(), this.handle);
|
|
459
|
-
this.inSpeech = false;
|
|
460
|
-
this.speechStartFrame = 0;
|
|
461
618
|
}
|
|
462
619
|
/** Release native resources. */
|
|
463
620
|
dispose() {
|
|
@@ -465,8 +622,6 @@ var OmniStreamVAD = class _OmniStreamVAD {
|
|
|
465
622
|
streamVadDestroy(getModule(), this.handle);
|
|
466
623
|
this.handle = 0;
|
|
467
624
|
}
|
|
468
|
-
this.inSpeech = false;
|
|
469
|
-
this.speechStartFrame = 0;
|
|
470
625
|
}
|
|
471
626
|
};
|
|
472
627
|
function int16ToFloat32(i16) {
|
|
@@ -580,6 +735,28 @@ function computeCoverageRatios(events, duration) {
|
|
|
580
735
|
return ratios;
|
|
581
736
|
}
|
|
582
737
|
|
|
583
|
-
|
|
738
|
+
// src/chunking.ts
|
|
739
|
+
async function mergeChunks(segments, options = {}) {
|
|
740
|
+
await initWasm();
|
|
741
|
+
const M = getModule();
|
|
742
|
+
const cfg = {
|
|
743
|
+
maxChunkSecs: options.maxChunkSecs ?? DEFAULT_CHUNK_CONFIG.maxChunkSecs,
|
|
744
|
+
maxGapSecs: options.maxGapSecs ?? DEFAULT_CHUNK_CONFIG.maxGapSecs,
|
|
745
|
+
padOnsetSecs: options.padOnsetSecs ?? DEFAULT_CHUNK_CONFIG.padOnsetSecs,
|
|
746
|
+
padOffsetSecs: options.padOffsetSecs ?? DEFAULT_CHUNK_CONFIG.padOffsetSecs,
|
|
747
|
+
minSpeechSecs: options.minSpeechSecs ?? DEFAULT_CHUNK_CONFIG.minSpeechSecs,
|
|
748
|
+
minSilenceSecs: options.minSilenceSecs ?? DEFAULT_CHUNK_CONFIG.minSilenceSecs,
|
|
749
|
+
mode: options.mode ?? DEFAULT_CHUNK_CONFIG.mode
|
|
750
|
+
};
|
|
751
|
+
const records = chunkMerge(M, segments, cfg);
|
|
752
|
+
return records.map((r) => ({
|
|
753
|
+
start: r.start,
|
|
754
|
+
end: r.end,
|
|
755
|
+
segStartIdx: r.segStartIdx,
|
|
756
|
+
segCount: r.segCount
|
|
757
|
+
}));
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
export { DEFAULT_CDN_BASE, DEFAULT_CHUNK_CONFIG, OmniAED as FireRedAED, OmniStreamVAD as FireRedStreamVAD, OmniVAD as FireRedVAD, MODEL_FILES, OmniAED, OmniStreamVAD, OmniVAD, VERSION, initWasm, loadModel, mergeChunks };
|
|
584
761
|
//# sourceMappingURL=index.js.map
|
|
585
762
|
//# sourceMappingURL=index.js.map
|