omnivad 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +504 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +169 -0
- package/dist/index.d.ts +169 -0
- package/dist/index.js +495 -0
- package/dist/index.js.map +1 -0
- package/dist/wasm/omnivad.cjs +22 -0
- package/dist/wasm/omnivad.data +0 -0
- package/dist/wasm/omnivad.js +22 -0
- package/dist/wasm/omnivad.wasm +0 -0
- package/package.json +52 -0
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
/** Result from non-streaming VAD detection */
|
|
2
|
+
interface VADResult {
|
|
3
|
+
/** Audio duration in seconds */
|
|
4
|
+
duration: number;
|
|
5
|
+
/** Array of [start, end] timestamp pairs in seconds */
|
|
6
|
+
timestamps: [number, number][];
|
|
7
|
+
}
|
|
8
|
+
/** Result from Audio Event Detection (3-class) */
|
|
9
|
+
interface AEDResult {
|
|
10
|
+
/** Audio duration in seconds */
|
|
11
|
+
duration: number;
|
|
12
|
+
/** Events keyed by type ("speech", "singing", "music") with timestamp pairs */
|
|
13
|
+
events: Record<string, [number, number][]>;
|
|
14
|
+
/** Detected duration coverage ratio for each event type */
|
|
15
|
+
ratios: Record<string, number>;
|
|
16
|
+
}
|
|
17
|
+
/** Per-frame result from streaming VAD */
|
|
18
|
+
interface StreamVADFrameResult {
|
|
19
|
+
/** Raw probability from model output */
|
|
20
|
+
confidence: number;
|
|
21
|
+
/** Currently identical to confidence; reserved for future smoothing */
|
|
22
|
+
smoothedConfidence: number;
|
|
23
|
+
/** Whether current frame is classified as speech */
|
|
24
|
+
isSpeech: boolean;
|
|
25
|
+
/** 1-based frame index of the emitted frame */
|
|
26
|
+
frameIndex: number;
|
|
27
|
+
/** True when speech becomes active at this frame */
|
|
28
|
+
isSpeechStart: boolean;
|
|
29
|
+
/** True when speech ends on the previous frame */
|
|
30
|
+
isSpeechEnd: boolean;
|
|
31
|
+
/** Start frame of the active or just-finished speech segment */
|
|
32
|
+
speechStartFrame: number;
|
|
33
|
+
/** End frame of the just-finished speech segment, or 0 if not ending */
|
|
34
|
+
speechEndFrame: number;
|
|
35
|
+
}
|
|
36
|
+
/** Full-audio streaming-model output */
|
|
37
|
+
interface StreamVADFullResult {
|
|
38
|
+
/** Per-frame speech probabilities */
|
|
39
|
+
probabilities: Float32Array;
|
|
40
|
+
/** Number of emitted frames */
|
|
41
|
+
numFrames: number;
|
|
42
|
+
/** Audio duration in seconds */
|
|
43
|
+
duration: number;
|
|
44
|
+
}
|
|
45
|
+
/** Configuration for non-streaming VAD */
|
|
46
|
+
interface VADConfig {
|
|
47
|
+
/** Speech probability threshold (default: 0.4) */
|
|
48
|
+
speechThreshold?: number;
|
|
49
|
+
/** Smoothing window size in frames (default: 5) */
|
|
50
|
+
smoothWindowSize?: number;
|
|
51
|
+
/** Minimum speech segment length in frames (default: 20) */
|
|
52
|
+
minSpeechFrames?: number;
|
|
53
|
+
/** Maximum speech segment length in frames before splitting (default: 2000 = 20s) */
|
|
54
|
+
maxSpeechFrames?: number;
|
|
55
|
+
/** Minimum silence segment length in frames for state machine (default: 20) */
|
|
56
|
+
minSilenceFrames?: number;
|
|
57
|
+
/** Merge silence segments shorter than this (default: 0 = disabled) */
|
|
58
|
+
mergeSilenceFrames?: number;
|
|
59
|
+
/** Extend speech segments by this many frames on each side (default: 0) */
|
|
60
|
+
extendSpeechFrames?: number;
|
|
61
|
+
}
|
|
62
|
+
/** Configuration for Audio Event Detection */
|
|
63
|
+
interface AEDConfig extends VADConfig {
|
|
64
|
+
/** Singing probability threshold (default: 0.5) */
|
|
65
|
+
singingThreshold?: number;
|
|
66
|
+
/** Music probability threshold (default: 0.5) */
|
|
67
|
+
musicThreshold?: number;
|
|
68
|
+
}
|
|
69
|
+
/** Configuration for streaming VAD */
|
|
70
|
+
interface StreamVADConfig {
|
|
71
|
+
/** Speech probability threshold (default: 0.5) */
|
|
72
|
+
speechThreshold?: number;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Non-streaming Voice Activity Detection (WASM/ncnn backend).
|
|
77
|
+
*
|
|
78
|
+
* Audio format:
|
|
79
|
+
* - Int16Array: raw 16-bit PCM, converted to normalized float internally
|
|
80
|
+
* - Float32Array in [-1.0, 1.0]: normalized audio (Web Audio API format)
|
|
81
|
+
*/
|
|
82
|
+
|
|
83
|
+
declare class OmniVAD {
|
|
84
|
+
private handle;
|
|
85
|
+
private config;
|
|
86
|
+
private constructor();
|
|
87
|
+
/**
|
|
88
|
+
* Create a new OmniVAD instance.
|
|
89
|
+
* Initializes WASM and loads the bundled ncnn model.
|
|
90
|
+
*/
|
|
91
|
+
static create(options?: VADConfig): Promise<OmniVAD>;
|
|
92
|
+
/**
|
|
93
|
+
* Detect speech segments in audio.
|
|
94
|
+
*
|
|
95
|
+
* Accepts Int16Array (PCM) or normalized Float32Array in [-1, 1].
|
|
96
|
+
*/
|
|
97
|
+
detect(audio: Float32Array | Int16Array): VADResult;
|
|
98
|
+
/** Release native resources. */
|
|
99
|
+
dispose(): void;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Streaming Voice Activity Detection (WASM/ncnn backend).
|
|
104
|
+
* Processes audio frame-by-frame (10ms chunks of 160 samples @ 16kHz).
|
|
105
|
+
*/
|
|
106
|
+
|
|
107
|
+
declare class OmniStreamVAD {
|
|
108
|
+
private handle;
|
|
109
|
+
private inSpeech;
|
|
110
|
+
private speechStartFrame;
|
|
111
|
+
private constructor();
|
|
112
|
+
/**
|
|
113
|
+
* Create a new OmniStreamVAD instance.
|
|
114
|
+
* Initializes WASM and loads the bundled ncnn model.
|
|
115
|
+
*/
|
|
116
|
+
static create(options?: StreamVADConfig): Promise<OmniStreamVAD>;
|
|
117
|
+
/**
|
|
118
|
+
* Process one frame of audio (160 int16 samples = 10ms @ 16kHz).
|
|
119
|
+
* Returns null until enough audio is accumulated.
|
|
120
|
+
*/
|
|
121
|
+
processFrame(pcm160: Int16Array): StreamVADFrameResult | null;
|
|
122
|
+
/**
|
|
123
|
+
* Process entire audio at once and return per-frame probabilities.
|
|
124
|
+
* @param audio - Float32Array in [-1, 1] or Int16Array of 16kHz mono PCM
|
|
125
|
+
*/
|
|
126
|
+
detectFull(audio: Float32Array | Int16Array): StreamVADFullResult;
|
|
127
|
+
/** Reset all internal state. */
|
|
128
|
+
reset(): void;
|
|
129
|
+
/** Release native resources. */
|
|
130
|
+
dispose(): void;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Audio Event Detection: speech, singing, music (WASM/ncnn backend).
|
|
135
|
+
*
|
|
136
|
+
* Audio format: same as OmniVAD — Int16Array or normalized Float32Array [-1, 1].
|
|
137
|
+
*/
|
|
138
|
+
|
|
139
|
+
declare class OmniAED {
|
|
140
|
+
private handle;
|
|
141
|
+
private config;
|
|
142
|
+
private constructor();
|
|
143
|
+
/**
|
|
144
|
+
* Create a new OmniAED instance.
|
|
145
|
+
* Initializes WASM and loads the bundled ncnn model.
|
|
146
|
+
*/
|
|
147
|
+
static create(options?: AEDConfig): Promise<OmniAED>;
|
|
148
|
+
/**
|
|
149
|
+
* Detect audio events (speech, singing, music).
|
|
150
|
+
*
|
|
151
|
+
* Accepts Int16Array (PCM) or normalized Float32Array in [-1, 1].
|
|
152
|
+
*/
|
|
153
|
+
detect(audio: Float32Array | Int16Array): AEDResult;
|
|
154
|
+
/** Release native resources. */
|
|
155
|
+
dispose(): void;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Low-level WASM binding for omnivad C API.
|
|
160
|
+
* Loads the Emscripten module and provides typed wrappers.
|
|
161
|
+
*/
|
|
162
|
+
type EmscriptenModule = any;
|
|
163
|
+
/**
|
|
164
|
+
* Initialize the WASM module. Call once before using any other functions.
|
|
165
|
+
* Safe to call multiple times (returns cached module).
|
|
166
|
+
*/
|
|
167
|
+
declare function initWasm(wasmLocator?: (filename: string) => string): Promise<EmscriptenModule>;
|
|
168
|
+
|
|
169
|
+
export { type AEDConfig, type AEDResult, OmniAED as FireRedAED, OmniStreamVAD as FireRedStreamVAD, OmniVAD as FireRedVAD, OmniAED, OmniStreamVAD, OmniVAD, type StreamVADConfig, type StreamVADFrameResult, type StreamVADFullResult, type VADConfig, type VADResult, initWasm };
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
/** Result from non-streaming VAD detection */
|
|
2
|
+
interface VADResult {
|
|
3
|
+
/** Audio duration in seconds */
|
|
4
|
+
duration: number;
|
|
5
|
+
/** Array of [start, end] timestamp pairs in seconds */
|
|
6
|
+
timestamps: [number, number][];
|
|
7
|
+
}
|
|
8
|
+
/** Result from Audio Event Detection (3-class) */
|
|
9
|
+
interface AEDResult {
|
|
10
|
+
/** Audio duration in seconds */
|
|
11
|
+
duration: number;
|
|
12
|
+
/** Events keyed by type ("speech", "singing", "music") with timestamp pairs */
|
|
13
|
+
events: Record<string, [number, number][]>;
|
|
14
|
+
/** Detected duration coverage ratio for each event type */
|
|
15
|
+
ratios: Record<string, number>;
|
|
16
|
+
}
|
|
17
|
+
/** Per-frame result from streaming VAD */
|
|
18
|
+
interface StreamVADFrameResult {
|
|
19
|
+
/** Raw probability from model output */
|
|
20
|
+
confidence: number;
|
|
21
|
+
/** Currently identical to confidence; reserved for future smoothing */
|
|
22
|
+
smoothedConfidence: number;
|
|
23
|
+
/** Whether current frame is classified as speech */
|
|
24
|
+
isSpeech: boolean;
|
|
25
|
+
/** 1-based frame index of the emitted frame */
|
|
26
|
+
frameIndex: number;
|
|
27
|
+
/** True when speech becomes active at this frame */
|
|
28
|
+
isSpeechStart: boolean;
|
|
29
|
+
/** True when speech ends on the previous frame */
|
|
30
|
+
isSpeechEnd: boolean;
|
|
31
|
+
/** Start frame of the active or just-finished speech segment */
|
|
32
|
+
speechStartFrame: number;
|
|
33
|
+
/** End frame of the just-finished speech segment, or 0 if not ending */
|
|
34
|
+
speechEndFrame: number;
|
|
35
|
+
}
|
|
36
|
+
/** Full-audio streaming-model output */
|
|
37
|
+
interface StreamVADFullResult {
|
|
38
|
+
/** Per-frame speech probabilities */
|
|
39
|
+
probabilities: Float32Array;
|
|
40
|
+
/** Number of emitted frames */
|
|
41
|
+
numFrames: number;
|
|
42
|
+
/** Audio duration in seconds */
|
|
43
|
+
duration: number;
|
|
44
|
+
}
|
|
45
|
+
/** Configuration for non-streaming VAD */
|
|
46
|
+
interface VADConfig {
|
|
47
|
+
/** Speech probability threshold (default: 0.4) */
|
|
48
|
+
speechThreshold?: number;
|
|
49
|
+
/** Smoothing window size in frames (default: 5) */
|
|
50
|
+
smoothWindowSize?: number;
|
|
51
|
+
/** Minimum speech segment length in frames (default: 20) */
|
|
52
|
+
minSpeechFrames?: number;
|
|
53
|
+
/** Maximum speech segment length in frames before splitting (default: 2000 = 20s) */
|
|
54
|
+
maxSpeechFrames?: number;
|
|
55
|
+
/** Minimum silence segment length in frames for state machine (default: 20) */
|
|
56
|
+
minSilenceFrames?: number;
|
|
57
|
+
/** Merge silence segments shorter than this (default: 0 = disabled) */
|
|
58
|
+
mergeSilenceFrames?: number;
|
|
59
|
+
/** Extend speech segments by this many frames on each side (default: 0) */
|
|
60
|
+
extendSpeechFrames?: number;
|
|
61
|
+
}
|
|
62
|
+
/** Configuration for Audio Event Detection */
|
|
63
|
+
interface AEDConfig extends VADConfig {
|
|
64
|
+
/** Singing probability threshold (default: 0.5) */
|
|
65
|
+
singingThreshold?: number;
|
|
66
|
+
/** Music probability threshold (default: 0.5) */
|
|
67
|
+
musicThreshold?: number;
|
|
68
|
+
}
|
|
69
|
+
/** Configuration for streaming VAD */
|
|
70
|
+
interface StreamVADConfig {
|
|
71
|
+
/** Speech probability threshold (default: 0.5) */
|
|
72
|
+
speechThreshold?: number;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Non-streaming Voice Activity Detection (WASM/ncnn backend).
|
|
77
|
+
*
|
|
78
|
+
* Audio format:
|
|
79
|
+
* - Int16Array: raw 16-bit PCM, converted to normalized float internally
|
|
80
|
+
* - Float32Array in [-1.0, 1.0]: normalized audio (Web Audio API format)
|
|
81
|
+
*/
|
|
82
|
+
|
|
83
|
+
declare class OmniVAD {
|
|
84
|
+
private handle;
|
|
85
|
+
private config;
|
|
86
|
+
private constructor();
|
|
87
|
+
/**
|
|
88
|
+
* Create a new OmniVAD instance.
|
|
89
|
+
* Initializes WASM and loads the bundled ncnn model.
|
|
90
|
+
*/
|
|
91
|
+
static create(options?: VADConfig): Promise<OmniVAD>;
|
|
92
|
+
/**
|
|
93
|
+
* Detect speech segments in audio.
|
|
94
|
+
*
|
|
95
|
+
* Accepts Int16Array (PCM) or normalized Float32Array in [-1, 1].
|
|
96
|
+
*/
|
|
97
|
+
detect(audio: Float32Array | Int16Array): VADResult;
|
|
98
|
+
/** Release native resources. */
|
|
99
|
+
dispose(): void;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Streaming Voice Activity Detection (WASM/ncnn backend).
|
|
104
|
+
* Processes audio frame-by-frame (10ms chunks of 160 samples @ 16kHz).
|
|
105
|
+
*/
|
|
106
|
+
|
|
107
|
+
declare class OmniStreamVAD {
|
|
108
|
+
private handle;
|
|
109
|
+
private inSpeech;
|
|
110
|
+
private speechStartFrame;
|
|
111
|
+
private constructor();
|
|
112
|
+
/**
|
|
113
|
+
* Create a new OmniStreamVAD instance.
|
|
114
|
+
* Initializes WASM and loads the bundled ncnn model.
|
|
115
|
+
*/
|
|
116
|
+
static create(options?: StreamVADConfig): Promise<OmniStreamVAD>;
|
|
117
|
+
/**
|
|
118
|
+
* Process one frame of audio (160 int16 samples = 10ms @ 16kHz).
|
|
119
|
+
* Returns null until enough audio is accumulated.
|
|
120
|
+
*/
|
|
121
|
+
processFrame(pcm160: Int16Array): StreamVADFrameResult | null;
|
|
122
|
+
/**
|
|
123
|
+
* Process entire audio at once and return per-frame probabilities.
|
|
124
|
+
* @param audio - Float32Array in [-1, 1] or Int16Array of 16kHz mono PCM
|
|
125
|
+
*/
|
|
126
|
+
detectFull(audio: Float32Array | Int16Array): StreamVADFullResult;
|
|
127
|
+
/** Reset all internal state. */
|
|
128
|
+
reset(): void;
|
|
129
|
+
/** Release native resources. */
|
|
130
|
+
dispose(): void;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Audio Event Detection: speech, singing, music (WASM/ncnn backend).
|
|
135
|
+
*
|
|
136
|
+
* Audio format: same as OmniVAD — Int16Array or normalized Float32Array [-1, 1].
|
|
137
|
+
*/
|
|
138
|
+
|
|
139
|
+
declare class OmniAED {
|
|
140
|
+
private handle;
|
|
141
|
+
private config;
|
|
142
|
+
private constructor();
|
|
143
|
+
/**
|
|
144
|
+
* Create a new OmniAED instance.
|
|
145
|
+
* Initializes WASM and loads the bundled ncnn model.
|
|
146
|
+
*/
|
|
147
|
+
static create(options?: AEDConfig): Promise<OmniAED>;
|
|
148
|
+
/**
|
|
149
|
+
* Detect audio events (speech, singing, music).
|
|
150
|
+
*
|
|
151
|
+
* Accepts Int16Array (PCM) or normalized Float32Array in [-1, 1].
|
|
152
|
+
*/
|
|
153
|
+
detect(audio: Float32Array | Int16Array): AEDResult;
|
|
154
|
+
/** Release native resources. */
|
|
155
|
+
dispose(): void;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Low-level WASM binding for omnivad C API.
|
|
160
|
+
* Loads the Emscripten module and provides typed wrappers.
|
|
161
|
+
*/
|
|
162
|
+
type EmscriptenModule = any;
|
|
163
|
+
/**
|
|
164
|
+
* Initialize the WASM module. Call once before using any other functions.
|
|
165
|
+
* Safe to call multiple times (returns cached module).
|
|
166
|
+
*/
|
|
167
|
+
declare function initWasm(wasmLocator?: (filename: string) => string): Promise<EmscriptenModule>;
|
|
168
|
+
|
|
169
|
+
export { type AEDConfig, type AEDResult, OmniAED as FireRedAED, OmniStreamVAD as FireRedStreamVAD, OmniVAD as FireRedVAD, OmniAED, OmniStreamVAD, OmniVAD, type StreamVADConfig, type StreamVADFrameResult, type StreamVADFullResult, type VADConfig, type VADResult, initWasm };
|