@guidekit/vad 0.1.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,122 @@
1
+ declare const VAD_VERSION = "0.1.0";
2
+ /** Silero VAD frame size: 512 samples at 16 kHz = 32 ms per frame. */
3
+ declare const FRAME_SIZE = 512;
4
+ /** Target sample rate for VAD processing. */
5
+ declare const TARGET_SAMPLE_RATE = 16000;
6
+ interface VADOptions {
7
+ /** Speech probability threshold (0-1). Default: 0.5 */
8
+ threshold?: number;
9
+ /** Minimum speech duration in ms to trigger start. Default: 300 */
10
+ minSpeechDurationMs?: number;
11
+ /** Silence duration in ms after speech to trigger end. Default: 500 */
12
+ silenceDurationMs?: number;
13
+ /** Sample rate. Default: 16000 */
14
+ sampleRate?: number;
15
+ /** Enable debug logging. Default: false */
16
+ debug?: boolean;
17
+ /** Custom URL for the Silero VAD ONNX model file. */
18
+ modelUrl?: string;
19
+ }
20
+ interface VADEvent {
21
+ type: 'speech-start' | 'speech-end' | 'vad-ready';
22
+ timestamp: number;
23
+ /** Speech probability (0-1) at the moment of the event. */
24
+ probability?: number;
25
+ }
26
+ type VADCallback = (event: VADEvent) => void;
27
+ declare class SileroVAD {
28
+ private readonly _threshold;
29
+ private readonly _minSpeechDurationMs;
30
+ private readonly _silenceDurationMs;
31
+ private readonly _sampleRate;
32
+ private readonly _debug;
33
+ private readonly _modelUrl;
34
+ private _session;
35
+ private _h;
36
+ private _c;
37
+ private _audioContext;
38
+ private _ownsAudioContext;
39
+ private _sourceNode;
40
+ private _workletNode;
41
+ private _stream;
42
+ private _frameBuffer;
43
+ private _frameBufferOffset;
44
+ private _isReady;
45
+ private _isSpeaking;
46
+ private _isStarted;
47
+ private _isDestroyed;
48
+ private _consecutiveSpeechFrames;
49
+ private _consecutiveSilenceFrames;
50
+ private _frameDurationMs;
51
+ private _minSpeechFrames;
52
+ private _silenceFrames;
53
+ private _isCalibrating;
54
+ private _calibrationSamples;
55
+ private _calibrationFramesNeeded;
56
+ private _calibratedThreshold;
57
+ private _listeners;
58
+ private _processingPromise;
59
+ constructor(options?: VADOptions);
60
+ /** Load the ONNX model. Uses Cache API for persistence across sessions. */
61
+ init(): Promise<void>;
62
+ /**
63
+ * Process a single audio frame (512 samples at 16 kHz).
64
+ * Returns the speech probability (0-1).
65
+ */
66
+ processFrame(audioData: Float32Array): Promise<number>;
67
+ /** Start VAD processing on a MediaStream (typically from getUserMedia). */
68
+ start(stream: MediaStream): void;
69
+ /** Stop VAD processing and release audio resources (but keep the model). */
70
+ stop(): void;
71
+ /** Register a callback for speech-start events. Returns an unsubscribe function. */
72
+ onSpeechStart(callback: VADCallback): () => void;
73
+ /** Register a callback for speech-end events. Returns an unsubscribe function. */
74
+ onSpeechEnd(callback: VADCallback): () => void;
75
+ /** Register a callback for vad-ready events. Returns an unsubscribe function. */
76
+ onReady(callback: VADCallback): () => void;
77
+ /** Whether the ONNX model is loaded and ready. */
78
+ get isReady(): boolean;
79
+ /** Whether speech is currently detected. */
80
+ get isSpeaking(): boolean;
81
+ /** The MediaStream currently being processed, or null. */
82
+ get stream(): MediaStream | null;
83
+ /** Release ONNX model session and all audio resources. */
84
+ destroy(): Promise<void>;
85
+ private _on;
86
+ private _emit;
87
+ private _setupAudioPipeline;
88
+ /**
89
+ * ScriptProcessorNode fallback (works everywhere, including Safari).
90
+ * We use a buffer size of 4096 which gives ~85 ms of audio at 48 kHz.
91
+ */
92
+ private _setupScriptProcessor;
93
+ /**
94
+ * Accumulate resampled audio into FRAME_SIZE chunks and process each full frame.
95
+ */
96
+ private _feedAudio;
97
+ /**
98
+ * Process a single FRAME_SIZE frame: run inference and update speech state.
99
+ */
100
+ private _handleFrame;
101
+ private _finishCalibration;
102
+ private _teardownAudioPipeline;
103
+ /** Reset the LSTM hidden and cell states to zeros. */
104
+ private _resetStates;
105
+ private _log;
106
+ }
107
+ /**
108
+ * Create and initialise a SileroVAD instance in one call.
109
+ *
110
+ * ```ts
111
+ * const vad = await createVAD({ debug: true });
112
+ * vad.onSpeechStart(() => console.log('speaking'));
113
+ * vad.start(stream);
114
+ * ```
115
+ */
116
+ declare function createVAD(options?: VADOptions): Promise<SileroVAD>;
117
+
118
+ declare global {
119
+ var webkitAudioContext: typeof AudioContext | undefined;
120
+ }
121
+
122
+ export { FRAME_SIZE, SileroVAD, TARGET_SAMPLE_RATE, type VADEvent, type VADOptions, VAD_VERSION, createVAD };
@@ -0,0 +1,122 @@
1
+ declare const VAD_VERSION = "0.1.0";
2
+ /** Silero VAD frame size: 512 samples at 16 kHz = 32 ms per frame. */
3
+ declare const FRAME_SIZE = 512;
4
+ /** Target sample rate for VAD processing. */
5
+ declare const TARGET_SAMPLE_RATE = 16000;
6
+ interface VADOptions {
7
+ /** Speech probability threshold (0-1). Default: 0.5 */
8
+ threshold?: number;
9
+ /** Minimum speech duration in ms to trigger start. Default: 300 */
10
+ minSpeechDurationMs?: number;
11
+ /** Silence duration in ms after speech to trigger end. Default: 500 */
12
+ silenceDurationMs?: number;
13
+ /** Sample rate. Default: 16000 */
14
+ sampleRate?: number;
15
+ /** Enable debug logging. Default: false */
16
+ debug?: boolean;
17
+ /** Custom URL for the Silero VAD ONNX model file. */
18
+ modelUrl?: string;
19
+ }
20
+ interface VADEvent {
21
+ type: 'speech-start' | 'speech-end' | 'vad-ready';
22
+ timestamp: number;
23
+ /** Speech probability (0-1) at the moment of the event. */
24
+ probability?: number;
25
+ }
26
+ type VADCallback = (event: VADEvent) => void;
27
+ declare class SileroVAD {
28
+ private readonly _threshold;
29
+ private readonly _minSpeechDurationMs;
30
+ private readonly _silenceDurationMs;
31
+ private readonly _sampleRate;
32
+ private readonly _debug;
33
+ private readonly _modelUrl;
34
+ private _session;
35
+ private _h;
36
+ private _c;
37
+ private _audioContext;
38
+ private _ownsAudioContext;
39
+ private _sourceNode;
40
+ private _workletNode;
41
+ private _stream;
42
+ private _frameBuffer;
43
+ private _frameBufferOffset;
44
+ private _isReady;
45
+ private _isSpeaking;
46
+ private _isStarted;
47
+ private _isDestroyed;
48
+ private _consecutiveSpeechFrames;
49
+ private _consecutiveSilenceFrames;
50
+ private _frameDurationMs;
51
+ private _minSpeechFrames;
52
+ private _silenceFrames;
53
+ private _isCalibrating;
54
+ private _calibrationSamples;
55
+ private _calibrationFramesNeeded;
56
+ private _calibratedThreshold;
57
+ private _listeners;
58
+ private _processingPromise;
59
+ constructor(options?: VADOptions);
60
+ /** Load the ONNX model. Uses Cache API for persistence across sessions. */
61
+ init(): Promise<void>;
62
+ /**
63
+ * Process a single audio frame (512 samples at 16 kHz).
64
+ * Returns the speech probability (0-1).
65
+ */
66
+ processFrame(audioData: Float32Array): Promise<number>;
67
+ /** Start VAD processing on a MediaStream (typically from getUserMedia). */
68
+ start(stream: MediaStream): void;
69
+ /** Stop VAD processing and release audio resources (but keep the model). */
70
+ stop(): void;
71
+ /** Register a callback for speech-start events. Returns an unsubscribe function. */
72
+ onSpeechStart(callback: VADCallback): () => void;
73
+ /** Register a callback for speech-end events. Returns an unsubscribe function. */
74
+ onSpeechEnd(callback: VADCallback): () => void;
75
+ /** Register a callback for vad-ready events. Returns an unsubscribe function. */
76
+ onReady(callback: VADCallback): () => void;
77
+ /** Whether the ONNX model is loaded and ready. */
78
+ get isReady(): boolean;
79
+ /** Whether speech is currently detected. */
80
+ get isSpeaking(): boolean;
81
+ /** The MediaStream currently being processed, or null. */
82
+ get stream(): MediaStream | null;
83
+ /** Release ONNX model session and all audio resources. */
84
+ destroy(): Promise<void>;
85
+ private _on;
86
+ private _emit;
87
+ private _setupAudioPipeline;
88
+ /**
89
+ * ScriptProcessorNode fallback (works everywhere, including Safari).
90
+ * We use a buffer size of 4096 which gives ~85 ms of audio at 48 kHz.
91
+ */
92
+ private _setupScriptProcessor;
93
+ /**
94
+ * Accumulate resampled audio into FRAME_SIZE chunks and process each full frame.
95
+ */
96
+ private _feedAudio;
97
+ /**
98
+ * Process a single FRAME_SIZE frame: run inference and update speech state.
99
+ */
100
+ private _handleFrame;
101
+ private _finishCalibration;
102
+ private _teardownAudioPipeline;
103
+ /** Reset the LSTM hidden and cell states to zeros. */
104
+ private _resetStates;
105
+ private _log;
106
+ }
107
+ /**
108
+ * Create and initialise a SileroVAD instance in one call.
109
+ *
110
+ * ```ts
111
+ * const vad = await createVAD({ debug: true });
112
+ * vad.onSpeechStart(() => console.log('speaking'));
113
+ * vad.start(stream);
114
+ * ```
115
+ */
116
+ declare function createVAD(options?: VADOptions): Promise<SileroVAD>;
117
+
118
+ declare global {
119
+ var webkitAudioContext: typeof AudioContext | undefined;
120
+ }
121
+
122
+ export { FRAME_SIZE, SileroVAD, TARGET_SAMPLE_RATE, type VADEvent, type VADOptions, VAD_VERSION, createVAD };