streaming-sortformer-node 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,253 @@
1
+ /**
2
+ * TypeScript wrapper for the native SortFormer speaker diarization model
3
+ */
4
+
5
+ import type { LoadOptions, DiarizeOptions, DiarizeResult, StreamingSessionOptions, StreamingPreset } from './types.js';
6
+ import { LATENCY_PRESETS, OFFLINE_PARAMS } from './presets.js';
7
+ import { getBinding } from './binding.js';
8
+ import { StreamingSession } from './StreamingSession.js';
9
+
10
+ /**
11
+ * SortFormer speaker diarization model wrapper
12
+ *
13
+ * Provides a high-level TypeScript API for loading and running the native
14
+ * SortFormer model for streaming speaker diarization.
15
+ *
16
+ * @example
17
+ * ```typescript
18
+ * const model = await Sortformer.load('./model.gguf', { threads: 4 });
19
+ * const result = await model.diarize(audioData, { mode: 'streaming', latency: '2s' });
20
+ * console.log(result.rttm);
21
+ * model.close();
22
+ * ```
23
+ */
24
+ export class Sortformer {
25
+ private native: any;
26
+ private closed: boolean = false;
27
+
28
+ /**
29
+ * Private constructor - use static load() method instead
30
+ * @param native - Native SortformerModel instance from binding
31
+ */
32
+ private constructor(native: any) {
33
+ this.native = native;
34
+ }
35
+
36
+ /**
37
+ * Load a SortFormer model from a GGUF file
38
+ *
39
+ * @param modelPath - Path to the GGUF model file
40
+ * @param options - Optional loading configuration
41
+ * @returns Promise resolving to a loaded Sortformer instance
42
+ * @throws Error if model file not found or native binding unavailable
43
+ *
44
+ * @example
45
+ * ```typescript
46
+ * const model = await Sortformer.load('./model.gguf', { threads: 8 });
47
+ * ```
48
+ */
49
+ static async load(modelPath: string, options?: LoadOptions): Promise<Sortformer> {
50
+ // Validate input
51
+ if (!modelPath || typeof modelPath !== 'string') {
52
+ throw new TypeError('modelPath must be a non-empty string');
53
+ }
54
+
55
+ // Get native binding
56
+ const binding = getBinding();
57
+
58
+ // Create native model instance
59
+ // Default to 4 threads if not specified
60
+ const threads = options?.threads ?? 4;
61
+
62
+ if (threads < 1 || !Number.isInteger(threads)) {
63
+ throw new Error('threads must be a positive integer');
64
+ }
65
+
66
+ // Instantiate native model
67
+ const native = new binding.SortformerModel(modelPath, threads);
68
+
69
+ return new Sortformer(native);
70
+ }
71
+
72
+ /**
73
+ * Run diarization inference on audio samples
74
+ *
75
+ * @param audio - Audio samples as Float32Array (16kHz mono)
76
+ * @param options - Optional diarization configuration
77
+ * @returns Promise resolving to diarization results (RTTM + predictions)
78
+ * @throws Error if model is closed, audio is invalid, or inference fails
79
+ *
80
+ * @example
81
+ * ```typescript
82
+ * const result = await model.diarize(audioData, {
83
+ * mode: 'streaming',
84
+ * latency: '2s',
85
+ * threshold: 0.5,
86
+ * medianFilter: 11
87
+ * });
88
+ * ```
89
+ */
90
+ async diarize(audio: Float32Array, options?: DiarizeOptions): Promise<DiarizeResult> {
91
+ // Check if model is closed
92
+ if (this.closed) {
93
+ throw new Error('Model is closed. Cannot perform diarization.');
94
+ }
95
+
96
+ // Validate audio input
97
+ if (!(audio instanceof Float32Array)) {
98
+ throw new TypeError('audio must be a Float32Array');
99
+ }
100
+
101
+ if (audio.length === 0) {
102
+ throw new Error('audio cannot be empty');
103
+ }
104
+
105
+ // Validate options
106
+ if (options?.threshold !== undefined) {
107
+ if (typeof options.threshold !== 'number' || options.threshold < 0 || options.threshold > 1) {
108
+ throw new Error('threshold must be a number between 0 and 1');
109
+ }
110
+ }
111
+
112
+ if (options?.medianFilter !== undefined) {
113
+ if (!Number.isInteger(options.medianFilter) || options.medianFilter < 1 || options.medianFilter % 2 === 0) {
114
+ throw new Error('medianFilter must be a positive odd integer');
115
+ }
116
+ }
117
+
118
+ // Map user-friendly options to native format
119
+ const mode = options?.mode ?? 'offline';
120
+ const nativeOptions: any = {
121
+ threshold: options?.threshold ?? 0.5,
122
+ medianFilter: options?.medianFilter ?? 11,
123
+ };
124
+
125
+ // Add streaming-specific parameters if in streaming mode
126
+ if (mode === 'streaming') {
127
+ const latency = options?.latency ?? '2s';
128
+ const presetParams = LATENCY_PRESETS[latency];
129
+
130
+ if (!presetParams) {
131
+ throw new Error(`Unknown latency preset: ${latency}`);
132
+ }
133
+
134
+ nativeOptions.chunkLen = presetParams.chunkLen;
135
+ nativeOptions.rightContext = presetParams.rightContext;
136
+ nativeOptions.fifoLen = presetParams.fifoLen;
137
+ nativeOptions.spkcacheUpdatePeriod = presetParams.spkcacheUpdatePeriod;
138
+ } else if (mode === 'offline') {
139
+ // Use offline parameters
140
+ nativeOptions.chunkLen = OFFLINE_PARAMS.chunkLen;
141
+ nativeOptions.rightContext = OFFLINE_PARAMS.rightContext;
142
+ nativeOptions.fifoLen = OFFLINE_PARAMS.fifoLen;
143
+ nativeOptions.spkcacheUpdatePeriod = OFFLINE_PARAMS.spkcacheUpdatePeriod;
144
+ } else {
145
+ throw new Error(`Unknown diarization mode: ${mode}`);
146
+ }
147
+
148
+ // Call native diarization
149
+ const result = await this.native.diarize(audio, nativeOptions);
150
+
151
+ // Validate result structure
152
+ if (!result || typeof result !== 'object') {
153
+ throw new Error('Native diarization returned invalid result');
154
+ }
155
+
156
+ if (typeof result.rttm !== 'string') {
157
+ throw new Error('Native diarization result missing rttm string');
158
+ }
159
+
160
+ if (!(result.predictions instanceof Float32Array)) {
161
+ throw new Error('Native diarization result predictions must be Float32Array');
162
+ }
163
+
164
+ if (!Number.isInteger(result.frameCount) || result.frameCount < 0) {
165
+ throw new Error('Native diarization result frameCount must be non-negative integer');
166
+ }
167
+
168
+ if (!Number.isInteger(result.speakerCount) || result.speakerCount < 1 || result.speakerCount > 4) {
169
+ throw new Error('Native diarization result speakerCount must be 1-4');
170
+ }
171
+
172
+ return result as DiarizeResult;
173
+ }
174
+
175
+ /**
176
+ * Close the model and free native resources
177
+ *
178
+ * After calling close(), the model cannot be used for further inference.
179
+ * Calling close() multiple times is safe (idempotent).
180
+ *
181
+ * @example
182
+ * ```typescript
183
+ * model.close();
184
+ * ```
185
+ */
186
+ close(): void {
187
+ if (!this.closed) {
188
+ if (this.native && typeof this.native.close === 'function') {
189
+ this.native.close();
190
+ }
191
+ this.closed = true;
192
+ }
193
+ }
194
+
195
+ /**
196
+ * Check if the model is closed
197
+ * @returns true if the model has been closed, false otherwise
198
+ */
199
+ isClosed(): boolean {
200
+ return this.closed;
201
+ }
202
+
203
+ /**
204
+ * Create a streaming session for incremental audio processing
205
+ *
206
+ * The streaming session maintains state (speaker cache, FIFO buffer)
207
+ * across feed() calls, enabling true real-time diarization.
208
+ *
209
+ * @param options - Optional streaming configuration
210
+ * @returns A new StreamingSession instance
211
+ * @throws Error if model is closed
212
+ *
213
+ * @example
214
+ * ```typescript
215
+ * const session = model.createStreamingSession({ preset: 'low' });
216
+ *
217
+ * // Feed audio chunks as they arrive
218
+ * const result1 = session.feed(chunk1);
219
+ * const result2 = session.feed(chunk2);
220
+ *
221
+ * // Accumulate predictions
222
+ * const allPreds = [...result1.predictions, ...result2.predictions];
223
+ *
224
+ * session.close();
225
+ * ```
226
+ */
227
+ createStreamingSession(options?: StreamingSessionOptions): StreamingSession {
228
+ if (this.closed) {
229
+ throw new Error('Model is closed. Cannot create streaming session.');
230
+ }
231
+
232
+ const preset = options?.preset ?? '2s';
233
+
234
+ // Map preset string to enum value
235
+ const presetMap: Record<StreamingPreset, number> = {
236
+ 'low': 0, // SORTFORMER_PRESET_LOW_LATENCY
237
+ '2s': 1, // SORTFORMER_PRESET_2S
238
+ '3s': 2, // SORTFORMER_PRESET_3S
239
+ '5s': 3, // SORTFORMER_PRESET_5S
240
+ };
241
+
242
+ const presetNum = presetMap[preset];
243
+ if (presetNum === undefined) {
244
+ throw new Error(`Unknown preset: ${preset}`);
245
+ }
246
+
247
+ // Get binding and create native session
248
+ const binding = getBinding();
249
+ const nativeSession = new binding.StreamingSession(this.native, presetNum);
250
+
251
+ return new StreamingSession(nativeSession);
252
+ }
253
+ }
@@ -0,0 +1,143 @@
1
+ /**
2
+ * TypeScript wrapper for native StreamingSession
3
+ */
4
+
5
+ import type { FeedResult, StreamingPreset } from './types.js';
6
+
7
+ /**
8
+ * Streaming diarization session
9
+ *
10
+ * Maintains state across incremental audio feed calls for true real-time
11
+ * speaker diarization. State is kept in native C code for efficiency.
12
+ *
13
+ * @example
14
+ * ```typescript
15
+ * const session = model.createStreamingSession({ preset: '2s' });
16
+ *
17
+ * // Feed audio chunks as they arrive
18
+ * const result1 = session.feed(chunk1);
19
+ * const result2 = session.feed(chunk2);
20
+ *
21
+ * // Get total frames processed
22
+ * console.log(session.totalFrames);
23
+ *
24
+ * // Reset for new audio stream
25
+ * session.reset();
26
+ *
27
+ * // Clean up
28
+ * session.close();
29
+ * ```
30
+ */
31
+ export class StreamingSession {
32
+ private native: any;
33
+ private _closed: boolean = false;
34
+
35
+ /**
36
+ * Create a new streaming session
37
+ * @param native - Native StreamingSession instance from binding
38
+ * @internal
39
+ */
40
+ constructor(native: any) {
41
+ this.native = native;
42
+ }
43
+
44
+ /**
45
+ * Feed audio samples and get predictions for this chunk
46
+ *
47
+ * @param audio - Audio samples as Float32Array (16kHz mono)
48
+ * @returns Predictions for the new frames in this chunk
49
+ * @throws Error if session is closed or audio is invalid
50
+ *
51
+ * @example
52
+ * ```typescript
53
+ * const audio = new Float32Array(48000); // 3 seconds
54
+ * const result = session.feed(audio);
55
+ * console.log(`Got ${result.frameCount} new frames`);
56
+ * ```
57
+ */
58
+ feed(audio: Float32Array): FeedResult {
59
+ if (this._closed) {
60
+ throw new Error('Session is closed');
61
+ }
62
+
63
+ if (!(audio instanceof Float32Array)) {
64
+ throw new TypeError('audio must be a Float32Array');
65
+ }
66
+
67
+ const result = this.native.feed(audio);
68
+
69
+ return {
70
+ predictions: result.predictions,
71
+ frameCount: result.frameCount,
72
+ };
73
+ }
74
+
75
+ /**
76
+ * Flush remaining buffered audio at end of stream
77
+ *
78
+ * Call this when the audio stream ends to process any remaining
79
+ * buffered audio that hasn't been output yet due to latency buffering.
80
+ *
81
+ * @returns Final predictions for buffered audio
82
+ * @throws Error if session is closed
83
+ */
84
+ flush(): FeedResult {
85
+ if (this._closed) {
86
+ throw new Error('Session is closed');
87
+ }
88
+
89
+ const result = this.native.flush();
90
+
91
+ return {
92
+ predictions: result.predictions,
93
+ frameCount: result.frameCount,
94
+ };
95
+ }
96
+
97
+ /**
98
+ * Reset the streaming state for a new audio stream
99
+ *
100
+ * Clears all internal buffers (spkcache, fifo, mel overlap) while
101
+ * keeping the model loaded. Use this when starting a new recording.
102
+ *
103
+ * @throws Error if session is closed
104
+ */
105
+ reset(): void {
106
+ if (this._closed) {
107
+ throw new Error('Session is closed');
108
+ }
109
+ this.native.reset();
110
+ }
111
+
112
+ /**
113
+ * Close the session and free native resources
114
+ *
115
+ * After calling close(), the session cannot be used.
116
+ * Calling close() multiple times is safe (idempotent).
117
+ */
118
+ close(): void {
119
+ if (!this._closed) {
120
+ if (this.native && typeof this.native.close === 'function') {
121
+ this.native.close();
122
+ }
123
+ this._closed = true;
124
+ }
125
+ }
126
+
127
+ /**
128
+ * Get total frames output so far
129
+ */
130
+ get totalFrames(): number {
131
+ if (this._closed) {
132
+ return 0;
133
+ }
134
+ return this.native.getTotalFrames();
135
+ }
136
+
137
+ /**
138
+ * Check if the session is closed
139
+ */
140
+ get isClosed(): boolean {
141
+ return this._closed;
142
+ }
143
+ }
package/src/binding.ts ADDED
@@ -0,0 +1,41 @@
1
+ import { createRequire } from 'module';
2
+
3
+ const require = createRequire(import.meta.url);
4
+
5
+ let cachedBinding: any = null;
6
+
7
+ /**
8
+ * Get the native binding for the current platform
9
+ * Detects platform and architecture, loads the appropriate platform-specific package
10
+ * @returns The native module binding
11
+ * @throws Error if platform is not supported or binding cannot be loaded
12
+ */
13
+ export function getBinding(): any {
14
+ if (cachedBinding) return cachedBinding;
15
+
16
+ const platform = process.platform;
17
+ const arch = process.arch;
18
+
19
+ let packageName: string;
20
+
21
+ if (platform === 'darwin' && arch === 'arm64') {
22
+ packageName = '@streaming-sortformer-node/darwin-arm64';
23
+ } else if (platform === 'darwin' && arch === 'x64') {
24
+ packageName = '@streaming-sortformer-node/darwin-x64';
25
+ } else {
26
+ throw new Error(
27
+ `Unsupported platform: ${platform}-${arch}. ` +
28
+ `streaming-sortformer-node currently supports: darwin-arm64, darwin-x64`
29
+ );
30
+ }
31
+
32
+ try {
33
+ cachedBinding = require(packageName);
34
+ return cachedBinding;
35
+ } catch (e) {
36
+ throw new Error(
37
+ `Failed to load native binding from ${packageName}. ` +
38
+ `Make sure the package is installed: npm install ${packageName}`
39
+ );
40
+ }
41
+ }
package/src/index.ts ADDED
@@ -0,0 +1,13 @@
1
+ export { Sortformer } from './Sortformer.js';
2
+ export { StreamingSession } from './StreamingSession.js';
3
+ export type {
4
+ LoadOptions,
5
+ DiarizeOptions,
6
+ DiarizeResult,
7
+ LatencyPreset,
8
+ DiarizeMode,
9
+ StreamingPreset,
10
+ StreamingSessionOptions,
11
+ FeedResult,
12
+ } from './types.js';
13
+ export { LATENCY_PRESETS, OFFLINE_PARAMS } from './presets.js';
package/src/presets.ts ADDED
@@ -0,0 +1,88 @@
1
+ /**
2
+ * Latency presets for streaming diarization
3
+ * Maps preset names to their corresponding parameter configurations
4
+ */
5
+
6
+ import type { LatencyPreset } from './types';
7
+
8
+ /**
9
+ * Streaming latency preset parameters
10
+ * Each preset controls chunk processing, buffering, and speaker cache update behavior
11
+ */
12
+ export interface PresetParams {
13
+ /** Chunk length in frames (16kHz, hop=160) */
14
+ chunkLen: number;
15
+ /** Right context frames for conformer processing */
16
+ rightContext: number;
17
+ /** FIFO buffer length in frames */
18
+ fifoLen: number;
19
+ /** Speaker cache update period in frames */
20
+ spkcacheUpdatePeriod: number;
21
+ }
22
+
23
+ /**
24
+ * Streaming latency presets
25
+ * - 'low': ~188ms latency, minimal buffering
26
+ * - '2s': ~2 second latency
27
+ * - '3s': ~3 second latency
28
+ * - '5s': ~5 second latency
29
+ */
30
+ export const LATENCY_PRESETS: Record<LatencyPreset, PresetParams> = {
31
+ 'low': {
32
+ chunkLen: 6,
33
+ rightContext: 7,
34
+ fifoLen: 188,
35
+ spkcacheUpdatePeriod: 144,
36
+ },
37
+ '2s': {
38
+ chunkLen: 15,
39
+ rightContext: 10,
40
+ fifoLen: 100,
41
+ spkcacheUpdatePeriod: 144,
42
+ },
43
+ '3s': {
44
+ chunkLen: 30,
45
+ rightContext: 7,
46
+ fifoLen: 100,
47
+ spkcacheUpdatePeriod: 100,
48
+ },
49
+ '5s': {
50
+ chunkLen: 55,
51
+ rightContext: 7,
52
+ fifoLen: 100,
53
+ spkcacheUpdatePeriod: 100,
54
+ },
55
+ } as const;
56
+
57
+ /**
58
+ * Offline mode parameters
59
+ * Used when mode='offline' to process entire audio at once
60
+ */
61
+ export const OFFLINE_PARAMS: PresetParams = {
62
+ chunkLen: 188,
63
+ rightContext: 1,
64
+ fifoLen: 0,
65
+ spkcacheUpdatePeriod: 188,
66
+ } as const;
67
+
68
+ /**
69
+ * Get preset parameters by name
70
+ * @param preset - Latency preset name
71
+ * @returns Preset parameters
72
+ * @throws Error if preset is not found
73
+ */
74
+ export function getPresetParams(preset: LatencyPreset): PresetParams {
75
+ const params = LATENCY_PRESETS[preset];
76
+ if (!params) {
77
+ throw new Error(`Unknown latency preset: ${preset}`);
78
+ }
79
+ return params;
80
+ }
81
+
82
+ /**
83
+ * Get default preset parameters for offline mode
84
+ * @returns Offline mode parameters
85
+ */
86
+ export function getOfflineParams(): PresetParams {
87
+ return OFFLINE_PARAMS;
88
+ }
package/src/types.ts ADDED
@@ -0,0 +1,121 @@
1
+ /**
2
+ * TypeScript type definitions for streaming-sortformer-node
3
+ */
4
+
5
+ /**
6
+ * Diarization mode: offline processes entire audio at once,
7
+ * streaming processes audio in chunks with latency control
8
+ */
9
+ export type DiarizeMode = 'offline' | 'streaming';
10
+
11
+ /**
12
+ * Latency preset for streaming mode
13
+ * - 'low': ~188ms latency, minimal buffering
14
+ * - '2s': ~2 second latency
15
+ * - '3s': ~3 second latency
16
+ * - '5s': ~5 second latency
17
+ */
18
+ export type LatencyPreset = 'low' | '2s' | '3s' | '5s';
19
+
20
+ /**
21
+ * Options for loading a SortFormer model
22
+ */
23
+ export interface LoadOptions {
24
+ /**
25
+ * Number of CPU threads to use for inference
26
+ * @default auto-detected based on CPU cores
27
+ */
28
+ threads?: number;
29
+ }
30
+
31
+ /**
32
+ * Options for diarization inference
33
+ */
34
+ export interface DiarizeOptions {
35
+ /**
36
+ * Diarization mode: 'offline' or 'streaming'
37
+ * @default 'offline'
38
+ */
39
+ mode?: DiarizeMode;
40
+
41
+ /**
42
+ * Latency preset for streaming mode
43
+ * Only used when mode='streaming'
44
+ * @default '2s'
45
+ */
46
+ latency?: LatencyPreset;
47
+
48
+ /**
49
+ * Speaker activity threshold (0.0 to 1.0)
50
+ * Frames with prediction >= threshold are considered active
51
+ * @default 0.5
52
+ */
53
+ threshold?: number;
54
+
55
+ /**
56
+ * Median filter window size for smoothing predictions
57
+ * Must be odd number >= 1
58
+ * @default 11
59
+ */
60
+ medianFilter?: number;
61
+ }
62
+
63
+ /**
64
+ * Result of diarization inference
65
+ */
66
+ export interface DiarizeResult {
67
+ /**
68
+ * RTTM format output (speaker diarization segments)
69
+ * Format: SPEAKER <filename> <channel> <start> <duration> <conf> <spk_type> <spk_id> <score>
70
+ */
71
+ rttm: string;
72
+
73
+ /**
74
+ * Raw per-frame speaker activity predictions
75
+ * Shape: [frameCount, 4] (4 speakers max)
76
+ * Values: 0.0 to 1.0 (probability of speaker activity)
77
+ */
78
+ predictions: Float32Array;
79
+
80
+ /**
81
+ * Number of frames in the output
82
+ */
83
+ frameCount: number;
84
+
85
+ /**
86
+ * Number of speakers detected (1-4)
87
+ */
88
+ speakerCount: number;
89
+ }
90
+
91
+ /**
92
+ * Streaming preset type
93
+ */
94
+ export type StreamingPreset = 'low' | '2s' | '3s' | '5s';
95
+
96
+ /**
97
+ * Options for creating a streaming session
98
+ */
99
+ export interface StreamingSessionOptions {
100
+ /**
101
+ * Latency preset
102
+ * @default '2s'
103
+ */
104
+ preset?: StreamingPreset;
105
+ }
106
+
107
+ /**
108
+ * Result from feeding audio to streaming session
109
+ */
110
+ export interface FeedResult {
111
+ /**
112
+ * Per-frame speaker predictions for this chunk
113
+ * Shape: [frameCount, 4]
114
+ */
115
+ predictions: Float32Array;
116
+
117
+ /**
118
+ * Number of new frames in this result
119
+ */
120
+ frameCount: number;
121
+ }