streaming-sortformer-node 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +369 -0
- package/dist/Sortformer.d.ts +104 -0
- package/dist/Sortformer.d.ts.map +1 -0
- package/dist/Sortformer.js +221 -0
- package/dist/Sortformer.js.map +1 -0
- package/dist/StreamingSession.d.ts +88 -0
- package/dist/StreamingSession.d.ts.map +1 -0
- package/dist/StreamingSession.js +128 -0
- package/dist/StreamingSession.js.map +1 -0
- package/dist/binding.d.ts +8 -0
- package/dist/binding.d.ts.map +1 -0
- package/dist/binding.js +35 -0
- package/dist/binding.js.map +1 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +4 -0
- package/dist/index.js.map +1 -0
- package/dist/presets.d.ts +45 -0
- package/dist/presets.d.ts.map +1 -0
- package/dist/presets.js +68 -0
- package/dist/presets.js.map +1 -0
- package/dist/types.d.ts +107 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +5 -0
- package/dist/types.js.map +1 -0
- package/package.json +31 -0
- package/src/Sortformer.ts +253 -0
- package/src/StreamingSession.ts +143 -0
- package/src/binding.ts +41 -0
- package/src/index.ts +13 -0
- package/src/presets.ts +88 -0
- package/src/types.ts +121 -0
package/README.md
ADDED
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
# streaming-sortformer-node
|
|
2
|
+
|
|
3
|
+
Node.js bindings for SortFormer streaming speaker diarization.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install streaming-sortformer-node
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Supported platforms:
|
|
12
|
+
- macOS Apple Silicon (arm64)
|
|
13
|
+
- macOS Intel (x64)
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
```javascript
|
|
18
|
+
const { Sortformer } = require('streaming-sortformer-node');
|
|
19
|
+
|
|
20
|
+
// Load model
|
|
21
|
+
const model = await Sortformer.load('./model.gguf', { threads: 4 });
|
|
22
|
+
|
|
23
|
+
// Prepare audio (16kHz mono Float32Array)
|
|
24
|
+
const audio = new Float32Array(/* your audio samples */);
|
|
25
|
+
|
|
26
|
+
// Run diarization
|
|
27
|
+
const result = await model.diarize(audio, {
|
|
28
|
+
mode: 'streaming',
|
|
29
|
+
latency: '2s',
|
|
30
|
+
threshold: 0.5
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
// Output RTTM format
|
|
34
|
+
console.log(result.rttm);
|
|
35
|
+
|
|
36
|
+
// Access raw predictions
|
|
37
|
+
console.log(`Detected ${result.speakerCount} speakers`);
|
|
38
|
+
console.log(`Frame count: ${result.frameCount}`);
|
|
39
|
+
|
|
40
|
+
// Clean up
|
|
41
|
+
model.close();
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## API Reference
|
|
45
|
+
|
|
46
|
+
### `Sortformer.load(modelPath, options?)`
|
|
47
|
+
|
|
48
|
+
Load a SortFormer model from a GGUF file.
|
|
49
|
+
|
|
50
|
+
**Parameters:**
|
|
51
|
+
- `modelPath` (string): Path to the GGUF model file
|
|
52
|
+
- `options` (LoadOptions, optional):
|
|
53
|
+
- `threads` (number): Number of CPU threads for inference (default: 4)
|
|
54
|
+
|
|
55
|
+
**Returns:** `Promise<Sortformer>`
|
|
56
|
+
|
|
57
|
+
**Example:**
|
|
58
|
+
```javascript
|
|
59
|
+
const model = await Sortformer.load('./model.gguf', { threads: 8 });
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### `model.diarize(audio, options?)`
|
|
63
|
+
|
|
64
|
+
Run diarization inference on audio samples.
|
|
65
|
+
|
|
66
|
+
**Parameters:**
|
|
67
|
+
- `audio` (Float32Array): Audio samples at 16kHz mono
|
|
68
|
+
- `options` (DiarizeOptions, optional):
|
|
69
|
+
- `mode` ('offline' | 'streaming'): Diarization mode (default: 'offline')
|
|
70
|
+
- `latency` ('low' | '2s' | '3s' | '5s'): Latency preset for streaming mode (default: '2s')
|
|
71
|
+
- `threshold` (number): Speaker activity threshold, 0.0-1.0 (default: 0.5)
|
|
72
|
+
- `medianFilter` (number): Median filter window size, must be odd (default: 11)
|
|
73
|
+
|
|
74
|
+
**Returns:** `Promise<DiarizeResult>`
|
|
75
|
+
|
|
76
|
+
**DiarizeResult:**
|
|
77
|
+
- `rttm` (string): RTTM format output with speaker segments
|
|
78
|
+
- `predictions` (Float32Array): Raw per-frame predictions, shape [frameCount, 4]
|
|
79
|
+
- `frameCount` (number): Number of frames in output
|
|
80
|
+
- `speakerCount` (number): Number of speakers detected (1-4)
|
|
81
|
+
|
|
82
|
+
**Example:**
|
|
83
|
+
```javascript
|
|
84
|
+
const result = await model.diarize(audio, {
|
|
85
|
+
mode: 'streaming',
|
|
86
|
+
latency: '2s',
|
|
87
|
+
threshold: 0.5,
|
|
88
|
+
medianFilter: 11
|
|
89
|
+
});
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### `model.close()`
|
|
93
|
+
|
|
94
|
+
Close the model and free native resources.
|
|
95
|
+
|
|
96
|
+
After calling `close()`, the model cannot be used for further inference. Calling `close()` multiple times is safe (idempotent).
|
|
97
|
+
|
|
98
|
+
**Example:**
|
|
99
|
+
```javascript
|
|
100
|
+
model.close();
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### `model.isClosed()`
|
|
104
|
+
|
|
105
|
+
Check if the model has been closed.
|
|
106
|
+
|
|
107
|
+
**Returns:** `boolean` - true if closed, false otherwise
|
|
108
|
+
|
|
109
|
+
## Types
|
|
110
|
+
|
|
111
|
+
### `LoadOptions`
|
|
112
|
+
|
|
113
|
+
```typescript
|
|
114
|
+
interface LoadOptions {
|
|
115
|
+
threads?: number; // Number of CPU threads (default: auto-detected)
|
|
116
|
+
}
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### `DiarizeOptions`
|
|
120
|
+
|
|
121
|
+
```typescript
|
|
122
|
+
interface DiarizeOptions {
|
|
123
|
+
mode?: 'offline' | 'streaming'; // Default: 'offline'
|
|
124
|
+
latency?: 'low' | '2s' | '3s' | '5s'; // Default: '2s' (streaming only)
|
|
125
|
+
threshold?: number; // 0.0-1.0, default: 0.5
|
|
126
|
+
medianFilter?: number; // Odd integer >= 1, default: 11
|
|
127
|
+
}
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### `DiarizeResult`
|
|
131
|
+
|
|
132
|
+
```typescript
|
|
133
|
+
interface DiarizeResult {
|
|
134
|
+
rttm: string; // RTTM format output
|
|
135
|
+
predictions: Float32Array; // Shape: [frameCount, 4]
|
|
136
|
+
frameCount: number; // Number of frames
|
|
137
|
+
speakerCount: number; // Detected speakers (1-4)
|
|
138
|
+
}
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## Latency Presets
|
|
142
|
+
|
|
143
|
+
Streaming mode supports four latency presets that trade off latency for accuracy:
|
|
144
|
+
|
|
145
|
+
| Preset | Latency | Chunk Size | Use Case |
|
|
146
|
+
|--------|---------|------------|----------|
|
|
147
|
+
| `low` | ~188ms | 6 frames | Real-time applications, minimal delay |
|
|
148
|
+
| `2s` | ~2 seconds | 15 frames | Near real-time, balanced accuracy |
|
|
149
|
+
| `3s` | ~3 seconds | 30 frames | Higher accuracy, moderate delay |
|
|
150
|
+
| `5s` | ~5 seconds | 55 frames | Best accuracy, higher latency acceptable |
|
|
151
|
+
|
|
152
|
+
**Offline mode** processes the entire audio at once with no streaming constraints, providing the highest accuracy but requiring the full audio upfront.
|
|
153
|
+
|
|
154
|
+
## Audio Format
|
|
155
|
+
|
|
156
|
+
Input audio must be:
|
|
157
|
+
- **Sample rate:** 16kHz
|
|
158
|
+
- **Channels:** Mono (single channel)
|
|
159
|
+
- **Format:** Float32Array with values in range [-1.0, 1.0]
|
|
160
|
+
|
|
161
|
+
To convert from 16-bit PCM:
|
|
162
|
+
|
|
163
|
+
```javascript
|
|
164
|
+
// From Int16Array
|
|
165
|
+
const float32 = new Float32Array(int16Array.length);
|
|
166
|
+
for (let i = 0; i < int16Array.length; i++) {
|
|
167
|
+
float32[i] = int16Array[i] / 32768.0;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// From Buffer (16-bit PCM)
|
|
171
|
+
const int16 = new Int16Array(buffer.buffer, buffer.byteOffset, buffer.length / 2);
|
|
172
|
+
const float32 = new Float32Array(int16.length);
|
|
173
|
+
for (let i = 0; i < int16.length; i++) {
|
|
174
|
+
float32[i] = int16[i] / 32768.0;
|
|
175
|
+
}
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## CoreML Acceleration
|
|
179
|
+
|
|
180
|
+
On Apple Silicon Macs, the addon automatically uses CoreML/ANE acceleration if a compiled CoreML model is present alongside the GGUF file.
|
|
181
|
+
|
|
182
|
+
**Setup:**
|
|
183
|
+
|
|
184
|
+
1. Convert the model head to CoreML:
|
|
185
|
+
```bash
|
|
186
|
+
python scripts/convert_head_to_coreml.py \
|
|
187
|
+
--model model.nemo \
|
|
188
|
+
--output model-coreml-head.mlpackage \
|
|
189
|
+
--precision fp16
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
2. Compile the CoreML model:
|
|
193
|
+
```bash
|
|
194
|
+
xcrun coremlcompiler compile model-coreml-head.mlpackage .
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
3. Place `model-coreml-head.mlmodelc/` in the same directory as `model.gguf`
|
|
198
|
+
|
|
199
|
+
The addon will automatically detect and use the CoreML model, providing ~110x real-time performance on Apple Silicon (vs ~10x with CPU-only).
|
|
200
|
+
|
|
201
|
+
**Performance comparison (M3 MacBook Pro):**
|
|
202
|
+
|
|
203
|
+
| Backend | Speed | Memory |
|
|
204
|
+
|---------|-------|--------|
|
|
205
|
+
| CoreML/ANE | ~110x real-time | ~300 MB |
|
|
206
|
+
| CPU only | ~10x real-time | ~380 MB |
|
|
207
|
+
|
|
208
|
+
## RTTM Output Format
|
|
209
|
+
|
|
210
|
+
The `rttm` field in `DiarizeResult` follows the standard [RTTM format](https://catalog.ldc.upenn.edu/docs/LDC2004T12/RTTM-format-v13.pdf):
|
|
211
|
+
|
|
212
|
+
```
|
|
213
|
+
SPEAKER <filename> 1 <start> <duration> <NA> <NA> speaker_<id> <NA> <NA>
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
Each line represents a contiguous speech segment for one speaker.
|
|
217
|
+
|
|
218
|
+
**Example:**
|
|
219
|
+
```
|
|
220
|
+
SPEAKER audio 1 0.000 2.560 <NA> <NA> speaker_0 <NA> <NA>
|
|
221
|
+
SPEAKER audio 1 2.560 1.920 <NA> <NA> speaker_1 <NA> <NA>
|
|
222
|
+
SPEAKER audio 1 4.480 3.200 <NA> <NA> speaker_0 <NA> <NA>
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
## Troubleshooting
|
|
226
|
+
|
|
227
|
+
### Model not found
|
|
228
|
+
|
|
229
|
+
**Error:** `Error: Failed to load model: model.gguf`
|
|
230
|
+
|
|
231
|
+
**Solution:** Ensure the model path is correct and the file exists. Use absolute paths if relative paths fail:
|
|
232
|
+
|
|
233
|
+
```javascript
|
|
234
|
+
const path = require('path');
|
|
235
|
+
const modelPath = path.resolve(__dirname, 'model.gguf');
|
|
236
|
+
const model = await Sortformer.load(modelPath);
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
### Unsupported platform
|
|
240
|
+
|
|
241
|
+
**Error:** `Error: Platform not supported`
|
|
242
|
+
|
|
243
|
+
**Solution:** The addon currently only supports macOS (arm64 and x64). Windows and Linux support is planned for future releases.
|
|
244
|
+
|
|
245
|
+
### Out of memory
|
|
246
|
+
|
|
247
|
+
**Error:** `Error: Failed to allocate memory`
|
|
248
|
+
|
|
249
|
+
**Solution:** Reduce the number of threads or process shorter audio segments:
|
|
250
|
+
|
|
251
|
+
```javascript
|
|
252
|
+
// Reduce threads
|
|
253
|
+
const model = await Sortformer.load('./model.gguf', { threads: 2 });
|
|
254
|
+
|
|
255
|
+
// Process in chunks
|
|
256
|
+
const chunkSize = 16000 * 30; // 30 seconds at 16kHz
|
|
257
|
+
for (let i = 0; i < audio.length; i += chunkSize) {
|
|
258
|
+
const chunk = audio.slice(i, i + chunkSize);
|
|
259
|
+
const result = await model.diarize(chunk);
|
|
260
|
+
// Process result...
|
|
261
|
+
}
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
### Invalid audio format
|
|
265
|
+
|
|
266
|
+
**Error:** `TypeError: audio must be a Float32Array`
|
|
267
|
+
|
|
268
|
+
**Solution:** Ensure audio is a Float32Array with 16kHz mono samples:
|
|
269
|
+
|
|
270
|
+
```javascript
|
|
271
|
+
// Check audio format
|
|
272
|
+
if (!(audio instanceof Float32Array)) {
|
|
273
|
+
throw new Error('Audio must be Float32Array');
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// Check sample rate (if you have metadata)
|
|
277
|
+
if (sampleRate !== 16000) {
|
|
278
|
+
// Resample to 16kHz using a library like 'audio-resampler'
|
|
279
|
+
}
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
### CoreML model not loading
|
|
283
|
+
|
|
284
|
+
**Error:** Model loads but doesn't use CoreML acceleration
|
|
285
|
+
|
|
286
|
+
**Solution:**
|
|
287
|
+
1. Verify `model-coreml-head.mlmodelc/` is in the same directory as `model.gguf`
|
|
288
|
+
2. Ensure the CoreML model was compiled with `xcrun coremlcompiler compile`
|
|
289
|
+
3. Check that you're running on Apple Silicon (CoreML is not available on Intel Macs)
|
|
290
|
+
|
|
291
|
+
## Examples
|
|
292
|
+
|
|
293
|
+
### Real-time streaming from microphone
|
|
294
|
+
|
|
295
|
+
```javascript
|
|
296
|
+
const { Sortformer } = require('streaming-sortformer-node');
|
|
297
|
+
const mic = require('mic');
|
|
298
|
+
|
|
299
|
+
const model = await Sortformer.load('./model.gguf');
|
|
300
|
+
|
|
301
|
+
const micInstance = mic({
|
|
302
|
+
rate: '16000',
|
|
303
|
+
channels: '1',
|
|
304
|
+
encoding: 'signed-integer',
|
|
305
|
+
bitwidth: '16'
|
|
306
|
+
});
|
|
307
|
+
|
|
308
|
+
const micInputStream = micInstance.getAudioStream();
|
|
309
|
+
let buffer = [];
|
|
310
|
+
|
|
311
|
+
micInputStream.on('data', async (chunk) => {
|
|
312
|
+
// Convert to Float32Array
|
|
313
|
+
const int16 = new Int16Array(chunk.buffer);
|
|
314
|
+
const float32 = new Float32Array(int16.length);
|
|
315
|
+
for (let i = 0; i < int16.length; i++) {
|
|
316
|
+
float32[i] = int16[i] / 32768.0;
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
buffer.push(float32);
|
|
320
|
+
|
|
321
|
+
// Process every 2 seconds
|
|
322
|
+
if (buffer.length >= 10) {
|
|
323
|
+
const audio = Float32Array.from(buffer.flat());
|
|
324
|
+
const result = await model.diarize(audio, {
|
|
325
|
+
mode: 'streaming',
|
|
326
|
+
latency: 'low'
|
|
327
|
+
});
|
|
328
|
+
console.log(result.rttm);
|
|
329
|
+
buffer = [];
|
|
330
|
+
}
|
|
331
|
+
});
|
|
332
|
+
|
|
333
|
+
micInstance.start();
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
### Batch processing multiple files
|
|
337
|
+
|
|
338
|
+
```javascript
|
|
339
|
+
const { Sortformer } = require('streaming-sortformer-node');
|
|
340
|
+
const fs = require('fs');
|
|
341
|
+
const path = require('path');
|
|
342
|
+
|
|
343
|
+
const model = await Sortformer.load('./model.gguf', { threads: 8 });
|
|
344
|
+
|
|
345
|
+
const files = fs.readdirSync('./audio').filter(f => f.endsWith('.wav'));
|
|
346
|
+
|
|
347
|
+
for (const file of files) {
|
|
348
|
+
const audio = loadAudioFile(path.join('./audio', file));
|
|
349
|
+
const result = await model.diarize(audio, { mode: 'offline' });
|
|
350
|
+
|
|
351
|
+
const outPath = path.join('./output', file.replace('.wav', '.rttm'));
|
|
352
|
+
fs.writeFileSync(outPath, result.rttm);
|
|
353
|
+
|
|
354
|
+
console.log(`Processed ${file}: ${result.speakerCount} speakers`);
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
model.close();
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
## License
|
|
361
|
+
|
|
362
|
+
This project follows the same license as the parent [streaming-sortformer-ggml](https://github.com/predict-woo/streaming-sortformer-ggml) repository.
|
|
363
|
+
|
|
364
|
+
## Links
|
|
365
|
+
|
|
366
|
+
- [GitHub Repository](https://github.com/predict-woo/streaming-sortformer-ggml)
|
|
367
|
+
- [Full Documentation](https://github.com/predict-woo/streaming-sortformer-ggml/tree/main/bindings/node)
|
|
368
|
+
- [NVIDIA NeMo](https://github.com/NVIDIA/NeMo)
|
|
369
|
+
- [GGML](https://github.com/ggml-org/ggml)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TypeScript wrapper for the native SortFormer speaker diarization model
|
|
3
|
+
*/
|
|
4
|
+
import type { LoadOptions, DiarizeOptions, DiarizeResult, StreamingSessionOptions } from './types.js';
|
|
5
|
+
import { StreamingSession } from './StreamingSession.js';
|
|
6
|
+
/**
|
|
7
|
+
* SortFormer speaker diarization model wrapper
|
|
8
|
+
*
|
|
9
|
+
* Provides a high-level TypeScript API for loading and running the native
|
|
10
|
+
* SortFormer model for streaming speaker diarization.
|
|
11
|
+
*
|
|
12
|
+
* @example
|
|
13
|
+
* ```typescript
|
|
14
|
+
* const model = await Sortformer.load('./model.gguf', { threads: 4 });
|
|
15
|
+
* const result = await model.diarize(audioData, { mode: 'streaming', latency: '2s' });
|
|
16
|
+
* console.log(result.rttm);
|
|
17
|
+
* model.close();
|
|
18
|
+
* ```
|
|
19
|
+
*/
|
|
20
|
+
export declare class Sortformer {
|
|
21
|
+
private native;
|
|
22
|
+
private closed;
|
|
23
|
+
/**
|
|
24
|
+
* Private constructor - use static load() method instead
|
|
25
|
+
* @param native - Native SortformerModel instance from binding
|
|
26
|
+
*/
|
|
27
|
+
private constructor();
|
|
28
|
+
/**
|
|
29
|
+
* Load a SortFormer model from a GGUF file
|
|
30
|
+
*
|
|
31
|
+
* @param modelPath - Path to the GGUF model file
|
|
32
|
+
* @param options - Optional loading configuration
|
|
33
|
+
* @returns Promise resolving to a loaded Sortformer instance
|
|
34
|
+
* @throws Error if model file not found or native binding unavailable
|
|
35
|
+
*
|
|
36
|
+
* @example
|
|
37
|
+
* ```typescript
|
|
38
|
+
* const model = await Sortformer.load('./model.gguf', { threads: 8 });
|
|
39
|
+
* ```
|
|
40
|
+
*/
|
|
41
|
+
static load(modelPath: string, options?: LoadOptions): Promise<Sortformer>;
|
|
42
|
+
/**
|
|
43
|
+
* Run diarization inference on audio samples
|
|
44
|
+
*
|
|
45
|
+
* @param audio - Audio samples as Float32Array (16kHz mono)
|
|
46
|
+
* @param options - Optional diarization configuration
|
|
47
|
+
* @returns Promise resolving to diarization results (RTTM + predictions)
|
|
48
|
+
* @throws Error if model is closed, audio is invalid, or inference fails
|
|
49
|
+
*
|
|
50
|
+
* @example
|
|
51
|
+
* ```typescript
|
|
52
|
+
* const result = await model.diarize(audioData, {
|
|
53
|
+
* mode: 'streaming',
|
|
54
|
+
* latency: '2s',
|
|
55
|
+
* threshold: 0.5,
|
|
56
|
+
* medianFilter: 11
|
|
57
|
+
* });
|
|
58
|
+
* ```
|
|
59
|
+
*/
|
|
60
|
+
diarize(audio: Float32Array, options?: DiarizeOptions): Promise<DiarizeResult>;
|
|
61
|
+
/**
|
|
62
|
+
* Close the model and free native resources
|
|
63
|
+
*
|
|
64
|
+
* After calling close(), the model cannot be used for further inference.
|
|
65
|
+
* Calling close() multiple times is safe (idempotent).
|
|
66
|
+
*
|
|
67
|
+
* @example
|
|
68
|
+
* ```typescript
|
|
69
|
+
* model.close();
|
|
70
|
+
* ```
|
|
71
|
+
*/
|
|
72
|
+
close(): void;
|
|
73
|
+
/**
|
|
74
|
+
* Check if the model is closed
|
|
75
|
+
* @returns true if the model has been closed, false otherwise
|
|
76
|
+
*/
|
|
77
|
+
isClosed(): boolean;
|
|
78
|
+
/**
|
|
79
|
+
* Create a streaming session for incremental audio processing
|
|
80
|
+
*
|
|
81
|
+
* The streaming session maintains state (speaker cache, FIFO buffer)
|
|
82
|
+
* across feed() calls, enabling true real-time diarization.
|
|
83
|
+
*
|
|
84
|
+
* @param options - Optional streaming configuration
|
|
85
|
+
* @returns A new StreamingSession instance
|
|
86
|
+
* @throws Error if model is closed
|
|
87
|
+
*
|
|
88
|
+
* @example
|
|
89
|
+
* ```typescript
|
|
90
|
+
* const session = model.createStreamingSession({ preset: 'low' });
|
|
91
|
+
*
|
|
92
|
+
* // Feed audio chunks as they arrive
|
|
93
|
+
* const result1 = session.feed(chunk1);
|
|
94
|
+
* const result2 = session.feed(chunk2);
|
|
95
|
+
*
|
|
96
|
+
* // Accumulate predictions
|
|
97
|
+
* const allPreds = [...result1.predictions, ...result2.predictions];
|
|
98
|
+
*
|
|
99
|
+
* session.close();
|
|
100
|
+
* ```
|
|
101
|
+
*/
|
|
102
|
+
createStreamingSession(options?: StreamingSessionOptions): StreamingSession;
|
|
103
|
+
}
|
|
104
|
+
//# sourceMappingURL=Sortformer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"Sortformer.d.ts","sourceRoot":"","sources":["../src/Sortformer.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,cAAc,EAAE,aAAa,EAAE,uBAAuB,EAAmB,MAAM,YAAY,CAAC;AAGvH,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAEzD;;;;;;;;;;;;;GAaG;AACH,qBAAa,UAAU;IACrB,OAAO,CAAC,MAAM,CAAM;IACpB,OAAO,CAAC,MAAM,CAAkB;IAEhC;;;OAGG;IACH,OAAO;IAIP;;;;;;;;;;;;OAYG;WACU,IAAI,CAAC,SAAS,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,WAAW,GAAG,OAAO,CAAC,UAAU,CAAC;IAuBhF;;;;;;;;;;;;;;;;;OAiBG;IACG,OAAO,CAAC,KAAK,EAAE,YAAY,EAAE,OAAO,CAAC,EAAE,cAAc,GAAG,OAAO,CAAC,aAAa,CAAC;IAqFpF;;;;;;;;;;OAUG;IACH,KAAK,IAAI,IAAI;IASb;;;OAGG;IACH,QAAQ,IAAI,OAAO;IAInB;;;;;;;;;;;;;;;;;;;;;;;OAuBG;IACH,sBAAsB,CAAC,OAAO,CAAC,EAAE,uBAAuB,GAAG,gBAAgB;CA0B5E"}
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TypeScript wrapper for the native SortFormer speaker diarization model
|
|
3
|
+
*/
|
|
4
|
+
import { LATENCY_PRESETS, OFFLINE_PARAMS } from './presets.js';
|
|
5
|
+
import { getBinding } from './binding.js';
|
|
6
|
+
import { StreamingSession } from './StreamingSession.js';
|
|
7
|
+
/**
|
|
8
|
+
* SortFormer speaker diarization model wrapper
|
|
9
|
+
*
|
|
10
|
+
* Provides a high-level TypeScript API for loading and running the native
|
|
11
|
+
* SortFormer model for streaming speaker diarization.
|
|
12
|
+
*
|
|
13
|
+
* @example
|
|
14
|
+
* ```typescript
|
|
15
|
+
* const model = await Sortformer.load('./model.gguf', { threads: 4 });
|
|
16
|
+
* const result = await model.diarize(audioData, { mode: 'streaming', latency: '2s' });
|
|
17
|
+
* console.log(result.rttm);
|
|
18
|
+
* model.close();
|
|
19
|
+
* ```
|
|
20
|
+
*/
|
|
21
|
+
export class Sortformer {
|
|
22
|
+
/**
|
|
23
|
+
* Private constructor - use static load() method instead
|
|
24
|
+
* @param native - Native SortformerModel instance from binding
|
|
25
|
+
*/
|
|
26
|
+
constructor(native) {
|
|
27
|
+
this.closed = false;
|
|
28
|
+
this.native = native;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Load a SortFormer model from a GGUF file
|
|
32
|
+
*
|
|
33
|
+
* @param modelPath - Path to the GGUF model file
|
|
34
|
+
* @param options - Optional loading configuration
|
|
35
|
+
* @returns Promise resolving to a loaded Sortformer instance
|
|
36
|
+
* @throws Error if model file not found or native binding unavailable
|
|
37
|
+
*
|
|
38
|
+
* @example
|
|
39
|
+
* ```typescript
|
|
40
|
+
* const model = await Sortformer.load('./model.gguf', { threads: 8 });
|
|
41
|
+
* ```
|
|
42
|
+
*/
|
|
43
|
+
static async load(modelPath, options) {
|
|
44
|
+
// Validate input
|
|
45
|
+
if (!modelPath || typeof modelPath !== 'string') {
|
|
46
|
+
throw new TypeError('modelPath must be a non-empty string');
|
|
47
|
+
}
|
|
48
|
+
// Get native binding
|
|
49
|
+
const binding = getBinding();
|
|
50
|
+
// Create native model instance
|
|
51
|
+
// Default to 4 threads if not specified
|
|
52
|
+
const threads = options?.threads ?? 4;
|
|
53
|
+
if (threads < 1 || !Number.isInteger(threads)) {
|
|
54
|
+
throw new Error('threads must be a positive integer');
|
|
55
|
+
}
|
|
56
|
+
// Instantiate native model
|
|
57
|
+
const native = new binding.SortformerModel(modelPath, threads);
|
|
58
|
+
return new Sortformer(native);
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Run diarization inference on audio samples
|
|
62
|
+
*
|
|
63
|
+
* @param audio - Audio samples as Float32Array (16kHz mono)
|
|
64
|
+
* @param options - Optional diarization configuration
|
|
65
|
+
* @returns Promise resolving to diarization results (RTTM + predictions)
|
|
66
|
+
* @throws Error if model is closed, audio is invalid, or inference fails
|
|
67
|
+
*
|
|
68
|
+
* @example
|
|
69
|
+
* ```typescript
|
|
70
|
+
* const result = await model.diarize(audioData, {
|
|
71
|
+
* mode: 'streaming',
|
|
72
|
+
* latency: '2s',
|
|
73
|
+
* threshold: 0.5,
|
|
74
|
+
* medianFilter: 11
|
|
75
|
+
* });
|
|
76
|
+
* ```
|
|
77
|
+
*/
|
|
78
|
+
async diarize(audio, options) {
|
|
79
|
+
// Check if model is closed
|
|
80
|
+
if (this.closed) {
|
|
81
|
+
throw new Error('Model is closed. Cannot perform diarization.');
|
|
82
|
+
}
|
|
83
|
+
// Validate audio input
|
|
84
|
+
if (!(audio instanceof Float32Array)) {
|
|
85
|
+
throw new TypeError('audio must be a Float32Array');
|
|
86
|
+
}
|
|
87
|
+
if (audio.length === 0) {
|
|
88
|
+
throw new Error('audio cannot be empty');
|
|
89
|
+
}
|
|
90
|
+
// Validate options
|
|
91
|
+
if (options?.threshold !== undefined) {
|
|
92
|
+
if (typeof options.threshold !== 'number' || options.threshold < 0 || options.threshold > 1) {
|
|
93
|
+
throw new Error('threshold must be a number between 0 and 1');
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
if (options?.medianFilter !== undefined) {
|
|
97
|
+
if (!Number.isInteger(options.medianFilter) || options.medianFilter < 1 || options.medianFilter % 2 === 0) {
|
|
98
|
+
throw new Error('medianFilter must be a positive odd integer');
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
// Map user-friendly options to native format
|
|
102
|
+
const mode = options?.mode ?? 'offline';
|
|
103
|
+
const nativeOptions = {
|
|
104
|
+
threshold: options?.threshold ?? 0.5,
|
|
105
|
+
medianFilter: options?.medianFilter ?? 11,
|
|
106
|
+
};
|
|
107
|
+
// Add streaming-specific parameters if in streaming mode
|
|
108
|
+
if (mode === 'streaming') {
|
|
109
|
+
const latency = options?.latency ?? '2s';
|
|
110
|
+
const presetParams = LATENCY_PRESETS[latency];
|
|
111
|
+
if (!presetParams) {
|
|
112
|
+
throw new Error(`Unknown latency preset: ${latency}`);
|
|
113
|
+
}
|
|
114
|
+
nativeOptions.chunkLen = presetParams.chunkLen;
|
|
115
|
+
nativeOptions.rightContext = presetParams.rightContext;
|
|
116
|
+
nativeOptions.fifoLen = presetParams.fifoLen;
|
|
117
|
+
nativeOptions.spkcacheUpdatePeriod = presetParams.spkcacheUpdatePeriod;
|
|
118
|
+
}
|
|
119
|
+
else if (mode === 'offline') {
|
|
120
|
+
// Use offline parameters
|
|
121
|
+
nativeOptions.chunkLen = OFFLINE_PARAMS.chunkLen;
|
|
122
|
+
nativeOptions.rightContext = OFFLINE_PARAMS.rightContext;
|
|
123
|
+
nativeOptions.fifoLen = OFFLINE_PARAMS.fifoLen;
|
|
124
|
+
nativeOptions.spkcacheUpdatePeriod = OFFLINE_PARAMS.spkcacheUpdatePeriod;
|
|
125
|
+
}
|
|
126
|
+
else {
|
|
127
|
+
throw new Error(`Unknown diarization mode: ${mode}`);
|
|
128
|
+
}
|
|
129
|
+
// Call native diarization
|
|
130
|
+
const result = await this.native.diarize(audio, nativeOptions);
|
|
131
|
+
// Validate result structure
|
|
132
|
+
if (!result || typeof result !== 'object') {
|
|
133
|
+
throw new Error('Native diarization returned invalid result');
|
|
134
|
+
}
|
|
135
|
+
if (typeof result.rttm !== 'string') {
|
|
136
|
+
throw new Error('Native diarization result missing rttm string');
|
|
137
|
+
}
|
|
138
|
+
if (!(result.predictions instanceof Float32Array)) {
|
|
139
|
+
throw new Error('Native diarization result predictions must be Float32Array');
|
|
140
|
+
}
|
|
141
|
+
if (!Number.isInteger(result.frameCount) || result.frameCount < 0) {
|
|
142
|
+
throw new Error('Native diarization result frameCount must be non-negative integer');
|
|
143
|
+
}
|
|
144
|
+
if (!Number.isInteger(result.speakerCount) || result.speakerCount < 1 || result.speakerCount > 4) {
|
|
145
|
+
throw new Error('Native diarization result speakerCount must be 1-4');
|
|
146
|
+
}
|
|
147
|
+
return result;
|
|
148
|
+
}
|
|
149
|
+
/**
|
|
150
|
+
* Close the model and free native resources
|
|
151
|
+
*
|
|
152
|
+
* After calling close(), the model cannot be used for further inference.
|
|
153
|
+
* Calling close() multiple times is safe (idempotent).
|
|
154
|
+
*
|
|
155
|
+
* @example
|
|
156
|
+
* ```typescript
|
|
157
|
+
* model.close();
|
|
158
|
+
* ```
|
|
159
|
+
*/
|
|
160
|
+
close() {
|
|
161
|
+
if (!this.closed) {
|
|
162
|
+
if (this.native && typeof this.native.close === 'function') {
|
|
163
|
+
this.native.close();
|
|
164
|
+
}
|
|
165
|
+
this.closed = true;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Check if the model is closed
|
|
170
|
+
* @returns true if the model has been closed, false otherwise
|
|
171
|
+
*/
|
|
172
|
+
isClosed() {
|
|
173
|
+
return this.closed;
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Create a streaming session for incremental audio processing
|
|
177
|
+
*
|
|
178
|
+
* The streaming session maintains state (speaker cache, FIFO buffer)
|
|
179
|
+
* across feed() calls, enabling true real-time diarization.
|
|
180
|
+
*
|
|
181
|
+
* @param options - Optional streaming configuration
|
|
182
|
+
* @returns A new StreamingSession instance
|
|
183
|
+
* @throws Error if model is closed
|
|
184
|
+
*
|
|
185
|
+
* @example
|
|
186
|
+
* ```typescript
|
|
187
|
+
* const session = model.createStreamingSession({ preset: 'low' });
|
|
188
|
+
*
|
|
189
|
+
* // Feed audio chunks as they arrive
|
|
190
|
+
* const result1 = session.feed(chunk1);
|
|
191
|
+
* const result2 = session.feed(chunk2);
|
|
192
|
+
*
|
|
193
|
+
* // Accumulate predictions
|
|
194
|
+
* const allPreds = [...result1.predictions, ...result2.predictions];
|
|
195
|
+
*
|
|
196
|
+
* session.close();
|
|
197
|
+
* ```
|
|
198
|
+
*/
|
|
199
|
+
createStreamingSession(options) {
|
|
200
|
+
if (this.closed) {
|
|
201
|
+
throw new Error('Model is closed. Cannot create streaming session.');
|
|
202
|
+
}
|
|
203
|
+
const preset = options?.preset ?? '2s';
|
|
204
|
+
// Map preset string to enum value
|
|
205
|
+
const presetMap = {
|
|
206
|
+
'low': 0, // SORTFORMER_PRESET_LOW_LATENCY
|
|
207
|
+
'2s': 1, // SORTFORMER_PRESET_2S
|
|
208
|
+
'3s': 2, // SORTFORMER_PRESET_3S
|
|
209
|
+
'5s': 3, // SORTFORMER_PRESET_5S
|
|
210
|
+
};
|
|
211
|
+
const presetNum = presetMap[preset];
|
|
212
|
+
if (presetNum === undefined) {
|
|
213
|
+
throw new Error(`Unknown preset: ${preset}`);
|
|
214
|
+
}
|
|
215
|
+
// Get binding and create native session
|
|
216
|
+
const binding = getBinding();
|
|
217
|
+
const nativeSession = new binding.StreamingSession(this.native, presetNum);
|
|
218
|
+
return new StreamingSession(nativeSession);
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
//# sourceMappingURL=Sortformer.js.map
|