pyannote-cpp-node 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +765 -0
- package/dist/Pyannote.d.ts +12 -0
- package/dist/Pyannote.d.ts.map +1 -0
- package/dist/Pyannote.js +50 -0
- package/dist/Pyannote.js.map +1 -0
- package/dist/StreamingSession.d.ts +12 -0
- package/dist/StreamingSession.d.ts.map +1 -0
- package/dist/StreamingSession.js +34 -0
- package/dist/StreamingSession.js.map +1 -0
- package/dist/binding.d.ts +20 -0
- package/dist/binding.d.ts.map +1 -0
- package/dist/binding.js +43 -0
- package/dist/binding.js.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -0
- package/dist/types.d.ts +23 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +30 -0
package/README.md
ADDED
|
@@ -0,0 +1,765 @@
|
|
|
1
|
+
# pyannote-cpp-node
|
|
2
|
+
|
|
3
|
+
Node.js native bindings for real-time speaker diarization
|
|
4
|
+
|
|
5
|
+

|
|
6
|
+

|
|
7
|
+
|
|
8
|
+
## Overview
|
|
9
|
+
|
|
10
|
+
`pyannote-cpp-node` provides Node.js bindings to a high-performance C++ port of the [`pyannote/speaker-diarization-community-1`](https://huggingface.co/pyannote/speaker-diarization-community-1) pipeline. It achieves **39x real-time** performance on Apple Silicon by leveraging CoreML acceleration (Neural Engine + GPU) for neural network inference and optimized C++ implementations of clustering algorithms.
|
|
11
|
+
|
|
12
|
+
The library supports two modes:
|
|
13
|
+
|
|
14
|
+
- **Offline diarization**: Process an entire audio file at once and receive speaker-labeled segments
|
|
15
|
+
- **Streaming diarization**: Process audio incrementally in real-time, receive voice activity detection (VAD) as audio arrives, and trigger speaker clustering on demand
|
|
16
|
+
|
|
17
|
+
All heavy operations are asynchronous and run on libuv worker threads, ensuring the Node.js event loop remains responsive.
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
|
|
21
|
+
- **Offline diarization** — Process full audio files and get speaker-labeled segments
|
|
22
|
+
- **Streaming diarization** — Push audio incrementally, receive real-time VAD, recluster on demand
|
|
23
|
+
- **Async/await API** — All heavy operations return Promises and run on worker threads
|
|
24
|
+
- **CoreML acceleration** — Neural networks run on Apple's Neural Engine, GPU, and CPU
|
|
25
|
+
- **TypeScript-first** — Full type definitions included
|
|
26
|
+
- **Zero-copy audio input** — Direct `Float32Array` input for maximum efficiency
|
|
27
|
+
- **Byte-identical output** — Streaming finalize produces identical results to offline pipeline
|
|
28
|
+
|
|
29
|
+
## Requirements
|
|
30
|
+
|
|
31
|
+
- **macOS** with Apple Silicon (M1/M2/M3/M4) or Intel x64
|
|
32
|
+
- **Node.js** >= 18
|
|
33
|
+
- **Model files**:
|
|
34
|
+
- Segmentation GGUF model (`segmentation.gguf`)
|
|
35
|
+
- Embedding GGUF model (`embedding.gguf`)
|
|
36
|
+
- PLDA GGUF model (`plda.gguf`)
|
|
37
|
+
- Segmentation CoreML model package (`segmentation.mlpackage/`)
|
|
38
|
+
- Embedding CoreML model package (`embedding.mlpackage/`)
|
|
39
|
+
|
|
40
|
+
Model files can be obtained by converting the original PyTorch models using the conversion scripts in the parent repository.
|
|
41
|
+
|
|
42
|
+
## Installation
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
npm install pyannote-cpp-node
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Or with pnpm:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pnpm add pyannote-cpp-node
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
The package uses `optionalDependencies` to automatically install the correct platform-specific native addon (`@pyannote-cpp-node/darwin-arm64` or `@pyannote-cpp-node/darwin-x64`).
|
|
55
|
+
|
|
56
|
+
## Quick Start
|
|
57
|
+
|
|
58
|
+
```typescript
|
|
59
|
+
import { Pyannote } from 'pyannote-cpp-node';
|
|
60
|
+
import { readFileSync } from 'node:fs';
|
|
61
|
+
|
|
62
|
+
// Load model (validates all paths exist)
|
|
63
|
+
const model = await Pyannote.load({
|
|
64
|
+
segModelPath: './models/segmentation.gguf',
|
|
65
|
+
embModelPath: './models/embedding.gguf',
|
|
66
|
+
pldaPath: './models/plda.gguf',
|
|
67
|
+
coremlPath: './models/embedding.mlpackage',
|
|
68
|
+
segCoremlPath: './models/segmentation.mlpackage',
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
// Load audio (16kHz mono Float32Array - see "Audio Format Requirements")
|
|
72
|
+
const audio = loadWavFile('./audio.wav');
|
|
73
|
+
|
|
74
|
+
// Run diarization
|
|
75
|
+
const result = await model.diarize(audio);
|
|
76
|
+
|
|
77
|
+
// Print results
|
|
78
|
+
for (const segment of result.segments) {
|
|
79
|
+
console.log(
|
|
80
|
+
`[${segment.start.toFixed(2)}s - ${(segment.start + segment.duration).toFixed(2)}s] ${segment.speaker}`
|
|
81
|
+
);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Clean up
|
|
85
|
+
model.close();
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## API Reference
|
|
89
|
+
|
|
90
|
+
### `Pyannote` Class
|
|
91
|
+
|
|
92
|
+
The main entry point for loading diarization models.
|
|
93
|
+
|
|
94
|
+
#### `static async load(config: ModelConfig): Promise<Pyannote>`
|
|
95
|
+
|
|
96
|
+
Factory method for loading a diarization model. Validates that all model paths exist before initializing. CoreML model compilation happens synchronously during initialization and is typically fast.
|
|
97
|
+
|
|
98
|
+
**Parameters:**
|
|
99
|
+
- `config: ModelConfig` — Configuration object with paths to all required model files
|
|
100
|
+
|
|
101
|
+
**Returns:** `Promise<Pyannote>` — Initialized model instance
|
|
102
|
+
|
|
103
|
+
**Throws:**
|
|
104
|
+
- `Error` if any model path does not exist or is invalid
|
|
105
|
+
|
|
106
|
+
**Example:**
|
|
107
|
+
```typescript
|
|
108
|
+
const model = await Pyannote.load({
|
|
109
|
+
segModelPath: './models/segmentation.gguf',
|
|
110
|
+
embModelPath: './models/embedding.gguf',
|
|
111
|
+
pldaPath: './models/plda.gguf',
|
|
112
|
+
coremlPath: './models/embedding.mlpackage',
|
|
113
|
+
segCoremlPath: './models/segmentation.mlpackage',
|
|
114
|
+
});
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
#### `async diarize(audio: Float32Array): Promise<DiarizationResult>`
|
|
118
|
+
|
|
119
|
+
Performs offline diarization on the entire audio file. Audio must be 16kHz mono in `Float32Array` format with values in the range [-1.0, 1.0].
|
|
120
|
+
|
|
121
|
+
Internally, this method uses the streaming API: it initializes a streaming session, pushes all audio in 1-second chunks, calls finalize, and cleans up. The operation runs on a worker thread and is non-blocking.
|
|
122
|
+
|
|
123
|
+
**Parameters:**
|
|
124
|
+
- `audio: Float32Array` — Audio samples (16kHz mono, values in [-1.0, 1.0])
|
|
125
|
+
|
|
126
|
+
**Returns:** `Promise<DiarizationResult>` — Diarization result with speaker-labeled segments sorted by start time
|
|
127
|
+
|
|
128
|
+
**Throws:**
|
|
129
|
+
- `Error` if model is closed
|
|
130
|
+
- `TypeError` if audio is not a `Float32Array`
|
|
131
|
+
- `Error` if audio is empty
|
|
132
|
+
|
|
133
|
+
**Example:**
|
|
134
|
+
```typescript
|
|
135
|
+
const result = await model.diarize(audio);
|
|
136
|
+
console.log(`Detected ${result.segments.length} segments`);
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
#### `createStreamingSession(): StreamingSession`
|
|
140
|
+
|
|
141
|
+
Creates a new independent streaming session. Each session maintains its own internal state and can be used to process audio incrementally.
|
|
142
|
+
|
|
143
|
+
**Returns:** `StreamingSession` — New streaming session instance
|
|
144
|
+
|
|
145
|
+
**Throws:**
|
|
146
|
+
- `Error` if model is closed
|
|
147
|
+
|
|
148
|
+
**Example:**
|
|
149
|
+
```typescript
|
|
150
|
+
const session = model.createStreamingSession();
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
#### `close(): void`
|
|
154
|
+
|
|
155
|
+
Releases all native resources associated with the model. This method is idempotent and safe to call multiple times.
|
|
156
|
+
|
|
157
|
+
Once closed, the model cannot be used for diarization or creating new streaming sessions. Existing streaming sessions should be closed before closing the model.
|
|
158
|
+
|
|
159
|
+
**Example:**
|
|
160
|
+
```typescript
|
|
161
|
+
model.close();
|
|
162
|
+
console.log(model.isClosed); // true
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
#### `get isClosed: boolean`
|
|
166
|
+
|
|
167
|
+
Indicates whether the model has been closed.
|
|
168
|
+
|
|
169
|
+
**Returns:** `boolean` — `true` if the model is closed, `false` otherwise
|
|
170
|
+
|
|
171
|
+
### `StreamingSession` Class
|
|
172
|
+
|
|
173
|
+
Handles incremental audio processing for real-time diarization.
|
|
174
|
+
|
|
175
|
+
#### `async push(audio: Float32Array): Promise<VADChunk[]>`
|
|
176
|
+
|
|
177
|
+
Pushes audio samples to the streaming session. Audio must be 16kHz mono `Float32Array`. Typically, push 1 second of audio (16,000 samples) at a time.
|
|
178
|
+
|
|
179
|
+
The first chunk requires 10 seconds of accumulated audio to produce output (the segmentation model uses a 10-second window). After that, each subsequent push returns approximately one `VADChunk` (depending on the 1-second hop size).
|
|
180
|
+
|
|
181
|
+
The returned VAD chunks contain frame-level voice activity (OR of all speakers) for the newly processed 10-second windows.
|
|
182
|
+
|
|
183
|
+
**Parameters:**
|
|
184
|
+
- `audio: Float32Array` — Audio samples (16kHz mono, values in [-1.0, 1.0])
|
|
185
|
+
|
|
186
|
+
**Returns:** `Promise<VADChunk[]>` — Array of VAD chunks (empty until 10 seconds accumulated)
|
|
187
|
+
|
|
188
|
+
**Throws:**
|
|
189
|
+
- `Error` if session is closed
|
|
190
|
+
- `TypeError` if audio is not a `Float32Array`
|
|
191
|
+
|
|
192
|
+
**Example:**
|
|
193
|
+
```typescript
|
|
194
|
+
const vadChunks = await session.push(audioChunk);
|
|
195
|
+
for (const chunk of vadChunks) {
|
|
196
|
+
console.log(`VAD chunk ${chunk.chunkIndex}: ${chunk.numFrames} frames`);
|
|
197
|
+
}
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
#### `async recluster(): Promise<DiarizationResult>`
|
|
201
|
+
|
|
202
|
+
Triggers full clustering on all accumulated audio data. This runs the complete diarization pipeline (embedding extraction → PLDA scoring → hierarchical clustering → VBx refinement → speaker assignment) and returns speaker-labeled segments with global speaker IDs.
|
|
203
|
+
|
|
204
|
+
**Warning:** This method mutates the internal session state. Specifically, it replaces the internal embedding and chunk index arrays with filtered versions (excluding silent speakers). Calling `push` after `recluster` may produce unexpected results. Use `recluster` sparingly (e.g., every 30 seconds for live progress updates) or only call `finalize` when the stream ends.
|
|
205
|
+
|
|
206
|
+
The operation runs on a worker thread and is non-blocking.
|
|
207
|
+
|
|
208
|
+
**Returns:** `Promise<DiarizationResult>` — Complete diarization result with global speaker labels
|
|
209
|
+
|
|
210
|
+
**Throws:**
|
|
211
|
+
- `Error` if session is closed
|
|
212
|
+
|
|
213
|
+
**Example:**
|
|
214
|
+
```typescript
|
|
215
|
+
// Trigger intermediate clustering after accumulating data
|
|
216
|
+
const intermediateResult = await session.recluster();
|
|
217
|
+
console.log(`Current speaker count: ${new Set(intermediateResult.segments.map(s => s.speaker)).size}`);
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
#### `async finalize(): Promise<DiarizationResult>`
|
|
221
|
+
|
|
222
|
+
Processes any remaining audio (zero-padding partial chunks to match the offline pipeline's chunk count formula), then performs final clustering. This method produces byte-identical output to the offline `diarize()` method when given the same input audio.
|
|
223
|
+
|
|
224
|
+
Call this method when the audio stream has ended to get the final diarization result.
|
|
225
|
+
|
|
226
|
+
The operation runs on a worker thread and is non-blocking.
|
|
227
|
+
|
|
228
|
+
**Returns:** `Promise<DiarizationResult>` — Final diarization result
|
|
229
|
+
|
|
230
|
+
**Throws:**
|
|
231
|
+
- `Error` if session is closed
|
|
232
|
+
|
|
233
|
+
**Example:**
|
|
234
|
+
```typescript
|
|
235
|
+
const finalResult = await session.finalize();
|
|
236
|
+
console.log(`Final result: ${finalResult.segments.length} segments`);
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
#### `close(): void`
|
|
240
|
+
|
|
241
|
+
Releases all native resources associated with the streaming session. This method is idempotent and safe to call multiple times.
|
|
242
|
+
|
|
243
|
+
**Example:**
|
|
244
|
+
```typescript
|
|
245
|
+
session.close();
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
#### `get isClosed: boolean`
|
|
249
|
+
|
|
250
|
+
Indicates whether the session has been closed.
|
|
251
|
+
|
|
252
|
+
**Returns:** `boolean` — `true` if the session is closed, `false` otherwise
|
|
253
|
+
|
|
254
|
+
### Types
|
|
255
|
+
|
|
256
|
+
#### `ModelConfig`
|
|
257
|
+
|
|
258
|
+
Configuration object for loading diarization models.
|
|
259
|
+
|
|
260
|
+
```typescript
|
|
261
|
+
interface ModelConfig {
|
|
262
|
+
segModelPath: string; // Path to segmentation GGUF model file
|
|
263
|
+
embModelPath: string; // Path to embedding GGUF model file
|
|
264
|
+
pldaPath: string; // Path to PLDA GGUF model file
|
|
265
|
+
coremlPath: string; // Path to embedding CoreML .mlpackage directory
|
|
266
|
+
segCoremlPath: string; // Path to segmentation CoreML .mlpackage directory
|
|
267
|
+
}
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
#### `VADChunk`
|
|
271
|
+
|
|
272
|
+
Voice activity detection result for a single 10-second audio chunk.
|
|
273
|
+
|
|
274
|
+
```typescript
|
|
275
|
+
interface VADChunk {
|
|
276
|
+
chunkIndex: number; // Zero-based chunk number (increments every 1 second)
|
|
277
|
+
startTime: number; // Absolute start time in seconds (chunkIndex * 1.0)
|
|
278
|
+
duration: number; // Always 10.0 (chunk window size)
|
|
279
|
+
numFrames: number; // Always 589 (segmentation model output frames)
|
|
280
|
+
vad: Float32Array; // [589] frame-level voice activity: 1.0 if any speaker active, 0.0 otherwise
|
|
281
|
+
}
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
The `vad` array contains 589 frames, each representing approximately 17ms of audio. A value of 1.0 indicates speech activity (any speaker), 0.0 indicates silence.
|
|
285
|
+
|
|
286
|
+
#### `Segment`
|
|
287
|
+
|
|
288
|
+
A contiguous speech segment with speaker label.
|
|
289
|
+
|
|
290
|
+
```typescript
|
|
291
|
+
interface Segment {
|
|
292
|
+
start: number; // Start time in seconds
|
|
293
|
+
duration: number; // Duration in seconds
|
|
294
|
+
speaker: string; // Speaker label (e.g., "SPEAKER_00", "SPEAKER_01", ...)
|
|
295
|
+
}
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
#### `DiarizationResult`
|
|
299
|
+
|
|
300
|
+
Complete diarization output with speaker-labeled segments.
|
|
301
|
+
|
|
302
|
+
```typescript
|
|
303
|
+
interface DiarizationResult {
|
|
304
|
+
segments: Segment[]; // Array of segments, sorted by start time
|
|
305
|
+
}
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
## Usage Examples
|
|
309
|
+
|
|
310
|
+
### Example 1: Offline Diarization
|
|
311
|
+
|
|
312
|
+
Process an entire audio file and print a timeline of speaker segments.
|
|
313
|
+
|
|
314
|
+
```typescript
|
|
315
|
+
import { Pyannote } from 'pyannote-cpp-node';
|
|
316
|
+
import { readFileSync } from 'node:fs';
|
|
317
|
+
|
|
318
|
+
// Helper to load 16-bit PCM WAV and convert to Float32Array
|
|
319
|
+
function loadWavFile(filePath: string): Float32Array {
|
|
320
|
+
const buffer = readFileSync(filePath);
|
|
321
|
+
const view = new DataView(buffer.buffer, buffer.byteOffset, buffer.byteLength);
|
|
322
|
+
|
|
323
|
+
// Find data chunk
|
|
324
|
+
let offset = 12; // Skip RIFF header
|
|
325
|
+
while (offset < view.byteLength - 8) {
|
|
326
|
+
const chunkId = String.fromCharCode(
|
|
327
|
+
view.getUint8(offset),
|
|
328
|
+
view.getUint8(offset + 1),
|
|
329
|
+
view.getUint8(offset + 2),
|
|
330
|
+
view.getUint8(offset + 3)
|
|
331
|
+
);
|
|
332
|
+
const chunkSize = view.getUint32(offset + 4, true);
|
|
333
|
+
offset += 8;
|
|
334
|
+
|
|
335
|
+
if (chunkId === 'data') {
|
|
336
|
+
// Convert Int16 PCM to Float32 by dividing by 32768
|
|
337
|
+
const numSamples = chunkSize / 2;
|
|
338
|
+
const float32 = new Float32Array(numSamples);
|
|
339
|
+
for (let i = 0; i < numSamples; i++) {
|
|
340
|
+
float32[i] = view.getInt16(offset + i * 2, true) / 32768.0;
|
|
341
|
+
}
|
|
342
|
+
return float32;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
offset += chunkSize;
|
|
346
|
+
if (chunkSize % 2 !== 0) offset++; // Align to word boundary
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
throw new Error('No data chunk found in WAV file');
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
async function main() {
|
|
353
|
+
// Load model
|
|
354
|
+
const model = await Pyannote.load({
|
|
355
|
+
segModelPath: './models/segmentation.gguf',
|
|
356
|
+
embModelPath: './models/embedding.gguf',
|
|
357
|
+
pldaPath: './models/plda.gguf',
|
|
358
|
+
coremlPath: './models/embedding.mlpackage',
|
|
359
|
+
segCoremlPath: './models/segmentation.mlpackage',
|
|
360
|
+
});
|
|
361
|
+
|
|
362
|
+
// Load audio
|
|
363
|
+
const audio = loadWavFile('./audio.wav');
|
|
364
|
+
console.log(`Loaded ${audio.length} samples (${(audio.length / 16000).toFixed(1)}s)`);
|
|
365
|
+
|
|
366
|
+
// Diarize
|
|
367
|
+
const result = await model.diarize(audio);
|
|
368
|
+
|
|
369
|
+
// Print timeline
|
|
370
|
+
console.log(`\nDetected ${result.segments.length} segments:`);
|
|
371
|
+
for (const segment of result.segments) {
|
|
372
|
+
const startTime = segment.start.toFixed(2);
|
|
373
|
+
const endTime = (segment.start + segment.duration).toFixed(2);
|
|
374
|
+
console.log(`[${startTime}s - ${endTime}s] ${segment.speaker}`);
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
// Count speakers
|
|
378
|
+
const speakers = new Set(result.segments.map(s => s.speaker));
|
|
379
|
+
console.log(`\nTotal speakers: ${speakers.size}`);
|
|
380
|
+
|
|
381
|
+
model.close();
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
main();
|
|
385
|
+
```
|
|
386
|
+
|
|
387
|
+
### Example 2: Streaming Diarization
|
|
388
|
+
|
|
389
|
+
Process audio incrementally in 1-second chunks, displaying real-time VAD.
|
|
390
|
+
|
|
391
|
+
```typescript
|
|
392
|
+
import { Pyannote } from 'pyannote-cpp-node';
|
|
393
|
+
|
|
394
|
+
async function streamingDiarization() {
|
|
395
|
+
const model = await Pyannote.load({
|
|
396
|
+
segModelPath: './models/segmentation.gguf',
|
|
397
|
+
embModelPath: './models/embedding.gguf',
|
|
398
|
+
pldaPath: './models/plda.gguf',
|
|
399
|
+
coremlPath: './models/embedding.mlpackage',
|
|
400
|
+
segCoremlPath: './models/segmentation.mlpackage',
|
|
401
|
+
});
|
|
402
|
+
|
|
403
|
+
const session = model.createStreamingSession();
|
|
404
|
+
|
|
405
|
+
// Load full audio file
|
|
406
|
+
const audio = loadWavFile('./audio.wav');
|
|
407
|
+
|
|
408
|
+
// Push audio in 1-second chunks (16,000 samples)
|
|
409
|
+
const CHUNK_SIZE = 16000;
|
|
410
|
+
let totalChunks = 0;
|
|
411
|
+
|
|
412
|
+
for (let offset = 0; offset < audio.length; offset += CHUNK_SIZE) {
|
|
413
|
+
const end = Math.min(offset + CHUNK_SIZE, audio.length);
|
|
414
|
+
const chunk = audio.slice(offset, end);
|
|
415
|
+
|
|
416
|
+
const vadChunks = await session.push(chunk);
|
|
417
|
+
|
|
418
|
+
// VAD chunks are returned after first 10 seconds
|
|
419
|
+
for (const vad of vadChunks) {
|
|
420
|
+
// Count active frames (speech detected)
|
|
421
|
+
const activeFrames = vad.vad.filter(v => v > 0.5).length;
|
|
422
|
+
const speechRatio = (activeFrames / vad.numFrames * 100).toFixed(1);
|
|
423
|
+
|
|
424
|
+
console.log(
|
|
425
|
+
`Chunk ${vad.chunkIndex}: ${vad.startTime.toFixed(1)}s - ${(vad.startTime + vad.duration).toFixed(1)}s | ` +
|
|
426
|
+
`Speech: ${speechRatio}%`
|
|
427
|
+
);
|
|
428
|
+
totalChunks++;
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
console.log(`\nProcessed ${totalChunks} chunks`);
|
|
433
|
+
|
|
434
|
+
// Get final diarization result
|
|
435
|
+
console.log('\nFinalizing...');
|
|
436
|
+
const result = await session.finalize();
|
|
437
|
+
|
|
438
|
+
console.log(`\nFinal result: ${result.segments.length} segments`);
|
|
439
|
+
for (const segment of result.segments) {
|
|
440
|
+
console.log(
|
|
441
|
+
`[${segment.start.toFixed(2)}s - ${(segment.start + segment.duration).toFixed(2)}s] ${segment.speaker}`
|
|
442
|
+
);
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
session.close();
|
|
446
|
+
model.close();
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
streamingDiarization();
|
|
450
|
+
```
|
|
451
|
+
|
|
452
|
+
### Example 3: On-Demand Reclustering
|
|
453
|
+
|
|
454
|
+
Push audio and trigger reclustering every 30 seconds to get intermediate results.
|
|
455
|
+
|
|
456
|
+
```typescript
|
|
457
|
+
import { Pyannote } from 'pyannote-cpp-node';
|
|
458
|
+
|
|
459
|
+
async function reclusteringExample() {
|
|
460
|
+
const model = await Pyannote.load({
|
|
461
|
+
segModelPath: './models/segmentation.gguf',
|
|
462
|
+
embModelPath: './models/embedding.gguf',
|
|
463
|
+
pldaPath: './models/plda.gguf',
|
|
464
|
+
coremlPath: './models/embedding.mlpackage',
|
|
465
|
+
segCoremlPath: './models/segmentation.mlpackage',
|
|
466
|
+
});
|
|
467
|
+
|
|
468
|
+
const session = model.createStreamingSession();
|
|
469
|
+
const audio = loadWavFile('./audio.wav');
|
|
470
|
+
|
|
471
|
+
const CHUNK_SIZE = 16000; // 1 second
|
|
472
|
+
const RECLUSTER_INTERVAL = 30; // Recluster every 30 seconds
|
|
473
|
+
|
|
474
|
+
let secondsProcessed = 0;
|
|
475
|
+
|
|
476
|
+
for (let offset = 0; offset < audio.length; offset += CHUNK_SIZE) {
|
|
477
|
+
const end = Math.min(offset + CHUNK_SIZE, audio.length);
|
|
478
|
+
const chunk = audio.slice(offset, end);
|
|
479
|
+
|
|
480
|
+
await session.push(chunk);
|
|
481
|
+
secondsProcessed++;
|
|
482
|
+
|
|
483
|
+
// Recluster every 30 seconds
|
|
484
|
+
if (secondsProcessed % RECLUSTER_INTERVAL === 0) {
|
|
485
|
+
console.log(`\n--- Reclustering at ${secondsProcessed}s ---`);
|
|
486
|
+
const intermediateResult = await session.recluster();
|
|
487
|
+
|
|
488
|
+
const speakers = new Set(intermediateResult.segments.map(s => s.speaker));
|
|
489
|
+
console.log(`Current speakers detected: ${speakers.size}`);
|
|
490
|
+
console.log(`Current segments: ${intermediateResult.segments.length}`);
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
// Final result
|
|
495
|
+
console.log('\n--- Final result ---');
|
|
496
|
+
const finalResult = await session.finalize();
|
|
497
|
+
const speakers = new Set(finalResult.segments.map(s => s.speaker));
|
|
498
|
+
console.log(`Total speakers: ${speakers.size}`);
|
|
499
|
+
console.log(`Total segments: ${finalResult.segments.length}`);
|
|
500
|
+
|
|
501
|
+
session.close();
|
|
502
|
+
model.close();
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
reclusteringExample();
|
|
506
|
+
```
|
|
507
|
+
|
|
508
|
+
### Example 4: Generating RTTM Output
|
|
509
|
+
|
|
510
|
+
Format diarization results into standard RTTM (Rich Transcription Time Marked) format.
|
|
511
|
+
|
|
512
|
+
```typescript
|
|
513
|
+
import { Pyannote, type DiarizationResult } from 'pyannote-cpp-node';
|
|
514
|
+
import { writeFileSync } from 'node:fs';
|
|
515
|
+
|
|
516
|
+
function toRTTM(result: DiarizationResult, filename: string = 'audio'): string {
|
|
517
|
+
const lines = result.segments.map(segment => {
|
|
518
|
+
// RTTM format: SPEAKER <file> <chnl> <tbeg> <tdur> <ortho> <stype> <name> <conf> <slat>
|
|
519
|
+
return [
|
|
520
|
+
'SPEAKER',
|
|
521
|
+
filename,
|
|
522
|
+
'1',
|
|
523
|
+
segment.start.toFixed(3),
|
|
524
|
+
segment.duration.toFixed(3),
|
|
525
|
+
'<NA>',
|
|
526
|
+
'<NA>',
|
|
527
|
+
segment.speaker,
|
|
528
|
+
'<NA>',
|
|
529
|
+
'<NA>',
|
|
530
|
+
].join(' ');
|
|
531
|
+
});
|
|
532
|
+
|
|
533
|
+
return lines.join('\n') + '\n';
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
async function generateRTTM() {
|
|
537
|
+
const model = await Pyannote.load({
|
|
538
|
+
segModelPath: './models/segmentation.gguf',
|
|
539
|
+
embModelPath: './models/embedding.gguf',
|
|
540
|
+
pldaPath: './models/plda.gguf',
|
|
541
|
+
coremlPath: './models/embedding.mlpackage',
|
|
542
|
+
segCoremlPath: './models/segmentation.mlpackage',
|
|
543
|
+
});
|
|
544
|
+
|
|
545
|
+
const audio = loadWavFile('./audio.wav');
|
|
546
|
+
const result = await model.diarize(audio);
|
|
547
|
+
|
|
548
|
+
// Generate RTTM
|
|
549
|
+
const rttm = toRTTM(result, 'audio');
|
|
550
|
+
|
|
551
|
+
// Write to file
|
|
552
|
+
writeFileSync('./output.rttm', rttm);
|
|
553
|
+
console.log('RTTM file written to output.rttm');
|
|
554
|
+
|
|
555
|
+
// Also print to console
|
|
556
|
+
console.log('\nRTTM output:');
|
|
557
|
+
console.log(rttm);
|
|
558
|
+
|
|
559
|
+
model.close();
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
generateRTTM();
|
|
563
|
+
```
|
|
564
|
+
|
|
565
|
+
## Architecture
|
|
566
|
+
|
|
567
|
+
The diarization pipeline consists of four main stages:
|
|
568
|
+
|
|
569
|
+
### 1. Segmentation (SincNet + BiLSTM)
|
|
570
|
+
|
|
571
|
+
The segmentation model processes 10-second audio windows and outputs 7-class powerset logits for 589 frames (approximately one frame every 17ms). The model architecture:
|
|
572
|
+
|
|
573
|
+
- **SincNet**: Learnable sinc filter bank for feature extraction
|
|
574
|
+
- **4-layer BiLSTM**: Bidirectional long short-term memory layers
|
|
575
|
+
- **Linear classifier**: Projects to 7 powerset classes with log-softmax
|
|
576
|
+
|
|
577
|
+
The 7 powerset classes represent all possible combinations of up to 3 simultaneous speakers:
|
|
578
|
+
- Class 0: silence (no speakers)
|
|
579
|
+
- Classes 1-3: single speakers
|
|
580
|
+
- Classes 4-6: speaker overlaps
|
|
581
|
+
|
|
582
|
+
### 2. Powerset Decoding
|
|
583
|
+
|
|
584
|
+
Converts the 7-class powerset predictions into binary speaker activity for 3 local speakers per chunk. Each frame is decoded to indicate which of the 3 local speaker "slots" are active.
|
|
585
|
+
|
|
586
|
+
### 3. Embedding Extraction (WeSpeaker ResNet34)
|
|
587
|
+
|
|
588
|
+
For each active speaker in each chunk, the embedding model extracts a 256-dimensional speaker vector:
|
|
589
|
+
|
|
590
|
+
- **Mel filterbank**: 80-bin log-mel spectrogram features
|
|
591
|
+
- **ResNet34**: Deep residual network for speaker representation
|
|
592
|
+
- **Output**: 256-dimensional L2-normalized embedding
|
|
593
|
+
|
|
594
|
+
Silent speakers receive NaN embeddings, which are filtered before clustering.
|
|
595
|
+
|
|
596
|
+
### 4. Clustering (PLDA + AHC + VBx)
|
|
597
|
+
|
|
598
|
+
The final stage maps local speaker labels to global speaker identities:
|
|
599
|
+
|
|
600
|
+
- **PLDA transformation**: Probabilistic Linear Discriminant Analysis projects embeddings from 256 to 128 dimensions
|
|
601
|
+
- **Agglomerative Hierarchical Clustering (AHC)**: fastcluster implementation with O(n²) complexity, using centroid linkage and a distance threshold of 0.6
|
|
602
|
+
- **VBx refinement**: Variational Bayes diarization with parameters FA=0.07, FB=0.8, maximum 20 iterations
|
|
603
|
+
|
|
604
|
+
The clustering stage computes speaker centroids and assigns each embedding to the closest centroid while respecting the constraint that two local speakers in the same chunk cannot map to the same global speaker.
|
|
605
|
+
|
|
606
|
+
### CoreML Acceleration
|
|
607
|
+
|
|
608
|
+
Both neural networks run on Apple's CoreML framework, which automatically distributes computation across:
|
|
609
|
+
|
|
610
|
+
- **Neural Engine**: Dedicated ML accelerator on Apple Silicon
|
|
611
|
+
- **GPU**: Metal-accelerated operations
|
|
612
|
+
- **CPU**: Fallback for unsupported operations
|
|
613
|
+
|
|
614
|
+
CoreML models use Float16 computation for optimal performance while maintaining accuracy within acceptable bounds (cosine similarity > 0.999 vs Float32).
|
|
615
|
+
|
|
616
|
+
### Streaming Architecture
|
|
617
|
+
|
|
618
|
+
The streaming API uses a sliding 10-second window with a 1-second hop (9 seconds of overlap between consecutive chunks). Three data stores maintain the state:
|
|
619
|
+
|
|
620
|
+
- **`audio_buffer`**: Sliding window (~10s, ~640 KB for 1 hour) — old samples are discarded
|
|
621
|
+
- **`embeddings`**: Grows forever (~11 MB for 1 hour) — stores 3 × 256-dim vectors per chunk (NaN for silent speakers)
|
|
622
|
+
- **`binarized`**: Grows forever (~25 MB for 1 hour) — stores 589 × 3 binary activity masks per chunk
|
|
623
|
+
|
|
624
|
+
During reclustering, all accumulated embeddings are used to compute soft cluster assignments, and all binarized segmentations are used to reconstruct the global timeline. This is why the `embeddings` and `binarized` arrays must persist for the entire session.
|
|
625
|
+
|
|
626
|
+
### Constants
|
|
627
|
+
|
|
628
|
+
| Constant | Value | Description |
|
|
629
|
+
|----------|-------|-------------|
|
|
630
|
+
| SAMPLE_RATE | 16000 Hz | Audio sample rate |
|
|
631
|
+
| CHUNK_SAMPLES | 160000 | 10-second window size |
|
|
632
|
+
| STEP_SAMPLES | 16000 | 1-second hop between chunks |
|
|
633
|
+
| FRAMES_PER_CHUNK | 589 | Segmentation output frames |
|
|
634
|
+
| NUM_LOCAL_SPEAKERS | 3 | Maximum speakers per chunk |
|
|
635
|
+
| EMBEDDING_DIM | 256 | Speaker embedding dimension |
|
|
636
|
+
| FBANK_NUM_BINS | 80 | Mel filterbank bins |
|
|
637
|
+
|
|
638
|
+
## Audio Format Requirements
|
|
639
|
+
|
|
640
|
+
The library expects raw PCM audio in a specific format:
|
|
641
|
+
|
|
642
|
+
- **Sample rate**: 16000 Hz (16 kHz) — **required**
|
|
643
|
+
- **Channels**: Mono (single channel) — **required**
|
|
644
|
+
- **Format**: `Float32Array` with values in the range **[-1.0, 1.0]**
|
|
645
|
+
|
|
646
|
+
The library does **not** handle audio decoding. You must provide raw PCM samples.
|
|
647
|
+
|
|
648
|
+
### Loading Audio Files
|
|
649
|
+
|
|
650
|
+
For WAV files, you can use the `loadWavFile` function from Example 1, or use third-party libraries:
|
|
651
|
+
|
|
652
|
+
```bash
|
|
653
|
+
npm install node-wav
|
|
654
|
+
```
|
|
655
|
+
|
|
656
|
+
```typescript
|
|
657
|
+
import { read } from 'node-wav';
|
|
658
|
+
import { readFileSync } from 'node:fs';
|
|
659
|
+
|
|
660
|
+
const buffer = readFileSync('./audio.wav');
|
|
661
|
+
const wav = read(buffer);
|
|
662
|
+
|
|
663
|
+
// Convert to mono if stereo
|
|
664
|
+
const mono = wav.channelData.length > 1
|
|
665
|
+
? wav.channelData[0].map((v, i) => (v + wav.channelData[1][i]) / 2)
|
|
666
|
+
: wav.channelData[0];
|
|
667
|
+
|
|
668
|
+
// Resample to 16kHz if needed (using a resampling library)
|
|
669
|
+
// ...
|
|
670
|
+
|
|
671
|
+
const audio = new Float32Array(mono);
|
|
672
|
+
```
|
|
673
|
+
|
|
674
|
+
For other audio formats (MP3, M4A, etc.), use ffmpeg to convert to 16kHz mono WAV first:
|
|
675
|
+
|
|
676
|
+
```bash
|
|
677
|
+
ffmpeg -i input.mp3 -ar 16000 -ac 1 -f f32le -acodec pcm_f32le - | \
|
|
678
|
+
node process.js
|
|
679
|
+
```
|
|
680
|
+
|
|
681
|
+
## Important Notes and Caveats
|
|
682
|
+
|
|
683
|
+
### Platform Limitations
|
|
684
|
+
|
|
685
|
+
- **macOS only**: The library requires CoreML for neural network inference. There is currently no fallback implementation for other platforms.
|
|
686
|
+
- **No Linux/Windows support**: CoreML is exclusive to Apple platforms.
|
|
687
|
+
|
|
688
|
+
### `recluster()` Mutates State
|
|
689
|
+
|
|
690
|
+
The `recluster()` method overwrites the internal session state, specifically replacing the `embeddings` and chunk index arrays with filtered versions (excluding NaN embeddings from silent speakers). This means:
|
|
691
|
+
|
|
692
|
+
- Calling `push()` after `recluster()` may produce incorrect results
|
|
693
|
+
- Subsequent `recluster()` calls may not work as expected
|
|
694
|
+
- The data structure assumes the original unfiltered layout (3 embeddings per chunk)
|
|
695
|
+
|
|
696
|
+
**Best practice**: Use `recluster()` sparingly for live progress updates (e.g., every 30 seconds), or avoid it entirely and only call `finalize()` when the stream ends.
|
|
697
|
+
|
|
698
|
+
### Operations Are Serialized
|
|
699
|
+
|
|
700
|
+
Operations on a streaming session are serialized internally. Do not call `push()` while another `push()`, `recluster()`, or `finalize()` is in progress. Wait for the Promise to resolve before making the next call.
|
|
701
|
+
|
|
702
|
+
### Resource Management
|
|
703
|
+
|
|
704
|
+
- **Close sessions before models**: Always close streaming sessions before closing the parent model
|
|
705
|
+
- **Idempotent close**: Both `model.close()` and `session.close()` are safe to call multiple times
|
|
706
|
+
- **No reuse after close**: Once closed, models and sessions cannot be reused
|
|
707
|
+
|
|
708
|
+
### Model Loading
|
|
709
|
+
|
|
710
|
+
- **Path validation**: `Pyannote.load()` validates that all paths exist using `fs.accessSync()` before initialization
|
|
711
|
+
- **CoreML compilation**: The CoreML framework compiles `.mlpackage` models internally on first load (typically fast, ~100ms)
|
|
712
|
+
- **No explicit loading step**: Model weights are loaded synchronously in the constructor
|
|
713
|
+
|
|
714
|
+
### Threading Model
|
|
715
|
+
|
|
716
|
+
All heavy operations (`diarize`, `push`, `recluster`, `finalize`) run on libuv worker threads and never block the Node.js event loop. However, the operations do hold native locks internally, so concurrent operations on the same session are serialized.
|
|
717
|
+
|
|
718
|
+
### Memory Usage
|
|
719
|
+
|
|
720
|
+
For a 1-hour audio file:
|
|
721
|
+
- `audio_buffer`: ~640 KB (sliding window)
|
|
722
|
+
- `embeddings`: ~11 MB (grows throughout session)
|
|
723
|
+
- `binarized`: ~25 MB (grows throughout session)
|
|
724
|
+
- CoreML models: ~50 MB (loaded once per model)
|
|
725
|
+
|
|
726
|
+
Total memory footprint: approximately 100 MB for a 1-hour streaming session.
|
|
727
|
+
|
|
728
|
+
## Performance
|
|
729
|
+
|
|
730
|
+
Measured on Apple M2 Pro with 16 GB RAM:
|
|
731
|
+
|
|
732
|
+
| Component | Time per Chunk | Notes |
|
|
733
|
+
|-----------|----------------|-------|
|
|
734
|
+
| Segmentation (CoreML) | ~12ms | 10-second audio window, 589 frames |
|
|
735
|
+
| Embedding (CoreML) | ~13ms | Per speaker per chunk (up to 3 speakers) |
|
|
736
|
+
| AHC Clustering | ~0.8s | 3000 embeddings (1000 chunks) |
|
|
737
|
+
| VBx Refinement | ~1.2s | 20 iterations, 3000 embeddings |
|
|
738
|
+
| **Full Pipeline (offline)** | **39x real-time** | 45-minute audio processed in 70 seconds |
|
|
739
|
+
|
|
740
|
+
### Streaming Performance
|
|
741
|
+
|
|
742
|
+
- **First chunk latency**: 10 seconds (requires full window)
|
|
743
|
+
- **Incremental latency**: ~30ms per 1-second push (after first chunk)
|
|
744
|
+
- **Recluster latency**: ~2 seconds for 30 minutes of audio (~1800 embeddings)
|
|
745
|
+
|
|
746
|
+
Streaming mode has higher per-chunk overhead due to the incremental nature but enables real-time applications.
|
|
747
|
+
|
|
748
|
+
## Supported Platforms
|
|
749
|
+
|
|
750
|
+
| Platform | Architecture | Status |
|
|
751
|
+
|----------|--------------|--------|
|
|
752
|
+
| macOS | arm64 (Apple Silicon) | ✅ Supported |
|
|
753
|
+
| macOS | x64 (Intel) | 🔜 Planned |
|
|
754
|
+
| Linux | any | ❌ Not supported (CoreML unavailable) |
|
|
755
|
+
| Windows | any | ❌ Not supported (CoreML unavailable) |
|
|
756
|
+
|
|
757
|
+
Intel macOS support is planned but not yet available. The CoreML dependency makes cross-platform support challenging without alternative inference backends.
|
|
758
|
+
|
|
759
|
+
## License
|
|
760
|
+
|
|
761
|
+
MIT
|
|
762
|
+
|
|
763
|
+
---
|
|
764
|
+
|
|
765
|
+
For issues, feature requests, or contributions, please visit the [GitHub repository](https://github.com/predict-woo/pyannote-ggml).
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { StreamingSession } from './StreamingSession.js';
|
|
2
|
+
import type { DiarizationResult, ModelConfig } from './types.js';
|
|
3
|
+
export declare class Pyannote {
|
|
4
|
+
private native;
|
|
5
|
+
private constructor();
|
|
6
|
+
static load(config: ModelConfig): Promise<Pyannote>;
|
|
7
|
+
diarize(audio: Float32Array): Promise<DiarizationResult>;
|
|
8
|
+
createStreamingSession(): StreamingSession;
|
|
9
|
+
close(): void;
|
|
10
|
+
get isClosed(): boolean;
|
|
11
|
+
}
|
|
12
|
+
//# sourceMappingURL=Pyannote.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"Pyannote.d.ts","sourceRoot":"","sources":["../src/Pyannote.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AACzD,OAAO,KAAK,EAAE,iBAAiB,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AAEjE,qBAAa,QAAQ;IACnB,OAAO,CAAC,MAAM,CAAsB;IAEpC,OAAO;WAIM,IAAI,CAAC,MAAM,EAAE,WAAW,GAAG,OAAO,CAAC,QAAQ,CAAC;IAkBnD,OAAO,CAAC,KAAK,EAAE,YAAY,GAAG,OAAO,CAAC,iBAAiB,CAAC;IAa9D,sBAAsB,IAAI,gBAAgB;IAQ1C,KAAK,IAAI,IAAI;IAIb,IAAI,QAAQ,IAAI,OAAO,CAEtB;CACF"}
|
package/dist/Pyannote.js
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { accessSync } from 'node:fs';
|
|
2
|
+
import { getBinding } from './binding.js';
|
|
3
|
+
import { StreamingSession } from './StreamingSession.js';
|
|
4
|
+
export class Pyannote {
|
|
5
|
+
native;
|
|
6
|
+
constructor(native) {
|
|
7
|
+
this.native = native;
|
|
8
|
+
}
|
|
9
|
+
static async load(config) {
|
|
10
|
+
const paths = [
|
|
11
|
+
config.segModelPath,
|
|
12
|
+
config.embModelPath,
|
|
13
|
+
config.pldaPath,
|
|
14
|
+
config.coremlPath,
|
|
15
|
+
config.segCoremlPath,
|
|
16
|
+
];
|
|
17
|
+
for (const path of paths) {
|
|
18
|
+
accessSync(path);
|
|
19
|
+
}
|
|
20
|
+
const binding = getBinding();
|
|
21
|
+
const native = new binding.PyannoteModel(config);
|
|
22
|
+
return new Pyannote(native);
|
|
23
|
+
}
|
|
24
|
+
async diarize(audio) {
|
|
25
|
+
if (this.native.isClosed) {
|
|
26
|
+
throw new Error('Model is closed');
|
|
27
|
+
}
|
|
28
|
+
if (!(audio instanceof Float32Array)) {
|
|
29
|
+
throw new TypeError('Expected Float32Array');
|
|
30
|
+
}
|
|
31
|
+
if (audio.length === 0) {
|
|
32
|
+
throw new Error('Audio must not be empty');
|
|
33
|
+
}
|
|
34
|
+
return this.native.diarize(audio);
|
|
35
|
+
}
|
|
36
|
+
createStreamingSession() {
|
|
37
|
+
if (this.native.isClosed) {
|
|
38
|
+
throw new Error('Model is closed');
|
|
39
|
+
}
|
|
40
|
+
const nativeSession = this.native.createStreamingSession();
|
|
41
|
+
return new StreamingSession(nativeSession);
|
|
42
|
+
}
|
|
43
|
+
close() {
|
|
44
|
+
this.native.close();
|
|
45
|
+
}
|
|
46
|
+
get isClosed() {
|
|
47
|
+
return this.native.isClosed;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
//# sourceMappingURL=Pyannote.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"Pyannote.js","sourceRoot":"","sources":["../src/Pyannote.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AAErC,OAAO,EAAE,UAAU,EAA4B,MAAM,cAAc,CAAC;AACpE,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAGzD,MAAM,OAAO,QAAQ;IACX,MAAM,CAAsB;IAEpC,YAAoB,MAA2B;QAC7C,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;IACvB,CAAC;IAED,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,MAAmB;QACnC,MAAM,KAAK,GAAG;YACZ,MAAM,CAAC,YAAY;YACnB,MAAM,CAAC,YAAY;YACnB,MAAM,CAAC,QAAQ;YACf,MAAM,CAAC,UAAU;YACjB,MAAM,CAAC,aAAa;SACrB,CAAC;QAEF,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,UAAU,CAAC,IAAI,CAAC,CAAC;QACnB,CAAC;QAED,MAAM,OAAO,GAAG,UAAU,EAAE,CAAC;QAC7B,MAAM,MAAM,GAAG,IAAI,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;QACjD,OAAO,IAAI,QAAQ,CAAC,MAAM,CAAC,CAAC;IAC9B,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,KAAmB;QAC/B,IAAI,IAAI,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC;YACzB,MAAM,IAAI,KAAK,CAAC,iBAAiB,CAAC,CAAC;QACrC,CAAC;QACD,IAAI,CAAC,CAAC,KAAK,YAAY,YAAY,CAAC,EAAE,CAAC;YACrC,MAAM,IAAI,SAAS,CAAC,uBAAuB,CAAC,CAAC;QAC/C,CAAC;QACD,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvB,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;QAC7C,CAAC;QACD,OAAO,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;IACpC,CAAC;IAED,sBAAsB;QACpB,IAAI,IAAI,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC;YACzB,MAAM,IAAI,KAAK,CAAC,iBAAiB,CAAC,CAAC;QACrC,CAAC;QACD,MAAM,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,sBAAsB,EAAE,CAAC;QAC3D,OAAO,IAAI,gBAAgB,CAAC,aAAa,CAAC,CAAC;IAC7C,CAAC;IAED,KAAK;QACH,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC;IACtB,CAAC;IAED,IAAI,QAAQ;QACV,OAAO,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC;IAC9B,CAAC;CACF"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import type { NativeStreamingSession } from './binding.js';
|
|
2
|
+
import type { DiarizationResult, VADChunk } from './types.js';
|
|
3
|
+
export declare class StreamingSession {
|
|
4
|
+
private native;
|
|
5
|
+
constructor(native: NativeStreamingSession);
|
|
6
|
+
push(audio: Float32Array): Promise<VADChunk[]>;
|
|
7
|
+
recluster(): Promise<DiarizationResult>;
|
|
8
|
+
finalize(): Promise<DiarizationResult>;
|
|
9
|
+
close(): void;
|
|
10
|
+
get isClosed(): boolean;
|
|
11
|
+
}
|
|
12
|
+
//# sourceMappingURL=StreamingSession.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"StreamingSession.d.ts","sourceRoot":"","sources":["../src/StreamingSession.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,sBAAsB,EAAE,MAAM,cAAc,CAAC;AAC3D,OAAO,KAAK,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAE9D,qBAAa,gBAAgB;IAC3B,OAAO,CAAC,MAAM,CAAyB;gBAE3B,MAAM,EAAE,sBAAsB;IAIpC,IAAI,CAAC,KAAK,EAAE,YAAY,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC;IAU9C,SAAS,IAAI,OAAO,CAAC,iBAAiB,CAAC;IAOvC,QAAQ,IAAI,OAAO,CAAC,iBAAiB,CAAC;IAO5C,KAAK,IAAI,IAAI;IAIb,IAAI,QAAQ,IAAI,OAAO,CAEtB;CACF"}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
export class StreamingSession {
|
|
2
|
+
native;
|
|
3
|
+
constructor(native) {
|
|
4
|
+
this.native = native;
|
|
5
|
+
}
|
|
6
|
+
async push(audio) {
|
|
7
|
+
if (this.native.isClosed) {
|
|
8
|
+
throw new Error('Session is closed');
|
|
9
|
+
}
|
|
10
|
+
if (!(audio instanceof Float32Array)) {
|
|
11
|
+
throw new TypeError('Expected Float32Array');
|
|
12
|
+
}
|
|
13
|
+
return this.native.push(audio);
|
|
14
|
+
}
|
|
15
|
+
async recluster() {
|
|
16
|
+
if (this.native.isClosed) {
|
|
17
|
+
throw new Error('Session is closed');
|
|
18
|
+
}
|
|
19
|
+
return this.native.recluster();
|
|
20
|
+
}
|
|
21
|
+
async finalize() {
|
|
22
|
+
if (this.native.isClosed) {
|
|
23
|
+
throw new Error('Session is closed');
|
|
24
|
+
}
|
|
25
|
+
return this.native.finalize();
|
|
26
|
+
}
|
|
27
|
+
close() {
|
|
28
|
+
this.native.close();
|
|
29
|
+
}
|
|
30
|
+
get isClosed() {
|
|
31
|
+
return this.native.isClosed;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
//# sourceMappingURL=StreamingSession.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"StreamingSession.js","sourceRoot":"","sources":["../src/StreamingSession.ts"],"names":[],"mappings":"AAGA,MAAM,OAAO,gBAAgB;IACnB,MAAM,CAAyB;IAEvC,YAAY,MAA8B;QACxC,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;IACvB,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,KAAmB;QAC5B,IAAI,IAAI,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC;YACzB,MAAM,IAAI,KAAK,CAAC,mBAAmB,CAAC,CAAC;QACvC,CAAC;QACD,IAAI,CAAC,CAAC,KAAK,YAAY,YAAY,CAAC,EAAE,CAAC;YACrC,MAAM,IAAI,SAAS,CAAC,uBAAuB,CAAC,CAAC;QAC/C,CAAC;QACD,OAAO,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACjC,CAAC;IAED,KAAK,CAAC,SAAS;QACb,IAAI,IAAI,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC;YACzB,MAAM,IAAI,KAAK,CAAC,mBAAmB,CAAC,CAAC;QACvC,CAAC;QACD,OAAO,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC;IACjC,CAAC;IAED,KAAK,CAAC,QAAQ;QACZ,IAAI,IAAI,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC;YACzB,MAAM,IAAI,KAAK,CAAC,mBAAmB,CAAC,CAAC;QACvC,CAAC;QACD,OAAO,IAAI,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC;IAChC,CAAC;IAED,KAAK;QACH,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC;IACtB,CAAC;IAED,IAAI,QAAQ;QACV,OAAO,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC;IAC9B,CAAC;CACF"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { DiarizationResult, ModelConfig, VADChunk } from './types.js';
|
|
2
|
+
export interface NativePyannoteModel {
|
|
3
|
+
diarize(audio: Float32Array): Promise<DiarizationResult>;
|
|
4
|
+
createStreamingSession(): NativeStreamingSession;
|
|
5
|
+
close(): void;
|
|
6
|
+
isClosed: boolean;
|
|
7
|
+
}
|
|
8
|
+
export interface NativeStreamingSession {
|
|
9
|
+
push(audio: Float32Array): Promise<VADChunk[]>;
|
|
10
|
+
recluster(): Promise<DiarizationResult>;
|
|
11
|
+
finalize(): Promise<DiarizationResult>;
|
|
12
|
+
close(): void;
|
|
13
|
+
isClosed: boolean;
|
|
14
|
+
}
|
|
15
|
+
export interface NativeBinding {
|
|
16
|
+
PyannoteModel: new (config: ModelConfig) => NativePyannoteModel;
|
|
17
|
+
StreamingSession: new (...args: unknown[]) => NativeStreamingSession;
|
|
18
|
+
}
|
|
19
|
+
export declare function getBinding(): NativeBinding;
|
|
20
|
+
//# sourceMappingURL=binding.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"binding.d.ts","sourceRoot":"","sources":["../src/binding.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,iBAAiB,EAAE,WAAW,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAI3E,MAAM,WAAW,mBAAmB;IAClC,OAAO,CAAC,KAAK,EAAE,YAAY,GAAG,OAAO,CAAC,iBAAiB,CAAC,CAAC;IACzD,sBAAsB,IAAI,sBAAsB,CAAC;IACjD,KAAK,IAAI,IAAI,CAAC;IACd,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED,MAAM,WAAW,sBAAsB;IACrC,IAAI,CAAC,KAAK,EAAE,YAAY,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC;IAC/C,SAAS,IAAI,OAAO,CAAC,iBAAiB,CAAC,CAAC;IACxC,QAAQ,IAAI,OAAO,CAAC,iBAAiB,CAAC,CAAC;IACvC,KAAK,IAAI,IAAI,CAAC;IACd,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED,MAAM,WAAW,aAAa;IAC5B,aAAa,EAAE,KAAK,MAAM,EAAE,WAAW,KAAK,mBAAmB,CAAC;IAChE,gBAAgB,EAAE,KAAK,GAAG,IAAI,EAAE,OAAO,EAAE,KAAK,sBAAsB,CAAC;CACtE;AAoCD,wBAAgB,UAAU,IAAI,aAAa,CAyB1C"}
|
package/dist/binding.js
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import { createRequire } from 'module';
|
|
2
|
+
const require = createRequire(import.meta.url);
|
|
3
|
+
let cachedBinding = null;
|
|
4
|
+
function getPackageName() {
|
|
5
|
+
if (process.platform !== 'darwin') {
|
|
6
|
+
throw new Error(`Unsupported platform: ${process.platform}. pyannote-cpp-node currently supports macOS only.`);
|
|
7
|
+
}
|
|
8
|
+
if (process.arch === 'arm64') {
|
|
9
|
+
return '@pyannote-cpp-node/darwin-arm64';
|
|
10
|
+
}
|
|
11
|
+
if (process.arch === 'x64') {
|
|
12
|
+
return '@pyannote-cpp-node/darwin-x64';
|
|
13
|
+
}
|
|
14
|
+
throw new Error(`Unsupported architecture on macOS: ${process.arch}. Supported architectures are arm64 and x64.`);
|
|
15
|
+
}
|
|
16
|
+
function isNativeBinding(value) {
|
|
17
|
+
if (typeof value !== 'object' || value === null) {
|
|
18
|
+
return false;
|
|
19
|
+
}
|
|
20
|
+
const candidate = value;
|
|
21
|
+
return (typeof candidate.PyannoteModel === 'function' &&
|
|
22
|
+
typeof candidate.StreamingSession === 'function');
|
|
23
|
+
}
|
|
24
|
+
export function getBinding() {
|
|
25
|
+
if (cachedBinding !== null) {
|
|
26
|
+
return cachedBinding;
|
|
27
|
+
}
|
|
28
|
+
const packageName = getPackageName();
|
|
29
|
+
let loaded;
|
|
30
|
+
try {
|
|
31
|
+
loaded = require(packageName);
|
|
32
|
+
}
|
|
33
|
+
catch (error) {
|
|
34
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
35
|
+
throw new Error(`Failed to load native module '${packageName}'. Ensure the platform package is installed. Original error: ${message}`);
|
|
36
|
+
}
|
|
37
|
+
if (!isNativeBinding(loaded)) {
|
|
38
|
+
throw new Error(`Invalid native module export from '${packageName}'. Expected PyannoteModel and StreamingSession constructors.`);
|
|
39
|
+
}
|
|
40
|
+
cachedBinding = loaded;
|
|
41
|
+
return cachedBinding;
|
|
42
|
+
}
|
|
43
|
+
//# sourceMappingURL=binding.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"binding.js","sourceRoot":"","sources":["../src/binding.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAAE,MAAM,QAAQ,CAAC;AAIvC,MAAM,OAAO,GAAG,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAsB/C,IAAI,aAAa,GAAyB,IAAI,CAAC;AAE/C,SAAS,cAAc;IACrB,IAAI,OAAO,CAAC,QAAQ,KAAK,QAAQ,EAAE,CAAC;QAClC,MAAM,IAAI,KAAK,CACb,yBAAyB,OAAO,CAAC,QAAQ,oDAAoD,CAC9F,CAAC;IACJ,CAAC;IAED,IAAI,OAAO,CAAC,IAAI,KAAK,OAAO,EAAE,CAAC;QAC7B,OAAO,iCAAiC,CAAC;IAC3C,CAAC;IAED,IAAI,OAAO,CAAC,IAAI,KAAK,KAAK,EAAE,CAAC;QAC3B,OAAO,+BAA+B,CAAC;IACzC,CAAC;IAED,MAAM,IAAI,KAAK,CACb,sCAAsC,OAAO,CAAC,IAAI,8CAA8C,CACjG,CAAC;AACJ,CAAC;AAED,SAAS,eAAe,CAAC,KAAc;IACrC,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC;QAChD,OAAO,KAAK,CAAC;IACf,CAAC;IAED,MAAM,SAAS,GAAG,KAAgC,CAAC;IACnD,OAAO,CACL,OAAO,SAAS,CAAC,aAAa,KAAK,UAAU;QAC7C,OAAO,SAAS,CAAC,gBAAgB,KAAK,UAAU,CACjD,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,UAAU;IACxB,IAAI,aAAa,KAAK,IAAI,EAAE,CAAC;QAC3B,OAAO,aAAa,CAAC;IACvB,CAAC;IAED,MAAM,WAAW,GAAG,cAAc,EAAE,CAAC;IAErC,IAAI,MAAe,CAAC;IACpB,IAAI,CAAC;QACH,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;IAChC,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QACvE,MAAM,IAAI,KAAK,CACb,iCAAiC,WAAW,gEAAgE,OAAO,EAAE,CACtH,CAAC;IACJ,CAAC;IAED,IAAI,CAAC,eAAe,CAAC,MAAM,CAAC,EAAE,CAAC;QAC7B,MAAM,IAAI,KAAK,CACb,sCAAsC,WAAW,8DAA8D,CAChH,CAAC;IACJ,CAAC;IAED,aAAa,GAAG,MAAM,CAAC;IACvB,OAAO,aAAa,CAAC;AACvB,CAAC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AACzD,YAAY,EACV,iBAAiB,EACjB,WAAW,EACX,OAAO,EACP,QAAQ,GACT,MAAM,YAAY,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC"}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
export interface ModelConfig {
|
|
2
|
+
segModelPath: string;
|
|
3
|
+
embModelPath: string;
|
|
4
|
+
pldaPath: string;
|
|
5
|
+
coremlPath: string;
|
|
6
|
+
segCoremlPath: string;
|
|
7
|
+
}
|
|
8
|
+
export interface VADChunk {
|
|
9
|
+
chunkIndex: number;
|
|
10
|
+
startTime: number;
|
|
11
|
+
duration: number;
|
|
12
|
+
numFrames: number;
|
|
13
|
+
vad: Float32Array;
|
|
14
|
+
}
|
|
15
|
+
export interface Segment {
|
|
16
|
+
start: number;
|
|
17
|
+
duration: number;
|
|
18
|
+
speaker: string;
|
|
19
|
+
}
|
|
20
|
+
export interface DiarizationResult {
|
|
21
|
+
segments: Segment[];
|
|
22
|
+
}
|
|
23
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,WAAW;IAC1B,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;IACrB,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;CACvB;AAED,MAAM,WAAW,QAAQ;IACvB,UAAU,EAAE,MAAM,CAAC;IACnB,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,GAAG,EAAE,YAAY,CAAC;CACnB;AAED,MAAM,WAAW,OAAO;IACtB,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,iBAAiB;IAChC,QAAQ,EAAE,OAAO,EAAE,CAAC;CACrB"}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":""}
|
package/package.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "pyannote-cpp-node",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"main": "./dist/index.js",
|
|
6
|
+
"types": "./dist/index.d.ts",
|
|
7
|
+
"engines": {
|
|
8
|
+
"node": ">=18"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"dist/**/*.js",
|
|
12
|
+
"dist/**/*.d.ts",
|
|
13
|
+
"dist/**/*.js.map",
|
|
14
|
+
"dist/**/*.d.ts.map"
|
|
15
|
+
],
|
|
16
|
+
"publishConfig": {
|
|
17
|
+
"access": "public"
|
|
18
|
+
},
|
|
19
|
+
"optionalDependencies": {
|
|
20
|
+
"@pyannote-cpp-node/darwin-arm64": "0.1.0",
|
|
21
|
+
"@pyannote-cpp-node/darwin-x64": "0.1.0"
|
|
22
|
+
},
|
|
23
|
+
"devDependencies": {
|
|
24
|
+
"typescript": "^5.7.0"
|
|
25
|
+
},
|
|
26
|
+
"scripts": {
|
|
27
|
+
"build": "tsc",
|
|
28
|
+
"test": "vitest run"
|
|
29
|
+
}
|
|
30
|
+
}
|