pyannote-cpp-node 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +44 -56
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js.map +1 -1
- package/dist/types.d.ts +0 -6
- package/dist/types.d.ts.map +1 -1
- package/package.json +3 -3
package/README.md
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|

|
|
4
4
|

|
|
5
5
|
|
|
6
|
-
Node.js native bindings for integrated Whisper transcription + speaker diarization with speaker-labeled
|
|
6
|
+
Node.js native bindings for integrated Whisper transcription + speaker diarization with speaker-labeled segment output.
|
|
7
7
|
|
|
8
8
|
## Overview
|
|
9
9
|
|
|
@@ -14,14 +14,13 @@ Given 16 kHz mono PCM audio (`Float32Array`), it produces cumulative and final t
|
|
|
14
14
|
- speaker label (`SPEAKER_00`, `SPEAKER_01`, ...)
|
|
15
15
|
- segment start/duration in seconds
|
|
16
16
|
- segment text
|
|
17
|
-
- per-word timestamps
|
|
18
17
|
|
|
19
18
|
The API supports both one-shot processing (`transcribe`) and incremental streaming (`createSession` + `push`/`finalize`). All heavy operations are asynchronous and run on libuv worker threads.
|
|
20
19
|
|
|
21
20
|
## Features
|
|
22
21
|
|
|
23
22
|
- Integrated transcription + diarization in one pipeline
|
|
24
|
-
- Speaker-labeled
|
|
23
|
+
- Speaker-labeled transcript segments with sentence-level text
|
|
25
24
|
- One-shot and streaming APIs with the same output schema
|
|
26
25
|
- Incremental `segments` events for live applications
|
|
27
26
|
- Deterministic output for the same audio/models/config
|
|
@@ -162,108 +161,104 @@ session.on('segments', (segments: AlignedSegment[], audio: Float32Array) => {
|
|
|
162
161
|
|
|
163
162
|
```typescript
|
|
164
163
|
export interface ModelConfig {
|
|
165
|
-
|
|
164
|
+
// === Required Model Paths ===
|
|
165
|
+
/** Path to segmentation GGUF model */
|
|
166
166
|
segModelPath: string;
|
|
167
167
|
|
|
168
|
-
/** Path to embedding GGUF model
|
|
168
|
+
/** Path to embedding GGUF model */
|
|
169
169
|
embModelPath: string;
|
|
170
170
|
|
|
171
|
-
/** Path to PLDA GGUF model
|
|
171
|
+
/** Path to PLDA GGUF model */
|
|
172
172
|
pldaPath: string;
|
|
173
173
|
|
|
174
|
-
/** Path to embedding CoreML .mlpackage directory
|
|
174
|
+
/** Path to embedding CoreML .mlpackage directory */
|
|
175
175
|
coremlPath: string;
|
|
176
176
|
|
|
177
|
-
/** Path to segmentation CoreML .mlpackage directory
|
|
177
|
+
/** Path to segmentation CoreML .mlpackage directory */
|
|
178
178
|
segCoremlPath: string;
|
|
179
179
|
|
|
180
|
-
/** Path to Whisper GGUF model
|
|
180
|
+
/** Path to Whisper GGUF model */
|
|
181
181
|
whisperModelPath: string;
|
|
182
182
|
|
|
183
|
-
|
|
183
|
+
// === Optional Model Paths ===
|
|
184
|
+
/** Path to Silero VAD model (optional, enables silence compression) */
|
|
184
185
|
vadModelPath?: string;
|
|
185
186
|
|
|
186
|
-
|
|
187
|
+
// === Whisper Context Options (model loading) ===
|
|
188
|
+
/** Enable GPU acceleration (default: true) */
|
|
187
189
|
useGpu?: boolean;
|
|
188
190
|
|
|
189
|
-
/** Enable
|
|
191
|
+
/** Enable Flash Attention (default: true) */
|
|
190
192
|
flashAttn?: boolean;
|
|
191
193
|
|
|
192
|
-
/** GPU device index
|
|
194
|
+
/** GPU device index (default: 0) */
|
|
193
195
|
gpuDevice?: number;
|
|
194
196
|
|
|
195
197
|
/**
|
|
196
|
-
* Enable Whisper
|
|
197
|
-
*
|
|
198
|
-
*
|
|
198
|
+
* Enable CoreML acceleration for Whisper encoder on macOS (default: false).
|
|
199
|
+
* The CoreML model must be placed next to the GGUF model with naming convention:
|
|
200
|
+
* e.g., ggml-base.en.bin -> ggml-base.en-encoder.mlmodelc/
|
|
199
201
|
*/
|
|
200
202
|
useCoreml?: boolean;
|
|
201
203
|
|
|
202
|
-
/** Suppress
|
|
204
|
+
/** Suppress whisper.cpp log output (default: false) */
|
|
203
205
|
noPrints?: boolean;
|
|
204
206
|
|
|
205
|
-
|
|
207
|
+
// === Whisper Decode Options ===
|
|
208
|
+
/** Number of threads for Whisper inference (default: 4) */
|
|
206
209
|
nThreads?: number;
|
|
207
210
|
|
|
208
|
-
/** Language code
|
|
211
|
+
/** Language code (e.g., 'en', 'zh'). Omit for auto-detect. (default: 'en') */
|
|
209
212
|
language?: string;
|
|
210
213
|
|
|
211
|
-
/** Translate to English
|
|
214
|
+
/** Translate non-English speech to English (default: false) */
|
|
212
215
|
translate?: boolean;
|
|
213
216
|
|
|
214
|
-
/**
|
|
217
|
+
/** Auto-detect spoken language. Overrides 'language' when true. (default: false) */
|
|
215
218
|
detectLanguage?: boolean;
|
|
216
219
|
|
|
217
|
-
|
|
220
|
+
// === Sampling ===
|
|
221
|
+
/** Sampling temperature. 0.0 = greedy deterministic. (default: 0.0) */
|
|
218
222
|
temperature?: number;
|
|
219
223
|
|
|
220
|
-
/** Temperature increment for fallback
|
|
224
|
+
/** Temperature increment for fallback retries (default: 0.2) */
|
|
221
225
|
temperatureInc?: number;
|
|
222
226
|
|
|
223
|
-
/** Disable temperature fallback
|
|
227
|
+
/** Disable temperature fallback. If true, temperatureInc is ignored. (default: false) */
|
|
224
228
|
noFallback?: boolean;
|
|
225
229
|
|
|
226
|
-
/** Beam size.
|
|
230
|
+
/** Beam search size. -1 uses greedy decoding. >1 enables beam search. (default: -1) */
|
|
227
231
|
beamSize?: number;
|
|
228
232
|
|
|
229
|
-
/**
|
|
233
|
+
/** Best-of-N sampling candidates for greedy decoding (default: 5) */
|
|
230
234
|
bestOf?: number;
|
|
231
235
|
|
|
232
|
-
|
|
236
|
+
// === Thresholds ===
|
|
237
|
+
/** Entropy threshold for decoder fallback (default: 2.4) */
|
|
233
238
|
entropyThold?: number;
|
|
234
239
|
|
|
235
|
-
/**
|
|
240
|
+
/** Log probability threshold for decoder fallback (default: -1.0) */
|
|
236
241
|
logprobThold?: number;
|
|
237
242
|
|
|
238
|
-
/** No-speech probability threshold
|
|
243
|
+
/** No-speech probability threshold (default: 0.6) */
|
|
239
244
|
noSpeechThold?: number;
|
|
240
245
|
|
|
241
|
-
|
|
246
|
+
// === Context ===
|
|
247
|
+
/** Initial prompt text to condition the decoder (default: none) */
|
|
242
248
|
prompt?: string;
|
|
243
249
|
|
|
244
|
-
/**
|
|
250
|
+
/** Don't use previous segment as context for next segment (default: true) */
|
|
245
251
|
noContext?: boolean;
|
|
246
252
|
|
|
247
|
-
/** Suppress blank
|
|
253
|
+
/** Suppress blank outputs at the beginning of segments (default: true) */
|
|
248
254
|
suppressBlank?: boolean;
|
|
249
255
|
|
|
250
|
-
/** Suppress non-speech tokens
|
|
256
|
+
/** Suppress non-speech tokens (default: false) */
|
|
251
257
|
suppressNst?: boolean;
|
|
252
258
|
}
|
|
253
259
|
|
|
254
|
-
export interface AlignedWord {
|
|
255
|
-
/** Word text (may include leading space from Whisper tokenization). */
|
|
256
|
-
text: string;
|
|
257
|
-
|
|
258
|
-
/** Word start time in seconds. */
|
|
259
|
-
start: number;
|
|
260
|
-
|
|
261
|
-
/** Word end time in seconds. */
|
|
262
|
-
end: number;
|
|
263
|
-
}
|
|
264
|
-
|
|
265
260
|
export interface AlignedSegment {
|
|
266
|
-
/** Global speaker label (
|
|
261
|
+
/** Global speaker label (e.g., SPEAKER_00). */
|
|
267
262
|
speaker: string;
|
|
268
263
|
|
|
269
264
|
/** Segment start time in seconds. */
|
|
@@ -272,11 +267,8 @@ export interface AlignedSegment {
|
|
|
272
267
|
/** Segment duration in seconds. */
|
|
273
268
|
duration: number;
|
|
274
269
|
|
|
275
|
-
/**
|
|
270
|
+
/** Transcribed text for this segment. */
|
|
276
271
|
text: string;
|
|
277
|
-
|
|
278
|
-
/** Word-level timestamps for the segment. */
|
|
279
|
-
words: AlignedWord[];
|
|
280
272
|
}
|
|
281
273
|
|
|
282
274
|
export interface TranscriptionResult {
|
|
@@ -407,11 +399,7 @@ The pipeline returns this JSON shape:
|
|
|
407
399
|
"speaker": "SPEAKER_00",
|
|
408
400
|
"start": 0.497000,
|
|
409
401
|
"duration": 2.085000,
|
|
410
|
-
"text": "Hello world"
|
|
411
|
-
"words": [
|
|
412
|
-
{"text": " Hello", "start": 0.500000, "end": 0.800000},
|
|
413
|
-
{"text": " world", "start": 0.900000, "end": 1.200000}
|
|
414
|
-
]
|
|
402
|
+
"text": "Hello world"
|
|
415
403
|
}
|
|
416
404
|
]
|
|
417
405
|
}
|
|
@@ -433,7 +421,7 @@ The integrated pipeline runs in 7 stages:
|
|
|
433
421
|
1. VAD silence filter (optional compression of long silence)
|
|
434
422
|
2. Audio buffer (stream-safe FIFO with timestamp tracking)
|
|
435
423
|
3. Segmentation (speech activity over rolling windows)
|
|
436
|
-
4. Transcription (Whisper sentence
|
|
424
|
+
4. Transcription (Whisper sentence-level segments)
|
|
437
425
|
5. Alignment (segment-level speaker assignment by overlap)
|
|
438
426
|
6. Finalize (flush + final recluster + final alignment)
|
|
439
427
|
7. Callback/event emission (`segments` updates)
|
|
@@ -443,7 +431,7 @@ The integrated pipeline runs in 7 stages:
|
|
|
443
431
|
- Diarization only: **39x real-time**
|
|
444
432
|
- Integrated transcription + diarization: **~14.6x real-time**
|
|
445
433
|
- 45-minute Korean meeting test (6 speakers): **2713s audio in 186s**
|
|
446
|
-
-
|
|
434
|
+
- Each Whisper segment maps 1:1 to a speaker-labeled segment (no merging)
|
|
447
435
|
- Speaker confusion rate: **2.55%**
|
|
448
436
|
|
|
449
437
|
## Platform Support
|
package/dist/index.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
export { Pipeline } from './Pipeline.js';
|
|
2
2
|
export { PipelineSession } from './PipelineSession.js';
|
|
3
|
-
export type { AlignedSegment,
|
|
3
|
+
export type { AlignedSegment, ModelConfig, TranscriptionResult, } from './types.js';
|
|
4
4
|
export type { NativePipelineModel, NativePipelineSession, NativeBinding, } from './binding.js';
|
|
5
5
|
export { getBinding } from './binding.js';
|
|
6
6
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAC;AACvD,YAAY,EACV,cAAc,EACd,WAAW,EACX,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAC;AACvD,YAAY,EACV,cAAc,EACd,WAAW,EACX,mBAAmB,GACpB,MAAM,YAAY,CAAC;AACpB,YAAY,EACV,mBAAmB,EACnB,qBAAqB,EACrB,aAAa,GACd,MAAM,cAAc,CAAC;AACtB,OAAO,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC"}
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAC;
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAC;AAWvD,OAAO,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC"}
|
package/dist/types.d.ts
CHANGED
|
@@ -60,17 +60,11 @@ export interface ModelConfig {
|
|
|
60
60
|
/** Suppress non-speech tokens (default: false) */
|
|
61
61
|
suppressNst?: boolean;
|
|
62
62
|
}
|
|
63
|
-
export interface AlignedWord {
|
|
64
|
-
text: string;
|
|
65
|
-
start: number;
|
|
66
|
-
end: number;
|
|
67
|
-
}
|
|
68
63
|
export interface AlignedSegment {
|
|
69
64
|
speaker: string;
|
|
70
65
|
start: number;
|
|
71
66
|
duration: number;
|
|
72
67
|
text: string;
|
|
73
|
-
words: AlignedWord[];
|
|
74
68
|
}
|
|
75
69
|
export interface TranscriptionResult {
|
|
76
70
|
segments: AlignedSegment[];
|
package/dist/types.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,WAAW;IAE1B,sCAAsC;IACtC,YAAY,EAAE,MAAM,CAAC;IACrB,mCAAmC;IACnC,YAAY,EAAE,MAAM,CAAC;IACrB,8BAA8B;IAC9B,QAAQ,EAAE,MAAM,CAAC;IACjB,oDAAoD;IACpD,UAAU,EAAE,MAAM,CAAC;IACnB,uDAAuD;IACvD,aAAa,EAAE,MAAM,CAAC;IACtB,iCAAiC;IACjC,gBAAgB,EAAE,MAAM,CAAC;IAGzB,uEAAuE;IACvE,YAAY,CAAC,EAAE,MAAM,CAAC;IAGtB,8CAA8C;IAC9C,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,6CAA6C;IAC7C,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,oCAAoC;IACpC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;;;;OAIG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,uDAAuD;IACvD,QAAQ,CAAC,EAAE,OAAO,CAAC;IAGnB,2DAA2D;IAC3D,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,8EAA8E;IAC9E,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,+DAA+D;IAC/D,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,oFAAoF;IACpF,cAAc,CAAC,EAAE,OAAO,CAAC;IAGzB,uEAAuE;IACvE,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,gEAAgE;IAChE,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,yFAAyF;IACzF,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,uFAAuF;IACvF,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,qEAAqE;IACrE,MAAM,CAAC,EAAE,MAAM,CAAC;IAGhB,4DAA4D;IAC5D,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,qEAAqE;IACrE,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,qDAAqD;IACrD,aAAa,CAAC,EAAE,MAAM,CAAC;IAGvB,mEAAmE;IACnE,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,6EAA6E;IAC7E,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,0EAA0E;IAC1E,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,kDAAkD;IAClD,WAAW,CAAC,EAAE,OAAO,CAAC;CACvB;AAED,MAAM,WAAW,
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,WAAW;IAE1B,sCAAsC;IACtC,YAAY,EAAE,MAAM,CAAC;IACrB,mCAAmC;IACnC,YAAY,EAAE,MAAM,CAAC;IACrB,8BAA8B;IAC9B,QAAQ,EAAE,MAAM,CAAC;IACjB,oDAAoD;IACpD,UAAU,EAAE,MAAM,CAAC;IACnB,uDAAuD;IACvD,aAAa,EAAE,MAAM,CAAC;IACtB,iCAAiC;IACjC,gBAAgB,EAAE,MAAM,CAAC;IAGzB,uEAAuE;IACvE,YAAY,CAAC,EAAE,MAAM,CAAC;IAGtB,8CAA8C;IAC9C,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,6CAA6C;IAC7C,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,oCAAoC;IACpC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;;;;OAIG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,uDAAuD;IACvD,QAAQ,CAAC,EAAE,OAAO,CAAC;IAGnB,2DAA2D;IAC3D,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,8EAA8E;IAC9E,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,+DAA+D;IAC/D,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,oFAAoF;IACpF,cAAc,CAAC,EAAE,OAAO,CAAC;IAGzB,uEAAuE;IACvE,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,gEAAgE;IAChE,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,yFAAyF;IACzF,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,uFAAuF;IACvF,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,qEAAqE;IACrE,MAAM,CAAC,EAAE,MAAM,CAAC;IAGhB,4DAA4D;IAC5D,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,qEAAqE;IACrE,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,qDAAqD;IACrD,aAAa,CAAC,EAAE,MAAM,CAAC;IAGvB,mEAAmE;IACnE,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,6EAA6E;IAC7E,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,0EAA0E;IAC1E,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,kDAAkD;IAClD,WAAW,CAAC,EAAE,OAAO,CAAC;CACvB;AAED,MAAM,WAAW,cAAc;IAC7B,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;CACd;AAED,MAAM,WAAW,mBAAmB;IAClC,QAAQ,EAAE,cAAc,EAAE,CAAC;CAC5B"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pyannote-cpp-node",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"types": "./dist/index.d.ts",
|
|
@@ -17,8 +17,8 @@
|
|
|
17
17
|
"access": "public"
|
|
18
18
|
},
|
|
19
19
|
"optionalDependencies": {
|
|
20
|
-
"@pyannote-cpp-node/darwin-arm64": "0.2.
|
|
21
|
-
"@pyannote-cpp-node/darwin-x64": "0.2.
|
|
20
|
+
"@pyannote-cpp-node/darwin-arm64": "0.2.1",
|
|
21
|
+
"@pyannote-cpp-node/darwin-x64": "0.2.1"
|
|
22
22
|
},
|
|
23
23
|
"devDependencies": {
|
|
24
24
|
"typescript": "^5.7.0"
|