pyannote-cpp-node 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -3,7 +3,7 @@
3
3
  ![Platform](https://img.shields.io/badge/platform-macOS-lightgrey)
4
4
  ![Node](https://img.shields.io/badge/node-%3E%3D18-brightgreen)
5
5
 
6
- Node.js native bindings for integrated Whisper transcription + speaker diarization with speaker-labeled, word-level output.
6
+ Node.js native bindings for integrated Whisper transcription + speaker diarization with speaker-labeled segment output.
7
7
 
8
8
  ## Overview
9
9
 
@@ -14,14 +14,13 @@ Given 16 kHz mono PCM audio (`Float32Array`), it produces cumulative and final t
14
14
  - speaker label (`SPEAKER_00`, `SPEAKER_01`, ...)
15
15
  - segment start/duration in seconds
16
16
  - segment text
17
- - per-word timestamps
18
17
 
19
18
  The API supports both one-shot processing (`transcribe`) and incremental streaming (`createSession` + `push`/`finalize`). All heavy operations are asynchronous and run on libuv worker threads.
20
19
 
21
20
  ## Features
22
21
 
23
22
  - Integrated transcription + diarization in one pipeline
24
- - Speaker-labeled, word-level transcript output
23
+ - Speaker-labeled transcript segments with sentence-level text
25
24
  - One-shot and streaming APIs with the same output schema
26
25
  - Incremental `segments` events for live applications
27
26
  - Deterministic output for the same audio/models/config
@@ -162,108 +161,104 @@ session.on('segments', (segments: AlignedSegment[], audio: Float32Array) => {
162
161
 
163
162
  ```typescript
164
163
  export interface ModelConfig {
165
- /** Path to segmentation GGUF model file. */
164
+ // === Required Model Paths ===
165
+ /** Path to segmentation GGUF model */
166
166
  segModelPath: string;
167
167
 
168
- /** Path to embedding GGUF model file. */
168
+ /** Path to embedding GGUF model */
169
169
  embModelPath: string;
170
170
 
171
- /** Path to PLDA GGUF model file. */
171
+ /** Path to PLDA GGUF model */
172
172
  pldaPath: string;
173
173
 
174
- /** Path to embedding CoreML .mlpackage directory. */
174
+ /** Path to embedding CoreML .mlpackage directory */
175
175
  coremlPath: string;
176
176
 
177
- /** Path to segmentation CoreML .mlpackage directory. */
177
+ /** Path to segmentation CoreML .mlpackage directory */
178
178
  segCoremlPath: string;
179
179
 
180
- /** Path to Whisper GGUF model file. */
180
+ /** Path to Whisper GGUF model */
181
181
  whisperModelPath: string;
182
182
 
183
- /** Optional path to Silero VAD model file; enables silence compression. */
183
+ // === Optional Model Paths ===
184
+ /** Path to Silero VAD model (optional, enables silence compression) */
184
185
  vadModelPath?: string;
185
186
 
186
- /** Enable GPU for Whisper. Default: true. */
187
+ // === Whisper Context Options (model loading) ===
188
+ /** Enable GPU acceleration (default: true) */
187
189
  useGpu?: boolean;
188
190
 
189
- /** Enable flash attention when supported. Default: true. */
191
+ /** Enable Flash Attention (default: true) */
190
192
  flashAttn?: boolean;
191
193
 
192
- /** GPU device index. Default: 0. */
194
+ /** GPU device index (default: 0) */
193
195
  gpuDevice?: number;
194
196
 
195
197
  /**
196
- * Enable Whisper CoreML encoder.
197
- * Default: false.
198
- * Requires a matching `-encoder.mlmodelc` next to the GGUF model.
198
+ * Enable CoreML acceleration for Whisper encoder on macOS (default: false).
199
+ * The CoreML model must be placed next to the GGUF model with naming convention:
200
+ * e.g., ggml-base.en.bin -> ggml-base.en-encoder.mlmodelc/
199
201
  */
200
202
  useCoreml?: boolean;
201
203
 
202
- /** Suppress Whisper native logs. Default: false. */
204
+ /** Suppress whisper.cpp log output (default: false) */
203
205
  noPrints?: boolean;
204
206
 
205
- /** Number of decode threads. Default: 4. */
207
+ // === Whisper Decode Options ===
208
+ /** Number of threads for Whisper inference (default: 4) */
206
209
  nThreads?: number;
207
210
 
208
- /** Language code for transcription. Default: 'en'. Omit for auto-detect behavior with model settings. */
211
+ /** Language code (e.g., 'en', 'zh'). Omit for auto-detect. (default: 'en') */
209
212
  language?: string;
210
213
 
211
- /** Translate to English. Default: false. */
214
+ /** Translate non-English speech to English (default: false) */
212
215
  translate?: boolean;
213
216
 
214
- /** Force language detection pass. Default: false. */
217
+ /** Auto-detect spoken language. Overrides 'language' when true. (default: false) */
215
218
  detectLanguage?: boolean;
216
219
 
217
- /** Base sampling temperature. Default: 0.0 (greedy). */
220
+ // === Sampling ===
221
+ /** Sampling temperature. 0.0 = greedy deterministic. (default: 0.0) */
218
222
  temperature?: number;
219
223
 
220
- /** Temperature increment for fallback sampling. Default: 0.2. */
224
+ /** Temperature increment for fallback retries (default: 0.2) */
221
225
  temperatureInc?: number;
222
226
 
223
- /** Disable temperature fallback ladder. Default: false. */
227
+ /** Disable temperature fallback. If true, temperatureInc is ignored. (default: false) */
224
228
  noFallback?: boolean;
225
229
 
226
- /** Beam size. Default: -1 (greedy with best_of). */
230
+ /** Beam search size. -1 uses greedy decoding. >1 enables beam search. (default: -1) */
227
231
  beamSize?: number;
228
232
 
229
- /** Number of candidates in best-of sampling. Default: 5. */
233
+ /** Best-of-N sampling candidates for greedy decoding (default: 5) */
230
234
  bestOf?: number;
231
235
 
232
- /** Compression/entropy threshold. Default: 2.4. */
236
+ // === Thresholds ===
237
+ /** Entropy threshold for decoder fallback (default: 2.4) */
233
238
  entropyThold?: number;
234
239
 
235
- /** Average logprob threshold. Default: -1.0. */
240
+ /** Log probability threshold for decoder fallback (default: -1.0) */
236
241
  logprobThold?: number;
237
242
 
238
- /** No-speech probability threshold. Default: 0.6. */
243
+ /** No-speech probability threshold (default: 0.6) */
239
244
  noSpeechThold?: number;
240
245
 
241
- /** Optional initial prompt text. Default: undefined. */
246
+ // === Context ===
247
+ /** Initial prompt text to condition the decoder (default: none) */
242
248
  prompt?: string;
243
249
 
244
- /** Disable context carry-over between decode windows. Default: true. */
250
+ /** Don't use previous segment as context for next segment (default: true) */
245
251
  noContext?: boolean;
246
252
 
247
- /** Suppress blank tokens. Default: true. */
253
+ /** Suppress blank outputs at the beginning of segments (default: true) */
248
254
  suppressBlank?: boolean;
249
255
 
250
- /** Suppress non-speech tokens. Default: false. */
256
+ /** Suppress non-speech tokens (default: false) */
251
257
  suppressNst?: boolean;
252
258
  }
253
259
 
254
- export interface AlignedWord {
255
- /** Word text (may include leading space from Whisper tokenization). */
256
- text: string;
257
-
258
- /** Word start time in seconds. */
259
- start: number;
260
-
261
- /** Word end time in seconds. */
262
- end: number;
263
- }
264
-
265
260
  export interface AlignedSegment {
266
- /** Global speaker label (for example, SPEAKER_00). */
261
+ /** Global speaker label (e.g., SPEAKER_00). */
267
262
  speaker: string;
268
263
 
269
264
  /** Segment start time in seconds. */
@@ -272,11 +267,8 @@ export interface AlignedSegment {
272
267
  /** Segment duration in seconds. */
273
268
  duration: number;
274
269
 
275
- /** Segment text (concatenated from words). */
270
+ /** Transcribed text for this segment. */
276
271
  text: string;
277
-
278
- /** Word-level timestamps for the segment. */
279
- words: AlignedWord[];
280
272
  }
281
273
 
282
274
  export interface TranscriptionResult {
@@ -407,11 +399,7 @@ The pipeline returns this JSON shape:
407
399
  "speaker": "SPEAKER_00",
408
400
  "start": 0.497000,
409
401
  "duration": 2.085000,
410
- "text": "Hello world",
411
- "words": [
412
- {"text": " Hello", "start": 0.500000, "end": 0.800000},
413
- {"text": " world", "start": 0.900000, "end": 1.200000}
414
- ]
402
+ "text": "Hello world"
415
403
  }
416
404
  ]
417
405
  }
@@ -433,7 +421,7 @@ The integrated pipeline runs in 7 stages:
433
421
  1. VAD silence filter (optional compression of long silence)
434
422
  2. Audio buffer (stream-safe FIFO with timestamp tracking)
435
423
  3. Segmentation (speech activity over rolling windows)
436
- 4. Transcription (Whisper sentence + word timestamps)
424
+ 4. Transcription (Whisper sentence-level segments)
437
425
  5. Alignment (segment-level speaker assignment by overlap)
438
426
  6. Finalize (flush + final recluster + final alignment)
439
427
  7. Callback/event emission (`segments` updates)
@@ -443,7 +431,7 @@ The integrated pipeline runs in 7 stages:
443
431
  - Diarization only: **39x real-time**
444
432
  - Integrated transcription + diarization: **~14.6x real-time**
445
433
  - 45-minute Korean meeting test (6 speakers): **2713s audio in 186s**
446
- - Alignment reduction: **701 Whisper segments -> 186 aligned speaker segments**
434
+ - Each Whisper segment maps 1:1 to a speaker-labeled segment (no merging)
447
435
  - Speaker confusion rate: **2.55%**
448
436
 
449
437
  ## Platform Support
package/dist/index.d.ts CHANGED
@@ -1,6 +1,6 @@
1
1
  export { Pipeline } from './Pipeline.js';
2
2
  export { PipelineSession } from './PipelineSession.js';
3
- export type { AlignedSegment, AlignedWord, ModelConfig, TranscriptionResult, } from './types.js';
3
+ export type { AlignedSegment, ModelConfig, TranscriptionResult, } from './types.js';
4
4
  export type { NativePipelineModel, NativePipelineSession, NativeBinding, } from './binding.js';
5
5
  export { getBinding } from './binding.js';
6
6
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAC;AACvD,YAAY,EACV,cAAc,EACd,WAAW,EACX,WAAW,EACX,mBAAmB,GACpB,MAAM,YAAY,CAAC;AACpB,YAAY,EACV,mBAAmB,EACnB,qBAAqB,EACrB,aAAa,GACd,MAAM,cAAc,CAAC;AACtB,OAAO,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAC;AACvD,YAAY,EACV,cAAc,EACd,WAAW,EACX,mBAAmB,GACpB,MAAM,YAAY,CAAC;AACpB,YAAY,EACV,mBAAmB,EACnB,qBAAqB,EACrB,aAAa,GACd,MAAM,cAAc,CAAC;AACtB,OAAO,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC"}
package/dist/index.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAC;AAYvD,OAAO,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAC;AAWvD,OAAO,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC"}
package/dist/types.d.ts CHANGED
@@ -60,17 +60,11 @@ export interface ModelConfig {
60
60
  /** Suppress non-speech tokens (default: false) */
61
61
  suppressNst?: boolean;
62
62
  }
63
- export interface AlignedWord {
64
- text: string;
65
- start: number;
66
- end: number;
67
- }
68
63
  export interface AlignedSegment {
69
64
  speaker: string;
70
65
  start: number;
71
66
  duration: number;
72
67
  text: string;
73
- words: AlignedWord[];
74
68
  }
75
69
  export interface TranscriptionResult {
76
70
  segments: AlignedSegment[];
@@ -1 +1 @@
1
- {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,WAAW;IAE1B,sCAAsC;IACtC,YAAY,EAAE,MAAM,CAAC;IACrB,mCAAmC;IACnC,YAAY,EAAE,MAAM,CAAC;IACrB,8BAA8B;IAC9B,QAAQ,EAAE,MAAM,CAAC;IACjB,oDAAoD;IACpD,UAAU,EAAE,MAAM,CAAC;IACnB,uDAAuD;IACvD,aAAa,EAAE,MAAM,CAAC;IACtB,iCAAiC;IACjC,gBAAgB,EAAE,MAAM,CAAC;IAGzB,uEAAuE;IACvE,YAAY,CAAC,EAAE,MAAM,CAAC;IAGtB,8CAA8C;IAC9C,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,6CAA6C;IAC7C,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,oCAAoC;IACpC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;;;;OAIG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,uDAAuD;IACvD,QAAQ,CAAC,EAAE,OAAO,CAAC;IAGnB,2DAA2D;IAC3D,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,8EAA8E;IAC9E,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,+DAA+D;IAC/D,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,oFAAoF;IACpF,cAAc,CAAC,EAAE,OAAO,CAAC;IAGzB,uEAAuE;IACvE,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,gEAAgE;IAChE,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,yFAAyF;IACzF,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,uFAAuF;IACvF,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,qEAAqE;IACrE,MAAM,CAAC,EAAE,MAAM,CAAC;IAGhB,4DAA4D;IAC5D,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,qEAAqE;IACrE,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,qDAAqD;IACrD,aAAa,CAAC,EAAE,MAAM,CAAC;IAGvB,mEAAmE;IACnE,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,6EAA6E;IAC7E,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,0EAA0E;IAC1E,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,kDAAkD;IAClD,WAAW,CAAC,EAAE,OAAO,CAAC;CACvB;AAED,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,GAAG,EAAE,MAAM,CAAC;CACb;AAED,MAAM,WAAW,cAAc;IAC7B,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,WAAW,EAAE,CAAC;CACtB;AAED,MAAM,WAAW,mBAAmB;IAClC,QAAQ,EAAE,cAAc,EAAE,CAAC;CAC5B"}
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,WAAW;IAE1B,sCAAsC;IACtC,YAAY,EAAE,MAAM,CAAC;IACrB,mCAAmC;IACnC,YAAY,EAAE,MAAM,CAAC;IACrB,8BAA8B;IAC9B,QAAQ,EAAE,MAAM,CAAC;IACjB,oDAAoD;IACpD,UAAU,EAAE,MAAM,CAAC;IACnB,uDAAuD;IACvD,aAAa,EAAE,MAAM,CAAC;IACtB,iCAAiC;IACjC,gBAAgB,EAAE,MAAM,CAAC;IAGzB,uEAAuE;IACvE,YAAY,CAAC,EAAE,MAAM,CAAC;IAGtB,8CAA8C;IAC9C,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,6CAA6C;IAC7C,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,oCAAoC;IACpC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;;;;OAIG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,uDAAuD;IACvD,QAAQ,CAAC,EAAE,OAAO,CAAC;IAGnB,2DAA2D;IAC3D,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,8EAA8E;IAC9E,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,+DAA+D;IAC/D,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,oFAAoF;IACpF,cAAc,CAAC,EAAE,OAAO,CAAC;IAGzB,uEAAuE;IACvE,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,gEAAgE;IAChE,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,yFAAyF;IACzF,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,uFAAuF;IACvF,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,qEAAqE;IACrE,MAAM,CAAC,EAAE,MAAM,CAAC;IAGhB,4DAA4D;IAC5D,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,qEAAqE;IACrE,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,qDAAqD;IACrD,aAAa,CAAC,EAAE,MAAM,CAAC;IAGvB,mEAAmE;IACnE,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,6EAA6E;IAC7E,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,0EAA0E;IAC1E,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,kDAAkD;IAClD,WAAW,CAAC,EAAE,OAAO,CAAC;CACvB;AAED,MAAM,WAAW,cAAc;IAC7B,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;CACd;AAED,MAAM,WAAW,mBAAmB;IAClC,QAAQ,EAAE,cAAc,EAAE,CAAC;CAC5B"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pyannote-cpp-node",
3
- "version": "0.2.0",
3
+ "version": "0.2.1",
4
4
  "type": "module",
5
5
  "main": "./dist/index.js",
6
6
  "types": "./dist/index.d.ts",
@@ -17,8 +17,8 @@
17
17
  "access": "public"
18
18
  },
19
19
  "optionalDependencies": {
20
- "@pyannote-cpp-node/darwin-arm64": "0.2.0",
21
- "@pyannote-cpp-node/darwin-x64": "0.2.0"
20
+ "@pyannote-cpp-node/darwin-arm64": "0.2.1",
21
+ "@pyannote-cpp-node/darwin-x64": "0.2.1"
22
22
  },
23
23
  "devDependencies": {
24
24
  "typescript": "^5.7.0"