pyannote-cpp-node 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/README.md +52 -3
  2. package/package.json +3 -3
package/README.md CHANGED
@@ -247,6 +247,8 @@ Flushes all stages, runs final recluster + alignment, and returns the definitive
247
247
  ```typescript
248
248
  type TranscriptionResult = {
249
249
  segments: AlignedSegment[];
250
+ /** Silence-filtered audio when VAD model is loaded. Timestamps align to this audio. */
251
+ filteredAudio?: Float32Array;
250
252
  };
251
253
  ```
252
254
 
@@ -430,6 +432,14 @@ export interface AlignedSegment {
430
432
  export interface TranscriptionResult {
431
433
  /** Full speaker-labeled transcript segments. */
432
434
  segments: AlignedSegment[];
435
+ /**
436
+ * Silence-filtered audio (16 kHz mono Float32Array).
437
+ * Present when a VAD model is loaded (`vadModelPath` in config).
438
+ * Silence longer than 2 seconds is compressed to 2 seconds.
439
+ * All segment timestamps are aligned to this audio —
440
+ * save it directly and timestamps will sync correctly.
441
+ */
442
+ filteredAudio?: Float32Array;
433
443
  }
434
444
  ```
435
445
 
@@ -462,6 +472,43 @@ async function runOffline(audio: Float32Array) {
462
472
  }
463
473
  ```
464
474
 
475
+ ### Offline transcription with silence filtering
476
+
477
+ When a VAD model is provided, `transcribeOffline` automatically compresses silence longer than 2 seconds down to 2 seconds before running Whisper and diarization. The filtered audio is returned alongside segments so you can save it with correctly aligned timestamps.
478
+
479
+ ```typescript
480
+ import { Pipeline } from 'pyannote-cpp-node';
481
+ import { writeFileSync } from 'node:fs';
482
+
483
+ async function runOfflineWithVAD(audio: Float32Array) {
484
+ const pipeline = await Pipeline.load({
485
+ segModelPath: './models/segmentation.gguf',
486
+ embModelPath: './models/embedding.gguf',
487
+ pldaPath: './models/plda.gguf',
488
+ coremlPath: './models/embedding.mlpackage',
489
+ segCoremlPath: './models/segmentation.mlpackage',
490
+ whisperModelPath: './models/ggml-large-v3-turbo-q5_0.bin',
491
+ vadModelPath: './models/ggml-silero-v6.2.0.bin', // enables silence filtering
492
+ });
493
+
494
+ const result = await pipeline.transcribeOffline(audio);
495
+
496
+ // Save the silence-filtered audio — timestamps in result.segments align to this
497
+ if (result.filteredAudio) {
498
+ // filteredAudio is 16 kHz mono Float32Array with silence compressed
499
+ writeFileSync('./output-filtered.pcm', Buffer.from(result.filteredAudio.buffer));
500
+ console.log(`Filtered: ${audio.length} -> ${result.filteredAudio.length} samples`);
501
+ }
502
+
503
+ for (const seg of result.segments) {
504
+ const end = seg.start + seg.duration;
505
+ console.log(`[${seg.speaker}] ${seg.start.toFixed(2)}-${end.toFixed(2)} ${seg.text.trim()}`);
506
+ }
507
+
508
+ pipeline.close();
509
+ }
510
+ ```
511
+
465
512
  ### Offline transcription with progress and live transcript preview
466
513
 
467
514
  ```typescript
@@ -722,9 +769,11 @@ All API methods expect decoded PCM samples; file decoding/resampling is handled
722
769
 
723
770
  ### Offline mode (`transcribeOffline`)
724
771
 
725
- 1. Single `whisper_full()` call on entire audio
726
- 2. Offline diarization (segmentation powerset embeddings → PLDA → AHC → VBx)
727
- 3. WhisperX-style alignment (speaker assignment by maximum segment overlap)
772
+ 1. VAD silence filter (optional compresses silence >2s to 2s when `vadModelPath` provided)
773
+ 2. Single `whisper_full()` call on filtered audio
774
+ 3. Offline diarization (segmentation powerset embeddings → PLDA → AHC → VBx) on filtered audio
775
+ 4. WhisperX-style alignment (speaker assignment by maximum segment overlap)
776
+ 5. Return segments + filtered audio bytes (timestamps aligned to filtered audio)
728
777
 
729
778
  ### Streaming mode (`transcribe` / `createSession`)
730
779
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pyannote-cpp-node",
3
- "version": "0.5.0",
3
+ "version": "0.6.0",
4
4
  "type": "module",
5
5
  "main": "./dist/index.js",
6
6
  "types": "./dist/index.d.ts",
@@ -17,8 +17,8 @@
17
17
  "access": "public"
18
18
  },
19
19
  "optionalDependencies": {
20
- "@pyannote-cpp-node/darwin-arm64": "0.5.0",
21
- "@pyannote-cpp-node/darwin-x64": "0.5.0"
20
+ "@pyannote-cpp-node/darwin-arm64": "0.6.0",
21
+ "@pyannote-cpp-node/darwin-x64": "0.6.0"
22
22
  },
23
23
  "devDependencies": {
24
24
  "typescript": "^5.7.0"