pyannote-cpp-node 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +52 -3
- package/package.json +3 -3
package/README.md
CHANGED
|
@@ -247,6 +247,8 @@ Flushes all stages, runs final recluster + alignment, and returns the definitive
|
|
|
247
247
|
```typescript
|
|
248
248
|
type TranscriptionResult = {
|
|
249
249
|
segments: AlignedSegment[];
|
|
250
|
+
/** Silence-filtered audio when VAD model is loaded. Timestamps align to this audio. */
|
|
251
|
+
filteredAudio?: Float32Array;
|
|
250
252
|
};
|
|
251
253
|
```
|
|
252
254
|
|
|
@@ -430,6 +432,14 @@ export interface AlignedSegment {
|
|
|
430
432
|
export interface TranscriptionResult {
|
|
431
433
|
/** Full speaker-labeled transcript segments. */
|
|
432
434
|
segments: AlignedSegment[];
|
|
435
|
+
/**
|
|
436
|
+
* Silence-filtered audio (16 kHz mono Float32Array).
|
|
437
|
+
* Present when a VAD model is loaded (`vadModelPath` in config).
|
|
438
|
+
* Silence longer than 2 seconds is compressed to 2 seconds.
|
|
439
|
+
* All segment timestamps are aligned to this audio —
|
|
440
|
+
* save it directly and timestamps will sync correctly.
|
|
441
|
+
*/
|
|
442
|
+
filteredAudio?: Float32Array;
|
|
433
443
|
}
|
|
434
444
|
```
|
|
435
445
|
|
|
@@ -462,6 +472,43 @@ async function runOffline(audio: Float32Array) {
|
|
|
462
472
|
}
|
|
463
473
|
```
|
|
464
474
|
|
|
475
|
+
### Offline transcription with silence filtering
|
|
476
|
+
|
|
477
|
+
When a VAD model is provided, `transcribeOffline` automatically compresses silence longer than 2 seconds down to 2 seconds before running Whisper and diarization. The filtered audio is returned alongside segments so you can save it with correctly aligned timestamps.
|
|
478
|
+
|
|
479
|
+
```typescript
|
|
480
|
+
import { Pipeline } from 'pyannote-cpp-node';
|
|
481
|
+
import { writeFileSync } from 'node:fs';
|
|
482
|
+
|
|
483
|
+
async function runOfflineWithVAD(audio: Float32Array) {
|
|
484
|
+
const pipeline = await Pipeline.load({
|
|
485
|
+
segModelPath: './models/segmentation.gguf',
|
|
486
|
+
embModelPath: './models/embedding.gguf',
|
|
487
|
+
pldaPath: './models/plda.gguf',
|
|
488
|
+
coremlPath: './models/embedding.mlpackage',
|
|
489
|
+
segCoremlPath: './models/segmentation.mlpackage',
|
|
490
|
+
whisperModelPath: './models/ggml-large-v3-turbo-q5_0.bin',
|
|
491
|
+
vadModelPath: './models/ggml-silero-v6.2.0.bin', // enables silence filtering
|
|
492
|
+
});
|
|
493
|
+
|
|
494
|
+
const result = await pipeline.transcribeOffline(audio);
|
|
495
|
+
|
|
496
|
+
// Save the silence-filtered audio — timestamps in result.segments align to this
|
|
497
|
+
if (result.filteredAudio) {
|
|
498
|
+
// filteredAudio is 16 kHz mono Float32Array with silence compressed
|
|
499
|
+
writeFileSync('./output-filtered.pcm', Buffer.from(result.filteredAudio.buffer));
|
|
500
|
+
console.log(`Filtered: ${audio.length} -> ${result.filteredAudio.length} samples`);
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
for (const seg of result.segments) {
|
|
504
|
+
const end = seg.start + seg.duration;
|
|
505
|
+
console.log(`[${seg.speaker}] ${seg.start.toFixed(2)}-${end.toFixed(2)} ${seg.text.trim()}`);
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
pipeline.close();
|
|
509
|
+
}
|
|
510
|
+
```
|
|
511
|
+
|
|
465
512
|
### Offline transcription with progress and live transcript preview
|
|
466
513
|
|
|
467
514
|
```typescript
|
|
@@ -722,9 +769,11 @@ All API methods expect decoded PCM samples; file decoding/resampling is handled
|
|
|
722
769
|
|
|
723
770
|
### Offline mode (`transcribeOffline`)
|
|
724
771
|
|
|
725
|
-
1.
|
|
726
|
-
2.
|
|
727
|
-
3.
|
|
772
|
+
1. VAD silence filter (optional — compresses silence >2s to 2s when `vadModelPath` provided)
|
|
773
|
+
2. Single `whisper_full()` call on filtered audio
|
|
774
|
+
3. Offline diarization (segmentation → powerset → embeddings → PLDA → AHC → VBx) on filtered audio
|
|
775
|
+
4. WhisperX-style alignment (speaker assignment by maximum segment overlap)
|
|
776
|
+
5. Return segments + filtered audio bytes (timestamps aligned to filtered audio)
|
|
728
777
|
|
|
729
778
|
### Streaming mode (`transcribe` / `createSession`)
|
|
730
779
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pyannote-cpp-node",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.6.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"types": "./dist/index.d.ts",
|
|
@@ -17,8 +17,8 @@
|
|
|
17
17
|
"access": "public"
|
|
18
18
|
},
|
|
19
19
|
"optionalDependencies": {
|
|
20
|
-
"@pyannote-cpp-node/darwin-arm64": "0.
|
|
21
|
-
"@pyannote-cpp-node/darwin-x64": "0.
|
|
20
|
+
"@pyannote-cpp-node/darwin-arm64": "0.6.0",
|
|
21
|
+
"@pyannote-cpp-node/darwin-x64": "0.6.0"
|
|
22
22
|
},
|
|
23
23
|
"devDependencies": {
|
|
24
24
|
"typescript": "^5.7.0"
|