dinnote 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dinnote/__init__.py +8 -0
- dinnote/cli.py +80 -0
- dinnote/config.py +64 -0
- dinnote/config.yaml +81 -0
- dinnote/denoise.py +59 -0
- dinnote/diarize.py +145 -0
- dinnote/pipeline.py +137 -0
- dinnote/transcribe.py +201 -0
- dinnote/utils.py +73 -0
- dinnote/vad.py +125 -0
- dinnote/vocab.txt +16 -0
- dinnote-0.1.0.dist-info/METADATA +118 -0
- dinnote-0.1.0.dist-info/RECORD +16 -0
- dinnote-0.1.0.dist-info/WHEEL +5 -0
- dinnote-0.1.0.dist-info/entry_points.txt +2 -0
- dinnote-0.1.0.dist-info/top_level.txt +1 -0
dinnote/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
from . import denoise, vad, diarize, transcribe
|
|
2
|
+
from .pipeline import process_file
|
|
3
|
+
from .config import DenoiseConfig, VadConfig, DiarizeConfig, TranscribeConfig, PipelineConfig
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"denoise", "vad", "diarize", "transcribe", "process_file",
|
|
7
|
+
"DenoiseConfig", "VadConfig", "DiarizeConfig", "TranscribeConfig", "PipelineConfig",
|
|
8
|
+
]
|
dinnote/cli.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
import time
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from .utils import AUDIO_EXTENSIONS, load_config, warn_if_no_cuda, setup_user_config, fmt_time
|
|
6
|
+
from .pipeline import process_file
|
|
7
|
+
from .config import PipelineConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main():
|
|
11
|
+
parser = argparse.ArgumentParser(
|
|
12
|
+
description="Denoise, VAD, diarize, and transcribe audio files"
|
|
13
|
+
)
|
|
14
|
+
parser.add_argument("input")
|
|
15
|
+
parser.add_argument("-o", "--output", default="output")
|
|
16
|
+
parser.add_argument("-c", "--config", default=None,
|
|
17
|
+
help="Path to config.yaml (default: user config dir)")
|
|
18
|
+
parser.add_argument("-f", "--force", action="store_true")
|
|
19
|
+
args = parser.parse_args()
|
|
20
|
+
|
|
21
|
+
config_dir = setup_user_config()
|
|
22
|
+
config_path = args.config or str(config_dir / "config.yaml")
|
|
23
|
+
raw = load_config(config_path)
|
|
24
|
+
|
|
25
|
+
transcribe_raw = raw.setdefault("transcribe", {})
|
|
26
|
+
if not transcribe_raw.get("vocab_file"):
|
|
27
|
+
vocab_path = config_dir / "vocab.txt"
|
|
28
|
+
if vocab_path.exists():
|
|
29
|
+
transcribe_raw["vocab_file"] = str(vocab_path)
|
|
30
|
+
|
|
31
|
+
config = PipelineConfig.from_dict(raw)
|
|
32
|
+
input_path = Path(args.input)
|
|
33
|
+
output_dir = Path(args.output)
|
|
34
|
+
|
|
35
|
+
if input_path.is_file():
|
|
36
|
+
if input_path.suffix.lower() not in AUDIO_EXTENSIONS:
|
|
37
|
+
print(f"Unrecognized audio extension: {input_path.suffix}")
|
|
38
|
+
sys.exit(1)
|
|
39
|
+
files = [input_path]
|
|
40
|
+
elif input_path.is_dir():
|
|
41
|
+
files = sorted(
|
|
42
|
+
f for f in input_path.iterdir()
|
|
43
|
+
if f.is_file() and f.suffix.lower() in AUDIO_EXTENSIONS
|
|
44
|
+
)
|
|
45
|
+
if not files:
|
|
46
|
+
print(f"No audio files found in {input_path}/")
|
|
47
|
+
sys.exit(1)
|
|
48
|
+
else:
|
|
49
|
+
print(f"Not found: {input_path}")
|
|
50
|
+
sys.exit(1)
|
|
51
|
+
|
|
52
|
+
print(f"Files to process: {len(files)}")
|
|
53
|
+
print(f"Output directory: {output_dir}/")
|
|
54
|
+
warn_if_no_cuda()
|
|
55
|
+
|
|
56
|
+
success = 0
|
|
57
|
+
batch_start = time.monotonic()
|
|
58
|
+
|
|
59
|
+
for i, audio_file in enumerate(files, 1):
|
|
60
|
+
if len(files) > 1:
|
|
61
|
+
print(f"\n{'═' * 60}")
|
|
62
|
+
print(f" File {i}/{len(files)}: {audio_file.name}")
|
|
63
|
+
|
|
64
|
+
if process_file(audio_file, output_dir, config, force=args.force):
|
|
65
|
+
success += 1
|
|
66
|
+
|
|
67
|
+
failed = len(files) - success
|
|
68
|
+
|
|
69
|
+
if len(files) > 1:
|
|
70
|
+
print(f"\n{'─' * 60}")
|
|
71
|
+
print(f" Batch complete: {success} succeeded, {failed} failed")
|
|
72
|
+
print(f" Total time: {fmt_time(time.monotonic() - batch_start)}")
|
|
73
|
+
print(f"\n{'─' * 60}")
|
|
74
|
+
|
|
75
|
+
if failed:
|
|
76
|
+
sys.exit(1)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
if __name__ == "__main__":
|
|
80
|
+
main()
|
dinnote/config.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class DenoiseConfig:
|
|
7
|
+
model: str = "htdemucs"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class VadConfig:
|
|
12
|
+
threshold: float = 0.5
|
|
13
|
+
min_speech_duration_ms: Optional[int] = 250
|
|
14
|
+
min_silence_duration_ms: Optional[int] = 100
|
|
15
|
+
padding_ms: int = 500
|
|
16
|
+
max_segment_length_sec: Optional[float] = 30.0
|
|
17
|
+
"""Segments longer than this are discarded as likely noise or music. None keeps all."""
|
|
18
|
+
merge_within_sec: Optional[float] = 1.0
|
|
19
|
+
"""Segments with less than this gap between them are merged. None disables merging."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class DiarizeConfig:
|
|
24
|
+
hf_token: Optional[str] = None
|
|
25
|
+
num_speakers: Optional[int] = None
|
|
26
|
+
min_speakers: Optional[int] = None
|
|
27
|
+
max_speakers: Optional[int] = None
|
|
28
|
+
min_turn_ms: int = 200
|
|
29
|
+
"""Turns less than this length are discarded."""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class TranscribeConfig:
|
|
34
|
+
model: str = "base"
|
|
35
|
+
language: str = "en"
|
|
36
|
+
temperature: Optional[float] = None
|
|
37
|
+
"""0 for greedy decoding, None to use Whisper's fallback sequence."""
|
|
38
|
+
no_speech_threshold: Optional[float] = 0.6
|
|
39
|
+
logprob_threshold: Optional[float] = -1.0
|
|
40
|
+
compression_ratio_threshold: Optional[float] = 2.4
|
|
41
|
+
condition_on_previous_text: bool = False
|
|
42
|
+
"""Use previous segment output as context. Improves coherence but can propagate errors."""
|
|
43
|
+
vocab_file: Optional[str] = None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class PipelineConfig:
|
|
48
|
+
denoise: DenoiseConfig = field(default_factory=DenoiseConfig)
|
|
49
|
+
vad: VadConfig = field(default_factory=VadConfig)
|
|
50
|
+
diarize: DiarizeConfig = field(default_factory=DiarizeConfig)
|
|
51
|
+
transcribe: TranscribeConfig = field(default_factory=TranscribeConfig)
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def from_dict(d: dict) -> "PipelineConfig":
|
|
55
|
+
def _build(cls, section):
|
|
56
|
+
fields = {f.name for f in cls.__dataclass_fields__.values()}
|
|
57
|
+
return cls(**{k: v for k, v in section.items() if k in fields})
|
|
58
|
+
|
|
59
|
+
return PipelineConfig(
|
|
60
|
+
denoise=_build(DenoiseConfig, d.get("denoise", {})),
|
|
61
|
+
vad=_build(VadConfig, d.get("vad", {})),
|
|
62
|
+
diarize=_build(DiarizeConfig, d.get("diarize", {})),
|
|
63
|
+
transcribe=_build(TranscribeConfig, d.get("transcribe", {})),
|
|
64
|
+
)
|
dinnote/config.yaml
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
denoise:
|
|
2
|
+
# Demucs model used for vocal isolation.
|
|
3
|
+
# Options: htdemucs, htdemucs_ft, mdx, mdx_extra, htdemucs_6s
|
|
4
|
+
# See: https://github.com/facebookresearch/demucs
|
|
5
|
+
model: htdemucs
|
|
6
|
+
|
|
7
|
+
vad:
|
|
8
|
+
# Detection sensitivity (0.0–1.0). Higher values require clearer speech to trigger.
|
|
9
|
+
# See: https://github.com/snakers4/silero-vad
|
|
10
|
+
threshold: 0.5
|
|
11
|
+
|
|
12
|
+
# Minimum duration of a detected speech region to keep (milliseconds).
|
|
13
|
+
# Set to null to use Silero's default (250ms).
|
|
14
|
+
min_speech_duration_ms: 250
|
|
15
|
+
|
|
16
|
+
# Minimum silence gap required before a new segment begins (milliseconds).
|
|
17
|
+
# Set to null to use Silero's default (100ms).
|
|
18
|
+
min_silence_duration_ms: 100
|
|
19
|
+
|
|
20
|
+
# Padding added before and after each detected segment (milliseconds).
|
|
21
|
+
# Set to null to disable padding.
|
|
22
|
+
padding_ms: 500
|
|
23
|
+
|
|
24
|
+
# Segments longer than this are discarded as likely noise or music (seconds).
|
|
25
|
+
# Set to null to keep all segments regardless of length.
|
|
26
|
+
max_segment_length_sec: 30
|
|
27
|
+
|
|
28
|
+
# Segments with less than this gap between them are merged into one (seconds).
|
|
29
|
+
# Set to null to disable merging of neighboring segments.
|
|
30
|
+
merge_within_sec: 1.0
|
|
31
|
+
|
|
32
|
+
diarize:
|
|
33
|
+
# HuggingFace token with access to pyannote/speaker-diarization-3.1.
|
|
34
|
+
# hf_token: hf_...
|
|
35
|
+
|
|
36
|
+
# Fix the number of speakers if known. Set to null to let pyannote estimate.
|
|
37
|
+
num_speakers: null
|
|
38
|
+
|
|
39
|
+
# Alternatively constrain the range and let pyannote pick within it.
|
|
40
|
+
min_speakers: null
|
|
41
|
+
max_speakers: null
|
|
42
|
+
|
|
43
|
+
# Turns shorter than this are discarded (milliseconds).
|
|
44
|
+
min_turn_ms: 200
|
|
45
|
+
|
|
46
|
+
transcribe:
|
|
47
|
+
# Whisper model size. Larger models are slower but more accurate.
|
|
48
|
+
# Options: tiny, base, small, medium, large
|
|
49
|
+
# See: https://github.com/openai/whisper
|
|
50
|
+
model: base
|
|
51
|
+
|
|
52
|
+
# Language code for transcription.
|
|
53
|
+
# Examples: en, fr, de, es, ja. Set to null to auto-detect.
|
|
54
|
+
language: en
|
|
55
|
+
|
|
56
|
+
# Decoding temperature.
|
|
57
|
+
# Set to 0 for single-pass greedy decoding with no fallback.
|
|
58
|
+
# Set to null to use Whisper's default fallback sequence (0.0, 0.2, …, 1.0),
|
|
59
|
+
# retrying at higher temperatures when output quality metrics look poor.
|
|
60
|
+
temperature: null
|
|
61
|
+
|
|
62
|
+
# Segments where Whisper's no-speech probability exceeds this are discarded.
|
|
63
|
+
# Set to null to use Whisper's default (0.6).
|
|
64
|
+
no_speech_threshold: 0.6
|
|
65
|
+
|
|
66
|
+
# Segments with average log-probability below this are discarded as likely low-quality audio.
|
|
67
|
+
# Set to null to use Whisper's default (-1.0).
|
|
68
|
+
logprob_threshold: -1.0
|
|
69
|
+
|
|
70
|
+
# Segments with compression ratio above this are discarded as likely hallucinations.
|
|
71
|
+
# Set to null to use Whisper's default (2.4).
|
|
72
|
+
compression_ratio_threshold: 2.4
|
|
73
|
+
|
|
74
|
+
# Whether Whisper should use the previous segment's output as context for the next.
|
|
75
|
+
# Improves coherence across segments but can propagate errors.
|
|
76
|
+
# Default: false.
|
|
77
|
+
condition_on_previous_text: false
|
|
78
|
+
|
|
79
|
+
# Path to a file containing domain-specific vocabulary to guide transcription.
|
|
80
|
+
# Set to null to disable.
|
|
81
|
+
vocab_file: vocab.txt
|
dinnote/denoise.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Uses Demucs to isolate vocals in audio.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
8
|
+
import sys
|
|
9
|
+
import tempfile
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from .utils import cuda_available
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _ffmpeg_env():
|
|
15
|
+
"""Return env with FFmpeg bin dir on PATH so subprocess can load torchcodec DLLs."""
|
|
16
|
+
import glob as g
|
|
17
|
+
env = dict(os.environ)
|
|
18
|
+
if sys.platform != "win32":
|
|
19
|
+
return env
|
|
20
|
+
local_appdata = env.get("LOCALAPPDATA", "")
|
|
21
|
+
winget_packages = os.path.join(local_appdata, "Microsoft", "WinGet", "Packages")
|
|
22
|
+
for bin_dir in g.glob(os.path.join(winget_packages, "Gyan.FFmpeg.Shared*", "**", "bin"), recursive=True):
|
|
23
|
+
if any(g.glob(os.path.join(bin_dir, "avcodec-*.dll"))):
|
|
24
|
+
env["PATH"] = bin_dir + os.pathsep + env.get("PATH", "")
|
|
25
|
+
break
|
|
26
|
+
return env
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _run_demucs(input_file: Path, output_file: Path, model: str = "htdemucs", device: str = "cpu"):
|
|
30
|
+
"""Run Demucs and copy the vocals stem to the destination path."""
|
|
31
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
32
|
+
tmp_path = Path(tmp_dir)
|
|
33
|
+
|
|
34
|
+
result = subprocess.run(
|
|
35
|
+
[sys.executable, "-m", "demucs", "-n", model, "--two-stems=vocals",
|
|
36
|
+
"--device", device, "-o", str(tmp_path), str(input_file)],
|
|
37
|
+
env=_ffmpeg_env(),
|
|
38
|
+
)
|
|
39
|
+
if result.returncode != 0:
|
|
40
|
+
raise RuntimeError(f"demucs exited with code {result.returncode}")
|
|
41
|
+
|
|
42
|
+
vocals_path = tmp_path / model / input_file.stem / "vocals.wav"
|
|
43
|
+
if not vocals_path.exists():
|
|
44
|
+
raise FileNotFoundError(f"Demucs output not found: {vocals_path}")
|
|
45
|
+
|
|
46
|
+
shutil.copy2(vocals_path, output_file)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def run(input_path: Path, output_dir: Path, config: dict, force: bool = False) -> Path:
|
|
50
|
+
"""Denoise a single audio file using Demucs vocal isolation. Returns path to denoised .wav."""
|
|
51
|
+
output_file = output_dir / f"{output_dir.name}_denoised.wav"
|
|
52
|
+
if not force and output_file.exists():
|
|
53
|
+
return output_file
|
|
54
|
+
|
|
55
|
+
model = config.get("model", "htdemucs")
|
|
56
|
+
device = "cuda" if cuda_available() else "cpu"
|
|
57
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
58
|
+
_run_demucs(input_path, output_file, model=model, device=device)
|
|
59
|
+
return output_file
|
dinnote/diarize.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Uses pyannote speaker diarization to split denoised audio into per-speaker turns,
|
|
3
|
+
writing a diarization JSON that replaces VAD segments as the unit fed to Whisper.
|
|
4
|
+
|
|
5
|
+
Each turn is a single-speaker slice with absolute timestamps in the source audio,
|
|
6
|
+
so Whisper gets clean single-speaker audio and speaker attribution is built-in.
|
|
7
|
+
|
|
8
|
+
Output JSON format:
|
|
9
|
+
{
|
|
10
|
+
"metadata": { ... },
|
|
11
|
+
"turns": [
|
|
12
|
+
{"turn_id": 0, "speaker": "SPEAKER_00", "start_ms": 1200, "end_ms": 3450, "duration_ms": 2250},
|
|
13
|
+
...
|
|
14
|
+
]
|
|
15
|
+
}
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
import sys
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from .utils import cuda_available
|
|
23
|
+
|
|
24
|
+
_pipeline_cache = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _load_pipeline(hf_token: str, num_speakers: int | None, device: str):
|
|
28
|
+
global _pipeline_cache
|
|
29
|
+
if _pipeline_cache is None:
|
|
30
|
+
from pyannote.audio import Pipeline
|
|
31
|
+
import torch
|
|
32
|
+
pipeline = Pipeline.from_pretrained(
|
|
33
|
+
"pyannote/speaker-diarization-3.1",
|
|
34
|
+
token=hf_token,
|
|
35
|
+
)
|
|
36
|
+
pipeline.to(torch.device(device))
|
|
37
|
+
_pipeline_cache = pipeline
|
|
38
|
+
return _pipeline_cache
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _ensure_ffmpeg_dlls():
|
|
42
|
+
"""Register FFmpeg shared DLL directory on Windows (required by torchcodec)."""
|
|
43
|
+
if sys.platform != "win32":
|
|
44
|
+
return
|
|
45
|
+
import glob as g
|
|
46
|
+
local_appdata = os.environ.get("LOCALAPPDATA", "")
|
|
47
|
+
winget_packages = os.path.join(local_appdata, "Microsoft", "WinGet", "Packages")
|
|
48
|
+
if not os.path.isdir(winget_packages):
|
|
49
|
+
return
|
|
50
|
+
for bin_dir in g.glob(os.path.join(winget_packages, "Gyan.FFmpeg.Shared*", "**", "bin"), recursive=True):
|
|
51
|
+
if any(g.glob(os.path.join(bin_dir, "avcodec-*.dll"))):
|
|
52
|
+
os.add_dll_directory(bin_dir)
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def run(
|
|
57
|
+
audio_path: Path,
|
|
58
|
+
output_dir: Path,
|
|
59
|
+
config: dict,
|
|
60
|
+
force: bool = False,
|
|
61
|
+
) -> Path:
|
|
62
|
+
"""Run pyannote diarization on a denoised audio file. Returns path to diarization JSON."""
|
|
63
|
+
output_file = output_dir / f"{output_dir.name}_diarization.json"
|
|
64
|
+
if not force and output_file.exists():
|
|
65
|
+
return output_file
|
|
66
|
+
|
|
67
|
+
_ensure_ffmpeg_dlls()
|
|
68
|
+
|
|
69
|
+
hf_token = config.get("hf_token")
|
|
70
|
+
if not hf_token:
|
|
71
|
+
raise RuntimeError(
|
|
72
|
+
"HuggingFace token required for pyannote. "
|
|
73
|
+
"Set diarize.hf_token in config.yaml."
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
num_speakers = config.get("num_speakers", None)
|
|
77
|
+
min_speakers = config.get("min_speakers", None)
|
|
78
|
+
max_speakers = config.get("max_speakers", None)
|
|
79
|
+
min_turn_ms = config.get("min_turn_ms", 200)
|
|
80
|
+
device = "cuda" if cuda_available() else "cpu"
|
|
81
|
+
|
|
82
|
+
pipeline = _load_pipeline(hf_token, num_speakers, device)
|
|
83
|
+
|
|
84
|
+
import torchaudio
|
|
85
|
+
waveform, sr = torchaudio.load(str(audio_path))
|
|
86
|
+
if sr != 16000:
|
|
87
|
+
waveform = torchaudio.functional.resample(waveform, sr, 16000)
|
|
88
|
+
sr = 16000
|
|
89
|
+
if waveform.shape[0] > 1:
|
|
90
|
+
waveform = waveform.mean(dim=0, keepdim=True)
|
|
91
|
+
|
|
92
|
+
diarize_kwargs = {}
|
|
93
|
+
if num_speakers is not None:
|
|
94
|
+
diarize_kwargs["num_speakers"] = num_speakers
|
|
95
|
+
if min_speakers is not None:
|
|
96
|
+
diarize_kwargs["min_speakers"] = min_speakers
|
|
97
|
+
if max_speakers is not None:
|
|
98
|
+
diarize_kwargs["max_speakers"] = max_speakers
|
|
99
|
+
|
|
100
|
+
result = pipeline({"waveform": waveform, "sample_rate": sr}, **diarize_kwargs)
|
|
101
|
+
del waveform
|
|
102
|
+
|
|
103
|
+
# Extract the Annotation object (handles pyannote API differences)
|
|
104
|
+
annotation = result.speaker_diarization if hasattr(result, "speaker_diarization") else result
|
|
105
|
+
|
|
106
|
+
speakers = sorted({s for _, _, s in annotation.itertracks(yield_label=True)})
|
|
107
|
+
|
|
108
|
+
turns = []
|
|
109
|
+
for i, (turn, _, speaker) in enumerate(annotation.itertracks(yield_label=True)):
|
|
110
|
+
start_ms = round(turn.start * 1000)
|
|
111
|
+
end_ms = round(turn.end * 1000)
|
|
112
|
+
duration_ms = end_ms - start_ms
|
|
113
|
+
if duration_ms < min_turn_ms:
|
|
114
|
+
continue
|
|
115
|
+
turns.append({
|
|
116
|
+
"turn_id": i,
|
|
117
|
+
"speaker": speaker,
|
|
118
|
+
"start_ms": start_ms,
|
|
119
|
+
"end_ms": end_ms,
|
|
120
|
+
"duration_ms": duration_ms,
|
|
121
|
+
})
|
|
122
|
+
|
|
123
|
+
for i, t in enumerate(turns):
|
|
124
|
+
t["turn_id"] = i
|
|
125
|
+
|
|
126
|
+
from pydub import AudioSegment
|
|
127
|
+
audio_duration_ms = len(AudioSegment.from_file(str(audio_path)))
|
|
128
|
+
|
|
129
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
130
|
+
output = {
|
|
131
|
+
"metadata": {
|
|
132
|
+
"source_audio": audio_path.name,
|
|
133
|
+
"audio_duration_ms": audio_duration_ms,
|
|
134
|
+
"speakers_detected": speakers,
|
|
135
|
+
"num_turns": len(turns),
|
|
136
|
+
"total_speech_ms": sum(t["duration_ms"] for t in turns),
|
|
137
|
+
"num_speakers_param": num_speakers,
|
|
138
|
+
"min_turn_ms": min_turn_ms,
|
|
139
|
+
},
|
|
140
|
+
"turns": turns,
|
|
141
|
+
}
|
|
142
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
|
143
|
+
json.dump(output, f, indent=2, ensure_ascii=False)
|
|
144
|
+
|
|
145
|
+
return output_file
|
dinnote/pipeline.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Extends the dinscribe pipeline with a speaker diarization step (pyannote) between VAD and
|
|
3
|
+
transcription. Diarization replaces VAD segments as the unit fed to Whisper, giving each
|
|
4
|
+
transcription entry a speaker label and cleaner single-speaker audio.
|
|
5
|
+
|
|
6
|
+
Pipeline: denoise -> vad -> diarize -> transcribe
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import time
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from . import denoise, vad, diarize, transcribe
|
|
14
|
+
from .config import PipelineConfig
|
|
15
|
+
from .utils import fmt_time, progress_bar, cuda_available
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def process_file(input_path: Path, output_dir: Path, config: PipelineConfig = PipelineConfig(), force: bool = False) -> bool:
|
|
19
|
+
"""Run the full pipeline for one audio file. Returns True on success."""
|
|
20
|
+
file_dir = output_dir / input_path.stem
|
|
21
|
+
file_dir.mkdir(parents=True, exist_ok=True)
|
|
22
|
+
|
|
23
|
+
total_start = time.monotonic()
|
|
24
|
+
|
|
25
|
+
print(f"\n{'─' * 60}")
|
|
26
|
+
print(f" Input: {input_path}")
|
|
27
|
+
print(f" Output: {file_dir}/")
|
|
28
|
+
print('─' * 60)
|
|
29
|
+
|
|
30
|
+
print("\n [1/4] Denoising")
|
|
31
|
+
denoised_path = file_dir / f"{input_path.stem}_denoised.wav"
|
|
32
|
+
if not force and denoised_path.exists():
|
|
33
|
+
print(" ✓ Skipped (cached)")
|
|
34
|
+
else:
|
|
35
|
+
print(" Running Demucs vocal isolation...")
|
|
36
|
+
step_start = time.monotonic()
|
|
37
|
+
try:
|
|
38
|
+
import dataclasses
|
|
39
|
+
denoised_path = denoise.run(input_path, file_dir, dataclasses.asdict(config.denoise), force=force)
|
|
40
|
+
except Exception as e:
|
|
41
|
+
print(f"\n ERROR: Denoising failed: {e}")
|
|
42
|
+
return False
|
|
43
|
+
print(f" ✓ Done ({fmt_time(time.monotonic() - step_start)})")
|
|
44
|
+
|
|
45
|
+
print("\n [2/4] Voice Activity Detection")
|
|
46
|
+
vad_path = file_dir / f"{input_path.stem}_vad.json"
|
|
47
|
+
vad_cached = not force and vad_path.exists()
|
|
48
|
+
step_start = time.monotonic()
|
|
49
|
+
if vad_cached:
|
|
50
|
+
print(" ✓ Skipped (cached)")
|
|
51
|
+
else:
|
|
52
|
+
print(" Detecting speech segments...")
|
|
53
|
+
try:
|
|
54
|
+
import dataclasses
|
|
55
|
+
vad_path = vad.run(denoised_path, file_dir, dataclasses.asdict(config.vad), force=force)
|
|
56
|
+
except Exception as e:
|
|
57
|
+
print(f"\n ERROR: VAD failed: {e}")
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
vad_meta = json.loads(vad_path.read_text(encoding="utf-8"))["metadata"]
|
|
61
|
+
if not vad_cached:
|
|
62
|
+
speech_s = vad_meta["total_speech_ms"] / 1000
|
|
63
|
+
audio_s = vad_meta["audio_duration_ms"] / 1000
|
|
64
|
+
print(f" ✓ {vad_meta['segment_count']} segments found "
|
|
65
|
+
f"({speech_s:.1f}s speech in {audio_s:.1f}s audio) "
|
|
66
|
+
f"({fmt_time(time.monotonic() - step_start)})")
|
|
67
|
+
|
|
68
|
+
print("\n [3/4] Speaker Diarization")
|
|
69
|
+
diarization_path = file_dir / f"{input_path.stem}_diarization.json"
|
|
70
|
+
diar_cached = not force and diarization_path.exists()
|
|
71
|
+
step_start = time.monotonic()
|
|
72
|
+
if diar_cached:
|
|
73
|
+
print(" ✓ Skipped (cached)")
|
|
74
|
+
else:
|
|
75
|
+
import dataclasses
|
|
76
|
+
diar_config = dataclasses.asdict(config.diarize)
|
|
77
|
+
num_spk = diar_config.get("num_speakers")
|
|
78
|
+
print(f" Running pyannote{f' (num_speakers={num_spk})' if num_spk else ''}...")
|
|
79
|
+
try:
|
|
80
|
+
diarization_path = diarize.run(denoised_path, file_dir, diar_config, force=force)
|
|
81
|
+
except Exception as e:
|
|
82
|
+
print(f"\n ERROR: Diarization failed: {e}")
|
|
83
|
+
return False
|
|
84
|
+
|
|
85
|
+
diar_meta = json.loads(diarization_path.read_text(encoding="utf-8"))["metadata"]
|
|
86
|
+
if not diar_cached:
|
|
87
|
+
speakers = diar_meta["speakers_detected"]
|
|
88
|
+
print(f" ✓ {len(speakers)} speaker(s): {', '.join(speakers)} "
|
|
89
|
+
f"| {diar_meta['num_turns']} turns "
|
|
90
|
+
f"({fmt_time(time.monotonic() - step_start)})")
|
|
91
|
+
|
|
92
|
+
print("\n [4/4] Transcribing")
|
|
93
|
+
transcription_path = file_dir / f"{input_path.stem}_transcription.json"
|
|
94
|
+
trans_cached = not force and transcription_path.exists()
|
|
95
|
+
if trans_cached:
|
|
96
|
+
print(" ✓ Skipped (cached)")
|
|
97
|
+
else:
|
|
98
|
+
import dataclasses
|
|
99
|
+
trans_config = dataclasses.asdict(config.transcribe)
|
|
100
|
+
model_name = trans_config.get("model", "base")
|
|
101
|
+
device = "cuda" if cuda_available() else "cpu"
|
|
102
|
+
print(f" Loading Whisper '{model_name}' on {device}...")
|
|
103
|
+
step_start = time.monotonic()
|
|
104
|
+
|
|
105
|
+
def on_segment(current: int, total: int):
|
|
106
|
+
bar = progress_bar(current, total)
|
|
107
|
+
print(f"\r [{bar}] {current}/{total} turns", end="", flush=True)
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
transcription_path = transcribe.run(
|
|
111
|
+
denoised_path,
|
|
112
|
+
file_dir,
|
|
113
|
+
trans_config,
|
|
114
|
+
diarization_path=diarization_path,
|
|
115
|
+
vad_path=vad_path,
|
|
116
|
+
on_segment=on_segment,
|
|
117
|
+
force=force,
|
|
118
|
+
)
|
|
119
|
+
except Exception as e:
|
|
120
|
+
print(f"\n ERROR: Transcription failed: {e}")
|
|
121
|
+
return False
|
|
122
|
+
|
|
123
|
+
print()
|
|
124
|
+
trans_meta = json.loads(transcription_path.read_text(encoding="utf-8"))["metadata"]
|
|
125
|
+
kept = trans_meta["processed_segments"]
|
|
126
|
+
total_turns = diar_meta["num_turns"]
|
|
127
|
+
print(f" ✓ {kept}/{total_turns} turns transcribed "
|
|
128
|
+
f"({fmt_time(time.monotonic() - step_start)})")
|
|
129
|
+
|
|
130
|
+
print(f"\n{'─' * 60}")
|
|
131
|
+
print(f" Total time: {fmt_time(time.monotonic() - total_start)}")
|
|
132
|
+
print(" Output:")
|
|
133
|
+
for p in (denoised_path, vad_path, diarization_path, transcription_path):
|
|
134
|
+
print(f" {p.relative_to(output_dir.parent)}")
|
|
135
|
+
print('─' * 60)
|
|
136
|
+
|
|
137
|
+
return True
|
dinnote/transcribe.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Transcribes speech segments from denoised audio using Whisper.
|
|
3
|
+
|
|
4
|
+
Accepts either:
|
|
5
|
+
- a diarization JSON (preferred) — per-speaker turns from pyannote
|
|
6
|
+
- a VAD JSON (fallback) — speaker-agnostic segments from Silero
|
|
7
|
+
|
|
8
|
+
When using diarization, each transcription entry includes the speaker label.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import shutil
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Optional, Callable
|
|
15
|
+
from pydub import AudioSegment
|
|
16
|
+
import whisper
|
|
17
|
+
from .utils import cuda_available
|
|
18
|
+
|
|
19
|
+
_whisper_cache: dict = {}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _load_model(model_name: str, device: str):
|
|
23
|
+
key = (model_name, device)
|
|
24
|
+
if key not in _whisper_cache:
|
|
25
|
+
_whisper_cache[key] = whisper.load_model(model_name, device=device)
|
|
26
|
+
return _whisper_cache[key]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _load_vocabulary(vocab_file: Optional[str]) -> str:
|
|
30
|
+
"""Build Whisper's initial_prompt from a vocabulary file."""
|
|
31
|
+
if not vocab_file:
|
|
32
|
+
return ""
|
|
33
|
+
vocab_path = Path(vocab_file)
|
|
34
|
+
if not vocab_path.exists():
|
|
35
|
+
return ""
|
|
36
|
+
terms = [
|
|
37
|
+
line.strip()
|
|
38
|
+
for line in vocab_path.read_text(encoding="utf-8").splitlines()
|
|
39
|
+
if line.strip() and not line.startswith("#")
|
|
40
|
+
]
|
|
41
|
+
return f"Common terms: {', '.join(terms)}." if terms else ""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _load_segments(diarization_path: Optional[Path], vad_path: Optional[Path]):
|
|
45
|
+
"""Load segments from diarization JSON (preferred) or VAD JSON (fallback).
|
|
46
|
+
|
|
47
|
+
Returns (segments, source) where each segment has start_ms, end_ms,
|
|
48
|
+
and optionally speaker. source is "diarization" or "vad".
|
|
49
|
+
"""
|
|
50
|
+
if diarization_path and diarization_path.exists():
|
|
51
|
+
data = json.loads(diarization_path.read_text(encoding="utf-8"))
|
|
52
|
+
turns = data.get("turns", [])
|
|
53
|
+
segments = [
|
|
54
|
+
{"start_ms": t["start_ms"], "end_ms": t["end_ms"], "speaker": t["speaker"]}
|
|
55
|
+
for t in turns
|
|
56
|
+
]
|
|
57
|
+
return segments, "diarization"
|
|
58
|
+
|
|
59
|
+
if vad_path and vad_path.exists():
|
|
60
|
+
data = json.loads(vad_path.read_text(encoding="utf-8"))
|
|
61
|
+
segments = [
|
|
62
|
+
{"start_ms": s["start_ms"], "end_ms": s["end_ms"]}
|
|
63
|
+
for s in data.get("segments", [])
|
|
64
|
+
]
|
|
65
|
+
return segments, "vad"
|
|
66
|
+
|
|
67
|
+
raise RuntimeError("No diarization or VAD file provided.")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def run(
|
|
71
|
+
audio_path: Path,
|
|
72
|
+
output_dir: Path,
|
|
73
|
+
config: dict,
|
|
74
|
+
diarization_path: Optional[Path] = None,
|
|
75
|
+
vad_path: Optional[Path] = None,
|
|
76
|
+
on_segment: Optional[Callable] = None,
|
|
77
|
+
force: bool = False,
|
|
78
|
+
) -> Path:
|
|
79
|
+
"""Transcribe speaker turns from denoised audio. Returns path to transcription JSON."""
|
|
80
|
+
output_file = output_dir / f"{output_dir.name}_transcription.json"
|
|
81
|
+
if not force and output_file.exists():
|
|
82
|
+
return output_file
|
|
83
|
+
|
|
84
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
model_name = config.get("model", "base")
|
|
86
|
+
language = config.get("language", "en")
|
|
87
|
+
temperature = config.get("temperature", None)
|
|
88
|
+
no_speech_threshold = config.get("no_speech_threshold", None)
|
|
89
|
+
logprob_threshold = config.get("logprob_threshold", None)
|
|
90
|
+
compression_ratio_threshold = config.get("compression_ratio_threshold", None)
|
|
91
|
+
condition_on_previous_text = config.get("condition_on_previous_text", False)
|
|
92
|
+
vocab_file = config.get("vocab_file", "vocab.txt")
|
|
93
|
+
|
|
94
|
+
device = "cuda" if cuda_available() else "cpu"
|
|
95
|
+
model = _load_model(model_name, device)
|
|
96
|
+
initial_prompt = _load_vocabulary(vocab_file) or None
|
|
97
|
+
|
|
98
|
+
segments, source = _load_segments(diarization_path, vad_path)
|
|
99
|
+
if not segments:
|
|
100
|
+
raise RuntimeError("No segments found to transcribe.")
|
|
101
|
+
|
|
102
|
+
audio = AudioSegment.from_file(str(audio_path))
|
|
103
|
+
audio_duration_ms = len(audio)
|
|
104
|
+
|
|
105
|
+
valid_segments = [seg for seg in segments if seg["start_ms"] < audio_duration_ms]
|
|
106
|
+
for seg in valid_segments:
|
|
107
|
+
seg["end_ms"] = min(seg["end_ms"], audio_duration_ms)
|
|
108
|
+
|
|
109
|
+
total = len(valid_segments)
|
|
110
|
+
|
|
111
|
+
output = {
|
|
112
|
+
"metadata": {
|
|
113
|
+
"source_audio": audio_path.name,
|
|
114
|
+
"model": model_name,
|
|
115
|
+
"language": language,
|
|
116
|
+
"segment_source": source,
|
|
117
|
+
"temperature": temperature,
|
|
118
|
+
"no_speech_threshold": no_speech_threshold,
|
|
119
|
+
"logprob_threshold": logprob_threshold,
|
|
120
|
+
"compression_ratio_threshold": compression_ratio_threshold,
|
|
121
|
+
"total_segments": total,
|
|
122
|
+
"processed_segments": 0,
|
|
123
|
+
},
|
|
124
|
+
"transcription": [],
|
|
125
|
+
}
|
|
126
|
+
_write_json(output_file, output)
|
|
127
|
+
|
|
128
|
+
temp_dir = output_dir / "_temp_segments"
|
|
129
|
+
if temp_dir.exists():
|
|
130
|
+
shutil.rmtree(temp_dir)
|
|
131
|
+
temp_dir.mkdir()
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
for i, seg in enumerate(valid_segments, 1):
|
|
135
|
+
start_ms = seg["start_ms"]
|
|
136
|
+
end_ms = seg["end_ms"]
|
|
137
|
+
speaker = seg.get("speaker")
|
|
138
|
+
|
|
139
|
+
temp_file = temp_dir / f"seg_{i}.wav"
|
|
140
|
+
audio[start_ms:end_ms].export(str(temp_file), format="wav")
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
transcribe_kwargs = dict(
|
|
144
|
+
language=language,
|
|
145
|
+
fp16=(device == "cuda"),
|
|
146
|
+
condition_on_previous_text=condition_on_previous_text,
|
|
147
|
+
initial_prompt=initial_prompt,
|
|
148
|
+
)
|
|
149
|
+
if temperature is not None:
|
|
150
|
+
transcribe_kwargs["temperature"] = temperature
|
|
151
|
+
if no_speech_threshold is not None:
|
|
152
|
+
transcribe_kwargs["no_speech_threshold"] = no_speech_threshold
|
|
153
|
+
if logprob_threshold is not None:
|
|
154
|
+
transcribe_kwargs["logprob_threshold"] = logprob_threshold
|
|
155
|
+
if compression_ratio_threshold is not None:
|
|
156
|
+
transcribe_kwargs["compression_ratio_threshold"] = compression_ratio_threshold
|
|
157
|
+
result = model.transcribe(str(temp_file), **transcribe_kwargs)
|
|
158
|
+
except Exception:
|
|
159
|
+
temp_file.unlink(missing_ok=True)
|
|
160
|
+
if on_segment:
|
|
161
|
+
on_segment(i, total)
|
|
162
|
+
continue
|
|
163
|
+
|
|
164
|
+
temp_file.unlink(missing_ok=True)
|
|
165
|
+
|
|
166
|
+
text = result["text"].strip()
|
|
167
|
+
no_speech_prob = max(
|
|
168
|
+
(s.get("no_speech_prob", 0) for s in result.get("segments", [])),
|
|
169
|
+
default=0,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
output["metadata"]["processed_segments"] += 1
|
|
173
|
+
|
|
174
|
+
if not text or (no_speech_threshold is not None and no_speech_prob > no_speech_threshold):
|
|
175
|
+
_write_json(output_file, output)
|
|
176
|
+
if on_segment:
|
|
177
|
+
on_segment(i, total)
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
entry = {
|
|
181
|
+
"timestamp": {"start": start_ms / 1000, "end": end_ms / 1000},
|
|
182
|
+
"text": text,
|
|
183
|
+
}
|
|
184
|
+
if speaker is not None:
|
|
185
|
+
entry["speaker"] = speaker
|
|
186
|
+
|
|
187
|
+
output["transcription"].append(entry)
|
|
188
|
+
_write_json(output_file, output)
|
|
189
|
+
|
|
190
|
+
if on_segment:
|
|
191
|
+
on_segment(i, total)
|
|
192
|
+
finally:
|
|
193
|
+
if temp_dir.exists():
|
|
194
|
+
shutil.rmtree(temp_dir)
|
|
195
|
+
|
|
196
|
+
return output_file
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _write_json(path: Path, data: dict):
|
|
200
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
201
|
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
dinnote/utils.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import sys
|
|
4
|
+
from importlib import resources
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
AUDIO_EXTENSIONS = frozenset({".mp3", ".wav", ".m4a", ".flac", ".ogg", ".aac"})
|
|
8
|
+
|
|
9
|
+
_CONFIG_FILES = ("config.yaml", "vocab.txt")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_config_dir() -> Path:
|
|
13
|
+
if sys.platform == "win32":
|
|
14
|
+
base = Path(os.environ.get("APPDATA", Path.home()))
|
|
15
|
+
elif sys.platform == "darwin":
|
|
16
|
+
base = Path.home() / "Library" / "Application Support"
|
|
17
|
+
else:
|
|
18
|
+
base = Path(os.environ.get("XDG_CONFIG_HOME", Path.home() / ".config"))
|
|
19
|
+
return base / "dinnote"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def setup_user_config() -> Path:
|
|
23
|
+
"""Copy default config files to the user config dir if they don't exist yet."""
|
|
24
|
+
config_dir = get_config_dir()
|
|
25
|
+
config_dir.mkdir(parents=True, exist_ok=True)
|
|
26
|
+
|
|
27
|
+
pkg = resources.files("dinnote")
|
|
28
|
+
for name in _CONFIG_FILES:
|
|
29
|
+
dest = config_dir / name
|
|
30
|
+
if not dest.exists():
|
|
31
|
+
src = pkg.joinpath(name)
|
|
32
|
+
with resources.as_file(src) as src_path:
|
|
33
|
+
shutil.copy2(src_path, dest)
|
|
34
|
+
|
|
35
|
+
return config_dir
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def load_config(config_path: str, section: str = "") -> dict:
|
|
39
|
+
try:
|
|
40
|
+
import yaml
|
|
41
|
+
with open(config_path, encoding="utf-8") as f:
|
|
42
|
+
cfg = yaml.safe_load(f) or {}
|
|
43
|
+
return cfg.get(section, {}) if section else cfg
|
|
44
|
+
except FileNotFoundError:
|
|
45
|
+
return {}
|
|
46
|
+
except Exception:
|
|
47
|
+
return {}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def progress_bar(current: int, total: int, width: int = 28) -> str:
|
|
51
|
+
if total == 0:
|
|
52
|
+
return "░" * width
|
|
53
|
+
filled = int(width * current / total)
|
|
54
|
+
return "▓" * filled + "░" * (width - filled)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def fmt_time(seconds: float) -> str:
|
|
58
|
+
if seconds < 60:
|
|
59
|
+
return f"{seconds:.1f}s"
|
|
60
|
+
return f"{int(seconds // 60)}m {seconds % 60:.0f}s"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def cuda_available() -> bool:
|
|
64
|
+
try:
|
|
65
|
+
import torch
|
|
66
|
+
return torch.cuda.is_available()
|
|
67
|
+
except Exception:
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def warn_if_no_cuda() -> None:
|
|
72
|
+
if not cuda_available():
|
|
73
|
+
print("WARNING: CUDA is not available! Audio processing will run on CPU, which will be much slower.")
|
dinnote/vad.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Uses Silero VAD to detect speech segments in denoised audio and writes a JSON file with segment boundaries.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import torch
|
|
8
|
+
from .utils import cuda_available
|
|
9
|
+
from .config import VadConfig
|
|
10
|
+
|
|
11
|
+
_vad_model_cache = None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _load_model():
|
|
15
|
+
global _vad_model_cache
|
|
16
|
+
if _vad_model_cache is None:
|
|
17
|
+
torch.set_num_threads(1)
|
|
18
|
+
model, utils = torch.hub.load(
|
|
19
|
+
repo_or_dir="snakers4/silero-vad",
|
|
20
|
+
model="silero_vad",
|
|
21
|
+
force_reload=False,
|
|
22
|
+
onnx=False,
|
|
23
|
+
)
|
|
24
|
+
get_speech_timestamps, _, read_audio, *_ = utils
|
|
25
|
+
device = "cuda" if cuda_available() else "cpu"
|
|
26
|
+
model = model.to(device)
|
|
27
|
+
_vad_model_cache = (model, get_speech_timestamps, read_audio, device)
|
|
28
|
+
return _vad_model_cache
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _detect_segments(audio_path: Path, config: dict):
|
|
32
|
+
"""Run Silero VAD and return (segments, duration_ms).
|
|
33
|
+
|
|
34
|
+
segments is a list of [start_ms, end_ms] pairs.
|
|
35
|
+
"""
|
|
36
|
+
threshold = config.get("threshold", 0.5)
|
|
37
|
+
min_speech_ms = config.get("min_speech_duration_ms", None)
|
|
38
|
+
min_silence_ms = config.get("min_silence_duration_ms", None)
|
|
39
|
+
padding_ms = config.get("padding_ms", 500) or 0
|
|
40
|
+
max_seg_sec = config.get("max_segment_length_sec", 30)
|
|
41
|
+
merge_within_sec = config.get("merge_within_sec", 1.0)
|
|
42
|
+
sample_rate = 16000 # required by Silero VAD
|
|
43
|
+
|
|
44
|
+
model, get_speech_timestamps, read_audio, device = _load_model()
|
|
45
|
+
|
|
46
|
+
wav = read_audio(str(audio_path), sampling_rate=sample_rate).to(device)
|
|
47
|
+
duration_ms = len(wav) / sample_rate * 1000
|
|
48
|
+
|
|
49
|
+
vad_kwargs = dict(
|
|
50
|
+
threshold=threshold,
|
|
51
|
+
sampling_rate=sample_rate,
|
|
52
|
+
window_size_samples=512,
|
|
53
|
+
speech_pad_ms=30,
|
|
54
|
+
)
|
|
55
|
+
if min_speech_ms is not None:
|
|
56
|
+
vad_kwargs["min_speech_duration_ms"] = min_speech_ms
|
|
57
|
+
if min_silence_ms is not None:
|
|
58
|
+
vad_kwargs["min_silence_duration_ms"] = min_silence_ms
|
|
59
|
+
speech_timestamps = get_speech_timestamps(wav, model, **vad_kwargs)
|
|
60
|
+
|
|
61
|
+
# Convert sample indices to ms and apply padding
|
|
62
|
+
samples_per_ms = sample_rate / 1000
|
|
63
|
+
segments = []
|
|
64
|
+
for ts in speech_timestamps:
|
|
65
|
+
start_ms = max(0, int(ts["start"] / samples_per_ms) - padding_ms)
|
|
66
|
+
end_ms = min(duration_ms, int(ts["end"] / samples_per_ms) + padding_ms)
|
|
67
|
+
if segments and start_ms < segments[-1][1]:
|
|
68
|
+
start_ms = segments[-1][1]
|
|
69
|
+
if start_ms < end_ms:
|
|
70
|
+
segments.append([start_ms, end_ms])
|
|
71
|
+
|
|
72
|
+
# Merge nearby segments
|
|
73
|
+
if merge_within_sec is not None:
|
|
74
|
+
merge_gap_ms = merge_within_sec * 1000
|
|
75
|
+
merged = []
|
|
76
|
+
for start_ms, end_ms in segments:
|
|
77
|
+
if merged and start_ms - merged[-1][1] <= merge_gap_ms:
|
|
78
|
+
merged[-1][1] = end_ms
|
|
79
|
+
else:
|
|
80
|
+
merged.append([start_ms, end_ms])
|
|
81
|
+
segments = merged
|
|
82
|
+
|
|
83
|
+
# Discard segments that exceed the maximum length
|
|
84
|
+
if max_seg_sec is not None:
|
|
85
|
+
max_ms = max_seg_sec * 1000
|
|
86
|
+
segments = [[s, e] for s, e in segments if e - s <= max_ms]
|
|
87
|
+
|
|
88
|
+
return segments, duration_ms
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _build_output(audio_path: Path, segments: list, duration_ms: float, config: dict) -> dict:
|
|
92
|
+
total_speech_ms = sum(e - s for s, e in segments)
|
|
93
|
+
return {
|
|
94
|
+
"metadata": {
|
|
95
|
+
"source_audio": audio_path.name,
|
|
96
|
+
"audio_duration_ms": duration_ms,
|
|
97
|
+
"total_speech_ms": total_speech_ms,
|
|
98
|
+
"segment_count": len(segments),
|
|
99
|
+
"vad_threshold": config.get("threshold", 0.5),
|
|
100
|
+
"max_segment_length_sec": config.get("max_segment_length_sec", 30),
|
|
101
|
+
"merge_within_sec": config.get("merge_within_sec", 1.0),
|
|
102
|
+
},
|
|
103
|
+
"segments": [
|
|
104
|
+
{"segment_id": i, "start_ms": s, "end_ms": e, "duration_ms": e - s}
|
|
105
|
+
for i, (s, e) in enumerate(segments)
|
|
106
|
+
],
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def run(input_path: Path, output_dir: Path, config: dict, force: bool = False) -> Path:
|
|
111
|
+
"""Run Silero VAD on a single audio file. Returns path to vad.json."""
|
|
112
|
+
output_file = output_dir / f"{output_dir.name}_vad.json"
|
|
113
|
+
if not force and output_file.exists():
|
|
114
|
+
return output_file
|
|
115
|
+
|
|
116
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
117
|
+
segments, duration_ms = _detect_segments(input_path, config)
|
|
118
|
+
if not segments:
|
|
119
|
+
raise RuntimeError(f"No speech segments detected in {input_path.name}")
|
|
120
|
+
|
|
121
|
+
output = _build_output(input_path, segments, duration_ms, config)
|
|
122
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
|
123
|
+
json.dump(output, f, indent=2, ensure_ascii=False)
|
|
124
|
+
|
|
125
|
+
return output_file
|
dinnote/vocab.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# You can add words or phrases here that will be included in Whisper's initial prompt,
|
|
2
|
+
# increasing the likelihood those words will be predicted during transcription.
|
|
3
|
+
# Add one word or phrase per line and note that lines starting with # are ignored.
|
|
4
|
+
#
|
|
5
|
+
# Examples of what to add:
|
|
6
|
+
# - Technical jargon specific to your subject matter
|
|
7
|
+
# - Proper nouns (names of people, places, organizations, products)
|
|
8
|
+
# - Acronyms and their expansions
|
|
9
|
+
# - Any word Whisper is consistently getting wrong
|
|
10
|
+
#
|
|
11
|
+
# Example entries:
|
|
12
|
+
# box box
|
|
13
|
+
# VSC
|
|
14
|
+
# DRS
|
|
15
|
+
# understeer
|
|
16
|
+
# oversteer
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dinnote
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Audio denoising, VAD, speaker diarization, and transcription pipeline using Demucs, Silero VAD, pyannote, and Whisper
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: demucs>=4.0.0
|
|
8
|
+
Requires-Dist: torchcodec
|
|
9
|
+
Requires-Dist: torch
|
|
10
|
+
Requires-Dist: torchaudio
|
|
11
|
+
Requires-Dist: pydub
|
|
12
|
+
Requires-Dist: packaging
|
|
13
|
+
Requires-Dist: pyannote.audio>=3.1.0
|
|
14
|
+
Requires-Dist: openai-whisper
|
|
15
|
+
Requires-Dist: pyyaml
|
|
16
|
+
|
|
17
|
+
## dinnote audio transcription
|
|
18
|
+
Processes audio through a four-step pipeline to produce a transcription JSON with per-speaker diarization: denoising (Demucs), voice activity detection (Silero VAD), speaker diarization (pyannote), and transcription (Whisper).
|
|
19
|
+
|
|
20
|
+
### Installation
|
|
21
|
+
```bash
|
|
22
|
+
pip install dinnote
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
On first run, dinnote copies default config files to your platform config directory:
|
|
26
|
+
- **Windows:** `%APPDATA%\dinnote\`
|
|
27
|
+
- **macOS:** `~/Library/Application Support/dinnote/`
|
|
28
|
+
- **Linux:** `~/.config/dinnote/`
|
|
29
|
+
|
|
30
|
+
Edit `config.yaml` and `vocab.txt` to customize settings.
|
|
31
|
+
|
|
32
|
+
Speaker diarization requires a HuggingFace token with access to [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1).
|
|
33
|
+
Set it via `diarize.hf_token` in `config.yaml`.
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
### CLI usage
|
|
37
|
+
```bash
|
|
38
|
+
dinnote input/audio.mp3 # single file
|
|
39
|
+
dinnote input/ # all audio files in a folder
|
|
40
|
+
dinnote input/audio.mp3 -f # force re-run all steps
|
|
41
|
+
dinnote input/audio.mp3 -c path/to/config.yaml # custom config
|
|
42
|
+
dinnote input/audio.mp3 -o results/ # custom output dir
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Each step checks whether its output already exists and skips it if so. Use `-f` to force all steps to re-run.
|
|
46
|
+
|
|
47
|
+
Output is written to `output/<filename>/` and contains:
|
|
48
|
+
- `<filename>_denoised.wav` (vocals isolated from background noise)
|
|
49
|
+
- `<filename>_vad.json` (detected speech segment boundaries)
|
|
50
|
+
- `<filename>_diarization.json` (per-speaker turn boundaries from pyannote)
|
|
51
|
+
- `<filename>_transcription.json` (final transcription with timestamps and speaker labels)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
### Python API
|
|
55
|
+
```python
|
|
56
|
+
from pathlib import Path
|
|
57
|
+
import dinnote
|
|
58
|
+
from dinnote import PipelineConfig, VadConfig, DiarizeConfig, TranscribeConfig
|
|
59
|
+
|
|
60
|
+
# Run the full pipeline with defaults
|
|
61
|
+
dinnote.process_file(
|
|
62
|
+
input_path=Path("recording.wav"),
|
|
63
|
+
output_dir=Path("output"),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Custom config
|
|
67
|
+
config = PipelineConfig(
|
|
68
|
+
vad=VadConfig(threshold=0.4, max_segment_length_sec=20),
|
|
69
|
+
diarize=DiarizeConfig(num_speakers=2),
|
|
70
|
+
transcribe=TranscribeConfig(model="small", language="en"),
|
|
71
|
+
)
|
|
72
|
+
dinnote.process_file(Path("recording.wav"), Path("output"), config=config)
|
|
73
|
+
|
|
74
|
+
# Or use individual stages
|
|
75
|
+
from dinnote import denoise, vad, diarize, transcribe
|
|
76
|
+
|
|
77
|
+
denoised = denoise.run(Path("recording.wav"), Path("output/recording"), config={})
|
|
78
|
+
vad_file = vad.run(denoised, Path("output/recording"), config={})
|
|
79
|
+
diarization = diarize.run(denoised, Path("output/recording"), config={})
|
|
80
|
+
result = transcribe.run(denoised, Path("output/recording"), config={}, diarization_path=diarization)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
### Configuration
|
|
85
|
+
|
|
86
|
+
```yaml
|
|
87
|
+
denoise:
|
|
88
|
+
model: htdemucs # htdemucs | htdemucs_ft | mdx | mdx_extra | htdemucs_6s
|
|
89
|
+
|
|
90
|
+
vad:
|
|
91
|
+
threshold: 0.5 # 0.0–1.0, higher = requires clearer speech
|
|
92
|
+
min_speech_duration_ms: 250
|
|
93
|
+
min_silence_duration_ms: 100
|
|
94
|
+
padding_ms: 500
|
|
95
|
+
max_segment_length_sec: 30
|
|
96
|
+
merge_within_sec: 1.0
|
|
97
|
+
|
|
98
|
+
diarize:
|
|
99
|
+
# hf_token: hf_...
|
|
100
|
+
num_speakers: null # fix speaker count or leave null to let pyannote estimate
|
|
101
|
+
min_speakers: null
|
|
102
|
+
max_speakers: null
|
|
103
|
+
min_turn_ms: 200 # turns shorter than this are discarded (ms)
|
|
104
|
+
|
|
105
|
+
transcribe:
|
|
106
|
+
model: base # tiny | base | small | medium | large
|
|
107
|
+
language: en # set to null to auto-detect
|
|
108
|
+
temperature: null # null = Whisper fallback sequence, 0 = greedy
|
|
109
|
+
no_speech_threshold: 0.6
|
|
110
|
+
logprob_threshold: -1.0
|
|
111
|
+
compression_ratio_threshold: 2.4
|
|
112
|
+
condition_on_previous_text: false
|
|
113
|
+
vocab_file: null # path to domain-specific vocabulary, defaults to vocab.txt in config dir
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Add domain-specific vocabulary to `vocab.txt` to improve transcription accuracy on unusual words and jargon. For noisy or technical audio, set `temperature: 0` to disable Whisper's fallback to higher-temperature decoding, and consider filtering out common hallucinations specific to your dataset.
|
|
117
|
+
|
|
118
|
+
If `num_speakers` is known in advance, setting it gives more reliable diarization. Otherwise use `min_speakers`/`max_speakers` to constrain the range, or leave both null to let pyannote estimate freely.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
dinnote/__init__.py,sha256=vc55iHsOM2HYZhpzNLrMVrgMmXZ58FviGlqZpvpQSg8,344
|
|
2
|
+
dinnote/cli.py,sha256=FMPG9xza0fsJJaC6nAKJ5kkJy_tKI6bG3y6hovL89jc,2531
|
|
3
|
+
dinnote/config.py,sha256=S8c9v61az-HfPt839INBkG2prx4qCiYEbH5Y61ex9-k,2225
|
|
4
|
+
dinnote/config.yaml,sha256=m4yCcbeCaMraP35C94lq9hUvTTPxzF1FsuHUCuqj8J4,2919
|
|
5
|
+
dinnote/denoise.py,sha256=x1ifNQH92gTeV_9hyJ6bfSHLxH6XFrcdpLMaK7OusWM,2192
|
|
6
|
+
dinnote/diarize.py,sha256=xfrtYd3dB-GY3SENSwi3Hg_wVmGfQbc566GSZl-mebM,4725
|
|
7
|
+
dinnote/pipeline.py,sha256=VIr7pqlqHN5uYD3AYLcFhpSQyYpqAfaMKvR7OnIBs5Y,5474
|
|
8
|
+
dinnote/transcribe.py,sha256=bzYs5lEJ6nQP8ekqUlkGKlWUecOGXj81fgwtBjXrHH8,7066
|
|
9
|
+
dinnote/utils.py,sha256=1YwSmT0womyKWXb44eOxQcZhvZMkNBQhX2APwjcpkFo,2066
|
|
10
|
+
dinnote/vad.py,sha256=6kVc2OStv004RuYeZacR6_yykwlwqXEOawjOlnsYUbA,4535
|
|
11
|
+
dinnote/vocab.txt,sha256=SGKEzbXHiX9PaxLworj15o7O8rCca-0gM3K8f5G75N0,575
|
|
12
|
+
dinnote-0.1.0.dist-info/METADATA,sha256=aJnkBEwEoknptUZgnTZ88_c_tbVbOhxQK72BhnxgL4M,4663
|
|
13
|
+
dinnote-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
14
|
+
dinnote-0.1.0.dist-info/entry_points.txt,sha256=XV293h8uFditIf7hAHSOnhyMDpXj0hq2uFhmwTkxlb4,45
|
|
15
|
+
dinnote-0.1.0.dist-info/top_level.txt,sha256=r_aQmbXJhOy4NsX6fxjdCyyD_TJvbjUasmbek0lJ7Ew,8
|
|
16
|
+
dinnote-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
dinnote
|