dinnote 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dinnote-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,118 @@
1
+ Metadata-Version: 2.4
2
+ Name: dinnote
3
+ Version: 0.1.0
4
+ Summary: Audio denoising, VAD, speaker diarization, and transcription pipeline using Demucs, Silero VAD, pyannote, and Whisper
5
+ Requires-Python: >=3.10
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: demucs>=4.0.0
8
+ Requires-Dist: torchcodec
9
+ Requires-Dist: torch
10
+ Requires-Dist: torchaudio
11
+ Requires-Dist: pydub
12
+ Requires-Dist: packaging
13
+ Requires-Dist: pyannote.audio>=3.1.0
14
+ Requires-Dist: openai-whisper
15
+ Requires-Dist: pyyaml
16
+
17
+ ## dinnote audio transcription
18
+ Processes audio through a four-step pipeline to produce a transcription JSON with per-speaker diarization: denoising (Demucs), voice activity detection (Silero VAD), speaker diarization (pyannote), and transcription (Whisper).
19
+
20
+ ### Installation
21
+ ```bash
22
+ pip install dinnote
23
+ ```
24
+
25
+ On first run, dinnote copies default config files to your platform config directory:
26
+ - **Windows:** `%APPDATA%\dinnote\`
27
+ - **macOS:** `~/Library/Application Support/dinnote/`
28
+ - **Linux:** `~/.config/dinnote/`
29
+
30
+ Edit `config.yaml` and `vocab.txt` to customize settings.
31
+
32
+ Speaker diarization requires a HuggingFace token with access to [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1).
33
+ Set it via `diarize.hf_token` in `config.yaml`.
34
+
35
+
36
+ ### CLI usage
37
+ ```bash
38
+ dinnote input/audio.mp3 # single file
39
+ dinnote input/ # all audio files in a folder
40
+ dinnote input/audio.mp3 -f # force re-run all steps
41
+ dinnote input/audio.mp3 -c path/to/config.yaml # custom config
42
+ dinnote input/audio.mp3 -o results/ # custom output dir
43
+ ```
44
+
45
+ Each step checks whether its output already exists and skips it if so. Use `-f` to force all steps to re-run.
46
+
47
+ Output is written to `output/<filename>/` and contains:
48
+ - `<filename>_denoised.wav` (vocals isolated from background noise)
49
+ - `<filename>_vad.json` (detected speech segment boundaries)
50
+ - `<filename>_diarization.json` (per-speaker turn boundaries from pyannote)
51
+ - `<filename>_transcription.json` (final transcription with timestamps and speaker labels)
52
+
53
+
54
+ ### Python API
55
+ ```python
56
+ from pathlib import Path
57
+ import dinnote
58
+ from dinnote import PipelineConfig, VadConfig, DiarizeConfig, TranscribeConfig
59
+
60
+ # Run the full pipeline with defaults
61
+ dinnote.process_file(
62
+ input_path=Path("recording.wav"),
63
+ output_dir=Path("output"),
64
+ )
65
+
66
+ # Custom config
67
+ config = PipelineConfig(
68
+ vad=VadConfig(threshold=0.4, max_segment_length_sec=20),
69
+ diarize=DiarizeConfig(num_speakers=2),
70
+ transcribe=TranscribeConfig(model="small", language="en"),
71
+ )
72
+ dinnote.process_file(Path("recording.wav"), Path("output"), config=config)
73
+
74
+ # Or use individual stages
75
+ from dinnote import denoise, vad, diarize, transcribe
76
+
77
+ denoised = denoise.run(Path("recording.wav"), Path("output/recording"), config={})
78
+ vad_file = vad.run(denoised, Path("output/recording"), config={})
79
+ diarization = diarize.run(denoised, Path("output/recording"), config={})
80
+ result = transcribe.run(denoised, Path("output/recording"), config={}, diarization_path=diarization)
81
+ ```
82
+
83
+
84
+ ### Configuration
85
+
86
+ ```yaml
87
+ denoise:
88
+ model: htdemucs # htdemucs | htdemucs_ft | mdx | mdx_extra | htdemucs_6s
89
+
90
+ vad:
91
+ threshold: 0.5 # 0.0–1.0, higher = requires clearer speech
92
+ min_speech_duration_ms: 250
93
+ min_silence_duration_ms: 100
94
+ padding_ms: 500
95
+ max_segment_length_sec: 30
96
+ merge_within_sec: 1.0
97
+
98
+ diarize:
99
+ # hf_token: hf_...
100
+ num_speakers: null # fix speaker count or leave null to let pyannote estimate
101
+ min_speakers: null
102
+ max_speakers: null
103
+ min_turn_ms: 200 # turns shorter than this are discarded (ms)
104
+
105
+ transcribe:
106
+ model: base # tiny | base | small | medium | large
107
+ language: en # set to null to auto-detect
108
+ temperature: null # null = Whisper fallback sequence, 0 = greedy
109
+ no_speech_threshold: 0.6
110
+ logprob_threshold: -1.0
111
+ compression_ratio_threshold: 2.4
112
+ condition_on_previous_text: false
113
+ vocab_file: null # path to domain-specific vocabulary, defaults to vocab.txt in config dir
114
+ ```
115
+
116
+ Add domain-specific vocabulary to `vocab.txt` to improve transcription accuracy on unusual words and jargon. For noisy or technical audio, set `temperature: 0` to disable Whisper's fallback to higher-temperature decoding, and consider filtering out common hallucinations specific to your dataset.
117
+
118
+ If `num_speakers` is known in advance, setting it gives more reliable diarization. Otherwise use `min_speakers`/`max_speakers` to constrain the range, or leave both null to let pyannote estimate freely.
@@ -0,0 +1,102 @@
1
+ ## dinnote audio transcription
2
+ Processes audio through a four-step pipeline to produce a transcription JSON with per-speaker diarization: denoising (Demucs), voice activity detection (Silero VAD), speaker diarization (pyannote), and transcription (Whisper).
3
+
4
+ ### Installation
5
+ ```bash
6
+ pip install dinnote
7
+ ```
8
+
9
+ On first run, dinnote copies default config files to your platform config directory:
10
+ - **Windows:** `%APPDATA%\dinnote\`
11
+ - **macOS:** `~/Library/Application Support/dinnote/`
12
+ - **Linux:** `~/.config/dinnote/`
13
+
14
+ Edit `config.yaml` and `vocab.txt` to customize settings.
15
+
16
+ Speaker diarization requires a HuggingFace token with access to [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1).
17
+ Set it via `diarize.hf_token` in `config.yaml`.
18
+
19
+
20
+ ### CLI usage
21
+ ```bash
22
+ dinnote input/audio.mp3 # single file
23
+ dinnote input/ # all audio files in a folder
24
+ dinnote input/audio.mp3 -f # force re-run all steps
25
+ dinnote input/audio.mp3 -c path/to/config.yaml # custom config
26
+ dinnote input/audio.mp3 -o results/ # custom output dir
27
+ ```
28
+
29
+ Each step checks whether its output already exists and skips it if so. Use `-f` to force all steps to re-run.
30
+
31
+ Output is written to `output/<filename>/` and contains:
32
+ - `<filename>_denoised.wav` (vocals isolated from background noise)
33
+ - `<filename>_vad.json` (detected speech segment boundaries)
34
+ - `<filename>_diarization.json` (per-speaker turn boundaries from pyannote)
35
+ - `<filename>_transcription.json` (final transcription with timestamps and speaker labels)
36
+
37
+
38
+ ### Python API
39
+ ```python
40
+ from pathlib import Path
41
+ import dinnote
42
+ from dinnote import PipelineConfig, VadConfig, DiarizeConfig, TranscribeConfig
43
+
44
+ # Run the full pipeline with defaults
45
+ dinnote.process_file(
46
+ input_path=Path("recording.wav"),
47
+ output_dir=Path("output"),
48
+ )
49
+
50
+ # Custom config
51
+ config = PipelineConfig(
52
+ vad=VadConfig(threshold=0.4, max_segment_length_sec=20),
53
+ diarize=DiarizeConfig(num_speakers=2),
54
+ transcribe=TranscribeConfig(model="small", language="en"),
55
+ )
56
+ dinnote.process_file(Path("recording.wav"), Path("output"), config=config)
57
+
58
+ # Or use individual stages
59
+ from dinnote import denoise, vad, diarize, transcribe
60
+
61
+ denoised = denoise.run(Path("recording.wav"), Path("output/recording"), config={})
62
+ vad_file = vad.run(denoised, Path("output/recording"), config={})
63
+ diarization = diarize.run(denoised, Path("output/recording"), config={})
64
+ result = transcribe.run(denoised, Path("output/recording"), config={}, diarization_path=diarization)
65
+ ```
66
+
67
+
68
+ ### Configuration
69
+
70
+ ```yaml
71
+ denoise:
72
+ model: htdemucs # htdemucs | htdemucs_ft | mdx | mdx_extra | htdemucs_6s
73
+
74
+ vad:
75
+ threshold: 0.5 # 0.0–1.0, higher = requires clearer speech
76
+ min_speech_duration_ms: 250
77
+ min_silence_duration_ms: 100
78
+ padding_ms: 500
79
+ max_segment_length_sec: 30
80
+ merge_within_sec: 1.0
81
+
82
+ diarize:
83
+ # hf_token: hf_...
84
+ num_speakers: null # fix speaker count or leave null to let pyannote estimate
85
+ min_speakers: null
86
+ max_speakers: null
87
+ min_turn_ms: 200 # turns shorter than this are discarded (ms)
88
+
89
+ transcribe:
90
+ model: base # tiny | base | small | medium | large
91
+ language: en # set to null to auto-detect
92
+ temperature: null # null = Whisper fallback sequence, 0 = greedy
93
+ no_speech_threshold: 0.6
94
+ logprob_threshold: -1.0
95
+ compression_ratio_threshold: 2.4
96
+ condition_on_previous_text: false
97
+ vocab_file: null # path to domain-specific vocabulary, defaults to vocab.txt in config dir
98
+ ```
99
+
100
+ Add domain-specific vocabulary to `vocab.txt` to improve transcription accuracy on unusual words and jargon. For noisy or technical audio, set `temperature: 0` to disable Whisper's fallback to higher-temperature decoding, and consider filtering out common hallucinations specific to your dataset.
101
+
102
+ If `num_speakers` is known in advance, setting it gives more reliable diarization. Otherwise use `min_speakers`/`max_speakers` to constrain the range, or leave both null to let pyannote estimate freely.
@@ -0,0 +1,30 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "dinnote"
7
+ version = "0.1.0"
8
+ description = "Audio denoising, VAD, speaker diarization, and transcription pipeline using Demucs, Silero VAD, pyannote, and Whisper"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ dependencies = [
12
+ "demucs>=4.0.0",
13
+ "torchcodec",
14
+ "torch",
15
+ "torchaudio",
16
+ "pydub",
17
+ "packaging",
18
+ "pyannote.audio>=3.1.0",
19
+ "openai-whisper",
20
+ "pyyaml",
21
+ ]
22
+
23
+ [project.scripts]
24
+ dinnote = "dinnote.cli:main"
25
+
26
+ [tool.setuptools.packages.find]
27
+ where = ["src"]
28
+
29
+ [tool.setuptools.package-data]
30
+ dinnote = ["config.yaml", "vocab.txt"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,8 @@
1
+ from . import denoise, vad, diarize, transcribe
2
+ from .pipeline import process_file
3
+ from .config import DenoiseConfig, VadConfig, DiarizeConfig, TranscribeConfig, PipelineConfig
4
+
5
+ __all__ = [
6
+ "denoise", "vad", "diarize", "transcribe", "process_file",
7
+ "DenoiseConfig", "VadConfig", "DiarizeConfig", "TranscribeConfig", "PipelineConfig",
8
+ ]
@@ -0,0 +1,80 @@
1
+ import argparse
2
+ import sys
3
+ import time
4
+ from pathlib import Path
5
+ from .utils import AUDIO_EXTENSIONS, load_config, warn_if_no_cuda, setup_user_config, fmt_time
6
+ from .pipeline import process_file
7
+ from .config import PipelineConfig
8
+
9
+
10
+ def main():
11
+ parser = argparse.ArgumentParser(
12
+ description="Denoise, VAD, diarize, and transcribe audio files"
13
+ )
14
+ parser.add_argument("input")
15
+ parser.add_argument("-o", "--output", default="output")
16
+ parser.add_argument("-c", "--config", default=None,
17
+ help="Path to config.yaml (default: user config dir)")
18
+ parser.add_argument("-f", "--force", action="store_true")
19
+ args = parser.parse_args()
20
+
21
+ config_dir = setup_user_config()
22
+ config_path = args.config or str(config_dir / "config.yaml")
23
+ raw = load_config(config_path)
24
+
25
+ transcribe_raw = raw.setdefault("transcribe", {})
26
+ if not transcribe_raw.get("vocab_file"):
27
+ vocab_path = config_dir / "vocab.txt"
28
+ if vocab_path.exists():
29
+ transcribe_raw["vocab_file"] = str(vocab_path)
30
+
31
+ config = PipelineConfig.from_dict(raw)
32
+ input_path = Path(args.input)
33
+ output_dir = Path(args.output)
34
+
35
+ if input_path.is_file():
36
+ if input_path.suffix.lower() not in AUDIO_EXTENSIONS:
37
+ print(f"Unrecognized audio extension: {input_path.suffix}")
38
+ sys.exit(1)
39
+ files = [input_path]
40
+ elif input_path.is_dir():
41
+ files = sorted(
42
+ f for f in input_path.iterdir()
43
+ if f.is_file() and f.suffix.lower() in AUDIO_EXTENSIONS
44
+ )
45
+ if not files:
46
+ print(f"No audio files found in {input_path}/")
47
+ sys.exit(1)
48
+ else:
49
+ print(f"Not found: {input_path}")
50
+ sys.exit(1)
51
+
52
+ print(f"Files to process: {len(files)}")
53
+ print(f"Output directory: {output_dir}/")
54
+ warn_if_no_cuda()
55
+
56
+ success = 0
57
+ batch_start = time.monotonic()
58
+
59
+ for i, audio_file in enumerate(files, 1):
60
+ if len(files) > 1:
61
+ print(f"\n{'═' * 60}")
62
+ print(f" File {i}/{len(files)}: {audio_file.name}")
63
+
64
+ if process_file(audio_file, output_dir, config, force=args.force):
65
+ success += 1
66
+
67
+ failed = len(files) - success
68
+
69
+ if len(files) > 1:
70
+ print(f"\n{'─' * 60}")
71
+ print(f" Batch complete: {success} succeeded, {failed} failed")
72
+ print(f" Total time: {fmt_time(time.monotonic() - batch_start)}")
73
+ print(f"\n{'─' * 60}")
74
+
75
+ if failed:
76
+ sys.exit(1)
77
+
78
+
79
+ if __name__ == "__main__":
80
+ main()
@@ -0,0 +1,64 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+
5
+ @dataclass
6
+ class DenoiseConfig:
7
+ model: str = "htdemucs"
8
+
9
+
10
+ @dataclass
11
+ class VadConfig:
12
+ threshold: float = 0.5
13
+ min_speech_duration_ms: Optional[int] = 250
14
+ min_silence_duration_ms: Optional[int] = 100
15
+ padding_ms: int = 500
16
+ max_segment_length_sec: Optional[float] = 30.0
17
+ """Segments longer than this are discarded as likely noise or music. None keeps all."""
18
+ merge_within_sec: Optional[float] = 1.0
19
+ """Segments with less than this gap between them are merged. None disables merging."""
20
+
21
+
22
+ @dataclass
23
+ class DiarizeConfig:
24
+ hf_token: Optional[str] = None
25
+ num_speakers: Optional[int] = None
26
+ min_speakers: Optional[int] = None
27
+ max_speakers: Optional[int] = None
28
+ min_turn_ms: int = 200
29
+ """Turns less than this length are discarded."""
30
+
31
+
32
+ @dataclass
33
+ class TranscribeConfig:
34
+ model: str = "base"
35
+ language: str = "en"
36
+ temperature: Optional[float] = None
37
+ """0 for greedy decoding, None to use Whisper's fallback sequence."""
38
+ no_speech_threshold: Optional[float] = 0.6
39
+ logprob_threshold: Optional[float] = -1.0
40
+ compression_ratio_threshold: Optional[float] = 2.4
41
+ condition_on_previous_text: bool = False
42
+ """Use previous segment output as context. Improves coherence but can propagate errors."""
43
+ vocab_file: Optional[str] = None
44
+
45
+
46
+ @dataclass
47
+ class PipelineConfig:
48
+ denoise: DenoiseConfig = field(default_factory=DenoiseConfig)
49
+ vad: VadConfig = field(default_factory=VadConfig)
50
+ diarize: DiarizeConfig = field(default_factory=DiarizeConfig)
51
+ transcribe: TranscribeConfig = field(default_factory=TranscribeConfig)
52
+
53
+ @staticmethod
54
+ def from_dict(d: dict) -> "PipelineConfig":
55
+ def _build(cls, section):
56
+ fields = {f.name for f in cls.__dataclass_fields__.values()}
57
+ return cls(**{k: v for k, v in section.items() if k in fields})
58
+
59
+ return PipelineConfig(
60
+ denoise=_build(DenoiseConfig, d.get("denoise", {})),
61
+ vad=_build(VadConfig, d.get("vad", {})),
62
+ diarize=_build(DiarizeConfig, d.get("diarize", {})),
63
+ transcribe=_build(TranscribeConfig, d.get("transcribe", {})),
64
+ )
@@ -0,0 +1,81 @@
1
+ denoise:
2
+ # Demucs model used for vocal isolation.
3
+ # Options: htdemucs, htdemucs_ft, mdx, mdx_extra, htdemucs_6s
4
+ # See: https://github.com/facebookresearch/demucs
5
+ model: htdemucs
6
+
7
+ vad:
8
+ # Detection sensitivity (0.0–1.0). Higher values require clearer speech to trigger.
9
+ # See: https://github.com/snakers4/silero-vad
10
+ threshold: 0.5
11
+
12
+ # Minimum duration of a detected speech region to keep (milliseconds).
13
+ # Set to null to use Silero's default (250ms).
14
+ min_speech_duration_ms: 250
15
+
16
+ # Minimum silence gap required before a new segment begins (milliseconds).
17
+ # Set to null to use Silero's default (100ms).
18
+ min_silence_duration_ms: 100
19
+
20
+ # Padding added before and after each detected segment (milliseconds).
21
+ # Set to null to disable padding.
22
+ padding_ms: 500
23
+
24
+ # Segments longer than this are discarded as likely noise or music (seconds).
25
+ # Set to null to keep all segments regardless of length.
26
+ max_segment_length_sec: 30
27
+
28
+ # Segments with less than this gap between them are merged into one (seconds).
29
+ # Set to null to disable merging of neighboring segments.
30
+ merge_within_sec: 1.0
31
+
32
+ diarize:
33
+ # HuggingFace token with access to pyannote/speaker-diarization-3.1.
34
+ # hf_token: hf_...
35
+
36
+ # Fix the number of speakers if known. Set to null to let pyannote estimate.
37
+ num_speakers: null
38
+
39
+ # Alternatively constrain the range and let pyannote pick within it.
40
+ min_speakers: null
41
+ max_speakers: null
42
+
43
+ # Turns shorter than this are discarded (milliseconds).
44
+ min_turn_ms: 200
45
+
46
+ transcribe:
47
+ # Whisper model size. Larger models are slower but more accurate.
48
+ # Options: tiny, base, small, medium, large
49
+ # See: https://github.com/openai/whisper
50
+ model: base
51
+
52
+ # Language code for transcription.
53
+ # Examples: en, fr, de, es, ja. Set to null to auto-detect.
54
+ language: en
55
+
56
+ # Decoding temperature.
57
+ # Set to 0 for single-pass greedy decoding with no fallback.
58
+ # Set to null to use Whisper's default fallback sequence (0.0, 0.2, …, 1.0),
59
+ # retrying at higher temperatures when output quality metrics look poor.
60
+ temperature: null
61
+
62
+ # Segments where Whisper's no-speech probability exceeds this are discarded.
63
+ # Set to null to use Whisper's default (0.6).
64
+ no_speech_threshold: 0.6
65
+
66
+ # Segments with average log-probability below this are discarded as likely low-quality audio.
67
+ # Set to null to use Whisper's default (-1.0).
68
+ logprob_threshold: -1.0
69
+
70
+ # Segments with compression ratio above this are discarded as likely hallucinations.
71
+ # Set to null to use Whisper's default (2.4).
72
+ compression_ratio_threshold: 2.4
73
+
74
+ # Whether Whisper should use the previous segment's output as context for the next.
75
+ # Improves coherence across segments but can propagate errors.
76
+ # Default: false.
77
+ condition_on_previous_text: false
78
+
79
+ # Path to a file containing domain-specific vocabulary to guide transcription.
80
+ # Set to null to disable.
81
+ vocab_file: vocab.txt
@@ -0,0 +1,59 @@
1
+ """
2
+ Uses Demucs to isolate vocals in audio.
3
+ """
4
+
5
+ import os
6
+ import shutil
7
+ import subprocess
8
+ import sys
9
+ import tempfile
10
+ from pathlib import Path
11
+ from .utils import cuda_available
12
+
13
+
14
+ def _ffmpeg_env():
15
+ """Return env with FFmpeg bin dir on PATH so subprocess can load torchcodec DLLs."""
16
+ import glob as g
17
+ env = dict(os.environ)
18
+ if sys.platform != "win32":
19
+ return env
20
+ local_appdata = env.get("LOCALAPPDATA", "")
21
+ winget_packages = os.path.join(local_appdata, "Microsoft", "WinGet", "Packages")
22
+ for bin_dir in g.glob(os.path.join(winget_packages, "Gyan.FFmpeg.Shared*", "**", "bin"), recursive=True):
23
+ if any(g.glob(os.path.join(bin_dir, "avcodec-*.dll"))):
24
+ env["PATH"] = bin_dir + os.pathsep + env.get("PATH", "")
25
+ break
26
+ return env
27
+
28
+
29
+ def _run_demucs(input_file: Path, output_file: Path, model: str = "htdemucs", device: str = "cpu"):
30
+ """Run Demucs and copy the vocals stem to the destination path."""
31
+ with tempfile.TemporaryDirectory() as tmp_dir:
32
+ tmp_path = Path(tmp_dir)
33
+
34
+ result = subprocess.run(
35
+ [sys.executable, "-m", "demucs", "-n", model, "--two-stems=vocals",
36
+ "--device", device, "-o", str(tmp_path), str(input_file)],
37
+ env=_ffmpeg_env(),
38
+ )
39
+ if result.returncode != 0:
40
+ raise RuntimeError(f"demucs exited with code {result.returncode}")
41
+
42
+ vocals_path = tmp_path / model / input_file.stem / "vocals.wav"
43
+ if not vocals_path.exists():
44
+ raise FileNotFoundError(f"Demucs output not found: {vocals_path}")
45
+
46
+ shutil.copy2(vocals_path, output_file)
47
+
48
+
49
+ def run(input_path: Path, output_dir: Path, config: dict, force: bool = False) -> Path:
50
+ """Denoise a single audio file using Demucs vocal isolation. Returns path to denoised .wav."""
51
+ output_file = output_dir / f"{output_dir.name}_denoised.wav"
52
+ if not force and output_file.exists():
53
+ return output_file
54
+
55
+ model = config.get("model", "htdemucs")
56
+ device = "cuda" if cuda_available() else "cpu"
57
+ output_dir.mkdir(parents=True, exist_ok=True)
58
+ _run_demucs(input_path, output_file, model=model, device=device)
59
+ return output_file