dinscribe 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ Metadata-Version: 2.4
2
+ Name: dinscribe
3
+ Version: 0.1.0
4
+ Summary: Audio denoising and transcription pipeline using Demucs, Silero VAD, and Whisper
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: demucs>=4.0.0
7
+ Requires-Dist: torchcodec
8
+ Requires-Dist: torch
9
+ Requires-Dist: pydub
10
+ Requires-Dist: packaging
11
+ Requires-Dist: openai-whisper
12
+ Requires-Dist: pyyaml
@@ -0,0 +1,44 @@
1
+ ## dinscribe audio transcription
2
+ Processes audio through a three-step pipeline to produce a transcription JSON: denoise, voice activity detection, transcribe.
3
+
4
+ ### Setup
5
+ ```bash
6
+ python -m venv venv
7
+ source venv/Scripts/activate
8
+ pip install -r requirements.txt
9
+ ```
10
+
11
+
12
+ ### Run the full pipeline
13
+ ```bash
14
+ python main.py input/audio.mp3 # Run for a single file
15
+ python main.py input/ # Run for all audio files in a folder
16
+ python main.py input/audio.mp3 -f # Force re-run all steps
17
+ ```
18
+ Each step checks whether its output already exists and skips it if so. Use `-f` to force all steps to re-run regardless, `-o <output_dir>` to specify a different output directory, and `-c <config.yaml>` to specify a different config file.
19
+
20
+ Output is written to `output/<filename>/` and contains:
21
+ - `<filename>_denoised.wav` (vocals isolated from background noise)
22
+ - `<filename>_vad.json` (detected speech segment boundaries)
23
+ - `<filename>_transcription.json` (final transcription with timestamps)
24
+
25
+
26
+ ### Configuration
27
+ Edit `config.yaml` to adjust settings for each step. Some important options are:
28
+ - `denoise.model` - Demucs model for vocal isolation (default: `htdemucs`)
29
+ - `vad.threshold` - VAD speech detection sensitivity (default: `0.5`)
30
+ - `transcribe.model` - Whisper model size `tiny` through `large` (default: `base`)
31
+ - `transcribe.language` - Transcription language code (default: `en`)
32
+
33
+
34
+ #### Other tips for best results
35
+ Add domain-specific vocabulary to `vocab.txt` to improve transcription accuracy on unusual words and jargon. For noisy or technical audio, set `temperature: 0` to disable attempts to fallback to higher-temperature decoding, and consider filtering out any common hallucinations specific to your dataset.
36
+
37
+
38
+ ### Run individual steps
39
+ Each step can also be run alone:
40
+ ```bash
41
+ python denoise.py audio.mp3
42
+ python vad.py audio_denoised.wav
43
+ python transcribe.py audio_denoised.wav audio_vad.json
44
+ ```
@@ -0,0 +1,27 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "dinscribe"
7
+ version = "0.1.0"
8
+ description = "Audio denoising and transcription pipeline using Demucs, Silero VAD, and Whisper"
9
+ requires-python = ">=3.10"
10
+ dependencies = [
11
+ "demucs>=4.0.0",
12
+ "torchcodec",
13
+ "torch",
14
+ "pydub",
15
+ "packaging",
16
+ "openai-whisper",
17
+ "pyyaml",
18
+ ]
19
+
20
+ [project.scripts]
21
+ dinscribe = "dinscribe.cli:main"
22
+
23
+ [tool.setuptools.packages.find]
24
+ where = ["src"]
25
+
26
+ [tool.setuptools.package-data]
27
+ dinscribe = ["config.yaml", "vocab.txt"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,4 @@
1
+ from . import denoise, vad, transcribe
2
+ from .pipeline import process_file
3
+
4
+ __all__ = ["denoise", "vad", "transcribe", "process_file"]
@@ -0,0 +1,82 @@
1
+ """
2
+ CLI entry point for dinscribe.
3
+ """
4
+
5
+ import argparse
6
+ import sys
7
+ import time
8
+ from pathlib import Path
9
+ from .utils import AUDIO_EXTENSIONS, load_config, fmt_time, warn_if_no_cuda, setup_user_config, get_config_dir
10
+ from .pipeline import process_file
11
+
12
+
13
+ def main():
14
+ parser = argparse.ArgumentParser(
15
+ description="Process audio files into transcriptions"
16
+ )
17
+ parser.add_argument("input")
18
+ parser.add_argument("-o", "--output", default="output")
19
+ parser.add_argument("-c", "--config", default=None,
20
+ help="Path to config.yaml (default: user config dir)")
21
+ parser.add_argument("-f", "--force", action="store_true")
22
+ args = parser.parse_args()
23
+
24
+ config_dir = setup_user_config()
25
+ config_path = args.config or str(config_dir / "config.yaml")
26
+ config = load_config(config_path)
27
+
28
+ # Point vocab_file default at the user config dir if not set in config
29
+ transcribe_cfg = config.setdefault("transcribe", {})
30
+ if not transcribe_cfg.get("vocab_file"):
31
+ vocab_path = config_dir / "vocab.txt"
32
+ if vocab_path.exists():
33
+ transcribe_cfg["vocab_file"] = str(vocab_path)
34
+ input_path = Path(args.input)
35
+ output_dir = Path(args.output)
36
+
37
+ if input_path.is_file():
38
+ if input_path.suffix.lower() not in AUDIO_EXTENSIONS:
39
+ print(f"Unrecognised audio extension: {input_path.suffix}")
40
+ sys.exit(1)
41
+ files = [input_path]
42
+ elif input_path.is_dir():
43
+ files = sorted(
44
+ f for f in input_path.iterdir()
45
+ if f.is_file() and f.suffix.lower() in AUDIO_EXTENSIONS
46
+ )
47
+ if not files:
48
+ print(f"No audio files found in {input_path}/")
49
+ sys.exit(1)
50
+ else:
51
+ print(f"Not found: {input_path}")
52
+ sys.exit(1)
53
+
54
+ print(f"Files to process: {len(files)}")
55
+ print(f"Output directory: {output_dir}/")
56
+ warn_if_no_cuda()
57
+
58
+ success = 0
59
+ batch_start = time.monotonic()
60
+
61
+ for i, audio_file in enumerate(files, 1):
62
+ if len(files) > 1:
63
+ print(f"\n{'═' * 60}")
64
+ print(f" File {i}/{len(files)}: {audio_file.name}")
65
+
66
+ if process_file(audio_file, output_dir, config, force=args.force):
67
+ success += 1
68
+
69
+ failed = len(files) - success
70
+
71
+ if len(files) > 1:
72
+ print(f"\n{'─' * 60}")
73
+ print(f" Batch complete: {success} succeeded, {failed} failed")
74
+ print(f" Total time: {fmt_time(time.monotonic() - batch_start)}")
75
+ print(f"\n{'─' * 60}")
76
+
77
+ if failed:
78
+ sys.exit(1)
79
+
80
+
81
+ if __name__ == "__main__":
82
+ main()
@@ -0,0 +1,67 @@
1
+ denoise:
2
+ # Demucs model used for vocal isolation.
3
+ # Options: htdemucs, htdemucs_ft, mdx, mdx_extra, htdemucs_6s
4
+ # See: https://github.com/facebookresearch/demucs
5
+ model: htdemucs
6
+
7
+ vad:
8
+ # Detection sensitivity (0.0–1.0). Higher values require clearer speech to trigger.
9
+ # See: https://github.com/snakers4/silero-vad
10
+ threshold: 0.5
11
+
12
+ # Minimum duration of a detected speech region to keep (milliseconds).
13
+ # Set to null to use Silero's default (250ms).
14
+ min_speech_duration_ms: 250
15
+
16
+ # Minimum silence gap required before a new segment begins (milliseconds).
17
+ # Set to null to use Silero's default (100ms).
18
+ min_silence_duration_ms: 100
19
+
20
+ # Padding added before and after each detected segment (milliseconds).
21
+ # Set to null to disable padding.
22
+ padding_ms: 500
23
+
24
+ # Segments longer than this are discarded as likely noise or music (seconds).
25
+ # Set to null to keep all segments regardless of length.
26
+ max_segment_length_sec: 30
27
+
28
+ # Segments with less than this gap between them are merged into one (seconds).
29
+ # Set to null to disable merging of neighboring segments.
30
+ merge_within_sec: 1.0
31
+
32
+ transcribe:
33
+ # Whisper model size. Larger models are slower but more accurate.
34
+ # Options: tiny, base, small, medium, large
35
+ # See: https://github.com/openai/whisper
36
+ model: base
37
+
38
+ # Language code for transcription.
39
+ # Examples: en, fr, de, es, ja. Set to null to auto-detect.
40
+ language: en
41
+
42
+ # Decoding temperature.
43
+ # Set to 0 for single-pass greedy decoding with no fallback.
44
+ # Set to null to use Whisper's default fallback sequence (0.0, 0.2, …, 1.0),
45
+ # retrying at higher temperatures when output quality metrics look poor.
46
+ temperature: null
47
+
48
+ # Segments where Whisper's no-speech probability exceeds this are discarded.
49
+ # Set to null to use Whisper's default (0.6).
50
+ no_speech_threshold: 0.6
51
+
52
+ # Segments with average log-probability below this are discarded as likely low-quality audio.
53
+ # Set to null to use Whisper's default (-1.0).
54
+ logprob_threshold: -1.0
55
+
56
+ # Segments with compression ratio above this are discarded as likely hallucinations.
57
+ # Set to null to use Whisper's default (2.4).
58
+ compression_ratio_threshold: 2.4
59
+
60
+ # Whether Whisper should use the previous segment's output as context for the next.
61
+ # Improves coherence across segments but can propagate errors.
62
+ # Default: false.
63
+ condition_on_previous_text: false
64
+
65
+ # Path to a file containing domain-specific vocabulary to guide transcription.
66
+ # Set to null to disable.
67
+ vocab_file: vocab.txt
@@ -0,0 +1,59 @@
1
+ """
2
+ Uses Demucs to isolate vocals in audio.
3
+ """
4
+
5
+ import os
6
+ import shutil
7
+ import subprocess
8
+ import sys
9
+ import tempfile
10
+ from pathlib import Path
11
+ from .utils import cuda_available
12
+
13
+
14
+ def _ffmpeg_env():
15
+ """Return env with FFmpeg bin dir on PATH so subprocess can load torchcodec DLLs."""
16
+ import glob as g
17
+ env = dict(os.environ)
18
+ if sys.platform != "win32":
19
+ return env
20
+ local_appdata = env.get("LOCALAPPDATA", "")
21
+ winget_packages = os.path.join(local_appdata, "Microsoft", "WinGet", "Packages")
22
+ for bin_dir in g.glob(os.path.join(winget_packages, "Gyan.FFmpeg.Shared*", "**", "bin"), recursive=True):
23
+ if any(g.glob(os.path.join(bin_dir, "avcodec-*.dll"))):
24
+ env["PATH"] = bin_dir + os.pathsep + env.get("PATH", "")
25
+ break
26
+ return env
27
+
28
+
29
+ def _run_demucs(input_file: Path, output_file: Path, model: str = "htdemucs", device: str = "cpu"):
30
+ """Run Demucs and copy the vocals stem to the destination path."""
31
+ with tempfile.TemporaryDirectory() as tmp_dir:
32
+ tmp_path = Path(tmp_dir)
33
+
34
+ result = subprocess.run(
35
+ [sys.executable, "-m", "demucs", "-n", model, "--two-stems=vocals",
36
+ "--device", device, "-o", str(tmp_path), str(input_file)],
37
+ env=_ffmpeg_env(),
38
+ )
39
+ if result.returncode != 0:
40
+ raise RuntimeError(f"demucs exited with code {result.returncode}")
41
+
42
+ vocals_path = tmp_path / model / input_file.stem / "vocals.wav"
43
+ if not vocals_path.exists():
44
+ raise FileNotFoundError(f"Demucs output not found: {vocals_path}")
45
+
46
+ shutil.copy2(vocals_path, output_file)
47
+
48
+
49
+ def run(input_path: Path, output_dir: Path, config: dict, force: bool = False) -> Path:
50
+ """Denoise a single audio file using Demucs vocal isolation. Returns path to denoised .wav."""
51
+ output_file = output_dir / f"{output_dir.name}_denoised.wav"
52
+ if not force and output_file.exists():
53
+ return output_file
54
+
55
+ model = config.get("model", "htdemucs")
56
+ device = "cuda" if cuda_available() else "cpu"
57
+ output_dir.mkdir(parents=True, exist_ok=True)
58
+ _run_demucs(input_path, output_file, model=model, device=device)
59
+ return output_file
@@ -0,0 +1,103 @@
1
+ """
2
+ Chains full audio pre-processing pipeline with transcription.
3
+ """
4
+
5
+ import json
6
+ import time
7
+ from pathlib import Path
8
+ from . import denoise, vad, transcribe
9
+ from .utils import fmt_time, progress_bar, cuda_available
10
+
11
+
12
+ def process_file(input_path: Path, output_dir: Path, config: dict, force: bool = False) -> bool:
13
+ """Run the full pipeline for one audio file."""
14
+ file_dir = output_dir / input_path.stem
15
+ file_dir.mkdir(parents=True, exist_ok=True)
16
+
17
+ total_start = time.monotonic()
18
+
19
+ print(f"\n{'─' * 60}")
20
+ print(f" Input: {input_path}")
21
+ print(f" Output: {file_dir}/")
22
+ print('─' * 60)
23
+
24
+ # Step 1: Denoise
25
+ print("\n [1/3] Denoising")
26
+ denoised_path = file_dir / f"{input_path.stem}_denoised.wav"
27
+ if not force and denoised_path.exists():
28
+ print(" ✓ Skipped (cached)")
29
+ else:
30
+ print(" Running Demucs vocal isolation...")
31
+ step_start = time.monotonic()
32
+ try:
33
+ denoised_path = denoise.run(input_path, file_dir, config.get("denoise", {}), force=force)
34
+ except Exception as e:
35
+ print(f"\n ERROR: Denoising failed: {e}")
36
+ return False
37
+ print(f" ✓ Done ({fmt_time(time.monotonic() - step_start)})")
38
+
39
+ # Step 2: VAD
40
+ print("\n [2/3] Voice Activity Detection")
41
+ vad_path = file_dir / f"{input_path.stem}_vad.json"
42
+ vad_cached = not force and vad_path.exists()
43
+ step_start = time.monotonic()
44
+ if vad_cached:
45
+ print(" ✓ Skipped (cached)")
46
+ else:
47
+ print(" Detecting speech segments...")
48
+ try:
49
+ vad_path = vad.run(denoised_path, file_dir, config.get("vad", {}), force=force)
50
+ except Exception as e:
51
+ print(f"\n ERROR: VAD failed: {e}")
52
+ return False
53
+
54
+ vad_meta = json.loads(vad_path.read_text(encoding="utf-8"))["metadata"]
55
+ segment_count = vad_meta["segment_count"]
56
+ if not vad_cached:
57
+ speech_s = vad_meta["total_speech_ms"] / 1000
58
+ audio_s = vad_meta["audio_duration_ms"] / 1000
59
+ print(f" ✓ {segment_count} segments found "
60
+ f"({speech_s:.1f}s speech in {audio_s:.1f}s audio) "
61
+ f"({fmt_time(time.monotonic() - step_start)})")
62
+
63
+ # Step 3: Transcribe
64
+ print("\n [3/3] Transcribing")
65
+ transcription_path = file_dir / f"{input_path.stem}_transcription.json"
66
+ trans_cached = not force and transcription_path.exists()
67
+ if trans_cached:
68
+ print(" ✓ Skipped (cached)")
69
+ else:
70
+ trans_config = config.get("transcribe", {})
71
+ model_name = trans_config.get("model", "base")
72
+ device = "cuda" if cuda_available() else "cpu"
73
+ print(f" Loading Whisper '{model_name}' on {device}...")
74
+
75
+ step_start = time.monotonic()
76
+
77
+ def on_segment(current: int, total: int):
78
+ bar = progress_bar(current, total)
79
+ print(f"\r [{bar}] {current}/{total} segments", end="", flush=True)
80
+
81
+ try:
82
+ transcription_path = transcribe.run(
83
+ denoised_path, vad_path, file_dir, trans_config, on_segment=on_segment, force=force,
84
+ )
85
+ except Exception as e:
86
+ print(f"\n ERROR: Transcription failed: {e}")
87
+ return False
88
+
89
+ print()
90
+ trans_meta = json.loads(transcription_path.read_text(encoding="utf-8"))["metadata"]
91
+ kept = trans_meta["processed_segments"]
92
+ print(f" ✓ {kept}/{segment_count} segments processed "
93
+ f"({fmt_time(time.monotonic() - step_start)})")
94
+
95
+ # Summary
96
+ print(f"\n{'─' * 60}")
97
+ print(f" Total time: {fmt_time(time.monotonic() - total_start)}")
98
+ print(" Output:")
99
+ for p in (denoised_path, vad_path, transcription_path):
100
+ print(f" {p.relative_to(output_dir.parent)}")
101
+ print('─' * 60)
102
+
103
+ return True
@@ -0,0 +1,162 @@
1
+ """
2
+ Transcribes speech segments from denoised audio using Whisper.
3
+ """
4
+
5
+ import json
6
+ import shutil
7
+ from pathlib import Path
8
+ from typing import Optional
9
+ from pydub import AudioSegment
10
+ import whisper
11
+ from .utils import cuda_available
12
+
13
+ # Loaded once per process, keyed by (model_name, device)
14
+ _whisper_cache: dict = {}
15
+
16
+
17
+ def _load_model(model_name: str, device: str):
18
+ key = (model_name, device)
19
+ if key not in _whisper_cache:
20
+ _whisper_cache[key] = whisper.load_model(model_name, device=device)
21
+ return _whisper_cache[key]
22
+
23
+
24
+ def _load_vocabulary(vocab_file: Optional[str]) -> str:
25
+ """Build Whisper's initial_prompt from a vocabulary file."""
26
+ if not vocab_file:
27
+ return ""
28
+ vocab_path = Path(vocab_file)
29
+ if not vocab_path.exists():
30
+ return ""
31
+ terms = [
32
+ line.strip()
33
+ for line in vocab_path.read_text(encoding="utf-8").splitlines()
34
+ if line.strip() and not line.startswith("#")
35
+ ]
36
+ return f"Common terms: {', '.join(terms)}." if terms else ""
37
+
38
+
39
+ def run(
40
+ audio_path: Path,
41
+ vad_path: Path,
42
+ output_dir: Path,
43
+ config: dict,
44
+ on_segment=None,
45
+ force: bool = False,
46
+ ) -> Path:
47
+ """Transcribe speech segments from denoised audio. Returns path to transcription.json."""
48
+ output_file = output_dir / f"{output_dir.name}_transcription.json"
49
+ if not force and output_file.exists():
50
+ return output_file
51
+
52
+ output_dir.mkdir(parents=True, exist_ok=True)
53
+ model_name = config.get("model", "base")
54
+ language = config.get("language", "en")
55
+ temperature = config.get("temperature", None)
56
+ no_speech_threshold = config.get("no_speech_threshold", None)
57
+ logprob_threshold = config.get("logprob_threshold", None)
58
+ compression_ratio_threshold = config.get("compression_ratio_threshold", None)
59
+ condition_on_previous_text = config.get("condition_on_previous_text", False)
60
+ vocab_file = config.get("vocab_file", None)
61
+
62
+ device = "cuda" if cuda_available() else "cpu"
63
+ model = _load_model(model_name, device)
64
+ initial_prompt = _load_vocabulary(vocab_file) or None
65
+
66
+ vad_data = json.loads(vad_path.read_text(encoding="utf-8"))
67
+ segments = vad_data.get("segments", [])
68
+ if not segments:
69
+ raise RuntimeError(f"No segments found in {vad_path.name}")
70
+
71
+ audio = AudioSegment.from_file(str(audio_path))
72
+ audio_duration_ms = len(audio)
73
+
74
+ valid_segments = [
75
+ (seg["start_ms"], min(seg["end_ms"], audio_duration_ms))
76
+ for seg in segments
77
+ if seg["start_ms"] < audio_duration_ms
78
+ ]
79
+ total = len(valid_segments)
80
+
81
+ output = {
82
+ "metadata": {
83
+ "source_audio": audio_path.name,
84
+ "model": model_name,
85
+ "language": language,
86
+ "temperature": temperature,
87
+ "no_speech_threshold": no_speech_threshold,
88
+ "logprob_threshold": logprob_threshold,
89
+ "compression_ratio_threshold": compression_ratio_threshold,
90
+ "total_segments": total,
91
+ "processed_segments": 0,
92
+ },
93
+ "transcription": [],
94
+ }
95
+ _write_json(output_file, output)
96
+
97
+ temp_dir = output_dir / "_temp_segments"
98
+ if temp_dir.exists():
99
+ shutil.rmtree(temp_dir)
100
+ temp_dir.mkdir()
101
+
102
+ try:
103
+ for i, (start_ms, end_ms) in enumerate(valid_segments, 1):
104
+ temp_file = temp_dir / f"seg_{i}.wav"
105
+ audio[start_ms:end_ms].export(str(temp_file), format="wav")
106
+
107
+ try:
108
+ transcribe_kwargs = dict(
109
+ language=language,
110
+ fp16=(device == "cuda"),
111
+ condition_on_previous_text=condition_on_previous_text,
112
+ initial_prompt=initial_prompt,
113
+ )
114
+ if temperature is not None:
115
+ transcribe_kwargs["temperature"] = temperature
116
+ if no_speech_threshold is not None:
117
+ transcribe_kwargs["no_speech_threshold"] = no_speech_threshold
118
+ if logprob_threshold is not None:
119
+ transcribe_kwargs["logprob_threshold"] = logprob_threshold
120
+ if compression_ratio_threshold is not None:
121
+ transcribe_kwargs["compression_ratio_threshold"] = compression_ratio_threshold
122
+ result = model.transcribe(str(temp_file), **transcribe_kwargs)
123
+ except Exception:
124
+ temp_file.unlink(missing_ok=True)
125
+ if on_segment:
126
+ on_segment(i, total)
127
+ continue
128
+
129
+ temp_file.unlink(missing_ok=True)
130
+
131
+ text = result["text"].strip()
132
+ no_speech_prob = max(
133
+ (s.get("no_speech_prob", 0) for s in result.get("segments", [])),
134
+ default=0,
135
+ )
136
+
137
+ output["metadata"]["processed_segments"] += 1
138
+
139
+ if not text or (no_speech_threshold is not None and no_speech_prob > no_speech_threshold):
140
+ _write_json(output_file, output)
141
+ if on_segment:
142
+ on_segment(i, total)
143
+ continue
144
+
145
+ output["transcription"].append({
146
+ "timestamp": {"start": start_ms / 1000, "end": end_ms / 1000},
147
+ "text": text,
148
+ })
149
+ _write_json(output_file, output)
150
+
151
+ if on_segment:
152
+ on_segment(i, total)
153
+ finally:
154
+ if temp_dir.exists():
155
+ shutil.rmtree(temp_dir)
156
+
157
+ return output_file
158
+
159
+
160
+ def _write_json(path: Path, data: dict):
161
+ with open(path, "w", encoding="utf-8") as f:
162
+ json.dump(data, f, indent=2, ensure_ascii=False)
@@ -0,0 +1,84 @@
1
+ import os
2
+ import shutil
3
+ import sys
4
+ from importlib import resources
5
+ from pathlib import Path
6
+
7
+ AUDIO_EXTENSIONS = frozenset({".mp3", ".wav", ".m4a", ".flac", ".ogg", ".aac"})
8
+
9
+ _CONFIG_FILES = ("config.yaml", "vocab.txt")
10
+
11
+
12
+ def get_config_dir() -> Path:
13
+ """Return the platform-appropriate user config directory for dinscribe."""
14
+ if sys.platform == "win32":
15
+ base = Path(os.environ.get("APPDATA", Path.home()))
16
+ elif sys.platform == "darwin":
17
+ base = Path.home() / "Library" / "Application Support"
18
+ else:
19
+ base = Path(os.environ.get("XDG_CONFIG_HOME", Path.home() / ".config"))
20
+ return base / "dinscribe"
21
+
22
+
23
+ def setup_user_config() -> Path:
24
+ """Copy default config files to the user config dir if they don't exist yet.
25
+
26
+ Returns the config dir path.
27
+ """
28
+ config_dir = get_config_dir()
29
+ config_dir.mkdir(parents=True, exist_ok=True)
30
+
31
+ pkg = resources.files("dinscribe")
32
+ for name in _CONFIG_FILES:
33
+ dest = config_dir / name
34
+ if not dest.exists():
35
+ src = pkg.joinpath(name)
36
+ with resources.as_file(src) as src_path:
37
+ shutil.copy2(src_path, dest)
38
+ print(f"Created default config: {dest}")
39
+
40
+ return config_dir
41
+
42
+
43
+ def load_config(config_path: str, section: str = "") -> dict:
44
+ """Loads config.yaml and return the full config dict, or a single named section"""
45
+ try:
46
+ import yaml
47
+ with open(config_path, encoding="utf-8") as f:
48
+ cfg = yaml.safe_load(f) or {}
49
+ return cfg.get(section, {}) if section else cfg
50
+ except FileNotFoundError:
51
+ return {}
52
+ except Exception as e:
53
+ print(f"Warning: could not parse config ({config_path}): {e}")
54
+ return {}
55
+
56
+
57
+ def progress_bar(current: int, total: int, width: int = 28) -> str:
58
+ """Returns an ASCII block progress bar string"""
59
+ if total == 0:
60
+ return "░" * width
61
+ filled = int(width * current / total)
62
+ return "▓" * filled + "░" * (width - filled)
63
+
64
+
65
+ def fmt_time(seconds: float) -> str:
66
+ """Formats duration in seconds as a human-readable string"""
67
+ if seconds < 60:
68
+ return f"{seconds:.1f}s"
69
+ return f"{int(seconds // 60)}m {seconds % 60:.0f}s"
70
+
71
+
72
+ def cuda_available() -> bool:
73
+ """Returns True if a CUDA-capable GPU is available"""
74
+ try:
75
+ import torch
76
+ return torch.cuda.is_available()
77
+ except Exception:
78
+ return False
79
+
80
+
81
+ def warn_if_no_cuda() -> None:
82
+ """Prints a warning when CUDA is not available"""
83
+ if not cuda_available():
84
+ print("WARNING: CUDA is not available! Audio processing will run on CPU, which will be much slower.")
@@ -0,0 +1,126 @@
1
+ """
2
+ Uses Silero VAD to detect speech segments in denoised audio and writes a JSON file with segment boundaries.
3
+ """
4
+
5
+ import json
6
+ from pathlib import Path
7
+ import torch
8
+ from .utils import cuda_available
9
+
10
+ # Loaded once per process
11
+ _vad_model_cache = None
12
+
13
+
14
+ def _load_model():
15
+ global _vad_model_cache
16
+ if _vad_model_cache is None:
17
+ print("Loading VAD model...")
18
+ torch.set_num_threads(1)
19
+ model, utils = torch.hub.load(
20
+ repo_or_dir="snakers4/silero-vad",
21
+ model="silero_vad",
22
+ force_reload=False,
23
+ onnx=False,
24
+ )
25
+ get_speech_timestamps, _, read_audio, *_ = utils
26
+ device = "cuda" if cuda_available() else "cpu"
27
+ model = model.to(device)
28
+ _vad_model_cache = (model, get_speech_timestamps, read_audio, device)
29
+ return _vad_model_cache
30
+
31
+
32
+ def _detect_segments(audio_path: Path, config: dict):
33
+ """Run Silero VAD and return (segments, duration_ms).
34
+
35
+ segments is a list of [start_ms, end_ms] pairs.
36
+ """
37
+ threshold = config.get("threshold", 0.5)
38
+ min_speech_ms = config.get("min_speech_duration_ms", None)
39
+ min_silence_ms = config.get("min_silence_duration_ms", None)
40
+ padding_ms = config.get("padding_ms", 500) or 0
41
+ max_seg_sec = config.get("max_segment_length_sec", 30)
42
+ merge_within_sec = config.get("merge_within_sec", 1.0)
43
+ sample_rate = 16000 # required by Silero VAD
44
+
45
+ model, get_speech_timestamps, read_audio, device = _load_model()
46
+
47
+ wav = read_audio(str(audio_path), sampling_rate=sample_rate).to(device)
48
+ duration_ms = len(wav) / sample_rate * 1000
49
+
50
+ vad_kwargs = dict(
51
+ threshold=threshold,
52
+ sampling_rate=sample_rate,
53
+ window_size_samples=512,
54
+ speech_pad_ms=30,
55
+ )
56
+ if min_speech_ms is not None:
57
+ vad_kwargs["min_speech_duration_ms"] = min_speech_ms
58
+ if min_silence_ms is not None:
59
+ vad_kwargs["min_silence_duration_ms"] = min_silence_ms
60
+ speech_timestamps = get_speech_timestamps(wav, model, **vad_kwargs)
61
+
62
+ # Convert sample indices to ms and apply padding
63
+ samples_per_ms = sample_rate / 1000
64
+ segments = []
65
+ for ts in speech_timestamps:
66
+ start_ms = max(0, int(ts["start"] / samples_per_ms) - padding_ms)
67
+ end_ms = min(duration_ms, int(ts["end"] / samples_per_ms) + padding_ms)
68
+ if segments and start_ms < segments[-1][1]:
69
+ start_ms = segments[-1][1]
70
+ if start_ms < end_ms:
71
+ segments.append([start_ms, end_ms])
72
+
73
+ # Merge nearby segments
74
+ if merge_within_sec is not None:
75
+ merge_gap_ms = merge_within_sec * 1000
76
+ merged = []
77
+ for start_ms, end_ms in segments:
78
+ if merged and start_ms - merged[-1][1] <= merge_gap_ms:
79
+ merged[-1][1] = end_ms
80
+ else:
81
+ merged.append([start_ms, end_ms])
82
+ segments = merged
83
+
84
+ # Discard segments that exceed the maximum length
85
+ if max_seg_sec is not None:
86
+ max_ms = max_seg_sec * 1000
87
+ segments = [[s, e] for s, e in segments if e - s <= max_ms]
88
+
89
+ return segments, duration_ms
90
+
91
+
92
+ def _build_output(audio_path: Path, segments: list, duration_ms: float, config: dict) -> dict:
93
+ total_speech_ms = sum(e - s for s, e in segments)
94
+ return {
95
+ "metadata": {
96
+ "source_audio": audio_path.name,
97
+ "audio_duration_ms": duration_ms,
98
+ "total_speech_ms": total_speech_ms,
99
+ "segment_count": len(segments),
100
+ "vad_threshold": config.get("threshold", 0.5),
101
+ "max_segment_length_sec": config.get("max_segment_length_sec", 30),
102
+ "merge_within_sec": config.get("merge_within_sec", 1.0),
103
+ },
104
+ "segments": [
105
+ {"segment_id": i, "start_ms": s, "end_ms": e, "duration_ms": e - s}
106
+ for i, (s, e) in enumerate(segments)
107
+ ],
108
+ }
109
+
110
+
111
+ def run(input_path: Path, output_dir: Path, config: dict, force: bool = False) -> Path:
112
+ """Run Silero VAD on a single audio file. Returns path to vad.json."""
113
+ output_file = output_dir / f"{output_dir.name}_vad.json"
114
+ if not force and output_file.exists():
115
+ return output_file
116
+
117
+ output_dir.mkdir(parents=True, exist_ok=True)
118
+ segments, duration_ms = _detect_segments(input_path, config)
119
+ if not segments:
120
+ raise RuntimeError(f"No speech segments detected in {input_path.name}")
121
+
122
+ output = _build_output(input_path, segments, duration_ms, config)
123
+ with open(output_file, "w", encoding="utf-8") as f:
124
+ json.dump(output, f, indent=2, ensure_ascii=False)
125
+
126
+ return output_file
@@ -0,0 +1,16 @@
1
+ # You can add words or phrases here that will be included in Whisper's initial prompt,
2
+ # increasing the likelihood those words will be predicted during transcription.
3
+ # Add one word or phrase per line and note that lines starting with # are ignored.
4
+ #
5
+ # Examples of what to add:
6
+ # - Technical jargon specific to your subject matter
7
+ # - Proper nouns (names of people, places, organizations, products)
8
+ # - Acronyms and their expansions
9
+ # - Any word Whisper is consistently getting wrong
10
+ #
11
+ # Example entries:
12
+ # box box
13
+ # VSC
14
+ # DRS
15
+ # understeer
16
+ # oversteer
@@ -0,0 +1,12 @@
1
+ Metadata-Version: 2.4
2
+ Name: dinscribe
3
+ Version: 0.1.0
4
+ Summary: Audio denoising and transcription pipeline using Demucs, Silero VAD, and Whisper
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: demucs>=4.0.0
7
+ Requires-Dist: torchcodec
8
+ Requires-Dist: torch
9
+ Requires-Dist: pydub
10
+ Requires-Dist: packaging
11
+ Requires-Dist: openai-whisper
12
+ Requires-Dist: pyyaml
@@ -0,0 +1,17 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/dinscribe/__init__.py
4
+ src/dinscribe/cli.py
5
+ src/dinscribe/config.yaml
6
+ src/dinscribe/denoise.py
7
+ src/dinscribe/pipeline.py
8
+ src/dinscribe/transcribe.py
9
+ src/dinscribe/utils.py
10
+ src/dinscribe/vad.py
11
+ src/dinscribe/vocab.txt
12
+ src/dinscribe.egg-info/PKG-INFO
13
+ src/dinscribe.egg-info/SOURCES.txt
14
+ src/dinscribe.egg-info/dependency_links.txt
15
+ src/dinscribe.egg-info/entry_points.txt
16
+ src/dinscribe.egg-info/requires.txt
17
+ src/dinscribe.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ dinscribe = dinscribe.cli:main
@@ -0,0 +1,7 @@
1
+ demucs>=4.0.0
2
+ torchcodec
3
+ torch
4
+ pydub
5
+ packaging
6
+ openai-whisper
7
+ pyyaml
@@ -0,0 +1 @@
1
+ dinscribe