erm 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- erm/__init__.py +58 -0
- erm/__main__.py +11 -0
- erm/acoustic.py +59 -0
- erm/asr.py +43 -0
- erm/audio.py +59 -0
- erm/cli.py +342 -0
- erm/detect.py +326 -0
- erm/envelope.py +74 -0
- erm/ffmpeg_ops.py +156 -0
- erm/fillers.py +59 -0
- erm/models.py +22 -0
- erm/ranges.py +63 -0
- erm/refine.py +100 -0
- erm/validate.py +73 -0
- erm-0.1.0.dist-info/METADATA +203 -0
- erm-0.1.0.dist-info/RECORD +20 -0
- erm-0.1.0.dist-info/WHEEL +5 -0
- erm-0.1.0.dist-info/entry_points.txt +2 -0
- erm-0.1.0.dist-info/licenses/LICENSE +21 -0
- erm-0.1.0.dist-info/top_level.txt +1 -0
erm/__init__.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""erm: strip disfluencies from spoken audio.
|
|
2
|
+
|
|
3
|
+
The pure-helper modules (`fillers`, `ranges`, `refine`, `envelope`, `models`)
|
|
4
|
+
depend only on numpy + stdlib so the unit tests can run without
|
|
5
|
+
faster-whisper or librosa installed. Heavy deps (`librosa`,
|
|
6
|
+
`faster_whisper`) are imported lazily inside the functions that need them.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .acoustic import is_sustained_vowel
|
|
10
|
+
from .asr import VERBATIM_PROMPT, transcribe
|
|
11
|
+
from .audio import find_quiet_region, load_audio_mono
|
|
12
|
+
from .cli import main
|
|
13
|
+
from .detect import (
|
|
14
|
+
detect_gap_fillers,
|
|
15
|
+
detect_intraword_fillers,
|
|
16
|
+
detect_overlong_words,
|
|
17
|
+
expected_max_word_duration,
|
|
18
|
+
)
|
|
19
|
+
from .ffmpeg_ops import (
|
|
20
|
+
denoise_to,
|
|
21
|
+
extract_segment,
|
|
22
|
+
ffprobe_duration,
|
|
23
|
+
overlay_room_tone,
|
|
24
|
+
render,
|
|
25
|
+
)
|
|
26
|
+
from .fillers import DEFAULT_FILLERS, find_fillers, is_filler, normalize_word
|
|
27
|
+
from .models import Cut, Word
|
|
28
|
+
from .ranges import invert_to_keep_ranges, merge_close_cuts
|
|
29
|
+
from .refine import refine_boundaries
|
|
30
|
+
from .validate import validate_output
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
"Cut",
|
|
34
|
+
"DEFAULT_FILLERS",
|
|
35
|
+
"VERBATIM_PROMPT",
|
|
36
|
+
"Word",
|
|
37
|
+
"denoise_to",
|
|
38
|
+
"detect_gap_fillers",
|
|
39
|
+
"detect_intraword_fillers",
|
|
40
|
+
"detect_overlong_words",
|
|
41
|
+
"expected_max_word_duration",
|
|
42
|
+
"extract_segment",
|
|
43
|
+
"ffprobe_duration",
|
|
44
|
+
"find_fillers",
|
|
45
|
+
"find_quiet_region",
|
|
46
|
+
"invert_to_keep_ranges",
|
|
47
|
+
"is_filler",
|
|
48
|
+
"is_sustained_vowel",
|
|
49
|
+
"load_audio_mono",
|
|
50
|
+
"main",
|
|
51
|
+
"merge_close_cuts",
|
|
52
|
+
"normalize_word",
|
|
53
|
+
"overlay_room_tone",
|
|
54
|
+
"refine_boundaries",
|
|
55
|
+
"render",
|
|
56
|
+
"transcribe",
|
|
57
|
+
"validate_output",
|
|
58
|
+
]
|
erm/__main__.py
ADDED
erm/acoustic.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Acoustic feature checks (librosa-based, lazy-imported)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def is_sustained_vowel(
|
|
9
|
+
audio: np.ndarray,
|
|
10
|
+
sr: int,
|
|
11
|
+
start_s: float,
|
|
12
|
+
end_s: float,
|
|
13
|
+
max_centroid_cv: float = 0.18,
|
|
14
|
+
min_voiced_frac: float = 0.50,
|
|
15
|
+
) -> bool:
|
|
16
|
+
"""Return True if [start_s, end_s] looks acoustically like a sustained
|
|
17
|
+
filler vowel ("uhhh", "ahhh", "ummm").
|
|
18
|
+
|
|
19
|
+
Filler vowels have two distinguishing features compared to real word
|
|
20
|
+
content: (a) the spectral energy stays in roughly the same place across
|
|
21
|
+
the region (low spectral-centroid variation), and (b) most frames are
|
|
22
|
+
voiced (ZCR in the voiced range, not silence or fricative noise).
|
|
23
|
+
|
|
24
|
+
`max_centroid_cv` is the std/mean ratio of the spectral centroid; lower
|
|
25
|
+
means more stable. `min_voiced_frac` is the fraction of frames whose
|
|
26
|
+
zero-crossing rate is in the typical voiced-speech range.
|
|
27
|
+
"""
|
|
28
|
+
import librosa # heavy; lazy
|
|
29
|
+
|
|
30
|
+
if audio.ndim > 1:
|
|
31
|
+
audio = audio.mean(axis=1)
|
|
32
|
+
s = max(0, int(start_s * sr))
|
|
33
|
+
e = min(audio.size, int(end_s * sr))
|
|
34
|
+
seg = audio[s:e]
|
|
35
|
+
if seg.size < int(0.06 * sr):
|
|
36
|
+
return False
|
|
37
|
+
|
|
38
|
+
n_fft = 1024
|
|
39
|
+
hop = max(1, int(0.020 * sr))
|
|
40
|
+
if seg.size < n_fft:
|
|
41
|
+
seg = np.pad(seg, (0, n_fft - seg.size), mode="constant")
|
|
42
|
+
|
|
43
|
+
centroid = librosa.feature.spectral_centroid(
|
|
44
|
+
y=seg, sr=sr, n_fft=n_fft, hop_length=hop,
|
|
45
|
+
)[0]
|
|
46
|
+
if centroid.size < 3:
|
|
47
|
+
return False
|
|
48
|
+
mean_c = float(centroid.mean())
|
|
49
|
+
if mean_c <= 1e-6:
|
|
50
|
+
return False
|
|
51
|
+
cv = float(centroid.std() / mean_c)
|
|
52
|
+
|
|
53
|
+
zcr = librosa.feature.zero_crossing_rate(
|
|
54
|
+
y=seg, frame_length=n_fft, hop_length=hop,
|
|
55
|
+
)[0]
|
|
56
|
+
voiced = (zcr > 0.02) & (zcr < 0.20)
|
|
57
|
+
voiced_frac = float(voiced.mean()) if voiced.size else 0.0
|
|
58
|
+
|
|
59
|
+
return cv <= max_centroid_cv and voiced_frac >= min_voiced_frac
|
erm/asr.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""faster-whisper transcription (lazy-imported)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from .models import Word
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
VERBATIM_PROMPT = (
|
|
11
|
+
"Um, uh, er, erm, ah, hmm. Like, you know, I mean, sort of. "
|
|
12
|
+
"Verbatim transcription including all filler words and disfluencies."
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def transcribe(
|
|
17
|
+
path: str | Path,
|
|
18
|
+
model_name: str = "medium.en",
|
|
19
|
+
verbatim: bool = True,
|
|
20
|
+
) -> tuple[list[Word], float]:
|
|
21
|
+
"""Transcribe `path` with faster-whisper. Returns (words, duration_seconds).
|
|
22
|
+
|
|
23
|
+
`verbatim=True` passes an `initial_prompt` that biases Whisper toward
|
|
24
|
+
keeping disfluencies, which it normally cleans up silently.
|
|
25
|
+
"""
|
|
26
|
+
from faster_whisper import WhisperModel # heavy; lazy
|
|
27
|
+
|
|
28
|
+
model = WhisperModel(model_name, device="auto", compute_type="auto")
|
|
29
|
+
segments, info = model.transcribe(
|
|
30
|
+
str(path),
|
|
31
|
+
word_timestamps=True,
|
|
32
|
+
initial_prompt=VERBATIM_PROMPT if verbatim else None,
|
|
33
|
+
condition_on_previous_text=False, # otherwise the prompt gets diluted
|
|
34
|
+
)
|
|
35
|
+
words: list[Word] = []
|
|
36
|
+
for seg in segments:
|
|
37
|
+
if not seg.words:
|
|
38
|
+
continue
|
|
39
|
+
for w in seg.words:
|
|
40
|
+
if w.start is None or w.end is None:
|
|
41
|
+
continue
|
|
42
|
+
words.append(Word(text=w.word.strip(), start=float(w.start), end=float(w.end)))
|
|
43
|
+
return words, float(info.duration)
|
erm/audio.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Audio loading and quiet-region selection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Sequence
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
from .models import Word
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def load_audio_mono(path: str | Path, target_sr: int = 16_000) -> tuple[np.ndarray, int]:
|
|
14
|
+
"""Load any ffmpeg-readable audio file as mono float32 at `target_sr`."""
|
|
15
|
+
import librosa # heavy; lazy
|
|
16
|
+
y, sr = librosa.load(str(path), sr=target_sr, mono=True)
|
|
17
|
+
return y.astype(np.float32), int(sr)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def find_quiet_region(
|
|
21
|
+
audio: np.ndarray,
|
|
22
|
+
sr: int,
|
|
23
|
+
words: Sequence[Word],
|
|
24
|
+
min_length_s: float = 0.4,
|
|
25
|
+
max_length_s: float = 1.5,
|
|
26
|
+
win_ms: float = 10.0,
|
|
27
|
+
) -> tuple[float, float] | None:
|
|
28
|
+
"""Find a stretch of mostly-silent audio suitable as a room-tone sample.
|
|
29
|
+
|
|
30
|
+
We need a region with no speech and only background noise (HVAC, mic
|
|
31
|
+
hiss, room tone). The gap *before the first transcribed word* is usually
|
|
32
|
+
the cleanest source — it's pre-roll silence with no speaker activity.
|
|
33
|
+
Falls back to the gap after the last word if the leading gap is too
|
|
34
|
+
short.
|
|
35
|
+
"""
|
|
36
|
+
if audio.ndim > 1:
|
|
37
|
+
audio = audio.mean(axis=1)
|
|
38
|
+
audio = np.ascontiguousarray(audio, dtype=np.float32)
|
|
39
|
+
total = float(audio.size) / sr
|
|
40
|
+
|
|
41
|
+
sorted_words = sorted(words, key=lambda w: w.start)
|
|
42
|
+
candidates: list[tuple[float, float]] = []
|
|
43
|
+
if sorted_words:
|
|
44
|
+
candidates.append((0.0, sorted_words[0].start))
|
|
45
|
+
candidates.append((sorted_words[-1].end, total))
|
|
46
|
+
else:
|
|
47
|
+
candidates.append((0.0, total))
|
|
48
|
+
|
|
49
|
+
# Trim 50ms off each side to avoid clipping the start of speech
|
|
50
|
+
# or the tail of the previous word's silence-pad.
|
|
51
|
+
pad = 0.05
|
|
52
|
+
for start_s, end_s in candidates:
|
|
53
|
+
if end_s - start_s < min_length_s + 2 * pad:
|
|
54
|
+
continue
|
|
55
|
+
s = start_s + pad
|
|
56
|
+
e = min(end_s - pad, s + max_length_s)
|
|
57
|
+
if e - s >= min_length_s:
|
|
58
|
+
return (s, e)
|
|
59
|
+
return None
|
erm/cli.py
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
"""Command-line interface: `erm` and `erm validate`."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import json
|
|
7
|
+
import sys
|
|
8
|
+
import time
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from .asr import transcribe
|
|
12
|
+
from .audio import find_quiet_region, load_audio_mono
|
|
13
|
+
from .detect import (
|
|
14
|
+
detect_gap_fillers,
|
|
15
|
+
detect_intraword_fillers,
|
|
16
|
+
detect_overlong_words,
|
|
17
|
+
)
|
|
18
|
+
from .acoustic import is_sustained_vowel
|
|
19
|
+
from .ffmpeg_ops import denoise_to, extract_segment, overlay_room_tone, render
|
|
20
|
+
from .fillers import DEFAULT_FILLERS, find_fillers
|
|
21
|
+
from .models import Cut
|
|
22
|
+
from .ranges import invert_to_keep_ranges, merge_close_cuts
|
|
23
|
+
from .refine import refine_boundaries
|
|
24
|
+
from .validate import validate_output
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _build_remove_parser() -> argparse.ArgumentParser:
|
|
28
|
+
p = argparse.ArgumentParser(
|
|
29
|
+
prog="erm",
|
|
30
|
+
description="Strip disfluencies from spoken audio.",
|
|
31
|
+
)
|
|
32
|
+
p.add_argument("input", help="Input audio file.")
|
|
33
|
+
p.add_argument("-o", "--output", help="Output audio file (.wav).")
|
|
34
|
+
p.add_argument("--model", default="medium.en",
|
|
35
|
+
help="faster-whisper model (default: medium.en).")
|
|
36
|
+
p.add_argument("--fillers", default=",".join(sorted(DEFAULT_FILLERS)),
|
|
37
|
+
help="Comma-separated filler word list.")
|
|
38
|
+
p.add_argument("--search-ms", type=float, default=60.0)
|
|
39
|
+
p.add_argument("--crossfade-ms", type=float, default=None,
|
|
40
|
+
help="Fixed crossfade length for every splice. When omitted "
|
|
41
|
+
"(default), each splice scales with its cut length.")
|
|
42
|
+
p.add_argument("--min-crossfade-ms", type=float, default=50.0,
|
|
43
|
+
help="Floor for the per-splice crossfade scaling.")
|
|
44
|
+
p.add_argument("--max-crossfade-ms", type=float, default=120.0,
|
|
45
|
+
help="Ceiling for the per-splice crossfade scaling.")
|
|
46
|
+
p.add_argument("--crossfade-factor", type=float, default=0.15,
|
|
47
|
+
help="Per-splice crossfade = cut_length * factor, "
|
|
48
|
+
"clamped to [min, max]. Higher = smoother but blurrier.")
|
|
49
|
+
p.add_argument("--merge-gap-ms", type=float, default=120.0,
|
|
50
|
+
help="Merge two cuts whose surviving fragment is shorter "
|
|
51
|
+
"than this (the fragment would otherwise be eaten "
|
|
52
|
+
"by the surrounding crossfades and audibly blurp).")
|
|
53
|
+
p.add_argument("--denoise", choices=("none", "pre", "post", "hybrid"),
|
|
54
|
+
default="hybrid",
|
|
55
|
+
help="Background-noise handling. "
|
|
56
|
+
"'none': leave audio alone. "
|
|
57
|
+
"'pre': denoise input, then cut. Cleanest splices, "
|
|
58
|
+
"but detection is less sensitive on denoised audio. "
|
|
59
|
+
"'post': cut the original, then denoise the output. "
|
|
60
|
+
"Same detection sensitivity as 'none', but the noise "
|
|
61
|
+
"floor mismatch at each splice is smoothed afterward. "
|
|
62
|
+
"'hybrid' (default): detect on the original (full "
|
|
63
|
+
"sensitivity, all real fillers caught), render cuts "
|
|
64
|
+
"from the denoised copy (clean splices). Best of both.")
|
|
65
|
+
p.add_argument("--denoise-nr", type=float, default=12.0,
|
|
66
|
+
help="ffmpeg afftdn noise-reduction strength (dB).")
|
|
67
|
+
p.add_argument("--denoise-nf", type=float, default=-25.0,
|
|
68
|
+
help="ffmpeg afftdn noise floor (dB).")
|
|
69
|
+
p.add_argument("--room-tone", dest="room_tone",
|
|
70
|
+
action=argparse.BooleanOptionalAction, default=True,
|
|
71
|
+
help="Sample a quiet region of the *original* recording "
|
|
72
|
+
"and lay it under the output as a constant ambient "
|
|
73
|
+
"undertone. Masks splice discontinuities by ensuring "
|
|
74
|
+
"the noise floor is identical everywhere. Especially "
|
|
75
|
+
"useful with --denoise (which strips room tone) — "
|
|
76
|
+
"this puts a bit of natural room tone back, "
|
|
77
|
+
"consistently. Default on.")
|
|
78
|
+
p.add_argument("--room-tone-level-db", type=float, default=-12.0,
|
|
79
|
+
help="Attenuation applied to the looped room-tone sample "
|
|
80
|
+
"before mixing under the speech. Lower = quieter. "
|
|
81
|
+
"Around -12 to -20 dB is usually right.")
|
|
82
|
+
p.add_argument("--room-tone-source", default="auto",
|
|
83
|
+
help="Either 'auto' (find a quiet stretch automatically) "
|
|
84
|
+
"or 'START-END' in seconds (e.g. '0.05-1.4').")
|
|
85
|
+
p.add_argument("--detect-gaps", dest="detect_gaps",
|
|
86
|
+
action=argparse.BooleanOptionalAction, default=True,
|
|
87
|
+
help="Also cut voiced regions in long inter-word gaps "
|
|
88
|
+
"(catches fillers Whisper drops). Default on.")
|
|
89
|
+
p.add_argument("--gap-min-ms", type=float, default=350.0,
|
|
90
|
+
help="Min inter-word gap to scan (ms). Below this, the "
|
|
91
|
+
"pause is too short to plausibly hide a filler.")
|
|
92
|
+
p.add_argument("--gap-min-voiced-ms", type=float, default=100.0)
|
|
93
|
+
p.add_argument("--gap-max-voiced-ms", type=float, default=1500.0)
|
|
94
|
+
p.add_argument("--intraword-min-ms", type=float, default=550.0,
|
|
95
|
+
help="Min word duration to scan for hidden trailing "
|
|
96
|
+
"fillers Whisper subsumed into the word's bounds.")
|
|
97
|
+
p.add_argument("--confirm-pitch", dest="confirm_pitch",
|
|
98
|
+
action=argparse.BooleanOptionalAction, default=True,
|
|
99
|
+
help="Confirm aggressive overlong-word candidates by "
|
|
100
|
+
"checking they look like sustained filler vowels "
|
|
101
|
+
"(stable spectral centroid + voiced ZCR). "
|
|
102
|
+
"Drops cuts that fall on real speech. Default on.")
|
|
103
|
+
p.add_argument("--dry-run", action="store_true")
|
|
104
|
+
p.add_argument("--json", dest="json_out",
|
|
105
|
+
help="Write cut list JSON to this path.")
|
|
106
|
+
return p
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _build_validate_parser() -> argparse.ArgumentParser:
|
|
110
|
+
p = argparse.ArgumentParser(
|
|
111
|
+
prog="erm validate",
|
|
112
|
+
description="Validate a rendered output against its source.",
|
|
113
|
+
)
|
|
114
|
+
p.add_argument("input")
|
|
115
|
+
p.add_argument("output")
|
|
116
|
+
p.add_argument("--cuts", help="Cut list JSON written by `remove`.")
|
|
117
|
+
p.add_argument("--model", default="medium.en")
|
|
118
|
+
p.add_argument("--report", help="Write report JSON to this path.")
|
|
119
|
+
return p
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _timestamped(input_path: str | Path, suffix: str, ext: str) -> Path:
|
|
123
|
+
"""Build a sibling output path: {stem}-{suffix}-{YYYYMMDD-HHMMSS}.{ext}.
|
|
124
|
+
|
|
125
|
+
Lives next to the input so tooling that pairs source/output (e.g. the
|
|
126
|
+
`validate` subcommand) finds them together.
|
|
127
|
+
"""
|
|
128
|
+
p = Path(input_path)
|
|
129
|
+
stamp = time.strftime("%Y%m%d-%H%M%S")
|
|
130
|
+
return p.with_name(f"{p.stem}-{suffix}-{stamp}.{ext}")
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _cmd_remove(args: argparse.Namespace) -> int:
|
|
134
|
+
fillers = {f.strip().lower() for f in args.fillers.split(",") if f.strip()}
|
|
135
|
+
|
|
136
|
+
if not args.output and not args.dry_run:
|
|
137
|
+
args.output = str(_timestamped(args.input, "cleaned", "wav"))
|
|
138
|
+
print(f" output: {args.output}", file=sys.stderr)
|
|
139
|
+
if not args.json_out:
|
|
140
|
+
args.json_out = str(_timestamped(args.input, "cuts", "json"))
|
|
141
|
+
print(f" cuts: {args.json_out}", file=sys.stderr)
|
|
142
|
+
|
|
143
|
+
# Denoise stages produce two virtual inputs:
|
|
144
|
+
# `analysis_input` — what transcribe + audio detectors see
|
|
145
|
+
# `render_input` — what ffmpeg cuts from
|
|
146
|
+
# `none`: both = original
|
|
147
|
+
# `pre`: both = denoised (cleanest splices, but detection less sensitive
|
|
148
|
+
# because denoising flattens the energy/pitch
|
|
149
|
+
# signals our detectors rely on)
|
|
150
|
+
# `post`: both = original; output is denoised at the end
|
|
151
|
+
# `hybrid`: analysis on original (full detection sensitivity), render from
|
|
152
|
+
# denoised (clean splices). Best filler coverage AND clean splices.
|
|
153
|
+
analysis_input = args.input
|
|
154
|
+
render_input = args.input
|
|
155
|
+
denoised_path: Path | None = None
|
|
156
|
+
if args.denoise in ("pre", "hybrid"):
|
|
157
|
+
denoised_path = _timestamped(args.input, "denoised", "wav")
|
|
158
|
+
print(f"[0/4] denoising input -> {denoised_path}", file=sys.stderr)
|
|
159
|
+
denoise_to(args.input, denoised_path,
|
|
160
|
+
nr=args.denoise_nr, nf=args.denoise_nf)
|
|
161
|
+
if args.denoise == "pre":
|
|
162
|
+
analysis_input = str(denoised_path)
|
|
163
|
+
render_input = str(denoised_path)
|
|
164
|
+
else: # hybrid
|
|
165
|
+
render_input = str(denoised_path)
|
|
166
|
+
|
|
167
|
+
print(f"[1/4] transcribing with {args.model}...", file=sys.stderr)
|
|
168
|
+
words, duration = transcribe(analysis_input, model_name=args.model)
|
|
169
|
+
|
|
170
|
+
word_cuts = find_fillers(words, fillers)
|
|
171
|
+
print(f"[2/4] found {len(word_cuts)} transcribed filler(s) in {duration:.2f}s",
|
|
172
|
+
file=sys.stderr)
|
|
173
|
+
|
|
174
|
+
audio = None
|
|
175
|
+
sr = 0
|
|
176
|
+
gap_cuts: list[Cut] = []
|
|
177
|
+
intra_cuts: list[Cut] = []
|
|
178
|
+
if args.detect_gaps:
|
|
179
|
+
audio, sr = load_audio_mono(analysis_input)
|
|
180
|
+
gap_cuts = detect_gap_fillers(
|
|
181
|
+
audio, sr, words, duration,
|
|
182
|
+
min_gap_s=args.gap_min_ms / 1000.0,
|
|
183
|
+
min_voiced_s=args.gap_min_voiced_ms / 1000.0,
|
|
184
|
+
max_voiced_s=args.gap_max_voiced_ms / 1000.0,
|
|
185
|
+
)
|
|
186
|
+
intra_cuts = detect_intraword_fillers(
|
|
187
|
+
audio, sr, words,
|
|
188
|
+
min_word_s=args.intraword_min_ms / 1000.0,
|
|
189
|
+
min_voiced_s=args.gap_min_voiced_ms / 1000.0,
|
|
190
|
+
max_voiced_s=args.gap_max_voiced_ms / 1000.0,
|
|
191
|
+
confirm_pitch=args.confirm_pitch,
|
|
192
|
+
)
|
|
193
|
+
long_cuts = detect_overlong_words(
|
|
194
|
+
audio, sr, words,
|
|
195
|
+
min_voiced_s=args.gap_min_voiced_ms / 1000.0,
|
|
196
|
+
max_voiced_s=args.gap_max_voiced_ms / 1000.0,
|
|
197
|
+
)
|
|
198
|
+
long_cuts_pre = len(long_cuts)
|
|
199
|
+
if args.confirm_pitch:
|
|
200
|
+
long_cuts = [
|
|
201
|
+
c for c in long_cuts
|
|
202
|
+
if is_sustained_vowel(audio, sr, c.start, c.end)
|
|
203
|
+
]
|
|
204
|
+
print(f" detected {len(gap_cuts)} gap + {len(intra_cuts)} intra "
|
|
205
|
+
f"+ {len(long_cuts)}/{long_cuts_pre} overlong "
|
|
206
|
+
f"(pitch-confirmed) candidate(s)", file=sys.stderr)
|
|
207
|
+
else:
|
|
208
|
+
long_cuts = []
|
|
209
|
+
|
|
210
|
+
raw_cuts = sorted(word_cuts + gap_cuts + intra_cuts + long_cuts,
|
|
211
|
+
key=lambda c: c.start)
|
|
212
|
+
if raw_cuts:
|
|
213
|
+
print("[3/4] refining cut boundaries...", file=sys.stderr)
|
|
214
|
+
if audio is None:
|
|
215
|
+
audio, sr = load_audio_mono(analysis_input)
|
|
216
|
+
cuts = refine_boundaries(
|
|
217
|
+
audio, sr, raw_cuts, search_ms=args.search_ms,
|
|
218
|
+
words=words, total_duration=duration,
|
|
219
|
+
)
|
|
220
|
+
else:
|
|
221
|
+
cuts = []
|
|
222
|
+
|
|
223
|
+
cuts = merge_close_cuts(cuts, min_gap_s=args.merge_gap_ms / 1000.0)
|
|
224
|
+
keep = invert_to_keep_ranges(cuts, duration)
|
|
225
|
+
saved = sum(c.end - c.start for c in cuts)
|
|
226
|
+
|
|
227
|
+
cuts_payload = {
|
|
228
|
+
"input": str(args.input),
|
|
229
|
+
"duration_s": duration,
|
|
230
|
+
"cuts": [c.as_dict() for c in cuts],
|
|
231
|
+
"keep_ranges": [{"start": s, "end": e} for s, e in keep],
|
|
232
|
+
"time_saved_s": saved,
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
if args.json_out:
|
|
236
|
+
Path(args.json_out).write_text(json.dumps(cuts_payload, indent=2))
|
|
237
|
+
print(f" wrote cut list to {args.json_out}", file=sys.stderr)
|
|
238
|
+
|
|
239
|
+
if args.dry_run:
|
|
240
|
+
print(json.dumps(cuts_payload, indent=2))
|
|
241
|
+
if denoised_path is not None:
|
|
242
|
+
Path(denoised_path).unlink(missing_ok=True)
|
|
243
|
+
return 0
|
|
244
|
+
|
|
245
|
+
if not keep:
|
|
246
|
+
print("error: no audio left after removing fillers", file=sys.stderr)
|
|
247
|
+
if denoised_path is not None:
|
|
248
|
+
Path(denoised_path).unlink(missing_ok=True)
|
|
249
|
+
return 1
|
|
250
|
+
|
|
251
|
+
print(f"[4/4] rendering {args.output} ({saved:.2f}s removed)", file=sys.stderr)
|
|
252
|
+
needs_post_denoise = args.denoise == "post"
|
|
253
|
+
needs_room_tone = args.room_tone
|
|
254
|
+
|
|
255
|
+
render_target = args.output
|
|
256
|
+
if needs_post_denoise or needs_room_tone:
|
|
257
|
+
render_target = str(_timestamped(args.input, "raw", "wav"))
|
|
258
|
+
|
|
259
|
+
render(render_input, keep, render_target,
|
|
260
|
+
crossfade_ms=args.crossfade_ms,
|
|
261
|
+
min_crossfade_ms=args.min_crossfade_ms,
|
|
262
|
+
max_crossfade_ms=args.max_crossfade_ms,
|
|
263
|
+
crossfade_factor=args.crossfade_factor,
|
|
264
|
+
words=words)
|
|
265
|
+
|
|
266
|
+
current = render_target
|
|
267
|
+
if needs_post_denoise:
|
|
268
|
+
print(f" denoising output...", file=sys.stderr)
|
|
269
|
+
next_target = (args.output if not needs_room_tone
|
|
270
|
+
else str(_timestamped(args.input, "denoised-out", "wav")))
|
|
271
|
+
denoise_to(current, next_target,
|
|
272
|
+
nr=args.denoise_nr, nf=args.denoise_nf)
|
|
273
|
+
if current != args.output:
|
|
274
|
+
Path(current).unlink(missing_ok=True)
|
|
275
|
+
current = next_target
|
|
276
|
+
|
|
277
|
+
if needs_room_tone:
|
|
278
|
+
# Always sample the room tone from the *original* — that's what has
|
|
279
|
+
# the real ambient character. Denoising would strip it.
|
|
280
|
+
if args.room_tone_source == "auto":
|
|
281
|
+
if audio is None:
|
|
282
|
+
audio, sr = load_audio_mono(args.input)
|
|
283
|
+
region = find_quiet_region(audio, sr, words)
|
|
284
|
+
if region is None:
|
|
285
|
+
print(" room tone: no quiet region found — skipping",
|
|
286
|
+
file=sys.stderr)
|
|
287
|
+
if current != args.output:
|
|
288
|
+
Path(args.output).unlink(missing_ok=True)
|
|
289
|
+
Path(current).rename(args.output)
|
|
290
|
+
if denoised_path is not None:
|
|
291
|
+
Path(denoised_path).unlink(missing_ok=True)
|
|
292
|
+
return 0
|
|
293
|
+
tone_start, tone_end = region
|
|
294
|
+
else:
|
|
295
|
+
try:
|
|
296
|
+
ts, te = (float(x) for x in args.room_tone_source.split("-"))
|
|
297
|
+
except ValueError:
|
|
298
|
+
print(f"error: invalid --room-tone-source {args.room_tone_source!r}",
|
|
299
|
+
file=sys.stderr)
|
|
300
|
+
if current != args.output:
|
|
301
|
+
Path(current).unlink(missing_ok=True)
|
|
302
|
+
if denoised_path is not None:
|
|
303
|
+
Path(denoised_path).unlink(missing_ok=True)
|
|
304
|
+
return 2
|
|
305
|
+
tone_start, tone_end = ts, te
|
|
306
|
+
print(f" room tone: {tone_start:.2f}-{tone_end:.2f}s "
|
|
307
|
+
f"({(tone_end-tone_start)*1000:.0f}ms) "
|
|
308
|
+
f"@ {args.room_tone_level_db:.1f}dB", file=sys.stderr)
|
|
309
|
+
tone_path = _timestamped(args.input, "tone", "wav")
|
|
310
|
+
extract_segment(args.input, tone_start, tone_end, tone_path)
|
|
311
|
+
overlay_room_tone(current, tone_path, args.output,
|
|
312
|
+
level_db=args.room_tone_level_db)
|
|
313
|
+
Path(tone_path).unlink(missing_ok=True)
|
|
314
|
+
if current != args.output:
|
|
315
|
+
Path(current).unlink(missing_ok=True)
|
|
316
|
+
|
|
317
|
+
if denoised_path is not None:
|
|
318
|
+
Path(denoised_path).unlink(missing_ok=True)
|
|
319
|
+
|
|
320
|
+
return 0
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def _cmd_validate(args: argparse.Namespace) -> int:
|
|
324
|
+
if not args.report:
|
|
325
|
+
args.report = str(_timestamped(args.output, "validate", "json"))
|
|
326
|
+
print(f" report: {args.report}", file=sys.stderr)
|
|
327
|
+
report = validate_output(
|
|
328
|
+
args.input, args.output, args.cuts, model_name=args.model,
|
|
329
|
+
)
|
|
330
|
+
text = json.dumps(report, indent=2)
|
|
331
|
+
print(text)
|
|
332
|
+
Path(args.report).write_text(text)
|
|
333
|
+
return 0 if report.get("ok") else 1
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def main(argv: list[str] | None = None) -> int:
|
|
337
|
+
raw = list(sys.argv[1:] if argv is None else argv)
|
|
338
|
+
if raw and raw[0] == "validate":
|
|
339
|
+
return _cmd_validate(_build_validate_parser().parse_args(raw[1:]))
|
|
340
|
+
if raw and raw[0] == "remove":
|
|
341
|
+
raw = raw[1:]
|
|
342
|
+
return _cmd_remove(_build_remove_parser().parse_args(raw))
|