erm 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
erm/__init__.py ADDED
@@ -0,0 +1,58 @@
1
+ """erm: strip disfluencies from spoken audio.
2
+
3
+ The pure-helper modules (`fillers`, `ranges`, `refine`, `envelope`, `models`)
4
+ depend only on numpy + stdlib so the unit tests can run without
5
+ faster-whisper or librosa installed. Heavy deps (`librosa`,
6
+ `faster_whisper`) are imported lazily inside the functions that need them.
7
+ """
8
+
9
+ from .acoustic import is_sustained_vowel
10
+ from .asr import VERBATIM_PROMPT, transcribe
11
+ from .audio import find_quiet_region, load_audio_mono
12
+ from .cli import main
13
+ from .detect import (
14
+ detect_gap_fillers,
15
+ detect_intraword_fillers,
16
+ detect_overlong_words,
17
+ expected_max_word_duration,
18
+ )
19
+ from .ffmpeg_ops import (
20
+ denoise_to,
21
+ extract_segment,
22
+ ffprobe_duration,
23
+ overlay_room_tone,
24
+ render,
25
+ )
26
+ from .fillers import DEFAULT_FILLERS, find_fillers, is_filler, normalize_word
27
+ from .models import Cut, Word
28
+ from .ranges import invert_to_keep_ranges, merge_close_cuts
29
+ from .refine import refine_boundaries
30
+ from .validate import validate_output
31
+
32
+ __all__ = [
33
+ "Cut",
34
+ "DEFAULT_FILLERS",
35
+ "VERBATIM_PROMPT",
36
+ "Word",
37
+ "denoise_to",
38
+ "detect_gap_fillers",
39
+ "detect_intraword_fillers",
40
+ "detect_overlong_words",
41
+ "expected_max_word_duration",
42
+ "extract_segment",
43
+ "ffprobe_duration",
44
+ "find_fillers",
45
+ "find_quiet_region",
46
+ "invert_to_keep_ranges",
47
+ "is_filler",
48
+ "is_sustained_vowel",
49
+ "load_audio_mono",
50
+ "main",
51
+ "merge_close_cuts",
52
+ "normalize_word",
53
+ "overlay_room_tone",
54
+ "refine_boundaries",
55
+ "render",
56
+ "transcribe",
57
+ "validate_output",
58
+ ]
erm/__main__.py ADDED
@@ -0,0 +1,11 @@
1
+ """`python -m erm` entrypoint."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+
7
+ from .cli import main
8
+
9
+
10
+ if __name__ == "__main__":
11
+ sys.exit(main())
erm/acoustic.py ADDED
@@ -0,0 +1,59 @@
1
+ """Acoustic feature checks (librosa-based, lazy-imported)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+
7
+
8
+ def is_sustained_vowel(
9
+ audio: np.ndarray,
10
+ sr: int,
11
+ start_s: float,
12
+ end_s: float,
13
+ max_centroid_cv: float = 0.18,
14
+ min_voiced_frac: float = 0.50,
15
+ ) -> bool:
16
+ """Return True if [start_s, end_s] looks acoustically like a sustained
17
+ filler vowel ("uhhh", "ahhh", "ummm").
18
+
19
+ Filler vowels have two distinguishing features compared to real word
20
+ content: (a) the spectral energy stays in roughly the same place across
21
+ the region (low spectral-centroid variation), and (b) most frames are
22
+ voiced (ZCR in the voiced range, not silence or fricative noise).
23
+
24
+ `max_centroid_cv` is the std/mean ratio of the spectral centroid; lower
25
+ means more stable. `min_voiced_frac` is the fraction of frames whose
26
+ zero-crossing rate is in the typical voiced-speech range.
27
+ """
28
+ import librosa # heavy; lazy
29
+
30
+ if audio.ndim > 1:
31
+ audio = audio.mean(axis=1)
32
+ s = max(0, int(start_s * sr))
33
+ e = min(audio.size, int(end_s * sr))
34
+ seg = audio[s:e]
35
+ if seg.size < int(0.06 * sr):
36
+ return False
37
+
38
+ n_fft = 1024
39
+ hop = max(1, int(0.020 * sr))
40
+ if seg.size < n_fft:
41
+ seg = np.pad(seg, (0, n_fft - seg.size), mode="constant")
42
+
43
+ centroid = librosa.feature.spectral_centroid(
44
+ y=seg, sr=sr, n_fft=n_fft, hop_length=hop,
45
+ )[0]
46
+ if centroid.size < 3:
47
+ return False
48
+ mean_c = float(centroid.mean())
49
+ if mean_c <= 1e-6:
50
+ return False
51
+ cv = float(centroid.std() / mean_c)
52
+
53
+ zcr = librosa.feature.zero_crossing_rate(
54
+ y=seg, frame_length=n_fft, hop_length=hop,
55
+ )[0]
56
+ voiced = (zcr > 0.02) & (zcr < 0.20)
57
+ voiced_frac = float(voiced.mean()) if voiced.size else 0.0
58
+
59
+ return cv <= max_centroid_cv and voiced_frac >= min_voiced_frac
erm/asr.py ADDED
@@ -0,0 +1,43 @@
1
+ """faster-whisper transcription (lazy-imported)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from .models import Word
8
+
9
+
10
+ VERBATIM_PROMPT = (
11
+ "Um, uh, er, erm, ah, hmm. Like, you know, I mean, sort of. "
12
+ "Verbatim transcription including all filler words and disfluencies."
13
+ )
14
+
15
+
16
+ def transcribe(
17
+ path: str | Path,
18
+ model_name: str = "medium.en",
19
+ verbatim: bool = True,
20
+ ) -> tuple[list[Word], float]:
21
+ """Transcribe `path` with faster-whisper. Returns (words, duration_seconds).
22
+
23
+ `verbatim=True` passes an `initial_prompt` that biases Whisper toward
24
+ keeping disfluencies, which it normally cleans up silently.
25
+ """
26
+ from faster_whisper import WhisperModel # heavy; lazy
27
+
28
+ model = WhisperModel(model_name, device="auto", compute_type="auto")
29
+ segments, info = model.transcribe(
30
+ str(path),
31
+ word_timestamps=True,
32
+ initial_prompt=VERBATIM_PROMPT if verbatim else None,
33
+ condition_on_previous_text=False, # otherwise the prompt gets diluted
34
+ )
35
+ words: list[Word] = []
36
+ for seg in segments:
37
+ if not seg.words:
38
+ continue
39
+ for w in seg.words:
40
+ if w.start is None or w.end is None:
41
+ continue
42
+ words.append(Word(text=w.word.strip(), start=float(w.start), end=float(w.end)))
43
+ return words, float(info.duration)
erm/audio.py ADDED
@@ -0,0 +1,59 @@
1
+ """Audio loading and quiet-region selection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Sequence
7
+
8
+ import numpy as np
9
+
10
+ from .models import Word
11
+
12
+
13
+ def load_audio_mono(path: str | Path, target_sr: int = 16_000) -> tuple[np.ndarray, int]:
14
+ """Load any ffmpeg-readable audio file as mono float32 at `target_sr`."""
15
+ import librosa # heavy; lazy
16
+ y, sr = librosa.load(str(path), sr=target_sr, mono=True)
17
+ return y.astype(np.float32), int(sr)
18
+
19
+
20
+ def find_quiet_region(
21
+ audio: np.ndarray,
22
+ sr: int,
23
+ words: Sequence[Word],
24
+ min_length_s: float = 0.4,
25
+ max_length_s: float = 1.5,
26
+ win_ms: float = 10.0,
27
+ ) -> tuple[float, float] | None:
28
+ """Find a stretch of mostly-silent audio suitable as a room-tone sample.
29
+
30
+ We need a region with no speech and only background noise (HVAC, mic
31
+ hiss, room tone). The gap *before the first transcribed word* is usually
32
+ the cleanest source — it's pre-roll silence with no speaker activity.
33
+ Falls back to the gap after the last word if the leading gap is too
34
+ short.
35
+ """
36
+ if audio.ndim > 1:
37
+ audio = audio.mean(axis=1)
38
+ audio = np.ascontiguousarray(audio, dtype=np.float32)
39
+ total = float(audio.size) / sr
40
+
41
+ sorted_words = sorted(words, key=lambda w: w.start)
42
+ candidates: list[tuple[float, float]] = []
43
+ if sorted_words:
44
+ candidates.append((0.0, sorted_words[0].start))
45
+ candidates.append((sorted_words[-1].end, total))
46
+ else:
47
+ candidates.append((0.0, total))
48
+
49
+ # Trim 50ms off each side to avoid clipping the start of speech
50
+ # or the tail of the previous word's silence-pad.
51
+ pad = 0.05
52
+ for start_s, end_s in candidates:
53
+ if end_s - start_s < min_length_s + 2 * pad:
54
+ continue
55
+ s = start_s + pad
56
+ e = min(end_s - pad, s + max_length_s)
57
+ if e - s >= min_length_s:
58
+ return (s, e)
59
+ return None
erm/cli.py ADDED
@@ -0,0 +1,342 @@
1
+ """Command-line interface: `erm` and `erm validate`."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import sys
8
+ import time
9
+ from pathlib import Path
10
+
11
+ from .asr import transcribe
12
+ from .audio import find_quiet_region, load_audio_mono
13
+ from .detect import (
14
+ detect_gap_fillers,
15
+ detect_intraword_fillers,
16
+ detect_overlong_words,
17
+ )
18
+ from .acoustic import is_sustained_vowel
19
+ from .ffmpeg_ops import denoise_to, extract_segment, overlay_room_tone, render
20
+ from .fillers import DEFAULT_FILLERS, find_fillers
21
+ from .models import Cut
22
+ from .ranges import invert_to_keep_ranges, merge_close_cuts
23
+ from .refine import refine_boundaries
24
+ from .validate import validate_output
25
+
26
+
27
+ def _build_remove_parser() -> argparse.ArgumentParser:
28
+ p = argparse.ArgumentParser(
29
+ prog="erm",
30
+ description="Strip disfluencies from spoken audio.",
31
+ )
32
+ p.add_argument("input", help="Input audio file.")
33
+ p.add_argument("-o", "--output", help="Output audio file (.wav).")
34
+ p.add_argument("--model", default="medium.en",
35
+ help="faster-whisper model (default: medium.en).")
36
+ p.add_argument("--fillers", default=",".join(sorted(DEFAULT_FILLERS)),
37
+ help="Comma-separated filler word list.")
38
+ p.add_argument("--search-ms", type=float, default=60.0)
39
+ p.add_argument("--crossfade-ms", type=float, default=None,
40
+ help="Fixed crossfade length for every splice. When omitted "
41
+ "(default), each splice scales with its cut length.")
42
+ p.add_argument("--min-crossfade-ms", type=float, default=50.0,
43
+ help="Floor for the per-splice crossfade scaling.")
44
+ p.add_argument("--max-crossfade-ms", type=float, default=120.0,
45
+ help="Ceiling for the per-splice crossfade scaling.")
46
+ p.add_argument("--crossfade-factor", type=float, default=0.15,
47
+ help="Per-splice crossfade = cut_length * factor, "
48
+ "clamped to [min, max]. Higher = smoother but blurrier.")
49
+ p.add_argument("--merge-gap-ms", type=float, default=120.0,
50
+ help="Merge two cuts whose surviving fragment is shorter "
51
+ "than this (the fragment would otherwise be eaten "
52
+ "by the surrounding crossfades and audibly blurp).")
53
+ p.add_argument("--denoise", choices=("none", "pre", "post", "hybrid"),
54
+ default="hybrid",
55
+ help="Background-noise handling. "
56
+ "'none': leave audio alone. "
57
+ "'pre': denoise input, then cut. Cleanest splices, "
58
+ "but detection is less sensitive on denoised audio. "
59
+ "'post': cut the original, then denoise the output. "
60
+ "Same detection sensitivity as 'none', but the noise "
61
+ "floor mismatch at each splice is smoothed afterward. "
62
+ "'hybrid' (default): detect on the original (full "
63
+ "sensitivity, all real fillers caught), render cuts "
64
+ "from the denoised copy (clean splices). Best of both.")
65
+ p.add_argument("--denoise-nr", type=float, default=12.0,
66
+ help="ffmpeg afftdn noise-reduction strength (dB).")
67
+ p.add_argument("--denoise-nf", type=float, default=-25.0,
68
+ help="ffmpeg afftdn noise floor (dB).")
69
+ p.add_argument("--room-tone", dest="room_tone",
70
+ action=argparse.BooleanOptionalAction, default=True,
71
+ help="Sample a quiet region of the *original* recording "
72
+ "and lay it under the output as a constant ambient "
73
+ "undertone. Masks splice discontinuities by ensuring "
74
+ "the noise floor is identical everywhere. Especially "
75
+ "useful with --denoise (which strips room tone) — "
76
+ "this puts a bit of natural room tone back, "
77
+ "consistently. Default on.")
78
+ p.add_argument("--room-tone-level-db", type=float, default=-12.0,
79
+ help="Attenuation applied to the looped room-tone sample "
80
+ "before mixing under the speech. Lower = quieter. "
81
+ "Around -12 to -20 dB is usually right.")
82
+ p.add_argument("--room-tone-source", default="auto",
83
+ help="Either 'auto' (find a quiet stretch automatically) "
84
+ "or 'START-END' in seconds (e.g. '0.05-1.4').")
85
+ p.add_argument("--detect-gaps", dest="detect_gaps",
86
+ action=argparse.BooleanOptionalAction, default=True,
87
+ help="Also cut voiced regions in long inter-word gaps "
88
+ "(catches fillers Whisper drops). Default on.")
89
+ p.add_argument("--gap-min-ms", type=float, default=350.0,
90
+ help="Min inter-word gap to scan (ms). Below this, the "
91
+ "pause is too short to plausibly hide a filler.")
92
+ p.add_argument("--gap-min-voiced-ms", type=float, default=100.0)
93
+ p.add_argument("--gap-max-voiced-ms", type=float, default=1500.0)
94
+ p.add_argument("--intraword-min-ms", type=float, default=550.0,
95
+ help="Min word duration to scan for hidden trailing "
96
+ "fillers Whisper subsumed into the word's bounds.")
97
+ p.add_argument("--confirm-pitch", dest="confirm_pitch",
98
+ action=argparse.BooleanOptionalAction, default=True,
99
+ help="Confirm aggressive overlong-word candidates by "
100
+ "checking they look like sustained filler vowels "
101
+ "(stable spectral centroid + voiced ZCR). "
102
+ "Drops cuts that fall on real speech. Default on.")
103
+ p.add_argument("--dry-run", action="store_true")
104
+ p.add_argument("--json", dest="json_out",
105
+ help="Write cut list JSON to this path.")
106
+ return p
107
+
108
+
109
+ def _build_validate_parser() -> argparse.ArgumentParser:
110
+ p = argparse.ArgumentParser(
111
+ prog="erm validate",
112
+ description="Validate a rendered output against its source.",
113
+ )
114
+ p.add_argument("input")
115
+ p.add_argument("output")
116
+ p.add_argument("--cuts", help="Cut list JSON written by `remove`.")
117
+ p.add_argument("--model", default="medium.en")
118
+ p.add_argument("--report", help="Write report JSON to this path.")
119
+ return p
120
+
121
+
122
+ def _timestamped(input_path: str | Path, suffix: str, ext: str) -> Path:
123
+ """Build a sibling output path: {stem}-{suffix}-{YYYYMMDD-HHMMSS}.{ext}.
124
+
125
+ Lives next to the input so tooling that pairs source/output (e.g. the
126
+ `validate` subcommand) finds them together.
127
+ """
128
+ p = Path(input_path)
129
+ stamp = time.strftime("%Y%m%d-%H%M%S")
130
+ return p.with_name(f"{p.stem}-{suffix}-{stamp}.{ext}")
131
+
132
+
133
+ def _cmd_remove(args: argparse.Namespace) -> int:
134
+ fillers = {f.strip().lower() for f in args.fillers.split(",") if f.strip()}
135
+
136
+ if not args.output and not args.dry_run:
137
+ args.output = str(_timestamped(args.input, "cleaned", "wav"))
138
+ print(f" output: {args.output}", file=sys.stderr)
139
+ if not args.json_out:
140
+ args.json_out = str(_timestamped(args.input, "cuts", "json"))
141
+ print(f" cuts: {args.json_out}", file=sys.stderr)
142
+
143
+ # Denoise stages produce two virtual inputs:
144
+ # `analysis_input` — what transcribe + audio detectors see
145
+ # `render_input` — what ffmpeg cuts from
146
+ # `none`: both = original
147
+ # `pre`: both = denoised (cleanest splices, but detection less sensitive
148
+ # because denoising flattens the energy/pitch
149
+ # signals our detectors rely on)
150
+ # `post`: both = original; output is denoised at the end
151
+ # `hybrid`: analysis on original (full detection sensitivity), render from
152
+ # denoised (clean splices). Best filler coverage AND clean splices.
153
+ analysis_input = args.input
154
+ render_input = args.input
155
+ denoised_path: Path | None = None
156
+ if args.denoise in ("pre", "hybrid"):
157
+ denoised_path = _timestamped(args.input, "denoised", "wav")
158
+ print(f"[0/4] denoising input -> {denoised_path}", file=sys.stderr)
159
+ denoise_to(args.input, denoised_path,
160
+ nr=args.denoise_nr, nf=args.denoise_nf)
161
+ if args.denoise == "pre":
162
+ analysis_input = str(denoised_path)
163
+ render_input = str(denoised_path)
164
+ else: # hybrid
165
+ render_input = str(denoised_path)
166
+
167
+ print(f"[1/4] transcribing with {args.model}...", file=sys.stderr)
168
+ words, duration = transcribe(analysis_input, model_name=args.model)
169
+
170
+ word_cuts = find_fillers(words, fillers)
171
+ print(f"[2/4] found {len(word_cuts)} transcribed filler(s) in {duration:.2f}s",
172
+ file=sys.stderr)
173
+
174
+ audio = None
175
+ sr = 0
176
+ gap_cuts: list[Cut] = []
177
+ intra_cuts: list[Cut] = []
178
+ if args.detect_gaps:
179
+ audio, sr = load_audio_mono(analysis_input)
180
+ gap_cuts = detect_gap_fillers(
181
+ audio, sr, words, duration,
182
+ min_gap_s=args.gap_min_ms / 1000.0,
183
+ min_voiced_s=args.gap_min_voiced_ms / 1000.0,
184
+ max_voiced_s=args.gap_max_voiced_ms / 1000.0,
185
+ )
186
+ intra_cuts = detect_intraword_fillers(
187
+ audio, sr, words,
188
+ min_word_s=args.intraword_min_ms / 1000.0,
189
+ min_voiced_s=args.gap_min_voiced_ms / 1000.0,
190
+ max_voiced_s=args.gap_max_voiced_ms / 1000.0,
191
+ confirm_pitch=args.confirm_pitch,
192
+ )
193
+ long_cuts = detect_overlong_words(
194
+ audio, sr, words,
195
+ min_voiced_s=args.gap_min_voiced_ms / 1000.0,
196
+ max_voiced_s=args.gap_max_voiced_ms / 1000.0,
197
+ )
198
+ long_cuts_pre = len(long_cuts)
199
+ if args.confirm_pitch:
200
+ long_cuts = [
201
+ c for c in long_cuts
202
+ if is_sustained_vowel(audio, sr, c.start, c.end)
203
+ ]
204
+ print(f" detected {len(gap_cuts)} gap + {len(intra_cuts)} intra "
205
+ f"+ {len(long_cuts)}/{long_cuts_pre} overlong "
206
+ f"(pitch-confirmed) candidate(s)", file=sys.stderr)
207
+ else:
208
+ long_cuts = []
209
+
210
+ raw_cuts = sorted(word_cuts + gap_cuts + intra_cuts + long_cuts,
211
+ key=lambda c: c.start)
212
+ if raw_cuts:
213
+ print("[3/4] refining cut boundaries...", file=sys.stderr)
214
+ if audio is None:
215
+ audio, sr = load_audio_mono(analysis_input)
216
+ cuts = refine_boundaries(
217
+ audio, sr, raw_cuts, search_ms=args.search_ms,
218
+ words=words, total_duration=duration,
219
+ )
220
+ else:
221
+ cuts = []
222
+
223
+ cuts = merge_close_cuts(cuts, min_gap_s=args.merge_gap_ms / 1000.0)
224
+ keep = invert_to_keep_ranges(cuts, duration)
225
+ saved = sum(c.end - c.start for c in cuts)
226
+
227
+ cuts_payload = {
228
+ "input": str(args.input),
229
+ "duration_s": duration,
230
+ "cuts": [c.as_dict() for c in cuts],
231
+ "keep_ranges": [{"start": s, "end": e} for s, e in keep],
232
+ "time_saved_s": saved,
233
+ }
234
+
235
+ if args.json_out:
236
+ Path(args.json_out).write_text(json.dumps(cuts_payload, indent=2))
237
+ print(f" wrote cut list to {args.json_out}", file=sys.stderr)
238
+
239
+ if args.dry_run:
240
+ print(json.dumps(cuts_payload, indent=2))
241
+ if denoised_path is not None:
242
+ Path(denoised_path).unlink(missing_ok=True)
243
+ return 0
244
+
245
+ if not keep:
246
+ print("error: no audio left after removing fillers", file=sys.stderr)
247
+ if denoised_path is not None:
248
+ Path(denoised_path).unlink(missing_ok=True)
249
+ return 1
250
+
251
+ print(f"[4/4] rendering {args.output} ({saved:.2f}s removed)", file=sys.stderr)
252
+ needs_post_denoise = args.denoise == "post"
253
+ needs_room_tone = args.room_tone
254
+
255
+ render_target = args.output
256
+ if needs_post_denoise or needs_room_tone:
257
+ render_target = str(_timestamped(args.input, "raw", "wav"))
258
+
259
+ render(render_input, keep, render_target,
260
+ crossfade_ms=args.crossfade_ms,
261
+ min_crossfade_ms=args.min_crossfade_ms,
262
+ max_crossfade_ms=args.max_crossfade_ms,
263
+ crossfade_factor=args.crossfade_factor,
264
+ words=words)
265
+
266
+ current = render_target
267
+ if needs_post_denoise:
268
+ print(f" denoising output...", file=sys.stderr)
269
+ next_target = (args.output if not needs_room_tone
270
+ else str(_timestamped(args.input, "denoised-out", "wav")))
271
+ denoise_to(current, next_target,
272
+ nr=args.denoise_nr, nf=args.denoise_nf)
273
+ if current != args.output:
274
+ Path(current).unlink(missing_ok=True)
275
+ current = next_target
276
+
277
+ if needs_room_tone:
278
+ # Always sample the room tone from the *original* — that's what has
279
+ # the real ambient character. Denoising would strip it.
280
+ if args.room_tone_source == "auto":
281
+ if audio is None:
282
+ audio, sr = load_audio_mono(args.input)
283
+ region = find_quiet_region(audio, sr, words)
284
+ if region is None:
285
+ print(" room tone: no quiet region found — skipping",
286
+ file=sys.stderr)
287
+ if current != args.output:
288
+ Path(args.output).unlink(missing_ok=True)
289
+ Path(current).rename(args.output)
290
+ if denoised_path is not None:
291
+ Path(denoised_path).unlink(missing_ok=True)
292
+ return 0
293
+ tone_start, tone_end = region
294
+ else:
295
+ try:
296
+ ts, te = (float(x) for x in args.room_tone_source.split("-"))
297
+ except ValueError:
298
+ print(f"error: invalid --room-tone-source {args.room_tone_source!r}",
299
+ file=sys.stderr)
300
+ if current != args.output:
301
+ Path(current).unlink(missing_ok=True)
302
+ if denoised_path is not None:
303
+ Path(denoised_path).unlink(missing_ok=True)
304
+ return 2
305
+ tone_start, tone_end = ts, te
306
+ print(f" room tone: {tone_start:.2f}-{tone_end:.2f}s "
307
+ f"({(tone_end-tone_start)*1000:.0f}ms) "
308
+ f"@ {args.room_tone_level_db:.1f}dB", file=sys.stderr)
309
+ tone_path = _timestamped(args.input, "tone", "wav")
310
+ extract_segment(args.input, tone_start, tone_end, tone_path)
311
+ overlay_room_tone(current, tone_path, args.output,
312
+ level_db=args.room_tone_level_db)
313
+ Path(tone_path).unlink(missing_ok=True)
314
+ if current != args.output:
315
+ Path(current).unlink(missing_ok=True)
316
+
317
+ if denoised_path is not None:
318
+ Path(denoised_path).unlink(missing_ok=True)
319
+
320
+ return 0
321
+
322
+
323
+ def _cmd_validate(args: argparse.Namespace) -> int:
324
+ if not args.report:
325
+ args.report = str(_timestamped(args.output, "validate", "json"))
326
+ print(f" report: {args.report}", file=sys.stderr)
327
+ report = validate_output(
328
+ args.input, args.output, args.cuts, model_name=args.model,
329
+ )
330
+ text = json.dumps(report, indent=2)
331
+ print(text)
332
+ Path(args.report).write_text(text)
333
+ return 0 if report.get("ok") else 1
334
+
335
+
336
+ def main(argv: list[str] | None = None) -> int:
337
+ raw = list(sys.argv[1:] if argv is None else argv)
338
+ if raw and raw[0] == "validate":
339
+ return _cmd_validate(_build_validate_parser().parse_args(raw[1:]))
340
+ if raw and raw[0] == "remove":
341
+ raw = raw[1:]
342
+ return _cmd_remove(_build_remove_parser().parse_args(raw))