erm 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
erm-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Doug Calobrisi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
erm-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,203 @@
1
+ Metadata-Version: 2.4
2
+ Name: erm
3
+ Version: 0.1.0
4
+ Summary: Strip disfluencies (um, uh, er, ah, hmm) from spoken audio.
5
+ Author-email: Doug Calobrisi <doug@calobrisi.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/dougcalobrisi/erm
8
+ Project-URL: Repository, https://github.com/dougcalobrisi/erm
9
+ Project-URL: Issues, https://github.com/dougcalobrisi/erm/issues
10
+ Keywords: audio,speech,transcription,filler-words,disfluency,whisper,ffmpeg
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: End Users/Desktop
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
21
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Editors
22
+ Requires-Python: >=3.11
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: faster-whisper>=1.0.3
26
+ Requires-Dist: numpy>=1.26
27
+ Requires-Dist: librosa>=0.10
28
+ Requires-Dist: soundfile>=0.12
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest>=8.0; extra == "dev"
31
+ Dynamic: license-file
32
+
33
+ # erm
34
+
35
+ Local CLI that strips disfluencies (`um`, `uh`, `er`, `erm`, `ah`, `hmm`, `mhm`,
36
+ `mm`, `uh-huh`, plus any-length elongations like `ummmm` / `uhhhhh`) from
37
+ recordings of English speech.
38
+
39
+ It uses [`faster-whisper`](https://github.com/SYSTRAN/faster-whisper) (running
40
+ the `medium.en` Whisper model by default — override with `--model`) for
41
+ word-level timestamps, three audio-domain detectors that catch fillers Whisper
42
+ hides, and ffmpeg for the cuts. Each splice is snapped to a local energy
43
+ minimum and zero-crossing, optionally crossfaded with a length that scales
44
+ with the cut size, and laid over a constant looped sample of the recording's
45
+ own room tone so the noise floor stays uniform across edits.
46
+
47
+ ## Install
48
+
49
+ Requires Python 3.11+ and `ffmpeg` / `ffprobe` on `PATH`.
50
+
51
+ ```sh
52
+ python3.13 -m venv .venv
53
+ source .venv/bin/activate
54
+ pip install -e '.[dev]'
55
+ ```
56
+
57
+ ## Usage
58
+
59
+ ```sh
60
+ # Remove fillers; output and cut-list paths are auto-generated next to the input.
61
+ erm input.wav
62
+
63
+ # Specify output explicitly.
64
+ erm input.wav -o cleaned.wav
65
+
66
+ # Inspect what would be cut without rendering.
67
+ erm input.wav --dry-run
68
+
69
+ # Validate a rendered output against its source.
70
+ erm validate input.wav cleaned.wav --cuts cuts.json
71
+ ```
72
+
73
+ When `-o` / `--json` are omitted, output paths are written next to the input as
74
+ `{stem}-cleaned-{YYYYMMDD-HHMMSS}.wav` and `{stem}-cuts-{YYYYMMDD-HHMMSS}.json`.
75
+
76
+ ## How it works
77
+
78
+ 1. **Transcribe.** `faster-whisper` runs with `word_timestamps=True` and a
79
+ verbatim-bias `initial_prompt` so it emits filler tokens instead of
80
+ silently cleaning them up.
81
+ 2. **Detect.** Four passes produce candidate cut ranges:
82
+ - **Word-list match** — words whose normalized text is in `--fillers`,
83
+ including arbitrary-length elongations (e.g. `ummmm` matches the `um`
84
+ stem).
85
+ - **Gap fillers** — voiced regions in inter-word gaps longer than
86
+ `--gap-min-ms`. Catches fillers Whisper drops entirely.
87
+ - **Intra-word fillers** — long words whose interior splits across a
88
+ silence dip into multiple voiced runs. The non-vowel run whose duration
89
+ best matches the word's expected duration is treated as the real word;
90
+ siblings become cuts. Catches `"in, uhhhhh"` that Whisper rolls into one
91
+ `'in'` token.
92
+ - **Overlong words** — words much longer than `expected_max_word_duration`
93
+ for their text. The trailing portion is scanned for voiced runs.
94
+ Optionally pitch-confirmed (`--confirm-pitch`) by checking the cut
95
+ region looks like a sustained filler vowel (stable spectral centroid,
96
+ voiced ZCR), so we don't trim slow-but-real speech.
97
+ 3. **Refine.** Each cut endpoint snaps to a local RMS-energy minimum within
98
+ ±`--search-ms`, then to the nearest zero-crossing. Refinement is clamped
99
+ so it never crosses a neighboring word's timestamp.
100
+ 4. **Merge.** Cuts whose surviving fragment would be shorter than
101
+ `--merge-gap-ms` are collapsed into one — a 40ms surviving fragment
102
+ between two cuts gets eaten by the surrounding crossfades and would
103
+ otherwise blurp.
104
+ 5. **Render.** ffmpeg `atrim` + `acrossfade` renders the kept segments. Each
105
+ splice's crossfade length scales with that splice's cut size:
106
+ `clamp(min, cut_ms * factor, max)`. Crossfades are also clamped so they
107
+ never reach back across a real word boundary.
108
+ 6. **Room tone (optional, on by default).** A quiet region of the *original*
109
+ recording is sampled and looped under the output at `--room-tone-level-db`.
110
+ This keeps the noise floor identical everywhere, masking the residual
111
+ noise-floor mismatch at each splice.
112
+
113
+ ## Denoising
114
+
115
+ `--denoise` picks how ffmpeg's `afftdn` denoiser is used:
116
+
117
+ | Mode | Detection sees | ffmpeg cuts from | Notes |
118
+ |----------|----------------|--------------------------|-------|
119
+ | `none` | original | original | No denoising. |
120
+ | `pre` | denoised | denoised | Cleanest splices, but detection less sensitive (denoising flattens energy/pitch signals). |
121
+ | `post` | original | original; output denoised at end | Full detection sensitivity; splice noise-floor mismatch smoothed afterward. |
122
+ | `hybrid` (default) | original | denoised | Full detection sensitivity *and* clean splices. Recommended. |
123
+
124
+ Tune with `--denoise-nr` (reduction strength dB) and `--denoise-nf` (noise
125
+ floor dB).
126
+
127
+ ## Flags
128
+
129
+ ### Detection
130
+
131
+ | Flag | Default | Notes |
132
+ |------|---------|-------|
133
+ | `--model` | `medium.en` | Any faster-whisper model. `small.en` faster; `large-v3` more accurate. |
134
+ | `--fillers` | `ah,er,erm,hmm,mhm,mm,uh,uh-huh,um` | Comma-separated stems. Elongations matched dynamically. |
135
+ | `--detect-gaps` / `--no-detect-gaps` | on | Run gap + intra-word + overlong detectors. |
136
+ | `--gap-min-ms` | `350` | Minimum inter-word gap to scan for fillers. |
137
+ | `--gap-min-voiced-ms` / `--gap-max-voiced-ms` | `100` / `1500` | Voiced-run length bounds. |
138
+ | `--intraword-min-ms` | `550` | Minimum word length to scan internally. |
139
+ | `--confirm-pitch` / `--no-confirm-pitch` | on | Drop overlong/intra candidates that don't look like sustained filler vowels. |
140
+
141
+ ### Cuts and splices
142
+
143
+ | Flag | Default | Notes |
144
+ |------|---------|-------|
145
+ | `--search-ms` | `60` | How far each endpoint may slide to find a local energy minimum. |
146
+ | `--crossfade-ms` | *(unset)* | Force a fixed crossfade length for every splice. When unset, per-splice scaling is used. |
147
+ | `--min-crossfade-ms` / `--max-crossfade-ms` | `50` / `120` | Floor and ceiling for the per-splice crossfade scaling. |
148
+ | `--crossfade-factor` | `0.15` | `cut_ms * factor`, clamped to `[min, max]`. Higher = smoother but blurrier. |
149
+ | `--merge-gap-ms` | `120` | Merge two cuts whose surviving fragment would be shorter than this. |
150
+
151
+ ### Audio cleanup
152
+
153
+ | Flag | Default | Notes |
154
+ |------|---------|-------|
155
+ | `--denoise` | `hybrid` | `none` / `pre` / `post` / `hybrid` (see table above). |
156
+ | `--denoise-nr` | `12.0` | `afftdn` noise reduction (dB). |
157
+ | `--denoise-nf` | `-25.0` | `afftdn` noise floor (dB). |
158
+ | `--room-tone` / `--no-room-tone` | on | Loop a quiet sample of the original under the output. |
159
+ | `--room-tone-level-db` | `-12.0` | Attenuation applied to the looped tone. `-12` to `-20` is usually right. |
160
+ | `--room-tone-source` | `auto` | `auto` finds a quiet region; otherwise `START-END` in seconds (e.g. `0.05-1.4`). |
161
+
162
+ ### Output
163
+
164
+ | Flag | Default | Notes |
165
+ |------|---------|-------|
166
+ | `-o`, `--output` | auto-named next to input | Output `.wav` path. |
167
+ | `--json PATH` | auto-named next to input | Cut list JSON. |
168
+ | `--dry-run` | off | Print the cut list and exit; no audio rendered. |
169
+
170
+ ## `validate` subcommand
171
+
172
+ ```sh
173
+ erm validate input.wav cleaned.wav --cuts cuts.json
174
+ ```
175
+
176
+ Runs three deterministic checks:
177
+
178
+ - **Container sanity** — `ffprobe` reads the output without errors.
179
+ - **Duration math** — `output_duration ≈ input_duration - sum(cut lengths)`,
180
+ within 50ms.
181
+ - **No-filler invariant** — re-transcribe the output; assert no token in the
182
+ filler set survives.
183
+
184
+ Writes a JSON report to `--report PATH` (or auto-named next to the output)
185
+ and exits non-zero if any check fails.
186
+
187
+ ## Tests
188
+
189
+ ```sh
190
+ pytest
191
+ ```
192
+
193
+ The pure helpers (`find_fillers`, `invert_to_keep_ranges`,
194
+ `refine_boundaries`, `merge_close_cuts`, `expected_max_word_duration`,
195
+ `_voiced_runs_in_region`, …) run without faster-whisper or librosa imported.
196
+ Heavy deps are imported lazily inside `transcribe`, `render`,
197
+ `load_audio_mono`, and `is_sustained_vowel`.
198
+
199
+ ## Out of scope
200
+
201
+ - Removing `like`, `you know`, `I mean` — too risky for meaning.
202
+ - Languages other than English.
203
+ - Real-time / streaming.
erm-0.1.0/README.md ADDED
@@ -0,0 +1,171 @@
1
+ # erm
2
+
3
+ Local CLI that strips disfluencies (`um`, `uh`, `er`, `erm`, `ah`, `hmm`, `mhm`,
4
+ `mm`, `uh-huh`, plus any-length elongations like `ummmm` / `uhhhhh`) from
5
+ recordings of English speech.
6
+
7
+ It uses [`faster-whisper`](https://github.com/SYSTRAN/faster-whisper) (running
8
+ the `medium.en` Whisper model by default — override with `--model`) for
9
+ word-level timestamps, three audio-domain detectors that catch fillers Whisper
10
+ hides, and ffmpeg for the cuts. Each splice is snapped to a local energy
11
+ minimum and zero-crossing, optionally crossfaded with a length that scales
12
+ with the cut size, and laid over a constant looped sample of the recording's
13
+ own room tone so the noise floor stays uniform across edits.
14
+
15
+ ## Install
16
+
17
+ Requires Python 3.11+ and `ffmpeg` / `ffprobe` on `PATH`.
18
+
19
+ ```sh
20
+ python3.13 -m venv .venv
21
+ source .venv/bin/activate
22
+ pip install -e '.[dev]'
23
+ ```
24
+
25
+ ## Usage
26
+
27
+ ```sh
28
+ # Remove fillers; output and cut-list paths are auto-generated next to the input.
29
+ erm input.wav
30
+
31
+ # Specify output explicitly.
32
+ erm input.wav -o cleaned.wav
33
+
34
+ # Inspect what would be cut without rendering.
35
+ erm input.wav --dry-run
36
+
37
+ # Validate a rendered output against its source.
38
+ erm validate input.wav cleaned.wav --cuts cuts.json
39
+ ```
40
+
41
+ When `-o` / `--json` are omitted, output paths are written next to the input as
42
+ `{stem}-cleaned-{YYYYMMDD-HHMMSS}.wav` and `{stem}-cuts-{YYYYMMDD-HHMMSS}.json`.
43
+
44
+ ## How it works
45
+
46
+ 1. **Transcribe.** `faster-whisper` runs with `word_timestamps=True` and a
47
+ verbatim-bias `initial_prompt` so it emits filler tokens instead of
48
+ silently cleaning them up.
49
+ 2. **Detect.** Four passes produce candidate cut ranges:
50
+ - **Word-list match** — words whose normalized text is in `--fillers`,
51
+ including arbitrary-length elongations (e.g. `ummmm` matches the `um`
52
+ stem).
53
+ - **Gap fillers** — voiced regions in inter-word gaps longer than
54
+ `--gap-min-ms`. Catches fillers Whisper drops entirely.
55
+ - **Intra-word fillers** — long words whose interior splits across a
56
+ silence dip into multiple voiced runs. The non-vowel run whose duration
57
+ best matches the word's expected duration is treated as the real word;
58
+ siblings become cuts. Catches `"in, uhhhhh"` that Whisper rolls into one
59
+ `'in'` token.
60
+ - **Overlong words** — words much longer than `expected_max_word_duration`
61
+ for their text. The trailing portion is scanned for voiced runs.
62
+ Optionally pitch-confirmed (`--confirm-pitch`) by checking the cut
63
+ region looks like a sustained filler vowel (stable spectral centroid,
64
+ voiced ZCR), so we don't trim slow-but-real speech.
65
+ 3. **Refine.** Each cut endpoint snaps to a local RMS-energy minimum within
66
+ ±`--search-ms`, then to the nearest zero-crossing. Refinement is clamped
67
+ so it never crosses a neighboring word's timestamp.
68
+ 4. **Merge.** Cuts whose surviving fragment would be shorter than
69
+ `--merge-gap-ms` are collapsed into one — a 40ms surviving fragment
70
+ between two cuts gets eaten by the surrounding crossfades and would
71
+ otherwise blurp.
72
+ 5. **Render.** ffmpeg `atrim` + `acrossfade` renders the kept segments. Each
73
+ splice's crossfade length scales with that splice's cut size:
74
+ `clamp(min, cut_ms * factor, max)`. Crossfades are also clamped so they
75
+ never reach back across a real word boundary.
76
+ 6. **Room tone (optional, on by default).** A quiet region of the *original*
77
+ recording is sampled and looped under the output at `--room-tone-level-db`.
78
+ This keeps the noise floor identical everywhere, masking the residual
79
+ noise-floor mismatch at each splice.
80
+
81
+ ## Denoising
82
+
83
+ `--denoise` picks how ffmpeg's `afftdn` denoiser is used:
84
+
85
+ | Mode | Detection sees | ffmpeg cuts from | Notes |
86
+ |----------|----------------|--------------------------|-------|
87
+ | `none` | original | original | No denoising. |
88
+ | `pre` | denoised | denoised | Cleanest splices, but detection less sensitive (denoising flattens energy/pitch signals). |
89
+ | `post` | original | original; output denoised at end | Full detection sensitivity; splice noise-floor mismatch smoothed afterward. |
90
+ | `hybrid` (default) | original | denoised | Full detection sensitivity *and* clean splices. Recommended. |
91
+
92
+ Tune with `--denoise-nr` (reduction strength dB) and `--denoise-nf` (noise
93
+ floor dB).
94
+
95
+ ## Flags
96
+
97
+ ### Detection
98
+
99
+ | Flag | Default | Notes |
100
+ |------|---------|-------|
101
+ | `--model` | `medium.en` | Any faster-whisper model. `small.en` faster; `large-v3` more accurate. |
102
+ | `--fillers` | `ah,er,erm,hmm,mhm,mm,uh,uh-huh,um` | Comma-separated stems. Elongations matched dynamically. |
103
+ | `--detect-gaps` / `--no-detect-gaps` | on | Run gap + intra-word + overlong detectors. |
104
+ | `--gap-min-ms` | `350` | Minimum inter-word gap to scan for fillers. |
105
+ | `--gap-min-voiced-ms` / `--gap-max-voiced-ms` | `100` / `1500` | Voiced-run length bounds. |
106
+ | `--intraword-min-ms` | `550` | Minimum word length to scan internally. |
107
+ | `--confirm-pitch` / `--no-confirm-pitch` | on | Drop overlong/intra candidates that don't look like sustained filler vowels. |
108
+
109
+ ### Cuts and splices
110
+
111
+ | Flag | Default | Notes |
112
+ |------|---------|-------|
113
+ | `--search-ms` | `60` | How far each endpoint may slide to find a local energy minimum. |
114
+ | `--crossfade-ms` | *(unset)* | Force a fixed crossfade length for every splice. When unset, per-splice scaling is used. |
115
+ | `--min-crossfade-ms` / `--max-crossfade-ms` | `50` / `120` | Floor and ceiling for the per-splice crossfade scaling. |
116
+ | `--crossfade-factor` | `0.15` | `cut_ms * factor`, clamped to `[min, max]`. Higher = smoother but blurrier. |
117
+ | `--merge-gap-ms` | `120` | Merge two cuts whose surviving fragment would be shorter than this. |
118
+
119
+ ### Audio cleanup
120
+
121
+ | Flag | Default | Notes |
122
+ |------|---------|-------|
123
+ | `--denoise` | `hybrid` | `none` / `pre` / `post` / `hybrid` (see table above). |
124
+ | `--denoise-nr` | `12.0` | `afftdn` noise reduction (dB). |
125
+ | `--denoise-nf` | `-25.0` | `afftdn` noise floor (dB). |
126
+ | `--room-tone` / `--no-room-tone` | on | Loop a quiet sample of the original under the output. |
127
+ | `--room-tone-level-db` | `-12.0` | Attenuation applied to the looped tone. `-12` to `-20` is usually right. |
128
+ | `--room-tone-source` | `auto` | `auto` finds a quiet region; otherwise `START-END` in seconds (e.g. `0.05-1.4`). |
129
+
130
+ ### Output
131
+
132
+ | Flag | Default | Notes |
133
+ |------|---------|-------|
134
+ | `-o`, `--output` | auto-named next to input | Output `.wav` path. |
135
+ | `--json PATH` | auto-named next to input | Cut list JSON. |
136
+ | `--dry-run` | off | Print the cut list and exit; no audio rendered. |
137
+
138
+ ## `validate` subcommand
139
+
140
+ ```sh
141
+ erm validate input.wav cleaned.wav --cuts cuts.json
142
+ ```
143
+
144
+ Runs three deterministic checks:
145
+
146
+ - **Container sanity** — `ffprobe` reads the output without errors.
147
+ - **Duration math** — `output_duration ≈ input_duration - sum(cut lengths)`,
148
+ within 50ms.
149
+ - **No-filler invariant** — re-transcribe the output; assert no token in the
150
+ filler set survives.
151
+
152
+ Writes a JSON report to `--report PATH` (or auto-named next to the output)
153
+ and exits non-zero if any check fails.
154
+
155
+ ## Tests
156
+
157
+ ```sh
158
+ pytest
159
+ ```
160
+
161
+ The pure helpers (`find_fillers`, `invert_to_keep_ranges`,
162
+ `refine_boundaries`, `merge_close_cuts`, `expected_max_word_duration`,
163
+ `_voiced_runs_in_region`, …) run without faster-whisper or librosa imported.
164
+ Heavy deps are imported lazily inside `transcribe`, `render`,
165
+ `load_audio_mono`, and `is_sustained_vowel`.
166
+
167
+ ## Out of scope
168
+
169
+ - Removing `like`, `you know`, `I mean` — too risky for meaning.
170
+ - Languages other than English.
171
+ - Real-time / streaming.
@@ -0,0 +1,49 @@
1
+ [project]
2
+ name = "erm"
3
+ version = "0.1.0"
4
+ description = "Strip disfluencies (um, uh, er, ah, hmm) from spoken audio."
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ license = { text = "MIT" }
8
+ authors = [{ name = "Doug Calobrisi", email = "doug@calobrisi.com" }]
9
+ keywords = ["audio", "speech", "transcription", "filler-words", "disfluency", "whisper", "ffmpeg"]
10
+ classifiers = [
11
+ "Development Status :: 4 - Beta",
12
+ "Environment :: Console",
13
+ "Intended Audience :: End Users/Desktop",
14
+ "License :: OSI Approved :: MIT License",
15
+ "Operating System :: OS Independent",
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.11",
18
+ "Programming Language :: Python :: 3.12",
19
+ "Programming Language :: Python :: 3.13",
20
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
21
+ "Topic :: Multimedia :: Sound/Audio :: Editors",
22
+ ]
23
+ dependencies = [
24
+ "faster-whisper>=1.0.3",
25
+ "numpy>=1.26",
26
+ "librosa>=0.10",
27
+ "soundfile>=0.12",
28
+ ]
29
+
30
+ [project.optional-dependencies]
31
+ dev = ["pytest>=8.0"]
32
+
33
+ [project.urls]
34
+ Homepage = "https://github.com/dougcalobrisi/erm"
35
+ Repository = "https://github.com/dougcalobrisi/erm"
36
+ Issues = "https://github.com/dougcalobrisi/erm/issues"
37
+
38
+ [project.scripts]
39
+ erm = "erm.cli:main"
40
+
41
+ [build-system]
42
+ requires = ["setuptools>=68"]
43
+ build-backend = "setuptools.build_meta"
44
+
45
+ [tool.setuptools.packages.find]
46
+ where = ["src"]
47
+
48
+ [tool.pytest.ini_options]
49
+ markers = ["slow: tests that need the Whisper model (download required)"]
erm-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,58 @@
1
+ """erm: strip disfluencies from spoken audio.
2
+
3
+ The pure-helper modules (`fillers`, `ranges`, `refine`, `envelope`, `models`)
4
+ depend only on numpy + stdlib so the unit tests can run without
5
+ faster-whisper or librosa installed. Heavy deps (`librosa`,
6
+ `faster_whisper`) are imported lazily inside the functions that need them.
7
+ """
8
+
9
+ from .acoustic import is_sustained_vowel
10
+ from .asr import VERBATIM_PROMPT, transcribe
11
+ from .audio import find_quiet_region, load_audio_mono
12
+ from .cli import main
13
+ from .detect import (
14
+ detect_gap_fillers,
15
+ detect_intraword_fillers,
16
+ detect_overlong_words,
17
+ expected_max_word_duration,
18
+ )
19
+ from .ffmpeg_ops import (
20
+ denoise_to,
21
+ extract_segment,
22
+ ffprobe_duration,
23
+ overlay_room_tone,
24
+ render,
25
+ )
26
+ from .fillers import DEFAULT_FILLERS, find_fillers, is_filler, normalize_word
27
+ from .models import Cut, Word
28
+ from .ranges import invert_to_keep_ranges, merge_close_cuts
29
+ from .refine import refine_boundaries
30
+ from .validate import validate_output
31
+
32
+ __all__ = [
33
+ "Cut",
34
+ "DEFAULT_FILLERS",
35
+ "VERBATIM_PROMPT",
36
+ "Word",
37
+ "denoise_to",
38
+ "detect_gap_fillers",
39
+ "detect_intraword_fillers",
40
+ "detect_overlong_words",
41
+ "expected_max_word_duration",
42
+ "extract_segment",
43
+ "ffprobe_duration",
44
+ "find_fillers",
45
+ "find_quiet_region",
46
+ "invert_to_keep_ranges",
47
+ "is_filler",
48
+ "is_sustained_vowel",
49
+ "load_audio_mono",
50
+ "main",
51
+ "merge_close_cuts",
52
+ "normalize_word",
53
+ "overlay_room_tone",
54
+ "refine_boundaries",
55
+ "render",
56
+ "transcribe",
57
+ "validate_output",
58
+ ]
@@ -0,0 +1,11 @@
1
+ """`python -m erm` entrypoint."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+
7
+ from .cli import main
8
+
9
+
10
+ if __name__ == "__main__":
11
+ sys.exit(main())
@@ -0,0 +1,59 @@
1
+ """Acoustic feature checks (librosa-based, lazy-imported)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+
7
+
8
+ def is_sustained_vowel(
9
+ audio: np.ndarray,
10
+ sr: int,
11
+ start_s: float,
12
+ end_s: float,
13
+ max_centroid_cv: float = 0.18,
14
+ min_voiced_frac: float = 0.50,
15
+ ) -> bool:
16
+ """Return True if [start_s, end_s] looks acoustically like a sustained
17
+ filler vowel ("uhhh", "ahhh", "ummm").
18
+
19
+ Filler vowels have two distinguishing features compared to real word
20
+ content: (a) the spectral energy stays in roughly the same place across
21
+ the region (low spectral-centroid variation), and (b) most frames are
22
+ voiced (ZCR in the voiced range, not silence or fricative noise).
23
+
24
+ `max_centroid_cv` is the std/mean ratio of the spectral centroid; lower
25
+ means more stable. `min_voiced_frac` is the fraction of frames whose
26
+ zero-crossing rate is in the typical voiced-speech range.
27
+ """
28
+ import librosa # heavy; lazy
29
+
30
+ if audio.ndim > 1:
31
+ audio = audio.mean(axis=1)
32
+ s = max(0, int(start_s * sr))
33
+ e = min(audio.size, int(end_s * sr))
34
+ seg = audio[s:e]
35
+ if seg.size < int(0.06 * sr):
36
+ return False
37
+
38
+ n_fft = 1024
39
+ hop = max(1, int(0.020 * sr))
40
+ if seg.size < n_fft:
41
+ seg = np.pad(seg, (0, n_fft - seg.size), mode="constant")
42
+
43
+ centroid = librosa.feature.spectral_centroid(
44
+ y=seg, sr=sr, n_fft=n_fft, hop_length=hop,
45
+ )[0]
46
+ if centroid.size < 3:
47
+ return False
48
+ mean_c = float(centroid.mean())
49
+ if mean_c <= 1e-6:
50
+ return False
51
+ cv = float(centroid.std() / mean_c)
52
+
53
+ zcr = librosa.feature.zero_crossing_rate(
54
+ y=seg, frame_length=n_fft, hop_length=hop,
55
+ )[0]
56
+ voiced = (zcr > 0.02) & (zcr < 0.20)
57
+ voiced_frac = float(voiced.mean()) if voiced.size else 0.0
58
+
59
+ return cv <= max_centroid_cv and voiced_frac >= min_voiced_frac
@@ -0,0 +1,43 @@
1
+ """faster-whisper transcription (lazy-imported)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from .models import Word
8
+
9
+
10
+ VERBATIM_PROMPT = (
11
+ "Um, uh, er, erm, ah, hmm. Like, you know, I mean, sort of. "
12
+ "Verbatim transcription including all filler words and disfluencies."
13
+ )
14
+
15
+
16
+ def transcribe(
17
+ path: str | Path,
18
+ model_name: str = "medium.en",
19
+ verbatim: bool = True,
20
+ ) -> tuple[list[Word], float]:
21
+ """Transcribe `path` with faster-whisper. Returns (words, duration_seconds).
22
+
23
+ `verbatim=True` passes an `initial_prompt` that biases Whisper toward
24
+ keeping disfluencies, which it normally cleans up silently.
25
+ """
26
+ from faster_whisper import WhisperModel # heavy; lazy
27
+
28
+ model = WhisperModel(model_name, device="auto", compute_type="auto")
29
+ segments, info = model.transcribe(
30
+ str(path),
31
+ word_timestamps=True,
32
+ initial_prompt=VERBATIM_PROMPT if verbatim else None,
33
+ condition_on_previous_text=False, # otherwise the prompt gets diluted
34
+ )
35
+ words: list[Word] = []
36
+ for seg in segments:
37
+ if not seg.words:
38
+ continue
39
+ for w in seg.words:
40
+ if w.start is None or w.end is None:
41
+ continue
42
+ words.append(Word(text=w.word.strip(), start=float(w.start), end=float(w.end)))
43
+ return words, float(info.duration)
@@ -0,0 +1,59 @@
1
+ """Audio loading and quiet-region selection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Sequence
7
+
8
+ import numpy as np
9
+
10
+ from .models import Word
11
+
12
+
13
+ def load_audio_mono(path: str | Path, target_sr: int = 16_000) -> tuple[np.ndarray, int]:
14
+ """Load any ffmpeg-readable audio file as mono float32 at `target_sr`."""
15
+ import librosa # heavy; lazy
16
+ y, sr = librosa.load(str(path), sr=target_sr, mono=True)
17
+ return y.astype(np.float32), int(sr)
18
+
19
+
20
+ def find_quiet_region(
21
+ audio: np.ndarray,
22
+ sr: int,
23
+ words: Sequence[Word],
24
+ min_length_s: float = 0.4,
25
+ max_length_s: float = 1.5,
26
+ win_ms: float = 10.0,
27
+ ) -> tuple[float, float] | None:
28
+ """Find a stretch of mostly-silent audio suitable as a room-tone sample.
29
+
30
+ We need a region with no speech and only background noise (HVAC, mic
31
+ hiss, room tone). The gap *before the first transcribed word* is usually
32
+ the cleanest source — it's pre-roll silence with no speaker activity.
33
+ Falls back to the gap after the last word if the leading gap is too
34
+ short.
35
+ """
36
+ if audio.ndim > 1:
37
+ audio = audio.mean(axis=1)
38
+ audio = np.ascontiguousarray(audio, dtype=np.float32)
39
+ total = float(audio.size) / sr
40
+
41
+ sorted_words = sorted(words, key=lambda w: w.start)
42
+ candidates: list[tuple[float, float]] = []
43
+ if sorted_words:
44
+ candidates.append((0.0, sorted_words[0].start))
45
+ candidates.append((sorted_words[-1].end, total))
46
+ else:
47
+ candidates.append((0.0, total))
48
+
49
+ # Trim 50ms off each side to avoid clipping the start of speech
50
+ # or the tail of the previous word's silence-pad.
51
+ pad = 0.05
52
+ for start_s, end_s in candidates:
53
+ if end_s - start_s < min_length_s + 2 * pad:
54
+ continue
55
+ s = start_s + pad
56
+ e = min(end_s - pad, s + max_length_s)
57
+ if e - s >= min_length_s:
58
+ return (s, e)
59
+ return None