erm 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- erm-0.1.0/LICENSE +21 -0
- erm-0.1.0/PKG-INFO +203 -0
- erm-0.1.0/README.md +171 -0
- erm-0.1.0/pyproject.toml +49 -0
- erm-0.1.0/setup.cfg +4 -0
- erm-0.1.0/src/erm/__init__.py +58 -0
- erm-0.1.0/src/erm/__main__.py +11 -0
- erm-0.1.0/src/erm/acoustic.py +59 -0
- erm-0.1.0/src/erm/asr.py +43 -0
- erm-0.1.0/src/erm/audio.py +59 -0
- erm-0.1.0/src/erm/cli.py +342 -0
- erm-0.1.0/src/erm/detect.py +326 -0
- erm-0.1.0/src/erm/envelope.py +74 -0
- erm-0.1.0/src/erm/ffmpeg_ops.py +156 -0
- erm-0.1.0/src/erm/fillers.py +59 -0
- erm-0.1.0/src/erm/models.py +22 -0
- erm-0.1.0/src/erm/ranges.py +63 -0
- erm-0.1.0/src/erm/refine.py +100 -0
- erm-0.1.0/src/erm/validate.py +73 -0
- erm-0.1.0/src/erm.egg-info/PKG-INFO +203 -0
- erm-0.1.0/src/erm.egg-info/SOURCES.txt +24 -0
- erm-0.1.0/src/erm.egg-info/dependency_links.txt +1 -0
- erm-0.1.0/src/erm.egg-info/entry_points.txt +2 -0
- erm-0.1.0/src/erm.egg-info/requires.txt +7 -0
- erm-0.1.0/src/erm.egg-info/top_level.txt +1 -0
- erm-0.1.0/tests/test_pure.py +257 -0
erm-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Doug Calobrisi
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
erm-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: erm
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Strip disfluencies (um, uh, er, ah, hmm) from spoken audio.
|
|
5
|
+
Author-email: Doug Calobrisi <doug@calobrisi.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/dougcalobrisi/erm
|
|
8
|
+
Project-URL: Repository, https://github.com/dougcalobrisi/erm
|
|
9
|
+
Project-URL: Issues, https://github.com/dougcalobrisi/erm/issues
|
|
10
|
+
Keywords: audio,speech,transcription,filler-words,disfluency,whisper,ffmpeg
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
21
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Editors
|
|
22
|
+
Requires-Python: >=3.11
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: faster-whisper>=1.0.3
|
|
26
|
+
Requires-Dist: numpy>=1.26
|
|
27
|
+
Requires-Dist: librosa>=0.10
|
|
28
|
+
Requires-Dist: soundfile>=0.12
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# erm
|
|
34
|
+
|
|
35
|
+
Local CLI that strips disfluencies (`um`, `uh`, `er`, `erm`, `ah`, `hmm`, `mhm`,
|
|
36
|
+
`mm`, `uh-huh`, plus any-length elongations like `ummmm` / `uhhhhh`) from
|
|
37
|
+
recordings of English speech.
|
|
38
|
+
|
|
39
|
+
It uses [`faster-whisper`](https://github.com/SYSTRAN/faster-whisper) (running
|
|
40
|
+
the `medium.en` Whisper model by default — override with `--model`) for
|
|
41
|
+
word-level timestamps, three audio-domain detectors that catch fillers Whisper
|
|
42
|
+
hides, and ffmpeg for the cuts. Each splice is snapped to a local energy
|
|
43
|
+
minimum and zero-crossing, optionally crossfaded with a length that scales
|
|
44
|
+
with the cut size, and laid over a constant looped sample of the recording's
|
|
45
|
+
own room tone so the noise floor stays uniform across edits.
|
|
46
|
+
|
|
47
|
+
## Install
|
|
48
|
+
|
|
49
|
+
Requires Python 3.11+ and `ffmpeg` / `ffprobe` on `PATH`.
|
|
50
|
+
|
|
51
|
+
```sh
|
|
52
|
+
python3.13 -m venv .venv
|
|
53
|
+
source .venv/bin/activate
|
|
54
|
+
pip install -e '.[dev]'
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Usage
|
|
58
|
+
|
|
59
|
+
```sh
|
|
60
|
+
# Remove fillers; output and cut-list paths are auto-generated next to the input.
|
|
61
|
+
erm input.wav
|
|
62
|
+
|
|
63
|
+
# Specify output explicitly.
|
|
64
|
+
erm input.wav -o cleaned.wav
|
|
65
|
+
|
|
66
|
+
# Inspect what would be cut without rendering.
|
|
67
|
+
erm input.wav --dry-run
|
|
68
|
+
|
|
69
|
+
# Validate a rendered output against its source.
|
|
70
|
+
erm validate input.wav cleaned.wav --cuts cuts.json
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
When `-o` / `--json` are omitted, output paths are written next to the input as
|
|
74
|
+
`{stem}-cleaned-{YYYYMMDD-HHMMSS}.wav` and `{stem}-cuts-{YYYYMMDD-HHMMSS}.json`.
|
|
75
|
+
|
|
76
|
+
## How it works
|
|
77
|
+
|
|
78
|
+
1. **Transcribe.** `faster-whisper` runs with `word_timestamps=True` and a
|
|
79
|
+
verbatim-bias `initial_prompt` so it emits filler tokens instead of
|
|
80
|
+
silently cleaning them up.
|
|
81
|
+
2. **Detect.** Four passes produce candidate cut ranges:
|
|
82
|
+
- **Word-list match** — words whose normalized text is in `--fillers`,
|
|
83
|
+
including arbitrary-length elongations (e.g. `ummmm` matches the `um`
|
|
84
|
+
stem).
|
|
85
|
+
- **Gap fillers** — voiced regions in inter-word gaps longer than
|
|
86
|
+
`--gap-min-ms`. Catches fillers Whisper drops entirely.
|
|
87
|
+
- **Intra-word fillers** — long words whose interior splits across a
|
|
88
|
+
silence dip into multiple voiced runs. The non-vowel run whose duration
|
|
89
|
+
best matches the word's expected duration is treated as the real word;
|
|
90
|
+
siblings become cuts. Catches `"in, uhhhhh"` that Whisper rolls into one
|
|
91
|
+
`'in'` token.
|
|
92
|
+
- **Overlong words** — words much longer than `expected_max_word_duration`
|
|
93
|
+
for their text. The trailing portion is scanned for voiced runs.
|
|
94
|
+
Optionally pitch-confirmed (`--confirm-pitch`) by checking the cut
|
|
95
|
+
region looks like a sustained filler vowel (stable spectral centroid,
|
|
96
|
+
voiced ZCR), so we don't trim slow-but-real speech.
|
|
97
|
+
3. **Refine.** Each cut endpoint snaps to a local RMS-energy minimum within
|
|
98
|
+
±`--search-ms`, then to the nearest zero-crossing. Refinement is clamped
|
|
99
|
+
so it never crosses a neighboring word's timestamp.
|
|
100
|
+
4. **Merge.** Cuts whose surviving fragment would be shorter than
|
|
101
|
+
`--merge-gap-ms` are collapsed into one — a 40ms surviving fragment
|
|
102
|
+
between two cuts gets eaten by the surrounding crossfades and would
|
|
103
|
+
otherwise blurp.
|
|
104
|
+
5. **Render.** ffmpeg `atrim` + `acrossfade` renders the kept segments. Each
|
|
105
|
+
splice's crossfade length scales with that splice's cut size:
|
|
106
|
+
`clamp(min, cut_ms * factor, max)`. Crossfades are also clamped so they
|
|
107
|
+
never reach back across a real word boundary.
|
|
108
|
+
6. **Room tone (optional, on by default).** A quiet region of the *original*
|
|
109
|
+
recording is sampled and looped under the output at `--room-tone-level-db`.
|
|
110
|
+
This keeps the noise floor identical everywhere, masking the residual
|
|
111
|
+
noise-floor mismatch at each splice.
|
|
112
|
+
|
|
113
|
+
## Denoising
|
|
114
|
+
|
|
115
|
+
`--denoise` picks how ffmpeg's `afftdn` denoiser is used:
|
|
116
|
+
|
|
117
|
+
| Mode | Detection sees | ffmpeg cuts from | Notes |
|
|
118
|
+
|----------|----------------|--------------------------|-------|
|
|
119
|
+
| `none` | original | original | No denoising. |
|
|
120
|
+
| `pre` | denoised | denoised | Cleanest splices, but detection less sensitive (denoising flattens energy/pitch signals). |
|
|
121
|
+
| `post` | original | original; output denoised at end | Full detection sensitivity; splice noise-floor mismatch smoothed afterward. |
|
|
122
|
+
| `hybrid` (default) | original | denoised | Full detection sensitivity *and* clean splices. Recommended. |
|
|
123
|
+
|
|
124
|
+
Tune with `--denoise-nr` (reduction strength dB) and `--denoise-nf` (noise
|
|
125
|
+
floor dB).
|
|
126
|
+
|
|
127
|
+
## Flags
|
|
128
|
+
|
|
129
|
+
### Detection
|
|
130
|
+
|
|
131
|
+
| Flag | Default | Notes |
|
|
132
|
+
|------|---------|-------|
|
|
133
|
+
| `--model` | `medium.en` | Any faster-whisper model. `small.en` faster; `large-v3` more accurate. |
|
|
134
|
+
| `--fillers` | `ah,er,erm,hmm,mhm,mm,uh,uh-huh,um` | Comma-separated stems. Elongations matched dynamically. |
|
|
135
|
+
| `--detect-gaps` / `--no-detect-gaps` | on | Run gap + intra-word + overlong detectors. |
|
|
136
|
+
| `--gap-min-ms` | `350` | Minimum inter-word gap to scan for fillers. |
|
|
137
|
+
| `--gap-min-voiced-ms` / `--gap-max-voiced-ms` | `100` / `1500` | Voiced-run length bounds. |
|
|
138
|
+
| `--intraword-min-ms` | `550` | Minimum word length to scan internally. |
|
|
139
|
+
| `--confirm-pitch` / `--no-confirm-pitch` | on | Drop overlong/intra candidates that don't look like sustained filler vowels. |
|
|
140
|
+
|
|
141
|
+
### Cuts and splices
|
|
142
|
+
|
|
143
|
+
| Flag | Default | Notes |
|
|
144
|
+
|------|---------|-------|
|
|
145
|
+
| `--search-ms` | `60` | How far each endpoint may slide to find a local energy minimum. |
|
|
146
|
+
| `--crossfade-ms` | *(unset)* | Force a fixed crossfade length for every splice. When unset, per-splice scaling is used. |
|
|
147
|
+
| `--min-crossfade-ms` / `--max-crossfade-ms` | `50` / `120` | Floor and ceiling for the per-splice crossfade scaling. |
|
|
148
|
+
| `--crossfade-factor` | `0.15` | `cut_ms * factor`, clamped to `[min, max]`. Higher = smoother but blurrier. |
|
|
149
|
+
| `--merge-gap-ms` | `120` | Merge two cuts whose surviving fragment would be shorter than this. |
|
|
150
|
+
|
|
151
|
+
### Audio cleanup
|
|
152
|
+
|
|
153
|
+
| Flag | Default | Notes |
|
|
154
|
+
|------|---------|-------|
|
|
155
|
+
| `--denoise` | `hybrid` | `none` / `pre` / `post` / `hybrid` (see table above). |
|
|
156
|
+
| `--denoise-nr` | `12.0` | `afftdn` noise reduction (dB). |
|
|
157
|
+
| `--denoise-nf` | `-25.0` | `afftdn` noise floor (dB). |
|
|
158
|
+
| `--room-tone` / `--no-room-tone` | on | Loop a quiet sample of the original under the output. |
|
|
159
|
+
| `--room-tone-level-db` | `-12.0` | Attenuation applied to the looped tone. `-12` to `-20` is usually right. |
|
|
160
|
+
| `--room-tone-source` | `auto` | `auto` finds a quiet region; otherwise `START-END` in seconds (e.g. `0.05-1.4`). |
|
|
161
|
+
|
|
162
|
+
### Output
|
|
163
|
+
|
|
164
|
+
| Flag | Default | Notes |
|
|
165
|
+
|------|---------|-------|
|
|
166
|
+
| `-o`, `--output` | auto-named next to input | Output `.wav` path. |
|
|
167
|
+
| `--json PATH` | auto-named next to input | Cut list JSON. |
|
|
168
|
+
| `--dry-run` | off | Print the cut list and exit; no audio rendered. |
|
|
169
|
+
|
|
170
|
+
## `validate` subcommand
|
|
171
|
+
|
|
172
|
+
```sh
|
|
173
|
+
erm validate input.wav cleaned.wav --cuts cuts.json
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Runs three deterministic checks:
|
|
177
|
+
|
|
178
|
+
- **Container sanity** — `ffprobe` reads the output without errors.
|
|
179
|
+
- **Duration math** — `output_duration ≈ input_duration - sum(cut lengths)`,
|
|
180
|
+
within 50ms.
|
|
181
|
+
- **No-filler invariant** — re-transcribe the output; assert no token in the
|
|
182
|
+
filler set survives.
|
|
183
|
+
|
|
184
|
+
Writes a JSON report to `--report PATH` (or auto-named next to the output)
|
|
185
|
+
and exits non-zero if any check fails.
|
|
186
|
+
|
|
187
|
+
## Tests
|
|
188
|
+
|
|
189
|
+
```sh
|
|
190
|
+
pytest
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
The pure helpers (`find_fillers`, `invert_to_keep_ranges`,
|
|
194
|
+
`refine_boundaries`, `merge_close_cuts`, `expected_max_word_duration`,
|
|
195
|
+
`_voiced_runs_in_region`, …) run without faster-whisper or librosa imported.
|
|
196
|
+
Heavy deps are imported lazily inside `transcribe`, `render`,
|
|
197
|
+
`load_audio_mono`, and `is_sustained_vowel`.
|
|
198
|
+
|
|
199
|
+
## Out of scope
|
|
200
|
+
|
|
201
|
+
- Removing `like`, `you know`, `I mean` — too risky for meaning.
|
|
202
|
+
- Languages other than English.
|
|
203
|
+
- Real-time / streaming.
|
erm-0.1.0/README.md
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# erm
|
|
2
|
+
|
|
3
|
+
Local CLI that strips disfluencies (`um`, `uh`, `er`, `erm`, `ah`, `hmm`, `mhm`,
|
|
4
|
+
`mm`, `uh-huh`, plus any-length elongations like `ummmm` / `uhhhhh`) from
|
|
5
|
+
recordings of English speech.
|
|
6
|
+
|
|
7
|
+
It uses [`faster-whisper`](https://github.com/SYSTRAN/faster-whisper) (running
|
|
8
|
+
the `medium.en` Whisper model by default — override with `--model`) for
|
|
9
|
+
word-level timestamps, three audio-domain detectors that catch fillers Whisper
|
|
10
|
+
hides, and ffmpeg for the cuts. Each splice is snapped to a local energy
|
|
11
|
+
minimum and zero-crossing, optionally crossfaded with a length that scales
|
|
12
|
+
with the cut size, and laid over a constant looped sample of the recording's
|
|
13
|
+
own room tone so the noise floor stays uniform across edits.
|
|
14
|
+
|
|
15
|
+
## Install
|
|
16
|
+
|
|
17
|
+
Requires Python 3.11+ and `ffmpeg` / `ffprobe` on `PATH`.
|
|
18
|
+
|
|
19
|
+
```sh
|
|
20
|
+
python3.13 -m venv .venv
|
|
21
|
+
source .venv/bin/activate
|
|
22
|
+
pip install -e '.[dev]'
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Usage
|
|
26
|
+
|
|
27
|
+
```sh
|
|
28
|
+
# Remove fillers; output and cut-list paths are auto-generated next to the input.
|
|
29
|
+
erm input.wav
|
|
30
|
+
|
|
31
|
+
# Specify output explicitly.
|
|
32
|
+
erm input.wav -o cleaned.wav
|
|
33
|
+
|
|
34
|
+
# Inspect what would be cut without rendering.
|
|
35
|
+
erm input.wav --dry-run
|
|
36
|
+
|
|
37
|
+
# Validate a rendered output against its source.
|
|
38
|
+
erm validate input.wav cleaned.wav --cuts cuts.json
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
When `-o` / `--json` are omitted, output paths are written next to the input as
|
|
42
|
+
`{stem}-cleaned-{YYYYMMDD-HHMMSS}.wav` and `{stem}-cuts-{YYYYMMDD-HHMMSS}.json`.
|
|
43
|
+
|
|
44
|
+
## How it works
|
|
45
|
+
|
|
46
|
+
1. **Transcribe.** `faster-whisper` runs with `word_timestamps=True` and a
|
|
47
|
+
verbatim-bias `initial_prompt` so it emits filler tokens instead of
|
|
48
|
+
silently cleaning them up.
|
|
49
|
+
2. **Detect.** Four passes produce candidate cut ranges:
|
|
50
|
+
- **Word-list match** — words whose normalized text is in `--fillers`,
|
|
51
|
+
including arbitrary-length elongations (e.g. `ummmm` matches the `um`
|
|
52
|
+
stem).
|
|
53
|
+
- **Gap fillers** — voiced regions in inter-word gaps longer than
|
|
54
|
+
`--gap-min-ms`. Catches fillers Whisper drops entirely.
|
|
55
|
+
- **Intra-word fillers** — long words whose interior splits across a
|
|
56
|
+
silence dip into multiple voiced runs. The non-vowel run whose duration
|
|
57
|
+
best matches the word's expected duration is treated as the real word;
|
|
58
|
+
siblings become cuts. Catches `"in, uhhhhh"` that Whisper rolls into one
|
|
59
|
+
`'in'` token.
|
|
60
|
+
- **Overlong words** — words much longer than `expected_max_word_duration`
|
|
61
|
+
for their text. The trailing portion is scanned for voiced runs.
|
|
62
|
+
Optionally pitch-confirmed (`--confirm-pitch`) by checking the cut
|
|
63
|
+
region looks like a sustained filler vowel (stable spectral centroid,
|
|
64
|
+
voiced ZCR), so we don't trim slow-but-real speech.
|
|
65
|
+
3. **Refine.** Each cut endpoint snaps to a local RMS-energy minimum within
|
|
66
|
+
±`--search-ms`, then to the nearest zero-crossing. Refinement is clamped
|
|
67
|
+
so it never crosses a neighboring word's timestamp.
|
|
68
|
+
4. **Merge.** Cuts whose surviving fragment would be shorter than
|
|
69
|
+
`--merge-gap-ms` are collapsed into one — a 40ms surviving fragment
|
|
70
|
+
between two cuts gets eaten by the surrounding crossfades and would
|
|
71
|
+
otherwise blurp.
|
|
72
|
+
5. **Render.** ffmpeg `atrim` + `acrossfade` renders the kept segments. Each
|
|
73
|
+
splice's crossfade length scales with that splice's cut size:
|
|
74
|
+
`clamp(min, cut_ms * factor, max)`. Crossfades are also clamped so they
|
|
75
|
+
never reach back across a real word boundary.
|
|
76
|
+
6. **Room tone (optional, on by default).** A quiet region of the *original*
|
|
77
|
+
recording is sampled and looped under the output at `--room-tone-level-db`.
|
|
78
|
+
This keeps the noise floor identical everywhere, masking the residual
|
|
79
|
+
noise-floor mismatch at each splice.
|
|
80
|
+
|
|
81
|
+
## Denoising
|
|
82
|
+
|
|
83
|
+
`--denoise` picks how ffmpeg's `afftdn` denoiser is used:
|
|
84
|
+
|
|
85
|
+
| Mode | Detection sees | ffmpeg cuts from | Notes |
|
|
86
|
+
|----------|----------------|--------------------------|-------|
|
|
87
|
+
| `none` | original | original | No denoising. |
|
|
88
|
+
| `pre` | denoised | denoised | Cleanest splices, but detection less sensitive (denoising flattens energy/pitch signals). |
|
|
89
|
+
| `post` | original | original; output denoised at end | Full detection sensitivity; splice noise-floor mismatch smoothed afterward. |
|
|
90
|
+
| `hybrid` (default) | original | denoised | Full detection sensitivity *and* clean splices. Recommended. |
|
|
91
|
+
|
|
92
|
+
Tune with `--denoise-nr` (reduction strength dB) and `--denoise-nf` (noise
|
|
93
|
+
floor dB).
|
|
94
|
+
|
|
95
|
+
## Flags
|
|
96
|
+
|
|
97
|
+
### Detection
|
|
98
|
+
|
|
99
|
+
| Flag | Default | Notes |
|
|
100
|
+
|------|---------|-------|
|
|
101
|
+
| `--model` | `medium.en` | Any faster-whisper model. `small.en` faster; `large-v3` more accurate. |
|
|
102
|
+
| `--fillers` | `ah,er,erm,hmm,mhm,mm,uh,uh-huh,um` | Comma-separated stems. Elongations matched dynamically. |
|
|
103
|
+
| `--detect-gaps` / `--no-detect-gaps` | on | Run gap + intra-word + overlong detectors. |
|
|
104
|
+
| `--gap-min-ms` | `350` | Minimum inter-word gap to scan for fillers. |
|
|
105
|
+
| `--gap-min-voiced-ms` / `--gap-max-voiced-ms` | `100` / `1500` | Voiced-run length bounds. |
|
|
106
|
+
| `--intraword-min-ms` | `550` | Minimum word length to scan internally. |
|
|
107
|
+
| `--confirm-pitch` / `--no-confirm-pitch` | on | Drop overlong/intra candidates that don't look like sustained filler vowels. |
|
|
108
|
+
|
|
109
|
+
### Cuts and splices
|
|
110
|
+
|
|
111
|
+
| Flag | Default | Notes |
|
|
112
|
+
|------|---------|-------|
|
|
113
|
+
| `--search-ms` | `60` | How far each endpoint may slide to find a local energy minimum. |
|
|
114
|
+
| `--crossfade-ms` | *(unset)* | Force a fixed crossfade length for every splice. When unset, per-splice scaling is used. |
|
|
115
|
+
| `--min-crossfade-ms` / `--max-crossfade-ms` | `50` / `120` | Floor and ceiling for the per-splice crossfade scaling. |
|
|
116
|
+
| `--crossfade-factor` | `0.15` | `cut_ms * factor`, clamped to `[min, max]`. Higher = smoother but blurrier. |
|
|
117
|
+
| `--merge-gap-ms` | `120` | Merge two cuts whose surviving fragment would be shorter than this. |
|
|
118
|
+
|
|
119
|
+
### Audio cleanup
|
|
120
|
+
|
|
121
|
+
| Flag | Default | Notes |
|
|
122
|
+
|------|---------|-------|
|
|
123
|
+
| `--denoise` | `hybrid` | `none` / `pre` / `post` / `hybrid` (see table above). |
|
|
124
|
+
| `--denoise-nr` | `12.0` | `afftdn` noise reduction (dB). |
|
|
125
|
+
| `--denoise-nf` | `-25.0` | `afftdn` noise floor (dB). |
|
|
126
|
+
| `--room-tone` / `--no-room-tone` | on | Loop a quiet sample of the original under the output. |
|
|
127
|
+
| `--room-tone-level-db` | `-12.0` | Attenuation applied to the looped tone. `-12` to `-20` is usually right. |
|
|
128
|
+
| `--room-tone-source` | `auto` | `auto` finds a quiet region; otherwise `START-END` in seconds (e.g. `0.05-1.4`). |
|
|
129
|
+
|
|
130
|
+
### Output
|
|
131
|
+
|
|
132
|
+
| Flag | Default | Notes |
|
|
133
|
+
|------|---------|-------|
|
|
134
|
+
| `-o`, `--output` | auto-named next to input | Output `.wav` path. |
|
|
135
|
+
| `--json PATH` | auto-named next to input | Cut list JSON. |
|
|
136
|
+
| `--dry-run` | off | Print the cut list and exit; no audio rendered. |
|
|
137
|
+
|
|
138
|
+
## `validate` subcommand
|
|
139
|
+
|
|
140
|
+
```sh
|
|
141
|
+
erm validate input.wav cleaned.wav --cuts cuts.json
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Runs three deterministic checks:
|
|
145
|
+
|
|
146
|
+
- **Container sanity** — `ffprobe` reads the output without errors.
|
|
147
|
+
- **Duration math** — `output_duration ≈ input_duration - sum(cut lengths)`,
|
|
148
|
+
within 50ms.
|
|
149
|
+
- **No-filler invariant** — re-transcribe the output; assert no token in the
|
|
150
|
+
filler set survives.
|
|
151
|
+
|
|
152
|
+
Writes a JSON report to `--report PATH` (or auto-named next to the output)
|
|
153
|
+
and exits non-zero if any check fails.
|
|
154
|
+
|
|
155
|
+
## Tests
|
|
156
|
+
|
|
157
|
+
```sh
|
|
158
|
+
pytest
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
The pure helpers (`find_fillers`, `invert_to_keep_ranges`,
|
|
162
|
+
`refine_boundaries`, `merge_close_cuts`, `expected_max_word_duration`,
|
|
163
|
+
`_voiced_runs_in_region`, …) run without faster-whisper or librosa imported.
|
|
164
|
+
Heavy deps are imported lazily inside `transcribe`, `render`,
|
|
165
|
+
`load_audio_mono`, and `is_sustained_vowel`.
|
|
166
|
+
|
|
167
|
+
## Out of scope
|
|
168
|
+
|
|
169
|
+
- Removing `like`, `you know`, `I mean` — too risky for meaning.
|
|
170
|
+
- Languages other than English.
|
|
171
|
+
- Real-time / streaming.
|
erm-0.1.0/pyproject.toml
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "erm"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Strip disfluencies (um, uh, er, ah, hmm) from spoken audio."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
license = { text = "MIT" }
|
|
8
|
+
authors = [{ name = "Doug Calobrisi", email = "doug@calobrisi.com" }]
|
|
9
|
+
keywords = ["audio", "speech", "transcription", "filler-words", "disfluency", "whisper", "ffmpeg"]
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Development Status :: 4 - Beta",
|
|
12
|
+
"Environment :: Console",
|
|
13
|
+
"Intended Audience :: End Users/Desktop",
|
|
14
|
+
"License :: OSI Approved :: MIT License",
|
|
15
|
+
"Operating System :: OS Independent",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.11",
|
|
18
|
+
"Programming Language :: Python :: 3.12",
|
|
19
|
+
"Programming Language :: Python :: 3.13",
|
|
20
|
+
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
|
21
|
+
"Topic :: Multimedia :: Sound/Audio :: Editors",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"faster-whisper>=1.0.3",
|
|
25
|
+
"numpy>=1.26",
|
|
26
|
+
"librosa>=0.10",
|
|
27
|
+
"soundfile>=0.12",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
dev = ["pytest>=8.0"]
|
|
32
|
+
|
|
33
|
+
[project.urls]
|
|
34
|
+
Homepage = "https://github.com/dougcalobrisi/erm"
|
|
35
|
+
Repository = "https://github.com/dougcalobrisi/erm"
|
|
36
|
+
Issues = "https://github.com/dougcalobrisi/erm/issues"
|
|
37
|
+
|
|
38
|
+
[project.scripts]
|
|
39
|
+
erm = "erm.cli:main"
|
|
40
|
+
|
|
41
|
+
[build-system]
|
|
42
|
+
requires = ["setuptools>=68"]
|
|
43
|
+
build-backend = "setuptools.build_meta"
|
|
44
|
+
|
|
45
|
+
[tool.setuptools.packages.find]
|
|
46
|
+
where = ["src"]
|
|
47
|
+
|
|
48
|
+
[tool.pytest.ini_options]
|
|
49
|
+
markers = ["slow: tests that need the Whisper model (download required)"]
|
erm-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""erm: strip disfluencies from spoken audio.
|
|
2
|
+
|
|
3
|
+
The pure-helper modules (`fillers`, `ranges`, `refine`, `envelope`, `models`)
|
|
4
|
+
depend only on numpy + stdlib so the unit tests can run without
|
|
5
|
+
faster-whisper or librosa installed. Heavy deps (`librosa`,
|
|
6
|
+
`faster_whisper`) are imported lazily inside the functions that need them.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .acoustic import is_sustained_vowel
|
|
10
|
+
from .asr import VERBATIM_PROMPT, transcribe
|
|
11
|
+
from .audio import find_quiet_region, load_audio_mono
|
|
12
|
+
from .cli import main
|
|
13
|
+
from .detect import (
|
|
14
|
+
detect_gap_fillers,
|
|
15
|
+
detect_intraword_fillers,
|
|
16
|
+
detect_overlong_words,
|
|
17
|
+
expected_max_word_duration,
|
|
18
|
+
)
|
|
19
|
+
from .ffmpeg_ops import (
|
|
20
|
+
denoise_to,
|
|
21
|
+
extract_segment,
|
|
22
|
+
ffprobe_duration,
|
|
23
|
+
overlay_room_tone,
|
|
24
|
+
render,
|
|
25
|
+
)
|
|
26
|
+
from .fillers import DEFAULT_FILLERS, find_fillers, is_filler, normalize_word
|
|
27
|
+
from .models import Cut, Word
|
|
28
|
+
from .ranges import invert_to_keep_ranges, merge_close_cuts
|
|
29
|
+
from .refine import refine_boundaries
|
|
30
|
+
from .validate import validate_output
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
"Cut",
|
|
34
|
+
"DEFAULT_FILLERS",
|
|
35
|
+
"VERBATIM_PROMPT",
|
|
36
|
+
"Word",
|
|
37
|
+
"denoise_to",
|
|
38
|
+
"detect_gap_fillers",
|
|
39
|
+
"detect_intraword_fillers",
|
|
40
|
+
"detect_overlong_words",
|
|
41
|
+
"expected_max_word_duration",
|
|
42
|
+
"extract_segment",
|
|
43
|
+
"ffprobe_duration",
|
|
44
|
+
"find_fillers",
|
|
45
|
+
"find_quiet_region",
|
|
46
|
+
"invert_to_keep_ranges",
|
|
47
|
+
"is_filler",
|
|
48
|
+
"is_sustained_vowel",
|
|
49
|
+
"load_audio_mono",
|
|
50
|
+
"main",
|
|
51
|
+
"merge_close_cuts",
|
|
52
|
+
"normalize_word",
|
|
53
|
+
"overlay_room_tone",
|
|
54
|
+
"refine_boundaries",
|
|
55
|
+
"render",
|
|
56
|
+
"transcribe",
|
|
57
|
+
"validate_output",
|
|
58
|
+
]
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Acoustic feature checks (librosa-based, lazy-imported)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def is_sustained_vowel(
|
|
9
|
+
audio: np.ndarray,
|
|
10
|
+
sr: int,
|
|
11
|
+
start_s: float,
|
|
12
|
+
end_s: float,
|
|
13
|
+
max_centroid_cv: float = 0.18,
|
|
14
|
+
min_voiced_frac: float = 0.50,
|
|
15
|
+
) -> bool:
|
|
16
|
+
"""Return True if [start_s, end_s] looks acoustically like a sustained
|
|
17
|
+
filler vowel ("uhhh", "ahhh", "ummm").
|
|
18
|
+
|
|
19
|
+
Filler vowels have two distinguishing features compared to real word
|
|
20
|
+
content: (a) the spectral energy stays in roughly the same place across
|
|
21
|
+
the region (low spectral-centroid variation), and (b) most frames are
|
|
22
|
+
voiced (ZCR in the voiced range, not silence or fricative noise).
|
|
23
|
+
|
|
24
|
+
`max_centroid_cv` is the std/mean ratio of the spectral centroid; lower
|
|
25
|
+
means more stable. `min_voiced_frac` is the fraction of frames whose
|
|
26
|
+
zero-crossing rate is in the typical voiced-speech range.
|
|
27
|
+
"""
|
|
28
|
+
import librosa # heavy; lazy
|
|
29
|
+
|
|
30
|
+
if audio.ndim > 1:
|
|
31
|
+
audio = audio.mean(axis=1)
|
|
32
|
+
s = max(0, int(start_s * sr))
|
|
33
|
+
e = min(audio.size, int(end_s * sr))
|
|
34
|
+
seg = audio[s:e]
|
|
35
|
+
if seg.size < int(0.06 * sr):
|
|
36
|
+
return False
|
|
37
|
+
|
|
38
|
+
n_fft = 1024
|
|
39
|
+
hop = max(1, int(0.020 * sr))
|
|
40
|
+
if seg.size < n_fft:
|
|
41
|
+
seg = np.pad(seg, (0, n_fft - seg.size), mode="constant")
|
|
42
|
+
|
|
43
|
+
centroid = librosa.feature.spectral_centroid(
|
|
44
|
+
y=seg, sr=sr, n_fft=n_fft, hop_length=hop,
|
|
45
|
+
)[0]
|
|
46
|
+
if centroid.size < 3:
|
|
47
|
+
return False
|
|
48
|
+
mean_c = float(centroid.mean())
|
|
49
|
+
if mean_c <= 1e-6:
|
|
50
|
+
return False
|
|
51
|
+
cv = float(centroid.std() / mean_c)
|
|
52
|
+
|
|
53
|
+
zcr = librosa.feature.zero_crossing_rate(
|
|
54
|
+
y=seg, frame_length=n_fft, hop_length=hop,
|
|
55
|
+
)[0]
|
|
56
|
+
voiced = (zcr > 0.02) & (zcr < 0.20)
|
|
57
|
+
voiced_frac = float(voiced.mean()) if voiced.size else 0.0
|
|
58
|
+
|
|
59
|
+
return cv <= max_centroid_cv and voiced_frac >= min_voiced_frac
|
erm-0.1.0/src/erm/asr.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""faster-whisper transcription (lazy-imported)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from .models import Word
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
VERBATIM_PROMPT = (
|
|
11
|
+
"Um, uh, er, erm, ah, hmm. Like, you know, I mean, sort of. "
|
|
12
|
+
"Verbatim transcription including all filler words and disfluencies."
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def transcribe(
|
|
17
|
+
path: str | Path,
|
|
18
|
+
model_name: str = "medium.en",
|
|
19
|
+
verbatim: bool = True,
|
|
20
|
+
) -> tuple[list[Word], float]:
|
|
21
|
+
"""Transcribe `path` with faster-whisper. Returns (words, duration_seconds).
|
|
22
|
+
|
|
23
|
+
`verbatim=True` passes an `initial_prompt` that biases Whisper toward
|
|
24
|
+
keeping disfluencies, which it normally cleans up silently.
|
|
25
|
+
"""
|
|
26
|
+
from faster_whisper import WhisperModel # heavy; lazy
|
|
27
|
+
|
|
28
|
+
model = WhisperModel(model_name, device="auto", compute_type="auto")
|
|
29
|
+
segments, info = model.transcribe(
|
|
30
|
+
str(path),
|
|
31
|
+
word_timestamps=True,
|
|
32
|
+
initial_prompt=VERBATIM_PROMPT if verbatim else None,
|
|
33
|
+
condition_on_previous_text=False, # otherwise the prompt gets diluted
|
|
34
|
+
)
|
|
35
|
+
words: list[Word] = []
|
|
36
|
+
for seg in segments:
|
|
37
|
+
if not seg.words:
|
|
38
|
+
continue
|
|
39
|
+
for w in seg.words:
|
|
40
|
+
if w.start is None or w.end is None:
|
|
41
|
+
continue
|
|
42
|
+
words.append(Word(text=w.word.strip(), start=float(w.start), end=float(w.end)))
|
|
43
|
+
return words, float(info.duration)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Audio loading and quiet-region selection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Sequence
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
from .models import Word
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def load_audio_mono(path: str | Path, target_sr: int = 16_000) -> tuple[np.ndarray, int]:
|
|
14
|
+
"""Load any ffmpeg-readable audio file as mono float32 at `target_sr`."""
|
|
15
|
+
import librosa # heavy; lazy
|
|
16
|
+
y, sr = librosa.load(str(path), sr=target_sr, mono=True)
|
|
17
|
+
return y.astype(np.float32), int(sr)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def find_quiet_region(
|
|
21
|
+
audio: np.ndarray,
|
|
22
|
+
sr: int,
|
|
23
|
+
words: Sequence[Word],
|
|
24
|
+
min_length_s: float = 0.4,
|
|
25
|
+
max_length_s: float = 1.5,
|
|
26
|
+
win_ms: float = 10.0,
|
|
27
|
+
) -> tuple[float, float] | None:
|
|
28
|
+
"""Find a stretch of mostly-silent audio suitable as a room-tone sample.
|
|
29
|
+
|
|
30
|
+
We need a region with no speech and only background noise (HVAC, mic
|
|
31
|
+
hiss, room tone). The gap *before the first transcribed word* is usually
|
|
32
|
+
the cleanest source — it's pre-roll silence with no speaker activity.
|
|
33
|
+
Falls back to the gap after the last word if the leading gap is too
|
|
34
|
+
short.
|
|
35
|
+
"""
|
|
36
|
+
if audio.ndim > 1:
|
|
37
|
+
audio = audio.mean(axis=1)
|
|
38
|
+
audio = np.ascontiguousarray(audio, dtype=np.float32)
|
|
39
|
+
total = float(audio.size) / sr
|
|
40
|
+
|
|
41
|
+
sorted_words = sorted(words, key=lambda w: w.start)
|
|
42
|
+
candidates: list[tuple[float, float]] = []
|
|
43
|
+
if sorted_words:
|
|
44
|
+
candidates.append((0.0, sorted_words[0].start))
|
|
45
|
+
candidates.append((sorted_words[-1].end, total))
|
|
46
|
+
else:
|
|
47
|
+
candidates.append((0.0, total))
|
|
48
|
+
|
|
49
|
+
# Trim 50ms off each side to avoid clipping the start of speech
|
|
50
|
+
# or the tail of the previous word's silence-pad.
|
|
51
|
+
pad = 0.05
|
|
52
|
+
for start_s, end_s in candidates:
|
|
53
|
+
if end_s - start_s < min_length_s + 2 * pad:
|
|
54
|
+
continue
|
|
55
|
+
s = start_s + pad
|
|
56
|
+
e = min(end_s - pad, s + max_length_s)
|
|
57
|
+
if e - s >= min_length_s:
|
|
58
|
+
return (s, e)
|
|
59
|
+
return None
|