speech-detect 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speech_detect-0.2.4/LICENSE +21 -0
- speech_detect-0.2.4/MANIFEST.in +5 -0
- speech_detect-0.2.4/PKG-INFO +327 -0
- speech_detect-0.2.4/README.md +309 -0
- speech_detect-0.2.4/pyproject.toml +27 -0
- speech_detect-0.2.4/requirements.txt +10 -0
- speech_detect-0.2.4/setup.cfg +4 -0
- speech_detect-0.2.4/speech_detect.egg-info/PKG-INFO +327 -0
- speech_detect-0.2.4/speech_detect.egg-info/SOURCES.txt +16 -0
- speech_detect-0.2.4/speech_detect.egg-info/dependency_links.txt +1 -0
- speech_detect-0.2.4/speech_detect.egg-info/requires.txt +5 -0
- speech_detect-0.2.4/speech_detect.egg-info/top_level.txt +1 -0
- speech_detect-0.2.4/src/__init__.py +32 -0
- speech_detect-0.2.4/src/exceptions.py +69 -0
- speech_detect-0.2.4/src/rms_calculator.py +215 -0
- speech_detect-0.2.4/src/sd_types.py +42 -0
- speech_detect-0.2.4/src/speech_detect.py +332 -0
- speech_detect-0.2.4/src/vad_parser.py +138 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 [Speech2SRT Team](https://github.com/speech2srt)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: speech-detect
|
|
3
|
+
Version: 0.2.4
|
|
4
|
+
Summary: A Python library for detecting speech segments and non-speech gaps in audio/video files using FSMN-VAD-ONNX with streaming processing
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/speech2srt/speech-detect
|
|
7
|
+
Project-URL: Repository, https://github.com/speech2srt/speech-detect
|
|
8
|
+
Project-URL: Issues, https://github.com/speech2srt/speech-detect/issues
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: numpy>=1.26.4
|
|
13
|
+
Requires-Dist: funasr-onnx>=0.4.1
|
|
14
|
+
Requires-Dist: ffmpeg-audio>=0.3.0
|
|
15
|
+
Requires-Dist: jieba>=0.42.1
|
|
16
|
+
Requires-Dist: torch>=2.9.1
|
|
17
|
+
Dynamic: license-file
|
|
18
|
+
|
|
19
|
+
# speech-detect
|
|
20
|
+
|
|
21
|
+
A Python library for detecting speech segments, non-speech gaps, and RMS energy curves in audio/video files using FSMN-VAD-ONNX with streaming processing.
|
|
22
|
+
|
|
23
|
+
## Features
|
|
24
|
+
|
|
25
|
+
### Core Functionality
|
|
26
|
+
|
|
27
|
+
- **Speech segment detection**: Detect all speech segments in audio/video files with precise timestamps
|
|
28
|
+
- **Non-speech gap derivation**: Automatically compute non-speech gaps (silence periods) from detected speech segments
|
|
29
|
+
- **RMS energy detection**: Compute RMS (Root Mean Square) energy curve for audio analysis and visualization
|
|
30
|
+
|
|
31
|
+
### Advanced Features
|
|
32
|
+
|
|
33
|
+
- **Adjacent segment merging**: Merge adjacent speech segments with gaps smaller than a threshold (useful for handling brief pauses like breathing or thinking pauses)
|
|
34
|
+
|
|
35
|
+
### Technical Capabilities
|
|
36
|
+
|
|
37
|
+
- **Streaming processing**: Process large audio/video files in chunks without loading everything into memory
|
|
38
|
+
- **Memory efficient**: Constant memory usage regardless of audio file duration
|
|
39
|
+
- **Format support**: Supports all audio/video formats that FFmpeg supports (MP3, WAV, FLAC, Opus, MP4, AVI, etc.)
|
|
40
|
+
- **Time range support**: Support start time and duration parameters for partial processing
|
|
41
|
+
|
|
42
|
+
## Installation
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install speech-detect
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
**Note**: This package requires:
|
|
49
|
+
|
|
50
|
+
- FFmpeg to be installed on your system and available in PATH
|
|
51
|
+
- FSMN-VAD-ONNX model files (see Model Setup below)
|
|
52
|
+
|
|
53
|
+
## Model Setup
|
|
54
|
+
|
|
55
|
+
This package requires FSMN-VAD-ONNX model files. The model is available on Hugging Face:
|
|
56
|
+
|
|
57
|
+
**Model Repository**: [funasr/fsmn-vad-onnx](https://huggingface.co/funasr/fsmn-vad-onnx)
|
|
58
|
+
|
|
59
|
+
### Download the Model
|
|
60
|
+
|
|
61
|
+
1. Install Git LFS (required for downloading large model files):
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
git lfs install
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
2. Clone the model repository:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
git clone https://huggingface.co/funasr/fsmn-vad-onnx
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
This will download the model files including `model_quant.onnx`, `config.yaml`, `am.mvn`, etc.
|
|
74
|
+
|
|
75
|
+
3. Set the `MODEL_FSMN_VAD_DIR` environment variable to point to the model directory:
|
|
76
|
+
```bash
|
|
77
|
+
export MODEL_FSMN_VAD_DIR=/path/to/fsmn-vad-onnx
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Alternatively, you can specify the model directory when initializing `SpeechDetector`:
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from speech_detect import SpeechDetector
|
|
84
|
+
|
|
85
|
+
detector = SpeechDetector(model_dir="/path/to/fsmn-vad-onnx")
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Quick Start
|
|
89
|
+
|
|
90
|
+
### Detect Speech Segments, Gaps, and RMS Energy Curve
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from speech_detect import SpeechDetector
|
|
94
|
+
|
|
95
|
+
# Initialize detector (reads MODEL_FSMN_VAD_DIR from environment)
|
|
96
|
+
detector = SpeechDetector()
|
|
97
|
+
|
|
98
|
+
# Detect speech segments, non-speech gaps, and RMS energy curve in an audio file
|
|
99
|
+
speech_segments, gaps, rms_curve = detector.detect("audio.mp3")
|
|
100
|
+
|
|
101
|
+
# speech_segments is a list of dictionaries: [{"start": 0, "end": 500}, ...]
|
|
102
|
+
for segment in speech_segments:
|
|
103
|
+
start_ms = segment["start"]
|
|
104
|
+
end_ms = segment["end"]
|
|
105
|
+
duration = end_ms - start_ms
|
|
106
|
+
print(f"Speech segment: {start_ms}ms - {end_ms}ms (duration: {duration}ms)")
|
|
107
|
+
|
|
108
|
+
# gaps is a list of dictionaries: [{"start": 0, "end": 500}, ...]
|
|
109
|
+
for gap in gaps:
|
|
110
|
+
start_ms = gap["start"]
|
|
111
|
+
end_ms = gap["end"]
|
|
112
|
+
duration = end_ms - start_ms
|
|
113
|
+
print(f"Non-speech gap: {start_ms}ms - {end_ms}ms (duration: {duration}ms)")
|
|
114
|
+
|
|
115
|
+
# rms_curve is a list of dictionaries: [{"ms": 0, "value": 0.123}, ...]
|
|
116
|
+
for point in rms_curve:
|
|
117
|
+
time_ms = point["ms"]
|
|
118
|
+
rms_value = point["value"]
|
|
119
|
+
print(f"RMS at {time_ms}ms: {rms_value}")
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Processing Specific Time Range
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
# Process only the first 30 seconds
|
|
126
|
+
speech_segments, gaps, rms_curve = detector.detect(
|
|
127
|
+
file_path="audio.mp3",
|
|
128
|
+
start_ms=0,
|
|
129
|
+
duration_ms=30000,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Process from 10 seconds, duration 5 seconds
|
|
133
|
+
speech_segments, gaps, rms_curve = detector.detect(
|
|
134
|
+
file_path="audio.mp3",
|
|
135
|
+
start_ms=10000,
|
|
136
|
+
duration_ms=5000,
|
|
137
|
+
)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### Custom Chunk Size
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
# Use 1-minute chunks instead of default 20-minute chunks
|
|
144
|
+
speech_segments, gaps, rms_curve = detector.detect(
|
|
145
|
+
file_path="audio.mp3",
|
|
146
|
+
chunk_duration_sec=60,
|
|
147
|
+
)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Merging Adjacent Segments
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
# Merge adjacent segments with gaps smaller than 300ms
|
|
154
|
+
# Useful for handling brief pauses in speech (breathing, thinking pauses)
|
|
155
|
+
speech_segments, gaps, rms_curve = detector.detect(
|
|
156
|
+
file_path="audio.mp3",
|
|
157
|
+
merge_gap_threshold_ms=300,
|
|
158
|
+
)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Custom RMS Energy Detection Parameters
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
# Customize RMS calculation window size and output interval
|
|
165
|
+
# frame_size_ms: Convolution window size (default: 100ms)
|
|
166
|
+
# output_interval_ms: Output sampling interval (default: 50ms)
|
|
167
|
+
speech_segments, gaps, rms_curve = detector.detect(
|
|
168
|
+
file_path="audio.mp3",
|
|
169
|
+
rms_frame_size_ms=100, # 100ms window for RMS calculation
|
|
170
|
+
rms_output_interval_ms=50, # Output every 50ms for higher resolution
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# The RMS curve can be used for audio visualization, energy analysis, or as input for other audio processing tasks
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### Environment Variables for RMS Configuration
|
|
177
|
+
|
|
178
|
+
You can configure default RMS parameters using environment variables:
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
# Set default frame size (default: 100ms)
|
|
182
|
+
export RMS_FRAME_SIZE_MS=100
|
|
183
|
+
|
|
184
|
+
# Set default output interval (default: 50ms)
|
|
185
|
+
export RMS_OUTPUT_INTERVAL_MS=50
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
These environment variables will be used as defaults when `rms_frame_size_ms` and `rms_output_interval_ms` parameters are not explicitly provided to the `detect()` method.
|
|
189
|
+
|
|
190
|
+
## RMS Energy Design Principles
|
|
191
|
+
|
|
192
|
+
`speech-detect` always emits an RMS energy curve together with VAD results. The feature is designed to stay lightweight while providing meaningful downstream insight:
|
|
193
|
+
|
|
194
|
+
- **Always-on streaming measurement**: RMS is computed chunk by chunk using a sliding, normalized window, so memory usage stays constant even for multi-hour recordings.
|
|
195
|
+
- **Decoupled smoothing vs. resolution**: `rms_frame_size_ms` controls the convolution window (how aggressively noise is smoothed), while `rms_output_interval_ms` controls the sampling density. Allowing the interval to be smaller than the window lets you plot dense, smooth curves without losing smoothing benefits.
|
|
196
|
+
- **Deterministic timeline**: Both parameters are expressed in milliseconds and map directly to timestamps in the returned `rms_curve`, making it trivial to align energy data with speech segments, captions, or UI waveforms.
|
|
197
|
+
- **Downstream flexibility**: The curve can drive silence gating, highlight low-energy pauses, or simply power visual meters. Because it is always returned, callers can adopt it opportunistically without extra processing steps.
|
|
198
|
+
|
|
199
|
+
## API Reference
|
|
200
|
+
|
|
201
|
+
### SpeechDetector
|
|
202
|
+
|
|
203
|
+
Main class for speech detection. All methods are instance methods.
|
|
204
|
+
|
|
205
|
+
#### `SpeechDetector.__init__(model_dir=None)`
|
|
206
|
+
|
|
207
|
+
Initialize speech detector.
|
|
208
|
+
|
|
209
|
+
**Parameters:**
|
|
210
|
+
|
|
211
|
+
- `model_dir` (str, optional): Path to the FSMN-VAD model directory. If None, reads from `MODEL_FSMN_VAD_DIR` environment variable.
|
|
212
|
+
|
|
213
|
+
**Note:** The FSMN-VAD model only has a quantized version, so `quantize=True` is always used internally.
|
|
214
|
+
|
|
215
|
+
**Raises:**
|
|
216
|
+
|
|
217
|
+
- `VadModelNotFoundError`: If model directory is not found or not set
|
|
218
|
+
- `VadModelInitializationError`: If model initialization fails
|
|
219
|
+
|
|
220
|
+
#### `SpeechDetector.detect(file_path, chunk_duration_sec=None, start_ms=None, duration_ms=None, merge_gap_threshold_ms=None, rms_frame_size_ms=None, rms_output_interval_ms=None)`
|
|
221
|
+
|
|
222
|
+
Detect speech segments, non-speech gaps, and RMS energy curve in audio/video file using streaming processing.
|
|
223
|
+
|
|
224
|
+
**Parameters:**
|
|
225
|
+
|
|
226
|
+
- `file_path` (str): Path to the audio/video file (supports all FFmpeg formats)
|
|
227
|
+
- `chunk_duration_sec` (int, optional): Duration of each chunk in seconds. Defaults to 1200 (20 minutes). Must be > 0 if provided.
|
|
228
|
+
- `start_ms` (int, optional): Start position in milliseconds. None means from file beginning. If None but `duration_ms` is provided, defaults to 0.
|
|
229
|
+
- `duration_ms` (int, optional): Total duration to process in milliseconds. None means process until end. If specified, processing stops when this duration is reached.
|
|
230
|
+
- `merge_gap_threshold_ms` (int, optional): Gap threshold in milliseconds. Adjacent speech segments with gaps smaller than this threshold will be merged into a single segment. None (default) disables merging. If <= 0, a warning will be logged and merging will be disabled. Useful for handling brief pauses in speech (e.g., breathing, thinking pauses) that should be considered part of continuous speech.
|
|
231
|
+
- `rms_frame_size_ms` (int, optional): Convolution window size in milliseconds for RMS calculation. Defaults to 100ms (can be overridden by `RMS_FRAME_SIZE_MS` environment variable). If <= 0, a warning will be logged and default value (100ms) will be used.
|
|
232
|
+
- `rms_output_interval_ms` (int, optional): Output sampling interval in milliseconds for RMS curve. Defaults to 50ms (can be overridden by `RMS_OUTPUT_INTERVAL_MS` environment variable). If <= 0, a warning will be logged and default value (50ms) will be used. If > `rms_frame_size_ms`, it will be adjusted to `rms_frame_size_ms`.
|
|
233
|
+
|
|
234
|
+
**Returns:**
|
|
235
|
+
|
|
236
|
+
- `tuple[list[VadSegment], list[VadSegment], list[RMSPoint]]`: Tuple of (speech_segments, gaps, rms_curve)
|
|
237
|
+
- `speech_segments`: List of speech segments, format: `[{"start": ms, "end": ms}, ...]`
|
|
238
|
+
- Timestamps are relative to audio start (from 0)
|
|
239
|
+
- Unit: milliseconds
|
|
240
|
+
- `gaps`: List of non-speech gaps, format: `[{"start": ms, "end": ms}, ...]`
|
|
241
|
+
- Timestamps are relative to audio start (from 0)
|
|
242
|
+
- Unit: milliseconds
|
|
243
|
+
- `rms_curve`: RMS energy curve data, always computed and returned, format: `[{"ms": int, "value": float}, ...]`
|
|
244
|
+
- `ms`: Time position in milliseconds (relative to audio start, from 0)
|
|
245
|
+
- `value`: RMS energy value (float, typically in range [0.0, 1.0])
|
|
246
|
+
- Unit: milliseconds for time, dimensionless for value
|
|
247
|
+
|
|
248
|
+
**Raises:**
|
|
249
|
+
|
|
250
|
+
- `VadProcessingError`: If processing fails
|
|
251
|
+
|
|
252
|
+
## Data Types
|
|
253
|
+
|
|
254
|
+
### VadSegment
|
|
255
|
+
|
|
256
|
+
A TypedDict representing a time segment (can be a speech segment or a non-speech gap).
|
|
257
|
+
|
|
258
|
+
**Fields:**
|
|
259
|
+
|
|
260
|
+
- `start` (int): Segment start time in milliseconds
|
|
261
|
+
- `end` (int): Segment end time in milliseconds
|
|
262
|
+
|
|
263
|
+
**Example:**
|
|
264
|
+
|
|
265
|
+
```python
|
|
266
|
+
segment: VadSegment = {"start": 100, "end": 500}
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
### RMSPoint
|
|
270
|
+
|
|
271
|
+
A TypedDict representing a point on the RMS energy curve.
|
|
272
|
+
|
|
273
|
+
**Fields:**
|
|
274
|
+
|
|
275
|
+
- `ms` (int): Time position in milliseconds (relative to audio start, from 0)
|
|
276
|
+
- `value` (float): RMS energy value (typically in range [0.0, 1.0])
|
|
277
|
+
|
|
278
|
+
**Example:**
|
|
279
|
+
|
|
280
|
+
```python
|
|
281
|
+
point: RMSPoint = {"ms": 100, "value": 0.123}
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
## Exceptions
|
|
285
|
+
|
|
286
|
+
### `VadModelNotFoundError`
|
|
287
|
+
|
|
288
|
+
Raised when VAD model directory is not found or not set.
|
|
289
|
+
|
|
290
|
+
**Attributes:**
|
|
291
|
+
|
|
292
|
+
- `message`: Human-readable error message
|
|
293
|
+
|
|
294
|
+
### `VadModelInitializationError`
|
|
295
|
+
|
|
296
|
+
Raised when VAD model initialization fails.
|
|
297
|
+
|
|
298
|
+
**Attributes:**
|
|
299
|
+
|
|
300
|
+
- `message`: Primary error message
|
|
301
|
+
- `model_dir`: Path to the model directory that caused the error
|
|
302
|
+
|
|
303
|
+
### `VadProcessingError`
|
|
304
|
+
|
|
305
|
+
Raised when VAD processing fails.
|
|
306
|
+
|
|
307
|
+
**Attributes:**
|
|
308
|
+
|
|
309
|
+
- `message`: Primary error message
|
|
310
|
+
- `file_path`: Path to the file being processed
|
|
311
|
+
- `details`: Additional error details dictionary
|
|
312
|
+
|
|
313
|
+
## Requirements
|
|
314
|
+
|
|
315
|
+
- Python >= 3.10
|
|
316
|
+
- FFmpeg (must be installed separately)
|
|
317
|
+
- numpy >= 1.26.4
|
|
318
|
+
- scipy >= 1.11.0 (for RMS energy calculation)
|
|
319
|
+
- funasr-onnx >= 0.4.1
|
|
320
|
+
- ffmpeg-audio >= 0.2.0
|
|
321
|
+
- jieba >= 0.42.1
|
|
322
|
+
- torch >= 2.9.1
|
|
323
|
+
- setuptools == 80.8.0 (to avoid UserWarning from jieba about deprecated pkg_resources API)
|
|
324
|
+
|
|
325
|
+
## License
|
|
326
|
+
|
|
327
|
+
MIT License
|
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
# speech-detect
|
|
2
|
+
|
|
3
|
+
A Python library for detecting speech segments, non-speech gaps, and RMS energy curves in audio/video files using FSMN-VAD-ONNX with streaming processing.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
### Core Functionality
|
|
8
|
+
|
|
9
|
+
- **Speech segment detection**: Detect all speech segments in audio/video files with precise timestamps
|
|
10
|
+
- **Non-speech gap derivation**: Automatically compute non-speech gaps (silence periods) from detected speech segments
|
|
11
|
+
- **RMS energy detection**: Compute RMS (Root Mean Square) energy curve for audio analysis and visualization
|
|
12
|
+
|
|
13
|
+
### Advanced Features
|
|
14
|
+
|
|
15
|
+
- **Adjacent segment merging**: Merge adjacent speech segments with gaps smaller than a threshold (useful for handling brief pauses like breathing or thinking pauses)
|
|
16
|
+
|
|
17
|
+
### Technical Capabilities
|
|
18
|
+
|
|
19
|
+
- **Streaming processing**: Process large audio/video files in chunks without loading everything into memory
|
|
20
|
+
- **Memory efficient**: Constant memory usage regardless of audio file duration
|
|
21
|
+
- **Format support**: Supports all audio/video formats that FFmpeg supports (MP3, WAV, FLAC, Opus, MP4, AVI, etc.)
|
|
22
|
+
- **Time range support**: Support start time and duration parameters for partial processing
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install speech-detect
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
**Note**: This package requires:
|
|
31
|
+
|
|
32
|
+
- FFmpeg to be installed on your system and available in PATH
|
|
33
|
+
- FSMN-VAD-ONNX model files (see Model Setup below)
|
|
34
|
+
|
|
35
|
+
## Model Setup
|
|
36
|
+
|
|
37
|
+
This package requires FSMN-VAD-ONNX model files. The model is available on Hugging Face:
|
|
38
|
+
|
|
39
|
+
**Model Repository**: [funasr/fsmn-vad-onnx](https://huggingface.co/funasr/fsmn-vad-onnx)
|
|
40
|
+
|
|
41
|
+
### Download the Model
|
|
42
|
+
|
|
43
|
+
1. Install Git LFS (required for downloading large model files):
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
git lfs install
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
2. Clone the model repository:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
git clone https://huggingface.co/funasr/fsmn-vad-onnx
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
This will download the model files including `model_quant.onnx`, `config.yaml`, `am.mvn`, etc.
|
|
56
|
+
|
|
57
|
+
3. Set the `MODEL_FSMN_VAD_DIR` environment variable to point to the model directory:
|
|
58
|
+
```bash
|
|
59
|
+
export MODEL_FSMN_VAD_DIR=/path/to/fsmn-vad-onnx
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Alternatively, you can specify the model directory when initializing `SpeechDetector`:
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from speech_detect import SpeechDetector
|
|
66
|
+
|
|
67
|
+
detector = SpeechDetector(model_dir="/path/to/fsmn-vad-onnx")
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Quick Start
|
|
71
|
+
|
|
72
|
+
### Detect Speech Segments, Gaps, and RMS Energy Curve
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from speech_detect import SpeechDetector
|
|
76
|
+
|
|
77
|
+
# Initialize detector (reads MODEL_FSMN_VAD_DIR from environment)
|
|
78
|
+
detector = SpeechDetector()
|
|
79
|
+
|
|
80
|
+
# Detect speech segments, non-speech gaps, and RMS energy curve in an audio file
|
|
81
|
+
speech_segments, gaps, rms_curve = detector.detect("audio.mp3")
|
|
82
|
+
|
|
83
|
+
# speech_segments is a list of dictionaries: [{"start": 0, "end": 500}, ...]
|
|
84
|
+
for segment in speech_segments:
|
|
85
|
+
start_ms = segment["start"]
|
|
86
|
+
end_ms = segment["end"]
|
|
87
|
+
duration = end_ms - start_ms
|
|
88
|
+
print(f"Speech segment: {start_ms}ms - {end_ms}ms (duration: {duration}ms)")
|
|
89
|
+
|
|
90
|
+
# gaps is a list of dictionaries: [{"start": 0, "end": 500}, ...]
|
|
91
|
+
for gap in gaps:
|
|
92
|
+
start_ms = gap["start"]
|
|
93
|
+
end_ms = gap["end"]
|
|
94
|
+
duration = end_ms - start_ms
|
|
95
|
+
print(f"Non-speech gap: {start_ms}ms - {end_ms}ms (duration: {duration}ms)")
|
|
96
|
+
|
|
97
|
+
# rms_curve is a list of dictionaries: [{"ms": 0, "value": 0.123}, ...]
|
|
98
|
+
for point in rms_curve:
|
|
99
|
+
time_ms = point["ms"]
|
|
100
|
+
rms_value = point["value"]
|
|
101
|
+
print(f"RMS at {time_ms}ms: {rms_value}")
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Processing Specific Time Range
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
# Process only the first 30 seconds
|
|
108
|
+
speech_segments, gaps, rms_curve = detector.detect(
|
|
109
|
+
file_path="audio.mp3",
|
|
110
|
+
start_ms=0,
|
|
111
|
+
duration_ms=30000,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Process from 10 seconds, duration 5 seconds
|
|
115
|
+
speech_segments, gaps, rms_curve = detector.detect(
|
|
116
|
+
file_path="audio.mp3",
|
|
117
|
+
start_ms=10000,
|
|
118
|
+
duration_ms=5000,
|
|
119
|
+
)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Custom Chunk Size
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
# Use 1-minute chunks instead of default 20-minute chunks
|
|
126
|
+
speech_segments, gaps, rms_curve = detector.detect(
|
|
127
|
+
file_path="audio.mp3",
|
|
128
|
+
chunk_duration_sec=60,
|
|
129
|
+
)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Merging Adjacent Segments
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
# Merge adjacent segments with gaps smaller than 300ms
|
|
136
|
+
# Useful for handling brief pauses in speech (breathing, thinking pauses)
|
|
137
|
+
speech_segments, gaps, rms_curve = detector.detect(
|
|
138
|
+
file_path="audio.mp3",
|
|
139
|
+
merge_gap_threshold_ms=300,
|
|
140
|
+
)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Custom RMS Energy Detection Parameters
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
# Customize RMS calculation window size and output interval
|
|
147
|
+
# frame_size_ms: Convolution window size (default: 100ms)
|
|
148
|
+
# output_interval_ms: Output sampling interval (default: 50ms)
|
|
149
|
+
speech_segments, gaps, rms_curve = detector.detect(
|
|
150
|
+
file_path="audio.mp3",
|
|
151
|
+
rms_frame_size_ms=100, # 100ms window for RMS calculation
|
|
152
|
+
rms_output_interval_ms=50, # Output every 50ms for higher resolution
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# The RMS curve can be used for audio visualization, energy analysis, or as input for other audio processing tasks
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Environment Variables for RMS Configuration
|
|
159
|
+
|
|
160
|
+
You can configure default RMS parameters using environment variables:
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
# Set default frame size (default: 100ms)
|
|
164
|
+
export RMS_FRAME_SIZE_MS=100
|
|
165
|
+
|
|
166
|
+
# Set default output interval (default: 50ms)
|
|
167
|
+
export RMS_OUTPUT_INTERVAL_MS=50
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
These environment variables will be used as defaults when `rms_frame_size_ms` and `rms_output_interval_ms` parameters are not explicitly provided to the `detect()` method.
|
|
171
|
+
|
|
172
|
+
## RMS Energy Design Principles
|
|
173
|
+
|
|
174
|
+
`speech-detect` always emits an RMS energy curve together with VAD results. The feature is designed to stay lightweight while providing meaningful downstream insight:
|
|
175
|
+
|
|
176
|
+
- **Always-on streaming measurement**: RMS is computed chunk by chunk using a sliding, normalized window, so memory usage stays constant even for multi-hour recordings.
|
|
177
|
+
- **Decoupled smoothing vs. resolution**: `rms_frame_size_ms` controls the convolution window (how aggressively noise is smoothed), while `rms_output_interval_ms` controls the sampling density. Allowing the interval to be smaller than the window lets you plot dense, smooth curves without losing smoothing benefits.
|
|
178
|
+
- **Deterministic timeline**: Both parameters are expressed in milliseconds and map directly to timestamps in the returned `rms_curve`, making it trivial to align energy data with speech segments, captions, or UI waveforms.
|
|
179
|
+
- **Downstream flexibility**: The curve can drive silence gating, highlight low-energy pauses, or simply power visual meters. Because it is always returned, callers can adopt it opportunistically without extra processing steps.
|
|
180
|
+
|
|
181
|
+
## API Reference
|
|
182
|
+
|
|
183
|
+
### SpeechDetector
|
|
184
|
+
|
|
185
|
+
Main class for speech detection. All methods are instance methods.
|
|
186
|
+
|
|
187
|
+
#### `SpeechDetector.__init__(model_dir=None)`
|
|
188
|
+
|
|
189
|
+
Initialize speech detector.
|
|
190
|
+
|
|
191
|
+
**Parameters:**
|
|
192
|
+
|
|
193
|
+
- `model_dir` (str, optional): Path to the FSMN-VAD model directory. If None, reads from `MODEL_FSMN_VAD_DIR` environment variable.
|
|
194
|
+
|
|
195
|
+
**Note:** The FSMN-VAD model only has a quantized version, so `quantize=True` is always used internally.
|
|
196
|
+
|
|
197
|
+
**Raises:**
|
|
198
|
+
|
|
199
|
+
- `VadModelNotFoundError`: If model directory is not found or not set
|
|
200
|
+
- `VadModelInitializationError`: If model initialization fails
|
|
201
|
+
|
|
202
|
+
#### `SpeechDetector.detect(file_path, chunk_duration_sec=None, start_ms=None, duration_ms=None, merge_gap_threshold_ms=None, rms_frame_size_ms=None, rms_output_interval_ms=None)`
|
|
203
|
+
|
|
204
|
+
Detect speech segments, non-speech gaps, and RMS energy curve in audio/video file using streaming processing.
|
|
205
|
+
|
|
206
|
+
**Parameters:**
|
|
207
|
+
|
|
208
|
+
- `file_path` (str): Path to the audio/video file (supports all FFmpeg formats)
|
|
209
|
+
- `chunk_duration_sec` (int, optional): Duration of each chunk in seconds. Defaults to 1200 (20 minutes). Must be > 0 if provided.
|
|
210
|
+
- `start_ms` (int, optional): Start position in milliseconds. None means from file beginning. If None but `duration_ms` is provided, defaults to 0.
|
|
211
|
+
- `duration_ms` (int, optional): Total duration to process in milliseconds. None means process until end. If specified, processing stops when this duration is reached.
|
|
212
|
+
- `merge_gap_threshold_ms` (int, optional): Gap threshold in milliseconds. Adjacent speech segments with gaps smaller than this threshold will be merged into a single segment. None (default) disables merging. If <= 0, a warning will be logged and merging will be disabled. Useful for handling brief pauses in speech (e.g., breathing, thinking pauses) that should be considered part of continuous speech.
|
|
213
|
+
- `rms_frame_size_ms` (int, optional): Convolution window size in milliseconds for RMS calculation. Defaults to 100ms (can be overridden by `RMS_FRAME_SIZE_MS` environment variable). If <= 0, a warning will be logged and default value (100ms) will be used.
|
|
214
|
+
- `rms_output_interval_ms` (int, optional): Output sampling interval in milliseconds for RMS curve. Defaults to 50ms (can be overridden by `RMS_OUTPUT_INTERVAL_MS` environment variable). If <= 0, a warning will be logged and default value (50ms) will be used. If > `rms_frame_size_ms`, it will be adjusted to `rms_frame_size_ms`.
|
|
215
|
+
|
|
216
|
+
**Returns:**
|
|
217
|
+
|
|
218
|
+
- `tuple[list[VadSegment], list[VadSegment], list[RMSPoint]]`: Tuple of (speech_segments, gaps, rms_curve)
|
|
219
|
+
- `speech_segments`: List of speech segments, format: `[{"start": ms, "end": ms}, ...]`
|
|
220
|
+
- Timestamps are relative to audio start (from 0)
|
|
221
|
+
- Unit: milliseconds
|
|
222
|
+
- `gaps`: List of non-speech gaps, format: `[{"start": ms, "end": ms}, ...]`
|
|
223
|
+
- Timestamps are relative to audio start (from 0)
|
|
224
|
+
- Unit: milliseconds
|
|
225
|
+
- `rms_curve`: RMS energy curve data, always computed and returned, format: `[{"ms": int, "value": float}, ...]`
|
|
226
|
+
- `ms`: Time position in milliseconds (relative to audio start, from 0)
|
|
227
|
+
- `value`: RMS energy value (float, typically in range [0.0, 1.0])
|
|
228
|
+
- Unit: milliseconds for time, dimensionless for value
|
|
229
|
+
|
|
230
|
+
**Raises:**
|
|
231
|
+
|
|
232
|
+
- `VadProcessingError`: If processing fails
|
|
233
|
+
|
|
234
|
+
## Data Types
|
|
235
|
+
|
|
236
|
+
### VadSegment
|
|
237
|
+
|
|
238
|
+
A TypedDict representing a time segment (can be a speech segment or a non-speech gap).
|
|
239
|
+
|
|
240
|
+
**Fields:**
|
|
241
|
+
|
|
242
|
+
- `start` (int): Segment start time in milliseconds
|
|
243
|
+
- `end` (int): Segment end time in milliseconds
|
|
244
|
+
|
|
245
|
+
**Example:**
|
|
246
|
+
|
|
247
|
+
```python
|
|
248
|
+
segment: VadSegment = {"start": 100, "end": 500}
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
### RMSPoint
|
|
252
|
+
|
|
253
|
+
A TypedDict representing a point on the RMS energy curve.
|
|
254
|
+
|
|
255
|
+
**Fields:**
|
|
256
|
+
|
|
257
|
+
- `ms` (int): Time position in milliseconds (relative to audio start, from 0)
|
|
258
|
+
- `value` (float): RMS energy value (typically in range [0.0, 1.0])
|
|
259
|
+
|
|
260
|
+
**Example:**
|
|
261
|
+
|
|
262
|
+
```python
|
|
263
|
+
point: RMSPoint = {"ms": 100, "value": 0.123}
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
## Exceptions
|
|
267
|
+
|
|
268
|
+
### `VadModelNotFoundError`
|
|
269
|
+
|
|
270
|
+
Raised when VAD model directory is not found or not set.
|
|
271
|
+
|
|
272
|
+
**Attributes:**
|
|
273
|
+
|
|
274
|
+
- `message`: Human-readable error message
|
|
275
|
+
|
|
276
|
+
### `VadModelInitializationError`
|
|
277
|
+
|
|
278
|
+
Raised when VAD model initialization fails.
|
|
279
|
+
|
|
280
|
+
**Attributes:**
|
|
281
|
+
|
|
282
|
+
- `message`: Primary error message
|
|
283
|
+
- `model_dir`: Path to the model directory that caused the error
|
|
284
|
+
|
|
285
|
+
### `VadProcessingError`
|
|
286
|
+
|
|
287
|
+
Raised when VAD processing fails.
|
|
288
|
+
|
|
289
|
+
**Attributes:**
|
|
290
|
+
|
|
291
|
+
- `message`: Primary error message
|
|
292
|
+
- `file_path`: Path to the file being processed
|
|
293
|
+
- `details`: Additional error details dictionary
|
|
294
|
+
|
|
295
|
+
## Requirements
|
|
296
|
+
|
|
297
|
+
- Python >= 3.10
|
|
298
|
+
- FFmpeg (must be installed separately)
|
|
299
|
+
- numpy >= 1.26.4
|
|
300
|
+
- scipy >= 1.11.0 (for RMS energy calculation)
|
|
301
|
+
- funasr-onnx >= 0.4.1
|
|
302
|
+
- ffmpeg-audio >= 0.2.0
|
|
303
|
+
- jieba >= 0.42.1
|
|
304
|
+
- torch >= 2.9.1
|
|
305
|
+
- setuptools == 80.8.0 (to avoid UserWarning from jieba about deprecated pkg_resources API)
|
|
306
|
+
|
|
307
|
+
## License
|
|
308
|
+
|
|
309
|
+
MIT License
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools==80.8.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "speech-detect"
|
|
7
|
+
version = "0.2.4"
|
|
8
|
+
description = "A Python library for detecting speech segments and non-speech gaps in audio/video files using FSMN-VAD-ONNX with streaming processing"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
dependencies = [
|
|
13
|
+
"numpy>=1.26.4",
|
|
14
|
+
"funasr-onnx>=0.4.1",
|
|
15
|
+
"ffmpeg-audio>=0.3.0",
|
|
16
|
+
"jieba>=0.42.1",
|
|
17
|
+
"torch>=2.9.1",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.urls]
|
|
21
|
+
Homepage = "https://github.com/speech2srt/speech-detect"
|
|
22
|
+
Repository = "https://github.com/speech2srt/speech-detect"
|
|
23
|
+
Issues = "https://github.com/speech2srt/speech-detect/issues"
|
|
24
|
+
|
|
25
|
+
[tool.setuptools]
|
|
26
|
+
package-dir = {"speech_detect" = "src"}
|
|
27
|
+
packages = ["speech_detect"]
|