speech-detect 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speech_detect-0.1.0/LICENSE +21 -0
- speech_detect-0.1.0/MANIFEST.in +5 -0
- speech_detect-0.1.0/PKG-INFO +234 -0
- speech_detect-0.1.0/README.md +218 -0
- speech_detect-0.1.0/pyproject.toml +25 -0
- speech_detect-0.1.0/requirements.txt +3 -0
- speech_detect-0.1.0/setup.cfg +4 -0
- speech_detect-0.1.0/speech_detect.egg-info/PKG-INFO +234 -0
- speech_detect-0.1.0/speech_detect.egg-info/SOURCES.txt +15 -0
- speech_detect-0.1.0/speech_detect.egg-info/dependency_links.txt +1 -0
- speech_detect-0.1.0/speech_detect.egg-info/requires.txt +3 -0
- speech_detect-0.1.0/speech_detect.egg-info/top_level.txt +1 -0
- speech_detect-0.1.0/src/__init__.py +31 -0
- speech_detect-0.1.0/src/exceptions.py +69 -0
- speech_detect-0.1.0/src/speech_detect.py +200 -0
- speech_detect-0.1.0/src/vad_parser.py +138 -0
- speech_detect-0.1.0/src/vad_types.py +27 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 [Speech2SRT Team](https://github.com/speech2srt)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: speech-detect
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Python library for detecting speech segments and non-speech gaps in audio/video files using FSMN-VAD-ONNX with streaming processing
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/speech2srt/speech-detect
|
|
7
|
+
Project-URL: Repository, https://github.com/speech2srt/speech-detect
|
|
8
|
+
Project-URL: Issues, https://github.com/speech2srt/speech-detect/issues
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: numpy>=1.26.4
|
|
13
|
+
Requires-Dist: funasr-onnx>=0.4.1
|
|
14
|
+
Requires-Dist: ffmpeg-audio>=0.1.3
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
# speech-detect
|
|
18
|
+
|
|
19
|
+
A Python library for detecting speech segments and non-speech gaps in audio/video files using FSMN-VAD-ONNX with streaming processing.
|
|
20
|
+
|
|
21
|
+
## Features
|
|
22
|
+
|
|
23
|
+
- **Streaming VAD detection**: Process large audio/video files in chunks without loading everything into memory
|
|
24
|
+
- **Speech segment detection**: Detect all speech segments in audio/video files
|
|
25
|
+
- **Non-speech gap derivation**: Compute non-speech gaps from speech segments
|
|
26
|
+
- **Format support**: Supports all audio/video formats that FFmpeg supports (MP3, WAV, FLAC, Opus, MP4, etc.)
|
|
27
|
+
- **Time range support**: Support start time and duration parameters for partial processing
|
|
28
|
+
- **Memory efficient**: Constant memory usage regardless of audio file duration
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install speech-detect
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
**Note**: This package requires:
|
|
37
|
+
|
|
38
|
+
- FFmpeg to be installed on your system and available in PATH
|
|
39
|
+
- FSMN-VAD-ONNX model files (see Model Setup below)
|
|
40
|
+
|
|
41
|
+
## Model Setup
|
|
42
|
+
|
|
43
|
+
This package requires FSMN-VAD-ONNX model files. The model is available on Hugging Face:
|
|
44
|
+
|
|
45
|
+
**Model Repository**: [funasr/fsmn-vad-onnx](https://huggingface.co/funasr/fsmn-vad-onnx)
|
|
46
|
+
|
|
47
|
+
### Download the Model
|
|
48
|
+
|
|
49
|
+
1. Install Git LFS (required for downloading large model files):
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
git lfs install
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
2. Clone the model repository:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
git clone https://huggingface.co/funasr/fsmn-vad-onnx
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
This will download the model files including `model_quant.onnx`, `config.yaml`, `am.mvn`, etc.
|
|
62
|
+
|
|
63
|
+
3. Set the `MODEL_FSMN_VAD_DIR` environment variable to point to the model directory:
|
|
64
|
+
```bash
|
|
65
|
+
export MODEL_FSMN_VAD_DIR=/path/to/fsmn-vad-onnx
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Alternatively, you can specify the model directory when initializing `SpeechDetector`:
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from speech_detect import SpeechDetector
|
|
72
|
+
|
|
73
|
+
detector = SpeechDetector(model_dir="/path/to/fsmn-vad-onnx")
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Quick Start
|
|
77
|
+
|
|
78
|
+
### Detect Speech Segments and Gaps
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from speech_detect import SpeechDetector
|
|
82
|
+
|
|
83
|
+
# Initialize detector (reads MODEL_FSMN_VAD_DIR from environment)
|
|
84
|
+
detector = SpeechDetector()
|
|
85
|
+
|
|
86
|
+
# Detect speech segments and non-speech gaps in an audio file
|
|
87
|
+
speech_segments, gaps = detector.detect("audio.mp3")
|
|
88
|
+
|
|
89
|
+
# speech_segments is a list of dictionaries: [{"start": 0, "end": 500}, ...]
|
|
90
|
+
for segment in speech_segments:
|
|
91
|
+
start_ms = segment["start"]
|
|
92
|
+
end_ms = segment["end"]
|
|
93
|
+
duration = end_ms - start_ms
|
|
94
|
+
print(f"Speech segment: {start_ms}ms - {end_ms}ms (duration: {duration}ms)")
|
|
95
|
+
|
|
96
|
+
# gaps is a list of dictionaries: [{"start": 0, "end": 500}, ...]
|
|
97
|
+
for gap in gaps:
|
|
98
|
+
start_ms = gap["start"]
|
|
99
|
+
end_ms = gap["end"]
|
|
100
|
+
duration = end_ms - start_ms
|
|
101
|
+
print(f"Non-speech gap: {start_ms}ms - {end_ms}ms (duration: {duration}ms)")
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Processing Specific Time Range
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
# Process only the first 30 seconds
|
|
108
|
+
speech_segments, gaps = detector.detect(
|
|
109
|
+
file_path="audio.mp3",
|
|
110
|
+
start_ms=0,
|
|
111
|
+
duration_ms=30000,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Process from 10 seconds, duration 5 seconds
|
|
115
|
+
speech_segments, gaps = detector.detect(
|
|
116
|
+
file_path="audio.mp3",
|
|
117
|
+
start_ms=10000,
|
|
118
|
+
duration_ms=5000,
|
|
119
|
+
)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Custom Chunk Size
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
# Use 1-minute chunks instead of default 20-minute chunks
|
|
126
|
+
speech_segments, gaps = detector.detect(
|
|
127
|
+
file_path="audio.mp3",
|
|
128
|
+
chunk_duration_sec=60,
|
|
129
|
+
)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## API Reference
|
|
133
|
+
|
|
134
|
+
### SpeechDetector
|
|
135
|
+
|
|
136
|
+
Main class for speech detection. All methods are instance methods.
|
|
137
|
+
|
|
138
|
+
#### `SpeechDetector.__init__(model_dir=None)`
|
|
139
|
+
|
|
140
|
+
Initialize speech detector.
|
|
141
|
+
|
|
142
|
+
**Parameters:**
|
|
143
|
+
|
|
144
|
+
- `model_dir` (str, optional): Path to the FSMN-VAD model directory. If None, reads from `MODEL_FSMN_VAD_DIR` environment variable.
|
|
145
|
+
|
|
146
|
+
**Note:** The FSMN-VAD model only has a quantized version, so `quantize=True` is always used internally.
|
|
147
|
+
|
|
148
|
+
**Raises:**
|
|
149
|
+
|
|
150
|
+
- `VadModelNotFoundError`: If model directory is not found or not set
|
|
151
|
+
- `VadModelInitializationError`: If model initialization fails
|
|
152
|
+
|
|
153
|
+
#### `SpeechDetector.detect(file_path, chunk_duration_sec=None, start_ms=None, duration_ms=None)`
|
|
154
|
+
|
|
155
|
+
Detect speech segments in audio/video file using streaming processing.
|
|
156
|
+
|
|
157
|
+
**Parameters:**
|
|
158
|
+
|
|
159
|
+
- `file_path` (str): Path to the audio/video file (supports all FFmpeg formats)
|
|
160
|
+
- `chunk_duration_sec` (int, optional): Duration of each chunk in seconds. Defaults to 1200 (20 minutes). Must be > 0 if provided.
|
|
161
|
+
- `start_ms` (int, optional): Start position in milliseconds. None means from file beginning. If None but `duration_ms` is provided, defaults to 0.
|
|
162
|
+
- `duration_ms` (int, optional): Total duration to process in milliseconds. None means process until end. If specified, processing stops when this duration is reached.
|
|
163
|
+
|
|
164
|
+
**Returns:**
|
|
165
|
+
|
|
166
|
+
- `tuple[list[VadSegment], list[VadSegment]]`: Tuple of (speech_segments, gaps)
|
|
167
|
+
- `speech_segments`: List of speech segments, format: `[{"start": ms, "end": ms}, ...]`
|
|
168
|
+
- Timestamps are relative to audio start (from 0)
|
|
169
|
+
- Unit: milliseconds
|
|
170
|
+
- `gaps`: List of non-speech gaps, format: `[{"start": ms, "end": ms}, ...]`
|
|
171
|
+
- Timestamps are relative to audio start (from 0)
|
|
172
|
+
- Unit: milliseconds
|
|
173
|
+
|
|
174
|
+
**Raises:**
|
|
175
|
+
|
|
176
|
+
- `VadProcessingError`: If processing fails
|
|
177
|
+
|
|
178
|
+
## Data Types
|
|
179
|
+
|
|
180
|
+
### VadSegment
|
|
181
|
+
|
|
182
|
+
A TypedDict representing a time segment (can be a speech segment or a non-speech gap).
|
|
183
|
+
|
|
184
|
+
**Fields:**
|
|
185
|
+
|
|
186
|
+
- `start` (int): Segment start time in milliseconds
|
|
187
|
+
- `end` (int): Segment end time in milliseconds
|
|
188
|
+
|
|
189
|
+
**Example:**
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
segment: VadSegment = {"start": 100, "end": 500}
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Exceptions
|
|
196
|
+
|
|
197
|
+
### `VadModelNotFoundError`
|
|
198
|
+
|
|
199
|
+
Raised when VAD model directory is not found or not set.
|
|
200
|
+
|
|
201
|
+
**Attributes:**
|
|
202
|
+
|
|
203
|
+
- `message`: Human-readable error message
|
|
204
|
+
|
|
205
|
+
### `VadModelInitializationError`
|
|
206
|
+
|
|
207
|
+
Raised when VAD model initialization fails.
|
|
208
|
+
|
|
209
|
+
**Attributes:**
|
|
210
|
+
|
|
211
|
+
- `message`: Primary error message
|
|
212
|
+
- `model_dir`: Path to the model directory that caused the error
|
|
213
|
+
|
|
214
|
+
### `VadProcessingError`
|
|
215
|
+
|
|
216
|
+
Raised when VAD processing fails.
|
|
217
|
+
|
|
218
|
+
**Attributes:**
|
|
219
|
+
|
|
220
|
+
- `message`: Primary error message
|
|
221
|
+
- `file_path`: Path to the file being processed
|
|
222
|
+
- `details`: Additional error details dictionary
|
|
223
|
+
|
|
224
|
+
## Requirements
|
|
225
|
+
|
|
226
|
+
- Python >= 3.10
|
|
227
|
+
- FFmpeg (must be installed separately)
|
|
228
|
+
- numpy >= 1.26.4
|
|
229
|
+
- funasr-onnx >= 0.4.1
|
|
230
|
+
- ffmpeg-audio >= 0.1.3
|
|
231
|
+
|
|
232
|
+
## License
|
|
233
|
+
|
|
234
|
+
MIT License
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
# speech-detect
|
|
2
|
+
|
|
3
|
+
A Python library for detecting speech segments and non-speech gaps in audio/video files using FSMN-VAD-ONNX with streaming processing.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Streaming VAD detection**: Process large audio/video files in chunks without loading everything into memory
|
|
8
|
+
- **Speech segment detection**: Detect all speech segments in audio/video files
|
|
9
|
+
- **Non-speech gap derivation**: Compute non-speech gaps from speech segments
|
|
10
|
+
- **Format support**: Supports all audio/video formats that FFmpeg supports (MP3, WAV, FLAC, Opus, MP4, etc.)
|
|
11
|
+
- **Time range support**: Support start time and duration parameters for partial processing
|
|
12
|
+
- **Memory efficient**: Constant memory usage regardless of audio file duration
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install speech-detect
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
**Note**: This package requires:
|
|
21
|
+
|
|
22
|
+
- FFmpeg to be installed on your system and available in PATH
|
|
23
|
+
- FSMN-VAD-ONNX model files (see Model Setup below)
|
|
24
|
+
|
|
25
|
+
## Model Setup
|
|
26
|
+
|
|
27
|
+
This package requires FSMN-VAD-ONNX model files. The model is available on Hugging Face:
|
|
28
|
+
|
|
29
|
+
**Model Repository**: [funasr/fsmn-vad-onnx](https://huggingface.co/funasr/fsmn-vad-onnx)
|
|
30
|
+
|
|
31
|
+
### Download the Model
|
|
32
|
+
|
|
33
|
+
1. Install Git LFS (required for downloading large model files):
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
git lfs install
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
2. Clone the model repository:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
git clone https://huggingface.co/funasr/fsmn-vad-onnx
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
This will download the model files including `model_quant.onnx`, `config.yaml`, `am.mvn`, etc.
|
|
46
|
+
|
|
47
|
+
3. Set the `MODEL_FSMN_VAD_DIR` environment variable to point to the model directory:
|
|
48
|
+
```bash
|
|
49
|
+
export MODEL_FSMN_VAD_DIR=/path/to/fsmn-vad-onnx
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Alternatively, you can specify the model directory when initializing `SpeechDetector`:
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from speech_detect import SpeechDetector
|
|
56
|
+
|
|
57
|
+
detector = SpeechDetector(model_dir="/path/to/fsmn-vad-onnx")
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Quick Start
|
|
61
|
+
|
|
62
|
+
### Detect Speech Segments and Gaps
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from speech_detect import SpeechDetector
|
|
66
|
+
|
|
67
|
+
# Initialize detector (reads MODEL_FSMN_VAD_DIR from environment)
|
|
68
|
+
detector = SpeechDetector()
|
|
69
|
+
|
|
70
|
+
# Detect speech segments and non-speech gaps in an audio file
|
|
71
|
+
speech_segments, gaps = detector.detect("audio.mp3")
|
|
72
|
+
|
|
73
|
+
# speech_segments is a list of dictionaries: [{"start": 0, "end": 500}, ...]
|
|
74
|
+
for segment in speech_segments:
|
|
75
|
+
start_ms = segment["start"]
|
|
76
|
+
end_ms = segment["end"]
|
|
77
|
+
duration = end_ms - start_ms
|
|
78
|
+
print(f"Speech segment: {start_ms}ms - {end_ms}ms (duration: {duration}ms)")
|
|
79
|
+
|
|
80
|
+
# gaps is a list of dictionaries: [{"start": 0, "end": 500}, ...]
|
|
81
|
+
for gap in gaps:
|
|
82
|
+
start_ms = gap["start"]
|
|
83
|
+
end_ms = gap["end"]
|
|
84
|
+
duration = end_ms - start_ms
|
|
85
|
+
print(f"Non-speech gap: {start_ms}ms - {end_ms}ms (duration: {duration}ms)")
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Processing Specific Time Range
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
# Process only the first 30 seconds
|
|
92
|
+
speech_segments, gaps = detector.detect(
|
|
93
|
+
file_path="audio.mp3",
|
|
94
|
+
start_ms=0,
|
|
95
|
+
duration_ms=30000,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Process from 10 seconds, duration 5 seconds
|
|
99
|
+
speech_segments, gaps = detector.detect(
|
|
100
|
+
file_path="audio.mp3",
|
|
101
|
+
start_ms=10000,
|
|
102
|
+
duration_ms=5000,
|
|
103
|
+
)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Custom Chunk Size
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
# Use 1-minute chunks instead of default 20-minute chunks
|
|
110
|
+
speech_segments, gaps = detector.detect(
|
|
111
|
+
file_path="audio.mp3",
|
|
112
|
+
chunk_duration_sec=60,
|
|
113
|
+
)
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## API Reference
|
|
117
|
+
|
|
118
|
+
### SpeechDetector
|
|
119
|
+
|
|
120
|
+
Main class for speech detection. All methods are instance methods.
|
|
121
|
+
|
|
122
|
+
#### `SpeechDetector.__init__(model_dir=None)`
|
|
123
|
+
|
|
124
|
+
Initialize speech detector.
|
|
125
|
+
|
|
126
|
+
**Parameters:**
|
|
127
|
+
|
|
128
|
+
- `model_dir` (str, optional): Path to the FSMN-VAD model directory. If None, reads from `MODEL_FSMN_VAD_DIR` environment variable.
|
|
129
|
+
|
|
130
|
+
**Note:** The FSMN-VAD model only has a quantized version, so `quantize=True` is always used internally.
|
|
131
|
+
|
|
132
|
+
**Raises:**
|
|
133
|
+
|
|
134
|
+
- `VadModelNotFoundError`: If model directory is not found or not set
|
|
135
|
+
- `VadModelInitializationError`: If model initialization fails
|
|
136
|
+
|
|
137
|
+
#### `SpeechDetector.detect(file_path, chunk_duration_sec=None, start_ms=None, duration_ms=None)`
|
|
138
|
+
|
|
139
|
+
Detect speech segments in audio/video file using streaming processing.
|
|
140
|
+
|
|
141
|
+
**Parameters:**
|
|
142
|
+
|
|
143
|
+
- `file_path` (str): Path to the audio/video file (supports all FFmpeg formats)
|
|
144
|
+
- `chunk_duration_sec` (int, optional): Duration of each chunk in seconds. Defaults to 1200 (20 minutes). Must be > 0 if provided.
|
|
145
|
+
- `start_ms` (int, optional): Start position in milliseconds. None means from file beginning. If None but `duration_ms` is provided, defaults to 0.
|
|
146
|
+
- `duration_ms` (int, optional): Total duration to process in milliseconds. None means process until end. If specified, processing stops when this duration is reached.
|
|
147
|
+
|
|
148
|
+
**Returns:**
|
|
149
|
+
|
|
150
|
+
- `tuple[list[VadSegment], list[VadSegment]]`: Tuple of (speech_segments, gaps)
|
|
151
|
+
- `speech_segments`: List of speech segments, format: `[{"start": ms, "end": ms}, ...]`
|
|
152
|
+
- Timestamps are relative to audio start (from 0)
|
|
153
|
+
- Unit: milliseconds
|
|
154
|
+
- `gaps`: List of non-speech gaps, format: `[{"start": ms, "end": ms}, ...]`
|
|
155
|
+
- Timestamps are relative to audio start (from 0)
|
|
156
|
+
- Unit: milliseconds
|
|
157
|
+
|
|
158
|
+
**Raises:**
|
|
159
|
+
|
|
160
|
+
- `VadProcessingError`: If processing fails
|
|
161
|
+
|
|
162
|
+
## Data Types
|
|
163
|
+
|
|
164
|
+
### VadSegment
|
|
165
|
+
|
|
166
|
+
A TypedDict representing a time segment (can be a speech segment or a non-speech gap).
|
|
167
|
+
|
|
168
|
+
**Fields:**
|
|
169
|
+
|
|
170
|
+
- `start` (int): Segment start time in milliseconds
|
|
171
|
+
- `end` (int): Segment end time in milliseconds
|
|
172
|
+
|
|
173
|
+
**Example:**
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
segment: VadSegment = {"start": 100, "end": 500}
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
## Exceptions
|
|
180
|
+
|
|
181
|
+
### `VadModelNotFoundError`
|
|
182
|
+
|
|
183
|
+
Raised when VAD model directory is not found or not set.
|
|
184
|
+
|
|
185
|
+
**Attributes:**
|
|
186
|
+
|
|
187
|
+
- `message`: Human-readable error message
|
|
188
|
+
|
|
189
|
+
### `VadModelInitializationError`
|
|
190
|
+
|
|
191
|
+
Raised when VAD model initialization fails.
|
|
192
|
+
|
|
193
|
+
**Attributes:**
|
|
194
|
+
|
|
195
|
+
- `message`: Primary error message
|
|
196
|
+
- `model_dir`: Path to the model directory that caused the error
|
|
197
|
+
|
|
198
|
+
### `VadProcessingError`
|
|
199
|
+
|
|
200
|
+
Raised when VAD processing fails.
|
|
201
|
+
|
|
202
|
+
**Attributes:**
|
|
203
|
+
|
|
204
|
+
- `message`: Primary error message
|
|
205
|
+
- `file_path`: Path to the file being processed
|
|
206
|
+
- `details`: Additional error details dictionary
|
|
207
|
+
|
|
208
|
+
## Requirements
|
|
209
|
+
|
|
210
|
+
- Python >= 3.10
|
|
211
|
+
- FFmpeg (must be installed separately)
|
|
212
|
+
- numpy >= 1.26.4
|
|
213
|
+
- funasr-onnx >= 0.4.1
|
|
214
|
+
- ffmpeg-audio >= 0.1.3
|
|
215
|
+
|
|
216
|
+
## License
|
|
217
|
+
|
|
218
|
+
MIT License
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=80.8.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "speech-detect"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A Python library for detecting speech segments and non-speech gaps in audio/video files using FSMN-VAD-ONNX with streaming processing"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
dependencies = [
|
|
13
|
+
"numpy>=1.26.4",
|
|
14
|
+
"funasr-onnx>=0.4.1",
|
|
15
|
+
"ffmpeg-audio>=0.1.3",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.urls]
|
|
19
|
+
Homepage = "https://github.com/speech2srt/speech-detect"
|
|
20
|
+
Repository = "https://github.com/speech2srt/speech-detect"
|
|
21
|
+
Issues = "https://github.com/speech2srt/speech-detect/issues"
|
|
22
|
+
|
|
23
|
+
[tool.setuptools]
|
|
24
|
+
package-dir = {"speech_detect" = "src"}
|
|
25
|
+
packages = ["speech_detect"]
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: speech-detect
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Python library for detecting speech segments and non-speech gaps in audio/video files using FSMN-VAD-ONNX with streaming processing
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/speech2srt/speech-detect
|
|
7
|
+
Project-URL: Repository, https://github.com/speech2srt/speech-detect
|
|
8
|
+
Project-URL: Issues, https://github.com/speech2srt/speech-detect/issues
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: numpy>=1.26.4
|
|
13
|
+
Requires-Dist: funasr-onnx>=0.4.1
|
|
14
|
+
Requires-Dist: ffmpeg-audio>=0.1.3
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
# speech-detect
|
|
18
|
+
|
|
19
|
+
A Python library for detecting speech segments and non-speech gaps in audio/video files using FSMN-VAD-ONNX with streaming processing.
|
|
20
|
+
|
|
21
|
+
## Features
|
|
22
|
+
|
|
23
|
+
- **Streaming VAD detection**: Process large audio/video files in chunks without loading everything into memory
|
|
24
|
+
- **Speech segment detection**: Detect all speech segments in audio/video files
|
|
25
|
+
- **Non-speech gap derivation**: Compute non-speech gaps from speech segments
|
|
26
|
+
- **Format support**: Supports all audio/video formats that FFmpeg supports (MP3, WAV, FLAC, Opus, MP4, etc.)
|
|
27
|
+
- **Time range support**: Support start time and duration parameters for partial processing
|
|
28
|
+
- **Memory efficient**: Constant memory usage regardless of audio file duration
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install speech-detect
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
**Note**: This package requires:
|
|
37
|
+
|
|
38
|
+
- FFmpeg to be installed on your system and available in PATH
|
|
39
|
+
- FSMN-VAD-ONNX model files (see Model Setup below)
|
|
40
|
+
|
|
41
|
+
## Model Setup
|
|
42
|
+
|
|
43
|
+
This package requires FSMN-VAD-ONNX model files. The model is available on Hugging Face:
|
|
44
|
+
|
|
45
|
+
**Model Repository**: [funasr/fsmn-vad-onnx](https://huggingface.co/funasr/fsmn-vad-onnx)
|
|
46
|
+
|
|
47
|
+
### Download the Model
|
|
48
|
+
|
|
49
|
+
1. Install Git LFS (required for downloading large model files):
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
git lfs install
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
2. Clone the model repository:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
git clone https://huggingface.co/funasr/fsmn-vad-onnx
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
This will download the model files including `model_quant.onnx`, `config.yaml`, `am.mvn`, etc.
|
|
62
|
+
|
|
63
|
+
3. Set the `MODEL_FSMN_VAD_DIR` environment variable to point to the model directory:
|
|
64
|
+
```bash
|
|
65
|
+
export MODEL_FSMN_VAD_DIR=/path/to/fsmn-vad-onnx
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Alternatively, you can specify the model directory when initializing `SpeechDetector`:
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from speech_detect import SpeechDetector
|
|
72
|
+
|
|
73
|
+
detector = SpeechDetector(model_dir="/path/to/fsmn-vad-onnx")
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Quick Start
|
|
77
|
+
|
|
78
|
+
### Detect Speech Segments and Gaps
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from speech_detect import SpeechDetector
|
|
82
|
+
|
|
83
|
+
# Initialize detector (reads MODEL_FSMN_VAD_DIR from environment)
|
|
84
|
+
detector = SpeechDetector()
|
|
85
|
+
|
|
86
|
+
# Detect speech segments and non-speech gaps in an audio file
|
|
87
|
+
speech_segments, gaps = detector.detect("audio.mp3")
|
|
88
|
+
|
|
89
|
+
# speech_segments is a list of dictionaries: [{"start": 0, "end": 500}, ...]
|
|
90
|
+
for segment in speech_segments:
|
|
91
|
+
start_ms = segment["start"]
|
|
92
|
+
end_ms = segment["end"]
|
|
93
|
+
duration = end_ms - start_ms
|
|
94
|
+
print(f"Speech segment: {start_ms}ms - {end_ms}ms (duration: {duration}ms)")
|
|
95
|
+
|
|
96
|
+
# gaps is a list of dictionaries: [{"start": 0, "end": 500}, ...]
|
|
97
|
+
for gap in gaps:
|
|
98
|
+
start_ms = gap["start"]
|
|
99
|
+
end_ms = gap["end"]
|
|
100
|
+
duration = end_ms - start_ms
|
|
101
|
+
print(f"Non-speech gap: {start_ms}ms - {end_ms}ms (duration: {duration}ms)")
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Processing Specific Time Range
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
# Process only the first 30 seconds
|
|
108
|
+
speech_segments, gaps = detector.detect(
|
|
109
|
+
file_path="audio.mp3",
|
|
110
|
+
start_ms=0,
|
|
111
|
+
duration_ms=30000,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Process from 10 seconds, duration 5 seconds
|
|
115
|
+
speech_segments, gaps = detector.detect(
|
|
116
|
+
file_path="audio.mp3",
|
|
117
|
+
start_ms=10000,
|
|
118
|
+
duration_ms=5000,
|
|
119
|
+
)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Custom Chunk Size
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
# Use 1-minute chunks instead of default 20-minute chunks
|
|
126
|
+
speech_segments, gaps = detector.detect(
|
|
127
|
+
file_path="audio.mp3",
|
|
128
|
+
chunk_duration_sec=60,
|
|
129
|
+
)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## API Reference
|
|
133
|
+
|
|
134
|
+
### SpeechDetector
|
|
135
|
+
|
|
136
|
+
Main class for speech detection. All methods are instance methods.
|
|
137
|
+
|
|
138
|
+
#### `SpeechDetector.__init__(model_dir=None)`
|
|
139
|
+
|
|
140
|
+
Initialize speech detector.
|
|
141
|
+
|
|
142
|
+
**Parameters:**
|
|
143
|
+
|
|
144
|
+
- `model_dir` (str, optional): Path to the FSMN-VAD model directory. If None, reads from `MODEL_FSMN_VAD_DIR` environment variable.
|
|
145
|
+
|
|
146
|
+
**Note:** The FSMN-VAD model only has a quantized version, so `quantize=True` is always used internally.
|
|
147
|
+
|
|
148
|
+
**Raises:**
|
|
149
|
+
|
|
150
|
+
- `VadModelNotFoundError`: If model directory is not found or not set
|
|
151
|
+
- `VadModelInitializationError`: If model initialization fails
|
|
152
|
+
|
|
153
|
+
#### `SpeechDetector.detect(file_path, chunk_duration_sec=None, start_ms=None, duration_ms=None)`
|
|
154
|
+
|
|
155
|
+
Detect speech segments in audio/video file using streaming processing.
|
|
156
|
+
|
|
157
|
+
**Parameters:**
|
|
158
|
+
|
|
159
|
+
- `file_path` (str): Path to the audio/video file (supports all FFmpeg formats)
|
|
160
|
+
- `chunk_duration_sec` (int, optional): Duration of each chunk in seconds. Defaults to 1200 (20 minutes). Must be > 0 if provided.
|
|
161
|
+
- `start_ms` (int, optional): Start position in milliseconds. None means from file beginning. If None but `duration_ms` is provided, defaults to 0.
|
|
162
|
+
- `duration_ms` (int, optional): Total duration to process in milliseconds. None means process until end. If specified, processing stops when this duration is reached.
|
|
163
|
+
|
|
164
|
+
**Returns:**
|
|
165
|
+
|
|
166
|
+
- `tuple[list[VadSegment], list[VadSegment]]`: Tuple of (speech_segments, gaps)
|
|
167
|
+
- `speech_segments`: List of speech segments, format: `[{"start": ms, "end": ms}, ...]`
|
|
168
|
+
- Timestamps are relative to audio start (from 0)
|
|
169
|
+
- Unit: milliseconds
|
|
170
|
+
- `gaps`: List of non-speech gaps, format: `[{"start": ms, "end": ms}, ...]`
|
|
171
|
+
- Timestamps are relative to audio start (from 0)
|
|
172
|
+
- Unit: milliseconds
|
|
173
|
+
|
|
174
|
+
**Raises:**
|
|
175
|
+
|
|
176
|
+
- `VadProcessingError`: If processing fails
|
|
177
|
+
|
|
178
|
+
## Data Types
|
|
179
|
+
|
|
180
|
+
### VadSegment
|
|
181
|
+
|
|
182
|
+
A TypedDict representing a time segment (can be a speech segment or a non-speech gap).
|
|
183
|
+
|
|
184
|
+
**Fields:**
|
|
185
|
+
|
|
186
|
+
- `start` (int): Segment start time in milliseconds
|
|
187
|
+
- `end` (int): Segment end time in milliseconds
|
|
188
|
+
|
|
189
|
+
**Example:**
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
segment: VadSegment = {"start": 100, "end": 500}
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Exceptions
|
|
196
|
+
|
|
197
|
+
### `VadModelNotFoundError`
|
|
198
|
+
|
|
199
|
+
Raised when VAD model directory is not found or not set.
|
|
200
|
+
|
|
201
|
+
**Attributes:**
|
|
202
|
+
|
|
203
|
+
- `message`: Human-readable error message
|
|
204
|
+
|
|
205
|
+
### `VadModelInitializationError`
|
|
206
|
+
|
|
207
|
+
Raised when VAD model initialization fails.
|
|
208
|
+
|
|
209
|
+
**Attributes:**
|
|
210
|
+
|
|
211
|
+
- `message`: Primary error message
|
|
212
|
+
- `model_dir`: Path to the model directory that caused the error
|
|
213
|
+
|
|
214
|
+
### `VadProcessingError`
|
|
215
|
+
|
|
216
|
+
Raised when VAD processing fails.
|
|
217
|
+
|
|
218
|
+
**Attributes:**
|
|
219
|
+
|
|
220
|
+
- `message`: Primary error message
|
|
221
|
+
- `file_path`: Path to the file being processed
|
|
222
|
+
- `details`: Additional error details dictionary
|
|
223
|
+
|
|
224
|
+
## Requirements
|
|
225
|
+
|
|
226
|
+
- Python >= 3.10
|
|
227
|
+
- FFmpeg (must be installed separately)
|
|
228
|
+
- numpy >= 1.26.4
|
|
229
|
+
- funasr-onnx >= 0.4.1
|
|
230
|
+
- ffmpeg-audio >= 0.1.3
|
|
231
|
+
|
|
232
|
+
## License
|
|
233
|
+
|
|
234
|
+
MIT License
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
MANIFEST.in
|
|
3
|
+
README.md
|
|
4
|
+
pyproject.toml
|
|
5
|
+
requirements.txt
|
|
6
|
+
speech_detect.egg-info/PKG-INFO
|
|
7
|
+
speech_detect.egg-info/SOURCES.txt
|
|
8
|
+
speech_detect.egg-info/dependency_links.txt
|
|
9
|
+
speech_detect.egg-info/requires.txt
|
|
10
|
+
speech_detect.egg-info/top_level.txt
|
|
11
|
+
src/__init__.py
|
|
12
|
+
src/exceptions.py
|
|
13
|
+
src/speech_detect.py
|
|
14
|
+
src/vad_parser.py
|
|
15
|
+
src/vad_types.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
speech_detect
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Speech Detect - A Python library for detecting speech segments and non-speech gaps in audio/video files.
|
|
3
|
+
|
|
4
|
+
This package provides utilities for:
|
|
5
|
+
- Streaming VAD detection using FSMN-VAD-ONNX model
|
|
6
|
+
- Speech segment detection
|
|
7
|
+
- Non-speech gap detection
|
|
8
|
+
- Support for all FFmpeg-supported audio/video formats
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
|
|
13
|
+
from .exceptions import VadModelInitializationError, VadModelNotFoundError, VadProcessingError
|
|
14
|
+
from .speech_detect import SpeechDetector
|
|
15
|
+
from .vad_types import VadSegment
|
|
16
|
+
|
|
17
|
+
__version__ = "0.1.0"
|
|
18
|
+
|
|
19
|
+
# Configure library root logger
|
|
20
|
+
# Use NullHandler to ensure library remains silent when user hasn't configured logging
|
|
21
|
+
# If user configures logging (e.g., logging.basicConfig()), logs will bubble up to root logger for processing
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
logger.addHandler(logging.NullHandler())
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"SpeechDetector",
|
|
27
|
+
"VadSegment",
|
|
28
|
+
"VadModelNotFoundError",
|
|
29
|
+
"VadModelInitializationError",
|
|
30
|
+
"VadProcessingError",
|
|
31
|
+
]
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Exception classes for VAD detection errors.
|
|
3
|
+
|
|
4
|
+
Provides a hierarchy of exceptions for different error conditions,
|
|
5
|
+
enabling precise error handling and debugging.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class VadModelNotFoundError(Exception):
|
|
10
|
+
"""
|
|
11
|
+
Raised when VAD model directory is not found or not set.
|
|
12
|
+
|
|
13
|
+
This exception indicates that the model directory path is either not provided,
|
|
14
|
+
not set in environment variable MODEL_FSMN_VAD_DIR, or the directory does not exist.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, message: str):
|
|
18
|
+
"""
|
|
19
|
+
Initialize exception.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
message: Human-readable error message describing the issue.
|
|
23
|
+
"""
|
|
24
|
+
super().__init__(message)
|
|
25
|
+
self.message = message
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class VadModelInitializationError(Exception):
|
|
29
|
+
"""
|
|
30
|
+
Raised when VAD model initialization fails.
|
|
31
|
+
|
|
32
|
+
This exception indicates that the Fsmn_vad_online model failed to initialize
|
|
33
|
+
from the provided model directory, possibly due to missing files or corrupted model.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, message: str, model_dir: str = None):
|
|
37
|
+
"""
|
|
38
|
+
Initialize exception.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
message: Primary error message (required).
|
|
42
|
+
model_dir: Path to the model directory that caused the error (optional).
|
|
43
|
+
"""
|
|
44
|
+
super().__init__(message)
|
|
45
|
+
self.message = message
|
|
46
|
+
self.model_dir = model_dir
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class VadProcessingError(Exception):
|
|
50
|
+
"""
|
|
51
|
+
Raised when VAD processing fails.
|
|
52
|
+
|
|
53
|
+
This exception indicates that an error occurred during the VAD detection process,
|
|
54
|
+
such as audio format issues, streaming errors, or model inference failures.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(self, message: str, file_path: str = None, details: dict = None):
|
|
58
|
+
"""
|
|
59
|
+
Initialize exception.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
message: Primary error message (required).
|
|
63
|
+
file_path: Path to the file being processed (optional).
|
|
64
|
+
details: Additional error details dictionary (optional).
|
|
65
|
+
"""
|
|
66
|
+
super().__init__(message)
|
|
67
|
+
self.message = message
|
|
68
|
+
self.file_path = file_path
|
|
69
|
+
self.details = details or {}
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Speech Detector
|
|
3
|
+
|
|
4
|
+
Detects speech segments and non-speech gaps in audio/video files using Fsmn_vad_online model with streaming processing.
|
|
5
|
+
Supports all FFmpeg-compatible formats and processes large files efficiently with constant memory usage.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
from ffmpeg_audio import FFmpegAudio
|
|
13
|
+
from funasr_onnx import Fsmn_vad_online
|
|
14
|
+
|
|
15
|
+
from .exceptions import VadModelInitializationError, VadModelNotFoundError, VadProcessingError
|
|
16
|
+
from .vad_parser import VadParser
|
|
17
|
+
from .vad_types import VadSegment
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SpeechDetector:
|
|
23
|
+
"""
|
|
24
|
+
Speech Detector (streaming only)
|
|
25
|
+
|
|
26
|
+
Performs speech activity detection on streaming audio data using FSMN-VAD-ONNX model
|
|
27
|
+
and derives speech segments and non-speech gaps. Designed for processing large audio/video files
|
|
28
|
+
with constant memory footprint regardless of file duration.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
SAMPLE_RATE = 16000 # Fixed sample rate constant (Hz)
|
|
32
|
+
|
|
33
|
+
def __init__(self, model_dir: str = None):
|
|
34
|
+
"""
|
|
35
|
+
Initialize speech detector.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
model_dir: Path to the model directory. If None, reads from MODEL_FSMN_VAD_DIR
|
|
39
|
+
environment variable.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
VadModelNotFoundError: Model directory path is not set or does not exist.
|
|
43
|
+
VadModelInitializationError: Model initialization failed.
|
|
44
|
+
"""
|
|
45
|
+
# Determine model directory path
|
|
46
|
+
if model_dir is None:
|
|
47
|
+
model_dir = os.getenv("MODEL_FSMN_VAD_DIR")
|
|
48
|
+
if not model_dir:
|
|
49
|
+
raise VadModelNotFoundError("MODEL_FSMN_VAD_DIR environment variable not set. " "Please set it to the path of the FSMN-VAD model directory.")
|
|
50
|
+
|
|
51
|
+
# Validate directory exists
|
|
52
|
+
if not os.path.exists(model_dir):
|
|
53
|
+
raise VadModelNotFoundError(f"Model directory not found: {model_dir}")
|
|
54
|
+
|
|
55
|
+
# Initialize model (FSMN VAD model only has quantized version, always use quantize=True)
|
|
56
|
+
try:
|
|
57
|
+
self.model = Fsmn_vad_online(model_dir, quantize=True)
|
|
58
|
+
self.model_dir = model_dir
|
|
59
|
+
except Exception as e:
|
|
60
|
+
raise VadModelInitializationError(
|
|
61
|
+
f"Failed to initialize VAD model from {model_dir}: {str(e)}",
|
|
62
|
+
model_dir=model_dir,
|
|
63
|
+
) from e
|
|
64
|
+
|
|
65
|
+
def detect(
|
|
66
|
+
self,
|
|
67
|
+
file_path: str,
|
|
68
|
+
chunk_duration_sec: int = None,
|
|
69
|
+
start_ms: int = None,
|
|
70
|
+
duration_ms: int = None,
|
|
71
|
+
) -> tuple["list[VadSegment]", "list[VadSegment]"]:
|
|
72
|
+
"""
|
|
73
|
+
Detect speech segments and non-speech gaps in audio/video file using streaming processing.
|
|
74
|
+
|
|
75
|
+
Processes audio file in chunks using ffmpeg-audio package's stream method,
|
|
76
|
+
suitable for large files. Memory usage is constant and independent of total audio duration.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
file_path: Path to audio/video file (supports all FFmpeg-compatible formats).
|
|
80
|
+
chunk_duration_sec: Chunk duration in seconds. None uses default (20 minutes).
|
|
81
|
+
start_ms: Start position in milliseconds. None starts from beginning of file.
|
|
82
|
+
duration_ms: Total duration to process in milliseconds. None processes to end of file.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
tuple[list[VadSegment], list[VadSegment]]: Tuple of (speech_segments, gaps).
|
|
86
|
+
- speech_segments: List of speech segments, format: [{"start": ms, "end": ms}, ...]
|
|
87
|
+
- gaps: List of non-speech gaps, format: [{"start": ms, "end": ms}, ...]
|
|
88
|
+
Timestamps are relative to audio start (0-based), in milliseconds.
|
|
89
|
+
|
|
90
|
+
Raises:
|
|
91
|
+
VadProcessingError: Error occurred during processing.
|
|
92
|
+
"""
|
|
93
|
+
parser = VadParser()
|
|
94
|
+
param_dict = {"in_cache": []}
|
|
95
|
+
speech_segments = []
|
|
96
|
+
total_samples = 0
|
|
97
|
+
|
|
98
|
+
# Process each chunk in streaming fashion
|
|
99
|
+
chunk_count = 0
|
|
100
|
+
try:
|
|
101
|
+
for chunk in FFmpegAudio.stream(
|
|
102
|
+
file_path,
|
|
103
|
+
chunk_duration_sec=chunk_duration_sec,
|
|
104
|
+
start_ms=start_ms,
|
|
105
|
+
duration_ms=duration_ms,
|
|
106
|
+
):
|
|
107
|
+
chunk_count += 1
|
|
108
|
+
chunk_samples = len(chunk)
|
|
109
|
+
|
|
110
|
+
# Validate chunk format
|
|
111
|
+
if chunk.dtype != np.float32:
|
|
112
|
+
raise VadProcessingError(
|
|
113
|
+
f"Chunk dtype must be float32, got {chunk.dtype}",
|
|
114
|
+
file_path=file_path,
|
|
115
|
+
details={"chunk_index": chunk_count, "dtype": str(chunk.dtype)},
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Accumulate total sample count
|
|
119
|
+
total_samples += chunk_samples
|
|
120
|
+
|
|
121
|
+
# param_dict state is automatically maintained across chunks
|
|
122
|
+
result = self.model(audio_in=chunk, param_dict=param_dict)
|
|
123
|
+
|
|
124
|
+
# Parse model output
|
|
125
|
+
segments = parser.parse(result)
|
|
126
|
+
speech_segments.extend(segments)
|
|
127
|
+
|
|
128
|
+
except VadProcessingError:
|
|
129
|
+
# Re-raise VadProcessingError
|
|
130
|
+
raise
|
|
131
|
+
except Exception as e:
|
|
132
|
+
raise VadProcessingError(
|
|
133
|
+
f"Stream processing failed: {str(e)}",
|
|
134
|
+
file_path=file_path,
|
|
135
|
+
details={"chunk_index": chunk_count, "exception_type": type(e).__name__},
|
|
136
|
+
) from e
|
|
137
|
+
|
|
138
|
+
# Final flush to ensure all data is processed
|
|
139
|
+
try:
|
|
140
|
+
param_dict["is_final"] = True
|
|
141
|
+
final_result = self.model(audio_in=[], param_dict=param_dict)
|
|
142
|
+
final_segments = parser.parse(final_result)
|
|
143
|
+
speech_segments.extend(final_segments)
|
|
144
|
+
|
|
145
|
+
# Handle any unclosed segments
|
|
146
|
+
parser.flush()
|
|
147
|
+
except Exception as e:
|
|
148
|
+
raise VadProcessingError(
|
|
149
|
+
f"Final flush failed: {str(e)}",
|
|
150
|
+
file_path=file_path,
|
|
151
|
+
details={"exception_type": type(e).__name__},
|
|
152
|
+
) from e
|
|
153
|
+
|
|
154
|
+
# Derive non-speech gaps from speech segments
|
|
155
|
+
gaps = self._derive_non_speech_gaps(speech_segments, total_samples)
|
|
156
|
+
|
|
157
|
+
return speech_segments, gaps
|
|
158
|
+
|
|
159
|
+
@staticmethod
|
|
160
|
+
def _derive_non_speech_gaps(speech_segments: "list[VadSegment]", audio_length_samples: int) -> "list[VadSegment]":
|
|
161
|
+
"""
|
|
162
|
+
Derive non-speech gaps from speech segments.
|
|
163
|
+
|
|
164
|
+
Computes gaps between speech segments and at the beginning/end of audio.
|
|
165
|
+
Gaps represent periods of silence or non-speech audio.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
speech_segments: List of speech segments, format: [{"start": ms, "end": ms}, ...]
|
|
169
|
+
audio_length_samples: Total number of audio samples.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
list[VadSegment]: List of non-speech gaps, format: [{"start": ms, "end": ms}, ...]
|
|
173
|
+
"""
|
|
174
|
+
# Calculate total audio duration in milliseconds
|
|
175
|
+
duration_ms = int(audio_length_samples / SpeechDetector.SAMPLE_RATE * 1000)
|
|
176
|
+
|
|
177
|
+
# If no speech segments, entire audio is non-speech
|
|
178
|
+
if not speech_segments:
|
|
179
|
+
return [{"start": 0, "end": duration_ms}]
|
|
180
|
+
|
|
181
|
+
gaps = []
|
|
182
|
+
|
|
183
|
+
# Check for gap at the beginning (before first speech segment)
|
|
184
|
+
first_speech = speech_segments[0]
|
|
185
|
+
if first_speech["start"] > 0:
|
|
186
|
+
gaps.append({"start": 0, "end": first_speech["start"]})
|
|
187
|
+
|
|
188
|
+
# Check for gaps between speech segments
|
|
189
|
+
for i in range(len(speech_segments) - 1):
|
|
190
|
+
prev_end = speech_segments[i]["end"]
|
|
191
|
+
next_start = speech_segments[i + 1]["start"]
|
|
192
|
+
if next_start > prev_end:
|
|
193
|
+
gaps.append({"start": prev_end, "end": next_start})
|
|
194
|
+
|
|
195
|
+
# Check for gap at the end (after last speech segment)
|
|
196
|
+
last_speech = speech_segments[-1]
|
|
197
|
+
if last_speech["end"] < duration_ms:
|
|
198
|
+
gaps.append({"start": last_speech["end"], "end": duration_ms})
|
|
199
|
+
|
|
200
|
+
return gaps
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""
|
|
2
|
+
VAD State Machine Parser
|
|
3
|
+
|
|
4
|
+
Parses fragmented output from Fsmn_vad_online model and converts it into
|
|
5
|
+
complete {start, end} segment semantics. Handles partial segments that span
|
|
6
|
+
multiple chunks in streaming processing.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
|
|
11
|
+
from .vad_types import VadSegment
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class VadParser:
|
|
17
|
+
"""
|
|
18
|
+
State machine parser for Fsmn_vad_online fragmented output.
|
|
19
|
+
|
|
20
|
+
Converts fragmented VAD model output into complete {start, end} segment semantics.
|
|
21
|
+
Maintains state across chunks to handle segments that span multiple processing chunks.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self):
|
|
25
|
+
"""Initialize VAD parser."""
|
|
26
|
+
self.current_start_ms = -1
|
|
27
|
+
|
|
28
|
+
def _process_segment_pair(self, beg_ms: int, end_ms: int, completed_segments: "list[VadSegment]") -> None:
|
|
29
|
+
"""
|
|
30
|
+
Process a single [start, end] pair, supporting three cases:
|
|
31
|
+
1. [[beg, -1]] -> Speech start detected but not ended (partial segment)
|
|
32
|
+
2. [[-1, end]] -> Speech end detected (completes previous start)
|
|
33
|
+
3. [[beg, end]] -> Complete segment within single chunk
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
beg_ms: Start time in milliseconds (int or float).
|
|
37
|
+
end_ms: End time in milliseconds (int or float).
|
|
38
|
+
completed_segments: List to store completed segments.
|
|
39
|
+
"""
|
|
40
|
+
# Convert to integers
|
|
41
|
+
if not isinstance(beg_ms, (int, float)):
|
|
42
|
+
beg_ms = -1
|
|
43
|
+
else:
|
|
44
|
+
beg_ms = int(beg_ms)
|
|
45
|
+
|
|
46
|
+
if not isinstance(end_ms, (int, float)):
|
|
47
|
+
end_ms = -1
|
|
48
|
+
else:
|
|
49
|
+
end_ms = int(end_ms)
|
|
50
|
+
|
|
51
|
+
# Case 1: Speech start detected but not ended (partial segment)
|
|
52
|
+
if beg_ms != -1 and end_ms == -1:
|
|
53
|
+
# Fsmn_vad_online returns global timestamps, use directly
|
|
54
|
+
self.current_start_ms = beg_ms
|
|
55
|
+
# Don't add to completed_segments yet, wait for end
|
|
56
|
+
|
|
57
|
+
# Case 2: Speech end detected (completes previous start)
|
|
58
|
+
elif beg_ms == -1 and end_ms != -1:
|
|
59
|
+
if self.current_start_ms != -1:
|
|
60
|
+
completed_segments.append(
|
|
61
|
+
{
|
|
62
|
+
"start": self.current_start_ms,
|
|
63
|
+
"end": end_ms, # Fsmn_vad_online returns global timestamps
|
|
64
|
+
}
|
|
65
|
+
)
|
|
66
|
+
self.current_start_ms = -1 # Reset state
|
|
67
|
+
else:
|
|
68
|
+
# Edge case: end without start (possibly from previous chunk), log warning
|
|
69
|
+
logger.warning(f"[VAD Parser] Found end without start: end_ms={end_ms}")
|
|
70
|
+
|
|
71
|
+
# Case 3: Complete segment within single chunk
|
|
72
|
+
elif beg_ms != -1 and end_ms != -1:
|
|
73
|
+
completed_segments.append(
|
|
74
|
+
{
|
|
75
|
+
"start": beg_ms, # Fsmn_vad_online returns global timestamps
|
|
76
|
+
"end": end_ms,
|
|
77
|
+
}
|
|
78
|
+
)
|
|
79
|
+
# If there was an unclosed start, reset it (new complete segment encountered)
|
|
80
|
+
self.current_start_ms = -1
|
|
81
|
+
|
|
82
|
+
def parse(self, vad_output: list) -> "list[VadSegment]":
|
|
83
|
+
"""
|
|
84
|
+
Parse VAD model inference output and return completed segments.
|
|
85
|
+
|
|
86
|
+
Handles nested list structures and partial segments. Maintains state
|
|
87
|
+
across calls to handle segments spanning multiple chunks.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
vad_output: VAD model output, format: [[beg, end], ...] or [[beg, -1]], [[-1, end]]
|
|
91
|
+
Note: Fsmn_vad_online returns global timestamps (relative to stream start).
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
list[VadSegment]: List of completed speech segments, format: [{"start": ms, "end": ms}, ...]
|
|
95
|
+
"""
|
|
96
|
+
if not vad_output:
|
|
97
|
+
return []
|
|
98
|
+
|
|
99
|
+
# Unwrap all nested layers
|
|
100
|
+
# Format: [[[beg, end], ...]] or [[beg, end], ...] -> [[beg, end], ...]
|
|
101
|
+
while isinstance(vad_output, list) and len(vad_output) == 1:
|
|
102
|
+
vad_output = vad_output[0]
|
|
103
|
+
if not isinstance(vad_output, list):
|
|
104
|
+
break
|
|
105
|
+
|
|
106
|
+
completed_segments = []
|
|
107
|
+
|
|
108
|
+
# Process normalized output format: [[beg, end], ...]
|
|
109
|
+
for seg in vad_output:
|
|
110
|
+
if isinstance(seg, (list, tuple)) and len(seg) >= 2:
|
|
111
|
+
beg_ms, end_ms = seg[0], seg[1]
|
|
112
|
+
# Convert to integers (handle possible floats)
|
|
113
|
+
beg_ms = int(beg_ms) if isinstance(beg_ms, (int, float)) else -1
|
|
114
|
+
end_ms = int(end_ms) if isinstance(end_ms, (int, float)) else -1
|
|
115
|
+
# Process segment pair (supports [beg, -1], [-1, end], [beg, end])
|
|
116
|
+
self._process_segment_pair(beg_ms, end_ms, completed_segments)
|
|
117
|
+
else:
|
|
118
|
+
# Skip invalid segment formats
|
|
119
|
+
logger.warning(f"[VAD Parser] Invalid segment format: {seg}")
|
|
120
|
+
|
|
121
|
+
return completed_segments
|
|
122
|
+
|
|
123
|
+
def flush(self) -> "list[VadSegment]":
|
|
124
|
+
"""
|
|
125
|
+
Flush parser state at end of stream.
|
|
126
|
+
|
|
127
|
+
Handles any unclosed segments. If a start was detected but not ended,
|
|
128
|
+
it's discarded (typically treated as silence at end of stream).
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
list[VadSegment]: List of remaining completed segments (usually empty).
|
|
132
|
+
"""
|
|
133
|
+
# If there's an unclosed start, it typically means audio was cut off mid-speech
|
|
134
|
+
# Use simple discard logic (treat as silence at end)
|
|
135
|
+
if self.current_start_ms != -1:
|
|
136
|
+
logger.warning(f"[VAD Parser] Unclosed segment at end of stream: start_ms={self.current_start_ms}")
|
|
137
|
+
self.current_start_ms = -1
|
|
138
|
+
return []
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Type definitions for VAD detection.
|
|
3
|
+
|
|
4
|
+
Defines TypedDict structures for VAD segments used throughout the package.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import TypedDict
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class VadSegment(TypedDict):
|
|
11
|
+
"""
|
|
12
|
+
VAD segment type.
|
|
13
|
+
|
|
14
|
+
Represents a time segment (can be speech segment or non-speech gap).
|
|
15
|
+
Timestamps are relative to the start of the audio stream.
|
|
16
|
+
|
|
17
|
+
This type is used for:
|
|
18
|
+
- Speech segments: Detected speech time periods
|
|
19
|
+
- Non-speech gaps: Non-speech time periods
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
start: Segment start time in milliseconds (integer).
|
|
23
|
+
end: Segment end time in milliseconds (integer).
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
start: int
|
|
27
|
+
end: int
|