speech-detect 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 [Speech2SRT Team](https://github.com/speech2srt)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,5 @@
1
+ include README.md
2
+ include LICENSE
3
+ include requirements.txt
4
+ recursive-include src *.py
5
+
@@ -0,0 +1,327 @@
1
+ Metadata-Version: 2.4
2
+ Name: speech-detect
3
+ Version: 0.2.4
4
+ Summary: A Python library for detecting speech segments and non-speech gaps in audio/video files using FSMN-VAD-ONNX with streaming processing
5
+ License: MIT
6
+ Project-URL: Homepage, https://github.com/speech2srt/speech-detect
7
+ Project-URL: Repository, https://github.com/speech2srt/speech-detect
8
+ Project-URL: Issues, https://github.com/speech2srt/speech-detect/issues
9
+ Requires-Python: >=3.10
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: numpy>=1.26.4
13
+ Requires-Dist: funasr-onnx>=0.4.1
14
+ Requires-Dist: ffmpeg-audio>=0.3.0
15
+ Requires-Dist: jieba>=0.42.1
16
+ Requires-Dist: torch>=2.9.1
17
+ Dynamic: license-file
18
+
19
+ # speech-detect
20
+
21
+ A Python library for detecting speech segments, non-speech gaps, and RMS energy curves in audio/video files using FSMN-VAD-ONNX with streaming processing.
22
+
23
+ ## Features
24
+
25
+ ### Core Functionality
26
+
27
+ - **Speech segment detection**: Detect all speech segments in audio/video files with precise timestamps
28
+ - **Non-speech gap derivation**: Automatically compute non-speech gaps (silence periods) from detected speech segments
29
+ - **RMS energy detection**: Compute RMS (Root Mean Square) energy curve for audio analysis and visualization
30
+
31
+ ### Advanced Features
32
+
33
+ - **Adjacent segment merging**: Merge adjacent speech segments with gaps smaller than a threshold (useful for handling brief pauses like breathing or thinking pauses)
34
+
35
+ ### Technical Capabilities
36
+
37
+ - **Streaming processing**: Process large audio/video files in chunks without loading everything into memory
38
+ - **Memory efficient**: Constant memory usage regardless of audio file duration
39
+ - **Format support**: Supports all audio/video formats that FFmpeg supports (MP3, WAV, FLAC, Opus, MP4, AVI, etc.)
40
+ - **Time range support**: Support start time and duration parameters for partial processing
41
+
42
+ ## Installation
43
+
44
+ ```bash
45
+ pip install speech-detect
46
+ ```
47
+
48
+ **Note**: This package requires:
49
+
50
+ - FFmpeg to be installed on your system and available in PATH
51
+ - FSMN-VAD-ONNX model files (see Model Setup below)
52
+
53
+ ## Model Setup
54
+
55
+ This package requires FSMN-VAD-ONNX model files. The model is available on Hugging Face:
56
+
57
+ **Model Repository**: [funasr/fsmn-vad-onnx](https://huggingface.co/funasr/fsmn-vad-onnx)
58
+
59
+ ### Download the Model
60
+
61
+ 1. Install Git LFS (required for downloading large model files):
62
+
63
+ ```bash
64
+ git lfs install
65
+ ```
66
+
67
+ 2. Clone the model repository:
68
+
69
+ ```bash
70
+ git clone https://huggingface.co/funasr/fsmn-vad-onnx
71
+ ```
72
+
73
+ This will download the model files including `model_quant.onnx`, `config.yaml`, `am.mvn`, etc.
74
+
75
+ 3. Set the `MODEL_FSMN_VAD_DIR` environment variable to point to the model directory:
76
+ ```bash
77
+ export MODEL_FSMN_VAD_DIR=/path/to/fsmn-vad-onnx
78
+ ```
79
+
80
+ Alternatively, you can specify the model directory when initializing `SpeechDetector`:
81
+
82
+ ```python
83
+ from speech_detect import SpeechDetector
84
+
85
+ detector = SpeechDetector(model_dir="/path/to/fsmn-vad-onnx")
86
+ ```
87
+
88
+ ## Quick Start
89
+
90
+ ### Detect Speech Segments, Gaps, and RMS Energy Curve
91
+
92
+ ```python
93
+ from speech_detect import SpeechDetector
94
+
95
+ # Initialize detector (reads MODEL_FSMN_VAD_DIR from environment)
96
+ detector = SpeechDetector()
97
+
98
+ # Detect speech segments, non-speech gaps, and RMS energy curve in an audio file
99
+ speech_segments, gaps, rms_curve = detector.detect("audio.mp3")
100
+
101
+ # speech_segments is a list of dictionaries: [{"start": 0, "end": 500}, ...]
102
+ for segment in speech_segments:
103
+ start_ms = segment["start"]
104
+ end_ms = segment["end"]
105
+ duration = end_ms - start_ms
106
+ print(f"Speech segment: {start_ms}ms - {end_ms}ms (duration: {duration}ms)")
107
+
108
+ # gaps is a list of dictionaries: [{"start": 0, "end": 500}, ...]
109
+ for gap in gaps:
110
+ start_ms = gap["start"]
111
+ end_ms = gap["end"]
112
+ duration = end_ms - start_ms
113
+ print(f"Non-speech gap: {start_ms}ms - {end_ms}ms (duration: {duration}ms)")
114
+
115
+ # rms_curve is a list of dictionaries: [{"ms": 0, "value": 0.123}, ...]
116
+ for point in rms_curve:
117
+ time_ms = point["ms"]
118
+ rms_value = point["value"]
119
+ print(f"RMS at {time_ms}ms: {rms_value}")
120
+ ```
121
+
122
+ ### Processing Specific Time Range
123
+
124
+ ```python
125
+ # Process only the first 30 seconds
126
+ speech_segments, gaps, rms_curve = detector.detect(
127
+ file_path="audio.mp3",
128
+ start_ms=0,
129
+ duration_ms=30000,
130
+ )
131
+
132
+ # Process from 10 seconds, duration 5 seconds
133
+ speech_segments, gaps, rms_curve = detector.detect(
134
+ file_path="audio.mp3",
135
+ start_ms=10000,
136
+ duration_ms=5000,
137
+ )
138
+ ```
139
+
140
+ ### Custom Chunk Size
141
+
142
+ ```python
143
+ # Use 1-minute chunks instead of default 20-minute chunks
144
+ speech_segments, gaps, rms_curve = detector.detect(
145
+ file_path="audio.mp3",
146
+ chunk_duration_sec=60,
147
+ )
148
+ ```
149
+
150
+ ### Merging Adjacent Segments
151
+
152
+ ```python
153
+ # Merge adjacent segments with gaps smaller than 300ms
154
+ # Useful for handling brief pauses in speech (breathing, thinking pauses)
155
+ speech_segments, gaps, rms_curve = detector.detect(
156
+ file_path="audio.mp3",
157
+ merge_gap_threshold_ms=300,
158
+ )
159
+ ```
160
+
161
+ ### Custom RMS Energy Detection Parameters
162
+
163
+ ```python
164
+ # Customize RMS calculation window size and output interval
165
+ # frame_size_ms: Convolution window size (default: 100ms)
166
+ # output_interval_ms: Output sampling interval (default: 50ms)
167
+ speech_segments, gaps, rms_curve = detector.detect(
168
+ file_path="audio.mp3",
169
+ rms_frame_size_ms=100, # 100ms window for RMS calculation
170
+ rms_output_interval_ms=50, # Output every 50ms for higher resolution
171
+ )
172
+
173
+ # The RMS curve can be used for audio visualization, energy analysis, or as input for other audio processing tasks
174
+ ```
175
+
176
+ ### Environment Variables for RMS Configuration
177
+
178
+ You can configure default RMS parameters using environment variables:
179
+
180
+ ```bash
181
+ # Set default frame size (default: 100ms)
182
+ export RMS_FRAME_SIZE_MS=100
183
+
184
+ # Set default output interval (default: 50ms)
185
+ export RMS_OUTPUT_INTERVAL_MS=50
186
+ ```
187
+
188
+ These environment variables will be used as defaults when `rms_frame_size_ms` and `rms_output_interval_ms` parameters are not explicitly provided to the `detect()` method.
189
+
190
+ ## RMS Energy Design Principles
191
+
192
+ `speech-detect` always emits an RMS energy curve together with VAD results. The feature is designed to stay lightweight while providing meaningful downstream insight:
193
+
194
+ - **Always-on streaming measurement**: RMS is computed chunk by chunk using a sliding, normalized window, so memory usage stays constant even for multi-hour recordings.
195
+ - **Decoupled smoothing vs. resolution**: `rms_frame_size_ms` controls the convolution window (how aggressively noise is smoothed), while `rms_output_interval_ms` controls the sampling density. Allowing the interval to be smaller than the window lets you plot dense, smooth curves without losing smoothing benefits.
196
+ - **Deterministic timeline**: Both parameters are expressed in milliseconds and map directly to timestamps in the returned `rms_curve`, making it trivial to align energy data with speech segments, captions, or UI waveforms.
197
+ - **Downstream flexibility**: The curve can drive silence gating, highlight low-energy pauses, or simply power visual meters. Because it is always returned, callers can adopt it opportunistically without extra processing steps.
198
+
199
+ ## API Reference
200
+
201
+ ### SpeechDetector
202
+
203
+ Main class for speech detection. All methods are instance methods.
204
+
205
+ #### `SpeechDetector.__init__(model_dir=None)`
206
+
207
+ Initialize speech detector.
208
+
209
+ **Parameters:**
210
+
211
+ - `model_dir` (str, optional): Path to the FSMN-VAD model directory. If None, reads from `MODEL_FSMN_VAD_DIR` environment variable.
212
+
213
+ **Note:** The FSMN-VAD model only has a quantized version, so `quantize=True` is always used internally.
214
+
215
+ **Raises:**
216
+
217
+ - `VadModelNotFoundError`: If model directory is not found or not set
218
+ - `VadModelInitializationError`: If model initialization fails
219
+
220
+ #### `SpeechDetector.detect(file_path, chunk_duration_sec=None, start_ms=None, duration_ms=None, merge_gap_threshold_ms=None, rms_frame_size_ms=None, rms_output_interval_ms=None)`
221
+
222
+ Detect speech segments, non-speech gaps, and RMS energy curve in audio/video file using streaming processing.
223
+
224
+ **Parameters:**
225
+
226
+ - `file_path` (str): Path to the audio/video file (supports all FFmpeg formats)
227
+ - `chunk_duration_sec` (int, optional): Duration of each chunk in seconds. Defaults to 1200 (20 minutes). Must be > 0 if provided.
228
+ - `start_ms` (int, optional): Start position in milliseconds. None means from file beginning. If None but `duration_ms` is provided, defaults to 0.
229
+ - `duration_ms` (int, optional): Total duration to process in milliseconds. None means process until end. If specified, processing stops when this duration is reached.
230
+ - `merge_gap_threshold_ms` (int, optional): Gap threshold in milliseconds. Adjacent speech segments with gaps smaller than this threshold will be merged into a single segment. None (default) disables merging. If <= 0, a warning will be logged and merging will be disabled. Useful for handling brief pauses in speech (e.g., breathing, thinking pauses) that should be considered part of continuous speech.
231
+ - `rms_frame_size_ms` (int, optional): Convolution window size in milliseconds for RMS calculation. Defaults to 100ms (can be overridden by `RMS_FRAME_SIZE_MS` environment variable). If <= 0, a warning will be logged and default value (100ms) will be used.
232
+ - `rms_output_interval_ms` (int, optional): Output sampling interval in milliseconds for RMS curve. Defaults to 50ms (can be overridden by `RMS_OUTPUT_INTERVAL_MS` environment variable). If <= 0, a warning will be logged and default value (50ms) will be used. If > `rms_frame_size_ms`, it will be adjusted to `rms_frame_size_ms`.
233
+
234
+ **Returns:**
235
+
236
+ - `tuple[list[VadSegment], list[VadSegment], list[RMSPoint]]`: Tuple of (speech_segments, gaps, rms_curve)
237
+ - `speech_segments`: List of speech segments, format: `[{"start": ms, "end": ms}, ...]`
238
+ - Timestamps are relative to audio start (from 0)
239
+ - Unit: milliseconds
240
+ - `gaps`: List of non-speech gaps, format: `[{"start": ms, "end": ms}, ...]`
241
+ - Timestamps are relative to audio start (from 0)
242
+ - Unit: milliseconds
243
+ - `rms_curve`: RMS energy curve data, always computed and returned, format: `[{"ms": int, "value": float}, ...]`
244
+ - `ms`: Time position in milliseconds (relative to audio start, from 0)
245
+ - `value`: RMS energy value (float, typically in range [0.0, 1.0])
246
+ - Unit: milliseconds for time, dimensionless for value
247
+
248
+ **Raises:**
249
+
250
+ - `VadProcessingError`: If processing fails
251
+
252
+ ## Data Types
253
+
254
+ ### VadSegment
255
+
256
+ A TypedDict representing a time segment (can be a speech segment or a non-speech gap).
257
+
258
+ **Fields:**
259
+
260
+ - `start` (int): Segment start time in milliseconds
261
+ - `end` (int): Segment end time in milliseconds
262
+
263
+ **Example:**
264
+
265
+ ```python
266
+ segment: VadSegment = {"start": 100, "end": 500}
267
+ ```
268
+
269
+ ### RMSPoint
270
+
271
+ A TypedDict representing a point on the RMS energy curve.
272
+
273
+ **Fields:**
274
+
275
+ - `ms` (int): Time position in milliseconds (relative to audio start, from 0)
276
+ - `value` (float): RMS energy value (typically in range [0.0, 1.0])
277
+
278
+ **Example:**
279
+
280
+ ```python
281
+ point: RMSPoint = {"ms": 100, "value": 0.123}
282
+ ```
283
+
284
+ ## Exceptions
285
+
286
+ ### `VadModelNotFoundError`
287
+
288
+ Raised when VAD model directory is not found or not set.
289
+
290
+ **Attributes:**
291
+
292
+ - `message`: Human-readable error message
293
+
294
+ ### `VadModelInitializationError`
295
+
296
+ Raised when VAD model initialization fails.
297
+
298
+ **Attributes:**
299
+
300
+ - `message`: Primary error message
301
+ - `model_dir`: Path to the model directory that caused the error
302
+
303
+ ### `VadProcessingError`
304
+
305
+ Raised when VAD processing fails.
306
+
307
+ **Attributes:**
308
+
309
+ - `message`: Primary error message
310
+ - `file_path`: Path to the file being processed
311
+ - `details`: Additional error details dictionary
312
+
313
+ ## Requirements
314
+
315
+ - Python >= 3.10
316
+ - FFmpeg (must be installed separately)
317
+ - numpy >= 1.26.4
318
+ - scipy >= 1.11.0 (for RMS energy calculation)
319
+ - funasr-onnx >= 0.4.1
320
+ - ffmpeg-audio >= 0.2.0
321
+ - jieba >= 0.42.1
322
+ - torch >= 2.9.1
323
+ - setuptools == 80.8.0 (to avoid UserWarning from jieba about deprecated pkg_resources API)
324
+
325
+ ## License
326
+
327
+ MIT License
@@ -0,0 +1,309 @@
1
+ # speech-detect
2
+
3
+ A Python library for detecting speech segments, non-speech gaps, and RMS energy curves in audio/video files using FSMN-VAD-ONNX with streaming processing.
4
+
5
+ ## Features
6
+
7
+ ### Core Functionality
8
+
9
+ - **Speech segment detection**: Detect all speech segments in audio/video files with precise timestamps
10
+ - **Non-speech gap derivation**: Automatically compute non-speech gaps (silence periods) from detected speech segments
11
+ - **RMS energy detection**: Compute RMS (Root Mean Square) energy curve for audio analysis and visualization
12
+
13
+ ### Advanced Features
14
+
15
+ - **Adjacent segment merging**: Merge adjacent speech segments with gaps smaller than a threshold (useful for handling brief pauses like breathing or thinking pauses)
16
+
17
+ ### Technical Capabilities
18
+
19
+ - **Streaming processing**: Process large audio/video files in chunks without loading everything into memory
20
+ - **Memory efficient**: Constant memory usage regardless of audio file duration
21
+ - **Format support**: Supports all audio/video formats that FFmpeg supports (MP3, WAV, FLAC, Opus, MP4, AVI, etc.)
22
+ - **Time range support**: Support start time and duration parameters for partial processing
23
+
24
+ ## Installation
25
+
26
+ ```bash
27
+ pip install speech-detect
28
+ ```
29
+
30
+ **Note**: This package requires:
31
+
32
+ - FFmpeg to be installed on your system and available in PATH
33
+ - FSMN-VAD-ONNX model files (see Model Setup below)
34
+
35
+ ## Model Setup
36
+
37
+ This package requires FSMN-VAD-ONNX model files. The model is available on Hugging Face:
38
+
39
+ **Model Repository**: [funasr/fsmn-vad-onnx](https://huggingface.co/funasr/fsmn-vad-onnx)
40
+
41
+ ### Download the Model
42
+
43
+ 1. Install Git LFS (required for downloading large model files):
44
+
45
+ ```bash
46
+ git lfs install
47
+ ```
48
+
49
+ 2. Clone the model repository:
50
+
51
+ ```bash
52
+ git clone https://huggingface.co/funasr/fsmn-vad-onnx
53
+ ```
54
+
55
+ This will download the model files including `model_quant.onnx`, `config.yaml`, `am.mvn`, etc.
56
+
57
+ 3. Set the `MODEL_FSMN_VAD_DIR` environment variable to point to the model directory:
58
+ ```bash
59
+ export MODEL_FSMN_VAD_DIR=/path/to/fsmn-vad-onnx
60
+ ```
61
+
62
+ Alternatively, you can specify the model directory when initializing `SpeechDetector`:
63
+
64
+ ```python
65
+ from speech_detect import SpeechDetector
66
+
67
+ detector = SpeechDetector(model_dir="/path/to/fsmn-vad-onnx")
68
+ ```
69
+
70
+ ## Quick Start
71
+
72
+ ### Detect Speech Segments, Gaps, and RMS Energy Curve
73
+
74
+ ```python
75
+ from speech_detect import SpeechDetector
76
+
77
+ # Initialize detector (reads MODEL_FSMN_VAD_DIR from environment)
78
+ detector = SpeechDetector()
79
+
80
+ # Detect speech segments, non-speech gaps, and RMS energy curve in an audio file
81
+ speech_segments, gaps, rms_curve = detector.detect("audio.mp3")
82
+
83
+ # speech_segments is a list of dictionaries: [{"start": 0, "end": 500}, ...]
84
+ for segment in speech_segments:
85
+ start_ms = segment["start"]
86
+ end_ms = segment["end"]
87
+ duration = end_ms - start_ms
88
+ print(f"Speech segment: {start_ms}ms - {end_ms}ms (duration: {duration}ms)")
89
+
90
+ # gaps is a list of dictionaries: [{"start": 0, "end": 500}, ...]
91
+ for gap in gaps:
92
+ start_ms = gap["start"]
93
+ end_ms = gap["end"]
94
+ duration = end_ms - start_ms
95
+ print(f"Non-speech gap: {start_ms}ms - {end_ms}ms (duration: {duration}ms)")
96
+
97
+ # rms_curve is a list of dictionaries: [{"ms": 0, "value": 0.123}, ...]
98
+ for point in rms_curve:
99
+ time_ms = point["ms"]
100
+ rms_value = point["value"]
101
+ print(f"RMS at {time_ms}ms: {rms_value}")
102
+ ```
103
+
104
+ ### Processing Specific Time Range
105
+
106
+ ```python
107
+ # Process only the first 30 seconds
108
+ speech_segments, gaps, rms_curve = detector.detect(
109
+ file_path="audio.mp3",
110
+ start_ms=0,
111
+ duration_ms=30000,
112
+ )
113
+
114
+ # Process from 10 seconds, duration 5 seconds
115
+ speech_segments, gaps, rms_curve = detector.detect(
116
+ file_path="audio.mp3",
117
+ start_ms=10000,
118
+ duration_ms=5000,
119
+ )
120
+ ```
121
+
122
+ ### Custom Chunk Size
123
+
124
+ ```python
125
+ # Use 1-minute chunks instead of default 20-minute chunks
126
+ speech_segments, gaps, rms_curve = detector.detect(
127
+ file_path="audio.mp3",
128
+ chunk_duration_sec=60,
129
+ )
130
+ ```
131
+
132
+ ### Merging Adjacent Segments
133
+
134
+ ```python
135
+ # Merge adjacent segments with gaps smaller than 300ms
136
+ # Useful for handling brief pauses in speech (breathing, thinking pauses)
137
+ speech_segments, gaps, rms_curve = detector.detect(
138
+ file_path="audio.mp3",
139
+ merge_gap_threshold_ms=300,
140
+ )
141
+ ```
142
+
143
+ ### Custom RMS Energy Detection Parameters
144
+
145
+ ```python
146
+ # Customize RMS calculation window size and output interval
147
+ # frame_size_ms: Convolution window size (default: 100ms)
148
+ # output_interval_ms: Output sampling interval (default: 50ms)
149
+ speech_segments, gaps, rms_curve = detector.detect(
150
+ file_path="audio.mp3",
151
+ rms_frame_size_ms=100, # 100ms window for RMS calculation
152
+ rms_output_interval_ms=50, # Output every 50ms for higher resolution
153
+ )
154
+
155
+ # The RMS curve can be used for audio visualization, energy analysis, or as input for other audio processing tasks
156
+ ```
157
+
158
+ ### Environment Variables for RMS Configuration
159
+
160
+ You can configure default RMS parameters using environment variables:
161
+
162
+ ```bash
163
+ # Set default frame size (default: 100ms)
164
+ export RMS_FRAME_SIZE_MS=100
165
+
166
+ # Set default output interval (default: 50ms)
167
+ export RMS_OUTPUT_INTERVAL_MS=50
168
+ ```
169
+
170
+ These environment variables will be used as defaults when `rms_frame_size_ms` and `rms_output_interval_ms` parameters are not explicitly provided to the `detect()` method.
171
+
172
+ ## RMS Energy Design Principles
173
+
174
+ `speech-detect` always emits an RMS energy curve together with VAD results. The feature is designed to stay lightweight while providing meaningful downstream insight:
175
+
176
+ - **Always-on streaming measurement**: RMS is computed chunk by chunk using a sliding, normalized window, so memory usage stays constant even for multi-hour recordings.
177
+ - **Decoupled smoothing vs. resolution**: `rms_frame_size_ms` controls the convolution window (how aggressively noise is smoothed), while `rms_output_interval_ms` controls the sampling density. Allowing the interval to be smaller than the window lets you plot dense, smooth curves without losing smoothing benefits.
178
+ - **Deterministic timeline**: Both parameters are expressed in milliseconds and map directly to timestamps in the returned `rms_curve`, making it trivial to align energy data with speech segments, captions, or UI waveforms.
179
+ - **Downstream flexibility**: The curve can drive silence gating, highlight low-energy pauses, or simply power visual meters. Because it is always returned, callers can adopt it opportunistically without extra processing steps.
180
+
181
+ ## API Reference
182
+
183
+ ### SpeechDetector
184
+
185
+ Main class for speech detection. All methods are instance methods.
186
+
187
+ #### `SpeechDetector.__init__(model_dir=None)`
188
+
189
+ Initialize speech detector.
190
+
191
+ **Parameters:**
192
+
193
+ - `model_dir` (str, optional): Path to the FSMN-VAD model directory. If None, reads from `MODEL_FSMN_VAD_DIR` environment variable.
194
+
195
+ **Note:** The FSMN-VAD model only has a quantized version, so `quantize=True` is always used internally.
196
+
197
+ **Raises:**
198
+
199
+ - `VadModelNotFoundError`: If model directory is not found or not set
200
+ - `VadModelInitializationError`: If model initialization fails
201
+
202
+ #### `SpeechDetector.detect(file_path, chunk_duration_sec=None, start_ms=None, duration_ms=None, merge_gap_threshold_ms=None, rms_frame_size_ms=None, rms_output_interval_ms=None)`
203
+
204
+ Detect speech segments, non-speech gaps, and RMS energy curve in audio/video file using streaming processing.
205
+
206
+ **Parameters:**
207
+
208
+ - `file_path` (str): Path to the audio/video file (supports all FFmpeg formats)
209
+ - `chunk_duration_sec` (int, optional): Duration of each chunk in seconds. Defaults to 1200 (20 minutes). Must be > 0 if provided.
210
+ - `start_ms` (int, optional): Start position in milliseconds. None means from file beginning. If None but `duration_ms` is provided, defaults to 0.
211
+ - `duration_ms` (int, optional): Total duration to process in milliseconds. None means process until end. If specified, processing stops when this duration is reached.
212
+ - `merge_gap_threshold_ms` (int, optional): Gap threshold in milliseconds. Adjacent speech segments with gaps smaller than this threshold will be merged into a single segment. None (default) disables merging. If <= 0, a warning will be logged and merging will be disabled. Useful for handling brief pauses in speech (e.g., breathing, thinking pauses) that should be considered part of continuous speech.
213
+ - `rms_frame_size_ms` (int, optional): Convolution window size in milliseconds for RMS calculation. Defaults to 100ms (can be overridden by `RMS_FRAME_SIZE_MS` environment variable). If <= 0, a warning will be logged and default value (100ms) will be used.
214
+ - `rms_output_interval_ms` (int, optional): Output sampling interval in milliseconds for RMS curve. Defaults to 50ms (can be overridden by `RMS_OUTPUT_INTERVAL_MS` environment variable). If <= 0, a warning will be logged and default value (50ms) will be used. If > `rms_frame_size_ms`, it will be adjusted to `rms_frame_size_ms`.
215
+
216
+ **Returns:**
217
+
218
+ - `tuple[list[VadSegment], list[VadSegment], list[RMSPoint]]`: Tuple of (speech_segments, gaps, rms_curve)
219
+ - `speech_segments`: List of speech segments, format: `[{"start": ms, "end": ms}, ...]`
220
+ - Timestamps are relative to audio start (from 0)
221
+ - Unit: milliseconds
222
+ - `gaps`: List of non-speech gaps, format: `[{"start": ms, "end": ms}, ...]`
223
+ - Timestamps are relative to audio start (from 0)
224
+ - Unit: milliseconds
225
+ - `rms_curve`: RMS energy curve data, always computed and returned, format: `[{"ms": int, "value": float}, ...]`
226
+ - `ms`: Time position in milliseconds (relative to audio start, from 0)
227
+ - `value`: RMS energy value (float, typically in range [0.0, 1.0])
228
+ - Unit: milliseconds for time, dimensionless for value
229
+
230
+ **Raises:**
231
+
232
+ - `VadProcessingError`: If processing fails
233
+
234
+ ## Data Types
235
+
236
+ ### VadSegment
237
+
238
+ A TypedDict representing a time segment (can be a speech segment or a non-speech gap).
239
+
240
+ **Fields:**
241
+
242
+ - `start` (int): Segment start time in milliseconds
243
+ - `end` (int): Segment end time in milliseconds
244
+
245
+ **Example:**
246
+
247
+ ```python
248
+ segment: VadSegment = {"start": 100, "end": 500}
249
+ ```
250
+
251
+ ### RMSPoint
252
+
253
+ A TypedDict representing a point on the RMS energy curve.
254
+
255
+ **Fields:**
256
+
257
+ - `ms` (int): Time position in milliseconds (relative to audio start, from 0)
258
+ - `value` (float): RMS energy value (typically in range [0.0, 1.0])
259
+
260
+ **Example:**
261
+
262
+ ```python
263
+ point: RMSPoint = {"ms": 100, "value": 0.123}
264
+ ```
265
+
266
+ ## Exceptions
267
+
268
+ ### `VadModelNotFoundError`
269
+
270
+ Raised when VAD model directory is not found or not set.
271
+
272
+ **Attributes:**
273
+
274
+ - `message`: Human-readable error message
275
+
276
+ ### `VadModelInitializationError`
277
+
278
+ Raised when VAD model initialization fails.
279
+
280
+ **Attributes:**
281
+
282
+ - `message`: Primary error message
283
+ - `model_dir`: Path to the model directory that caused the error
284
+
285
+ ### `VadProcessingError`
286
+
287
+ Raised when VAD processing fails.
288
+
289
+ **Attributes:**
290
+
291
+ - `message`: Primary error message
292
+ - `file_path`: Path to the file being processed
293
+ - `details`: Additional error details dictionary
294
+
295
+ ## Requirements
296
+
297
+ - Python >= 3.10
298
+ - FFmpeg (must be installed separately)
299
+ - numpy >= 1.26.4
300
+ - scipy >= 1.11.0 (for RMS energy calculation)
301
+ - funasr-onnx >= 0.4.1
302
+ - ffmpeg-audio >= 0.2.0
303
+ - jieba >= 0.42.1
304
+ - torch >= 2.9.1
305
+ - setuptools == 80.8.0 (to avoid UserWarning from jieba about deprecated pkg_resources API)
306
+
307
+ ## License
308
+
309
+ MIT License
@@ -0,0 +1,27 @@
1
+ [build-system]
2
+ requires = ["setuptools==80.8.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "speech-detect"
7
+ version = "0.2.4"
8
+ description = "A Python library for detecting speech segments and non-speech gaps in audio/video files using FSMN-VAD-ONNX with streaming processing"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = {text = "MIT"}
12
+ dependencies = [
13
+ "numpy>=1.26.4",
14
+ "funasr-onnx>=0.4.1",
15
+ "ffmpeg-audio>=0.3.0",
16
+ "jieba>=0.42.1",
17
+ "torch>=2.9.1",
18
+ ]
19
+
20
+ [project.urls]
21
+ Homepage = "https://github.com/speech2srt/speech-detect"
22
+ Repository = "https://github.com/speech2srt/speech-detect"
23
+ Issues = "https://github.com/speech2srt/speech-detect/issues"
24
+
25
+ [tool.setuptools]
26
+ package-dir = {"speech_detect" = "src"}
27
+ packages = ["speech_detect"]