speech-prep 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speech_prep/__init__.py +34 -0
- speech_prep/core.py +203 -0
- speech_prep/detection.py +116 -0
- speech_prep/exceptions.py +49 -0
- speech_prep/processing.py +185 -0
- speech_prep/utils.py +130 -0
- speech_prep-0.1.3.dist-info/METADATA +220 -0
- speech_prep-0.1.3.dist-info/RECORD +10 -0
- speech_prep-0.1.3.dist-info/WHEEL +4 -0
- speech_prep-0.1.3.dist-info/licenses/LICENSE +21 -0
speech_prep/__init__.py
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
"""
|
2
|
+
Speech Prep - Audio preprocessing toolkit for speech-to-text applications.
|
3
|
+
|
4
|
+
This package provides tools to prepare audio files for speech-to-text processing,
|
5
|
+
including silence detection and removal, speed adjustment, and format conversion.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from .core import SoundFile
|
9
|
+
from .exceptions import (
|
10
|
+
AudioPropertiesError,
|
11
|
+
FFmpegError,
|
12
|
+
FileValidationError,
|
13
|
+
SilenceDetectionError,
|
14
|
+
SpeechPrepError,
|
15
|
+
)
|
16
|
+
|
17
|
+
# Import version from hatch-vcs
|
18
|
+
try:
|
19
|
+
from importlib.metadata import version as get_metadata_version
|
20
|
+
|
21
|
+
__version__ = get_metadata_version("speech-prep")
|
22
|
+
except ImportError:
|
23
|
+
# Development or not installed
|
24
|
+
__version__ = "0.0.0"
|
25
|
+
|
26
|
+
__all__ = [
|
27
|
+
"SoundFile",
|
28
|
+
"SpeechPrepError",
|
29
|
+
"FFmpegError",
|
30
|
+
"FileValidationError",
|
31
|
+
"AudioPropertiesError",
|
32
|
+
"SilenceDetectionError",
|
33
|
+
"__version__",
|
34
|
+
]
|
speech_prep/core.py
ADDED
@@ -0,0 +1,203 @@
|
|
1
|
+
"""Core SoundFile class for audio file manipulation."""
|
2
|
+
|
3
|
+
import logging
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Optional
|
6
|
+
|
7
|
+
from .detection import calculate_median_silence, detect_silence
|
8
|
+
from .exceptions import SpeechPrepError
|
9
|
+
from .processing import adjust_speed, convert_format, strip_silence
|
10
|
+
from .utils import format_time, get_audio_properties
|
11
|
+
|
12
|
+
# Configure package logger
|
13
|
+
logger = logging.getLogger("speech_prep")
|
14
|
+
if not logger.hasHandlers():
|
15
|
+
handler = logging.StreamHandler()
|
16
|
+
formatter = logging.Formatter("[%(levelname)s] %(name)s: %(message)s")
|
17
|
+
handler.setFormatter(formatter)
|
18
|
+
logger.addHandler(handler)
|
19
|
+
logger.setLevel(logging.WARNING)
|
20
|
+
|
21
|
+
|
22
|
+
class SoundFile:
|
23
|
+
"""Represents an audio file with silence detection and processing capabilities."""
|
24
|
+
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
path: Path,
|
28
|
+
noise_threshold_db: int = -30,
|
29
|
+
min_silence_duration: float = 0.5,
|
30
|
+
):
|
31
|
+
"""
|
32
|
+
Initialize a SoundFile object.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
path: Path to the audio file
|
36
|
+
noise_threshold_db: Threshold (in dB) for silence detection
|
37
|
+
min_silence_duration: Minimum duration (in seconds) to consider as silence
|
38
|
+
"""
|
39
|
+
self.path = Path(path)
|
40
|
+
if not self.path.exists():
|
41
|
+
raise FileNotFoundError(f"Audio file not found: {self.path}")
|
42
|
+
|
43
|
+
try:
|
44
|
+
self.duration, self.file_size, self.format = get_audio_properties(self.path)
|
45
|
+
except Exception as e:
|
46
|
+
raise SpeechPrepError(f"Failed to extract metadata: {e}") from e
|
47
|
+
|
48
|
+
try:
|
49
|
+
self.silence_periods = detect_silence(
|
50
|
+
self.path,
|
51
|
+
noise_threshold_db=noise_threshold_db,
|
52
|
+
min_silence_duration=min_silence_duration,
|
53
|
+
)
|
54
|
+
except Exception:
|
55
|
+
self.silence_periods = []
|
56
|
+
|
57
|
+
if self.silence_periods:
|
58
|
+
self.median_silence = calculate_median_silence(self.silence_periods)
|
59
|
+
else:
|
60
|
+
self.median_silence = 0.0
|
61
|
+
|
62
|
+
self.noise_threshold_db = noise_threshold_db
|
63
|
+
self.min_silence_duration = min_silence_duration
|
64
|
+
|
65
|
+
def __eq__(self, other: object) -> bool:
|
66
|
+
"""Two files are equal if they reference the same path."""
|
67
|
+
if not isinstance(other, SoundFile):
|
68
|
+
return False
|
69
|
+
return self.path == other.path
|
70
|
+
|
71
|
+
def __str__(self) -> str:
|
72
|
+
"""
|
73
|
+
Return a string representation of the SoundFile object.
|
74
|
+
|
75
|
+
Displays a summary of audio properties and the first/last three silence periods.
|
76
|
+
"""
|
77
|
+
# Format basic audio information
|
78
|
+
basic_info = [
|
79
|
+
f"SoundFile: {self.path}",
|
80
|
+
f" Duration: {self.duration:.2f} seconds ({format_time(self.duration)})",
|
81
|
+
f" Format: {self.format}",
|
82
|
+
f" File size: {self.file_size / 1024 / 1024:.2f} MB",
|
83
|
+
f" Silence periods: {len(self.silence_periods)} detected",
|
84
|
+
f" Median silence: {self.median_silence:.2f} seconds",
|
85
|
+
]
|
86
|
+
|
87
|
+
# Format silence periods (first 3, ellipsis, last 3)
|
88
|
+
silence_info = [" Silence periods:"]
|
89
|
+
if self.silence_periods:
|
90
|
+
# Always show at least the first and last if there are any
|
91
|
+
total_periods = len(self.silence_periods)
|
92
|
+
|
93
|
+
# Determine how many to show at start and end
|
94
|
+
to_show = min(3, total_periods)
|
95
|
+
|
96
|
+
# Add the first 'to_show' periods
|
97
|
+
for i in range(to_show):
|
98
|
+
start, end, duration = self.silence_periods[i]
|
99
|
+
silence_info.append(
|
100
|
+
f" {i + 1}: {start:.2f}s - {end:.2f}s ({duration:.2f}s) "
|
101
|
+
f"[{format_time(start)} - {format_time(end)}]"
|
102
|
+
)
|
103
|
+
|
104
|
+
# Add ellipsis if there are more than 2*to_show periods
|
105
|
+
if total_periods > 2 * to_show:
|
106
|
+
silence_info.append(
|
107
|
+
f" ... {total_periods - 2 * to_show} more periods ..."
|
108
|
+
)
|
109
|
+
|
110
|
+
# Add the last 'to_show' periods if there are more than 'to_show' total
|
111
|
+
if total_periods > to_show:
|
112
|
+
for i in range(max(to_show, total_periods - to_show), total_periods):
|
113
|
+
start, end, duration = self.silence_periods[i]
|
114
|
+
silence_info.append(
|
115
|
+
f" {i + 1}: {start:.2f}s - {end:.2f}s ({duration:.2f}s) "
|
116
|
+
f"[{format_time(start)} - {format_time(end)}]"
|
117
|
+
)
|
118
|
+
else:
|
119
|
+
silence_info.append(" None detected")
|
120
|
+
|
121
|
+
# Combine all information
|
122
|
+
return "\n".join(basic_info + silence_info)
|
123
|
+
|
124
|
+
# __new__ removed; all initialization is handled in __init__
|
125
|
+
|
126
|
+
def strip(
|
127
|
+
self, output_path: Path, leading: bool = True, trailing: bool = True
|
128
|
+
) -> Optional["SoundFile"]:
|
129
|
+
"""
|
130
|
+
Create a new audio file with leading and/or trailing silence removed.
|
131
|
+
|
132
|
+
Args:
|
133
|
+
output_path: Path to save the new file.
|
134
|
+
leading: Whether to remove leading silence
|
135
|
+
trailing: Whether to remove trailing silence
|
136
|
+
Returns:
|
137
|
+
A new SoundFile instance for the created file, or None if operation failed
|
138
|
+
"""
|
139
|
+
if not self.silence_periods:
|
140
|
+
logger.info(
|
141
|
+
f"No silence periods detected in {self.path}, nothing to strip."
|
142
|
+
)
|
143
|
+
return self
|
144
|
+
try:
|
145
|
+
strip_silence(
|
146
|
+
self.path,
|
147
|
+
output_path,
|
148
|
+
self.silence_periods,
|
149
|
+
self.duration,
|
150
|
+
leading,
|
151
|
+
trailing,
|
152
|
+
)
|
153
|
+
return SoundFile(
|
154
|
+
output_path, self.noise_threshold_db, self.min_silence_duration
|
155
|
+
)
|
156
|
+
except SpeechPrepError as e:
|
157
|
+
logger.error(f"Error during strip: {e}")
|
158
|
+
return None
|
159
|
+
|
160
|
+
def convert(
|
161
|
+
self, output_path: Path, audio_bitrate: Optional[str] = None
|
162
|
+
) -> Optional["SoundFile"]:
|
163
|
+
"""
|
164
|
+
Convert the audio file to a different format.
|
165
|
+
|
166
|
+
Args:
|
167
|
+
output_path: Path to save the converted file
|
168
|
+
audio_bitrate: Optional bitrate for the output file (e.g., '192k', '320k')
|
169
|
+
|
170
|
+
Returns:
|
171
|
+
A new SoundFile instance for the converted file, or None if operation failed
|
172
|
+
"""
|
173
|
+
try:
|
174
|
+
convert_format(self.path, output_path, audio_bitrate)
|
175
|
+
return SoundFile(
|
176
|
+
output_path, self.noise_threshold_db, self.min_silence_duration
|
177
|
+
)
|
178
|
+
except SpeechPrepError as e:
|
179
|
+
logger.error(f"Error during convert: {e}")
|
180
|
+
return None
|
181
|
+
|
182
|
+
def speed(self, output_path: Path, speed_factor: float) -> Optional["SoundFile"]:
|
183
|
+
"""
|
184
|
+
Create a new audio file with adjusted playback speed.
|
185
|
+
|
186
|
+
Args:
|
187
|
+
output_path: Path to save the new file.
|
188
|
+
speed_factor: Speed multiplier (e.g., 2.0 for 2x speed, 0.5 for half speed)
|
189
|
+
|
190
|
+
Returns:
|
191
|
+
A new SoundFile instance for the created file, or None if operation failed
|
192
|
+
"""
|
193
|
+
try:
|
194
|
+
adjust_speed(self.path, output_path, speed_factor)
|
195
|
+
# Adjust silence threshold for the new file
|
196
|
+
adjusted_threshold = self.min_silence_duration / speed_factor
|
197
|
+
logger.info(
|
198
|
+
f"Silence threshold: {adjusted_threshold:.2f}s for sped-up file"
|
199
|
+
)
|
200
|
+
return SoundFile(output_path, self.noise_threshold_db, adjusted_threshold)
|
201
|
+
except SpeechPrepError as e:
|
202
|
+
logger.error(f"Error during speed: {e}")
|
203
|
+
return None
|
speech_prep/detection.py
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
"""Silence detection functionality for audio files."""
|
2
|
+
|
3
|
+
from pathlib import Path
|
4
|
+
import re
|
5
|
+
import subprocess
|
6
|
+
from typing import Optional
|
7
|
+
|
8
|
+
from .exceptions import SilenceDetectionError
|
9
|
+
|
10
|
+
|
11
|
+
def detect_silence(
|
12
|
+
file_path: Path, noise_threshold_db: int, min_silence_duration: float
|
13
|
+
) -> list[tuple[float, float, float]]:
|
14
|
+
"""
|
15
|
+
Detect silence periods using ffmpeg silencedetect filter.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
file_path: Path to the audio file
|
19
|
+
noise_threshold_db: Threshold (in dB) for silence detection
|
20
|
+
min_silence_duration: Minimum duration (in seconds) to consider as silence
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
List of silence periods as (start, end, duration) tuples
|
24
|
+
|
25
|
+
Raises:
|
26
|
+
SilenceDetectionError: If silence detection fails
|
27
|
+
"""
|
28
|
+
silence_cmd = [
|
29
|
+
"ffmpeg",
|
30
|
+
"-i",
|
31
|
+
str(file_path),
|
32
|
+
"-af",
|
33
|
+
f"silencedetect=noise={noise_threshold_db}dB:d={min_silence_duration}",
|
34
|
+
"-f",
|
35
|
+
"null",
|
36
|
+
"-",
|
37
|
+
]
|
38
|
+
|
39
|
+
try:
|
40
|
+
silence_proc = subprocess.run(
|
41
|
+
silence_cmd,
|
42
|
+
capture_output=True,
|
43
|
+
text=True,
|
44
|
+
check=True,
|
45
|
+
)
|
46
|
+
except subprocess.CalledProcessError as e:
|
47
|
+
raise SilenceDetectionError(
|
48
|
+
f"Error detecting silence in file {file_path}: {e.stderr}"
|
49
|
+
) from e
|
50
|
+
except FileNotFoundError as e:
|
51
|
+
raise SilenceDetectionError(
|
52
|
+
"ffmpeg not found. Please ensure ffmpeg is installed and accessible."
|
53
|
+
) from e
|
54
|
+
|
55
|
+
return parse_silence_output(silence_proc.stderr)
|
56
|
+
|
57
|
+
|
58
|
+
def parse_silence_output(silence_output: str) -> list[tuple[float, float, float]]:
|
59
|
+
"""
|
60
|
+
Parse the ffmpeg silence detection output to extract silence periods.
|
61
|
+
|
62
|
+
Args:
|
63
|
+
silence_output: stderr output from ffmpeg silence detection
|
64
|
+
|
65
|
+
Returns:
|
66
|
+
List of silence periods as (start, end, duration) tuples
|
67
|
+
"""
|
68
|
+
silence_periods = []
|
69
|
+
start_time: Optional[float] = None
|
70
|
+
|
71
|
+
for line in silence_output.splitlines():
|
72
|
+
if "silence_start" in line:
|
73
|
+
# Extract the start time
|
74
|
+
match = re.search(r"silence_start: (\d+(?:\.\d+)?)", line)
|
75
|
+
if match:
|
76
|
+
start_time = float(match.group(1))
|
77
|
+
elif "silence_end" in line and start_time is not None:
|
78
|
+
# Extract the end time and duration
|
79
|
+
match = re.search(
|
80
|
+
r"silence_end: (\d+(?:\.\d+)?) \| silence_duration: (\d+(?:\.\d+)?)",
|
81
|
+
line,
|
82
|
+
)
|
83
|
+
if match:
|
84
|
+
end_time = float(match.group(1))
|
85
|
+
silence_duration = float(match.group(2))
|
86
|
+
silence_periods.append((start_time, end_time, silence_duration))
|
87
|
+
start_time = None
|
88
|
+
|
89
|
+
return silence_periods
|
90
|
+
|
91
|
+
|
92
|
+
def calculate_median_silence(
|
93
|
+
silence_periods: list[tuple[float, float, float]],
|
94
|
+
) -> float:
|
95
|
+
"""
|
96
|
+
Calculate the median duration of silence periods.
|
97
|
+
|
98
|
+
Args:
|
99
|
+
silence_periods: List of silence periods as (start, end, duration) tuples
|
100
|
+
|
101
|
+
Returns:
|
102
|
+
Median silence duration in seconds
|
103
|
+
"""
|
104
|
+
if not silence_periods:
|
105
|
+
return 0.0
|
106
|
+
|
107
|
+
silence_durations = [duration for _, _, duration in silence_periods]
|
108
|
+
silence_durations.sort()
|
109
|
+
|
110
|
+
n = len(silence_durations)
|
111
|
+
if n % 2 == 0:
|
112
|
+
# Even number of elements - take average of middle two
|
113
|
+
return (silence_durations[n // 2 - 1] + silence_durations[n // 2]) / 2
|
114
|
+
else:
|
115
|
+
# Odd number of elements - take middle element
|
116
|
+
return silence_durations[n // 2]
|
@@ -0,0 +1,49 @@
|
|
1
|
+
"""Custom exceptions for the speech-prep package."""
|
2
|
+
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
|
6
|
+
class SpeechPrepError(Exception):
|
7
|
+
"""Base exception for all speech-prep related errors."""
|
8
|
+
|
9
|
+
pass
|
10
|
+
|
11
|
+
|
12
|
+
class FFmpegError(SpeechPrepError):
|
13
|
+
"""Raised when ffmpeg command fails or returns an error."""
|
14
|
+
|
15
|
+
def __init__(
|
16
|
+
self,
|
17
|
+
message: str,
|
18
|
+
stderr: Optional[str] = None,
|
19
|
+
returncode: Optional[int] = None,
|
20
|
+
):
|
21
|
+
"""
|
22
|
+
Initialize FFmpegError with error details.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
message: Error message
|
26
|
+
stderr: Standard error output from ffmpeg
|
27
|
+
returncode: Return code from ffmpeg process
|
28
|
+
"""
|
29
|
+
self.stderr = stderr
|
30
|
+
self.returncode = returncode
|
31
|
+
super().__init__(message)
|
32
|
+
|
33
|
+
|
34
|
+
class FileValidationError(SpeechPrepError):
|
35
|
+
"""Raised when file validation fails."""
|
36
|
+
|
37
|
+
pass
|
38
|
+
|
39
|
+
|
40
|
+
class AudioPropertiesError(SpeechPrepError):
|
41
|
+
"""Raised when audio properties cannot be extracted."""
|
42
|
+
|
43
|
+
pass
|
44
|
+
|
45
|
+
|
46
|
+
class SilenceDetectionError(SpeechPrepError):
|
47
|
+
"""Raised when silence detection fails."""
|
48
|
+
|
49
|
+
pass
|
@@ -0,0 +1,185 @@
|
|
1
|
+
"""Audio processing operations for speech preparation."""
|
2
|
+
|
3
|
+
from pathlib import Path
|
4
|
+
import subprocess
|
5
|
+
from typing import Optional
|
6
|
+
|
7
|
+
from .exceptions import FFmpegError
|
8
|
+
|
9
|
+
|
10
|
+
def strip_silence(
|
11
|
+
input_path: Path,
|
12
|
+
output_path: Path,
|
13
|
+
silence_periods: list[tuple[float, float, float]],
|
14
|
+
total_duration: float,
|
15
|
+
leading: bool = True,
|
16
|
+
trailing: bool = True,
|
17
|
+
) -> None:
|
18
|
+
"""
|
19
|
+
Create a new audio file with leading and/or trailing silence removed.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
input_path: Path to the input audio file
|
23
|
+
output_path: Path to save the new file
|
24
|
+
silence_periods: List of silence periods as (start, end, duration) tuples
|
25
|
+
total_duration: Total duration of the audio file
|
26
|
+
leading: Whether to remove leading silence
|
27
|
+
trailing: Whether to remove trailing silence
|
28
|
+
|
29
|
+
Raises:
|
30
|
+
FFmpegError: If the ffmpeg operation fails
|
31
|
+
"""
|
32
|
+
if not silence_periods:
|
33
|
+
raise FFmpegError("No silence periods detected, nothing to strip")
|
34
|
+
|
35
|
+
# Determine start and end times based on silence periods
|
36
|
+
start_time = 0.0
|
37
|
+
end_time = total_duration
|
38
|
+
|
39
|
+
if leading and silence_periods[0][0] == 0:
|
40
|
+
# First silence period starts at 0, so it's leading silence
|
41
|
+
start_time = silence_periods[0][1]
|
42
|
+
|
43
|
+
if trailing:
|
44
|
+
last_silence = silence_periods[-1]
|
45
|
+
# Check if the last silence extends to the end of the file
|
46
|
+
# Allow a small buffer (0.1s) for rounding errors
|
47
|
+
if abs(last_silence[1] - total_duration) < 0.1:
|
48
|
+
end_time = last_silence[0]
|
49
|
+
|
50
|
+
# Use ffmpeg to cut the file
|
51
|
+
cmd = [
|
52
|
+
"ffmpeg",
|
53
|
+
"-y", # Overwrite output file if it exists
|
54
|
+
"-i",
|
55
|
+
str(input_path),
|
56
|
+
"-ss",
|
57
|
+
str(start_time), # Start time
|
58
|
+
"-to",
|
59
|
+
str(end_time), # End time
|
60
|
+
"-c",
|
61
|
+
"copy", # Copy streams without re-encoding
|
62
|
+
str(output_path),
|
63
|
+
]
|
64
|
+
|
65
|
+
print(f"Stripping silence: {start_time:.2f}s to {end_time:.2f}s")
|
66
|
+
_run_ffmpeg_command(cmd, "stripping silence")
|
67
|
+
|
68
|
+
|
69
|
+
def convert_format(
|
70
|
+
input_path: Path, output_path: Path, audio_bitrate: Optional[str] = None
|
71
|
+
) -> None:
|
72
|
+
"""
|
73
|
+
Convert the audio file to a different format.
|
74
|
+
|
75
|
+
Args:
|
76
|
+
input_path: Path to the input audio file
|
77
|
+
output_path: Path to save the converted file
|
78
|
+
audio_bitrate: Optional bitrate for the output file (e.g., '192k', '320k')
|
79
|
+
|
80
|
+
Raises:
|
81
|
+
FFmpegError: If the ffmpeg operation fails
|
82
|
+
"""
|
83
|
+
# Build ffmpeg command
|
84
|
+
cmd = ["ffmpeg", "-y", "-i", str(input_path)]
|
85
|
+
|
86
|
+
# Add bitrate if specified
|
87
|
+
if audio_bitrate:
|
88
|
+
cmd.extend(["-b:a", audio_bitrate])
|
89
|
+
|
90
|
+
# Add output file
|
91
|
+
cmd.append(str(output_path))
|
92
|
+
|
93
|
+
input_format = input_path.suffix.lower().lstrip(".")
|
94
|
+
output_format = output_path.suffix.lower().lstrip(".")
|
95
|
+
print(f"Converting {input_path.name} from {input_format} to {output_format}")
|
96
|
+
|
97
|
+
_run_ffmpeg_command(cmd, "converting format")
|
98
|
+
|
99
|
+
|
100
|
+
def adjust_speed(input_path: Path, output_path: Path, speed_factor: float) -> None:
|
101
|
+
"""
|
102
|
+
Create a new audio file with adjusted playback speed.
|
103
|
+
|
104
|
+
Args:
|
105
|
+
input_path: Path to the input audio file
|
106
|
+
output_path: Path to save the speed-adjusted file
|
107
|
+
speed_factor: Speed multiplier (e.g., 2.0 for 2x speed, 0.5 for half speed)
|
108
|
+
|
109
|
+
Raises:
|
110
|
+
FFmpegError: If the ffmpeg operation fails or speed_factor is invalid
|
111
|
+
"""
|
112
|
+
if speed_factor <= 0:
|
113
|
+
raise FFmpegError("Speed factor must be positive")
|
114
|
+
|
115
|
+
# Use ffmpeg's atempo filter for speed adjustment
|
116
|
+
# Note: atempo filter is limited to 0.5x to 2.0x range
|
117
|
+
# For factors outside this range, we need to chain multiple atempo filters
|
118
|
+
|
119
|
+
atempo_filters = []
|
120
|
+
remaining_factor = speed_factor
|
121
|
+
|
122
|
+
# Split into multiple atempo filters if needed
|
123
|
+
while remaining_factor > 2.0:
|
124
|
+
atempo_filters.append("atempo=2.0")
|
125
|
+
remaining_factor /= 2.0
|
126
|
+
|
127
|
+
while remaining_factor < 0.5:
|
128
|
+
atempo_filters.append("atempo=0.5")
|
129
|
+
remaining_factor /= 0.5
|
130
|
+
|
131
|
+
# Add the final adjustment
|
132
|
+
if abs(remaining_factor - 1.0) > 0.01: # If not very close to 1.0
|
133
|
+
atempo_filters.append(f"atempo={remaining_factor}")
|
134
|
+
|
135
|
+
# Build the filter string
|
136
|
+
filter_str = ",".join(atempo_filters) if atempo_filters else "atempo=1.0"
|
137
|
+
|
138
|
+
# Determine appropriate codec based on output format
|
139
|
+
output_format = output_path.suffix.lower()
|
140
|
+
if output_format == ".mp3":
|
141
|
+
codec = "libmp3lame"
|
142
|
+
elif output_format == ".wav":
|
143
|
+
codec = "pcm_s16le"
|
144
|
+
else:
|
145
|
+
codec = "libmp3lame" # Default to mp3 codec
|
146
|
+
|
147
|
+
# Build ffmpeg command
|
148
|
+
cmd = [
|
149
|
+
"ffmpeg",
|
150
|
+
"-y",
|
151
|
+
"-i",
|
152
|
+
str(input_path),
|
153
|
+
"-filter:a",
|
154
|
+
filter_str,
|
155
|
+
"-c:a",
|
156
|
+
codec,
|
157
|
+
str(output_path),
|
158
|
+
]
|
159
|
+
|
160
|
+
print(f"Adjusting speed by factor {speed_factor}x using filter: {filter_str}")
|
161
|
+
_run_ffmpeg_command(cmd, "adjusting speed")
|
162
|
+
|
163
|
+
|
164
|
+
def _run_ffmpeg_command(cmd: list[str], operation_name: str) -> None:
|
165
|
+
"""
|
166
|
+
Run an ffmpeg command with error handling.
|
167
|
+
|
168
|
+
Args:
|
169
|
+
cmd: List of command arguments
|
170
|
+
operation_name: Description of the operation for error messages
|
171
|
+
|
172
|
+
Raises:
|
173
|
+
FFmpegError: If ffmpeg command fails
|
174
|
+
"""
|
175
|
+
try:
|
176
|
+
subprocess.run(cmd, capture_output=True, text=True, check=True)
|
177
|
+
except subprocess.CalledProcessError as e:
|
178
|
+
raise FFmpegError(
|
179
|
+
f"Error during {operation_name}", stderr=e.stderr, returncode=e.returncode
|
180
|
+
) from e
|
181
|
+
except FileNotFoundError as e:
|
182
|
+
raise FFmpegError(
|
183
|
+
f"ffmpeg not found during {operation_name}. "
|
184
|
+
"Please ensure ffmpeg is installed and accessible."
|
185
|
+
) from e
|
speech_prep/utils.py
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
"""Utility functions for audio file operations."""
|
2
|
+
|
3
|
+
import json
|
4
|
+
from pathlib import Path
|
5
|
+
import subprocess
|
6
|
+
|
7
|
+
from .exceptions import AudioPropertiesError, FileValidationError
|
8
|
+
|
9
|
+
|
10
|
+
def validate_file(file_path: Path) -> bool:
|
11
|
+
"""
|
12
|
+
Validate that the file exists and is a regular file.
|
13
|
+
|
14
|
+
Args:
|
15
|
+
file_path: Path to the file to validate
|
16
|
+
|
17
|
+
Returns:
|
18
|
+
True if file is valid
|
19
|
+
|
20
|
+
Raises:
|
21
|
+
FileValidationError: If file doesn't exist or is not a regular file
|
22
|
+
"""
|
23
|
+
if not file_path.exists():
|
24
|
+
raise FileValidationError(f"File {file_path} does not exist")
|
25
|
+
|
26
|
+
if not file_path.is_file():
|
27
|
+
raise FileValidationError(f"Path {file_path} is not a regular file")
|
28
|
+
|
29
|
+
return True
|
30
|
+
|
31
|
+
|
32
|
+
def get_audio_properties(file_path: Path) -> tuple[float, int, str]:
|
33
|
+
"""
|
34
|
+
Extract audio properties (duration, file size, format) using ffprobe.
|
35
|
+
|
36
|
+
Args:
|
37
|
+
file_path: Path to the audio file
|
38
|
+
|
39
|
+
Returns:
|
40
|
+
Tuple of (duration, file_size, audio_format)
|
41
|
+
|
42
|
+
Raises:
|
43
|
+
AudioPropertiesError: If properties cannot be extracted
|
44
|
+
"""
|
45
|
+
probe_cmd = [
|
46
|
+
"ffprobe",
|
47
|
+
"-v",
|
48
|
+
"error",
|
49
|
+
"-show_entries",
|
50
|
+
"format=duration,size,format_name",
|
51
|
+
"-of",
|
52
|
+
"json",
|
53
|
+
str(file_path),
|
54
|
+
]
|
55
|
+
|
56
|
+
try:
|
57
|
+
probe_result = subprocess.run(
|
58
|
+
probe_cmd,
|
59
|
+
capture_output=True,
|
60
|
+
text=True,
|
61
|
+
check=True,
|
62
|
+
)
|
63
|
+
except subprocess.CalledProcessError as e:
|
64
|
+
raise AudioPropertiesError(f"Error probing file {file_path}: {e.stderr}") from e
|
65
|
+
except FileNotFoundError as e:
|
66
|
+
raise AudioPropertiesError(
|
67
|
+
"ffprobe not found. Please ensure ffmpeg is installed and accessible."
|
68
|
+
) from e
|
69
|
+
|
70
|
+
try:
|
71
|
+
probe_data = json.loads(probe_result.stdout)["format"]
|
72
|
+
duration = float(probe_data["duration"])
|
73
|
+
file_size = int(probe_data["size"])
|
74
|
+
audio_format = probe_data["format_name"].split(",")[
|
75
|
+
0
|
76
|
+
] # Get the first format name
|
77
|
+
|
78
|
+
if duration <= 0 or file_size <= 0:
|
79
|
+
raise AudioPropertiesError(
|
80
|
+
f"Invalid duration or file size for {file_path}. "
|
81
|
+
f"Duration: {duration}, Size: {file_size}"
|
82
|
+
)
|
83
|
+
|
84
|
+
return duration, file_size, audio_format
|
85
|
+
|
86
|
+
except (KeyError, ValueError, json.JSONDecodeError) as e:
|
87
|
+
raise AudioPropertiesError(f"Error parsing probe data: {e}") from e
|
88
|
+
|
89
|
+
|
90
|
+
def format_time(seconds: float) -> str:
|
91
|
+
"""
|
92
|
+
Format seconds as HH:MM:SS.
|
93
|
+
|
94
|
+
Args:
|
95
|
+
seconds: Time in seconds
|
96
|
+
|
97
|
+
Returns:
|
98
|
+
Formatted time string
|
99
|
+
"""
|
100
|
+
hours, remainder = divmod(int(seconds), 3600)
|
101
|
+
minutes, seconds_int = divmod(remainder, 60)
|
102
|
+
return f"{hours:02}:{minutes:02}:{seconds_int:02}"
|
103
|
+
|
104
|
+
|
105
|
+
def run_ffmpeg_command(
|
106
|
+
cmd: list[str], operation_name: str
|
107
|
+
) -> subprocess.CompletedProcess[str]:
|
108
|
+
"""
|
109
|
+
Run an ffmpeg command with error handling.
|
110
|
+
|
111
|
+
Args:
|
112
|
+
cmd: List of command arguments
|
113
|
+
operation_name: Description of the operation for error messages
|
114
|
+
|
115
|
+
Returns:
|
116
|
+
CompletedProcess result
|
117
|
+
|
118
|
+
Raises:
|
119
|
+
AudioPropertiesError: If ffmpeg command fails
|
120
|
+
"""
|
121
|
+
try:
|
122
|
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
123
|
+
return result
|
124
|
+
|
125
|
+
except subprocess.CalledProcessError as e:
|
126
|
+
raise AudioPropertiesError(f"Error during {operation_name}: {e.stderr}") from e
|
127
|
+
except FileNotFoundError as e:
|
128
|
+
raise AudioPropertiesError(
|
129
|
+
"ffmpeg not found. Please ensure ffmpeg is installed and accessible."
|
130
|
+
) from e
|
@@ -0,0 +1,220 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: speech-prep
|
3
|
+
Version: 0.1.3
|
4
|
+
Summary: Audio preprocessing toolkit for speech-to-text applications using ffmpeg
|
5
|
+
Project-URL: Homepage, https://github.com/dimdasci/speech-prep
|
6
|
+
Project-URL: Repository, https://github.com/dimdasci/speech-prep
|
7
|
+
Project-URL: Issues, https://github.com/dimdasci/speech-prep/issues
|
8
|
+
Author-email: Dim Kharitonov <dimds@fastmail.com>
|
9
|
+
License: MIT
|
10
|
+
License-File: LICENSE
|
11
|
+
Keywords: audio,ffmpeg,preprocessing,silence-detection,speech-to-text
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
13
|
+
Classifier: Intended Audience :: Developers
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
20
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
22
|
+
Requires-Python: >=3.9
|
23
|
+
Provides-Extra: dev
|
24
|
+
Description-Content-Type: text/markdown
|
25
|
+
|
26
|
+
# Speech Prep
|
27
|
+
|
28
|
+
Audio preprocessing toolkit for speech-to-text applications using FFmpeg.
|
29
|
+
|
30
|
+
## Overview
|
31
|
+
|
32
|
+
Speech Prep is a Python package designed to prepare audio files for speech-to-text processing. It provides tools for silence detection and removal, speed adjustment, and format conversion - all essential steps for optimizing audio before transcription.
|
33
|
+
|
34
|
+
## Features
|
35
|
+
|
36
|
+
- **Silence Detection**: Automatically detect silence periods in audio files
|
37
|
+
- **Silence Removal**: Remove leading/trailing silence to clean up recordings
|
38
|
+
- **Speed Adjustment**: Change playback speed while maintaining audio quality
|
39
|
+
- **Format Conversion**: Convert between different audio formats (MP3, WAV, FLAC, etc.)
|
40
|
+
- **Clean API**: Simple, intuitive interface with method chaining support
|
41
|
+
- **FFmpeg Integration**: Leverages the power and reliability of FFmpeg
|
42
|
+
|
43
|
+
## Requirements
|
44
|
+
|
45
|
+
- Python 3.9+
|
46
|
+
- FFmpeg (must be installed and accessible via PATH)
|
47
|
+
|
48
|
+
## Installation
|
49
|
+
|
50
|
+
```bash
|
51
|
+
# Install from PyPI (when published)
|
52
|
+
pip install speech-prep
|
53
|
+
|
54
|
+
# Or install from source
|
55
|
+
git clone https://github.com/dimdasci/speech-prep.git
|
56
|
+
cd speech-prep
|
57
|
+
uv sync # or pip install -e .
|
58
|
+
```
|
59
|
+
|
60
|
+
## Quick Start
|
61
|
+
|
62
|
+
```python
|
63
|
+
from speech_prep import SoundFile
|
64
|
+
from pathlib import Path
|
65
|
+
|
66
|
+
# Load an audio file
|
67
|
+
audio = SoundFile(Path("recording.wav"))
|
68
|
+
|
69
|
+
if audio:
|
70
|
+
print(f"Duration: {audio.duration:.2f} seconds")
|
71
|
+
print(f"Format: {audio.format}")
|
72
|
+
print(f"Silence periods detected: {len(audio.silence_periods)}")
|
73
|
+
|
74
|
+
# Clean up the audio for speech-to-text
|
75
|
+
cleaned = audio.strip(output_path=Path("recording_stripped.wav"))
|
76
|
+
faster = cleaned.speed(output_path=Path("recording_stripped_fast.wav"), speed_factor=1.2)
|
77
|
+
final = faster.convert(output_path=Path("clean.mp3"))
|
78
|
+
|
79
|
+
print(f"Processed file saved: {final.path}")
|
80
|
+
```
|
81
|
+
|
82
|
+
## Usage Examples
|
83
|
+
|
84
|
+
### Basic Operations
|
85
|
+
|
86
|
+
```python
|
87
|
+
from speech_prep import SoundFile
|
88
|
+
from pathlib import Path
|
89
|
+
|
90
|
+
# Load audio file
|
91
|
+
audio = SoundFile(Path("interview.wav"))
|
92
|
+
|
93
|
+
# View audio information
|
94
|
+
print(audio) # Shows duration, format, file size, and silence periods
|
95
|
+
|
96
|
+
# Remove silence from beginning and end
|
97
|
+
cleaned = audio.strip(output_path=Path("interview_stripped.wav"))
|
98
|
+
|
99
|
+
# Remove only leading silence
|
100
|
+
cleaned = audio.strip(output_path=Path("interview_leading.wav"), trailing=False)
|
101
|
+
|
102
|
+
# Speed up audio by 50%
|
103
|
+
faster = audio.speed(output_path=Path("interview_fast.wav"), speed_factor=1.5)
|
104
|
+
|
105
|
+
# Convert format
|
106
|
+
mp3_file = audio.convert(output_path=Path("output.mp3"))
|
107
|
+
```
|
108
|
+
|
109
|
+
### Processing Pipeline
|
110
|
+
|
111
|
+
```python
|
112
|
+
from speech_prep import SoundFile
|
113
|
+
from pathlib import Path
|
114
|
+
|
115
|
+
def prepare_for_transcription(input_file: Path, output_file: Path):
|
116
|
+
"""Prepare audio file for speech-to-text processing."""
|
117
|
+
# Load the original file
|
118
|
+
audio = SoundFile(input_file)
|
119
|
+
if not audio:
|
120
|
+
return None
|
121
|
+
# Processing pipeline
|
122
|
+
stripped = audio.strip(output_path=input_file.with_stem(input_file.stem + "_stripped"))
|
123
|
+
faster = stripped.speed(output_path=input_file.with_stem(input_file.stem + "_stripped_fast"), speed_factor=1.1)
|
124
|
+
processed = faster.convert(output_path=output_file)
|
125
|
+
if processed:
|
126
|
+
print(f"Original duration: {audio.duration:.2f}s")
|
127
|
+
print(f"Processed duration: {processed.duration:.2f}s")
|
128
|
+
print(f"Time saved: {audio.duration - processed.duration:.2f}s")
|
129
|
+
return processed
|
130
|
+
|
131
|
+
# Use the pipeline
|
132
|
+
result = prepare_for_transcription(
|
133
|
+
Path("long_meeting.wav"),
|
134
|
+
Path("ready_for_stt.mp3")
|
135
|
+
)
|
136
|
+
```
|
137
|
+
|
138
|
+
### Error Handling
|
139
|
+
|
140
|
+
```python
|
141
|
+
from speech_prep import SoundFile, SpeechPrepError, FFmpegError
|
142
|
+
from pathlib import Path
|
143
|
+
|
144
|
+
try:
|
145
|
+
audio = SoundFile(Path("audio.wav"))
|
146
|
+
if audio:
|
147
|
+
result = audio.strip().speed(2.0)
|
148
|
+
print(f"Success: {result.path}")
|
149
|
+
else:
|
150
|
+
print("Failed to load audio file")
|
151
|
+
|
152
|
+
except FFmpegError as e:
|
153
|
+
print(f"FFmpeg error: {e}")
|
154
|
+
if e.stderr:
|
155
|
+
print(f"Details: {e.stderr}")
|
156
|
+
|
157
|
+
except SpeechPrepError as e:
|
158
|
+
print(f"Processing error: {e}")
|
159
|
+
```
|
160
|
+
|
161
|
+
### Custom Parameters
|
162
|
+
|
163
|
+
```python
|
164
|
+
from speech_prep import SoundFile
|
165
|
+
from pathlib import Path
|
166
|
+
|
167
|
+
# Custom silence detection settings
|
168
|
+
audio = SoundFile(
|
169
|
+
Path("audio.wav"),
|
170
|
+
noise_threshold_db=-40, # More sensitive silence detection
|
171
|
+
min_silence_duration=0.3 # Shorter minimum silence periods
|
172
|
+
)
|
173
|
+
|
174
|
+
# Custom output paths
|
175
|
+
cleaned = audio.strip(output_path=Path("custom_output.wav"))
|
176
|
+
|
177
|
+
# Custom conversion settings
|
178
|
+
mp3 = audio.convert(
|
179
|
+
output_path=Path("output.mp3"),
|
180
|
+
audio_bitrate="192k" # Custom bitrate
|
181
|
+
)
|
182
|
+
```
|
183
|
+
|
184
|
+
## API Reference
|
185
|
+
|
186
|
+
### SoundFile Class
|
187
|
+
|
188
|
+
#### Constructor
|
189
|
+
```python
|
190
|
+
SoundFile(file_path, noise_threshold_db=-30, min_silence_duration=0.5)
|
191
|
+
```
|
192
|
+
|
193
|
+
#### Methods
|
194
|
+
- **`strip(output_path, leading=True, trailing=True)`**: Remove silence
|
195
|
+
- **`speed(output_path, speed_factor)`**: Adjust playback speed
|
196
|
+
- **`convert(output_path, audio_bitrate=None)`**: Convert format
|
197
|
+
|
198
|
+
#### Properties
|
199
|
+
- **`path`**: Path to the audio file
|
200
|
+
- **`duration`**: Duration in seconds
|
201
|
+
- **`format`**: Audio format
|
202
|
+
- **`file_size`**: File size in bytes
|
203
|
+
- **`silence_periods`**: List of detected silence periods
|
204
|
+
- **`median_silence`**: Median silence duration
|
205
|
+
|
206
|
+
## Contributing
|
207
|
+
|
208
|
+
1. Fork the repository
|
209
|
+
2. Create your feature branch (`git checkout -b feature/amazing-feature`)
|
210
|
+
3. Commit your changes (`git commit -m 'Add amazing feature'`)
|
211
|
+
4. Push to the branch (`git push origin feature/amazing-feature`)
|
212
|
+
5. Open a Pull Request
|
213
|
+
|
214
|
+
## License
|
215
|
+
|
216
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
217
|
+
|
218
|
+
## Acknowledgments
|
219
|
+
|
220
|
+
- Built on top of the powerful [FFmpeg](https://ffmpeg.org/) multimedia framework
|
@@ -0,0 +1,10 @@
|
|
1
|
+
speech_prep/__init__.py,sha256=0Eu8vjSjvG3sOQbN9dsjtQkKcVBPcLthK4Eit0UrtAQ,839
|
2
|
+
speech_prep/core.py,sha256=pe4djUP1wQF4TJiaw1lg7xIvBzVHOMWP7dHgar3unt4,7567
|
3
|
+
speech_prep/detection.py,sha256=D5_WkTYoFDUIYA2u6cfWK6E_Rd5R6g1Lng0Hh1UGgBs,3495
|
4
|
+
speech_prep/exceptions.py,sha256=qZcIzM-IPltgJNtfmj5K4D8OJsL1zButmLnshas9m4M,1091
|
5
|
+
speech_prep/processing.py,sha256=421IqfAcRUqMtXBsiTypSp_4H0X3uh5UjQ8Af-nPaX0,5684
|
6
|
+
speech_prep/utils.py,sha256=_yjn1hoVVHfLc3nGAhD2n6bsevgweqNOt1rsDyahQnY,3585
|
7
|
+
speech_prep-0.1.3.dist-info/METADATA,sha256=8wP2R43DbY7JH9S8r1_DJlWKPsYMgi9CIIl8HpZMLsI,6616
|
8
|
+
speech_prep-0.1.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
9
|
+
speech_prep-0.1.3.dist-info/licenses/LICENSE,sha256=-M8NcLlGaRvQqThXHq5g0D9CUR05KMhdswCB9s_0Sds,1066
|
10
|
+
speech_prep-0.1.3.dist-info/RECORD,,
|
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 Dim Kharitonov
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|