speech-prep 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,34 @@
1
+ """
2
+ Speech Prep - Audio preprocessing toolkit for speech-to-text applications.
3
+
4
+ This package provides tools to prepare audio files for speech-to-text processing,
5
+ including silence detection and removal, speed adjustment, and format conversion.
6
+ """
7
+
8
+ from .core import SoundFile
9
+ from .exceptions import (
10
+ AudioPropertiesError,
11
+ FFmpegError,
12
+ FileValidationError,
13
+ SilenceDetectionError,
14
+ SpeechPrepError,
15
+ )
16
+
17
+ # Import version from hatch-vcs
18
+ try:
19
+ from importlib.metadata import version as get_metadata_version
20
+
21
+ __version__ = get_metadata_version("speech-prep")
22
+ except ImportError:
23
+ # Development or not installed
24
+ __version__ = "0.0.0"
25
+
26
+ __all__ = [
27
+ "SoundFile",
28
+ "SpeechPrepError",
29
+ "FFmpegError",
30
+ "FileValidationError",
31
+ "AudioPropertiesError",
32
+ "SilenceDetectionError",
33
+ "__version__",
34
+ ]
speech_prep/core.py ADDED
@@ -0,0 +1,203 @@
1
+ """Core SoundFile class for audio file manipulation."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from .detection import calculate_median_silence, detect_silence
8
+ from .exceptions import SpeechPrepError
9
+ from .processing import adjust_speed, convert_format, strip_silence
10
+ from .utils import format_time, get_audio_properties
11
+
12
+ # Configure package logger
13
+ logger = logging.getLogger("speech_prep")
14
+ if not logger.hasHandlers():
15
+ handler = logging.StreamHandler()
16
+ formatter = logging.Formatter("[%(levelname)s] %(name)s: %(message)s")
17
+ handler.setFormatter(formatter)
18
+ logger.addHandler(handler)
19
+ logger.setLevel(logging.WARNING)
20
+
21
+
22
+ class SoundFile:
23
+ """Represents an audio file with silence detection and processing capabilities."""
24
+
25
+ def __init__(
26
+ self,
27
+ path: Path,
28
+ noise_threshold_db: int = -30,
29
+ min_silence_duration: float = 0.5,
30
+ ):
31
+ """
32
+ Initialize a SoundFile object.
33
+
34
+ Args:
35
+ path: Path to the audio file
36
+ noise_threshold_db: Threshold (in dB) for silence detection
37
+ min_silence_duration: Minimum duration (in seconds) to consider as silence
38
+ """
39
+ self.path = Path(path)
40
+ if not self.path.exists():
41
+ raise FileNotFoundError(f"Audio file not found: {self.path}")
42
+
43
+ try:
44
+ self.duration, self.file_size, self.format = get_audio_properties(self.path)
45
+ except Exception as e:
46
+ raise SpeechPrepError(f"Failed to extract metadata: {e}") from e
47
+
48
+ try:
49
+ self.silence_periods = detect_silence(
50
+ self.path,
51
+ noise_threshold_db=noise_threshold_db,
52
+ min_silence_duration=min_silence_duration,
53
+ )
54
+ except Exception:
55
+ self.silence_periods = []
56
+
57
+ if self.silence_periods:
58
+ self.median_silence = calculate_median_silence(self.silence_periods)
59
+ else:
60
+ self.median_silence = 0.0
61
+
62
+ self.noise_threshold_db = noise_threshold_db
63
+ self.min_silence_duration = min_silence_duration
64
+
65
+ def __eq__(self, other: object) -> bool:
66
+ """Two files are equal if they reference the same path."""
67
+ if not isinstance(other, SoundFile):
68
+ return False
69
+ return self.path == other.path
70
+
71
+ def __str__(self) -> str:
72
+ """
73
+ Return a string representation of the SoundFile object.
74
+
75
+ Displays a summary of audio properties and the first/last three silence periods.
76
+ """
77
+ # Format basic audio information
78
+ basic_info = [
79
+ f"SoundFile: {self.path}",
80
+ f" Duration: {self.duration:.2f} seconds ({format_time(self.duration)})",
81
+ f" Format: {self.format}",
82
+ f" File size: {self.file_size / 1024 / 1024:.2f} MB",
83
+ f" Silence periods: {len(self.silence_periods)} detected",
84
+ f" Median silence: {self.median_silence:.2f} seconds",
85
+ ]
86
+
87
+ # Format silence periods (first 3, ellipsis, last 3)
88
+ silence_info = [" Silence periods:"]
89
+ if self.silence_periods:
90
+ # Always show at least the first and last if there are any
91
+ total_periods = len(self.silence_periods)
92
+
93
+ # Determine how many to show at start and end
94
+ to_show = min(3, total_periods)
95
+
96
+ # Add the first 'to_show' periods
97
+ for i in range(to_show):
98
+ start, end, duration = self.silence_periods[i]
99
+ silence_info.append(
100
+ f" {i + 1}: {start:.2f}s - {end:.2f}s ({duration:.2f}s) "
101
+ f"[{format_time(start)} - {format_time(end)}]"
102
+ )
103
+
104
+ # Add ellipsis if there are more than 2*to_show periods
105
+ if total_periods > 2 * to_show:
106
+ silence_info.append(
107
+ f" ... {total_periods - 2 * to_show} more periods ..."
108
+ )
109
+
110
+ # Add the last 'to_show' periods if there are more than 'to_show' total
111
+ if total_periods > to_show:
112
+ for i in range(max(to_show, total_periods - to_show), total_periods):
113
+ start, end, duration = self.silence_periods[i]
114
+ silence_info.append(
115
+ f" {i + 1}: {start:.2f}s - {end:.2f}s ({duration:.2f}s) "
116
+ f"[{format_time(start)} - {format_time(end)}]"
117
+ )
118
+ else:
119
+ silence_info.append(" None detected")
120
+
121
+ # Combine all information
122
+ return "\n".join(basic_info + silence_info)
123
+
124
+ # __new__ removed; all initialization is handled in __init__
125
+
126
+ def strip(
127
+ self, output_path: Path, leading: bool = True, trailing: bool = True
128
+ ) -> Optional["SoundFile"]:
129
+ """
130
+ Create a new audio file with leading and/or trailing silence removed.
131
+
132
+ Args:
133
+ output_path: Path to save the new file.
134
+ leading: Whether to remove leading silence
135
+ trailing: Whether to remove trailing silence
136
+ Returns:
137
+ A new SoundFile instance for the created file, or None if operation failed
138
+ """
139
+ if not self.silence_periods:
140
+ logger.info(
141
+ f"No silence periods detected in {self.path}, nothing to strip."
142
+ )
143
+ return self
144
+ try:
145
+ strip_silence(
146
+ self.path,
147
+ output_path,
148
+ self.silence_periods,
149
+ self.duration,
150
+ leading,
151
+ trailing,
152
+ )
153
+ return SoundFile(
154
+ output_path, self.noise_threshold_db, self.min_silence_duration
155
+ )
156
+ except SpeechPrepError as e:
157
+ logger.error(f"Error during strip: {e}")
158
+ return None
159
+
160
+ def convert(
161
+ self, output_path: Path, audio_bitrate: Optional[str] = None
162
+ ) -> Optional["SoundFile"]:
163
+ """
164
+ Convert the audio file to a different format.
165
+
166
+ Args:
167
+ output_path: Path to save the converted file
168
+ audio_bitrate: Optional bitrate for the output file (e.g., '192k', '320k')
169
+
170
+ Returns:
171
+ A new SoundFile instance for the converted file, or None if operation failed
172
+ """
173
+ try:
174
+ convert_format(self.path, output_path, audio_bitrate)
175
+ return SoundFile(
176
+ output_path, self.noise_threshold_db, self.min_silence_duration
177
+ )
178
+ except SpeechPrepError as e:
179
+ logger.error(f"Error during convert: {e}")
180
+ return None
181
+
182
+ def speed(self, output_path: Path, speed_factor: float) -> Optional["SoundFile"]:
183
+ """
184
+ Create a new audio file with adjusted playback speed.
185
+
186
+ Args:
187
+ output_path: Path to save the new file.
188
+ speed_factor: Speed multiplier (e.g., 2.0 for 2x speed, 0.5 for half speed)
189
+
190
+ Returns:
191
+ A new SoundFile instance for the created file, or None if operation failed
192
+ """
193
+ try:
194
+ adjust_speed(self.path, output_path, speed_factor)
195
+ # Adjust silence threshold for the new file
196
+ adjusted_threshold = self.min_silence_duration / speed_factor
197
+ logger.info(
198
+ f"Silence threshold: {adjusted_threshold:.2f}s for sped-up file"
199
+ )
200
+ return SoundFile(output_path, self.noise_threshold_db, adjusted_threshold)
201
+ except SpeechPrepError as e:
202
+ logger.error(f"Error during speed: {e}")
203
+ return None
@@ -0,0 +1,116 @@
1
+ """Silence detection functionality for audio files."""
2
+
3
+ from pathlib import Path
4
+ import re
5
+ import subprocess
6
+ from typing import Optional
7
+
8
+ from .exceptions import SilenceDetectionError
9
+
10
+
11
+ def detect_silence(
12
+ file_path: Path, noise_threshold_db: int, min_silence_duration: float
13
+ ) -> list[tuple[float, float, float]]:
14
+ """
15
+ Detect silence periods using ffmpeg silencedetect filter.
16
+
17
+ Args:
18
+ file_path: Path to the audio file
19
+ noise_threshold_db: Threshold (in dB) for silence detection
20
+ min_silence_duration: Minimum duration (in seconds) to consider as silence
21
+
22
+ Returns:
23
+ List of silence periods as (start, end, duration) tuples
24
+
25
+ Raises:
26
+ SilenceDetectionError: If silence detection fails
27
+ """
28
+ silence_cmd = [
29
+ "ffmpeg",
30
+ "-i",
31
+ str(file_path),
32
+ "-af",
33
+ f"silencedetect=noise={noise_threshold_db}dB:d={min_silence_duration}",
34
+ "-f",
35
+ "null",
36
+ "-",
37
+ ]
38
+
39
+ try:
40
+ silence_proc = subprocess.run(
41
+ silence_cmd,
42
+ capture_output=True,
43
+ text=True,
44
+ check=True,
45
+ )
46
+ except subprocess.CalledProcessError as e:
47
+ raise SilenceDetectionError(
48
+ f"Error detecting silence in file {file_path}: {e.stderr}"
49
+ ) from e
50
+ except FileNotFoundError as e:
51
+ raise SilenceDetectionError(
52
+ "ffmpeg not found. Please ensure ffmpeg is installed and accessible."
53
+ ) from e
54
+
55
+ return parse_silence_output(silence_proc.stderr)
56
+
57
+
58
+ def parse_silence_output(silence_output: str) -> list[tuple[float, float, float]]:
59
+ """
60
+ Parse the ffmpeg silence detection output to extract silence periods.
61
+
62
+ Args:
63
+ silence_output: stderr output from ffmpeg silence detection
64
+
65
+ Returns:
66
+ List of silence periods as (start, end, duration) tuples
67
+ """
68
+ silence_periods = []
69
+ start_time: Optional[float] = None
70
+
71
+ for line in silence_output.splitlines():
72
+ if "silence_start" in line:
73
+ # Extract the start time
74
+ match = re.search(r"silence_start: (\d+(?:\.\d+)?)", line)
75
+ if match:
76
+ start_time = float(match.group(1))
77
+ elif "silence_end" in line and start_time is not None:
78
+ # Extract the end time and duration
79
+ match = re.search(
80
+ r"silence_end: (\d+(?:\.\d+)?) \| silence_duration: (\d+(?:\.\d+)?)",
81
+ line,
82
+ )
83
+ if match:
84
+ end_time = float(match.group(1))
85
+ silence_duration = float(match.group(2))
86
+ silence_periods.append((start_time, end_time, silence_duration))
87
+ start_time = None
88
+
89
+ return silence_periods
90
+
91
+
92
+ def calculate_median_silence(
93
+ silence_periods: list[tuple[float, float, float]],
94
+ ) -> float:
95
+ """
96
+ Calculate the median duration of silence periods.
97
+
98
+ Args:
99
+ silence_periods: List of silence periods as (start, end, duration) tuples
100
+
101
+ Returns:
102
+ Median silence duration in seconds
103
+ """
104
+ if not silence_periods:
105
+ return 0.0
106
+
107
+ silence_durations = [duration for _, _, duration in silence_periods]
108
+ silence_durations.sort()
109
+
110
+ n = len(silence_durations)
111
+ if n % 2 == 0:
112
+ # Even number of elements - take average of middle two
113
+ return (silence_durations[n // 2 - 1] + silence_durations[n // 2]) / 2
114
+ else:
115
+ # Odd number of elements - take middle element
116
+ return silence_durations[n // 2]
@@ -0,0 +1,49 @@
1
+ """Custom exceptions for the speech-prep package."""
2
+
3
+ from typing import Optional
4
+
5
+
6
+ class SpeechPrepError(Exception):
7
+ """Base exception for all speech-prep related errors."""
8
+
9
+ pass
10
+
11
+
12
+ class FFmpegError(SpeechPrepError):
13
+ """Raised when ffmpeg command fails or returns an error."""
14
+
15
+ def __init__(
16
+ self,
17
+ message: str,
18
+ stderr: Optional[str] = None,
19
+ returncode: Optional[int] = None,
20
+ ):
21
+ """
22
+ Initialize FFmpegError with error details.
23
+
24
+ Args:
25
+ message: Error message
26
+ stderr: Standard error output from ffmpeg
27
+ returncode: Return code from ffmpeg process
28
+ """
29
+ self.stderr = stderr
30
+ self.returncode = returncode
31
+ super().__init__(message)
32
+
33
+
34
+ class FileValidationError(SpeechPrepError):
35
+ """Raised when file validation fails."""
36
+
37
+ pass
38
+
39
+
40
+ class AudioPropertiesError(SpeechPrepError):
41
+ """Raised when audio properties cannot be extracted."""
42
+
43
+ pass
44
+
45
+
46
+ class SilenceDetectionError(SpeechPrepError):
47
+ """Raised when silence detection fails."""
48
+
49
+ pass
@@ -0,0 +1,185 @@
1
+ """Audio processing operations for speech preparation."""
2
+
3
+ from pathlib import Path
4
+ import subprocess
5
+ from typing import Optional
6
+
7
+ from .exceptions import FFmpegError
8
+
9
+
10
+ def strip_silence(
11
+ input_path: Path,
12
+ output_path: Path,
13
+ silence_periods: list[tuple[float, float, float]],
14
+ total_duration: float,
15
+ leading: bool = True,
16
+ trailing: bool = True,
17
+ ) -> None:
18
+ """
19
+ Create a new audio file with leading and/or trailing silence removed.
20
+
21
+ Args:
22
+ input_path: Path to the input audio file
23
+ output_path: Path to save the new file
24
+ silence_periods: List of silence periods as (start, end, duration) tuples
25
+ total_duration: Total duration of the audio file
26
+ leading: Whether to remove leading silence
27
+ trailing: Whether to remove trailing silence
28
+
29
+ Raises:
30
+ FFmpegError: If the ffmpeg operation fails
31
+ """
32
+ if not silence_periods:
33
+ raise FFmpegError("No silence periods detected, nothing to strip")
34
+
35
+ # Determine start and end times based on silence periods
36
+ start_time = 0.0
37
+ end_time = total_duration
38
+
39
+ if leading and silence_periods[0][0] == 0:
40
+ # First silence period starts at 0, so it's leading silence
41
+ start_time = silence_periods[0][1]
42
+
43
+ if trailing:
44
+ last_silence = silence_periods[-1]
45
+ # Check if the last silence extends to the end of the file
46
+ # Allow a small buffer (0.1s) for rounding errors
47
+ if abs(last_silence[1] - total_duration) < 0.1:
48
+ end_time = last_silence[0]
49
+
50
+ # Use ffmpeg to cut the file
51
+ cmd = [
52
+ "ffmpeg",
53
+ "-y", # Overwrite output file if it exists
54
+ "-i",
55
+ str(input_path),
56
+ "-ss",
57
+ str(start_time), # Start time
58
+ "-to",
59
+ str(end_time), # End time
60
+ "-c",
61
+ "copy", # Copy streams without re-encoding
62
+ str(output_path),
63
+ ]
64
+
65
+ print(f"Stripping silence: {start_time:.2f}s to {end_time:.2f}s")
66
+ _run_ffmpeg_command(cmd, "stripping silence")
67
+
68
+
69
+ def convert_format(
70
+ input_path: Path, output_path: Path, audio_bitrate: Optional[str] = None
71
+ ) -> None:
72
+ """
73
+ Convert the audio file to a different format.
74
+
75
+ Args:
76
+ input_path: Path to the input audio file
77
+ output_path: Path to save the converted file
78
+ audio_bitrate: Optional bitrate for the output file (e.g., '192k', '320k')
79
+
80
+ Raises:
81
+ FFmpegError: If the ffmpeg operation fails
82
+ """
83
+ # Build ffmpeg command
84
+ cmd = ["ffmpeg", "-y", "-i", str(input_path)]
85
+
86
+ # Add bitrate if specified
87
+ if audio_bitrate:
88
+ cmd.extend(["-b:a", audio_bitrate])
89
+
90
+ # Add output file
91
+ cmd.append(str(output_path))
92
+
93
+ input_format = input_path.suffix.lower().lstrip(".")
94
+ output_format = output_path.suffix.lower().lstrip(".")
95
+ print(f"Converting {input_path.name} from {input_format} to {output_format}")
96
+
97
+ _run_ffmpeg_command(cmd, "converting format")
98
+
99
+
100
+ def adjust_speed(input_path: Path, output_path: Path, speed_factor: float) -> None:
101
+ """
102
+ Create a new audio file with adjusted playback speed.
103
+
104
+ Args:
105
+ input_path: Path to the input audio file
106
+ output_path: Path to save the speed-adjusted file
107
+ speed_factor: Speed multiplier (e.g., 2.0 for 2x speed, 0.5 for half speed)
108
+
109
+ Raises:
110
+ FFmpegError: If the ffmpeg operation fails or speed_factor is invalid
111
+ """
112
+ if speed_factor <= 0:
113
+ raise FFmpegError("Speed factor must be positive")
114
+
115
+ # Use ffmpeg's atempo filter for speed adjustment
116
+ # Note: atempo filter is limited to 0.5x to 2.0x range
117
+ # For factors outside this range, we need to chain multiple atempo filters
118
+
119
+ atempo_filters = []
120
+ remaining_factor = speed_factor
121
+
122
+ # Split into multiple atempo filters if needed
123
+ while remaining_factor > 2.0:
124
+ atempo_filters.append("atempo=2.0")
125
+ remaining_factor /= 2.0
126
+
127
+ while remaining_factor < 0.5:
128
+ atempo_filters.append("atempo=0.5")
129
+ remaining_factor /= 0.5
130
+
131
+ # Add the final adjustment
132
+ if abs(remaining_factor - 1.0) > 0.01: # If not very close to 1.0
133
+ atempo_filters.append(f"atempo={remaining_factor}")
134
+
135
+ # Build the filter string
136
+ filter_str = ",".join(atempo_filters) if atempo_filters else "atempo=1.0"
137
+
138
+ # Determine appropriate codec based on output format
139
+ output_format = output_path.suffix.lower()
140
+ if output_format == ".mp3":
141
+ codec = "libmp3lame"
142
+ elif output_format == ".wav":
143
+ codec = "pcm_s16le"
144
+ else:
145
+ codec = "libmp3lame" # Default to mp3 codec
146
+
147
+ # Build ffmpeg command
148
+ cmd = [
149
+ "ffmpeg",
150
+ "-y",
151
+ "-i",
152
+ str(input_path),
153
+ "-filter:a",
154
+ filter_str,
155
+ "-c:a",
156
+ codec,
157
+ str(output_path),
158
+ ]
159
+
160
+ print(f"Adjusting speed by factor {speed_factor}x using filter: {filter_str}")
161
+ _run_ffmpeg_command(cmd, "adjusting speed")
162
+
163
+
164
+ def _run_ffmpeg_command(cmd: list[str], operation_name: str) -> None:
165
+ """
166
+ Run an ffmpeg command with error handling.
167
+
168
+ Args:
169
+ cmd: List of command arguments
170
+ operation_name: Description of the operation for error messages
171
+
172
+ Raises:
173
+ FFmpegError: If ffmpeg command fails
174
+ """
175
+ try:
176
+ subprocess.run(cmd, capture_output=True, text=True, check=True)
177
+ except subprocess.CalledProcessError as e:
178
+ raise FFmpegError(
179
+ f"Error during {operation_name}", stderr=e.stderr, returncode=e.returncode
180
+ ) from e
181
+ except FileNotFoundError as e:
182
+ raise FFmpegError(
183
+ f"ffmpeg not found during {operation_name}. "
184
+ "Please ensure ffmpeg is installed and accessible."
185
+ ) from e
speech_prep/utils.py ADDED
@@ -0,0 +1,130 @@
1
+ """Utility functions for audio file operations."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+ import subprocess
6
+
7
+ from .exceptions import AudioPropertiesError, FileValidationError
8
+
9
+
10
+ def validate_file(file_path: Path) -> bool:
11
+ """
12
+ Validate that the file exists and is a regular file.
13
+
14
+ Args:
15
+ file_path: Path to the file to validate
16
+
17
+ Returns:
18
+ True if file is valid
19
+
20
+ Raises:
21
+ FileValidationError: If file doesn't exist or is not a regular file
22
+ """
23
+ if not file_path.exists():
24
+ raise FileValidationError(f"File {file_path} does not exist")
25
+
26
+ if not file_path.is_file():
27
+ raise FileValidationError(f"Path {file_path} is not a regular file")
28
+
29
+ return True
30
+
31
+
32
+ def get_audio_properties(file_path: Path) -> tuple[float, int, str]:
33
+ """
34
+ Extract audio properties (duration, file size, format) using ffprobe.
35
+
36
+ Args:
37
+ file_path: Path to the audio file
38
+
39
+ Returns:
40
+ Tuple of (duration, file_size, audio_format)
41
+
42
+ Raises:
43
+ AudioPropertiesError: If properties cannot be extracted
44
+ """
45
+ probe_cmd = [
46
+ "ffprobe",
47
+ "-v",
48
+ "error",
49
+ "-show_entries",
50
+ "format=duration,size,format_name",
51
+ "-of",
52
+ "json",
53
+ str(file_path),
54
+ ]
55
+
56
+ try:
57
+ probe_result = subprocess.run(
58
+ probe_cmd,
59
+ capture_output=True,
60
+ text=True,
61
+ check=True,
62
+ )
63
+ except subprocess.CalledProcessError as e:
64
+ raise AudioPropertiesError(f"Error probing file {file_path}: {e.stderr}") from e
65
+ except FileNotFoundError as e:
66
+ raise AudioPropertiesError(
67
+ "ffprobe not found. Please ensure ffmpeg is installed and accessible."
68
+ ) from e
69
+
70
+ try:
71
+ probe_data = json.loads(probe_result.stdout)["format"]
72
+ duration = float(probe_data["duration"])
73
+ file_size = int(probe_data["size"])
74
+ audio_format = probe_data["format_name"].split(",")[
75
+ 0
76
+ ] # Get the first format name
77
+
78
+ if duration <= 0 or file_size <= 0:
79
+ raise AudioPropertiesError(
80
+ f"Invalid duration or file size for {file_path}. "
81
+ f"Duration: {duration}, Size: {file_size}"
82
+ )
83
+
84
+ return duration, file_size, audio_format
85
+
86
+ except (KeyError, ValueError, json.JSONDecodeError) as e:
87
+ raise AudioPropertiesError(f"Error parsing probe data: {e}") from e
88
+
89
+
90
+ def format_time(seconds: float) -> str:
91
+ """
92
+ Format seconds as HH:MM:SS.
93
+
94
+ Args:
95
+ seconds: Time in seconds
96
+
97
+ Returns:
98
+ Formatted time string
99
+ """
100
+ hours, remainder = divmod(int(seconds), 3600)
101
+ minutes, seconds_int = divmod(remainder, 60)
102
+ return f"{hours:02}:{minutes:02}:{seconds_int:02}"
103
+
104
+
105
+ def run_ffmpeg_command(
106
+ cmd: list[str], operation_name: str
107
+ ) -> subprocess.CompletedProcess[str]:
108
+ """
109
+ Run an ffmpeg command with error handling.
110
+
111
+ Args:
112
+ cmd: List of command arguments
113
+ operation_name: Description of the operation for error messages
114
+
115
+ Returns:
116
+ CompletedProcess result
117
+
118
+ Raises:
119
+ AudioPropertiesError: If ffmpeg command fails
120
+ """
121
+ try:
122
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
123
+ return result
124
+
125
+ except subprocess.CalledProcessError as e:
126
+ raise AudioPropertiesError(f"Error during {operation_name}: {e.stderr}") from e
127
+ except FileNotFoundError as e:
128
+ raise AudioPropertiesError(
129
+ "ffmpeg not found. Please ensure ffmpeg is installed and accessible."
130
+ ) from e
@@ -0,0 +1,220 @@
1
+ Metadata-Version: 2.4
2
+ Name: speech-prep
3
+ Version: 0.1.3
4
+ Summary: Audio preprocessing toolkit for speech-to-text applications using ffmpeg
5
+ Project-URL: Homepage, https://github.com/dimdasci/speech-prep
6
+ Project-URL: Repository, https://github.com/dimdasci/speech-prep
7
+ Project-URL: Issues, https://github.com/dimdasci/speech-prep/issues
8
+ Author-email: Dim Kharitonov <dimds@fastmail.com>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: audio,ffmpeg,preprocessing,silence-detection,speech-to-text
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Multimedia :: Sound/Audio
21
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
+ Requires-Python: >=3.9
23
+ Provides-Extra: dev
24
+ Description-Content-Type: text/markdown
25
+
26
+ # Speech Prep
27
+
28
+ Audio preprocessing toolkit for speech-to-text applications using FFmpeg.
29
+
30
+ ## Overview
31
+
32
+ Speech Prep is a Python package designed to prepare audio files for speech-to-text processing. It provides tools for silence detection and removal, speed adjustment, and format conversion - all essential steps for optimizing audio before transcription.
33
+
34
+ ## Features
35
+
36
+ - **Silence Detection**: Automatically detect silence periods in audio files
37
+ - **Silence Removal**: Remove leading/trailing silence to clean up recordings
38
+ - **Speed Adjustment**: Change playback speed while maintaining audio quality
39
+ - **Format Conversion**: Convert between different audio formats (MP3, WAV, FLAC, etc.)
40
+ - **Clean API**: Simple, intuitive interface with method chaining support
41
+ - **FFmpeg Integration**: Leverages the power and reliability of FFmpeg
42
+
43
+ ## Requirements
44
+
45
+ - Python 3.9+
46
+ - FFmpeg (must be installed and accessible via PATH)
47
+
48
+ ## Installation
49
+
50
+ ```bash
51
+ # Install from PyPI (when published)
52
+ pip install speech-prep
53
+
54
+ # Or install from source
55
+ git clone https://github.com/dimdasci/speech-prep.git
56
+ cd speech-prep
57
+ uv sync # or pip install -e .
58
+ ```
59
+
60
+ ## Quick Start
61
+
62
+ ```python
63
+ from speech_prep import SoundFile
64
+ from pathlib import Path
65
+
66
+ # Load an audio file
67
+ audio = SoundFile(Path("recording.wav"))
68
+
69
+ if audio:
70
+ print(f"Duration: {audio.duration:.2f} seconds")
71
+ print(f"Format: {audio.format}")
72
+ print(f"Silence periods detected: {len(audio.silence_periods)}")
73
+
74
+ # Clean up the audio for speech-to-text
75
+ cleaned = audio.strip(output_path=Path("recording_stripped.wav"))
76
+ faster = cleaned.speed(output_path=Path("recording_stripped_fast.wav"), speed_factor=1.2)
77
+ final = faster.convert(output_path=Path("clean.mp3"))
78
+
79
+ print(f"Processed file saved: {final.path}")
80
+ ```
81
+
82
+ ## Usage Examples
83
+
84
+ ### Basic Operations
85
+
86
+ ```python
87
+ from speech_prep import SoundFile
88
+ from pathlib import Path
89
+
90
+ # Load audio file
91
+ audio = SoundFile(Path("interview.wav"))
92
+
93
+ # View audio information
94
+ print(audio) # Shows duration, format, file size, and silence periods
95
+
96
+ # Remove silence from beginning and end
97
+ cleaned = audio.strip(output_path=Path("interview_stripped.wav"))
98
+
99
+ # Remove only leading silence
100
+ cleaned = audio.strip(output_path=Path("interview_leading.wav"), trailing=False)
101
+
102
+ # Speed up audio by 50%
103
+ faster = audio.speed(output_path=Path("interview_fast.wav"), speed_factor=1.5)
104
+
105
+ # Convert format
106
+ mp3_file = audio.convert(output_path=Path("output.mp3"))
107
+ ```
108
+
109
+ ### Processing Pipeline
110
+
111
+ ```python
112
+ from speech_prep import SoundFile
113
+ from pathlib import Path
114
+
115
+ def prepare_for_transcription(input_file: Path, output_file: Path):
116
+ """Prepare audio file for speech-to-text processing."""
117
+ # Load the original file
118
+ audio = SoundFile(input_file)
119
+ if not audio:
120
+ return None
121
+ # Processing pipeline
122
+ stripped = audio.strip(output_path=input_file.with_stem(input_file.stem + "_stripped"))
123
+ faster = stripped.speed(output_path=input_file.with_stem(input_file.stem + "_stripped_fast"), speed_factor=1.1)
124
+ processed = faster.convert(output_path=output_file)
125
+ if processed:
126
+ print(f"Original duration: {audio.duration:.2f}s")
127
+ print(f"Processed duration: {processed.duration:.2f}s")
128
+ print(f"Time saved: {audio.duration - processed.duration:.2f}s")
129
+ return processed
130
+
131
+ # Use the pipeline
132
+ result = prepare_for_transcription(
133
+ Path("long_meeting.wav"),
134
+ Path("ready_for_stt.mp3")
135
+ )
136
+ ```
137
+
138
+ ### Error Handling
139
+
140
+ ```python
141
+ from speech_prep import SoundFile, SpeechPrepError, FFmpegError
142
+ from pathlib import Path
143
+
144
+ try:
145
+ audio = SoundFile(Path("audio.wav"))
146
+ if audio:
147
+ result = audio.strip().speed(2.0)
148
+ print(f"Success: {result.path}")
149
+ else:
150
+ print("Failed to load audio file")
151
+
152
+ except FFmpegError as e:
153
+ print(f"FFmpeg error: {e}")
154
+ if e.stderr:
155
+ print(f"Details: {e.stderr}")
156
+
157
+ except SpeechPrepError as e:
158
+ print(f"Processing error: {e}")
159
+ ```
160
+
161
+ ### Custom Parameters
162
+
163
+ ```python
164
+ from speech_prep import SoundFile
165
+ from pathlib import Path
166
+
167
+ # Custom silence detection settings
168
+ audio = SoundFile(
169
+ Path("audio.wav"),
170
+ noise_threshold_db=-40, # More sensitive silence detection
171
+ min_silence_duration=0.3 # Shorter minimum silence periods
172
+ )
173
+
174
+ # Custom output paths
175
+ cleaned = audio.strip(output_path=Path("custom_output.wav"))
176
+
177
+ # Custom conversion settings
178
+ mp3 = audio.convert(
179
+ output_path=Path("output.mp3"),
180
+ audio_bitrate="192k" # Custom bitrate
181
+ )
182
+ ```
183
+
184
+ ## API Reference
185
+
186
+ ### SoundFile Class
187
+
188
+ #### Constructor
189
+ ```python
190
+ SoundFile(file_path, noise_threshold_db=-30, min_silence_duration=0.5)
191
+ ```
192
+
193
+ #### Methods
194
+ - **`strip(output_path, leading=True, trailing=True)`**: Remove silence
195
+ - **`speed(output_path, speed_factor)`**: Adjust playback speed
196
+ - **`convert(output_path, audio_bitrate=None)`**: Convert format
197
+
198
+ #### Properties
199
+ - **`path`**: Path to the audio file
200
+ - **`duration`**: Duration in seconds
201
+ - **`format`**: Audio format
202
+ - **`file_size`**: File size in bytes
203
+ - **`silence_periods`**: List of detected silence periods
204
+ - **`median_silence`**: Median silence duration
205
+
206
+ ## Contributing
207
+
208
+ 1. Fork the repository
209
+ 2. Create your feature branch (`git checkout -b feature/amazing-feature`)
210
+ 3. Commit your changes (`git commit -m 'Add amazing feature'`)
211
+ 4. Push to the branch (`git push origin feature/amazing-feature`)
212
+ 5. Open a Pull Request
213
+
214
+ ## License
215
+
216
+ This project is licensed under the MIT License - see the LICENSE file for details.
217
+
218
+ ## Acknowledgments
219
+
220
+ - Built on top of the powerful [FFmpeg](https://ffmpeg.org/) multimedia framework
@@ -0,0 +1,10 @@
1
+ speech_prep/__init__.py,sha256=0Eu8vjSjvG3sOQbN9dsjtQkKcVBPcLthK4Eit0UrtAQ,839
2
+ speech_prep/core.py,sha256=pe4djUP1wQF4TJiaw1lg7xIvBzVHOMWP7dHgar3unt4,7567
3
+ speech_prep/detection.py,sha256=D5_WkTYoFDUIYA2u6cfWK6E_Rd5R6g1Lng0Hh1UGgBs,3495
4
+ speech_prep/exceptions.py,sha256=qZcIzM-IPltgJNtfmj5K4D8OJsL1zButmLnshas9m4M,1091
5
+ speech_prep/processing.py,sha256=421IqfAcRUqMtXBsiTypSp_4H0X3uh5UjQ8Af-nPaX0,5684
6
+ speech_prep/utils.py,sha256=_yjn1hoVVHfLc3nGAhD2n6bsevgweqNOt1rsDyahQnY,3585
7
+ speech_prep-0.1.3.dist-info/METADATA,sha256=8wP2R43DbY7JH9S8r1_DJlWKPsYMgi9CIIl8HpZMLsI,6616
8
+ speech_prep-0.1.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
9
+ speech_prep-0.1.3.dist-info/licenses/LICENSE,sha256=-M8NcLlGaRvQqThXHq5g0D9CUR05KMhdswCB9s_0Sds,1066
10
+ speech_prep-0.1.3.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Dim Kharitonov
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.