speech-prep 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speech_prep/__init__.py +2 -0
- speech_prep/core.py +7 -2
- speech_prep/formats.py +15 -0
- speech_prep/processing.py +30 -7
- speech_prep/utils.py +12 -3
- {speech_prep-0.1.3.dist-info → speech_prep-0.1.4.dist-info}/METADATA +30 -12
- speech_prep-0.1.4.dist-info/RECORD +11 -0
- speech_prep-0.1.3.dist-info/RECORD +0 -10
- {speech_prep-0.1.3.dist-info → speech_prep-0.1.4.dist-info}/WHEEL +0 -0
- {speech_prep-0.1.3.dist-info → speech_prep-0.1.4.dist-info}/licenses/LICENSE +0 -0
speech_prep/__init__.py
CHANGED
@@ -13,6 +13,7 @@ from .exceptions import (
|
|
13
13
|
SilenceDetectionError,
|
14
14
|
SpeechPrepError,
|
15
15
|
)
|
16
|
+
from .formats import AudioFormat
|
16
17
|
|
17
18
|
# Import version from hatch-vcs
|
18
19
|
try:
|
@@ -25,6 +26,7 @@ except ImportError:
|
|
25
26
|
|
26
27
|
__all__ = [
|
27
28
|
"SoundFile",
|
29
|
+
"AudioFormat",
|
28
30
|
"SpeechPrepError",
|
29
31
|
"FFmpegError",
|
30
32
|
"FileValidationError",
|
speech_prep/core.py
CHANGED
@@ -6,6 +6,7 @@ from typing import Optional
|
|
6
6
|
|
7
7
|
from .detection import calculate_median_silence, detect_silence
|
8
8
|
from .exceptions import SpeechPrepError
|
9
|
+
from .formats import AudioFormat
|
9
10
|
from .processing import adjust_speed, convert_format, strip_silence
|
10
11
|
from .utils import format_time, get_audio_properties
|
11
12
|
|
@@ -158,20 +159,24 @@ class SoundFile:
|
|
158
159
|
return None
|
159
160
|
|
160
161
|
def convert(
|
161
|
-
self,
|
162
|
+
self,
|
163
|
+
output_path: Path,
|
164
|
+
target_format: AudioFormat,
|
165
|
+
audio_bitrate: Optional[str] = None,
|
162
166
|
) -> Optional["SoundFile"]:
|
163
167
|
"""
|
164
168
|
Convert the audio file to a different format.
|
165
169
|
|
166
170
|
Args:
|
167
171
|
output_path: Path to save the converted file
|
172
|
+
target_format: Target audio format
|
168
173
|
audio_bitrate: Optional bitrate for the output file (e.g., '192k', '320k')
|
169
174
|
|
170
175
|
Returns:
|
171
176
|
A new SoundFile instance for the converted file, or None if operation failed
|
172
177
|
"""
|
173
178
|
try:
|
174
|
-
convert_format(self.path, output_path, audio_bitrate)
|
179
|
+
convert_format(self.path, output_path, target_format, audio_bitrate)
|
175
180
|
return SoundFile(
|
176
181
|
output_path, self.noise_threshold_db, self.min_silence_duration
|
177
182
|
)
|
speech_prep/formats.py
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
"""Enums for audio file formats."""
|
2
|
+
|
3
|
+
from enum import Enum
|
4
|
+
|
5
|
+
|
6
|
+
class AudioFormat(Enum):
|
7
|
+
"""Enum representing supported audio formats."""
|
8
|
+
|
9
|
+
MP3 = "mp3"
|
10
|
+
WAV = "wav"
|
11
|
+
FLAC = "flac"
|
12
|
+
AAC = "aac"
|
13
|
+
OGG = "ogg"
|
14
|
+
M4A = "m4a"
|
15
|
+
UNKNOWN = "unknown"
|
speech_prep/processing.py
CHANGED
@@ -5,6 +5,7 @@ import subprocess
|
|
5
5
|
from typing import Optional
|
6
6
|
|
7
7
|
from .exceptions import FFmpegError
|
8
|
+
from .formats import AudioFormat
|
8
9
|
|
9
10
|
|
10
11
|
def strip_silence(
|
@@ -67,7 +68,10 @@ def strip_silence(
|
|
67
68
|
|
68
69
|
|
69
70
|
def convert_format(
|
70
|
-
input_path: Path,
|
71
|
+
input_path: Path,
|
72
|
+
output_path: Path,
|
73
|
+
target_format: AudioFormat,
|
74
|
+
audio_bitrate: Optional[str] = None,
|
71
75
|
) -> None:
|
72
76
|
"""
|
73
77
|
Convert the audio file to a different format.
|
@@ -75,6 +79,7 @@ def convert_format(
|
|
75
79
|
Args:
|
76
80
|
input_path: Path to the input audio file
|
77
81
|
output_path: Path to save the converted file
|
82
|
+
target_format: Target audio format
|
78
83
|
audio_bitrate: Optional bitrate for the output file (e.g., '192k', '320k')
|
79
84
|
|
80
85
|
Raises:
|
@@ -90,9 +95,21 @@ def convert_format(
|
|
90
95
|
# Add output file
|
91
96
|
cmd.append(str(output_path))
|
92
97
|
|
93
|
-
|
94
|
-
|
95
|
-
|
98
|
+
# Determine the input format from the file extension
|
99
|
+
input_format = AudioFormat.UNKNOWN
|
100
|
+
try:
|
101
|
+
ext = input_path.suffix.lower().lstrip(".")
|
102
|
+
input_format = AudioFormat(ext)
|
103
|
+
except ValueError:
|
104
|
+
pass # Keep as UNKNOWN if not found
|
105
|
+
|
106
|
+
# Use the provided target_format
|
107
|
+
output_format = target_format
|
108
|
+
|
109
|
+
print(
|
110
|
+
f"Converting {input_path.name} from "
|
111
|
+
f"{input_format.value} to {output_format.value}"
|
112
|
+
)
|
96
113
|
|
97
114
|
_run_ffmpeg_command(cmd, "converting format")
|
98
115
|
|
@@ -136,10 +153,16 @@ def adjust_speed(input_path: Path, output_path: Path, speed_factor: float) -> No
|
|
136
153
|
filter_str = ",".join(atempo_filters) if atempo_filters else "atempo=1.0"
|
137
154
|
|
138
155
|
# Determine appropriate codec based on output format
|
139
|
-
output_format =
|
140
|
-
|
156
|
+
output_format = AudioFormat.UNKNOWN
|
157
|
+
try:
|
158
|
+
ext = output_path.suffix.lower().lstrip(".")
|
159
|
+
output_format = AudioFormat(ext)
|
160
|
+
except ValueError:
|
161
|
+
pass # Keep as UNKNOWN
|
162
|
+
|
163
|
+
if output_format == AudioFormat.MP3:
|
141
164
|
codec = "libmp3lame"
|
142
|
-
elif output_format ==
|
165
|
+
elif output_format == AudioFormat.WAV:
|
143
166
|
codec = "pcm_s16le"
|
144
167
|
else:
|
145
168
|
codec = "libmp3lame" # Default to mp3 codec
|
speech_prep/utils.py
CHANGED
@@ -5,6 +5,7 @@ from pathlib import Path
|
|
5
5
|
import subprocess
|
6
6
|
|
7
7
|
from .exceptions import AudioPropertiesError, FileValidationError
|
8
|
+
from .formats import AudioFormat
|
8
9
|
|
9
10
|
|
10
11
|
def validate_file(file_path: Path) -> bool:
|
@@ -29,7 +30,7 @@ def validate_file(file_path: Path) -> bool:
|
|
29
30
|
return True
|
30
31
|
|
31
32
|
|
32
|
-
def get_audio_properties(file_path: Path) -> tuple[float, int,
|
33
|
+
def get_audio_properties(file_path: Path) -> tuple[float, int, AudioFormat]:
|
33
34
|
"""
|
34
35
|
Extract audio properties (duration, file size, format) using ffprobe.
|
35
36
|
|
@@ -37,7 +38,8 @@ def get_audio_properties(file_path: Path) -> tuple[float, int, str]:
|
|
37
38
|
file_path: Path to the audio file
|
38
39
|
|
39
40
|
Returns:
|
40
|
-
Tuple of (duration, file_size, audio_format)
|
41
|
+
Tuple of (duration, file_size, audio_format) where audio_format
|
42
|
+
is an AudioFormat enum representing the detected audio format
|
41
43
|
|
42
44
|
Raises:
|
43
45
|
AudioPropertiesError: If properties cannot be extracted
|
@@ -71,10 +73,17 @@ def get_audio_properties(file_path: Path) -> tuple[float, int, str]:
|
|
71
73
|
probe_data = json.loads(probe_result.stdout)["format"]
|
72
74
|
duration = float(probe_data["duration"])
|
73
75
|
file_size = int(probe_data["size"])
|
74
|
-
|
76
|
+
format_str = probe_data["format_name"].split(",")[
|
75
77
|
0
|
76
78
|
] # Get the first format name
|
77
79
|
|
80
|
+
# Convert format string to enum
|
81
|
+
try:
|
82
|
+
audio_format = AudioFormat(format_str.lower())
|
83
|
+
except ValueError:
|
84
|
+
# If not a direct match, use UNKNOWN
|
85
|
+
audio_format = AudioFormat.UNKNOWN
|
86
|
+
|
78
87
|
if duration <= 0 or file_size <= 0:
|
79
88
|
raise AudioPropertiesError(
|
80
89
|
f"Invalid duration or file size for {file_path}. "
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: speech-prep
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.4
|
4
4
|
Summary: Audio preprocessing toolkit for speech-to-text applications using ffmpeg
|
5
5
|
Project-URL: Homepage, https://github.com/dimdasci/speech-prep
|
6
6
|
Project-URL: Repository, https://github.com/dimdasci/speech-prep
|
@@ -60,21 +60,19 @@ uv sync # or pip install -e .
|
|
60
60
|
## Quick Start
|
61
61
|
|
62
62
|
```python
|
63
|
-
from speech_prep import SoundFile
|
63
|
+
from speech_prep import SoundFile, AudioFormat
|
64
64
|
from pathlib import Path
|
65
65
|
|
66
66
|
# Load an audio file
|
67
67
|
audio = SoundFile(Path("recording.wav"))
|
68
68
|
|
69
69
|
if audio:
|
70
|
-
print(
|
71
|
-
print(f"Format: {audio.format}")
|
72
|
-
print(f"Silence periods detected: {len(audio.silence_periods)}")
|
70
|
+
print(audio) # Shows duration, format, file size, and silence periods
|
73
71
|
|
74
72
|
# Clean up the audio for speech-to-text
|
75
73
|
cleaned = audio.strip(output_path=Path("recording_stripped.wav"))
|
76
74
|
faster = cleaned.speed(output_path=Path("recording_stripped_fast.wav"), speed_factor=1.2)
|
77
|
-
final = faster.convert(output_path=Path("clean.mp3"))
|
75
|
+
final = faster.convert(output_path=Path("clean.mp3", target_format=AudioFormat.MP3))
|
78
76
|
|
79
77
|
print(f"Processed file saved: {final.path}")
|
80
78
|
```
|
@@ -84,7 +82,7 @@ if audio:
|
|
84
82
|
### Basic Operations
|
85
83
|
|
86
84
|
```python
|
87
|
-
from speech_prep import SoundFile
|
85
|
+
from speech_prep import SoundFile, AudioFormat
|
88
86
|
from pathlib import Path
|
89
87
|
|
90
88
|
# Load audio file
|
@@ -103,17 +101,18 @@ cleaned = audio.strip(output_path=Path("interview_leading.wav"), trailing=False)
|
|
103
101
|
faster = audio.speed(output_path=Path("interview_fast.wav"), speed_factor=1.5)
|
104
102
|
|
105
103
|
# Convert format
|
106
|
-
mp3_file = audio.convert(output_path=Path("output.mp3"))
|
104
|
+
mp3_file = audio.convert(output_path=Path("output.mp3"), target_format=AudioFormat.MP3)
|
107
105
|
```
|
108
106
|
|
109
107
|
### Processing Pipeline
|
110
108
|
|
111
109
|
```python
|
112
|
-
from speech_prep import SoundFile
|
110
|
+
from speech_prep import AudioFormat, SoundFile
|
113
111
|
from pathlib import Path
|
114
112
|
|
115
113
|
def prepare_for_transcription(input_file: Path, output_file: Path):
|
116
114
|
"""Prepare audio file for speech-to-text processing."""
|
115
|
+
|
117
116
|
# Load the original file
|
118
117
|
audio = SoundFile(input_file)
|
119
118
|
if not audio:
|
@@ -121,7 +120,7 @@ def prepare_for_transcription(input_file: Path, output_file: Path):
|
|
121
120
|
# Processing pipeline
|
122
121
|
stripped = audio.strip(output_path=input_file.with_stem(input_file.stem + "_stripped"))
|
123
122
|
faster = stripped.speed(output_path=input_file.with_stem(input_file.stem + "_stripped_fast"), speed_factor=1.1)
|
124
|
-
processed = faster.convert(output_path=output_file)
|
123
|
+
processed = faster.convert(output_path=output_file, target_format=AudioFormat.MP3)
|
125
124
|
if processed:
|
126
125
|
print(f"Original duration: {audio.duration:.2f}s")
|
127
126
|
print(f"Processed duration: {processed.duration:.2f}s")
|
@@ -175,8 +174,10 @@ audio = SoundFile(
|
|
175
174
|
cleaned = audio.strip(output_path=Path("custom_output.wav"))
|
176
175
|
|
177
176
|
# Custom conversion settings
|
177
|
+
from speech_prep import AudioFormat
|
178
178
|
mp3 = audio.convert(
|
179
179
|
output_path=Path("output.mp3"),
|
180
|
+
target_format=AudioFormat.MP3,
|
180
181
|
audio_bitrate="192k" # Custom bitrate
|
181
182
|
)
|
182
183
|
```
|
@@ -193,16 +194,33 @@ SoundFile(file_path, noise_threshold_db=-30, min_silence_duration=0.5)
|
|
193
194
|
#### Methods
|
194
195
|
- **`strip(output_path, leading=True, trailing=True)`**: Remove silence
|
195
196
|
- **`speed(output_path, speed_factor)`**: Adjust playback speed
|
196
|
-
- **`convert(output_path, audio_bitrate=None)`**: Convert format
|
197
|
+
- **`convert(output_path, target_format, audio_bitrate=None)`**: Convert format
|
197
198
|
|
198
199
|
#### Properties
|
199
200
|
- **`path`**: Path to the audio file
|
200
201
|
- **`duration`**: Duration in seconds
|
201
|
-
- **`format`**: Audio format
|
202
|
+
- **`format`**: Audio format (AudioFormat enum)
|
202
203
|
- **`file_size`**: File size in bytes
|
203
204
|
- **`silence_periods`**: List of detected silence periods
|
204
205
|
- **`median_silence`**: Median silence duration
|
205
206
|
|
207
|
+
### AudioFormat Enum
|
208
|
+
|
209
|
+
The `AudioFormat` enum represents supported audio formats:
|
210
|
+
|
211
|
+
```python
|
212
|
+
from speech_prep import AudioFormat
|
213
|
+
|
214
|
+
# Available formats
|
215
|
+
AudioFormat.MP3 # MP3 format
|
216
|
+
AudioFormat.WAV # WAV format
|
217
|
+
AudioFormat.FLAC # FLAC format
|
218
|
+
AudioFormat.AAC # AAC format
|
219
|
+
AudioFormat.OGG # OGG format
|
220
|
+
AudioFormat.M4A # M4A format
|
221
|
+
AudioFormat.UNKNOWN # Unknown/unsupported format
|
222
|
+
```
|
223
|
+
|
206
224
|
## Contributing
|
207
225
|
|
208
226
|
1. Fork the repository
|
@@ -0,0 +1,11 @@
|
|
1
|
+
speech_prep/__init__.py,sha256=BWVsOFBywQYAiykMB3XJX6JQww155M6R8NLxNCn3Z10,891
|
2
|
+
speech_prep/core.py,sha256=GCxmKlf_ovEiRRzM8vr3ucPjb1pWHrL-MzkfWAKtzgg,7715
|
3
|
+
speech_prep/detection.py,sha256=D5_WkTYoFDUIYA2u6cfWK6E_Rd5R6g1Lng0Hh1UGgBs,3495
|
4
|
+
speech_prep/exceptions.py,sha256=qZcIzM-IPltgJNtfmj5K4D8OJsL1zButmLnshas9m4M,1091
|
5
|
+
speech_prep/formats.py,sha256=fYeOMpMOrl3LX62L32xoAo2qYgxl43UYbywX_4j2nbw,262
|
6
|
+
speech_prep/processing.py,sha256=wFZEVt2nB4PSiRQu3thVBQnODe8DSdXVogo9b09L9q4,6231
|
7
|
+
speech_prep/utils.py,sha256=vz5OWIHvICTa2sz3__rDFxLeDXi4j8B5hvT5vdFblMM,3949
|
8
|
+
speech_prep-0.1.4.dist-info/METADATA,sha256=f1UUmZgGnH1TyFreBxi5XMGlSSMwQve9hzz4rAi13mY,7161
|
9
|
+
speech_prep-0.1.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
10
|
+
speech_prep-0.1.4.dist-info/licenses/LICENSE,sha256=-M8NcLlGaRvQqThXHq5g0D9CUR05KMhdswCB9s_0Sds,1066
|
11
|
+
speech_prep-0.1.4.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
speech_prep/__init__.py,sha256=0Eu8vjSjvG3sOQbN9dsjtQkKcVBPcLthK4Eit0UrtAQ,839
|
2
|
-
speech_prep/core.py,sha256=pe4djUP1wQF4TJiaw1lg7xIvBzVHOMWP7dHgar3unt4,7567
|
3
|
-
speech_prep/detection.py,sha256=D5_WkTYoFDUIYA2u6cfWK6E_Rd5R6g1Lng0Hh1UGgBs,3495
|
4
|
-
speech_prep/exceptions.py,sha256=qZcIzM-IPltgJNtfmj5K4D8OJsL1zButmLnshas9m4M,1091
|
5
|
-
speech_prep/processing.py,sha256=421IqfAcRUqMtXBsiTypSp_4H0X3uh5UjQ8Af-nPaX0,5684
|
6
|
-
speech_prep/utils.py,sha256=_yjn1hoVVHfLc3nGAhD2n6bsevgweqNOt1rsDyahQnY,3585
|
7
|
-
speech_prep-0.1.3.dist-info/METADATA,sha256=8wP2R43DbY7JH9S8r1_DJlWKPsYMgi9CIIl8HpZMLsI,6616
|
8
|
-
speech_prep-0.1.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
9
|
-
speech_prep-0.1.3.dist-info/licenses/LICENSE,sha256=-M8NcLlGaRvQqThXHq5g0D9CUR05KMhdswCB9s_0Sds,1066
|
10
|
-
speech_prep-0.1.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|