gptmed 0.5.5__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ """
2
+ Data Preparation Module for GptMed
3
+
4
+ A comprehensive data preprocessing and cleaning framework for preparing various data types
5
+ (text, image, video, and audio) as a preprocessing baseline for the gptmed framework.
6
+
7
+ Usage:
8
+ >>> from gptmed.data_preparation import TextPreprocessor, ImagePreprocessor
9
+ >>>
10
+ >>> # Text preprocessing
11
+ >>> text_prep = TextPreprocessor()
12
+ >>> cleaned_text = text_prep.process('raw text data')
13
+ >>>
14
+ >>> # Image preprocessing
15
+ >>> image_prep = ImagePreprocessor()
16
+ >>> processed_image = image_prep.process('path/to/image.jpg')
17
+ """
18
+
19
+ from .base import BaseDataPreprocessor, PreprocessingConfig
20
+ from .text import TextPreprocessor
21
+ from .image import ImagePreprocessor
22
+ from .audio import AudioPreprocessor
23
+ from .video import VideoPreprocessor
24
+
25
+ __version__ = "0.1.0"
26
+ __all__ = [
27
+ "BaseDataPreprocessor",
28
+ "PreprocessingConfig",
29
+ "TextPreprocessor",
30
+ "ImagePreprocessor",
31
+ "AudioPreprocessor",
32
+ "VideoPreprocessor",
33
+ ]
@@ -0,0 +1,335 @@
1
+ """
2
+ Audio data preprocessing and cleaning module
3
+
4
+ Handles audio resampling, normalization, silence removal, and quality checks
5
+ """
6
+
7
+ import logging
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+ from pathlib import Path
10
+ import json
11
+
12
+ from ..base import BaseDataPreprocessor, PreprocessingConfig
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class AudioPreprocessor(BaseDataPreprocessor):
19
+ """
20
+ Audio preprocessing with resampling, normalization, and validation
21
+
22
+ Features:
23
+ - Audio format validation
24
+ - Resampling to target sample rate
25
+ - Amplitude normalization
26
+ - Silence detection and removal
27
+ - Noise reduction
28
+ - Duration validation
29
+ - Metadata extraction
30
+ - Stereo to mono conversion
31
+ - Compression artifact detection
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ config: Optional[PreprocessingConfig] = None,
37
+ target_sample_rate: int = 16000,
38
+ mono: bool = True,
39
+ normalize_amplitude: bool = True,
40
+ remove_silence: bool = False,
41
+ silence_threshold: float = 0.02,
42
+ min_duration: float = 0.5, # seconds
43
+ max_duration: Optional[float] = None, # seconds
44
+ supported_formats: Optional[List[str]] = None,
45
+ ):
46
+ """
47
+ Initialize audio preprocessor
48
+
49
+ Args:
50
+ config: PreprocessingConfig instance
51
+ target_sample_rate: Target sample rate in Hz
52
+ mono: Convert to mono if True
53
+ normalize_amplitude: Normalize audio amplitude
54
+ remove_silence: Remove silence from audio
55
+ silence_threshold: Threshold for silence detection (0-1)
56
+ min_duration: Minimum audio duration in seconds
57
+ max_duration: Maximum audio duration in seconds
58
+ supported_formats: List of supported audio formats
59
+ """
60
+ if config is None:
61
+ config = PreprocessingConfig(
62
+ input_path="./data/raw/audio",
63
+ output_path="./data/processed/audio",
64
+ data_type="audio"
65
+ )
66
+
67
+ super().__init__(config)
68
+
69
+ self.target_sample_rate = target_sample_rate
70
+ self.mono = mono
71
+ self.normalize_amplitude = normalize_amplitude
72
+ self.remove_silence = remove_silence
73
+ self.silence_threshold = silence_threshold
74
+ self.min_duration = min_duration
75
+ self.max_duration = max_duration
76
+ self.supported_formats = supported_formats or ['wav', 'mp3', 'flac', 'ogg', 'm4a']
77
+
78
+ self._import_audio_library()
79
+
80
+ def _import_audio_library(self):
81
+ """Attempt to import librosa and soundfile"""
82
+ self.librosa_available = False
83
+ self.soundfile_available = False
84
+
85
+ try:
86
+ import librosa
87
+ self.librosa = librosa
88
+ self.librosa_available = True
89
+ except ImportError:
90
+ self.logger.warning(
91
+ "librosa not available. Install with: pip install librosa"
92
+ )
93
+
94
+ try:
95
+ import soundfile as sf
96
+ self.soundfile = sf
97
+ self.soundfile_available = True
98
+ except ImportError:
99
+ self.logger.warning(
100
+ "soundfile not available. Install with: pip install soundfile"
101
+ )
102
+
103
+ def validate(self, data: Any) -> bool:
104
+ """
105
+ Validate audio input
106
+
107
+ Args:
108
+ data: Audio file path (str) or numpy array with sample rate
109
+
110
+ Returns:
111
+ True if valid, False otherwise
112
+ """
113
+ if not self.librosa_available:
114
+ self.logger.error("librosa is required for audio processing")
115
+ return False
116
+
117
+ try:
118
+ if isinstance(data, str):
119
+ audio_path = Path(data)
120
+ if not audio_path.exists():
121
+ self.logger.warning(f"Audio file not found: {data}")
122
+ return False
123
+
124
+ if not any(str(audio_path).lower().endswith(f) for f in self.supported_formats):
125
+ self.logger.warning(f"Unsupported format: {data}")
126
+ return False
127
+
128
+ # Load and check
129
+ y, sr = self.librosa.load(str(audio_path), sr=None, mono=False)
130
+ duration = self.librosa.get_duration(y=y, sr=sr)
131
+
132
+ elif isinstance(data, tuple) and len(data) == 2: # (audio_array, sample_rate)
133
+ y, sr = data
134
+ duration = len(y) / sr
135
+ else:
136
+ self.logger.warning(f"Invalid audio type: {type(data)}")
137
+ return False
138
+
139
+ # Check duration constraints
140
+ if duration < self.min_duration:
141
+ self.logger.warning(f"Audio too short: {duration:.2f}s < {self.min_duration}s")
142
+ return False
143
+
144
+ if self.max_duration and duration > self.max_duration:
145
+ self.logger.warning(f"Audio too long: {duration:.2f}s > {self.max_duration}s")
146
+ return False
147
+
148
+ return True
149
+
150
+ except Exception as e:
151
+ self.logger.error(f"Audio validation error: {str(e)}")
152
+ return False
153
+
154
+ def clean(self, audio_data: Any) -> Tuple:
155
+ """
156
+ Clean audio data
157
+
158
+ Args:
159
+ audio_data: Audio file path or (audio_array, sample_rate) tuple
160
+
161
+ Returns:
162
+ Tuple of (cleaned_audio, sample_rate)
163
+ """
164
+ try:
165
+ # Load audio if path
166
+ if isinstance(audio_data, str):
167
+ y, sr = self.librosa.load(str(audio_data), sr=None, mono=False)
168
+ else:
169
+ y, sr = audio_data
170
+
171
+ # Convert to mono if needed
172
+ if self.mono and len(y.shape) > 1 and y.shape[0] > 1:
173
+ y = self.librosa.to_mono(y)
174
+ elif self.mono and len(y.shape) > 1:
175
+ y = y[0]
176
+
177
+ # Remove silence if specified
178
+ if self.remove_silence:
179
+ # Simple silence removal based on amplitude threshold
180
+ S = self.librosa.feature.melspectrogram(y=y, sr=sr)
181
+ S_db = self.librosa.power_to_db(S, ref=self.librosa.db_to_power(0))
182
+
183
+ # Get energy
184
+ energy = self.librosa.feature.melspectrogram(y=y, sr=sr)
185
+
186
+ # Very basic silence detection - can be improved
187
+ y = self.librosa.effects.split(y, top_db=40)[0]
188
+
189
+ return (y, sr)
190
+
191
+ except Exception as e:
192
+ self.logger.error(f"Audio cleaning error: {str(e)}")
193
+ return None
194
+
195
+ def normalize(self, audio_data: Tuple) -> Tuple:
196
+ """
197
+ Normalize audio
198
+
199
+ Args:
200
+ audio_data: Tuple of (audio_array, sample_rate)
201
+
202
+ Returns:
203
+ Tuple of (normalized_audio, target_sample_rate)
204
+ """
205
+ try:
206
+ y, sr = audio_data
207
+
208
+ # Resample if needed
209
+ if sr != self.target_sample_rate:
210
+ y = self.librosa.resample(y, orig_sr=sr, target_sr=self.target_sample_rate)
211
+
212
+ # Normalize amplitude
213
+ if self.normalize_amplitude:
214
+ # Peak normalization
215
+ y = y / (self.librosa.effects.loudness(y=y) + 1e-10)
216
+ # Ensure it's in [-1, 1] range
217
+ max_val = self.librosa.util.peak_normalize(y)
218
+ y = max_val
219
+
220
+ return (y, self.target_sample_rate)
221
+
222
+ except Exception as e:
223
+ self.logger.error(f"Audio normalization error: {str(e)}")
224
+ return None
225
+
226
+ def get_audio_stats(self, audio_path: str) -> Dict[str, Any]:
227
+ """
228
+ Get statistics about audio
229
+
230
+ Args:
231
+ audio_path: Path to audio file
232
+
233
+ Returns:
234
+ Dictionary with audio statistics
235
+ """
236
+ try:
237
+ if not self.librosa_available:
238
+ return {}
239
+
240
+ y, sr = self.librosa.load(str(audio_path), sr=None, mono=False)
241
+ duration = self.librosa.get_duration(y=y, sr=sr)
242
+
243
+ # Get RMS energy
244
+ S = self.librosa.feature.melspectrogram(y=y, sr=sr)
245
+ S_db = self.librosa.power_to_db(S, ref=self.librosa.db_to_power(0))
246
+
247
+ stats = {
248
+ 'file': str(audio_path),
249
+ 'sample_rate': sr,
250
+ 'duration_seconds': float(duration),
251
+ 'channels': 1 if len(y.shape) == 1 else y.shape[0],
252
+ 'total_samples': len(y) if len(y.shape) == 1 else y.shape[1],
253
+ 'file_size_bytes': Path(audio_path).stat().st_size,
254
+ 'rms_energy': float(self.librosa.feature.rms(y=y)[0].mean()) if len(y.shape) == 1 else 0,
255
+ }
256
+
257
+ return stats
258
+
259
+ except Exception as e:
260
+ self.logger.error(f"Error getting audio stats: {str(e)}")
261
+ return {}
262
+
263
+ def batch_process_directory(
264
+ self,
265
+ input_dir: str,
266
+ output_dir: Optional[str] = None,
267
+ output_format: str = "wav",
268
+ ) -> Dict[str, Any]:
269
+ """
270
+ Process all audio files in a directory
271
+
272
+ Args:
273
+ input_dir: Input directory path
274
+ output_dir: Output directory path
275
+ output_format: Output audio format
276
+
277
+ Returns:
278
+ Processing results
279
+ """
280
+ if not self.librosa_available:
281
+ self.logger.error("librosa is required")
282
+ return {'error': 'librosa not available'}
283
+
284
+ output_dir = output_dir or self.config.output_path
285
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
286
+
287
+ input_path = Path(input_dir)
288
+ results = []
289
+
290
+ for audio_file in input_path.iterdir():
291
+ if audio_file.suffix.lower()[1:] not in self.supported_formats:
292
+ continue
293
+
294
+ try:
295
+ # Validate
296
+ if not self.validate(str(audio_file)):
297
+ self.stats['skipped'] += 1
298
+ continue
299
+
300
+ # Load and process
301
+ y, sr = self.librosa.load(str(audio_file), sr=None)
302
+ cleaned = self.clean((y, sr))
303
+ normalized = self.normalize(cleaned)
304
+
305
+ if normalized is None:
306
+ raise Exception("Normalization failed")
307
+
308
+ # Save processed audio
309
+ y_out, sr_out = normalized
310
+ output_file = Path(output_dir) / f"{audio_file.stem}.{output_format}"
311
+
312
+ if self.soundfile_available:
313
+ self.soundfile.write(str(output_file), y_out, sr_out)
314
+ else:
315
+ self.librosa.output.write_wav(str(output_file), y_out, sr=sr_out)
316
+
317
+ self.stats['output_count'] += 1
318
+
319
+ results.append({
320
+ 'file': str(audio_file),
321
+ 'status': 'success',
322
+ 'stats': self.get_audio_stats(str(audio_file))
323
+ })
324
+
325
+ except Exception as e:
326
+ self.logger.error(f"Error processing {audio_file}: {str(e)}")
327
+ self.stats['errors'] += 1
328
+ results.append({
329
+ 'file': str(audio_file),
330
+ 'status': 'error',
331
+ 'error': str(e)
332
+ })
333
+
334
+ self.logger.info(f"Processed {self.stats['output_count']} audio files")
335
+ return {'results': results, 'stats': self.get_statistics()}
@@ -0,0 +1,196 @@
1
+ """
2
+ Base classes and configurations for data preprocessing
3
+ """
4
+
5
+ import os
6
+ import json
7
+ import logging
8
+ from abc import ABC, abstractmethod
9
+ from typing import Any, Dict, Optional, List
10
+ from dataclasses import dataclass, field, asdict
11
+ from pathlib import Path
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class PreprocessingConfig:
19
+ """Configuration for data preprocessing"""
20
+
21
+ input_path: str
22
+ output_path: str
23
+ data_type: str # 'text', 'image', 'audio', 'video'
24
+ save_format: str = "json" # json, csv, parquet
25
+ batch_size: int = 32
26
+ num_workers: int = 4
27
+ verbose: bool = True
28
+ clean_cache: bool = False
29
+ validation_split: float = 0.1
30
+ test_split: float = 0.1
31
+ random_seed: int = 42
32
+ custom_params: Dict[str, Any] = field(default_factory=dict)
33
+
34
+ def to_dict(self) -> Dict[str, Any]:
35
+ """Convert config to dictionary"""
36
+ return asdict(self)
37
+
38
+ def save(self, path: str) -> None:
39
+ """Save config to JSON file"""
40
+ Path(path).parent.mkdir(parents=True, exist_ok=True)
41
+ with open(path, 'w') as f:
42
+ json.dump(self.to_dict(), f, indent=2)
43
+ logger.info(f"Config saved to {path}")
44
+
45
+ @classmethod
46
+ def load(cls, path: str) -> 'PreprocessingConfig':
47
+ """Load config from JSON file"""
48
+ with open(path, 'r') as f:
49
+ config_dict = json.load(f)
50
+ return cls(**config_dict)
51
+
52
+
53
+ class BaseDataPreprocessor(ABC):
54
+ """Abstract base class for all data preprocessors"""
55
+
56
+ def __init__(self, config: PreprocessingConfig):
57
+ """
58
+ Initialize preprocessor with configuration
59
+
60
+ Args:
61
+ config: PreprocessingConfig instance
62
+ """
63
+ self.config = config
64
+ self.data_type = config.data_type
65
+ self.logger = logging.getLogger(self.__class__.__name__)
66
+
67
+ # Setup logging
68
+ if config.verbose:
69
+ self.logger.setLevel(logging.INFO)
70
+
71
+ # Create output directory
72
+ Path(self.config.output_path).mkdir(parents=True, exist_ok=True)
73
+
74
+ # Statistics tracking
75
+ self.stats = {
76
+ 'input_count': 0,
77
+ 'output_count': 0,
78
+ 'errors': 0,
79
+ 'skipped': 0,
80
+ 'processing_time': 0.0,
81
+ }
82
+
83
+ @abstractmethod
84
+ def validate(self, data: Any) -> bool:
85
+ """
86
+ Validate input data
87
+
88
+ Args:
89
+ data: Input data to validate
90
+
91
+ Returns:
92
+ True if valid, False otherwise
93
+ """
94
+ pass
95
+
96
+ @abstractmethod
97
+ def clean(self, data: Any) -> Any:
98
+ """
99
+ Clean the input data
100
+
101
+ Args:
102
+ data: Input data to clean
103
+
104
+ Returns:
105
+ Cleaned data
106
+ """
107
+ pass
108
+
109
+ @abstractmethod
110
+ def normalize(self, data: Any) -> Any:
111
+ """
112
+ Normalize the data
113
+
114
+ Args:
115
+ data: Data to normalize
116
+
117
+ Returns:
118
+ Normalized data
119
+ """
120
+ pass
121
+
122
+ def process(self, data: Any) -> Any:
123
+ """
124
+ Full preprocessing pipeline
125
+
126
+ Args:
127
+ data: Input data
128
+
129
+ Returns:
130
+ Processed data
131
+ """
132
+ try:
133
+ # Validate
134
+ if not self.validate(data):
135
+ self.logger.warning("Validation failed for input data")
136
+ self.stats['skipped'] += 1
137
+ return None
138
+
139
+ # Clean
140
+ cleaned = self.clean(data)
141
+
142
+ # Normalize
143
+ normalized = self.normalize(cleaned)
144
+
145
+ self.stats['output_count'] += 1
146
+ return normalized
147
+
148
+ except Exception as e:
149
+ self.logger.error(f"Error processing data: {str(e)}")
150
+ self.stats['errors'] += 1
151
+ return None
152
+
153
+ def batch_process(self, data_list: List[Any]) -> List[Any]:
154
+ """
155
+ Process a batch of data items
156
+
157
+ Args:
158
+ data_list: List of data items
159
+
160
+ Returns:
161
+ List of processed data
162
+ """
163
+ self.stats['input_count'] = len(data_list)
164
+ results = []
165
+
166
+ for i, data in enumerate(data_list):
167
+ if self.config.verbose and (i + 1) % max(1, len(data_list) // 10) == 0:
168
+ self.logger.info(f"Processing: {i + 1}/{len(data_list)}")
169
+
170
+ result = self.process(data)
171
+ if result is not None:
172
+ results.append(result)
173
+
174
+ self.logger.info(f"Batch processing complete. Stats: {self.stats}")
175
+ return results
176
+
177
+ def get_statistics(self) -> Dict[str, Any]:
178
+ """Get processing statistics"""
179
+ return self.stats.copy()
180
+
181
+ def reset_statistics(self) -> None:
182
+ """Reset statistics"""
183
+ self.stats = {
184
+ 'input_count': 0,
185
+ 'output_count': 0,
186
+ 'errors': 0,
187
+ 'skipped': 0,
188
+ 'processing_time': 0.0,
189
+ }
190
+
191
+ def save_statistics(self, path: str) -> None:
192
+ """Save statistics to file"""
193
+ Path(path).parent.mkdir(parents=True, exist_ok=True)
194
+ with open(path, 'w') as f:
195
+ json.dump(self.stats, f, indent=2)
196
+ self.logger.info(f"Statistics saved to {path}")