gptmed 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,345 @@
1
+ """
2
+ CLI interface for data-preparation service
3
+
4
+ Provides command-line tools for preprocessing and cleaning various data types.
5
+
6
+ Usage:
7
+ data-preparation text --input data/raw/text --output data/processed/text
8
+ data-preparation image --input data/raw/images --output data/processed/images
9
+ data-preparation audio --input data/raw/audio --output data/processed/audio
10
+ data-preparation video --input data/raw/videos --output data/processed/videos
11
+ """
12
+
13
+ import sys
14
+ import argparse
15
+ import logging
16
+ import json
17
+ from pathlib import Path
18
+ from typing import Optional, Any
19
+
20
+ from .base import PreprocessingConfig
21
+ from .text import TextPreprocessor
22
+ from .image import ImagePreprocessor
23
+ from .audio import AudioPreprocessor
24
+ from .video import VideoPreprocessor
25
+
26
+
27
+ # Setup logging
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
31
+ )
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ class DataPreparationCLI:
36
+ """CLI handler for data preparation tasks"""
37
+
38
+ def __init__(self):
39
+ self.parser = self._create_parser()
40
+
41
+ def _create_parser(self) -> argparse.ArgumentParser:
42
+ """Create argument parser"""
43
+ parser = argparse.ArgumentParser(
44
+ prog='data-preparation',
45
+ description='Data preprocessing and cleaning toolkit for text, image, audio, and video',
46
+ formatter_class=argparse.RawDescriptionHelpFormatter,
47
+ epilog="""
48
+ Examples:
49
+ # Text preprocessing
50
+ data-preparation text \\
51
+ --input ./data/raw/text \\
52
+ --output ./data/processed/text \\
53
+ --lowercase \\
54
+ --remove-stopwords
55
+
56
+ # Image preprocessing
57
+ data-preparation image \\
58
+ --input ./data/raw/images \\
59
+ --output ./data/processed/images \\
60
+ --target-size 224 224 \\
61
+ --batch-size 32
62
+
63
+ # Audio preprocessing
64
+ data-preparation audio \\
65
+ --input ./data/raw/audio \\
66
+ --output ./data/processed/audio \\
67
+ --target-sample-rate 16000 \\
68
+ --mono
69
+
70
+ # Video preprocessing
71
+ data-preparation video \\
72
+ --input ./data/raw/videos \\
73
+ --output ./data/processed/videos \\
74
+ --extract-frames \\
75
+ --frame-sample-rate 30
76
+ """
77
+ )
78
+
79
+ # Global arguments
80
+ parser.add_argument(
81
+ '--verbose', '-v',
82
+ action='store_true',
83
+ help='Enable verbose output'
84
+ )
85
+ parser.add_argument(
86
+ '--version',
87
+ action='version',
88
+ version='data-preparation 0.1.0'
89
+ )
90
+
91
+ # Subcommands
92
+ subparsers = parser.add_subparsers(dest='command', help='Data type to process')
93
+
94
+ # Text preprocessing
95
+ text_parser = subparsers.add_parser('text', help='Text data preprocessing')
96
+ self._add_text_arguments(text_parser)
97
+
98
+ # Image preprocessing
99
+ image_parser = subparsers.add_parser('image', help='Image data preprocessing')
100
+ self._add_image_arguments(image_parser)
101
+
102
+ # Audio preprocessing
103
+ audio_parser = subparsers.add_parser('audio', help='Audio data preprocessing')
104
+ self._add_audio_arguments(audio_parser)
105
+
106
+ # Video preprocessing
107
+ video_parser = subparsers.add_parser('video', help='Video data preprocessing')
108
+ self._add_video_arguments(video_parser)
109
+
110
+ return parser
111
+
112
+ def _add_text_arguments(self, parser: argparse.ArgumentParser) -> None:
113
+ """Add text preprocessing arguments"""
114
+ parser.add_argument('--input', required=True, help='Input text file or directory')
115
+ parser.add_argument('--output', required=True, help='Output directory')
116
+ parser.add_argument('--lowercase', action='store_true', help='Convert to lowercase')
117
+ parser.add_argument('--remove-stopwords', action='store_true', help='Remove stopwords')
118
+ parser.add_argument('--remove-punctuation', action='store_true', help='Remove punctuation')
119
+ parser.add_argument('--min-length', type=int, default=3, help='Minimum text length')
120
+ parser.add_argument('--max-length', type=int, help='Maximum text length')
121
+ parser.add_argument('--batch-size', type=int, default=32, help='Batch processing size')
122
+ parser.add_argument('--save-stats', action='store_true', help='Save processing statistics')
123
+ parser.set_defaults(func=self.process_text)
124
+
125
+ def _add_image_arguments(self, parser: argparse.ArgumentParser) -> None:
126
+ """Add image preprocessing arguments"""
127
+ parser.add_argument('--input', required=True, help='Input image directory')
128
+ parser.add_argument('--output', required=True, help='Output directory')
129
+ parser.add_argument('--target-size', type=int, nargs=2, default=[224, 224],
130
+ help='Target image size (height width)')
131
+ parser.add_argument('--preserve-aspect', action='store_true', default=True,
132
+ help='Preserve aspect ratio')
133
+ parser.add_argument('--output-format', default='jpg', help='Output image format')
134
+ parser.add_argument('--quality', type=int, default=95, help='JPEG quality (0-100)')
135
+ parser.add_argument('--batch-size', type=int, default=32, help='Batch processing size')
136
+ parser.set_defaults(func=self.process_image)
137
+
138
+ def _add_audio_arguments(self, parser: argparse.ArgumentParser) -> None:
139
+ """Add audio preprocessing arguments"""
140
+ parser.add_argument('--input', required=True, help='Input audio directory')
141
+ parser.add_argument('--output', required=True, help='Output directory')
142
+ parser.add_argument('--target-sample-rate', type=int, default=16000,
143
+ help='Target sample rate (Hz)')
144
+ parser.add_argument('--mono', action='store_true', help='Convert to mono')
145
+ parser.add_argument('--remove-silence', action='store_true', help='Remove silence')
146
+ parser.add_argument('--min-duration', type=float, default=0.5,
147
+ help='Minimum audio duration (seconds)')
148
+ parser.add_argument('--output-format', default='wav', help='Output audio format')
149
+ parser.add_argument('--batch-size', type=int, default=32, help='Batch processing size')
150
+ parser.set_defaults(func=self.process_audio)
151
+
152
+ def _add_video_arguments(self, parser: argparse.ArgumentParser) -> None:
153
+ """Add video preprocessing arguments"""
154
+ parser.add_argument('--input', required=True, help='Input video directory')
155
+ parser.add_argument('--output', required=True, help='Output directory')
156
+ parser.add_argument('--target-fps', type=int, default=30, help='Target frames per second')
157
+ parser.add_argument('--target-resolution', type=int, nargs=2, default=[640, 480],
158
+ help='Target resolution (width height)')
159
+ parser.add_argument('--extract-frames', action='store_true',
160
+ help='Extract frames from videos')
161
+ parser.add_argument('--frame-sample-rate', type=int, default=30,
162
+ help='Extract every Nth frame')
163
+ parser.add_argument('--min-duration', type=float, default=1.0,
164
+ help='Minimum video duration (seconds)')
165
+ parser.add_argument('--batch-size', type=int, default=32, help='Batch processing size')
166
+ parser.set_defaults(func=self.process_video)
167
+
168
+ def process_text(self, args: argparse.Namespace) -> int:
169
+ """Process text data"""
170
+ try:
171
+ logger.info("Starting text preprocessing...")
172
+
173
+ config = PreprocessingConfig(
174
+ input_path=args.input,
175
+ output_path=args.output,
176
+ data_type='text',
177
+ batch_size=args.batch_size,
178
+ verbose=args.verbose,
179
+ )
180
+
181
+ preprocessor = TextPreprocessor(
182
+ config=config,
183
+ remove_stopwords=args.remove_stopwords,
184
+ remove_punctuation=args.remove_punctuation,
185
+ lowercase=args.lowercase,
186
+ min_length=args.min_length,
187
+ max_length=args.max_length,
188
+ )
189
+
190
+ input_path = Path(args.input)
191
+
192
+ # Process single file or directory
193
+ if input_path.is_file():
194
+ with open(input_path, 'r', encoding='utf-8') as f:
195
+ text = f.read()
196
+
197
+ results = preprocessor.batch_process([text])
198
+ logger.info(f"Processed text. Output: {args.output}")
199
+
200
+ elif input_path.is_dir():
201
+ results = preprocessor.batch_process_files(args.input, args.output)
202
+ logger.info(f"Processed directory: {args.input}")
203
+ else:
204
+ logger.error(f"Input path not found: {args.input}")
205
+ return 1
206
+
207
+ # Save statistics if requested
208
+ if args.save_stats:
209
+ stats_file = Path(args.output) / 'processing_stats.json'
210
+ preprocessor.save_statistics(str(stats_file))
211
+ logger.info(f"Statistics saved to {stats_file}")
212
+
213
+ logger.info("Text preprocessing complete!")
214
+ return 0
215
+
216
+ except Exception as e:
217
+ logger.error(f"Error in text preprocessing: {str(e)}")
218
+ return 1
219
+
220
+ def process_image(self, args: argparse.Namespace) -> int:
221
+ """Process image data"""
222
+ try:
223
+ logger.info("Starting image preprocessing...")
224
+
225
+ config = PreprocessingConfig(
226
+ input_path=args.input,
227
+ output_path=args.output,
228
+ data_type='image',
229
+ batch_size=args.batch_size,
230
+ verbose=args.verbose,
231
+ )
232
+
233
+ preprocessor = ImagePreprocessor(
234
+ config=config,
235
+ target_size=tuple(args.target_size),
236
+ preserve_aspect_ratio=args.preserve_aspect,
237
+ )
238
+
239
+ results = preprocessor.batch_process_directory(
240
+ args.input,
241
+ args.output,
242
+ output_format=args.output_format,
243
+ quality=args.quality,
244
+ )
245
+
246
+ logger.info(f"Processed images from {args.input}")
247
+ logger.info(f"Results: {results['stats']}")
248
+ logger.info("Image preprocessing complete!")
249
+ return 0
250
+
251
+ except Exception as e:
252
+ logger.error(f"Error in image preprocessing: {str(e)}")
253
+ return 1
254
+
255
+ def process_audio(self, args: argparse.Namespace) -> int:
256
+ """Process audio data"""
257
+ try:
258
+ logger.info("Starting audio preprocessing...")
259
+
260
+ config = PreprocessingConfig(
261
+ input_path=args.input,
262
+ output_path=args.output,
263
+ data_type='audio',
264
+ batch_size=args.batch_size,
265
+ verbose=args.verbose,
266
+ )
267
+
268
+ preprocessor = AudioPreprocessor(
269
+ config=config,
270
+ target_sample_rate=args.target_sample_rate,
271
+ mono=args.mono,
272
+ remove_silence=args.remove_silence,
273
+ min_duration=args.min_duration,
274
+ )
275
+
276
+ results = preprocessor.batch_process_directory(
277
+ args.input,
278
+ args.output,
279
+ output_format=args.output_format,
280
+ )
281
+
282
+ logger.info(f"Processed audio from {args.input}")
283
+ logger.info(f"Results: {results['stats']}")
284
+ logger.info("Audio preprocessing complete!")
285
+ return 0
286
+
287
+ except Exception as e:
288
+ logger.error(f"Error in audio preprocessing: {str(e)}")
289
+ return 1
290
+
291
+ def process_video(self, args: argparse.Namespace) -> int:
292
+ """Process video data"""
293
+ try:
294
+ logger.info("Starting video preprocessing...")
295
+
296
+ config = PreprocessingConfig(
297
+ input_path=args.input,
298
+ output_path=args.output,
299
+ data_type='video',
300
+ batch_size=args.batch_size,
301
+ verbose=args.verbose,
302
+ )
303
+
304
+ preprocessor = VideoPreprocessor(
305
+ config=config,
306
+ target_fps=args.target_fps,
307
+ target_resolution=tuple(args.target_resolution),
308
+ min_duration=args.min_duration,
309
+ )
310
+
311
+ results = preprocessor.batch_process_directory(
312
+ args.input,
313
+ args.output,
314
+ extract_frames=args.extract_frames,
315
+ frame_sample_rate=args.frame_sample_rate,
316
+ )
317
+
318
+ logger.info(f"Processed videos from {args.input}")
319
+ logger.info(f"Results: {results['stats']}")
320
+ logger.info("Video preprocessing complete!")
321
+ return 0
322
+
323
+ except Exception as e:
324
+ logger.error(f"Error in video preprocessing: {str(e)}")
325
+ return 1
326
+
327
+ def run(self, args: Optional[list] = None) -> int:
328
+ """Run CLI"""
329
+ parsed_args = self.parser.parse_args(args)
330
+
331
+ if not hasattr(parsed_args, 'func'):
332
+ self.parser.print_help()
333
+ return 1
334
+
335
+ return parsed_args.func(parsed_args)
336
+
337
+
338
+ def main():
339
+ """Main entry point"""
340
+ cli = DataPreparationCLI()
341
+ sys.exit(cli.run())
342
+
343
+
344
+ if __name__ == '__main__':
345
+ main()
@@ -0,0 +1,296 @@
1
+ """
2
+ Image data preprocessing and cleaning module
3
+
4
+ Handles image resizing, normalization, augmentation, and quality checks
5
+ """
6
+
7
+ import logging
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+ from pathlib import Path
10
+ import json
11
+ from abc import ABC
12
+
13
+ from ..base import BaseDataPreprocessor, PreprocessingConfig
14
+
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class ImagePreprocessor(BaseDataPreprocessor):
20
+ """
21
+ Image preprocessing with resizing, normalization, and validation
22
+
23
+ Features:
24
+ - Image format validation
25
+ - Resizing and aspect ratio preservation
26
+ - Normalization (pixel value scaling)
27
+ - Brightness/contrast adjustment
28
+ - Noise reduction
29
+ - Format conversion
30
+ - Metadata extraction
31
+ - Duplicate detection via hashing
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ config: Optional[PreprocessingConfig] = None,
37
+ target_size: Tuple[int, int] = (224, 224),
38
+ normalize: bool = True,
39
+ preserve_aspect_ratio: bool = True,
40
+ min_size: Tuple[int, int] = (32, 32),
41
+ max_size: Tuple[int, int] = (4096, 4096),
42
+ supported_formats: Optional[List[str]] = None,
43
+ ):
44
+ """
45
+ Initialize image preprocessor
46
+
47
+ Args:
48
+ config: PreprocessingConfig instance
49
+ target_size: Target image size (height, width)
50
+ normalize: Whether to normalize pixel values
51
+ preserve_aspect_ratio: Whether to preserve aspect ratio when resizing
52
+ min_size: Minimum allowed image size
53
+ max_size: Maximum allowed image size
54
+ supported_formats: List of supported image formats
55
+ """
56
+ if config is None:
57
+ config = PreprocessingConfig(
58
+ input_path="./data/raw/images",
59
+ output_path="./data/processed/images",
60
+ data_type="image"
61
+ )
62
+
63
+ super().__init__(config)
64
+
65
+ self.target_size = target_size
66
+ self.normalize = normalize
67
+ self.preserve_aspect_ratio = preserve_aspect_ratio
68
+ self.min_size = min_size
69
+ self.max_size = max_size
70
+ self.supported_formats = supported_formats or ['jpg', 'jpeg', 'png', 'bmp', 'webp']
71
+
72
+ self._import_image_library()
73
+
74
+ def _import_image_library(self):
75
+ """Attempt to import PIL/Pillow"""
76
+ try:
77
+ from PIL import Image
78
+ self.Image = Image
79
+ self.pil_available = True
80
+ except ImportError:
81
+ self.logger.warning(
82
+ "PIL/Pillow not available. Install with: pip install Pillow"
83
+ )
84
+ self.pil_available = False
85
+
86
+ def validate(self, data: Any) -> bool:
87
+ """
88
+ Validate image input
89
+
90
+ Args:
91
+ data: Image file path (str) or PIL Image
92
+
93
+ Returns:
94
+ True if valid, False otherwise
95
+ """
96
+ if not self.pil_available:
97
+ self.logger.error("PIL/Pillow is required for image processing")
98
+ return False
99
+
100
+ try:
101
+ if isinstance(data, str):
102
+ img_path = Path(data)
103
+ if not img_path.exists():
104
+ self.logger.warning(f"Image file not found: {data}")
105
+ return False
106
+ if not any(str(img_path).lower().endswith(f) for f in self.supported_formats):
107
+ self.logger.warning(f"Unsupported format: {data}")
108
+ return False
109
+
110
+ # Try to open
111
+ img = self.Image.open(img_path)
112
+ w, h = img.size
113
+
114
+ elif hasattr(data, 'size'): # PIL Image object
115
+ w, h = data.size
116
+ else:
117
+ self.logger.warning(f"Invalid image type: {type(data)}")
118
+ return False
119
+
120
+ # Check size constraints
121
+ if (w, h) < self.min_size or (w, h) > self.max_size:
122
+ self.logger.warning(f"Image size {(w, h)} outside allowed range")
123
+ return False
124
+
125
+ return True
126
+
127
+ except Exception as e:
128
+ self.logger.error(f"Image validation error: {str(e)}")
129
+ return False
130
+
131
+ def clean(self, image_data: Any) -> Any:
132
+ """
133
+ Clean image data
134
+
135
+ Args:
136
+ image_data: Image file path or PIL Image
137
+
138
+ Returns:
139
+ Cleaned PIL Image
140
+ """
141
+ try:
142
+ # Load image if path
143
+ if isinstance(image_data, str):
144
+ img = self.Image.open(image_data)
145
+ else:
146
+ img = image_data
147
+
148
+ # Convert to RGB if needed (remove alpha channel, convert grayscale)
149
+ if img.mode in ('RGBA', 'LA', 'P'):
150
+ rgb_img = self.Image.new('RGB', img.size, (255, 255, 255))
151
+ rgb_img.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
152
+ img = rgb_img
153
+ elif img.mode != 'RGB':
154
+ img = img.convert('RGB')
155
+
156
+ return img
157
+
158
+ except Exception as e:
159
+ self.logger.error(f"Image cleaning error: {str(e)}")
160
+ return None
161
+
162
+ def normalize(self, image: Any) -> Any:
163
+ """
164
+ Normalize image
165
+
166
+ Args:
167
+ image: PIL Image
168
+
169
+ Returns:
170
+ Normalized image
171
+ """
172
+ try:
173
+ # Resize image
174
+ if self.preserve_aspect_ratio:
175
+ image.thumbnail(self.target_size, self.Image.Resampling.LANCZOS)
176
+ # Pad to target size
177
+ new_img = self.Image.new('RGB', self.target_size, (0, 0, 0))
178
+ offset = (
179
+ (self.target_size[0] - image.size[0]) // 2,
180
+ (self.target_size[1] - image.size[1]) // 2
181
+ )
182
+ new_img.paste(image, offset)
183
+ image = new_img
184
+ else:
185
+ image = image.resize(self.target_size, self.Image.Resampling.LANCZOS)
186
+
187
+ return image
188
+
189
+ except Exception as e:
190
+ self.logger.error(f"Image normalization error: {str(e)}")
191
+ return None
192
+
193
+ def get_image_stats(self, image_path: str) -> Dict[str, Any]:
194
+ """
195
+ Get statistics about image
196
+
197
+ Args:
198
+ image_path: Path to image file
199
+
200
+ Returns:
201
+ Dictionary with image statistics
202
+ """
203
+ try:
204
+ if not self.pil_available:
205
+ return {}
206
+
207
+ img = self.Image.open(image_path)
208
+
209
+ stats = {
210
+ 'file': str(image_path),
211
+ 'format': img.format,
212
+ 'mode': img.mode,
213
+ 'width': img.width,
214
+ 'height': img.height,
215
+ 'size_bytes': Path(image_path).stat().st_size,
216
+ 'aspect_ratio': img.width / img.height if img.height > 0 else 0,
217
+ }
218
+
219
+ # Get file size in MB
220
+ stats['size_mb'] = stats['size_bytes'] / (1024 * 1024)
221
+
222
+ return stats
223
+
224
+ except Exception as e:
225
+ self.logger.error(f"Error getting image stats: {str(e)}")
226
+ return {}
227
+
228
+ def batch_process_directory(
229
+ self,
230
+ input_dir: str,
231
+ output_dir: Optional[str] = None,
232
+ output_format: str = "jpg",
233
+ quality: int = 95,
234
+ ) -> Dict[str, Any]:
235
+ """
236
+ Process all images in a directory
237
+
238
+ Args:
239
+ input_dir: Input directory path
240
+ output_dir: Output directory path
241
+ output_format: Output image format
242
+ quality: Output quality (for JPEG)
243
+
244
+ Returns:
245
+ Processing results
246
+ """
247
+ if not self.pil_available:
248
+ self.logger.error("PIL/Pillow is required")
249
+ return {'error': 'PIL not available'}
250
+
251
+ output_dir = output_dir or self.config.output_path
252
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
253
+
254
+ input_path = Path(input_dir)
255
+ results = []
256
+
257
+ for img_file in input_path.iterdir():
258
+ if img_file.suffix.lower()[1:] not in self.supported_formats:
259
+ continue
260
+
261
+ try:
262
+ # Validate and process
263
+ if not self.validate(str(img_file)):
264
+ self.stats['skipped'] += 1
265
+ continue
266
+
267
+ img = self.Image.open(str(img_file))
268
+ cleaned = self.clean(img)
269
+ normalized = self.normalize(cleaned)
270
+
271
+ # Save processed image
272
+ output_file = Path(output_dir) / f"{img_file.stem}.{output_format}"
273
+ if output_format.lower() == 'jpg':
274
+ normalized.save(str(output_file), 'JPEG', quality=quality)
275
+ else:
276
+ normalized.save(str(output_file))
277
+
278
+ self.stats['output_count'] += 1
279
+
280
+ results.append({
281
+ 'file': str(img_file),
282
+ 'status': 'success',
283
+ 'stats': self.get_image_stats(str(img_file))
284
+ })
285
+
286
+ except Exception as e:
287
+ self.logger.error(f"Error processing {img_file}: {str(e)}")
288
+ self.stats['errors'] += 1
289
+ results.append({
290
+ 'file': str(img_file),
291
+ 'status': 'error',
292
+ 'error': str(e)
293
+ })
294
+
295
+ self.logger.info(f"Processed {self.stats['output_count']} images")
296
+ return {'results': results, 'stats': self.get_statistics()}