PyPI - gptmed - Versions diffs - 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

gptmed 0.5.3py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

gptmed/data_preparation/__init__.py +33 -0
gptmed/data_preparation/audio/__init__.py +335 -0
gptmed/data_preparation/base.py +196 -0
gptmed/data_preparation/cli.py +345 -0
gptmed/data_preparation/image/__init__.py +296 -0
gptmed/data_preparation/text/__init__.py +268 -0
gptmed/data_preparation/video/__init__.py +415 -0
gptmed/framework/cli/__init__.py +18 -2
gptmed/framework/cli/__main__.py +7 -0
gptmed/framework/cli/startproject.py +845 -7
{gptmed-0.5.3.dist-info → gptmed-0.6.0.dist-info}/METADATA +7 -1
{gptmed-0.5.3.dist-info → gptmed-0.6.0.dist-info}/RECORD +16 -8
{gptmed-0.5.3.dist-info → gptmed-0.6.0.dist-info}/entry_points.txt +1 -0
{gptmed-0.5.3.dist-info → gptmed-0.6.0.dist-info}/WHEEL +0 -0
{gptmed-0.5.3.dist-info → gptmed-0.6.0.dist-info}/licenses/LICENSE +0 -0
{gptmed-0.5.3.dist-info → gptmed-0.6.0.dist-info}/top_level.txt +0 -0

gptmed/data_preparation/cli.py ADDED Viewed

@@ -0,0 +1,345 @@
+"""
+CLI interface for data-preparation service
+Provides command-line tools for preprocessing and cleaning various data types.
+Usage:
+    data-preparation text --input data/raw/text --output data/processed/text
+    data-preparation image --input data/raw/images --output data/processed/images
+    data-preparation audio --input data/raw/audio --output data/processed/audio
+    data-preparation video --input data/raw/videos --output data/processed/videos
+"""
+import sys
+import argparse
+import logging
+import json
+from pathlib import Path
+from typing import Optional, Any
+from .base import PreprocessingConfig
+from .text import TextPreprocessor
+from .image import ImagePreprocessor
+from .audio import AudioPreprocessor
+from .video import VideoPreprocessor
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+class DataPreparationCLI:
+    """CLI handler for data preparation tasks"""
+    def __init__(self):
+        self.parser = self._create_parser()
+    def _create_parser(self) -> argparse.ArgumentParser:
+        """Create argument parser"""
+        parser = argparse.ArgumentParser(
+            prog='data-preparation',
+            description='Data preprocessing and cleaning toolkit for text, image, audio, and video',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            epilog="""
+Examples:
+  # Text preprocessing
+  data-preparation text \\
+    --input ./data/raw/text \\
+    --output ./data/processed/text \\
+    --lowercase \\
+    --remove-stopwords
+  # Image preprocessing
+  data-preparation image \\
+    --input ./data/raw/images \\
+    --output ./data/processed/images \\
+    --target-size 224 224 \\
+    --batch-size 32
+  # Audio preprocessing
+  data-preparation audio \\
+    --input ./data/raw/audio \\
+    --output ./data/processed/audio \\
+    --target-sample-rate 16000 \\
+    --mono
+  # Video preprocessing
+  data-preparation video \\
+    --input ./data/raw/videos \\
+    --output ./data/processed/videos \\
+    --extract-frames \\
+    --frame-sample-rate 30
+            """
+        )
+        # Global arguments
+        parser.add_argument(
+            '--verbose', '-v',
+            action='store_true',
+            help='Enable verbose output'
+        )
+        parser.add_argument(
+            '--version',
+            action='version',
+            version='data-preparation 0.1.0'
+        )
+        # Subcommands
+        subparsers = parser.add_subparsers(dest='command', help='Data type to process')
+        # Text preprocessing
+        text_parser = subparsers.add_parser('text', help='Text data preprocessing')
+        self._add_text_arguments(text_parser)
+        # Image preprocessing
+        image_parser = subparsers.add_parser('image', help='Image data preprocessing')
+        self._add_image_arguments(image_parser)
+        # Audio preprocessing
+        audio_parser = subparsers.add_parser('audio', help='Audio data preprocessing')
+        self._add_audio_arguments(audio_parser)
+        # Video preprocessing
+        video_parser = subparsers.add_parser('video', help='Video data preprocessing')
+        self._add_video_arguments(video_parser)
+        return parser
+    def _add_text_arguments(self, parser: argparse.ArgumentParser) -> None:
+        """Add text preprocessing arguments"""
+        parser.add_argument('--input', required=True, help='Input text file or directory')
+        parser.add_argument('--output', required=True, help='Output directory')
+        parser.add_argument('--lowercase', action='store_true', help='Convert to lowercase')
+        parser.add_argument('--remove-stopwords', action='store_true', help='Remove stopwords')
+        parser.add_argument('--remove-punctuation', action='store_true', help='Remove punctuation')
+        parser.add_argument('--min-length', type=int, default=3, help='Minimum text length')
+        parser.add_argument('--max-length', type=int, help='Maximum text length')
+        parser.add_argument('--batch-size', type=int, default=32, help='Batch processing size')
+        parser.add_argument('--save-stats', action='store_true', help='Save processing statistics')
+        parser.set_defaults(func=self.process_text)
+    def _add_image_arguments(self, parser: argparse.ArgumentParser) -> None:
+        """Add image preprocessing arguments"""
+        parser.add_argument('--input', required=True, help='Input image directory')
+        parser.add_argument('--output', required=True, help='Output directory')
+        parser.add_argument('--target-size', type=int, nargs=2, default=[224, 224],
+                          help='Target image size (height width)')
+        parser.add_argument('--preserve-aspect', action='store_true', default=True,
+                          help='Preserve aspect ratio')
+        parser.add_argument('--output-format', default='jpg', help='Output image format')
+        parser.add_argument('--quality', type=int, default=95, help='JPEG quality (0-100)')
+        parser.add_argument('--batch-size', type=int, default=32, help='Batch processing size')
+        parser.set_defaults(func=self.process_image)
+    def _add_audio_arguments(self, parser: argparse.ArgumentParser) -> None:
+        """Add audio preprocessing arguments"""
+        parser.add_argument('--input', required=True, help='Input audio directory')
+        parser.add_argument('--output', required=True, help='Output directory')
+        parser.add_argument('--target-sample-rate', type=int, default=16000,
+                          help='Target sample rate (Hz)')
+        parser.add_argument('--mono', action='store_true', help='Convert to mono')
+        parser.add_argument('--remove-silence', action='store_true', help='Remove silence')
+        parser.add_argument('--min-duration', type=float, default=0.5,
+                          help='Minimum audio duration (seconds)')
+        parser.add_argument('--output-format', default='wav', help='Output audio format')
+        parser.add_argument('--batch-size', type=int, default=32, help='Batch processing size')
+        parser.set_defaults(func=self.process_audio)
+    def _add_video_arguments(self, parser: argparse.ArgumentParser) -> None:
+        """Add video preprocessing arguments"""
+        parser.add_argument('--input', required=True, help='Input video directory')
+        parser.add_argument('--output', required=True, help='Output directory')
+        parser.add_argument('--target-fps', type=int, default=30, help='Target frames per second')
+        parser.add_argument('--target-resolution', type=int, nargs=2, default=[640, 480],
+                          help='Target resolution (width height)')
+        parser.add_argument('--extract-frames', action='store_true',
+                          help='Extract frames from videos')
+        parser.add_argument('--frame-sample-rate', type=int, default=30,
+                          help='Extract every Nth frame')
+        parser.add_argument('--min-duration', type=float, default=1.0,
+                          help='Minimum video duration (seconds)')
+        parser.add_argument('--batch-size', type=int, default=32, help='Batch processing size')
+        parser.set_defaults(func=self.process_video)
+    def process_text(self, args: argparse.Namespace) -> int:
+        """Process text data"""
+        try:
+            logger.info("Starting text preprocessing...")
+            config = PreprocessingConfig(
+                input_path=args.input,
+                output_path=args.output,
+                data_type='text',
+                batch_size=args.batch_size,
+                verbose=args.verbose,
+            )
+            preprocessor = TextPreprocessor(
+                config=config,
+                remove_stopwords=args.remove_stopwords,
+                remove_punctuation=args.remove_punctuation,
+                lowercase=args.lowercase,
+                min_length=args.min_length,
+                max_length=args.max_length,
+            )
+            input_path = Path(args.input)
+            # Process single file or directory
+            if input_path.is_file():
+                with open(input_path, 'r', encoding='utf-8') as f:
+                    text = f.read()
+                results = preprocessor.batch_process([text])
+                logger.info(f"Processed text. Output: {args.output}")
+            elif input_path.is_dir():
+                results = preprocessor.batch_process_files(args.input, args.output)
+                logger.info(f"Processed directory: {args.input}")
+            else:
+                logger.error(f"Input path not found: {args.input}")
+                return 1
+            # Save statistics if requested
+            if args.save_stats:
+                stats_file = Path(args.output) / 'processing_stats.json'
+                preprocessor.save_statistics(str(stats_file))
+                logger.info(f"Statistics saved to {stats_file}")
+            logger.info("Text preprocessing complete!")
+            return 0
+        except Exception as e:
+            logger.error(f"Error in text preprocessing: {str(e)}")
+            return 1
+    def process_image(self, args: argparse.Namespace) -> int:
+        """Process image data"""
+        try:
+            logger.info("Starting image preprocessing...")
+            config = PreprocessingConfig(
+                input_path=args.input,
+                output_path=args.output,
+                data_type='image',
+                batch_size=args.batch_size,
+                verbose=args.verbose,
+            )
+            preprocessor = ImagePreprocessor(
+                config=config,
+                target_size=tuple(args.target_size),
+                preserve_aspect_ratio=args.preserve_aspect,
+            )
+            results = preprocessor.batch_process_directory(
+                args.input,
+                args.output,
+                output_format=args.output_format,
+                quality=args.quality,
+            )
+            logger.info(f"Processed images from {args.input}")
+            logger.info(f"Results: {results['stats']}")
+            logger.info("Image preprocessing complete!")
+            return 0
+        except Exception as e:
+            logger.error(f"Error in image preprocessing: {str(e)}")
+            return 1
+    def process_audio(self, args: argparse.Namespace) -> int:
+        """Process audio data"""
+        try:
+            logger.info("Starting audio preprocessing...")
+            config = PreprocessingConfig(
+                input_path=args.input,
+                output_path=args.output,
+                data_type='audio',
+                batch_size=args.batch_size,
+                verbose=args.verbose,
+            )
+            preprocessor = AudioPreprocessor(
+                config=config,
+                target_sample_rate=args.target_sample_rate,
+                mono=args.mono,
+                remove_silence=args.remove_silence,
+                min_duration=args.min_duration,
+            )
+            results = preprocessor.batch_process_directory(
+                args.input,
+                args.output,
+                output_format=args.output_format,
+            )
+            logger.info(f"Processed audio from {args.input}")
+            logger.info(f"Results: {results['stats']}")
+            logger.info("Audio preprocessing complete!")
+            return 0
+        except Exception as e:
+            logger.error(f"Error in audio preprocessing: {str(e)}")
+            return 1
+    def process_video(self, args: argparse.Namespace) -> int:
+        """Process video data"""
+        try:
+            logger.info("Starting video preprocessing...")
+            config = PreprocessingConfig(
+                input_path=args.input,
+                output_path=args.output,
+                data_type='video',
+                batch_size=args.batch_size,
+                verbose=args.verbose,
+            )
+            preprocessor = VideoPreprocessor(
+                config=config,
+                target_fps=args.target_fps,
+                target_resolution=tuple(args.target_resolution),
+                min_duration=args.min_duration,
+            )
+            results = preprocessor.batch_process_directory(
+                args.input,
+                args.output,
+                extract_frames=args.extract_frames,
+                frame_sample_rate=args.frame_sample_rate,
+            )
+            logger.info(f"Processed videos from {args.input}")
+            logger.info(f"Results: {results['stats']}")
+            logger.info("Video preprocessing complete!")
+            return 0
+        except Exception as e:
+            logger.error(f"Error in video preprocessing: {str(e)}")
+            return 1
+    def run(self, args: Optional[list] = None) -> int:
+        """Run CLI"""
+        parsed_args = self.parser.parse_args(args)
+        if not hasattr(parsed_args, 'func'):
+            self.parser.print_help()
+            return 1
+        return parsed_args.func(parsed_args)
+def main():
+    """Main entry point"""
+    cli = DataPreparationCLI()
+    sys.exit(cli.run())
+if __name__ == '__main__':
+    main()

gptmed/data_preparation/image/__init__.py ADDED Viewed

@@ -0,0 +1,296 @@
+"""
+Image data preprocessing and cleaning module
+Handles image resizing, normalization, augmentation, and quality checks
+"""
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+from pathlib import Path
+import json
+from abc import ABC
+from ..base import BaseDataPreprocessor, PreprocessingConfig
+logger = logging.getLogger(__name__)
+class ImagePreprocessor(BaseDataPreprocessor):
+    """
+    Image preprocessing with resizing, normalization, and validation
+    Features:
+        - Image format validation
+        - Resizing and aspect ratio preservation
+        - Normalization (pixel value scaling)
+        - Brightness/contrast adjustment
+        - Noise reduction
+        - Format conversion
+        - Metadata extraction
+        - Duplicate detection via hashing
+    """
+    def __init__(
+        self,
+        config: Optional[PreprocessingConfig] = None,
+        target_size: Tuple[int, int] = (224, 224),
+        normalize: bool = True,
+        preserve_aspect_ratio: bool = True,
+        min_size: Tuple[int, int] = (32, 32),
+        max_size: Tuple[int, int] = (4096, 4096),
+        supported_formats: Optional[List[str]] = None,
+    ):
+        """
+        Initialize image preprocessor
+        Args:
+            config: PreprocessingConfig instance
+            target_size: Target image size (height, width)
+            normalize: Whether to normalize pixel values
+            preserve_aspect_ratio: Whether to preserve aspect ratio when resizing
+            min_size: Minimum allowed image size
+            max_size: Maximum allowed image size
+            supported_formats: List of supported image formats
+        """
+        if config is None:
+            config = PreprocessingConfig(
+                input_path="./data/raw/images",
+                output_path="./data/processed/images",
+                data_type="image"
+            )
+        super().__init__(config)
+        self.target_size = target_size
+        self.normalize = normalize
+        self.preserve_aspect_ratio = preserve_aspect_ratio
+        self.min_size = min_size
+        self.max_size = max_size
+        self.supported_formats = supported_formats or ['jpg', 'jpeg', 'png', 'bmp', 'webp']
+        self._import_image_library()
+    def _import_image_library(self):
+        """Attempt to import PIL/Pillow"""
+        try:
+            from PIL import Image
+            self.Image = Image
+            self.pil_available = True
+        except ImportError:
+            self.logger.warning(
+                "PIL/Pillow not available. Install with: pip install Pillow"
+            )
+            self.pil_available = False
+    def validate(self, data: Any) -> bool:
+        """
+        Validate image input
+        Args:
+            data: Image file path (str) or PIL Image
+        Returns:
+            True if valid, False otherwise
+        """
+        if not self.pil_available:
+            self.logger.error("PIL/Pillow is required for image processing")
+            return False
+        try:
+            if isinstance(data, str):
+                img_path = Path(data)
+                if not img_path.exists():
+                    self.logger.warning(f"Image file not found: {data}")
+                    return False
+                if not any(str(img_path).lower().endswith(f) for f in self.supported_formats):
+                    self.logger.warning(f"Unsupported format: {data}")
+                    return False
+                # Try to open
+                img = self.Image.open(img_path)
+                w, h = img.size
+            elif hasattr(data, 'size'):  # PIL Image object
+                w, h = data.size
+            else:
+                self.logger.warning(f"Invalid image type: {type(data)}")
+                return False
+            # Check size constraints
+            if (w, h) < self.min_size or (w, h) > self.max_size:
+                self.logger.warning(f"Image size {(w, h)} outside allowed range")
+                return False
+            return True
+        except Exception as e:
+            self.logger.error(f"Image validation error: {str(e)}")
+            return False
+    def clean(self, image_data: Any) -> Any:
+        """
+        Clean image data
+        Args:
+            image_data: Image file path or PIL Image
+        Returns:
+            Cleaned PIL Image
+        """
+        try:
+            # Load image if path
+            if isinstance(image_data, str):
+                img = self.Image.open(image_data)
+            else:
+                img = image_data
+            # Convert to RGB if needed (remove alpha channel, convert grayscale)
+            if img.mode in ('RGBA', 'LA', 'P'):
+                rgb_img = self.Image.new('RGB', img.size, (255, 255, 255))
+                rgb_img.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
+                img = rgb_img
+            elif img.mode != 'RGB':
+                img = img.convert('RGB')
+            return img
+        except Exception as e:
+            self.logger.error(f"Image cleaning error: {str(e)}")
+            return None
+    def normalize(self, image: Any) -> Any:
+        """
+        Normalize image
+        Args:
+            image: PIL Image
+        Returns:
+            Normalized image
+        """
+        try:
+            # Resize image
+            if self.preserve_aspect_ratio:
+                image.thumbnail(self.target_size, self.Image.Resampling.LANCZOS)
+                # Pad to target size
+                new_img = self.Image.new('RGB', self.target_size, (0, 0, 0))
+                offset = (
+                    (self.target_size[0] - image.size[0]) // 2,
+                    (self.target_size[1] - image.size[1]) // 2
+                )
+                new_img.paste(image, offset)
+                image = new_img
+            else:
+                image = image.resize(self.target_size, self.Image.Resampling.LANCZOS)
+            return image
+        except Exception as e:
+            self.logger.error(f"Image normalization error: {str(e)}")
+            return None
+    def get_image_stats(self, image_path: str) -> Dict[str, Any]:
+        """
+        Get statistics about image
+        Args:
+            image_path: Path to image file
+        Returns:
+            Dictionary with image statistics
+        """
+        try:
+            if not self.pil_available:
+                return {}
+            img = self.Image.open(image_path)
+            stats = {
+                'file': str(image_path),
+                'format': img.format,
+                'mode': img.mode,
+                'width': img.width,
+                'height': img.height,
+                'size_bytes': Path(image_path).stat().st_size,
+                'aspect_ratio': img.width / img.height if img.height > 0 else 0,
+            }
+            # Get file size in MB
+            stats['size_mb'] = stats['size_bytes'] / (1024 * 1024)
+            return stats
+        except Exception as e:
+            self.logger.error(f"Error getting image stats: {str(e)}")
+            return {}
+    def batch_process_directory(
+        self,
+        input_dir: str,
+        output_dir: Optional[str] = None,
+        output_format: str = "jpg",
+        quality: int = 95,
+    ) -> Dict[str, Any]:
+        """
+        Process all images in a directory
+        Args:
+            input_dir: Input directory path
+            output_dir: Output directory path
+            output_format: Output image format
+            quality: Output quality (for JPEG)
+        Returns:
+            Processing results
+        """
+        if not self.pil_available:
+            self.logger.error("PIL/Pillow is required")
+            return {'error': 'PIL not available'}
+        output_dir = output_dir or self.config.output_path
+        Path(output_dir).mkdir(parents=True, exist_ok=True)
+        input_path = Path(input_dir)
+        results = []
+        for img_file in input_path.iterdir():
+            if img_file.suffix.lower()[1:] not in self.supported_formats:
+                continue
+            try:
+                # Validate and process
+                if not self.validate(str(img_file)):
+                    self.stats['skipped'] += 1
+                    continue
+                img = self.Image.open(str(img_file))
+                cleaned = self.clean(img)
+                normalized = self.normalize(cleaned)
+                # Save processed image
+                output_file = Path(output_dir) / f"{img_file.stem}.{output_format}"
+                if output_format.lower() == 'jpg':
+                    normalized.save(str(output_file), 'JPEG', quality=quality)
+                else:
+                    normalized.save(str(output_file))
+                self.stats['output_count'] += 1
+                results.append({
+                    'file': str(img_file),
+                    'status': 'success',
+                    'stats': self.get_image_stats(str(img_file))
+                })
+            except Exception as e:
+                self.logger.error(f"Error processing {img_file}: {str(e)}")
+                self.stats['errors'] += 1
+                results.append({
+                    'file': str(img_file),
+                    'status': 'error',
+                    'error': str(e)
+                })
+        self.logger.info(f"Processed {self.stats['output_count']} images")
+        return {'results': results, 'stats': self.get_statistics()}

gptmed 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl

gptmed 0.5.3py3-none-any.whl → 0.6.0py3-none-any.whl