PyPI - gptmed - Versions diffs - 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

gptmed 0.5.3py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

gptmed/data_preparation/__init__.py +33 -0
gptmed/data_preparation/audio/__init__.py +335 -0
gptmed/data_preparation/base.py +196 -0
gptmed/data_preparation/cli.py +345 -0
gptmed/data_preparation/image/__init__.py +296 -0
gptmed/data_preparation/text/__init__.py +268 -0
gptmed/data_preparation/video/__init__.py +415 -0
gptmed/framework/cli/__init__.py +18 -2
gptmed/framework/cli/__main__.py +7 -0
gptmed/framework/cli/startproject.py +845 -7
{gptmed-0.5.3.dist-info → gptmed-0.6.0.dist-info}/METADATA +7 -1
{gptmed-0.5.3.dist-info → gptmed-0.6.0.dist-info}/RECORD +16 -8
{gptmed-0.5.3.dist-info → gptmed-0.6.0.dist-info}/entry_points.txt +1 -0
{gptmed-0.5.3.dist-info → gptmed-0.6.0.dist-info}/WHEEL +0 -0
{gptmed-0.5.3.dist-info → gptmed-0.6.0.dist-info}/licenses/LICENSE +0 -0
{gptmed-0.5.3.dist-info → gptmed-0.6.0.dist-info}/top_level.txt +0 -0

gptmed/data_preparation/text/__init__.py ADDED Viewed

@@ -0,0 +1,268 @@
+"""
+Text data preprocessing and cleaning module
+Handles text normalization, cleaning, tokenization, and validation
+"""
+import re
+import string
+import unicodedata
+import logging
+from typing import Any, Dict, List, Optional
+from pathlib import Path
+import json
+from ..base import BaseDataPreprocessor, PreprocessingConfig
+logger = logging.getLogger(__name__)
+class TextPreprocessor(BaseDataPreprocessor):
+    """
+    Text preprocessing with cleaning, normalization, and validation
+    Features:
+        - Text cleaning (whitespace, special characters)
+        - Case normalization
+        - Unicode normalization
+        - Stopword removal
+        - Punctuation handling
+        - Language detection
+        - Sentiment preservation
+    """
+    def __init__(
+        self,
+        config: Optional[PreprocessingConfig] = None,
+        remove_stopwords: bool = False,
+        remove_punctuation: bool = False,
+        lowercase: bool = True,
+        min_length: int = 3,
+        max_length: Optional[int] = None,
+    ):
+        """
+        Initialize text preprocessor
+        Args:
+            config: PreprocessingConfig instance
+            remove_stopwords: Whether to remove common stopwords
+            remove_punctuation: Whether to remove punctuation
+            lowercase: Whether to convert to lowercase
+            min_length: Minimum text length to keep
+            max_length: Maximum text length (None for unlimited)
+        """
+        if config is None:
+            config = PreprocessingConfig(
+                input_path="./data/raw",
+                output_path="./data/processed",
+                data_type="text"
+            )
+        super().__init__(config)
+        self.remove_stopwords = remove_stopwords
+        self.remove_punctuation = remove_punctuation
+        self.lowercase = lowercase
+        self.min_length = min_length
+        self.max_length = max_length
+        # Load stopwords
+        self.stopwords = self._load_stopwords() if remove_stopwords else set()
+    def _load_stopwords(self) -> set:
+        """Load common English stopwords"""
+        # Basic English stopwords
+        stopwords = {
+            'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
+            'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
+            'do', 'at', 'this', 'but', 'his', 'by', 'from', 'is', 'was',
+            'are', 'been', 'were', 'or', 'an', 'which', 'their', 'what',
+            'so', 'up', 'out', 'if', 'about', 'who', 'get', 'them', 'me',
+        }
+        return stopwords
+    def validate(self, data: Any) -> bool:
+        """
+        Validate text input
+        Args:
+            data: Input text
+        Returns:
+            True if valid, False otherwise
+        """
+        if not isinstance(data, str):
+            self.logger.warning(f"Invalid text type: {type(data)}")
+            return False
+        if len(data.strip()) < self.min_length:
+            self.logger.debug(f"Text too short: {len(data)}")
+            return False
+        if self.max_length and len(data) > self.max_length:
+            self.logger.debug(f"Text too long: {len(data)}")
+            return False
+        return True
+    def clean(self, text: str) -> str:
+        """
+        Clean text by removing artifacts and normalizing
+        Args:
+            text: Raw text
+        Returns:
+            Cleaned text
+        """
+        # Remove leading/trailing whitespace
+        text = text.strip()
+        # Normalize unicode (NFD - decomposed form)
+        text = unicodedata.normalize('NFD', text)
+        text = ''.join(ch for ch in text if unicodedata.category(ch) != 'Mn')
+        # Remove HTML tags
+        text = re.sub(r'<[^>]+>', '', text)
+        # Remove URLs
+        text = re.sub(r'http[s]?://\S+', '', text)
+        text = re.sub(r'www\.\S+', '', text)
+        # Remove email addresses
+        text = re.sub(r'\S+@\S+', '', text)
+        # Remove extra whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # Remove common control characters
+        text = ''.join(ch for ch in text if ch.isprintable() or ch.isspace())
+        return text.strip()
+    def normalize(self, text: str) -> str:
+        """
+        Normalize text
+        Args:
+            text: Cleaned text
+        Returns:
+            Normalized text
+        """
+        # Convert to lowercase if specified
+        if self.lowercase:
+            text = text.lower()
+        # Optional punctuation removal
+        if self.remove_punctuation:
+            text = text.translate(str.maketrans('', '', string.punctuation))
+        else:
+            # Just normalize spacing around punctuation
+            text = re.sub(r'\s+([.!?,;:])', r'\1', text)
+        # Remove stopwords if specified
+        if self.remove_stopwords:
+            words = text.split()
+            words = [w for w in words if w not in self.stopwords]
+            text = ' '.join(words)
+        return text.strip()
+    def tokenize(self, text: str) -> List[str]:
+        """
+        Simple word tokenization
+        Args:
+            text: Text to tokenize
+        Returns:
+            List of tokens
+        """
+        # Process text first
+        processed = self.process(text)
+        if processed is None:
+            return []
+        return processed.split()
+    def get_text_stats(self, text: str) -> Dict[str, Any]:
+        """
+        Get statistics about the text
+        Args:
+            text: Input text
+        Returns:
+            Dictionary with text statistics
+        """
+        processed = self.process(text)
+        if processed is None:
+            return {}
+        words = processed.split()
+        sentences = re.split(r'[.!?]+', processed)
+        sentences = [s.strip() for s in sentences if s.strip()]
+        return {
+            'original_length': len(text),
+            'cleaned_length': len(processed),
+            'word_count': len(words),
+            'sentence_count': len(sentences),
+            'avg_word_length': sum(len(w) for w in words) / len(words) if words else 0,
+            'unique_words': len(set(words)),
+            'vocabulary_diversity': len(set(words)) / len(words) if words else 0,
+        }
+    def batch_process_files(
+        self,
+        input_dir: str,
+        output_dir: Optional[str] = None,
+        pattern: str = "*.txt"
+    ) -> Dict[str, Any]:
+        """
+        Process multiple text files from a directory
+        Args:
+            input_dir: Input directory path
+            output_dir: Output directory path (uses config if None)
+            pattern: File pattern to match
+        Returns:
+            Processing results
+        """
+        output_dir = output_dir or self.config.output_path
+        Path(output_dir).mkdir(parents=True, exist_ok=True)
+        input_path = Path(input_dir)
+        results = []
+        for file_path in input_path.glob(pattern):
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    text = f.read()
+                processed = self.process(text)
+                if processed:
+                    output_file = Path(output_dir) / file_path.name
+                    with open(output_file, 'w', encoding='utf-8') as f:
+                        f.write(processed)
+                    results.append({
+                        'file': str(file_path),
+                        'status': 'success',
+                        'stats': self.get_text_stats(text)
+                    })
+            except Exception as e:
+                self.logger.error(f"Error processing {file_path}: {str(e)}")
+                results.append({
+                    'file': str(file_path),
+                    'status': 'error',
+                    'error': str(e)
+                })
+        self.logger.info(f"Processed {len(results)} files")
+        return {'results': results, 'stats': self.get_statistics()}

gptmed/data_preparation/video/__init__.py ADDED Viewed

@@ -0,0 +1,415 @@
+"""
+Video data preprocessing and cleaning module
+Handles video frame extraction, resizing, quality checks, and metadata extraction
+"""
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+from pathlib import Path
+import json
+from ..base import BaseDataPreprocessor, PreprocessingConfig
+logger = logging.getLogger(__name__)
+class VideoPreprocessor(BaseDataPreprocessor):
+    """
+    Video preprocessing with frame extraction, resizing, and validation
+    Features:
+        - Video format validation
+        - Frame extraction at specified intervals
+        - Resolution resizing
+        - Frame rate conversion
+        - Bitrate analysis
+        - Duration validation
+        - Metadata extraction
+        - Codec detection
+        - Corruption detection
+        - Thumbnail generation
+    """
+    def __init__(
+        self,
+        config: Optional[PreprocessingConfig] = None,
+        target_fps: int = 30,
+        target_resolution: Tuple[int, int] = (640, 480),
+        preserve_aspect_ratio: bool = True,
+        min_duration: float = 1.0,  # seconds
+        max_duration: Optional[float] = None,  # seconds
+        min_width: int = 320,
+        min_height: int = 240,
+        frame_extraction_interval: Optional[int] = None,  # Extract every N frames
+        extract_frames: bool = False,
+        supported_formats: Optional[List[str]] = None,
+    ):
+        """
+        Initialize video preprocessor
+        Args:
+            config: PreprocessingConfig instance
+            target_fps: Target frames per second
+            target_resolution: Target resolution (width, height)
+            preserve_aspect_ratio: Whether to preserve aspect ratio
+            min_duration: Minimum video duration in seconds
+            max_duration: Maximum video duration in seconds
+            min_width: Minimum video width
+            min_height: Minimum video height
+            frame_extraction_interval: Extract every N frames (None = no extraction)
+            extract_frames: Whether to extract frames to disk
+            supported_formats: List of supported video formats
+        """
+        if config is None:
+            config = PreprocessingConfig(
+                input_path="./data/raw/videos",
+                output_path="./data/processed/videos",
+                data_type="video"
+            )
+        super().__init__(config)
+        self.target_fps = target_fps
+        self.target_resolution = target_resolution
+        self.preserve_aspect_ratio = preserve_aspect_ratio
+        self.min_duration = min_duration
+        self.max_duration = max_duration
+        self.min_width = min_width
+        self.min_height = min_height
+        self.frame_extraction_interval = frame_extraction_interval
+        self.extract_frames = extract_frames
+        self.supported_formats = supported_formats or ['mp4', 'avi', 'mov', 'mkv', 'flv', 'wmv']
+        self._import_video_library()
+    def _import_video_library(self):
+        """Attempt to import opencv and other video libraries"""
+        self.opencv_available = False
+        self.ffmpeg_available = False
+        try:
+            import cv2
+            self.cv2 = cv2
+            self.opencv_available = True
+        except ImportError:
+            self.logger.warning(
+                "OpenCV not available. Install with: pip install opencv-python"
+            )
+        try:
+            import subprocess
+            result = subprocess.run(['ffmpeg', '-version'], capture_output=True)
+            self.ffmpeg_available = result.returncode == 0
+        except:
+            self.logger.warning(
+                "ffmpeg not available. Install from: https://ffmpeg.org/download.html"
+            )
+    def validate(self, data: Any) -> bool:
+        """
+        Validate video input
+        Args:
+            data: Video file path (str)
+        Returns:
+            True if valid, False otherwise
+        """
+        if not self.opencv_available:
+            self.logger.error("OpenCV is required for video processing")
+            return False
+        try:
+            if not isinstance(data, str):
+                self.logger.warning(f"Invalid video type: {type(data)}")
+                return False
+            video_path = Path(data)
+            if not video_path.exists():
+                self.logger.warning(f"Video file not found: {data}")
+                return False
+            if not any(str(video_path).lower().endswith(f) for f in self.supported_formats):
+                self.logger.warning(f"Unsupported format: {data}")
+                return False
+            # Try to open video
+            cap = self.cv2.VideoCapture(str(video_path))
+            if not cap.isOpened():
+                self.logger.warning(f"Cannot open video: {data}")
+                return False
+            # Check properties
+            fps = cap.get(self.cv2.CAP_PROP_FPS)
+            width = int(cap.get(self.cv2.CAP_PROP_FRAME_WIDTH))
+            height = int(cap.get(self.cv2.CAP_PROP_FRAME_HEIGHT))
+            frame_count = int(cap.get(self.cv2.CAP_PROP_FRAME_COUNT))
+            cap.release()
+            # Validate dimensions
+            if width < self.min_width or height < self.min_height:
+                self.logger.warning(f"Video resolution too small: {width}x{height}")
+                return False
+            # Validate duration
+            duration = frame_count / fps if fps > 0 else 0
+            if duration < self.min_duration:
+                self.logger.warning(f"Video too short: {duration:.2f}s")
+                return False
+            if self.max_duration and duration > self.max_duration:
+                self.logger.warning(f"Video too long: {duration:.2f}s")
+                return False
+            return True
+        except Exception as e:
+            self.logger.error(f"Video validation error: {str(e)}")
+            return False
+    def clean(self, video_path: str) -> Any:
+        """
+        Clean video data (basic validation)
+        Args:
+            video_path: Path to video file
+        Returns:
+            OpenCV VideoCapture object or None
+        """
+        try:
+            cap = self.cv2.VideoCapture(str(video_path))
+            if not cap.isOpened():
+                raise Exception(f"Cannot open video: {video_path}")
+            return cap
+        except Exception as e:
+            self.logger.error(f"Video cleaning error: {str(e)}")
+            return None
+    def normalize(self, video_cap: Any) -> Any:
+        """
+        Normalize video properties
+        Args:
+            video_cap: OpenCV VideoCapture object
+        Returns:
+            VideoCapture with normalized properties
+        """
+        # Note: OpenCV doesn't allow changing FPS on the fly
+        # Normalization happens during frame extraction
+        return video_cap
+    def extract_frames(
+        self,
+        video_path: str,
+        output_dir: str,
+        sample_rate: int = 1,
+    ) -> List[str]:
+        """
+        Extract frames from video
+        Args:
+            video_path: Path to video file
+            output_dir: Directory to save frames
+            sample_rate: Extract every Nth frame
+        Returns:
+            List of extracted frame paths
+        """
+        if not self.opencv_available:
+            self.logger.error("OpenCV is required")
+            return []
+        try:
+            Path(output_dir).mkdir(parents=True, exist_ok=True)
+            cap = self.cv2.VideoCapture(str(video_path))
+            frame_count = 0
+            extracted_count = 0
+            extracted_frames = []
+            while True:
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                if frame_count % sample_rate == 0:
+                    # Resize frame
+                    resized = self._resize_frame(frame)
+                    frame_path = Path(output_dir) / f"frame_{extracted_count:06d}.jpg"
+                    self.cv2.imwrite(str(frame_path), resized)
+                    extracted_frames.append(str(frame_path))
+                    extracted_count += 1
+                frame_count += 1
+            cap.release()
+            self.logger.info(f"Extracted {extracted_count} frames from video")
+            return extracted_frames
+        except Exception as e:
+            self.logger.error(f"Frame extraction error: {str(e)}")
+            return []
+    def _resize_frame(self, frame: Any) -> Any:
+        """
+        Resize a single frame while preserving aspect ratio
+        Args:
+            frame: OpenCV frame
+        Returns:
+            Resized frame
+        """
+        if self.preserve_aspect_ratio:
+            h, w = frame.shape[:2]
+            scale = min(
+                self.target_resolution[0] / w,
+                self.target_resolution[1] / h
+            )
+            new_w = int(w * scale)
+            new_h = int(h * scale)
+            resized = self.cv2.resize(frame, (new_w, new_h), interpolation=self.cv2.INTER_LANCZOS4)
+            # Pad to target size
+            top = (self.target_resolution[1] - new_h) // 2
+            bottom = self.target_resolution[1] - new_h - top
+            left = (self.target_resolution[0] - new_w) // 2
+            right = self.target_resolution[0] - new_w - left
+            padded = self.cv2.copyMakeBorder(
+                resized, top, bottom, left, right,
+                self.cv2.BORDER_CONSTANT, value=[0, 0, 0]
+            )
+            return padded
+        else:
+            return self.cv2.resize(frame, self.target_resolution, interpolation=self.cv2.INTER_LANCZOS4)
+    def get_video_stats(self, video_path: str) -> Dict[str, Any]:
+        """
+        Get statistics about video
+        Args:
+            video_path: Path to video file
+        Returns:
+            Dictionary with video statistics
+        """
+        try:
+            if not self.opencv_available:
+                return {}
+            cap = self.cv2.VideoCapture(str(video_path))
+            fps = cap.get(self.cv2.CAP_PROP_FPS)
+            width = int(cap.get(self.cv2.CAP_PROP_FRAME_WIDTH))
+            height = int(cap.get(self.cv2.CAP_PROP_FRAME_HEIGHT))
+            frame_count = int(cap.get(self.cv2.CAP_PROP_FRAME_COUNT))
+            cap.release()
+            duration = frame_count / fps if fps > 0 else 0
+            stats = {
+                'file': str(video_path),
+                'width': width,
+                'height': height,
+                'fps': float(fps),
+                'frame_count': frame_count,
+                'duration_seconds': float(duration),
+                'resolution': f"{width}x{height}",
+                'file_size_bytes': Path(video_path).stat().st_size,
+                'aspect_ratio': width / height if height > 0 else 0,
+            }
+            return stats
+        except Exception as e:
+            self.logger.error(f"Error getting video stats: {str(e)}")
+            return {}
+    def batch_process_directory(
+        self,
+        input_dir: str,
+        output_dir: Optional[str] = None,
+        extract_frames: bool = False,
+        frame_sample_rate: int = 30,  # Extract every 30th frame
+    ) -> Dict[str, Any]:
+        """
+        Process all videos in a directory
+        Args:
+            input_dir: Input directory path
+            output_dir: Output directory path
+            extract_frames: Whether to extract frames
+            frame_sample_rate: Sample rate for frame extraction
+        Returns:
+            Processing results
+        """
+        if not self.opencv_available:
+            self.logger.error("OpenCV is required")
+            return {'error': 'OpenCV not available'}
+        output_dir = output_dir or self.config.output_path
+        Path(output_dir).mkdir(parents=True, exist_ok=True)
+        input_path = Path(input_dir)
+        results = []
+        for video_file in input_path.iterdir():
+            if video_file.suffix.lower()[1:] not in self.supported_formats:
+                continue
+            try:
+                # Validate
+                if not self.validate(str(video_file)):
+                    self.stats['skipped'] += 1
+                    continue
+                # Process video
+                cap = self.clean(str(video_file))
+                if cap is None:
+                    raise Exception("Failed to open video")
+                normalized = self.normalize(cap)
+                cap.release()
+                # Extract frames if requested
+                frame_list = []
+                if extract_frames:
+                    frames_dir = Path(output_dir) / video_file.stem / "frames"
+                    frame_list = self.extract_frames(
+                        str(video_file),
+                        str(frames_dir),
+                        sample_rate=frame_sample_rate
+                    )
+                self.stats['output_count'] += 1
+                results.append({
+                    'file': str(video_file),
+                    'status': 'success',
+                    'frames_extracted': len(frame_list),
+                    'stats': self.get_video_stats(str(video_file))
+                })
+            except Exception as e:
+                self.logger.error(f"Error processing {video_file}: {str(e)}")
+                self.stats['errors'] += 1
+                results.append({
+                    'file': str(video_file),
+                    'status': 'error',
+                    'error': str(e)
+                })
+        self.logger.info(f"Processed {self.stats['output_count']} videos")
+        return {'results': results, 'stats': self.get_statistics()}

gptmed/framework/cli/__init__.py CHANGED Viewed

@@ -3,6 +3,22 @@ from .startproject import startproject
 def main():
     if len(sys.argv) < 3 or sys.argv[1] != "startproject":
-        print("Usage: gptmed startproject <projectname>")
+        print("Usage: gptmed startproject <projectname> [--qna|--conversational]")
         sys.exit(1)
-    startproject(sys.argv[2])
+    project_name = sys.argv[2]
+    project_type = None
+    # Check for flags
+    if len(sys.argv) > 3:
+        flag = sys.argv[3]
+        if flag == "--qna":
+            project_type = "qna"
+        elif flag == "--conversational":
+            project_type = "conversational"
+        else:
+            print(f"Invalid flag: {flag}")
+            print("Usage: gptmed startproject <projectname> [--qna|--conversational]")
+            sys.exit(1)
+    startproject(project_name, project_type)

gptmed 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl

gptmed 0.5.3py3-none-any.whl → 0.6.0py3-none-any.whl