PyPI - gptmed - Versions diffs - 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

gptmed 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

gptmed/data_preparation/text/__init__.py +190 -100
gptmed/data_preparation/text/base_strategy.py +28 -0
gptmed/data_preparation/text/batch_pdf_to_jsonl.py +274 -0
gptmed/data_preparation/text/case_normalizer.py +66 -0
gptmed/data_preparation/text/pdf_processor.py +302 -0
gptmed/data_preparation/text/pipeline.py +333 -0
gptmed/data_preparation/text/preprocess_jsonl.py +466 -0
gptmed/data_preparation/text/punctuation_handler.py +65 -0
gptmed/data_preparation/text/stopword_remover.py +87 -0
gptmed/data_preparation/text/text_cleaner.py +74 -0
gptmed/data_preparation/text/text_statistics.py +137 -0
gptmed/data_preparation/text/tokenize_jsonl.py +376 -0
gptmed/data_preparation/text/tokenizer.py +74 -0
gptmed/data_preparation/text/unicode_normalizer.py +61 -0
{gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/METADATA +2 -1
{gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/RECORD +20 -7
{gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/WHEEL +0 -0
{gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/entry_points.txt +0 -0
{gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/licenses/LICENSE +0 -0
{gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/top_level.txt +0 -0

gptmed/data_preparation/text/__init__.py CHANGED Viewed

@@ -1,40 +1,77 @@
 """
-Text data preprocessing and cleaning module
+Text data preprocessing and cleaning module - Modular Architecture
-Handles text normalization, cleaning, tokenization, and validation
+This module provides strategy-based text preprocessing with support for:
+- Text cleaning (whitespace, special characters, HTML, URLs, emails)
+- Unicode normalization with multiple forms
+- Case conversion (lowercase, uppercase, title, sentence)
+- Punctuation handling (removal, spacing normalization)
+- Stopword removal with customizable lists
+- Tokenization (word and sentence level)
+- Text statistics extraction
+- PDF text extraction and CSV export
+- Batch file processing
+Architecture:
+Each preprocessing feature is implemented as a separate strategy class following
+the Strategy Design Pattern and SOLID principles. This allows:
+- Independent composition of preprocessing steps
+- Easy addition of new strategies without modifying existing code
+- Single Responsibility: Each class has one reason to change
+- Dependency Inversion: Code depends on TextPreprocessingStrategy interface
+Usage:
+    from gptmed.data_preparation.text import TextCleaner, CaseNormalizer, StopwordRemover
+    # Use individual strategies
+    cleaner = TextCleaner()
+    normalizer = CaseNormalizer(mode='lower')
+    stopwords = StopwordRemover()
+    # Compose into pipeline
+    text = "Your text here..."
+    text = cleaner.process(text)
+    text = normalizer.process(text)
+    text = stopwords.process(text)
+    # Get statistics
+    stats = cleaner.get_stats()
 """
-import re
-import string
-import unicodedata
 import logging
 from typing import Any, Dict, List, Optional
 from pathlib import Path
-import json
 from ..base import BaseDataPreprocessor, PreprocessingConfig
+# Import all strategy classes
+from .base_strategy import TextPreprocessingStrategy
+from .text_cleaner import TextCleaner
+from .unicode_normalizer import UnicodeNormalizer
+from .case_normalizer import CaseNormalizer
+from .punctuation_handler import PunctuationHandler
+from .stopword_remover import StopwordRemover
+from .tokenizer import Tokenizer
+from .text_statistics import TextStatistics
+from .pdf_processor import PDFProcessor
 logger = logging.getLogger(__name__)
 class TextPreprocessor(BaseDataPreprocessor):
     """
-    Text preprocessing with cleaning, normalization, and validation
+    Orchestrator for composing text preprocessing strategies
-    Features:
-        - Text cleaning (whitespace, special characters)
-        - Case normalization
-        - Unicode normalization
-        - Stopword removal
-        - Punctuation handling
-        - Language detection
-        - Sentiment preservation
+    This class provides a unified interface for text preprocessing by composing
+    individual strategy classes. It maintains backward compatibility with the
+    previous monolithic implementation while enabling flexible strategy composition.
     """
     def __init__(
         self,
         config: Optional[PreprocessingConfig] = None,
+        strategies: Optional[List[TextPreprocessingStrategy]] = None,
         remove_stopwords: bool = False,
         remove_punctuation: bool = False,
         lowercase: bool = True,
@@ -46,11 +83,12 @@ class TextPreprocessor(BaseDataPreprocessor):
         Args:
             config: PreprocessingConfig instance
-            remove_stopwords: Whether to remove common stopwords
-            remove_punctuation: Whether to remove punctuation
-            lowercase: Whether to convert to lowercase
-            min_length: Minimum text length to keep
-            max_length: Maximum text length (None for unlimited)
+            strategies: List of strategy instances to use in order
+            remove_stopwords: Whether to remove common stopwords (default False)
+            remove_punctuation: Whether to remove punctuation (default False)
+            lowercase: Whether to convert to lowercase (default True)
+            min_length: Minimum text length to keep (default 3)
+            max_length: Maximum text length, None for unlimited (default None)
         """
         if config is None:
             config = PreprocessingConfig(
@@ -61,26 +99,48 @@ class TextPreprocessor(BaseDataPreprocessor):
         super().__init__(config)
-        self.remove_stopwords = remove_stopwords
-        self.remove_punctuation = remove_punctuation
-        self.lowercase = lowercase
         self.min_length = min_length
         self.max_length = max_length
-        # Load stopwords
-        self.stopwords = self._load_stopwords() if remove_stopwords else set()
+        # Initialize strategies
+        if strategies is None:
+            strategies = self._create_default_pipeline(
+                remove_stopwords=remove_stopwords,
+                remove_punctuation=remove_punctuation,
+                lowercase=lowercase,
+            )
+        self.strategies = strategies
+        # Initialize individual strategies for backward compatibility
+        self.text_cleaner = TextCleaner()
+        self.unicode_normalizer = UnicodeNormalizer()
+        self.tokenizer = Tokenizer()
+        self.text_stats = TextStatistics()
+        self.pdf_processor = PDFProcessor()
-    def _load_stopwords(self) -> set:
-        """Load common English stopwords"""
-        # Basic English stopwords
-        stopwords = {
-            'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
-            'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
-            'do', 'at', 'this', 'but', 'his', 'by', 'from', 'is', 'was',
-            'are', 'been', 'were', 'or', 'an', 'which', 'their', 'what',
-            'so', 'up', 'out', 'if', 'about', 'who', 'get', 'them', 'me',
-        }
-        return stopwords
+    def _create_default_pipeline(
+        self,
+        remove_stopwords: bool = False,
+        remove_punctuation: bool = False,
+        lowercase: bool = True,
+    ) -> List[TextPreprocessingStrategy]:
+        """Create default preprocessing pipeline"""
+        pipeline = [
+            TextCleaner(),
+            UnicodeNormalizer(),
+        ]
+        if lowercase:
+            pipeline.append(CaseNormalizer(mode='lower'))
+        if remove_punctuation:
+            pipeline.append(PunctuationHandler(remove=True))
+        if remove_stopwords:
+            pipeline.append(StopwordRemover())
+        return pipeline
     def validate(self, data: Any) -> bool:
         """
@@ -106,73 +166,72 @@ class TextPreprocessor(BaseDataPreprocessor):
         return True
-    def clean(self, text: str) -> str:
+    def clean(self, data: Any) -> Optional[str]:
         """
-        Clean text by removing artifacts and normalizing
+        Clean text data (remove HTML, URLs, emails, etc.)
         Args:
-            text: Raw text
+            data: Input text
         Returns:
             Cleaned text
         """
-        # Remove leading/trailing whitespace
-        text = text.strip()
+        if not self.validate(data):
+            return None
-        # Normalize unicode (NFD - decomposed form)
-        text = unicodedata.normalize('NFD', text)
-        text = ''.join(ch for ch in text if unicodedata.category(ch) != 'Mn')
-        # Remove HTML tags
-        text = re.sub(r'<[^>]+>', '', text)
-        # Remove URLs
-        text = re.sub(r'http[s]?://\S+', '', text)
-        text = re.sub(r'www\.\S+', '', text)
-        # Remove email addresses
-        text = re.sub(r'\S+@\S+', '', text)
+        text = data
+        # Apply TextCleaner from strategies
+        cleaner = TextCleaner()
+        text = cleaner.process(text)
+        return text
+    def normalize(self, data: Any) -> Optional[str]:
+        """
+        Normalize text (unicode, case, punctuation, stopwords)
-        # Remove extra whitespace
-        text = re.sub(r'\s+', ' ', text)
+        Args:
+            data: Input text (may be already cleaned)
+        Returns:
+            Normalized text
+        """
+        if not isinstance(data, str):
+            return None
-        # Remove common control characters
-        text = ''.join(ch for ch in text if ch.isprintable() or ch.isspace())
+        text = data
+        # Apply remaining strategies except TextCleaner
+        for strategy in self.strategies:
+            if not isinstance(strategy, TextCleaner):
+                text = strategy.process(text)
+                if not text:
+                    return None
-        return text.strip()
+        return text.strip() if text else None
-    def normalize(self, text: str) -> str:
+    def process(self, data: Any) -> Optional[str]:
         """
-        Normalize text
+        Process text through the pipeline of strategies
         Args:
-            text: Cleaned text
+            data: Input text
         Returns:
-            Normalized text
+            Processed text or None if validation fails
         """
-        # Convert to lowercase if specified
-        if self.lowercase:
-            text = text.lower()
-        # Optional punctuation removal
-        if self.remove_punctuation:
-            text = text.translate(str.maketrans('', '', string.punctuation))
-        else:
-            # Just normalize spacing around punctuation
-            text = re.sub(r'\s+([.!?,;:])', r'\1', text)
+        if not self.validate(data):
+            return None
-        # Remove stopwords if specified
-        if self.remove_stopwords:
-            words = text.split()
-            words = [w for w in words if w not in self.stopwords]
-            text = ' '.join(words)
+        text = data
+        for strategy in self.strategies:
+            text = strategy.process(text)
+            if not text:
+                return None
-        return text.strip()
+        return text.strip() if text else None
     def tokenize(self, text: str) -> List[str]:
         """
-        Simple word tokenization
+        Tokenize text into words
         Args:
             text: Text to tokenize
@@ -180,16 +239,15 @@ class TextPreprocessor(BaseDataPreprocessor):
         Returns:
             List of tokens
         """
-        # Process text first
         processed = self.process(text)
         if processed is None:
             return []
-        return processed.split()
+        return self.tokenizer.tokenize(processed)
     def get_text_stats(self, text: str) -> Dict[str, Any]:
         """
-        Get statistics about the text
+        Get statistics about text (before processing)
         Args:
             text: Input text
@@ -197,23 +255,7 @@ class TextPreprocessor(BaseDataPreprocessor):
         Returns:
             Dictionary with text statistics
         """
-        processed = self.process(text)
-        if processed is None:
-            return {}
-        words = processed.split()
-        sentences = re.split(r'[.!?]+', processed)
-        sentences = [s.strip() for s in sentences if s.strip()]
-        return {
-            'original_length': len(text),
-            'cleaned_length': len(processed),
-            'word_count': len(words),
-            'sentence_count': len(sentences),
-            'avg_word_length': sum(len(w) for w in words) / len(words) if words else 0,
-            'unique_words': len(set(words)),
-            'vocabulary_diversity': len(set(words)) / len(words) if words else 0,
-        }
+        return self.text_stats.get_text_stats(text)
     def batch_process_files(
         self,
@@ -266,3 +308,51 @@ class TextPreprocessor(BaseDataPreprocessor):
         self.logger.info(f"Processed {len(results)} files")
         return {'results': results, 'stats': self.get_statistics()}
+    # PDF-related methods delegated to PDFProcessor
+    def extract_text_from_pdf(self, pdf_path: str) -> Optional[str]:
+        """Extract text from a single PDF file"""
+        try:
+            return self.pdf_processor.extract_text_from_pdf(pdf_path)
+        except Exception as e:
+            self.logger.error(f"Error extracting PDF: {str(e)}")
+            return None
+    def batch_process_pdfs(self, input_dir: str, output_dir: Optional[str] = None) -> Dict[str, Any]:
+        """Process multiple PDF files from a directory"""
+        output_dir = output_dir or self.config.output_path
+        return self.pdf_processor.batch_process_pdfs(input_dir, output_dir)
+    def export_to_csv(
+        self,
+        pdf_dir: str,
+        output_csv: str,
+        process_text: bool = True,
+    ) -> Dict[str, Any]:
+        """Export PDF text to CSV with filename and text content columns"""
+        return self.pdf_processor.export_to_csv(pdf_dir, output_csv)
+    def export_to_csv_detailed(
+        self,
+        pdf_dir: str,
+        output_csv: str,
+        process_text: bool = True,
+        include_stats: bool = True,
+    ) -> Dict[str, Any]:
+        """Export PDF text to CSV with additional statistics"""
+        return self.pdf_processor.export_to_csv_detailed(pdf_dir, output_csv)
+# Export all strategy classes for direct import
+__all__ = [
+    'TextPreprocessor',
+    'TextPreprocessingStrategy',
+    'TextCleaner',
+    'UnicodeNormalizer',
+    'CaseNormalizer',
+    'PunctuationHandler',
+    'StopwordRemover',
+    'Tokenizer',
+    'TextStatistics',
+    'PDFProcessor',
+]

gptmed/data_preparation/text/base_strategy.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""
+Base strategy interface for text preprocessing
+"""
+from abc import ABC, abstractmethod
+from typing import Any, Dict
+class TextPreprocessingStrategy(ABC):
+    """Abstract base class for all text preprocessing strategies"""
+    @abstractmethod
+    def process(self, text: str) -> str:
+        """
+        Process the text according to the strategy
+        Args:
+            text: Input text to process
+        Returns:
+            Processed text
+        """
+        pass
+    @abstractmethod
+    def get_stats(self) -> Dict[str, Any]:
+        """Get statistics about the processing"""
+        pass

gptmed/data_preparation/text/batch_pdf_to_jsonl.py ADDED Viewed

@@ -0,0 +1,274 @@
+"""
+Batch PDF Processing (Intermediate Step)
+Parallelly processes all PDFs from a directory and extracts text.
+Note: Does not save files - text is processed in-memory for preprocessing pipeline.
+Usage:
+    python3 batch_pdf_to_jsonl.py [--input-dir ./pdfs] [--output-dir ./output] [--workers 4]
+"""
+import sys
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass, asdict
+# Adjust path if running from workspace
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
+from gptmed.data_preparation.text import PDFProcessor
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+@dataclass
+class PDFRecord:
+    """Data class for a single PDF record"""
+    filename: str
+    text: str
+    word_count: int
+    char_count: int
+    sentence_count: int
+    extraction_time: float
+    status: str
+class PDFBatchProcessor:
+    """Process multiple PDFs in parallel and export to JSONL"""
+    def __init__(
+        self,
+        input_dir: str = "./pdfs",
+        output_dir: str = "./output",
+        max_workers: int = 4,
+    ):
+        """
+        Initialize batch processor
+        Args:
+            input_dir: Directory containing PDF files
+            output_dir: Directory for output JSONL files
+            max_workers: Number of parallel workers
+        """
+        self.input_dir = Path(input_dir)
+        self.output_dir = Path(output_dir)
+        self.max_workers = max_workers
+        # Create directories if they don't exist
+        self.input_dir.mkdir(parents=True, exist_ok=True)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        # Initialize PDF processor
+        self.pdf_processor = PDFProcessor(max_workers=max_workers, use_threading=True)
+        self.logger = logging.getLogger(self.__class__.__name__)
+    def _extract_pdf(self, pdf_file: Path) -> Optional[PDFRecord]:
+        """
+        Extract text from a single PDF file
+        Args:
+            pdf_file: Path to PDF file
+        Returns:
+            PDFRecord with extracted information or None if failed
+        """
+        start_time = time.time()
+        try:
+            text = self.pdf_processor.extract_text_from_pdf(str(pdf_file))
+            # Calculate statistics
+            word_count = len(text.split())
+            char_count = len(text)
+            sentence_count = len([s for s in text.split('.') if s.strip()])
+            extraction_time = time.time() - start_time
+            record = PDFRecord(
+                filename=pdf_file.name,
+                text=text,
+                word_count=word_count,
+                char_count=char_count,
+                sentence_count=sentence_count,
+                extraction_time=extraction_time,
+                status="success"
+            )
+            self.logger.info(
+                f"✓ Extracted: {pdf_file.name} "
+                f"({word_count} words, {char_count} chars) in {extraction_time:.2f}s"
+            )
+            return record
+        except Exception as e:
+            self.logger.error(f"✗ Failed to extract {pdf_file.name}: {str(e)}")
+            return PDFRecord(
+                filename=pdf_file.name,
+                text="",
+                word_count=0,
+                char_count=0,
+                sentence_count=0,
+                extraction_time=time.time() - start_time,
+                status=f"error: {str(e)}"
+            )
+    def _process_pdfs_parallel(self) -> List[PDFRecord]:
+        """
+        Process all PDFs in parallel
+        Returns:
+            List of PDFRecord objects
+        """
+        # Find all PDF files
+        pdf_files = sorted(self.input_dir.glob('*.pdf'))
+        if not pdf_files:
+            self.logger.warning(f"No PDF files found in {self.input_dir}")
+            return []
+        self.logger.info(
+            f"Processing {len(pdf_files)} PDF files with {self.max_workers} workers..."
+        )
+        records = []
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            # Submit all PDF extraction tasks
+            future_to_file = {
+                executor.submit(self._extract_pdf, pdf_file): pdf_file
+                for pdf_file in pdf_files
+            }
+            # Process completed tasks as they finish
+            for i, future in enumerate(as_completed(future_to_file), 1):
+                try:
+                    record = future.result()
+                    if record:
+                        records.append(record)
+                except Exception as e:
+                    self.logger.error(f"Error processing future: {str(e)}")
+        return records
+    def process(self) -> Dict[str, Any]:
+        """
+        Main processing pipeline
+        Returns:
+            Dictionary with processing results and statistics
+        """
+        start_time = time.time()
+        self.logger.info(f"Input directory: {self.input_dir}")
+        self.logger.info(f"Output directory: {self.output_dir}")
+        # Step 1: Process all PDFs in parallel
+        records = self._process_pdfs_parallel()
+        if not records:
+            self.logger.warning("No records to process")
+            return {
+                'status': 'failure',
+                'message': 'No PDFs were successfully processed',
+                'total_pdfs': 0,
+                'successful_extractions': 0,
+                'failed_extractions': 0,
+                'individual_jsonl_files': 0,
+                'combined_jsonl_created': False,
+                'total_time': 0,
+            }
+        # Count successes and failures
+        successful = [r for r in records if r.status == "success"]
+        failed = [r for r in records if r.status != "success"]
+        self.logger.info(f"\nExtraction Results:")
+        self.logger.info(f"  ✓ Successful: {len(successful)}/{len(records)}")
+        self.logger.info(f"  ✗ Failed: {len(failed)}/{len(records)}")
+        total_time = time.time() - start_time
+        # Calculate statistics
+        total_words = sum(r.word_count for r in successful)
+        total_chars = sum(r.char_count for r in successful)
+        # Print summary
+        self.logger.info(f"\n" + "="*60)
+        self.logger.info(f"Processing Summary")
+        self.logger.info(f"="*60)
+        self.logger.info(f"Total PDFs: {len(records)}")
+        self.logger.info(f"Successfully processed: {len(successful)}")
+        self.logger.info(f"Failed: {len(failed)}")
+        self.logger.info(f"Total words extracted: {total_words:,}")
+        self.logger.info(f"Total characters: {total_chars:,}")
+        self.logger.info(f"Note: Records processed in-memory for preprocessing")
+        self.logger.info(f"Total processing time: {total_time:.2f}s")
+        self.logger.info(f"="*60)
+        return {
+            'status': 'success',
+            'records': records,
+            'total_pdfs': len(records),
+            'successful_extractions': len(successful),
+            'failed_extractions': len(failed),
+            'total_words': total_words,
+            'total_characters': total_chars,
+            'total_time': total_time,
+        }
+def main():
+    """Main entry point"""
+    import argparse
+    parser = argparse.ArgumentParser(
+        description='Batch process PDFs and export to JSONL format'
+    )
+    parser.add_argument(
+        '--input-dir',
+        default='./pdfs',
+        help='Input directory containing PDFs (default: ./pdfs)'
+    )
+    parser.add_argument(
+        '--output-dir',
+        default='./output',
+        help='Output directory for JSONL files (default: ./output)'
+    )
+    parser.add_argument(
+        '--workers',
+        type=int,
+        default=4,
+        help='Number of parallel workers (default: 4)'
+    )
+    args = parser.parse_args()
+    # Create and run processor
+    processor = PDFBatchProcessor(
+        input_dir=args.input_dir,
+        output_dir=args.output_dir,
+        max_workers=args.workers,
+    )
+    result = processor.process()
+    # Return exit code
+    return 0 if result['status'] == 'success' else 1
+if __name__ == '__main__':
+    exit_code = main()
+    sys.exit(exit_code)

gptmed 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

gptmed 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl