gptmed 0.5.5__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,358 @@
1
+ """
2
+ Text data preprocessing and cleaning module - Modular Architecture
3
+
4
+ This module provides strategy-based text preprocessing with support for:
5
+ - Text cleaning (whitespace, special characters, HTML, URLs, emails)
6
+ - Unicode normalization with multiple forms
7
+ - Case conversion (lowercase, uppercase, title, sentence)
8
+ - Punctuation handling (removal, spacing normalization)
9
+ - Stopword removal with customizable lists
10
+ - Tokenization (word and sentence level)
11
+ - Text statistics extraction
12
+ - PDF text extraction and CSV export
13
+ - Batch file processing
14
+
15
+ Architecture:
16
+ Each preprocessing feature is implemented as a separate strategy class following
17
+ the Strategy Design Pattern and SOLID principles. This allows:
18
+ - Independent composition of preprocessing steps
19
+ - Easy addition of new strategies without modifying existing code
20
+ - Single Responsibility: Each class has one reason to change
21
+ - Dependency Inversion: Code depends on TextPreprocessingStrategy interface
22
+
23
+ Usage:
24
+ from gptmed.data_preparation.text import TextCleaner, CaseNormalizer, StopwordRemover
25
+
26
+ # Use individual strategies
27
+ cleaner = TextCleaner()
28
+ normalizer = CaseNormalizer(mode='lower')
29
+ stopwords = StopwordRemover()
30
+
31
+ # Compose into pipeline
32
+ text = "Your text here..."
33
+ text = cleaner.process(text)
34
+ text = normalizer.process(text)
35
+ text = stopwords.process(text)
36
+
37
+ # Get statistics
38
+ stats = cleaner.get_stats()
39
+ """
40
+
41
+ import logging
42
+ from typing import Any, Dict, List, Optional
43
+ from pathlib import Path
44
+
45
+ from ..base import BaseDataPreprocessor, PreprocessingConfig
46
+
47
+ # Import all strategy classes
48
+ from .base_strategy import TextPreprocessingStrategy
49
+ from .text_cleaner import TextCleaner
50
+ from .unicode_normalizer import UnicodeNormalizer
51
+ from .case_normalizer import CaseNormalizer
52
+ from .punctuation_handler import PunctuationHandler
53
+ from .stopword_remover import StopwordRemover
54
+ from .tokenizer import Tokenizer
55
+ from .text_statistics import TextStatistics
56
+ from .pdf_processor import PDFProcessor
57
+
58
+
59
+ logger = logging.getLogger(__name__)
60
+
61
+
62
+ class TextPreprocessor(BaseDataPreprocessor):
63
+ """
64
+ Orchestrator for composing text preprocessing strategies
65
+
66
+ This class provides a unified interface for text preprocessing by composing
67
+ individual strategy classes. It maintains backward compatibility with the
68
+ previous monolithic implementation while enabling flexible strategy composition.
69
+ """
70
+
71
+ def __init__(
72
+ self,
73
+ config: Optional[PreprocessingConfig] = None,
74
+ strategies: Optional[List[TextPreprocessingStrategy]] = None,
75
+ remove_stopwords: bool = False,
76
+ remove_punctuation: bool = False,
77
+ lowercase: bool = True,
78
+ min_length: int = 3,
79
+ max_length: Optional[int] = None,
80
+ ):
81
+ """
82
+ Initialize text preprocessor
83
+
84
+ Args:
85
+ config: PreprocessingConfig instance
86
+ strategies: List of strategy instances to use in order
87
+ remove_stopwords: Whether to remove common stopwords (default False)
88
+ remove_punctuation: Whether to remove punctuation (default False)
89
+ lowercase: Whether to convert to lowercase (default True)
90
+ min_length: Minimum text length to keep (default 3)
91
+ max_length: Maximum text length, None for unlimited (default None)
92
+ """
93
+ if config is None:
94
+ config = PreprocessingConfig(
95
+ input_path="./data/raw",
96
+ output_path="./data/processed",
97
+ data_type="text"
98
+ )
99
+
100
+ super().__init__(config)
101
+
102
+ self.min_length = min_length
103
+ self.max_length = max_length
104
+
105
+ # Initialize strategies
106
+ if strategies is None:
107
+ strategies = self._create_default_pipeline(
108
+ remove_stopwords=remove_stopwords,
109
+ remove_punctuation=remove_punctuation,
110
+ lowercase=lowercase,
111
+ )
112
+
113
+ self.strategies = strategies
114
+
115
+ # Initialize individual strategies for backward compatibility
116
+ self.text_cleaner = TextCleaner()
117
+ self.unicode_normalizer = UnicodeNormalizer()
118
+ self.tokenizer = Tokenizer()
119
+ self.text_stats = TextStatistics()
120
+ self.pdf_processor = PDFProcessor()
121
+
122
+ def _create_default_pipeline(
123
+ self,
124
+ remove_stopwords: bool = False,
125
+ remove_punctuation: bool = False,
126
+ lowercase: bool = True,
127
+ ) -> List[TextPreprocessingStrategy]:
128
+ """Create default preprocessing pipeline"""
129
+ pipeline = [
130
+ TextCleaner(),
131
+ UnicodeNormalizer(),
132
+ ]
133
+
134
+ if lowercase:
135
+ pipeline.append(CaseNormalizer(mode='lower'))
136
+
137
+ if remove_punctuation:
138
+ pipeline.append(PunctuationHandler(remove=True))
139
+
140
+ if remove_stopwords:
141
+ pipeline.append(StopwordRemover())
142
+
143
+ return pipeline
144
+
145
+ def validate(self, data: Any) -> bool:
146
+ """
147
+ Validate text input
148
+
149
+ Args:
150
+ data: Input text
151
+
152
+ Returns:
153
+ True if valid, False otherwise
154
+ """
155
+ if not isinstance(data, str):
156
+ self.logger.warning(f"Invalid text type: {type(data)}")
157
+ return False
158
+
159
+ if len(data.strip()) < self.min_length:
160
+ self.logger.debug(f"Text too short: {len(data)}")
161
+ return False
162
+
163
+ if self.max_length and len(data) > self.max_length:
164
+ self.logger.debug(f"Text too long: {len(data)}")
165
+ return False
166
+
167
+ return True
168
+
169
+ def clean(self, data: Any) -> Optional[str]:
170
+ """
171
+ Clean text data (remove HTML, URLs, emails, etc.)
172
+
173
+ Args:
174
+ data: Input text
175
+
176
+ Returns:
177
+ Cleaned text
178
+ """
179
+ if not self.validate(data):
180
+ return None
181
+
182
+ text = data
183
+ # Apply TextCleaner from strategies
184
+ cleaner = TextCleaner()
185
+ text = cleaner.process(text)
186
+ return text
187
+
188
+ def normalize(self, data: Any) -> Optional[str]:
189
+ """
190
+ Normalize text (unicode, case, punctuation, stopwords)
191
+
192
+ Args:
193
+ data: Input text (may be already cleaned)
194
+
195
+ Returns:
196
+ Normalized text
197
+ """
198
+ if not isinstance(data, str):
199
+ return None
200
+
201
+ text = data
202
+ # Apply remaining strategies except TextCleaner
203
+ for strategy in self.strategies:
204
+ if not isinstance(strategy, TextCleaner):
205
+ text = strategy.process(text)
206
+ if not text:
207
+ return None
208
+
209
+ return text.strip() if text else None
210
+
211
+ def process(self, data: Any) -> Optional[str]:
212
+ """
213
+ Process text through the pipeline of strategies
214
+
215
+ Args:
216
+ data: Input text
217
+
218
+ Returns:
219
+ Processed text or None if validation fails
220
+ """
221
+ if not self.validate(data):
222
+ return None
223
+
224
+ text = data
225
+ for strategy in self.strategies:
226
+ text = strategy.process(text)
227
+ if not text:
228
+ return None
229
+
230
+ return text.strip() if text else None
231
+
232
+ def tokenize(self, text: str) -> List[str]:
233
+ """
234
+ Tokenize text into words
235
+
236
+ Args:
237
+ text: Text to tokenize
238
+
239
+ Returns:
240
+ List of tokens
241
+ """
242
+ processed = self.process(text)
243
+ if processed is None:
244
+ return []
245
+
246
+ return self.tokenizer.tokenize(processed)
247
+
248
+ def get_text_stats(self, text: str) -> Dict[str, Any]:
249
+ """
250
+ Get statistics about text (before processing)
251
+
252
+ Args:
253
+ text: Input text
254
+
255
+ Returns:
256
+ Dictionary with text statistics
257
+ """
258
+ return self.text_stats.get_text_stats(text)
259
+
260
+ def batch_process_files(
261
+ self,
262
+ input_dir: str,
263
+ output_dir: Optional[str] = None,
264
+ pattern: str = "*.txt"
265
+ ) -> Dict[str, Any]:
266
+ """
267
+ Process multiple text files from a directory
268
+
269
+ Args:
270
+ input_dir: Input directory path
271
+ output_dir: Output directory path (uses config if None)
272
+ pattern: File pattern to match
273
+
274
+ Returns:
275
+ Processing results
276
+ """
277
+ output_dir = output_dir or self.config.output_path
278
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
279
+
280
+ input_path = Path(input_dir)
281
+ results = []
282
+
283
+ for file_path in input_path.glob(pattern):
284
+ try:
285
+ with open(file_path, 'r', encoding='utf-8') as f:
286
+ text = f.read()
287
+
288
+ processed = self.process(text)
289
+
290
+ if processed:
291
+ output_file = Path(output_dir) / file_path.name
292
+ with open(output_file, 'w', encoding='utf-8') as f:
293
+ f.write(processed)
294
+
295
+ results.append({
296
+ 'file': str(file_path),
297
+ 'status': 'success',
298
+ 'stats': self.get_text_stats(text)
299
+ })
300
+
301
+ except Exception as e:
302
+ self.logger.error(f"Error processing {file_path}: {str(e)}")
303
+ results.append({
304
+ 'file': str(file_path),
305
+ 'status': 'error',
306
+ 'error': str(e)
307
+ })
308
+
309
+ self.logger.info(f"Processed {len(results)} files")
310
+ return {'results': results, 'stats': self.get_statistics()}
311
+
312
+ # PDF-related methods delegated to PDFProcessor
313
+ def extract_text_from_pdf(self, pdf_path: str) -> Optional[str]:
314
+ """Extract text from a single PDF file"""
315
+ try:
316
+ return self.pdf_processor.extract_text_from_pdf(pdf_path)
317
+ except Exception as e:
318
+ self.logger.error(f"Error extracting PDF: {str(e)}")
319
+ return None
320
+
321
+ def batch_process_pdfs(self, input_dir: str, output_dir: Optional[str] = None) -> Dict[str, Any]:
322
+ """Process multiple PDF files from a directory"""
323
+ output_dir = output_dir or self.config.output_path
324
+ return self.pdf_processor.batch_process_pdfs(input_dir, output_dir)
325
+
326
+ def export_to_csv(
327
+ self,
328
+ pdf_dir: str,
329
+ output_csv: str,
330
+ process_text: bool = True,
331
+ ) -> Dict[str, Any]:
332
+ """Export PDF text to CSV with filename and text content columns"""
333
+ return self.pdf_processor.export_to_csv(pdf_dir, output_csv)
334
+
335
+ def export_to_csv_detailed(
336
+ self,
337
+ pdf_dir: str,
338
+ output_csv: str,
339
+ process_text: bool = True,
340
+ include_stats: bool = True,
341
+ ) -> Dict[str, Any]:
342
+ """Export PDF text to CSV with additional statistics"""
343
+ return self.pdf_processor.export_to_csv_detailed(pdf_dir, output_csv)
344
+
345
+
346
+ # Export all strategy classes for direct import
347
+ __all__ = [
348
+ 'TextPreprocessor',
349
+ 'TextPreprocessingStrategy',
350
+ 'TextCleaner',
351
+ 'UnicodeNormalizer',
352
+ 'CaseNormalizer',
353
+ 'PunctuationHandler',
354
+ 'StopwordRemover',
355
+ 'Tokenizer',
356
+ 'TextStatistics',
357
+ 'PDFProcessor',
358
+ ]
@@ -0,0 +1,28 @@
1
+ """
2
+ Base strategy interface for text preprocessing
3
+ """
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import Any, Dict
7
+
8
+
9
+ class TextPreprocessingStrategy(ABC):
10
+ """Abstract base class for all text preprocessing strategies"""
11
+
12
+ @abstractmethod
13
+ def process(self, text: str) -> str:
14
+ """
15
+ Process the text according to the strategy
16
+
17
+ Args:
18
+ text: Input text to process
19
+
20
+ Returns:
21
+ Processed text
22
+ """
23
+ pass
24
+
25
+ @abstractmethod
26
+ def get_stats(self) -> Dict[str, Any]:
27
+ """Get statistics about the processing"""
28
+ pass
@@ -0,0 +1,274 @@
1
+ """
2
+ Batch PDF Processing (Intermediate Step)
3
+
4
+ Parallelly processes all PDFs from a directory and extracts text.
5
+ Note: Does not save files - text is processed in-memory for preprocessing pipeline.
6
+
7
+ Usage:
8
+ python3 batch_pdf_to_jsonl.py [--input-dir ./pdfs] [--output-dir ./output] [--workers 4]
9
+ """
10
+
11
+ import sys
12
+ import json
13
+ import logging
14
+ import time
15
+ from pathlib import Path
16
+ from typing import Dict, List, Optional, Any
17
+ from concurrent.futures import ThreadPoolExecutor, as_completed
18
+ from dataclasses import dataclass, asdict
19
+
20
+ # Adjust path if running from workspace
21
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
22
+
23
+ from gptmed.data_preparation.text import PDFProcessor
24
+
25
+
26
+ # Configure logging
27
+ logging.basicConfig(
28
+ level=logging.INFO,
29
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
30
+ )
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ @dataclass
35
+ class PDFRecord:
36
+ """Data class for a single PDF record"""
37
+ filename: str
38
+ text: str
39
+ word_count: int
40
+ char_count: int
41
+ sentence_count: int
42
+ extraction_time: float
43
+ status: str
44
+
45
+
46
+ class PDFBatchProcessor:
47
+ """Process multiple PDFs in parallel and export to JSONL"""
48
+
49
+ def __init__(
50
+ self,
51
+ input_dir: str = "./pdfs",
52
+ output_dir: str = "./output",
53
+ max_workers: int = 4,
54
+ ):
55
+ """
56
+ Initialize batch processor
57
+
58
+ Args:
59
+ input_dir: Directory containing PDF files
60
+ output_dir: Directory for output JSONL files
61
+ max_workers: Number of parallel workers
62
+ """
63
+ self.input_dir = Path(input_dir)
64
+ self.output_dir = Path(output_dir)
65
+ self.max_workers = max_workers
66
+
67
+ # Create directories if they don't exist
68
+ self.input_dir.mkdir(parents=True, exist_ok=True)
69
+ self.output_dir.mkdir(parents=True, exist_ok=True)
70
+
71
+ # Initialize PDF processor
72
+ self.pdf_processor = PDFProcessor(max_workers=max_workers, use_threading=True)
73
+
74
+ self.logger = logging.getLogger(self.__class__.__name__)
75
+
76
+ def _extract_pdf(self, pdf_file: Path) -> Optional[PDFRecord]:
77
+ """
78
+ Extract text from a single PDF file
79
+
80
+ Args:
81
+ pdf_file: Path to PDF file
82
+
83
+ Returns:
84
+ PDFRecord with extracted information or None if failed
85
+ """
86
+ start_time = time.time()
87
+
88
+ try:
89
+ text = self.pdf_processor.extract_text_from_pdf(str(pdf_file))
90
+
91
+ # Calculate statistics
92
+ word_count = len(text.split())
93
+ char_count = len(text)
94
+ sentence_count = len([s for s in text.split('.') if s.strip()])
95
+ extraction_time = time.time() - start_time
96
+
97
+ record = PDFRecord(
98
+ filename=pdf_file.name,
99
+ text=text,
100
+ word_count=word_count,
101
+ char_count=char_count,
102
+ sentence_count=sentence_count,
103
+ extraction_time=extraction_time,
104
+ status="success"
105
+ )
106
+
107
+ self.logger.info(
108
+ f"✓ Extracted: {pdf_file.name} "
109
+ f"({word_count} words, {char_count} chars) in {extraction_time:.2f}s"
110
+ )
111
+
112
+ return record
113
+
114
+ except Exception as e:
115
+ self.logger.error(f"✗ Failed to extract {pdf_file.name}: {str(e)}")
116
+ return PDFRecord(
117
+ filename=pdf_file.name,
118
+ text="",
119
+ word_count=0,
120
+ char_count=0,
121
+ sentence_count=0,
122
+ extraction_time=time.time() - start_time,
123
+ status=f"error: {str(e)}"
124
+ )
125
+
126
+ def _process_pdfs_parallel(self) -> List[PDFRecord]:
127
+ """
128
+ Process all PDFs in parallel
129
+
130
+ Returns:
131
+ List of PDFRecord objects
132
+ """
133
+ # Find all PDF files
134
+ pdf_files = sorted(self.input_dir.glob('*.pdf'))
135
+
136
+ if not pdf_files:
137
+ self.logger.warning(f"No PDF files found in {self.input_dir}")
138
+ return []
139
+
140
+ self.logger.info(
141
+ f"Processing {len(pdf_files)} PDF files with {self.max_workers} workers..."
142
+ )
143
+
144
+ records = []
145
+
146
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
147
+ # Submit all PDF extraction tasks
148
+ future_to_file = {
149
+ executor.submit(self._extract_pdf, pdf_file): pdf_file
150
+ for pdf_file in pdf_files
151
+ }
152
+
153
+ # Process completed tasks as they finish
154
+ for i, future in enumerate(as_completed(future_to_file), 1):
155
+ try:
156
+ record = future.result()
157
+ if record:
158
+ records.append(record)
159
+ except Exception as e:
160
+ self.logger.error(f"Error processing future: {str(e)}")
161
+
162
+ return records
163
+
164
+
165
+
166
+ def process(self) -> Dict[str, Any]:
167
+ """
168
+ Main processing pipeline
169
+
170
+ Returns:
171
+ Dictionary with processing results and statistics
172
+ """
173
+ start_time = time.time()
174
+
175
+ self.logger.info(f"Input directory: {self.input_dir}")
176
+ self.logger.info(f"Output directory: {self.output_dir}")
177
+
178
+ # Step 1: Process all PDFs in parallel
179
+ records = self._process_pdfs_parallel()
180
+
181
+ if not records:
182
+ self.logger.warning("No records to process")
183
+ return {
184
+ 'status': 'failure',
185
+ 'message': 'No PDFs were successfully processed',
186
+ 'total_pdfs': 0,
187
+ 'successful_extractions': 0,
188
+ 'failed_extractions': 0,
189
+ 'individual_jsonl_files': 0,
190
+ 'combined_jsonl_created': False,
191
+ 'total_time': 0,
192
+ }
193
+
194
+ # Count successes and failures
195
+ successful = [r for r in records if r.status == "success"]
196
+ failed = [r for r in records if r.status != "success"]
197
+
198
+ self.logger.info(f"\nExtraction Results:")
199
+ self.logger.info(f" ✓ Successful: {len(successful)}/{len(records)}")
200
+ self.logger.info(f" ✗ Failed: {len(failed)}/{len(records)}")
201
+
202
+ total_time = time.time() - start_time
203
+
204
+ # Calculate statistics
205
+ total_words = sum(r.word_count for r in successful)
206
+ total_chars = sum(r.char_count for r in successful)
207
+
208
+ # Print summary
209
+ self.logger.info(f"\n" + "="*60)
210
+ self.logger.info(f"Processing Summary")
211
+ self.logger.info(f"="*60)
212
+ self.logger.info(f"Total PDFs: {len(records)}")
213
+ self.logger.info(f"Successfully processed: {len(successful)}")
214
+ self.logger.info(f"Failed: {len(failed)}")
215
+ self.logger.info(f"Total words extracted: {total_words:,}")
216
+ self.logger.info(f"Total characters: {total_chars:,}")
217
+ self.logger.info(f"Note: Records processed in-memory for preprocessing")
218
+ self.logger.info(f"Total processing time: {total_time:.2f}s")
219
+ self.logger.info(f"="*60)
220
+
221
+ return {
222
+ 'status': 'success',
223
+ 'records': records,
224
+ 'total_pdfs': len(records),
225
+ 'successful_extractions': len(successful),
226
+ 'failed_extractions': len(failed),
227
+ 'total_words': total_words,
228
+ 'total_characters': total_chars,
229
+ 'total_time': total_time,
230
+ }
231
+
232
+
233
+ def main():
234
+ """Main entry point"""
235
+ import argparse
236
+
237
+ parser = argparse.ArgumentParser(
238
+ description='Batch process PDFs and export to JSONL format'
239
+ )
240
+ parser.add_argument(
241
+ '--input-dir',
242
+ default='./pdfs',
243
+ help='Input directory containing PDFs (default: ./pdfs)'
244
+ )
245
+ parser.add_argument(
246
+ '--output-dir',
247
+ default='./output',
248
+ help='Output directory for JSONL files (default: ./output)'
249
+ )
250
+ parser.add_argument(
251
+ '--workers',
252
+ type=int,
253
+ default=4,
254
+ help='Number of parallel workers (default: 4)'
255
+ )
256
+
257
+ args = parser.parse_args()
258
+
259
+ # Create and run processor
260
+ processor = PDFBatchProcessor(
261
+ input_dir=args.input_dir,
262
+ output_dir=args.output_dir,
263
+ max_workers=args.workers,
264
+ )
265
+
266
+ result = processor.process()
267
+
268
+ # Return exit code
269
+ return 0 if result['status'] == 'success' else 1
270
+
271
+
272
+ if __name__ == '__main__':
273
+ exit_code = main()
274
+ sys.exit(exit_code)