gptmed 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gptmed/data_preparation/text/__init__.py +190 -100
- gptmed/data_preparation/text/base_strategy.py +28 -0
- gptmed/data_preparation/text/batch_pdf_to_jsonl.py +274 -0
- gptmed/data_preparation/text/case_normalizer.py +66 -0
- gptmed/data_preparation/text/pdf_processor.py +302 -0
- gptmed/data_preparation/text/pipeline.py +333 -0
- gptmed/data_preparation/text/preprocess_jsonl.py +466 -0
- gptmed/data_preparation/text/punctuation_handler.py +65 -0
- gptmed/data_preparation/text/stopword_remover.py +87 -0
- gptmed/data_preparation/text/text_cleaner.py +74 -0
- gptmed/data_preparation/text/text_statistics.py +137 -0
- gptmed/data_preparation/text/tokenize_jsonl.py +376 -0
- gptmed/data_preparation/text/tokenizer.py +74 -0
- gptmed/data_preparation/text/unicode_normalizer.py +61 -0
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/METADATA +2 -1
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/RECORD +20 -7
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/WHEEL +0 -0
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/entry_points.txt +0 -0
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/licenses/LICENSE +0 -0
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/top_level.txt +0 -0
|
@@ -1,40 +1,77 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Text data preprocessing and cleaning module
|
|
2
|
+
Text data preprocessing and cleaning module - Modular Architecture
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
This module provides strategy-based text preprocessing with support for:
|
|
5
|
+
- Text cleaning (whitespace, special characters, HTML, URLs, emails)
|
|
6
|
+
- Unicode normalization with multiple forms
|
|
7
|
+
- Case conversion (lowercase, uppercase, title, sentence)
|
|
8
|
+
- Punctuation handling (removal, spacing normalization)
|
|
9
|
+
- Stopword removal with customizable lists
|
|
10
|
+
- Tokenization (word and sentence level)
|
|
11
|
+
- Text statistics extraction
|
|
12
|
+
- PDF text extraction and CSV export
|
|
13
|
+
- Batch file processing
|
|
14
|
+
|
|
15
|
+
Architecture:
|
|
16
|
+
Each preprocessing feature is implemented as a separate strategy class following
|
|
17
|
+
the Strategy Design Pattern and SOLID principles. This allows:
|
|
18
|
+
- Independent composition of preprocessing steps
|
|
19
|
+
- Easy addition of new strategies without modifying existing code
|
|
20
|
+
- Single Responsibility: Each class has one reason to change
|
|
21
|
+
- Dependency Inversion: Code depends on TextPreprocessingStrategy interface
|
|
22
|
+
|
|
23
|
+
Usage:
|
|
24
|
+
from gptmed.data_preparation.text import TextCleaner, CaseNormalizer, StopwordRemover
|
|
25
|
+
|
|
26
|
+
# Use individual strategies
|
|
27
|
+
cleaner = TextCleaner()
|
|
28
|
+
normalizer = CaseNormalizer(mode='lower')
|
|
29
|
+
stopwords = StopwordRemover()
|
|
30
|
+
|
|
31
|
+
# Compose into pipeline
|
|
32
|
+
text = "Your text here..."
|
|
33
|
+
text = cleaner.process(text)
|
|
34
|
+
text = normalizer.process(text)
|
|
35
|
+
text = stopwords.process(text)
|
|
36
|
+
|
|
37
|
+
# Get statistics
|
|
38
|
+
stats = cleaner.get_stats()
|
|
5
39
|
"""
|
|
6
40
|
|
|
7
|
-
import re
|
|
8
|
-
import string
|
|
9
|
-
import unicodedata
|
|
10
41
|
import logging
|
|
11
42
|
from typing import Any, Dict, List, Optional
|
|
12
43
|
from pathlib import Path
|
|
13
|
-
import json
|
|
14
44
|
|
|
15
45
|
from ..base import BaseDataPreprocessor, PreprocessingConfig
|
|
16
46
|
|
|
47
|
+
# Import all strategy classes
|
|
48
|
+
from .base_strategy import TextPreprocessingStrategy
|
|
49
|
+
from .text_cleaner import TextCleaner
|
|
50
|
+
from .unicode_normalizer import UnicodeNormalizer
|
|
51
|
+
from .case_normalizer import CaseNormalizer
|
|
52
|
+
from .punctuation_handler import PunctuationHandler
|
|
53
|
+
from .stopword_remover import StopwordRemover
|
|
54
|
+
from .tokenizer import Tokenizer
|
|
55
|
+
from .text_statistics import TextStatistics
|
|
56
|
+
from .pdf_processor import PDFProcessor
|
|
57
|
+
|
|
17
58
|
|
|
18
59
|
logger = logging.getLogger(__name__)
|
|
19
60
|
|
|
20
61
|
|
|
21
62
|
class TextPreprocessor(BaseDataPreprocessor):
|
|
22
63
|
"""
|
|
23
|
-
|
|
64
|
+
Orchestrator for composing text preprocessing strategies
|
|
24
65
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
- Unicode normalization
|
|
29
|
-
- Stopword removal
|
|
30
|
-
- Punctuation handling
|
|
31
|
-
- Language detection
|
|
32
|
-
- Sentiment preservation
|
|
66
|
+
This class provides a unified interface for text preprocessing by composing
|
|
67
|
+
individual strategy classes. It maintains backward compatibility with the
|
|
68
|
+
previous monolithic implementation while enabling flexible strategy composition.
|
|
33
69
|
"""
|
|
34
70
|
|
|
35
71
|
def __init__(
|
|
36
72
|
self,
|
|
37
73
|
config: Optional[PreprocessingConfig] = None,
|
|
74
|
+
strategies: Optional[List[TextPreprocessingStrategy]] = None,
|
|
38
75
|
remove_stopwords: bool = False,
|
|
39
76
|
remove_punctuation: bool = False,
|
|
40
77
|
lowercase: bool = True,
|
|
@@ -46,11 +83,12 @@ class TextPreprocessor(BaseDataPreprocessor):
|
|
|
46
83
|
|
|
47
84
|
Args:
|
|
48
85
|
config: PreprocessingConfig instance
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
86
|
+
strategies: List of strategy instances to use in order
|
|
87
|
+
remove_stopwords: Whether to remove common stopwords (default False)
|
|
88
|
+
remove_punctuation: Whether to remove punctuation (default False)
|
|
89
|
+
lowercase: Whether to convert to lowercase (default True)
|
|
90
|
+
min_length: Minimum text length to keep (default 3)
|
|
91
|
+
max_length: Maximum text length, None for unlimited (default None)
|
|
54
92
|
"""
|
|
55
93
|
if config is None:
|
|
56
94
|
config = PreprocessingConfig(
|
|
@@ -61,26 +99,48 @@ class TextPreprocessor(BaseDataPreprocessor):
|
|
|
61
99
|
|
|
62
100
|
super().__init__(config)
|
|
63
101
|
|
|
64
|
-
self.remove_stopwords = remove_stopwords
|
|
65
|
-
self.remove_punctuation = remove_punctuation
|
|
66
|
-
self.lowercase = lowercase
|
|
67
102
|
self.min_length = min_length
|
|
68
103
|
self.max_length = max_length
|
|
69
104
|
|
|
70
|
-
#
|
|
71
|
-
|
|
105
|
+
# Initialize strategies
|
|
106
|
+
if strategies is None:
|
|
107
|
+
strategies = self._create_default_pipeline(
|
|
108
|
+
remove_stopwords=remove_stopwords,
|
|
109
|
+
remove_punctuation=remove_punctuation,
|
|
110
|
+
lowercase=lowercase,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
self.strategies = strategies
|
|
114
|
+
|
|
115
|
+
# Initialize individual strategies for backward compatibility
|
|
116
|
+
self.text_cleaner = TextCleaner()
|
|
117
|
+
self.unicode_normalizer = UnicodeNormalizer()
|
|
118
|
+
self.tokenizer = Tokenizer()
|
|
119
|
+
self.text_stats = TextStatistics()
|
|
120
|
+
self.pdf_processor = PDFProcessor()
|
|
72
121
|
|
|
73
|
-
def
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
122
|
+
def _create_default_pipeline(
|
|
123
|
+
self,
|
|
124
|
+
remove_stopwords: bool = False,
|
|
125
|
+
remove_punctuation: bool = False,
|
|
126
|
+
lowercase: bool = True,
|
|
127
|
+
) -> List[TextPreprocessingStrategy]:
|
|
128
|
+
"""Create default preprocessing pipeline"""
|
|
129
|
+
pipeline = [
|
|
130
|
+
TextCleaner(),
|
|
131
|
+
UnicodeNormalizer(),
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
if lowercase:
|
|
135
|
+
pipeline.append(CaseNormalizer(mode='lower'))
|
|
136
|
+
|
|
137
|
+
if remove_punctuation:
|
|
138
|
+
pipeline.append(PunctuationHandler(remove=True))
|
|
139
|
+
|
|
140
|
+
if remove_stopwords:
|
|
141
|
+
pipeline.append(StopwordRemover())
|
|
142
|
+
|
|
143
|
+
return pipeline
|
|
84
144
|
|
|
85
145
|
def validate(self, data: Any) -> bool:
|
|
86
146
|
"""
|
|
@@ -106,73 +166,72 @@ class TextPreprocessor(BaseDataPreprocessor):
|
|
|
106
166
|
|
|
107
167
|
return True
|
|
108
168
|
|
|
109
|
-
def clean(self,
|
|
169
|
+
def clean(self, data: Any) -> Optional[str]:
|
|
110
170
|
"""
|
|
111
|
-
Clean text
|
|
171
|
+
Clean text data (remove HTML, URLs, emails, etc.)
|
|
112
172
|
|
|
113
173
|
Args:
|
|
114
|
-
|
|
174
|
+
data: Input text
|
|
115
175
|
|
|
116
176
|
Returns:
|
|
117
177
|
Cleaned text
|
|
118
178
|
"""
|
|
119
|
-
|
|
120
|
-
|
|
179
|
+
if not self.validate(data):
|
|
180
|
+
return None
|
|
121
181
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
text
|
|
131
|
-
text = re.sub(r'www\.\S+', '', text)
|
|
132
|
-
|
|
133
|
-
# Remove email addresses
|
|
134
|
-
text = re.sub(r'\S+@\S+', '', text)
|
|
182
|
+
text = data
|
|
183
|
+
# Apply TextCleaner from strategies
|
|
184
|
+
cleaner = TextCleaner()
|
|
185
|
+
text = cleaner.process(text)
|
|
186
|
+
return text
|
|
187
|
+
|
|
188
|
+
def normalize(self, data: Any) -> Optional[str]:
|
|
189
|
+
"""
|
|
190
|
+
Normalize text (unicode, case, punctuation, stopwords)
|
|
135
191
|
|
|
136
|
-
|
|
137
|
-
|
|
192
|
+
Args:
|
|
193
|
+
data: Input text (may be already cleaned)
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
Normalized text
|
|
197
|
+
"""
|
|
198
|
+
if not isinstance(data, str):
|
|
199
|
+
return None
|
|
138
200
|
|
|
139
|
-
|
|
140
|
-
|
|
201
|
+
text = data
|
|
202
|
+
# Apply remaining strategies except TextCleaner
|
|
203
|
+
for strategy in self.strategies:
|
|
204
|
+
if not isinstance(strategy, TextCleaner):
|
|
205
|
+
text = strategy.process(text)
|
|
206
|
+
if not text:
|
|
207
|
+
return None
|
|
141
208
|
|
|
142
|
-
return text.strip()
|
|
209
|
+
return text.strip() if text else None
|
|
143
210
|
|
|
144
|
-
def
|
|
211
|
+
def process(self, data: Any) -> Optional[str]:
|
|
145
212
|
"""
|
|
146
|
-
|
|
213
|
+
Process text through the pipeline of strategies
|
|
147
214
|
|
|
148
215
|
Args:
|
|
149
|
-
|
|
216
|
+
data: Input text
|
|
150
217
|
|
|
151
218
|
Returns:
|
|
152
|
-
|
|
219
|
+
Processed text or None if validation fails
|
|
153
220
|
"""
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
text = text.lower()
|
|
157
|
-
|
|
158
|
-
# Optional punctuation removal
|
|
159
|
-
if self.remove_punctuation:
|
|
160
|
-
text = text.translate(str.maketrans('', '', string.punctuation))
|
|
161
|
-
else:
|
|
162
|
-
# Just normalize spacing around punctuation
|
|
163
|
-
text = re.sub(r'\s+([.!?,;:])', r'\1', text)
|
|
221
|
+
if not self.validate(data):
|
|
222
|
+
return None
|
|
164
223
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
224
|
+
text = data
|
|
225
|
+
for strategy in self.strategies:
|
|
226
|
+
text = strategy.process(text)
|
|
227
|
+
if not text:
|
|
228
|
+
return None
|
|
170
229
|
|
|
171
|
-
return text.strip()
|
|
230
|
+
return text.strip() if text else None
|
|
172
231
|
|
|
173
232
|
def tokenize(self, text: str) -> List[str]:
|
|
174
233
|
"""
|
|
175
|
-
|
|
234
|
+
Tokenize text into words
|
|
176
235
|
|
|
177
236
|
Args:
|
|
178
237
|
text: Text to tokenize
|
|
@@ -180,16 +239,15 @@ class TextPreprocessor(BaseDataPreprocessor):
|
|
|
180
239
|
Returns:
|
|
181
240
|
List of tokens
|
|
182
241
|
"""
|
|
183
|
-
# Process text first
|
|
184
242
|
processed = self.process(text)
|
|
185
243
|
if processed is None:
|
|
186
244
|
return []
|
|
187
245
|
|
|
188
|
-
return
|
|
246
|
+
return self.tokenizer.tokenize(processed)
|
|
189
247
|
|
|
190
248
|
def get_text_stats(self, text: str) -> Dict[str, Any]:
|
|
191
249
|
"""
|
|
192
|
-
Get statistics about
|
|
250
|
+
Get statistics about text (before processing)
|
|
193
251
|
|
|
194
252
|
Args:
|
|
195
253
|
text: Input text
|
|
@@ -197,23 +255,7 @@ class TextPreprocessor(BaseDataPreprocessor):
|
|
|
197
255
|
Returns:
|
|
198
256
|
Dictionary with text statistics
|
|
199
257
|
"""
|
|
200
|
-
|
|
201
|
-
if processed is None:
|
|
202
|
-
return {}
|
|
203
|
-
|
|
204
|
-
words = processed.split()
|
|
205
|
-
sentences = re.split(r'[.!?]+', processed)
|
|
206
|
-
sentences = [s.strip() for s in sentences if s.strip()]
|
|
207
|
-
|
|
208
|
-
return {
|
|
209
|
-
'original_length': len(text),
|
|
210
|
-
'cleaned_length': len(processed),
|
|
211
|
-
'word_count': len(words),
|
|
212
|
-
'sentence_count': len(sentences),
|
|
213
|
-
'avg_word_length': sum(len(w) for w in words) / len(words) if words else 0,
|
|
214
|
-
'unique_words': len(set(words)),
|
|
215
|
-
'vocabulary_diversity': len(set(words)) / len(words) if words else 0,
|
|
216
|
-
}
|
|
258
|
+
return self.text_stats.get_text_stats(text)
|
|
217
259
|
|
|
218
260
|
def batch_process_files(
|
|
219
261
|
self,
|
|
@@ -266,3 +308,51 @@ class TextPreprocessor(BaseDataPreprocessor):
|
|
|
266
308
|
|
|
267
309
|
self.logger.info(f"Processed {len(results)} files")
|
|
268
310
|
return {'results': results, 'stats': self.get_statistics()}
|
|
311
|
+
|
|
312
|
+
# PDF-related methods delegated to PDFProcessor
|
|
313
|
+
def extract_text_from_pdf(self, pdf_path: str) -> Optional[str]:
|
|
314
|
+
"""Extract text from a single PDF file"""
|
|
315
|
+
try:
|
|
316
|
+
return self.pdf_processor.extract_text_from_pdf(pdf_path)
|
|
317
|
+
except Exception as e:
|
|
318
|
+
self.logger.error(f"Error extracting PDF: {str(e)}")
|
|
319
|
+
return None
|
|
320
|
+
|
|
321
|
+
def batch_process_pdfs(self, input_dir: str, output_dir: Optional[str] = None) -> Dict[str, Any]:
|
|
322
|
+
"""Process multiple PDF files from a directory"""
|
|
323
|
+
output_dir = output_dir or self.config.output_path
|
|
324
|
+
return self.pdf_processor.batch_process_pdfs(input_dir, output_dir)
|
|
325
|
+
|
|
326
|
+
def export_to_csv(
|
|
327
|
+
self,
|
|
328
|
+
pdf_dir: str,
|
|
329
|
+
output_csv: str,
|
|
330
|
+
process_text: bool = True,
|
|
331
|
+
) -> Dict[str, Any]:
|
|
332
|
+
"""Export PDF text to CSV with filename and text content columns"""
|
|
333
|
+
return self.pdf_processor.export_to_csv(pdf_dir, output_csv)
|
|
334
|
+
|
|
335
|
+
def export_to_csv_detailed(
|
|
336
|
+
self,
|
|
337
|
+
pdf_dir: str,
|
|
338
|
+
output_csv: str,
|
|
339
|
+
process_text: bool = True,
|
|
340
|
+
include_stats: bool = True,
|
|
341
|
+
) -> Dict[str, Any]:
|
|
342
|
+
"""Export PDF text to CSV with additional statistics"""
|
|
343
|
+
return self.pdf_processor.export_to_csv_detailed(pdf_dir, output_csv)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
# Export all strategy classes for direct import
|
|
347
|
+
__all__ = [
|
|
348
|
+
'TextPreprocessor',
|
|
349
|
+
'TextPreprocessingStrategy',
|
|
350
|
+
'TextCleaner',
|
|
351
|
+
'UnicodeNormalizer',
|
|
352
|
+
'CaseNormalizer',
|
|
353
|
+
'PunctuationHandler',
|
|
354
|
+
'StopwordRemover',
|
|
355
|
+
'Tokenizer',
|
|
356
|
+
'TextStatistics',
|
|
357
|
+
'PDFProcessor',
|
|
358
|
+
]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base strategy interface for text preprocessing
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from typing import Any, Dict
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TextPreprocessingStrategy(ABC):
|
|
10
|
+
"""Abstract base class for all text preprocessing strategies"""
|
|
11
|
+
|
|
12
|
+
@abstractmethod
|
|
13
|
+
def process(self, text: str) -> str:
|
|
14
|
+
"""
|
|
15
|
+
Process the text according to the strategy
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
text: Input text to process
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Processed text
|
|
22
|
+
"""
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
27
|
+
"""Get statistics about the processing"""
|
|
28
|
+
pass
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Batch PDF Processing (Intermediate Step)
|
|
3
|
+
|
|
4
|
+
Parallelly processes all PDFs from a directory and extracts text.
|
|
5
|
+
Note: Does not save files - text is processed in-memory for preprocessing pipeline.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python3 batch_pdf_to_jsonl.py [--input-dir ./pdfs] [--output-dir ./output] [--workers 4]
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import sys
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
import time
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Dict, List, Optional, Any
|
|
17
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
18
|
+
from dataclasses import dataclass, asdict
|
|
19
|
+
|
|
20
|
+
# Adjust path if running from workspace
|
|
21
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
|
|
22
|
+
|
|
23
|
+
from gptmed.data_preparation.text import PDFProcessor
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Configure logging
|
|
27
|
+
logging.basicConfig(
|
|
28
|
+
level=logging.INFO,
|
|
29
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
30
|
+
)
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class PDFRecord:
|
|
36
|
+
"""Data class for a single PDF record"""
|
|
37
|
+
filename: str
|
|
38
|
+
text: str
|
|
39
|
+
word_count: int
|
|
40
|
+
char_count: int
|
|
41
|
+
sentence_count: int
|
|
42
|
+
extraction_time: float
|
|
43
|
+
status: str
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class PDFBatchProcessor:
|
|
47
|
+
"""Process multiple PDFs in parallel and export to JSONL"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
input_dir: str = "./pdfs",
|
|
52
|
+
output_dir: str = "./output",
|
|
53
|
+
max_workers: int = 4,
|
|
54
|
+
):
|
|
55
|
+
"""
|
|
56
|
+
Initialize batch processor
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
input_dir: Directory containing PDF files
|
|
60
|
+
output_dir: Directory for output JSONL files
|
|
61
|
+
max_workers: Number of parallel workers
|
|
62
|
+
"""
|
|
63
|
+
self.input_dir = Path(input_dir)
|
|
64
|
+
self.output_dir = Path(output_dir)
|
|
65
|
+
self.max_workers = max_workers
|
|
66
|
+
|
|
67
|
+
# Create directories if they don't exist
|
|
68
|
+
self.input_dir.mkdir(parents=True, exist_ok=True)
|
|
69
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
70
|
+
|
|
71
|
+
# Initialize PDF processor
|
|
72
|
+
self.pdf_processor = PDFProcessor(max_workers=max_workers, use_threading=True)
|
|
73
|
+
|
|
74
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
75
|
+
|
|
76
|
+
def _extract_pdf(self, pdf_file: Path) -> Optional[PDFRecord]:
|
|
77
|
+
"""
|
|
78
|
+
Extract text from a single PDF file
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
pdf_file: Path to PDF file
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
PDFRecord with extracted information or None if failed
|
|
85
|
+
"""
|
|
86
|
+
start_time = time.time()
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
text = self.pdf_processor.extract_text_from_pdf(str(pdf_file))
|
|
90
|
+
|
|
91
|
+
# Calculate statistics
|
|
92
|
+
word_count = len(text.split())
|
|
93
|
+
char_count = len(text)
|
|
94
|
+
sentence_count = len([s for s in text.split('.') if s.strip()])
|
|
95
|
+
extraction_time = time.time() - start_time
|
|
96
|
+
|
|
97
|
+
record = PDFRecord(
|
|
98
|
+
filename=pdf_file.name,
|
|
99
|
+
text=text,
|
|
100
|
+
word_count=word_count,
|
|
101
|
+
char_count=char_count,
|
|
102
|
+
sentence_count=sentence_count,
|
|
103
|
+
extraction_time=extraction_time,
|
|
104
|
+
status="success"
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
self.logger.info(
|
|
108
|
+
f"✓ Extracted: {pdf_file.name} "
|
|
109
|
+
f"({word_count} words, {char_count} chars) in {extraction_time:.2f}s"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
return record
|
|
113
|
+
|
|
114
|
+
except Exception as e:
|
|
115
|
+
self.logger.error(f"✗ Failed to extract {pdf_file.name}: {str(e)}")
|
|
116
|
+
return PDFRecord(
|
|
117
|
+
filename=pdf_file.name,
|
|
118
|
+
text="",
|
|
119
|
+
word_count=0,
|
|
120
|
+
char_count=0,
|
|
121
|
+
sentence_count=0,
|
|
122
|
+
extraction_time=time.time() - start_time,
|
|
123
|
+
status=f"error: {str(e)}"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def _process_pdfs_parallel(self) -> List[PDFRecord]:
|
|
127
|
+
"""
|
|
128
|
+
Process all PDFs in parallel
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
List of PDFRecord objects
|
|
132
|
+
"""
|
|
133
|
+
# Find all PDF files
|
|
134
|
+
pdf_files = sorted(self.input_dir.glob('*.pdf'))
|
|
135
|
+
|
|
136
|
+
if not pdf_files:
|
|
137
|
+
self.logger.warning(f"No PDF files found in {self.input_dir}")
|
|
138
|
+
return []
|
|
139
|
+
|
|
140
|
+
self.logger.info(
|
|
141
|
+
f"Processing {len(pdf_files)} PDF files with {self.max_workers} workers..."
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
records = []
|
|
145
|
+
|
|
146
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
147
|
+
# Submit all PDF extraction tasks
|
|
148
|
+
future_to_file = {
|
|
149
|
+
executor.submit(self._extract_pdf, pdf_file): pdf_file
|
|
150
|
+
for pdf_file in pdf_files
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
# Process completed tasks as they finish
|
|
154
|
+
for i, future in enumerate(as_completed(future_to_file), 1):
|
|
155
|
+
try:
|
|
156
|
+
record = future.result()
|
|
157
|
+
if record:
|
|
158
|
+
records.append(record)
|
|
159
|
+
except Exception as e:
|
|
160
|
+
self.logger.error(f"Error processing future: {str(e)}")
|
|
161
|
+
|
|
162
|
+
return records
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def process(self) -> Dict[str, Any]:
|
|
167
|
+
"""
|
|
168
|
+
Main processing pipeline
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
Dictionary with processing results and statistics
|
|
172
|
+
"""
|
|
173
|
+
start_time = time.time()
|
|
174
|
+
|
|
175
|
+
self.logger.info(f"Input directory: {self.input_dir}")
|
|
176
|
+
self.logger.info(f"Output directory: {self.output_dir}")
|
|
177
|
+
|
|
178
|
+
# Step 1: Process all PDFs in parallel
|
|
179
|
+
records = self._process_pdfs_parallel()
|
|
180
|
+
|
|
181
|
+
if not records:
|
|
182
|
+
self.logger.warning("No records to process")
|
|
183
|
+
return {
|
|
184
|
+
'status': 'failure',
|
|
185
|
+
'message': 'No PDFs were successfully processed',
|
|
186
|
+
'total_pdfs': 0,
|
|
187
|
+
'successful_extractions': 0,
|
|
188
|
+
'failed_extractions': 0,
|
|
189
|
+
'individual_jsonl_files': 0,
|
|
190
|
+
'combined_jsonl_created': False,
|
|
191
|
+
'total_time': 0,
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
# Count successes and failures
|
|
195
|
+
successful = [r for r in records if r.status == "success"]
|
|
196
|
+
failed = [r for r in records if r.status != "success"]
|
|
197
|
+
|
|
198
|
+
self.logger.info(f"\nExtraction Results:")
|
|
199
|
+
self.logger.info(f" ✓ Successful: {len(successful)}/{len(records)}")
|
|
200
|
+
self.logger.info(f" ✗ Failed: {len(failed)}/{len(records)}")
|
|
201
|
+
|
|
202
|
+
total_time = time.time() - start_time
|
|
203
|
+
|
|
204
|
+
# Calculate statistics
|
|
205
|
+
total_words = sum(r.word_count for r in successful)
|
|
206
|
+
total_chars = sum(r.char_count for r in successful)
|
|
207
|
+
|
|
208
|
+
# Print summary
|
|
209
|
+
self.logger.info(f"\n" + "="*60)
|
|
210
|
+
self.logger.info(f"Processing Summary")
|
|
211
|
+
self.logger.info(f"="*60)
|
|
212
|
+
self.logger.info(f"Total PDFs: {len(records)}")
|
|
213
|
+
self.logger.info(f"Successfully processed: {len(successful)}")
|
|
214
|
+
self.logger.info(f"Failed: {len(failed)}")
|
|
215
|
+
self.logger.info(f"Total words extracted: {total_words:,}")
|
|
216
|
+
self.logger.info(f"Total characters: {total_chars:,}")
|
|
217
|
+
self.logger.info(f"Note: Records processed in-memory for preprocessing")
|
|
218
|
+
self.logger.info(f"Total processing time: {total_time:.2f}s")
|
|
219
|
+
self.logger.info(f"="*60)
|
|
220
|
+
|
|
221
|
+
return {
|
|
222
|
+
'status': 'success',
|
|
223
|
+
'records': records,
|
|
224
|
+
'total_pdfs': len(records),
|
|
225
|
+
'successful_extractions': len(successful),
|
|
226
|
+
'failed_extractions': len(failed),
|
|
227
|
+
'total_words': total_words,
|
|
228
|
+
'total_characters': total_chars,
|
|
229
|
+
'total_time': total_time,
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def main():
|
|
234
|
+
"""Main entry point"""
|
|
235
|
+
import argparse
|
|
236
|
+
|
|
237
|
+
parser = argparse.ArgumentParser(
|
|
238
|
+
description='Batch process PDFs and export to JSONL format'
|
|
239
|
+
)
|
|
240
|
+
parser.add_argument(
|
|
241
|
+
'--input-dir',
|
|
242
|
+
default='./pdfs',
|
|
243
|
+
help='Input directory containing PDFs (default: ./pdfs)'
|
|
244
|
+
)
|
|
245
|
+
parser.add_argument(
|
|
246
|
+
'--output-dir',
|
|
247
|
+
default='./output',
|
|
248
|
+
help='Output directory for JSONL files (default: ./output)'
|
|
249
|
+
)
|
|
250
|
+
parser.add_argument(
|
|
251
|
+
'--workers',
|
|
252
|
+
type=int,
|
|
253
|
+
default=4,
|
|
254
|
+
help='Number of parallel workers (default: 4)'
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
args = parser.parse_args()
|
|
258
|
+
|
|
259
|
+
# Create and run processor
|
|
260
|
+
processor = PDFBatchProcessor(
|
|
261
|
+
input_dir=args.input_dir,
|
|
262
|
+
output_dir=args.output_dir,
|
|
263
|
+
max_workers=args.workers,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
result = processor.process()
|
|
267
|
+
|
|
268
|
+
# Return exit code
|
|
269
|
+
return 0 if result['status'] == 'success' else 1
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
if __name__ == '__main__':
|
|
273
|
+
exit_code = main()
|
|
274
|
+
sys.exit(exit_code)
|