gptmed 0.6.0__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gptmed-0.6.0/gptmed.egg-info → gptmed-0.7.0}/PKG-INFO +2 -1
- gptmed-0.7.0/gptmed/data_preparation/text/__init__.py +358 -0
- gptmed-0.7.0/gptmed/data_preparation/text/base_strategy.py +28 -0
- gptmed-0.7.0/gptmed/data_preparation/text/batch_pdf_to_jsonl.py +274 -0
- gptmed-0.7.0/gptmed/data_preparation/text/case_normalizer.py +66 -0
- gptmed-0.7.0/gptmed/data_preparation/text/pdf_processor.py +302 -0
- gptmed-0.7.0/gptmed/data_preparation/text/pipeline.py +333 -0
- gptmed-0.7.0/gptmed/data_preparation/text/preprocess_jsonl.py +466 -0
- gptmed-0.7.0/gptmed/data_preparation/text/punctuation_handler.py +65 -0
- gptmed-0.7.0/gptmed/data_preparation/text/stopword_remover.py +87 -0
- gptmed-0.7.0/gptmed/data_preparation/text/text_cleaner.py +74 -0
- gptmed-0.7.0/gptmed/data_preparation/text/text_statistics.py +137 -0
- gptmed-0.7.0/gptmed/data_preparation/text/tokenize_jsonl.py +376 -0
- gptmed-0.7.0/gptmed/data_preparation/text/tokenizer.py +74 -0
- gptmed-0.7.0/gptmed/data_preparation/text/unicode_normalizer.py +61 -0
- {gptmed-0.6.0 → gptmed-0.7.0/gptmed.egg-info}/PKG-INFO +2 -1
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed.egg-info/SOURCES.txt +13 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed.egg-info/requires.txt +1 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/pyproject.toml +2 -1
- gptmed-0.6.0/gptmed/data_preparation/text/__init__.py +0 -268
- {gptmed-0.6.0 → gptmed-0.7.0}/LICENSE +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/MANIFEST.in +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/README.md +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/api.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/configs/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/configs/config_loader.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/configs/configs.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/configs/train_config.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/configs/training_config.yaml +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/data/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/data/parsers/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/data/parsers/medquad_parser.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/data/parsers/text_formatter.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/data_preparation/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/data_preparation/audio/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/data_preparation/base.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/data_preparation/cli.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/data_preparation/image/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/data_preparation/video/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/framework/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/framework/cli/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/framework/cli/__main__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/framework/cli/startproject.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/inference/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/inference/decoding_utils.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/inference/generation_config.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/inference/generator.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/inference/sampling.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/model/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/model/architecture/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/model/architecture/attention.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/model/architecture/decoder_block.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/model/architecture/embeddings.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/model/architecture/feedforward.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/model/architecture/transformer.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/model/configs/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/model/configs/model_config.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/observability/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/observability/base.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/observability/callbacks.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/observability/metrics_tracker.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/observability/redis_metrics_storage.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/services/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/services/device_manager.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/services/training_service.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/tokenizer/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/tokenizer/tokenize_data.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/tokenizer/train_tokenizer.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/training/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/training/dataset.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/training/train.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/training/trainer.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/training/utils.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/utils/__init__.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/utils/checkpoints.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed/utils/logging.py +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed.egg-info/dependency_links.txt +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed.egg-info/entry_points.txt +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/gptmed.egg-info/top_level.txt +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/requirements.txt +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/setup.cfg +0 -0
- {gptmed-0.6.0 → gptmed-0.7.0}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gptmed
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: A lightweight GPT-based language model framework for training custom question-answering models on any domain
|
|
5
5
|
Author-email: Sanjog Sigdel <sigdelsanjog@gmail.com>, Sanjog Sigdel <sanjog.sigdel@ku.edu.np>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -51,6 +51,7 @@ Requires-Dist: librosa>=0.10.0; extra == "data-preparation"
|
|
|
51
51
|
Requires-Dist: soundfile>=0.12.0; extra == "data-preparation"
|
|
52
52
|
Requires-Dist: opencv-python>=4.5.0; extra == "data-preparation"
|
|
53
53
|
Requires-Dist: numpy>=1.24.0; extra == "data-preparation"
|
|
54
|
+
Requires-Dist: PyPDF2>=3.0.0; extra == "data-preparation"
|
|
54
55
|
Dynamic: license-file
|
|
55
56
|
|
|
56
57
|
# GptMed 🤖
|
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text data preprocessing and cleaning module - Modular Architecture
|
|
3
|
+
|
|
4
|
+
This module provides strategy-based text preprocessing with support for:
|
|
5
|
+
- Text cleaning (whitespace, special characters, HTML, URLs, emails)
|
|
6
|
+
- Unicode normalization with multiple forms
|
|
7
|
+
- Case conversion (lowercase, uppercase, title, sentence)
|
|
8
|
+
- Punctuation handling (removal, spacing normalization)
|
|
9
|
+
- Stopword removal with customizable lists
|
|
10
|
+
- Tokenization (word and sentence level)
|
|
11
|
+
- Text statistics extraction
|
|
12
|
+
- PDF text extraction and CSV export
|
|
13
|
+
- Batch file processing
|
|
14
|
+
|
|
15
|
+
Architecture:
|
|
16
|
+
Each preprocessing feature is implemented as a separate strategy class following
|
|
17
|
+
the Strategy Design Pattern and SOLID principles. This allows:
|
|
18
|
+
- Independent composition of preprocessing steps
|
|
19
|
+
- Easy addition of new strategies without modifying existing code
|
|
20
|
+
- Single Responsibility: Each class has one reason to change
|
|
21
|
+
- Dependency Inversion: Code depends on TextPreprocessingStrategy interface
|
|
22
|
+
|
|
23
|
+
Usage:
|
|
24
|
+
from gptmed.data_preparation.text import TextCleaner, CaseNormalizer, StopwordRemover
|
|
25
|
+
|
|
26
|
+
# Use individual strategies
|
|
27
|
+
cleaner = TextCleaner()
|
|
28
|
+
normalizer = CaseNormalizer(mode='lower')
|
|
29
|
+
stopwords = StopwordRemover()
|
|
30
|
+
|
|
31
|
+
# Compose into pipeline
|
|
32
|
+
text = "Your text here..."
|
|
33
|
+
text = cleaner.process(text)
|
|
34
|
+
text = normalizer.process(text)
|
|
35
|
+
text = stopwords.process(text)
|
|
36
|
+
|
|
37
|
+
# Get statistics
|
|
38
|
+
stats = cleaner.get_stats()
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
import logging
|
|
42
|
+
from typing import Any, Dict, List, Optional
|
|
43
|
+
from pathlib import Path
|
|
44
|
+
|
|
45
|
+
from ..base import BaseDataPreprocessor, PreprocessingConfig
|
|
46
|
+
|
|
47
|
+
# Import all strategy classes
|
|
48
|
+
from .base_strategy import TextPreprocessingStrategy
|
|
49
|
+
from .text_cleaner import TextCleaner
|
|
50
|
+
from .unicode_normalizer import UnicodeNormalizer
|
|
51
|
+
from .case_normalizer import CaseNormalizer
|
|
52
|
+
from .punctuation_handler import PunctuationHandler
|
|
53
|
+
from .stopword_remover import StopwordRemover
|
|
54
|
+
from .tokenizer import Tokenizer
|
|
55
|
+
from .text_statistics import TextStatistics
|
|
56
|
+
from .pdf_processor import PDFProcessor
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
logger = logging.getLogger(__name__)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class TextPreprocessor(BaseDataPreprocessor):
|
|
63
|
+
"""
|
|
64
|
+
Orchestrator for composing text preprocessing strategies
|
|
65
|
+
|
|
66
|
+
This class provides a unified interface for text preprocessing by composing
|
|
67
|
+
individual strategy classes. It maintains backward compatibility with the
|
|
68
|
+
previous monolithic implementation while enabling flexible strategy composition.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
config: Optional[PreprocessingConfig] = None,
|
|
74
|
+
strategies: Optional[List[TextPreprocessingStrategy]] = None,
|
|
75
|
+
remove_stopwords: bool = False,
|
|
76
|
+
remove_punctuation: bool = False,
|
|
77
|
+
lowercase: bool = True,
|
|
78
|
+
min_length: int = 3,
|
|
79
|
+
max_length: Optional[int] = None,
|
|
80
|
+
):
|
|
81
|
+
"""
|
|
82
|
+
Initialize text preprocessor
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
config: PreprocessingConfig instance
|
|
86
|
+
strategies: List of strategy instances to use in order
|
|
87
|
+
remove_stopwords: Whether to remove common stopwords (default False)
|
|
88
|
+
remove_punctuation: Whether to remove punctuation (default False)
|
|
89
|
+
lowercase: Whether to convert to lowercase (default True)
|
|
90
|
+
min_length: Minimum text length to keep (default 3)
|
|
91
|
+
max_length: Maximum text length, None for unlimited (default None)
|
|
92
|
+
"""
|
|
93
|
+
if config is None:
|
|
94
|
+
config = PreprocessingConfig(
|
|
95
|
+
input_path="./data/raw",
|
|
96
|
+
output_path="./data/processed",
|
|
97
|
+
data_type="text"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
super().__init__(config)
|
|
101
|
+
|
|
102
|
+
self.min_length = min_length
|
|
103
|
+
self.max_length = max_length
|
|
104
|
+
|
|
105
|
+
# Initialize strategies
|
|
106
|
+
if strategies is None:
|
|
107
|
+
strategies = self._create_default_pipeline(
|
|
108
|
+
remove_stopwords=remove_stopwords,
|
|
109
|
+
remove_punctuation=remove_punctuation,
|
|
110
|
+
lowercase=lowercase,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
self.strategies = strategies
|
|
114
|
+
|
|
115
|
+
# Initialize individual strategies for backward compatibility
|
|
116
|
+
self.text_cleaner = TextCleaner()
|
|
117
|
+
self.unicode_normalizer = UnicodeNormalizer()
|
|
118
|
+
self.tokenizer = Tokenizer()
|
|
119
|
+
self.text_stats = TextStatistics()
|
|
120
|
+
self.pdf_processor = PDFProcessor()
|
|
121
|
+
|
|
122
|
+
def _create_default_pipeline(
|
|
123
|
+
self,
|
|
124
|
+
remove_stopwords: bool = False,
|
|
125
|
+
remove_punctuation: bool = False,
|
|
126
|
+
lowercase: bool = True,
|
|
127
|
+
) -> List[TextPreprocessingStrategy]:
|
|
128
|
+
"""Create default preprocessing pipeline"""
|
|
129
|
+
pipeline = [
|
|
130
|
+
TextCleaner(),
|
|
131
|
+
UnicodeNormalizer(),
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
if lowercase:
|
|
135
|
+
pipeline.append(CaseNormalizer(mode='lower'))
|
|
136
|
+
|
|
137
|
+
if remove_punctuation:
|
|
138
|
+
pipeline.append(PunctuationHandler(remove=True))
|
|
139
|
+
|
|
140
|
+
if remove_stopwords:
|
|
141
|
+
pipeline.append(StopwordRemover())
|
|
142
|
+
|
|
143
|
+
return pipeline
|
|
144
|
+
|
|
145
|
+
def validate(self, data: Any) -> bool:
|
|
146
|
+
"""
|
|
147
|
+
Validate text input
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
data: Input text
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
True if valid, False otherwise
|
|
154
|
+
"""
|
|
155
|
+
if not isinstance(data, str):
|
|
156
|
+
self.logger.warning(f"Invalid text type: {type(data)}")
|
|
157
|
+
return False
|
|
158
|
+
|
|
159
|
+
if len(data.strip()) < self.min_length:
|
|
160
|
+
self.logger.debug(f"Text too short: {len(data)}")
|
|
161
|
+
return False
|
|
162
|
+
|
|
163
|
+
if self.max_length and len(data) > self.max_length:
|
|
164
|
+
self.logger.debug(f"Text too long: {len(data)}")
|
|
165
|
+
return False
|
|
166
|
+
|
|
167
|
+
return True
|
|
168
|
+
|
|
169
|
+
def clean(self, data: Any) -> Optional[str]:
|
|
170
|
+
"""
|
|
171
|
+
Clean text data (remove HTML, URLs, emails, etc.)
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
data: Input text
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Cleaned text
|
|
178
|
+
"""
|
|
179
|
+
if not self.validate(data):
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
text = data
|
|
183
|
+
# Apply TextCleaner from strategies
|
|
184
|
+
cleaner = TextCleaner()
|
|
185
|
+
text = cleaner.process(text)
|
|
186
|
+
return text
|
|
187
|
+
|
|
188
|
+
def normalize(self, data: Any) -> Optional[str]:
|
|
189
|
+
"""
|
|
190
|
+
Normalize text (unicode, case, punctuation, stopwords)
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
data: Input text (may be already cleaned)
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
Normalized text
|
|
197
|
+
"""
|
|
198
|
+
if not isinstance(data, str):
|
|
199
|
+
return None
|
|
200
|
+
|
|
201
|
+
text = data
|
|
202
|
+
# Apply remaining strategies except TextCleaner
|
|
203
|
+
for strategy in self.strategies:
|
|
204
|
+
if not isinstance(strategy, TextCleaner):
|
|
205
|
+
text = strategy.process(text)
|
|
206
|
+
if not text:
|
|
207
|
+
return None
|
|
208
|
+
|
|
209
|
+
return text.strip() if text else None
|
|
210
|
+
|
|
211
|
+
def process(self, data: Any) -> Optional[str]:
|
|
212
|
+
"""
|
|
213
|
+
Process text through the pipeline of strategies
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
data: Input text
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Processed text or None if validation fails
|
|
220
|
+
"""
|
|
221
|
+
if not self.validate(data):
|
|
222
|
+
return None
|
|
223
|
+
|
|
224
|
+
text = data
|
|
225
|
+
for strategy in self.strategies:
|
|
226
|
+
text = strategy.process(text)
|
|
227
|
+
if not text:
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
return text.strip() if text else None
|
|
231
|
+
|
|
232
|
+
def tokenize(self, text: str) -> List[str]:
|
|
233
|
+
"""
|
|
234
|
+
Tokenize text into words
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
text: Text to tokenize
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
List of tokens
|
|
241
|
+
"""
|
|
242
|
+
processed = self.process(text)
|
|
243
|
+
if processed is None:
|
|
244
|
+
return []
|
|
245
|
+
|
|
246
|
+
return self.tokenizer.tokenize(processed)
|
|
247
|
+
|
|
248
|
+
def get_text_stats(self, text: str) -> Dict[str, Any]:
|
|
249
|
+
"""
|
|
250
|
+
Get statistics about text (before processing)
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
text: Input text
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
Dictionary with text statistics
|
|
257
|
+
"""
|
|
258
|
+
return self.text_stats.get_text_stats(text)
|
|
259
|
+
|
|
260
|
+
def batch_process_files(
|
|
261
|
+
self,
|
|
262
|
+
input_dir: str,
|
|
263
|
+
output_dir: Optional[str] = None,
|
|
264
|
+
pattern: str = "*.txt"
|
|
265
|
+
) -> Dict[str, Any]:
|
|
266
|
+
"""
|
|
267
|
+
Process multiple text files from a directory
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
input_dir: Input directory path
|
|
271
|
+
output_dir: Output directory path (uses config if None)
|
|
272
|
+
pattern: File pattern to match
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
Processing results
|
|
276
|
+
"""
|
|
277
|
+
output_dir = output_dir or self.config.output_path
|
|
278
|
+
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
|
279
|
+
|
|
280
|
+
input_path = Path(input_dir)
|
|
281
|
+
results = []
|
|
282
|
+
|
|
283
|
+
for file_path in input_path.glob(pattern):
|
|
284
|
+
try:
|
|
285
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
286
|
+
text = f.read()
|
|
287
|
+
|
|
288
|
+
processed = self.process(text)
|
|
289
|
+
|
|
290
|
+
if processed:
|
|
291
|
+
output_file = Path(output_dir) / file_path.name
|
|
292
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
293
|
+
f.write(processed)
|
|
294
|
+
|
|
295
|
+
results.append({
|
|
296
|
+
'file': str(file_path),
|
|
297
|
+
'status': 'success',
|
|
298
|
+
'stats': self.get_text_stats(text)
|
|
299
|
+
})
|
|
300
|
+
|
|
301
|
+
except Exception as e:
|
|
302
|
+
self.logger.error(f"Error processing {file_path}: {str(e)}")
|
|
303
|
+
results.append({
|
|
304
|
+
'file': str(file_path),
|
|
305
|
+
'status': 'error',
|
|
306
|
+
'error': str(e)
|
|
307
|
+
})
|
|
308
|
+
|
|
309
|
+
self.logger.info(f"Processed {len(results)} files")
|
|
310
|
+
return {'results': results, 'stats': self.get_statistics()}
|
|
311
|
+
|
|
312
|
+
# PDF-related methods delegated to PDFProcessor
|
|
313
|
+
def extract_text_from_pdf(self, pdf_path: str) -> Optional[str]:
|
|
314
|
+
"""Extract text from a single PDF file"""
|
|
315
|
+
try:
|
|
316
|
+
return self.pdf_processor.extract_text_from_pdf(pdf_path)
|
|
317
|
+
except Exception as e:
|
|
318
|
+
self.logger.error(f"Error extracting PDF: {str(e)}")
|
|
319
|
+
return None
|
|
320
|
+
|
|
321
|
+
def batch_process_pdfs(self, input_dir: str, output_dir: Optional[str] = None) -> Dict[str, Any]:
|
|
322
|
+
"""Process multiple PDF files from a directory"""
|
|
323
|
+
output_dir = output_dir or self.config.output_path
|
|
324
|
+
return self.pdf_processor.batch_process_pdfs(input_dir, output_dir)
|
|
325
|
+
|
|
326
|
+
def export_to_csv(
|
|
327
|
+
self,
|
|
328
|
+
pdf_dir: str,
|
|
329
|
+
output_csv: str,
|
|
330
|
+
process_text: bool = True,
|
|
331
|
+
) -> Dict[str, Any]:
|
|
332
|
+
"""Export PDF text to CSV with filename and text content columns"""
|
|
333
|
+
return self.pdf_processor.export_to_csv(pdf_dir, output_csv)
|
|
334
|
+
|
|
335
|
+
def export_to_csv_detailed(
|
|
336
|
+
self,
|
|
337
|
+
pdf_dir: str,
|
|
338
|
+
output_csv: str,
|
|
339
|
+
process_text: bool = True,
|
|
340
|
+
include_stats: bool = True,
|
|
341
|
+
) -> Dict[str, Any]:
|
|
342
|
+
"""Export PDF text to CSV with additional statistics"""
|
|
343
|
+
return self.pdf_processor.export_to_csv_detailed(pdf_dir, output_csv)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
# Export all strategy classes for direct import
|
|
347
|
+
__all__ = [
|
|
348
|
+
'TextPreprocessor',
|
|
349
|
+
'TextPreprocessingStrategy',
|
|
350
|
+
'TextCleaner',
|
|
351
|
+
'UnicodeNormalizer',
|
|
352
|
+
'CaseNormalizer',
|
|
353
|
+
'PunctuationHandler',
|
|
354
|
+
'StopwordRemover',
|
|
355
|
+
'Tokenizer',
|
|
356
|
+
'TextStatistics',
|
|
357
|
+
'PDFProcessor',
|
|
358
|
+
]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base strategy interface for text preprocessing
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from typing import Any, Dict
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TextPreprocessingStrategy(ABC):
|
|
10
|
+
"""Abstract base class for all text preprocessing strategies"""
|
|
11
|
+
|
|
12
|
+
@abstractmethod
|
|
13
|
+
def process(self, text: str) -> str:
|
|
14
|
+
"""
|
|
15
|
+
Process the text according to the strategy
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
text: Input text to process
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Processed text
|
|
22
|
+
"""
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
27
|
+
"""Get statistics about the processing"""
|
|
28
|
+
pass
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Batch PDF Processing (Intermediate Step)
|
|
3
|
+
|
|
4
|
+
Parallelly processes all PDFs from a directory and extracts text.
|
|
5
|
+
Note: Does not save files - text is processed in-memory for preprocessing pipeline.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python3 batch_pdf_to_jsonl.py [--input-dir ./pdfs] [--output-dir ./output] [--workers 4]
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import sys
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
import time
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Dict, List, Optional, Any
|
|
17
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
18
|
+
from dataclasses import dataclass, asdict
|
|
19
|
+
|
|
20
|
+
# Adjust path if running from workspace
|
|
21
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
|
|
22
|
+
|
|
23
|
+
from gptmed.data_preparation.text import PDFProcessor
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Configure logging
|
|
27
|
+
logging.basicConfig(
|
|
28
|
+
level=logging.INFO,
|
|
29
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
30
|
+
)
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class PDFRecord:
|
|
36
|
+
"""Data class for a single PDF record"""
|
|
37
|
+
filename: str
|
|
38
|
+
text: str
|
|
39
|
+
word_count: int
|
|
40
|
+
char_count: int
|
|
41
|
+
sentence_count: int
|
|
42
|
+
extraction_time: float
|
|
43
|
+
status: str
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class PDFBatchProcessor:
|
|
47
|
+
"""Process multiple PDFs in parallel and export to JSONL"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
input_dir: str = "./pdfs",
|
|
52
|
+
output_dir: str = "./output",
|
|
53
|
+
max_workers: int = 4,
|
|
54
|
+
):
|
|
55
|
+
"""
|
|
56
|
+
Initialize batch processor
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
input_dir: Directory containing PDF files
|
|
60
|
+
output_dir: Directory for output JSONL files
|
|
61
|
+
max_workers: Number of parallel workers
|
|
62
|
+
"""
|
|
63
|
+
self.input_dir = Path(input_dir)
|
|
64
|
+
self.output_dir = Path(output_dir)
|
|
65
|
+
self.max_workers = max_workers
|
|
66
|
+
|
|
67
|
+
# Create directories if they don't exist
|
|
68
|
+
self.input_dir.mkdir(parents=True, exist_ok=True)
|
|
69
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
70
|
+
|
|
71
|
+
# Initialize PDF processor
|
|
72
|
+
self.pdf_processor = PDFProcessor(max_workers=max_workers, use_threading=True)
|
|
73
|
+
|
|
74
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
75
|
+
|
|
76
|
+
def _extract_pdf(self, pdf_file: Path) -> Optional[PDFRecord]:
|
|
77
|
+
"""
|
|
78
|
+
Extract text from a single PDF file
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
pdf_file: Path to PDF file
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
PDFRecord with extracted information or None if failed
|
|
85
|
+
"""
|
|
86
|
+
start_time = time.time()
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
text = self.pdf_processor.extract_text_from_pdf(str(pdf_file))
|
|
90
|
+
|
|
91
|
+
# Calculate statistics
|
|
92
|
+
word_count = len(text.split())
|
|
93
|
+
char_count = len(text)
|
|
94
|
+
sentence_count = len([s for s in text.split('.') if s.strip()])
|
|
95
|
+
extraction_time = time.time() - start_time
|
|
96
|
+
|
|
97
|
+
record = PDFRecord(
|
|
98
|
+
filename=pdf_file.name,
|
|
99
|
+
text=text,
|
|
100
|
+
word_count=word_count,
|
|
101
|
+
char_count=char_count,
|
|
102
|
+
sentence_count=sentence_count,
|
|
103
|
+
extraction_time=extraction_time,
|
|
104
|
+
status="success"
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
self.logger.info(
|
|
108
|
+
f"✓ Extracted: {pdf_file.name} "
|
|
109
|
+
f"({word_count} words, {char_count} chars) in {extraction_time:.2f}s"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
return record
|
|
113
|
+
|
|
114
|
+
except Exception as e:
|
|
115
|
+
self.logger.error(f"✗ Failed to extract {pdf_file.name}: {str(e)}")
|
|
116
|
+
return PDFRecord(
|
|
117
|
+
filename=pdf_file.name,
|
|
118
|
+
text="",
|
|
119
|
+
word_count=0,
|
|
120
|
+
char_count=0,
|
|
121
|
+
sentence_count=0,
|
|
122
|
+
extraction_time=time.time() - start_time,
|
|
123
|
+
status=f"error: {str(e)}"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def _process_pdfs_parallel(self) -> List[PDFRecord]:
|
|
127
|
+
"""
|
|
128
|
+
Process all PDFs in parallel
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
List of PDFRecord objects
|
|
132
|
+
"""
|
|
133
|
+
# Find all PDF files
|
|
134
|
+
pdf_files = sorted(self.input_dir.glob('*.pdf'))
|
|
135
|
+
|
|
136
|
+
if not pdf_files:
|
|
137
|
+
self.logger.warning(f"No PDF files found in {self.input_dir}")
|
|
138
|
+
return []
|
|
139
|
+
|
|
140
|
+
self.logger.info(
|
|
141
|
+
f"Processing {len(pdf_files)} PDF files with {self.max_workers} workers..."
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
records = []
|
|
145
|
+
|
|
146
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
147
|
+
# Submit all PDF extraction tasks
|
|
148
|
+
future_to_file = {
|
|
149
|
+
executor.submit(self._extract_pdf, pdf_file): pdf_file
|
|
150
|
+
for pdf_file in pdf_files
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
# Process completed tasks as they finish
|
|
154
|
+
for i, future in enumerate(as_completed(future_to_file), 1):
|
|
155
|
+
try:
|
|
156
|
+
record = future.result()
|
|
157
|
+
if record:
|
|
158
|
+
records.append(record)
|
|
159
|
+
except Exception as e:
|
|
160
|
+
self.logger.error(f"Error processing future: {str(e)}")
|
|
161
|
+
|
|
162
|
+
return records
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def process(self) -> Dict[str, Any]:
|
|
167
|
+
"""
|
|
168
|
+
Main processing pipeline
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
Dictionary with processing results and statistics
|
|
172
|
+
"""
|
|
173
|
+
start_time = time.time()
|
|
174
|
+
|
|
175
|
+
self.logger.info(f"Input directory: {self.input_dir}")
|
|
176
|
+
self.logger.info(f"Output directory: {self.output_dir}")
|
|
177
|
+
|
|
178
|
+
# Step 1: Process all PDFs in parallel
|
|
179
|
+
records = self._process_pdfs_parallel()
|
|
180
|
+
|
|
181
|
+
if not records:
|
|
182
|
+
self.logger.warning("No records to process")
|
|
183
|
+
return {
|
|
184
|
+
'status': 'failure',
|
|
185
|
+
'message': 'No PDFs were successfully processed',
|
|
186
|
+
'total_pdfs': 0,
|
|
187
|
+
'successful_extractions': 0,
|
|
188
|
+
'failed_extractions': 0,
|
|
189
|
+
'individual_jsonl_files': 0,
|
|
190
|
+
'combined_jsonl_created': False,
|
|
191
|
+
'total_time': 0,
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
# Count successes and failures
|
|
195
|
+
successful = [r for r in records if r.status == "success"]
|
|
196
|
+
failed = [r for r in records if r.status != "success"]
|
|
197
|
+
|
|
198
|
+
self.logger.info(f"\nExtraction Results:")
|
|
199
|
+
self.logger.info(f" ✓ Successful: {len(successful)}/{len(records)}")
|
|
200
|
+
self.logger.info(f" ✗ Failed: {len(failed)}/{len(records)}")
|
|
201
|
+
|
|
202
|
+
total_time = time.time() - start_time
|
|
203
|
+
|
|
204
|
+
# Calculate statistics
|
|
205
|
+
total_words = sum(r.word_count for r in successful)
|
|
206
|
+
total_chars = sum(r.char_count for r in successful)
|
|
207
|
+
|
|
208
|
+
# Print summary
|
|
209
|
+
self.logger.info(f"\n" + "="*60)
|
|
210
|
+
self.logger.info(f"Processing Summary")
|
|
211
|
+
self.logger.info(f"="*60)
|
|
212
|
+
self.logger.info(f"Total PDFs: {len(records)}")
|
|
213
|
+
self.logger.info(f"Successfully processed: {len(successful)}")
|
|
214
|
+
self.logger.info(f"Failed: {len(failed)}")
|
|
215
|
+
self.logger.info(f"Total words extracted: {total_words:,}")
|
|
216
|
+
self.logger.info(f"Total characters: {total_chars:,}")
|
|
217
|
+
self.logger.info(f"Note: Records processed in-memory for preprocessing")
|
|
218
|
+
self.logger.info(f"Total processing time: {total_time:.2f}s")
|
|
219
|
+
self.logger.info(f"="*60)
|
|
220
|
+
|
|
221
|
+
return {
|
|
222
|
+
'status': 'success',
|
|
223
|
+
'records': records,
|
|
224
|
+
'total_pdfs': len(records),
|
|
225
|
+
'successful_extractions': len(successful),
|
|
226
|
+
'failed_extractions': len(failed),
|
|
227
|
+
'total_words': total_words,
|
|
228
|
+
'total_characters': total_chars,
|
|
229
|
+
'total_time': total_time,
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def main():
|
|
234
|
+
"""Main entry point"""
|
|
235
|
+
import argparse
|
|
236
|
+
|
|
237
|
+
parser = argparse.ArgumentParser(
|
|
238
|
+
description='Batch process PDFs and export to JSONL format'
|
|
239
|
+
)
|
|
240
|
+
parser.add_argument(
|
|
241
|
+
'--input-dir',
|
|
242
|
+
default='./pdfs',
|
|
243
|
+
help='Input directory containing PDFs (default: ./pdfs)'
|
|
244
|
+
)
|
|
245
|
+
parser.add_argument(
|
|
246
|
+
'--output-dir',
|
|
247
|
+
default='./output',
|
|
248
|
+
help='Output directory for JSONL files (default: ./output)'
|
|
249
|
+
)
|
|
250
|
+
parser.add_argument(
|
|
251
|
+
'--workers',
|
|
252
|
+
type=int,
|
|
253
|
+
default=4,
|
|
254
|
+
help='Number of parallel workers (default: 4)'
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
args = parser.parse_args()
|
|
258
|
+
|
|
259
|
+
# Create and run processor
|
|
260
|
+
processor = PDFBatchProcessor(
|
|
261
|
+
input_dir=args.input_dir,
|
|
262
|
+
output_dir=args.output_dir,
|
|
263
|
+
max_workers=args.workers,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
result = processor.process()
|
|
267
|
+
|
|
268
|
+
# Return exit code
|
|
269
|
+
return 0 if result['status'] == 'success' else 1
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
if __name__ == '__main__':
|
|
273
|
+
exit_code = main()
|
|
274
|
+
sys.exit(exit_code)
|