gptmed 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gptmed/data_preparation/text/__init__.py +190 -100
- gptmed/data_preparation/text/base_strategy.py +28 -0
- gptmed/data_preparation/text/batch_pdf_to_jsonl.py +274 -0
- gptmed/data_preparation/text/case_normalizer.py +66 -0
- gptmed/data_preparation/text/pdf_processor.py +302 -0
- gptmed/data_preparation/text/pipeline.py +333 -0
- gptmed/data_preparation/text/preprocess_jsonl.py +466 -0
- gptmed/data_preparation/text/punctuation_handler.py +65 -0
- gptmed/data_preparation/text/stopword_remover.py +87 -0
- gptmed/data_preparation/text/text_cleaner.py +74 -0
- gptmed/data_preparation/text/text_statistics.py +137 -0
- gptmed/data_preparation/text/tokenize_jsonl.py +376 -0
- gptmed/data_preparation/text/tokenizer.py +74 -0
- gptmed/data_preparation/text/unicode_normalizer.py +61 -0
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/METADATA +2 -1
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/RECORD +20 -7
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/WHEEL +0 -0
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/entry_points.txt +0 -0
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/licenses/LICENSE +0 -0
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Case normalization strategy - converts text case
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict
|
|
6
|
+
|
|
7
|
+
from .base_strategy import TextPreprocessingStrategy
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CaseNormalizer(TextPreprocessingStrategy):
|
|
11
|
+
"""
|
|
12
|
+
Handles text case conversion (lowercase, uppercase, title case)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, mode: str = 'lower'):
|
|
16
|
+
"""
|
|
17
|
+
Initialize case normalizer
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
mode: Case conversion mode ('lower', 'upper', 'title', 'sentence')
|
|
21
|
+
"""
|
|
22
|
+
if mode not in ('lower', 'upper', 'title', 'sentence'):
|
|
23
|
+
raise ValueError(f"Invalid mode: {mode}. Use 'lower', 'upper', 'title', or 'sentence'")
|
|
24
|
+
|
|
25
|
+
self.mode = mode
|
|
26
|
+
self.stats = {
|
|
27
|
+
'mode_used': mode,
|
|
28
|
+
'conversions_applied': 0,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
def process(self, text: str) -> str:
|
|
32
|
+
"""
|
|
33
|
+
Normalize text case
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
text: Text to normalize
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Case-normalized text
|
|
40
|
+
"""
|
|
41
|
+
if not text:
|
|
42
|
+
return text
|
|
43
|
+
|
|
44
|
+
original = text
|
|
45
|
+
|
|
46
|
+
if self.mode == 'lower':
|
|
47
|
+
text = text.lower()
|
|
48
|
+
elif self.mode == 'upper':
|
|
49
|
+
text = text.upper()
|
|
50
|
+
elif self.mode == 'title':
|
|
51
|
+
text = text.title()
|
|
52
|
+
elif self.mode == 'sentence':
|
|
53
|
+
text = text[0].upper() + text[1:].lower() if len(text) > 0 else text
|
|
54
|
+
|
|
55
|
+
if text != original:
|
|
56
|
+
self.stats['conversions_applied'] += 1
|
|
57
|
+
|
|
58
|
+
return text
|
|
59
|
+
|
|
60
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
61
|
+
"""Get conversion statistics"""
|
|
62
|
+
return self.stats.copy()
|
|
63
|
+
|
|
64
|
+
def reset_stats(self) -> None:
|
|
65
|
+
"""Reset statistics"""
|
|
66
|
+
self.stats['conversions_applied'] = 0
|
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDF processing strategy - extracts text from PDFs and exports to CSV
|
|
3
|
+
|
|
4
|
+
Supports parallel processing for batch operations using concurrent.futures
|
|
5
|
+
for efficient handling of multiple PDF files.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import csv
|
|
9
|
+
import logging
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Dict, List, Optional
|
|
12
|
+
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
|
|
13
|
+
|
|
14
|
+
from .base_strategy import TextPreprocessingStrategy
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class PDFProcessor(TextPreprocessingStrategy):
|
|
21
|
+
"""
|
|
22
|
+
Processes PDF files: extracts text and exports to CSV
|
|
23
|
+
|
|
24
|
+
Supports both sequential and parallel batch processing for efficient
|
|
25
|
+
handling of multiple PDF files using ThreadPoolExecutor.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, max_workers: int = 4, use_threading: bool = True):
|
|
29
|
+
"""
|
|
30
|
+
Initialize PDF processor
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
max_workers: Maximum number of parallel workers (default: 4)
|
|
34
|
+
use_threading: Use ThreadPoolExecutor (True) or ProcessPoolExecutor (False)
|
|
35
|
+
Threading is preferred for I/O-bound PDF extraction
|
|
36
|
+
"""
|
|
37
|
+
self.PyPDF2 = None
|
|
38
|
+
self._import_pdf_library()
|
|
39
|
+
|
|
40
|
+
self.max_workers = max_workers
|
|
41
|
+
self.use_threading = use_threading
|
|
42
|
+
|
|
43
|
+
self.stats = {
|
|
44
|
+
'pdfs_processed': 0,
|
|
45
|
+
'total_text_extracted': 0,
|
|
46
|
+
'csv_exports': 0,
|
|
47
|
+
'parallel_jobs_completed': 0,
|
|
48
|
+
'processing_errors': 0,
|
|
49
|
+
}
|
|
50
|
+
self.last_extracted_texts: Dict[str, str] = {}
|
|
51
|
+
|
|
52
|
+
def _import_pdf_library(self) -> None:
|
|
53
|
+
"""Import PyPDF2 library, raise error if not available"""
|
|
54
|
+
try:
|
|
55
|
+
import PyPDF2
|
|
56
|
+
self.PyPDF2 = PyPDF2
|
|
57
|
+
except ImportError:
|
|
58
|
+
raise ImportError(
|
|
59
|
+
"PyPDF2 is required for PDF processing. "
|
|
60
|
+
"Install it with: pip install gptmed[data-preparation]"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def process(self, text: str) -> str:
|
|
64
|
+
"""
|
|
65
|
+
This strategy is not designed for streaming text processing.
|
|
66
|
+
Use extract_text_from_pdf() or batch_process_pdfs() instead.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
text: Not used for PDF processing
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Empty string (PDFs are processed via dedicated methods)
|
|
73
|
+
"""
|
|
74
|
+
return ""
|
|
75
|
+
|
|
76
|
+
def extract_text_from_pdf(self, pdf_path: str) -> str:
|
|
77
|
+
"""
|
|
78
|
+
Extract text from a single PDF file
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
pdf_path: Path to PDF file
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Extracted text from PDF
|
|
85
|
+
|
|
86
|
+
Raises:
|
|
87
|
+
FileNotFoundError: If PDF file doesn't exist
|
|
88
|
+
Exception: If PDF processing fails
|
|
89
|
+
"""
|
|
90
|
+
pdf_path = Path(pdf_path)
|
|
91
|
+
|
|
92
|
+
if not pdf_path.exists():
|
|
93
|
+
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
text_content = []
|
|
97
|
+
with open(pdf_path, 'rb') as file:
|
|
98
|
+
pdf_reader = self.PyPDF2.PdfReader(file)
|
|
99
|
+
|
|
100
|
+
for page_num in range(len(pdf_reader.pages)):
|
|
101
|
+
page = pdf_reader.pages[page_num]
|
|
102
|
+
text_content.append(page.extract_text())
|
|
103
|
+
|
|
104
|
+
full_text = '\n'.join(text_content)
|
|
105
|
+
self.last_extracted_texts[str(pdf_path)] = full_text
|
|
106
|
+
self.stats['pdfs_processed'] += 1
|
|
107
|
+
self.stats['total_text_extracted'] += len(full_text)
|
|
108
|
+
|
|
109
|
+
return full_text
|
|
110
|
+
|
|
111
|
+
except Exception as e:
|
|
112
|
+
raise Exception(f"Failed to process PDF {pdf_path}: {str(e)}")
|
|
113
|
+
|
|
114
|
+
def batch_process_pdfs(
|
|
115
|
+
self,
|
|
116
|
+
pdf_dir: str,
|
|
117
|
+
output_format: str = 'dict',
|
|
118
|
+
parallel: bool = True,
|
|
119
|
+
max_workers: Optional[int] = None,
|
|
120
|
+
) -> Any:
|
|
121
|
+
"""
|
|
122
|
+
Process multiple PDFs from a directory
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
pdf_dir: Directory containing PDF files
|
|
126
|
+
output_format: Format to return ('dict' or 'list')
|
|
127
|
+
parallel: Whether to use parallel processing (default: True)
|
|
128
|
+
max_workers: Override default max workers for this batch operation
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Dictionary or list of {filename: text} pairs
|
|
132
|
+
"""
|
|
133
|
+
pdf_dir = Path(pdf_dir)
|
|
134
|
+
|
|
135
|
+
if not pdf_dir.is_dir():
|
|
136
|
+
raise NotADirectoryError(f"Directory not found: {pdf_dir}")
|
|
137
|
+
|
|
138
|
+
pdf_files = list(pdf_dir.glob('*.pdf'))
|
|
139
|
+
|
|
140
|
+
if not pdf_files:
|
|
141
|
+
logger.warning(f"No PDF files found in {pdf_dir}")
|
|
142
|
+
return {} if output_format == 'dict' else []
|
|
143
|
+
|
|
144
|
+
if parallel and len(pdf_files) > 1:
|
|
145
|
+
return self._batch_process_parallel(
|
|
146
|
+
pdf_files,
|
|
147
|
+
output_format,
|
|
148
|
+
max_workers or self.max_workers
|
|
149
|
+
)
|
|
150
|
+
else:
|
|
151
|
+
return self._batch_process_sequential(pdf_files, output_format)
|
|
152
|
+
|
|
153
|
+
def _batch_process_sequential(self, pdf_files: List[Path], output_format: str) -> Any:
|
|
154
|
+
"""
|
|
155
|
+
Process PDFs sequentially (original method)
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
pdf_files: List of PDF file paths
|
|
159
|
+
output_format: Format to return ('dict' or 'list')
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Dictionary or list of {filename: text} pairs
|
|
163
|
+
"""
|
|
164
|
+
results = {}
|
|
165
|
+
|
|
166
|
+
for pdf_file in pdf_files:
|
|
167
|
+
try:
|
|
168
|
+
text = self.extract_text_from_pdf(str(pdf_file))
|
|
169
|
+
results[pdf_file.name] = text
|
|
170
|
+
except Exception as e:
|
|
171
|
+
error_msg = f"Error: {str(e)}"
|
|
172
|
+
results[pdf_file.name] = error_msg
|
|
173
|
+
self.stats['processing_errors'] += 1
|
|
174
|
+
logger.error(f"Error processing {pdf_file.name}: {str(e)}")
|
|
175
|
+
|
|
176
|
+
if output_format == 'list':
|
|
177
|
+
return [{'filename': k, 'text': v} for k, v in results.items()]
|
|
178
|
+
|
|
179
|
+
return results
|
|
180
|
+
|
|
181
|
+
def _batch_process_parallel(
|
|
182
|
+
self,
|
|
183
|
+
pdf_files: List[Path],
|
|
184
|
+
output_format: str,
|
|
185
|
+
max_workers: int,
|
|
186
|
+
) -> Any:
|
|
187
|
+
"""
|
|
188
|
+
Process PDFs in parallel using ThreadPoolExecutor
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
pdf_files: List of PDF file paths
|
|
192
|
+
output_format: Format to return ('dict' or 'list')
|
|
193
|
+
max_workers: Maximum number of concurrent workers
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
Dictionary or list of {filename: text} pairs
|
|
197
|
+
"""
|
|
198
|
+
results = {}
|
|
199
|
+
executor_class = ThreadPoolExecutor if self.use_threading else ProcessPoolExecutor
|
|
200
|
+
|
|
201
|
+
logger.info(f"Starting parallel PDF processing with {max_workers} workers")
|
|
202
|
+
|
|
203
|
+
with executor_class(max_workers=max_workers) as executor:
|
|
204
|
+
# Submit all tasks
|
|
205
|
+
future_to_file = {
|
|
206
|
+
executor.submit(self.extract_text_from_pdf, str(pdf_file)): pdf_file
|
|
207
|
+
for pdf_file in pdf_files
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
# Process completed tasks as they finish
|
|
211
|
+
for i, future in enumerate(as_completed(future_to_file), 1):
|
|
212
|
+
pdf_file = future_to_file[future]
|
|
213
|
+
|
|
214
|
+
try:
|
|
215
|
+
text = future.result()
|
|
216
|
+
results[pdf_file.name] = text
|
|
217
|
+
self.stats['parallel_jobs_completed'] += 1
|
|
218
|
+
logger.debug(f"[{i}/{len(pdf_files)}] Completed: {pdf_file.name}")
|
|
219
|
+
|
|
220
|
+
except Exception as e:
|
|
221
|
+
error_msg = f"Error: {str(e)}"
|
|
222
|
+
results[pdf_file.name] = error_msg
|
|
223
|
+
self.stats['processing_errors'] += 1
|
|
224
|
+
logger.error(f"Error processing {pdf_file.name}: {str(e)}")
|
|
225
|
+
|
|
226
|
+
logger.info(f"Parallel processing complete: {len(results)} files processed")
|
|
227
|
+
|
|
228
|
+
if output_format == 'list':
|
|
229
|
+
return [{'filename': k, 'text': v} for k, v in results.items()]
|
|
230
|
+
|
|
231
|
+
return results
|
|
232
|
+
|
|
233
|
+
def export_to_csv(self, texts: Dict[str, str], output_file: str) -> None:
|
|
234
|
+
"""
|
|
235
|
+
Export text data to CSV with filename and text content
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
texts: Dictionary of {filename: text_content}
|
|
239
|
+
output_file: Path to output CSV file
|
|
240
|
+
"""
|
|
241
|
+
output_path = Path(output_file)
|
|
242
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
243
|
+
|
|
244
|
+
try:
|
|
245
|
+
with open(output_path, 'w', newline='', encoding='utf-8') as f:
|
|
246
|
+
writer = csv.writer(f)
|
|
247
|
+
writer.writerow(['filename', 'text_content'])
|
|
248
|
+
|
|
249
|
+
for filename, text in texts.items():
|
|
250
|
+
writer.writerow([filename, text])
|
|
251
|
+
|
|
252
|
+
self.stats['csv_exports'] += 1
|
|
253
|
+
|
|
254
|
+
except Exception as e:
|
|
255
|
+
raise Exception(f"Failed to export CSV: {str(e)}")
|
|
256
|
+
|
|
257
|
+
def export_to_csv_detailed(self, texts: Dict[str, str], output_file: str) -> None:
|
|
258
|
+
"""
|
|
259
|
+
Export text data to CSV with detailed statistics
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
texts: Dictionary of {filename: text_content}
|
|
263
|
+
output_file: Path to output CSV file
|
|
264
|
+
"""
|
|
265
|
+
output_path = Path(output_file)
|
|
266
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
267
|
+
|
|
268
|
+
try:
|
|
269
|
+
with open(output_path, 'w', newline='', encoding='utf-8') as f:
|
|
270
|
+
writer = csv.writer(f)
|
|
271
|
+
writer.writerow([
|
|
272
|
+
'filename', 'text_content', 'word_count',
|
|
273
|
+
'char_count', 'sentence_count'
|
|
274
|
+
])
|
|
275
|
+
|
|
276
|
+
for filename, text in texts.items():
|
|
277
|
+
word_count = len(text.split())
|
|
278
|
+
char_count = len(text)
|
|
279
|
+
sentence_count = len([s for s in text.split('.') if s.strip()])
|
|
280
|
+
|
|
281
|
+
writer.writerow([
|
|
282
|
+
filename, text, word_count, char_count, sentence_count
|
|
283
|
+
])
|
|
284
|
+
|
|
285
|
+
self.stats['csv_exports'] += 1
|
|
286
|
+
|
|
287
|
+
except Exception as e:
|
|
288
|
+
raise Exception(f"Failed to export detailed CSV: {str(e)}")
|
|
289
|
+
|
|
290
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
291
|
+
"""Get PDF processing statistics"""
|
|
292
|
+
return self.stats.copy()
|
|
293
|
+
|
|
294
|
+
def reset_stats(self) -> None:
|
|
295
|
+
"""Reset statistics"""
|
|
296
|
+
self.stats = {
|
|
297
|
+
'pdfs_processed': 0,
|
|
298
|
+
'total_text_extracted': 0,
|
|
299
|
+
'csv_exports': 0,
|
|
300
|
+
'parallel_jobs_completed': 0,
|
|
301
|
+
'processing_errors': 0,
|
|
302
|
+
}
|
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Complete PDF → Tokens Pipeline
|
|
3
|
+
|
|
4
|
+
Orchestrates the full preprocessing pipeline:
|
|
5
|
+
1. Extract text from PDFs (in-memory)
|
|
6
|
+
2. Preprocess text (in-memory)
|
|
7
|
+
3. Tokenize (saves merged_tokens.jsonl only)
|
|
8
|
+
|
|
9
|
+
This is the main entry point for generating training data.
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
python3 pipeline.py \
|
|
13
|
+
--input-dir ./pdfs \
|
|
14
|
+
--output-dir ./output \
|
|
15
|
+
--tokenizer-method huggingface \
|
|
16
|
+
--tokenizer-model gpt2 \
|
|
17
|
+
--workers 4
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import sys
|
|
21
|
+
import json
|
|
22
|
+
import logging
|
|
23
|
+
import time
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Dict, List, Any
|
|
26
|
+
from dataclasses import asdict
|
|
27
|
+
import argparse
|
|
28
|
+
import importlib.util
|
|
29
|
+
|
|
30
|
+
# Adjust path
|
|
31
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
|
|
32
|
+
|
|
33
|
+
# Load batch_pdf_to_jsonl
|
|
34
|
+
spec1 = importlib.util.spec_from_file_location("batch_pdf_to_jsonl", Path(__file__).parent / "batch_pdf_to_jsonl.py")
|
|
35
|
+
batch_module = importlib.util.module_from_spec(spec1)
|
|
36
|
+
spec1.loader.exec_module(batch_module)
|
|
37
|
+
PDFBatchProcessor = batch_module.PDFBatchProcessor
|
|
38
|
+
PDFRecord = batch_module.PDFRecord
|
|
39
|
+
|
|
40
|
+
# Load preprocess_jsonl
|
|
41
|
+
spec2 = importlib.util.spec_from_file_location("preprocess_jsonl", Path(__file__).parent / "preprocess_jsonl.py")
|
|
42
|
+
preprocess_module = importlib.util.module_from_spec(spec2)
|
|
43
|
+
spec2.loader.exec_module(preprocess_module)
|
|
44
|
+
ComprehensiveJSONLPreprocessor = preprocess_module.ComprehensiveJSONLPreprocessor
|
|
45
|
+
FullPreprocessedRecord = preprocess_module.FullPreprocessedRecord
|
|
46
|
+
|
|
47
|
+
# Load tokenize_jsonl
|
|
48
|
+
spec3 = importlib.util.spec_from_file_location("tokenize_jsonl", Path(__file__).parent / "tokenize_jsonl.py")
|
|
49
|
+
tokenize_module = importlib.util.module_from_spec(spec3)
|
|
50
|
+
spec3.loader.exec_module(tokenize_module)
|
|
51
|
+
ParallelJSONLTokenizer = tokenize_module.ParallelJSONLTokenizer
|
|
52
|
+
SimplifiedTokenizedRecord = tokenize_module.SimplifiedTokenizedRecord
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# Configure logging
|
|
56
|
+
logging.basicConfig(
|
|
57
|
+
level=logging.INFO,
|
|
58
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
59
|
+
)
|
|
60
|
+
logger = logging.getLogger(__name__)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class EndToEndPipeline:
|
|
64
|
+
"""Complete PDF to training data pipeline"""
|
|
65
|
+
|
|
66
|
+
def __init__(
|
|
67
|
+
self,
|
|
68
|
+
input_dir: str = "./pdfs",
|
|
69
|
+
output_dir: str = "./output",
|
|
70
|
+
tokenizer_method: str = "huggingface",
|
|
71
|
+
tokenizer_model: str = "gpt2",
|
|
72
|
+
workers: int = 4,
|
|
73
|
+
case_mode: str = "lower",
|
|
74
|
+
remove_stopwords: bool = False,
|
|
75
|
+
remove_punctuation: bool = False,
|
|
76
|
+
):
|
|
77
|
+
"""
|
|
78
|
+
Initialize pipeline
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
input_dir: Directory containing PDFs
|
|
82
|
+
output_dir: Output directory for final results
|
|
83
|
+
tokenizer_method: Tokenization method (huggingface/custom/sentencepiece)
|
|
84
|
+
tokenizer_model: Tokenizer model name
|
|
85
|
+
workers: Number of parallel workers
|
|
86
|
+
case_mode: Case normalization (lower/upper/title/sentence)
|
|
87
|
+
remove_stopwords: Whether to remove stopwords
|
|
88
|
+
remove_punctuation: Whether to remove punctuation
|
|
89
|
+
"""
|
|
90
|
+
self.input_dir = Path(input_dir)
|
|
91
|
+
self.output_dir = Path(output_dir)
|
|
92
|
+
self.tokenizer_method = tokenizer_method
|
|
93
|
+
self.tokenizer_model = tokenizer_model
|
|
94
|
+
self.workers = workers
|
|
95
|
+
self.case_mode = case_mode
|
|
96
|
+
self.remove_stopwords = remove_stopwords
|
|
97
|
+
self.remove_punctuation = remove_punctuation
|
|
98
|
+
|
|
99
|
+
# Create output directory
|
|
100
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
101
|
+
|
|
102
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
103
|
+
|
|
104
|
+
def step1_extract_pdfs(self) -> List[PDFRecord]:
|
|
105
|
+
"""Step 1: Extract text from PDFs in-memory"""
|
|
106
|
+
self.logger.info("\n" + "="*70)
|
|
107
|
+
self.logger.info("STEP 1: PDF EXTRACTION")
|
|
108
|
+
self.logger.info("="*70)
|
|
109
|
+
|
|
110
|
+
processor = PDFBatchProcessor(
|
|
111
|
+
input_dir=str(self.input_dir),
|
|
112
|
+
output_dir=str(self.output_dir),
|
|
113
|
+
max_workers=self.workers,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
result = processor.process()
|
|
117
|
+
records = result.get('records', [])
|
|
118
|
+
|
|
119
|
+
self.logger.info(f"\n✓ Extracted {len(records)} PDF records")
|
|
120
|
+
return records
|
|
121
|
+
|
|
122
|
+
def step2_preprocess_text(self, records: List[PDFRecord]) -> List[FullPreprocessedRecord]:
|
|
123
|
+
"""Step 2: Preprocess text in-memory"""
|
|
124
|
+
self.logger.info("\n" + "="*70)
|
|
125
|
+
self.logger.info("STEP 2: TEXT PREPROCESSING")
|
|
126
|
+
self.logger.info("="*70)
|
|
127
|
+
|
|
128
|
+
# Create a dummy temp file just to initialize the preprocessor
|
|
129
|
+
# We'll manually preprocess records using its methods
|
|
130
|
+
temp_input = self.output_dir / "_temp_input.jsonl"
|
|
131
|
+
with open(temp_input, 'w') as f:
|
|
132
|
+
f.write("{}\n")
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
processor = ComprehensiveJSONLPreprocessor(
|
|
136
|
+
input_file=str(temp_input),
|
|
137
|
+
output_file=None,
|
|
138
|
+
case_mode=self.case_mode,
|
|
139
|
+
remove_stopwords=self.remove_stopwords,
|
|
140
|
+
remove_punctuation=self.remove_punctuation,
|
|
141
|
+
)
|
|
142
|
+
finally:
|
|
143
|
+
temp_input.unlink()
|
|
144
|
+
|
|
145
|
+
preprocessed_records = []
|
|
146
|
+
|
|
147
|
+
# Convert PDFRecord to dict format expected by preprocess_record
|
|
148
|
+
for record in records:
|
|
149
|
+
record_dict = {
|
|
150
|
+
'filename': record.filename,
|
|
151
|
+
'text': record.text,
|
|
152
|
+
'word_count': record.word_count,
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
# Preprocess the record
|
|
156
|
+
preprocessed = processor.preprocess_record(record_dict)
|
|
157
|
+
preprocessed_records.append(preprocessed)
|
|
158
|
+
|
|
159
|
+
self.logger.info(f"✓ Preprocessed {len(preprocessed_records)} records")
|
|
160
|
+
|
|
161
|
+
# Save preprocessed output
|
|
162
|
+
output_file = self.output_dir / "full_preprocessed.jsonl"
|
|
163
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
164
|
+
for record in preprocessed_records:
|
|
165
|
+
json.dump(asdict(record), f, ensure_ascii=False)
|
|
166
|
+
f.write('\n')
|
|
167
|
+
|
|
168
|
+
self.logger.info(f"✓ Saved: {output_file.name}")
|
|
169
|
+
|
|
170
|
+
return preprocessed_records
|
|
171
|
+
|
|
172
|
+
def step3_tokenize(self, preprocessed_records: List[FullPreprocessedRecord]) -> Dict[str, Any]:
|
|
173
|
+
"""Step 3: Tokenize preprocessed text"""
|
|
174
|
+
self.logger.info("\n" + "="*70)
|
|
175
|
+
self.logger.info("STEP 3: TOKENIZATION")
|
|
176
|
+
self.logger.info("="*70)
|
|
177
|
+
|
|
178
|
+
# Save preprocessed records to temporary JSONL for tokenizer to consume
|
|
179
|
+
temp_file = self.output_dir / "_temp_preprocessed.jsonl"
|
|
180
|
+
with open(temp_file, 'w', encoding='utf-8') as f:
|
|
181
|
+
for record in preprocessed_records:
|
|
182
|
+
json.dump(asdict(record), f, ensure_ascii=False)
|
|
183
|
+
f.write('\n')
|
|
184
|
+
|
|
185
|
+
# Initialize tokenizer
|
|
186
|
+
tokens_dir = self.output_dir / "tokens"
|
|
187
|
+
tokenizer = ParallelJSONLTokenizer(
|
|
188
|
+
input_file=str(temp_file),
|
|
189
|
+
output_dir=str(tokens_dir),
|
|
190
|
+
method=self.tokenizer_method,
|
|
191
|
+
model_name=self.tokenizer_model,
|
|
192
|
+
workers=self.workers,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Tokenize
|
|
196
|
+
result = tokenizer.process()
|
|
197
|
+
|
|
198
|
+
# Clean up temporary file
|
|
199
|
+
temp_file.unlink()
|
|
200
|
+
|
|
201
|
+
return result
|
|
202
|
+
|
|
203
|
+
def run(self) -> Dict[str, Any]:
|
|
204
|
+
"""Execute full pipeline"""
|
|
205
|
+
start_time = time.time()
|
|
206
|
+
|
|
207
|
+
self.logger.info("\n")
|
|
208
|
+
self.logger.info("╔" + "="*68 + "╗")
|
|
209
|
+
self.logger.info("║" + " "*15 + "END-TO-END PDF → TOKENS PIPELINE" + " "*21 + "║")
|
|
210
|
+
self.logger.info("╚" + "="*68 + "╝")
|
|
211
|
+
|
|
212
|
+
try:
|
|
213
|
+
# Step 1: Extract PDFs
|
|
214
|
+
pdf_records = self.step1_extract_pdfs()
|
|
215
|
+
|
|
216
|
+
if not pdf_records:
|
|
217
|
+
self.logger.error("No PDF records extracted. Exiting.")
|
|
218
|
+
return {'status': 'failure', 'message': 'No PDFs extracted'}
|
|
219
|
+
|
|
220
|
+
# Step 2: Preprocess
|
|
221
|
+
preprocessed_records = self.step2_preprocess_text(pdf_records)
|
|
222
|
+
|
|
223
|
+
if not preprocessed_records:
|
|
224
|
+
self.logger.error("No records preprocessed. Exiting.")
|
|
225
|
+
return {'status': 'failure', 'message': 'Preprocessing failed'}
|
|
226
|
+
|
|
227
|
+
# Step 3: Tokenize
|
|
228
|
+
tokenization_result = self.step3_tokenize(preprocessed_records)
|
|
229
|
+
|
|
230
|
+
total_time = time.time() - start_time
|
|
231
|
+
|
|
232
|
+
# Final summary
|
|
233
|
+
self.logger.info("\n" + "="*70)
|
|
234
|
+
self.logger.info("PIPELINE COMPLETE")
|
|
235
|
+
self.logger.info("="*70)
|
|
236
|
+
self.logger.info(f"\nFinal Outputs:")
|
|
237
|
+
self.logger.info(f" 1. {self.output_dir}/full_preprocessed.jsonl (cleaned text)")
|
|
238
|
+
self.logger.info(f" 2. {self.output_dir}/tokens/merged_tokens.jsonl (training tokens)")
|
|
239
|
+
self.logger.info(f"\nTotal Time: {total_time:.2f}s")
|
|
240
|
+
self.logger.info(f"="*70 + "\n")
|
|
241
|
+
|
|
242
|
+
return {
|
|
243
|
+
'status': 'success',
|
|
244
|
+
'total_time': total_time,
|
|
245
|
+
'pdf_extraction': {
|
|
246
|
+
'records': len(pdf_records),
|
|
247
|
+
},
|
|
248
|
+
'preprocessing': {
|
|
249
|
+
'records': len(preprocessed_records),
|
|
250
|
+
},
|
|
251
|
+
'tokenization': tokenization_result,
|
|
252
|
+
'output_dir': str(self.output_dir),
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
except Exception as e:
|
|
256
|
+
self.logger.error(f"Pipeline failed: {str(e)}")
|
|
257
|
+
import traceback
|
|
258
|
+
traceback.print_exc()
|
|
259
|
+
return {'status': 'failure', 'error': str(e)}
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def main():
|
|
263
|
+
"""Main entry point"""
|
|
264
|
+
parser = argparse.ArgumentParser(
|
|
265
|
+
description='Complete PDF to training tokens pipeline'
|
|
266
|
+
)
|
|
267
|
+
parser.add_argument(
|
|
268
|
+
'--input-dir',
|
|
269
|
+
default='./pdfs',
|
|
270
|
+
help='Input directory containing PDFs (default: ./pdfs)'
|
|
271
|
+
)
|
|
272
|
+
parser.add_argument(
|
|
273
|
+
'--output-dir',
|
|
274
|
+
default='./output',
|
|
275
|
+
help='Output directory for final results (default: ./output)'
|
|
276
|
+
)
|
|
277
|
+
parser.add_argument(
|
|
278
|
+
'--tokenizer-method',
|
|
279
|
+
default='huggingface',
|
|
280
|
+
choices=['huggingface', 'custom', 'sentencepiece'],
|
|
281
|
+
help='Tokenization method (default: huggingface)'
|
|
282
|
+
)
|
|
283
|
+
parser.add_argument(
|
|
284
|
+
'--tokenizer-model',
|
|
285
|
+
default='gpt2',
|
|
286
|
+
help='Tokenizer model name (default: gpt2)'
|
|
287
|
+
)
|
|
288
|
+
parser.add_argument(
|
|
289
|
+
'--workers',
|
|
290
|
+
type=int,
|
|
291
|
+
default=4,
|
|
292
|
+
help='Number of parallel workers (default: 4)'
|
|
293
|
+
)
|
|
294
|
+
parser.add_argument(
|
|
295
|
+
'--case-mode',
|
|
296
|
+
default='lower',
|
|
297
|
+
choices=['lower', 'upper', 'title', 'sentence'],
|
|
298
|
+
help='Case normalization mode (default: lower)'
|
|
299
|
+
)
|
|
300
|
+
parser.add_argument(
|
|
301
|
+
'--remove-stopwords',
|
|
302
|
+
action='store_true',
|
|
303
|
+
help='Remove common stopwords during preprocessing'
|
|
304
|
+
)
|
|
305
|
+
parser.add_argument(
|
|
306
|
+
'--remove-punctuation',
|
|
307
|
+
action='store_true',
|
|
308
|
+
help='Remove punctuation during preprocessing'
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
args = parser.parse_args()
|
|
312
|
+
|
|
313
|
+
# Create and run pipeline
|
|
314
|
+
pipeline = EndToEndPipeline(
|
|
315
|
+
input_dir=args.input_dir,
|
|
316
|
+
output_dir=args.output_dir,
|
|
317
|
+
tokenizer_method=args.tokenizer_method,
|
|
318
|
+
tokenizer_model=args.tokenizer_model,
|
|
319
|
+
workers=args.workers,
|
|
320
|
+
case_mode=args.case_mode,
|
|
321
|
+
remove_stopwords=args.remove_stopwords,
|
|
322
|
+
remove_punctuation=args.remove_punctuation,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
result = pipeline.run()
|
|
326
|
+
|
|
327
|
+
# Exit with appropriate code
|
|
328
|
+
return 0 if result['status'] == 'success' else 1
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
if __name__ == '__main__':
|
|
332
|
+
exit_code = main()
|
|
333
|
+
sys.exit(exit_code)
|