gptmed 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gptmed/data_preparation/text/__init__.py +190 -100
- gptmed/data_preparation/text/base_strategy.py +28 -0
- gptmed/data_preparation/text/batch_pdf_to_jsonl.py +274 -0
- gptmed/data_preparation/text/case_normalizer.py +66 -0
- gptmed/data_preparation/text/pdf_processor.py +302 -0
- gptmed/data_preparation/text/pipeline.py +333 -0
- gptmed/data_preparation/text/preprocess_jsonl.py +466 -0
- gptmed/data_preparation/text/punctuation_handler.py +65 -0
- gptmed/data_preparation/text/stopword_remover.py +87 -0
- gptmed/data_preparation/text/text_cleaner.py +74 -0
- gptmed/data_preparation/text/text_statistics.py +137 -0
- gptmed/data_preparation/text/tokenize_jsonl.py +376 -0
- gptmed/data_preparation/text/tokenizer.py +74 -0
- gptmed/data_preparation/text/unicode_normalizer.py +61 -0
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/METADATA +2 -1
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/RECORD +20 -7
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/WHEEL +0 -0
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/entry_points.txt +0 -0
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/licenses/LICENSE +0 -0
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,466 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Comprehensive JSONL Preprocessing Pipeline
|
|
3
|
+
|
|
4
|
+
Uses ALL available text preprocessing strategies in sequence:
|
|
5
|
+
1. TextCleaner - Remove HTML, URLs, emails, normalize whitespace
|
|
6
|
+
2. UnicodeNormalizer - Normalize unicode characters (NFC/NFD/NFKC/NFKD)
|
|
7
|
+
3. CaseNormalizer - Normalize case (lowercase/uppercase/title/sentence)
|
|
8
|
+
4. PunctuationHandler - Handle punctuation (remove or normalize spacing)
|
|
9
|
+
5. StopwordRemover - Remove common stopwords (optional)
|
|
10
|
+
6. TextStatistics - Track detailed statistics
|
|
11
|
+
|
|
12
|
+
Pipeline applies strategies in order and tracks statistics for each step.
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
python3 preprocess_jsonl.py \
|
|
16
|
+
--input-file ./output/combined_text.jsonl \
|
|
17
|
+
--output-file ./output/combined_text_full_preprocessed.jsonl \
|
|
18
|
+
[--remove-stopwords] [--remove-punctuation] [--case-mode lowercase]
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import sys
|
|
22
|
+
import json
|
|
23
|
+
import logging
|
|
24
|
+
import time
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Dict, List, Optional, Any
|
|
27
|
+
from dataclasses import dataclass, asdict
|
|
28
|
+
|
|
29
|
+
# Adjust path if running from workspace
|
|
30
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
|
|
31
|
+
|
|
32
|
+
from gptmed.data_preparation.text import (
|
|
33
|
+
TextCleaner,
|
|
34
|
+
UnicodeNormalizer,
|
|
35
|
+
CaseNormalizer,
|
|
36
|
+
PunctuationHandler,
|
|
37
|
+
StopwordRemover,
|
|
38
|
+
TextStatistics,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Configure logging
|
|
43
|
+
logging.basicConfig(
|
|
44
|
+
level=logging.INFO,
|
|
45
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
46
|
+
)
|
|
47
|
+
logger = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class PreprocessingStepStats:
|
|
52
|
+
"""Statistics for each preprocessing step"""
|
|
53
|
+
step_name: str
|
|
54
|
+
word_count_before: int
|
|
55
|
+
word_count_after: int
|
|
56
|
+
char_count_before: int
|
|
57
|
+
char_count_after: int
|
|
58
|
+
reduction_percentage: float
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class FullPreprocessedRecord:
|
|
63
|
+
"""Complete preprocessed record - only final text in output"""
|
|
64
|
+
filename: str
|
|
65
|
+
text: str
|
|
66
|
+
|
|
67
|
+
# Statistics summary
|
|
68
|
+
original_word_count: int
|
|
69
|
+
final_word_count: int
|
|
70
|
+
total_reduction_percentage: float
|
|
71
|
+
status: str
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class ComprehensiveJSONLPreprocessor:
|
|
75
|
+
"""Complete preprocessing pipeline using ALL strategies"""
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
input_file: str,
|
|
80
|
+
output_file: str = None,
|
|
81
|
+
case_mode: str = 'lower',
|
|
82
|
+
remove_stopwords: bool = False,
|
|
83
|
+
remove_punctuation: bool = False,
|
|
84
|
+
unicode_form: str = 'NFC',
|
|
85
|
+
):
|
|
86
|
+
"""
|
|
87
|
+
Initialize comprehensive preprocessor
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
input_file: Path to input JSONL file
|
|
91
|
+
output_file: Path to output JSONL file
|
|
92
|
+
case_mode: Case normalization mode (lower/upper/title/sentence)
|
|
93
|
+
remove_stopwords: Whether to remove stopwords
|
|
94
|
+
remove_punctuation: Whether to remove punctuation
|
|
95
|
+
unicode_form: Unicode normalization form (NFC/NFD/NFKC/NFKD)
|
|
96
|
+
"""
|
|
97
|
+
self.input_file = Path(input_file)
|
|
98
|
+
|
|
99
|
+
if output_file is None:
|
|
100
|
+
base_name = self.input_file.stem
|
|
101
|
+
output_file = self.input_file.parent / f"{base_name}_full_preprocessed.jsonl"
|
|
102
|
+
|
|
103
|
+
self.output_file = Path(output_file)
|
|
104
|
+
self.case_mode = case_mode
|
|
105
|
+
self.remove_stopwords = remove_stopwords
|
|
106
|
+
self.remove_punctuation = remove_punctuation
|
|
107
|
+
self.unicode_form = unicode_form
|
|
108
|
+
|
|
109
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
110
|
+
|
|
111
|
+
# Initialize ALL preprocessing strategies
|
|
112
|
+
self.text_cleaner = TextCleaner()
|
|
113
|
+
self.unicode_normalizer = UnicodeNormalizer(form=unicode_form)
|
|
114
|
+
self.case_normalizer = CaseNormalizer(mode=case_mode)
|
|
115
|
+
self.punctuation_handler = PunctuationHandler(
|
|
116
|
+
remove=remove_punctuation,
|
|
117
|
+
normalize_spacing=True
|
|
118
|
+
)
|
|
119
|
+
self.stopword_remover = StopwordRemover()
|
|
120
|
+
self.text_stats = TextStatistics()
|
|
121
|
+
|
|
122
|
+
def load_jsonl(self) -> List[Dict[str, Any]]:
|
|
123
|
+
"""Load JSONL file"""
|
|
124
|
+
if not self.input_file.exists():
|
|
125
|
+
raise FileNotFoundError(f"Input file not found: {self.input_file}")
|
|
126
|
+
|
|
127
|
+
records = []
|
|
128
|
+
with open(self.input_file, 'r', encoding='utf-8') as f:
|
|
129
|
+
for line_num, line in enumerate(f, 1):
|
|
130
|
+
try:
|
|
131
|
+
record = json.loads(line.strip())
|
|
132
|
+
records.append(record)
|
|
133
|
+
except json.JSONDecodeError as e:
|
|
134
|
+
self.logger.warning(f"Line {line_num}: Invalid JSON - {str(e)}")
|
|
135
|
+
|
|
136
|
+
self.logger.info(f"Loaded {len(records)} records from {self.input_file.name}")
|
|
137
|
+
return records
|
|
138
|
+
|
|
139
|
+
def preprocess_record(self, record: Dict[str, Any]) -> FullPreprocessedRecord:
|
|
140
|
+
"""
|
|
141
|
+
Apply all preprocessing steps to a single record
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
record: Input JSONL record
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
FullPreprocessedRecord with all steps tracked
|
|
148
|
+
"""
|
|
149
|
+
try:
|
|
150
|
+
filename = record.get('filename', 'unknown')
|
|
151
|
+
original_text = record.get('text', '')
|
|
152
|
+
|
|
153
|
+
if not original_text:
|
|
154
|
+
return FullPreprocessedRecord(
|
|
155
|
+
filename=filename,
|
|
156
|
+
text='',
|
|
157
|
+
original_word_count=0,
|
|
158
|
+
final_word_count=0,
|
|
159
|
+
total_reduction_percentage=0.0,
|
|
160
|
+
status='empty_text'
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Track original stats
|
|
164
|
+
original_words = len(original_text.split())
|
|
165
|
+
|
|
166
|
+
current_text = original_text
|
|
167
|
+
|
|
168
|
+
# STEP 1: Text Cleaning
|
|
169
|
+
self.logger.debug(f"Step 1: Text Cleaning")
|
|
170
|
+
self.text_cleaner.stats = {
|
|
171
|
+
'html_tags_removed': 0,
|
|
172
|
+
'urls_removed': 0,
|
|
173
|
+
'emails_removed': 0,
|
|
174
|
+
'whitespace_normalized': 0,
|
|
175
|
+
}
|
|
176
|
+
current_text = self.text_cleaner.process(current_text)
|
|
177
|
+
|
|
178
|
+
# STEP 2: Unicode Normalization
|
|
179
|
+
self.logger.debug(f"Step 2: Unicode Normalization ({self.unicode_form})")
|
|
180
|
+
current_text = self.unicode_normalizer.process(current_text)
|
|
181
|
+
|
|
182
|
+
# STEP 3: Case Normalization
|
|
183
|
+
self.logger.debug(f"Step 3: Case Normalization ({self.case_mode})")
|
|
184
|
+
current_text = self.case_normalizer.process(current_text)
|
|
185
|
+
|
|
186
|
+
# STEP 4: Punctuation Handling
|
|
187
|
+
self.logger.debug(f"Step 4: Punctuation Handling")
|
|
188
|
+
current_text = self.punctuation_handler.process(current_text)
|
|
189
|
+
|
|
190
|
+
# STEP 5: Stopword Removal (optional)
|
|
191
|
+
if self.remove_stopwords:
|
|
192
|
+
self.logger.debug(f"Step 5: Stopword Removal")
|
|
193
|
+
current_text = self.stopword_remover.process(current_text)
|
|
194
|
+
|
|
195
|
+
final_text = current_text
|
|
196
|
+
|
|
197
|
+
# Final statistics
|
|
198
|
+
final_words = len(final_text.split())
|
|
199
|
+
reduction_pct = (
|
|
200
|
+
(original_words - final_words) / original_words * 100
|
|
201
|
+
if original_words > 0 else 0.0
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
return FullPreprocessedRecord(
|
|
205
|
+
filename=filename,
|
|
206
|
+
text=final_text,
|
|
207
|
+
original_word_count=original_words,
|
|
208
|
+
final_word_count=final_words,
|
|
209
|
+
total_reduction_percentage=reduction_pct,
|
|
210
|
+
status='success'
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
except Exception as e:
|
|
214
|
+
self.logger.error(f"Error preprocessing {record.get('filename', 'unknown')}: {str(e)}")
|
|
215
|
+
return FullPreprocessedRecord(
|
|
216
|
+
filename=record.get('filename', 'unknown'),
|
|
217
|
+
text='',
|
|
218
|
+
original_word_count=0,
|
|
219
|
+
final_word_count=0,
|
|
220
|
+
total_reduction_percentage=0.0,
|
|
221
|
+
status=f'error: {str(e)}'
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
def save_jsonl(self, records: List[FullPreprocessedRecord]) -> bool:
|
|
225
|
+
"""Save full preprocessed records to JSONL"""
|
|
226
|
+
try:
|
|
227
|
+
self.output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
228
|
+
|
|
229
|
+
with open(self.output_file, 'w', encoding='utf-8') as f:
|
|
230
|
+
for record in records:
|
|
231
|
+
json.dump(asdict(record), f, ensure_ascii=False)
|
|
232
|
+
f.write('\n')
|
|
233
|
+
|
|
234
|
+
self.logger.info(f"✓ Saved {len(records)} fully preprocessed records to {self.output_file.name}")
|
|
235
|
+
return True
|
|
236
|
+
|
|
237
|
+
except Exception as e:
|
|
238
|
+
self.logger.error(f"Error saving JSONL: {str(e)}")
|
|
239
|
+
return False
|
|
240
|
+
|
|
241
|
+
def process(self) -> Dict[str, Any]:
|
|
242
|
+
"""Main processing pipeline"""
|
|
243
|
+
start_time = time.time()
|
|
244
|
+
|
|
245
|
+
self.logger.info("="*70)
|
|
246
|
+
self.logger.info("Comprehensive JSONL Preprocessing Pipeline")
|
|
247
|
+
self.logger.info("="*70)
|
|
248
|
+
self.logger.info(f"Input file: {self.input_file}")
|
|
249
|
+
self.logger.info(f"Output file: {self.output_file}")
|
|
250
|
+
self.logger.info(f"\nPipeline configuration:")
|
|
251
|
+
self.logger.info(f" 1. TextCleaner (remove HTML, URLs, emails)")
|
|
252
|
+
self.logger.info(f" 2. UnicodeNormalizer ({self.unicode_form})")
|
|
253
|
+
self.logger.info(f" 3. CaseNormalizer ({self.case_mode})")
|
|
254
|
+
self.logger.info(f" 4. PunctuationHandler (remove={self.remove_punctuation})")
|
|
255
|
+
self.logger.info(f" 5. StopwordRemover (enabled={self.remove_stopwords})")
|
|
256
|
+
self.logger.info("="*70)
|
|
257
|
+
|
|
258
|
+
# Load records
|
|
259
|
+
records = self.load_jsonl()
|
|
260
|
+
|
|
261
|
+
if not records:
|
|
262
|
+
self.logger.warning("No records to process")
|
|
263
|
+
return {'status': 'failure', 'total_records': 0}
|
|
264
|
+
|
|
265
|
+
# Preprocess each record
|
|
266
|
+
self.logger.info(f"\nPreprocessing {len(records)} records...")
|
|
267
|
+
preprocessed_records = []
|
|
268
|
+
|
|
269
|
+
for i, record in enumerate(records, 1):
|
|
270
|
+
preprocessed = self.preprocess_record(record)
|
|
271
|
+
preprocessed_records.append(preprocessed)
|
|
272
|
+
|
|
273
|
+
if i % max(1, len(records) // 10) == 0 or i == len(records):
|
|
274
|
+
self.logger.info(f"Progress: {i}/{len(records)} records processed")
|
|
275
|
+
|
|
276
|
+
# Save results
|
|
277
|
+
self.logger.info(f"\nSaving preprocessed records...")
|
|
278
|
+
saved = self.save_jsonl(preprocessed_records)
|
|
279
|
+
|
|
280
|
+
# Calculate statistics
|
|
281
|
+
successful = [r for r in preprocessed_records if r.status == 'success']
|
|
282
|
+
failed = [r for r in preprocessed_records if r.status != 'success']
|
|
283
|
+
|
|
284
|
+
total_time = time.time() - start_time
|
|
285
|
+
|
|
286
|
+
# Statistics
|
|
287
|
+
original_words = sum(r.original_word_count for r in successful)
|
|
288
|
+
final_words = sum(r.final_word_count for r in successful)
|
|
289
|
+
avg_reduction = (
|
|
290
|
+
sum(r.total_reduction_percentage for r in successful) / len(successful)
|
|
291
|
+
if successful else 0.0
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
# Print summary
|
|
295
|
+
self.logger.info(f"\n" + "="*70)
|
|
296
|
+
self.logger.info(f"Preprocessing Summary")
|
|
297
|
+
self.logger.info(f"="*70)
|
|
298
|
+
self.logger.info(f"Total records: {len(records)}")
|
|
299
|
+
self.logger.info(f"Successfully preprocessed: {len(successful)}")
|
|
300
|
+
self.logger.info(f"Failed: {len(failed)}")
|
|
301
|
+
self.logger.info(f"\nWord Count Statistics:")
|
|
302
|
+
self.logger.info(f" Original total words: {original_words:,}")
|
|
303
|
+
self.logger.info(f" Final total words: {final_words:,}")
|
|
304
|
+
self.logger.info(f" Average reduction: {avg_reduction:.1f}%")
|
|
305
|
+
self.logger.info(f"\nOutput: {self.output_file.name}")
|
|
306
|
+
self.logger.info(f"Total time: {total_time:.2f}s")
|
|
307
|
+
self.logger.info(f"="*70)
|
|
308
|
+
|
|
309
|
+
return {
|
|
310
|
+
'status': 'success' if saved else 'failure',
|
|
311
|
+
'input_file': str(self.input_file),
|
|
312
|
+
'output_file': str(self.output_file),
|
|
313
|
+
'total_records': len(records),
|
|
314
|
+
'successful': len(successful),
|
|
315
|
+
'failed': len(failed),
|
|
316
|
+
'original_word_count': original_words,
|
|
317
|
+
'final_word_count': final_words,
|
|
318
|
+
'average_reduction_percentage': avg_reduction,
|
|
319
|
+
'total_time': total_time,
|
|
320
|
+
'pipeline_steps': [
|
|
321
|
+
'TextCleaner',
|
|
322
|
+
'UnicodeNormalizer',
|
|
323
|
+
'CaseNormalizer',
|
|
324
|
+
'PunctuationHandler',
|
|
325
|
+
f"StopwordRemover (enabled={self.remove_stopwords})",
|
|
326
|
+
],
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def show_pipeline_details():
|
|
331
|
+
"""Show details about each preprocessing step"""
|
|
332
|
+
print(f"""
|
|
333
|
+
{'='*70}
|
|
334
|
+
COMPREHENSIVE PREPROCESSING PIPELINE DETAILS
|
|
335
|
+
{'='*70}
|
|
336
|
+
|
|
337
|
+
STEP 1: TextCleaner
|
|
338
|
+
✓ Removes HTML tags: <b>text</b> → text
|
|
339
|
+
✓ Removes URLs: http://example.com → removed
|
|
340
|
+
✓ Removes email addresses: user@example.com → removed
|
|
341
|
+
✓ Normalizes whitespace: multiple spaces → single space
|
|
342
|
+
Stats: html_tags_removed, urls_removed, emails_removed
|
|
343
|
+
|
|
344
|
+
STEP 2: UnicodeNormalizer
|
|
345
|
+
✓ NFC (canonical composition) - DEFAULT
|
|
346
|
+
✓ NFD (canonical decomposition)
|
|
347
|
+
✓ NFKC (compatibility composition)
|
|
348
|
+
✓ NFKD (compatibility decomposition)
|
|
349
|
+
Handles: accents, emojis, special characters
|
|
350
|
+
Stats: characters_removed, form_used
|
|
351
|
+
|
|
352
|
+
STEP 3: CaseNormalizer
|
|
353
|
+
✓ lowercase - RECOMMENDED for models
|
|
354
|
+
✓ UPPERCASE
|
|
355
|
+
✓ Title Case
|
|
356
|
+
✓ Sentence case
|
|
357
|
+
Stats: mode_used, conversions_applied
|
|
358
|
+
|
|
359
|
+
STEP 4: PunctuationHandler
|
|
360
|
+
✓ Remove or keep punctuation
|
|
361
|
+
✓ Normalize spacing around punctuation
|
|
362
|
+
Stats: punctuation_removed, spacing_normalized
|
|
363
|
+
|
|
364
|
+
STEP 5: StopwordRemover (Optional)
|
|
365
|
+
✓ Remove common English stopwords (the, a, an, etc.)
|
|
366
|
+
✓ Customizable stopword lists
|
|
367
|
+
Stats: stopwords_removed
|
|
368
|
+
|
|
369
|
+
OUTPUT:
|
|
370
|
+
- Each record includes text at every step
|
|
371
|
+
- Full statistics for each step
|
|
372
|
+
- Track exactly how text changes through pipeline
|
|
373
|
+
- Ready for tokenization with preprocessed_text field
|
|
374
|
+
|
|
375
|
+
EXAMPLE TRANSFORMATION:
|
|
376
|
+
|
|
377
|
+
Original:
|
|
378
|
+
"Hello WORLD! 🌍 Visit https://example.com for <b>MORE</b> info!"
|
|
379
|
+
|
|
380
|
+
Step 1 (TextCleaner):
|
|
381
|
+
"Hello WORLD! 🌍 Visit for MORE info!"
|
|
382
|
+
|
|
383
|
+
Step 2 (UnicodeNormalizer - NFC):
|
|
384
|
+
"Hello WORLD! 🌍 Visit for MORE info!"
|
|
385
|
+
|
|
386
|
+
Step 3 (CaseNormalizer - lowercase):
|
|
387
|
+
"hello world! 🌍 visit for more info!"
|
|
388
|
+
|
|
389
|
+
Step 4 (PunctuationHandler):
|
|
390
|
+
"hello world 🌍 visit for more info"
|
|
391
|
+
|
|
392
|
+
Step 5 (StopwordRemover - if enabled):
|
|
393
|
+
"hello world visit more info"
|
|
394
|
+
|
|
395
|
+
Final: "hello world 🌍 visit for more info" (or without stopwords)
|
|
396
|
+
|
|
397
|
+
READY FOR TOKENIZATION:
|
|
398
|
+
Use the 'final_preprocessed_text' field in JSONL for:
|
|
399
|
+
- Hugging Face Tokenizer
|
|
400
|
+
- SentencePiece
|
|
401
|
+
- Custom tokenizers
|
|
402
|
+
- Model training
|
|
403
|
+
{'='*70}
|
|
404
|
+
""")
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
if __name__ == '__main__':
|
|
408
|
+
import argparse
|
|
409
|
+
|
|
410
|
+
parser = argparse.ArgumentParser(
|
|
411
|
+
description='Comprehensive JSONL preprocessing using ALL strategies'
|
|
412
|
+
)
|
|
413
|
+
parser.add_argument(
|
|
414
|
+
'--input-file',
|
|
415
|
+
default='./output/combined_text.jsonl',
|
|
416
|
+
help='Input JSONL file'
|
|
417
|
+
)
|
|
418
|
+
parser.add_argument(
|
|
419
|
+
'--output-file',
|
|
420
|
+
default=None,
|
|
421
|
+
help='Output JSONL file (auto-generated if not specified)'
|
|
422
|
+
)
|
|
423
|
+
parser.add_argument(
|
|
424
|
+
'--case-mode',
|
|
425
|
+
default='lower',
|
|
426
|
+
choices=['lower', 'upper', 'title', 'sentence'],
|
|
427
|
+
help='Case normalization mode (default: lower)'
|
|
428
|
+
)
|
|
429
|
+
parser.add_argument(
|
|
430
|
+
'--remove-stopwords',
|
|
431
|
+
action='store_true',
|
|
432
|
+
help='Remove common stopwords'
|
|
433
|
+
)
|
|
434
|
+
parser.add_argument(
|
|
435
|
+
'--remove-punctuation',
|
|
436
|
+
action='store_true',
|
|
437
|
+
help='Remove punctuation marks'
|
|
438
|
+
)
|
|
439
|
+
parser.add_argument(
|
|
440
|
+
'--unicode-form',
|
|
441
|
+
default='NFC',
|
|
442
|
+
choices=['NFC', 'NFD', 'NFKC', 'NFKD'],
|
|
443
|
+
help='Unicode normalization form (default: NFC)'
|
|
444
|
+
)
|
|
445
|
+
parser.add_argument(
|
|
446
|
+
'--show-pipeline',
|
|
447
|
+
action='store_true',
|
|
448
|
+
help='Show pipeline details and exit'
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
args = parser.parse_args()
|
|
452
|
+
|
|
453
|
+
if args.show_pipeline:
|
|
454
|
+
show_pipeline_details()
|
|
455
|
+
else:
|
|
456
|
+
processor = ComprehensiveJSONLPreprocessor(
|
|
457
|
+
input_file=args.input_file,
|
|
458
|
+
output_file=args.output_file,
|
|
459
|
+
case_mode=args.case_mode,
|
|
460
|
+
remove_stopwords=args.remove_stopwords,
|
|
461
|
+
remove_punctuation=args.remove_punctuation,
|
|
462
|
+
unicode_form=args.unicode_form,
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
result = processor.process()
|
|
466
|
+
sys.exit(0 if result['status'] == 'success' else 1)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Punctuation handling strategy - removes or normalizes punctuation
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import string
|
|
7
|
+
from typing import Any, Dict
|
|
8
|
+
|
|
9
|
+
from .base_strategy import TextPreprocessingStrategy
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PunctuationHandler(TextPreprocessingStrategy):
|
|
13
|
+
"""
|
|
14
|
+
Handles punctuation removal or normalization
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, remove: bool = False, normalize_spacing: bool = True):
|
|
18
|
+
"""
|
|
19
|
+
Initialize punctuation handler
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
remove: If True, removes all punctuation; if False, only normalizes spacing
|
|
23
|
+
normalize_spacing: If True, normalizes spacing around punctuation
|
|
24
|
+
"""
|
|
25
|
+
self.remove = remove
|
|
26
|
+
self.normalize_spacing = normalize_spacing
|
|
27
|
+
self.stats = {
|
|
28
|
+
'punctuation_removed': 0,
|
|
29
|
+
'spacing_normalized': 0,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
def process(self, text: str) -> str:
|
|
33
|
+
"""
|
|
34
|
+
Handle punctuation in text
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
text: Text to process
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Processed text
|
|
41
|
+
"""
|
|
42
|
+
if self.remove:
|
|
43
|
+
# Remove all punctuation
|
|
44
|
+
original_len = len(text)
|
|
45
|
+
text = text.translate(str.maketrans('', '', string.punctuation))
|
|
46
|
+
self.stats['punctuation_removed'] += original_len - len(text)
|
|
47
|
+
else:
|
|
48
|
+
# Normalize spacing around punctuation
|
|
49
|
+
if self.normalize_spacing:
|
|
50
|
+
# Remove space before punctuation
|
|
51
|
+
original = text
|
|
52
|
+
text = re.sub(r'\s+([.!?,;:])', r'\1', text)
|
|
53
|
+
if text != original:
|
|
54
|
+
self.stats['spacing_normalized'] += 1
|
|
55
|
+
|
|
56
|
+
return text
|
|
57
|
+
|
|
58
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
59
|
+
"""Get punctuation handling statistics"""
|
|
60
|
+
return self.stats.copy()
|
|
61
|
+
|
|
62
|
+
def reset_stats(self) -> None:
|
|
63
|
+
"""Reset statistics"""
|
|
64
|
+
self.stats['punctuation_removed'] = 0
|
|
65
|
+
self.stats['spacing_normalized'] = 0
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Stopword removal strategy - removes common words
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, Set, Optional
|
|
6
|
+
|
|
7
|
+
from .base_strategy import TextPreprocessingStrategy
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class StopwordRemover(TextPreprocessingStrategy):
|
|
11
|
+
"""
|
|
12
|
+
Removes common English stopwords from text
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, stopwords: Optional[Set[str]] = None, enable: bool = True):
|
|
16
|
+
"""
|
|
17
|
+
Initialize stopword remover
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
stopwords: Custom set of stopwords; uses default if None
|
|
21
|
+
enable: Whether stopword removal is enabled
|
|
22
|
+
"""
|
|
23
|
+
self.enable = enable
|
|
24
|
+
|
|
25
|
+
if stopwords is None:
|
|
26
|
+
self.stopwords = self._get_default_stopwords()
|
|
27
|
+
else:
|
|
28
|
+
self.stopwords = stopwords
|
|
29
|
+
|
|
30
|
+
self.stats = {
|
|
31
|
+
'stopwords_removed': 0,
|
|
32
|
+
'enabled': enable,
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def _get_default_stopwords() -> Set[str]:
|
|
37
|
+
"""Get default English stopwords"""
|
|
38
|
+
return {
|
|
39
|
+
'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
|
|
40
|
+
'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
|
|
41
|
+
'do', 'at', 'this', 'but', 'his', 'by', 'from', 'is', 'was',
|
|
42
|
+
'are', 'been', 'were', 'or', 'an', 'which', 'their', 'what',
|
|
43
|
+
'so', 'up', 'out', 'if', 'about', 'who', 'get', 'them', 'me',
|
|
44
|
+
'my', 'your', 'we', 'us', 'they', 'them', 'these', 'those',
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
def process(self, text: str) -> str:
|
|
48
|
+
"""
|
|
49
|
+
Remove stopwords from text
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
text: Text to process
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Text with stopwords removed (if enabled)
|
|
56
|
+
"""
|
|
57
|
+
if not self.enable:
|
|
58
|
+
return text
|
|
59
|
+
|
|
60
|
+
words = text.split()
|
|
61
|
+
original_count = len(words)
|
|
62
|
+
|
|
63
|
+
filtered_words = [w for w in words if w not in self.stopwords]
|
|
64
|
+
|
|
65
|
+
self.stats['stopwords_removed'] += original_count - len(filtered_words)
|
|
66
|
+
|
|
67
|
+
return ' '.join(filtered_words)
|
|
68
|
+
|
|
69
|
+
def add_stopword(self, word: str) -> None:
|
|
70
|
+
"""Add a custom stopword"""
|
|
71
|
+
self.stopwords.add(word.lower())
|
|
72
|
+
|
|
73
|
+
def remove_stopword(self, word: str) -> None:
|
|
74
|
+
"""Remove a stopword"""
|
|
75
|
+
self.stopwords.discard(word.lower())
|
|
76
|
+
|
|
77
|
+
def set_stopwords(self, stopwords: Set[str]) -> None:
|
|
78
|
+
"""Replace all stopwords with a new set"""
|
|
79
|
+
self.stopwords = stopwords
|
|
80
|
+
|
|
81
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
82
|
+
"""Get stopword removal statistics"""
|
|
83
|
+
return self.stats.copy()
|
|
84
|
+
|
|
85
|
+
def reset_stats(self) -> None:
|
|
86
|
+
"""Reset statistics"""
|
|
87
|
+
self.stats['stopwords_removed'] = 0
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text cleaning strategy - removes HTML, URLs, emails, and special characters
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any, Dict
|
|
7
|
+
|
|
8
|
+
from .base_strategy import TextPreprocessingStrategy
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TextCleaner(TextPreprocessingStrategy):
|
|
12
|
+
"""
|
|
13
|
+
Removes HTML tags, URLs, email addresses, and normalizes whitespace
|
|
14
|
+
|
|
15
|
+
This is the first step in the preprocessing pipeline.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self):
|
|
19
|
+
self.stats = {
|
|
20
|
+
'html_tags_removed': 0,
|
|
21
|
+
'urls_removed': 0,
|
|
22
|
+
'emails_removed': 0,
|
|
23
|
+
'whitespace_normalized': 0,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
def process(self, text: str) -> str:
|
|
27
|
+
"""
|
|
28
|
+
Clean text by removing artifacts
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
text: Raw text to clean
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Cleaned text
|
|
35
|
+
"""
|
|
36
|
+
# Remove leading/trailing whitespace
|
|
37
|
+
text = text.strip()
|
|
38
|
+
|
|
39
|
+
# Remove HTML tags
|
|
40
|
+
html_pattern = r'<[^>]+>'
|
|
41
|
+
html_matches = len(re.findall(html_pattern, text))
|
|
42
|
+
text = re.sub(html_pattern, '', text)
|
|
43
|
+
self.stats['html_tags_removed'] += html_matches
|
|
44
|
+
|
|
45
|
+
# Remove URLs
|
|
46
|
+
url_pattern = r'http[s]?://\S+|www\.\S+'
|
|
47
|
+
url_matches = len(re.findall(url_pattern, text))
|
|
48
|
+
text = re.sub(url_pattern, '', text)
|
|
49
|
+
self.stats['urls_removed'] += url_matches
|
|
50
|
+
|
|
51
|
+
# Remove email addresses
|
|
52
|
+
email_pattern = r'\S+@\S+'
|
|
53
|
+
email_matches = len(re.findall(email_pattern, text))
|
|
54
|
+
text = re.sub(email_pattern, '', text)
|
|
55
|
+
self.stats['emails_removed'] += email_matches
|
|
56
|
+
|
|
57
|
+
# Remove extra whitespace
|
|
58
|
+
original_spaces = len(re.findall(r'\s+', text))
|
|
59
|
+
text = re.sub(r'\s+', ' ', text)
|
|
60
|
+
self.stats['whitespace_normalized'] += original_spaces - len(re.findall(r'\s+', text))
|
|
61
|
+
|
|
62
|
+
# Remove common control characters
|
|
63
|
+
text = ''.join(ch for ch in text if ch.isprintable() or ch.isspace())
|
|
64
|
+
|
|
65
|
+
return text.strip()
|
|
66
|
+
|
|
67
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
68
|
+
"""Get cleaning statistics"""
|
|
69
|
+
return self.stats.copy()
|
|
70
|
+
|
|
71
|
+
def reset_stats(self) -> None:
|
|
72
|
+
"""Reset statistics"""
|
|
73
|
+
for key in self.stats:
|
|
74
|
+
self.stats[key] = 0
|