gptmed 0.5.5__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,466 @@
1
+ """
2
+ Comprehensive JSONL Preprocessing Pipeline
3
+
4
+ Uses ALL available text preprocessing strategies in sequence:
5
+ 1. TextCleaner - Remove HTML, URLs, emails, normalize whitespace
6
+ 2. UnicodeNormalizer - Normalize unicode characters (NFC/NFD/NFKC/NFKD)
7
+ 3. CaseNormalizer - Normalize case (lowercase/uppercase/title/sentence)
8
+ 4. PunctuationHandler - Handle punctuation (remove or normalize spacing)
9
+ 5. StopwordRemover - Remove common stopwords (optional)
10
+ 6. TextStatistics - Track detailed statistics
11
+
12
+ Pipeline applies strategies in order and tracks statistics for each step.
13
+
14
+ Usage:
15
+ python3 preprocess_jsonl.py \
16
+ --input-file ./output/combined_text.jsonl \
17
+ --output-file ./output/combined_text_full_preprocessed.jsonl \
18
+ [--remove-stopwords] [--remove-punctuation] [--case-mode lowercase]
19
+ """
20
+
21
+ import sys
22
+ import json
23
+ import logging
24
+ import time
25
+ from pathlib import Path
26
+ from typing import Dict, List, Optional, Any
27
+ from dataclasses import dataclass, asdict
28
+
29
+ # Adjust path if running from workspace
30
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
31
+
32
+ from gptmed.data_preparation.text import (
33
+ TextCleaner,
34
+ UnicodeNormalizer,
35
+ CaseNormalizer,
36
+ PunctuationHandler,
37
+ StopwordRemover,
38
+ TextStatistics,
39
+ )
40
+
41
+
42
+ # Configure logging
43
+ logging.basicConfig(
44
+ level=logging.INFO,
45
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
46
+ )
47
+ logger = logging.getLogger(__name__)
48
+
49
+
50
+ @dataclass
51
+ class PreprocessingStepStats:
52
+ """Statistics for each preprocessing step"""
53
+ step_name: str
54
+ word_count_before: int
55
+ word_count_after: int
56
+ char_count_before: int
57
+ char_count_after: int
58
+ reduction_percentage: float
59
+
60
+
61
+ @dataclass
62
+ class FullPreprocessedRecord:
63
+ """Complete preprocessed record - only final text in output"""
64
+ filename: str
65
+ text: str
66
+
67
+ # Statistics summary
68
+ original_word_count: int
69
+ final_word_count: int
70
+ total_reduction_percentage: float
71
+ status: str
72
+
73
+
74
+ class ComprehensiveJSONLPreprocessor:
75
+ """Complete preprocessing pipeline using ALL strategies"""
76
+
77
+ def __init__(
78
+ self,
79
+ input_file: str,
80
+ output_file: str = None,
81
+ case_mode: str = 'lower',
82
+ remove_stopwords: bool = False,
83
+ remove_punctuation: bool = False,
84
+ unicode_form: str = 'NFC',
85
+ ):
86
+ """
87
+ Initialize comprehensive preprocessor
88
+
89
+ Args:
90
+ input_file: Path to input JSONL file
91
+ output_file: Path to output JSONL file
92
+ case_mode: Case normalization mode (lower/upper/title/sentence)
93
+ remove_stopwords: Whether to remove stopwords
94
+ remove_punctuation: Whether to remove punctuation
95
+ unicode_form: Unicode normalization form (NFC/NFD/NFKC/NFKD)
96
+ """
97
+ self.input_file = Path(input_file)
98
+
99
+ if output_file is None:
100
+ base_name = self.input_file.stem
101
+ output_file = self.input_file.parent / f"{base_name}_full_preprocessed.jsonl"
102
+
103
+ self.output_file = Path(output_file)
104
+ self.case_mode = case_mode
105
+ self.remove_stopwords = remove_stopwords
106
+ self.remove_punctuation = remove_punctuation
107
+ self.unicode_form = unicode_form
108
+
109
+ self.logger = logging.getLogger(self.__class__.__name__)
110
+
111
+ # Initialize ALL preprocessing strategies
112
+ self.text_cleaner = TextCleaner()
113
+ self.unicode_normalizer = UnicodeNormalizer(form=unicode_form)
114
+ self.case_normalizer = CaseNormalizer(mode=case_mode)
115
+ self.punctuation_handler = PunctuationHandler(
116
+ remove=remove_punctuation,
117
+ normalize_spacing=True
118
+ )
119
+ self.stopword_remover = StopwordRemover()
120
+ self.text_stats = TextStatistics()
121
+
122
+ def load_jsonl(self) -> List[Dict[str, Any]]:
123
+ """Load JSONL file"""
124
+ if not self.input_file.exists():
125
+ raise FileNotFoundError(f"Input file not found: {self.input_file}")
126
+
127
+ records = []
128
+ with open(self.input_file, 'r', encoding='utf-8') as f:
129
+ for line_num, line in enumerate(f, 1):
130
+ try:
131
+ record = json.loads(line.strip())
132
+ records.append(record)
133
+ except json.JSONDecodeError as e:
134
+ self.logger.warning(f"Line {line_num}: Invalid JSON - {str(e)}")
135
+
136
+ self.logger.info(f"Loaded {len(records)} records from {self.input_file.name}")
137
+ return records
138
+
139
+ def preprocess_record(self, record: Dict[str, Any]) -> FullPreprocessedRecord:
140
+ """
141
+ Apply all preprocessing steps to a single record
142
+
143
+ Args:
144
+ record: Input JSONL record
145
+
146
+ Returns:
147
+ FullPreprocessedRecord with all steps tracked
148
+ """
149
+ try:
150
+ filename = record.get('filename', 'unknown')
151
+ original_text = record.get('text', '')
152
+
153
+ if not original_text:
154
+ return FullPreprocessedRecord(
155
+ filename=filename,
156
+ text='',
157
+ original_word_count=0,
158
+ final_word_count=0,
159
+ total_reduction_percentage=0.0,
160
+ status='empty_text'
161
+ )
162
+
163
+ # Track original stats
164
+ original_words = len(original_text.split())
165
+
166
+ current_text = original_text
167
+
168
+ # STEP 1: Text Cleaning
169
+ self.logger.debug(f"Step 1: Text Cleaning")
170
+ self.text_cleaner.stats = {
171
+ 'html_tags_removed': 0,
172
+ 'urls_removed': 0,
173
+ 'emails_removed': 0,
174
+ 'whitespace_normalized': 0,
175
+ }
176
+ current_text = self.text_cleaner.process(current_text)
177
+
178
+ # STEP 2: Unicode Normalization
179
+ self.logger.debug(f"Step 2: Unicode Normalization ({self.unicode_form})")
180
+ current_text = self.unicode_normalizer.process(current_text)
181
+
182
+ # STEP 3: Case Normalization
183
+ self.logger.debug(f"Step 3: Case Normalization ({self.case_mode})")
184
+ current_text = self.case_normalizer.process(current_text)
185
+
186
+ # STEP 4: Punctuation Handling
187
+ self.logger.debug(f"Step 4: Punctuation Handling")
188
+ current_text = self.punctuation_handler.process(current_text)
189
+
190
+ # STEP 5: Stopword Removal (optional)
191
+ if self.remove_stopwords:
192
+ self.logger.debug(f"Step 5: Stopword Removal")
193
+ current_text = self.stopword_remover.process(current_text)
194
+
195
+ final_text = current_text
196
+
197
+ # Final statistics
198
+ final_words = len(final_text.split())
199
+ reduction_pct = (
200
+ (original_words - final_words) / original_words * 100
201
+ if original_words > 0 else 0.0
202
+ )
203
+
204
+ return FullPreprocessedRecord(
205
+ filename=filename,
206
+ text=final_text,
207
+ original_word_count=original_words,
208
+ final_word_count=final_words,
209
+ total_reduction_percentage=reduction_pct,
210
+ status='success'
211
+ )
212
+
213
+ except Exception as e:
214
+ self.logger.error(f"Error preprocessing {record.get('filename', 'unknown')}: {str(e)}")
215
+ return FullPreprocessedRecord(
216
+ filename=record.get('filename', 'unknown'),
217
+ text='',
218
+ original_word_count=0,
219
+ final_word_count=0,
220
+ total_reduction_percentage=0.0,
221
+ status=f'error: {str(e)}'
222
+ )
223
+
224
+ def save_jsonl(self, records: List[FullPreprocessedRecord]) -> bool:
225
+ """Save full preprocessed records to JSONL"""
226
+ try:
227
+ self.output_file.parent.mkdir(parents=True, exist_ok=True)
228
+
229
+ with open(self.output_file, 'w', encoding='utf-8') as f:
230
+ for record in records:
231
+ json.dump(asdict(record), f, ensure_ascii=False)
232
+ f.write('\n')
233
+
234
+ self.logger.info(f"✓ Saved {len(records)} fully preprocessed records to {self.output_file.name}")
235
+ return True
236
+
237
+ except Exception as e:
238
+ self.logger.error(f"Error saving JSONL: {str(e)}")
239
+ return False
240
+
241
+ def process(self) -> Dict[str, Any]:
242
+ """Main processing pipeline"""
243
+ start_time = time.time()
244
+
245
+ self.logger.info("="*70)
246
+ self.logger.info("Comprehensive JSONL Preprocessing Pipeline")
247
+ self.logger.info("="*70)
248
+ self.logger.info(f"Input file: {self.input_file}")
249
+ self.logger.info(f"Output file: {self.output_file}")
250
+ self.logger.info(f"\nPipeline configuration:")
251
+ self.logger.info(f" 1. TextCleaner (remove HTML, URLs, emails)")
252
+ self.logger.info(f" 2. UnicodeNormalizer ({self.unicode_form})")
253
+ self.logger.info(f" 3. CaseNormalizer ({self.case_mode})")
254
+ self.logger.info(f" 4. PunctuationHandler (remove={self.remove_punctuation})")
255
+ self.logger.info(f" 5. StopwordRemover (enabled={self.remove_stopwords})")
256
+ self.logger.info("="*70)
257
+
258
+ # Load records
259
+ records = self.load_jsonl()
260
+
261
+ if not records:
262
+ self.logger.warning("No records to process")
263
+ return {'status': 'failure', 'total_records': 0}
264
+
265
+ # Preprocess each record
266
+ self.logger.info(f"\nPreprocessing {len(records)} records...")
267
+ preprocessed_records = []
268
+
269
+ for i, record in enumerate(records, 1):
270
+ preprocessed = self.preprocess_record(record)
271
+ preprocessed_records.append(preprocessed)
272
+
273
+ if i % max(1, len(records) // 10) == 0 or i == len(records):
274
+ self.logger.info(f"Progress: {i}/{len(records)} records processed")
275
+
276
+ # Save results
277
+ self.logger.info(f"\nSaving preprocessed records...")
278
+ saved = self.save_jsonl(preprocessed_records)
279
+
280
+ # Calculate statistics
281
+ successful = [r for r in preprocessed_records if r.status == 'success']
282
+ failed = [r for r in preprocessed_records if r.status != 'success']
283
+
284
+ total_time = time.time() - start_time
285
+
286
+ # Statistics
287
+ original_words = sum(r.original_word_count for r in successful)
288
+ final_words = sum(r.final_word_count for r in successful)
289
+ avg_reduction = (
290
+ sum(r.total_reduction_percentage for r in successful) / len(successful)
291
+ if successful else 0.0
292
+ )
293
+
294
+ # Print summary
295
+ self.logger.info(f"\n" + "="*70)
296
+ self.logger.info(f"Preprocessing Summary")
297
+ self.logger.info(f"="*70)
298
+ self.logger.info(f"Total records: {len(records)}")
299
+ self.logger.info(f"Successfully preprocessed: {len(successful)}")
300
+ self.logger.info(f"Failed: {len(failed)}")
301
+ self.logger.info(f"\nWord Count Statistics:")
302
+ self.logger.info(f" Original total words: {original_words:,}")
303
+ self.logger.info(f" Final total words: {final_words:,}")
304
+ self.logger.info(f" Average reduction: {avg_reduction:.1f}%")
305
+ self.logger.info(f"\nOutput: {self.output_file.name}")
306
+ self.logger.info(f"Total time: {total_time:.2f}s")
307
+ self.logger.info(f"="*70)
308
+
309
+ return {
310
+ 'status': 'success' if saved else 'failure',
311
+ 'input_file': str(self.input_file),
312
+ 'output_file': str(self.output_file),
313
+ 'total_records': len(records),
314
+ 'successful': len(successful),
315
+ 'failed': len(failed),
316
+ 'original_word_count': original_words,
317
+ 'final_word_count': final_words,
318
+ 'average_reduction_percentage': avg_reduction,
319
+ 'total_time': total_time,
320
+ 'pipeline_steps': [
321
+ 'TextCleaner',
322
+ 'UnicodeNormalizer',
323
+ 'CaseNormalizer',
324
+ 'PunctuationHandler',
325
+ f"StopwordRemover (enabled={self.remove_stopwords})",
326
+ ],
327
+ }
328
+
329
+
330
+ def show_pipeline_details():
331
+ """Show details about each preprocessing step"""
332
+ print(f"""
333
+ {'='*70}
334
+ COMPREHENSIVE PREPROCESSING PIPELINE DETAILS
335
+ {'='*70}
336
+
337
+ STEP 1: TextCleaner
338
+ ✓ Removes HTML tags: <b>text</b> → text
339
+ ✓ Removes URLs: http://example.com → removed
340
+ ✓ Removes email addresses: user@example.com → removed
341
+ ✓ Normalizes whitespace: multiple spaces → single space
342
+ Stats: html_tags_removed, urls_removed, emails_removed
343
+
344
+ STEP 2: UnicodeNormalizer
345
+ ✓ NFC (canonical composition) - DEFAULT
346
+ ✓ NFD (canonical decomposition)
347
+ ✓ NFKC (compatibility composition)
348
+ ✓ NFKD (compatibility decomposition)
349
+ Handles: accents, emojis, special characters
350
+ Stats: characters_removed, form_used
351
+
352
+ STEP 3: CaseNormalizer
353
+ ✓ lowercase - RECOMMENDED for models
354
+ ✓ UPPERCASE
355
+ ✓ Title Case
356
+ ✓ Sentence case
357
+ Stats: mode_used, conversions_applied
358
+
359
+ STEP 4: PunctuationHandler
360
+ ✓ Remove or keep punctuation
361
+ ✓ Normalize spacing around punctuation
362
+ Stats: punctuation_removed, spacing_normalized
363
+
364
+ STEP 5: StopwordRemover (Optional)
365
+ ✓ Remove common English stopwords (the, a, an, etc.)
366
+ ✓ Customizable stopword lists
367
+ Stats: stopwords_removed
368
+
369
+ OUTPUT:
370
+ - Each record includes text at every step
371
+ - Full statistics for each step
372
+ - Track exactly how text changes through pipeline
373
+ - Ready for tokenization with preprocessed_text field
374
+
375
+ EXAMPLE TRANSFORMATION:
376
+
377
+ Original:
378
+ "Hello WORLD! 🌍 Visit https://example.com for <b>MORE</b> info!"
379
+
380
+ Step 1 (TextCleaner):
381
+ "Hello WORLD! 🌍 Visit for MORE info!"
382
+
383
+ Step 2 (UnicodeNormalizer - NFC):
384
+ "Hello WORLD! 🌍 Visit for MORE info!"
385
+
386
+ Step 3 (CaseNormalizer - lowercase):
387
+ "hello world! 🌍 visit for more info!"
388
+
389
+ Step 4 (PunctuationHandler):
390
+ "hello world 🌍 visit for more info"
391
+
392
+ Step 5 (StopwordRemover - if enabled):
393
+ "hello world visit more info"
394
+
395
+ Final: "hello world 🌍 visit for more info" (or without stopwords)
396
+
397
+ READY FOR TOKENIZATION:
398
+ Use the 'final_preprocessed_text' field in JSONL for:
399
+ - Hugging Face Tokenizer
400
+ - SentencePiece
401
+ - Custom tokenizers
402
+ - Model training
403
+ {'='*70}
404
+ """)
405
+
406
+
407
+ if __name__ == '__main__':
408
+ import argparse
409
+
410
+ parser = argparse.ArgumentParser(
411
+ description='Comprehensive JSONL preprocessing using ALL strategies'
412
+ )
413
+ parser.add_argument(
414
+ '--input-file',
415
+ default='./output/combined_text.jsonl',
416
+ help='Input JSONL file'
417
+ )
418
+ parser.add_argument(
419
+ '--output-file',
420
+ default=None,
421
+ help='Output JSONL file (auto-generated if not specified)'
422
+ )
423
+ parser.add_argument(
424
+ '--case-mode',
425
+ default='lower',
426
+ choices=['lower', 'upper', 'title', 'sentence'],
427
+ help='Case normalization mode (default: lower)'
428
+ )
429
+ parser.add_argument(
430
+ '--remove-stopwords',
431
+ action='store_true',
432
+ help='Remove common stopwords'
433
+ )
434
+ parser.add_argument(
435
+ '--remove-punctuation',
436
+ action='store_true',
437
+ help='Remove punctuation marks'
438
+ )
439
+ parser.add_argument(
440
+ '--unicode-form',
441
+ default='NFC',
442
+ choices=['NFC', 'NFD', 'NFKC', 'NFKD'],
443
+ help='Unicode normalization form (default: NFC)'
444
+ )
445
+ parser.add_argument(
446
+ '--show-pipeline',
447
+ action='store_true',
448
+ help='Show pipeline details and exit'
449
+ )
450
+
451
+ args = parser.parse_args()
452
+
453
+ if args.show_pipeline:
454
+ show_pipeline_details()
455
+ else:
456
+ processor = ComprehensiveJSONLPreprocessor(
457
+ input_file=args.input_file,
458
+ output_file=args.output_file,
459
+ case_mode=args.case_mode,
460
+ remove_stopwords=args.remove_stopwords,
461
+ remove_punctuation=args.remove_punctuation,
462
+ unicode_form=args.unicode_form,
463
+ )
464
+
465
+ result = processor.process()
466
+ sys.exit(0 if result['status'] == 'success' else 1)
@@ -0,0 +1,65 @@
1
+ """
2
+ Punctuation handling strategy - removes or normalizes punctuation
3
+ """
4
+
5
+ import re
6
+ import string
7
+ from typing import Any, Dict
8
+
9
+ from .base_strategy import TextPreprocessingStrategy
10
+
11
+
12
+ class PunctuationHandler(TextPreprocessingStrategy):
13
+ """
14
+ Handles punctuation removal or normalization
15
+ """
16
+
17
+ def __init__(self, remove: bool = False, normalize_spacing: bool = True):
18
+ """
19
+ Initialize punctuation handler
20
+
21
+ Args:
22
+ remove: If True, removes all punctuation; if False, only normalizes spacing
23
+ normalize_spacing: If True, normalizes spacing around punctuation
24
+ """
25
+ self.remove = remove
26
+ self.normalize_spacing = normalize_spacing
27
+ self.stats = {
28
+ 'punctuation_removed': 0,
29
+ 'spacing_normalized': 0,
30
+ }
31
+
32
+ def process(self, text: str) -> str:
33
+ """
34
+ Handle punctuation in text
35
+
36
+ Args:
37
+ text: Text to process
38
+
39
+ Returns:
40
+ Processed text
41
+ """
42
+ if self.remove:
43
+ # Remove all punctuation
44
+ original_len = len(text)
45
+ text = text.translate(str.maketrans('', '', string.punctuation))
46
+ self.stats['punctuation_removed'] += original_len - len(text)
47
+ else:
48
+ # Normalize spacing around punctuation
49
+ if self.normalize_spacing:
50
+ # Remove space before punctuation
51
+ original = text
52
+ text = re.sub(r'\s+([.!?,;:])', r'\1', text)
53
+ if text != original:
54
+ self.stats['spacing_normalized'] += 1
55
+
56
+ return text
57
+
58
+ def get_stats(self) -> Dict[str, Any]:
59
+ """Get punctuation handling statistics"""
60
+ return self.stats.copy()
61
+
62
+ def reset_stats(self) -> None:
63
+ """Reset statistics"""
64
+ self.stats['punctuation_removed'] = 0
65
+ self.stats['spacing_normalized'] = 0
@@ -0,0 +1,87 @@
1
+ """
2
+ Stopword removal strategy - removes common words
3
+ """
4
+
5
+ from typing import Any, Dict, Set, Optional
6
+
7
+ from .base_strategy import TextPreprocessingStrategy
8
+
9
+
10
+ class StopwordRemover(TextPreprocessingStrategy):
11
+ """
12
+ Removes common English stopwords from text
13
+ """
14
+
15
+ def __init__(self, stopwords: Optional[Set[str]] = None, enable: bool = True):
16
+ """
17
+ Initialize stopword remover
18
+
19
+ Args:
20
+ stopwords: Custom set of stopwords; uses default if None
21
+ enable: Whether stopword removal is enabled
22
+ """
23
+ self.enable = enable
24
+
25
+ if stopwords is None:
26
+ self.stopwords = self._get_default_stopwords()
27
+ else:
28
+ self.stopwords = stopwords
29
+
30
+ self.stats = {
31
+ 'stopwords_removed': 0,
32
+ 'enabled': enable,
33
+ }
34
+
35
+ @staticmethod
36
+ def _get_default_stopwords() -> Set[str]:
37
+ """Get default English stopwords"""
38
+ return {
39
+ 'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
40
+ 'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
41
+ 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'is', 'was',
42
+ 'are', 'been', 'were', 'or', 'an', 'which', 'their', 'what',
43
+ 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'them', 'me',
44
+ 'my', 'your', 'we', 'us', 'they', 'them', 'these', 'those',
45
+ }
46
+
47
+ def process(self, text: str) -> str:
48
+ """
49
+ Remove stopwords from text
50
+
51
+ Args:
52
+ text: Text to process
53
+
54
+ Returns:
55
+ Text with stopwords removed (if enabled)
56
+ """
57
+ if not self.enable:
58
+ return text
59
+
60
+ words = text.split()
61
+ original_count = len(words)
62
+
63
+ filtered_words = [w for w in words if w not in self.stopwords]
64
+
65
+ self.stats['stopwords_removed'] += original_count - len(filtered_words)
66
+
67
+ return ' '.join(filtered_words)
68
+
69
+ def add_stopword(self, word: str) -> None:
70
+ """Add a custom stopword"""
71
+ self.stopwords.add(word.lower())
72
+
73
+ def remove_stopword(self, word: str) -> None:
74
+ """Remove a stopword"""
75
+ self.stopwords.discard(word.lower())
76
+
77
+ def set_stopwords(self, stopwords: Set[str]) -> None:
78
+ """Replace all stopwords with a new set"""
79
+ self.stopwords = stopwords
80
+
81
+ def get_stats(self) -> Dict[str, Any]:
82
+ """Get stopword removal statistics"""
83
+ return self.stats.copy()
84
+
85
+ def reset_stats(self) -> None:
86
+ """Reset statistics"""
87
+ self.stats['stopwords_removed'] = 0
@@ -0,0 +1,74 @@
1
+ """
2
+ Text cleaning strategy - removes HTML, URLs, emails, and special characters
3
+ """
4
+
5
+ import re
6
+ from typing import Any, Dict
7
+
8
+ from .base_strategy import TextPreprocessingStrategy
9
+
10
+
11
+ class TextCleaner(TextPreprocessingStrategy):
12
+ """
13
+ Removes HTML tags, URLs, email addresses, and normalizes whitespace
14
+
15
+ This is the first step in the preprocessing pipeline.
16
+ """
17
+
18
+ def __init__(self):
19
+ self.stats = {
20
+ 'html_tags_removed': 0,
21
+ 'urls_removed': 0,
22
+ 'emails_removed': 0,
23
+ 'whitespace_normalized': 0,
24
+ }
25
+
26
+ def process(self, text: str) -> str:
27
+ """
28
+ Clean text by removing artifacts
29
+
30
+ Args:
31
+ text: Raw text to clean
32
+
33
+ Returns:
34
+ Cleaned text
35
+ """
36
+ # Remove leading/trailing whitespace
37
+ text = text.strip()
38
+
39
+ # Remove HTML tags
40
+ html_pattern = r'<[^>]+>'
41
+ html_matches = len(re.findall(html_pattern, text))
42
+ text = re.sub(html_pattern, '', text)
43
+ self.stats['html_tags_removed'] += html_matches
44
+
45
+ # Remove URLs
46
+ url_pattern = r'http[s]?://\S+|www\.\S+'
47
+ url_matches = len(re.findall(url_pattern, text))
48
+ text = re.sub(url_pattern, '', text)
49
+ self.stats['urls_removed'] += url_matches
50
+
51
+ # Remove email addresses
52
+ email_pattern = r'\S+@\S+'
53
+ email_matches = len(re.findall(email_pattern, text))
54
+ text = re.sub(email_pattern, '', text)
55
+ self.stats['emails_removed'] += email_matches
56
+
57
+ # Remove extra whitespace
58
+ original_spaces = len(re.findall(r'\s+', text))
59
+ text = re.sub(r'\s+', ' ', text)
60
+ self.stats['whitespace_normalized'] += original_spaces - len(re.findall(r'\s+', text))
61
+
62
+ # Remove common control characters
63
+ text = ''.join(ch for ch in text if ch.isprintable() or ch.isspace())
64
+
65
+ return text.strip()
66
+
67
+ def get_stats(self) -> Dict[str, Any]:
68
+ """Get cleaning statistics"""
69
+ return self.stats.copy()
70
+
71
+ def reset_stats(self) -> None:
72
+ """Reset statistics"""
73
+ for key in self.stats:
74
+ self.stats[key] = 0