krira-augment 2.1.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,394 @@
1
+ """
2
+ Data Cleaner module for Krira Chunker V2.0.
3
+
4
+ Removes noise and normalizes text for downstream chunking operations.
5
+ This class applies regex-based filters to remove headers, footers,
6
+ and boilerplate text that would corrupt chunk quality.
7
+
8
+ Performance: O(1) memory usage regardless of file size with streaming support.
9
+ """
10
+
11
+ import re
12
+ import unicodedata
13
+ from dataclasses import dataclass, field
14
+ from typing import Generator, List, Optional, Pattern, Tuple
15
+
16
+
17
+ # =============================================================================
18
+ # REGEX PATTERNS
19
+ # Pre-compiled patterns for high performance
20
+ # =============================================================================
21
+
22
+ # === HEADER PATTERNS ===
23
+ # These patterns match common page headers found in documents
24
+ HEADER_PATTERNS: List[str] = [
25
+ r'Page\s+\d+\s+of\s+\d+', # "Page 1 of 10" - Standard page numbering
26
+ r'Page\s+\d+', # "Page 5" - Simple page numbering
27
+ r'\d+\s*/\s*\d+', # "5 / 10" - Fraction-style page numbers
28
+ ]
29
+
30
+ # === FOOTER PATTERNS ===
31
+ # These patterns match common document footers
32
+ FOOTER_PATTERNS: List[str] = [
33
+ r'©\s*\d{4}\s+[\w\s]+', # "© 2024 Company Name" - Copyright with year
34
+ r'Copyright\s+\d{4}', # "Copyright 2024" - Alternative copyright format
35
+ r'Confidential', # "Confidential" - Common security footer
36
+ r'All\s+Rights\s+Reserved', # "All Rights Reserved" - Legal boilerplate
37
+ r'CONFIDENTIAL', # Uppercase variant
38
+ r'PROPRIETARY[\s\w]*', # Proprietary notices
39
+ ]
40
+
41
+ # === PII PATTERNS ===
42
+ # Email pattern: Matches standard email format
43
+ # Format: local-part@domain.tld
44
+ EMAIL_PATTERN = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
45
+
46
+ # Phone pattern: Matches various phone formats
47
+ # Supports: +1-555-123-4567, (555) 123-4567, 555.123.4567, etc.
48
+ PHONE_PATTERN = r'(\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
49
+
50
+
51
+ @dataclass
52
+ class CleaningConfig:
53
+ """
54
+ Configuration for DataCleaner.
55
+
56
+ This dataclass controls all cleaning behaviors including noise removal,
57
+ character normalization, and privacy redaction.
58
+
59
+ Attributes:
60
+ remove_headers: Strip 'Page X of Y' patterns commonly found in PDFs.
61
+ remove_footers: Strip copyright notices and confidentiality statements.
62
+ custom_patterns: User-defined regex patterns to remove.
63
+ fix_unicode: Normalize Unicode (NFKC) to fix broken characters.
64
+ normalize_whitespace: Convert multiple spaces/tabs to single space.
65
+ preserve_line_breaks: Keep paragraph breaks (\\n\\n) intact.
66
+ redact_pii: Mask emails and phone numbers with placeholders.
67
+ chunk_buffer_size: Characters to process in each streaming buffer.
68
+ """
69
+
70
+ # === NOISE REMOVAL ===
71
+ remove_headers: bool = True
72
+ """Strip 'Page X of Y' patterns commonly found in PDFs."""
73
+
74
+ remove_footers: bool = True
75
+ """Strip copyright notices and confidentiality statements."""
76
+
77
+ custom_patterns: List[str] = field(default_factory=list)
78
+ """User-defined regex patterns to remove (e.g., company letterhead)."""
79
+
80
+ # === CHARACTER NORMALIZATION ===
81
+ fix_unicode: bool = True
82
+ """Normalize Unicode (NFKC) to fix broken characters like \\u00a0."""
83
+
84
+ normalize_whitespace: bool = True
85
+ """Convert multiple spaces/tabs to single space."""
86
+
87
+ preserve_line_breaks: bool = True
88
+ """Keep paragraph breaks (\\n\\n) intact for structure."""
89
+
90
+ # === PRIVACY ===
91
+ redact_pii: bool = False
92
+ """Mask emails and phone numbers. WARNING: May impact data quality."""
93
+
94
+ # === PERFORMANCE ===
95
+ chunk_buffer_size: int = 10_000
96
+ """Number of characters to process in each buffer (streaming mode)."""
97
+
98
+ def __post_init__(self) -> None:
99
+ """Validate configuration parameters."""
100
+ if self.chunk_buffer_size <= 0:
101
+ raise ValueError(
102
+ f"chunk_buffer_size must be positive, got {self.chunk_buffer_size}"
103
+ )
104
+
105
+
106
+ class DataCleaner:
107
+ """
108
+ Removes noise and normalizes text for downstream chunking.
109
+
110
+ This class applies regex-based filters to remove headers, footers,
111
+ and boilerplate text that would corrupt chunk quality.
112
+
113
+ Features:
114
+ - Unicode normalization (NFKC)
115
+ - Header/footer removal (Page X of Y, Copyright, etc.)
116
+ - Custom pattern matching
117
+ - PII redaction (email, phone)
118
+ - Whitespace normalization
119
+ - Streaming support for large files
120
+
121
+ Example:
122
+ >>> config = CleaningConfig(remove_headers=True, fix_unicode=True)
123
+ >>> cleaner = DataCleaner(config)
124
+ >>> cleaned = cleaner.clean_text("Page 1 of 10\\nActual content here")
125
+ >>> print(cleaned)
126
+ 'Actual content here'
127
+ """
128
+
129
+ # Placeholder strings for PII redaction
130
+ EMAIL_PLACEHOLDER = "<EMAIL>"
131
+ PHONE_PLACEHOLDER = "<PHONE>"
132
+
133
+ def __init__(self, config: CleaningConfig) -> None:
134
+ """
135
+ Initialize the cleaner with compiled regex patterns.
136
+
137
+ Args:
138
+ config: Configuration object controlling cleaning behavior.
139
+
140
+ Implementation Notes:
141
+ - Pre-compiles ALL regex patterns in __init__ for performance.
142
+ - Stores compiled patterns as instance variables.
143
+ - Validates config (raises ValueError if invalid).
144
+
145
+ Raises:
146
+ ValueError: If configuration is invalid.
147
+ TypeError: If config is not a CleaningConfig instance.
148
+ """
149
+ if not isinstance(config, CleaningConfig):
150
+ raise TypeError(
151
+ f"config must be CleaningConfig, got {type(config).__name__}"
152
+ )
153
+
154
+ self.config = config
155
+
156
+ # Pre-compile header patterns
157
+ self._header_patterns: List[Pattern[str]] = []
158
+ if config.remove_headers:
159
+ for pattern in HEADER_PATTERNS:
160
+ try:
161
+ self._header_patterns.append(
162
+ re.compile(pattern, re.IGNORECASE | re.MULTILINE)
163
+ )
164
+ except re.error as e:
165
+ raise ValueError(f"Invalid header pattern '{pattern}': {e}")
166
+
167
+ # Pre-compile footer patterns
168
+ self._footer_patterns: List[Pattern[str]] = []
169
+ if config.remove_footers:
170
+ for pattern in FOOTER_PATTERNS:
171
+ try:
172
+ self._footer_patterns.append(
173
+ re.compile(pattern, re.IGNORECASE | re.MULTILINE)
174
+ )
175
+ except re.error as e:
176
+ raise ValueError(f"Invalid footer pattern '{pattern}': {e}")
177
+
178
+ # Pre-compile custom patterns
179
+ self._custom_patterns: List[Pattern[str]] = []
180
+ for pattern in config.custom_patterns:
181
+ try:
182
+ self._custom_patterns.append(
183
+ re.compile(pattern, re.MULTILINE)
184
+ )
185
+ except re.error as e:
186
+ raise ValueError(f"Invalid custom pattern '{pattern}': {e}")
187
+
188
+ # Pre-compile PII patterns
189
+ self._email_pattern: Optional[Pattern[str]] = None
190
+ self._phone_pattern: Optional[Pattern[str]] = None
191
+ if config.redact_pii:
192
+ self._email_pattern = re.compile(EMAIL_PATTERN)
193
+ self._phone_pattern = re.compile(PHONE_PATTERN)
194
+
195
+ # Whitespace normalization pattern
196
+ # Matches 2+ spaces or tabs (not newlines)
197
+ self._multi_space_pattern = re.compile(r'[^\S\n]+')
198
+
199
+ # Multiple newline pattern (more than 2 consecutive)
200
+ self._multi_newline_pattern = re.compile(r'\n{3,}')
201
+
202
+ # Statistics tracking
203
+ self._stats = {
204
+ "bytes_cleaned": 0,
205
+ "patterns_removed": 0,
206
+ }
207
+
208
+ def clean_text(self, text: Optional[str]) -> str:
209
+ """
210
+ Apply all enabled cleaning filters to the input text.
211
+
212
+ Args:
213
+ text: Raw input string (can be empty or None).
214
+
215
+ Returns:
216
+ Cleaned string with noise removed.
217
+
218
+ Algorithm:
219
+ 1. If text is None or empty, return "".
220
+ 2. If fix_unicode: Apply unicodedata.normalize('NFKC', text).
221
+ 3. If remove_headers: Apply header removal regex.
222
+ 4. If remove_footers: Apply footer removal regex.
223
+ 5. Apply custom_patterns in order.
224
+ 6. If redact_pii: Mask emails and phones.
225
+ 7. If normalize_whitespace: Collapse multiple spaces.
226
+ 8. Return result.strip().
227
+
228
+ Edge Cases:
229
+ - Text with only whitespace should return "".
230
+ - Text with only headers/footers should return "".
231
+ - Unicode errors should NOT crash (uses errors='ignore').
232
+
233
+ Example:
234
+ >>> cleaner = DataCleaner(CleaningConfig())
235
+ >>> cleaner.clean_text("Page 1 of 5\\n\\nContent here")
236
+ 'Content here'
237
+ """
238
+ # Handle None or empty input
239
+ if text is None:
240
+ return ""
241
+
242
+ if not text:
243
+ return ""
244
+
245
+ original_len = len(text)
246
+ result = text
247
+ patterns_removed = 0
248
+
249
+ # Step 1: Unicode normalization
250
+ if self.config.fix_unicode:
251
+ try:
252
+ # NFKC: Compatibility decomposition followed by canonical composition
253
+ # Converts things like \u00a0 (non-breaking space) to regular space
254
+ result = unicodedata.normalize('NFKC', result)
255
+ except (TypeError, UnicodeError):
256
+ # If normalization fails, continue with original text
257
+ pass
258
+
259
+ # Step 2: Remove headers
260
+ if self.config.remove_headers:
261
+ for pattern in self._header_patterns:
262
+ result, count = pattern.subn('', result)
263
+ patterns_removed += count
264
+
265
+ # Step 3: Remove footers
266
+ if self.config.remove_footers:
267
+ for pattern in self._footer_patterns:
268
+ result, count = pattern.subn('', result)
269
+ patterns_removed += count
270
+
271
+ # Step 4: Apply custom patterns
272
+ for pattern in self._custom_patterns:
273
+ result, count = pattern.subn('', result)
274
+ patterns_removed += count
275
+
276
+ # Step 5: Redact PII
277
+ if self.config.redact_pii and self._email_pattern and self._phone_pattern:
278
+ result, email_count = self._email_pattern.subn(
279
+ self.EMAIL_PLACEHOLDER, result
280
+ )
281
+ result, phone_count = self._phone_pattern.subn(
282
+ self.PHONE_PLACEHOLDER, result
283
+ )
284
+ patterns_removed += email_count + phone_count
285
+
286
+ # Step 6: Normalize whitespace
287
+ if self.config.normalize_whitespace:
288
+ if self.config.preserve_line_breaks:
289
+ # Handle each line separately to preserve line breaks
290
+ lines = result.split('\n')
291
+ normalized_lines = []
292
+ for line in lines:
293
+ # Collapse multiple spaces/tabs to single space
294
+ normalized = self._multi_space_pattern.sub(' ', line)
295
+ normalized_lines.append(normalized.strip())
296
+ result = '\n'.join(normalized_lines)
297
+
298
+ # Collapse excessive newlines (more than 2) to double newline
299
+ result = self._multi_newline_pattern.sub('\n\n', result)
300
+ else:
301
+ # Convert all whitespace (including newlines) to single space
302
+ result = ' '.join(result.split())
303
+
304
+ # Final strip
305
+ result = result.strip()
306
+
307
+ # Update statistics
308
+ self._stats["bytes_cleaned"] += original_len
309
+ self._stats["patterns_removed"] += patterns_removed
310
+
311
+ return result
312
+
313
+ def clean_stream(
314
+ self,
315
+ text_stream: Generator[str, None, None]
316
+ ) -> Generator[str, None, None]:
317
+ """
318
+ Clean a stream of text chunks without loading all into memory.
319
+
320
+ Args:
321
+ text_stream: Generator yielding text strings.
322
+
323
+ Yields:
324
+ Cleaned text strings.
325
+
326
+ Implementation:
327
+ - Uses a sliding window buffer to handle patterns that span chunks.
328
+ - Buffer size controlled by config.chunk_buffer_size.
329
+ - Yields cleaned text as soon as buffer is processed.
330
+
331
+ Note:
332
+ For patterns that may span chunk boundaries (like multi-word
333
+ patterns), this method uses an overlap buffer to ensure
334
+ accurate detection.
335
+
336
+ Example:
337
+ >>> def text_generator():
338
+ ... yield "Page 1 of 10\\n"
339
+ ... yield "Content line 1\\n"
340
+ ... yield "Content line 2"
341
+ >>>
342
+ >>> cleaner = DataCleaner(CleaningConfig())
343
+ >>> for chunk in cleaner.clean_stream(text_generator()):
344
+ ... print(chunk)
345
+ """
346
+ buffer_size = self.config.chunk_buffer_size
347
+
348
+ # Overlap to handle patterns spanning chunk boundaries
349
+ # Use max pattern length estimate (100 chars should cover most cases)
350
+ overlap_size = min(100, buffer_size // 10)
351
+
352
+ buffer = ""
353
+
354
+ for chunk in text_stream:
355
+ if chunk is None:
356
+ continue
357
+
358
+ buffer += chunk
359
+
360
+ # Process when buffer is large enough
361
+ while len(buffer) >= buffer_size + overlap_size:
362
+ # Process the main portion
363
+ to_process = buffer[:buffer_size]
364
+ cleaned = self.clean_text(to_process)
365
+
366
+ if cleaned:
367
+ yield cleaned
368
+
369
+ # Keep overlap for next iteration (might contain split patterns)
370
+ buffer = buffer[buffer_size:]
371
+
372
+ # Process remaining buffer
373
+ if buffer:
374
+ cleaned = self.clean_text(buffer)
375
+ if cleaned:
376
+ yield cleaned
377
+
378
+ def get_stats(self) -> dict:
379
+ """
380
+ Return cleaning statistics.
381
+
382
+ Returns:
383
+ Dictionary with keys:
384
+ - 'bytes_cleaned': Total text bytes processed.
385
+ - 'patterns_removed': Count of regex pattern matches removed.
386
+ """
387
+ return dict(self._stats)
388
+
389
+ def reset_stats(self) -> None:
390
+ """Reset internal statistics counters."""
391
+ self._stats = {
392
+ "bytes_cleaned": 0,
393
+ "patterns_removed": 0,
394
+ }