krira-augment 2.1.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- krira_augment/__init__.py +515 -0
- krira_augment/_python/__init__.py +14 -0
- krira_augment/_python/cleaning.py +394 -0
- krira_augment/_python/pipeline.py +738 -0
- krira_augment/_python/transformation.py +551 -0
- krira_augment/_rust.cp313-win_amd64.pyd +0 -0
- krira_augment-2.1.3.dist-info/METADATA +722 -0
- krira_augment-2.1.3.dist-info/RECORD +10 -0
- krira_augment-2.1.3.dist-info/WHEEL +4 -0
- krira_augment-2.1.3.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Cleaner module for Krira Chunker V2.0.
|
|
3
|
+
|
|
4
|
+
Removes noise and normalizes text for downstream chunking operations.
|
|
5
|
+
This class applies regex-based filters to remove headers, footers,
|
|
6
|
+
and boilerplate text that would corrupt chunk quality.
|
|
7
|
+
|
|
8
|
+
Performance: O(1) memory usage regardless of file size with streaming support.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
import unicodedata
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from typing import Generator, List, Optional, Pattern, Tuple
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# =============================================================================
|
|
18
|
+
# REGEX PATTERNS
|
|
19
|
+
# Pre-compiled patterns for high performance
|
|
20
|
+
# =============================================================================
|
|
21
|
+
|
|
22
|
+
# === HEADER PATTERNS ===
|
|
23
|
+
# These patterns match common page headers found in documents
|
|
24
|
+
HEADER_PATTERNS: List[str] = [
|
|
25
|
+
r'Page\s+\d+\s+of\s+\d+', # "Page 1 of 10" - Standard page numbering
|
|
26
|
+
r'Page\s+\d+', # "Page 5" - Simple page numbering
|
|
27
|
+
r'\d+\s*/\s*\d+', # "5 / 10" - Fraction-style page numbers
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
# === FOOTER PATTERNS ===
|
|
31
|
+
# These patterns match common document footers
|
|
32
|
+
FOOTER_PATTERNS: List[str] = [
|
|
33
|
+
r'©\s*\d{4}\s+[\w\s]+', # "© 2024 Company Name" - Copyright with year
|
|
34
|
+
r'Copyright\s+\d{4}', # "Copyright 2024" - Alternative copyright format
|
|
35
|
+
r'Confidential', # "Confidential" - Common security footer
|
|
36
|
+
r'All\s+Rights\s+Reserved', # "All Rights Reserved" - Legal boilerplate
|
|
37
|
+
r'CONFIDENTIAL', # Uppercase variant
|
|
38
|
+
r'PROPRIETARY[\s\w]*', # Proprietary notices
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
# === PII PATTERNS ===
|
|
42
|
+
# Email pattern: Matches standard email format
|
|
43
|
+
# Format: local-part@domain.tld
|
|
44
|
+
EMAIL_PATTERN = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
|
45
|
+
|
|
46
|
+
# Phone pattern: Matches various phone formats
|
|
47
|
+
# Supports: +1-555-123-4567, (555) 123-4567, 555.123.4567, etc.
|
|
48
|
+
PHONE_PATTERN = r'(\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class CleaningConfig:
|
|
53
|
+
"""
|
|
54
|
+
Configuration for DataCleaner.
|
|
55
|
+
|
|
56
|
+
This dataclass controls all cleaning behaviors including noise removal,
|
|
57
|
+
character normalization, and privacy redaction.
|
|
58
|
+
|
|
59
|
+
Attributes:
|
|
60
|
+
remove_headers: Strip 'Page X of Y' patterns commonly found in PDFs.
|
|
61
|
+
remove_footers: Strip copyright notices and confidentiality statements.
|
|
62
|
+
custom_patterns: User-defined regex patterns to remove.
|
|
63
|
+
fix_unicode: Normalize Unicode (NFKC) to fix broken characters.
|
|
64
|
+
normalize_whitespace: Convert multiple spaces/tabs to single space.
|
|
65
|
+
preserve_line_breaks: Keep paragraph breaks (\\n\\n) intact.
|
|
66
|
+
redact_pii: Mask emails and phone numbers with placeholders.
|
|
67
|
+
chunk_buffer_size: Characters to process in each streaming buffer.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
# === NOISE REMOVAL ===
|
|
71
|
+
remove_headers: bool = True
|
|
72
|
+
"""Strip 'Page X of Y' patterns commonly found in PDFs."""
|
|
73
|
+
|
|
74
|
+
remove_footers: bool = True
|
|
75
|
+
"""Strip copyright notices and confidentiality statements."""
|
|
76
|
+
|
|
77
|
+
custom_patterns: List[str] = field(default_factory=list)
|
|
78
|
+
"""User-defined regex patterns to remove (e.g., company letterhead)."""
|
|
79
|
+
|
|
80
|
+
# === CHARACTER NORMALIZATION ===
|
|
81
|
+
fix_unicode: bool = True
|
|
82
|
+
"""Normalize Unicode (NFKC) to fix broken characters like \\u00a0."""
|
|
83
|
+
|
|
84
|
+
normalize_whitespace: bool = True
|
|
85
|
+
"""Convert multiple spaces/tabs to single space."""
|
|
86
|
+
|
|
87
|
+
preserve_line_breaks: bool = True
|
|
88
|
+
"""Keep paragraph breaks (\\n\\n) intact for structure."""
|
|
89
|
+
|
|
90
|
+
# === PRIVACY ===
|
|
91
|
+
redact_pii: bool = False
|
|
92
|
+
"""Mask emails and phone numbers. WARNING: May impact data quality."""
|
|
93
|
+
|
|
94
|
+
# === PERFORMANCE ===
|
|
95
|
+
chunk_buffer_size: int = 10_000
|
|
96
|
+
"""Number of characters to process in each buffer (streaming mode)."""
|
|
97
|
+
|
|
98
|
+
def __post_init__(self) -> None:
|
|
99
|
+
"""Validate configuration parameters."""
|
|
100
|
+
if self.chunk_buffer_size <= 0:
|
|
101
|
+
raise ValueError(
|
|
102
|
+
f"chunk_buffer_size must be positive, got {self.chunk_buffer_size}"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class DataCleaner:
|
|
107
|
+
"""
|
|
108
|
+
Removes noise and normalizes text for downstream chunking.
|
|
109
|
+
|
|
110
|
+
This class applies regex-based filters to remove headers, footers,
|
|
111
|
+
and boilerplate text that would corrupt chunk quality.
|
|
112
|
+
|
|
113
|
+
Features:
|
|
114
|
+
- Unicode normalization (NFKC)
|
|
115
|
+
- Header/footer removal (Page X of Y, Copyright, etc.)
|
|
116
|
+
- Custom pattern matching
|
|
117
|
+
- PII redaction (email, phone)
|
|
118
|
+
- Whitespace normalization
|
|
119
|
+
- Streaming support for large files
|
|
120
|
+
|
|
121
|
+
Example:
|
|
122
|
+
>>> config = CleaningConfig(remove_headers=True, fix_unicode=True)
|
|
123
|
+
>>> cleaner = DataCleaner(config)
|
|
124
|
+
>>> cleaned = cleaner.clean_text("Page 1 of 10\\nActual content here")
|
|
125
|
+
>>> print(cleaned)
|
|
126
|
+
'Actual content here'
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
# Placeholder strings for PII redaction
|
|
130
|
+
EMAIL_PLACEHOLDER = "<EMAIL>"
|
|
131
|
+
PHONE_PLACEHOLDER = "<PHONE>"
|
|
132
|
+
|
|
133
|
+
def __init__(self, config: CleaningConfig) -> None:
|
|
134
|
+
"""
|
|
135
|
+
Initialize the cleaner with compiled regex patterns.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
config: Configuration object controlling cleaning behavior.
|
|
139
|
+
|
|
140
|
+
Implementation Notes:
|
|
141
|
+
- Pre-compiles ALL regex patterns in __init__ for performance.
|
|
142
|
+
- Stores compiled patterns as instance variables.
|
|
143
|
+
- Validates config (raises ValueError if invalid).
|
|
144
|
+
|
|
145
|
+
Raises:
|
|
146
|
+
ValueError: If configuration is invalid.
|
|
147
|
+
TypeError: If config is not a CleaningConfig instance.
|
|
148
|
+
"""
|
|
149
|
+
if not isinstance(config, CleaningConfig):
|
|
150
|
+
raise TypeError(
|
|
151
|
+
f"config must be CleaningConfig, got {type(config).__name__}"
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
self.config = config
|
|
155
|
+
|
|
156
|
+
# Pre-compile header patterns
|
|
157
|
+
self._header_patterns: List[Pattern[str]] = []
|
|
158
|
+
if config.remove_headers:
|
|
159
|
+
for pattern in HEADER_PATTERNS:
|
|
160
|
+
try:
|
|
161
|
+
self._header_patterns.append(
|
|
162
|
+
re.compile(pattern, re.IGNORECASE | re.MULTILINE)
|
|
163
|
+
)
|
|
164
|
+
except re.error as e:
|
|
165
|
+
raise ValueError(f"Invalid header pattern '{pattern}': {e}")
|
|
166
|
+
|
|
167
|
+
# Pre-compile footer patterns
|
|
168
|
+
self._footer_patterns: List[Pattern[str]] = []
|
|
169
|
+
if config.remove_footers:
|
|
170
|
+
for pattern in FOOTER_PATTERNS:
|
|
171
|
+
try:
|
|
172
|
+
self._footer_patterns.append(
|
|
173
|
+
re.compile(pattern, re.IGNORECASE | re.MULTILINE)
|
|
174
|
+
)
|
|
175
|
+
except re.error as e:
|
|
176
|
+
raise ValueError(f"Invalid footer pattern '{pattern}': {e}")
|
|
177
|
+
|
|
178
|
+
# Pre-compile custom patterns
|
|
179
|
+
self._custom_patterns: List[Pattern[str]] = []
|
|
180
|
+
for pattern in config.custom_patterns:
|
|
181
|
+
try:
|
|
182
|
+
self._custom_patterns.append(
|
|
183
|
+
re.compile(pattern, re.MULTILINE)
|
|
184
|
+
)
|
|
185
|
+
except re.error as e:
|
|
186
|
+
raise ValueError(f"Invalid custom pattern '{pattern}': {e}")
|
|
187
|
+
|
|
188
|
+
# Pre-compile PII patterns
|
|
189
|
+
self._email_pattern: Optional[Pattern[str]] = None
|
|
190
|
+
self._phone_pattern: Optional[Pattern[str]] = None
|
|
191
|
+
if config.redact_pii:
|
|
192
|
+
self._email_pattern = re.compile(EMAIL_PATTERN)
|
|
193
|
+
self._phone_pattern = re.compile(PHONE_PATTERN)
|
|
194
|
+
|
|
195
|
+
# Whitespace normalization pattern
|
|
196
|
+
# Matches 2+ spaces or tabs (not newlines)
|
|
197
|
+
self._multi_space_pattern = re.compile(r'[^\S\n]+')
|
|
198
|
+
|
|
199
|
+
# Multiple newline pattern (more than 2 consecutive)
|
|
200
|
+
self._multi_newline_pattern = re.compile(r'\n{3,}')
|
|
201
|
+
|
|
202
|
+
# Statistics tracking
|
|
203
|
+
self._stats = {
|
|
204
|
+
"bytes_cleaned": 0,
|
|
205
|
+
"patterns_removed": 0,
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
def clean_text(self, text: Optional[str]) -> str:
|
|
209
|
+
"""
|
|
210
|
+
Apply all enabled cleaning filters to the input text.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
text: Raw input string (can be empty or None).
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Cleaned string with noise removed.
|
|
217
|
+
|
|
218
|
+
Algorithm:
|
|
219
|
+
1. If text is None or empty, return "".
|
|
220
|
+
2. If fix_unicode: Apply unicodedata.normalize('NFKC', text).
|
|
221
|
+
3. If remove_headers: Apply header removal regex.
|
|
222
|
+
4. If remove_footers: Apply footer removal regex.
|
|
223
|
+
5. Apply custom_patterns in order.
|
|
224
|
+
6. If redact_pii: Mask emails and phones.
|
|
225
|
+
7. If normalize_whitespace: Collapse multiple spaces.
|
|
226
|
+
8. Return result.strip().
|
|
227
|
+
|
|
228
|
+
Edge Cases:
|
|
229
|
+
- Text with only whitespace should return "".
|
|
230
|
+
- Text with only headers/footers should return "".
|
|
231
|
+
- Unicode errors should NOT crash (uses errors='ignore').
|
|
232
|
+
|
|
233
|
+
Example:
|
|
234
|
+
>>> cleaner = DataCleaner(CleaningConfig())
|
|
235
|
+
>>> cleaner.clean_text("Page 1 of 5\\n\\nContent here")
|
|
236
|
+
'Content here'
|
|
237
|
+
"""
|
|
238
|
+
# Handle None or empty input
|
|
239
|
+
if text is None:
|
|
240
|
+
return ""
|
|
241
|
+
|
|
242
|
+
if not text:
|
|
243
|
+
return ""
|
|
244
|
+
|
|
245
|
+
original_len = len(text)
|
|
246
|
+
result = text
|
|
247
|
+
patterns_removed = 0
|
|
248
|
+
|
|
249
|
+
# Step 1: Unicode normalization
|
|
250
|
+
if self.config.fix_unicode:
|
|
251
|
+
try:
|
|
252
|
+
# NFKC: Compatibility decomposition followed by canonical composition
|
|
253
|
+
# Converts things like \u00a0 (non-breaking space) to regular space
|
|
254
|
+
result = unicodedata.normalize('NFKC', result)
|
|
255
|
+
except (TypeError, UnicodeError):
|
|
256
|
+
# If normalization fails, continue with original text
|
|
257
|
+
pass
|
|
258
|
+
|
|
259
|
+
# Step 2: Remove headers
|
|
260
|
+
if self.config.remove_headers:
|
|
261
|
+
for pattern in self._header_patterns:
|
|
262
|
+
result, count = pattern.subn('', result)
|
|
263
|
+
patterns_removed += count
|
|
264
|
+
|
|
265
|
+
# Step 3: Remove footers
|
|
266
|
+
if self.config.remove_footers:
|
|
267
|
+
for pattern in self._footer_patterns:
|
|
268
|
+
result, count = pattern.subn('', result)
|
|
269
|
+
patterns_removed += count
|
|
270
|
+
|
|
271
|
+
# Step 4: Apply custom patterns
|
|
272
|
+
for pattern in self._custom_patterns:
|
|
273
|
+
result, count = pattern.subn('', result)
|
|
274
|
+
patterns_removed += count
|
|
275
|
+
|
|
276
|
+
# Step 5: Redact PII
|
|
277
|
+
if self.config.redact_pii and self._email_pattern and self._phone_pattern:
|
|
278
|
+
result, email_count = self._email_pattern.subn(
|
|
279
|
+
self.EMAIL_PLACEHOLDER, result
|
|
280
|
+
)
|
|
281
|
+
result, phone_count = self._phone_pattern.subn(
|
|
282
|
+
self.PHONE_PLACEHOLDER, result
|
|
283
|
+
)
|
|
284
|
+
patterns_removed += email_count + phone_count
|
|
285
|
+
|
|
286
|
+
# Step 6: Normalize whitespace
|
|
287
|
+
if self.config.normalize_whitespace:
|
|
288
|
+
if self.config.preserve_line_breaks:
|
|
289
|
+
# Handle each line separately to preserve line breaks
|
|
290
|
+
lines = result.split('\n')
|
|
291
|
+
normalized_lines = []
|
|
292
|
+
for line in lines:
|
|
293
|
+
# Collapse multiple spaces/tabs to single space
|
|
294
|
+
normalized = self._multi_space_pattern.sub(' ', line)
|
|
295
|
+
normalized_lines.append(normalized.strip())
|
|
296
|
+
result = '\n'.join(normalized_lines)
|
|
297
|
+
|
|
298
|
+
# Collapse excessive newlines (more than 2) to double newline
|
|
299
|
+
result = self._multi_newline_pattern.sub('\n\n', result)
|
|
300
|
+
else:
|
|
301
|
+
# Convert all whitespace (including newlines) to single space
|
|
302
|
+
result = ' '.join(result.split())
|
|
303
|
+
|
|
304
|
+
# Final strip
|
|
305
|
+
result = result.strip()
|
|
306
|
+
|
|
307
|
+
# Update statistics
|
|
308
|
+
self._stats["bytes_cleaned"] += original_len
|
|
309
|
+
self._stats["patterns_removed"] += patterns_removed
|
|
310
|
+
|
|
311
|
+
return result
|
|
312
|
+
|
|
313
|
+
def clean_stream(
|
|
314
|
+
self,
|
|
315
|
+
text_stream: Generator[str, None, None]
|
|
316
|
+
) -> Generator[str, None, None]:
|
|
317
|
+
"""
|
|
318
|
+
Clean a stream of text chunks without loading all into memory.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
text_stream: Generator yielding text strings.
|
|
322
|
+
|
|
323
|
+
Yields:
|
|
324
|
+
Cleaned text strings.
|
|
325
|
+
|
|
326
|
+
Implementation:
|
|
327
|
+
- Uses a sliding window buffer to handle patterns that span chunks.
|
|
328
|
+
- Buffer size controlled by config.chunk_buffer_size.
|
|
329
|
+
- Yields cleaned text as soon as buffer is processed.
|
|
330
|
+
|
|
331
|
+
Note:
|
|
332
|
+
For patterns that may span chunk boundaries (like multi-word
|
|
333
|
+
patterns), this method uses an overlap buffer to ensure
|
|
334
|
+
accurate detection.
|
|
335
|
+
|
|
336
|
+
Example:
|
|
337
|
+
>>> def text_generator():
|
|
338
|
+
... yield "Page 1 of 10\\n"
|
|
339
|
+
... yield "Content line 1\\n"
|
|
340
|
+
... yield "Content line 2"
|
|
341
|
+
>>>
|
|
342
|
+
>>> cleaner = DataCleaner(CleaningConfig())
|
|
343
|
+
>>> for chunk in cleaner.clean_stream(text_generator()):
|
|
344
|
+
... print(chunk)
|
|
345
|
+
"""
|
|
346
|
+
buffer_size = self.config.chunk_buffer_size
|
|
347
|
+
|
|
348
|
+
# Overlap to handle patterns spanning chunk boundaries
|
|
349
|
+
# Use max pattern length estimate (100 chars should cover most cases)
|
|
350
|
+
overlap_size = min(100, buffer_size // 10)
|
|
351
|
+
|
|
352
|
+
buffer = ""
|
|
353
|
+
|
|
354
|
+
for chunk in text_stream:
|
|
355
|
+
if chunk is None:
|
|
356
|
+
continue
|
|
357
|
+
|
|
358
|
+
buffer += chunk
|
|
359
|
+
|
|
360
|
+
# Process when buffer is large enough
|
|
361
|
+
while len(buffer) >= buffer_size + overlap_size:
|
|
362
|
+
# Process the main portion
|
|
363
|
+
to_process = buffer[:buffer_size]
|
|
364
|
+
cleaned = self.clean_text(to_process)
|
|
365
|
+
|
|
366
|
+
if cleaned:
|
|
367
|
+
yield cleaned
|
|
368
|
+
|
|
369
|
+
# Keep overlap for next iteration (might contain split patterns)
|
|
370
|
+
buffer = buffer[buffer_size:]
|
|
371
|
+
|
|
372
|
+
# Process remaining buffer
|
|
373
|
+
if buffer:
|
|
374
|
+
cleaned = self.clean_text(buffer)
|
|
375
|
+
if cleaned:
|
|
376
|
+
yield cleaned
|
|
377
|
+
|
|
378
|
+
def get_stats(self) -> dict:
|
|
379
|
+
"""
|
|
380
|
+
Return cleaning statistics.
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
Dictionary with keys:
|
|
384
|
+
- 'bytes_cleaned': Total text bytes processed.
|
|
385
|
+
- 'patterns_removed': Count of regex pattern matches removed.
|
|
386
|
+
"""
|
|
387
|
+
return dict(self._stats)
|
|
388
|
+
|
|
389
|
+
def reset_stats(self) -> None:
|
|
390
|
+
"""Reset internal statistics counters."""
|
|
391
|
+
self._stats = {
|
|
392
|
+
"bytes_cleaned": 0,
|
|
393
|
+
"patterns_removed": 0,
|
|
394
|
+
}
|