krira-augment 2.1.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- krira_augment/__init__.py +515 -0
- krira_augment/_python/__init__.py +14 -0
- krira_augment/_python/cleaning.py +394 -0
- krira_augment/_python/pipeline.py +738 -0
- krira_augment/_python/transformation.py +551 -0
- krira_augment/_rust.cp313-win_amd64.pyd +0 -0
- krira_augment-2.1.3.dist-info/METADATA +722 -0
- krira_augment-2.1.3.dist-info/RECORD +10 -0
- krira_augment-2.1.3.dist-info/WHEEL +4 -0
- krira_augment-2.1.3.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,738 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pipeline Orchestrator for Krira Chunker V2.0.
|
|
3
|
+
|
|
4
|
+
Orchestrates the Clean -> Transform -> Chunk workflow.
|
|
5
|
+
This is the main entry point for users processing CSV and XLSX files.
|
|
6
|
+
|
|
7
|
+
Performance Requirements:
|
|
8
|
+
- Must process 1GB files in under 10 minutes on a 4-core CPU.
|
|
9
|
+
- Memory usage must remain constant O(1) regardless of file size.
|
|
10
|
+
- Uses streaming architecture (generators) throughout.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import csv
|
|
14
|
+
import logging
|
|
15
|
+
import os
|
|
16
|
+
import sys
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from io import StringIO
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any, Dict, Generator, Iterator, List, Optional, Tuple, Union
|
|
21
|
+
|
|
22
|
+
# Import sibling modules
|
|
23
|
+
from .cleaning import CleaningConfig, DataCleaner
|
|
24
|
+
from .transformation import DataTransformer, TransformConfig
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# Setup logger
|
|
28
|
+
LOGGER = logging.getLogger("krira_augment.pipeline")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# =============================================================================
|
|
32
|
+
# Configuration
|
|
33
|
+
# =============================================================================
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class PipelineConfig:
|
|
37
|
+
"""
|
|
38
|
+
Master configuration for the full Clean -> Transform -> Chunk pipeline.
|
|
39
|
+
|
|
40
|
+
Combines CleaningConfig, TransformConfig, and chunk settings into a
|
|
41
|
+
single configuration object.
|
|
42
|
+
|
|
43
|
+
Attributes:
|
|
44
|
+
cleaning_config: Configuration for the DataCleaner stage.
|
|
45
|
+
transform_config: Configuration for the DataTransformer stage.
|
|
46
|
+
chunk_config: Configuration for the chunking stage (optional, uses defaults).
|
|
47
|
+
|
|
48
|
+
csv_batch_rows: Number of rows to process per batch for CSV files.
|
|
49
|
+
xlsx_batch_rows: Number of rows to process per batch for XLSX files.
|
|
50
|
+
log_progress_every: Log progress every N rows processed.
|
|
51
|
+
encoding_fallbacks: List of encodings to try if UTF-8 fails.
|
|
52
|
+
|
|
53
|
+
Example:
|
|
54
|
+
>>> from krira_augment import PipelineConfig, CleaningConfig, TransformConfig
|
|
55
|
+
>>> cfg = PipelineConfig(
|
|
56
|
+
... cleaning_config=CleaningConfig(remove_headers=True),
|
|
57
|
+
... transform_config=TransformConfig(output_format="markdown"),
|
|
58
|
+
... )
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
# Sub-configurations
|
|
62
|
+
cleaning_config: CleaningConfig = field(default_factory=CleaningConfig)
|
|
63
|
+
"""Configuration for DataCleaner."""
|
|
64
|
+
|
|
65
|
+
transform_config: TransformConfig = field(default_factory=TransformConfig)
|
|
66
|
+
"""Configuration for DataTransformer."""
|
|
67
|
+
|
|
68
|
+
chunk_config: Optional[Any] = None
|
|
69
|
+
"""Configuration for chunking (ChunkConfig from Krira_Chunker). Uses defaults if None."""
|
|
70
|
+
|
|
71
|
+
# Batch processing
|
|
72
|
+
csv_batch_rows: int = 50_000
|
|
73
|
+
"""Number of rows to process per batch for CSV files."""
|
|
74
|
+
|
|
75
|
+
xlsx_batch_rows: int = 25_000
|
|
76
|
+
"""Number of rows to process per batch for XLSX files."""
|
|
77
|
+
|
|
78
|
+
# Progress logging
|
|
79
|
+
log_progress_every: int = 100_000
|
|
80
|
+
"""Log progress status every N rows processed."""
|
|
81
|
+
|
|
82
|
+
# Encoding fallbacks
|
|
83
|
+
encoding_fallbacks: Tuple[str, ...] = ("utf-8", "latin-1", "cp1252", "utf-16")
|
|
84
|
+
"""List of encodings to try when reading files."""
|
|
85
|
+
|
|
86
|
+
def __post_init__(self) -> None:
|
|
87
|
+
"""Validate configuration parameters."""
|
|
88
|
+
if self.csv_batch_rows <= 0:
|
|
89
|
+
raise ValueError(
|
|
90
|
+
f"csv_batch_rows must be positive, got {self.csv_batch_rows}"
|
|
91
|
+
)
|
|
92
|
+
if self.xlsx_batch_rows <= 0:
|
|
93
|
+
raise ValueError(
|
|
94
|
+
f"xlsx_batch_rows must be positive, got {self.xlsx_batch_rows}"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# =============================================================================
|
|
99
|
+
# Pipeline Exception Classes
|
|
100
|
+
# =============================================================================
|
|
101
|
+
|
|
102
|
+
class PipelineError(Exception):
|
|
103
|
+
"""Base exception for pipeline errors."""
|
|
104
|
+
pass
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class FileNotFoundPipelineError(PipelineError):
|
|
108
|
+
"""Raised when the input file is not found."""
|
|
109
|
+
|
|
110
|
+
def __init__(self, file_path: str):
|
|
111
|
+
self.file_path = file_path
|
|
112
|
+
super().__init__(f"File not found: {file_path}")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class PermissionPipelineError(PipelineError):
|
|
116
|
+
"""Raised when file permissions prevent access."""
|
|
117
|
+
|
|
118
|
+
def __init__(self, file_path: str):
|
|
119
|
+
self.file_path = file_path
|
|
120
|
+
super().__init__(
|
|
121
|
+
f"Permission denied: {file_path}. "
|
|
122
|
+
"Please check file permissions and try again."
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class UnsupportedFormatPipelineError(PipelineError):
|
|
127
|
+
"""Raised when the file format is not supported."""
|
|
128
|
+
|
|
129
|
+
def __init__(self, file_path: str, extension: str):
|
|
130
|
+
self.file_path = file_path
|
|
131
|
+
self.extension = extension
|
|
132
|
+
super().__init__(
|
|
133
|
+
f"Unsupported file format: {extension}. "
|
|
134
|
+
"Supported formats: .csv, .xlsx"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# =============================================================================
|
|
139
|
+
# KriraPipeline
|
|
140
|
+
# =============================================================================
|
|
141
|
+
|
|
142
|
+
class KriraPipeline:
|
|
143
|
+
"""
|
|
144
|
+
Orchestrates Clean -> Transform -> Chunk workflow.
|
|
145
|
+
|
|
146
|
+
This is the main entry point for users. It chains DataCleaner,
|
|
147
|
+
DataTransformer, and chunking in sequence.
|
|
148
|
+
|
|
149
|
+
Features:
|
|
150
|
+
- Streaming architecture for O(1) memory usage
|
|
151
|
+
- Automatic file type detection (CSV, XLSX)
|
|
152
|
+
- Multiple encoding fallback
|
|
153
|
+
- Progress logging
|
|
154
|
+
- Comprehensive error handling
|
|
155
|
+
- Pipeline statistics
|
|
156
|
+
|
|
157
|
+
Example:
|
|
158
|
+
>>> from krira_augment import KriraPipeline, PipelineConfig
|
|
159
|
+
>>> cfg = PipelineConfig()
|
|
160
|
+
>>> pipeline = KriraPipeline(cfg)
|
|
161
|
+
>>> for chunk in pipeline.process_file("data.csv"):
|
|
162
|
+
... print(chunk["text"][:100])
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
def __init__(self, config: PipelineConfig) -> None:
|
|
166
|
+
"""
|
|
167
|
+
Initialize all pipeline components.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
config: Master configuration object.
|
|
171
|
+
|
|
172
|
+
Implementation:
|
|
173
|
+
- Instantiates DataCleaner with config.cleaning_config.
|
|
174
|
+
- Instantiates DataTransformer with config.transform_config.
|
|
175
|
+
- Validates that all configs are compatible.
|
|
176
|
+
|
|
177
|
+
Raises:
|
|
178
|
+
TypeError: If config is not a PipelineConfig instance.
|
|
179
|
+
"""
|
|
180
|
+
if not isinstance(config, PipelineConfig):
|
|
181
|
+
raise TypeError(
|
|
182
|
+
f"config must be PipelineConfig, got {type(config).__name__}"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
self.config = config
|
|
186
|
+
|
|
187
|
+
# Initialize sub-components
|
|
188
|
+
self.cleaner = DataCleaner(config.cleaning_config)
|
|
189
|
+
self.transformer = DataTransformer(config.transform_config)
|
|
190
|
+
|
|
191
|
+
# Lazy-load chunker (requires Krira_Chunker)
|
|
192
|
+
self._chunker = None
|
|
193
|
+
self._chunk_config = config.chunk_config
|
|
194
|
+
|
|
195
|
+
# Pipeline statistics
|
|
196
|
+
self._stats = {
|
|
197
|
+
"rows_processed": 0,
|
|
198
|
+
"chunks_created": 0,
|
|
199
|
+
"bytes_cleaned": 0,
|
|
200
|
+
"patterns_removed": 0,
|
|
201
|
+
"files_processed": 0,
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
# Reduced batch size flag (for memory recovery)
|
|
205
|
+
self._reduced_batch = False
|
|
206
|
+
|
|
207
|
+
@property
|
|
208
|
+
def chunker(self):
|
|
209
|
+
"""Lazy-load the chunker from Krira_Chunker."""
|
|
210
|
+
if self._chunker is None:
|
|
211
|
+
try:
|
|
212
|
+
from Krira_Chunker import FastChunker, ChunkConfig
|
|
213
|
+
|
|
214
|
+
if self._chunk_config is None:
|
|
215
|
+
self._chunk_config = ChunkConfig()
|
|
216
|
+
|
|
217
|
+
self._chunker = FastChunker(self._chunk_config)
|
|
218
|
+
except ImportError:
|
|
219
|
+
LOGGER.warning(
|
|
220
|
+
"Krira_Chunker not available. "
|
|
221
|
+
"Using simple text chunking fallback."
|
|
222
|
+
)
|
|
223
|
+
self._chunker = None
|
|
224
|
+
|
|
225
|
+
return self._chunker
|
|
226
|
+
|
|
227
|
+
def _detect_separator(self, header_line: str) -> str:
|
|
228
|
+
"""
|
|
229
|
+
Auto-detect CSV separator from header line.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
header_line: First line of the CSV file.
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
Detected separator character.
|
|
236
|
+
"""
|
|
237
|
+
tab_count = header_line.count("\t")
|
|
238
|
+
comma_count = header_line.count(",")
|
|
239
|
+
semicolon_count = header_line.count(";")
|
|
240
|
+
|
|
241
|
+
if tab_count > max(comma_count, semicolon_count):
|
|
242
|
+
return "\t"
|
|
243
|
+
elif semicolon_count > comma_count:
|
|
244
|
+
return ";"
|
|
245
|
+
return ","
|
|
246
|
+
|
|
247
|
+
def _read_file_with_encoding(
|
|
248
|
+
self,
|
|
249
|
+
file_path: str
|
|
250
|
+
) -> Tuple[StringIO, str]:
|
|
251
|
+
"""
|
|
252
|
+
Read file with automatic encoding detection.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
file_path: Path to the file.
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
Tuple of (StringIO with content, detected encoding).
|
|
259
|
+
|
|
260
|
+
Raises:
|
|
261
|
+
UnicodeDecodeError: If no encoding works.
|
|
262
|
+
"""
|
|
263
|
+
for encoding in self.config.encoding_fallbacks:
|
|
264
|
+
try:
|
|
265
|
+
with open(file_path, 'r', encoding=encoding, errors='strict') as f:
|
|
266
|
+
content = f.read()
|
|
267
|
+
return StringIO(content), encoding
|
|
268
|
+
except (UnicodeDecodeError, UnicodeError):
|
|
269
|
+
continue
|
|
270
|
+
|
|
271
|
+
# Last resort: read with replace mode
|
|
272
|
+
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
|
273
|
+
content = f.read()
|
|
274
|
+
return StringIO(content), 'utf-8-replace'
|
|
275
|
+
|
|
276
|
+
def _process_csv_streaming(
|
|
277
|
+
self,
|
|
278
|
+
file_path: str
|
|
279
|
+
) -> Generator[Dict[str, Any], None, None]:
|
|
280
|
+
"""
|
|
281
|
+
Process CSV file in streaming mode.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
file_path: Path to CSV file.
|
|
285
|
+
|
|
286
|
+
Yields:
|
|
287
|
+
Chunk dictionaries with 'text' and 'metadata'.
|
|
288
|
+
"""
|
|
289
|
+
batch_size = self.config.csv_batch_rows
|
|
290
|
+
|
|
291
|
+
if self._reduced_batch:
|
|
292
|
+
batch_size = batch_size // 4
|
|
293
|
+
LOGGER.info("Using reduced batch size: %d", batch_size)
|
|
294
|
+
|
|
295
|
+
try:
|
|
296
|
+
content_io, encoding = self._read_file_with_encoding(file_path)
|
|
297
|
+
LOGGER.debug("Reading CSV with encoding: %s", encoding)
|
|
298
|
+
except Exception as e:
|
|
299
|
+
LOGGER.error("Failed to read file: %s", e)
|
|
300
|
+
raise
|
|
301
|
+
|
|
302
|
+
# Detect separator
|
|
303
|
+
first_line = content_io.readline()
|
|
304
|
+
separator = self._detect_separator(first_line)
|
|
305
|
+
content_io.seek(0)
|
|
306
|
+
|
|
307
|
+
try:
|
|
308
|
+
reader = csv.reader(content_io, delimiter=separator)
|
|
309
|
+
except csv.Error as e:
|
|
310
|
+
LOGGER.error("CSV parsing error: %s", e)
|
|
311
|
+
raise
|
|
312
|
+
|
|
313
|
+
# Read header
|
|
314
|
+
try:
|
|
315
|
+
headers = next(reader)
|
|
316
|
+
except StopIteration:
|
|
317
|
+
LOGGER.warning("Empty CSV file: %s", file_path)
|
|
318
|
+
return
|
|
319
|
+
|
|
320
|
+
# Clean headers
|
|
321
|
+
headers = [h.strip() or f"col_{i+1}" for i, h in enumerate(headers)]
|
|
322
|
+
|
|
323
|
+
base_meta = {
|
|
324
|
+
"source": os.path.basename(file_path),
|
|
325
|
+
"source_path": os.path.abspath(file_path),
|
|
326
|
+
"source_type": "csv",
|
|
327
|
+
"encoding": encoding,
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
batch_texts: List[str] = []
|
|
331
|
+
batch_row_ids: List[int] = []
|
|
332
|
+
chunk_index = 0
|
|
333
|
+
|
|
334
|
+
for row_num, row in enumerate(reader, start=1):
|
|
335
|
+
try:
|
|
336
|
+
# Step 1: Transform row to text
|
|
337
|
+
row_dict = dict(zip(headers, row))
|
|
338
|
+
row_text = self.transformer.transform_row(row_dict)
|
|
339
|
+
|
|
340
|
+
if not row_text or not row_text.strip():
|
|
341
|
+
continue
|
|
342
|
+
|
|
343
|
+
# Step 2: Clean the text
|
|
344
|
+
cleaned_text = self.cleaner.clean_text(row_text)
|
|
345
|
+
|
|
346
|
+
if not cleaned_text:
|
|
347
|
+
continue
|
|
348
|
+
|
|
349
|
+
batch_texts.append(cleaned_text)
|
|
350
|
+
batch_row_ids.append(row_num)
|
|
351
|
+
self._stats["rows_processed"] += 1
|
|
352
|
+
|
|
353
|
+
# Log progress
|
|
354
|
+
if row_num % self.config.log_progress_every == 0:
|
|
355
|
+
LOGGER.info("Processed %d rows...", row_num)
|
|
356
|
+
|
|
357
|
+
# Process batch
|
|
358
|
+
if len(batch_texts) >= batch_size:
|
|
359
|
+
for chunk in self._chunk_batch(
|
|
360
|
+
batch_texts,
|
|
361
|
+
batch_row_ids,
|
|
362
|
+
base_meta,
|
|
363
|
+
chunk_index
|
|
364
|
+
):
|
|
365
|
+
chunk_index = chunk["metadata"]["chunk_index"] + 1
|
|
366
|
+
self._stats["chunks_created"] += 1
|
|
367
|
+
yield chunk
|
|
368
|
+
|
|
369
|
+
batch_texts = []
|
|
370
|
+
batch_row_ids = []
|
|
371
|
+
|
|
372
|
+
except csv.Error as e:
|
|
373
|
+
LOGGER.warning("Skipping malformed row %d: %s", row_num, e)
|
|
374
|
+
continue
|
|
375
|
+
|
|
376
|
+
# Flush remaining batch
|
|
377
|
+
if batch_texts:
|
|
378
|
+
for chunk in self._chunk_batch(
|
|
379
|
+
batch_texts,
|
|
380
|
+
batch_row_ids,
|
|
381
|
+
base_meta,
|
|
382
|
+
chunk_index
|
|
383
|
+
):
|
|
384
|
+
self._stats["chunks_created"] += 1
|
|
385
|
+
yield chunk
|
|
386
|
+
|
|
387
|
+
# Update cleaning stats
|
|
388
|
+
cleaner_stats = self.cleaner.get_stats()
|
|
389
|
+
self._stats["bytes_cleaned"] += cleaner_stats["bytes_cleaned"]
|
|
390
|
+
self._stats["patterns_removed"] += cleaner_stats["patterns_removed"]
|
|
391
|
+
|
|
392
|
+
def _process_xlsx_streaming(
|
|
393
|
+
self,
|
|
394
|
+
file_path: str
|
|
395
|
+
) -> Generator[Dict[str, Any], None, None]:
|
|
396
|
+
"""
|
|
397
|
+
Process XLSX file in streaming mode.
|
|
398
|
+
|
|
399
|
+
Args:
|
|
400
|
+
file_path: Path to XLSX file.
|
|
401
|
+
|
|
402
|
+
Yields:
|
|
403
|
+
Chunk dictionaries with 'text' and 'metadata'.
|
|
404
|
+
"""
|
|
405
|
+
try:
|
|
406
|
+
import openpyxl
|
|
407
|
+
except ImportError:
|
|
408
|
+
raise ImportError(
|
|
409
|
+
"openpyxl is required for XLSX processing. "
|
|
410
|
+
"Install with: pip install openpyxl"
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
batch_size = self.config.xlsx_batch_rows
|
|
414
|
+
|
|
415
|
+
if self._reduced_batch:
|
|
416
|
+
batch_size = batch_size // 4
|
|
417
|
+
LOGGER.info("Using reduced batch size: %d", batch_size)
|
|
418
|
+
|
|
419
|
+
try:
|
|
420
|
+
wb = openpyxl.load_workbook(file_path, read_only=True, data_only=True)
|
|
421
|
+
except Exception as e:
|
|
422
|
+
LOGGER.error("Failed to open XLSX file: %s", e)
|
|
423
|
+
raise
|
|
424
|
+
|
|
425
|
+
chunk_index = 0
|
|
426
|
+
|
|
427
|
+
try:
|
|
428
|
+
for sheet_name in wb.sheetnames:
|
|
429
|
+
ws = wb[sheet_name]
|
|
430
|
+
|
|
431
|
+
base_meta = {
|
|
432
|
+
"source": os.path.basename(file_path),
|
|
433
|
+
"source_path": os.path.abspath(file_path),
|
|
434
|
+
"source_type": "xlsx",
|
|
435
|
+
"sheet": sheet_name,
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
rows_iter = ws.iter_rows(values_only=True)
|
|
439
|
+
|
|
440
|
+
# Get headers
|
|
441
|
+
try:
|
|
442
|
+
header_row = next(rows_iter)
|
|
443
|
+
except StopIteration:
|
|
444
|
+
LOGGER.debug("Empty sheet: %s", sheet_name)
|
|
445
|
+
continue
|
|
446
|
+
|
|
447
|
+
headers = [
|
|
448
|
+
str(h).strip() if h is not None and str(h).strip()
|
|
449
|
+
else f"col_{i+1}"
|
|
450
|
+
for i, h in enumerate(header_row)
|
|
451
|
+
]
|
|
452
|
+
|
|
453
|
+
batch_texts: List[str] = []
|
|
454
|
+
batch_row_ids: List[int] = []
|
|
455
|
+
|
|
456
|
+
for row_num, row in enumerate(rows_iter, start=1):
|
|
457
|
+
try:
|
|
458
|
+
# Step 1: Transform row to text
|
|
459
|
+
row_text = self.transformer.excel_row_to_text(
|
|
460
|
+
headers, list(row)
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
if not row_text or not row_text.strip():
|
|
464
|
+
continue
|
|
465
|
+
|
|
466
|
+
# Step 2: Clean the text
|
|
467
|
+
cleaned_text = self.cleaner.clean_text(row_text)
|
|
468
|
+
|
|
469
|
+
if not cleaned_text:
|
|
470
|
+
continue
|
|
471
|
+
|
|
472
|
+
batch_texts.append(cleaned_text)
|
|
473
|
+
batch_row_ids.append(row_num)
|
|
474
|
+
self._stats["rows_processed"] += 1
|
|
475
|
+
|
|
476
|
+
# Log progress
|
|
477
|
+
if row_num % self.config.log_progress_every == 0:
|
|
478
|
+
LOGGER.info(
|
|
479
|
+
"Processed %d rows from sheet '%s'...",
|
|
480
|
+
row_num, sheet_name
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Process batch
|
|
484
|
+
if len(batch_texts) >= batch_size:
|
|
485
|
+
for chunk in self._chunk_batch(
|
|
486
|
+
batch_texts,
|
|
487
|
+
batch_row_ids,
|
|
488
|
+
base_meta,
|
|
489
|
+
chunk_index
|
|
490
|
+
):
|
|
491
|
+
chunk_index = chunk["metadata"]["chunk_index"] + 1
|
|
492
|
+
self._stats["chunks_created"] += 1
|
|
493
|
+
yield chunk
|
|
494
|
+
|
|
495
|
+
batch_texts = []
|
|
496
|
+
batch_row_ids = []
|
|
497
|
+
|
|
498
|
+
except Exception as e:
|
|
499
|
+
LOGGER.warning(
|
|
500
|
+
"Skipping row %d in sheet '%s': %s",
|
|
501
|
+
row_num, sheet_name, e
|
|
502
|
+
)
|
|
503
|
+
continue
|
|
504
|
+
|
|
505
|
+
# Flush remaining batch for this sheet
|
|
506
|
+
if batch_texts:
|
|
507
|
+
for chunk in self._chunk_batch(
|
|
508
|
+
batch_texts,
|
|
509
|
+
batch_row_ids,
|
|
510
|
+
base_meta,
|
|
511
|
+
chunk_index
|
|
512
|
+
):
|
|
513
|
+
chunk_index = chunk["metadata"]["chunk_index"] + 1
|
|
514
|
+
self._stats["chunks_created"] += 1
|
|
515
|
+
yield chunk
|
|
516
|
+
|
|
517
|
+
finally:
|
|
518
|
+
try:
|
|
519
|
+
wb.close()
|
|
520
|
+
except Exception:
|
|
521
|
+
pass
|
|
522
|
+
|
|
523
|
+
# Update cleaning stats
|
|
524
|
+
cleaner_stats = self.cleaner.get_stats()
|
|
525
|
+
self._stats["bytes_cleaned"] += cleaner_stats["bytes_cleaned"]
|
|
526
|
+
self._stats["patterns_removed"] += cleaner_stats["patterns_removed"]
|
|
527
|
+
|
|
528
|
+
def _chunk_batch(
|
|
529
|
+
self,
|
|
530
|
+
texts: List[str],
|
|
531
|
+
row_ids: List[int],
|
|
532
|
+
base_meta: Dict[str, Any],
|
|
533
|
+
start_chunk_index: int
|
|
534
|
+
) -> Generator[Dict[str, Any], None, None]:
|
|
535
|
+
"""
|
|
536
|
+
Chunk a batch of cleaned text rows.
|
|
537
|
+
|
|
538
|
+
Args:
|
|
539
|
+
texts: List of cleaned text strings.
|
|
540
|
+
row_ids: Corresponding row IDs.
|
|
541
|
+
base_meta: Base metadata for chunks.
|
|
542
|
+
start_chunk_index: Starting chunk index.
|
|
543
|
+
|
|
544
|
+
Yields:
|
|
545
|
+
Chunk dictionaries.
|
|
546
|
+
"""
|
|
547
|
+
if not texts:
|
|
548
|
+
return
|
|
549
|
+
|
|
550
|
+
# Use Krira_Chunker if available
|
|
551
|
+
if self.chunker is not None:
|
|
552
|
+
try:
|
|
553
|
+
for chunk in self.chunker.chunk_units(
|
|
554
|
+
units=texts,
|
|
555
|
+
base_meta=base_meta,
|
|
556
|
+
joiner="\n",
|
|
557
|
+
locator=base_meta.get("source", "unknown"),
|
|
558
|
+
range_key="row",
|
|
559
|
+
range_vals=row_ids,
|
|
560
|
+
start_chunk_index=start_chunk_index,
|
|
561
|
+
):
|
|
562
|
+
yield chunk
|
|
563
|
+
return
|
|
564
|
+
except Exception as e:
|
|
565
|
+
LOGGER.warning("Chunker failed, using fallback: %s", e)
|
|
566
|
+
|
|
567
|
+
# Fallback: Simple chunking
|
|
568
|
+
combined_text = "\n".join(texts)
|
|
569
|
+
|
|
570
|
+
# Get chunk size from config or use default
|
|
571
|
+
if self._chunk_config is not None:
|
|
572
|
+
max_chars = getattr(self._chunk_config, 'max_chars', 2000)
|
|
573
|
+
else:
|
|
574
|
+
max_chars = 2000
|
|
575
|
+
|
|
576
|
+
chunk_index = start_chunk_index
|
|
577
|
+
start = 0
|
|
578
|
+
|
|
579
|
+
while start < len(combined_text):
|
|
580
|
+
end = min(start + max_chars, len(combined_text))
|
|
581
|
+
|
|
582
|
+
# Try to break at newline
|
|
583
|
+
if end < len(combined_text):
|
|
584
|
+
newline_pos = combined_text.rfind('\n', start, end)
|
|
585
|
+
if newline_pos > start:
|
|
586
|
+
end = newline_pos + 1
|
|
587
|
+
|
|
588
|
+
chunk_text = combined_text[start:end].strip()
|
|
589
|
+
|
|
590
|
+
if chunk_text:
|
|
591
|
+
yield {
|
|
592
|
+
"text": chunk_text,
|
|
593
|
+
"metadata": {
|
|
594
|
+
**base_meta,
|
|
595
|
+
"chunk_index": chunk_index,
|
|
596
|
+
"char_start": start,
|
|
597
|
+
"char_end": end,
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
chunk_index += 1
|
|
601
|
+
|
|
602
|
+
start = end
|
|
603
|
+
|
|
604
|
+
def process_file(
|
|
605
|
+
self,
|
|
606
|
+
file_path: str
|
|
607
|
+
) -> Generator[Dict[str, Any], None, None]:
|
|
608
|
+
"""
|
|
609
|
+
Process a file through the full Clean -> Transform -> Chunk pipeline.
|
|
610
|
+
|
|
611
|
+
Args:
|
|
612
|
+
file_path: Path to input file (CSV or XLSX).
|
|
613
|
+
|
|
614
|
+
Yields:
|
|
615
|
+
Chunk dictionaries with keys: 'text', 'metadata'.
|
|
616
|
+
|
|
617
|
+
Algorithm:
|
|
618
|
+
1. Validate file exists and is accessible.
|
|
619
|
+
2. Detect file type from extension.
|
|
620
|
+
3. Open file in streaming mode.
|
|
621
|
+
4. For each row batch:
|
|
622
|
+
a. Transform row to text format.
|
|
623
|
+
b. Pass through cleaner.clean_text().
|
|
624
|
+
c. Pass through chunker.
|
|
625
|
+
d. Yield chunks.
|
|
626
|
+
5. Close file handle.
|
|
627
|
+
|
|
628
|
+
Performance Requirements:
|
|
629
|
+
- Uses generators throughout (no list accumulation).
|
|
630
|
+
- Memory usage stays under 500MB for 10GB files.
|
|
631
|
+
- Logs progress every log_progress_every rows.
|
|
632
|
+
|
|
633
|
+
Raises:
|
|
634
|
+
FileNotFoundPipelineError: If file does not exist.
|
|
635
|
+
PermissionPipelineError: If file cannot be read.
|
|
636
|
+
UnsupportedFormatPipelineError: If file format not supported.
|
|
637
|
+
"""
|
|
638
|
+
# Validate file exists
|
|
639
|
+
if not os.path.exists(file_path):
|
|
640
|
+
raise FileNotFoundPipelineError(file_path)
|
|
641
|
+
|
|
642
|
+
# Check permissions
|
|
643
|
+
if not os.access(file_path, os.R_OK):
|
|
644
|
+
raise PermissionPipelineError(file_path)
|
|
645
|
+
|
|
646
|
+
# Detect file type
|
|
647
|
+
path = Path(file_path)
|
|
648
|
+
extension = path.suffix.lower()
|
|
649
|
+
|
|
650
|
+
self._stats["files_processed"] += 1
|
|
651
|
+
LOGGER.info("Processing file: %s", file_path)
|
|
652
|
+
|
|
653
|
+
try:
|
|
654
|
+
if extension == ".csv":
|
|
655
|
+
yield from self._process_csv_streaming(file_path)
|
|
656
|
+
elif extension == ".xlsx":
|
|
657
|
+
yield from self._process_xlsx_streaming(file_path)
|
|
658
|
+
else:
|
|
659
|
+
raise UnsupportedFormatPipelineError(file_path, extension)
|
|
660
|
+
|
|
661
|
+
except MemoryError:
|
|
662
|
+
# Handle memory errors by reducing batch size
|
|
663
|
+
if not self._reduced_batch:
|
|
664
|
+
LOGGER.warning(
|
|
665
|
+
"MemoryError encountered. Reducing batch size and retrying..."
|
|
666
|
+
)
|
|
667
|
+
self._reduced_batch = True
|
|
668
|
+
|
|
669
|
+
# Reset stats for retry
|
|
670
|
+
self._stats["rows_processed"] = 0
|
|
671
|
+
self._stats["chunks_created"] = 0
|
|
672
|
+
|
|
673
|
+
if extension == ".csv":
|
|
674
|
+
yield from self._process_csv_streaming(file_path)
|
|
675
|
+
elif extension == ".xlsx":
|
|
676
|
+
yield from self._process_xlsx_streaming(file_path)
|
|
677
|
+
else:
|
|
678
|
+
raise
|
|
679
|
+
|
|
680
|
+
LOGGER.info(
|
|
681
|
+
"Completed: %d rows -> %d chunks",
|
|
682
|
+
self._stats["rows_processed"],
|
|
683
|
+
self._stats["chunks_created"]
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
def process_text(self, text: str) -> Generator[Dict[str, Any], None, None]:
|
|
687
|
+
"""
|
|
688
|
+
Process raw text through Clean -> Chunk pipeline.
|
|
689
|
+
|
|
690
|
+
Args:
|
|
691
|
+
text: Raw input text.
|
|
692
|
+
|
|
693
|
+
Yields:
|
|
694
|
+
Chunk dictionaries.
|
|
695
|
+
"""
|
|
696
|
+
# Clean the text
|
|
697
|
+
cleaned = self.cleaner.clean_text(text)
|
|
698
|
+
|
|
699
|
+
if not cleaned:
|
|
700
|
+
return
|
|
701
|
+
|
|
702
|
+
base_meta = {
|
|
703
|
+
"source": "text_input",
|
|
704
|
+
"source_type": "text",
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
yield from self._chunk_batch(
|
|
708
|
+
texts=[cleaned],
|
|
709
|
+
row_ids=[0],
|
|
710
|
+
base_meta=base_meta,
|
|
711
|
+
start_chunk_index=0
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
def get_stats(self) -> Dict[str, int]:
|
|
715
|
+
"""
|
|
716
|
+
Return pipeline statistics after processing.
|
|
717
|
+
|
|
718
|
+
Returns:
|
|
719
|
+
Dictionary with keys:
|
|
720
|
+
- 'rows_processed': Total rows read.
|
|
721
|
+
- 'chunks_created': Total chunks generated.
|
|
722
|
+
- 'bytes_cleaned': Total text bytes cleaned.
|
|
723
|
+
- 'patterns_removed': Count of regex matches removed.
|
|
724
|
+
- 'files_processed': Number of files processed.
|
|
725
|
+
"""
|
|
726
|
+
return dict(self._stats)
|
|
727
|
+
|
|
728
|
+
def reset_stats(self) -> None:
|
|
729
|
+
"""Reset all statistics counters."""
|
|
730
|
+
self._stats = {
|
|
731
|
+
"rows_processed": 0,
|
|
732
|
+
"chunks_created": 0,
|
|
733
|
+
"bytes_cleaned": 0,
|
|
734
|
+
"patterns_removed": 0,
|
|
735
|
+
"files_processed": 0,
|
|
736
|
+
}
|
|
737
|
+
self.cleaner.reset_stats()
|
|
738
|
+
self.transformer.reset_stats()
|