krira-augment 2.1.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,738 @@
1
+ """
2
+ Pipeline Orchestrator for Krira Chunker V2.0.
3
+
4
+ Orchestrates the Clean -> Transform -> Chunk workflow.
5
+ This is the main entry point for users processing CSV and XLSX files.
6
+
7
+ Performance Requirements:
8
+ - Must process 1GB files in under 10 minutes on a 4-core CPU.
9
+ - Memory usage must remain constant O(1) regardless of file size.
10
+ - Uses streaming architecture (generators) throughout.
11
+ """
12
+
13
+ import csv
14
+ import logging
15
+ import os
16
+ import sys
17
+ from dataclasses import dataclass, field
18
+ from io import StringIO
19
+ from pathlib import Path
20
+ from typing import Any, Dict, Generator, Iterator, List, Optional, Tuple, Union
21
+
22
+ # Import sibling modules
23
+ from .cleaning import CleaningConfig, DataCleaner
24
+ from .transformation import DataTransformer, TransformConfig
25
+
26
+
27
+ # Setup logger
28
+ LOGGER = logging.getLogger("krira_augment.pipeline")
29
+
30
+
31
+ # =============================================================================
32
+ # Configuration
33
+ # =============================================================================
34
+
35
+ @dataclass
36
+ class PipelineConfig:
37
+ """
38
+ Master configuration for the full Clean -> Transform -> Chunk pipeline.
39
+
40
+ Combines CleaningConfig, TransformConfig, and chunk settings into a
41
+ single configuration object.
42
+
43
+ Attributes:
44
+ cleaning_config: Configuration for the DataCleaner stage.
45
+ transform_config: Configuration for the DataTransformer stage.
46
+ chunk_config: Configuration for the chunking stage (optional, uses defaults).
47
+
48
+ csv_batch_rows: Number of rows to process per batch for CSV files.
49
+ xlsx_batch_rows: Number of rows to process per batch for XLSX files.
50
+ log_progress_every: Log progress every N rows processed.
51
+ encoding_fallbacks: List of encodings to try if UTF-8 fails.
52
+
53
+ Example:
54
+ >>> from krira_augment import PipelineConfig, CleaningConfig, TransformConfig
55
+ >>> cfg = PipelineConfig(
56
+ ... cleaning_config=CleaningConfig(remove_headers=True),
57
+ ... transform_config=TransformConfig(output_format="markdown"),
58
+ ... )
59
+ """
60
+
61
+ # Sub-configurations
62
+ cleaning_config: CleaningConfig = field(default_factory=CleaningConfig)
63
+ """Configuration for DataCleaner."""
64
+
65
+ transform_config: TransformConfig = field(default_factory=TransformConfig)
66
+ """Configuration for DataTransformer."""
67
+
68
+ chunk_config: Optional[Any] = None
69
+ """Configuration for chunking (ChunkConfig from Krira_Chunker). Uses defaults if None."""
70
+
71
+ # Batch processing
72
+ csv_batch_rows: int = 50_000
73
+ """Number of rows to process per batch for CSV files."""
74
+
75
+ xlsx_batch_rows: int = 25_000
76
+ """Number of rows to process per batch for XLSX files."""
77
+
78
+ # Progress logging
79
+ log_progress_every: int = 100_000
80
+ """Log progress status every N rows processed."""
81
+
82
+ # Encoding fallbacks
83
+ encoding_fallbacks: Tuple[str, ...] = ("utf-8", "latin-1", "cp1252", "utf-16")
84
+ """List of encodings to try when reading files."""
85
+
86
+ def __post_init__(self) -> None:
87
+ """Validate configuration parameters."""
88
+ if self.csv_batch_rows <= 0:
89
+ raise ValueError(
90
+ f"csv_batch_rows must be positive, got {self.csv_batch_rows}"
91
+ )
92
+ if self.xlsx_batch_rows <= 0:
93
+ raise ValueError(
94
+ f"xlsx_batch_rows must be positive, got {self.xlsx_batch_rows}"
95
+ )
96
+
97
+
98
+ # =============================================================================
99
+ # Pipeline Exception Classes
100
+ # =============================================================================
101
+
102
+ class PipelineError(Exception):
103
+ """Base exception for pipeline errors."""
104
+ pass
105
+
106
+
107
+ class FileNotFoundPipelineError(PipelineError):
108
+ """Raised when the input file is not found."""
109
+
110
+ def __init__(self, file_path: str):
111
+ self.file_path = file_path
112
+ super().__init__(f"File not found: {file_path}")
113
+
114
+
115
+ class PermissionPipelineError(PipelineError):
116
+ """Raised when file permissions prevent access."""
117
+
118
+ def __init__(self, file_path: str):
119
+ self.file_path = file_path
120
+ super().__init__(
121
+ f"Permission denied: {file_path}. "
122
+ "Please check file permissions and try again."
123
+ )
124
+
125
+
126
+ class UnsupportedFormatPipelineError(PipelineError):
127
+ """Raised when the file format is not supported."""
128
+
129
+ def __init__(self, file_path: str, extension: str):
130
+ self.file_path = file_path
131
+ self.extension = extension
132
+ super().__init__(
133
+ f"Unsupported file format: {extension}. "
134
+ "Supported formats: .csv, .xlsx"
135
+ )
136
+
137
+
138
+ # =============================================================================
139
+ # KriraPipeline
140
+ # =============================================================================
141
+
142
+ class KriraPipeline:
143
+ """
144
+ Orchestrates Clean -> Transform -> Chunk workflow.
145
+
146
+ This is the main entry point for users. It chains DataCleaner,
147
+ DataTransformer, and chunking in sequence.
148
+
149
+ Features:
150
+ - Streaming architecture for O(1) memory usage
151
+ - Automatic file type detection (CSV, XLSX)
152
+ - Multiple encoding fallback
153
+ - Progress logging
154
+ - Comprehensive error handling
155
+ - Pipeline statistics
156
+
157
+ Example:
158
+ >>> from krira_augment import KriraPipeline, PipelineConfig
159
+ >>> cfg = PipelineConfig()
160
+ >>> pipeline = KriraPipeline(cfg)
161
+ >>> for chunk in pipeline.process_file("data.csv"):
162
+ ... print(chunk["text"][:100])
163
+ """
164
+
165
+ def __init__(self, config: PipelineConfig) -> None:
166
+ """
167
+ Initialize all pipeline components.
168
+
169
+ Args:
170
+ config: Master configuration object.
171
+
172
+ Implementation:
173
+ - Instantiates DataCleaner with config.cleaning_config.
174
+ - Instantiates DataTransformer with config.transform_config.
175
+ - Validates that all configs are compatible.
176
+
177
+ Raises:
178
+ TypeError: If config is not a PipelineConfig instance.
179
+ """
180
+ if not isinstance(config, PipelineConfig):
181
+ raise TypeError(
182
+ f"config must be PipelineConfig, got {type(config).__name__}"
183
+ )
184
+
185
+ self.config = config
186
+
187
+ # Initialize sub-components
188
+ self.cleaner = DataCleaner(config.cleaning_config)
189
+ self.transformer = DataTransformer(config.transform_config)
190
+
191
+ # Lazy-load chunker (requires Krira_Chunker)
192
+ self._chunker = None
193
+ self._chunk_config = config.chunk_config
194
+
195
+ # Pipeline statistics
196
+ self._stats = {
197
+ "rows_processed": 0,
198
+ "chunks_created": 0,
199
+ "bytes_cleaned": 0,
200
+ "patterns_removed": 0,
201
+ "files_processed": 0,
202
+ }
203
+
204
+ # Reduced batch size flag (for memory recovery)
205
+ self._reduced_batch = False
206
+
207
+ @property
208
+ def chunker(self):
209
+ """Lazy-load the chunker from Krira_Chunker."""
210
+ if self._chunker is None:
211
+ try:
212
+ from Krira_Chunker import FastChunker, ChunkConfig
213
+
214
+ if self._chunk_config is None:
215
+ self._chunk_config = ChunkConfig()
216
+
217
+ self._chunker = FastChunker(self._chunk_config)
218
+ except ImportError:
219
+ LOGGER.warning(
220
+ "Krira_Chunker not available. "
221
+ "Using simple text chunking fallback."
222
+ )
223
+ self._chunker = None
224
+
225
+ return self._chunker
226
+
227
+ def _detect_separator(self, header_line: str) -> str:
228
+ """
229
+ Auto-detect CSV separator from header line.
230
+
231
+ Args:
232
+ header_line: First line of the CSV file.
233
+
234
+ Returns:
235
+ Detected separator character.
236
+ """
237
+ tab_count = header_line.count("\t")
238
+ comma_count = header_line.count(",")
239
+ semicolon_count = header_line.count(";")
240
+
241
+ if tab_count > max(comma_count, semicolon_count):
242
+ return "\t"
243
+ elif semicolon_count > comma_count:
244
+ return ";"
245
+ return ","
246
+
247
+ def _read_file_with_encoding(
248
+ self,
249
+ file_path: str
250
+ ) -> Tuple[StringIO, str]:
251
+ """
252
+ Read file with automatic encoding detection.
253
+
254
+ Args:
255
+ file_path: Path to the file.
256
+
257
+ Returns:
258
+ Tuple of (StringIO with content, detected encoding).
259
+
260
+ Raises:
261
+ UnicodeDecodeError: If no encoding works.
262
+ """
263
+ for encoding in self.config.encoding_fallbacks:
264
+ try:
265
+ with open(file_path, 'r', encoding=encoding, errors='strict') as f:
266
+ content = f.read()
267
+ return StringIO(content), encoding
268
+ except (UnicodeDecodeError, UnicodeError):
269
+ continue
270
+
271
+ # Last resort: read with replace mode
272
+ with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
273
+ content = f.read()
274
+ return StringIO(content), 'utf-8-replace'
275
+
276
+ def _process_csv_streaming(
277
+ self,
278
+ file_path: str
279
+ ) -> Generator[Dict[str, Any], None, None]:
280
+ """
281
+ Process CSV file in streaming mode.
282
+
283
+ Args:
284
+ file_path: Path to CSV file.
285
+
286
+ Yields:
287
+ Chunk dictionaries with 'text' and 'metadata'.
288
+ """
289
+ batch_size = self.config.csv_batch_rows
290
+
291
+ if self._reduced_batch:
292
+ batch_size = batch_size // 4
293
+ LOGGER.info("Using reduced batch size: %d", batch_size)
294
+
295
+ try:
296
+ content_io, encoding = self._read_file_with_encoding(file_path)
297
+ LOGGER.debug("Reading CSV with encoding: %s", encoding)
298
+ except Exception as e:
299
+ LOGGER.error("Failed to read file: %s", e)
300
+ raise
301
+
302
+ # Detect separator
303
+ first_line = content_io.readline()
304
+ separator = self._detect_separator(first_line)
305
+ content_io.seek(0)
306
+
307
+ try:
308
+ reader = csv.reader(content_io, delimiter=separator)
309
+ except csv.Error as e:
310
+ LOGGER.error("CSV parsing error: %s", e)
311
+ raise
312
+
313
+ # Read header
314
+ try:
315
+ headers = next(reader)
316
+ except StopIteration:
317
+ LOGGER.warning("Empty CSV file: %s", file_path)
318
+ return
319
+
320
+ # Clean headers
321
+ headers = [h.strip() or f"col_{i+1}" for i, h in enumerate(headers)]
322
+
323
+ base_meta = {
324
+ "source": os.path.basename(file_path),
325
+ "source_path": os.path.abspath(file_path),
326
+ "source_type": "csv",
327
+ "encoding": encoding,
328
+ }
329
+
330
+ batch_texts: List[str] = []
331
+ batch_row_ids: List[int] = []
332
+ chunk_index = 0
333
+
334
+ for row_num, row in enumerate(reader, start=1):
335
+ try:
336
+ # Step 1: Transform row to text
337
+ row_dict = dict(zip(headers, row))
338
+ row_text = self.transformer.transform_row(row_dict)
339
+
340
+ if not row_text or not row_text.strip():
341
+ continue
342
+
343
+ # Step 2: Clean the text
344
+ cleaned_text = self.cleaner.clean_text(row_text)
345
+
346
+ if not cleaned_text:
347
+ continue
348
+
349
+ batch_texts.append(cleaned_text)
350
+ batch_row_ids.append(row_num)
351
+ self._stats["rows_processed"] += 1
352
+
353
+ # Log progress
354
+ if row_num % self.config.log_progress_every == 0:
355
+ LOGGER.info("Processed %d rows...", row_num)
356
+
357
+ # Process batch
358
+ if len(batch_texts) >= batch_size:
359
+ for chunk in self._chunk_batch(
360
+ batch_texts,
361
+ batch_row_ids,
362
+ base_meta,
363
+ chunk_index
364
+ ):
365
+ chunk_index = chunk["metadata"]["chunk_index"] + 1
366
+ self._stats["chunks_created"] += 1
367
+ yield chunk
368
+
369
+ batch_texts = []
370
+ batch_row_ids = []
371
+
372
+ except csv.Error as e:
373
+ LOGGER.warning("Skipping malformed row %d: %s", row_num, e)
374
+ continue
375
+
376
+ # Flush remaining batch
377
+ if batch_texts:
378
+ for chunk in self._chunk_batch(
379
+ batch_texts,
380
+ batch_row_ids,
381
+ base_meta,
382
+ chunk_index
383
+ ):
384
+ self._stats["chunks_created"] += 1
385
+ yield chunk
386
+
387
+ # Update cleaning stats
388
+ cleaner_stats = self.cleaner.get_stats()
389
+ self._stats["bytes_cleaned"] += cleaner_stats["bytes_cleaned"]
390
+ self._stats["patterns_removed"] += cleaner_stats["patterns_removed"]
391
+
392
+ def _process_xlsx_streaming(
393
+ self,
394
+ file_path: str
395
+ ) -> Generator[Dict[str, Any], None, None]:
396
+ """
397
+ Process XLSX file in streaming mode.
398
+
399
+ Args:
400
+ file_path: Path to XLSX file.
401
+
402
+ Yields:
403
+ Chunk dictionaries with 'text' and 'metadata'.
404
+ """
405
+ try:
406
+ import openpyxl
407
+ except ImportError:
408
+ raise ImportError(
409
+ "openpyxl is required for XLSX processing. "
410
+ "Install with: pip install openpyxl"
411
+ )
412
+
413
+ batch_size = self.config.xlsx_batch_rows
414
+
415
+ if self._reduced_batch:
416
+ batch_size = batch_size // 4
417
+ LOGGER.info("Using reduced batch size: %d", batch_size)
418
+
419
+ try:
420
+ wb = openpyxl.load_workbook(file_path, read_only=True, data_only=True)
421
+ except Exception as e:
422
+ LOGGER.error("Failed to open XLSX file: %s", e)
423
+ raise
424
+
425
+ chunk_index = 0
426
+
427
+ try:
428
+ for sheet_name in wb.sheetnames:
429
+ ws = wb[sheet_name]
430
+
431
+ base_meta = {
432
+ "source": os.path.basename(file_path),
433
+ "source_path": os.path.abspath(file_path),
434
+ "source_type": "xlsx",
435
+ "sheet": sheet_name,
436
+ }
437
+
438
+ rows_iter = ws.iter_rows(values_only=True)
439
+
440
+ # Get headers
441
+ try:
442
+ header_row = next(rows_iter)
443
+ except StopIteration:
444
+ LOGGER.debug("Empty sheet: %s", sheet_name)
445
+ continue
446
+
447
+ headers = [
448
+ str(h).strip() if h is not None and str(h).strip()
449
+ else f"col_{i+1}"
450
+ for i, h in enumerate(header_row)
451
+ ]
452
+
453
+ batch_texts: List[str] = []
454
+ batch_row_ids: List[int] = []
455
+
456
+ for row_num, row in enumerate(rows_iter, start=1):
457
+ try:
458
+ # Step 1: Transform row to text
459
+ row_text = self.transformer.excel_row_to_text(
460
+ headers, list(row)
461
+ )
462
+
463
+ if not row_text or not row_text.strip():
464
+ continue
465
+
466
+ # Step 2: Clean the text
467
+ cleaned_text = self.cleaner.clean_text(row_text)
468
+
469
+ if not cleaned_text:
470
+ continue
471
+
472
+ batch_texts.append(cleaned_text)
473
+ batch_row_ids.append(row_num)
474
+ self._stats["rows_processed"] += 1
475
+
476
+ # Log progress
477
+ if row_num % self.config.log_progress_every == 0:
478
+ LOGGER.info(
479
+ "Processed %d rows from sheet '%s'...",
480
+ row_num, sheet_name
481
+ )
482
+
483
+ # Process batch
484
+ if len(batch_texts) >= batch_size:
485
+ for chunk in self._chunk_batch(
486
+ batch_texts,
487
+ batch_row_ids,
488
+ base_meta,
489
+ chunk_index
490
+ ):
491
+ chunk_index = chunk["metadata"]["chunk_index"] + 1
492
+ self._stats["chunks_created"] += 1
493
+ yield chunk
494
+
495
+ batch_texts = []
496
+ batch_row_ids = []
497
+
498
+ except Exception as e:
499
+ LOGGER.warning(
500
+ "Skipping row %d in sheet '%s': %s",
501
+ row_num, sheet_name, e
502
+ )
503
+ continue
504
+
505
+ # Flush remaining batch for this sheet
506
+ if batch_texts:
507
+ for chunk in self._chunk_batch(
508
+ batch_texts,
509
+ batch_row_ids,
510
+ base_meta,
511
+ chunk_index
512
+ ):
513
+ chunk_index = chunk["metadata"]["chunk_index"] + 1
514
+ self._stats["chunks_created"] += 1
515
+ yield chunk
516
+
517
+ finally:
518
+ try:
519
+ wb.close()
520
+ except Exception:
521
+ pass
522
+
523
+ # Update cleaning stats
524
+ cleaner_stats = self.cleaner.get_stats()
525
+ self._stats["bytes_cleaned"] += cleaner_stats["bytes_cleaned"]
526
+ self._stats["patterns_removed"] += cleaner_stats["patterns_removed"]
527
+
528
+ def _chunk_batch(
529
+ self,
530
+ texts: List[str],
531
+ row_ids: List[int],
532
+ base_meta: Dict[str, Any],
533
+ start_chunk_index: int
534
+ ) -> Generator[Dict[str, Any], None, None]:
535
+ """
536
+ Chunk a batch of cleaned text rows.
537
+
538
+ Args:
539
+ texts: List of cleaned text strings.
540
+ row_ids: Corresponding row IDs.
541
+ base_meta: Base metadata for chunks.
542
+ start_chunk_index: Starting chunk index.
543
+
544
+ Yields:
545
+ Chunk dictionaries.
546
+ """
547
+ if not texts:
548
+ return
549
+
550
+ # Use Krira_Chunker if available
551
+ if self.chunker is not None:
552
+ try:
553
+ for chunk in self.chunker.chunk_units(
554
+ units=texts,
555
+ base_meta=base_meta,
556
+ joiner="\n",
557
+ locator=base_meta.get("source", "unknown"),
558
+ range_key="row",
559
+ range_vals=row_ids,
560
+ start_chunk_index=start_chunk_index,
561
+ ):
562
+ yield chunk
563
+ return
564
+ except Exception as e:
565
+ LOGGER.warning("Chunker failed, using fallback: %s", e)
566
+
567
+ # Fallback: Simple chunking
568
+ combined_text = "\n".join(texts)
569
+
570
+ # Get chunk size from config or use default
571
+ if self._chunk_config is not None:
572
+ max_chars = getattr(self._chunk_config, 'max_chars', 2000)
573
+ else:
574
+ max_chars = 2000
575
+
576
+ chunk_index = start_chunk_index
577
+ start = 0
578
+
579
+ while start < len(combined_text):
580
+ end = min(start + max_chars, len(combined_text))
581
+
582
+ # Try to break at newline
583
+ if end < len(combined_text):
584
+ newline_pos = combined_text.rfind('\n', start, end)
585
+ if newline_pos > start:
586
+ end = newline_pos + 1
587
+
588
+ chunk_text = combined_text[start:end].strip()
589
+
590
+ if chunk_text:
591
+ yield {
592
+ "text": chunk_text,
593
+ "metadata": {
594
+ **base_meta,
595
+ "chunk_index": chunk_index,
596
+ "char_start": start,
597
+ "char_end": end,
598
+ }
599
+ }
600
+ chunk_index += 1
601
+
602
+ start = end
603
+
604
+ def process_file(
605
+ self,
606
+ file_path: str
607
+ ) -> Generator[Dict[str, Any], None, None]:
608
+ """
609
+ Process a file through the full Clean -> Transform -> Chunk pipeline.
610
+
611
+ Args:
612
+ file_path: Path to input file (CSV or XLSX).
613
+
614
+ Yields:
615
+ Chunk dictionaries with keys: 'text', 'metadata'.
616
+
617
+ Algorithm:
618
+ 1. Validate file exists and is accessible.
619
+ 2. Detect file type from extension.
620
+ 3. Open file in streaming mode.
621
+ 4. For each row batch:
622
+ a. Transform row to text format.
623
+ b. Pass through cleaner.clean_text().
624
+ c. Pass through chunker.
625
+ d. Yield chunks.
626
+ 5. Close file handle.
627
+
628
+ Performance Requirements:
629
+ - Uses generators throughout (no list accumulation).
630
+ - Memory usage stays under 500MB for 10GB files.
631
+ - Logs progress every log_progress_every rows.
632
+
633
+ Raises:
634
+ FileNotFoundPipelineError: If file does not exist.
635
+ PermissionPipelineError: If file cannot be read.
636
+ UnsupportedFormatPipelineError: If file format not supported.
637
+ """
638
+ # Validate file exists
639
+ if not os.path.exists(file_path):
640
+ raise FileNotFoundPipelineError(file_path)
641
+
642
+ # Check permissions
643
+ if not os.access(file_path, os.R_OK):
644
+ raise PermissionPipelineError(file_path)
645
+
646
+ # Detect file type
647
+ path = Path(file_path)
648
+ extension = path.suffix.lower()
649
+
650
+ self._stats["files_processed"] += 1
651
+ LOGGER.info("Processing file: %s", file_path)
652
+
653
+ try:
654
+ if extension == ".csv":
655
+ yield from self._process_csv_streaming(file_path)
656
+ elif extension == ".xlsx":
657
+ yield from self._process_xlsx_streaming(file_path)
658
+ else:
659
+ raise UnsupportedFormatPipelineError(file_path, extension)
660
+
661
+ except MemoryError:
662
+ # Handle memory errors by reducing batch size
663
+ if not self._reduced_batch:
664
+ LOGGER.warning(
665
+ "MemoryError encountered. Reducing batch size and retrying..."
666
+ )
667
+ self._reduced_batch = True
668
+
669
+ # Reset stats for retry
670
+ self._stats["rows_processed"] = 0
671
+ self._stats["chunks_created"] = 0
672
+
673
+ if extension == ".csv":
674
+ yield from self._process_csv_streaming(file_path)
675
+ elif extension == ".xlsx":
676
+ yield from self._process_xlsx_streaming(file_path)
677
+ else:
678
+ raise
679
+
680
+ LOGGER.info(
681
+ "Completed: %d rows -> %d chunks",
682
+ self._stats["rows_processed"],
683
+ self._stats["chunks_created"]
684
+ )
685
+
686
+ def process_text(self, text: str) -> Generator[Dict[str, Any], None, None]:
687
+ """
688
+ Process raw text through Clean -> Chunk pipeline.
689
+
690
+ Args:
691
+ text: Raw input text.
692
+
693
+ Yields:
694
+ Chunk dictionaries.
695
+ """
696
+ # Clean the text
697
+ cleaned = self.cleaner.clean_text(text)
698
+
699
+ if not cleaned:
700
+ return
701
+
702
+ base_meta = {
703
+ "source": "text_input",
704
+ "source_type": "text",
705
+ }
706
+
707
+ yield from self._chunk_batch(
708
+ texts=[cleaned],
709
+ row_ids=[0],
710
+ base_meta=base_meta,
711
+ start_chunk_index=0
712
+ )
713
+
714
+ def get_stats(self) -> Dict[str, int]:
715
+ """
716
+ Return pipeline statistics after processing.
717
+
718
+ Returns:
719
+ Dictionary with keys:
720
+ - 'rows_processed': Total rows read.
721
+ - 'chunks_created': Total chunks generated.
722
+ - 'bytes_cleaned': Total text bytes cleaned.
723
+ - 'patterns_removed': Count of regex matches removed.
724
+ - 'files_processed': Number of files processed.
725
+ """
726
+ return dict(self._stats)
727
+
728
+ def reset_stats(self) -> None:
729
+ """Reset all statistics counters."""
730
+ self._stats = {
731
+ "rows_processed": 0,
732
+ "chunks_created": 0,
733
+ "bytes_cleaned": 0,
734
+ "patterns_removed": 0,
735
+ "files_processed": 0,
736
+ }
737
+ self.cleaner.reset_stats()
738
+ self.transformer.reset_stats()