recursive-cleaner 0.7.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -62,6 +62,8 @@ class DataCleaner:
62
62
  report_path: str | None = "cleaning_report.md",
63
63
  dry_run: bool = False,
64
64
  auto_parse: bool = False,
65
+ tui: bool = False,
66
+ output_path: str = "cleaning_functions.py",
65
67
  ):
66
68
  self.backend = llm_backend
67
69
  self.file_path = file_path
@@ -86,7 +88,10 @@ class DataCleaner:
86
88
  self.report_path = report_path
87
89
  self.dry_run = dry_run
88
90
  self.auto_parse = auto_parse
91
+ self.tui = tui
92
+ self.output_path = output_path
89
93
  self.functions: list[dict] = [] # List of {name, docstring, code}
94
+ self._tui_renderer = None # TUIRenderer instance when tui=True
90
95
  self._generated_parser: callable | None = None # LLM-generated parser for unknown formats
91
96
  # Track recent function generation for saturation check
92
97
  self._recent_new_function_count = 0
@@ -119,10 +124,15 @@ class DataCleaner:
119
124
  try:
120
125
  self.on_progress(event)
121
126
  except Exception as e:
122
- print(f" Warning: callback error: {e}")
127
+ if not self.tui:
128
+ print(f" Warning: callback error: {e}")
123
129
 
124
130
  def _call_llm_timed(self, prompt: str, chunk_index: int = 0) -> str:
125
131
  """Call LLM with timing and emit latency event."""
132
+ # Update TUI status before call
133
+ if self._tui_renderer:
134
+ self._tui_renderer.update_llm_status("calling")
135
+
126
136
  start = time.perf_counter()
127
137
  response = call_llm(self.backend, prompt)
128
138
  elapsed_ms = (time.perf_counter() - start) * 1000
@@ -133,6 +143,20 @@ class DataCleaner:
133
143
  self._latency_stats["min_ms"] = min(self._latency_stats["min_ms"], elapsed_ms)
134
144
  self._latency_stats["max_ms"] = max(self._latency_stats["max_ms"], elapsed_ms)
135
145
 
146
+ # Update TUI status and metrics after call
147
+ if self._tui_renderer:
148
+ self._tui_renderer.update_llm_status("idle")
149
+ latency_summary = self._get_latency_summary()
150
+ self._tui_renderer.update_metrics(
151
+ quality_delta=0.0, # Quality delta calculated at end
152
+ latency_last=elapsed_ms,
153
+ latency_avg=latency_summary.get("avg_ms", 0.0),
154
+ latency_total=latency_summary.get("total_ms", 0.0),
155
+ llm_calls=latency_summary.get("call_count", 0),
156
+ )
157
+ self._tui_renderer.update_tokens(prompt, response)
158
+ self._tui_renderer.update_transmission(response)
159
+
136
160
  # Emit event
137
161
  self._emit("llm_call", chunk_index=chunk_index, latency_ms=round(elapsed_ms, 2))
138
162
 
@@ -216,7 +240,8 @@ class DataCleaner:
216
240
  response = self._call_llm_timed(prompt, chunk_index=chunks_processed - 1)
217
241
  assessment = parse_saturation_response(response)
218
242
  except Exception as e:
219
- print(f" Warning: saturation check failed: {e}")
243
+ if not self.tui:
244
+ print(f" Warning: saturation check failed: {e}")
220
245
  return False # Continue on error
221
246
 
222
247
  self._emit(
@@ -275,7 +300,8 @@ class DataCleaner:
275
300
  self.functions = state.get("functions", [])
276
301
  self._last_completed_chunk = state.get("last_completed_chunk", -1)
277
302
  self._total_chunks = state.get("total_chunks", 0)
278
- print(f"Resumed from state: {self._last_completed_chunk + 1}/{self._total_chunks} chunks completed")
303
+ if not self.tui:
304
+ print(f"Resumed from state: {self._last_completed_chunk + 1}/{self._total_chunks} chunks completed")
279
305
  return True
280
306
 
281
307
  @classmethod
@@ -340,14 +366,16 @@ class DataCleaner:
340
366
  """Load file using LLM-generated parser, return JSONL chunks."""
341
367
  from .parser_generator import generate_parser
342
368
 
343
- print(f"Unknown file format, generating parser...")
369
+ if not self.tui:
370
+ print(f"Unknown file format, generating parser...")
344
371
  self._emit("parser_generation_start")
345
372
 
346
373
  parser = generate_parser(self.backend, self.file_path)
347
374
  self._generated_parser = parser
348
375
 
349
376
  self._emit("parser_generation_complete")
350
- print("Parser generated successfully.")
377
+ if not self.tui:
378
+ print("Parser generated successfully.")
351
379
 
352
380
  # Parse the file
353
381
  records = parser(self.file_path)
@@ -390,7 +418,8 @@ class DataCleaner:
390
418
  )
391
419
 
392
420
  if not chunks:
393
- print("No data to process.")
421
+ if not self.tui:
422
+ print("No data to process.")
394
423
  return
395
424
 
396
425
  # Try to load existing state
@@ -409,13 +438,38 @@ class DataCleaner:
409
438
 
410
439
  self._total_chunks = len(chunks)
411
440
 
441
+ # Initialize TUI if enabled
442
+ if self.tui:
443
+ from .tui import HAS_RICH, TUIRenderer
444
+
445
+ if HAS_RICH:
446
+ self._tui_renderer = TUIRenderer(
447
+ file_path=self.file_path,
448
+ total_chunks=self._total_chunks,
449
+ total_records=0, # Could be calculated from chunks
450
+ )
451
+ self._tui_renderer.start()
452
+ else:
453
+ import logging
454
+
455
+ logging.warning(
456
+ "tui=True but Rich not installed. "
457
+ "Install with: pip install recursive-cleaner[tui]"
458
+ )
459
+
412
460
  for i, chunk in enumerate(chunks):
413
461
  # Skip already completed chunks
414
462
  if i <= self._last_completed_chunk:
415
- if resumed:
463
+ if resumed and not self.tui:
416
464
  print(f"Skipping chunk {i + 1}/{len(chunks)} (already completed)")
417
465
  continue
418
- print(f"Processing chunk {i + 1}/{len(chunks)}...")
466
+ if not self.tui:
467
+ print(f"Processing chunk {i + 1}/{len(chunks)}...")
468
+
469
+ # Update TUI with chunk progress
470
+ if self._tui_renderer:
471
+ self._tui_renderer.update_chunk(i, 0, self.max_iterations)
472
+
419
473
  self._process_chunk(chunk, i)
420
474
  # Mark chunk as completed and save state
421
475
  self._last_completed_chunk = i
@@ -429,7 +483,8 @@ class DataCleaner:
429
483
  ):
430
484
  if self._check_saturation(i + 1):
431
485
  self._emit("early_termination", chunk_index=i)
432
- print(f"Early termination: pattern discovery saturated at chunk {i + 1}")
486
+ if not self.tui:
487
+ print(f"Early termination: pattern discovery saturated at chunk {i + 1}")
433
488
  break
434
489
 
435
490
  # Skip optimization and output in dry_run mode
@@ -439,7 +494,11 @@ class DataCleaner:
439
494
  chunk_index=self._total_chunks - 1,
440
495
  latency_stats=self._get_latency_summary(),
441
496
  )
442
- print("Dry run complete. No functions generated or saved.")
497
+ # Stop TUI if running
498
+ if self._tui_renderer:
499
+ self._tui_renderer.stop()
500
+ if not self.tui:
501
+ print("Dry run complete. No functions generated or saved.")
443
502
  return
444
503
 
445
504
  # Two-pass optimization (if enabled and enough functions)
@@ -453,7 +512,22 @@ class DataCleaner:
453
512
  chunk_index=self._total_chunks - 1,
454
513
  latency_stats=self._get_latency_summary(),
455
514
  )
456
- print(f"Done! Generated {len(self.functions)} functions.")
515
+
516
+ # Show TUI completion and stop
517
+ if self._tui_renderer:
518
+ latency_summary = self._get_latency_summary()
519
+ self._tui_renderer.show_complete({
520
+ "functions_count": len(self.functions),
521
+ "chunks_processed": self._total_chunks,
522
+ "quality_delta": 0.0, # Could be calculated from metrics
523
+ "latency_total_ms": latency_summary.get("total_ms", 0.0),
524
+ "llm_calls": latency_summary.get("call_count", 0),
525
+ "output_file": self.output_path,
526
+ })
527
+ self._tui_renderer.stop()
528
+
529
+ if not self.tui:
530
+ print(f"Done! Generated {len(self.functions)} functions.")
457
531
 
458
532
  def _process_chunk(self, chunk: str, chunk_idx: int) -> None:
459
533
  """Process a single chunk, iterating until clean or max iterations."""
@@ -476,6 +550,11 @@ class DataCleaner:
476
550
 
477
551
  for iteration in range(self.max_iterations):
478
552
  self._emit("iteration", chunk_index=chunk_idx, iteration=iteration)
553
+
554
+ # Update TUI with iteration progress
555
+ if self._tui_renderer:
556
+ self._tui_renderer.update_chunk(chunk_idx, iteration, self.max_iterations)
557
+
479
558
  context = build_context(self.functions, self.context_budget)
480
559
  prompt = build_prompt(
481
560
  self.instructions,
@@ -511,7 +590,8 @@ class DataCleaner:
511
590
  function_name=result["name"],
512
591
  error=safety_error,
513
592
  )
514
- print(f" Safety check failed: {safety_error}")
593
+ if not self.tui:
594
+ print(f" Safety check failed: {safety_error}")
515
595
  continue
516
596
 
517
597
  # Runtime validation if enabled
@@ -539,7 +619,8 @@ class DataCleaner:
539
619
  function_name=result["name"],
540
620
  error=error_msg,
541
621
  )
542
- print(f" Validation failed: {error_msg}")
622
+ if not self.tui:
623
+ print(f" Validation failed: {error_msg}")
543
624
  continue
544
625
 
545
626
  self.functions.append({
@@ -549,17 +630,25 @@ class DataCleaner:
549
630
  })
550
631
  # Track for saturation check
551
632
  self._recent_new_function_count += 1
633
+
634
+ # Update TUI with new function
635
+ if self._tui_renderer:
636
+ self._tui_renderer.add_function(result["name"], result["docstring"])
637
+
552
638
  self._emit(
553
639
  "function_generated",
554
640
  chunk_index=chunk_idx,
555
641
  function_name=result["name"],
556
642
  )
557
- print(f" Generated: {result['name']}")
643
+ if not self.tui:
644
+ print(f" Generated: {result['name']}")
558
645
  else:
559
646
  # LLM said needs_more_work but didn't provide code
560
- print(f" Warning: iteration {iteration + 1} produced no function")
647
+ if not self.tui:
648
+ print(f" Warning: iteration {iteration + 1} produced no function")
561
649
 
562
- print(f" Warning: chunk {chunk_idx} hit max iterations ({self.max_iterations})")
650
+ if not self.tui:
651
+ print(f" Warning: chunk {chunk_idx} hit max iterations ({self.max_iterations})")
563
652
  self._emit("chunk_done", chunk_index=chunk_idx)
564
653
 
565
654
  def _process_chunk_dry_run(self, chunk: str, chunk_idx: int) -> None:
@@ -577,7 +666,8 @@ class DataCleaner:
577
666
  response = self._call_llm_timed(prompt, chunk_index=chunk_idx)
578
667
  result = parse_response(response)
579
668
  except ParseError as e:
580
- print(f" Warning: parse error in dry run: {e}")
669
+ if not self.tui:
670
+ print(f" Warning: parse error in dry run: {e}")
581
671
  self._emit("chunk_done", chunk_index=chunk_idx)
582
672
  return
583
673
 
@@ -589,23 +679,25 @@ class DataCleaner:
589
679
  issues=issues,
590
680
  )
591
681
 
592
- if issues:
593
- unsolved = [i for i in issues if not i.get("solved", False)]
594
- print(f" Found {len(issues)} issues ({len(unsolved)} unsolved)")
595
- else:
596
- print(" No issues detected")
682
+ if not self.tui:
683
+ if issues:
684
+ unsolved = [i for i in issues if not i.get("solved", False)]
685
+ print(f" Found {len(issues)} issues ({len(unsolved)} unsolved)")
686
+ else:
687
+ print(" No issues detected")
597
688
 
598
689
  self._emit("chunk_done", chunk_index=chunk_idx)
599
690
 
600
691
  def _write_output(self) -> None:
601
- """Write generated functions to cleaning_functions.py."""
692
+ """Write generated functions to output file."""
602
693
  from .output import write_cleaning_file
603
694
 
604
695
  try:
605
- write_cleaning_file(self.functions)
696
+ write_cleaning_file(self.functions, self.output_path)
606
697
  except OutputValidationError as e:
607
- print(f" Error: {e}")
608
- print(" Attempting to write valid functions only...")
698
+ if not self.tui:
699
+ print(f" Error: {e}")
700
+ print(" Attempting to write valid functions only...")
609
701
  # Try writing functions one by one, skipping invalid ones
610
702
  valid_functions = []
611
703
  for f in self.functions:
@@ -614,10 +706,11 @@ class DataCleaner:
614
706
  ast.parse(f["code"])
615
707
  valid_functions.append(f)
616
708
  except SyntaxError:
617
- print(f" Skipping invalid function: {f['name']}")
709
+ if not self.tui:
710
+ print(f" Skipping invalid function: {f['name']}")
618
711
  if valid_functions:
619
- write_cleaning_file(valid_functions)
620
- else:
712
+ write_cleaning_file(valid_functions, self.output_path)
713
+ elif not self.tui:
621
714
  print(" No valid functions to write.")
622
715
 
623
716
  def _write_report(self) -> None: