recursive-cleaner 0.7.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- backends/__init__.py +2 -1
- backends/openai_backend.py +71 -0
- recursive_cleaner/__init__.py +5 -0
- recursive_cleaner/__main__.py +8 -0
- recursive_cleaner/apply.py +483 -0
- recursive_cleaner/cleaner.py +122 -29
- recursive_cleaner/cli.py +395 -0
- recursive_cleaner/tui.py +614 -0
- {recursive_cleaner-0.7.1.dist-info → recursive_cleaner-1.0.0.dist-info}/METADATA +119 -4
- {recursive_cleaner-0.7.1.dist-info → recursive_cleaner-1.0.0.dist-info}/RECORD +13 -7
- recursive_cleaner-1.0.0.dist-info/entry_points.txt +2 -0
- {recursive_cleaner-0.7.1.dist-info → recursive_cleaner-1.0.0.dist-info}/WHEEL +0 -0
- {recursive_cleaner-0.7.1.dist-info → recursive_cleaner-1.0.0.dist-info}/licenses/LICENSE +0 -0
recursive_cleaner/cleaner.py
CHANGED
|
@@ -62,6 +62,8 @@ class DataCleaner:
|
|
|
62
62
|
report_path: str | None = "cleaning_report.md",
|
|
63
63
|
dry_run: bool = False,
|
|
64
64
|
auto_parse: bool = False,
|
|
65
|
+
tui: bool = False,
|
|
66
|
+
output_path: str = "cleaning_functions.py",
|
|
65
67
|
):
|
|
66
68
|
self.backend = llm_backend
|
|
67
69
|
self.file_path = file_path
|
|
@@ -86,7 +88,10 @@ class DataCleaner:
|
|
|
86
88
|
self.report_path = report_path
|
|
87
89
|
self.dry_run = dry_run
|
|
88
90
|
self.auto_parse = auto_parse
|
|
91
|
+
self.tui = tui
|
|
92
|
+
self.output_path = output_path
|
|
89
93
|
self.functions: list[dict] = [] # List of {name, docstring, code}
|
|
94
|
+
self._tui_renderer = None # TUIRenderer instance when tui=True
|
|
90
95
|
self._generated_parser: callable | None = None # LLM-generated parser for unknown formats
|
|
91
96
|
# Track recent function generation for saturation check
|
|
92
97
|
self._recent_new_function_count = 0
|
|
@@ -119,10 +124,15 @@ class DataCleaner:
|
|
|
119
124
|
try:
|
|
120
125
|
self.on_progress(event)
|
|
121
126
|
except Exception as e:
|
|
122
|
-
|
|
127
|
+
if not self.tui:
|
|
128
|
+
print(f" Warning: callback error: {e}")
|
|
123
129
|
|
|
124
130
|
def _call_llm_timed(self, prompt: str, chunk_index: int = 0) -> str:
|
|
125
131
|
"""Call LLM with timing and emit latency event."""
|
|
132
|
+
# Update TUI status before call
|
|
133
|
+
if self._tui_renderer:
|
|
134
|
+
self._tui_renderer.update_llm_status("calling")
|
|
135
|
+
|
|
126
136
|
start = time.perf_counter()
|
|
127
137
|
response = call_llm(self.backend, prompt)
|
|
128
138
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
@@ -133,6 +143,20 @@ class DataCleaner:
|
|
|
133
143
|
self._latency_stats["min_ms"] = min(self._latency_stats["min_ms"], elapsed_ms)
|
|
134
144
|
self._latency_stats["max_ms"] = max(self._latency_stats["max_ms"], elapsed_ms)
|
|
135
145
|
|
|
146
|
+
# Update TUI status and metrics after call
|
|
147
|
+
if self._tui_renderer:
|
|
148
|
+
self._tui_renderer.update_llm_status("idle")
|
|
149
|
+
latency_summary = self._get_latency_summary()
|
|
150
|
+
self._tui_renderer.update_metrics(
|
|
151
|
+
quality_delta=0.0, # Quality delta calculated at end
|
|
152
|
+
latency_last=elapsed_ms,
|
|
153
|
+
latency_avg=latency_summary.get("avg_ms", 0.0),
|
|
154
|
+
latency_total=latency_summary.get("total_ms", 0.0),
|
|
155
|
+
llm_calls=latency_summary.get("call_count", 0),
|
|
156
|
+
)
|
|
157
|
+
self._tui_renderer.update_tokens(prompt, response)
|
|
158
|
+
self._tui_renderer.update_transmission(response)
|
|
159
|
+
|
|
136
160
|
# Emit event
|
|
137
161
|
self._emit("llm_call", chunk_index=chunk_index, latency_ms=round(elapsed_ms, 2))
|
|
138
162
|
|
|
@@ -216,7 +240,8 @@ class DataCleaner:
|
|
|
216
240
|
response = self._call_llm_timed(prompt, chunk_index=chunks_processed - 1)
|
|
217
241
|
assessment = parse_saturation_response(response)
|
|
218
242
|
except Exception as e:
|
|
219
|
-
|
|
243
|
+
if not self.tui:
|
|
244
|
+
print(f" Warning: saturation check failed: {e}")
|
|
220
245
|
return False # Continue on error
|
|
221
246
|
|
|
222
247
|
self._emit(
|
|
@@ -275,7 +300,8 @@ class DataCleaner:
|
|
|
275
300
|
self.functions = state.get("functions", [])
|
|
276
301
|
self._last_completed_chunk = state.get("last_completed_chunk", -1)
|
|
277
302
|
self._total_chunks = state.get("total_chunks", 0)
|
|
278
|
-
|
|
303
|
+
if not self.tui:
|
|
304
|
+
print(f"Resumed from state: {self._last_completed_chunk + 1}/{self._total_chunks} chunks completed")
|
|
279
305
|
return True
|
|
280
306
|
|
|
281
307
|
@classmethod
|
|
@@ -340,14 +366,16 @@ class DataCleaner:
|
|
|
340
366
|
"""Load file using LLM-generated parser, return JSONL chunks."""
|
|
341
367
|
from .parser_generator import generate_parser
|
|
342
368
|
|
|
343
|
-
|
|
369
|
+
if not self.tui:
|
|
370
|
+
print(f"Unknown file format, generating parser...")
|
|
344
371
|
self._emit("parser_generation_start")
|
|
345
372
|
|
|
346
373
|
parser = generate_parser(self.backend, self.file_path)
|
|
347
374
|
self._generated_parser = parser
|
|
348
375
|
|
|
349
376
|
self._emit("parser_generation_complete")
|
|
350
|
-
|
|
377
|
+
if not self.tui:
|
|
378
|
+
print("Parser generated successfully.")
|
|
351
379
|
|
|
352
380
|
# Parse the file
|
|
353
381
|
records = parser(self.file_path)
|
|
@@ -390,7 +418,8 @@ class DataCleaner:
|
|
|
390
418
|
)
|
|
391
419
|
|
|
392
420
|
if not chunks:
|
|
393
|
-
|
|
421
|
+
if not self.tui:
|
|
422
|
+
print("No data to process.")
|
|
394
423
|
return
|
|
395
424
|
|
|
396
425
|
# Try to load existing state
|
|
@@ -409,13 +438,38 @@ class DataCleaner:
|
|
|
409
438
|
|
|
410
439
|
self._total_chunks = len(chunks)
|
|
411
440
|
|
|
441
|
+
# Initialize TUI if enabled
|
|
442
|
+
if self.tui:
|
|
443
|
+
from .tui import HAS_RICH, TUIRenderer
|
|
444
|
+
|
|
445
|
+
if HAS_RICH:
|
|
446
|
+
self._tui_renderer = TUIRenderer(
|
|
447
|
+
file_path=self.file_path,
|
|
448
|
+
total_chunks=self._total_chunks,
|
|
449
|
+
total_records=0, # Could be calculated from chunks
|
|
450
|
+
)
|
|
451
|
+
self._tui_renderer.start()
|
|
452
|
+
else:
|
|
453
|
+
import logging
|
|
454
|
+
|
|
455
|
+
logging.warning(
|
|
456
|
+
"tui=True but Rich not installed. "
|
|
457
|
+
"Install with: pip install recursive-cleaner[tui]"
|
|
458
|
+
)
|
|
459
|
+
|
|
412
460
|
for i, chunk in enumerate(chunks):
|
|
413
461
|
# Skip already completed chunks
|
|
414
462
|
if i <= self._last_completed_chunk:
|
|
415
|
-
if resumed:
|
|
463
|
+
if resumed and not self.tui:
|
|
416
464
|
print(f"Skipping chunk {i + 1}/{len(chunks)} (already completed)")
|
|
417
465
|
continue
|
|
418
|
-
|
|
466
|
+
if not self.tui:
|
|
467
|
+
print(f"Processing chunk {i + 1}/{len(chunks)}...")
|
|
468
|
+
|
|
469
|
+
# Update TUI with chunk progress
|
|
470
|
+
if self._tui_renderer:
|
|
471
|
+
self._tui_renderer.update_chunk(i, 0, self.max_iterations)
|
|
472
|
+
|
|
419
473
|
self._process_chunk(chunk, i)
|
|
420
474
|
# Mark chunk as completed and save state
|
|
421
475
|
self._last_completed_chunk = i
|
|
@@ -429,7 +483,8 @@ class DataCleaner:
|
|
|
429
483
|
):
|
|
430
484
|
if self._check_saturation(i + 1):
|
|
431
485
|
self._emit("early_termination", chunk_index=i)
|
|
432
|
-
|
|
486
|
+
if not self.tui:
|
|
487
|
+
print(f"Early termination: pattern discovery saturated at chunk {i + 1}")
|
|
433
488
|
break
|
|
434
489
|
|
|
435
490
|
# Skip optimization and output in dry_run mode
|
|
@@ -439,7 +494,11 @@ class DataCleaner:
|
|
|
439
494
|
chunk_index=self._total_chunks - 1,
|
|
440
495
|
latency_stats=self._get_latency_summary(),
|
|
441
496
|
)
|
|
442
|
-
|
|
497
|
+
# Stop TUI if running
|
|
498
|
+
if self._tui_renderer:
|
|
499
|
+
self._tui_renderer.stop()
|
|
500
|
+
if not self.tui:
|
|
501
|
+
print("Dry run complete. No functions generated or saved.")
|
|
443
502
|
return
|
|
444
503
|
|
|
445
504
|
# Two-pass optimization (if enabled and enough functions)
|
|
@@ -453,7 +512,22 @@ class DataCleaner:
|
|
|
453
512
|
chunk_index=self._total_chunks - 1,
|
|
454
513
|
latency_stats=self._get_latency_summary(),
|
|
455
514
|
)
|
|
456
|
-
|
|
515
|
+
|
|
516
|
+
# Show TUI completion and stop
|
|
517
|
+
if self._tui_renderer:
|
|
518
|
+
latency_summary = self._get_latency_summary()
|
|
519
|
+
self._tui_renderer.show_complete({
|
|
520
|
+
"functions_count": len(self.functions),
|
|
521
|
+
"chunks_processed": self._total_chunks,
|
|
522
|
+
"quality_delta": 0.0, # Could be calculated from metrics
|
|
523
|
+
"latency_total_ms": latency_summary.get("total_ms", 0.0),
|
|
524
|
+
"llm_calls": latency_summary.get("call_count", 0),
|
|
525
|
+
"output_file": self.output_path,
|
|
526
|
+
})
|
|
527
|
+
self._tui_renderer.stop()
|
|
528
|
+
|
|
529
|
+
if not self.tui:
|
|
530
|
+
print(f"Done! Generated {len(self.functions)} functions.")
|
|
457
531
|
|
|
458
532
|
def _process_chunk(self, chunk: str, chunk_idx: int) -> None:
|
|
459
533
|
"""Process a single chunk, iterating until clean or max iterations."""
|
|
@@ -476,6 +550,11 @@ class DataCleaner:
|
|
|
476
550
|
|
|
477
551
|
for iteration in range(self.max_iterations):
|
|
478
552
|
self._emit("iteration", chunk_index=chunk_idx, iteration=iteration)
|
|
553
|
+
|
|
554
|
+
# Update TUI with iteration progress
|
|
555
|
+
if self._tui_renderer:
|
|
556
|
+
self._tui_renderer.update_chunk(chunk_idx, iteration, self.max_iterations)
|
|
557
|
+
|
|
479
558
|
context = build_context(self.functions, self.context_budget)
|
|
480
559
|
prompt = build_prompt(
|
|
481
560
|
self.instructions,
|
|
@@ -511,7 +590,8 @@ class DataCleaner:
|
|
|
511
590
|
function_name=result["name"],
|
|
512
591
|
error=safety_error,
|
|
513
592
|
)
|
|
514
|
-
|
|
593
|
+
if not self.tui:
|
|
594
|
+
print(f" Safety check failed: {safety_error}")
|
|
515
595
|
continue
|
|
516
596
|
|
|
517
597
|
# Runtime validation if enabled
|
|
@@ -539,7 +619,8 @@ class DataCleaner:
|
|
|
539
619
|
function_name=result["name"],
|
|
540
620
|
error=error_msg,
|
|
541
621
|
)
|
|
542
|
-
|
|
622
|
+
if not self.tui:
|
|
623
|
+
print(f" Validation failed: {error_msg}")
|
|
543
624
|
continue
|
|
544
625
|
|
|
545
626
|
self.functions.append({
|
|
@@ -549,17 +630,25 @@ class DataCleaner:
|
|
|
549
630
|
})
|
|
550
631
|
# Track for saturation check
|
|
551
632
|
self._recent_new_function_count += 1
|
|
633
|
+
|
|
634
|
+
# Update TUI with new function
|
|
635
|
+
if self._tui_renderer:
|
|
636
|
+
self._tui_renderer.add_function(result["name"], result["docstring"])
|
|
637
|
+
|
|
552
638
|
self._emit(
|
|
553
639
|
"function_generated",
|
|
554
640
|
chunk_index=chunk_idx,
|
|
555
641
|
function_name=result["name"],
|
|
556
642
|
)
|
|
557
|
-
|
|
643
|
+
if not self.tui:
|
|
644
|
+
print(f" Generated: {result['name']}")
|
|
558
645
|
else:
|
|
559
646
|
# LLM said needs_more_work but didn't provide code
|
|
560
|
-
|
|
647
|
+
if not self.tui:
|
|
648
|
+
print(f" Warning: iteration {iteration + 1} produced no function")
|
|
561
649
|
|
|
562
|
-
|
|
650
|
+
if not self.tui:
|
|
651
|
+
print(f" Warning: chunk {chunk_idx} hit max iterations ({self.max_iterations})")
|
|
563
652
|
self._emit("chunk_done", chunk_index=chunk_idx)
|
|
564
653
|
|
|
565
654
|
def _process_chunk_dry_run(self, chunk: str, chunk_idx: int) -> None:
|
|
@@ -577,7 +666,8 @@ class DataCleaner:
|
|
|
577
666
|
response = self._call_llm_timed(prompt, chunk_index=chunk_idx)
|
|
578
667
|
result = parse_response(response)
|
|
579
668
|
except ParseError as e:
|
|
580
|
-
|
|
669
|
+
if not self.tui:
|
|
670
|
+
print(f" Warning: parse error in dry run: {e}")
|
|
581
671
|
self._emit("chunk_done", chunk_index=chunk_idx)
|
|
582
672
|
return
|
|
583
673
|
|
|
@@ -589,23 +679,25 @@ class DataCleaner:
|
|
|
589
679
|
issues=issues,
|
|
590
680
|
)
|
|
591
681
|
|
|
592
|
-
if
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
682
|
+
if not self.tui:
|
|
683
|
+
if issues:
|
|
684
|
+
unsolved = [i for i in issues if not i.get("solved", False)]
|
|
685
|
+
print(f" Found {len(issues)} issues ({len(unsolved)} unsolved)")
|
|
686
|
+
else:
|
|
687
|
+
print(" No issues detected")
|
|
597
688
|
|
|
598
689
|
self._emit("chunk_done", chunk_index=chunk_idx)
|
|
599
690
|
|
|
600
691
|
def _write_output(self) -> None:
|
|
601
|
-
"""Write generated functions to
|
|
692
|
+
"""Write generated functions to output file."""
|
|
602
693
|
from .output import write_cleaning_file
|
|
603
694
|
|
|
604
695
|
try:
|
|
605
|
-
write_cleaning_file(self.functions)
|
|
696
|
+
write_cleaning_file(self.functions, self.output_path)
|
|
606
697
|
except OutputValidationError as e:
|
|
607
|
-
|
|
608
|
-
|
|
698
|
+
if not self.tui:
|
|
699
|
+
print(f" Error: {e}")
|
|
700
|
+
print(" Attempting to write valid functions only...")
|
|
609
701
|
# Try writing functions one by one, skipping invalid ones
|
|
610
702
|
valid_functions = []
|
|
611
703
|
for f in self.functions:
|
|
@@ -614,10 +706,11 @@ class DataCleaner:
|
|
|
614
706
|
ast.parse(f["code"])
|
|
615
707
|
valid_functions.append(f)
|
|
616
708
|
except SyntaxError:
|
|
617
|
-
|
|
709
|
+
if not self.tui:
|
|
710
|
+
print(f" Skipping invalid function: {f['name']}")
|
|
618
711
|
if valid_functions:
|
|
619
|
-
write_cleaning_file(valid_functions)
|
|
620
|
-
|
|
712
|
+
write_cleaning_file(valid_functions, self.output_path)
|
|
713
|
+
elif not self.tui:
|
|
621
714
|
print(" No valid functions to write.")
|
|
622
715
|
|
|
623
716
|
def _write_report(self) -> None:
|