recursive-cleaner 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- recursive_cleaner/__init__.py +3 -0
- recursive_cleaner/cleaner.py +117 -26
- recursive_cleaner/tui.py +595 -0
- {recursive_cleaner-0.7.0.dist-info → recursive_cleaner-0.8.0.dist-info}/METADATA +55 -16
- {recursive_cleaner-0.7.0.dist-info → recursive_cleaner-0.8.0.dist-info}/RECORD +7 -6
- {recursive_cleaner-0.7.0.dist-info → recursive_cleaner-0.8.0.dist-info}/WHEEL +0 -0
- {recursive_cleaner-0.7.0.dist-info → recursive_cleaner-0.8.0.dist-info}/licenses/LICENSE +0 -0
recursive_cleaner/__init__.py
CHANGED
|
@@ -20,6 +20,7 @@ from recursive_cleaner.parsers import MARKITDOWN_EXTENSIONS, chunk_file, load_pa
|
|
|
20
20
|
from recursive_cleaner.prompt import build_prompt
|
|
21
21
|
from recursive_cleaner.response import extract_python_block, parse_response
|
|
22
22
|
from recursive_cleaner.parser_generator import check_parser_safety, generate_parser
|
|
23
|
+
from recursive_cleaner.tui import HAS_RICH, TUIRenderer
|
|
23
24
|
from recursive_cleaner.validation import check_code_safety, extract_sample_data, validate_function
|
|
24
25
|
|
|
25
26
|
__all__ = [
|
|
@@ -49,4 +50,6 @@ __all__ = [
|
|
|
49
50
|
"consolidate_with_agency",
|
|
50
51
|
"generate_parser",
|
|
51
52
|
"check_parser_safety",
|
|
53
|
+
"TUIRenderer",
|
|
54
|
+
"HAS_RICH",
|
|
52
55
|
]
|
recursive_cleaner/cleaner.py
CHANGED
|
@@ -62,6 +62,7 @@ class DataCleaner:
|
|
|
62
62
|
report_path: str | None = "cleaning_report.md",
|
|
63
63
|
dry_run: bool = False,
|
|
64
64
|
auto_parse: bool = False,
|
|
65
|
+
tui: bool = False,
|
|
65
66
|
):
|
|
66
67
|
self.backend = llm_backend
|
|
67
68
|
self.file_path = file_path
|
|
@@ -86,7 +87,9 @@ class DataCleaner:
|
|
|
86
87
|
self.report_path = report_path
|
|
87
88
|
self.dry_run = dry_run
|
|
88
89
|
self.auto_parse = auto_parse
|
|
90
|
+
self.tui = tui
|
|
89
91
|
self.functions: list[dict] = [] # List of {name, docstring, code}
|
|
92
|
+
self._tui_renderer = None # TUIRenderer instance when tui=True
|
|
90
93
|
self._generated_parser: callable | None = None # LLM-generated parser for unknown formats
|
|
91
94
|
# Track recent function generation for saturation check
|
|
92
95
|
self._recent_new_function_count = 0
|
|
@@ -119,10 +122,15 @@ class DataCleaner:
|
|
|
119
122
|
try:
|
|
120
123
|
self.on_progress(event)
|
|
121
124
|
except Exception as e:
|
|
122
|
-
|
|
125
|
+
if not self.tui:
|
|
126
|
+
print(f" Warning: callback error: {e}")
|
|
123
127
|
|
|
124
128
|
def _call_llm_timed(self, prompt: str, chunk_index: int = 0) -> str:
|
|
125
129
|
"""Call LLM with timing and emit latency event."""
|
|
130
|
+
# Update TUI status before call
|
|
131
|
+
if self._tui_renderer:
|
|
132
|
+
self._tui_renderer.update_llm_status("calling")
|
|
133
|
+
|
|
126
134
|
start = time.perf_counter()
|
|
127
135
|
response = call_llm(self.backend, prompt)
|
|
128
136
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
@@ -133,6 +141,20 @@ class DataCleaner:
|
|
|
133
141
|
self._latency_stats["min_ms"] = min(self._latency_stats["min_ms"], elapsed_ms)
|
|
134
142
|
self._latency_stats["max_ms"] = max(self._latency_stats["max_ms"], elapsed_ms)
|
|
135
143
|
|
|
144
|
+
# Update TUI status and metrics after call
|
|
145
|
+
if self._tui_renderer:
|
|
146
|
+
self._tui_renderer.update_llm_status("idle")
|
|
147
|
+
latency_summary = self._get_latency_summary()
|
|
148
|
+
self._tui_renderer.update_metrics(
|
|
149
|
+
quality_delta=0.0, # Quality delta calculated at end
|
|
150
|
+
latency_last=elapsed_ms,
|
|
151
|
+
latency_avg=latency_summary.get("avg_ms", 0.0),
|
|
152
|
+
latency_total=latency_summary.get("total_ms", 0.0),
|
|
153
|
+
llm_calls=latency_summary.get("call_count", 0),
|
|
154
|
+
)
|
|
155
|
+
self._tui_renderer.update_tokens(prompt, response)
|
|
156
|
+
self._tui_renderer.update_transmission(response)
|
|
157
|
+
|
|
136
158
|
# Emit event
|
|
137
159
|
self._emit("llm_call", chunk_index=chunk_index, latency_ms=round(elapsed_ms, 2))
|
|
138
160
|
|
|
@@ -216,7 +238,8 @@ class DataCleaner:
|
|
|
216
238
|
response = self._call_llm_timed(prompt, chunk_index=chunks_processed - 1)
|
|
217
239
|
assessment = parse_saturation_response(response)
|
|
218
240
|
except Exception as e:
|
|
219
|
-
|
|
241
|
+
if not self.tui:
|
|
242
|
+
print(f" Warning: saturation check failed: {e}")
|
|
220
243
|
return False # Continue on error
|
|
221
244
|
|
|
222
245
|
self._emit(
|
|
@@ -275,7 +298,8 @@ class DataCleaner:
|
|
|
275
298
|
self.functions = state.get("functions", [])
|
|
276
299
|
self._last_completed_chunk = state.get("last_completed_chunk", -1)
|
|
277
300
|
self._total_chunks = state.get("total_chunks", 0)
|
|
278
|
-
|
|
301
|
+
if not self.tui:
|
|
302
|
+
print(f"Resumed from state: {self._last_completed_chunk + 1}/{self._total_chunks} chunks completed")
|
|
279
303
|
return True
|
|
280
304
|
|
|
281
305
|
@classmethod
|
|
@@ -340,14 +364,16 @@ class DataCleaner:
|
|
|
340
364
|
"""Load file using LLM-generated parser, return JSONL chunks."""
|
|
341
365
|
from .parser_generator import generate_parser
|
|
342
366
|
|
|
343
|
-
|
|
367
|
+
if not self.tui:
|
|
368
|
+
print(f"Unknown file format, generating parser...")
|
|
344
369
|
self._emit("parser_generation_start")
|
|
345
370
|
|
|
346
371
|
parser = generate_parser(self.backend, self.file_path)
|
|
347
372
|
self._generated_parser = parser
|
|
348
373
|
|
|
349
374
|
self._emit("parser_generation_complete")
|
|
350
|
-
|
|
375
|
+
if not self.tui:
|
|
376
|
+
print("Parser generated successfully.")
|
|
351
377
|
|
|
352
378
|
# Parse the file
|
|
353
379
|
records = parser(self.file_path)
|
|
@@ -390,7 +416,8 @@ class DataCleaner:
|
|
|
390
416
|
)
|
|
391
417
|
|
|
392
418
|
if not chunks:
|
|
393
|
-
|
|
419
|
+
if not self.tui:
|
|
420
|
+
print("No data to process.")
|
|
394
421
|
return
|
|
395
422
|
|
|
396
423
|
# Try to load existing state
|
|
@@ -409,13 +436,38 @@ class DataCleaner:
|
|
|
409
436
|
|
|
410
437
|
self._total_chunks = len(chunks)
|
|
411
438
|
|
|
439
|
+
# Initialize TUI if enabled
|
|
440
|
+
if self.tui:
|
|
441
|
+
from .tui import HAS_RICH, TUIRenderer
|
|
442
|
+
|
|
443
|
+
if HAS_RICH:
|
|
444
|
+
self._tui_renderer = TUIRenderer(
|
|
445
|
+
file_path=self.file_path,
|
|
446
|
+
total_chunks=self._total_chunks,
|
|
447
|
+
total_records=0, # Could be calculated from chunks
|
|
448
|
+
)
|
|
449
|
+
self._tui_renderer.start()
|
|
450
|
+
else:
|
|
451
|
+
import logging
|
|
452
|
+
|
|
453
|
+
logging.warning(
|
|
454
|
+
"tui=True but Rich not installed. "
|
|
455
|
+
"Install with: pip install recursive-cleaner[tui]"
|
|
456
|
+
)
|
|
457
|
+
|
|
412
458
|
for i, chunk in enumerate(chunks):
|
|
413
459
|
# Skip already completed chunks
|
|
414
460
|
if i <= self._last_completed_chunk:
|
|
415
|
-
if resumed:
|
|
461
|
+
if resumed and not self.tui:
|
|
416
462
|
print(f"Skipping chunk {i + 1}/{len(chunks)} (already completed)")
|
|
417
463
|
continue
|
|
418
|
-
|
|
464
|
+
if not self.tui:
|
|
465
|
+
print(f"Processing chunk {i + 1}/{len(chunks)}...")
|
|
466
|
+
|
|
467
|
+
# Update TUI with chunk progress
|
|
468
|
+
if self._tui_renderer:
|
|
469
|
+
self._tui_renderer.update_chunk(i, 0, self.max_iterations)
|
|
470
|
+
|
|
419
471
|
self._process_chunk(chunk, i)
|
|
420
472
|
# Mark chunk as completed and save state
|
|
421
473
|
self._last_completed_chunk = i
|
|
@@ -429,7 +481,8 @@ class DataCleaner:
|
|
|
429
481
|
):
|
|
430
482
|
if self._check_saturation(i + 1):
|
|
431
483
|
self._emit("early_termination", chunk_index=i)
|
|
432
|
-
|
|
484
|
+
if not self.tui:
|
|
485
|
+
print(f"Early termination: pattern discovery saturated at chunk {i + 1}")
|
|
433
486
|
break
|
|
434
487
|
|
|
435
488
|
# Skip optimization and output in dry_run mode
|
|
@@ -439,7 +492,11 @@ class DataCleaner:
|
|
|
439
492
|
chunk_index=self._total_chunks - 1,
|
|
440
493
|
latency_stats=self._get_latency_summary(),
|
|
441
494
|
)
|
|
442
|
-
|
|
495
|
+
# Stop TUI if running
|
|
496
|
+
if self._tui_renderer:
|
|
497
|
+
self._tui_renderer.stop()
|
|
498
|
+
if not self.tui:
|
|
499
|
+
print("Dry run complete. No functions generated or saved.")
|
|
443
500
|
return
|
|
444
501
|
|
|
445
502
|
# Two-pass optimization (if enabled and enough functions)
|
|
@@ -453,7 +510,22 @@ class DataCleaner:
|
|
|
453
510
|
chunk_index=self._total_chunks - 1,
|
|
454
511
|
latency_stats=self._get_latency_summary(),
|
|
455
512
|
)
|
|
456
|
-
|
|
513
|
+
|
|
514
|
+
# Show TUI completion and stop
|
|
515
|
+
if self._tui_renderer:
|
|
516
|
+
latency_summary = self._get_latency_summary()
|
|
517
|
+
self._tui_renderer.show_complete({
|
|
518
|
+
"functions_count": len(self.functions),
|
|
519
|
+
"chunks_processed": self._total_chunks,
|
|
520
|
+
"quality_delta": 0.0, # Could be calculated from metrics
|
|
521
|
+
"latency_total_ms": latency_summary.get("total_ms", 0.0),
|
|
522
|
+
"llm_calls": latency_summary.get("call_count", 0),
|
|
523
|
+
"output_file": "cleaning_functions.py",
|
|
524
|
+
})
|
|
525
|
+
self._tui_renderer.stop()
|
|
526
|
+
|
|
527
|
+
if not self.tui:
|
|
528
|
+
print(f"Done! Generated {len(self.functions)} functions.")
|
|
457
529
|
|
|
458
530
|
def _process_chunk(self, chunk: str, chunk_idx: int) -> None:
|
|
459
531
|
"""Process a single chunk, iterating until clean or max iterations."""
|
|
@@ -476,6 +548,11 @@ class DataCleaner:
|
|
|
476
548
|
|
|
477
549
|
for iteration in range(self.max_iterations):
|
|
478
550
|
self._emit("iteration", chunk_index=chunk_idx, iteration=iteration)
|
|
551
|
+
|
|
552
|
+
# Update TUI with iteration progress
|
|
553
|
+
if self._tui_renderer:
|
|
554
|
+
self._tui_renderer.update_chunk(chunk_idx, iteration, self.max_iterations)
|
|
555
|
+
|
|
479
556
|
context = build_context(self.functions, self.context_budget)
|
|
480
557
|
prompt = build_prompt(
|
|
481
558
|
self.instructions,
|
|
@@ -511,7 +588,8 @@ class DataCleaner:
|
|
|
511
588
|
function_name=result["name"],
|
|
512
589
|
error=safety_error,
|
|
513
590
|
)
|
|
514
|
-
|
|
591
|
+
if not self.tui:
|
|
592
|
+
print(f" Safety check failed: {safety_error}")
|
|
515
593
|
continue
|
|
516
594
|
|
|
517
595
|
# Runtime validation if enabled
|
|
@@ -539,7 +617,8 @@ class DataCleaner:
|
|
|
539
617
|
function_name=result["name"],
|
|
540
618
|
error=error_msg,
|
|
541
619
|
)
|
|
542
|
-
|
|
620
|
+
if not self.tui:
|
|
621
|
+
print(f" Validation failed: {error_msg}")
|
|
543
622
|
continue
|
|
544
623
|
|
|
545
624
|
self.functions.append({
|
|
@@ -549,17 +628,25 @@ class DataCleaner:
|
|
|
549
628
|
})
|
|
550
629
|
# Track for saturation check
|
|
551
630
|
self._recent_new_function_count += 1
|
|
631
|
+
|
|
632
|
+
# Update TUI with new function
|
|
633
|
+
if self._tui_renderer:
|
|
634
|
+
self._tui_renderer.add_function(result["name"], result["docstring"])
|
|
635
|
+
|
|
552
636
|
self._emit(
|
|
553
637
|
"function_generated",
|
|
554
638
|
chunk_index=chunk_idx,
|
|
555
639
|
function_name=result["name"],
|
|
556
640
|
)
|
|
557
|
-
|
|
641
|
+
if not self.tui:
|
|
642
|
+
print(f" Generated: {result['name']}")
|
|
558
643
|
else:
|
|
559
644
|
# LLM said needs_more_work but didn't provide code
|
|
560
|
-
|
|
645
|
+
if not self.tui:
|
|
646
|
+
print(f" Warning: iteration {iteration + 1} produced no function")
|
|
561
647
|
|
|
562
|
-
|
|
648
|
+
if not self.tui:
|
|
649
|
+
print(f" Warning: chunk {chunk_idx} hit max iterations ({self.max_iterations})")
|
|
563
650
|
self._emit("chunk_done", chunk_index=chunk_idx)
|
|
564
651
|
|
|
565
652
|
def _process_chunk_dry_run(self, chunk: str, chunk_idx: int) -> None:
|
|
@@ -577,7 +664,8 @@ class DataCleaner:
|
|
|
577
664
|
response = self._call_llm_timed(prompt, chunk_index=chunk_idx)
|
|
578
665
|
result = parse_response(response)
|
|
579
666
|
except ParseError as e:
|
|
580
|
-
|
|
667
|
+
if not self.tui:
|
|
668
|
+
print(f" Warning: parse error in dry run: {e}")
|
|
581
669
|
self._emit("chunk_done", chunk_index=chunk_idx)
|
|
582
670
|
return
|
|
583
671
|
|
|
@@ -589,11 +677,12 @@ class DataCleaner:
|
|
|
589
677
|
issues=issues,
|
|
590
678
|
)
|
|
591
679
|
|
|
592
|
-
if
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
680
|
+
if not self.tui:
|
|
681
|
+
if issues:
|
|
682
|
+
unsolved = [i for i in issues if not i.get("solved", False)]
|
|
683
|
+
print(f" Found {len(issues)} issues ({len(unsolved)} unsolved)")
|
|
684
|
+
else:
|
|
685
|
+
print(" No issues detected")
|
|
597
686
|
|
|
598
687
|
self._emit("chunk_done", chunk_index=chunk_idx)
|
|
599
688
|
|
|
@@ -604,8 +693,9 @@ class DataCleaner:
|
|
|
604
693
|
try:
|
|
605
694
|
write_cleaning_file(self.functions)
|
|
606
695
|
except OutputValidationError as e:
|
|
607
|
-
|
|
608
|
-
|
|
696
|
+
if not self.tui:
|
|
697
|
+
print(f" Error: {e}")
|
|
698
|
+
print(" Attempting to write valid functions only...")
|
|
609
699
|
# Try writing functions one by one, skipping invalid ones
|
|
610
700
|
valid_functions = []
|
|
611
701
|
for f in self.functions:
|
|
@@ -614,10 +704,11 @@ class DataCleaner:
|
|
|
614
704
|
ast.parse(f["code"])
|
|
615
705
|
valid_functions.append(f)
|
|
616
706
|
except SyntaxError:
|
|
617
|
-
|
|
707
|
+
if not self.tui:
|
|
708
|
+
print(f" Skipping invalid function: {f['name']}")
|
|
618
709
|
if valid_functions:
|
|
619
710
|
write_cleaning_file(valid_functions)
|
|
620
|
-
|
|
711
|
+
elif not self.tui:
|
|
621
712
|
print(" No valid functions to write.")
|
|
622
713
|
|
|
623
714
|
def _write_report(self) -> None:
|
recursive_cleaner/tui.py
ADDED
|
@@ -0,0 +1,595 @@
|
|
|
1
|
+
"""Rich TUI dashboard with Mission Control retro aesthetic."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
# Graceful import - TUI features only available when Rich is installed
|
|
8
|
+
try:
|
|
9
|
+
from rich.box import DOUBLE
|
|
10
|
+
from rich.console import Console, Group
|
|
11
|
+
from rich.layout import Layout
|
|
12
|
+
from rich.live import Live
|
|
13
|
+
from rich.panel import Panel
|
|
14
|
+
from rich.progress import BarColumn, Progress, TextColumn
|
|
15
|
+
from rich.table import Table
|
|
16
|
+
from rich.text import Text
|
|
17
|
+
|
|
18
|
+
HAS_RICH = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
HAS_RICH = False
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# ASCII art banner - chunky block style
|
|
24
|
+
ASCII_BANNER = """
|
|
25
|
+
██████╗ ███████╗ ██████╗██╗ ██╗██████╗ ███████╗██╗██╗ ██╗███████╗
|
|
26
|
+
██╔══██╗██╔════╝██╔════╝██║ ██║██╔══██╗██╔════╝██║██║ ██║██╔════╝
|
|
27
|
+
██████╔╝█████╗ ██║ ██║ ██║██████╔╝███████╗██║██║ ██║█████╗
|
|
28
|
+
██╔══██╗██╔══╝ ██║ ██║ ██║██╔══██╗╚════██║██║╚██╗ ██╔╝██╔══╝
|
|
29
|
+
██║ ██║███████╗╚██████╗╚██████╔╝██║ ██║███████║██║ ╚████╔╝ ███████╗
|
|
30
|
+
╚═╝ ╚═╝╚══════╝ ╚═════╝ ╚═════╝ ╚═╝ ╚═╝╚══════╝╚═╝ ╚═══╝ ╚══════╝
|
|
31
|
+
██████╗██╗ ███████╗ █████╗ ███╗ ██╗███████╗██████╗
|
|
32
|
+
██╔════╝██║ ██╔════╝██╔══██╗████╗ ██║██╔════╝██╔══██╗
|
|
33
|
+
██║ ██║ █████╗ ███████║██╔██╗ ██║█████╗ ██████╔╝
|
|
34
|
+
██║ ██║ ██╔══╝ ██╔══██║██║╚██╗██║██╔══╝ ██╔══██╗
|
|
35
|
+
╚██████╗███████╗███████╗██║ ██║██║ ╚████║███████╗██║ ██║
|
|
36
|
+
╚═════╝╚══════╝╚══════╝╚═╝ ╚═╝╚═╝ ╚═══╝╚══════╝╚═╝ ╚═╝
|
|
37
|
+
""".strip()
|
|
38
|
+
|
|
39
|
+
# Keep HEADER_TITLE for backwards compatibility with tests
|
|
40
|
+
HEADER_TITLE = "RECURSIVE CLEANER"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class FunctionInfo:
|
|
45
|
+
"""Info about a generated cleaning function."""
|
|
46
|
+
|
|
47
|
+
name: str
|
|
48
|
+
docstring: str
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class TUIState:
|
|
53
|
+
"""Dashboard display state."""
|
|
54
|
+
|
|
55
|
+
# Header
|
|
56
|
+
file_path: str
|
|
57
|
+
total_records: int
|
|
58
|
+
version: str = "0.8.0"
|
|
59
|
+
|
|
60
|
+
# Progress
|
|
61
|
+
current_chunk: int = 0
|
|
62
|
+
total_chunks: int = 0
|
|
63
|
+
current_iteration: int = 0
|
|
64
|
+
max_iterations: int = 5
|
|
65
|
+
|
|
66
|
+
# LLM Status
|
|
67
|
+
llm_status: Literal["idle", "calling"] = "idle"
|
|
68
|
+
|
|
69
|
+
# Functions
|
|
70
|
+
functions: list[FunctionInfo] = field(default_factory=list)
|
|
71
|
+
|
|
72
|
+
# Latency metrics
|
|
73
|
+
latency_last_ms: float = 0.0
|
|
74
|
+
latency_avg_ms: float = 0.0
|
|
75
|
+
latency_total_ms: float = 0.0
|
|
76
|
+
llm_call_count: int = 0
|
|
77
|
+
|
|
78
|
+
# Token estimation
|
|
79
|
+
tokens_in: int = 0
|
|
80
|
+
tokens_out: int = 0
|
|
81
|
+
|
|
82
|
+
# Transmission log
|
|
83
|
+
last_response: str = ""
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class TUIRenderer:
|
|
87
|
+
"""
|
|
88
|
+
Rich-based terminal dashboard with Mission Control retro aesthetic.
|
|
89
|
+
|
|
90
|
+
Shows live updates during cleaning runs with:
|
|
91
|
+
- ASCII art banner header
|
|
92
|
+
- Mission timer and status indicator
|
|
93
|
+
- Progress bar and chunk/iteration counters
|
|
94
|
+
- List of generated functions with checkmarks
|
|
95
|
+
- Token estimation and latency metrics
|
|
96
|
+
- Transmission log showing latest LLM response
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
def __init__(self, file_path: str, total_chunks: int, total_records: int = 0):
|
|
100
|
+
"""
|
|
101
|
+
Initialize TUI renderer.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
file_path: Path to the data file being cleaned
|
|
105
|
+
total_chunks: Total number of chunks to process
|
|
106
|
+
total_records: Total number of records in the file
|
|
107
|
+
"""
|
|
108
|
+
self._state = TUIState(
|
|
109
|
+
file_path=file_path,
|
|
110
|
+
total_chunks=total_chunks,
|
|
111
|
+
total_records=total_records,
|
|
112
|
+
)
|
|
113
|
+
self._start_time = time.time()
|
|
114
|
+
self._layout = self._make_layout() if HAS_RICH else None
|
|
115
|
+
self._live: "Live | None" = None
|
|
116
|
+
self._console = Console() if HAS_RICH else None
|
|
117
|
+
|
|
118
|
+
def _make_layout(self) -> "Layout":
|
|
119
|
+
"""Create the dashboard layout structure.
|
|
120
|
+
|
|
121
|
+
Layout:
|
|
122
|
+
- header (size=5) - ASCII art banner "RECURSIVE CLEANER"
|
|
123
|
+
- status_bar (size=3) - MISSION | TIME | STATUS
|
|
124
|
+
- progress_bar (size=3) - CHUNK X/Y + progress bar
|
|
125
|
+
- body (size=computed) - Split horizontally, FIXED size to prevent infinite expansion
|
|
126
|
+
- left_panel - FUNCTIONS ACQUIRED, tokens, latency
|
|
127
|
+
- right_panel - Parsed transmission log
|
|
128
|
+
|
|
129
|
+
CRITICAL: Body uses fixed `size=` not `ratio=` to prevent panels from
|
|
130
|
+
expanding infinitely and pushing header off screen on large terminals.
|
|
131
|
+
Works on terminals as small as 80x24.
|
|
132
|
+
"""
|
|
133
|
+
if not HAS_RICH:
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
from rich.console import Console
|
|
137
|
+
|
|
138
|
+
console = Console()
|
|
139
|
+
term_height = console.height or 24 # Default to 24 if unknown
|
|
140
|
+
|
|
141
|
+
# Fixed heights for top sections
|
|
142
|
+
header_height = 14 # ASCII banner (12 lines + border)
|
|
143
|
+
status_height = 3
|
|
144
|
+
progress_height = 3
|
|
145
|
+
fixed_total = header_height + status_height + progress_height
|
|
146
|
+
|
|
147
|
+
# Body gets remaining space with a FIXED size (not ratio)
|
|
148
|
+
# Cap at 18 rows max to keep it tight
|
|
149
|
+
body_height = min(18, max(10, term_height - fixed_total - 2))
|
|
150
|
+
|
|
151
|
+
layout = Layout()
|
|
152
|
+
layout.split_column(
|
|
153
|
+
Layout(name="header", size=header_height),
|
|
154
|
+
Layout(name="status_bar", size=status_height),
|
|
155
|
+
Layout(name="progress_bar", size=progress_height),
|
|
156
|
+
Layout(name="body", size=body_height), # FIXED size, not ratio
|
|
157
|
+
)
|
|
158
|
+
layout["body"].split_row(
|
|
159
|
+
Layout(name="left_panel", ratio=1),
|
|
160
|
+
Layout(name="right_panel", ratio=1),
|
|
161
|
+
)
|
|
162
|
+
return layout
|
|
163
|
+
|
|
164
|
+
def start(self) -> None:
|
|
165
|
+
"""Start the live TUI display."""
|
|
166
|
+
if not HAS_RICH or self._layout is None:
|
|
167
|
+
return
|
|
168
|
+
|
|
169
|
+
self._start_time = time.time()
|
|
170
|
+
self._refresh()
|
|
171
|
+
self._live = Live(
|
|
172
|
+
self._layout,
|
|
173
|
+
console=self._console,
|
|
174
|
+
refresh_per_second=2,
|
|
175
|
+
vertical_overflow="crop",
|
|
176
|
+
)
|
|
177
|
+
self._live.start()
|
|
178
|
+
|
|
179
|
+
def stop(self) -> None:
|
|
180
|
+
"""Stop the live TUI display."""
|
|
181
|
+
if self._live:
|
|
182
|
+
self._live.stop()
|
|
183
|
+
self._live = None
|
|
184
|
+
|
|
185
|
+
def update_chunk(self, chunk_index: int, iteration: int, max_iterations: int) -> None:
|
|
186
|
+
"""
|
|
187
|
+
Update progress for current chunk and iteration.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
chunk_index: Current chunk index (0-based)
|
|
191
|
+
iteration: Current iteration within chunk (0-based)
|
|
192
|
+
max_iterations: Maximum iterations per chunk
|
|
193
|
+
"""
|
|
194
|
+
self._state.current_chunk = chunk_index + 1 # Convert to 1-based for display
|
|
195
|
+
self._state.current_iteration = iteration + 1
|
|
196
|
+
self._state.max_iterations = max_iterations
|
|
197
|
+
self._refresh()
|
|
198
|
+
|
|
199
|
+
def update_llm_status(self, status: Literal["calling", "idle"]) -> None:
|
|
200
|
+
"""
|
|
201
|
+
Update LLM call status.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
status: "calling" when LLM is being called, "idle" otherwise
|
|
205
|
+
"""
|
|
206
|
+
self._state.llm_status = status
|
|
207
|
+
self._refresh()
|
|
208
|
+
|
|
209
|
+
def add_function(self, name: str, docstring: str) -> None:
|
|
210
|
+
"""
|
|
211
|
+
Add a newly generated function to the display.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
name: Function name
|
|
215
|
+
docstring: Function docstring
|
|
216
|
+
"""
|
|
217
|
+
self._state.functions.append(FunctionInfo(name=name, docstring=docstring))
|
|
218
|
+
self._refresh()
|
|
219
|
+
|
|
220
|
+
def update_metrics(
|
|
221
|
+
self,
|
|
222
|
+
quality_delta: float,
|
|
223
|
+
latency_last: float,
|
|
224
|
+
latency_avg: float,
|
|
225
|
+
latency_total: float,
|
|
226
|
+
llm_calls: int,
|
|
227
|
+
) -> None:
|
|
228
|
+
"""
|
|
229
|
+
Update latency metrics.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
quality_delta: Quality improvement percentage (ignored, kept for compatibility)
|
|
233
|
+
latency_last: Last LLM call latency in ms
|
|
234
|
+
latency_avg: Average LLM call latency in ms
|
|
235
|
+
latency_total: Total LLM call time in ms
|
|
236
|
+
llm_calls: Total number of LLM calls
|
|
237
|
+
"""
|
|
238
|
+
self._state.latency_last_ms = latency_last
|
|
239
|
+
self._state.latency_avg_ms = latency_avg
|
|
240
|
+
self._state.latency_total_ms = latency_total
|
|
241
|
+
self._state.llm_call_count = llm_calls
|
|
242
|
+
self._refresh()
|
|
243
|
+
|
|
244
|
+
def update_tokens(self, prompt: str, response: str) -> None:
|
|
245
|
+
"""
|
|
246
|
+
Update token estimates.
|
|
247
|
+
|
|
248
|
+
Rough estimate: len(text) // 4
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
prompt: The prompt sent to the LLM
|
|
252
|
+
response: The response received from the LLM
|
|
253
|
+
"""
|
|
254
|
+
self._state.tokens_in += len(prompt) // 4
|
|
255
|
+
self._state.tokens_out += len(response) // 4
|
|
256
|
+
self._refresh()
|
|
257
|
+
|
|
258
|
+
def update_transmission(self, response: str) -> None:
|
|
259
|
+
"""
|
|
260
|
+
Update the transmission log with latest LLM response.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
response: The latest LLM response text
|
|
264
|
+
"""
|
|
265
|
+
self._state.last_response = response
|
|
266
|
+
self._refresh()
|
|
267
|
+
|
|
268
|
+
def _get_elapsed_time(self) -> str:
|
|
269
|
+
"""Get elapsed time as MM:SS string."""
|
|
270
|
+
elapsed = int(time.time() - self._start_time)
|
|
271
|
+
minutes = elapsed // 60
|
|
272
|
+
seconds = elapsed % 60
|
|
273
|
+
return f"{minutes:02d}:{seconds:02d}"
|
|
274
|
+
|
|
275
|
+
def show_complete(self, summary: dict) -> None:
|
|
276
|
+
"""
|
|
277
|
+
Show completion summary panel.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
summary: Dictionary with completion stats including:
|
|
281
|
+
- functions_count: Number of functions generated
|
|
282
|
+
- chunks_processed: Number of chunks processed
|
|
283
|
+
- latency_total_ms: Total LLM time in ms
|
|
284
|
+
- llm_calls: Number of LLM calls
|
|
285
|
+
- output_file: Path to output file
|
|
286
|
+
"""
|
|
287
|
+
if not HAS_RICH or self._layout is None:
|
|
288
|
+
return
|
|
289
|
+
|
|
290
|
+
# Build completion panel content
|
|
291
|
+
content = Table.grid(padding=(0, 2))
|
|
292
|
+
content.add_column(justify="left")
|
|
293
|
+
content.add_column(justify="left")
|
|
294
|
+
|
|
295
|
+
func_count = summary.get("functions_count", len(self._state.functions))
|
|
296
|
+
chunks = summary.get("chunks_processed", self._state.total_chunks)
|
|
297
|
+
elapsed = self._get_elapsed_time()
|
|
298
|
+
|
|
299
|
+
# Token stats
|
|
300
|
+
tokens_in_k = self._state.tokens_in / 1000
|
|
301
|
+
tokens_out_k = self._state.tokens_out / 1000
|
|
302
|
+
|
|
303
|
+
content.add_row(
|
|
304
|
+
Text("Functions Acquired:", style="bold"),
|
|
305
|
+
Text(str(func_count), style="green"),
|
|
306
|
+
)
|
|
307
|
+
content.add_row(
|
|
308
|
+
Text("Chunks Processed:", style="bold"),
|
|
309
|
+
Text(str(chunks)),
|
|
310
|
+
)
|
|
311
|
+
content.add_row(
|
|
312
|
+
Text("Total Time:", style="bold"),
|
|
313
|
+
Text(elapsed),
|
|
314
|
+
)
|
|
315
|
+
content.add_row(
|
|
316
|
+
Text("Tokens:", style="bold"),
|
|
317
|
+
Text(f"~{tokens_in_k:.1f}k in / ~{tokens_out_k:.1f}k out"),
|
|
318
|
+
)
|
|
319
|
+
content.add_row(Text(""), Text("")) # Spacer
|
|
320
|
+
content.add_row(
|
|
321
|
+
Text("Output:", style="bold"),
|
|
322
|
+
Text(summary.get("output_file", "cleaning_functions.py"), style="cyan"),
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
# Build the complete panel with box drawing
|
|
326
|
+
complete_panel = Panel(
|
|
327
|
+
content,
|
|
328
|
+
title="[bold green]MISSION COMPLETE[/bold green]",
|
|
329
|
+
border_style="green",
|
|
330
|
+
box=DOUBLE,
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
# Replace entire layout with completion panel
|
|
334
|
+
self._layout.split_column(
|
|
335
|
+
Layout(complete_panel, name="complete"),
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
if self._live:
|
|
339
|
+
self._live.update(self._layout)
|
|
340
|
+
|
|
341
|
+
def _refresh(self) -> None:
|
|
342
|
+
"""Refresh all panels with current state."""
|
|
343
|
+
if not HAS_RICH or self._layout is None:
|
|
344
|
+
return
|
|
345
|
+
|
|
346
|
+
self._refresh_header()
|
|
347
|
+
self._refresh_status_bar()
|
|
348
|
+
self._refresh_progress_bar()
|
|
349
|
+
self._refresh_left_panel()
|
|
350
|
+
self._refresh_right_panel()
|
|
351
|
+
|
|
352
|
+
if self._live:
|
|
353
|
+
self._live.update(self._layout)
|
|
354
|
+
|
|
355
|
+
def _refresh_header(self) -> None:
|
|
356
|
+
"""Refresh the header panel with ASCII art banner."""
|
|
357
|
+
if not HAS_RICH or self._layout is None:
|
|
358
|
+
return
|
|
359
|
+
|
|
360
|
+
banner_text = Text(ASCII_BANNER, style="bold cyan")
|
|
361
|
+
header_panel = Panel(
|
|
362
|
+
banner_text,
|
|
363
|
+
border_style="cyan",
|
|
364
|
+
box=DOUBLE,
|
|
365
|
+
padding=(0, 1),
|
|
366
|
+
)
|
|
367
|
+
self._layout["header"].update(header_panel)
|
|
368
|
+
|
|
369
|
+
def _refresh_status_bar(self) -> None:
|
|
370
|
+
"""Refresh the status bar with mission info, timer, and status."""
|
|
371
|
+
if not HAS_RICH or self._layout is None:
|
|
372
|
+
return
|
|
373
|
+
|
|
374
|
+
# Truncate file path if too long
|
|
375
|
+
file_path = self._state.file_path
|
|
376
|
+
if len(file_path) > 30:
|
|
377
|
+
file_path = "..." + file_path[-27:]
|
|
378
|
+
|
|
379
|
+
elapsed = self._get_elapsed_time()
|
|
380
|
+
|
|
381
|
+
# Status indicator
|
|
382
|
+
if self._state.llm_status == "calling":
|
|
383
|
+
status_text = Text("ACTIVE", style="bold green")
|
|
384
|
+
status_indicator = "\u25cf" # Filled circle
|
|
385
|
+
else:
|
|
386
|
+
status_text = Text("IDLE", style="dim")
|
|
387
|
+
status_indicator = "\u25cb" # Empty circle
|
|
388
|
+
|
|
389
|
+
# Build status bar content
|
|
390
|
+
status_table = Table.grid(padding=(0, 2), expand=True)
|
|
391
|
+
status_table.add_column(justify="left", ratio=2)
|
|
392
|
+
status_table.add_column(justify="center", ratio=1)
|
|
393
|
+
status_table.add_column(justify="right", ratio=1)
|
|
394
|
+
|
|
395
|
+
mission_text = Text()
|
|
396
|
+
mission_text.append("MISSION: ", style="bold")
|
|
397
|
+
mission_text.append(file_path, style="cyan")
|
|
398
|
+
|
|
399
|
+
time_text = Text()
|
|
400
|
+
time_text.append("TIME: ", style="bold")
|
|
401
|
+
time_text.append(elapsed, style="cyan")
|
|
402
|
+
|
|
403
|
+
status_combined = Text()
|
|
404
|
+
status_combined.append("STATUS: ", style="bold")
|
|
405
|
+
status_combined.append(f"{status_indicator} ", style="green" if self._state.llm_status == "calling" else "dim")
|
|
406
|
+
status_combined.append_text(status_text)
|
|
407
|
+
|
|
408
|
+
status_table.add_row(mission_text, time_text, status_combined)
|
|
409
|
+
|
|
410
|
+
status_panel = Panel(
|
|
411
|
+
status_table,
|
|
412
|
+
border_style="cyan",
|
|
413
|
+
box=DOUBLE,
|
|
414
|
+
padding=(0, 1),
|
|
415
|
+
)
|
|
416
|
+
self._layout["status_bar"].update(status_panel)
|
|
417
|
+
|
|
418
|
+
def _refresh_progress_bar(self) -> None:
|
|
419
|
+
"""Refresh the progress bar panel."""
|
|
420
|
+
if not HAS_RICH or self._layout is None:
|
|
421
|
+
return
|
|
422
|
+
|
|
423
|
+
# Calculate progress percentage
|
|
424
|
+
progress_pct = 0
|
|
425
|
+
if self._state.total_chunks > 0:
|
|
426
|
+
progress_pct = int((self._state.current_chunk / self._state.total_chunks) * 100)
|
|
427
|
+
|
|
428
|
+
# Build progress bar using Rich Progress
|
|
429
|
+
progress = Progress(
|
|
430
|
+
TextColumn("[bold cyan]\u25ba[/bold cyan]"),
|
|
431
|
+
TextColumn(f"CHUNK {self._state.current_chunk}/{self._state.total_chunks}"),
|
|
432
|
+
BarColumn(bar_width=30, complete_style="cyan", finished_style="green"),
|
|
433
|
+
TextColumn(f"{progress_pct}%"),
|
|
434
|
+
expand=False,
|
|
435
|
+
)
|
|
436
|
+
task = progress.add_task("", total=self._state.total_chunks, completed=self._state.current_chunk)
|
|
437
|
+
|
|
438
|
+
progress_panel = Panel(
|
|
439
|
+
progress,
|
|
440
|
+
border_style="cyan",
|
|
441
|
+
box=DOUBLE,
|
|
442
|
+
padding=(0, 1),
|
|
443
|
+
)
|
|
444
|
+
self._layout["progress_bar"].update(progress_panel)
|
|
445
|
+
|
|
446
|
+
def _refresh_left_panel(self) -> None:
|
|
447
|
+
"""Refresh the left panel with functions list and metrics."""
|
|
448
|
+
if not HAS_RICH or self._layout is None:
|
|
449
|
+
return
|
|
450
|
+
|
|
451
|
+
func_count = len(self._state.functions)
|
|
452
|
+
|
|
453
|
+
# Build function tree
|
|
454
|
+
content = Table.grid(padding=(0, 0))
|
|
455
|
+
content.add_column()
|
|
456
|
+
|
|
457
|
+
# Show max 6 functions with tree structure
|
|
458
|
+
max_display = 6
|
|
459
|
+
display_funcs = self._state.functions[-max_display:] if func_count > max_display else self._state.functions
|
|
460
|
+
|
|
461
|
+
for i, func in enumerate(display_funcs):
|
|
462
|
+
func_text = Text()
|
|
463
|
+
# Tree-style prefix
|
|
464
|
+
if i == len(display_funcs) - 1:
|
|
465
|
+
func_text.append("\u2514\u2500 ", style="dim cyan") # Corner
|
|
466
|
+
else:
|
|
467
|
+
func_text.append("\u251c\u2500 ", style="dim cyan") # Tee
|
|
468
|
+
|
|
469
|
+
func_text.append(func.name, style="bold")
|
|
470
|
+
func_text.append(" \u2713", style="green") # Checkmark
|
|
471
|
+
|
|
472
|
+
content.add_row(func_text)
|
|
473
|
+
|
|
474
|
+
# Show "+N more" if truncated
|
|
475
|
+
if func_count > max_display:
|
|
476
|
+
hidden_count = func_count - max_display
|
|
477
|
+
content.add_row(Text(f" (+{hidden_count} more)", style="dim italic"))
|
|
478
|
+
|
|
479
|
+
# Add spacing
|
|
480
|
+
content.add_row(Text(""))
|
|
481
|
+
|
|
482
|
+
# Token stats
|
|
483
|
+
tokens_in_k = self._state.tokens_in / 1000
|
|
484
|
+
tokens_out_k = self._state.tokens_out / 1000
|
|
485
|
+
tokens_text = Text()
|
|
486
|
+
tokens_text.append("TOKENS: ", style="bold")
|
|
487
|
+
tokens_text.append(f"~{tokens_in_k:.1f}k in / ~{tokens_out_k:.1f}k out", style="dim")
|
|
488
|
+
content.add_row(tokens_text)
|
|
489
|
+
|
|
490
|
+
# Latency stats
|
|
491
|
+
latency_text = Text()
|
|
492
|
+
latency_text.append("LATENCY: ", style="bold")
|
|
493
|
+
if self._state.llm_call_count > 0:
|
|
494
|
+
latency_text.append(f"{self._state.latency_last_ms:.1f}s", style="cyan")
|
|
495
|
+
latency_text.append(f" (avg {self._state.latency_avg_ms / 1000:.1f}s)", style="dim")
|
|
496
|
+
else:
|
|
497
|
+
latency_text.append("\u2014", style="dim") # Em dash
|
|
498
|
+
content.add_row(latency_text)
|
|
499
|
+
|
|
500
|
+
left_panel = Panel(
|
|
501
|
+
content,
|
|
502
|
+
title=f"[bold cyan]FUNCTIONS ACQUIRED [{func_count}][/bold cyan]",
|
|
503
|
+
border_style="cyan",
|
|
504
|
+
box=DOUBLE,
|
|
505
|
+
)
|
|
506
|
+
self._layout["left_panel"].update(left_panel)
|
|
507
|
+
|
|
508
|
+
def _parse_response_for_display(self, response: str) -> str:
|
|
509
|
+
"""Parse LLM XML response into readable format for transmission log.
|
|
510
|
+
|
|
511
|
+
Args:
|
|
512
|
+
response: Raw LLM response text (XML format)
|
|
513
|
+
|
|
514
|
+
Returns:
|
|
515
|
+
Formatted string for display showing issues, function being
|
|
516
|
+
generated, and chunk status.
|
|
517
|
+
"""
|
|
518
|
+
import re
|
|
519
|
+
|
|
520
|
+
lines = []
|
|
521
|
+
|
|
522
|
+
try:
|
|
523
|
+
# Find all issues
|
|
524
|
+
issue_pattern = r'<issue[^>]*id="(\d+)"[^>]*solved="(true|false)"[^>]*>([^<]+)</issue>'
|
|
525
|
+
issues = re.findall(issue_pattern, response, re.DOTALL)
|
|
526
|
+
|
|
527
|
+
if issues:
|
|
528
|
+
lines.append("ISSUES DETECTED:")
|
|
529
|
+
for issue_id, solved, desc in issues[:8]: # Limit to 8 issues
|
|
530
|
+
marker = "\u2713" if solved == "true" else "\u2717" # checkmark or X
|
|
531
|
+
desc_clean = desc.strip()[:40] # Truncate description
|
|
532
|
+
lines.append(f" {marker} {desc_clean}")
|
|
533
|
+
if len(issues) > 8:
|
|
534
|
+
lines.append(f" (+{len(issues) - 8} more)")
|
|
535
|
+
lines.append("")
|
|
536
|
+
|
|
537
|
+
# Find function being generated
|
|
538
|
+
name_match = re.search(r'<name>([^<]+)</name>', response)
|
|
539
|
+
docstring_match = re.search(r'<docstring>([^<]+)</docstring>', response, re.DOTALL)
|
|
540
|
+
|
|
541
|
+
if name_match:
|
|
542
|
+
lines.append(f"GENERATING: {name_match.group(1).strip()}")
|
|
543
|
+
if docstring_match:
|
|
544
|
+
doc = docstring_match.group(1).strip()[:60]
|
|
545
|
+
lines.append(f' "{doc}..."')
|
|
546
|
+
lines.append("")
|
|
547
|
+
|
|
548
|
+
# Find chunk status
|
|
549
|
+
status_match = re.search(r'<chunk_status>([^<]+)</chunk_status>', response)
|
|
550
|
+
if status_match:
|
|
551
|
+
status = status_match.group(1).strip()
|
|
552
|
+
lines.append(f"STATUS: {status.upper()}")
|
|
553
|
+
|
|
554
|
+
if lines:
|
|
555
|
+
return "\n".join(lines)
|
|
556
|
+
except Exception:
|
|
557
|
+
pass
|
|
558
|
+
|
|
559
|
+
# Fallback: show truncated raw response
|
|
560
|
+
return response[:500] + "..." if len(response) > 500 else response
|
|
561
|
+
|
|
562
|
+
def _refresh_right_panel(self) -> None:
|
|
563
|
+
"""Refresh the right panel with parsed transmission log."""
|
|
564
|
+
if not HAS_RICH or self._layout is None:
|
|
565
|
+
return
|
|
566
|
+
|
|
567
|
+
# Get last response and parse for display
|
|
568
|
+
response = self._state.last_response
|
|
569
|
+
if not response:
|
|
570
|
+
display_text = "(Awaiting transmission...)"
|
|
571
|
+
else:
|
|
572
|
+
display_text = self._parse_response_for_display(response)
|
|
573
|
+
|
|
574
|
+
log_text = Text(display_text, style="dim cyan")
|
|
575
|
+
|
|
576
|
+
right_panel = Panel(
|
|
577
|
+
log_text,
|
|
578
|
+
title="[bold cyan]\u25c4\u25c4 TRANSMISSION LOG \u25ba\u25ba[/bold cyan]",
|
|
579
|
+
border_style="cyan",
|
|
580
|
+
box=DOUBLE,
|
|
581
|
+
)
|
|
582
|
+
self._layout["right_panel"].update(right_panel)
|
|
583
|
+
|
|
584
|
+
# Legacy method stubs for backwards compatibility
|
|
585
|
+
def _refresh_progress(self) -> None:
|
|
586
|
+
"""Legacy method - calls _refresh_progress_bar."""
|
|
587
|
+
self._refresh_progress_bar()
|
|
588
|
+
|
|
589
|
+
def _refresh_functions(self) -> None:
|
|
590
|
+
"""Legacy method - calls _refresh_left_panel."""
|
|
591
|
+
self._refresh_left_panel()
|
|
592
|
+
|
|
593
|
+
def _refresh_footer(self) -> None:
|
|
594
|
+
"""Legacy method - no longer used but kept for compatibility."""
|
|
595
|
+
pass
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: recursive-cleaner
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
|
|
5
5
|
Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
6
6
|
Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
@@ -32,6 +32,8 @@ Provides-Extra: mlx
|
|
|
32
32
|
Requires-Dist: mlx-lm>=0.10.0; extra == 'mlx'
|
|
33
33
|
Provides-Extra: parquet
|
|
34
34
|
Requires-Dist: pyarrow>=14.0.0; extra == 'parquet'
|
|
35
|
+
Provides-Extra: tui
|
|
36
|
+
Requires-Dist: rich>=13.0; extra == 'tui'
|
|
35
37
|
Description-Content-Type: text/markdown
|
|
36
38
|
|
|
37
39
|
# Recursive Data Cleaner
|
|
@@ -40,7 +42,7 @@ LLM-powered incremental data cleaning for massive datasets. Process files in chu
|
|
|
40
42
|
|
|
41
43
|
## How It Works
|
|
42
44
|
|
|
43
|
-
1. **Chunk** your data (JSONL, CSV, JSON,
|
|
45
|
+
1. **Chunk** your data (JSONL, CSV, JSON, Parquet, PDF, Word, Excel, XML, and more)
|
|
44
46
|
2. **Analyze** each chunk with an LLM to identify issues
|
|
45
47
|
3. **Generate** one cleaning function per issue
|
|
46
48
|
4. **Validate** functions on holdout data before accepting
|
|
@@ -59,6 +61,21 @@ For Apple Silicon (MLX backend):
|
|
|
59
61
|
pip install -e ".[mlx]"
|
|
60
62
|
```
|
|
61
63
|
|
|
64
|
+
For document conversion (PDF, Word, Excel, HTML, etc.):
|
|
65
|
+
```bash
|
|
66
|
+
pip install -e ".[markitdown]"
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
For Parquet files:
|
|
70
|
+
```bash
|
|
71
|
+
pip install -e ".[parquet]"
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
For Terminal UI (Rich dashboard):
|
|
75
|
+
```bash
|
|
76
|
+
pip install -e ".[tui]"
|
|
77
|
+
```
|
|
78
|
+
|
|
62
79
|
## Quick Start
|
|
63
80
|
|
|
64
81
|
```python
|
|
@@ -111,6 +128,18 @@ cleaner.run() # Generates cleaning_functions.py
|
|
|
111
128
|
- **Cleaning Reports**: Markdown summary with functions, timing, quality delta
|
|
112
129
|
- **Dry-Run Mode**: Analyze data without generating functions
|
|
113
130
|
|
|
131
|
+
### Format Expansion (v0.7.0)
|
|
132
|
+
- **Markitdown Integration**: Convert 20+ formats (PDF, Word, Excel, PowerPoint, HTML, EPUB, etc.) to text
|
|
133
|
+
- **Parquet Support**: Load parquet files as structured data via pyarrow
|
|
134
|
+
- **LLM-Generated Parsers**: Auto-generate parsers for XML and unknown formats (`auto_parse=True`)
|
|
135
|
+
|
|
136
|
+
### Terminal UI (v0.8.0)
|
|
137
|
+
- **Mission Control Dashboard**: Rich-based live terminal UI with retro aesthetic
|
|
138
|
+
- **Real-time Progress**: Animated progress bars, chunk/iteration counters
|
|
139
|
+
- **Transmission Log**: Parsed LLM responses showing issues detected and functions being generated
|
|
140
|
+
- **Token Estimation**: Track estimated input/output tokens across the run
|
|
141
|
+
- **Graceful Fallback**: Works without Rich installed (falls back to callbacks)
|
|
142
|
+
|
|
114
143
|
## Configuration
|
|
115
144
|
|
|
116
145
|
```python
|
|
@@ -142,6 +171,12 @@ cleaner = DataCleaner(
|
|
|
142
171
|
report_path="report.md", # Markdown report output (None to disable)
|
|
143
172
|
dry_run=False, # Analyze without generating functions
|
|
144
173
|
|
|
174
|
+
# Format Expansion
|
|
175
|
+
auto_parse=False, # LLM generates parser for unknown formats
|
|
176
|
+
|
|
177
|
+
# Terminal UI
|
|
178
|
+
tui=True, # Enable Rich dashboard (requires [tui] extra)
|
|
179
|
+
|
|
145
180
|
# Progress & State
|
|
146
181
|
on_progress=callback, # Progress event callback
|
|
147
182
|
state_file="state.json", # Enable resume on interrupt
|
|
@@ -235,20 +270,22 @@ cleaner.run()
|
|
|
235
270
|
|
|
236
271
|
```
|
|
237
272
|
recursive_cleaner/
|
|
238
|
-
├── cleaner.py
|
|
239
|
-
├── context.py
|
|
240
|
-
├── dependencies.py
|
|
241
|
-
├── metrics.py
|
|
242
|
-
├── optimizer.py
|
|
243
|
-
├── output.py
|
|
244
|
-
├──
|
|
245
|
-
├──
|
|
246
|
-
├──
|
|
247
|
-
├──
|
|
248
|
-
├──
|
|
249
|
-
├──
|
|
273
|
+
├── cleaner.py # Main DataCleaner class
|
|
274
|
+
├── context.py # Docstring registry with FIFO eviction
|
|
275
|
+
├── dependencies.py # Topological sort for function ordering
|
|
276
|
+
├── metrics.py # Quality metrics before/after
|
|
277
|
+
├── optimizer.py # Two-pass consolidation with LLM agency
|
|
278
|
+
├── output.py # Function file generation + import consolidation
|
|
279
|
+
├── parser_generator.py # LLM-generated parsers for unknown formats
|
|
280
|
+
├── parsers.py # Chunking for all formats + sampling
|
|
281
|
+
├── prompt.py # LLM prompt templates
|
|
282
|
+
├── report.py # Markdown report generation
|
|
283
|
+
├── response.py # XML/markdown parsing + agency dataclasses
|
|
284
|
+
├── schema.py # Schema inference
|
|
285
|
+
├── tui.py # Rich terminal dashboard
|
|
286
|
+
├── validation.py # Runtime validation + holdout
|
|
250
287
|
└── vendor/
|
|
251
|
-
└── chunker.py
|
|
288
|
+
└── chunker.py # Vendored sentence-aware chunker
|
|
252
289
|
```
|
|
253
290
|
|
|
254
291
|
## Testing
|
|
@@ -257,7 +294,7 @@ recursive_cleaner/
|
|
|
257
294
|
pytest tests/ -v
|
|
258
295
|
```
|
|
259
296
|
|
|
260
|
-
|
|
297
|
+
465 tests covering all features. Test datasets in `test_cases/`:
|
|
261
298
|
- E-commerce product catalogs
|
|
262
299
|
- Healthcare patient records
|
|
263
300
|
- Financial transaction data
|
|
@@ -273,6 +310,8 @@ pytest tests/ -v
|
|
|
273
310
|
|
|
274
311
|
| Version | Features |
|
|
275
312
|
|---------|----------|
|
|
313
|
+
| v0.8.0 | Terminal UI with Rich dashboard, mission control aesthetic, transmission log |
|
|
314
|
+
| v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
|
|
276
315
|
| v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
|
|
277
316
|
| v0.5.1 | Dangerous code detection (AST-based security) |
|
|
278
317
|
| v0.5.0 | Two-pass optimization, early termination, LLM agency |
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
backends/__init__.py,sha256=FUgODeYSGBvT0-z6myVby6YeAHG0nEUgWLITBKobUew,121
|
|
2
2
|
backends/mlx_backend.py,sha256=0U6IqmDHyk4vjKzytvEcQvSUBryQTgFtsNOcpwFNKk8,2945
|
|
3
|
-
recursive_cleaner/__init__.py,sha256=
|
|
4
|
-
recursive_cleaner/cleaner.py,sha256=
|
|
3
|
+
recursive_cleaner/__init__.py,sha256=v0bNQ3H0d7n6cTOkuxuqG9bmnX9yeZBLZ_AfFM7edHI,1789
|
|
4
|
+
recursive_cleaner/cleaner.py,sha256=vZTMwaLlCmuh1qy3c-puEZrwS5gXt0u28d5iweQXbms,29801
|
|
5
5
|
recursive_cleaner/context.py,sha256=avMXRDxLd7nd8CKWtvPHQy1MFhBKiA0aUVVJIlWoLZ4,824
|
|
6
6
|
recursive_cleaner/dependencies.py,sha256=vlYeoGL517v3yUSWN0wYDuIs9OOuQwM_dCBADrlitW8,2080
|
|
7
7
|
recursive_cleaner/errors.py,sha256=hwRJF8NSmWy_FZHCxcZDZxLQ0zqvo5dX8ImkB9mrOYc,433
|
|
@@ -14,11 +14,12 @@ recursive_cleaner/prompt.py,sha256=ep0eOXz_XbhH3HduJ76LvzVSftonhcv4GLEecIqd3lY,6
|
|
|
14
14
|
recursive_cleaner/report.py,sha256=AWWneRjvl76ccLlExdkKJeY3GVFUG_LtmzVIJJT5cFI,4629
|
|
15
15
|
recursive_cleaner/response.py,sha256=3w0mLnqEPdB4daMSF0mtTcG0PTP-utb1HFtKuYA1ljw,9064
|
|
16
16
|
recursive_cleaner/schema.py,sha256=w2hcEdApR15KVI9SFWB3VfumMoHFwn1YJrktdfgPo8M,3925
|
|
17
|
+
recursive_cleaner/tui.py,sha256=FwG_uCwqUcvch5dRZmV-ba2JXD0XJkm9roXzPQ9iUSo,21633
|
|
17
18
|
recursive_cleaner/types.py,sha256=-GdCmsfHd3rfdfCi5c-RXqX4TyuCSHgA__3AF3bMhoQ,290
|
|
18
19
|
recursive_cleaner/validation.py,sha256=-KAolhw3GQyhHwmh0clEj8xqPD5O-R2AO5rx7vubIME,6442
|
|
19
20
|
recursive_cleaner/vendor/__init__.py,sha256=E87TjmjRzu8ty39nqThvBwM611yXlLKQZ6KGY_zp3Dk,117
|
|
20
21
|
recursive_cleaner/vendor/chunker.py,sha256=pDDbfF6FoSmUji0-RG4MletPxJ-VybGw0yfnhh0aMSQ,6730
|
|
21
|
-
recursive_cleaner-0.
|
|
22
|
-
recursive_cleaner-0.
|
|
23
|
-
recursive_cleaner-0.
|
|
24
|
-
recursive_cleaner-0.
|
|
22
|
+
recursive_cleaner-0.8.0.dist-info/METADATA,sha256=rVABzjvUZ-uzk35o5evbIlkRIbgEb29QPKSCoMI4_fs,11072
|
|
23
|
+
recursive_cleaner-0.8.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
24
|
+
recursive_cleaner-0.8.0.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
|
|
25
|
+
recursive_cleaner-0.8.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|