recursive-cleaner 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,6 +20,7 @@ from recursive_cleaner.parsers import MARKITDOWN_EXTENSIONS, chunk_file, load_pa
20
20
  from recursive_cleaner.prompt import build_prompt
21
21
  from recursive_cleaner.response import extract_python_block, parse_response
22
22
  from recursive_cleaner.parser_generator import check_parser_safety, generate_parser
23
+ from recursive_cleaner.tui import HAS_RICH, TUIRenderer
23
24
  from recursive_cleaner.validation import check_code_safety, extract_sample_data, validate_function
24
25
 
25
26
  __all__ = [
@@ -49,4 +50,6 @@ __all__ = [
49
50
  "consolidate_with_agency",
50
51
  "generate_parser",
51
52
  "check_parser_safety",
53
+ "TUIRenderer",
54
+ "HAS_RICH",
52
55
  ]
@@ -62,6 +62,7 @@ class DataCleaner:
62
62
  report_path: str | None = "cleaning_report.md",
63
63
  dry_run: bool = False,
64
64
  auto_parse: bool = False,
65
+ tui: bool = False,
65
66
  ):
66
67
  self.backend = llm_backend
67
68
  self.file_path = file_path
@@ -86,7 +87,9 @@ class DataCleaner:
86
87
  self.report_path = report_path
87
88
  self.dry_run = dry_run
88
89
  self.auto_parse = auto_parse
90
+ self.tui = tui
89
91
  self.functions: list[dict] = [] # List of {name, docstring, code}
92
+ self._tui_renderer = None # TUIRenderer instance when tui=True
90
93
  self._generated_parser: callable | None = None # LLM-generated parser for unknown formats
91
94
  # Track recent function generation for saturation check
92
95
  self._recent_new_function_count = 0
@@ -119,10 +122,15 @@ class DataCleaner:
119
122
  try:
120
123
  self.on_progress(event)
121
124
  except Exception as e:
122
- print(f" Warning: callback error: {e}")
125
+ if not self.tui:
126
+ print(f" Warning: callback error: {e}")
123
127
 
124
128
  def _call_llm_timed(self, prompt: str, chunk_index: int = 0) -> str:
125
129
  """Call LLM with timing and emit latency event."""
130
+ # Update TUI status before call
131
+ if self._tui_renderer:
132
+ self._tui_renderer.update_llm_status("calling")
133
+
126
134
  start = time.perf_counter()
127
135
  response = call_llm(self.backend, prompt)
128
136
  elapsed_ms = (time.perf_counter() - start) * 1000
@@ -133,6 +141,20 @@ class DataCleaner:
133
141
  self._latency_stats["min_ms"] = min(self._latency_stats["min_ms"], elapsed_ms)
134
142
  self._latency_stats["max_ms"] = max(self._latency_stats["max_ms"], elapsed_ms)
135
143
 
144
+ # Update TUI status and metrics after call
145
+ if self._tui_renderer:
146
+ self._tui_renderer.update_llm_status("idle")
147
+ latency_summary = self._get_latency_summary()
148
+ self._tui_renderer.update_metrics(
149
+ quality_delta=0.0, # Quality delta calculated at end
150
+ latency_last=elapsed_ms,
151
+ latency_avg=latency_summary.get("avg_ms", 0.0),
152
+ latency_total=latency_summary.get("total_ms", 0.0),
153
+ llm_calls=latency_summary.get("call_count", 0),
154
+ )
155
+ self._tui_renderer.update_tokens(prompt, response)
156
+ self._tui_renderer.update_transmission(response)
157
+
136
158
  # Emit event
137
159
  self._emit("llm_call", chunk_index=chunk_index, latency_ms=round(elapsed_ms, 2))
138
160
 
@@ -216,7 +238,8 @@ class DataCleaner:
216
238
  response = self._call_llm_timed(prompt, chunk_index=chunks_processed - 1)
217
239
  assessment = parse_saturation_response(response)
218
240
  except Exception as e:
219
- print(f" Warning: saturation check failed: {e}")
241
+ if not self.tui:
242
+ print(f" Warning: saturation check failed: {e}")
220
243
  return False # Continue on error
221
244
 
222
245
  self._emit(
@@ -275,7 +298,8 @@ class DataCleaner:
275
298
  self.functions = state.get("functions", [])
276
299
  self._last_completed_chunk = state.get("last_completed_chunk", -1)
277
300
  self._total_chunks = state.get("total_chunks", 0)
278
- print(f"Resumed from state: {self._last_completed_chunk + 1}/{self._total_chunks} chunks completed")
301
+ if not self.tui:
302
+ print(f"Resumed from state: {self._last_completed_chunk + 1}/{self._total_chunks} chunks completed")
279
303
  return True
280
304
 
281
305
  @classmethod
@@ -340,14 +364,16 @@ class DataCleaner:
340
364
  """Load file using LLM-generated parser, return JSONL chunks."""
341
365
  from .parser_generator import generate_parser
342
366
 
343
- print(f"Unknown file format, generating parser...")
367
+ if not self.tui:
368
+ print(f"Unknown file format, generating parser...")
344
369
  self._emit("parser_generation_start")
345
370
 
346
371
  parser = generate_parser(self.backend, self.file_path)
347
372
  self._generated_parser = parser
348
373
 
349
374
  self._emit("parser_generation_complete")
350
- print("Parser generated successfully.")
375
+ if not self.tui:
376
+ print("Parser generated successfully.")
351
377
 
352
378
  # Parse the file
353
379
  records = parser(self.file_path)
@@ -390,7 +416,8 @@ class DataCleaner:
390
416
  )
391
417
 
392
418
  if not chunks:
393
- print("No data to process.")
419
+ if not self.tui:
420
+ print("No data to process.")
394
421
  return
395
422
 
396
423
  # Try to load existing state
@@ -409,13 +436,38 @@ class DataCleaner:
409
436
 
410
437
  self._total_chunks = len(chunks)
411
438
 
439
+ # Initialize TUI if enabled
440
+ if self.tui:
441
+ from .tui import HAS_RICH, TUIRenderer
442
+
443
+ if HAS_RICH:
444
+ self._tui_renderer = TUIRenderer(
445
+ file_path=self.file_path,
446
+ total_chunks=self._total_chunks,
447
+ total_records=0, # Could be calculated from chunks
448
+ )
449
+ self._tui_renderer.start()
450
+ else:
451
+ import logging
452
+
453
+ logging.warning(
454
+ "tui=True but Rich not installed. "
455
+ "Install with: pip install recursive-cleaner[tui]"
456
+ )
457
+
412
458
  for i, chunk in enumerate(chunks):
413
459
  # Skip already completed chunks
414
460
  if i <= self._last_completed_chunk:
415
- if resumed:
461
+ if resumed and not self.tui:
416
462
  print(f"Skipping chunk {i + 1}/{len(chunks)} (already completed)")
417
463
  continue
418
- print(f"Processing chunk {i + 1}/{len(chunks)}...")
464
+ if not self.tui:
465
+ print(f"Processing chunk {i + 1}/{len(chunks)}...")
466
+
467
+ # Update TUI with chunk progress
468
+ if self._tui_renderer:
469
+ self._tui_renderer.update_chunk(i, 0, self.max_iterations)
470
+
419
471
  self._process_chunk(chunk, i)
420
472
  # Mark chunk as completed and save state
421
473
  self._last_completed_chunk = i
@@ -429,7 +481,8 @@ class DataCleaner:
429
481
  ):
430
482
  if self._check_saturation(i + 1):
431
483
  self._emit("early_termination", chunk_index=i)
432
- print(f"Early termination: pattern discovery saturated at chunk {i + 1}")
484
+ if not self.tui:
485
+ print(f"Early termination: pattern discovery saturated at chunk {i + 1}")
433
486
  break
434
487
 
435
488
  # Skip optimization and output in dry_run mode
@@ -439,7 +492,11 @@ class DataCleaner:
439
492
  chunk_index=self._total_chunks - 1,
440
493
  latency_stats=self._get_latency_summary(),
441
494
  )
442
- print("Dry run complete. No functions generated or saved.")
495
+ # Stop TUI if running
496
+ if self._tui_renderer:
497
+ self._tui_renderer.stop()
498
+ if not self.tui:
499
+ print("Dry run complete. No functions generated or saved.")
443
500
  return
444
501
 
445
502
  # Two-pass optimization (if enabled and enough functions)
@@ -453,7 +510,22 @@ class DataCleaner:
453
510
  chunk_index=self._total_chunks - 1,
454
511
  latency_stats=self._get_latency_summary(),
455
512
  )
456
- print(f"Done! Generated {len(self.functions)} functions.")
513
+
514
+ # Show TUI completion and stop
515
+ if self._tui_renderer:
516
+ latency_summary = self._get_latency_summary()
517
+ self._tui_renderer.show_complete({
518
+ "functions_count": len(self.functions),
519
+ "chunks_processed": self._total_chunks,
520
+ "quality_delta": 0.0, # Could be calculated from metrics
521
+ "latency_total_ms": latency_summary.get("total_ms", 0.0),
522
+ "llm_calls": latency_summary.get("call_count", 0),
523
+ "output_file": "cleaning_functions.py",
524
+ })
525
+ self._tui_renderer.stop()
526
+
527
+ if not self.tui:
528
+ print(f"Done! Generated {len(self.functions)} functions.")
457
529
 
458
530
  def _process_chunk(self, chunk: str, chunk_idx: int) -> None:
459
531
  """Process a single chunk, iterating until clean or max iterations."""
@@ -476,6 +548,11 @@ class DataCleaner:
476
548
 
477
549
  for iteration in range(self.max_iterations):
478
550
  self._emit("iteration", chunk_index=chunk_idx, iteration=iteration)
551
+
552
+ # Update TUI with iteration progress
553
+ if self._tui_renderer:
554
+ self._tui_renderer.update_chunk(chunk_idx, iteration, self.max_iterations)
555
+
479
556
  context = build_context(self.functions, self.context_budget)
480
557
  prompt = build_prompt(
481
558
  self.instructions,
@@ -511,7 +588,8 @@ class DataCleaner:
511
588
  function_name=result["name"],
512
589
  error=safety_error,
513
590
  )
514
- print(f" Safety check failed: {safety_error}")
591
+ if not self.tui:
592
+ print(f" Safety check failed: {safety_error}")
515
593
  continue
516
594
 
517
595
  # Runtime validation if enabled
@@ -539,7 +617,8 @@ class DataCleaner:
539
617
  function_name=result["name"],
540
618
  error=error_msg,
541
619
  )
542
- print(f" Validation failed: {error_msg}")
620
+ if not self.tui:
621
+ print(f" Validation failed: {error_msg}")
543
622
  continue
544
623
 
545
624
  self.functions.append({
@@ -549,17 +628,25 @@ class DataCleaner:
549
628
  })
550
629
  # Track for saturation check
551
630
  self._recent_new_function_count += 1
631
+
632
+ # Update TUI with new function
633
+ if self._tui_renderer:
634
+ self._tui_renderer.add_function(result["name"], result["docstring"])
635
+
552
636
  self._emit(
553
637
  "function_generated",
554
638
  chunk_index=chunk_idx,
555
639
  function_name=result["name"],
556
640
  )
557
- print(f" Generated: {result['name']}")
641
+ if not self.tui:
642
+ print(f" Generated: {result['name']}")
558
643
  else:
559
644
  # LLM said needs_more_work but didn't provide code
560
- print(f" Warning: iteration {iteration + 1} produced no function")
645
+ if not self.tui:
646
+ print(f" Warning: iteration {iteration + 1} produced no function")
561
647
 
562
- print(f" Warning: chunk {chunk_idx} hit max iterations ({self.max_iterations})")
648
+ if not self.tui:
649
+ print(f" Warning: chunk {chunk_idx} hit max iterations ({self.max_iterations})")
563
650
  self._emit("chunk_done", chunk_index=chunk_idx)
564
651
 
565
652
  def _process_chunk_dry_run(self, chunk: str, chunk_idx: int) -> None:
@@ -577,7 +664,8 @@ class DataCleaner:
577
664
  response = self._call_llm_timed(prompt, chunk_index=chunk_idx)
578
665
  result = parse_response(response)
579
666
  except ParseError as e:
580
- print(f" Warning: parse error in dry run: {e}")
667
+ if not self.tui:
668
+ print(f" Warning: parse error in dry run: {e}")
581
669
  self._emit("chunk_done", chunk_index=chunk_idx)
582
670
  return
583
671
 
@@ -589,11 +677,12 @@ class DataCleaner:
589
677
  issues=issues,
590
678
  )
591
679
 
592
- if issues:
593
- unsolved = [i for i in issues if not i.get("solved", False)]
594
- print(f" Found {len(issues)} issues ({len(unsolved)} unsolved)")
595
- else:
596
- print(" No issues detected")
680
+ if not self.tui:
681
+ if issues:
682
+ unsolved = [i for i in issues if not i.get("solved", False)]
683
+ print(f" Found {len(issues)} issues ({len(unsolved)} unsolved)")
684
+ else:
685
+ print(" No issues detected")
597
686
 
598
687
  self._emit("chunk_done", chunk_index=chunk_idx)
599
688
 
@@ -604,8 +693,9 @@ class DataCleaner:
604
693
  try:
605
694
  write_cleaning_file(self.functions)
606
695
  except OutputValidationError as e:
607
- print(f" Error: {e}")
608
- print(" Attempting to write valid functions only...")
696
+ if not self.tui:
697
+ print(f" Error: {e}")
698
+ print(" Attempting to write valid functions only...")
609
699
  # Try writing functions one by one, skipping invalid ones
610
700
  valid_functions = []
611
701
  for f in self.functions:
@@ -614,10 +704,11 @@ class DataCleaner:
614
704
  ast.parse(f["code"])
615
705
  valid_functions.append(f)
616
706
  except SyntaxError:
617
- print(f" Skipping invalid function: {f['name']}")
707
+ if not self.tui:
708
+ print(f" Skipping invalid function: {f['name']}")
618
709
  if valid_functions:
619
710
  write_cleaning_file(valid_functions)
620
- else:
711
+ elif not self.tui:
621
712
  print(" No valid functions to write.")
622
713
 
623
714
  def _write_report(self) -> None:
@@ -0,0 +1,595 @@
1
+ """Rich TUI dashboard with Mission Control retro aesthetic."""
2
+
3
+ import time
4
+ from dataclasses import dataclass, field
5
+ from typing import Literal
6
+
7
+ # Graceful import - TUI features only available when Rich is installed
8
+ try:
9
+ from rich.box import DOUBLE
10
+ from rich.console import Console, Group
11
+ from rich.layout import Layout
12
+ from rich.live import Live
13
+ from rich.panel import Panel
14
+ from rich.progress import BarColumn, Progress, TextColumn
15
+ from rich.table import Table
16
+ from rich.text import Text
17
+
18
+ HAS_RICH = True
19
+ except ImportError:
20
+ HAS_RICH = False
21
+
22
+
23
+ # ASCII art banner - chunky block style
24
+ ASCII_BANNER = """
25
+ ██████╗ ███████╗ ██████╗██╗ ██╗██████╗ ███████╗██╗██╗ ██╗███████╗
26
+ ██╔══██╗██╔════╝██╔════╝██║ ██║██╔══██╗██╔════╝██║██║ ██║██╔════╝
27
+ ██████╔╝█████╗ ██║ ██║ ██║██████╔╝███████╗██║██║ ██║█████╗
28
+ ██╔══██╗██╔══╝ ██║ ██║ ██║██╔══██╗╚════██║██║╚██╗ ██╔╝██╔══╝
29
+ ██║ ██║███████╗╚██████╗╚██████╔╝██║ ██║███████║██║ ╚████╔╝ ███████╗
30
+ ╚═╝ ╚═╝╚══════╝ ╚═════╝ ╚═════╝ ╚═╝ ╚═╝╚══════╝╚═╝ ╚═══╝ ╚══════╝
31
+ ██████╗██╗ ███████╗ █████╗ ███╗ ██╗███████╗██████╗
32
+ ██╔════╝██║ ██╔════╝██╔══██╗████╗ ██║██╔════╝██╔══██╗
33
+ ██║ ██║ █████╗ ███████║██╔██╗ ██║█████╗ ██████╔╝
34
+ ██║ ██║ ██╔══╝ ██╔══██║██║╚██╗██║██╔══╝ ██╔══██╗
35
+ ╚██████╗███████╗███████╗██║ ██║██║ ╚████║███████╗██║ ██║
36
+ ╚═════╝╚══════╝╚══════╝╚═╝ ╚═╝╚═╝ ╚═══╝╚══════╝╚═╝ ╚═╝
37
+ """.strip()
38
+
39
+ # Keep HEADER_TITLE for backwards compatibility with tests
40
+ HEADER_TITLE = "RECURSIVE CLEANER"
41
+
42
+
43
+ @dataclass
44
+ class FunctionInfo:
45
+ """Info about a generated cleaning function."""
46
+
47
+ name: str
48
+ docstring: str
49
+
50
+
51
+ @dataclass
52
+ class TUIState:
53
+ """Dashboard display state."""
54
+
55
+ # Header
56
+ file_path: str
57
+ total_records: int
58
+ version: str = "0.8.0"
59
+
60
+ # Progress
61
+ current_chunk: int = 0
62
+ total_chunks: int = 0
63
+ current_iteration: int = 0
64
+ max_iterations: int = 5
65
+
66
+ # LLM Status
67
+ llm_status: Literal["idle", "calling"] = "idle"
68
+
69
+ # Functions
70
+ functions: list[FunctionInfo] = field(default_factory=list)
71
+
72
+ # Latency metrics
73
+ latency_last_ms: float = 0.0
74
+ latency_avg_ms: float = 0.0
75
+ latency_total_ms: float = 0.0
76
+ llm_call_count: int = 0
77
+
78
+ # Token estimation
79
+ tokens_in: int = 0
80
+ tokens_out: int = 0
81
+
82
+ # Transmission log
83
+ last_response: str = ""
84
+
85
+
86
+ class TUIRenderer:
87
+ """
88
+ Rich-based terminal dashboard with Mission Control retro aesthetic.
89
+
90
+ Shows live updates during cleaning runs with:
91
+ - ASCII art banner header
92
+ - Mission timer and status indicator
93
+ - Progress bar and chunk/iteration counters
94
+ - List of generated functions with checkmarks
95
+ - Token estimation and latency metrics
96
+ - Transmission log showing latest LLM response
97
+ """
98
+
99
+ def __init__(self, file_path: str, total_chunks: int, total_records: int = 0):
100
+ """
101
+ Initialize TUI renderer.
102
+
103
+ Args:
104
+ file_path: Path to the data file being cleaned
105
+ total_chunks: Total number of chunks to process
106
+ total_records: Total number of records in the file
107
+ """
108
+ self._state = TUIState(
109
+ file_path=file_path,
110
+ total_chunks=total_chunks,
111
+ total_records=total_records,
112
+ )
113
+ self._start_time = time.time()
114
+ self._layout = self._make_layout() if HAS_RICH else None
115
+ self._live: "Live | None" = None
116
+ self._console = Console() if HAS_RICH else None
117
+
118
+ def _make_layout(self) -> "Layout":
119
+ """Create the dashboard layout structure.
120
+
121
+ Layout:
122
+ - header (size=5) - ASCII art banner "RECURSIVE CLEANER"
123
+ - status_bar (size=3) - MISSION | TIME | STATUS
124
+ - progress_bar (size=3) - CHUNK X/Y + progress bar
125
+ - body (size=computed) - Split horizontally, FIXED size to prevent infinite expansion
126
+ - left_panel - FUNCTIONS ACQUIRED, tokens, latency
127
+ - right_panel - Parsed transmission log
128
+
129
+ CRITICAL: Body uses fixed `size=` not `ratio=` to prevent panels from
130
+ expanding infinitely and pushing header off screen on large terminals.
131
+ Works on terminals as small as 80x24.
132
+ """
133
+ if not HAS_RICH:
134
+ return None
135
+
136
+ from rich.console import Console
137
+
138
+ console = Console()
139
+ term_height = console.height or 24 # Default to 24 if unknown
140
+
141
+ # Fixed heights for top sections
142
+ header_height = 14 # ASCII banner (12 lines + border)
143
+ status_height = 3
144
+ progress_height = 3
145
+ fixed_total = header_height + status_height + progress_height
146
+
147
+ # Body gets remaining space with a FIXED size (not ratio)
148
+ # Cap at 18 rows max to keep it tight
149
+ body_height = min(18, max(10, term_height - fixed_total - 2))
150
+
151
+ layout = Layout()
152
+ layout.split_column(
153
+ Layout(name="header", size=header_height),
154
+ Layout(name="status_bar", size=status_height),
155
+ Layout(name="progress_bar", size=progress_height),
156
+ Layout(name="body", size=body_height), # FIXED size, not ratio
157
+ )
158
+ layout["body"].split_row(
159
+ Layout(name="left_panel", ratio=1),
160
+ Layout(name="right_panel", ratio=1),
161
+ )
162
+ return layout
163
+
164
+ def start(self) -> None:
165
+ """Start the live TUI display."""
166
+ if not HAS_RICH or self._layout is None:
167
+ return
168
+
169
+ self._start_time = time.time()
170
+ self._refresh()
171
+ self._live = Live(
172
+ self._layout,
173
+ console=self._console,
174
+ refresh_per_second=2,
175
+ vertical_overflow="crop",
176
+ )
177
+ self._live.start()
178
+
179
+ def stop(self) -> None:
180
+ """Stop the live TUI display."""
181
+ if self._live:
182
+ self._live.stop()
183
+ self._live = None
184
+
185
+ def update_chunk(self, chunk_index: int, iteration: int, max_iterations: int) -> None:
186
+ """
187
+ Update progress for current chunk and iteration.
188
+
189
+ Args:
190
+ chunk_index: Current chunk index (0-based)
191
+ iteration: Current iteration within chunk (0-based)
192
+ max_iterations: Maximum iterations per chunk
193
+ """
194
+ self._state.current_chunk = chunk_index + 1 # Convert to 1-based for display
195
+ self._state.current_iteration = iteration + 1
196
+ self._state.max_iterations = max_iterations
197
+ self._refresh()
198
+
199
+ def update_llm_status(self, status: Literal["calling", "idle"]) -> None:
200
+ """
201
+ Update LLM call status.
202
+
203
+ Args:
204
+ status: "calling" when LLM is being called, "idle" otherwise
205
+ """
206
+ self._state.llm_status = status
207
+ self._refresh()
208
+
209
+ def add_function(self, name: str, docstring: str) -> None:
210
+ """
211
+ Add a newly generated function to the display.
212
+
213
+ Args:
214
+ name: Function name
215
+ docstring: Function docstring
216
+ """
217
+ self._state.functions.append(FunctionInfo(name=name, docstring=docstring))
218
+ self._refresh()
219
+
220
+ def update_metrics(
221
+ self,
222
+ quality_delta: float,
223
+ latency_last: float,
224
+ latency_avg: float,
225
+ latency_total: float,
226
+ llm_calls: int,
227
+ ) -> None:
228
+ """
229
+ Update latency metrics.
230
+
231
+ Args:
232
+ quality_delta: Quality improvement percentage (ignored, kept for compatibility)
233
+ latency_last: Last LLM call latency in ms
234
+ latency_avg: Average LLM call latency in ms
235
+ latency_total: Total LLM call time in ms
236
+ llm_calls: Total number of LLM calls
237
+ """
238
+ self._state.latency_last_ms = latency_last
239
+ self._state.latency_avg_ms = latency_avg
240
+ self._state.latency_total_ms = latency_total
241
+ self._state.llm_call_count = llm_calls
242
+ self._refresh()
243
+
244
+ def update_tokens(self, prompt: str, response: str) -> None:
245
+ """
246
+ Update token estimates.
247
+
248
+ Rough estimate: len(text) // 4
249
+
250
+ Args:
251
+ prompt: The prompt sent to the LLM
252
+ response: The response received from the LLM
253
+ """
254
+ self._state.tokens_in += len(prompt) // 4
255
+ self._state.tokens_out += len(response) // 4
256
+ self._refresh()
257
+
258
+ def update_transmission(self, response: str) -> None:
259
+ """
260
+ Update the transmission log with latest LLM response.
261
+
262
+ Args:
263
+ response: The latest LLM response text
264
+ """
265
+ self._state.last_response = response
266
+ self._refresh()
267
+
268
+ def _get_elapsed_time(self) -> str:
269
+ """Get elapsed time as MM:SS string."""
270
+ elapsed = int(time.time() - self._start_time)
271
+ minutes = elapsed // 60
272
+ seconds = elapsed % 60
273
+ return f"{minutes:02d}:{seconds:02d}"
274
+
275
+ def show_complete(self, summary: dict) -> None:
276
+ """
277
+ Show completion summary panel.
278
+
279
+ Args:
280
+ summary: Dictionary with completion stats including:
281
+ - functions_count: Number of functions generated
282
+ - chunks_processed: Number of chunks processed
283
+ - latency_total_ms: Total LLM time in ms
284
+ - llm_calls: Number of LLM calls
285
+ - output_file: Path to output file
286
+ """
287
+ if not HAS_RICH or self._layout is None:
288
+ return
289
+
290
+ # Build completion panel content
291
+ content = Table.grid(padding=(0, 2))
292
+ content.add_column(justify="left")
293
+ content.add_column(justify="left")
294
+
295
+ func_count = summary.get("functions_count", len(self._state.functions))
296
+ chunks = summary.get("chunks_processed", self._state.total_chunks)
297
+ elapsed = self._get_elapsed_time()
298
+
299
+ # Token stats
300
+ tokens_in_k = self._state.tokens_in / 1000
301
+ tokens_out_k = self._state.tokens_out / 1000
302
+
303
+ content.add_row(
304
+ Text("Functions Acquired:", style="bold"),
305
+ Text(str(func_count), style="green"),
306
+ )
307
+ content.add_row(
308
+ Text("Chunks Processed:", style="bold"),
309
+ Text(str(chunks)),
310
+ )
311
+ content.add_row(
312
+ Text("Total Time:", style="bold"),
313
+ Text(elapsed),
314
+ )
315
+ content.add_row(
316
+ Text("Tokens:", style="bold"),
317
+ Text(f"~{tokens_in_k:.1f}k in / ~{tokens_out_k:.1f}k out"),
318
+ )
319
+ content.add_row(Text(""), Text("")) # Spacer
320
+ content.add_row(
321
+ Text("Output:", style="bold"),
322
+ Text(summary.get("output_file", "cleaning_functions.py"), style="cyan"),
323
+ )
324
+
325
+ # Build the complete panel with box drawing
326
+ complete_panel = Panel(
327
+ content,
328
+ title="[bold green]MISSION COMPLETE[/bold green]",
329
+ border_style="green",
330
+ box=DOUBLE,
331
+ )
332
+
333
+ # Replace entire layout with completion panel
334
+ self._layout.split_column(
335
+ Layout(complete_panel, name="complete"),
336
+ )
337
+
338
+ if self._live:
339
+ self._live.update(self._layout)
340
+
341
+ def _refresh(self) -> None:
342
+ """Refresh all panels with current state."""
343
+ if not HAS_RICH or self._layout is None:
344
+ return
345
+
346
+ self._refresh_header()
347
+ self._refresh_status_bar()
348
+ self._refresh_progress_bar()
349
+ self._refresh_left_panel()
350
+ self._refresh_right_panel()
351
+
352
+ if self._live:
353
+ self._live.update(self._layout)
354
+
355
+ def _refresh_header(self) -> None:
356
+ """Refresh the header panel with ASCII art banner."""
357
+ if not HAS_RICH or self._layout is None:
358
+ return
359
+
360
+ banner_text = Text(ASCII_BANNER, style="bold cyan")
361
+ header_panel = Panel(
362
+ banner_text,
363
+ border_style="cyan",
364
+ box=DOUBLE,
365
+ padding=(0, 1),
366
+ )
367
+ self._layout["header"].update(header_panel)
368
+
369
+ def _refresh_status_bar(self) -> None:
370
+ """Refresh the status bar with mission info, timer, and status."""
371
+ if not HAS_RICH or self._layout is None:
372
+ return
373
+
374
+ # Truncate file path if too long
375
+ file_path = self._state.file_path
376
+ if len(file_path) > 30:
377
+ file_path = "..." + file_path[-27:]
378
+
379
+ elapsed = self._get_elapsed_time()
380
+
381
+ # Status indicator
382
+ if self._state.llm_status == "calling":
383
+ status_text = Text("ACTIVE", style="bold green")
384
+ status_indicator = "\u25cf" # Filled circle
385
+ else:
386
+ status_text = Text("IDLE", style="dim")
387
+ status_indicator = "\u25cb" # Empty circle
388
+
389
+ # Build status bar content
390
+ status_table = Table.grid(padding=(0, 2), expand=True)
391
+ status_table.add_column(justify="left", ratio=2)
392
+ status_table.add_column(justify="center", ratio=1)
393
+ status_table.add_column(justify="right", ratio=1)
394
+
395
+ mission_text = Text()
396
+ mission_text.append("MISSION: ", style="bold")
397
+ mission_text.append(file_path, style="cyan")
398
+
399
+ time_text = Text()
400
+ time_text.append("TIME: ", style="bold")
401
+ time_text.append(elapsed, style="cyan")
402
+
403
+ status_combined = Text()
404
+ status_combined.append("STATUS: ", style="bold")
405
+ status_combined.append(f"{status_indicator} ", style="green" if self._state.llm_status == "calling" else "dim")
406
+ status_combined.append_text(status_text)
407
+
408
+ status_table.add_row(mission_text, time_text, status_combined)
409
+
410
+ status_panel = Panel(
411
+ status_table,
412
+ border_style="cyan",
413
+ box=DOUBLE,
414
+ padding=(0, 1),
415
+ )
416
+ self._layout["status_bar"].update(status_panel)
417
+
418
+ def _refresh_progress_bar(self) -> None:
419
+ """Refresh the progress bar panel."""
420
+ if not HAS_RICH or self._layout is None:
421
+ return
422
+
423
+ # Calculate progress percentage
424
+ progress_pct = 0
425
+ if self._state.total_chunks > 0:
426
+ progress_pct = int((self._state.current_chunk / self._state.total_chunks) * 100)
427
+
428
+ # Build progress bar using Rich Progress
429
+ progress = Progress(
430
+ TextColumn("[bold cyan]\u25ba[/bold cyan]"),
431
+ TextColumn(f"CHUNK {self._state.current_chunk}/{self._state.total_chunks}"),
432
+ BarColumn(bar_width=30, complete_style="cyan", finished_style="green"),
433
+ TextColumn(f"{progress_pct}%"),
434
+ expand=False,
435
+ )
436
+ task = progress.add_task("", total=self._state.total_chunks, completed=self._state.current_chunk)
437
+
438
+ progress_panel = Panel(
439
+ progress,
440
+ border_style="cyan",
441
+ box=DOUBLE,
442
+ padding=(0, 1),
443
+ )
444
+ self._layout["progress_bar"].update(progress_panel)
445
+
446
+ def _refresh_left_panel(self) -> None:
447
+ """Refresh the left panel with functions list and metrics."""
448
+ if not HAS_RICH or self._layout is None:
449
+ return
450
+
451
+ func_count = len(self._state.functions)
452
+
453
+ # Build function tree
454
+ content = Table.grid(padding=(0, 0))
455
+ content.add_column()
456
+
457
+ # Show max 6 functions with tree structure
458
+ max_display = 6
459
+ display_funcs = self._state.functions[-max_display:] if func_count > max_display else self._state.functions
460
+
461
+ for i, func in enumerate(display_funcs):
462
+ func_text = Text()
463
+ # Tree-style prefix
464
+ if i == len(display_funcs) - 1:
465
+ func_text.append("\u2514\u2500 ", style="dim cyan") # Corner
466
+ else:
467
+ func_text.append("\u251c\u2500 ", style="dim cyan") # Tee
468
+
469
+ func_text.append(func.name, style="bold")
470
+ func_text.append(" \u2713", style="green") # Checkmark
471
+
472
+ content.add_row(func_text)
473
+
474
+ # Show "+N more" if truncated
475
+ if func_count > max_display:
476
+ hidden_count = func_count - max_display
477
+ content.add_row(Text(f" (+{hidden_count} more)", style="dim italic"))
478
+
479
+ # Add spacing
480
+ content.add_row(Text(""))
481
+
482
+ # Token stats
483
+ tokens_in_k = self._state.tokens_in / 1000
484
+ tokens_out_k = self._state.tokens_out / 1000
485
+ tokens_text = Text()
486
+ tokens_text.append("TOKENS: ", style="bold")
487
+ tokens_text.append(f"~{tokens_in_k:.1f}k in / ~{tokens_out_k:.1f}k out", style="dim")
488
+ content.add_row(tokens_text)
489
+
490
+ # Latency stats
491
+ latency_text = Text()
492
+ latency_text.append("LATENCY: ", style="bold")
493
+ if self._state.llm_call_count > 0:
494
+ latency_text.append(f"{self._state.latency_last_ms:.1f}s", style="cyan")
495
+ latency_text.append(f" (avg {self._state.latency_avg_ms / 1000:.1f}s)", style="dim")
496
+ else:
497
+ latency_text.append("\u2014", style="dim") # Em dash
498
+ content.add_row(latency_text)
499
+
500
+ left_panel = Panel(
501
+ content,
502
+ title=f"[bold cyan]FUNCTIONS ACQUIRED [{func_count}][/bold cyan]",
503
+ border_style="cyan",
504
+ box=DOUBLE,
505
+ )
506
+ self._layout["left_panel"].update(left_panel)
507
+
508
+ def _parse_response_for_display(self, response: str) -> str:
509
+ """Parse LLM XML response into readable format for transmission log.
510
+
511
+ Args:
512
+ response: Raw LLM response text (XML format)
513
+
514
+ Returns:
515
+ Formatted string for display showing issues, function being
516
+ generated, and chunk status.
517
+ """
518
+ import re
519
+
520
+ lines = []
521
+
522
+ try:
523
+ # Find all issues
524
+ issue_pattern = r'<issue[^>]*id="(\d+)"[^>]*solved="(true|false)"[^>]*>([^<]+)</issue>'
525
+ issues = re.findall(issue_pattern, response, re.DOTALL)
526
+
527
+ if issues:
528
+ lines.append("ISSUES DETECTED:")
529
+ for issue_id, solved, desc in issues[:8]: # Limit to 8 issues
530
+ marker = "\u2713" if solved == "true" else "\u2717" # checkmark or X
531
+ desc_clean = desc.strip()[:40] # Truncate description
532
+ lines.append(f" {marker} {desc_clean}")
533
+ if len(issues) > 8:
534
+ lines.append(f" (+{len(issues) - 8} more)")
535
+ lines.append("")
536
+
537
+ # Find function being generated
538
+ name_match = re.search(r'<name>([^<]+)</name>', response)
539
+ docstring_match = re.search(r'<docstring>([^<]+)</docstring>', response, re.DOTALL)
540
+
541
+ if name_match:
542
+ lines.append(f"GENERATING: {name_match.group(1).strip()}")
543
+ if docstring_match:
544
+ doc = docstring_match.group(1).strip()[:60]
545
+ lines.append(f' "{doc}..."')
546
+ lines.append("")
547
+
548
+ # Find chunk status
549
+ status_match = re.search(r'<chunk_status>([^<]+)</chunk_status>', response)
550
+ if status_match:
551
+ status = status_match.group(1).strip()
552
+ lines.append(f"STATUS: {status.upper()}")
553
+
554
+ if lines:
555
+ return "\n".join(lines)
556
+ except Exception:
557
+ pass
558
+
559
+ # Fallback: show truncated raw response
560
+ return response[:500] + "..." if len(response) > 500 else response
561
+
562
+ def _refresh_right_panel(self) -> None:
563
+ """Refresh the right panel with parsed transmission log."""
564
+ if not HAS_RICH or self._layout is None:
565
+ return
566
+
567
+ # Get last response and parse for display
568
+ response = self._state.last_response
569
+ if not response:
570
+ display_text = "(Awaiting transmission...)"
571
+ else:
572
+ display_text = self._parse_response_for_display(response)
573
+
574
+ log_text = Text(display_text, style="dim cyan")
575
+
576
+ right_panel = Panel(
577
+ log_text,
578
+ title="[bold cyan]\u25c4\u25c4 TRANSMISSION LOG \u25ba\u25ba[/bold cyan]",
579
+ border_style="cyan",
580
+ box=DOUBLE,
581
+ )
582
+ self._layout["right_panel"].update(right_panel)
583
+
584
+ # Legacy method stubs for backwards compatibility
585
+ def _refresh_progress(self) -> None:
586
+ """Legacy method - calls _refresh_progress_bar."""
587
+ self._refresh_progress_bar()
588
+
589
+ def _refresh_functions(self) -> None:
590
+ """Legacy method - calls _refresh_left_panel."""
591
+ self._refresh_left_panel()
592
+
593
+ def _refresh_footer(self) -> None:
594
+ """Legacy method - no longer used but kept for compatibility."""
595
+ pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: recursive-cleaner
3
- Version: 0.7.1
3
+ Version: 0.8.0
4
4
  Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
5
5
  Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
6
6
  Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
@@ -32,6 +32,8 @@ Provides-Extra: mlx
32
32
  Requires-Dist: mlx-lm>=0.10.0; extra == 'mlx'
33
33
  Provides-Extra: parquet
34
34
  Requires-Dist: pyarrow>=14.0.0; extra == 'parquet'
35
+ Provides-Extra: tui
36
+ Requires-Dist: rich>=13.0; extra == 'tui'
35
37
  Description-Content-Type: text/markdown
36
38
 
37
39
  # Recursive Data Cleaner
@@ -69,6 +71,11 @@ For Parquet files:
69
71
  pip install -e ".[parquet]"
70
72
  ```
71
73
 
74
+ For Terminal UI (Rich dashboard):
75
+ ```bash
76
+ pip install -e ".[tui]"
77
+ ```
78
+
72
79
  ## Quick Start
73
80
 
74
81
  ```python
@@ -126,6 +133,13 @@ cleaner.run() # Generates cleaning_functions.py
126
133
  - **Parquet Support**: Load parquet files as structured data via pyarrow
127
134
  - **LLM-Generated Parsers**: Auto-generate parsers for XML and unknown formats (`auto_parse=True`)
128
135
 
136
+ ### Terminal UI (v0.8.0)
137
+ - **Mission Control Dashboard**: Rich-based live terminal UI with retro aesthetic
138
+ - **Real-time Progress**: Animated progress bars, chunk/iteration counters
139
+ - **Transmission Log**: Parsed LLM responses showing issues detected and functions being generated
140
+ - **Token Estimation**: Track estimated input/output tokens across the run
141
+ - **Graceful Fallback**: Works without Rich installed (falls back to callbacks)
142
+
129
143
  ## Configuration
130
144
 
131
145
  ```python
@@ -160,6 +174,9 @@ cleaner = DataCleaner(
160
174
  # Format Expansion
161
175
  auto_parse=False, # LLM generates parser for unknown formats
162
176
 
177
+ # Terminal UI
178
+ tui=True, # Enable Rich dashboard (requires [tui] extra)
179
+
163
180
  # Progress & State
164
181
  on_progress=callback, # Progress event callback
165
182
  state_file="state.json", # Enable resume on interrupt
@@ -265,6 +282,7 @@ recursive_cleaner/
265
282
  ├── report.py # Markdown report generation
266
283
  ├── response.py # XML/markdown parsing + agency dataclasses
267
284
  ├── schema.py # Schema inference
285
+ ├── tui.py # Rich terminal dashboard
268
286
  ├── validation.py # Runtime validation + holdout
269
287
  └── vendor/
270
288
  └── chunker.py # Vendored sentence-aware chunker
@@ -276,7 +294,7 @@ recursive_cleaner/
276
294
  pytest tests/ -v
277
295
  ```
278
296
 
279
- 432 tests covering all features. Test datasets in `test_cases/`:
297
+ 465 tests covering all features. Test datasets in `test_cases/`:
280
298
  - E-commerce product catalogs
281
299
  - Healthcare patient records
282
300
  - Financial transaction data
@@ -292,6 +310,7 @@ pytest tests/ -v
292
310
 
293
311
  | Version | Features |
294
312
  |---------|----------|
313
+ | v0.8.0 | Terminal UI with Rich dashboard, mission control aesthetic, transmission log |
295
314
  | v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
296
315
  | v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
297
316
  | v0.5.1 | Dangerous code detection (AST-based security) |
@@ -1,7 +1,7 @@
1
1
  backends/__init__.py,sha256=FUgODeYSGBvT0-z6myVby6YeAHG0nEUgWLITBKobUew,121
2
2
  backends/mlx_backend.py,sha256=0U6IqmDHyk4vjKzytvEcQvSUBryQTgFtsNOcpwFNKk8,2945
3
- recursive_cleaner/__init__.py,sha256=bG83PcmkxAYMC17FmKuyMJUrMnuukp32JO3rlCLyB-Q,1698
4
- recursive_cleaner/cleaner.py,sha256=J2X5bnk2OsWJyOn4BNR-cj0sqeKCylznfs_WEyMGxG8,26280
3
+ recursive_cleaner/__init__.py,sha256=v0bNQ3H0d7n6cTOkuxuqG9bmnX9yeZBLZ_AfFM7edHI,1789
4
+ recursive_cleaner/cleaner.py,sha256=vZTMwaLlCmuh1qy3c-puEZrwS5gXt0u28d5iweQXbms,29801
5
5
  recursive_cleaner/context.py,sha256=avMXRDxLd7nd8CKWtvPHQy1MFhBKiA0aUVVJIlWoLZ4,824
6
6
  recursive_cleaner/dependencies.py,sha256=vlYeoGL517v3yUSWN0wYDuIs9OOuQwM_dCBADrlitW8,2080
7
7
  recursive_cleaner/errors.py,sha256=hwRJF8NSmWy_FZHCxcZDZxLQ0zqvo5dX8ImkB9mrOYc,433
@@ -14,11 +14,12 @@ recursive_cleaner/prompt.py,sha256=ep0eOXz_XbhH3HduJ76LvzVSftonhcv4GLEecIqd3lY,6
14
14
  recursive_cleaner/report.py,sha256=AWWneRjvl76ccLlExdkKJeY3GVFUG_LtmzVIJJT5cFI,4629
15
15
  recursive_cleaner/response.py,sha256=3w0mLnqEPdB4daMSF0mtTcG0PTP-utb1HFtKuYA1ljw,9064
16
16
  recursive_cleaner/schema.py,sha256=w2hcEdApR15KVI9SFWB3VfumMoHFwn1YJrktdfgPo8M,3925
17
+ recursive_cleaner/tui.py,sha256=FwG_uCwqUcvch5dRZmV-ba2JXD0XJkm9roXzPQ9iUSo,21633
17
18
  recursive_cleaner/types.py,sha256=-GdCmsfHd3rfdfCi5c-RXqX4TyuCSHgA__3AF3bMhoQ,290
18
19
  recursive_cleaner/validation.py,sha256=-KAolhw3GQyhHwmh0clEj8xqPD5O-R2AO5rx7vubIME,6442
19
20
  recursive_cleaner/vendor/__init__.py,sha256=E87TjmjRzu8ty39nqThvBwM611yXlLKQZ6KGY_zp3Dk,117
20
21
  recursive_cleaner/vendor/chunker.py,sha256=pDDbfF6FoSmUji0-RG4MletPxJ-VybGw0yfnhh0aMSQ,6730
21
- recursive_cleaner-0.7.1.dist-info/METADATA,sha256=X5_HVPMIPUULKKIgDvqhN0ZRQQBcZ1lupGb9frLdCSI,10258
22
- recursive_cleaner-0.7.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
23
- recursive_cleaner-0.7.1.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
24
- recursive_cleaner-0.7.1.dist-info/RECORD,,
22
+ recursive_cleaner-0.8.0.dist-info/METADATA,sha256=rVABzjvUZ-uzk35o5evbIlkRIbgEb29QPKSCoMI4_fs,11072
23
+ recursive_cleaner-0.8.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
24
+ recursive_cleaner-0.8.0.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
25
+ recursive_cleaner-0.8.0.dist-info/RECORD,,