recursive-cleaner 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,6 +20,7 @@ from recursive_cleaner.parsers import MARKITDOWN_EXTENSIONS, chunk_file, load_pa
20
20
  from recursive_cleaner.prompt import build_prompt
21
21
  from recursive_cleaner.response import extract_python_block, parse_response
22
22
  from recursive_cleaner.parser_generator import check_parser_safety, generate_parser
23
+ from recursive_cleaner.tui import HAS_RICH, TUIRenderer
23
24
  from recursive_cleaner.validation import check_code_safety, extract_sample_data, validate_function
24
25
 
25
26
  __all__ = [
@@ -49,4 +50,6 @@ __all__ = [
49
50
  "consolidate_with_agency",
50
51
  "generate_parser",
51
52
  "check_parser_safety",
53
+ "TUIRenderer",
54
+ "HAS_RICH",
52
55
  ]
@@ -62,6 +62,7 @@ class DataCleaner:
62
62
  report_path: str | None = "cleaning_report.md",
63
63
  dry_run: bool = False,
64
64
  auto_parse: bool = False,
65
+ tui: bool = False,
65
66
  ):
66
67
  self.backend = llm_backend
67
68
  self.file_path = file_path
@@ -86,7 +87,9 @@ class DataCleaner:
86
87
  self.report_path = report_path
87
88
  self.dry_run = dry_run
88
89
  self.auto_parse = auto_parse
90
+ self.tui = tui
89
91
  self.functions: list[dict] = [] # List of {name, docstring, code}
92
+ self._tui_renderer = None # TUIRenderer instance when tui=True
90
93
  self._generated_parser: callable | None = None # LLM-generated parser for unknown formats
91
94
  # Track recent function generation for saturation check
92
95
  self._recent_new_function_count = 0
@@ -119,10 +122,15 @@ class DataCleaner:
119
122
  try:
120
123
  self.on_progress(event)
121
124
  except Exception as e:
122
- print(f" Warning: callback error: {e}")
125
+ if not self.tui:
126
+ print(f" Warning: callback error: {e}")
123
127
 
124
128
  def _call_llm_timed(self, prompt: str, chunk_index: int = 0) -> str:
125
129
  """Call LLM with timing and emit latency event."""
130
+ # Update TUI status before call
131
+ if self._tui_renderer:
132
+ self._tui_renderer.update_llm_status("calling")
133
+
126
134
  start = time.perf_counter()
127
135
  response = call_llm(self.backend, prompt)
128
136
  elapsed_ms = (time.perf_counter() - start) * 1000
@@ -133,6 +141,20 @@ class DataCleaner:
133
141
  self._latency_stats["min_ms"] = min(self._latency_stats["min_ms"], elapsed_ms)
134
142
  self._latency_stats["max_ms"] = max(self._latency_stats["max_ms"], elapsed_ms)
135
143
 
144
+ # Update TUI status and metrics after call
145
+ if self._tui_renderer:
146
+ self._tui_renderer.update_llm_status("idle")
147
+ latency_summary = self._get_latency_summary()
148
+ self._tui_renderer.update_metrics(
149
+ quality_delta=0.0, # Quality delta calculated at end
150
+ latency_last=elapsed_ms,
151
+ latency_avg=latency_summary.get("avg_ms", 0.0),
152
+ latency_total=latency_summary.get("total_ms", 0.0),
153
+ llm_calls=latency_summary.get("call_count", 0),
154
+ )
155
+ self._tui_renderer.update_tokens(prompt, response)
156
+ self._tui_renderer.update_transmission(response)
157
+
136
158
  # Emit event
137
159
  self._emit("llm_call", chunk_index=chunk_index, latency_ms=round(elapsed_ms, 2))
138
160
 
@@ -216,7 +238,8 @@ class DataCleaner:
216
238
  response = self._call_llm_timed(prompt, chunk_index=chunks_processed - 1)
217
239
  assessment = parse_saturation_response(response)
218
240
  except Exception as e:
219
- print(f" Warning: saturation check failed: {e}")
241
+ if not self.tui:
242
+ print(f" Warning: saturation check failed: {e}")
220
243
  return False # Continue on error
221
244
 
222
245
  self._emit(
@@ -275,7 +298,8 @@ class DataCleaner:
275
298
  self.functions = state.get("functions", [])
276
299
  self._last_completed_chunk = state.get("last_completed_chunk", -1)
277
300
  self._total_chunks = state.get("total_chunks", 0)
278
- print(f"Resumed from state: {self._last_completed_chunk + 1}/{self._total_chunks} chunks completed")
301
+ if not self.tui:
302
+ print(f"Resumed from state: {self._last_completed_chunk + 1}/{self._total_chunks} chunks completed")
279
303
  return True
280
304
 
281
305
  @classmethod
@@ -340,14 +364,16 @@ class DataCleaner:
340
364
  """Load file using LLM-generated parser, return JSONL chunks."""
341
365
  from .parser_generator import generate_parser
342
366
 
343
- print(f"Unknown file format, generating parser...")
367
+ if not self.tui:
368
+ print(f"Unknown file format, generating parser...")
344
369
  self._emit("parser_generation_start")
345
370
 
346
371
  parser = generate_parser(self.backend, self.file_path)
347
372
  self._generated_parser = parser
348
373
 
349
374
  self._emit("parser_generation_complete")
350
- print("Parser generated successfully.")
375
+ if not self.tui:
376
+ print("Parser generated successfully.")
351
377
 
352
378
  # Parse the file
353
379
  records = parser(self.file_path)
@@ -390,7 +416,8 @@ class DataCleaner:
390
416
  )
391
417
 
392
418
  if not chunks:
393
- print("No data to process.")
419
+ if not self.tui:
420
+ print("No data to process.")
394
421
  return
395
422
 
396
423
  # Try to load existing state
@@ -409,13 +436,38 @@ class DataCleaner:
409
436
 
410
437
  self._total_chunks = len(chunks)
411
438
 
439
+ # Initialize TUI if enabled
440
+ if self.tui:
441
+ from .tui import HAS_RICH, TUIRenderer
442
+
443
+ if HAS_RICH:
444
+ self._tui_renderer = TUIRenderer(
445
+ file_path=self.file_path,
446
+ total_chunks=self._total_chunks,
447
+ total_records=0, # Could be calculated from chunks
448
+ )
449
+ self._tui_renderer.start()
450
+ else:
451
+ import logging
452
+
453
+ logging.warning(
454
+ "tui=True but Rich not installed. "
455
+ "Install with: pip install recursive-cleaner[tui]"
456
+ )
457
+
412
458
  for i, chunk in enumerate(chunks):
413
459
  # Skip already completed chunks
414
460
  if i <= self._last_completed_chunk:
415
- if resumed:
461
+ if resumed and not self.tui:
416
462
  print(f"Skipping chunk {i + 1}/{len(chunks)} (already completed)")
417
463
  continue
418
- print(f"Processing chunk {i + 1}/{len(chunks)}...")
464
+ if not self.tui:
465
+ print(f"Processing chunk {i + 1}/{len(chunks)}...")
466
+
467
+ # Update TUI with chunk progress
468
+ if self._tui_renderer:
469
+ self._tui_renderer.update_chunk(i, 0, self.max_iterations)
470
+
419
471
  self._process_chunk(chunk, i)
420
472
  # Mark chunk as completed and save state
421
473
  self._last_completed_chunk = i
@@ -429,7 +481,8 @@ class DataCleaner:
429
481
  ):
430
482
  if self._check_saturation(i + 1):
431
483
  self._emit("early_termination", chunk_index=i)
432
- print(f"Early termination: pattern discovery saturated at chunk {i + 1}")
484
+ if not self.tui:
485
+ print(f"Early termination: pattern discovery saturated at chunk {i + 1}")
433
486
  break
434
487
 
435
488
  # Skip optimization and output in dry_run mode
@@ -439,7 +492,11 @@ class DataCleaner:
439
492
  chunk_index=self._total_chunks - 1,
440
493
  latency_stats=self._get_latency_summary(),
441
494
  )
442
- print("Dry run complete. No functions generated or saved.")
495
+ # Stop TUI if running
496
+ if self._tui_renderer:
497
+ self._tui_renderer.stop()
498
+ if not self.tui:
499
+ print("Dry run complete. No functions generated or saved.")
443
500
  return
444
501
 
445
502
  # Two-pass optimization (if enabled and enough functions)
@@ -453,7 +510,22 @@ class DataCleaner:
453
510
  chunk_index=self._total_chunks - 1,
454
511
  latency_stats=self._get_latency_summary(),
455
512
  )
456
- print(f"Done! Generated {len(self.functions)} functions.")
513
+
514
+ # Show TUI completion and stop
515
+ if self._tui_renderer:
516
+ latency_summary = self._get_latency_summary()
517
+ self._tui_renderer.show_complete({
518
+ "functions_count": len(self.functions),
519
+ "chunks_processed": self._total_chunks,
520
+ "quality_delta": 0.0, # Could be calculated from metrics
521
+ "latency_total_ms": latency_summary.get("total_ms", 0.0),
522
+ "llm_calls": latency_summary.get("call_count", 0),
523
+ "output_file": "cleaning_functions.py",
524
+ })
525
+ self._tui_renderer.stop()
526
+
527
+ if not self.tui:
528
+ print(f"Done! Generated {len(self.functions)} functions.")
457
529
 
458
530
  def _process_chunk(self, chunk: str, chunk_idx: int) -> None:
459
531
  """Process a single chunk, iterating until clean or max iterations."""
@@ -476,6 +548,11 @@ class DataCleaner:
476
548
 
477
549
  for iteration in range(self.max_iterations):
478
550
  self._emit("iteration", chunk_index=chunk_idx, iteration=iteration)
551
+
552
+ # Update TUI with iteration progress
553
+ if self._tui_renderer:
554
+ self._tui_renderer.update_chunk(chunk_idx, iteration, self.max_iterations)
555
+
479
556
  context = build_context(self.functions, self.context_budget)
480
557
  prompt = build_prompt(
481
558
  self.instructions,
@@ -511,7 +588,8 @@ class DataCleaner:
511
588
  function_name=result["name"],
512
589
  error=safety_error,
513
590
  )
514
- print(f" Safety check failed: {safety_error}")
591
+ if not self.tui:
592
+ print(f" Safety check failed: {safety_error}")
515
593
  continue
516
594
 
517
595
  # Runtime validation if enabled
@@ -539,7 +617,8 @@ class DataCleaner:
539
617
  function_name=result["name"],
540
618
  error=error_msg,
541
619
  )
542
- print(f" Validation failed: {error_msg}")
620
+ if not self.tui:
621
+ print(f" Validation failed: {error_msg}")
543
622
  continue
544
623
 
545
624
  self.functions.append({
@@ -549,17 +628,25 @@ class DataCleaner:
549
628
  })
550
629
  # Track for saturation check
551
630
  self._recent_new_function_count += 1
631
+
632
+ # Update TUI with new function
633
+ if self._tui_renderer:
634
+ self._tui_renderer.add_function(result["name"], result["docstring"])
635
+
552
636
  self._emit(
553
637
  "function_generated",
554
638
  chunk_index=chunk_idx,
555
639
  function_name=result["name"],
556
640
  )
557
- print(f" Generated: {result['name']}")
641
+ if not self.tui:
642
+ print(f" Generated: {result['name']}")
558
643
  else:
559
644
  # LLM said needs_more_work but didn't provide code
560
- print(f" Warning: iteration {iteration + 1} produced no function")
645
+ if not self.tui:
646
+ print(f" Warning: iteration {iteration + 1} produced no function")
561
647
 
562
- print(f" Warning: chunk {chunk_idx} hit max iterations ({self.max_iterations})")
648
+ if not self.tui:
649
+ print(f" Warning: chunk {chunk_idx} hit max iterations ({self.max_iterations})")
563
650
  self._emit("chunk_done", chunk_index=chunk_idx)
564
651
 
565
652
  def _process_chunk_dry_run(self, chunk: str, chunk_idx: int) -> None:
@@ -577,7 +664,8 @@ class DataCleaner:
577
664
  response = self._call_llm_timed(prompt, chunk_index=chunk_idx)
578
665
  result = parse_response(response)
579
666
  except ParseError as e:
580
- print(f" Warning: parse error in dry run: {e}")
667
+ if not self.tui:
668
+ print(f" Warning: parse error in dry run: {e}")
581
669
  self._emit("chunk_done", chunk_index=chunk_idx)
582
670
  return
583
671
 
@@ -589,11 +677,12 @@ class DataCleaner:
589
677
  issues=issues,
590
678
  )
591
679
 
592
- if issues:
593
- unsolved = [i for i in issues if not i.get("solved", False)]
594
- print(f" Found {len(issues)} issues ({len(unsolved)} unsolved)")
595
- else:
596
- print(" No issues detected")
680
+ if not self.tui:
681
+ if issues:
682
+ unsolved = [i for i in issues if not i.get("solved", False)]
683
+ print(f" Found {len(issues)} issues ({len(unsolved)} unsolved)")
684
+ else:
685
+ print(" No issues detected")
597
686
 
598
687
  self._emit("chunk_done", chunk_index=chunk_idx)
599
688
 
@@ -604,8 +693,9 @@ class DataCleaner:
604
693
  try:
605
694
  write_cleaning_file(self.functions)
606
695
  except OutputValidationError as e:
607
- print(f" Error: {e}")
608
- print(" Attempting to write valid functions only...")
696
+ if not self.tui:
697
+ print(f" Error: {e}")
698
+ print(" Attempting to write valid functions only...")
609
699
  # Try writing functions one by one, skipping invalid ones
610
700
  valid_functions = []
611
701
  for f in self.functions:
@@ -614,10 +704,11 @@ class DataCleaner:
614
704
  ast.parse(f["code"])
615
705
  valid_functions.append(f)
616
706
  except SyntaxError:
617
- print(f" Skipping invalid function: {f['name']}")
707
+ if not self.tui:
708
+ print(f" Skipping invalid function: {f['name']}")
618
709
  if valid_functions:
619
710
  write_cleaning_file(valid_functions)
620
- else:
711
+ elif not self.tui:
621
712
  print(" No valid functions to write.")
622
713
 
623
714
  def _write_report(self) -> None:
@@ -0,0 +1,595 @@
1
+ """Rich TUI dashboard with Mission Control retro aesthetic."""
2
+
3
+ import time
4
+ from dataclasses import dataclass, field
5
+ from typing import Literal
6
+
7
+ # Graceful import - TUI features only available when Rich is installed
8
+ try:
9
+ from rich.box import DOUBLE
10
+ from rich.console import Console, Group
11
+ from rich.layout import Layout
12
+ from rich.live import Live
13
+ from rich.panel import Panel
14
+ from rich.progress import BarColumn, Progress, TextColumn
15
+ from rich.table import Table
16
+ from rich.text import Text
17
+
18
+ HAS_RICH = True
19
+ except ImportError:
20
+ HAS_RICH = False
21
+
22
+
23
+ # ASCII art banner - chunky block style
24
+ ASCII_BANNER = """
25
+ ██████╗ ███████╗ ██████╗██╗ ██╗██████╗ ███████╗██╗██╗ ██╗███████╗
26
+ ██╔══██╗██╔════╝██╔════╝██║ ██║██╔══██╗██╔════╝██║██║ ██║██╔════╝
27
+ ██████╔╝█████╗ ██║ ██║ ██║██████╔╝███████╗██║██║ ██║█████╗
28
+ ██╔══██╗██╔══╝ ██║ ██║ ██║██╔══██╗╚════██║██║╚██╗ ██╔╝██╔══╝
29
+ ██║ ██║███████╗╚██████╗╚██████╔╝██║ ██║███████║██║ ╚████╔╝ ███████╗
30
+ ╚═╝ ╚═╝╚══════╝ ╚═════╝ ╚═════╝ ╚═╝ ╚═╝╚══════╝╚═╝ ╚═══╝ ╚══════╝
31
+ ██████╗██╗ ███████╗ █████╗ ███╗ ██╗███████╗██████╗
32
+ ██╔════╝██║ ██╔════╝██╔══██╗████╗ ██║██╔════╝██╔══██╗
33
+ ██║ ██║ █████╗ ███████║██╔██╗ ██║█████╗ ██████╔╝
34
+ ██║ ██║ ██╔══╝ ██╔══██║██║╚██╗██║██╔══╝ ██╔══██╗
35
+ ╚██████╗███████╗███████╗██║ ██║██║ ╚████║███████╗██║ ██║
36
+ ╚═════╝╚══════╝╚══════╝╚═╝ ╚═╝╚═╝ ╚═══╝╚══════╝╚═╝ ╚═╝
37
+ """.strip()
38
+
39
+ # Keep HEADER_TITLE for backwards compatibility with tests
40
+ HEADER_TITLE = "RECURSIVE CLEANER"
41
+
42
+
43
+ @dataclass
44
+ class FunctionInfo:
45
+ """Info about a generated cleaning function."""
46
+
47
+ name: str
48
+ docstring: str
49
+
50
+
51
+ @dataclass
52
+ class TUIState:
53
+ """Dashboard display state."""
54
+
55
+ # Header
56
+ file_path: str
57
+ total_records: int
58
+ version: str = "0.8.0"
59
+
60
+ # Progress
61
+ current_chunk: int = 0
62
+ total_chunks: int = 0
63
+ current_iteration: int = 0
64
+ max_iterations: int = 5
65
+
66
+ # LLM Status
67
+ llm_status: Literal["idle", "calling"] = "idle"
68
+
69
+ # Functions
70
+ functions: list[FunctionInfo] = field(default_factory=list)
71
+
72
+ # Latency metrics
73
+ latency_last_ms: float = 0.0
74
+ latency_avg_ms: float = 0.0
75
+ latency_total_ms: float = 0.0
76
+ llm_call_count: int = 0
77
+
78
+ # Token estimation
79
+ tokens_in: int = 0
80
+ tokens_out: int = 0
81
+
82
+ # Transmission log
83
+ last_response: str = ""
84
+
85
+
86
+ class TUIRenderer:
87
+ """
88
+ Rich-based terminal dashboard with Mission Control retro aesthetic.
89
+
90
+ Shows live updates during cleaning runs with:
91
+ - ASCII art banner header
92
+ - Mission timer and status indicator
93
+ - Progress bar and chunk/iteration counters
94
+ - List of generated functions with checkmarks
95
+ - Token estimation and latency metrics
96
+ - Transmission log showing latest LLM response
97
+ """
98
+
99
+ def __init__(self, file_path: str, total_chunks: int, total_records: int = 0):
100
+ """
101
+ Initialize TUI renderer.
102
+
103
+ Args:
104
+ file_path: Path to the data file being cleaned
105
+ total_chunks: Total number of chunks to process
106
+ total_records: Total number of records in the file
107
+ """
108
+ self._state = TUIState(
109
+ file_path=file_path,
110
+ total_chunks=total_chunks,
111
+ total_records=total_records,
112
+ )
113
+ self._start_time = time.time()
114
+ self._layout = self._make_layout() if HAS_RICH else None
115
+ self._live: "Live | None" = None
116
+ self._console = Console() if HAS_RICH else None
117
+
118
+ def _make_layout(self) -> "Layout":
119
+ """Create the dashboard layout structure.
120
+
121
+ Layout:
122
+ - header (size=5) - ASCII art banner "RECURSIVE CLEANER"
123
+ - status_bar (size=3) - MISSION | TIME | STATUS
124
+ - progress_bar (size=3) - CHUNK X/Y + progress bar
125
+ - body (size=computed) - Split horizontally, FIXED size to prevent infinite expansion
126
+ - left_panel - FUNCTIONS ACQUIRED, tokens, latency
127
+ - right_panel - Parsed transmission log
128
+
129
+ CRITICAL: Body uses fixed `size=` not `ratio=` to prevent panels from
130
+ expanding infinitely and pushing header off screen on large terminals.
131
+ Works on terminals as small as 80x24.
132
+ """
133
+ if not HAS_RICH:
134
+ return None
135
+
136
+ from rich.console import Console
137
+
138
+ console = Console()
139
+ term_height = console.height or 24 # Default to 24 if unknown
140
+
141
+ # Fixed heights for top sections
142
+ header_height = 14 # ASCII banner (12 lines + border)
143
+ status_height = 3
144
+ progress_height = 3
145
+ fixed_total = header_height + status_height + progress_height
146
+
147
+ # Body gets remaining space with a FIXED size (not ratio)
148
+ # Cap at 18 rows max to keep it tight
149
+ body_height = min(18, max(10, term_height - fixed_total - 2))
150
+
151
+ layout = Layout()
152
+ layout.split_column(
153
+ Layout(name="header", size=header_height),
154
+ Layout(name="status_bar", size=status_height),
155
+ Layout(name="progress_bar", size=progress_height),
156
+ Layout(name="body", size=body_height), # FIXED size, not ratio
157
+ )
158
+ layout["body"].split_row(
159
+ Layout(name="left_panel", ratio=1),
160
+ Layout(name="right_panel", ratio=1),
161
+ )
162
+ return layout
163
+
164
+ def start(self) -> None:
165
+ """Start the live TUI display."""
166
+ if not HAS_RICH or self._layout is None:
167
+ return
168
+
169
+ self._start_time = time.time()
170
+ self._refresh()
171
+ self._live = Live(
172
+ self._layout,
173
+ console=self._console,
174
+ refresh_per_second=2,
175
+ vertical_overflow="crop",
176
+ )
177
+ self._live.start()
178
+
179
+ def stop(self) -> None:
180
+ """Stop the live TUI display."""
181
+ if self._live:
182
+ self._live.stop()
183
+ self._live = None
184
+
185
+ def update_chunk(self, chunk_index: int, iteration: int, max_iterations: int) -> None:
186
+ """
187
+ Update progress for current chunk and iteration.
188
+
189
+ Args:
190
+ chunk_index: Current chunk index (0-based)
191
+ iteration: Current iteration within chunk (0-based)
192
+ max_iterations: Maximum iterations per chunk
193
+ """
194
+ self._state.current_chunk = chunk_index + 1 # Convert to 1-based for display
195
+ self._state.current_iteration = iteration + 1
196
+ self._state.max_iterations = max_iterations
197
+ self._refresh()
198
+
199
+ def update_llm_status(self, status: Literal["calling", "idle"]) -> None:
200
+ """
201
+ Update LLM call status.
202
+
203
+ Args:
204
+ status: "calling" when LLM is being called, "idle" otherwise
205
+ """
206
+ self._state.llm_status = status
207
+ self._refresh()
208
+
209
+ def add_function(self, name: str, docstring: str) -> None:
210
+ """
211
+ Add a newly generated function to the display.
212
+
213
+ Args:
214
+ name: Function name
215
+ docstring: Function docstring
216
+ """
217
+ self._state.functions.append(FunctionInfo(name=name, docstring=docstring))
218
+ self._refresh()
219
+
220
+ def update_metrics(
221
+ self,
222
+ quality_delta: float,
223
+ latency_last: float,
224
+ latency_avg: float,
225
+ latency_total: float,
226
+ llm_calls: int,
227
+ ) -> None:
228
+ """
229
+ Update latency metrics.
230
+
231
+ Args:
232
+ quality_delta: Quality improvement percentage (ignored, kept for compatibility)
233
+ latency_last: Last LLM call latency in ms
234
+ latency_avg: Average LLM call latency in ms
235
+ latency_total: Total LLM call time in ms
236
+ llm_calls: Total number of LLM calls
237
+ """
238
+ self._state.latency_last_ms = latency_last
239
+ self._state.latency_avg_ms = latency_avg
240
+ self._state.latency_total_ms = latency_total
241
+ self._state.llm_call_count = llm_calls
242
+ self._refresh()
243
+
244
+ def update_tokens(self, prompt: str, response: str) -> None:
245
+ """
246
+ Update token estimates.
247
+
248
+ Rough estimate: len(text) // 4
249
+
250
+ Args:
251
+ prompt: The prompt sent to the LLM
252
+ response: The response received from the LLM
253
+ """
254
+ self._state.tokens_in += len(prompt) // 4
255
+ self._state.tokens_out += len(response) // 4
256
+ self._refresh()
257
+
258
+ def update_transmission(self, response: str) -> None:
259
+ """
260
+ Update the transmission log with latest LLM response.
261
+
262
+ Args:
263
+ response: The latest LLM response text
264
+ """
265
+ self._state.last_response = response
266
+ self._refresh()
267
+
268
+ def _get_elapsed_time(self) -> str:
269
+ """Get elapsed time as MM:SS string."""
270
+ elapsed = int(time.time() - self._start_time)
271
+ minutes = elapsed // 60
272
+ seconds = elapsed % 60
273
+ return f"{minutes:02d}:{seconds:02d}"
274
+
275
+ def show_complete(self, summary: dict) -> None:
276
+ """
277
+ Show completion summary panel.
278
+
279
+ Args:
280
+ summary: Dictionary with completion stats including:
281
+ - functions_count: Number of functions generated
282
+ - chunks_processed: Number of chunks processed
283
+ - latency_total_ms: Total LLM time in ms
284
+ - llm_calls: Number of LLM calls
285
+ - output_file: Path to output file
286
+ """
287
+ if not HAS_RICH or self._layout is None:
288
+ return
289
+
290
+ # Build completion panel content
291
+ content = Table.grid(padding=(0, 2))
292
+ content.add_column(justify="left")
293
+ content.add_column(justify="left")
294
+
295
+ func_count = summary.get("functions_count", len(self._state.functions))
296
+ chunks = summary.get("chunks_processed", self._state.total_chunks)
297
+ elapsed = self._get_elapsed_time()
298
+
299
+ # Token stats
300
+ tokens_in_k = self._state.tokens_in / 1000
301
+ tokens_out_k = self._state.tokens_out / 1000
302
+
303
+ content.add_row(
304
+ Text("Functions Acquired:", style="bold"),
305
+ Text(str(func_count), style="green"),
306
+ )
307
+ content.add_row(
308
+ Text("Chunks Processed:", style="bold"),
309
+ Text(str(chunks)),
310
+ )
311
+ content.add_row(
312
+ Text("Total Time:", style="bold"),
313
+ Text(elapsed),
314
+ )
315
+ content.add_row(
316
+ Text("Tokens:", style="bold"),
317
+ Text(f"~{tokens_in_k:.1f}k in / ~{tokens_out_k:.1f}k out"),
318
+ )
319
+ content.add_row(Text(""), Text("")) # Spacer
320
+ content.add_row(
321
+ Text("Output:", style="bold"),
322
+ Text(summary.get("output_file", "cleaning_functions.py"), style="cyan"),
323
+ )
324
+
325
+ # Build the complete panel with box drawing
326
+ complete_panel = Panel(
327
+ content,
328
+ title="[bold green]MISSION COMPLETE[/bold green]",
329
+ border_style="green",
330
+ box=DOUBLE,
331
+ )
332
+
333
+ # Replace entire layout with completion panel
334
+ self._layout.split_column(
335
+ Layout(complete_panel, name="complete"),
336
+ )
337
+
338
+ if self._live:
339
+ self._live.update(self._layout)
340
+
341
+ def _refresh(self) -> None:
342
+ """Refresh all panels with current state."""
343
+ if not HAS_RICH or self._layout is None:
344
+ return
345
+
346
+ self._refresh_header()
347
+ self._refresh_status_bar()
348
+ self._refresh_progress_bar()
349
+ self._refresh_left_panel()
350
+ self._refresh_right_panel()
351
+
352
+ if self._live:
353
+ self._live.update(self._layout)
354
+
355
+ def _refresh_header(self) -> None:
356
+ """Refresh the header panel with ASCII art banner."""
357
+ if not HAS_RICH or self._layout is None:
358
+ return
359
+
360
+ banner_text = Text(ASCII_BANNER, style="bold cyan")
361
+ header_panel = Panel(
362
+ banner_text,
363
+ border_style="cyan",
364
+ box=DOUBLE,
365
+ padding=(0, 1),
366
+ )
367
+ self._layout["header"].update(header_panel)
368
+
369
+ def _refresh_status_bar(self) -> None:
370
+ """Refresh the status bar with mission info, timer, and status."""
371
+ if not HAS_RICH or self._layout is None:
372
+ return
373
+
374
+ # Truncate file path if too long
375
+ file_path = self._state.file_path
376
+ if len(file_path) > 30:
377
+ file_path = "..." + file_path[-27:]
378
+
379
+ elapsed = self._get_elapsed_time()
380
+
381
+ # Status indicator
382
+ if self._state.llm_status == "calling":
383
+ status_text = Text("ACTIVE", style="bold green")
384
+ status_indicator = "\u25cf" # Filled circle
385
+ else:
386
+ status_text = Text("IDLE", style="dim")
387
+ status_indicator = "\u25cb" # Empty circle
388
+
389
+ # Build status bar content
390
+ status_table = Table.grid(padding=(0, 2), expand=True)
391
+ status_table.add_column(justify="left", ratio=2)
392
+ status_table.add_column(justify="center", ratio=1)
393
+ status_table.add_column(justify="right", ratio=1)
394
+
395
+ mission_text = Text()
396
+ mission_text.append("MISSION: ", style="bold")
397
+ mission_text.append(file_path, style="cyan")
398
+
399
+ time_text = Text()
400
+ time_text.append("TIME: ", style="bold")
401
+ time_text.append(elapsed, style="cyan")
402
+
403
+ status_combined = Text()
404
+ status_combined.append("STATUS: ", style="bold")
405
+ status_combined.append(f"{status_indicator} ", style="green" if self._state.llm_status == "calling" else "dim")
406
+ status_combined.append_text(status_text)
407
+
408
+ status_table.add_row(mission_text, time_text, status_combined)
409
+
410
+ status_panel = Panel(
411
+ status_table,
412
+ border_style="cyan",
413
+ box=DOUBLE,
414
+ padding=(0, 1),
415
+ )
416
+ self._layout["status_bar"].update(status_panel)
417
+
418
+ def _refresh_progress_bar(self) -> None:
419
+ """Refresh the progress bar panel."""
420
+ if not HAS_RICH or self._layout is None:
421
+ return
422
+
423
+ # Calculate progress percentage
424
+ progress_pct = 0
425
+ if self._state.total_chunks > 0:
426
+ progress_pct = int((self._state.current_chunk / self._state.total_chunks) * 100)
427
+
428
+ # Build progress bar using Rich Progress
429
+ progress = Progress(
430
+ TextColumn("[bold cyan]\u25ba[/bold cyan]"),
431
+ TextColumn(f"CHUNK {self._state.current_chunk}/{self._state.total_chunks}"),
432
+ BarColumn(bar_width=30, complete_style="cyan", finished_style="green"),
433
+ TextColumn(f"{progress_pct}%"),
434
+ expand=False,
435
+ )
436
+ task = progress.add_task("", total=self._state.total_chunks, completed=self._state.current_chunk)
437
+
438
+ progress_panel = Panel(
439
+ progress,
440
+ border_style="cyan",
441
+ box=DOUBLE,
442
+ padding=(0, 1),
443
+ )
444
+ self._layout["progress_bar"].update(progress_panel)
445
+
446
+ def _refresh_left_panel(self) -> None:
447
+ """Refresh the left panel with functions list and metrics."""
448
+ if not HAS_RICH or self._layout is None:
449
+ return
450
+
451
+ func_count = len(self._state.functions)
452
+
453
+ # Build function tree
454
+ content = Table.grid(padding=(0, 0))
455
+ content.add_column()
456
+
457
+ # Show max 6 functions with tree structure
458
+ max_display = 6
459
+ display_funcs = self._state.functions[-max_display:] if func_count > max_display else self._state.functions
460
+
461
+ for i, func in enumerate(display_funcs):
462
+ func_text = Text()
463
+ # Tree-style prefix
464
+ if i == len(display_funcs) - 1:
465
+ func_text.append("\u2514\u2500 ", style="dim cyan") # Corner
466
+ else:
467
+ func_text.append("\u251c\u2500 ", style="dim cyan") # Tee
468
+
469
+ func_text.append(func.name, style="bold")
470
+ func_text.append(" \u2713", style="green") # Checkmark
471
+
472
+ content.add_row(func_text)
473
+
474
+ # Show "+N more" if truncated
475
+ if func_count > max_display:
476
+ hidden_count = func_count - max_display
477
+ content.add_row(Text(f" (+{hidden_count} more)", style="dim italic"))
478
+
479
+ # Add spacing
480
+ content.add_row(Text(""))
481
+
482
+ # Token stats
483
+ tokens_in_k = self._state.tokens_in / 1000
484
+ tokens_out_k = self._state.tokens_out / 1000
485
+ tokens_text = Text()
486
+ tokens_text.append("TOKENS: ", style="bold")
487
+ tokens_text.append(f"~{tokens_in_k:.1f}k in / ~{tokens_out_k:.1f}k out", style="dim")
488
+ content.add_row(tokens_text)
489
+
490
+ # Latency stats
491
+ latency_text = Text()
492
+ latency_text.append("LATENCY: ", style="bold")
493
+ if self._state.llm_call_count > 0:
494
+ latency_text.append(f"{self._state.latency_last_ms:.1f}s", style="cyan")
495
+ latency_text.append(f" (avg {self._state.latency_avg_ms / 1000:.1f}s)", style="dim")
496
+ else:
497
+ latency_text.append("\u2014", style="dim") # Em dash
498
+ content.add_row(latency_text)
499
+
500
+ left_panel = Panel(
501
+ content,
502
+ title=f"[bold cyan]FUNCTIONS ACQUIRED [{func_count}][/bold cyan]",
503
+ border_style="cyan",
504
+ box=DOUBLE,
505
+ )
506
+ self._layout["left_panel"].update(left_panel)
507
+
508
+ def _parse_response_for_display(self, response: str) -> str:
509
+ """Parse LLM XML response into readable format for transmission log.
510
+
511
+ Args:
512
+ response: Raw LLM response text (XML format)
513
+
514
+ Returns:
515
+ Formatted string for display showing issues, function being
516
+ generated, and chunk status.
517
+ """
518
+ import re
519
+
520
+ lines = []
521
+
522
+ try:
523
+ # Find all issues
524
+ issue_pattern = r'<issue[^>]*id="(\d+)"[^>]*solved="(true|false)"[^>]*>([^<]+)</issue>'
525
+ issues = re.findall(issue_pattern, response, re.DOTALL)
526
+
527
+ if issues:
528
+ lines.append("ISSUES DETECTED:")
529
+ for issue_id, solved, desc in issues[:8]: # Limit to 8 issues
530
+ marker = "\u2713" if solved == "true" else "\u2717" # checkmark or X
531
+ desc_clean = desc.strip()[:40] # Truncate description
532
+ lines.append(f" {marker} {desc_clean}")
533
+ if len(issues) > 8:
534
+ lines.append(f" (+{len(issues) - 8} more)")
535
+ lines.append("")
536
+
537
+ # Find function being generated
538
+ name_match = re.search(r'<name>([^<]+)</name>', response)
539
+ docstring_match = re.search(r'<docstring>([^<]+)</docstring>', response, re.DOTALL)
540
+
541
+ if name_match:
542
+ lines.append(f"GENERATING: {name_match.group(1).strip()}")
543
+ if docstring_match:
544
+ doc = docstring_match.group(1).strip()[:60]
545
+ lines.append(f' "{doc}..."')
546
+ lines.append("")
547
+
548
+ # Find chunk status
549
+ status_match = re.search(r'<chunk_status>([^<]+)</chunk_status>', response)
550
+ if status_match:
551
+ status = status_match.group(1).strip()
552
+ lines.append(f"STATUS: {status.upper()}")
553
+
554
+ if lines:
555
+ return "\n".join(lines)
556
+ except Exception:
557
+ pass
558
+
559
+ # Fallback: show truncated raw response
560
+ return response[:500] + "..." if len(response) > 500 else response
561
+
562
+ def _refresh_right_panel(self) -> None:
563
+ """Refresh the right panel with parsed transmission log."""
564
+ if not HAS_RICH or self._layout is None:
565
+ return
566
+
567
+ # Get last response and parse for display
568
+ response = self._state.last_response
569
+ if not response:
570
+ display_text = "(Awaiting transmission...)"
571
+ else:
572
+ display_text = self._parse_response_for_display(response)
573
+
574
+ log_text = Text(display_text, style="dim cyan")
575
+
576
+ right_panel = Panel(
577
+ log_text,
578
+ title="[bold cyan]\u25c4\u25c4 TRANSMISSION LOG \u25ba\u25ba[/bold cyan]",
579
+ border_style="cyan",
580
+ box=DOUBLE,
581
+ )
582
+ self._layout["right_panel"].update(right_panel)
583
+
584
+ # Legacy method stubs for backwards compatibility
585
+ def _refresh_progress(self) -> None:
586
+ """Legacy method - calls _refresh_progress_bar."""
587
+ self._refresh_progress_bar()
588
+
589
+ def _refresh_functions(self) -> None:
590
+ """Legacy method - calls _refresh_left_panel."""
591
+ self._refresh_left_panel()
592
+
593
+ def _refresh_footer(self) -> None:
594
+ """Legacy method - no longer used but kept for compatibility."""
595
+ pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: recursive-cleaner
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
5
5
  Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
6
6
  Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
@@ -32,6 +32,8 @@ Provides-Extra: mlx
32
32
  Requires-Dist: mlx-lm>=0.10.0; extra == 'mlx'
33
33
  Provides-Extra: parquet
34
34
  Requires-Dist: pyarrow>=14.0.0; extra == 'parquet'
35
+ Provides-Extra: tui
36
+ Requires-Dist: rich>=13.0; extra == 'tui'
35
37
  Description-Content-Type: text/markdown
36
38
 
37
39
  # Recursive Data Cleaner
@@ -40,7 +42,7 @@ LLM-powered incremental data cleaning for massive datasets. Process files in chu
40
42
 
41
43
  ## How It Works
42
44
 
43
- 1. **Chunk** your data (JSONL, CSV, JSON, or text)
45
+ 1. **Chunk** your data (JSONL, CSV, JSON, Parquet, PDF, Word, Excel, XML, and more)
44
46
  2. **Analyze** each chunk with an LLM to identify issues
45
47
  3. **Generate** one cleaning function per issue
46
48
  4. **Validate** functions on holdout data before accepting
@@ -59,6 +61,21 @@ For Apple Silicon (MLX backend):
59
61
  pip install -e ".[mlx]"
60
62
  ```
61
63
 
64
+ For document conversion (PDF, Word, Excel, HTML, etc.):
65
+ ```bash
66
+ pip install -e ".[markitdown]"
67
+ ```
68
+
69
+ For Parquet files:
70
+ ```bash
71
+ pip install -e ".[parquet]"
72
+ ```
73
+
74
+ For Terminal UI (Rich dashboard):
75
+ ```bash
76
+ pip install -e ".[tui]"
77
+ ```
78
+
62
79
  ## Quick Start
63
80
 
64
81
  ```python
@@ -111,6 +128,18 @@ cleaner.run() # Generates cleaning_functions.py
111
128
  - **Cleaning Reports**: Markdown summary with functions, timing, quality delta
112
129
  - **Dry-Run Mode**: Analyze data without generating functions
113
130
 
131
+ ### Format Expansion (v0.7.0)
132
+ - **Markitdown Integration**: Convert 20+ formats (PDF, Word, Excel, PowerPoint, HTML, EPUB, etc.) to text
133
+ - **Parquet Support**: Load parquet files as structured data via pyarrow
134
+ - **LLM-Generated Parsers**: Auto-generate parsers for XML and unknown formats (`auto_parse=True`)
135
+
136
+ ### Terminal UI (v0.8.0)
137
+ - **Mission Control Dashboard**: Rich-based live terminal UI with retro aesthetic
138
+ - **Real-time Progress**: Animated progress bars, chunk/iteration counters
139
+ - **Transmission Log**: Parsed LLM responses showing issues detected and functions being generated
140
+ - **Token Estimation**: Track estimated input/output tokens across the run
141
+ - **Graceful Fallback**: Works without Rich installed (falls back to callbacks)
142
+
114
143
  ## Configuration
115
144
 
116
145
  ```python
@@ -142,6 +171,12 @@ cleaner = DataCleaner(
142
171
  report_path="report.md", # Markdown report output (None to disable)
143
172
  dry_run=False, # Analyze without generating functions
144
173
 
174
+ # Format Expansion
175
+ auto_parse=False, # LLM generates parser for unknown formats
176
+
177
+ # Terminal UI
178
+ tui=True, # Enable Rich dashboard (requires [tui] extra)
179
+
145
180
  # Progress & State
146
181
  on_progress=callback, # Progress event callback
147
182
  state_file="state.json", # Enable resume on interrupt
@@ -235,20 +270,22 @@ cleaner.run()
235
270
 
236
271
  ```
237
272
  recursive_cleaner/
238
- ├── cleaner.py # Main DataCleaner class (~580 lines)
239
- ├── context.py # Docstring registry with FIFO eviction
240
- ├── dependencies.py # Topological sort for function ordering
241
- ├── metrics.py # Quality metrics before/after
242
- ├── optimizer.py # Two-pass consolidation with LLM agency
243
- ├── output.py # Function file generation + import consolidation
244
- ├── parsers.py # Chunking for JSONL/CSV/JSON/text + sampling
245
- ├── prompt.py # LLM prompt templates
246
- ├── report.py # Markdown report generation
247
- ├── response.py # XML/markdown parsing + agency dataclasses
248
- ├── schema.py # Schema inference
249
- ├── validation.py # Runtime validation + holdout
273
+ ├── cleaner.py # Main DataCleaner class
274
+ ├── context.py # Docstring registry with FIFO eviction
275
+ ├── dependencies.py # Topological sort for function ordering
276
+ ├── metrics.py # Quality metrics before/after
277
+ ├── optimizer.py # Two-pass consolidation with LLM agency
278
+ ├── output.py # Function file generation + import consolidation
279
+ ├── parser_generator.py # LLM-generated parsers for unknown formats
280
+ ├── parsers.py # Chunking for all formats + sampling
281
+ ├── prompt.py # LLM prompt templates
282
+ ├── report.py # Markdown report generation
283
+ ├── response.py # XML/markdown parsing + agency dataclasses
284
+ ├── schema.py # Schema inference
285
+ ├── tui.py # Rich terminal dashboard
286
+ ├── validation.py # Runtime validation + holdout
250
287
  └── vendor/
251
- └── chunker.py # Vendored sentence-aware chunker
288
+ └── chunker.py # Vendored sentence-aware chunker
252
289
  ```
253
290
 
254
291
  ## Testing
@@ -257,7 +294,7 @@ recursive_cleaner/
257
294
  pytest tests/ -v
258
295
  ```
259
296
 
260
- 392 tests covering all features. Test datasets in `test_cases/`:
297
+ 465 tests covering all features. Test datasets in `test_cases/`:
261
298
  - E-commerce product catalogs
262
299
  - Healthcare patient records
263
300
  - Financial transaction data
@@ -273,6 +310,8 @@ pytest tests/ -v
273
310
 
274
311
  | Version | Features |
275
312
  |---------|----------|
313
+ | v0.8.0 | Terminal UI with Rich dashboard, mission control aesthetic, transmission log |
314
+ | v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
276
315
  | v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
277
316
  | v0.5.1 | Dangerous code detection (AST-based security) |
278
317
  | v0.5.0 | Two-pass optimization, early termination, LLM agency |
@@ -1,7 +1,7 @@
1
1
  backends/__init__.py,sha256=FUgODeYSGBvT0-z6myVby6YeAHG0nEUgWLITBKobUew,121
2
2
  backends/mlx_backend.py,sha256=0U6IqmDHyk4vjKzytvEcQvSUBryQTgFtsNOcpwFNKk8,2945
3
- recursive_cleaner/__init__.py,sha256=bG83PcmkxAYMC17FmKuyMJUrMnuukp32JO3rlCLyB-Q,1698
4
- recursive_cleaner/cleaner.py,sha256=J2X5bnk2OsWJyOn4BNR-cj0sqeKCylznfs_WEyMGxG8,26280
3
+ recursive_cleaner/__init__.py,sha256=v0bNQ3H0d7n6cTOkuxuqG9bmnX9yeZBLZ_AfFM7edHI,1789
4
+ recursive_cleaner/cleaner.py,sha256=vZTMwaLlCmuh1qy3c-puEZrwS5gXt0u28d5iweQXbms,29801
5
5
  recursive_cleaner/context.py,sha256=avMXRDxLd7nd8CKWtvPHQy1MFhBKiA0aUVVJIlWoLZ4,824
6
6
  recursive_cleaner/dependencies.py,sha256=vlYeoGL517v3yUSWN0wYDuIs9OOuQwM_dCBADrlitW8,2080
7
7
  recursive_cleaner/errors.py,sha256=hwRJF8NSmWy_FZHCxcZDZxLQ0zqvo5dX8ImkB9mrOYc,433
@@ -14,11 +14,12 @@ recursive_cleaner/prompt.py,sha256=ep0eOXz_XbhH3HduJ76LvzVSftonhcv4GLEecIqd3lY,6
14
14
  recursive_cleaner/report.py,sha256=AWWneRjvl76ccLlExdkKJeY3GVFUG_LtmzVIJJT5cFI,4629
15
15
  recursive_cleaner/response.py,sha256=3w0mLnqEPdB4daMSF0mtTcG0PTP-utb1HFtKuYA1ljw,9064
16
16
  recursive_cleaner/schema.py,sha256=w2hcEdApR15KVI9SFWB3VfumMoHFwn1YJrktdfgPo8M,3925
17
+ recursive_cleaner/tui.py,sha256=FwG_uCwqUcvch5dRZmV-ba2JXD0XJkm9roXzPQ9iUSo,21633
17
18
  recursive_cleaner/types.py,sha256=-GdCmsfHd3rfdfCi5c-RXqX4TyuCSHgA__3AF3bMhoQ,290
18
19
  recursive_cleaner/validation.py,sha256=-KAolhw3GQyhHwmh0clEj8xqPD5O-R2AO5rx7vubIME,6442
19
20
  recursive_cleaner/vendor/__init__.py,sha256=E87TjmjRzu8ty39nqThvBwM611yXlLKQZ6KGY_zp3Dk,117
20
21
  recursive_cleaner/vendor/chunker.py,sha256=pDDbfF6FoSmUji0-RG4MletPxJ-VybGw0yfnhh0aMSQ,6730
21
- recursive_cleaner-0.7.0.dist-info/METADATA,sha256=bSCS8YBPAYzBufVF41LDYAgpLnYc4JAynA4xkNVuKyo,9486
22
- recursive_cleaner-0.7.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
23
- recursive_cleaner-0.7.0.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
24
- recursive_cleaner-0.7.0.dist-info/RECORD,,
22
+ recursive_cleaner-0.8.0.dist-info/METADATA,sha256=rVABzjvUZ-uzk35o5evbIlkRIbgEb29QPKSCoMI4_fs,11072
23
+ recursive_cleaner-0.8.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
24
+ recursive_cleaner-0.8.0.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
25
+ recursive_cleaner-0.8.0.dist-info/RECORD,,