PyPI - recursive-cleaner - Versions diffs - 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

recursive-cleaner 0.7.0py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

recursive_cleaner/__init__.py CHANGED Viewed

@@ -20,6 +20,7 @@ from recursive_cleaner.parsers import MARKITDOWN_EXTENSIONS, chunk_file, load_pa
 from recursive_cleaner.prompt import build_prompt
 from recursive_cleaner.response import extract_python_block, parse_response
 from recursive_cleaner.parser_generator import check_parser_safety, generate_parser
+from recursive_cleaner.tui import HAS_RICH, TUIRenderer
 from recursive_cleaner.validation import check_code_safety, extract_sample_data, validate_function
 __all__ = [
@@ -49,4 +50,6 @@ __all__ = [
     "consolidate_with_agency",
     "generate_parser",
     "check_parser_safety",
+    "TUIRenderer",
+    "HAS_RICH",
 ]

recursive_cleaner/cleaner.py CHANGED Viewed

@@ -62,6 +62,7 @@ class DataCleaner:
         report_path: str | None = "cleaning_report.md",
         dry_run: bool = False,
         auto_parse: bool = False,
+        tui: bool = False,
     ):
         self.backend = llm_backend
         self.file_path = file_path
@@ -86,7 +87,9 @@ class DataCleaner:
         self.report_path = report_path
         self.dry_run = dry_run
         self.auto_parse = auto_parse
+        self.tui = tui
         self.functions: list[dict] = []  # List of {name, docstring, code}
+        self._tui_renderer = None  # TUIRenderer instance when tui=True
         self._generated_parser: callable | None = None  # LLM-generated parser for unknown formats
         # Track recent function generation for saturation check
         self._recent_new_function_count = 0
@@ -119,10 +122,15 @@ class DataCleaner:
         try:
             self.on_progress(event)
         except Exception as e:
-            print(f"  Warning: callback error: {e}")
+            if not self.tui:
+                print(f"  Warning: callback error: {e}")
     def _call_llm_timed(self, prompt: str, chunk_index: int = 0) -> str:
         """Call LLM with timing and emit latency event."""
+        # Update TUI status before call
+        if self._tui_renderer:
+            self._tui_renderer.update_llm_status("calling")
         start = time.perf_counter()
         response = call_llm(self.backend, prompt)
         elapsed_ms = (time.perf_counter() - start) * 1000
@@ -133,6 +141,20 @@ class DataCleaner:
         self._latency_stats["min_ms"] = min(self._latency_stats["min_ms"], elapsed_ms)
         self._latency_stats["max_ms"] = max(self._latency_stats["max_ms"], elapsed_ms)
+        # Update TUI status and metrics after call
+        if self._tui_renderer:
+            self._tui_renderer.update_llm_status("idle")
+            latency_summary = self._get_latency_summary()
+            self._tui_renderer.update_metrics(
+                quality_delta=0.0,  # Quality delta calculated at end
+                latency_last=elapsed_ms,
+                latency_avg=latency_summary.get("avg_ms", 0.0),
+                latency_total=latency_summary.get("total_ms", 0.0),
+                llm_calls=latency_summary.get("call_count", 0),
+            )
+            self._tui_renderer.update_tokens(prompt, response)
+            self._tui_renderer.update_transmission(response)
         # Emit event
         self._emit("llm_call", chunk_index=chunk_index, latency_ms=round(elapsed_ms, 2))
@@ -216,7 +238,8 @@ class DataCleaner:
             response = self._call_llm_timed(prompt, chunk_index=chunks_processed - 1)
             assessment = parse_saturation_response(response)
         except Exception as e:
-            print(f"  Warning: saturation check failed: {e}")
+            if not self.tui:
+                print(f"  Warning: saturation check failed: {e}")
             return False  # Continue on error
         self._emit(
@@ -275,7 +298,8 @@ class DataCleaner:
         self.functions = state.get("functions", [])
         self._last_completed_chunk = state.get("last_completed_chunk", -1)
         self._total_chunks = state.get("total_chunks", 0)
-        print(f"Resumed from state: {self._last_completed_chunk + 1}/{self._total_chunks} chunks completed")
+        if not self.tui:
+            print(f"Resumed from state: {self._last_completed_chunk + 1}/{self._total_chunks} chunks completed")
         return True
     @classmethod
@@ -340,14 +364,16 @@ class DataCleaner:
         """Load file using LLM-generated parser, return JSONL chunks."""
         from .parser_generator import generate_parser
-        print(f"Unknown file format, generating parser...")
+        if not self.tui:
+            print(f"Unknown file format, generating parser...")
         self._emit("parser_generation_start")
         parser = generate_parser(self.backend, self.file_path)
         self._generated_parser = parser
         self._emit("parser_generation_complete")
-        print("Parser generated successfully.")
+        if not self.tui:
+            print("Parser generated successfully.")
         # Parse the file
         records = parser(self.file_path)
@@ -390,7 +416,8 @@ class DataCleaner:
             )
         if not chunks:
-            print("No data to process.")
+            if not self.tui:
+                print("No data to process.")
             return
         # Try to load existing state
@@ -409,13 +436,38 @@ class DataCleaner:
         self._total_chunks = len(chunks)
+        # Initialize TUI if enabled
+        if self.tui:
+            from .tui import HAS_RICH, TUIRenderer
+            if HAS_RICH:
+                self._tui_renderer = TUIRenderer(
+                    file_path=self.file_path,
+                    total_chunks=self._total_chunks,
+                    total_records=0,  # Could be calculated from chunks
+                )
+                self._tui_renderer.start()
+            else:
+                import logging
+                logging.warning(
+                    "tui=True but Rich not installed. "
+                    "Install with: pip install recursive-cleaner[tui]"
+                )
         for i, chunk in enumerate(chunks):
             # Skip already completed chunks
             if i <= self._last_completed_chunk:
-                if resumed:
+                if resumed and not self.tui:
                     print(f"Skipping chunk {i + 1}/{len(chunks)} (already completed)")
                 continue
-            print(f"Processing chunk {i + 1}/{len(chunks)}...")
+            if not self.tui:
+                print(f"Processing chunk {i + 1}/{len(chunks)}...")
+            # Update TUI with chunk progress
+            if self._tui_renderer:
+                self._tui_renderer.update_chunk(i, 0, self.max_iterations)
             self._process_chunk(chunk, i)
             # Mark chunk as completed and save state
             self._last_completed_chunk = i
@@ -429,7 +481,8 @@ class DataCleaner:
             ):
                 if self._check_saturation(i + 1):
                     self._emit("early_termination", chunk_index=i)
-                    print(f"Early termination: pattern discovery saturated at chunk {i + 1}")
+                    if not self.tui:
+                        print(f"Early termination: pattern discovery saturated at chunk {i + 1}")
                     break
         # Skip optimization and output in dry_run mode
@@ -439,7 +492,11 @@ class DataCleaner:
                 chunk_index=self._total_chunks - 1,
                 latency_stats=self._get_latency_summary(),
             )
-            print("Dry run complete. No functions generated or saved.")
+            # Stop TUI if running
+            if self._tui_renderer:
+                self._tui_renderer.stop()
+            if not self.tui:
+                print("Dry run complete. No functions generated or saved.")
             return
         # Two-pass optimization (if enabled and enough functions)
@@ -453,7 +510,22 @@ class DataCleaner:
             chunk_index=self._total_chunks - 1,
             latency_stats=self._get_latency_summary(),
         )
-        print(f"Done! Generated {len(self.functions)} functions.")
+        # Show TUI completion and stop
+        if self._tui_renderer:
+            latency_summary = self._get_latency_summary()
+            self._tui_renderer.show_complete({
+                "functions_count": len(self.functions),
+                "chunks_processed": self._total_chunks,
+                "quality_delta": 0.0,  # Could be calculated from metrics
+                "latency_total_ms": latency_summary.get("total_ms", 0.0),
+                "llm_calls": latency_summary.get("call_count", 0),
+                "output_file": "cleaning_functions.py",
+            })
+            self._tui_renderer.stop()
+        if not self.tui:
+            print(f"Done! Generated {len(self.functions)} functions.")
     def _process_chunk(self, chunk: str, chunk_idx: int) -> None:
         """Process a single chunk, iterating until clean or max iterations."""
@@ -476,6 +548,11 @@ class DataCleaner:
         for iteration in range(self.max_iterations):
             self._emit("iteration", chunk_index=chunk_idx, iteration=iteration)
+            # Update TUI with iteration progress
+            if self._tui_renderer:
+                self._tui_renderer.update_chunk(chunk_idx, iteration, self.max_iterations)
             context = build_context(self.functions, self.context_budget)
             prompt = build_prompt(
                 self.instructions,
@@ -511,7 +588,8 @@ class DataCleaner:
                         function_name=result["name"],
                         error=safety_error,
                     )
-                    print(f"  Safety check failed: {safety_error}")
+                    if not self.tui:
+                        print(f"  Safety check failed: {safety_error}")
                     continue
                 # Runtime validation if enabled
@@ -539,7 +617,8 @@ class DataCleaner:
                             function_name=result["name"],
                             error=error_msg,
                         )
-                        print(f"  Validation failed: {error_msg}")
+                        if not self.tui:
+                            print(f"  Validation failed: {error_msg}")
                         continue
                 self.functions.append({
@@ -549,17 +628,25 @@ class DataCleaner:
                 })
                 # Track for saturation check
                 self._recent_new_function_count += 1
+                # Update TUI with new function
+                if self._tui_renderer:
+                    self._tui_renderer.add_function(result["name"], result["docstring"])
                 self._emit(
                     "function_generated",
                     chunk_index=chunk_idx,
                     function_name=result["name"],
                 )
-                print(f"  Generated: {result['name']}")
+                if not self.tui:
+                    print(f"  Generated: {result['name']}")
             else:
                 # LLM said needs_more_work but didn't provide code
-                print(f"  Warning: iteration {iteration + 1} produced no function")
+                if not self.tui:
+                    print(f"  Warning: iteration {iteration + 1} produced no function")
-        print(f"  Warning: chunk {chunk_idx} hit max iterations ({self.max_iterations})")
+        if not self.tui:
+            print(f"  Warning: chunk {chunk_idx} hit max iterations ({self.max_iterations})")
         self._emit("chunk_done", chunk_index=chunk_idx)
     def _process_chunk_dry_run(self, chunk: str, chunk_idx: int) -> None:
@@ -577,7 +664,8 @@ class DataCleaner:
             response = self._call_llm_timed(prompt, chunk_index=chunk_idx)
             result = parse_response(response)
         except ParseError as e:
-            print(f"  Warning: parse error in dry run: {e}")
+            if not self.tui:
+                print(f"  Warning: parse error in dry run: {e}")
             self._emit("chunk_done", chunk_index=chunk_idx)
             return
@@ -589,11 +677,12 @@ class DataCleaner:
             issues=issues,
         )
-        if issues:
-            unsolved = [i for i in issues if not i.get("solved", False)]
-            print(f"  Found {len(issues)} issues ({len(unsolved)} unsolved)")
-        else:
-            print("  No issues detected")
+        if not self.tui:
+            if issues:
+                unsolved = [i for i in issues if not i.get("solved", False)]
+                print(f"  Found {len(issues)} issues ({len(unsolved)} unsolved)")
+            else:
+                print("  No issues detected")
         self._emit("chunk_done", chunk_index=chunk_idx)
@@ -604,8 +693,9 @@ class DataCleaner:
         try:
             write_cleaning_file(self.functions)
         except OutputValidationError as e:
-            print(f"  Error: {e}")
-            print("  Attempting to write valid functions only...")
+            if not self.tui:
+                print(f"  Error: {e}")
+                print("  Attempting to write valid functions only...")
             # Try writing functions one by one, skipping invalid ones
             valid_functions = []
             for f in self.functions:
@@ -614,10 +704,11 @@ class DataCleaner:
                     ast.parse(f["code"])
                     valid_functions.append(f)
                 except SyntaxError:
-                    print(f"  Skipping invalid function: {f['name']}")
+                    if not self.tui:
+                        print(f"  Skipping invalid function: {f['name']}")
             if valid_functions:
                 write_cleaning_file(valid_functions)
-            else:
+            elif not self.tui:
                 print("  No valid functions to write.")
     def _write_report(self) -> None:

recursive_cleaner/tui.py ADDED Viewed

@@ -0,0 +1,595 @@
+"""Rich TUI dashboard with Mission Control retro aesthetic."""
+import time
+from dataclasses import dataclass, field
+from typing import Literal
+# Graceful import - TUI features only available when Rich is installed
+try:
+    from rich.box import DOUBLE
+    from rich.console import Console, Group
+    from rich.layout import Layout
+    from rich.live import Live
+    from rich.panel import Panel
+    from rich.progress import BarColumn, Progress, TextColumn
+    from rich.table import Table
+    from rich.text import Text
+    HAS_RICH = True
+except ImportError:
+    HAS_RICH = False
+# ASCII art banner - chunky block style
+ASCII_BANNER = """
+██████╗ ███████╗ ██████╗██╗   ██╗██████╗ ███████╗██╗██╗   ██╗███████╗
+██╔══██╗██╔════╝██╔════╝██║   ██║██╔══██╗██╔════╝██║██║   ██║██╔════╝
+██████╔╝█████╗  ██║     ██║   ██║██████╔╝███████╗██║██║   ██║█████╗
+██╔══██╗██╔══╝  ██║     ██║   ██║██╔══██╗╚════██║██║╚██╗ ██╔╝██╔══╝
+██║  ██║███████╗╚██████╗╚██████╔╝██║  ██║███████║██║ ╚████╔╝ ███████╗
+╚═╝  ╚═╝╚══════╝ ╚═════╝ ╚═════╝ ╚═╝  ╚═╝╚══════╝╚═╝  ╚═══╝  ╚══════╝
+ ██████╗██╗     ███████╗ █████╗ ███╗   ██╗███████╗██████╗
+██╔════╝██║     ██╔════╝██╔══██╗████╗  ██║██╔════╝██╔══██╗
+██║     ██║     █████╗  ███████║██╔██╗ ██║█████╗  ██████╔╝
+██║     ██║     ██╔══╝  ██╔══██║██║╚██╗██║██╔══╝  ██╔══██╗
+╚██████╗███████╗███████╗██║  ██║██║ ╚████║███████╗██║  ██║
+ ╚═════╝╚══════╝╚══════╝╚═╝  ╚═╝╚═╝  ╚═══╝╚══════╝╚═╝  ╚═╝
+""".strip()
+# Keep HEADER_TITLE for backwards compatibility with tests
+HEADER_TITLE = "RECURSIVE CLEANER"
+@dataclass
+class FunctionInfo:
+    """Info about a generated cleaning function."""
+    name: str
+    docstring: str
+@dataclass
+class TUIState:
+    """Dashboard display state."""
+    # Header
+    file_path: str
+    total_records: int
+    version: str = "0.8.0"
+    # Progress
+    current_chunk: int = 0
+    total_chunks: int = 0
+    current_iteration: int = 0
+    max_iterations: int = 5
+    # LLM Status
+    llm_status: Literal["idle", "calling"] = "idle"
+    # Functions
+    functions: list[FunctionInfo] = field(default_factory=list)
+    # Latency metrics
+    latency_last_ms: float = 0.0
+    latency_avg_ms: float = 0.0
+    latency_total_ms: float = 0.0
+    llm_call_count: int = 0
+    # Token estimation
+    tokens_in: int = 0
+    tokens_out: int = 0
+    # Transmission log
+    last_response: str = ""
+class TUIRenderer:
+    """
+    Rich-based terminal dashboard with Mission Control retro aesthetic.
+    Shows live updates during cleaning runs with:
+    - ASCII art banner header
+    - Mission timer and status indicator
+    - Progress bar and chunk/iteration counters
+    - List of generated functions with checkmarks
+    - Token estimation and latency metrics
+    - Transmission log showing latest LLM response
+    """
+    def __init__(self, file_path: str, total_chunks: int, total_records: int = 0):
+        """
+        Initialize TUI renderer.
+        Args:
+            file_path: Path to the data file being cleaned
+            total_chunks: Total number of chunks to process
+            total_records: Total number of records in the file
+        """
+        self._state = TUIState(
+            file_path=file_path,
+            total_chunks=total_chunks,
+            total_records=total_records,
+        )
+        self._start_time = time.time()
+        self._layout = self._make_layout() if HAS_RICH else None
+        self._live: "Live | None" = None
+        self._console = Console() if HAS_RICH else None
+    def _make_layout(self) -> "Layout":
+        """Create the dashboard layout structure.
+        Layout:
+        - header (size=5)       - ASCII art banner "RECURSIVE CLEANER"
+        - status_bar (size=3)   - MISSION | TIME | STATUS
+        - progress_bar (size=3) - CHUNK X/Y + progress bar
+        - body (size=computed)  - Split horizontally, FIXED size to prevent infinite expansion
+            - left_panel        - FUNCTIONS ACQUIRED, tokens, latency
+            - right_panel       - Parsed transmission log
+        CRITICAL: Body uses fixed `size=` not `ratio=` to prevent panels from
+        expanding infinitely and pushing header off screen on large terminals.
+        Works on terminals as small as 80x24.
+        """
+        if not HAS_RICH:
+            return None
+        from rich.console import Console
+        console = Console()
+        term_height = console.height or 24  # Default to 24 if unknown
+        # Fixed heights for top sections
+        header_height = 14  # ASCII banner (12 lines + border)
+        status_height = 3
+        progress_height = 3
+        fixed_total = header_height + status_height + progress_height
+        # Body gets remaining space with a FIXED size (not ratio)
+        # Cap at 18 rows max to keep it tight
+        body_height = min(18, max(10, term_height - fixed_total - 2))
+        layout = Layout()
+        layout.split_column(
+            Layout(name="header", size=header_height),
+            Layout(name="status_bar", size=status_height),
+            Layout(name="progress_bar", size=progress_height),
+            Layout(name="body", size=body_height),  # FIXED size, not ratio
+        )
+        layout["body"].split_row(
+            Layout(name="left_panel", ratio=1),
+            Layout(name="right_panel", ratio=1),
+        )
+        return layout
+    def start(self) -> None:
+        """Start the live TUI display."""
+        if not HAS_RICH or self._layout is None:
+            return
+        self._start_time = time.time()
+        self._refresh()
+        self._live = Live(
+            self._layout,
+            console=self._console,
+            refresh_per_second=2,
+            vertical_overflow="crop",
+        )
+        self._live.start()
+    def stop(self) -> None:
+        """Stop the live TUI display."""
+        if self._live:
+            self._live.stop()
+            self._live = None
+    def update_chunk(self, chunk_index: int, iteration: int, max_iterations: int) -> None:
+        """
+        Update progress for current chunk and iteration.
+        Args:
+            chunk_index: Current chunk index (0-based)
+            iteration: Current iteration within chunk (0-based)
+            max_iterations: Maximum iterations per chunk
+        """
+        self._state.current_chunk = chunk_index + 1  # Convert to 1-based for display
+        self._state.current_iteration = iteration + 1
+        self._state.max_iterations = max_iterations
+        self._refresh()
+    def update_llm_status(self, status: Literal["calling", "idle"]) -> None:
+        """
+        Update LLM call status.
+        Args:
+            status: "calling" when LLM is being called, "idle" otherwise
+        """
+        self._state.llm_status = status
+        self._refresh()
+    def add_function(self, name: str, docstring: str) -> None:
+        """
+        Add a newly generated function to the display.
+        Args:
+            name: Function name
+            docstring: Function docstring
+        """
+        self._state.functions.append(FunctionInfo(name=name, docstring=docstring))
+        self._refresh()
+    def update_metrics(
+        self,
+        quality_delta: float,
+        latency_last: float,
+        latency_avg: float,
+        latency_total: float,
+        llm_calls: int,
+    ) -> None:
+        """
+        Update latency metrics.
+        Args:
+            quality_delta: Quality improvement percentage (ignored, kept for compatibility)
+            latency_last: Last LLM call latency in ms
+            latency_avg: Average LLM call latency in ms
+            latency_total: Total LLM call time in ms
+            llm_calls: Total number of LLM calls
+        """
+        self._state.latency_last_ms = latency_last
+        self._state.latency_avg_ms = latency_avg
+        self._state.latency_total_ms = latency_total
+        self._state.llm_call_count = llm_calls
+        self._refresh()
+    def update_tokens(self, prompt: str, response: str) -> None:
+        """
+        Update token estimates.
+        Rough estimate: len(text) // 4
+        Args:
+            prompt: The prompt sent to the LLM
+            response: The response received from the LLM
+        """
+        self._state.tokens_in += len(prompt) // 4
+        self._state.tokens_out += len(response) // 4
+        self._refresh()
+    def update_transmission(self, response: str) -> None:
+        """
+        Update the transmission log with latest LLM response.
+        Args:
+            response: The latest LLM response text
+        """
+        self._state.last_response = response
+        self._refresh()
+    def _get_elapsed_time(self) -> str:
+        """Get elapsed time as MM:SS string."""
+        elapsed = int(time.time() - self._start_time)
+        minutes = elapsed // 60
+        seconds = elapsed % 60
+        return f"{minutes:02d}:{seconds:02d}"
+    def show_complete(self, summary: dict) -> None:
+        """
+        Show completion summary panel.
+        Args:
+            summary: Dictionary with completion stats including:
+                - functions_count: Number of functions generated
+                - chunks_processed: Number of chunks processed
+                - latency_total_ms: Total LLM time in ms
+                - llm_calls: Number of LLM calls
+                - output_file: Path to output file
+        """
+        if not HAS_RICH or self._layout is None:
+            return
+        # Build completion panel content
+        content = Table.grid(padding=(0, 2))
+        content.add_column(justify="left")
+        content.add_column(justify="left")
+        func_count = summary.get("functions_count", len(self._state.functions))
+        chunks = summary.get("chunks_processed", self._state.total_chunks)
+        elapsed = self._get_elapsed_time()
+        # Token stats
+        tokens_in_k = self._state.tokens_in / 1000
+        tokens_out_k = self._state.tokens_out / 1000
+        content.add_row(
+            Text("Functions Acquired:", style="bold"),
+            Text(str(func_count), style="green"),
+        )
+        content.add_row(
+            Text("Chunks Processed:", style="bold"),
+            Text(str(chunks)),
+        )
+        content.add_row(
+            Text("Total Time:", style="bold"),
+            Text(elapsed),
+        )
+        content.add_row(
+            Text("Tokens:", style="bold"),
+            Text(f"~{tokens_in_k:.1f}k in / ~{tokens_out_k:.1f}k out"),
+        )
+        content.add_row(Text(""), Text(""))  # Spacer
+        content.add_row(
+            Text("Output:", style="bold"),
+            Text(summary.get("output_file", "cleaning_functions.py"), style="cyan"),
+        )
+        # Build the complete panel with box drawing
+        complete_panel = Panel(
+            content,
+            title="[bold green]MISSION COMPLETE[/bold green]",
+            border_style="green",
+            box=DOUBLE,
+        )
+        # Replace entire layout with completion panel
+        self._layout.split_column(
+            Layout(complete_panel, name="complete"),
+        )
+        if self._live:
+            self._live.update(self._layout)
+    def _refresh(self) -> None:
+        """Refresh all panels with current state."""
+        if not HAS_RICH or self._layout is None:
+            return
+        self._refresh_header()
+        self._refresh_status_bar()
+        self._refresh_progress_bar()
+        self._refresh_left_panel()
+        self._refresh_right_panel()
+        if self._live:
+            self._live.update(self._layout)
+    def _refresh_header(self) -> None:
+        """Refresh the header panel with ASCII art banner."""
+        if not HAS_RICH or self._layout is None:
+            return
+        banner_text = Text(ASCII_BANNER, style="bold cyan")
+        header_panel = Panel(
+            banner_text,
+            border_style="cyan",
+            box=DOUBLE,
+            padding=(0, 1),
+        )
+        self._layout["header"].update(header_panel)
+    def _refresh_status_bar(self) -> None:
+        """Refresh the status bar with mission info, timer, and status."""
+        if not HAS_RICH or self._layout is None:
+            return
+        # Truncate file path if too long
+        file_path = self._state.file_path
+        if len(file_path) > 30:
+            file_path = "..." + file_path[-27:]
+        elapsed = self._get_elapsed_time()
+        # Status indicator
+        if self._state.llm_status == "calling":
+            status_text = Text("ACTIVE", style="bold green")
+            status_indicator = "\u25cf"  # Filled circle
+        else:
+            status_text = Text("IDLE", style="dim")
+            status_indicator = "\u25cb"  # Empty circle
+        # Build status bar content
+        status_table = Table.grid(padding=(0, 2), expand=True)
+        status_table.add_column(justify="left", ratio=2)
+        status_table.add_column(justify="center", ratio=1)
+        status_table.add_column(justify="right", ratio=1)
+        mission_text = Text()
+        mission_text.append("MISSION: ", style="bold")
+        mission_text.append(file_path, style="cyan")
+        time_text = Text()
+        time_text.append("TIME: ", style="bold")
+        time_text.append(elapsed, style="cyan")
+        status_combined = Text()
+        status_combined.append("STATUS: ", style="bold")
+        status_combined.append(f"{status_indicator} ", style="green" if self._state.llm_status == "calling" else "dim")
+        status_combined.append_text(status_text)
+        status_table.add_row(mission_text, time_text, status_combined)
+        status_panel = Panel(
+            status_table,
+            border_style="cyan",
+            box=DOUBLE,
+            padding=(0, 1),
+        )
+        self._layout["status_bar"].update(status_panel)
+    def _refresh_progress_bar(self) -> None:
+        """Refresh the progress bar panel."""
+        if not HAS_RICH or self._layout is None:
+            return
+        # Calculate progress percentage
+        progress_pct = 0
+        if self._state.total_chunks > 0:
+            progress_pct = int((self._state.current_chunk / self._state.total_chunks) * 100)
+        # Build progress bar using Rich Progress
+        progress = Progress(
+            TextColumn("[bold cyan]\u25ba[/bold cyan]"),
+            TextColumn(f"CHUNK {self._state.current_chunk}/{self._state.total_chunks}"),
+            BarColumn(bar_width=30, complete_style="cyan", finished_style="green"),
+            TextColumn(f"{progress_pct}%"),
+            expand=False,
+        )
+        task = progress.add_task("", total=self._state.total_chunks, completed=self._state.current_chunk)
+        progress_panel = Panel(
+            progress,
+            border_style="cyan",
+            box=DOUBLE,
+            padding=(0, 1),
+        )
+        self._layout["progress_bar"].update(progress_panel)
+    def _refresh_left_panel(self) -> None:
+        """Refresh the left panel with functions list and metrics."""
+        if not HAS_RICH or self._layout is None:
+            return
+        func_count = len(self._state.functions)
+        # Build function tree
+        content = Table.grid(padding=(0, 0))
+        content.add_column()
+        # Show max 6 functions with tree structure
+        max_display = 6
+        display_funcs = self._state.functions[-max_display:] if func_count > max_display else self._state.functions
+        for i, func in enumerate(display_funcs):
+            func_text = Text()
+            # Tree-style prefix
+            if i == len(display_funcs) - 1:
+                func_text.append("\u2514\u2500 ", style="dim cyan")  # Corner
+            else:
+                func_text.append("\u251c\u2500 ", style="dim cyan")  # Tee
+            func_text.append(func.name, style="bold")
+            func_text.append(" \u2713", style="green")  # Checkmark
+            content.add_row(func_text)
+        # Show "+N more" if truncated
+        if func_count > max_display:
+            hidden_count = func_count - max_display
+            content.add_row(Text(f"   (+{hidden_count} more)", style="dim italic"))
+        # Add spacing
+        content.add_row(Text(""))
+        # Token stats
+        tokens_in_k = self._state.tokens_in / 1000
+        tokens_out_k = self._state.tokens_out / 1000
+        tokens_text = Text()
+        tokens_text.append("TOKENS: ", style="bold")
+        tokens_text.append(f"~{tokens_in_k:.1f}k in / ~{tokens_out_k:.1f}k out", style="dim")
+        content.add_row(tokens_text)
+        # Latency stats
+        latency_text = Text()
+        latency_text.append("LATENCY: ", style="bold")
+        if self._state.llm_call_count > 0:
+            latency_text.append(f"{self._state.latency_last_ms:.1f}s", style="cyan")
+            latency_text.append(f" (avg {self._state.latency_avg_ms / 1000:.1f}s)", style="dim")
+        else:
+            latency_text.append("\u2014", style="dim")  # Em dash
+        content.add_row(latency_text)
+        left_panel = Panel(
+            content,
+            title=f"[bold cyan]FUNCTIONS ACQUIRED [{func_count}][/bold cyan]",
+            border_style="cyan",
+            box=DOUBLE,
+        )
+        self._layout["left_panel"].update(left_panel)
+    def _parse_response_for_display(self, response: str) -> str:
+        """Parse LLM XML response into readable format for transmission log.
+        Args:
+            response: Raw LLM response text (XML format)
+        Returns:
+            Formatted string for display showing issues, function being
+            generated, and chunk status.
+        """
+        import re
+        lines = []
+        try:
+            # Find all issues
+            issue_pattern = r'<issue[^>]*id="(\d+)"[^>]*solved="(true|false)"[^>]*>([^<]+)</issue>'
+            issues = re.findall(issue_pattern, response, re.DOTALL)
+            if issues:
+                lines.append("ISSUES DETECTED:")
+                for issue_id, solved, desc in issues[:8]:  # Limit to 8 issues
+                    marker = "\u2713" if solved == "true" else "\u2717"  # checkmark or X
+                    desc_clean = desc.strip()[:40]  # Truncate description
+                    lines.append(f"  {marker} {desc_clean}")
+                if len(issues) > 8:
+                    lines.append(f"  (+{len(issues) - 8} more)")
+                lines.append("")
+            # Find function being generated
+            name_match = re.search(r'<name>([^<]+)</name>', response)
+            docstring_match = re.search(r'<docstring>([^<]+)</docstring>', response, re.DOTALL)
+            if name_match:
+                lines.append(f"GENERATING: {name_match.group(1).strip()}")
+                if docstring_match:
+                    doc = docstring_match.group(1).strip()[:60]
+                    lines.append(f'  "{doc}..."')
+                lines.append("")
+            # Find chunk status
+            status_match = re.search(r'<chunk_status>([^<]+)</chunk_status>', response)
+            if status_match:
+                status = status_match.group(1).strip()
+                lines.append(f"STATUS: {status.upper()}")
+            if lines:
+                return "\n".join(lines)
+        except Exception:
+            pass
+        # Fallback: show truncated raw response
+        return response[:500] + "..." if len(response) > 500 else response
+    def _refresh_right_panel(self) -> None:
+        """Refresh the right panel with parsed transmission log."""
+        if not HAS_RICH or self._layout is None:
+            return
+        # Get last response and parse for display
+        response = self._state.last_response
+        if not response:
+            display_text = "(Awaiting transmission...)"
+        else:
+            display_text = self._parse_response_for_display(response)
+        log_text = Text(display_text, style="dim cyan")
+        right_panel = Panel(
+            log_text,
+            title="[bold cyan]\u25c4\u25c4 TRANSMISSION LOG \u25ba\u25ba[/bold cyan]",
+            border_style="cyan",
+            box=DOUBLE,
+        )
+        self._layout["right_panel"].update(right_panel)
+    # Legacy method stubs for backwards compatibility
+    def _refresh_progress(self) -> None:
+        """Legacy method - calls _refresh_progress_bar."""
+        self._refresh_progress_bar()
+    def _refresh_functions(self) -> None:
+        """Legacy method - calls _refresh_left_panel."""
+        self._refresh_left_panel()
+    def _refresh_footer(self) -> None:
+        """Legacy method - no longer used but kept for compatibility."""
+        pass

{recursive_cleaner-0.7.0.dist-info → recursive_cleaner-0.8.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: recursive-cleaner
-Version: 0.7.0
+Version: 0.8.0
 Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
 Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
 Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
@@ -32,6 +32,8 @@ Provides-Extra: mlx
 Requires-Dist: mlx-lm>=0.10.0; extra == 'mlx'
 Provides-Extra: parquet
 Requires-Dist: pyarrow>=14.0.0; extra == 'parquet'
+Provides-Extra: tui
+Requires-Dist: rich>=13.0; extra == 'tui'
 Description-Content-Type: text/markdown
 # Recursive Data Cleaner
@@ -40,7 +42,7 @@ LLM-powered incremental data cleaning for massive datasets. Process files in chu
 ## How It Works
-1. **Chunk** your data (JSONL, CSV, JSON, or text)
+1. **Chunk** your data (JSONL, CSV, JSON, Parquet, PDF, Word, Excel, XML, and more)
 2. **Analyze** each chunk with an LLM to identify issues
 3. **Generate** one cleaning function per issue
 4. **Validate** functions on holdout data before accepting
@@ -59,6 +61,21 @@ For Apple Silicon (MLX backend):
 pip install -e ".[mlx]"
 ```
+For document conversion (PDF, Word, Excel, HTML, etc.):
+```bash
+pip install -e ".[markitdown]"
+```
+For Parquet files:
+```bash
+pip install -e ".[parquet]"
+```
+For Terminal UI (Rich dashboard):
+```bash
+pip install -e ".[tui]"
+```
 ## Quick Start
 ```python
@@ -111,6 +128,18 @@ cleaner.run()  # Generates cleaning_functions.py
 - **Cleaning Reports**: Markdown summary with functions, timing, quality delta
 - **Dry-Run Mode**: Analyze data without generating functions
+### Format Expansion (v0.7.0)
+- **Markitdown Integration**: Convert 20+ formats (PDF, Word, Excel, PowerPoint, HTML, EPUB, etc.) to text
+- **Parquet Support**: Load parquet files as structured data via pyarrow
+- **LLM-Generated Parsers**: Auto-generate parsers for XML and unknown formats (`auto_parse=True`)
+### Terminal UI (v0.8.0)
+- **Mission Control Dashboard**: Rich-based live terminal UI with retro aesthetic
+- **Real-time Progress**: Animated progress bars, chunk/iteration counters
+- **Transmission Log**: Parsed LLM responses showing issues detected and functions being generated
+- **Token Estimation**: Track estimated input/output tokens across the run
+- **Graceful Fallback**: Works without Rich installed (falls back to callbacks)
 ## Configuration
 ```python
@@ -142,6 +171,12 @@ cleaner = DataCleaner(
     report_path="report.md",    # Markdown report output (None to disable)
     dry_run=False,              # Analyze without generating functions
+    # Format Expansion
+    auto_parse=False,           # LLM generates parser for unknown formats
+    # Terminal UI
+    tui=True,                   # Enable Rich dashboard (requires [tui] extra)
     # Progress & State
     on_progress=callback,       # Progress event callback
     state_file="state.json",    # Enable resume on interrupt
@@ -235,20 +270,22 @@ cleaner.run()
 ```
 recursive_cleaner/
-├── cleaner.py       # Main DataCleaner class (~580 lines)
-├── context.py       # Docstring registry with FIFO eviction
-├── dependencies.py  # Topological sort for function ordering
-├── metrics.py       # Quality metrics before/after
-├── optimizer.py     # Two-pass consolidation with LLM agency
-├── output.py        # Function file generation + import consolidation
-├── parsers.py       # Chunking for JSONL/CSV/JSON/text + sampling
-├── prompt.py        # LLM prompt templates
-├── report.py        # Markdown report generation
-├── response.py      # XML/markdown parsing + agency dataclasses
-├── schema.py        # Schema inference
-├── validation.py    # Runtime validation + holdout
+├── cleaner.py          # Main DataCleaner class
+├── context.py          # Docstring registry with FIFO eviction
+├── dependencies.py     # Topological sort for function ordering
+├── metrics.py          # Quality metrics before/after
+├── optimizer.py        # Two-pass consolidation with LLM agency
+├── output.py           # Function file generation + import consolidation
+├── parser_generator.py # LLM-generated parsers for unknown formats
+├── parsers.py          # Chunking for all formats + sampling
+├── prompt.py           # LLM prompt templates
+├── report.py           # Markdown report generation
+├── response.py         # XML/markdown parsing + agency dataclasses
+├── schema.py           # Schema inference
+├── tui.py              # Rich terminal dashboard
+├── validation.py       # Runtime validation + holdout
 └── vendor/
-    └── chunker.py   # Vendored sentence-aware chunker
+    └── chunker.py      # Vendored sentence-aware chunker
 ```
 ## Testing
@@ -257,7 +294,7 @@ recursive_cleaner/
 pytest tests/ -v
 ```
-392 tests covering all features. Test datasets in `test_cases/`:
+465 tests covering all features. Test datasets in `test_cases/`:
 - E-commerce product catalogs
 - Healthcare patient records
 - Financial transaction data
@@ -273,6 +310,8 @@ pytest tests/ -v
 | Version | Features |
 |---------|----------|
+| v0.8.0 | Terminal UI with Rich dashboard, mission control aesthetic, transmission log |
+| v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
 | v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
 | v0.5.1 | Dangerous code detection (AST-based security) |
 | v0.5.0 | Two-pass optimization, early termination, LLM agency |

{recursive_cleaner-0.7.0.dist-info → recursive_cleaner-0.8.0.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
 backends/__init__.py,sha256=FUgODeYSGBvT0-z6myVby6YeAHG0nEUgWLITBKobUew,121
 backends/mlx_backend.py,sha256=0U6IqmDHyk4vjKzytvEcQvSUBryQTgFtsNOcpwFNKk8,2945
-recursive_cleaner/__init__.py,sha256=bG83PcmkxAYMC17FmKuyMJUrMnuukp32JO3rlCLyB-Q,1698
-recursive_cleaner/cleaner.py,sha256=J2X5bnk2OsWJyOn4BNR-cj0sqeKCylznfs_WEyMGxG8,26280
+recursive_cleaner/__init__.py,sha256=v0bNQ3H0d7n6cTOkuxuqG9bmnX9yeZBLZ_AfFM7edHI,1789
+recursive_cleaner/cleaner.py,sha256=vZTMwaLlCmuh1qy3c-puEZrwS5gXt0u28d5iweQXbms,29801
 recursive_cleaner/context.py,sha256=avMXRDxLd7nd8CKWtvPHQy1MFhBKiA0aUVVJIlWoLZ4,824
 recursive_cleaner/dependencies.py,sha256=vlYeoGL517v3yUSWN0wYDuIs9OOuQwM_dCBADrlitW8,2080
 recursive_cleaner/errors.py,sha256=hwRJF8NSmWy_FZHCxcZDZxLQ0zqvo5dX8ImkB9mrOYc,433
@@ -14,11 +14,12 @@ recursive_cleaner/prompt.py,sha256=ep0eOXz_XbhH3HduJ76LvzVSftonhcv4GLEecIqd3lY,6
 recursive_cleaner/report.py,sha256=AWWneRjvl76ccLlExdkKJeY3GVFUG_LtmzVIJJT5cFI,4629
 recursive_cleaner/response.py,sha256=3w0mLnqEPdB4daMSF0mtTcG0PTP-utb1HFtKuYA1ljw,9064
 recursive_cleaner/schema.py,sha256=w2hcEdApR15KVI9SFWB3VfumMoHFwn1YJrktdfgPo8M,3925
+recursive_cleaner/tui.py,sha256=FwG_uCwqUcvch5dRZmV-ba2JXD0XJkm9roXzPQ9iUSo,21633
 recursive_cleaner/types.py,sha256=-GdCmsfHd3rfdfCi5c-RXqX4TyuCSHgA__3AF3bMhoQ,290
 recursive_cleaner/validation.py,sha256=-KAolhw3GQyhHwmh0clEj8xqPD5O-R2AO5rx7vubIME,6442
 recursive_cleaner/vendor/__init__.py,sha256=E87TjmjRzu8ty39nqThvBwM611yXlLKQZ6KGY_zp3Dk,117
 recursive_cleaner/vendor/chunker.py,sha256=pDDbfF6FoSmUji0-RG4MletPxJ-VybGw0yfnhh0aMSQ,6730
-recursive_cleaner-0.7.0.dist-info/METADATA,sha256=bSCS8YBPAYzBufVF41LDYAgpLnYc4JAynA4xkNVuKyo,9486
-recursive_cleaner-0.7.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-recursive_cleaner-0.7.0.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
-recursive_cleaner-0.7.0.dist-info/RECORD,,
+recursive_cleaner-0.8.0.dist-info/METADATA,sha256=rVABzjvUZ-uzk35o5evbIlkRIbgEb29QPKSCoMI4_fs,11072
+recursive_cleaner-0.8.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+recursive_cleaner-0.8.0.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
+recursive_cleaner-0.8.0.dist-info/RECORD,,

{recursive_cleaner-0.7.0.dist-info → recursive_cleaner-0.8.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{recursive_cleaner-0.7.0.dist-info → recursive_cleaner-0.8.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

recursive-cleaner 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

recursive-cleaner 0.7.0py3-none-any.whl → 0.8.0py3-none-any.whl