PyPI - recursive-cleaner - Versions diffs - 0.8.0__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

recursive-cleaner 0.8.0py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

backends/__init__.py +2 -1
backends/openai_backend.py +71 -0
recursive_cleaner/__init__.py +4 -1
recursive_cleaner/__main__.py +8 -0
recursive_cleaner/apply.py +483 -0
recursive_cleaner/cleaner.py +27 -5
recursive_cleaner/cli.py +395 -0
recursive_cleaner/prompt.py +8 -4
recursive_cleaner/tui.py +43 -24
recursive_cleaner/validation.py +40 -1
{recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.1.dist-info}/METADATA +100 -4
{recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.1.dist-info}/RECORD +15 -10
recursive_cleaner-1.0.1.dist-info/entry_points.txt +2 -0
{recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.1.dist-info}/WHEEL +0 -0
{recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.1.dist-info}/licenses/LICENSE +0 -0

recursive_cleaner/cli.py ADDED Viewed

@@ -0,0 +1,395 @@
+"""CLI interface for Recursive Data Cleaner."""
+import argparse
+import os
+import sys
+def create_backend(provider: str, model: str, base_url: str | None, api_key: str | None):
+    """
+    Factory function to create the appropriate backend.
+    Args:
+        provider: Backend provider ("mlx" or "openai")
+        model: Model name/path
+        base_url: Optional API base URL (for openai-compatible servers)
+        api_key: Optional API key
+    Returns:
+        LLMBackend instance
+    Raises:
+        SystemExit: With code 2 if provider is invalid or import fails
+    """
+    if provider == "mlx":
+        try:
+            from backends import MLXBackend
+            return MLXBackend(model_path=model)
+        except ImportError:
+            print("Error: MLX backend requires mlx-lm. Install with: pip install mlx-lm", file=sys.stderr)
+            sys.exit(2)
+    elif provider == "openai":
+        try:
+            from backends import OpenAIBackend
+            return OpenAIBackend(model=model, api_key=api_key, base_url=base_url)
+        except ImportError as e:
+            print(f"Error: {e}", file=sys.stderr)
+            sys.exit(2)
+    else:
+        print(f"Error: Unknown provider '{provider}'. Use 'mlx' or 'openai'.", file=sys.stderr)
+        sys.exit(2)
+def read_instructions(value: str) -> str:
+    """
+    Read instructions from inline text or file.
+    Args:
+        value: Instructions string or @file.txt path
+    Returns:
+        Instructions text
+    """
+    if value.startswith("@"):
+        file_path = value[1:]
+        try:
+            with open(file_path, "r") as f:
+                return f.read().strip()
+        except FileNotFoundError:
+            print(f"Error: Instructions file not found: {file_path}", file=sys.stderr)
+            sys.exit(1)
+        except IOError as e:
+            print(f"Error reading instructions file: {e}", file=sys.stderr)
+            sys.exit(1)
+    elif value == "-":
+        return sys.stdin.read().strip()
+    return value
+def cmd_generate(args) -> int:
+    """Handle the generate command."""
+    from recursive_cleaner import DataCleaner
+    # Check if file exists
+    if not os.path.exists(args.file):
+        print(f"Error: File not found: {args.file}", file=sys.stderr)
+        return 1
+    # Create backend
+    backend = create_backend(args.provider, args.model, args.base_url, args.api_key)
+    # Read instructions
+    instructions = read_instructions(args.instructions) if args.instructions else ""
+    # Create progress callback for non-TUI mode
+    def on_progress(event):
+        if not args.tui:
+            event_type = event.get("type", "")
+            if event_type == "function_generated":
+                print(f"  Generated: {event.get('function_name', '')}")
+    try:
+        cleaner = DataCleaner(
+            llm_backend=backend,
+            file_path=args.file,
+            chunk_size=args.chunk_size,
+            instructions=instructions,
+            max_iterations=args.max_iterations,
+            mode=args.mode,
+            state_file=args.state_file,
+            report_path=args.report if args.report else None,
+            tui=args.tui,
+            optimize=args.optimize,
+            track_metrics=args.track_metrics,
+            early_termination=args.early_termination,
+            on_progress=on_progress if not args.tui else None,
+            output_path=args.output,
+        )
+        cleaner.run()
+        return 0
+    except FileNotFoundError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 1
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 3
+def cmd_analyze(args) -> int:
+    """Handle the analyze command (dry-run mode)."""
+    from recursive_cleaner import DataCleaner
+    # Check if file exists
+    if not os.path.exists(args.file):
+        print(f"Error: File not found: {args.file}", file=sys.stderr)
+        return 1
+    # Create backend
+    backend = create_backend(args.provider, args.model, args.base_url, args.api_key)
+    # Read instructions
+    instructions = read_instructions(args.instructions) if args.instructions else ""
+    # Progress callback for analysis output
+    def on_progress(event):
+        if not args.tui:
+            event_type = event.get("type", "")
+            if event_type == "issues_detected":
+                issues = event.get("issues", [])
+                chunk_idx = event.get("chunk_index", 0)
+                unsolved = [i for i in issues if not i.get("solved", False)]
+                print(f"Chunk {chunk_idx + 1}: {len(issues)} issues ({len(unsolved)} unsolved)")
+    try:
+        cleaner = DataCleaner(
+            llm_backend=backend,
+            file_path=args.file,
+            chunk_size=args.chunk_size,
+            instructions=instructions,
+            max_iterations=args.max_iterations,
+            mode=args.mode,
+            dry_run=True,
+            tui=args.tui,
+            on_progress=on_progress if not args.tui else None,
+        )
+        cleaner.run()
+        return 0
+    except FileNotFoundError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 1
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 3
+def cmd_apply(args) -> int:
+    """Handle the apply command."""
+    from recursive_cleaner.apply import apply_cleaning
+    # Check if input file exists
+    if not os.path.exists(args.file):
+        print(f"Error: File not found: {args.file}", file=sys.stderr)
+        return 1
+    # Check if functions file exists
+    if not os.path.exists(args.functions):
+        print(f"Error: Functions file not found: {args.functions}", file=sys.stderr)
+        return 1
+    try:
+        output_path = apply_cleaning(
+            input_path=args.file,
+            functions_path=args.functions,
+            output_path=args.output,
+        )
+        print(f"Cleaned data written to: {output_path}")
+        return 0
+    except FileNotFoundError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 1
+    except ImportError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 2
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 3
+def cmd_resume(args) -> int:
+    """Handle the resume command."""
+    from recursive_cleaner import DataCleaner
+    # Check if state file exists
+    if not os.path.exists(args.state_file):
+        print(f"Error: State file not found: {args.state_file}", file=sys.stderr)
+        return 1
+    # Create backend
+    backend = create_backend(args.provider, args.model, args.base_url, args.api_key)
+    try:
+        cleaner = DataCleaner.resume(args.state_file, backend)
+        cleaner.run()
+        return 0
+    except FileNotFoundError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 1
+    except ValueError as e:
+        print(f"Error: Invalid state file: {e}", file=sys.stderr)
+        return 1
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 3
+def create_parser() -> argparse.ArgumentParser:
+    """Create the argument parser with all subcommands."""
+    parser = argparse.ArgumentParser(
+        prog="recursive-cleaner",
+        description="LLM-powered incremental data cleaning pipeline",
+    )
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # --- generate command ---
+    gen_parser = subparsers.add_parser(
+        "generate",
+        help="Generate cleaning functions from data file",
+    )
+    gen_parser.add_argument("file", metavar="FILE", help="Path to input data file")
+    gen_parser.add_argument(
+        "-p", "--provider", required=True, choices=["mlx", "openai"],
+        help="LLM provider (mlx or openai)"
+    )
+    gen_parser.add_argument(
+        "-m", "--model", required=True, help="Model name/path"
+    )
+    gen_parser.add_argument(
+        "-i", "--instructions", default="",
+        help="Cleaning instructions (text or @file.txt)"
+    )
+    gen_parser.add_argument(
+        "--base-url", help="API base URL (for openai-compatible servers)"
+    )
+    gen_parser.add_argument(
+        "--api-key", help="API key (or use OPENAI_API_KEY env var)"
+    )
+    gen_parser.add_argument(
+        "--chunk-size", type=int, default=50, help="Items per chunk (default: 50)"
+    )
+    gen_parser.add_argument(
+        "--max-iterations", type=int, default=5,
+        help="Max iterations per chunk (default: 5)"
+    )
+    gen_parser.add_argument(
+        "--mode", choices=["auto", "structured", "text"], default="auto",
+        help="Processing mode (default: auto)"
+    )
+    gen_parser.add_argument(
+        "-o", "--output", default="cleaning_functions.py",
+        help="Output file path (default: cleaning_functions.py)"
+    )
+    gen_parser.add_argument(
+        "--report", default="cleaning_report.md",
+        help="Report file path (empty to disable, default: cleaning_report.md)"
+    )
+    gen_parser.add_argument(
+        "--state-file", help="Checkpoint file for resume"
+    )
+    gen_parser.add_argument(
+        "--tui", action="store_true", help="Enable Rich terminal dashboard"
+    )
+    gen_parser.add_argument(
+        "--optimize", action="store_true", help="Consolidate redundant functions"
+    )
+    gen_parser.add_argument(
+        "--track-metrics", action="store_true", help="Measure before/after quality"
+    )
+    gen_parser.add_argument(
+        "--early-termination", action="store_true",
+        help="Stop on pattern saturation"
+    )
+    gen_parser.set_defaults(func=cmd_generate)
+    # --- analyze command ---
+    analyze_parser = subparsers.add_parser(
+        "analyze",
+        help="Dry-run analysis without generating functions",
+    )
+    analyze_parser.add_argument("file", metavar="FILE", help="Path to input data file")
+    analyze_parser.add_argument(
+        "-p", "--provider", required=True, choices=["mlx", "openai"],
+        help="LLM provider (mlx or openai)"
+    )
+    analyze_parser.add_argument(
+        "-m", "--model", required=True, help="Model name/path"
+    )
+    analyze_parser.add_argument(
+        "-i", "--instructions", default="",
+        help="Cleaning instructions (text or @file.txt)"
+    )
+    analyze_parser.add_argument(
+        "--base-url", help="API base URL (for openai-compatible servers)"
+    )
+    analyze_parser.add_argument(
+        "--api-key", help="API key (or use OPENAI_API_KEY env var)"
+    )
+    analyze_parser.add_argument(
+        "--chunk-size", type=int, default=50, help="Items per chunk (default: 50)"
+    )
+    analyze_parser.add_argument(
+        "--max-iterations", type=int, default=5,
+        help="Max iterations per chunk (default: 5)"
+    )
+    analyze_parser.add_argument(
+        "--mode", choices=["auto", "structured", "text"], default="auto",
+        help="Processing mode (default: auto)"
+    )
+    analyze_parser.add_argument(
+        "--tui", action="store_true", help="Enable Rich terminal dashboard"
+    )
+    analyze_parser.set_defaults(func=cmd_analyze)
+    # --- resume command ---
+    resume_parser = subparsers.add_parser(
+        "resume",
+        help="Resume from checkpoint file",
+    )
+    resume_parser.add_argument(
+        "state_file", metavar="STATE_FILE", help="Path to checkpoint JSON file"
+    )
+    resume_parser.add_argument(
+        "-p", "--provider", required=True, choices=["mlx", "openai"],
+        help="LLM provider (mlx or openai)"
+    )
+    resume_parser.add_argument(
+        "-m", "--model", required=True, help="Model name/path"
+    )
+    resume_parser.add_argument(
+        "--base-url", help="API base URL (for openai-compatible servers)"
+    )
+    resume_parser.add_argument(
+        "--api-key", help="API key (or use OPENAI_API_KEY env var)"
+    )
+    resume_parser.set_defaults(func=cmd_resume)
+    # --- apply command ---
+    apply_parser = subparsers.add_parser(
+        "apply",
+        help="Apply cleaning functions to data file",
+    )
+    apply_parser.add_argument("file", metavar="FILE", help="Path to input data file")
+    apply_parser.add_argument(
+        "-f", "--functions", required=True,
+        help="Path to cleaning_functions.py"
+    )
+    apply_parser.add_argument(
+        "-o", "--output", help="Output file path (default: <input>.cleaned.<ext>)"
+    )
+    apply_parser.set_defaults(func=cmd_apply)
+    return parser
+def main(args: list[str] | None = None) -> int:
+    """
+    Main entry point for the CLI.
+    Args:
+        args: Command-line arguments (defaults to sys.argv[1:])
+    Returns:
+        Exit code (0=success, 1=general error, 2=backend error, 3=validation error)
+    """
+    parser = create_parser()
+    parsed = parser.parse_args(args)
+    if parsed.command is None:
+        parser.print_help()
+        return 0
+    return parsed.func(parsed)
+if __name__ == "__main__":
+    sys.exit(main())

recursive_cleaner/prompt.py CHANGED Viewed

@@ -52,7 +52,8 @@ CONSOLIDATION_TEMPLATE = '''You are reviewing cleaning functions for consolidati
       </docstring>
       <code>
 ```python
-def merged_function_name(record):
+def merged_function_name(record: dict) -> dict:
+    # Modify fields, return record
     ...
 ```
       </code>
@@ -108,9 +109,10 @@ Tags: domain, action, detail
     </docstring>
     <code>
 ```python
-def function_name(data):
-    # Complete implementation
-    pass
+def function_name(record: dict) -> dict:
+    # Modify field(s) in the record
+    record["field"] = cleaned_value
+    return record
 ```
     </code>
   </function_to_generate>
@@ -120,6 +122,8 @@ def function_name(data):
 RULES:
 - ONE function per response
+- Function signature: takes a dict (one record), returns the modified dict
+- Modify fields directly on the record, then return it
 - If all issues solved: <chunk_status>clean</chunk_status>, omit <function_to_generate>
 - Include imports inside the function or document needed imports in docstring
 - Function must be idempotent (safe to run multiple times)

recursive_cleaner/tui.py CHANGED Viewed

@@ -505,19 +505,28 @@ class TUIRenderer:
         )
         self._layout["left_panel"].update(left_panel)
-    def _parse_response_for_display(self, response: str) -> str:
-        """Parse LLM XML response into readable format for transmission log.
+    def _colorize_transmission(self, response: str) -> "Text":
+        """Parse LLM XML response into colorized Rich Text for transmission log.
+        Color scheme:
+        - Issues (solved): dim
+        - Issues (unsolved): bright_white with cycling accent (blue/magenta/cyan/yellow)
+        - Function names: green
+        - Docstrings: italic
+        - Status clean: green
+        - Status needs_more_work: yellow
         Args:
             response: Raw LLM response text (XML format)
         Returns:
-            Formatted string for display showing issues, function being
-            generated, and chunk status.
+            Rich Text object with colors applied.
         """
         import re
-        lines = []
+        ISSUE_COLORS = ["blue", "magenta", "cyan", "yellow"]
+        text = Text()
+        unsolved_index = 0
         try:
             # Find all issues
@@ -525,53 +534,63 @@ class TUIRenderer:
             issues = re.findall(issue_pattern, response, re.DOTALL)
             if issues:
-                lines.append("ISSUES DETECTED:")
+                text.append("ISSUES DETECTED:\n", style="bold cyan")
                 for issue_id, solved, desc in issues[:8]:  # Limit to 8 issues
-                    marker = "\u2713" if solved == "true" else "\u2717"  # checkmark or X
                     desc_clean = desc.strip()[:40]  # Truncate description
-                    lines.append(f"  {marker} {desc_clean}")
+                    if solved == "true":
+                        text.append("  \u2713 ", style="green")
+                        text.append(f"{desc_clean}\n", style="dim")
+                    else:
+                        accent = ISSUE_COLORS[unsolved_index % len(ISSUE_COLORS)]
+                        text.append("  \u2717 ", style=accent)
+                        text.append(f"{desc_clean}\n", style="bright_white")
+                        unsolved_index += 1
                 if len(issues) > 8:
-                    lines.append(f"  (+{len(issues) - 8} more)")
-                lines.append("")
+                    text.append(f"  (+{len(issues) - 8} more)\n", style="dim")
+                text.append("\n")
             # Find function being generated
             name_match = re.search(r'<name>([^<]+)</name>', response)
             docstring_match = re.search(r'<docstring>([^<]+)</docstring>', response, re.DOTALL)
             if name_match:
-                lines.append(f"GENERATING: {name_match.group(1).strip()}")
+                text.append("GENERATING: ", style="bold cyan")
+                text.append(f"{name_match.group(1).strip()}\n", style="green bold")
                 if docstring_match:
                     doc = docstring_match.group(1).strip()[:60]
-                    lines.append(f'  "{doc}..."')
-                lines.append("")
+                    text.append(f'  "{doc}..."\n', style="italic")
+                text.append("\n")
             # Find chunk status
             status_match = re.search(r'<chunk_status>([^<]+)</chunk_status>', response)
             if status_match:
                 status = status_match.group(1).strip()
-                lines.append(f"STATUS: {status.upper()}")
-            if lines:
-                return "\n".join(lines)
+                text.append("STATUS: ", style="bold cyan")
+                if status == "clean":
+                    text.append(status.upper(), style="green bold")
+                else:
+                    text.append(status.upper().replace("_", " "), style="yellow bold")
+            if text.plain:
+                return text
         except Exception:
             pass
         # Fallback: show truncated raw response
-        return response[:500] + "..." if len(response) > 500 else response
+        fallback = response[:500] + "..." if len(response) > 500 else response
+        return Text(fallback, style="dim cyan")
     def _refresh_right_panel(self) -> None:
-        """Refresh the right panel with parsed transmission log."""
+        """Refresh the right panel with colorized transmission log."""
         if not HAS_RICH or self._layout is None:
             return
-        # Get last response and parse for display
+        # Get last response and colorize for display
         response = self._state.last_response
         if not response:
-            display_text = "(Awaiting transmission...)"
+            log_text = Text("(Awaiting transmission...)", style="dim cyan")
         else:
-            display_text = self._parse_response_for_display(response)
-        log_text = Text(display_text, style="dim cyan")
+            log_text = self._colorize_transmission(response)
         right_panel = Panel(
             log_text,

recursive_cleaner/validation.py CHANGED Viewed

@@ -160,7 +160,10 @@ def validate_function(
         # Structured mode: sample_data is list[dict]
         for i, record in enumerate(sample_data):
             try:
-                func(record)
+                result = func(record)
+                # Verify function returns a dict (not string, int, etc.)
+                if not isinstance(result, dict):
+                    return False, f"Function must return dict, got {type(result).__name__}"
             except Exception as e:
                 return False, f"Runtime error on sample {i}: {type(e).__name__}: {e}"
@@ -200,3 +203,39 @@ def extract_sample_data(
         except json.JSONDecodeError:
             continue
     return samples
+def extract_modified_fields(code: str) -> set[str]:
+    """
+    Extract field names that are modified via record["field"] = ... pattern.
+    Args:
+        code: Python source code of the function
+    Returns:
+        Set of field names that are assigned to
+    """
+    try:
+        tree = ast.parse(code)
+    except SyntaxError:
+        return set()
+    fields = set()
+    # Common parameter names for the data/record argument
+    data_names = {"record", "data"}
+    for node in ast.walk(tree):
+        # Look for assignment statements
+        if isinstance(node, ast.Assign):
+            for target in node.targets:
+                # Check if target is a subscript: record["field"] or data["field"]
+                if isinstance(target, ast.Subscript):
+                    # The value should be a Name node (record or data)
+                    if isinstance(target.value, ast.Name):
+                        if target.value.id in data_names:
+                            # The slice should be a string constant
+                            if isinstance(target.slice, ast.Constant):
+                                if isinstance(target.slice.value, str):
+                                    fields.add(target.slice.value)
+    return fields

recursive-cleaner 0.8.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

recursive-cleaner 0.8.0py3-none-any.whl → 1.0.1py3-none-any.whl