recursive-cleaner 0.8.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
backends/__init__.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Backend implementations for Recursive Data Cleaner."""
2
2
 
3
3
  from .mlx_backend import MLXBackend
4
+ from .openai_backend import OpenAIBackend
4
5
 
5
- __all__ = ["MLXBackend"]
6
+ __all__ = ["MLXBackend", "OpenAIBackend"]
@@ -0,0 +1,71 @@
1
+ """OpenAI-compatible backend for Recursive Data Cleaner."""
2
+
3
+ import os
4
+
5
+
6
+ class OpenAIBackend:
7
+ """
8
+ OpenAI-compatible backend implementation.
9
+
10
+ Works with OpenAI API, LM Studio, Ollama, and other OpenAI-compatible servers.
11
+ Conforms to the LLMBackend protocol.
12
+ """
13
+
14
+ def __init__(
15
+ self,
16
+ model: str,
17
+ api_key: str | None = None,
18
+ base_url: str | None = None,
19
+ max_tokens: int = 4096,
20
+ temperature: float = 0.7,
21
+ ):
22
+ """
23
+ Initialize the OpenAI backend.
24
+
25
+ Args:
26
+ model: Model name (e.g., "gpt-4o", "gpt-3.5-turbo")
27
+ api_key: API key (defaults to OPENAI_API_KEY env var, or "not-needed" for local)
28
+ base_url: API base URL (defaults to OpenAI's API)
29
+ max_tokens: Maximum tokens to generate
30
+ temperature: Sampling temperature
31
+ """
32
+ try:
33
+ import openai
34
+ except ImportError:
35
+ raise ImportError(
36
+ "OpenAI SDK not installed. Install with: pip install openai"
37
+ )
38
+
39
+ self.model = model
40
+ self.max_tokens = max_tokens
41
+ self.temperature = temperature
42
+
43
+ # Resolve API key: explicit > env var > "not-needed" for local servers
44
+ if api_key is not None:
45
+ resolved_key = api_key
46
+ else:
47
+ resolved_key = os.environ.get("OPENAI_API_KEY", "not-needed")
48
+
49
+ # Create client
50
+ self._client = openai.OpenAI(
51
+ api_key=resolved_key,
52
+ base_url=base_url,
53
+ )
54
+
55
+ def generate(self, prompt: str) -> str:
56
+ """
57
+ Generate a response from the LLM.
58
+
59
+ Args:
60
+ prompt: The input prompt
61
+
62
+ Returns:
63
+ The generated text response
64
+ """
65
+ response = self._client.chat.completions.create(
66
+ model=self.model,
67
+ messages=[{"role": "user", "content": prompt}],
68
+ max_tokens=self.max_tokens,
69
+ temperature=self.temperature,
70
+ )
71
+ return response.choices[0].message.content or ""
@@ -1,5 +1,6 @@
1
1
  """Recursive Data Cleaner - LLM-powered incremental data cleaning pipeline."""
2
2
 
3
+ from recursive_cleaner.apply import apply_cleaning
3
4
  from recursive_cleaner.cleaner import DataCleaner
4
5
  from recursive_cleaner.context import build_context
5
6
  from recursive_cleaner.dependencies import resolve_dependencies
@@ -24,6 +25,7 @@ from recursive_cleaner.tui import HAS_RICH, TUIRenderer
24
25
  from recursive_cleaner.validation import check_code_safety, extract_sample_data, validate_function
25
26
 
26
27
  __all__ = [
28
+ "apply_cleaning",
27
29
  "CleanerError",
28
30
  "ParseError",
29
31
  "MaxIterationsError",
@@ -0,0 +1,8 @@
1
+ """Entry point for python -m recursive_cleaner."""
2
+
3
+ import sys
4
+
5
+ from recursive_cleaner.cli import main
6
+
7
+ if __name__ == "__main__":
8
+ sys.exit(main())
@@ -0,0 +1,483 @@
1
+ """Apply cleaning functions to data files."""
2
+
3
+ import csv
4
+ import importlib.util
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Callable
8
+
9
+ from .parsers import MARKITDOWN_EXTENSIONS
10
+
11
+ # Text formats that should be converted to markdown (excludes spreadsheets)
12
+ TEXT_MARKITDOWN_EXTENSIONS = MARKITDOWN_EXTENSIONS - {".xlsx", ".xls", ".ods"}
13
+
14
+
15
+ def load_cleaning_module(functions_path: str):
16
+ """
17
+ Dynamically import a cleaning_functions.py file.
18
+
19
+ Args:
20
+ functions_path: Path to the cleaning functions file
21
+
22
+ Returns:
23
+ The imported module
24
+
25
+ Raises:
26
+ FileNotFoundError: If the functions file doesn't exist
27
+ ImportError: If the module cannot be imported
28
+ """
29
+ path = Path(functions_path)
30
+ if not path.exists():
31
+ raise FileNotFoundError(f"Functions file not found: {functions_path}")
32
+
33
+ spec = importlib.util.spec_from_file_location("cleaning_module", path)
34
+ if spec is None or spec.loader is None:
35
+ raise ImportError(f"Cannot load module from: {functions_path}")
36
+
37
+ module = importlib.util.module_from_spec(spec)
38
+ spec.loader.exec_module(module)
39
+ return module
40
+
41
+
42
+ def get_default_output_path(input_path: str, force_ext: str | None = None) -> str:
43
+ """
44
+ Generate default output path: input.cleaned.ext
45
+
46
+ Args:
47
+ input_path: Path to the input file
48
+ force_ext: Override the output extension (e.g., ".xlsx" for .xls files)
49
+
50
+ Returns:
51
+ Path string for the output file
52
+ """
53
+ path = Path(input_path)
54
+ suffix = path.suffix.lower()
55
+
56
+ if force_ext:
57
+ ext = force_ext
58
+ elif suffix == ".xls":
59
+ # .xls files are written as .xlsx
60
+ ext = ".xlsx"
61
+ elif suffix == ".txt" or suffix in TEXT_MARKITDOWN_EXTENSIONS:
62
+ # Text formats output as markdown
63
+ ext = ".md"
64
+ else:
65
+ ext = path.suffix
66
+
67
+ return str(path.with_suffix(f".cleaned{ext}"))
68
+
69
+
70
+ def apply_to_jsonl(
71
+ input_path: str,
72
+ output_path: str,
73
+ clean_fn: Callable,
74
+ on_progress: Callable[[dict], None] | None = None,
75
+ ) -> int:
76
+ """
77
+ Stream JSONL: read line, clean, write line.
78
+
79
+ Args:
80
+ input_path: Path to input JSONL file
81
+ output_path: Path for output JSONL file
82
+ clean_fn: Cleaning function to apply to each record
83
+ on_progress: Optional progress callback
84
+
85
+ Returns:
86
+ Number of records processed
87
+ """
88
+ records_processed = 0
89
+
90
+ with open(input_path, "r", encoding="utf-8") as infile, \
91
+ open(output_path, "w", encoding="utf-8") as outfile:
92
+ for line in infile:
93
+ line = line.strip()
94
+ if not line:
95
+ continue
96
+
97
+ record = json.loads(line)
98
+ cleaned = clean_fn(record)
99
+ outfile.write(json.dumps(cleaned) + "\n")
100
+
101
+ records_processed += 1
102
+ if on_progress:
103
+ on_progress({"type": "apply_progress", "records_processed": records_processed})
104
+
105
+ return records_processed
106
+
107
+
108
+ def apply_to_csv(
109
+ input_path: str,
110
+ output_path: str,
111
+ clean_fn: Callable,
112
+ on_progress: Callable[[dict], None] | None = None,
113
+ ) -> int:
114
+ """
115
+ Stream CSV: DictReader to clean each row, DictWriter to output.
116
+
117
+ Args:
118
+ input_path: Path to input CSV file
119
+ output_path: Path for output CSV file
120
+ clean_fn: Cleaning function to apply to each record
121
+ on_progress: Optional progress callback
122
+
123
+ Returns:
124
+ Number of records processed
125
+ """
126
+ records_processed = 0
127
+
128
+ with open(input_path, "r", encoding="utf-8", newline="") as infile:
129
+ reader = csv.DictReader(infile)
130
+ fieldnames = reader.fieldnames
131
+
132
+ if not fieldnames:
133
+ return 0
134
+
135
+ with open(output_path, "w", encoding="utf-8", newline="") as outfile:
136
+ writer = csv.DictWriter(outfile, fieldnames=fieldnames)
137
+ writer.writeheader()
138
+
139
+ for row in reader:
140
+ cleaned = clean_fn(row)
141
+ writer.writerow(cleaned)
142
+
143
+ records_processed += 1
144
+ if on_progress:
145
+ on_progress({"type": "apply_progress", "records_processed": records_processed})
146
+
147
+ return records_processed
148
+
149
+
150
+ def apply_to_json(
151
+ input_path: str,
152
+ output_path: str,
153
+ clean_fn: Callable,
154
+ on_progress: Callable[[dict], None] | None = None,
155
+ ) -> int:
156
+ """
157
+ Batch JSON array: load all, clean each, write array.
158
+
159
+ Args:
160
+ input_path: Path to input JSON file
161
+ output_path: Path for output JSON file
162
+ clean_fn: Cleaning function to apply to each record
163
+ on_progress: Optional progress callback
164
+
165
+ Returns:
166
+ Number of records processed
167
+ """
168
+ with open(input_path, "r", encoding="utf-8") as f:
169
+ data = json.load(f)
170
+
171
+ if not isinstance(data, list):
172
+ # Single object - wrap, clean, unwrap
173
+ cleaned = clean_fn(data)
174
+ with open(output_path, "w", encoding="utf-8") as f:
175
+ json.dump(cleaned, f, indent=2)
176
+ if on_progress:
177
+ on_progress({"type": "apply_progress", "records_processed": 1})
178
+ return 1
179
+
180
+ cleaned_data = []
181
+ for i, record in enumerate(data):
182
+ cleaned = clean_fn(record)
183
+ cleaned_data.append(cleaned)
184
+
185
+ if on_progress:
186
+ on_progress({"type": "apply_progress", "records_processed": i + 1})
187
+
188
+ with open(output_path, "w", encoding="utf-8") as f:
189
+ json.dump(cleaned_data, f, indent=2)
190
+
191
+ return len(cleaned_data)
192
+
193
+
194
+ def apply_to_parquet(
195
+ input_path: str,
196
+ output_path: str,
197
+ clean_fn: Callable,
198
+ on_progress: Callable[[dict], None] | None = None,
199
+ ) -> int:
200
+ """
201
+ Batch Parquet: load as list of dicts, clean each, write back.
202
+
203
+ Args:
204
+ input_path: Path to input Parquet file
205
+ output_path: Path for output Parquet file
206
+ clean_fn: Cleaning function to apply to each record
207
+ on_progress: Optional progress callback
208
+
209
+ Returns:
210
+ Number of records processed
211
+
212
+ Raises:
213
+ ImportError: If pyarrow is not installed
214
+ """
215
+ try:
216
+ import pyarrow as pa
217
+ import pyarrow.parquet as pq
218
+ except ImportError:
219
+ raise ImportError(
220
+ "pyarrow is required for parquet files. "
221
+ "Install with: pip install recursive-cleaner[parquet]"
222
+ )
223
+
224
+ table = pq.read_table(input_path)
225
+ records = table.to_pylist()
226
+
227
+ cleaned_data = []
228
+ for i, record in enumerate(records):
229
+ cleaned = clean_fn(record)
230
+ cleaned_data.append(cleaned)
231
+
232
+ if on_progress:
233
+ on_progress({"type": "apply_progress", "records_processed": i + 1})
234
+
235
+ # Write back as parquet
236
+ cleaned_table = pa.Table.from_pylist(cleaned_data)
237
+ pq.write_table(cleaned_table, output_path)
238
+
239
+ return len(cleaned_data)
240
+
241
+
242
+ def apply_to_excel(
243
+ input_path: str,
244
+ output_path: str,
245
+ clean_fn: Callable,
246
+ on_progress: Callable[[dict], None] | None = None,
247
+ ) -> int:
248
+ """
249
+ Batch Excel: load as list of dicts, clean each, write back.
250
+
251
+ Args:
252
+ input_path: Path to input Excel file (.xlsx or .xls)
253
+ output_path: Path for output Excel file (.xlsx)
254
+ clean_fn: Cleaning function to apply to each record
255
+ on_progress: Optional progress callback
256
+
257
+ Returns:
258
+ Number of records processed
259
+
260
+ Raises:
261
+ ImportError: If openpyxl (or xlrd for .xls) is not installed
262
+ """
263
+ suffix = Path(input_path).suffix.lower()
264
+
265
+ if suffix == ".xls":
266
+ # Use xlrd for .xls files
267
+ try:
268
+ import xlrd
269
+ except ImportError:
270
+ raise ImportError(
271
+ "xlrd is required for .xls files. "
272
+ "Install with: pip install recursive-cleaner[excel]"
273
+ )
274
+
275
+ workbook = xlrd.open_workbook(input_path)
276
+ sheet = workbook.sheet_by_index(0)
277
+
278
+ if sheet.nrows < 1:
279
+ return 0
280
+
281
+ # First row is headers
282
+ headers = [str(sheet.cell_value(0, col)) for col in range(sheet.ncols)]
283
+ records = []
284
+ for row_idx in range(1, sheet.nrows):
285
+ row_data = {}
286
+ for col_idx, header in enumerate(headers):
287
+ row_data[header] = sheet.cell_value(row_idx, col_idx)
288
+ records.append(row_data)
289
+ else:
290
+ # Use openpyxl for .xlsx files
291
+ try:
292
+ from openpyxl import load_workbook
293
+ except ImportError:
294
+ raise ImportError(
295
+ "openpyxl is required for .xlsx files. "
296
+ "Install with: pip install recursive-cleaner[excel]"
297
+ )
298
+
299
+ workbook = load_workbook(input_path, read_only=True)
300
+ sheet = workbook.active
301
+
302
+ rows = list(sheet.iter_rows(values_only=True))
303
+ if not rows:
304
+ return 0
305
+
306
+ # First row is headers
307
+ headers = [str(h) if h is not None else "" for h in rows[0]]
308
+ records = []
309
+ for row in rows[1:]:
310
+ row_data = {}
311
+ for col_idx, header in enumerate(headers):
312
+ value = row[col_idx] if col_idx < len(row) else None
313
+ row_data[header] = value
314
+ records.append(row_data)
315
+
316
+ workbook.close()
317
+
318
+ # Clean records
319
+ cleaned_data = []
320
+ for i, record in enumerate(records):
321
+ cleaned = clean_fn(record)
322
+ cleaned_data.append(cleaned)
323
+
324
+ if on_progress:
325
+ on_progress({"type": "apply_progress", "records_processed": i + 1})
326
+
327
+ # Write back as xlsx using openpyxl
328
+ try:
329
+ from openpyxl import Workbook
330
+ except ImportError:
331
+ raise ImportError(
332
+ "openpyxl is required for writing Excel files. "
333
+ "Install with: pip install recursive-cleaner[excel]"
334
+ )
335
+
336
+ wb = Workbook()
337
+ ws = wb.active
338
+
339
+ if cleaned_data:
340
+ # Write headers
341
+ fieldnames = list(cleaned_data[0].keys())
342
+ ws.append(fieldnames)
343
+
344
+ # Write data rows
345
+ for record in cleaned_data:
346
+ ws.append([record.get(k) for k in fieldnames])
347
+
348
+ wb.save(output_path)
349
+
350
+ return len(cleaned_data)
351
+
352
+
353
+ def apply_to_text(
354
+ input_path: str,
355
+ output_path: str,
356
+ clean_fn: Callable,
357
+ on_progress: Callable[[dict], None] | None = None,
358
+ ) -> int:
359
+ """
360
+ Process text/document files: extract text, clean, write as markdown.
361
+
362
+ Args:
363
+ input_path: Path to input file (.txt or markitdown format)
364
+ output_path: Path for output markdown file
365
+ clean_fn: Cleaning function to apply to the text
366
+ on_progress: Optional progress callback
367
+
368
+ Returns:
369
+ Number of records processed (always 1 for text)
370
+
371
+ Raises:
372
+ ImportError: If markitdown is not installed (for non-.txt files)
373
+ """
374
+ suffix = Path(input_path).suffix.lower()
375
+
376
+ if suffix == ".txt":
377
+ # Plain text - read directly
378
+ with open(input_path, "r", encoding="utf-8") as f:
379
+ content = f.read()
380
+ else:
381
+ # Use markitdown for other formats
382
+ try:
383
+ from markitdown import MarkItDown
384
+ except ImportError:
385
+ raise ImportError(
386
+ "markitdown is required for this file type. "
387
+ "Install with: pip install recursive-cleaner[markitdown]"
388
+ )
389
+
390
+ md = MarkItDown()
391
+ result = md.convert(input_path)
392
+ content = result.text_content
393
+
394
+ # Clean the text content
395
+ cleaned = clean_fn(content)
396
+
397
+ # Write as markdown
398
+ with open(output_path, "w", encoding="utf-8") as f:
399
+ f.write(cleaned)
400
+
401
+ if on_progress:
402
+ on_progress({"type": "apply_progress", "records_processed": 1})
403
+
404
+ return 1
405
+
406
+
407
+ def apply_cleaning(
408
+ input_path: str,
409
+ functions_path: str,
410
+ output_path: str | None = None,
411
+ on_progress: Callable[[dict], None] | None = None,
412
+ ) -> str:
413
+ """
414
+ Apply cleaning functions to a data file.
415
+
416
+ Args:
417
+ input_path: Path to input data file
418
+ functions_path: Path to cleaning_functions.py
419
+ output_path: Path for output file (default: input.cleaned.ext)
420
+ on_progress: Optional progress callback
421
+
422
+ Returns:
423
+ Path to output file
424
+
425
+ Raises:
426
+ FileNotFoundError: If input or functions file not found
427
+ ImportError: If functions file cannot be imported
428
+ ValueError: If input format is unsupported
429
+ """
430
+ # Validate input file exists
431
+ if not Path(input_path).exists():
432
+ raise FileNotFoundError(f"Input file not found: {input_path}")
433
+
434
+ # Load cleaning module
435
+ module = load_cleaning_module(functions_path)
436
+
437
+ # Get the clean_data function
438
+ if not hasattr(module, "clean_data"):
439
+ raise ImportError(f"Functions file missing clean_data() function: {functions_path}")
440
+
441
+ clean_fn = module.clean_data
442
+
443
+ # Determine output path
444
+ suffix = Path(input_path).suffix.lower()
445
+ if output_path is None:
446
+ output_path = get_default_output_path(input_path)
447
+
448
+ # Route by format
449
+ format_handlers = {
450
+ ".jsonl": apply_to_jsonl,
451
+ ".csv": apply_to_csv,
452
+ ".json": apply_to_json,
453
+ ".parquet": apply_to_parquet,
454
+ ".xlsx": apply_to_excel,
455
+ ".xls": apply_to_excel,
456
+ }
457
+
458
+ handler = format_handlers.get(suffix)
459
+
460
+ # Check for text formats (.txt and markitdown extensions, excluding spreadsheets)
461
+ if handler is None:
462
+ if suffix == ".txt" or suffix in TEXT_MARKITDOWN_EXTENSIONS:
463
+ handler = apply_to_text
464
+
465
+ if handler is None:
466
+ raise ValueError(f"Unsupported format: {suffix}")
467
+
468
+ # Emit start event (total_records unknown for streaming formats)
469
+ if on_progress:
470
+ on_progress({"type": "apply_start", "total_records": None})
471
+
472
+ # Apply cleaning
473
+ total_records = handler(input_path, output_path, clean_fn, on_progress)
474
+
475
+ # Emit complete event
476
+ if on_progress:
477
+ on_progress({
478
+ "type": "apply_complete",
479
+ "total_records": total_records,
480
+ "output_path": output_path,
481
+ })
482
+
483
+ return output_path
@@ -63,6 +63,7 @@ class DataCleaner:
63
63
  dry_run: bool = False,
64
64
  auto_parse: bool = False,
65
65
  tui: bool = False,
66
+ output_path: str = "cleaning_functions.py",
66
67
  ):
67
68
  self.backend = llm_backend
68
69
  self.file_path = file_path
@@ -88,6 +89,7 @@ class DataCleaner:
88
89
  self.dry_run = dry_run
89
90
  self.auto_parse = auto_parse
90
91
  self.tui = tui
92
+ self.output_path = output_path
91
93
  self.functions: list[dict] = [] # List of {name, docstring, code}
92
94
  self._tui_renderer = None # TUIRenderer instance when tui=True
93
95
  self._generated_parser: callable | None = None # LLM-generated parser for unknown formats
@@ -520,7 +522,7 @@ class DataCleaner:
520
522
  "quality_delta": 0.0, # Could be calculated from metrics
521
523
  "latency_total_ms": latency_summary.get("total_ms", 0.0),
522
524
  "llm_calls": latency_summary.get("call_count", 0),
523
- "output_file": "cleaning_functions.py",
525
+ "output_file": self.output_path,
524
526
  })
525
527
  self._tui_renderer.stop()
526
528
 
@@ -687,11 +689,11 @@ class DataCleaner:
687
689
  self._emit("chunk_done", chunk_index=chunk_idx)
688
690
 
689
691
  def _write_output(self) -> None:
690
- """Write generated functions to cleaning_functions.py."""
692
+ """Write generated functions to output file."""
691
693
  from .output import write_cleaning_file
692
694
 
693
695
  try:
694
- write_cleaning_file(self.functions)
696
+ write_cleaning_file(self.functions, self.output_path)
695
697
  except OutputValidationError as e:
696
698
  if not self.tui:
697
699
  print(f" Error: {e}")
@@ -707,7 +709,7 @@ class DataCleaner:
707
709
  if not self.tui:
708
710
  print(f" Skipping invalid function: {f['name']}")
709
711
  if valid_functions:
710
- write_cleaning_file(valid_functions)
712
+ write_cleaning_file(valid_functions, self.output_path)
711
713
  elif not self.tui:
712
714
  print(" No valid functions to write.")
713
715
 
@@ -0,0 +1,395 @@
1
+ """CLI interface for Recursive Data Cleaner."""
2
+
3
+ import argparse
4
+ import os
5
+ import sys
6
+
7
+
8
+ def create_backend(provider: str, model: str, base_url: str | None, api_key: str | None):
9
+ """
10
+ Factory function to create the appropriate backend.
11
+
12
+ Args:
13
+ provider: Backend provider ("mlx" or "openai")
14
+ model: Model name/path
15
+ base_url: Optional API base URL (for openai-compatible servers)
16
+ api_key: Optional API key
17
+
18
+ Returns:
19
+ LLMBackend instance
20
+
21
+ Raises:
22
+ SystemExit: With code 2 if provider is invalid or import fails
23
+ """
24
+ if provider == "mlx":
25
+ try:
26
+ from backends import MLXBackend
27
+ return MLXBackend(model_path=model)
28
+ except ImportError:
29
+ print("Error: MLX backend requires mlx-lm. Install with: pip install mlx-lm", file=sys.stderr)
30
+ sys.exit(2)
31
+ elif provider == "openai":
32
+ try:
33
+ from backends import OpenAIBackend
34
+ return OpenAIBackend(model=model, api_key=api_key, base_url=base_url)
35
+ except ImportError as e:
36
+ print(f"Error: {e}", file=sys.stderr)
37
+ sys.exit(2)
38
+ else:
39
+ print(f"Error: Unknown provider '{provider}'. Use 'mlx' or 'openai'.", file=sys.stderr)
40
+ sys.exit(2)
41
+
42
+
43
+ def read_instructions(value: str) -> str:
44
+ """
45
+ Read instructions from inline text or file.
46
+
47
+ Args:
48
+ value: Instructions string or @file.txt path
49
+
50
+ Returns:
51
+ Instructions text
52
+ """
53
+ if value.startswith("@"):
54
+ file_path = value[1:]
55
+ try:
56
+ with open(file_path, "r") as f:
57
+ return f.read().strip()
58
+ except FileNotFoundError:
59
+ print(f"Error: Instructions file not found: {file_path}", file=sys.stderr)
60
+ sys.exit(1)
61
+ except IOError as e:
62
+ print(f"Error reading instructions file: {e}", file=sys.stderr)
63
+ sys.exit(1)
64
+ elif value == "-":
65
+ return sys.stdin.read().strip()
66
+ return value
67
+
68
+
69
+ def cmd_generate(args) -> int:
70
+ """Handle the generate command."""
71
+ from recursive_cleaner import DataCleaner
72
+
73
+ # Check if file exists
74
+ if not os.path.exists(args.file):
75
+ print(f"Error: File not found: {args.file}", file=sys.stderr)
76
+ return 1
77
+
78
+ # Create backend
79
+ backend = create_backend(args.provider, args.model, args.base_url, args.api_key)
80
+
81
+ # Read instructions
82
+ instructions = read_instructions(args.instructions) if args.instructions else ""
83
+
84
+ # Create progress callback for non-TUI mode
85
+ def on_progress(event):
86
+ if not args.tui:
87
+ event_type = event.get("type", "")
88
+ if event_type == "function_generated":
89
+ print(f" Generated: {event.get('function_name', '')}")
90
+
91
+ try:
92
+ cleaner = DataCleaner(
93
+ llm_backend=backend,
94
+ file_path=args.file,
95
+ chunk_size=args.chunk_size,
96
+ instructions=instructions,
97
+ max_iterations=args.max_iterations,
98
+ mode=args.mode,
99
+ state_file=args.state_file,
100
+ report_path=args.report if args.report else None,
101
+ tui=args.tui,
102
+ optimize=args.optimize,
103
+ track_metrics=args.track_metrics,
104
+ early_termination=args.early_termination,
105
+ on_progress=on_progress if not args.tui else None,
106
+ output_path=args.output,
107
+ )
108
+ cleaner.run()
109
+ return 0
110
+ except FileNotFoundError as e:
111
+ print(f"Error: {e}", file=sys.stderr)
112
+ return 1
113
+ except Exception as e:
114
+ print(f"Error: {e}", file=sys.stderr)
115
+ return 3
116
+
117
+
118
+ def cmd_analyze(args) -> int:
119
+ """Handle the analyze command (dry-run mode)."""
120
+ from recursive_cleaner import DataCleaner
121
+
122
+ # Check if file exists
123
+ if not os.path.exists(args.file):
124
+ print(f"Error: File not found: {args.file}", file=sys.stderr)
125
+ return 1
126
+
127
+ # Create backend
128
+ backend = create_backend(args.provider, args.model, args.base_url, args.api_key)
129
+
130
+ # Read instructions
131
+ instructions = read_instructions(args.instructions) if args.instructions else ""
132
+
133
+ # Progress callback for analysis output
134
+ def on_progress(event):
135
+ if not args.tui:
136
+ event_type = event.get("type", "")
137
+ if event_type == "issues_detected":
138
+ issues = event.get("issues", [])
139
+ chunk_idx = event.get("chunk_index", 0)
140
+ unsolved = [i for i in issues if not i.get("solved", False)]
141
+ print(f"Chunk {chunk_idx + 1}: {len(issues)} issues ({len(unsolved)} unsolved)")
142
+
143
+ try:
144
+ cleaner = DataCleaner(
145
+ llm_backend=backend,
146
+ file_path=args.file,
147
+ chunk_size=args.chunk_size,
148
+ instructions=instructions,
149
+ max_iterations=args.max_iterations,
150
+ mode=args.mode,
151
+ dry_run=True,
152
+ tui=args.tui,
153
+ on_progress=on_progress if not args.tui else None,
154
+ )
155
+ cleaner.run()
156
+ return 0
157
+ except FileNotFoundError as e:
158
+ print(f"Error: {e}", file=sys.stderr)
159
+ return 1
160
+ except Exception as e:
161
+ print(f"Error: {e}", file=sys.stderr)
162
+ return 3
163
+
164
+
165
+ def cmd_apply(args) -> int:
166
+ """Handle the apply command."""
167
+ from recursive_cleaner.apply import apply_cleaning
168
+
169
+ # Check if input file exists
170
+ if not os.path.exists(args.file):
171
+ print(f"Error: File not found: {args.file}", file=sys.stderr)
172
+ return 1
173
+
174
+ # Check if functions file exists
175
+ if not os.path.exists(args.functions):
176
+ print(f"Error: Functions file not found: {args.functions}", file=sys.stderr)
177
+ return 1
178
+
179
+ try:
180
+ output_path = apply_cleaning(
181
+ input_path=args.file,
182
+ functions_path=args.functions,
183
+ output_path=args.output,
184
+ )
185
+ print(f"Cleaned data written to: {output_path}")
186
+ return 0
187
+ except FileNotFoundError as e:
188
+ print(f"Error: {e}", file=sys.stderr)
189
+ return 1
190
+ except ImportError as e:
191
+ print(f"Error: {e}", file=sys.stderr)
192
+ return 2
193
+ except Exception as e:
194
+ print(f"Error: {e}", file=sys.stderr)
195
+ return 3
196
+
197
+
198
+ def cmd_resume(args) -> int:
199
+ """Handle the resume command."""
200
+ from recursive_cleaner import DataCleaner
201
+
202
+ # Check if state file exists
203
+ if not os.path.exists(args.state_file):
204
+ print(f"Error: State file not found: {args.state_file}", file=sys.stderr)
205
+ return 1
206
+
207
+ # Create backend
208
+ backend = create_backend(args.provider, args.model, args.base_url, args.api_key)
209
+
210
+ try:
211
+ cleaner = DataCleaner.resume(args.state_file, backend)
212
+ cleaner.run()
213
+ return 0
214
+ except FileNotFoundError as e:
215
+ print(f"Error: {e}", file=sys.stderr)
216
+ return 1
217
+ except ValueError as e:
218
+ print(f"Error: Invalid state file: {e}", file=sys.stderr)
219
+ return 1
220
+ except Exception as e:
221
+ print(f"Error: {e}", file=sys.stderr)
222
+ return 3
223
+
224
+
225
+ def create_parser() -> argparse.ArgumentParser:
226
+ """Create the argument parser with all subcommands."""
227
+ parser = argparse.ArgumentParser(
228
+ prog="recursive-cleaner",
229
+ description="LLM-powered incremental data cleaning pipeline",
230
+ )
231
+
232
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
233
+
234
+ # --- generate command ---
235
+ gen_parser = subparsers.add_parser(
236
+ "generate",
237
+ help="Generate cleaning functions from data file",
238
+ )
239
+ gen_parser.add_argument("file", metavar="FILE", help="Path to input data file")
240
+ gen_parser.add_argument(
241
+ "-p", "--provider", required=True, choices=["mlx", "openai"],
242
+ help="LLM provider (mlx or openai)"
243
+ )
244
+ gen_parser.add_argument(
245
+ "-m", "--model", required=True, help="Model name/path"
246
+ )
247
+ gen_parser.add_argument(
248
+ "-i", "--instructions", default="",
249
+ help="Cleaning instructions (text or @file.txt)"
250
+ )
251
+ gen_parser.add_argument(
252
+ "--base-url", help="API base URL (for openai-compatible servers)"
253
+ )
254
+ gen_parser.add_argument(
255
+ "--api-key", help="API key (or use OPENAI_API_KEY env var)"
256
+ )
257
+ gen_parser.add_argument(
258
+ "--chunk-size", type=int, default=50, help="Items per chunk (default: 50)"
259
+ )
260
+ gen_parser.add_argument(
261
+ "--max-iterations", type=int, default=5,
262
+ help="Max iterations per chunk (default: 5)"
263
+ )
264
+ gen_parser.add_argument(
265
+ "--mode", choices=["auto", "structured", "text"], default="auto",
266
+ help="Processing mode (default: auto)"
267
+ )
268
+ gen_parser.add_argument(
269
+ "-o", "--output", default="cleaning_functions.py",
270
+ help="Output file path (default: cleaning_functions.py)"
271
+ )
272
+ gen_parser.add_argument(
273
+ "--report", default="cleaning_report.md",
274
+ help="Report file path (empty to disable, default: cleaning_report.md)"
275
+ )
276
+ gen_parser.add_argument(
277
+ "--state-file", help="Checkpoint file for resume"
278
+ )
279
+ gen_parser.add_argument(
280
+ "--tui", action="store_true", help="Enable Rich terminal dashboard"
281
+ )
282
+ gen_parser.add_argument(
283
+ "--optimize", action="store_true", help="Consolidate redundant functions"
284
+ )
285
+ gen_parser.add_argument(
286
+ "--track-metrics", action="store_true", help="Measure before/after quality"
287
+ )
288
+ gen_parser.add_argument(
289
+ "--early-termination", action="store_true",
290
+ help="Stop on pattern saturation"
291
+ )
292
+ gen_parser.set_defaults(func=cmd_generate)
293
+
294
+ # --- analyze command ---
295
+ analyze_parser = subparsers.add_parser(
296
+ "analyze",
297
+ help="Dry-run analysis without generating functions",
298
+ )
299
+ analyze_parser.add_argument("file", metavar="FILE", help="Path to input data file")
300
+ analyze_parser.add_argument(
301
+ "-p", "--provider", required=True, choices=["mlx", "openai"],
302
+ help="LLM provider (mlx or openai)"
303
+ )
304
+ analyze_parser.add_argument(
305
+ "-m", "--model", required=True, help="Model name/path"
306
+ )
307
+ analyze_parser.add_argument(
308
+ "-i", "--instructions", default="",
309
+ help="Cleaning instructions (text or @file.txt)"
310
+ )
311
+ analyze_parser.add_argument(
312
+ "--base-url", help="API base URL (for openai-compatible servers)"
313
+ )
314
+ analyze_parser.add_argument(
315
+ "--api-key", help="API key (or use OPENAI_API_KEY env var)"
316
+ )
317
+ analyze_parser.add_argument(
318
+ "--chunk-size", type=int, default=50, help="Items per chunk (default: 50)"
319
+ )
320
+ analyze_parser.add_argument(
321
+ "--max-iterations", type=int, default=5,
322
+ help="Max iterations per chunk (default: 5)"
323
+ )
324
+ analyze_parser.add_argument(
325
+ "--mode", choices=["auto", "structured", "text"], default="auto",
326
+ help="Processing mode (default: auto)"
327
+ )
328
+ analyze_parser.add_argument(
329
+ "--tui", action="store_true", help="Enable Rich terminal dashboard"
330
+ )
331
+ analyze_parser.set_defaults(func=cmd_analyze)
332
+
333
+ # --- resume command ---
334
+ resume_parser = subparsers.add_parser(
335
+ "resume",
336
+ help="Resume from checkpoint file",
337
+ )
338
+ resume_parser.add_argument(
339
+ "state_file", metavar="STATE_FILE", help="Path to checkpoint JSON file"
340
+ )
341
+ resume_parser.add_argument(
342
+ "-p", "--provider", required=True, choices=["mlx", "openai"],
343
+ help="LLM provider (mlx or openai)"
344
+ )
345
+ resume_parser.add_argument(
346
+ "-m", "--model", required=True, help="Model name/path"
347
+ )
348
+ resume_parser.add_argument(
349
+ "--base-url", help="API base URL (for openai-compatible servers)"
350
+ )
351
+ resume_parser.add_argument(
352
+ "--api-key", help="API key (or use OPENAI_API_KEY env var)"
353
+ )
354
+ resume_parser.set_defaults(func=cmd_resume)
355
+
356
+ # --- apply command ---
357
+ apply_parser = subparsers.add_parser(
358
+ "apply",
359
+ help="Apply cleaning functions to data file",
360
+ )
361
+ apply_parser.add_argument("file", metavar="FILE", help="Path to input data file")
362
+ apply_parser.add_argument(
363
+ "-f", "--functions", required=True,
364
+ help="Path to cleaning_functions.py"
365
+ )
366
+ apply_parser.add_argument(
367
+ "-o", "--output", help="Output file path (default: <input>.cleaned.<ext>)"
368
+ )
369
+ apply_parser.set_defaults(func=cmd_apply)
370
+
371
+ return parser
372
+
373
+
374
+ def main(args: list[str] | None = None) -> int:
375
+ """
376
+ Main entry point for the CLI.
377
+
378
+ Args:
379
+ args: Command-line arguments (defaults to sys.argv[1:])
380
+
381
+ Returns:
382
+ Exit code (0=success, 1=general error, 2=backend error, 3=validation error)
383
+ """
384
+ parser = create_parser()
385
+ parsed = parser.parse_args(args)
386
+
387
+ if parsed.command is None:
388
+ parser.print_help()
389
+ return 0
390
+
391
+ return parsed.func(parsed)
392
+
393
+
394
+ if __name__ == "__main__":
395
+ sys.exit(main())
recursive_cleaner/tui.py CHANGED
@@ -505,19 +505,28 @@ class TUIRenderer:
505
505
  )
506
506
  self._layout["left_panel"].update(left_panel)
507
507
 
508
- def _parse_response_for_display(self, response: str) -> str:
509
- """Parse LLM XML response into readable format for transmission log.
508
+ def _colorize_transmission(self, response: str) -> "Text":
509
+ """Parse LLM XML response into colorized Rich Text for transmission log.
510
+
511
+ Color scheme:
512
+ - Issues (solved): dim
513
+ - Issues (unsolved): bright_white with cycling accent (blue/magenta/cyan/yellow)
514
+ - Function names: green
515
+ - Docstrings: italic
516
+ - Status clean: green
517
+ - Status needs_more_work: yellow
510
518
 
511
519
  Args:
512
520
  response: Raw LLM response text (XML format)
513
521
 
514
522
  Returns:
515
- Formatted string for display showing issues, function being
516
- generated, and chunk status.
523
+ Rich Text object with colors applied.
517
524
  """
518
525
  import re
519
526
 
520
- lines = []
527
+ ISSUE_COLORS = ["blue", "magenta", "cyan", "yellow"]
528
+ text = Text()
529
+ unsolved_index = 0
521
530
 
522
531
  try:
523
532
  # Find all issues
@@ -525,53 +534,63 @@ class TUIRenderer:
525
534
  issues = re.findall(issue_pattern, response, re.DOTALL)
526
535
 
527
536
  if issues:
528
- lines.append("ISSUES DETECTED:")
537
+ text.append("ISSUES DETECTED:\n", style="bold cyan")
529
538
  for issue_id, solved, desc in issues[:8]: # Limit to 8 issues
530
- marker = "\u2713" if solved == "true" else "\u2717" # checkmark or X
531
539
  desc_clean = desc.strip()[:40] # Truncate description
532
- lines.append(f" {marker} {desc_clean}")
540
+ if solved == "true":
541
+ text.append(" \u2713 ", style="green")
542
+ text.append(f"{desc_clean}\n", style="dim")
543
+ else:
544
+ accent = ISSUE_COLORS[unsolved_index % len(ISSUE_COLORS)]
545
+ text.append(" \u2717 ", style=accent)
546
+ text.append(f"{desc_clean}\n", style="bright_white")
547
+ unsolved_index += 1
533
548
  if len(issues) > 8:
534
- lines.append(f" (+{len(issues) - 8} more)")
535
- lines.append("")
549
+ text.append(f" (+{len(issues) - 8} more)\n", style="dim")
550
+ text.append("\n")
536
551
 
537
552
  # Find function being generated
538
553
  name_match = re.search(r'<name>([^<]+)</name>', response)
539
554
  docstring_match = re.search(r'<docstring>([^<]+)</docstring>', response, re.DOTALL)
540
555
 
541
556
  if name_match:
542
- lines.append(f"GENERATING: {name_match.group(1).strip()}")
557
+ text.append("GENERATING: ", style="bold cyan")
558
+ text.append(f"{name_match.group(1).strip()}\n", style="green bold")
543
559
  if docstring_match:
544
560
  doc = docstring_match.group(1).strip()[:60]
545
- lines.append(f' "{doc}..."')
546
- lines.append("")
561
+ text.append(f' "{doc}..."\n', style="italic")
562
+ text.append("\n")
547
563
 
548
564
  # Find chunk status
549
565
  status_match = re.search(r'<chunk_status>([^<]+)</chunk_status>', response)
550
566
  if status_match:
551
567
  status = status_match.group(1).strip()
552
- lines.append(f"STATUS: {status.upper()}")
553
-
554
- if lines:
555
- return "\n".join(lines)
568
+ text.append("STATUS: ", style="bold cyan")
569
+ if status == "clean":
570
+ text.append(status.upper(), style="green bold")
571
+ else:
572
+ text.append(status.upper().replace("_", " "), style="yellow bold")
573
+
574
+ if text.plain:
575
+ return text
556
576
  except Exception:
557
577
  pass
558
578
 
559
579
  # Fallback: show truncated raw response
560
- return response[:500] + "..." if len(response) > 500 else response
580
+ fallback = response[:500] + "..." if len(response) > 500 else response
581
+ return Text(fallback, style="dim cyan")
561
582
 
562
583
  def _refresh_right_panel(self) -> None:
563
- """Refresh the right panel with parsed transmission log."""
584
+ """Refresh the right panel with colorized transmission log."""
564
585
  if not HAS_RICH or self._layout is None:
565
586
  return
566
587
 
567
- # Get last response and parse for display
588
+ # Get last response and colorize for display
568
589
  response = self._state.last_response
569
590
  if not response:
570
- display_text = "(Awaiting transmission...)"
591
+ log_text = Text("(Awaiting transmission...)", style="dim cyan")
571
592
  else:
572
- display_text = self._parse_response_for_display(response)
573
-
574
- log_text = Text(display_text, style="dim cyan")
593
+ log_text = self._colorize_transmission(response)
575
594
 
576
595
  right_panel = Panel(
577
596
  log_text,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: recursive-cleaner
3
- Version: 0.8.0
3
+ Version: 1.0.0
4
4
  Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
5
5
  Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
6
6
  Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
@@ -9,7 +9,7 @@ Author: Gary Tran
9
9
  License-Expression: MIT
10
10
  License-File: LICENSE
11
11
  Keywords: automation,data-cleaning,data-quality,etl,llm,machine-learning
12
- Classifier: Development Status :: 4 - Beta
12
+ Classifier: Development Status :: 5 - Production/Stable
13
13
  Classifier: Intended Audience :: Developers
14
14
  Classifier: Intended Audience :: Science/Research
15
15
  Classifier: License :: OSI Approved :: MIT License
@@ -26,10 +26,15 @@ Requires-Dist: tenacity>=8.0
26
26
  Provides-Extra: dev
27
27
  Requires-Dist: pytest-cov>=4.0; extra == 'dev'
28
28
  Requires-Dist: pytest>=7.0; extra == 'dev'
29
+ Provides-Extra: excel
30
+ Requires-Dist: openpyxl>=3.0.0; extra == 'excel'
31
+ Requires-Dist: xlrd>=2.0.0; extra == 'excel'
29
32
  Provides-Extra: markitdown
30
33
  Requires-Dist: markitdown>=0.1.0; extra == 'markitdown'
31
34
  Provides-Extra: mlx
32
35
  Requires-Dist: mlx-lm>=0.10.0; extra == 'mlx'
36
+ Provides-Extra: openai
37
+ Requires-Dist: openai>=1.0.0; extra == 'openai'
33
38
  Provides-Extra: parquet
34
39
  Requires-Dist: pyarrow>=14.0.0; extra == 'parquet'
35
40
  Provides-Extra: tui
@@ -140,6 +145,91 @@ cleaner.run() # Generates cleaning_functions.py
140
145
  - **Token Estimation**: Track estimated input/output tokens across the run
141
146
  - **Graceful Fallback**: Works without Rich installed (falls back to callbacks)
142
147
 
148
+ ### CLI (v0.9.0)
149
+ - **Command Line Interface**: Use without writing Python code
150
+ - **Multiple Backends**: MLX (Apple Silicon) and OpenAI-compatible (OpenAI, LM Studio, Ollama)
151
+ - **Four Commands**: `generate`, `analyze` (dry-run), `resume`, `apply`
152
+
153
+ ### Apply Mode (v1.0.0)
154
+ - **Apply Cleaning Functions**: Apply generated functions to full datasets
155
+ - **Data Formats**: JSONL, CSV, JSON, Parquet, Excel (.xlsx/.xls) output same format
156
+ - **Text Formats**: PDF, Word, HTML, etc. output as Markdown
157
+ - **Streaming**: Memory-efficient line-by-line processing for JSONL/CSV
158
+ - **Colored TUI**: Enhanced transmission log with syntax-highlighted XML parsing
159
+
160
+ ## Command Line Interface
161
+
162
+ After installation, the `recursive-cleaner` command is available:
163
+
164
+ ```bash
165
+ # Generate cleaning functions with MLX (Apple Silicon)
166
+ recursive-cleaner generate data.jsonl \
167
+ --provider mlx \
168
+ --model "lmstudio-community/Qwen3-80B-MLX-4bit" \
169
+ --instructions "Normalize phone numbers to E.164" \
170
+ --output cleaning_functions.py
171
+
172
+ # Use OpenAI
173
+ export OPENAI_API_KEY=your-key
174
+ recursive-cleaner generate data.jsonl \
175
+ --provider openai \
176
+ --model gpt-4o \
177
+ --instructions "Fix date formats"
178
+
179
+ # Use LM Studio or Ollama (OpenAI-compatible)
180
+ recursive-cleaner generate data.jsonl \
181
+ --provider openai \
182
+ --model "qwen/qwen3-vl-30b" \
183
+ --base-url http://localhost:1234/v1 \
184
+ --instructions "Normalize prices"
185
+
186
+ # Dry-run analysis
187
+ recursive-cleaner analyze data.jsonl \
188
+ --provider openai \
189
+ --model gpt-4o \
190
+ --instructions @instructions.txt
191
+
192
+ # Resume from checkpoint
193
+ recursive-cleaner resume cleaning_state.json \
194
+ --provider mlx \
195
+ --model "model-path"
196
+
197
+ # Apply cleaning functions to data
198
+ recursive-cleaner apply data.jsonl \
199
+ --functions cleaning_functions.py \
200
+ --output cleaned_data.jsonl
201
+
202
+ # Apply to Excel (outputs same format)
203
+ recursive-cleaner apply sales.xlsx \
204
+ --functions cleaning_functions.py
205
+
206
+ # Apply to PDF (outputs markdown)
207
+ recursive-cleaner apply document.pdf \
208
+ --functions cleaning_functions.py \
209
+ --output cleaned.md
210
+ ```
211
+
212
+ ### CLI Options
213
+
214
+ ```
215
+ recursive-cleaner generate <FILE> [OPTIONS]
216
+
217
+ Required:
218
+ FILE Input data file
219
+ -p, --provider {mlx,openai} LLM provider
220
+ -m, --model MODEL Model name/path
221
+
222
+ Optional:
223
+ -i, --instructions TEXT Cleaning instructions (or @file.txt)
224
+ --base-url URL API URL for OpenAI-compatible servers
225
+ --chunk-size N Items per chunk (default: 50)
226
+ --max-iterations N Max iterations per chunk (default: 5)
227
+ -o, --output PATH Output file (default: cleaning_functions.py)
228
+ --tui Enable Rich dashboard
229
+ --optimize Consolidate redundant functions
230
+ --track-metrics Measure before/after quality
231
+ ```
232
+
143
233
  ## Configuration
144
234
 
145
235
  ```python
@@ -270,6 +360,7 @@ cleaner.run()
270
360
 
271
361
  ```
272
362
  recursive_cleaner/
363
+ ├── cli.py # Command line interface
273
364
  ├── cleaner.py # Main DataCleaner class
274
365
  ├── context.py # Docstring registry with FIFO eviction
275
366
  ├── dependencies.py # Topological sort for function ordering
@@ -286,6 +377,10 @@ recursive_cleaner/
286
377
  ├── validation.py # Runtime validation + holdout
287
378
  └── vendor/
288
379
  └── chunker.py # Vendored sentence-aware chunker
380
+
381
+ backends/
382
+ ├── mlx_backend.py # MLX-LM backend for Apple Silicon
383
+ └── openai_backend.py # OpenAI-compatible backend
289
384
  ```
290
385
 
291
386
  ## Testing
@@ -294,14 +389,14 @@ recursive_cleaner/
294
389
  pytest tests/ -v
295
390
  ```
296
391
 
297
- 465 tests covering all features. Test datasets in `test_cases/`:
392
+ 548 tests covering all features. Test datasets in `test_cases/`:
298
393
  - E-commerce product catalogs
299
394
  - Healthcare patient records
300
395
  - Financial transaction data
301
396
 
302
397
  ## Philosophy
303
398
 
304
- - **Simplicity over extensibility**: ~3,000 lines that do one thing well
399
+ - **Simplicity over extensibility**: ~5,000 lines that do one thing well
305
400
  - **stdlib over dependencies**: Only `tenacity` required
306
401
  - **Retry over recover**: On error, retry with error in prompt
307
402
  - **Wu wei**: Let the LLM make decisions about data it understands
@@ -310,6 +405,7 @@ pytest tests/ -v
310
405
 
311
406
  | Version | Features |
312
407
  |---------|----------|
408
+ | v0.9.0 | CLI tool with MLX and OpenAI-compatible backends (LM Studio, Ollama) |
313
409
  | v0.8.0 | Terminal UI with Rich dashboard, mission control aesthetic, transmission log |
314
410
  | v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
315
411
  | v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
@@ -1,7 +1,11 @@
1
- backends/__init__.py,sha256=FUgODeYSGBvT0-z6myVby6YeAHG0nEUgWLITBKobUew,121
1
+ backends/__init__.py,sha256=vWcPASV0GGEAydzOSjdrknkSHoGbSs4edtuv9HIzBhI,180
2
2
  backends/mlx_backend.py,sha256=0U6IqmDHyk4vjKzytvEcQvSUBryQTgFtsNOcpwFNKk8,2945
3
- recursive_cleaner/__init__.py,sha256=v0bNQ3H0d7n6cTOkuxuqG9bmnX9yeZBLZ_AfFM7edHI,1789
4
- recursive_cleaner/cleaner.py,sha256=vZTMwaLlCmuh1qy3c-puEZrwS5gXt0u28d5iweQXbms,29801
3
+ backends/openai_backend.py,sha256=vKWsXKltBv_tJDoQfQ_7KVMZDfomhFFN2vl1oZ1KGbQ,2057
4
+ recursive_cleaner/__init__.py,sha256=xCFlkqmmBoa7ntUZQnRQxVMv9iLeOvmboDS_j2EHfZI,1862
5
+ recursive_cleaner/__main__.py,sha256=WXmMaL_myHPsG_qXAhZDufD43Ydsd25RV2IPeW2Kg08,152
6
+ recursive_cleaner/apply.py,sha256=hjeljhZNiOuwz9m09RYVLl_z_9tet7LwubH6cb_Wy6Y,13855
7
+ recursive_cleaner/cleaner.py,sha256=kPOQ44hgiJzABiqdmjg2hqd7Ot9uxKUSOe8_jz0UBQc,29911
8
+ recursive_cleaner/cli.py,sha256=Sk_qYKxSn1PiPmMLKkyj9VxsseHaSXmSlGazxfmkTFc,12807
5
9
  recursive_cleaner/context.py,sha256=avMXRDxLd7nd8CKWtvPHQy1MFhBKiA0aUVVJIlWoLZ4,824
6
10
  recursive_cleaner/dependencies.py,sha256=vlYeoGL517v3yUSWN0wYDuIs9OOuQwM_dCBADrlitW8,2080
7
11
  recursive_cleaner/errors.py,sha256=hwRJF8NSmWy_FZHCxcZDZxLQ0zqvo5dX8ImkB9mrOYc,433
@@ -14,12 +18,13 @@ recursive_cleaner/prompt.py,sha256=ep0eOXz_XbhH3HduJ76LvzVSftonhcv4GLEecIqd3lY,6
14
18
  recursive_cleaner/report.py,sha256=AWWneRjvl76ccLlExdkKJeY3GVFUG_LtmzVIJJT5cFI,4629
15
19
  recursive_cleaner/response.py,sha256=3w0mLnqEPdB4daMSF0mtTcG0PTP-utb1HFtKuYA1ljw,9064
16
20
  recursive_cleaner/schema.py,sha256=w2hcEdApR15KVI9SFWB3VfumMoHFwn1YJrktdfgPo8M,3925
17
- recursive_cleaner/tui.py,sha256=FwG_uCwqUcvch5dRZmV-ba2JXD0XJkm9roXzPQ9iUSo,21633
21
+ recursive_cleaner/tui.py,sha256=zuiFPtMh3K-sC1CWZoaoUmgZ3rESkl10gYcqMzpVqiM,22598
18
22
  recursive_cleaner/types.py,sha256=-GdCmsfHd3rfdfCi5c-RXqX4TyuCSHgA__3AF3bMhoQ,290
19
23
  recursive_cleaner/validation.py,sha256=-KAolhw3GQyhHwmh0clEj8xqPD5O-R2AO5rx7vubIME,6442
20
24
  recursive_cleaner/vendor/__init__.py,sha256=E87TjmjRzu8ty39nqThvBwM611yXlLKQZ6KGY_zp3Dk,117
21
25
  recursive_cleaner/vendor/chunker.py,sha256=pDDbfF6FoSmUji0-RG4MletPxJ-VybGw0yfnhh0aMSQ,6730
22
- recursive_cleaner-0.8.0.dist-info/METADATA,sha256=rVABzjvUZ-uzk35o5evbIlkRIbgEb29QPKSCoMI4_fs,11072
23
- recursive_cleaner-0.8.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
24
- recursive_cleaner-0.8.0.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
25
- recursive_cleaner-0.8.0.dist-info/RECORD,,
26
+ recursive_cleaner-1.0.0.dist-info/METADATA,sha256=L86ATNd8JxmPp32HKaO6PPwkmq4sIE3Mdvgx3pmUulE,14285
27
+ recursive_cleaner-1.0.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
28
+ recursive_cleaner-1.0.0.dist-info/entry_points.txt,sha256=S5nbi0rnifpShxdXGExeZnd65UZfp8K7DNyuKPST6nk,65
29
+ recursive_cleaner-1.0.0.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
30
+ recursive_cleaner-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ recursive-cleaner = recursive_cleaner.cli:main