recursive-cleaner 0.8.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,395 @@
1
+ """CLI interface for Recursive Data Cleaner."""
2
+
3
+ import argparse
4
+ import os
5
+ import sys
6
+
7
+
8
+ def create_backend(provider: str, model: str, base_url: str | None, api_key: str | None):
9
+ """
10
+ Factory function to create the appropriate backend.
11
+
12
+ Args:
13
+ provider: Backend provider ("mlx" or "openai")
14
+ model: Model name/path
15
+ base_url: Optional API base URL (for openai-compatible servers)
16
+ api_key: Optional API key
17
+
18
+ Returns:
19
+ LLMBackend instance
20
+
21
+ Raises:
22
+ SystemExit: With code 2 if provider is invalid or import fails
23
+ """
24
+ if provider == "mlx":
25
+ try:
26
+ from backends import MLXBackend
27
+ return MLXBackend(model_path=model)
28
+ except ImportError:
29
+ print("Error: MLX backend requires mlx-lm. Install with: pip install mlx-lm", file=sys.stderr)
30
+ sys.exit(2)
31
+ elif provider == "openai":
32
+ try:
33
+ from backends import OpenAIBackend
34
+ return OpenAIBackend(model=model, api_key=api_key, base_url=base_url)
35
+ except ImportError as e:
36
+ print(f"Error: {e}", file=sys.stderr)
37
+ sys.exit(2)
38
+ else:
39
+ print(f"Error: Unknown provider '{provider}'. Use 'mlx' or 'openai'.", file=sys.stderr)
40
+ sys.exit(2)
41
+
42
+
43
+ def read_instructions(value: str) -> str:
44
+ """
45
+ Read instructions from inline text or file.
46
+
47
+ Args:
48
+ value: Instructions string or @file.txt path
49
+
50
+ Returns:
51
+ Instructions text
52
+ """
53
+ if value.startswith("@"):
54
+ file_path = value[1:]
55
+ try:
56
+ with open(file_path, "r") as f:
57
+ return f.read().strip()
58
+ except FileNotFoundError:
59
+ print(f"Error: Instructions file not found: {file_path}", file=sys.stderr)
60
+ sys.exit(1)
61
+ except IOError as e:
62
+ print(f"Error reading instructions file: {e}", file=sys.stderr)
63
+ sys.exit(1)
64
+ elif value == "-":
65
+ return sys.stdin.read().strip()
66
+ return value
67
+
68
+
69
+ def cmd_generate(args) -> int:
70
+ """Handle the generate command."""
71
+ from recursive_cleaner import DataCleaner
72
+
73
+ # Check if file exists
74
+ if not os.path.exists(args.file):
75
+ print(f"Error: File not found: {args.file}", file=sys.stderr)
76
+ return 1
77
+
78
+ # Create backend
79
+ backend = create_backend(args.provider, args.model, args.base_url, args.api_key)
80
+
81
+ # Read instructions
82
+ instructions = read_instructions(args.instructions) if args.instructions else ""
83
+
84
+ # Create progress callback for non-TUI mode
85
+ def on_progress(event):
86
+ if not args.tui:
87
+ event_type = event.get("type", "")
88
+ if event_type == "function_generated":
89
+ print(f" Generated: {event.get('function_name', '')}")
90
+
91
+ try:
92
+ cleaner = DataCleaner(
93
+ llm_backend=backend,
94
+ file_path=args.file,
95
+ chunk_size=args.chunk_size,
96
+ instructions=instructions,
97
+ max_iterations=args.max_iterations,
98
+ mode=args.mode,
99
+ state_file=args.state_file,
100
+ report_path=args.report if args.report else None,
101
+ tui=args.tui,
102
+ optimize=args.optimize,
103
+ track_metrics=args.track_metrics,
104
+ early_termination=args.early_termination,
105
+ on_progress=on_progress if not args.tui else None,
106
+ output_path=args.output,
107
+ )
108
+ cleaner.run()
109
+ return 0
110
+ except FileNotFoundError as e:
111
+ print(f"Error: {e}", file=sys.stderr)
112
+ return 1
113
+ except Exception as e:
114
+ print(f"Error: {e}", file=sys.stderr)
115
+ return 3
116
+
117
+
118
+ def cmd_analyze(args) -> int:
119
+ """Handle the analyze command (dry-run mode)."""
120
+ from recursive_cleaner import DataCleaner
121
+
122
+ # Check if file exists
123
+ if not os.path.exists(args.file):
124
+ print(f"Error: File not found: {args.file}", file=sys.stderr)
125
+ return 1
126
+
127
+ # Create backend
128
+ backend = create_backend(args.provider, args.model, args.base_url, args.api_key)
129
+
130
+ # Read instructions
131
+ instructions = read_instructions(args.instructions) if args.instructions else ""
132
+
133
+ # Progress callback for analysis output
134
+ def on_progress(event):
135
+ if not args.tui:
136
+ event_type = event.get("type", "")
137
+ if event_type == "issues_detected":
138
+ issues = event.get("issues", [])
139
+ chunk_idx = event.get("chunk_index", 0)
140
+ unsolved = [i for i in issues if not i.get("solved", False)]
141
+ print(f"Chunk {chunk_idx + 1}: {len(issues)} issues ({len(unsolved)} unsolved)")
142
+
143
+ try:
144
+ cleaner = DataCleaner(
145
+ llm_backend=backend,
146
+ file_path=args.file,
147
+ chunk_size=args.chunk_size,
148
+ instructions=instructions,
149
+ max_iterations=args.max_iterations,
150
+ mode=args.mode,
151
+ dry_run=True,
152
+ tui=args.tui,
153
+ on_progress=on_progress if not args.tui else None,
154
+ )
155
+ cleaner.run()
156
+ return 0
157
+ except FileNotFoundError as e:
158
+ print(f"Error: {e}", file=sys.stderr)
159
+ return 1
160
+ except Exception as e:
161
+ print(f"Error: {e}", file=sys.stderr)
162
+ return 3
163
+
164
+
165
+ def cmd_apply(args) -> int:
166
+ """Handle the apply command."""
167
+ from recursive_cleaner.apply import apply_cleaning
168
+
169
+ # Check if input file exists
170
+ if not os.path.exists(args.file):
171
+ print(f"Error: File not found: {args.file}", file=sys.stderr)
172
+ return 1
173
+
174
+ # Check if functions file exists
175
+ if not os.path.exists(args.functions):
176
+ print(f"Error: Functions file not found: {args.functions}", file=sys.stderr)
177
+ return 1
178
+
179
+ try:
180
+ output_path = apply_cleaning(
181
+ input_path=args.file,
182
+ functions_path=args.functions,
183
+ output_path=args.output,
184
+ )
185
+ print(f"Cleaned data written to: {output_path}")
186
+ return 0
187
+ except FileNotFoundError as e:
188
+ print(f"Error: {e}", file=sys.stderr)
189
+ return 1
190
+ except ImportError as e:
191
+ print(f"Error: {e}", file=sys.stderr)
192
+ return 2
193
+ except Exception as e:
194
+ print(f"Error: {e}", file=sys.stderr)
195
+ return 3
196
+
197
+
198
+ def cmd_resume(args) -> int:
199
+ """Handle the resume command."""
200
+ from recursive_cleaner import DataCleaner
201
+
202
+ # Check if state file exists
203
+ if not os.path.exists(args.state_file):
204
+ print(f"Error: State file not found: {args.state_file}", file=sys.stderr)
205
+ return 1
206
+
207
+ # Create backend
208
+ backend = create_backend(args.provider, args.model, args.base_url, args.api_key)
209
+
210
+ try:
211
+ cleaner = DataCleaner.resume(args.state_file, backend)
212
+ cleaner.run()
213
+ return 0
214
+ except FileNotFoundError as e:
215
+ print(f"Error: {e}", file=sys.stderr)
216
+ return 1
217
+ except ValueError as e:
218
+ print(f"Error: Invalid state file: {e}", file=sys.stderr)
219
+ return 1
220
+ except Exception as e:
221
+ print(f"Error: {e}", file=sys.stderr)
222
+ return 3
223
+
224
+
225
+ def create_parser() -> argparse.ArgumentParser:
226
+ """Create the argument parser with all subcommands."""
227
+ parser = argparse.ArgumentParser(
228
+ prog="recursive-cleaner",
229
+ description="LLM-powered incremental data cleaning pipeline",
230
+ )
231
+
232
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
233
+
234
+ # --- generate command ---
235
+ gen_parser = subparsers.add_parser(
236
+ "generate",
237
+ help="Generate cleaning functions from data file",
238
+ )
239
+ gen_parser.add_argument("file", metavar="FILE", help="Path to input data file")
240
+ gen_parser.add_argument(
241
+ "-p", "--provider", required=True, choices=["mlx", "openai"],
242
+ help="LLM provider (mlx or openai)"
243
+ )
244
+ gen_parser.add_argument(
245
+ "-m", "--model", required=True, help="Model name/path"
246
+ )
247
+ gen_parser.add_argument(
248
+ "-i", "--instructions", default="",
249
+ help="Cleaning instructions (text or @file.txt)"
250
+ )
251
+ gen_parser.add_argument(
252
+ "--base-url", help="API base URL (for openai-compatible servers)"
253
+ )
254
+ gen_parser.add_argument(
255
+ "--api-key", help="API key (or use OPENAI_API_KEY env var)"
256
+ )
257
+ gen_parser.add_argument(
258
+ "--chunk-size", type=int, default=50, help="Items per chunk (default: 50)"
259
+ )
260
+ gen_parser.add_argument(
261
+ "--max-iterations", type=int, default=5,
262
+ help="Max iterations per chunk (default: 5)"
263
+ )
264
+ gen_parser.add_argument(
265
+ "--mode", choices=["auto", "structured", "text"], default="auto",
266
+ help="Processing mode (default: auto)"
267
+ )
268
+ gen_parser.add_argument(
269
+ "-o", "--output", default="cleaning_functions.py",
270
+ help="Output file path (default: cleaning_functions.py)"
271
+ )
272
+ gen_parser.add_argument(
273
+ "--report", default="cleaning_report.md",
274
+ help="Report file path (empty to disable, default: cleaning_report.md)"
275
+ )
276
+ gen_parser.add_argument(
277
+ "--state-file", help="Checkpoint file for resume"
278
+ )
279
+ gen_parser.add_argument(
280
+ "--tui", action="store_true", help="Enable Rich terminal dashboard"
281
+ )
282
+ gen_parser.add_argument(
283
+ "--optimize", action="store_true", help="Consolidate redundant functions"
284
+ )
285
+ gen_parser.add_argument(
286
+ "--track-metrics", action="store_true", help="Measure before/after quality"
287
+ )
288
+ gen_parser.add_argument(
289
+ "--early-termination", action="store_true",
290
+ help="Stop on pattern saturation"
291
+ )
292
+ gen_parser.set_defaults(func=cmd_generate)
293
+
294
+ # --- analyze command ---
295
+ analyze_parser = subparsers.add_parser(
296
+ "analyze",
297
+ help="Dry-run analysis without generating functions",
298
+ )
299
+ analyze_parser.add_argument("file", metavar="FILE", help="Path to input data file")
300
+ analyze_parser.add_argument(
301
+ "-p", "--provider", required=True, choices=["mlx", "openai"],
302
+ help="LLM provider (mlx or openai)"
303
+ )
304
+ analyze_parser.add_argument(
305
+ "-m", "--model", required=True, help="Model name/path"
306
+ )
307
+ analyze_parser.add_argument(
308
+ "-i", "--instructions", default="",
309
+ help="Cleaning instructions (text or @file.txt)"
310
+ )
311
+ analyze_parser.add_argument(
312
+ "--base-url", help="API base URL (for openai-compatible servers)"
313
+ )
314
+ analyze_parser.add_argument(
315
+ "--api-key", help="API key (or use OPENAI_API_KEY env var)"
316
+ )
317
+ analyze_parser.add_argument(
318
+ "--chunk-size", type=int, default=50, help="Items per chunk (default: 50)"
319
+ )
320
+ analyze_parser.add_argument(
321
+ "--max-iterations", type=int, default=5,
322
+ help="Max iterations per chunk (default: 5)"
323
+ )
324
+ analyze_parser.add_argument(
325
+ "--mode", choices=["auto", "structured", "text"], default="auto",
326
+ help="Processing mode (default: auto)"
327
+ )
328
+ analyze_parser.add_argument(
329
+ "--tui", action="store_true", help="Enable Rich terminal dashboard"
330
+ )
331
+ analyze_parser.set_defaults(func=cmd_analyze)
332
+
333
+ # --- resume command ---
334
+ resume_parser = subparsers.add_parser(
335
+ "resume",
336
+ help="Resume from checkpoint file",
337
+ )
338
+ resume_parser.add_argument(
339
+ "state_file", metavar="STATE_FILE", help="Path to checkpoint JSON file"
340
+ )
341
+ resume_parser.add_argument(
342
+ "-p", "--provider", required=True, choices=["mlx", "openai"],
343
+ help="LLM provider (mlx or openai)"
344
+ )
345
+ resume_parser.add_argument(
346
+ "-m", "--model", required=True, help="Model name/path"
347
+ )
348
+ resume_parser.add_argument(
349
+ "--base-url", help="API base URL (for openai-compatible servers)"
350
+ )
351
+ resume_parser.add_argument(
352
+ "--api-key", help="API key (or use OPENAI_API_KEY env var)"
353
+ )
354
+ resume_parser.set_defaults(func=cmd_resume)
355
+
356
+ # --- apply command ---
357
+ apply_parser = subparsers.add_parser(
358
+ "apply",
359
+ help="Apply cleaning functions to data file",
360
+ )
361
+ apply_parser.add_argument("file", metavar="FILE", help="Path to input data file")
362
+ apply_parser.add_argument(
363
+ "-f", "--functions", required=True,
364
+ help="Path to cleaning_functions.py"
365
+ )
366
+ apply_parser.add_argument(
367
+ "-o", "--output", help="Output file path (default: <input>.cleaned.<ext>)"
368
+ )
369
+ apply_parser.set_defaults(func=cmd_apply)
370
+
371
+ return parser
372
+
373
+
374
+ def main(args: list[str] | None = None) -> int:
375
+ """
376
+ Main entry point for the CLI.
377
+
378
+ Args:
379
+ args: Command-line arguments (defaults to sys.argv[1:])
380
+
381
+ Returns:
382
+ Exit code (0=success, 1=general error, 2=backend error, 3=validation error)
383
+ """
384
+ parser = create_parser()
385
+ parsed = parser.parse_args(args)
386
+
387
+ if parsed.command is None:
388
+ parser.print_help()
389
+ return 0
390
+
391
+ return parsed.func(parsed)
392
+
393
+
394
+ if __name__ == "__main__":
395
+ sys.exit(main())
@@ -52,7 +52,8 @@ CONSOLIDATION_TEMPLATE = '''You are reviewing cleaning functions for consolidati
52
52
  </docstring>
53
53
  <code>
54
54
  ```python
55
- def merged_function_name(record):
55
+ def merged_function_name(record: dict) -> dict:
56
+ # Modify fields, return record
56
57
  ...
57
58
  ```
58
59
  </code>
@@ -108,9 +109,10 @@ Tags: domain, action, detail
108
109
  </docstring>
109
110
  <code>
110
111
  ```python
111
- def function_name(data):
112
- # Complete implementation
113
- pass
112
+ def function_name(record: dict) -> dict:
113
+ # Modify field(s) in the record
114
+ record["field"] = cleaned_value
115
+ return record
114
116
  ```
115
117
  </code>
116
118
  </function_to_generate>
@@ -120,6 +122,8 @@ def function_name(data):
120
122
 
121
123
  RULES:
122
124
  - ONE function per response
125
+ - Function signature: takes a dict (one record), returns the modified dict
126
+ - Modify fields directly on the record, then return it
123
127
  - If all issues solved: <chunk_status>clean</chunk_status>, omit <function_to_generate>
124
128
  - Include imports inside the function or document needed imports in docstring
125
129
  - Function must be idempotent (safe to run multiple times)
recursive_cleaner/tui.py CHANGED
@@ -505,19 +505,28 @@ class TUIRenderer:
505
505
  )
506
506
  self._layout["left_panel"].update(left_panel)
507
507
 
508
- def _parse_response_for_display(self, response: str) -> str:
509
- """Parse LLM XML response into readable format for transmission log.
508
+ def _colorize_transmission(self, response: str) -> "Text":
509
+ """Parse LLM XML response into colorized Rich Text for transmission log.
510
+
511
+ Color scheme:
512
+ - Issues (solved): dim
513
+ - Issues (unsolved): bright_white with cycling accent (blue/magenta/cyan/yellow)
514
+ - Function names: green
515
+ - Docstrings: italic
516
+ - Status clean: green
517
+ - Status needs_more_work: yellow
510
518
 
511
519
  Args:
512
520
  response: Raw LLM response text (XML format)
513
521
 
514
522
  Returns:
515
- Formatted string for display showing issues, function being
516
- generated, and chunk status.
523
+ Rich Text object with colors applied.
517
524
  """
518
525
  import re
519
526
 
520
- lines = []
527
+ ISSUE_COLORS = ["blue", "magenta", "cyan", "yellow"]
528
+ text = Text()
529
+ unsolved_index = 0
521
530
 
522
531
  try:
523
532
  # Find all issues
@@ -525,53 +534,63 @@ class TUIRenderer:
525
534
  issues = re.findall(issue_pattern, response, re.DOTALL)
526
535
 
527
536
  if issues:
528
- lines.append("ISSUES DETECTED:")
537
+ text.append("ISSUES DETECTED:\n", style="bold cyan")
529
538
  for issue_id, solved, desc in issues[:8]: # Limit to 8 issues
530
- marker = "\u2713" if solved == "true" else "\u2717" # checkmark or X
531
539
  desc_clean = desc.strip()[:40] # Truncate description
532
- lines.append(f" {marker} {desc_clean}")
540
+ if solved == "true":
541
+ text.append(" \u2713 ", style="green")
542
+ text.append(f"{desc_clean}\n", style="dim")
543
+ else:
544
+ accent = ISSUE_COLORS[unsolved_index % len(ISSUE_COLORS)]
545
+ text.append(" \u2717 ", style=accent)
546
+ text.append(f"{desc_clean}\n", style="bright_white")
547
+ unsolved_index += 1
533
548
  if len(issues) > 8:
534
- lines.append(f" (+{len(issues) - 8} more)")
535
- lines.append("")
549
+ text.append(f" (+{len(issues) - 8} more)\n", style="dim")
550
+ text.append("\n")
536
551
 
537
552
  # Find function being generated
538
553
  name_match = re.search(r'<name>([^<]+)</name>', response)
539
554
  docstring_match = re.search(r'<docstring>([^<]+)</docstring>', response, re.DOTALL)
540
555
 
541
556
  if name_match:
542
- lines.append(f"GENERATING: {name_match.group(1).strip()}")
557
+ text.append("GENERATING: ", style="bold cyan")
558
+ text.append(f"{name_match.group(1).strip()}\n", style="green bold")
543
559
  if docstring_match:
544
560
  doc = docstring_match.group(1).strip()[:60]
545
- lines.append(f' "{doc}..."')
546
- lines.append("")
561
+ text.append(f' "{doc}..."\n', style="italic")
562
+ text.append("\n")
547
563
 
548
564
  # Find chunk status
549
565
  status_match = re.search(r'<chunk_status>([^<]+)</chunk_status>', response)
550
566
  if status_match:
551
567
  status = status_match.group(1).strip()
552
- lines.append(f"STATUS: {status.upper()}")
553
-
554
- if lines:
555
- return "\n".join(lines)
568
+ text.append("STATUS: ", style="bold cyan")
569
+ if status == "clean":
570
+ text.append(status.upper(), style="green bold")
571
+ else:
572
+ text.append(status.upper().replace("_", " "), style="yellow bold")
573
+
574
+ if text.plain:
575
+ return text
556
576
  except Exception:
557
577
  pass
558
578
 
559
579
  # Fallback: show truncated raw response
560
- return response[:500] + "..." if len(response) > 500 else response
580
+ fallback = response[:500] + "..." if len(response) > 500 else response
581
+ return Text(fallback, style="dim cyan")
561
582
 
562
583
  def _refresh_right_panel(self) -> None:
563
- """Refresh the right panel with parsed transmission log."""
584
+ """Refresh the right panel with colorized transmission log."""
564
585
  if not HAS_RICH or self._layout is None:
565
586
  return
566
587
 
567
- # Get last response and parse for display
588
+ # Get last response and colorize for display
568
589
  response = self._state.last_response
569
590
  if not response:
570
- display_text = "(Awaiting transmission...)"
591
+ log_text = Text("(Awaiting transmission...)", style="dim cyan")
571
592
  else:
572
- display_text = self._parse_response_for_display(response)
573
-
574
- log_text = Text(display_text, style="dim cyan")
593
+ log_text = self._colorize_transmission(response)
575
594
 
576
595
  right_panel = Panel(
577
596
  log_text,
@@ -160,7 +160,10 @@ def validate_function(
160
160
  # Structured mode: sample_data is list[dict]
161
161
  for i, record in enumerate(sample_data):
162
162
  try:
163
- func(record)
163
+ result = func(record)
164
+ # Verify function returns a dict (not string, int, etc.)
165
+ if not isinstance(result, dict):
166
+ return False, f"Function must return dict, got {type(result).__name__}"
164
167
  except Exception as e:
165
168
  return False, f"Runtime error on sample {i}: {type(e).__name__}: {e}"
166
169
 
@@ -200,3 +203,39 @@ def extract_sample_data(
200
203
  except json.JSONDecodeError:
201
204
  continue
202
205
  return samples
206
+
207
+
208
+ def extract_modified_fields(code: str) -> set[str]:
209
+ """
210
+ Extract field names that are modified via record["field"] = ... pattern.
211
+
212
+ Args:
213
+ code: Python source code of the function
214
+
215
+ Returns:
216
+ Set of field names that are assigned to
217
+ """
218
+ try:
219
+ tree = ast.parse(code)
220
+ except SyntaxError:
221
+ return set()
222
+
223
+ fields = set()
224
+ # Common parameter names for the data/record argument
225
+ data_names = {"record", "data"}
226
+
227
+ for node in ast.walk(tree):
228
+ # Look for assignment statements
229
+ if isinstance(node, ast.Assign):
230
+ for target in node.targets:
231
+ # Check if target is a subscript: record["field"] or data["field"]
232
+ if isinstance(target, ast.Subscript):
233
+ # The value should be a Name node (record or data)
234
+ if isinstance(target.value, ast.Name):
235
+ if target.value.id in data_names:
236
+ # The slice should be a string constant
237
+ if isinstance(target.slice, ast.Constant):
238
+ if isinstance(target.slice.value, str):
239
+ fields.add(target.slice.value)
240
+
241
+ return fields