recursive-cleaner 0.7.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,395 @@
1
+ """CLI interface for Recursive Data Cleaner."""
2
+
3
+ import argparse
4
+ import os
5
+ import sys
6
+
7
+
8
+ def create_backend(provider: str, model: str, base_url: str | None, api_key: str | None):
9
+ """
10
+ Factory function to create the appropriate backend.
11
+
12
+ Args:
13
+ provider: Backend provider ("mlx" or "openai")
14
+ model: Model name/path
15
+ base_url: Optional API base URL (for openai-compatible servers)
16
+ api_key: Optional API key
17
+
18
+ Returns:
19
+ LLMBackend instance
20
+
21
+ Raises:
22
+ SystemExit: With code 2 if provider is invalid or import fails
23
+ """
24
+ if provider == "mlx":
25
+ try:
26
+ from backends import MLXBackend
27
+ return MLXBackend(model_path=model)
28
+ except ImportError:
29
+ print("Error: MLX backend requires mlx-lm. Install with: pip install mlx-lm", file=sys.stderr)
30
+ sys.exit(2)
31
+ elif provider == "openai":
32
+ try:
33
+ from backends import OpenAIBackend
34
+ return OpenAIBackend(model=model, api_key=api_key, base_url=base_url)
35
+ except ImportError as e:
36
+ print(f"Error: {e}", file=sys.stderr)
37
+ sys.exit(2)
38
+ else:
39
+ print(f"Error: Unknown provider '{provider}'. Use 'mlx' or 'openai'.", file=sys.stderr)
40
+ sys.exit(2)
41
+
42
+
43
+ def read_instructions(value: str) -> str:
44
+ """
45
+ Read instructions from inline text or file.
46
+
47
+ Args:
48
+ value: Instructions string or @file.txt path
49
+
50
+ Returns:
51
+ Instructions text
52
+ """
53
+ if value.startswith("@"):
54
+ file_path = value[1:]
55
+ try:
56
+ with open(file_path, "r") as f:
57
+ return f.read().strip()
58
+ except FileNotFoundError:
59
+ print(f"Error: Instructions file not found: {file_path}", file=sys.stderr)
60
+ sys.exit(1)
61
+ except IOError as e:
62
+ print(f"Error reading instructions file: {e}", file=sys.stderr)
63
+ sys.exit(1)
64
+ elif value == "-":
65
+ return sys.stdin.read().strip()
66
+ return value
67
+
68
+
69
+ def cmd_generate(args) -> int:
70
+ """Handle the generate command."""
71
+ from recursive_cleaner import DataCleaner
72
+
73
+ # Check if file exists
74
+ if not os.path.exists(args.file):
75
+ print(f"Error: File not found: {args.file}", file=sys.stderr)
76
+ return 1
77
+
78
+ # Create backend
79
+ backend = create_backend(args.provider, args.model, args.base_url, args.api_key)
80
+
81
+ # Read instructions
82
+ instructions = read_instructions(args.instructions) if args.instructions else ""
83
+
84
+ # Create progress callback for non-TUI mode
85
+ def on_progress(event):
86
+ if not args.tui:
87
+ event_type = event.get("type", "")
88
+ if event_type == "function_generated":
89
+ print(f" Generated: {event.get('function_name', '')}")
90
+
91
+ try:
92
+ cleaner = DataCleaner(
93
+ llm_backend=backend,
94
+ file_path=args.file,
95
+ chunk_size=args.chunk_size,
96
+ instructions=instructions,
97
+ max_iterations=args.max_iterations,
98
+ mode=args.mode,
99
+ state_file=args.state_file,
100
+ report_path=args.report if args.report else None,
101
+ tui=args.tui,
102
+ optimize=args.optimize,
103
+ track_metrics=args.track_metrics,
104
+ early_termination=args.early_termination,
105
+ on_progress=on_progress if not args.tui else None,
106
+ output_path=args.output,
107
+ )
108
+ cleaner.run()
109
+ return 0
110
+ except FileNotFoundError as e:
111
+ print(f"Error: {e}", file=sys.stderr)
112
+ return 1
113
+ except Exception as e:
114
+ print(f"Error: {e}", file=sys.stderr)
115
+ return 3
116
+
117
+
118
+ def cmd_analyze(args) -> int:
119
+ """Handle the analyze command (dry-run mode)."""
120
+ from recursive_cleaner import DataCleaner
121
+
122
+ # Check if file exists
123
+ if not os.path.exists(args.file):
124
+ print(f"Error: File not found: {args.file}", file=sys.stderr)
125
+ return 1
126
+
127
+ # Create backend
128
+ backend = create_backend(args.provider, args.model, args.base_url, args.api_key)
129
+
130
+ # Read instructions
131
+ instructions = read_instructions(args.instructions) if args.instructions else ""
132
+
133
+ # Progress callback for analysis output
134
+ def on_progress(event):
135
+ if not args.tui:
136
+ event_type = event.get("type", "")
137
+ if event_type == "issues_detected":
138
+ issues = event.get("issues", [])
139
+ chunk_idx = event.get("chunk_index", 0)
140
+ unsolved = [i for i in issues if not i.get("solved", False)]
141
+ print(f"Chunk {chunk_idx + 1}: {len(issues)} issues ({len(unsolved)} unsolved)")
142
+
143
+ try:
144
+ cleaner = DataCleaner(
145
+ llm_backend=backend,
146
+ file_path=args.file,
147
+ chunk_size=args.chunk_size,
148
+ instructions=instructions,
149
+ max_iterations=args.max_iterations,
150
+ mode=args.mode,
151
+ dry_run=True,
152
+ tui=args.tui,
153
+ on_progress=on_progress if not args.tui else None,
154
+ )
155
+ cleaner.run()
156
+ return 0
157
+ except FileNotFoundError as e:
158
+ print(f"Error: {e}", file=sys.stderr)
159
+ return 1
160
+ except Exception as e:
161
+ print(f"Error: {e}", file=sys.stderr)
162
+ return 3
163
+
164
+
165
+ def cmd_apply(args) -> int:
166
+ """Handle the apply command."""
167
+ from recursive_cleaner.apply import apply_cleaning
168
+
169
+ # Check if input file exists
170
+ if not os.path.exists(args.file):
171
+ print(f"Error: File not found: {args.file}", file=sys.stderr)
172
+ return 1
173
+
174
+ # Check if functions file exists
175
+ if not os.path.exists(args.functions):
176
+ print(f"Error: Functions file not found: {args.functions}", file=sys.stderr)
177
+ return 1
178
+
179
+ try:
180
+ output_path = apply_cleaning(
181
+ input_path=args.file,
182
+ functions_path=args.functions,
183
+ output_path=args.output,
184
+ )
185
+ print(f"Cleaned data written to: {output_path}")
186
+ return 0
187
+ except FileNotFoundError as e:
188
+ print(f"Error: {e}", file=sys.stderr)
189
+ return 1
190
+ except ImportError as e:
191
+ print(f"Error: {e}", file=sys.stderr)
192
+ return 2
193
+ except Exception as e:
194
+ print(f"Error: {e}", file=sys.stderr)
195
+ return 3
196
+
197
+
198
+ def cmd_resume(args) -> int:
199
+ """Handle the resume command."""
200
+ from recursive_cleaner import DataCleaner
201
+
202
+ # Check if state file exists
203
+ if not os.path.exists(args.state_file):
204
+ print(f"Error: State file not found: {args.state_file}", file=sys.stderr)
205
+ return 1
206
+
207
+ # Create backend
208
+ backend = create_backend(args.provider, args.model, args.base_url, args.api_key)
209
+
210
+ try:
211
+ cleaner = DataCleaner.resume(args.state_file, backend)
212
+ cleaner.run()
213
+ return 0
214
+ except FileNotFoundError as e:
215
+ print(f"Error: {e}", file=sys.stderr)
216
+ return 1
217
+ except ValueError as e:
218
+ print(f"Error: Invalid state file: {e}", file=sys.stderr)
219
+ return 1
220
+ except Exception as e:
221
+ print(f"Error: {e}", file=sys.stderr)
222
+ return 3
223
+
224
+
225
+ def create_parser() -> argparse.ArgumentParser:
226
+ """Create the argument parser with all subcommands."""
227
+ parser = argparse.ArgumentParser(
228
+ prog="recursive-cleaner",
229
+ description="LLM-powered incremental data cleaning pipeline",
230
+ )
231
+
232
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
233
+
234
+ # --- generate command ---
235
+ gen_parser = subparsers.add_parser(
236
+ "generate",
237
+ help="Generate cleaning functions from data file",
238
+ )
239
+ gen_parser.add_argument("file", metavar="FILE", help="Path to input data file")
240
+ gen_parser.add_argument(
241
+ "-p", "--provider", required=True, choices=["mlx", "openai"],
242
+ help="LLM provider (mlx or openai)"
243
+ )
244
+ gen_parser.add_argument(
245
+ "-m", "--model", required=True, help="Model name/path"
246
+ )
247
+ gen_parser.add_argument(
248
+ "-i", "--instructions", default="",
249
+ help="Cleaning instructions (text or @file.txt)"
250
+ )
251
+ gen_parser.add_argument(
252
+ "--base-url", help="API base URL (for openai-compatible servers)"
253
+ )
254
+ gen_parser.add_argument(
255
+ "--api-key", help="API key (or use OPENAI_API_KEY env var)"
256
+ )
257
+ gen_parser.add_argument(
258
+ "--chunk-size", type=int, default=50, help="Items per chunk (default: 50)"
259
+ )
260
+ gen_parser.add_argument(
261
+ "--max-iterations", type=int, default=5,
262
+ help="Max iterations per chunk (default: 5)"
263
+ )
264
+ gen_parser.add_argument(
265
+ "--mode", choices=["auto", "structured", "text"], default="auto",
266
+ help="Processing mode (default: auto)"
267
+ )
268
+ gen_parser.add_argument(
269
+ "-o", "--output", default="cleaning_functions.py",
270
+ help="Output file path (default: cleaning_functions.py)"
271
+ )
272
+ gen_parser.add_argument(
273
+ "--report", default="cleaning_report.md",
274
+ help="Report file path (empty to disable, default: cleaning_report.md)"
275
+ )
276
+ gen_parser.add_argument(
277
+ "--state-file", help="Checkpoint file for resume"
278
+ )
279
+ gen_parser.add_argument(
280
+ "--tui", action="store_true", help="Enable Rich terminal dashboard"
281
+ )
282
+ gen_parser.add_argument(
283
+ "--optimize", action="store_true", help="Consolidate redundant functions"
284
+ )
285
+ gen_parser.add_argument(
286
+ "--track-metrics", action="store_true", help="Measure before/after quality"
287
+ )
288
+ gen_parser.add_argument(
289
+ "--early-termination", action="store_true",
290
+ help="Stop on pattern saturation"
291
+ )
292
+ gen_parser.set_defaults(func=cmd_generate)
293
+
294
+ # --- analyze command ---
295
+ analyze_parser = subparsers.add_parser(
296
+ "analyze",
297
+ help="Dry-run analysis without generating functions",
298
+ )
299
+ analyze_parser.add_argument("file", metavar="FILE", help="Path to input data file")
300
+ analyze_parser.add_argument(
301
+ "-p", "--provider", required=True, choices=["mlx", "openai"],
302
+ help="LLM provider (mlx or openai)"
303
+ )
304
+ analyze_parser.add_argument(
305
+ "-m", "--model", required=True, help="Model name/path"
306
+ )
307
+ analyze_parser.add_argument(
308
+ "-i", "--instructions", default="",
309
+ help="Cleaning instructions (text or @file.txt)"
310
+ )
311
+ analyze_parser.add_argument(
312
+ "--base-url", help="API base URL (for openai-compatible servers)"
313
+ )
314
+ analyze_parser.add_argument(
315
+ "--api-key", help="API key (or use OPENAI_API_KEY env var)"
316
+ )
317
+ analyze_parser.add_argument(
318
+ "--chunk-size", type=int, default=50, help="Items per chunk (default: 50)"
319
+ )
320
+ analyze_parser.add_argument(
321
+ "--max-iterations", type=int, default=5,
322
+ help="Max iterations per chunk (default: 5)"
323
+ )
324
+ analyze_parser.add_argument(
325
+ "--mode", choices=["auto", "structured", "text"], default="auto",
326
+ help="Processing mode (default: auto)"
327
+ )
328
+ analyze_parser.add_argument(
329
+ "--tui", action="store_true", help="Enable Rich terminal dashboard"
330
+ )
331
+ analyze_parser.set_defaults(func=cmd_analyze)
332
+
333
+ # --- resume command ---
334
+ resume_parser = subparsers.add_parser(
335
+ "resume",
336
+ help="Resume from checkpoint file",
337
+ )
338
+ resume_parser.add_argument(
339
+ "state_file", metavar="STATE_FILE", help="Path to checkpoint JSON file"
340
+ )
341
+ resume_parser.add_argument(
342
+ "-p", "--provider", required=True, choices=["mlx", "openai"],
343
+ help="LLM provider (mlx or openai)"
344
+ )
345
+ resume_parser.add_argument(
346
+ "-m", "--model", required=True, help="Model name/path"
347
+ )
348
+ resume_parser.add_argument(
349
+ "--base-url", help="API base URL (for openai-compatible servers)"
350
+ )
351
+ resume_parser.add_argument(
352
+ "--api-key", help="API key (or use OPENAI_API_KEY env var)"
353
+ )
354
+ resume_parser.set_defaults(func=cmd_resume)
355
+
356
+ # --- apply command ---
357
+ apply_parser = subparsers.add_parser(
358
+ "apply",
359
+ help="Apply cleaning functions to data file",
360
+ )
361
+ apply_parser.add_argument("file", metavar="FILE", help="Path to input data file")
362
+ apply_parser.add_argument(
363
+ "-f", "--functions", required=True,
364
+ help="Path to cleaning_functions.py"
365
+ )
366
+ apply_parser.add_argument(
367
+ "-o", "--output", help="Output file path (default: <input>.cleaned.<ext>)"
368
+ )
369
+ apply_parser.set_defaults(func=cmd_apply)
370
+
371
+ return parser
372
+
373
+
374
+ def main(args: list[str] | None = None) -> int:
375
+ """
376
+ Main entry point for the CLI.
377
+
378
+ Args:
379
+ args: Command-line arguments (defaults to sys.argv[1:])
380
+
381
+ Returns:
382
+ Exit code (0=success, 1=general error, 2=backend error, 3=validation error)
383
+ """
384
+ parser = create_parser()
385
+ parsed = parser.parse_args(args)
386
+
387
+ if parsed.command is None:
388
+ parser.print_help()
389
+ return 0
390
+
391
+ return parsed.func(parsed)
392
+
393
+
394
+ if __name__ == "__main__":
395
+ sys.exit(main())