recursive-cleaner 0.8.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- backends/__init__.py +2 -1
- backends/openai_backend.py +71 -0
- recursive_cleaner/__init__.py +2 -0
- recursive_cleaner/__main__.py +8 -0
- recursive_cleaner/apply.py +483 -0
- recursive_cleaner/cleaner.py +6 -4
- recursive_cleaner/cli.py +395 -0
- recursive_cleaner/tui.py +43 -24
- {recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.0.dist-info}/METADATA +100 -4
- {recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.0.dist-info}/RECORD +13 -8
- recursive_cleaner-1.0.0.dist-info/entry_points.txt +2 -0
- {recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.0.dist-info}/WHEEL +0 -0
- {recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.0.dist-info}/licenses/LICENSE +0 -0
backends/__init__.py
CHANGED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""OpenAI-compatible backend for Recursive Data Cleaner."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class OpenAIBackend:
|
|
7
|
+
"""
|
|
8
|
+
OpenAI-compatible backend implementation.
|
|
9
|
+
|
|
10
|
+
Works with OpenAI API, LM Studio, Ollama, and other OpenAI-compatible servers.
|
|
11
|
+
Conforms to the LLMBackend protocol.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
model: str,
|
|
17
|
+
api_key: str | None = None,
|
|
18
|
+
base_url: str | None = None,
|
|
19
|
+
max_tokens: int = 4096,
|
|
20
|
+
temperature: float = 0.7,
|
|
21
|
+
):
|
|
22
|
+
"""
|
|
23
|
+
Initialize the OpenAI backend.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
model: Model name (e.g., "gpt-4o", "gpt-3.5-turbo")
|
|
27
|
+
api_key: API key (defaults to OPENAI_API_KEY env var, or "not-needed" for local)
|
|
28
|
+
base_url: API base URL (defaults to OpenAI's API)
|
|
29
|
+
max_tokens: Maximum tokens to generate
|
|
30
|
+
temperature: Sampling temperature
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
import openai
|
|
34
|
+
except ImportError:
|
|
35
|
+
raise ImportError(
|
|
36
|
+
"OpenAI SDK not installed. Install with: pip install openai"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
self.model = model
|
|
40
|
+
self.max_tokens = max_tokens
|
|
41
|
+
self.temperature = temperature
|
|
42
|
+
|
|
43
|
+
# Resolve API key: explicit > env var > "not-needed" for local servers
|
|
44
|
+
if api_key is not None:
|
|
45
|
+
resolved_key = api_key
|
|
46
|
+
else:
|
|
47
|
+
resolved_key = os.environ.get("OPENAI_API_KEY", "not-needed")
|
|
48
|
+
|
|
49
|
+
# Create client
|
|
50
|
+
self._client = openai.OpenAI(
|
|
51
|
+
api_key=resolved_key,
|
|
52
|
+
base_url=base_url,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def generate(self, prompt: str) -> str:
|
|
56
|
+
"""
|
|
57
|
+
Generate a response from the LLM.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
prompt: The input prompt
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
The generated text response
|
|
64
|
+
"""
|
|
65
|
+
response = self._client.chat.completions.create(
|
|
66
|
+
model=self.model,
|
|
67
|
+
messages=[{"role": "user", "content": prompt}],
|
|
68
|
+
max_tokens=self.max_tokens,
|
|
69
|
+
temperature=self.temperature,
|
|
70
|
+
)
|
|
71
|
+
return response.choices[0].message.content or ""
|
recursive_cleaner/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Recursive Data Cleaner - LLM-powered incremental data cleaning pipeline."""
|
|
2
2
|
|
|
3
|
+
from recursive_cleaner.apply import apply_cleaning
|
|
3
4
|
from recursive_cleaner.cleaner import DataCleaner
|
|
4
5
|
from recursive_cleaner.context import build_context
|
|
5
6
|
from recursive_cleaner.dependencies import resolve_dependencies
|
|
@@ -24,6 +25,7 @@ from recursive_cleaner.tui import HAS_RICH, TUIRenderer
|
|
|
24
25
|
from recursive_cleaner.validation import check_code_safety, extract_sample_data, validate_function
|
|
25
26
|
|
|
26
27
|
__all__ = [
|
|
28
|
+
"apply_cleaning",
|
|
27
29
|
"CleanerError",
|
|
28
30
|
"ParseError",
|
|
29
31
|
"MaxIterationsError",
|
|
@@ -0,0 +1,483 @@
|
|
|
1
|
+
"""Apply cleaning functions to data files."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import importlib.util
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Callable
|
|
8
|
+
|
|
9
|
+
from .parsers import MARKITDOWN_EXTENSIONS
|
|
10
|
+
|
|
11
|
+
# Text formats that should be converted to markdown (excludes spreadsheets)
|
|
12
|
+
TEXT_MARKITDOWN_EXTENSIONS = MARKITDOWN_EXTENSIONS - {".xlsx", ".xls", ".ods"}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def load_cleaning_module(functions_path: str):
|
|
16
|
+
"""
|
|
17
|
+
Dynamically import a cleaning_functions.py file.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
functions_path: Path to the cleaning functions file
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
The imported module
|
|
24
|
+
|
|
25
|
+
Raises:
|
|
26
|
+
FileNotFoundError: If the functions file doesn't exist
|
|
27
|
+
ImportError: If the module cannot be imported
|
|
28
|
+
"""
|
|
29
|
+
path = Path(functions_path)
|
|
30
|
+
if not path.exists():
|
|
31
|
+
raise FileNotFoundError(f"Functions file not found: {functions_path}")
|
|
32
|
+
|
|
33
|
+
spec = importlib.util.spec_from_file_location("cleaning_module", path)
|
|
34
|
+
if spec is None or spec.loader is None:
|
|
35
|
+
raise ImportError(f"Cannot load module from: {functions_path}")
|
|
36
|
+
|
|
37
|
+
module = importlib.util.module_from_spec(spec)
|
|
38
|
+
spec.loader.exec_module(module)
|
|
39
|
+
return module
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_default_output_path(input_path: str, force_ext: str | None = None) -> str:
|
|
43
|
+
"""
|
|
44
|
+
Generate default output path: input.cleaned.ext
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
input_path: Path to the input file
|
|
48
|
+
force_ext: Override the output extension (e.g., ".xlsx" for .xls files)
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Path string for the output file
|
|
52
|
+
"""
|
|
53
|
+
path = Path(input_path)
|
|
54
|
+
suffix = path.suffix.lower()
|
|
55
|
+
|
|
56
|
+
if force_ext:
|
|
57
|
+
ext = force_ext
|
|
58
|
+
elif suffix == ".xls":
|
|
59
|
+
# .xls files are written as .xlsx
|
|
60
|
+
ext = ".xlsx"
|
|
61
|
+
elif suffix == ".txt" or suffix in TEXT_MARKITDOWN_EXTENSIONS:
|
|
62
|
+
# Text formats output as markdown
|
|
63
|
+
ext = ".md"
|
|
64
|
+
else:
|
|
65
|
+
ext = path.suffix
|
|
66
|
+
|
|
67
|
+
return str(path.with_suffix(f".cleaned{ext}"))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def apply_to_jsonl(
|
|
71
|
+
input_path: str,
|
|
72
|
+
output_path: str,
|
|
73
|
+
clean_fn: Callable,
|
|
74
|
+
on_progress: Callable[[dict], None] | None = None,
|
|
75
|
+
) -> int:
|
|
76
|
+
"""
|
|
77
|
+
Stream JSONL: read line, clean, write line.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
input_path: Path to input JSONL file
|
|
81
|
+
output_path: Path for output JSONL file
|
|
82
|
+
clean_fn: Cleaning function to apply to each record
|
|
83
|
+
on_progress: Optional progress callback
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Number of records processed
|
|
87
|
+
"""
|
|
88
|
+
records_processed = 0
|
|
89
|
+
|
|
90
|
+
with open(input_path, "r", encoding="utf-8") as infile, \
|
|
91
|
+
open(output_path, "w", encoding="utf-8") as outfile:
|
|
92
|
+
for line in infile:
|
|
93
|
+
line = line.strip()
|
|
94
|
+
if not line:
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
record = json.loads(line)
|
|
98
|
+
cleaned = clean_fn(record)
|
|
99
|
+
outfile.write(json.dumps(cleaned) + "\n")
|
|
100
|
+
|
|
101
|
+
records_processed += 1
|
|
102
|
+
if on_progress:
|
|
103
|
+
on_progress({"type": "apply_progress", "records_processed": records_processed})
|
|
104
|
+
|
|
105
|
+
return records_processed
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def apply_to_csv(
|
|
109
|
+
input_path: str,
|
|
110
|
+
output_path: str,
|
|
111
|
+
clean_fn: Callable,
|
|
112
|
+
on_progress: Callable[[dict], None] | None = None,
|
|
113
|
+
) -> int:
|
|
114
|
+
"""
|
|
115
|
+
Stream CSV: DictReader to clean each row, DictWriter to output.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
input_path: Path to input CSV file
|
|
119
|
+
output_path: Path for output CSV file
|
|
120
|
+
clean_fn: Cleaning function to apply to each record
|
|
121
|
+
on_progress: Optional progress callback
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Number of records processed
|
|
125
|
+
"""
|
|
126
|
+
records_processed = 0
|
|
127
|
+
|
|
128
|
+
with open(input_path, "r", encoding="utf-8", newline="") as infile:
|
|
129
|
+
reader = csv.DictReader(infile)
|
|
130
|
+
fieldnames = reader.fieldnames
|
|
131
|
+
|
|
132
|
+
if not fieldnames:
|
|
133
|
+
return 0
|
|
134
|
+
|
|
135
|
+
with open(output_path, "w", encoding="utf-8", newline="") as outfile:
|
|
136
|
+
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
|
|
137
|
+
writer.writeheader()
|
|
138
|
+
|
|
139
|
+
for row in reader:
|
|
140
|
+
cleaned = clean_fn(row)
|
|
141
|
+
writer.writerow(cleaned)
|
|
142
|
+
|
|
143
|
+
records_processed += 1
|
|
144
|
+
if on_progress:
|
|
145
|
+
on_progress({"type": "apply_progress", "records_processed": records_processed})
|
|
146
|
+
|
|
147
|
+
return records_processed
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def apply_to_json(
|
|
151
|
+
input_path: str,
|
|
152
|
+
output_path: str,
|
|
153
|
+
clean_fn: Callable,
|
|
154
|
+
on_progress: Callable[[dict], None] | None = None,
|
|
155
|
+
) -> int:
|
|
156
|
+
"""
|
|
157
|
+
Batch JSON array: load all, clean each, write array.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
input_path: Path to input JSON file
|
|
161
|
+
output_path: Path for output JSON file
|
|
162
|
+
clean_fn: Cleaning function to apply to each record
|
|
163
|
+
on_progress: Optional progress callback
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Number of records processed
|
|
167
|
+
"""
|
|
168
|
+
with open(input_path, "r", encoding="utf-8") as f:
|
|
169
|
+
data = json.load(f)
|
|
170
|
+
|
|
171
|
+
if not isinstance(data, list):
|
|
172
|
+
# Single object - wrap, clean, unwrap
|
|
173
|
+
cleaned = clean_fn(data)
|
|
174
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
175
|
+
json.dump(cleaned, f, indent=2)
|
|
176
|
+
if on_progress:
|
|
177
|
+
on_progress({"type": "apply_progress", "records_processed": 1})
|
|
178
|
+
return 1
|
|
179
|
+
|
|
180
|
+
cleaned_data = []
|
|
181
|
+
for i, record in enumerate(data):
|
|
182
|
+
cleaned = clean_fn(record)
|
|
183
|
+
cleaned_data.append(cleaned)
|
|
184
|
+
|
|
185
|
+
if on_progress:
|
|
186
|
+
on_progress({"type": "apply_progress", "records_processed": i + 1})
|
|
187
|
+
|
|
188
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
189
|
+
json.dump(cleaned_data, f, indent=2)
|
|
190
|
+
|
|
191
|
+
return len(cleaned_data)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def apply_to_parquet(
|
|
195
|
+
input_path: str,
|
|
196
|
+
output_path: str,
|
|
197
|
+
clean_fn: Callable,
|
|
198
|
+
on_progress: Callable[[dict], None] | None = None,
|
|
199
|
+
) -> int:
|
|
200
|
+
"""
|
|
201
|
+
Batch Parquet: load as list of dicts, clean each, write back.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
input_path: Path to input Parquet file
|
|
205
|
+
output_path: Path for output Parquet file
|
|
206
|
+
clean_fn: Cleaning function to apply to each record
|
|
207
|
+
on_progress: Optional progress callback
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Number of records processed
|
|
211
|
+
|
|
212
|
+
Raises:
|
|
213
|
+
ImportError: If pyarrow is not installed
|
|
214
|
+
"""
|
|
215
|
+
try:
|
|
216
|
+
import pyarrow as pa
|
|
217
|
+
import pyarrow.parquet as pq
|
|
218
|
+
except ImportError:
|
|
219
|
+
raise ImportError(
|
|
220
|
+
"pyarrow is required for parquet files. "
|
|
221
|
+
"Install with: pip install recursive-cleaner[parquet]"
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
table = pq.read_table(input_path)
|
|
225
|
+
records = table.to_pylist()
|
|
226
|
+
|
|
227
|
+
cleaned_data = []
|
|
228
|
+
for i, record in enumerate(records):
|
|
229
|
+
cleaned = clean_fn(record)
|
|
230
|
+
cleaned_data.append(cleaned)
|
|
231
|
+
|
|
232
|
+
if on_progress:
|
|
233
|
+
on_progress({"type": "apply_progress", "records_processed": i + 1})
|
|
234
|
+
|
|
235
|
+
# Write back as parquet
|
|
236
|
+
cleaned_table = pa.Table.from_pylist(cleaned_data)
|
|
237
|
+
pq.write_table(cleaned_table, output_path)
|
|
238
|
+
|
|
239
|
+
return len(cleaned_data)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def apply_to_excel(
|
|
243
|
+
input_path: str,
|
|
244
|
+
output_path: str,
|
|
245
|
+
clean_fn: Callable,
|
|
246
|
+
on_progress: Callable[[dict], None] | None = None,
|
|
247
|
+
) -> int:
|
|
248
|
+
"""
|
|
249
|
+
Batch Excel: load as list of dicts, clean each, write back.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
input_path: Path to input Excel file (.xlsx or .xls)
|
|
253
|
+
output_path: Path for output Excel file (.xlsx)
|
|
254
|
+
clean_fn: Cleaning function to apply to each record
|
|
255
|
+
on_progress: Optional progress callback
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
Number of records processed
|
|
259
|
+
|
|
260
|
+
Raises:
|
|
261
|
+
ImportError: If openpyxl (or xlrd for .xls) is not installed
|
|
262
|
+
"""
|
|
263
|
+
suffix = Path(input_path).suffix.lower()
|
|
264
|
+
|
|
265
|
+
if suffix == ".xls":
|
|
266
|
+
# Use xlrd for .xls files
|
|
267
|
+
try:
|
|
268
|
+
import xlrd
|
|
269
|
+
except ImportError:
|
|
270
|
+
raise ImportError(
|
|
271
|
+
"xlrd is required for .xls files. "
|
|
272
|
+
"Install with: pip install recursive-cleaner[excel]"
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
workbook = xlrd.open_workbook(input_path)
|
|
276
|
+
sheet = workbook.sheet_by_index(0)
|
|
277
|
+
|
|
278
|
+
if sheet.nrows < 1:
|
|
279
|
+
return 0
|
|
280
|
+
|
|
281
|
+
# First row is headers
|
|
282
|
+
headers = [str(sheet.cell_value(0, col)) for col in range(sheet.ncols)]
|
|
283
|
+
records = []
|
|
284
|
+
for row_idx in range(1, sheet.nrows):
|
|
285
|
+
row_data = {}
|
|
286
|
+
for col_idx, header in enumerate(headers):
|
|
287
|
+
row_data[header] = sheet.cell_value(row_idx, col_idx)
|
|
288
|
+
records.append(row_data)
|
|
289
|
+
else:
|
|
290
|
+
# Use openpyxl for .xlsx files
|
|
291
|
+
try:
|
|
292
|
+
from openpyxl import load_workbook
|
|
293
|
+
except ImportError:
|
|
294
|
+
raise ImportError(
|
|
295
|
+
"openpyxl is required for .xlsx files. "
|
|
296
|
+
"Install with: pip install recursive-cleaner[excel]"
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
workbook = load_workbook(input_path, read_only=True)
|
|
300
|
+
sheet = workbook.active
|
|
301
|
+
|
|
302
|
+
rows = list(sheet.iter_rows(values_only=True))
|
|
303
|
+
if not rows:
|
|
304
|
+
return 0
|
|
305
|
+
|
|
306
|
+
# First row is headers
|
|
307
|
+
headers = [str(h) if h is not None else "" for h in rows[0]]
|
|
308
|
+
records = []
|
|
309
|
+
for row in rows[1:]:
|
|
310
|
+
row_data = {}
|
|
311
|
+
for col_idx, header in enumerate(headers):
|
|
312
|
+
value = row[col_idx] if col_idx < len(row) else None
|
|
313
|
+
row_data[header] = value
|
|
314
|
+
records.append(row_data)
|
|
315
|
+
|
|
316
|
+
workbook.close()
|
|
317
|
+
|
|
318
|
+
# Clean records
|
|
319
|
+
cleaned_data = []
|
|
320
|
+
for i, record in enumerate(records):
|
|
321
|
+
cleaned = clean_fn(record)
|
|
322
|
+
cleaned_data.append(cleaned)
|
|
323
|
+
|
|
324
|
+
if on_progress:
|
|
325
|
+
on_progress({"type": "apply_progress", "records_processed": i + 1})
|
|
326
|
+
|
|
327
|
+
# Write back as xlsx using openpyxl
|
|
328
|
+
try:
|
|
329
|
+
from openpyxl import Workbook
|
|
330
|
+
except ImportError:
|
|
331
|
+
raise ImportError(
|
|
332
|
+
"openpyxl is required for writing Excel files. "
|
|
333
|
+
"Install with: pip install recursive-cleaner[excel]"
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
wb = Workbook()
|
|
337
|
+
ws = wb.active
|
|
338
|
+
|
|
339
|
+
if cleaned_data:
|
|
340
|
+
# Write headers
|
|
341
|
+
fieldnames = list(cleaned_data[0].keys())
|
|
342
|
+
ws.append(fieldnames)
|
|
343
|
+
|
|
344
|
+
# Write data rows
|
|
345
|
+
for record in cleaned_data:
|
|
346
|
+
ws.append([record.get(k) for k in fieldnames])
|
|
347
|
+
|
|
348
|
+
wb.save(output_path)
|
|
349
|
+
|
|
350
|
+
return len(cleaned_data)
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def apply_to_text(
|
|
354
|
+
input_path: str,
|
|
355
|
+
output_path: str,
|
|
356
|
+
clean_fn: Callable,
|
|
357
|
+
on_progress: Callable[[dict], None] | None = None,
|
|
358
|
+
) -> int:
|
|
359
|
+
"""
|
|
360
|
+
Process text/document files: extract text, clean, write as markdown.
|
|
361
|
+
|
|
362
|
+
Args:
|
|
363
|
+
input_path: Path to input file (.txt or markitdown format)
|
|
364
|
+
output_path: Path for output markdown file
|
|
365
|
+
clean_fn: Cleaning function to apply to the text
|
|
366
|
+
on_progress: Optional progress callback
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
Number of records processed (always 1 for text)
|
|
370
|
+
|
|
371
|
+
Raises:
|
|
372
|
+
ImportError: If markitdown is not installed (for non-.txt files)
|
|
373
|
+
"""
|
|
374
|
+
suffix = Path(input_path).suffix.lower()
|
|
375
|
+
|
|
376
|
+
if suffix == ".txt":
|
|
377
|
+
# Plain text - read directly
|
|
378
|
+
with open(input_path, "r", encoding="utf-8") as f:
|
|
379
|
+
content = f.read()
|
|
380
|
+
else:
|
|
381
|
+
# Use markitdown for other formats
|
|
382
|
+
try:
|
|
383
|
+
from markitdown import MarkItDown
|
|
384
|
+
except ImportError:
|
|
385
|
+
raise ImportError(
|
|
386
|
+
"markitdown is required for this file type. "
|
|
387
|
+
"Install with: pip install recursive-cleaner[markitdown]"
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
md = MarkItDown()
|
|
391
|
+
result = md.convert(input_path)
|
|
392
|
+
content = result.text_content
|
|
393
|
+
|
|
394
|
+
# Clean the text content
|
|
395
|
+
cleaned = clean_fn(content)
|
|
396
|
+
|
|
397
|
+
# Write as markdown
|
|
398
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
399
|
+
f.write(cleaned)
|
|
400
|
+
|
|
401
|
+
if on_progress:
|
|
402
|
+
on_progress({"type": "apply_progress", "records_processed": 1})
|
|
403
|
+
|
|
404
|
+
return 1
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def apply_cleaning(
|
|
408
|
+
input_path: str,
|
|
409
|
+
functions_path: str,
|
|
410
|
+
output_path: str | None = None,
|
|
411
|
+
on_progress: Callable[[dict], None] | None = None,
|
|
412
|
+
) -> str:
|
|
413
|
+
"""
|
|
414
|
+
Apply cleaning functions to a data file.
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
input_path: Path to input data file
|
|
418
|
+
functions_path: Path to cleaning_functions.py
|
|
419
|
+
output_path: Path for output file (default: input.cleaned.ext)
|
|
420
|
+
on_progress: Optional progress callback
|
|
421
|
+
|
|
422
|
+
Returns:
|
|
423
|
+
Path to output file
|
|
424
|
+
|
|
425
|
+
Raises:
|
|
426
|
+
FileNotFoundError: If input or functions file not found
|
|
427
|
+
ImportError: If functions file cannot be imported
|
|
428
|
+
ValueError: If input format is unsupported
|
|
429
|
+
"""
|
|
430
|
+
# Validate input file exists
|
|
431
|
+
if not Path(input_path).exists():
|
|
432
|
+
raise FileNotFoundError(f"Input file not found: {input_path}")
|
|
433
|
+
|
|
434
|
+
# Load cleaning module
|
|
435
|
+
module = load_cleaning_module(functions_path)
|
|
436
|
+
|
|
437
|
+
# Get the clean_data function
|
|
438
|
+
if not hasattr(module, "clean_data"):
|
|
439
|
+
raise ImportError(f"Functions file missing clean_data() function: {functions_path}")
|
|
440
|
+
|
|
441
|
+
clean_fn = module.clean_data
|
|
442
|
+
|
|
443
|
+
# Determine output path
|
|
444
|
+
suffix = Path(input_path).suffix.lower()
|
|
445
|
+
if output_path is None:
|
|
446
|
+
output_path = get_default_output_path(input_path)
|
|
447
|
+
|
|
448
|
+
# Route by format
|
|
449
|
+
format_handlers = {
|
|
450
|
+
".jsonl": apply_to_jsonl,
|
|
451
|
+
".csv": apply_to_csv,
|
|
452
|
+
".json": apply_to_json,
|
|
453
|
+
".parquet": apply_to_parquet,
|
|
454
|
+
".xlsx": apply_to_excel,
|
|
455
|
+
".xls": apply_to_excel,
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
handler = format_handlers.get(suffix)
|
|
459
|
+
|
|
460
|
+
# Check for text formats (.txt and markitdown extensions, excluding spreadsheets)
|
|
461
|
+
if handler is None:
|
|
462
|
+
if suffix == ".txt" or suffix in TEXT_MARKITDOWN_EXTENSIONS:
|
|
463
|
+
handler = apply_to_text
|
|
464
|
+
|
|
465
|
+
if handler is None:
|
|
466
|
+
raise ValueError(f"Unsupported format: {suffix}")
|
|
467
|
+
|
|
468
|
+
# Emit start event (total_records unknown for streaming formats)
|
|
469
|
+
if on_progress:
|
|
470
|
+
on_progress({"type": "apply_start", "total_records": None})
|
|
471
|
+
|
|
472
|
+
# Apply cleaning
|
|
473
|
+
total_records = handler(input_path, output_path, clean_fn, on_progress)
|
|
474
|
+
|
|
475
|
+
# Emit complete event
|
|
476
|
+
if on_progress:
|
|
477
|
+
on_progress({
|
|
478
|
+
"type": "apply_complete",
|
|
479
|
+
"total_records": total_records,
|
|
480
|
+
"output_path": output_path,
|
|
481
|
+
})
|
|
482
|
+
|
|
483
|
+
return output_path
|
recursive_cleaner/cleaner.py
CHANGED
|
@@ -63,6 +63,7 @@ class DataCleaner:
|
|
|
63
63
|
dry_run: bool = False,
|
|
64
64
|
auto_parse: bool = False,
|
|
65
65
|
tui: bool = False,
|
|
66
|
+
output_path: str = "cleaning_functions.py",
|
|
66
67
|
):
|
|
67
68
|
self.backend = llm_backend
|
|
68
69
|
self.file_path = file_path
|
|
@@ -88,6 +89,7 @@ class DataCleaner:
|
|
|
88
89
|
self.dry_run = dry_run
|
|
89
90
|
self.auto_parse = auto_parse
|
|
90
91
|
self.tui = tui
|
|
92
|
+
self.output_path = output_path
|
|
91
93
|
self.functions: list[dict] = [] # List of {name, docstring, code}
|
|
92
94
|
self._tui_renderer = None # TUIRenderer instance when tui=True
|
|
93
95
|
self._generated_parser: callable | None = None # LLM-generated parser for unknown formats
|
|
@@ -520,7 +522,7 @@ class DataCleaner:
|
|
|
520
522
|
"quality_delta": 0.0, # Could be calculated from metrics
|
|
521
523
|
"latency_total_ms": latency_summary.get("total_ms", 0.0),
|
|
522
524
|
"llm_calls": latency_summary.get("call_count", 0),
|
|
523
|
-
"output_file":
|
|
525
|
+
"output_file": self.output_path,
|
|
524
526
|
})
|
|
525
527
|
self._tui_renderer.stop()
|
|
526
528
|
|
|
@@ -687,11 +689,11 @@ class DataCleaner:
|
|
|
687
689
|
self._emit("chunk_done", chunk_index=chunk_idx)
|
|
688
690
|
|
|
689
691
|
def _write_output(self) -> None:
|
|
690
|
-
"""Write generated functions to
|
|
692
|
+
"""Write generated functions to output file."""
|
|
691
693
|
from .output import write_cleaning_file
|
|
692
694
|
|
|
693
695
|
try:
|
|
694
|
-
write_cleaning_file(self.functions)
|
|
696
|
+
write_cleaning_file(self.functions, self.output_path)
|
|
695
697
|
except OutputValidationError as e:
|
|
696
698
|
if not self.tui:
|
|
697
699
|
print(f" Error: {e}")
|
|
@@ -707,7 +709,7 @@ class DataCleaner:
|
|
|
707
709
|
if not self.tui:
|
|
708
710
|
print(f" Skipping invalid function: {f['name']}")
|
|
709
711
|
if valid_functions:
|
|
710
|
-
write_cleaning_file(valid_functions)
|
|
712
|
+
write_cleaning_file(valid_functions, self.output_path)
|
|
711
713
|
elif not self.tui:
|
|
712
714
|
print(" No valid functions to write.")
|
|
713
715
|
|
recursive_cleaner/cli.py
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
"""CLI interface for Recursive Data Cleaner."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def create_backend(provider: str, model: str, base_url: str | None, api_key: str | None):
|
|
9
|
+
"""
|
|
10
|
+
Factory function to create the appropriate backend.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
provider: Backend provider ("mlx" or "openai")
|
|
14
|
+
model: Model name/path
|
|
15
|
+
base_url: Optional API base URL (for openai-compatible servers)
|
|
16
|
+
api_key: Optional API key
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
LLMBackend instance
|
|
20
|
+
|
|
21
|
+
Raises:
|
|
22
|
+
SystemExit: With code 2 if provider is invalid or import fails
|
|
23
|
+
"""
|
|
24
|
+
if provider == "mlx":
|
|
25
|
+
try:
|
|
26
|
+
from backends import MLXBackend
|
|
27
|
+
return MLXBackend(model_path=model)
|
|
28
|
+
except ImportError:
|
|
29
|
+
print("Error: MLX backend requires mlx-lm. Install with: pip install mlx-lm", file=sys.stderr)
|
|
30
|
+
sys.exit(2)
|
|
31
|
+
elif provider == "openai":
|
|
32
|
+
try:
|
|
33
|
+
from backends import OpenAIBackend
|
|
34
|
+
return OpenAIBackend(model=model, api_key=api_key, base_url=base_url)
|
|
35
|
+
except ImportError as e:
|
|
36
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
37
|
+
sys.exit(2)
|
|
38
|
+
else:
|
|
39
|
+
print(f"Error: Unknown provider '{provider}'. Use 'mlx' or 'openai'.", file=sys.stderr)
|
|
40
|
+
sys.exit(2)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def read_instructions(value: str) -> str:
|
|
44
|
+
"""
|
|
45
|
+
Read instructions from inline text or file.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
value: Instructions string or @file.txt path
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Instructions text
|
|
52
|
+
"""
|
|
53
|
+
if value.startswith("@"):
|
|
54
|
+
file_path = value[1:]
|
|
55
|
+
try:
|
|
56
|
+
with open(file_path, "r") as f:
|
|
57
|
+
return f.read().strip()
|
|
58
|
+
except FileNotFoundError:
|
|
59
|
+
print(f"Error: Instructions file not found: {file_path}", file=sys.stderr)
|
|
60
|
+
sys.exit(1)
|
|
61
|
+
except IOError as e:
|
|
62
|
+
print(f"Error reading instructions file: {e}", file=sys.stderr)
|
|
63
|
+
sys.exit(1)
|
|
64
|
+
elif value == "-":
|
|
65
|
+
return sys.stdin.read().strip()
|
|
66
|
+
return value
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def cmd_generate(args) -> int:
|
|
70
|
+
"""Handle the generate command."""
|
|
71
|
+
from recursive_cleaner import DataCleaner
|
|
72
|
+
|
|
73
|
+
# Check if file exists
|
|
74
|
+
if not os.path.exists(args.file):
|
|
75
|
+
print(f"Error: File not found: {args.file}", file=sys.stderr)
|
|
76
|
+
return 1
|
|
77
|
+
|
|
78
|
+
# Create backend
|
|
79
|
+
backend = create_backend(args.provider, args.model, args.base_url, args.api_key)
|
|
80
|
+
|
|
81
|
+
# Read instructions
|
|
82
|
+
instructions = read_instructions(args.instructions) if args.instructions else ""
|
|
83
|
+
|
|
84
|
+
# Create progress callback for non-TUI mode
|
|
85
|
+
def on_progress(event):
|
|
86
|
+
if not args.tui:
|
|
87
|
+
event_type = event.get("type", "")
|
|
88
|
+
if event_type == "function_generated":
|
|
89
|
+
print(f" Generated: {event.get('function_name', '')}")
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
cleaner = DataCleaner(
|
|
93
|
+
llm_backend=backend,
|
|
94
|
+
file_path=args.file,
|
|
95
|
+
chunk_size=args.chunk_size,
|
|
96
|
+
instructions=instructions,
|
|
97
|
+
max_iterations=args.max_iterations,
|
|
98
|
+
mode=args.mode,
|
|
99
|
+
state_file=args.state_file,
|
|
100
|
+
report_path=args.report if args.report else None,
|
|
101
|
+
tui=args.tui,
|
|
102
|
+
optimize=args.optimize,
|
|
103
|
+
track_metrics=args.track_metrics,
|
|
104
|
+
early_termination=args.early_termination,
|
|
105
|
+
on_progress=on_progress if not args.tui else None,
|
|
106
|
+
output_path=args.output,
|
|
107
|
+
)
|
|
108
|
+
cleaner.run()
|
|
109
|
+
return 0
|
|
110
|
+
except FileNotFoundError as e:
|
|
111
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
112
|
+
return 1
|
|
113
|
+
except Exception as e:
|
|
114
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
115
|
+
return 3
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def cmd_analyze(args) -> int:
|
|
119
|
+
"""Handle the analyze command (dry-run mode)."""
|
|
120
|
+
from recursive_cleaner import DataCleaner
|
|
121
|
+
|
|
122
|
+
# Check if file exists
|
|
123
|
+
if not os.path.exists(args.file):
|
|
124
|
+
print(f"Error: File not found: {args.file}", file=sys.stderr)
|
|
125
|
+
return 1
|
|
126
|
+
|
|
127
|
+
# Create backend
|
|
128
|
+
backend = create_backend(args.provider, args.model, args.base_url, args.api_key)
|
|
129
|
+
|
|
130
|
+
# Read instructions
|
|
131
|
+
instructions = read_instructions(args.instructions) if args.instructions else ""
|
|
132
|
+
|
|
133
|
+
# Progress callback for analysis output
|
|
134
|
+
def on_progress(event):
|
|
135
|
+
if not args.tui:
|
|
136
|
+
event_type = event.get("type", "")
|
|
137
|
+
if event_type == "issues_detected":
|
|
138
|
+
issues = event.get("issues", [])
|
|
139
|
+
chunk_idx = event.get("chunk_index", 0)
|
|
140
|
+
unsolved = [i for i in issues if not i.get("solved", False)]
|
|
141
|
+
print(f"Chunk {chunk_idx + 1}: {len(issues)} issues ({len(unsolved)} unsolved)")
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
cleaner = DataCleaner(
|
|
145
|
+
llm_backend=backend,
|
|
146
|
+
file_path=args.file,
|
|
147
|
+
chunk_size=args.chunk_size,
|
|
148
|
+
instructions=instructions,
|
|
149
|
+
max_iterations=args.max_iterations,
|
|
150
|
+
mode=args.mode,
|
|
151
|
+
dry_run=True,
|
|
152
|
+
tui=args.tui,
|
|
153
|
+
on_progress=on_progress if not args.tui else None,
|
|
154
|
+
)
|
|
155
|
+
cleaner.run()
|
|
156
|
+
return 0
|
|
157
|
+
except FileNotFoundError as e:
|
|
158
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
159
|
+
return 1
|
|
160
|
+
except Exception as e:
|
|
161
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
162
|
+
return 3
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def cmd_apply(args) -> int:
|
|
166
|
+
"""Handle the apply command."""
|
|
167
|
+
from recursive_cleaner.apply import apply_cleaning
|
|
168
|
+
|
|
169
|
+
# Check if input file exists
|
|
170
|
+
if not os.path.exists(args.file):
|
|
171
|
+
print(f"Error: File not found: {args.file}", file=sys.stderr)
|
|
172
|
+
return 1
|
|
173
|
+
|
|
174
|
+
# Check if functions file exists
|
|
175
|
+
if not os.path.exists(args.functions):
|
|
176
|
+
print(f"Error: Functions file not found: {args.functions}", file=sys.stderr)
|
|
177
|
+
return 1
|
|
178
|
+
|
|
179
|
+
try:
|
|
180
|
+
output_path = apply_cleaning(
|
|
181
|
+
input_path=args.file,
|
|
182
|
+
functions_path=args.functions,
|
|
183
|
+
output_path=args.output,
|
|
184
|
+
)
|
|
185
|
+
print(f"Cleaned data written to: {output_path}")
|
|
186
|
+
return 0
|
|
187
|
+
except FileNotFoundError as e:
|
|
188
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
189
|
+
return 1
|
|
190
|
+
except ImportError as e:
|
|
191
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
192
|
+
return 2
|
|
193
|
+
except Exception as e:
|
|
194
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
195
|
+
return 3
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def cmd_resume(args) -> int:
|
|
199
|
+
"""Handle the resume command."""
|
|
200
|
+
from recursive_cleaner import DataCleaner
|
|
201
|
+
|
|
202
|
+
# Check if state file exists
|
|
203
|
+
if not os.path.exists(args.state_file):
|
|
204
|
+
print(f"Error: State file not found: {args.state_file}", file=sys.stderr)
|
|
205
|
+
return 1
|
|
206
|
+
|
|
207
|
+
# Create backend
|
|
208
|
+
backend = create_backend(args.provider, args.model, args.base_url, args.api_key)
|
|
209
|
+
|
|
210
|
+
try:
|
|
211
|
+
cleaner = DataCleaner.resume(args.state_file, backend)
|
|
212
|
+
cleaner.run()
|
|
213
|
+
return 0
|
|
214
|
+
except FileNotFoundError as e:
|
|
215
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
216
|
+
return 1
|
|
217
|
+
except ValueError as e:
|
|
218
|
+
print(f"Error: Invalid state file: {e}", file=sys.stderr)
|
|
219
|
+
return 1
|
|
220
|
+
except Exception as e:
|
|
221
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
222
|
+
return 3
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def create_parser() -> argparse.ArgumentParser:
|
|
226
|
+
"""Create the argument parser with all subcommands."""
|
|
227
|
+
parser = argparse.ArgumentParser(
|
|
228
|
+
prog="recursive-cleaner",
|
|
229
|
+
description="LLM-powered incremental data cleaning pipeline",
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
233
|
+
|
|
234
|
+
# --- generate command ---
|
|
235
|
+
gen_parser = subparsers.add_parser(
|
|
236
|
+
"generate",
|
|
237
|
+
help="Generate cleaning functions from data file",
|
|
238
|
+
)
|
|
239
|
+
gen_parser.add_argument("file", metavar="FILE", help="Path to input data file")
|
|
240
|
+
gen_parser.add_argument(
|
|
241
|
+
"-p", "--provider", required=True, choices=["mlx", "openai"],
|
|
242
|
+
help="LLM provider (mlx or openai)"
|
|
243
|
+
)
|
|
244
|
+
gen_parser.add_argument(
|
|
245
|
+
"-m", "--model", required=True, help="Model name/path"
|
|
246
|
+
)
|
|
247
|
+
gen_parser.add_argument(
|
|
248
|
+
"-i", "--instructions", default="",
|
|
249
|
+
help="Cleaning instructions (text or @file.txt)"
|
|
250
|
+
)
|
|
251
|
+
gen_parser.add_argument(
|
|
252
|
+
"--base-url", help="API base URL (for openai-compatible servers)"
|
|
253
|
+
)
|
|
254
|
+
gen_parser.add_argument(
|
|
255
|
+
"--api-key", help="API key (or use OPENAI_API_KEY env var)"
|
|
256
|
+
)
|
|
257
|
+
gen_parser.add_argument(
|
|
258
|
+
"--chunk-size", type=int, default=50, help="Items per chunk (default: 50)"
|
|
259
|
+
)
|
|
260
|
+
gen_parser.add_argument(
|
|
261
|
+
"--max-iterations", type=int, default=5,
|
|
262
|
+
help="Max iterations per chunk (default: 5)"
|
|
263
|
+
)
|
|
264
|
+
gen_parser.add_argument(
|
|
265
|
+
"--mode", choices=["auto", "structured", "text"], default="auto",
|
|
266
|
+
help="Processing mode (default: auto)"
|
|
267
|
+
)
|
|
268
|
+
gen_parser.add_argument(
|
|
269
|
+
"-o", "--output", default="cleaning_functions.py",
|
|
270
|
+
help="Output file path (default: cleaning_functions.py)"
|
|
271
|
+
)
|
|
272
|
+
gen_parser.add_argument(
|
|
273
|
+
"--report", default="cleaning_report.md",
|
|
274
|
+
help="Report file path (empty to disable, default: cleaning_report.md)"
|
|
275
|
+
)
|
|
276
|
+
gen_parser.add_argument(
|
|
277
|
+
"--state-file", help="Checkpoint file for resume"
|
|
278
|
+
)
|
|
279
|
+
gen_parser.add_argument(
|
|
280
|
+
"--tui", action="store_true", help="Enable Rich terminal dashboard"
|
|
281
|
+
)
|
|
282
|
+
gen_parser.add_argument(
|
|
283
|
+
"--optimize", action="store_true", help="Consolidate redundant functions"
|
|
284
|
+
)
|
|
285
|
+
gen_parser.add_argument(
|
|
286
|
+
"--track-metrics", action="store_true", help="Measure before/after quality"
|
|
287
|
+
)
|
|
288
|
+
gen_parser.add_argument(
|
|
289
|
+
"--early-termination", action="store_true",
|
|
290
|
+
help="Stop on pattern saturation"
|
|
291
|
+
)
|
|
292
|
+
gen_parser.set_defaults(func=cmd_generate)
|
|
293
|
+
|
|
294
|
+
# --- analyze command ---
|
|
295
|
+
analyze_parser = subparsers.add_parser(
|
|
296
|
+
"analyze",
|
|
297
|
+
help="Dry-run analysis without generating functions",
|
|
298
|
+
)
|
|
299
|
+
analyze_parser.add_argument("file", metavar="FILE", help="Path to input data file")
|
|
300
|
+
analyze_parser.add_argument(
|
|
301
|
+
"-p", "--provider", required=True, choices=["mlx", "openai"],
|
|
302
|
+
help="LLM provider (mlx or openai)"
|
|
303
|
+
)
|
|
304
|
+
analyze_parser.add_argument(
|
|
305
|
+
"-m", "--model", required=True, help="Model name/path"
|
|
306
|
+
)
|
|
307
|
+
analyze_parser.add_argument(
|
|
308
|
+
"-i", "--instructions", default="",
|
|
309
|
+
help="Cleaning instructions (text or @file.txt)"
|
|
310
|
+
)
|
|
311
|
+
analyze_parser.add_argument(
|
|
312
|
+
"--base-url", help="API base URL (for openai-compatible servers)"
|
|
313
|
+
)
|
|
314
|
+
analyze_parser.add_argument(
|
|
315
|
+
"--api-key", help="API key (or use OPENAI_API_KEY env var)"
|
|
316
|
+
)
|
|
317
|
+
analyze_parser.add_argument(
|
|
318
|
+
"--chunk-size", type=int, default=50, help="Items per chunk (default: 50)"
|
|
319
|
+
)
|
|
320
|
+
analyze_parser.add_argument(
|
|
321
|
+
"--max-iterations", type=int, default=5,
|
|
322
|
+
help="Max iterations per chunk (default: 5)"
|
|
323
|
+
)
|
|
324
|
+
analyze_parser.add_argument(
|
|
325
|
+
"--mode", choices=["auto", "structured", "text"], default="auto",
|
|
326
|
+
help="Processing mode (default: auto)"
|
|
327
|
+
)
|
|
328
|
+
analyze_parser.add_argument(
|
|
329
|
+
"--tui", action="store_true", help="Enable Rich terminal dashboard"
|
|
330
|
+
)
|
|
331
|
+
analyze_parser.set_defaults(func=cmd_analyze)
|
|
332
|
+
|
|
333
|
+
# --- resume command ---
|
|
334
|
+
resume_parser = subparsers.add_parser(
|
|
335
|
+
"resume",
|
|
336
|
+
help="Resume from checkpoint file",
|
|
337
|
+
)
|
|
338
|
+
resume_parser.add_argument(
|
|
339
|
+
"state_file", metavar="STATE_FILE", help="Path to checkpoint JSON file"
|
|
340
|
+
)
|
|
341
|
+
resume_parser.add_argument(
|
|
342
|
+
"-p", "--provider", required=True, choices=["mlx", "openai"],
|
|
343
|
+
help="LLM provider (mlx or openai)"
|
|
344
|
+
)
|
|
345
|
+
resume_parser.add_argument(
|
|
346
|
+
"-m", "--model", required=True, help="Model name/path"
|
|
347
|
+
)
|
|
348
|
+
resume_parser.add_argument(
|
|
349
|
+
"--base-url", help="API base URL (for openai-compatible servers)"
|
|
350
|
+
)
|
|
351
|
+
resume_parser.add_argument(
|
|
352
|
+
"--api-key", help="API key (or use OPENAI_API_KEY env var)"
|
|
353
|
+
)
|
|
354
|
+
resume_parser.set_defaults(func=cmd_resume)
|
|
355
|
+
|
|
356
|
+
# --- apply command ---
|
|
357
|
+
apply_parser = subparsers.add_parser(
|
|
358
|
+
"apply",
|
|
359
|
+
help="Apply cleaning functions to data file",
|
|
360
|
+
)
|
|
361
|
+
apply_parser.add_argument("file", metavar="FILE", help="Path to input data file")
|
|
362
|
+
apply_parser.add_argument(
|
|
363
|
+
"-f", "--functions", required=True,
|
|
364
|
+
help="Path to cleaning_functions.py"
|
|
365
|
+
)
|
|
366
|
+
apply_parser.add_argument(
|
|
367
|
+
"-o", "--output", help="Output file path (default: <input>.cleaned.<ext>)"
|
|
368
|
+
)
|
|
369
|
+
apply_parser.set_defaults(func=cmd_apply)
|
|
370
|
+
|
|
371
|
+
return parser
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def main(args: list[str] | None = None) -> int:
|
|
375
|
+
"""
|
|
376
|
+
Main entry point for the CLI.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
args: Command-line arguments (defaults to sys.argv[1:])
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
Exit code (0=success, 1=general error, 2=backend error, 3=validation error)
|
|
383
|
+
"""
|
|
384
|
+
parser = create_parser()
|
|
385
|
+
parsed = parser.parse_args(args)
|
|
386
|
+
|
|
387
|
+
if parsed.command is None:
|
|
388
|
+
parser.print_help()
|
|
389
|
+
return 0
|
|
390
|
+
|
|
391
|
+
return parsed.func(parsed)
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
if __name__ == "__main__":
|
|
395
|
+
sys.exit(main())
|
recursive_cleaner/tui.py
CHANGED
|
@@ -505,19 +505,28 @@ class TUIRenderer:
|
|
|
505
505
|
)
|
|
506
506
|
self._layout["left_panel"].update(left_panel)
|
|
507
507
|
|
|
508
|
-
def
|
|
509
|
-
"""Parse LLM XML response into
|
|
508
|
+
def _colorize_transmission(self, response: str) -> "Text":
|
|
509
|
+
"""Parse LLM XML response into colorized Rich Text for transmission log.
|
|
510
|
+
|
|
511
|
+
Color scheme:
|
|
512
|
+
- Issues (solved): dim
|
|
513
|
+
- Issues (unsolved): bright_white with cycling accent (blue/magenta/cyan/yellow)
|
|
514
|
+
- Function names: green
|
|
515
|
+
- Docstrings: italic
|
|
516
|
+
- Status clean: green
|
|
517
|
+
- Status needs_more_work: yellow
|
|
510
518
|
|
|
511
519
|
Args:
|
|
512
520
|
response: Raw LLM response text (XML format)
|
|
513
521
|
|
|
514
522
|
Returns:
|
|
515
|
-
|
|
516
|
-
generated, and chunk status.
|
|
523
|
+
Rich Text object with colors applied.
|
|
517
524
|
"""
|
|
518
525
|
import re
|
|
519
526
|
|
|
520
|
-
|
|
527
|
+
ISSUE_COLORS = ["blue", "magenta", "cyan", "yellow"]
|
|
528
|
+
text = Text()
|
|
529
|
+
unsolved_index = 0
|
|
521
530
|
|
|
522
531
|
try:
|
|
523
532
|
# Find all issues
|
|
@@ -525,53 +534,63 @@ class TUIRenderer:
|
|
|
525
534
|
issues = re.findall(issue_pattern, response, re.DOTALL)
|
|
526
535
|
|
|
527
536
|
if issues:
|
|
528
|
-
|
|
537
|
+
text.append("ISSUES DETECTED:\n", style="bold cyan")
|
|
529
538
|
for issue_id, solved, desc in issues[:8]: # Limit to 8 issues
|
|
530
|
-
marker = "\u2713" if solved == "true" else "\u2717" # checkmark or X
|
|
531
539
|
desc_clean = desc.strip()[:40] # Truncate description
|
|
532
|
-
|
|
540
|
+
if solved == "true":
|
|
541
|
+
text.append(" \u2713 ", style="green")
|
|
542
|
+
text.append(f"{desc_clean}\n", style="dim")
|
|
543
|
+
else:
|
|
544
|
+
accent = ISSUE_COLORS[unsolved_index % len(ISSUE_COLORS)]
|
|
545
|
+
text.append(" \u2717 ", style=accent)
|
|
546
|
+
text.append(f"{desc_clean}\n", style="bright_white")
|
|
547
|
+
unsolved_index += 1
|
|
533
548
|
if len(issues) > 8:
|
|
534
|
-
|
|
535
|
-
|
|
549
|
+
text.append(f" (+{len(issues) - 8} more)\n", style="dim")
|
|
550
|
+
text.append("\n")
|
|
536
551
|
|
|
537
552
|
# Find function being generated
|
|
538
553
|
name_match = re.search(r'<name>([^<]+)</name>', response)
|
|
539
554
|
docstring_match = re.search(r'<docstring>([^<]+)</docstring>', response, re.DOTALL)
|
|
540
555
|
|
|
541
556
|
if name_match:
|
|
542
|
-
|
|
557
|
+
text.append("GENERATING: ", style="bold cyan")
|
|
558
|
+
text.append(f"{name_match.group(1).strip()}\n", style="green bold")
|
|
543
559
|
if docstring_match:
|
|
544
560
|
doc = docstring_match.group(1).strip()[:60]
|
|
545
|
-
|
|
546
|
-
|
|
561
|
+
text.append(f' "{doc}..."\n', style="italic")
|
|
562
|
+
text.append("\n")
|
|
547
563
|
|
|
548
564
|
# Find chunk status
|
|
549
565
|
status_match = re.search(r'<chunk_status>([^<]+)</chunk_status>', response)
|
|
550
566
|
if status_match:
|
|
551
567
|
status = status_match.group(1).strip()
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
568
|
+
text.append("STATUS: ", style="bold cyan")
|
|
569
|
+
if status == "clean":
|
|
570
|
+
text.append(status.upper(), style="green bold")
|
|
571
|
+
else:
|
|
572
|
+
text.append(status.upper().replace("_", " "), style="yellow bold")
|
|
573
|
+
|
|
574
|
+
if text.plain:
|
|
575
|
+
return text
|
|
556
576
|
except Exception:
|
|
557
577
|
pass
|
|
558
578
|
|
|
559
579
|
# Fallback: show truncated raw response
|
|
560
|
-
|
|
580
|
+
fallback = response[:500] + "..." if len(response) > 500 else response
|
|
581
|
+
return Text(fallback, style="dim cyan")
|
|
561
582
|
|
|
562
583
|
def _refresh_right_panel(self) -> None:
|
|
563
|
-
"""Refresh the right panel with
|
|
584
|
+
"""Refresh the right panel with colorized transmission log."""
|
|
564
585
|
if not HAS_RICH or self._layout is None:
|
|
565
586
|
return
|
|
566
587
|
|
|
567
|
-
# Get last response and
|
|
588
|
+
# Get last response and colorize for display
|
|
568
589
|
response = self._state.last_response
|
|
569
590
|
if not response:
|
|
570
|
-
|
|
591
|
+
log_text = Text("(Awaiting transmission...)", style="dim cyan")
|
|
571
592
|
else:
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
log_text = Text(display_text, style="dim cyan")
|
|
593
|
+
log_text = self._colorize_transmission(response)
|
|
575
594
|
|
|
576
595
|
right_panel = Panel(
|
|
577
596
|
log_text,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: recursive-cleaner
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
|
|
5
5
|
Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
6
6
|
Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
@@ -9,7 +9,7 @@ Author: Gary Tran
|
|
|
9
9
|
License-Expression: MIT
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Keywords: automation,data-cleaning,data-quality,etl,llm,machine-learning
|
|
12
|
-
Classifier: Development Status ::
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
13
|
Classifier: Intended Audience :: Developers
|
|
14
14
|
Classifier: Intended Audience :: Science/Research
|
|
15
15
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -26,10 +26,15 @@ Requires-Dist: tenacity>=8.0
|
|
|
26
26
|
Provides-Extra: dev
|
|
27
27
|
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
28
28
|
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
29
|
+
Provides-Extra: excel
|
|
30
|
+
Requires-Dist: openpyxl>=3.0.0; extra == 'excel'
|
|
31
|
+
Requires-Dist: xlrd>=2.0.0; extra == 'excel'
|
|
29
32
|
Provides-Extra: markitdown
|
|
30
33
|
Requires-Dist: markitdown>=0.1.0; extra == 'markitdown'
|
|
31
34
|
Provides-Extra: mlx
|
|
32
35
|
Requires-Dist: mlx-lm>=0.10.0; extra == 'mlx'
|
|
36
|
+
Provides-Extra: openai
|
|
37
|
+
Requires-Dist: openai>=1.0.0; extra == 'openai'
|
|
33
38
|
Provides-Extra: parquet
|
|
34
39
|
Requires-Dist: pyarrow>=14.0.0; extra == 'parquet'
|
|
35
40
|
Provides-Extra: tui
|
|
@@ -140,6 +145,91 @@ cleaner.run() # Generates cleaning_functions.py
|
|
|
140
145
|
- **Token Estimation**: Track estimated input/output tokens across the run
|
|
141
146
|
- **Graceful Fallback**: Works without Rich installed (falls back to callbacks)
|
|
142
147
|
|
|
148
|
+
### CLI (v0.9.0)
|
|
149
|
+
- **Command Line Interface**: Use without writing Python code
|
|
150
|
+
- **Multiple Backends**: MLX (Apple Silicon) and OpenAI-compatible (OpenAI, LM Studio, Ollama)
|
|
151
|
+
- **Four Commands**: `generate`, `analyze` (dry-run), `resume`, `apply`
|
|
152
|
+
|
|
153
|
+
### Apply Mode (v1.0.0)
|
|
154
|
+
- **Apply Cleaning Functions**: Apply generated functions to full datasets
|
|
155
|
+
- **Data Formats**: JSONL, CSV, JSON, Parquet, Excel (.xlsx/.xls) output same format
|
|
156
|
+
- **Text Formats**: PDF, Word, HTML, etc. output as Markdown
|
|
157
|
+
- **Streaming**: Memory-efficient line-by-line processing for JSONL/CSV
|
|
158
|
+
- **Colored TUI**: Enhanced transmission log with syntax-highlighted XML parsing
|
|
159
|
+
|
|
160
|
+
## Command Line Interface
|
|
161
|
+
|
|
162
|
+
After installation, the `recursive-cleaner` command is available:
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
# Generate cleaning functions with MLX (Apple Silicon)
|
|
166
|
+
recursive-cleaner generate data.jsonl \
|
|
167
|
+
--provider mlx \
|
|
168
|
+
--model "lmstudio-community/Qwen3-80B-MLX-4bit" \
|
|
169
|
+
--instructions "Normalize phone numbers to E.164" \
|
|
170
|
+
--output cleaning_functions.py
|
|
171
|
+
|
|
172
|
+
# Use OpenAI
|
|
173
|
+
export OPENAI_API_KEY=your-key
|
|
174
|
+
recursive-cleaner generate data.jsonl \
|
|
175
|
+
--provider openai \
|
|
176
|
+
--model gpt-4o \
|
|
177
|
+
--instructions "Fix date formats"
|
|
178
|
+
|
|
179
|
+
# Use LM Studio or Ollama (OpenAI-compatible)
|
|
180
|
+
recursive-cleaner generate data.jsonl \
|
|
181
|
+
--provider openai \
|
|
182
|
+
--model "qwen/qwen3-vl-30b" \
|
|
183
|
+
--base-url http://localhost:1234/v1 \
|
|
184
|
+
--instructions "Normalize prices"
|
|
185
|
+
|
|
186
|
+
# Dry-run analysis
|
|
187
|
+
recursive-cleaner analyze data.jsonl \
|
|
188
|
+
--provider openai \
|
|
189
|
+
--model gpt-4o \
|
|
190
|
+
--instructions @instructions.txt
|
|
191
|
+
|
|
192
|
+
# Resume from checkpoint
|
|
193
|
+
recursive-cleaner resume cleaning_state.json \
|
|
194
|
+
--provider mlx \
|
|
195
|
+
--model "model-path"
|
|
196
|
+
|
|
197
|
+
# Apply cleaning functions to data
|
|
198
|
+
recursive-cleaner apply data.jsonl \
|
|
199
|
+
--functions cleaning_functions.py \
|
|
200
|
+
--output cleaned_data.jsonl
|
|
201
|
+
|
|
202
|
+
# Apply to Excel (outputs same format)
|
|
203
|
+
recursive-cleaner apply sales.xlsx \
|
|
204
|
+
--functions cleaning_functions.py
|
|
205
|
+
|
|
206
|
+
# Apply to PDF (outputs markdown)
|
|
207
|
+
recursive-cleaner apply document.pdf \
|
|
208
|
+
--functions cleaning_functions.py \
|
|
209
|
+
--output cleaned.md
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### CLI Options
|
|
213
|
+
|
|
214
|
+
```
|
|
215
|
+
recursive-cleaner generate <FILE> [OPTIONS]
|
|
216
|
+
|
|
217
|
+
Required:
|
|
218
|
+
FILE Input data file
|
|
219
|
+
-p, --provider {mlx,openai} LLM provider
|
|
220
|
+
-m, --model MODEL Model name/path
|
|
221
|
+
|
|
222
|
+
Optional:
|
|
223
|
+
-i, --instructions TEXT Cleaning instructions (or @file.txt)
|
|
224
|
+
--base-url URL API URL for OpenAI-compatible servers
|
|
225
|
+
--chunk-size N Items per chunk (default: 50)
|
|
226
|
+
--max-iterations N Max iterations per chunk (default: 5)
|
|
227
|
+
-o, --output PATH Output file (default: cleaning_functions.py)
|
|
228
|
+
--tui Enable Rich dashboard
|
|
229
|
+
--optimize Consolidate redundant functions
|
|
230
|
+
--track-metrics Measure before/after quality
|
|
231
|
+
```
|
|
232
|
+
|
|
143
233
|
## Configuration
|
|
144
234
|
|
|
145
235
|
```python
|
|
@@ -270,6 +360,7 @@ cleaner.run()
|
|
|
270
360
|
|
|
271
361
|
```
|
|
272
362
|
recursive_cleaner/
|
|
363
|
+
├── cli.py # Command line interface
|
|
273
364
|
├── cleaner.py # Main DataCleaner class
|
|
274
365
|
├── context.py # Docstring registry with FIFO eviction
|
|
275
366
|
├── dependencies.py # Topological sort for function ordering
|
|
@@ -286,6 +377,10 @@ recursive_cleaner/
|
|
|
286
377
|
├── validation.py # Runtime validation + holdout
|
|
287
378
|
└── vendor/
|
|
288
379
|
└── chunker.py # Vendored sentence-aware chunker
|
|
380
|
+
|
|
381
|
+
backends/
|
|
382
|
+
├── mlx_backend.py # MLX-LM backend for Apple Silicon
|
|
383
|
+
└── openai_backend.py # OpenAI-compatible backend
|
|
289
384
|
```
|
|
290
385
|
|
|
291
386
|
## Testing
|
|
@@ -294,14 +389,14 @@ recursive_cleaner/
|
|
|
294
389
|
pytest tests/ -v
|
|
295
390
|
```
|
|
296
391
|
|
|
297
|
-
|
|
392
|
+
548 tests covering all features. Test datasets in `test_cases/`:
|
|
298
393
|
- E-commerce product catalogs
|
|
299
394
|
- Healthcare patient records
|
|
300
395
|
- Financial transaction data
|
|
301
396
|
|
|
302
397
|
## Philosophy
|
|
303
398
|
|
|
304
|
-
- **Simplicity over extensibility**: ~
|
|
399
|
+
- **Simplicity over extensibility**: ~5,000 lines that do one thing well
|
|
305
400
|
- **stdlib over dependencies**: Only `tenacity` required
|
|
306
401
|
- **Retry over recover**: On error, retry with error in prompt
|
|
307
402
|
- **Wu wei**: Let the LLM make decisions about data it understands
|
|
@@ -310,6 +405,7 @@ pytest tests/ -v
|
|
|
310
405
|
|
|
311
406
|
| Version | Features |
|
|
312
407
|
|---------|----------|
|
|
408
|
+
| v0.9.0 | CLI tool with MLX and OpenAI-compatible backends (LM Studio, Ollama) |
|
|
313
409
|
| v0.8.0 | Terminal UI with Rich dashboard, mission control aesthetic, transmission log |
|
|
314
410
|
| v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
|
|
315
411
|
| v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
|
|
@@ -1,7 +1,11 @@
|
|
|
1
|
-
backends/__init__.py,sha256=
|
|
1
|
+
backends/__init__.py,sha256=vWcPASV0GGEAydzOSjdrknkSHoGbSs4edtuv9HIzBhI,180
|
|
2
2
|
backends/mlx_backend.py,sha256=0U6IqmDHyk4vjKzytvEcQvSUBryQTgFtsNOcpwFNKk8,2945
|
|
3
|
-
|
|
4
|
-
recursive_cleaner/
|
|
3
|
+
backends/openai_backend.py,sha256=vKWsXKltBv_tJDoQfQ_7KVMZDfomhFFN2vl1oZ1KGbQ,2057
|
|
4
|
+
recursive_cleaner/__init__.py,sha256=xCFlkqmmBoa7ntUZQnRQxVMv9iLeOvmboDS_j2EHfZI,1862
|
|
5
|
+
recursive_cleaner/__main__.py,sha256=WXmMaL_myHPsG_qXAhZDufD43Ydsd25RV2IPeW2Kg08,152
|
|
6
|
+
recursive_cleaner/apply.py,sha256=hjeljhZNiOuwz9m09RYVLl_z_9tet7LwubH6cb_Wy6Y,13855
|
|
7
|
+
recursive_cleaner/cleaner.py,sha256=kPOQ44hgiJzABiqdmjg2hqd7Ot9uxKUSOe8_jz0UBQc,29911
|
|
8
|
+
recursive_cleaner/cli.py,sha256=Sk_qYKxSn1PiPmMLKkyj9VxsseHaSXmSlGazxfmkTFc,12807
|
|
5
9
|
recursive_cleaner/context.py,sha256=avMXRDxLd7nd8CKWtvPHQy1MFhBKiA0aUVVJIlWoLZ4,824
|
|
6
10
|
recursive_cleaner/dependencies.py,sha256=vlYeoGL517v3yUSWN0wYDuIs9OOuQwM_dCBADrlitW8,2080
|
|
7
11
|
recursive_cleaner/errors.py,sha256=hwRJF8NSmWy_FZHCxcZDZxLQ0zqvo5dX8ImkB9mrOYc,433
|
|
@@ -14,12 +18,13 @@ recursive_cleaner/prompt.py,sha256=ep0eOXz_XbhH3HduJ76LvzVSftonhcv4GLEecIqd3lY,6
|
|
|
14
18
|
recursive_cleaner/report.py,sha256=AWWneRjvl76ccLlExdkKJeY3GVFUG_LtmzVIJJT5cFI,4629
|
|
15
19
|
recursive_cleaner/response.py,sha256=3w0mLnqEPdB4daMSF0mtTcG0PTP-utb1HFtKuYA1ljw,9064
|
|
16
20
|
recursive_cleaner/schema.py,sha256=w2hcEdApR15KVI9SFWB3VfumMoHFwn1YJrktdfgPo8M,3925
|
|
17
|
-
recursive_cleaner/tui.py,sha256=
|
|
21
|
+
recursive_cleaner/tui.py,sha256=zuiFPtMh3K-sC1CWZoaoUmgZ3rESkl10gYcqMzpVqiM,22598
|
|
18
22
|
recursive_cleaner/types.py,sha256=-GdCmsfHd3rfdfCi5c-RXqX4TyuCSHgA__3AF3bMhoQ,290
|
|
19
23
|
recursive_cleaner/validation.py,sha256=-KAolhw3GQyhHwmh0clEj8xqPD5O-R2AO5rx7vubIME,6442
|
|
20
24
|
recursive_cleaner/vendor/__init__.py,sha256=E87TjmjRzu8ty39nqThvBwM611yXlLKQZ6KGY_zp3Dk,117
|
|
21
25
|
recursive_cleaner/vendor/chunker.py,sha256=pDDbfF6FoSmUji0-RG4MletPxJ-VybGw0yfnhh0aMSQ,6730
|
|
22
|
-
recursive_cleaner-0.
|
|
23
|
-
recursive_cleaner-0.
|
|
24
|
-
recursive_cleaner-0.
|
|
25
|
-
recursive_cleaner-0.
|
|
26
|
+
recursive_cleaner-1.0.0.dist-info/METADATA,sha256=L86ATNd8JxmPp32HKaO6PPwkmq4sIE3Mdvgx3pmUulE,14285
|
|
27
|
+
recursive_cleaner-1.0.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
28
|
+
recursive_cleaner-1.0.0.dist-info/entry_points.txt,sha256=S5nbi0rnifpShxdXGExeZnd65UZfp8K7DNyuKPST6nk,65
|
|
29
|
+
recursive_cleaner-1.0.0.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
|
|
30
|
+
recursive_cleaner-1.0.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|