recursive-cleaner 0.8.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- backends/__init__.py +2 -1
- backends/openai_backend.py +71 -0
- recursive_cleaner/__init__.py +4 -1
- recursive_cleaner/__main__.py +8 -0
- recursive_cleaner/apply.py +483 -0
- recursive_cleaner/cleaner.py +27 -5
- recursive_cleaner/cli.py +395 -0
- recursive_cleaner/prompt.py +8 -4
- recursive_cleaner/tui.py +43 -24
- recursive_cleaner/validation.py +40 -1
- {recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.1.dist-info}/METADATA +100 -4
- {recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.1.dist-info}/RECORD +15 -10
- recursive_cleaner-1.0.1.dist-info/entry_points.txt +2 -0
- {recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.1.dist-info}/WHEEL +0 -0
- {recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.1.dist-info}/licenses/LICENSE +0 -0
backends/__init__.py
CHANGED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""OpenAI-compatible backend for Recursive Data Cleaner."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class OpenAIBackend:
|
|
7
|
+
"""
|
|
8
|
+
OpenAI-compatible backend implementation.
|
|
9
|
+
|
|
10
|
+
Works with OpenAI API, LM Studio, Ollama, and other OpenAI-compatible servers.
|
|
11
|
+
Conforms to the LLMBackend protocol.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
model: str,
|
|
17
|
+
api_key: str | None = None,
|
|
18
|
+
base_url: str | None = None,
|
|
19
|
+
max_tokens: int = 4096,
|
|
20
|
+
temperature: float = 0.7,
|
|
21
|
+
):
|
|
22
|
+
"""
|
|
23
|
+
Initialize the OpenAI backend.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
model: Model name (e.g., "gpt-4o", "gpt-3.5-turbo")
|
|
27
|
+
api_key: API key (defaults to OPENAI_API_KEY env var, or "not-needed" for local)
|
|
28
|
+
base_url: API base URL (defaults to OpenAI's API)
|
|
29
|
+
max_tokens: Maximum tokens to generate
|
|
30
|
+
temperature: Sampling temperature
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
import openai
|
|
34
|
+
except ImportError:
|
|
35
|
+
raise ImportError(
|
|
36
|
+
"OpenAI SDK not installed. Install with: pip install openai"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
self.model = model
|
|
40
|
+
self.max_tokens = max_tokens
|
|
41
|
+
self.temperature = temperature
|
|
42
|
+
|
|
43
|
+
# Resolve API key: explicit > env var > "not-needed" for local servers
|
|
44
|
+
if api_key is not None:
|
|
45
|
+
resolved_key = api_key
|
|
46
|
+
else:
|
|
47
|
+
resolved_key = os.environ.get("OPENAI_API_KEY", "not-needed")
|
|
48
|
+
|
|
49
|
+
# Create client
|
|
50
|
+
self._client = openai.OpenAI(
|
|
51
|
+
api_key=resolved_key,
|
|
52
|
+
base_url=base_url,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def generate(self, prompt: str) -> str:
|
|
56
|
+
"""
|
|
57
|
+
Generate a response from the LLM.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
prompt: The input prompt
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
The generated text response
|
|
64
|
+
"""
|
|
65
|
+
response = self._client.chat.completions.create(
|
|
66
|
+
model=self.model,
|
|
67
|
+
messages=[{"role": "user", "content": prompt}],
|
|
68
|
+
max_tokens=self.max_tokens,
|
|
69
|
+
temperature=self.temperature,
|
|
70
|
+
)
|
|
71
|
+
return response.choices[0].message.content or ""
|
recursive_cleaner/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Recursive Data Cleaner - LLM-powered incremental data cleaning pipeline."""
|
|
2
2
|
|
|
3
|
+
from recursive_cleaner.apply import apply_cleaning
|
|
3
4
|
from recursive_cleaner.cleaner import DataCleaner
|
|
4
5
|
from recursive_cleaner.context import build_context
|
|
5
6
|
from recursive_cleaner.dependencies import resolve_dependencies
|
|
@@ -21,9 +22,10 @@ from recursive_cleaner.prompt import build_prompt
|
|
|
21
22
|
from recursive_cleaner.response import extract_python_block, parse_response
|
|
22
23
|
from recursive_cleaner.parser_generator import check_parser_safety, generate_parser
|
|
23
24
|
from recursive_cleaner.tui import HAS_RICH, TUIRenderer
|
|
24
|
-
from recursive_cleaner.validation import check_code_safety, extract_sample_data, validate_function
|
|
25
|
+
from recursive_cleaner.validation import check_code_safety, extract_modified_fields, extract_sample_data, validate_function
|
|
25
26
|
|
|
26
27
|
__all__ = [
|
|
28
|
+
"apply_cleaning",
|
|
27
29
|
"CleanerError",
|
|
28
30
|
"ParseError",
|
|
29
31
|
"MaxIterationsError",
|
|
@@ -41,6 +43,7 @@ __all__ = [
|
|
|
41
43
|
"validate_function",
|
|
42
44
|
"extract_sample_data",
|
|
43
45
|
"check_code_safety",
|
|
46
|
+
"extract_modified_fields",
|
|
44
47
|
"resolve_dependencies",
|
|
45
48
|
"QualityMetrics",
|
|
46
49
|
"measure_quality",
|
|
@@ -0,0 +1,483 @@
|
|
|
1
|
+
"""Apply cleaning functions to data files."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import importlib.util
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Callable
|
|
8
|
+
|
|
9
|
+
from .parsers import MARKITDOWN_EXTENSIONS
|
|
10
|
+
|
|
11
|
+
# Text formats that should be converted to markdown (excludes spreadsheets)
|
|
12
|
+
TEXT_MARKITDOWN_EXTENSIONS = MARKITDOWN_EXTENSIONS - {".xlsx", ".xls", ".ods"}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def load_cleaning_module(functions_path: str):
|
|
16
|
+
"""
|
|
17
|
+
Dynamically import a cleaning_functions.py file.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
functions_path: Path to the cleaning functions file
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
The imported module
|
|
24
|
+
|
|
25
|
+
Raises:
|
|
26
|
+
FileNotFoundError: If the functions file doesn't exist
|
|
27
|
+
ImportError: If the module cannot be imported
|
|
28
|
+
"""
|
|
29
|
+
path = Path(functions_path)
|
|
30
|
+
if not path.exists():
|
|
31
|
+
raise FileNotFoundError(f"Functions file not found: {functions_path}")
|
|
32
|
+
|
|
33
|
+
spec = importlib.util.spec_from_file_location("cleaning_module", path)
|
|
34
|
+
if spec is None or spec.loader is None:
|
|
35
|
+
raise ImportError(f"Cannot load module from: {functions_path}")
|
|
36
|
+
|
|
37
|
+
module = importlib.util.module_from_spec(spec)
|
|
38
|
+
spec.loader.exec_module(module)
|
|
39
|
+
return module
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_default_output_path(input_path: str, force_ext: str | None = None) -> str:
|
|
43
|
+
"""
|
|
44
|
+
Generate default output path: input.cleaned.ext
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
input_path: Path to the input file
|
|
48
|
+
force_ext: Override the output extension (e.g., ".xlsx" for .xls files)
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Path string for the output file
|
|
52
|
+
"""
|
|
53
|
+
path = Path(input_path)
|
|
54
|
+
suffix = path.suffix.lower()
|
|
55
|
+
|
|
56
|
+
if force_ext:
|
|
57
|
+
ext = force_ext
|
|
58
|
+
elif suffix == ".xls":
|
|
59
|
+
# .xls files are written as .xlsx
|
|
60
|
+
ext = ".xlsx"
|
|
61
|
+
elif suffix == ".txt" or suffix in TEXT_MARKITDOWN_EXTENSIONS:
|
|
62
|
+
# Text formats output as markdown
|
|
63
|
+
ext = ".md"
|
|
64
|
+
else:
|
|
65
|
+
ext = path.suffix
|
|
66
|
+
|
|
67
|
+
return str(path.with_suffix(f".cleaned{ext}"))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def apply_to_jsonl(
|
|
71
|
+
input_path: str,
|
|
72
|
+
output_path: str,
|
|
73
|
+
clean_fn: Callable,
|
|
74
|
+
on_progress: Callable[[dict], None] | None = None,
|
|
75
|
+
) -> int:
|
|
76
|
+
"""
|
|
77
|
+
Stream JSONL: read line, clean, write line.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
input_path: Path to input JSONL file
|
|
81
|
+
output_path: Path for output JSONL file
|
|
82
|
+
clean_fn: Cleaning function to apply to each record
|
|
83
|
+
on_progress: Optional progress callback
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Number of records processed
|
|
87
|
+
"""
|
|
88
|
+
records_processed = 0
|
|
89
|
+
|
|
90
|
+
with open(input_path, "r", encoding="utf-8") as infile, \
|
|
91
|
+
open(output_path, "w", encoding="utf-8") as outfile:
|
|
92
|
+
for line in infile:
|
|
93
|
+
line = line.strip()
|
|
94
|
+
if not line:
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
record = json.loads(line)
|
|
98
|
+
cleaned = clean_fn(record)
|
|
99
|
+
outfile.write(json.dumps(cleaned) + "\n")
|
|
100
|
+
|
|
101
|
+
records_processed += 1
|
|
102
|
+
if on_progress:
|
|
103
|
+
on_progress({"type": "apply_progress", "records_processed": records_processed})
|
|
104
|
+
|
|
105
|
+
return records_processed
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def apply_to_csv(
|
|
109
|
+
input_path: str,
|
|
110
|
+
output_path: str,
|
|
111
|
+
clean_fn: Callable,
|
|
112
|
+
on_progress: Callable[[dict], None] | None = None,
|
|
113
|
+
) -> int:
|
|
114
|
+
"""
|
|
115
|
+
Stream CSV: DictReader to clean each row, DictWriter to output.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
input_path: Path to input CSV file
|
|
119
|
+
output_path: Path for output CSV file
|
|
120
|
+
clean_fn: Cleaning function to apply to each record
|
|
121
|
+
on_progress: Optional progress callback
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Number of records processed
|
|
125
|
+
"""
|
|
126
|
+
records_processed = 0
|
|
127
|
+
|
|
128
|
+
with open(input_path, "r", encoding="utf-8", newline="") as infile:
|
|
129
|
+
reader = csv.DictReader(infile)
|
|
130
|
+
fieldnames = reader.fieldnames
|
|
131
|
+
|
|
132
|
+
if not fieldnames:
|
|
133
|
+
return 0
|
|
134
|
+
|
|
135
|
+
with open(output_path, "w", encoding="utf-8", newline="") as outfile:
|
|
136
|
+
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
|
|
137
|
+
writer.writeheader()
|
|
138
|
+
|
|
139
|
+
for row in reader:
|
|
140
|
+
cleaned = clean_fn(row)
|
|
141
|
+
writer.writerow(cleaned)
|
|
142
|
+
|
|
143
|
+
records_processed += 1
|
|
144
|
+
if on_progress:
|
|
145
|
+
on_progress({"type": "apply_progress", "records_processed": records_processed})
|
|
146
|
+
|
|
147
|
+
return records_processed
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def apply_to_json(
|
|
151
|
+
input_path: str,
|
|
152
|
+
output_path: str,
|
|
153
|
+
clean_fn: Callable,
|
|
154
|
+
on_progress: Callable[[dict], None] | None = None,
|
|
155
|
+
) -> int:
|
|
156
|
+
"""
|
|
157
|
+
Batch JSON array: load all, clean each, write array.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
input_path: Path to input JSON file
|
|
161
|
+
output_path: Path for output JSON file
|
|
162
|
+
clean_fn: Cleaning function to apply to each record
|
|
163
|
+
on_progress: Optional progress callback
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Number of records processed
|
|
167
|
+
"""
|
|
168
|
+
with open(input_path, "r", encoding="utf-8") as f:
|
|
169
|
+
data = json.load(f)
|
|
170
|
+
|
|
171
|
+
if not isinstance(data, list):
|
|
172
|
+
# Single object - wrap, clean, unwrap
|
|
173
|
+
cleaned = clean_fn(data)
|
|
174
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
175
|
+
json.dump(cleaned, f, indent=2)
|
|
176
|
+
if on_progress:
|
|
177
|
+
on_progress({"type": "apply_progress", "records_processed": 1})
|
|
178
|
+
return 1
|
|
179
|
+
|
|
180
|
+
cleaned_data = []
|
|
181
|
+
for i, record in enumerate(data):
|
|
182
|
+
cleaned = clean_fn(record)
|
|
183
|
+
cleaned_data.append(cleaned)
|
|
184
|
+
|
|
185
|
+
if on_progress:
|
|
186
|
+
on_progress({"type": "apply_progress", "records_processed": i + 1})
|
|
187
|
+
|
|
188
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
189
|
+
json.dump(cleaned_data, f, indent=2)
|
|
190
|
+
|
|
191
|
+
return len(cleaned_data)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def apply_to_parquet(
|
|
195
|
+
input_path: str,
|
|
196
|
+
output_path: str,
|
|
197
|
+
clean_fn: Callable,
|
|
198
|
+
on_progress: Callable[[dict], None] | None = None,
|
|
199
|
+
) -> int:
|
|
200
|
+
"""
|
|
201
|
+
Batch Parquet: load as list of dicts, clean each, write back.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
input_path: Path to input Parquet file
|
|
205
|
+
output_path: Path for output Parquet file
|
|
206
|
+
clean_fn: Cleaning function to apply to each record
|
|
207
|
+
on_progress: Optional progress callback
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Number of records processed
|
|
211
|
+
|
|
212
|
+
Raises:
|
|
213
|
+
ImportError: If pyarrow is not installed
|
|
214
|
+
"""
|
|
215
|
+
try:
|
|
216
|
+
import pyarrow as pa
|
|
217
|
+
import pyarrow.parquet as pq
|
|
218
|
+
except ImportError:
|
|
219
|
+
raise ImportError(
|
|
220
|
+
"pyarrow is required for parquet files. "
|
|
221
|
+
"Install with: pip install recursive-cleaner[parquet]"
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
table = pq.read_table(input_path)
|
|
225
|
+
records = table.to_pylist()
|
|
226
|
+
|
|
227
|
+
cleaned_data = []
|
|
228
|
+
for i, record in enumerate(records):
|
|
229
|
+
cleaned = clean_fn(record)
|
|
230
|
+
cleaned_data.append(cleaned)
|
|
231
|
+
|
|
232
|
+
if on_progress:
|
|
233
|
+
on_progress({"type": "apply_progress", "records_processed": i + 1})
|
|
234
|
+
|
|
235
|
+
# Write back as parquet
|
|
236
|
+
cleaned_table = pa.Table.from_pylist(cleaned_data)
|
|
237
|
+
pq.write_table(cleaned_table, output_path)
|
|
238
|
+
|
|
239
|
+
return len(cleaned_data)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def apply_to_excel(
|
|
243
|
+
input_path: str,
|
|
244
|
+
output_path: str,
|
|
245
|
+
clean_fn: Callable,
|
|
246
|
+
on_progress: Callable[[dict], None] | None = None,
|
|
247
|
+
) -> int:
|
|
248
|
+
"""
|
|
249
|
+
Batch Excel: load as list of dicts, clean each, write back.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
input_path: Path to input Excel file (.xlsx or .xls)
|
|
253
|
+
output_path: Path for output Excel file (.xlsx)
|
|
254
|
+
clean_fn: Cleaning function to apply to each record
|
|
255
|
+
on_progress: Optional progress callback
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
Number of records processed
|
|
259
|
+
|
|
260
|
+
Raises:
|
|
261
|
+
ImportError: If openpyxl (or xlrd for .xls) is not installed
|
|
262
|
+
"""
|
|
263
|
+
suffix = Path(input_path).suffix.lower()
|
|
264
|
+
|
|
265
|
+
if suffix == ".xls":
|
|
266
|
+
# Use xlrd for .xls files
|
|
267
|
+
try:
|
|
268
|
+
import xlrd
|
|
269
|
+
except ImportError:
|
|
270
|
+
raise ImportError(
|
|
271
|
+
"xlrd is required for .xls files. "
|
|
272
|
+
"Install with: pip install recursive-cleaner[excel]"
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
workbook = xlrd.open_workbook(input_path)
|
|
276
|
+
sheet = workbook.sheet_by_index(0)
|
|
277
|
+
|
|
278
|
+
if sheet.nrows < 1:
|
|
279
|
+
return 0
|
|
280
|
+
|
|
281
|
+
# First row is headers
|
|
282
|
+
headers = [str(sheet.cell_value(0, col)) for col in range(sheet.ncols)]
|
|
283
|
+
records = []
|
|
284
|
+
for row_idx in range(1, sheet.nrows):
|
|
285
|
+
row_data = {}
|
|
286
|
+
for col_idx, header in enumerate(headers):
|
|
287
|
+
row_data[header] = sheet.cell_value(row_idx, col_idx)
|
|
288
|
+
records.append(row_data)
|
|
289
|
+
else:
|
|
290
|
+
# Use openpyxl for .xlsx files
|
|
291
|
+
try:
|
|
292
|
+
from openpyxl import load_workbook
|
|
293
|
+
except ImportError:
|
|
294
|
+
raise ImportError(
|
|
295
|
+
"openpyxl is required for .xlsx files. "
|
|
296
|
+
"Install with: pip install recursive-cleaner[excel]"
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
workbook = load_workbook(input_path, read_only=True)
|
|
300
|
+
sheet = workbook.active
|
|
301
|
+
|
|
302
|
+
rows = list(sheet.iter_rows(values_only=True))
|
|
303
|
+
if not rows:
|
|
304
|
+
return 0
|
|
305
|
+
|
|
306
|
+
# First row is headers
|
|
307
|
+
headers = [str(h) if h is not None else "" for h in rows[0]]
|
|
308
|
+
records = []
|
|
309
|
+
for row in rows[1:]:
|
|
310
|
+
row_data = {}
|
|
311
|
+
for col_idx, header in enumerate(headers):
|
|
312
|
+
value = row[col_idx] if col_idx < len(row) else None
|
|
313
|
+
row_data[header] = value
|
|
314
|
+
records.append(row_data)
|
|
315
|
+
|
|
316
|
+
workbook.close()
|
|
317
|
+
|
|
318
|
+
# Clean records
|
|
319
|
+
cleaned_data = []
|
|
320
|
+
for i, record in enumerate(records):
|
|
321
|
+
cleaned = clean_fn(record)
|
|
322
|
+
cleaned_data.append(cleaned)
|
|
323
|
+
|
|
324
|
+
if on_progress:
|
|
325
|
+
on_progress({"type": "apply_progress", "records_processed": i + 1})
|
|
326
|
+
|
|
327
|
+
# Write back as xlsx using openpyxl
|
|
328
|
+
try:
|
|
329
|
+
from openpyxl import Workbook
|
|
330
|
+
except ImportError:
|
|
331
|
+
raise ImportError(
|
|
332
|
+
"openpyxl is required for writing Excel files. "
|
|
333
|
+
"Install with: pip install recursive-cleaner[excel]"
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
wb = Workbook()
|
|
337
|
+
ws = wb.active
|
|
338
|
+
|
|
339
|
+
if cleaned_data:
|
|
340
|
+
# Write headers
|
|
341
|
+
fieldnames = list(cleaned_data[0].keys())
|
|
342
|
+
ws.append(fieldnames)
|
|
343
|
+
|
|
344
|
+
# Write data rows
|
|
345
|
+
for record in cleaned_data:
|
|
346
|
+
ws.append([record.get(k) for k in fieldnames])
|
|
347
|
+
|
|
348
|
+
wb.save(output_path)
|
|
349
|
+
|
|
350
|
+
return len(cleaned_data)
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def apply_to_text(
|
|
354
|
+
input_path: str,
|
|
355
|
+
output_path: str,
|
|
356
|
+
clean_fn: Callable,
|
|
357
|
+
on_progress: Callable[[dict], None] | None = None,
|
|
358
|
+
) -> int:
|
|
359
|
+
"""
|
|
360
|
+
Process text/document files: extract text, clean, write as markdown.
|
|
361
|
+
|
|
362
|
+
Args:
|
|
363
|
+
input_path: Path to input file (.txt or markitdown format)
|
|
364
|
+
output_path: Path for output markdown file
|
|
365
|
+
clean_fn: Cleaning function to apply to the text
|
|
366
|
+
on_progress: Optional progress callback
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
Number of records processed (always 1 for text)
|
|
370
|
+
|
|
371
|
+
Raises:
|
|
372
|
+
ImportError: If markitdown is not installed (for non-.txt files)
|
|
373
|
+
"""
|
|
374
|
+
suffix = Path(input_path).suffix.lower()
|
|
375
|
+
|
|
376
|
+
if suffix == ".txt":
|
|
377
|
+
# Plain text - read directly
|
|
378
|
+
with open(input_path, "r", encoding="utf-8") as f:
|
|
379
|
+
content = f.read()
|
|
380
|
+
else:
|
|
381
|
+
# Use markitdown for other formats
|
|
382
|
+
try:
|
|
383
|
+
from markitdown import MarkItDown
|
|
384
|
+
except ImportError:
|
|
385
|
+
raise ImportError(
|
|
386
|
+
"markitdown is required for this file type. "
|
|
387
|
+
"Install with: pip install recursive-cleaner[markitdown]"
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
md = MarkItDown()
|
|
391
|
+
result = md.convert(input_path)
|
|
392
|
+
content = result.text_content
|
|
393
|
+
|
|
394
|
+
# Clean the text content
|
|
395
|
+
cleaned = clean_fn(content)
|
|
396
|
+
|
|
397
|
+
# Write as markdown
|
|
398
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
399
|
+
f.write(cleaned)
|
|
400
|
+
|
|
401
|
+
if on_progress:
|
|
402
|
+
on_progress({"type": "apply_progress", "records_processed": 1})
|
|
403
|
+
|
|
404
|
+
return 1
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def apply_cleaning(
|
|
408
|
+
input_path: str,
|
|
409
|
+
functions_path: str,
|
|
410
|
+
output_path: str | None = None,
|
|
411
|
+
on_progress: Callable[[dict], None] | None = None,
|
|
412
|
+
) -> str:
|
|
413
|
+
"""
|
|
414
|
+
Apply cleaning functions to a data file.
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
input_path: Path to input data file
|
|
418
|
+
functions_path: Path to cleaning_functions.py
|
|
419
|
+
output_path: Path for output file (default: input.cleaned.ext)
|
|
420
|
+
on_progress: Optional progress callback
|
|
421
|
+
|
|
422
|
+
Returns:
|
|
423
|
+
Path to output file
|
|
424
|
+
|
|
425
|
+
Raises:
|
|
426
|
+
FileNotFoundError: If input or functions file not found
|
|
427
|
+
ImportError: If functions file cannot be imported
|
|
428
|
+
ValueError: If input format is unsupported
|
|
429
|
+
"""
|
|
430
|
+
# Validate input file exists
|
|
431
|
+
if not Path(input_path).exists():
|
|
432
|
+
raise FileNotFoundError(f"Input file not found: {input_path}")
|
|
433
|
+
|
|
434
|
+
# Load cleaning module
|
|
435
|
+
module = load_cleaning_module(functions_path)
|
|
436
|
+
|
|
437
|
+
# Get the clean_data function
|
|
438
|
+
if not hasattr(module, "clean_data"):
|
|
439
|
+
raise ImportError(f"Functions file missing clean_data() function: {functions_path}")
|
|
440
|
+
|
|
441
|
+
clean_fn = module.clean_data
|
|
442
|
+
|
|
443
|
+
# Determine output path
|
|
444
|
+
suffix = Path(input_path).suffix.lower()
|
|
445
|
+
if output_path is None:
|
|
446
|
+
output_path = get_default_output_path(input_path)
|
|
447
|
+
|
|
448
|
+
# Route by format
|
|
449
|
+
format_handlers = {
|
|
450
|
+
".jsonl": apply_to_jsonl,
|
|
451
|
+
".csv": apply_to_csv,
|
|
452
|
+
".json": apply_to_json,
|
|
453
|
+
".parquet": apply_to_parquet,
|
|
454
|
+
".xlsx": apply_to_excel,
|
|
455
|
+
".xls": apply_to_excel,
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
handler = format_handlers.get(suffix)
|
|
459
|
+
|
|
460
|
+
# Check for text formats (.txt and markitdown extensions, excluding spreadsheets)
|
|
461
|
+
if handler is None:
|
|
462
|
+
if suffix == ".txt" or suffix in TEXT_MARKITDOWN_EXTENSIONS:
|
|
463
|
+
handler = apply_to_text
|
|
464
|
+
|
|
465
|
+
if handler is None:
|
|
466
|
+
raise ValueError(f"Unsupported format: {suffix}")
|
|
467
|
+
|
|
468
|
+
# Emit start event (total_records unknown for streaming formats)
|
|
469
|
+
if on_progress:
|
|
470
|
+
on_progress({"type": "apply_start", "total_records": None})
|
|
471
|
+
|
|
472
|
+
# Apply cleaning
|
|
473
|
+
total_records = handler(input_path, output_path, clean_fn, on_progress)
|
|
474
|
+
|
|
475
|
+
# Emit complete event
|
|
476
|
+
if on_progress:
|
|
477
|
+
on_progress({
|
|
478
|
+
"type": "apply_complete",
|
|
479
|
+
"total_records": total_records,
|
|
480
|
+
"output_path": output_path,
|
|
481
|
+
})
|
|
482
|
+
|
|
483
|
+
return output_path
|
recursive_cleaner/cleaner.py
CHANGED
|
@@ -17,7 +17,7 @@ from .prompt import build_prompt
|
|
|
17
17
|
from .response import parse_response
|
|
18
18
|
from .schema import format_schema_for_prompt, infer_schema
|
|
19
19
|
from .types import LLMBackend
|
|
20
|
-
from .validation import check_code_safety, extract_sample_data, split_holdout, validate_function
|
|
20
|
+
from .validation import check_code_safety, extract_modified_fields, extract_sample_data, split_holdout, validate_function
|
|
21
21
|
|
|
22
22
|
STATE_VERSION = "0.5.0"
|
|
23
23
|
|
|
@@ -63,6 +63,7 @@ class DataCleaner:
|
|
|
63
63
|
dry_run: bool = False,
|
|
64
64
|
auto_parse: bool = False,
|
|
65
65
|
tui: bool = False,
|
|
66
|
+
output_path: str = "cleaning_functions.py",
|
|
66
67
|
):
|
|
67
68
|
self.backend = llm_backend
|
|
68
69
|
self.file_path = file_path
|
|
@@ -88,6 +89,7 @@ class DataCleaner:
|
|
|
88
89
|
self.dry_run = dry_run
|
|
89
90
|
self.auto_parse = auto_parse
|
|
90
91
|
self.tui = tui
|
|
92
|
+
self.output_path = output_path
|
|
91
93
|
self.functions: list[dict] = [] # List of {name, docstring, code}
|
|
92
94
|
self._tui_renderer = None # TUIRenderer instance when tui=True
|
|
93
95
|
self._generated_parser: callable | None = None # LLM-generated parser for unknown formats
|
|
@@ -108,6 +110,8 @@ class DataCleaner:
|
|
|
108
110
|
"min_ms": float("inf"),
|
|
109
111
|
"max_ms": 0.0,
|
|
110
112
|
}
|
|
113
|
+
# Track fields already covered by generated functions (per chunk)
|
|
114
|
+
self._fields_covered: set[str] = set()
|
|
111
115
|
|
|
112
116
|
def _emit(self, event_type: str, chunk_index: int = 0, **kwargs) -> None:
|
|
113
117
|
"""Emit a progress event to the callback, if set."""
|
|
@@ -520,7 +524,7 @@ class DataCleaner:
|
|
|
520
524
|
"quality_delta": 0.0, # Could be calculated from metrics
|
|
521
525
|
"latency_total_ms": latency_summary.get("total_ms", 0.0),
|
|
522
526
|
"llm_calls": latency_summary.get("call_count", 0),
|
|
523
|
-
"output_file":
|
|
527
|
+
"output_file": self.output_path,
|
|
524
528
|
})
|
|
525
529
|
self._tui_renderer.stop()
|
|
526
530
|
|
|
@@ -531,6 +535,8 @@ class DataCleaner:
|
|
|
531
535
|
"""Process a single chunk, iterating until clean or max iterations."""
|
|
532
536
|
self._emit("chunk_start", chunk_index=chunk_idx)
|
|
533
537
|
error_feedback = ""
|
|
538
|
+
# Reset fields covered for new chunk
|
|
539
|
+
self._fields_covered = set()
|
|
534
540
|
|
|
535
541
|
# Dry run mode: just detect issues, don't generate functions
|
|
536
542
|
if self.dry_run:
|
|
@@ -592,6 +598,20 @@ class DataCleaner:
|
|
|
592
598
|
print(f" Safety check failed: {safety_error}")
|
|
593
599
|
continue
|
|
594
600
|
|
|
601
|
+
# Check for duplicate field coverage
|
|
602
|
+
new_fields = extract_modified_fields(result["code"])
|
|
603
|
+
overlap = new_fields & self._fields_covered
|
|
604
|
+
if overlap:
|
|
605
|
+
field_list = ", ".join(sorted(overlap))
|
|
606
|
+
error_feedback = f"You already generated a function for field(s): {field_list}. This issue is solved. Move on to the next unsolved issue."
|
|
607
|
+
self._emit(
|
|
608
|
+
"duplicate_field",
|
|
609
|
+
chunk_index=chunk_idx,
|
|
610
|
+
function_name=result["name"],
|
|
611
|
+
fields=list(overlap),
|
|
612
|
+
)
|
|
613
|
+
continue
|
|
614
|
+
|
|
595
615
|
# Runtime validation if enabled
|
|
596
616
|
if self.validate_runtime:
|
|
597
617
|
# Use holdout data if available, else sample from generation chunk
|
|
@@ -626,6 +646,8 @@ class DataCleaner:
|
|
|
626
646
|
"docstring": result["docstring"],
|
|
627
647
|
"code": result["code"],
|
|
628
648
|
})
|
|
649
|
+
# Track fields covered by this function
|
|
650
|
+
self._fields_covered.update(new_fields)
|
|
629
651
|
# Track for saturation check
|
|
630
652
|
self._recent_new_function_count += 1
|
|
631
653
|
|
|
@@ -687,11 +709,11 @@ class DataCleaner:
|
|
|
687
709
|
self._emit("chunk_done", chunk_index=chunk_idx)
|
|
688
710
|
|
|
689
711
|
def _write_output(self) -> None:
|
|
690
|
-
"""Write generated functions to
|
|
712
|
+
"""Write generated functions to output file."""
|
|
691
713
|
from .output import write_cleaning_file
|
|
692
714
|
|
|
693
715
|
try:
|
|
694
|
-
write_cleaning_file(self.functions)
|
|
716
|
+
write_cleaning_file(self.functions, self.output_path)
|
|
695
717
|
except OutputValidationError as e:
|
|
696
718
|
if not self.tui:
|
|
697
719
|
print(f" Error: {e}")
|
|
@@ -707,7 +729,7 @@ class DataCleaner:
|
|
|
707
729
|
if not self.tui:
|
|
708
730
|
print(f" Skipping invalid function: {f['name']}")
|
|
709
731
|
if valid_functions:
|
|
710
|
-
write_cleaning_file(valid_functions)
|
|
732
|
+
write_cleaning_file(valid_functions, self.output_path)
|
|
711
733
|
elif not self.tui:
|
|
712
734
|
print(" No valid functions to write.")
|
|
713
735
|
|