recursive-cleaner 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- recursive_cleaner/__init__.py +2 -1
- recursive_cleaner/cleaner.py +21 -1
- recursive_cleaner/prompt.py +8 -4
- recursive_cleaner/validation.py +40 -1
- {recursive_cleaner-1.0.0.dist-info → recursive_cleaner-1.0.1.dist-info}/METADATA +2 -2
- {recursive_cleaner-1.0.0.dist-info → recursive_cleaner-1.0.1.dist-info}/RECORD +9 -9
- {recursive_cleaner-1.0.0.dist-info → recursive_cleaner-1.0.1.dist-info}/WHEEL +0 -0
- {recursive_cleaner-1.0.0.dist-info → recursive_cleaner-1.0.1.dist-info}/entry_points.txt +0 -0
- {recursive_cleaner-1.0.0.dist-info → recursive_cleaner-1.0.1.dist-info}/licenses/LICENSE +0 -0
recursive_cleaner/__init__.py
CHANGED
|
@@ -22,7 +22,7 @@ from recursive_cleaner.prompt import build_prompt
|
|
|
22
22
|
from recursive_cleaner.response import extract_python_block, parse_response
|
|
23
23
|
from recursive_cleaner.parser_generator import check_parser_safety, generate_parser
|
|
24
24
|
from recursive_cleaner.tui import HAS_RICH, TUIRenderer
|
|
25
|
-
from recursive_cleaner.validation import check_code_safety, extract_sample_data, validate_function
|
|
25
|
+
from recursive_cleaner.validation import check_code_safety, extract_modified_fields, extract_sample_data, validate_function
|
|
26
26
|
|
|
27
27
|
__all__ = [
|
|
28
28
|
"apply_cleaning",
|
|
@@ -43,6 +43,7 @@ __all__ = [
|
|
|
43
43
|
"validate_function",
|
|
44
44
|
"extract_sample_data",
|
|
45
45
|
"check_code_safety",
|
|
46
|
+
"extract_modified_fields",
|
|
46
47
|
"resolve_dependencies",
|
|
47
48
|
"QualityMetrics",
|
|
48
49
|
"measure_quality",
|
recursive_cleaner/cleaner.py
CHANGED
|
@@ -17,7 +17,7 @@ from .prompt import build_prompt
|
|
|
17
17
|
from .response import parse_response
|
|
18
18
|
from .schema import format_schema_for_prompt, infer_schema
|
|
19
19
|
from .types import LLMBackend
|
|
20
|
-
from .validation import check_code_safety, extract_sample_data, split_holdout, validate_function
|
|
20
|
+
from .validation import check_code_safety, extract_modified_fields, extract_sample_data, split_holdout, validate_function
|
|
21
21
|
|
|
22
22
|
STATE_VERSION = "0.5.0"
|
|
23
23
|
|
|
@@ -110,6 +110,8 @@ class DataCleaner:
|
|
|
110
110
|
"min_ms": float("inf"),
|
|
111
111
|
"max_ms": 0.0,
|
|
112
112
|
}
|
|
113
|
+
# Track fields already covered by generated functions (per chunk)
|
|
114
|
+
self._fields_covered: set[str] = set()
|
|
113
115
|
|
|
114
116
|
def _emit(self, event_type: str, chunk_index: int = 0, **kwargs) -> None:
|
|
115
117
|
"""Emit a progress event to the callback, if set."""
|
|
@@ -533,6 +535,8 @@ class DataCleaner:
|
|
|
533
535
|
"""Process a single chunk, iterating until clean or max iterations."""
|
|
534
536
|
self._emit("chunk_start", chunk_index=chunk_idx)
|
|
535
537
|
error_feedback = ""
|
|
538
|
+
# Reset fields covered for new chunk
|
|
539
|
+
self._fields_covered = set()
|
|
536
540
|
|
|
537
541
|
# Dry run mode: just detect issues, don't generate functions
|
|
538
542
|
if self.dry_run:
|
|
@@ -594,6 +598,20 @@ class DataCleaner:
|
|
|
594
598
|
print(f" Safety check failed: {safety_error}")
|
|
595
599
|
continue
|
|
596
600
|
|
|
601
|
+
# Check for duplicate field coverage
|
|
602
|
+
new_fields = extract_modified_fields(result["code"])
|
|
603
|
+
overlap = new_fields & self._fields_covered
|
|
604
|
+
if overlap:
|
|
605
|
+
field_list = ", ".join(sorted(overlap))
|
|
606
|
+
error_feedback = f"You already generated a function for field(s): {field_list}. This issue is solved. Move on to the next unsolved issue."
|
|
607
|
+
self._emit(
|
|
608
|
+
"duplicate_field",
|
|
609
|
+
chunk_index=chunk_idx,
|
|
610
|
+
function_name=result["name"],
|
|
611
|
+
fields=list(overlap),
|
|
612
|
+
)
|
|
613
|
+
continue
|
|
614
|
+
|
|
597
615
|
# Runtime validation if enabled
|
|
598
616
|
if self.validate_runtime:
|
|
599
617
|
# Use holdout data if available, else sample from generation chunk
|
|
@@ -628,6 +646,8 @@ class DataCleaner:
|
|
|
628
646
|
"docstring": result["docstring"],
|
|
629
647
|
"code": result["code"],
|
|
630
648
|
})
|
|
649
|
+
# Track fields covered by this function
|
|
650
|
+
self._fields_covered.update(new_fields)
|
|
631
651
|
# Track for saturation check
|
|
632
652
|
self._recent_new_function_count += 1
|
|
633
653
|
|
recursive_cleaner/prompt.py
CHANGED
|
@@ -52,7 +52,8 @@ CONSOLIDATION_TEMPLATE = '''You are reviewing cleaning functions for consolidati
|
|
|
52
52
|
</docstring>
|
|
53
53
|
<code>
|
|
54
54
|
```python
|
|
55
|
-
def merged_function_name(record):
|
|
55
|
+
def merged_function_name(record: dict) -> dict:
|
|
56
|
+
# Modify fields, return record
|
|
56
57
|
...
|
|
57
58
|
```
|
|
58
59
|
</code>
|
|
@@ -108,9 +109,10 @@ Tags: domain, action, detail
|
|
|
108
109
|
</docstring>
|
|
109
110
|
<code>
|
|
110
111
|
```python
|
|
111
|
-
def function_name(
|
|
112
|
-
#
|
|
113
|
-
|
|
112
|
+
def function_name(record: dict) -> dict:
|
|
113
|
+
# Modify field(s) in the record
|
|
114
|
+
record["field"] = cleaned_value
|
|
115
|
+
return record
|
|
114
116
|
```
|
|
115
117
|
</code>
|
|
116
118
|
</function_to_generate>
|
|
@@ -120,6 +122,8 @@ def function_name(data):
|
|
|
120
122
|
|
|
121
123
|
RULES:
|
|
122
124
|
- ONE function per response
|
|
125
|
+
- Function signature: takes a dict (one record), returns the modified dict
|
|
126
|
+
- Modify fields directly on the record, then return it
|
|
123
127
|
- If all issues solved: <chunk_status>clean</chunk_status>, omit <function_to_generate>
|
|
124
128
|
- Include imports inside the function or document needed imports in docstring
|
|
125
129
|
- Function must be idempotent (safe to run multiple times)
|
recursive_cleaner/validation.py
CHANGED
|
@@ -160,7 +160,10 @@ def validate_function(
|
|
|
160
160
|
# Structured mode: sample_data is list[dict]
|
|
161
161
|
for i, record in enumerate(sample_data):
|
|
162
162
|
try:
|
|
163
|
-
func(record)
|
|
163
|
+
result = func(record)
|
|
164
|
+
# Verify function returns a dict (not string, int, etc.)
|
|
165
|
+
if not isinstance(result, dict):
|
|
166
|
+
return False, f"Function must return dict, got {type(result).__name__}"
|
|
164
167
|
except Exception as e:
|
|
165
168
|
return False, f"Runtime error on sample {i}: {type(e).__name__}: {e}"
|
|
166
169
|
|
|
@@ -200,3 +203,39 @@ def extract_sample_data(
|
|
|
200
203
|
except json.JSONDecodeError:
|
|
201
204
|
continue
|
|
202
205
|
return samples
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def extract_modified_fields(code: str) -> set[str]:
|
|
209
|
+
"""
|
|
210
|
+
Extract field names that are modified via record["field"] = ... pattern.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
code: Python source code of the function
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Set of field names that are assigned to
|
|
217
|
+
"""
|
|
218
|
+
try:
|
|
219
|
+
tree = ast.parse(code)
|
|
220
|
+
except SyntaxError:
|
|
221
|
+
return set()
|
|
222
|
+
|
|
223
|
+
fields = set()
|
|
224
|
+
# Common parameter names for the data/record argument
|
|
225
|
+
data_names = {"record", "data"}
|
|
226
|
+
|
|
227
|
+
for node in ast.walk(tree):
|
|
228
|
+
# Look for assignment statements
|
|
229
|
+
if isinstance(node, ast.Assign):
|
|
230
|
+
for target in node.targets:
|
|
231
|
+
# Check if target is a subscript: record["field"] or data["field"]
|
|
232
|
+
if isinstance(target, ast.Subscript):
|
|
233
|
+
# The value should be a Name node (record or data)
|
|
234
|
+
if isinstance(target.value, ast.Name):
|
|
235
|
+
if target.value.id in data_names:
|
|
236
|
+
# The slice should be a string constant
|
|
237
|
+
if isinstance(target.slice, ast.Constant):
|
|
238
|
+
if isinstance(target.slice.value, str):
|
|
239
|
+
fields.add(target.slice.value)
|
|
240
|
+
|
|
241
|
+
return fields
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: recursive-cleaner
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.1
|
|
4
4
|
Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
|
|
5
5
|
Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
6
6
|
Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
@@ -389,7 +389,7 @@ backends/
|
|
|
389
389
|
pytest tests/ -v
|
|
390
390
|
```
|
|
391
391
|
|
|
392
|
-
|
|
392
|
+
555 tests covering all features. Test datasets in `test_cases/`:
|
|
393
393
|
- E-commerce product catalogs
|
|
394
394
|
- Healthcare patient records
|
|
395
395
|
- Financial transaction data
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
backends/__init__.py,sha256=vWcPASV0GGEAydzOSjdrknkSHoGbSs4edtuv9HIzBhI,180
|
|
2
2
|
backends/mlx_backend.py,sha256=0U6IqmDHyk4vjKzytvEcQvSUBryQTgFtsNOcpwFNKk8,2945
|
|
3
3
|
backends/openai_backend.py,sha256=vKWsXKltBv_tJDoQfQ_7KVMZDfomhFFN2vl1oZ1KGbQ,2057
|
|
4
|
-
recursive_cleaner/__init__.py,sha256
|
|
4
|
+
recursive_cleaner/__init__.py,sha256=-NesTf9deCVOxkadFuyfVl-IjfbEHlYcMNAaAW9kUuw,1918
|
|
5
5
|
recursive_cleaner/__main__.py,sha256=WXmMaL_myHPsG_qXAhZDufD43Ydsd25RV2IPeW2Kg08,152
|
|
6
6
|
recursive_cleaner/apply.py,sha256=hjeljhZNiOuwz9m09RYVLl_z_9tet7LwubH6cb_Wy6Y,13855
|
|
7
|
-
recursive_cleaner/cleaner.py,sha256=
|
|
7
|
+
recursive_cleaner/cleaner.py,sha256=lLe7LNaVYwukDhBTxLs8ezsQf7fes9m9OX7g9nGo760,30954
|
|
8
8
|
recursive_cleaner/cli.py,sha256=Sk_qYKxSn1PiPmMLKkyj9VxsseHaSXmSlGazxfmkTFc,12807
|
|
9
9
|
recursive_cleaner/context.py,sha256=avMXRDxLd7nd8CKWtvPHQy1MFhBKiA0aUVVJIlWoLZ4,824
|
|
10
10
|
recursive_cleaner/dependencies.py,sha256=vlYeoGL517v3yUSWN0wYDuIs9OOuQwM_dCBADrlitW8,2080
|
|
@@ -14,17 +14,17 @@ recursive_cleaner/optimizer.py,sha256=lnQC9Y1ClkW4po1eYa2bnYYu4smiDuUpMPPX6EN1UQ
|
|
|
14
14
|
recursive_cleaner/output.py,sha256=quTlZYtKZm9h37mbnwQmEjg0q8VQSZWEqwaHfhSAd3s,6106
|
|
15
15
|
recursive_cleaner/parser_generator.py,sha256=enn6_okGWB2ddVkwI7ytndT04S4QEVAk6cbmb7shxcM,3905
|
|
16
16
|
recursive_cleaner/parsers.py,sha256=HCS2UiVFhboq_go4DyWUygkJTkpfYkFj9_hqWiGIEXo,14572
|
|
17
|
-
recursive_cleaner/prompt.py,sha256=
|
|
17
|
+
recursive_cleaner/prompt.py,sha256=yqwUyB6Z51Oqhvxz3mNijZraXr-QEUYQ_ubyiryZSrU,6730
|
|
18
18
|
recursive_cleaner/report.py,sha256=AWWneRjvl76ccLlExdkKJeY3GVFUG_LtmzVIJJT5cFI,4629
|
|
19
19
|
recursive_cleaner/response.py,sha256=3w0mLnqEPdB4daMSF0mtTcG0PTP-utb1HFtKuYA1ljw,9064
|
|
20
20
|
recursive_cleaner/schema.py,sha256=w2hcEdApR15KVI9SFWB3VfumMoHFwn1YJrktdfgPo8M,3925
|
|
21
21
|
recursive_cleaner/tui.py,sha256=zuiFPtMh3K-sC1CWZoaoUmgZ3rESkl10gYcqMzpVqiM,22598
|
|
22
22
|
recursive_cleaner/types.py,sha256=-GdCmsfHd3rfdfCi5c-RXqX4TyuCSHgA__3AF3bMhoQ,290
|
|
23
|
-
recursive_cleaner/validation.py,sha256
|
|
23
|
+
recursive_cleaner/validation.py,sha256=IlXz5EhXaUb0mJlaH0ygFH1ePPWHVfgjL-5ZawyKicY,7910
|
|
24
24
|
recursive_cleaner/vendor/__init__.py,sha256=E87TjmjRzu8ty39nqThvBwM611yXlLKQZ6KGY_zp3Dk,117
|
|
25
25
|
recursive_cleaner/vendor/chunker.py,sha256=pDDbfF6FoSmUji0-RG4MletPxJ-VybGw0yfnhh0aMSQ,6730
|
|
26
|
-
recursive_cleaner-1.0.
|
|
27
|
-
recursive_cleaner-1.0.
|
|
28
|
-
recursive_cleaner-1.0.
|
|
29
|
-
recursive_cleaner-1.0.
|
|
30
|
-
recursive_cleaner-1.0.
|
|
26
|
+
recursive_cleaner-1.0.1.dist-info/METADATA,sha256=qEmuiRPtRjuigM29FgjrkUUZm0YV91xNjuc7j16NhKU,14285
|
|
27
|
+
recursive_cleaner-1.0.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
28
|
+
recursive_cleaner-1.0.1.dist-info/entry_points.txt,sha256=S5nbi0rnifpShxdXGExeZnd65UZfp8K7DNyuKPST6nk,65
|
|
29
|
+
recursive_cleaner-1.0.1.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
|
|
30
|
+
recursive_cleaner-1.0.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|