recursive-cleaner 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- backends/__init__.py +5 -0
- backends/mlx_backend.py +95 -0
- recursive_cleaner/__init__.py +46 -0
- recursive_cleaner/cleaner.py +628 -0
- recursive_cleaner/context.py +27 -0
- recursive_cleaner/dependencies.py +59 -0
- recursive_cleaner/errors.py +17 -0
- recursive_cleaner/metrics.py +163 -0
- recursive_cleaner/optimizer.py +336 -0
- recursive_cleaner/output.py +197 -0
- recursive_cleaner/parsers.py +325 -0
- recursive_cleaner/prompt.py +218 -0
- recursive_cleaner/report.py +138 -0
- recursive_cleaner/response.py +292 -0
- recursive_cleaner/schema.py +117 -0
- recursive_cleaner/types.py +11 -0
- recursive_cleaner/validation.py +202 -0
- recursive_cleaner/vendor/__init__.py +4 -0
- recursive_cleaner/vendor/chunker.py +187 -0
- recursive_cleaner-0.6.0.dist-info/METADATA +282 -0
- recursive_cleaner-0.6.0.dist-info/RECORD +23 -0
- recursive_cleaner-0.6.0.dist-info/WHEEL +4 -0
- recursive_cleaner-0.6.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: recursive-cleaner
|
|
3
|
+
Version: 0.6.0
|
|
4
|
+
Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
|
|
5
|
+
Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
6
|
+
Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
7
|
+
Project-URL: Issues, https://github.com/gaztrabisme/recursive-data-cleaner/issues
|
|
8
|
+
Author: Gary Tou
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: automation,data-cleaning,data-quality,etl,llm,machine-learning
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Topic :: Text Processing :: General
|
|
23
|
+
Classifier: Topic :: Utilities
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Requires-Dist: tenacity>=8.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
29
|
+
Provides-Extra: mlx
|
|
30
|
+
Requires-Dist: mlx-lm>=0.10.0; extra == 'mlx'
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# Recursive Data Cleaner
|
|
34
|
+
|
|
35
|
+
LLM-powered incremental data cleaning for massive datasets. Process files in chunks, identify quality issues, and automatically generate Python cleaning functions.
|
|
36
|
+
|
|
37
|
+
## How It Works
|
|
38
|
+
|
|
39
|
+
1. **Chunk** your data (JSONL, CSV, JSON, or text)
|
|
40
|
+
2. **Analyze** each chunk with an LLM to identify issues
|
|
41
|
+
3. **Generate** one cleaning function per issue
|
|
42
|
+
4. **Validate** functions on holdout data before accepting
|
|
43
|
+
5. **Output** a ready-to-use `cleaning_functions.py`
|
|
44
|
+
|
|
45
|
+
The system maintains a "docstring registry" - feeding generated function descriptions back into prompts so the LLM knows what's already solved and avoids duplicate work.
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install -e .
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
For Apple Silicon (MLX backend):
|
|
54
|
+
```bash
|
|
55
|
+
pip install -e ".[mlx]"
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Quick Start
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from recursive_cleaner import DataCleaner
|
|
62
|
+
from backends import MLXBackend
|
|
63
|
+
|
|
64
|
+
# Any LLM with generate(prompt) -> str works
|
|
65
|
+
llm = MLXBackend(model_path="your-model")
|
|
66
|
+
|
|
67
|
+
cleaner = DataCleaner(
|
|
68
|
+
llm_backend=llm,
|
|
69
|
+
file_path="messy_data.jsonl",
|
|
70
|
+
chunk_size=50,
|
|
71
|
+
instructions="""
|
|
72
|
+
- Normalize phone numbers to E.164
|
|
73
|
+
- Fix typos in status field (valid: active, pending, churned)
|
|
74
|
+
- Convert dates to ISO 8601
|
|
75
|
+
""",
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
cleaner.run() # Generates cleaning_functions.py
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Features
|
|
82
|
+
|
|
83
|
+
### Core
|
|
84
|
+
- **Chunked Processing**: Handle files larger than LLM context windows
|
|
85
|
+
- **Incremental Generation**: One function per issue, building up a complete solution
|
|
86
|
+
- **Docstring Registry**: Automatic context management with FIFO eviction
|
|
87
|
+
- **AST Validation**: All generated code validated before output
|
|
88
|
+
- **Error Recovery**: Retries with error feedback on parse failures
|
|
89
|
+
|
|
90
|
+
### Data Quality (v0.4.0+)
|
|
91
|
+
- **Holdout Validation**: Test functions on unseen 20% of each chunk
|
|
92
|
+
- **Sampling Strategies**: Sequential, random, or stratified sampling
|
|
93
|
+
- **Quality Metrics**: Before/after comparison with improvement reports
|
|
94
|
+
- **Dependency Resolution**: Topological sort for correct function ordering
|
|
95
|
+
|
|
96
|
+
### Optimization (v0.5.0+)
|
|
97
|
+
- **Two-Pass Consolidation**: Merge redundant functions after generation
|
|
98
|
+
- **Early Termination**: Stop when LLM detects pattern saturation
|
|
99
|
+
- **LLM Agency**: Model decides chunk cleanliness and saturation
|
|
100
|
+
|
|
101
|
+
### Security (v0.5.1+)
|
|
102
|
+
- **Dangerous Code Detection**: AST-based detection of exec, eval, subprocess, network calls
|
|
103
|
+
|
|
104
|
+
### Observability (v0.6.0)
|
|
105
|
+
- **Latency Metrics**: Track min/max/avg/total LLM call times
|
|
106
|
+
- **Import Consolidation**: Deduplicate and merge imports in output
|
|
107
|
+
- **Cleaning Reports**: Markdown summary with functions, timing, quality delta
|
|
108
|
+
- **Dry-Run Mode**: Analyze data without generating functions
|
|
109
|
+
|
|
110
|
+
## Configuration
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
cleaner = DataCleaner(
|
|
114
|
+
# Required
|
|
115
|
+
llm_backend=llm,
|
|
116
|
+
file_path="data.jsonl",
|
|
117
|
+
|
|
118
|
+
# Chunking
|
|
119
|
+
chunk_size=50, # Items per chunk (or chars for text mode)
|
|
120
|
+
max_iterations=5, # Max iterations per chunk
|
|
121
|
+
context_budget=8000, # Max chars for docstring context
|
|
122
|
+
|
|
123
|
+
# Validation
|
|
124
|
+
validate_runtime=True, # Test functions before accepting
|
|
125
|
+
schema_sample_size=10, # Records for schema inference
|
|
126
|
+
holdout_ratio=0.2, # Fraction held out for validation
|
|
127
|
+
|
|
128
|
+
# Sampling
|
|
129
|
+
sampling_strategy="stratified", # "sequential", "random", "stratified"
|
|
130
|
+
stratify_field="status", # Field for stratified sampling
|
|
131
|
+
|
|
132
|
+
# Optimization
|
|
133
|
+
optimize=True, # Consolidate redundant functions
|
|
134
|
+
early_termination=True, # Stop when patterns saturate
|
|
135
|
+
track_metrics=True, # Measure before/after quality
|
|
136
|
+
|
|
137
|
+
# Observability
|
|
138
|
+
report_path="report.md", # Markdown report output (None to disable)
|
|
139
|
+
dry_run=False, # Analyze without generating functions
|
|
140
|
+
|
|
141
|
+
# Progress & State
|
|
142
|
+
on_progress=callback, # Progress event callback
|
|
143
|
+
state_file="state.json", # Enable resume on interrupt
|
|
144
|
+
)
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Progress Events
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
def on_progress(event):
|
|
151
|
+
match event["type"]:
|
|
152
|
+
case "chunk_start":
|
|
153
|
+
print(f"Chunk {event['chunk_index']}/{event['total_chunks']}")
|
|
154
|
+
case "llm_call":
|
|
155
|
+
print(f"LLM latency: {event['latency_ms']}ms")
|
|
156
|
+
case "function_generated":
|
|
157
|
+
print(f"Generated: {event['function_name']}")
|
|
158
|
+
case "issues_detected": # dry-run mode
|
|
159
|
+
print(f"Found {len(event['issues'])} issues")
|
|
160
|
+
case "complete":
|
|
161
|
+
stats = event["latency_stats"]
|
|
162
|
+
print(f"Done! Avg latency: {stats['avg_ms']}ms")
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## Output
|
|
166
|
+
|
|
167
|
+
The cleaner generates `cleaning_functions.py`:
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
# Auto-generated cleaning functions
|
|
171
|
+
import re
|
|
172
|
+
|
|
173
|
+
def normalize_phone_numbers(data):
|
|
174
|
+
"""Normalize phone numbers to E.164 format."""
|
|
175
|
+
# ... implementation ...
|
|
176
|
+
|
|
177
|
+
def fix_status_typos(data):
|
|
178
|
+
"""Fix typos in status field."""
|
|
179
|
+
# ... implementation ...
|
|
180
|
+
|
|
181
|
+
def clean_data(data):
|
|
182
|
+
"""Apply all cleaning functions in order."""
|
|
183
|
+
data = normalize_phone_numbers(data)
|
|
184
|
+
data = fix_status_typos(data)
|
|
185
|
+
return data
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
## Custom LLM Backend
|
|
189
|
+
|
|
190
|
+
Implement the simple protocol:
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
class MyBackend:
|
|
194
|
+
def generate(self, prompt: str) -> str:
|
|
195
|
+
# Call your LLM (OpenAI, Anthropic, local, etc.)
|
|
196
|
+
return response
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## Text Mode
|
|
200
|
+
|
|
201
|
+
For plain text files (PDFs, documents):
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
cleaner = DataCleaner(
|
|
205
|
+
llm_backend=llm,
|
|
206
|
+
file_path="document.txt",
|
|
207
|
+
chunk_size=4000, # Characters, not items
|
|
208
|
+
instructions="Fix OCR errors, normalize whitespace",
|
|
209
|
+
)
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
Text mode uses sentence-aware chunking to avoid splitting mid-sentence.
|
|
213
|
+
|
|
214
|
+
## Resume on Interrupt
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
# Start with state file
|
|
218
|
+
cleaner = DataCleaner(
|
|
219
|
+
llm_backend=llm,
|
|
220
|
+
file_path="huge_file.jsonl",
|
|
221
|
+
state_file="cleaning_state.json",
|
|
222
|
+
)
|
|
223
|
+
cleaner.run()
|
|
224
|
+
|
|
225
|
+
# If interrupted, resume later:
|
|
226
|
+
cleaner = DataCleaner.resume("cleaning_state.json", llm)
|
|
227
|
+
cleaner.run()
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
## Architecture
|
|
231
|
+
|
|
232
|
+
```
|
|
233
|
+
recursive_cleaner/
|
|
234
|
+
├── cleaner.py # Main DataCleaner class (~580 lines)
|
|
235
|
+
├── context.py # Docstring registry with FIFO eviction
|
|
236
|
+
├── dependencies.py # Topological sort for function ordering
|
|
237
|
+
├── metrics.py # Quality metrics before/after
|
|
238
|
+
├── optimizer.py # Two-pass consolidation with LLM agency
|
|
239
|
+
├── output.py # Function file generation + import consolidation
|
|
240
|
+
├── parsers.py # Chunking for JSONL/CSV/JSON/text + sampling
|
|
241
|
+
├── prompt.py # LLM prompt templates
|
|
242
|
+
├── report.py # Markdown report generation
|
|
243
|
+
├── response.py # XML/markdown parsing + agency dataclasses
|
|
244
|
+
├── schema.py # Schema inference
|
|
245
|
+
├── validation.py # Runtime validation + holdout
|
|
246
|
+
└── vendor/
|
|
247
|
+
└── chunker.py # Vendored sentence-aware chunker
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
## Testing
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
pytest tests/ -v
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
392 tests covering all features. Test datasets in `test_cases/`:
|
|
257
|
+
- E-commerce product catalogs
|
|
258
|
+
- Healthcare patient records
|
|
259
|
+
- Financial transaction data
|
|
260
|
+
|
|
261
|
+
## Philosophy
|
|
262
|
+
|
|
263
|
+
- **Simplicity over extensibility**: ~3,000 lines that do one thing well
|
|
264
|
+
- **stdlib over dependencies**: Only `tenacity` required
|
|
265
|
+
- **Retry over recover**: On error, retry with error in prompt
|
|
266
|
+
- **Wu wei**: Let the LLM make decisions about data it understands
|
|
267
|
+
|
|
268
|
+
## Version History
|
|
269
|
+
|
|
270
|
+
| Version | Features |
|
|
271
|
+
|---------|----------|
|
|
272
|
+
| v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
|
|
273
|
+
| v0.5.1 | Dangerous code detection (AST-based security) |
|
|
274
|
+
| v0.5.0 | Two-pass optimization, early termination, LLM agency |
|
|
275
|
+
| v0.4.0 | Holdout validation, dependency resolution, sampling, quality metrics |
|
|
276
|
+
| v0.3.0 | Text mode with sentence-aware chunking |
|
|
277
|
+
| v0.2.0 | Runtime validation, schema inference, callbacks, incremental saves |
|
|
278
|
+
| v0.1.0 | Core pipeline, chunking, docstring registry |
|
|
279
|
+
|
|
280
|
+
## License
|
|
281
|
+
|
|
282
|
+
MIT
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
backends/__init__.py,sha256=FUgODeYSGBvT0-z6myVby6YeAHG0nEUgWLITBKobUew,121
|
|
2
|
+
backends/mlx_backend.py,sha256=0U6IqmDHyk4vjKzytvEcQvSUBryQTgFtsNOcpwFNKk8,2945
|
|
3
|
+
recursive_cleaner/__init__.py,sha256=Wm_sQdh1mkJ-DByo4lMiAIiLWFRfj8G2Limr0eSG0s0,1416
|
|
4
|
+
recursive_cleaner/cleaner.py,sha256=vSrogf8T1AquLakmqbpgvuFoTD6_AZp_hrG3vJxx9gk,24340
|
|
5
|
+
recursive_cleaner/context.py,sha256=avMXRDxLd7nd8CKWtvPHQy1MFhBKiA0aUVVJIlWoLZ4,824
|
|
6
|
+
recursive_cleaner/dependencies.py,sha256=vlYeoGL517v3yUSWN0wYDuIs9OOuQwM_dCBADrlitW8,2080
|
|
7
|
+
recursive_cleaner/errors.py,sha256=hwRJF8NSmWy_FZHCxcZDZxLQ0zqvo5dX8ImkB9mrOYc,433
|
|
8
|
+
recursive_cleaner/metrics.py,sha256=C6RlvZMTtVm7kdRhfMZu4xd-R-i1EQGMT5FCasPOO3A,5003
|
|
9
|
+
recursive_cleaner/optimizer.py,sha256=lnQC9Y1ClkW4po1eYa2bnYYu4smiDuUpMPPX6EN1UQ0,9700
|
|
10
|
+
recursive_cleaner/output.py,sha256=quTlZYtKZm9h37mbnwQmEjg0q8VQSZWEqwaHfhSAd3s,6106
|
|
11
|
+
recursive_cleaner/parsers.py,sha256=39oMg0WGRFV_eRBzOfB7LIGXMP1cIDwdeYk4UOlw140,10595
|
|
12
|
+
recursive_cleaner/prompt.py,sha256=ep0eOXz_XbhH3HduJ76LvzVSftonhcv4GLEecIqd3lY,6484
|
|
13
|
+
recursive_cleaner/report.py,sha256=AWWneRjvl76ccLlExdkKJeY3GVFUG_LtmzVIJJT5cFI,4629
|
|
14
|
+
recursive_cleaner/response.py,sha256=3w0mLnqEPdB4daMSF0mtTcG0PTP-utb1HFtKuYA1ljw,9064
|
|
15
|
+
recursive_cleaner/schema.py,sha256=w2hcEdApR15KVI9SFWB3VfumMoHFwn1YJrktdfgPo8M,3925
|
|
16
|
+
recursive_cleaner/types.py,sha256=-GdCmsfHd3rfdfCi5c-RXqX4TyuCSHgA__3AF3bMhoQ,290
|
|
17
|
+
recursive_cleaner/validation.py,sha256=-KAolhw3GQyhHwmh0clEj8xqPD5O-R2AO5rx7vubIME,6442
|
|
18
|
+
recursive_cleaner/vendor/__init__.py,sha256=E87TjmjRzu8ty39nqThvBwM611yXlLKQZ6KGY_zp3Dk,117
|
|
19
|
+
recursive_cleaner/vendor/chunker.py,sha256=pDDbfF6FoSmUji0-RG4MletPxJ-VybGw0yfnhh0aMSQ,6730
|
|
20
|
+
recursive_cleaner-0.6.0.dist-info/METADATA,sha256=qo4yp828JAo3dAww0zezaZZGdYH4y-UdYu0UcZVBIKE,9127
|
|
21
|
+
recursive_cleaner-0.6.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
22
|
+
recursive_cleaner-0.6.0.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
|
|
23
|
+
recursive_cleaner-0.6.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 GazTrab
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|