recursive-cleaner 0.8.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- backends/__init__.py +2 -1
- backends/openai_backend.py +71 -0
- recursive_cleaner/__init__.py +4 -1
- recursive_cleaner/__main__.py +8 -0
- recursive_cleaner/apply.py +483 -0
- recursive_cleaner/cleaner.py +27 -5
- recursive_cleaner/cli.py +395 -0
- recursive_cleaner/prompt.py +8 -4
- recursive_cleaner/tui.py +43 -24
- recursive_cleaner/validation.py +40 -1
- {recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.1.dist-info}/METADATA +100 -4
- {recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.1.dist-info}/RECORD +15 -10
- recursive_cleaner-1.0.1.dist-info/entry_points.txt +2 -0
- {recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.1.dist-info}/WHEEL +0 -0
- {recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: recursive-cleaner
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.1
|
|
4
4
|
Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
|
|
5
5
|
Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
6
6
|
Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
@@ -9,7 +9,7 @@ Author: Gary Tran
|
|
|
9
9
|
License-Expression: MIT
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Keywords: automation,data-cleaning,data-quality,etl,llm,machine-learning
|
|
12
|
-
Classifier: Development Status ::
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
13
|
Classifier: Intended Audience :: Developers
|
|
14
14
|
Classifier: Intended Audience :: Science/Research
|
|
15
15
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -26,10 +26,15 @@ Requires-Dist: tenacity>=8.0
|
|
|
26
26
|
Provides-Extra: dev
|
|
27
27
|
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
28
28
|
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
29
|
+
Provides-Extra: excel
|
|
30
|
+
Requires-Dist: openpyxl>=3.0.0; extra == 'excel'
|
|
31
|
+
Requires-Dist: xlrd>=2.0.0; extra == 'excel'
|
|
29
32
|
Provides-Extra: markitdown
|
|
30
33
|
Requires-Dist: markitdown>=0.1.0; extra == 'markitdown'
|
|
31
34
|
Provides-Extra: mlx
|
|
32
35
|
Requires-Dist: mlx-lm>=0.10.0; extra == 'mlx'
|
|
36
|
+
Provides-Extra: openai
|
|
37
|
+
Requires-Dist: openai>=1.0.0; extra == 'openai'
|
|
33
38
|
Provides-Extra: parquet
|
|
34
39
|
Requires-Dist: pyarrow>=14.0.0; extra == 'parquet'
|
|
35
40
|
Provides-Extra: tui
|
|
@@ -140,6 +145,91 @@ cleaner.run() # Generates cleaning_functions.py
|
|
|
140
145
|
- **Token Estimation**: Track estimated input/output tokens across the run
|
|
141
146
|
- **Graceful Fallback**: Works without Rich installed (falls back to callbacks)
|
|
142
147
|
|
|
148
|
+
### CLI (v0.9.0)
|
|
149
|
+
- **Command Line Interface**: Use without writing Python code
|
|
150
|
+
- **Multiple Backends**: MLX (Apple Silicon) and OpenAI-compatible (OpenAI, LM Studio, Ollama)
|
|
151
|
+
- **Four Commands**: `generate`, `analyze` (dry-run), `resume`, `apply`
|
|
152
|
+
|
|
153
|
+
### Apply Mode (v1.0.0)
|
|
154
|
+
- **Apply Cleaning Functions**: Apply generated functions to full datasets
|
|
155
|
+
- **Data Formats**: JSONL, CSV, JSON, Parquet, Excel (.xlsx/.xls) output same format
|
|
156
|
+
- **Text Formats**: PDF, Word, HTML, etc. output as Markdown
|
|
157
|
+
- **Streaming**: Memory-efficient line-by-line processing for JSONL/CSV
|
|
158
|
+
- **Colored TUI**: Enhanced transmission log with syntax-highlighted XML parsing
|
|
159
|
+
|
|
160
|
+
## Command Line Interface
|
|
161
|
+
|
|
162
|
+
After installation, the `recursive-cleaner` command is available:
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
# Generate cleaning functions with MLX (Apple Silicon)
|
|
166
|
+
recursive-cleaner generate data.jsonl \
|
|
167
|
+
--provider mlx \
|
|
168
|
+
--model "lmstudio-community/Qwen3-80B-MLX-4bit" \
|
|
169
|
+
--instructions "Normalize phone numbers to E.164" \
|
|
170
|
+
--output cleaning_functions.py
|
|
171
|
+
|
|
172
|
+
# Use OpenAI
|
|
173
|
+
export OPENAI_API_KEY=your-key
|
|
174
|
+
recursive-cleaner generate data.jsonl \
|
|
175
|
+
--provider openai \
|
|
176
|
+
--model gpt-4o \
|
|
177
|
+
--instructions "Fix date formats"
|
|
178
|
+
|
|
179
|
+
# Use LM Studio or Ollama (OpenAI-compatible)
|
|
180
|
+
recursive-cleaner generate data.jsonl \
|
|
181
|
+
--provider openai \
|
|
182
|
+
--model "qwen/qwen3-vl-30b" \
|
|
183
|
+
--base-url http://localhost:1234/v1 \
|
|
184
|
+
--instructions "Normalize prices"
|
|
185
|
+
|
|
186
|
+
# Dry-run analysis
|
|
187
|
+
recursive-cleaner analyze data.jsonl \
|
|
188
|
+
--provider openai \
|
|
189
|
+
--model gpt-4o \
|
|
190
|
+
--instructions @instructions.txt
|
|
191
|
+
|
|
192
|
+
# Resume from checkpoint
|
|
193
|
+
recursive-cleaner resume cleaning_state.json \
|
|
194
|
+
--provider mlx \
|
|
195
|
+
--model "model-path"
|
|
196
|
+
|
|
197
|
+
# Apply cleaning functions to data
|
|
198
|
+
recursive-cleaner apply data.jsonl \
|
|
199
|
+
--functions cleaning_functions.py \
|
|
200
|
+
--output cleaned_data.jsonl
|
|
201
|
+
|
|
202
|
+
# Apply to Excel (outputs same format)
|
|
203
|
+
recursive-cleaner apply sales.xlsx \
|
|
204
|
+
--functions cleaning_functions.py
|
|
205
|
+
|
|
206
|
+
# Apply to PDF (outputs markdown)
|
|
207
|
+
recursive-cleaner apply document.pdf \
|
|
208
|
+
--functions cleaning_functions.py \
|
|
209
|
+
--output cleaned.md
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### CLI Options
|
|
213
|
+
|
|
214
|
+
```
|
|
215
|
+
recursive-cleaner generate <FILE> [OPTIONS]
|
|
216
|
+
|
|
217
|
+
Required:
|
|
218
|
+
FILE Input data file
|
|
219
|
+
-p, --provider {mlx,openai} LLM provider
|
|
220
|
+
-m, --model MODEL Model name/path
|
|
221
|
+
|
|
222
|
+
Optional:
|
|
223
|
+
-i, --instructions TEXT Cleaning instructions (or @file.txt)
|
|
224
|
+
--base-url URL API URL for OpenAI-compatible servers
|
|
225
|
+
--chunk-size N Items per chunk (default: 50)
|
|
226
|
+
--max-iterations N Max iterations per chunk (default: 5)
|
|
227
|
+
-o, --output PATH Output file (default: cleaning_functions.py)
|
|
228
|
+
--tui Enable Rich dashboard
|
|
229
|
+
--optimize Consolidate redundant functions
|
|
230
|
+
--track-metrics Measure before/after quality
|
|
231
|
+
```
|
|
232
|
+
|
|
143
233
|
## Configuration
|
|
144
234
|
|
|
145
235
|
```python
|
|
@@ -270,6 +360,7 @@ cleaner.run()
|
|
|
270
360
|
|
|
271
361
|
```
|
|
272
362
|
recursive_cleaner/
|
|
363
|
+
├── cli.py # Command line interface
|
|
273
364
|
├── cleaner.py # Main DataCleaner class
|
|
274
365
|
├── context.py # Docstring registry with FIFO eviction
|
|
275
366
|
├── dependencies.py # Topological sort for function ordering
|
|
@@ -286,6 +377,10 @@ recursive_cleaner/
|
|
|
286
377
|
├── validation.py # Runtime validation + holdout
|
|
287
378
|
└── vendor/
|
|
288
379
|
└── chunker.py # Vendored sentence-aware chunker
|
|
380
|
+
|
|
381
|
+
backends/
|
|
382
|
+
├── mlx_backend.py # MLX-LM backend for Apple Silicon
|
|
383
|
+
└── openai_backend.py # OpenAI-compatible backend
|
|
289
384
|
```
|
|
290
385
|
|
|
291
386
|
## Testing
|
|
@@ -294,14 +389,14 @@ recursive_cleaner/
|
|
|
294
389
|
pytest tests/ -v
|
|
295
390
|
```
|
|
296
391
|
|
|
297
|
-
|
|
392
|
+
555 tests covering all features. Test datasets in `test_cases/`:
|
|
298
393
|
- E-commerce product catalogs
|
|
299
394
|
- Healthcare patient records
|
|
300
395
|
- Financial transaction data
|
|
301
396
|
|
|
302
397
|
## Philosophy
|
|
303
398
|
|
|
304
|
-
- **Simplicity over extensibility**: ~
|
|
399
|
+
- **Simplicity over extensibility**: ~5,000 lines that do one thing well
|
|
305
400
|
- **stdlib over dependencies**: Only `tenacity` required
|
|
306
401
|
- **Retry over recover**: On error, retry with error in prompt
|
|
307
402
|
- **Wu wei**: Let the LLM make decisions about data it understands
|
|
@@ -310,6 +405,7 @@ pytest tests/ -v
|
|
|
310
405
|
|
|
311
406
|
| Version | Features |
|
|
312
407
|
|---------|----------|
|
|
408
|
+
| v0.9.0 | CLI tool with MLX and OpenAI-compatible backends (LM Studio, Ollama) |
|
|
313
409
|
| v0.8.0 | Terminal UI with Rich dashboard, mission control aesthetic, transmission log |
|
|
314
410
|
| v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
|
|
315
411
|
| v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
|
|
@@ -1,7 +1,11 @@
|
|
|
1
|
-
backends/__init__.py,sha256=
|
|
1
|
+
backends/__init__.py,sha256=vWcPASV0GGEAydzOSjdrknkSHoGbSs4edtuv9HIzBhI,180
|
|
2
2
|
backends/mlx_backend.py,sha256=0U6IqmDHyk4vjKzytvEcQvSUBryQTgFtsNOcpwFNKk8,2945
|
|
3
|
-
|
|
4
|
-
recursive_cleaner/
|
|
3
|
+
backends/openai_backend.py,sha256=vKWsXKltBv_tJDoQfQ_7KVMZDfomhFFN2vl1oZ1KGbQ,2057
|
|
4
|
+
recursive_cleaner/__init__.py,sha256=-NesTf9deCVOxkadFuyfVl-IjfbEHlYcMNAaAW9kUuw,1918
|
|
5
|
+
recursive_cleaner/__main__.py,sha256=WXmMaL_myHPsG_qXAhZDufD43Ydsd25RV2IPeW2Kg08,152
|
|
6
|
+
recursive_cleaner/apply.py,sha256=hjeljhZNiOuwz9m09RYVLl_z_9tet7LwubH6cb_Wy6Y,13855
|
|
7
|
+
recursive_cleaner/cleaner.py,sha256=lLe7LNaVYwukDhBTxLs8ezsQf7fes9m9OX7g9nGo760,30954
|
|
8
|
+
recursive_cleaner/cli.py,sha256=Sk_qYKxSn1PiPmMLKkyj9VxsseHaSXmSlGazxfmkTFc,12807
|
|
5
9
|
recursive_cleaner/context.py,sha256=avMXRDxLd7nd8CKWtvPHQy1MFhBKiA0aUVVJIlWoLZ4,824
|
|
6
10
|
recursive_cleaner/dependencies.py,sha256=vlYeoGL517v3yUSWN0wYDuIs9OOuQwM_dCBADrlitW8,2080
|
|
7
11
|
recursive_cleaner/errors.py,sha256=hwRJF8NSmWy_FZHCxcZDZxLQ0zqvo5dX8ImkB9mrOYc,433
|
|
@@ -10,16 +14,17 @@ recursive_cleaner/optimizer.py,sha256=lnQC9Y1ClkW4po1eYa2bnYYu4smiDuUpMPPX6EN1UQ
|
|
|
10
14
|
recursive_cleaner/output.py,sha256=quTlZYtKZm9h37mbnwQmEjg0q8VQSZWEqwaHfhSAd3s,6106
|
|
11
15
|
recursive_cleaner/parser_generator.py,sha256=enn6_okGWB2ddVkwI7ytndT04S4QEVAk6cbmb7shxcM,3905
|
|
12
16
|
recursive_cleaner/parsers.py,sha256=HCS2UiVFhboq_go4DyWUygkJTkpfYkFj9_hqWiGIEXo,14572
|
|
13
|
-
recursive_cleaner/prompt.py,sha256=
|
|
17
|
+
recursive_cleaner/prompt.py,sha256=yqwUyB6Z51Oqhvxz3mNijZraXr-QEUYQ_ubyiryZSrU,6730
|
|
14
18
|
recursive_cleaner/report.py,sha256=AWWneRjvl76ccLlExdkKJeY3GVFUG_LtmzVIJJT5cFI,4629
|
|
15
19
|
recursive_cleaner/response.py,sha256=3w0mLnqEPdB4daMSF0mtTcG0PTP-utb1HFtKuYA1ljw,9064
|
|
16
20
|
recursive_cleaner/schema.py,sha256=w2hcEdApR15KVI9SFWB3VfumMoHFwn1YJrktdfgPo8M,3925
|
|
17
|
-
recursive_cleaner/tui.py,sha256=
|
|
21
|
+
recursive_cleaner/tui.py,sha256=zuiFPtMh3K-sC1CWZoaoUmgZ3rESkl10gYcqMzpVqiM,22598
|
|
18
22
|
recursive_cleaner/types.py,sha256=-GdCmsfHd3rfdfCi5c-RXqX4TyuCSHgA__3AF3bMhoQ,290
|
|
19
|
-
recursive_cleaner/validation.py,sha256
|
|
23
|
+
recursive_cleaner/validation.py,sha256=IlXz5EhXaUb0mJlaH0ygFH1ePPWHVfgjL-5ZawyKicY,7910
|
|
20
24
|
recursive_cleaner/vendor/__init__.py,sha256=E87TjmjRzu8ty39nqThvBwM611yXlLKQZ6KGY_zp3Dk,117
|
|
21
25
|
recursive_cleaner/vendor/chunker.py,sha256=pDDbfF6FoSmUji0-RG4MletPxJ-VybGw0yfnhh0aMSQ,6730
|
|
22
|
-
recursive_cleaner-0.
|
|
23
|
-
recursive_cleaner-0.
|
|
24
|
-
recursive_cleaner-0.
|
|
25
|
-
recursive_cleaner-0.
|
|
26
|
+
recursive_cleaner-1.0.1.dist-info/METADATA,sha256=qEmuiRPtRjuigM29FgjrkUUZm0YV91xNjuc7j16NhKU,14285
|
|
27
|
+
recursive_cleaner-1.0.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
28
|
+
recursive_cleaner-1.0.1.dist-info/entry_points.txt,sha256=S5nbi0rnifpShxdXGExeZnd65UZfp8K7DNyuKPST6nk,65
|
|
29
|
+
recursive_cleaner-1.0.1.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
|
|
30
|
+
recursive_cleaner-1.0.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|