recursive-cleaner 0.7.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {recursive_cleaner-0.7.0.dist-info → recursive_cleaner-0.7.1.dist-info}/METADATA +36 -16
- {recursive_cleaner-0.7.0.dist-info → recursive_cleaner-0.7.1.dist-info}/RECORD +4 -4
- {recursive_cleaner-0.7.0.dist-info → recursive_cleaner-0.7.1.dist-info}/WHEEL +0 -0
- {recursive_cleaner-0.7.0.dist-info → recursive_cleaner-0.7.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: recursive-cleaner
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.1
|
|
4
4
|
Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
|
|
5
5
|
Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
6
6
|
Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
@@ -40,7 +40,7 @@ LLM-powered incremental data cleaning for massive datasets. Process files in chu
|
|
|
40
40
|
|
|
41
41
|
## How It Works
|
|
42
42
|
|
|
43
|
-
1. **Chunk** your data (JSONL, CSV, JSON,
|
|
43
|
+
1. **Chunk** your data (JSONL, CSV, JSON, Parquet, PDF, Word, Excel, XML, and more)
|
|
44
44
|
2. **Analyze** each chunk with an LLM to identify issues
|
|
45
45
|
3. **Generate** one cleaning function per issue
|
|
46
46
|
4. **Validate** functions on holdout data before accepting
|
|
@@ -59,6 +59,16 @@ For Apple Silicon (MLX backend):
|
|
|
59
59
|
pip install -e ".[mlx]"
|
|
60
60
|
```
|
|
61
61
|
|
|
62
|
+
For document conversion (PDF, Word, Excel, HTML, etc.):
|
|
63
|
+
```bash
|
|
64
|
+
pip install -e ".[markitdown]"
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
For Parquet files:
|
|
68
|
+
```bash
|
|
69
|
+
pip install -e ".[parquet]"
|
|
70
|
+
```
|
|
71
|
+
|
|
62
72
|
## Quick Start
|
|
63
73
|
|
|
64
74
|
```python
|
|
@@ -111,6 +121,11 @@ cleaner.run() # Generates cleaning_functions.py
|
|
|
111
121
|
- **Cleaning Reports**: Markdown summary with functions, timing, quality delta
|
|
112
122
|
- **Dry-Run Mode**: Analyze data without generating functions
|
|
113
123
|
|
|
124
|
+
### Format Expansion (v0.7.0)
|
|
125
|
+
- **Markitdown Integration**: Convert 20+ formats (PDF, Word, Excel, PowerPoint, HTML, EPUB, etc.) to text
|
|
126
|
+
- **Parquet Support**: Load parquet files as structured data via pyarrow
|
|
127
|
+
- **LLM-Generated Parsers**: Auto-generate parsers for XML and unknown formats (`auto_parse=True`)
|
|
128
|
+
|
|
114
129
|
## Configuration
|
|
115
130
|
|
|
116
131
|
```python
|
|
@@ -142,6 +157,9 @@ cleaner = DataCleaner(
|
|
|
142
157
|
report_path="report.md", # Markdown report output (None to disable)
|
|
143
158
|
dry_run=False, # Analyze without generating functions
|
|
144
159
|
|
|
160
|
+
# Format Expansion
|
|
161
|
+
auto_parse=False, # LLM generates parser for unknown formats
|
|
162
|
+
|
|
145
163
|
# Progress & State
|
|
146
164
|
on_progress=callback, # Progress event callback
|
|
147
165
|
state_file="state.json", # Enable resume on interrupt
|
|
@@ -235,20 +253,21 @@ cleaner.run()
|
|
|
235
253
|
|
|
236
254
|
```
|
|
237
255
|
recursive_cleaner/
|
|
238
|
-
├── cleaner.py
|
|
239
|
-
├── context.py
|
|
240
|
-
├── dependencies.py
|
|
241
|
-
├── metrics.py
|
|
242
|
-
├── optimizer.py
|
|
243
|
-
├── output.py
|
|
244
|
-
├──
|
|
245
|
-
├──
|
|
246
|
-
├──
|
|
247
|
-
├──
|
|
248
|
-
├──
|
|
249
|
-
├──
|
|
256
|
+
├── cleaner.py # Main DataCleaner class
|
|
257
|
+
├── context.py # Docstring registry with FIFO eviction
|
|
258
|
+
├── dependencies.py # Topological sort for function ordering
|
|
259
|
+
├── metrics.py # Quality metrics before/after
|
|
260
|
+
├── optimizer.py # Two-pass consolidation with LLM agency
|
|
261
|
+
├── output.py # Function file generation + import consolidation
|
|
262
|
+
├── parser_generator.py # LLM-generated parsers for unknown formats
|
|
263
|
+
├── parsers.py # Chunking for all formats + sampling
|
|
264
|
+
├── prompt.py # LLM prompt templates
|
|
265
|
+
├── report.py # Markdown report generation
|
|
266
|
+
├── response.py # XML/markdown parsing + agency dataclasses
|
|
267
|
+
├── schema.py # Schema inference
|
|
268
|
+
├── validation.py # Runtime validation + holdout
|
|
250
269
|
└── vendor/
|
|
251
|
-
└── chunker.py
|
|
270
|
+
└── chunker.py # Vendored sentence-aware chunker
|
|
252
271
|
```
|
|
253
272
|
|
|
254
273
|
## Testing
|
|
@@ -257,7 +276,7 @@ recursive_cleaner/
|
|
|
257
276
|
pytest tests/ -v
|
|
258
277
|
```
|
|
259
278
|
|
|
260
|
-
|
|
279
|
+
432 tests covering all features. Test datasets in `test_cases/`:
|
|
261
280
|
- E-commerce product catalogs
|
|
262
281
|
- Healthcare patient records
|
|
263
282
|
- Financial transaction data
|
|
@@ -273,6 +292,7 @@ pytest tests/ -v
|
|
|
273
292
|
|
|
274
293
|
| Version | Features |
|
|
275
294
|
|---------|----------|
|
|
295
|
+
| v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
|
|
276
296
|
| v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
|
|
277
297
|
| v0.5.1 | Dangerous code detection (AST-based security) |
|
|
278
298
|
| v0.5.0 | Two-pass optimization, early termination, LLM agency |
|
|
@@ -18,7 +18,7 @@ recursive_cleaner/types.py,sha256=-GdCmsfHd3rfdfCi5c-RXqX4TyuCSHgA__3AF3bMhoQ,29
|
|
|
18
18
|
recursive_cleaner/validation.py,sha256=-KAolhw3GQyhHwmh0clEj8xqPD5O-R2AO5rx7vubIME,6442
|
|
19
19
|
recursive_cleaner/vendor/__init__.py,sha256=E87TjmjRzu8ty39nqThvBwM611yXlLKQZ6KGY_zp3Dk,117
|
|
20
20
|
recursive_cleaner/vendor/chunker.py,sha256=pDDbfF6FoSmUji0-RG4MletPxJ-VybGw0yfnhh0aMSQ,6730
|
|
21
|
-
recursive_cleaner-0.7.
|
|
22
|
-
recursive_cleaner-0.7.
|
|
23
|
-
recursive_cleaner-0.7.
|
|
24
|
-
recursive_cleaner-0.7.
|
|
21
|
+
recursive_cleaner-0.7.1.dist-info/METADATA,sha256=X5_HVPMIPUULKKIgDvqhN0ZRQQBcZ1lupGb9frLdCSI,10258
|
|
22
|
+
recursive_cleaner-0.7.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
23
|
+
recursive_cleaner-0.7.1.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
|
|
24
|
+
recursive_cleaner-0.7.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|