recursive-cleaner 0.6.1__tar.gz → 0.7.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/PKG-INFO +40 -16
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/README.md +35 -15
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/TODO.md +27 -15
- recursive_cleaner-0.7.1/docs/contracts/v070-success-criteria.md +13 -0
- recursive_cleaner-0.7.1/docs/workflow-state.md +26 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/pyproject.toml +7 -1
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/__init__.py +7 -1
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/cleaner.py +62 -14
- recursive_cleaner-0.7.1/recursive_cleaner/parser_generator.py +123 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/parsers.py +131 -1
- recursive_cleaner-0.7.1/tests/test_parser_generator.py +611 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_parsers.py +216 -0
- recursive_cleaner-0.6.1/docs/workflow-state.md +0 -45
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/.gitignore +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/CLAUDE.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/LICENSE +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/backends/__init__.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/backends/mlx_backend.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/contracts/api-contract.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/contracts/data-schema.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/contracts/success-criteria.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/contracts/text-mode-contract.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/contracts/tier2-contract.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/contracts/tier4-contract.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/contracts/tier4-success-criteria.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/contracts/two-pass-contract.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/handoffs/tier4-handoff.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/implementation-plan-tier4.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/implementation-plan-v03.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/implementation-plan-v04.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/implementation-plan-v05.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/implementation-plan.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/langchain-analysis.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/langgraph-analysis.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/mlx-lm-guide.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/other-frameworks-analysis.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/refactor-assessment/data/dependency.json +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/refactor-assessment/data/stats.json +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/refactor-assessment/plan.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/refactor-assessment/report.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/research/chonkie-extraction.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/research/chonkie.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/research/markitdown.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/smolagents-analysis.md +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/context.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/dependencies.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/errors.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/metrics.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/optimizer.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/output.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/prompt.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/report.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/response.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/schema.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/types.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/validation.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/vendor/__init__.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/vendor/chunker.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/test_cases/ecommerce_instructions.txt +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/test_cases/ecommerce_products.jsonl +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/test_cases/financial_instructions.txt +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/test_cases/financial_transactions.jsonl +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/test_cases/healthcare_instructions.txt +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/test_cases/healthcare_patients.jsonl +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/test_cases/run_ecommerce_test.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/test_cases/run_financial_test.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/test_cases/run_healthcare_test.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/__init__.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_callbacks.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_cleaner.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_context.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_dependencies.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_dry_run.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_holdout.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_incremental.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_integration.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_latency.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_metrics.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_optimizer.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_output.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_report.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_sampling.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_schema.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_text_mode.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_validation.py +0 -0
- {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_vendor_chunker.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: recursive-cleaner
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.1
|
|
4
4
|
Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
|
|
5
5
|
Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
6
6
|
Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
@@ -26,8 +26,12 @@ Requires-Dist: tenacity>=8.0
|
|
|
26
26
|
Provides-Extra: dev
|
|
27
27
|
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
28
28
|
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
29
|
+
Provides-Extra: markitdown
|
|
30
|
+
Requires-Dist: markitdown>=0.1.0; extra == 'markitdown'
|
|
29
31
|
Provides-Extra: mlx
|
|
30
32
|
Requires-Dist: mlx-lm>=0.10.0; extra == 'mlx'
|
|
33
|
+
Provides-Extra: parquet
|
|
34
|
+
Requires-Dist: pyarrow>=14.0.0; extra == 'parquet'
|
|
31
35
|
Description-Content-Type: text/markdown
|
|
32
36
|
|
|
33
37
|
# Recursive Data Cleaner
|
|
@@ -36,7 +40,7 @@ LLM-powered incremental data cleaning for massive datasets. Process files in chu
|
|
|
36
40
|
|
|
37
41
|
## How It Works
|
|
38
42
|
|
|
39
|
-
1. **Chunk** your data (JSONL, CSV, JSON,
|
|
43
|
+
1. **Chunk** your data (JSONL, CSV, JSON, Parquet, PDF, Word, Excel, XML, and more)
|
|
40
44
|
2. **Analyze** each chunk with an LLM to identify issues
|
|
41
45
|
3. **Generate** one cleaning function per issue
|
|
42
46
|
4. **Validate** functions on holdout data before accepting
|
|
@@ -55,6 +59,16 @@ For Apple Silicon (MLX backend):
|
|
|
55
59
|
pip install -e ".[mlx]"
|
|
56
60
|
```
|
|
57
61
|
|
|
62
|
+
For document conversion (PDF, Word, Excel, HTML, etc.):
|
|
63
|
+
```bash
|
|
64
|
+
pip install -e ".[markitdown]"
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
For Parquet files:
|
|
68
|
+
```bash
|
|
69
|
+
pip install -e ".[parquet]"
|
|
70
|
+
```
|
|
71
|
+
|
|
58
72
|
## Quick Start
|
|
59
73
|
|
|
60
74
|
```python
|
|
@@ -107,6 +121,11 @@ cleaner.run() # Generates cleaning_functions.py
|
|
|
107
121
|
- **Cleaning Reports**: Markdown summary with functions, timing, quality delta
|
|
108
122
|
- **Dry-Run Mode**: Analyze data without generating functions
|
|
109
123
|
|
|
124
|
+
### Format Expansion (v0.7.0)
|
|
125
|
+
- **Markitdown Integration**: Convert 20+ formats (PDF, Word, Excel, PowerPoint, HTML, EPUB, etc.) to text
|
|
126
|
+
- **Parquet Support**: Load parquet files as structured data via pyarrow
|
|
127
|
+
- **LLM-Generated Parsers**: Auto-generate parsers for XML and unknown formats (`auto_parse=True`)
|
|
128
|
+
|
|
110
129
|
## Configuration
|
|
111
130
|
|
|
112
131
|
```python
|
|
@@ -138,6 +157,9 @@ cleaner = DataCleaner(
|
|
|
138
157
|
report_path="report.md", # Markdown report output (None to disable)
|
|
139
158
|
dry_run=False, # Analyze without generating functions
|
|
140
159
|
|
|
160
|
+
# Format Expansion
|
|
161
|
+
auto_parse=False, # LLM generates parser for unknown formats
|
|
162
|
+
|
|
141
163
|
# Progress & State
|
|
142
164
|
on_progress=callback, # Progress event callback
|
|
143
165
|
state_file="state.json", # Enable resume on interrupt
|
|
@@ -231,20 +253,21 @@ cleaner.run()
|
|
|
231
253
|
|
|
232
254
|
```
|
|
233
255
|
recursive_cleaner/
|
|
234
|
-
├── cleaner.py
|
|
235
|
-
├── context.py
|
|
236
|
-
├── dependencies.py
|
|
237
|
-
├── metrics.py
|
|
238
|
-
├── optimizer.py
|
|
239
|
-
├── output.py
|
|
240
|
-
├──
|
|
241
|
-
├──
|
|
242
|
-
├──
|
|
243
|
-
├──
|
|
244
|
-
├──
|
|
245
|
-
├──
|
|
256
|
+
├── cleaner.py # Main DataCleaner class
|
|
257
|
+
├── context.py # Docstring registry with FIFO eviction
|
|
258
|
+
├── dependencies.py # Topological sort for function ordering
|
|
259
|
+
├── metrics.py # Quality metrics before/after
|
|
260
|
+
├── optimizer.py # Two-pass consolidation with LLM agency
|
|
261
|
+
├── output.py # Function file generation + import consolidation
|
|
262
|
+
├── parser_generator.py # LLM-generated parsers for unknown formats
|
|
263
|
+
├── parsers.py # Chunking for all formats + sampling
|
|
264
|
+
├── prompt.py # LLM prompt templates
|
|
265
|
+
├── report.py # Markdown report generation
|
|
266
|
+
├── response.py # XML/markdown parsing + agency dataclasses
|
|
267
|
+
├── schema.py # Schema inference
|
|
268
|
+
├── validation.py # Runtime validation + holdout
|
|
246
269
|
└── vendor/
|
|
247
|
-
└── chunker.py
|
|
270
|
+
└── chunker.py # Vendored sentence-aware chunker
|
|
248
271
|
```
|
|
249
272
|
|
|
250
273
|
## Testing
|
|
@@ -253,7 +276,7 @@ recursive_cleaner/
|
|
|
253
276
|
pytest tests/ -v
|
|
254
277
|
```
|
|
255
278
|
|
|
256
|
-
|
|
279
|
+
432 tests covering all features. Test datasets in `test_cases/`:
|
|
257
280
|
- E-commerce product catalogs
|
|
258
281
|
- Healthcare patient records
|
|
259
282
|
- Financial transaction data
|
|
@@ -269,6 +292,7 @@ pytest tests/ -v
|
|
|
269
292
|
|
|
270
293
|
| Version | Features |
|
|
271
294
|
|---------|----------|
|
|
295
|
+
| v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
|
|
272
296
|
| v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
|
|
273
297
|
| v0.5.1 | Dangerous code detection (AST-based security) |
|
|
274
298
|
| v0.5.0 | Two-pass optimization, early termination, LLM agency |
|
|
@@ -4,7 +4,7 @@ LLM-powered incremental data cleaning for massive datasets. Process files in chu
|
|
|
4
4
|
|
|
5
5
|
## How It Works
|
|
6
6
|
|
|
7
|
-
1. **Chunk** your data (JSONL, CSV, JSON,
|
|
7
|
+
1. **Chunk** your data (JSONL, CSV, JSON, Parquet, PDF, Word, Excel, XML, and more)
|
|
8
8
|
2. **Analyze** each chunk with an LLM to identify issues
|
|
9
9
|
3. **Generate** one cleaning function per issue
|
|
10
10
|
4. **Validate** functions on holdout data before accepting
|
|
@@ -23,6 +23,16 @@ For Apple Silicon (MLX backend):
|
|
|
23
23
|
pip install -e ".[mlx]"
|
|
24
24
|
```
|
|
25
25
|
|
|
26
|
+
For document conversion (PDF, Word, Excel, HTML, etc.):
|
|
27
|
+
```bash
|
|
28
|
+
pip install -e ".[markitdown]"
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
For Parquet files:
|
|
32
|
+
```bash
|
|
33
|
+
pip install -e ".[parquet]"
|
|
34
|
+
```
|
|
35
|
+
|
|
26
36
|
## Quick Start
|
|
27
37
|
|
|
28
38
|
```python
|
|
@@ -75,6 +85,11 @@ cleaner.run() # Generates cleaning_functions.py
|
|
|
75
85
|
- **Cleaning Reports**: Markdown summary with functions, timing, quality delta
|
|
76
86
|
- **Dry-Run Mode**: Analyze data without generating functions
|
|
77
87
|
|
|
88
|
+
### Format Expansion (v0.7.0)
|
|
89
|
+
- **Markitdown Integration**: Convert 20+ formats (PDF, Word, Excel, PowerPoint, HTML, EPUB, etc.) to text
|
|
90
|
+
- **Parquet Support**: Load parquet files as structured data via pyarrow
|
|
91
|
+
- **LLM-Generated Parsers**: Auto-generate parsers for XML and unknown formats (`auto_parse=True`)
|
|
92
|
+
|
|
78
93
|
## Configuration
|
|
79
94
|
|
|
80
95
|
```python
|
|
@@ -106,6 +121,9 @@ cleaner = DataCleaner(
|
|
|
106
121
|
report_path="report.md", # Markdown report output (None to disable)
|
|
107
122
|
dry_run=False, # Analyze without generating functions
|
|
108
123
|
|
|
124
|
+
# Format Expansion
|
|
125
|
+
auto_parse=False, # LLM generates parser for unknown formats
|
|
126
|
+
|
|
109
127
|
# Progress & State
|
|
110
128
|
on_progress=callback, # Progress event callback
|
|
111
129
|
state_file="state.json", # Enable resume on interrupt
|
|
@@ -199,20 +217,21 @@ cleaner.run()
|
|
|
199
217
|
|
|
200
218
|
```
|
|
201
219
|
recursive_cleaner/
|
|
202
|
-
├── cleaner.py
|
|
203
|
-
├── context.py
|
|
204
|
-
├── dependencies.py
|
|
205
|
-
├── metrics.py
|
|
206
|
-
├── optimizer.py
|
|
207
|
-
├── output.py
|
|
208
|
-
├──
|
|
209
|
-
├──
|
|
210
|
-
├──
|
|
211
|
-
├──
|
|
212
|
-
├──
|
|
213
|
-
├──
|
|
220
|
+
├── cleaner.py # Main DataCleaner class
|
|
221
|
+
├── context.py # Docstring registry with FIFO eviction
|
|
222
|
+
├── dependencies.py # Topological sort for function ordering
|
|
223
|
+
├── metrics.py # Quality metrics before/after
|
|
224
|
+
├── optimizer.py # Two-pass consolidation with LLM agency
|
|
225
|
+
├── output.py # Function file generation + import consolidation
|
|
226
|
+
├── parser_generator.py # LLM-generated parsers for unknown formats
|
|
227
|
+
├── parsers.py # Chunking for all formats + sampling
|
|
228
|
+
├── prompt.py # LLM prompt templates
|
|
229
|
+
├── report.py # Markdown report generation
|
|
230
|
+
├── response.py # XML/markdown parsing + agency dataclasses
|
|
231
|
+
├── schema.py # Schema inference
|
|
232
|
+
├── validation.py # Runtime validation + holdout
|
|
214
233
|
└── vendor/
|
|
215
|
-
└── chunker.py
|
|
234
|
+
└── chunker.py # Vendored sentence-aware chunker
|
|
216
235
|
```
|
|
217
236
|
|
|
218
237
|
## Testing
|
|
@@ -221,7 +240,7 @@ recursive_cleaner/
|
|
|
221
240
|
pytest tests/ -v
|
|
222
241
|
```
|
|
223
242
|
|
|
224
|
-
|
|
243
|
+
432 tests covering all features. Test datasets in `test_cases/`:
|
|
225
244
|
- E-commerce product catalogs
|
|
226
245
|
- Healthcare patient records
|
|
227
246
|
- Financial transaction data
|
|
@@ -237,6 +256,7 @@ pytest tests/ -v
|
|
|
237
256
|
|
|
238
257
|
| Version | Features |
|
|
239
258
|
|---------|----------|
|
|
259
|
+
| v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
|
|
240
260
|
| v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
|
|
241
261
|
| v0.5.1 | Dangerous code detection (AST-based security) |
|
|
242
262
|
| v0.5.0 | Two-pass optimization, early termination, LLM agency |
|
|
@@ -60,30 +60,42 @@ These patterns proved high-value with low implementation effort:
|
|
|
60
60
|
|
|
61
61
|
---
|
|
62
62
|
|
|
63
|
-
##
|
|
63
|
+
## Tier 5: Format Expansion & UI (v0.7.0) - PLANNED
|
|
64
|
+
|
|
65
|
+
### Markitdown Integration
|
|
66
|
+
- [ ] Add markitdown as optional dependency
|
|
67
|
+
- [ ] Auto-convert 20+ formats: Excel, HTML, Word, PDF, PowerPoint, EPUB, etc.
|
|
68
|
+
- [ ] Preprocessing step before chunking
|
|
69
|
+
- **Approach**: `pip install recursive-cleaner[markitdown]`
|
|
70
|
+
|
|
71
|
+
### Parquet Support
|
|
72
|
+
- [ ] Native parser using pyarrow
|
|
73
|
+
- [ ] Read as list of dicts (same as JSONL)
|
|
74
|
+
- **Approach**: Optional dependency, ~10 lines of code
|
|
75
|
+
|
|
76
|
+
### LLM-Generated Parsers
|
|
77
|
+
- [ ] For XML and unknown formats
|
|
78
|
+
- [ ] Send sample to LLM: "Generate a function to parse this into list of records"
|
|
79
|
+
- [ ] Validate generated parser on sample before using
|
|
80
|
+
- **Approach**: Wu wei - let LLM decide how to parse data it understands
|
|
81
|
+
|
|
82
|
+
### Terminal UI (Textual)
|
|
83
|
+
- [ ] Optional `[ui]` extra dependency
|
|
84
|
+
- [ ] Live dashboard showing: chunk progress, function generation, latency sparkline
|
|
85
|
+
- [ ] Pure terminal, no browser needed
|
|
86
|
+
- **Approach**: `pip install recursive-cleaner[ui]`
|
|
64
87
|
|
|
65
|
-
|
|
88
|
+
---
|
|
66
89
|
|
|
67
|
-
|
|
68
|
-
- LLM rates confidence in each generated function (high/medium/low)
|
|
69
|
-
- Low confidence = flag for human review
|
|
70
|
-
- **Question**: Does this actually help users, or just add noise?
|
|
90
|
+
## Future Considerations
|
|
71
91
|
|
|
72
|
-
|
|
73
|
-
- User provides expected input→output pairs
|
|
74
|
-
- Validate generated functions match expectations
|
|
75
|
-
- **Question**: How to handle functions that transform data differently but correctly?
|
|
92
|
+
Ideas that might be valuable but need more thought.
|
|
76
93
|
|
|
77
94
|
### Multi-File Batch Mode
|
|
78
95
|
- Process multiple files with shared function registry
|
|
79
96
|
- Functions learned from file A applied to file B
|
|
80
97
|
- **Question**: How to handle schema differences between files?
|
|
81
98
|
|
|
82
|
-
### Summary Buffer Memory
|
|
83
|
-
- Compress old function docstrings into summaries
|
|
84
|
-
- Keep recent functions verbatim
|
|
85
|
-
- **Question**: Does FIFO eviction already work well enough?
|
|
86
|
-
|
|
87
99
|
---
|
|
88
100
|
|
|
89
101
|
## Explicitly Deferred
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Success Criteria - v0.7.0 Format Expansion
|
|
2
|
+
|
|
3
|
+
## Project-Level Success
|
|
4
|
+
- [ ] Markitdown integration converts 20+ formats to text before chunking
|
|
5
|
+
- [ ] Parquet files load as list of dicts like JSONL/CSV
|
|
6
|
+
- [ ] LLM-generated parsers handle XML and unknown formats
|
|
7
|
+
- [ ] All new formats integrate seamlessly with existing cleaning pipeline
|
|
8
|
+
- [ ] Optional dependencies don't break core functionality when not installed
|
|
9
|
+
- [ ] All 392 existing tests still pass
|
|
10
|
+
|
|
11
|
+
## Phase Success Criteria
|
|
12
|
+
|
|
13
|
+
[To be populated during planning]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Workflow State - v0.7.0 Format Expansion
|
|
2
|
+
|
|
3
|
+
## Current Phase
|
|
4
|
+
Research
|
|
5
|
+
|
|
6
|
+
## Awaiting
|
|
7
|
+
Subagent Completion (Research)
|
|
8
|
+
|
|
9
|
+
## Blockers
|
|
10
|
+
None
|
|
11
|
+
|
|
12
|
+
## Progress
|
|
13
|
+
- [ ] Research complete
|
|
14
|
+
- [ ] Contracts approved
|
|
15
|
+
- [ ] Plan approved
|
|
16
|
+
- [ ] Phase 1: Markitdown integration
|
|
17
|
+
- [ ] Phase 1 audit
|
|
18
|
+
- [ ] Phase 2: Parquet support
|
|
19
|
+
- [ ] Phase 2 audit
|
|
20
|
+
- [ ] Phase 3: LLM-generated parsers
|
|
21
|
+
- [ ] Phase 3 audit
|
|
22
|
+
|
|
23
|
+
## Previous Version (v0.6.0)
|
|
24
|
+
- **Tests**: 392 passing
|
|
25
|
+
- **Lines**: 2,967 total
|
|
26
|
+
- **Status**: Released on GitHub + PyPI
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "recursive-cleaner"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.7.1"
|
|
8
8
|
description = "LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -46,6 +46,12 @@ dev = [
|
|
|
46
46
|
mlx = [
|
|
47
47
|
"mlx-lm>=0.10.0",
|
|
48
48
|
]
|
|
49
|
+
markitdown = [
|
|
50
|
+
"markitdown>=0.1.0",
|
|
51
|
+
]
|
|
52
|
+
parquet = [
|
|
53
|
+
"pyarrow>=14.0.0",
|
|
54
|
+
]
|
|
49
55
|
|
|
50
56
|
[project.urls]
|
|
51
57
|
Homepage = "https://github.com/gaztrabisme/recursive-data-cleaner"
|
|
@@ -16,9 +16,10 @@ from recursive_cleaner.optimizer import (
|
|
|
16
16
|
group_by_salience,
|
|
17
17
|
)
|
|
18
18
|
from recursive_cleaner.output import write_cleaning_file
|
|
19
|
-
from recursive_cleaner.parsers import chunk_file
|
|
19
|
+
from recursive_cleaner.parsers import MARKITDOWN_EXTENSIONS, chunk_file, load_parquet, preprocess_with_markitdown
|
|
20
20
|
from recursive_cleaner.prompt import build_prompt
|
|
21
21
|
from recursive_cleaner.response import extract_python_block, parse_response
|
|
22
|
+
from recursive_cleaner.parser_generator import check_parser_safety, generate_parser
|
|
22
23
|
from recursive_cleaner.validation import check_code_safety, extract_sample_data, validate_function
|
|
23
24
|
|
|
24
25
|
__all__ = [
|
|
@@ -27,6 +28,9 @@ __all__ = [
|
|
|
27
28
|
"MaxIterationsError",
|
|
28
29
|
"OutputValidationError",
|
|
29
30
|
"chunk_file",
|
|
31
|
+
"MARKITDOWN_EXTENSIONS",
|
|
32
|
+
"load_parquet",
|
|
33
|
+
"preprocess_with_markitdown",
|
|
30
34
|
"parse_response",
|
|
31
35
|
"extract_python_block",
|
|
32
36
|
"build_context",
|
|
@@ -43,4 +47,6 @@ __all__ = [
|
|
|
43
47
|
"extract_tags",
|
|
44
48
|
"group_by_salience",
|
|
45
49
|
"consolidate_with_agency",
|
|
50
|
+
"generate_parser",
|
|
51
|
+
"check_parser_safety",
|
|
46
52
|
]
|
|
@@ -12,7 +12,7 @@ from tenacity import retry, stop_after_attempt, wait_exponential
|
|
|
12
12
|
from .context import build_context
|
|
13
13
|
from .errors import OutputValidationError, ParseError
|
|
14
14
|
from .metrics import QualityMetrics, compare_quality, load_structured_data, measure_quality
|
|
15
|
-
from .parsers import chunk_file
|
|
15
|
+
from .parsers import MARKITDOWN_EXTENSIONS, chunk_file
|
|
16
16
|
from .prompt import build_prompt
|
|
17
17
|
from .response import parse_response
|
|
18
18
|
from .schema import format_schema_for_prompt, infer_schema
|
|
@@ -61,6 +61,7 @@ class DataCleaner:
|
|
|
61
61
|
saturation_check_interval: int = 20,
|
|
62
62
|
report_path: str | None = "cleaning_report.md",
|
|
63
63
|
dry_run: bool = False,
|
|
64
|
+
auto_parse: bool = False,
|
|
64
65
|
):
|
|
65
66
|
self.backend = llm_backend
|
|
66
67
|
self.file_path = file_path
|
|
@@ -84,7 +85,9 @@ class DataCleaner:
|
|
|
84
85
|
self.saturation_check_interval = saturation_check_interval
|
|
85
86
|
self.report_path = report_path
|
|
86
87
|
self.dry_run = dry_run
|
|
88
|
+
self.auto_parse = auto_parse
|
|
87
89
|
self.functions: list[dict] = [] # List of {name, docstring, code}
|
|
90
|
+
self._generated_parser: callable | None = None # LLM-generated parser for unknown formats
|
|
88
91
|
# Track recent function generation for saturation check
|
|
89
92
|
self._recent_new_function_count = 0
|
|
90
93
|
self._last_check_function_count = 0
|
|
@@ -319,27 +322,72 @@ class DataCleaner:
|
|
|
319
322
|
def _detect_mode(self) -> Literal["structured", "text"]:
|
|
320
323
|
"""Detect mode from file extension."""
|
|
321
324
|
suffix = Path(self.file_path).suffix.lower()
|
|
322
|
-
|
|
325
|
+
# Markitdown formats are processed as text
|
|
326
|
+
if suffix in MARKITDOWN_EXTENSIONS:
|
|
327
|
+
return "text"
|
|
328
|
+
structured_extensions = {".jsonl", ".csv", ".json", ".parquet"}
|
|
323
329
|
if suffix in structured_extensions:
|
|
324
330
|
return "structured"
|
|
325
331
|
return "text"
|
|
326
332
|
|
|
333
|
+
def _is_known_extension(self) -> bool:
|
|
334
|
+
"""Check if file extension is natively supported."""
|
|
335
|
+
suffix = Path(self.file_path).suffix.lower()
|
|
336
|
+
known = {".jsonl", ".csv", ".json", ".parquet", ".txt"}
|
|
337
|
+
return suffix in known or suffix in MARKITDOWN_EXTENSIONS
|
|
338
|
+
|
|
339
|
+
def _load_with_auto_parser(self) -> list[str]:
|
|
340
|
+
"""Load file using LLM-generated parser, return JSONL chunks."""
|
|
341
|
+
from .parser_generator import generate_parser
|
|
342
|
+
|
|
343
|
+
print(f"Unknown file format, generating parser...")
|
|
344
|
+
self._emit("parser_generation_start")
|
|
345
|
+
|
|
346
|
+
parser = generate_parser(self.backend, self.file_path)
|
|
347
|
+
self._generated_parser = parser
|
|
348
|
+
|
|
349
|
+
self._emit("parser_generation_complete")
|
|
350
|
+
print("Parser generated successfully.")
|
|
351
|
+
|
|
352
|
+
# Parse the file
|
|
353
|
+
records = parser(self.file_path)
|
|
354
|
+
if not records:
|
|
355
|
+
return []
|
|
356
|
+
|
|
357
|
+
# Convert to JSONL chunks
|
|
358
|
+
import json
|
|
359
|
+
chunks = []
|
|
360
|
+
for i in range(0, len(records), self.chunk_size):
|
|
361
|
+
chunk_records = records[i:i + self.chunk_size]
|
|
362
|
+
chunk_lines = [json.dumps(r) for r in chunk_records]
|
|
363
|
+
chunks.append("\n".join(chunk_lines))
|
|
364
|
+
|
|
365
|
+
return chunks
|
|
366
|
+
|
|
327
367
|
def run(self) -> None:
|
|
328
368
|
"""Run the cleaning pipeline."""
|
|
329
|
-
#
|
|
330
|
-
|
|
331
|
-
|
|
369
|
+
# Check if we should use auto-parser for unknown formats
|
|
370
|
+
use_auto_parser = self.auto_parse and not self._is_known_extension()
|
|
371
|
+
|
|
372
|
+
if use_auto_parser:
|
|
373
|
+
# LLM generates parser, always structured mode
|
|
374
|
+
self._effective_mode = "structured"
|
|
375
|
+
chunks = self._load_with_auto_parser()
|
|
332
376
|
else:
|
|
333
|
-
|
|
377
|
+
# Resolve effective mode
|
|
378
|
+
if self.mode == "auto":
|
|
379
|
+
self._effective_mode = self._detect_mode()
|
|
380
|
+
else:
|
|
381
|
+
self._effective_mode = self.mode
|
|
334
382
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
383
|
+
chunks = chunk_file(
|
|
384
|
+
self.file_path,
|
|
385
|
+
self.chunk_size,
|
|
386
|
+
mode=self._effective_mode,
|
|
387
|
+
chunk_overlap=self.chunk_overlap,
|
|
388
|
+
sampling_strategy=self.sampling_strategy,
|
|
389
|
+
stratify_field=self.stratify_field,
|
|
390
|
+
)
|
|
343
391
|
|
|
344
392
|
if not chunks:
|
|
345
393
|
print("No data to process.")
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""LLM-generated parser for unknown file formats."""
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
import re
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from .types import LLMBackend
|
|
8
|
+
|
|
9
|
+
# Dangerous patterns for parser code (allows 'open' since parsers need file I/O)
|
|
10
|
+
_DANGEROUS_IMPORTS = frozenset({
|
|
11
|
+
"os", "subprocess", "sys", "shutil", "socket", "urllib",
|
|
12
|
+
"requests", "httplib", "ftplib", "smtplib", "pickle",
|
|
13
|
+
})
|
|
14
|
+
_DANGEROUS_CALLS = frozenset({"eval", "exec", "compile", "__import__"})
|
|
15
|
+
|
|
16
|
+
PARSER_PROMPT = '''You are a data parsing expert. Generate a Python function to parse this file format.
|
|
17
|
+
|
|
18
|
+
=== SAMPLE (first 4KB) ===
|
|
19
|
+
{sample}
|
|
20
|
+
|
|
21
|
+
=== TASK ===
|
|
22
|
+
Generate a function with this EXACT signature:
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
def parse_file(file_path: str) -> list[dict]:
|
|
26
|
+
"""Parse the file into a list of records."""
|
|
27
|
+
# Your implementation
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
RULES:
|
|
31
|
+
- Return list of dicts, one dict per logical record
|
|
32
|
+
- Use only stdlib (xml.etree, json, re, csv)
|
|
33
|
+
- Handle the ENTIRE file, not just this sample
|
|
34
|
+
- Be defensive about malformed data
|
|
35
|
+
- Include necessary imports inside or before the function
|
|
36
|
+
'''
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def check_parser_safety(code: str) -> list[str]:
|
|
40
|
+
"""Check parser code for dangerous patterns. Returns list of issues."""
|
|
41
|
+
issues = []
|
|
42
|
+
try:
|
|
43
|
+
tree = ast.parse(code)
|
|
44
|
+
except SyntaxError as e:
|
|
45
|
+
return [f"Syntax error: {e}"]
|
|
46
|
+
|
|
47
|
+
for node in ast.walk(tree):
|
|
48
|
+
if isinstance(node, ast.Import):
|
|
49
|
+
for alias in node.names:
|
|
50
|
+
module = alias.name.split(".")[0]
|
|
51
|
+
if module in _DANGEROUS_IMPORTS:
|
|
52
|
+
issues.append(f"Dangerous import: {alias.name}")
|
|
53
|
+
if isinstance(node, ast.ImportFrom):
|
|
54
|
+
if node.module:
|
|
55
|
+
module = node.module.split(".")[0]
|
|
56
|
+
if module in _DANGEROUS_IMPORTS:
|
|
57
|
+
issues.append(f"Dangerous import: from {node.module}")
|
|
58
|
+
if isinstance(node, ast.Call):
|
|
59
|
+
if isinstance(node.func, ast.Name):
|
|
60
|
+
if node.func.id in _DANGEROUS_CALLS:
|
|
61
|
+
issues.append(f"Dangerous call: {node.func.id}()")
|
|
62
|
+
return issues
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def extract_python_block(text: str) -> str:
|
|
66
|
+
"""Extract code from ```python ... ``` block."""
|
|
67
|
+
match = re.search(r"```python\s*(.*?)\s*```", text, re.DOTALL)
|
|
68
|
+
return match.group(1).strip() if match else text.strip()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def generate_parser(llm_backend: LLMBackend, file_path: str) -> callable:
|
|
72
|
+
"""
|
|
73
|
+
Generate a parser function for an unknown file format.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
llm_backend: LLM backend implementing generate(prompt) -> str
|
|
77
|
+
file_path: Path to the file to parse
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
A callable parse_file(file_path) -> list[dict]
|
|
81
|
+
|
|
82
|
+
Raises:
|
|
83
|
+
ValueError: If generated code is unsafe, has invalid syntax,
|
|
84
|
+
or doesn't return list of dicts
|
|
85
|
+
"""
|
|
86
|
+
path = Path(file_path)
|
|
87
|
+
with open(path, "r", errors="replace") as f:
|
|
88
|
+
sample = f.read(4096)
|
|
89
|
+
|
|
90
|
+
prompt = PARSER_PROMPT.format(sample=sample)
|
|
91
|
+
response = llm_backend.generate(prompt)
|
|
92
|
+
code = extract_python_block(response)
|
|
93
|
+
|
|
94
|
+
# Validate syntax
|
|
95
|
+
try:
|
|
96
|
+
ast.parse(code)
|
|
97
|
+
except SyntaxError as e:
|
|
98
|
+
raise ValueError(f"Generated parser has invalid syntax: {e}")
|
|
99
|
+
|
|
100
|
+
# Security check
|
|
101
|
+
issues = check_parser_safety(code)
|
|
102
|
+
if issues:
|
|
103
|
+
raise ValueError(f"Generated parser contains dangerous code: {issues}")
|
|
104
|
+
|
|
105
|
+
# Execute to get function
|
|
106
|
+
namespace: dict = {}
|
|
107
|
+
exec(code, namespace)
|
|
108
|
+
|
|
109
|
+
if "parse_file" not in namespace:
|
|
110
|
+
raise ValueError("Generated code must define 'parse_file' function")
|
|
111
|
+
|
|
112
|
+
parser = namespace["parse_file"]
|
|
113
|
+
|
|
114
|
+
# Validate on actual file
|
|
115
|
+
result = parser(file_path)
|
|
116
|
+
if not isinstance(result, list):
|
|
117
|
+
raise ValueError(f"Parser must return list, got {type(result).__name__}")
|
|
118
|
+
if result and not isinstance(result[0], dict):
|
|
119
|
+
raise ValueError(
|
|
120
|
+
f"Parser must return list of dicts, got list of {type(result[0]).__name__}"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
return parser
|