recursive-cleaner 0.6.1__tar.gz → 0.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/PKG-INFO +40 -16
  2. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/README.md +35 -15
  3. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/TODO.md +27 -15
  4. recursive_cleaner-0.7.1/docs/contracts/v070-success-criteria.md +13 -0
  5. recursive_cleaner-0.7.1/docs/workflow-state.md +26 -0
  6. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/pyproject.toml +7 -1
  7. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/__init__.py +7 -1
  8. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/cleaner.py +62 -14
  9. recursive_cleaner-0.7.1/recursive_cleaner/parser_generator.py +123 -0
  10. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/parsers.py +131 -1
  11. recursive_cleaner-0.7.1/tests/test_parser_generator.py +611 -0
  12. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_parsers.py +216 -0
  13. recursive_cleaner-0.6.1/docs/workflow-state.md +0 -45
  14. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/.gitignore +0 -0
  15. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/CLAUDE.md +0 -0
  16. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/LICENSE +0 -0
  17. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/backends/__init__.py +0 -0
  18. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/backends/mlx_backend.py +0 -0
  19. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/contracts/api-contract.md +0 -0
  20. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/contracts/data-schema.md +0 -0
  21. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/contracts/success-criteria.md +0 -0
  22. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/contracts/text-mode-contract.md +0 -0
  23. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/contracts/tier2-contract.md +0 -0
  24. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/contracts/tier4-contract.md +0 -0
  25. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/contracts/tier4-success-criteria.md +0 -0
  26. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/contracts/two-pass-contract.md +0 -0
  27. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/handoffs/tier4-handoff.md +0 -0
  28. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/implementation-plan-tier4.md +0 -0
  29. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/implementation-plan-v03.md +0 -0
  30. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/implementation-plan-v04.md +0 -0
  31. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/implementation-plan-v05.md +0 -0
  32. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/implementation-plan.md +0 -0
  33. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/langchain-analysis.md +0 -0
  34. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/langgraph-analysis.md +0 -0
  35. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/mlx-lm-guide.md +0 -0
  36. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/other-frameworks-analysis.md +0 -0
  37. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/refactor-assessment/data/dependency.json +0 -0
  38. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/refactor-assessment/data/stats.json +0 -0
  39. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/refactor-assessment/plan.md +0 -0
  40. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/refactor-assessment/report.md +0 -0
  41. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/research/chonkie-extraction.md +0 -0
  42. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/research/chonkie.md +0 -0
  43. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/research/markitdown.md +0 -0
  44. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/docs/smolagents-analysis.md +0 -0
  45. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/context.py +0 -0
  46. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/dependencies.py +0 -0
  47. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/errors.py +0 -0
  48. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/metrics.py +0 -0
  49. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/optimizer.py +0 -0
  50. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/output.py +0 -0
  51. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/prompt.py +0 -0
  52. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/report.py +0 -0
  53. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/response.py +0 -0
  54. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/schema.py +0 -0
  55. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/types.py +0 -0
  56. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/validation.py +0 -0
  57. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/vendor/__init__.py +0 -0
  58. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/recursive_cleaner/vendor/chunker.py +0 -0
  59. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/test_cases/ecommerce_instructions.txt +0 -0
  60. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/test_cases/ecommerce_products.jsonl +0 -0
  61. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/test_cases/financial_instructions.txt +0 -0
  62. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/test_cases/financial_transactions.jsonl +0 -0
  63. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/test_cases/healthcare_instructions.txt +0 -0
  64. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/test_cases/healthcare_patients.jsonl +0 -0
  65. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/test_cases/run_ecommerce_test.py +0 -0
  66. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/test_cases/run_financial_test.py +0 -0
  67. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/test_cases/run_healthcare_test.py +0 -0
  68. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/__init__.py +0 -0
  69. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_callbacks.py +0 -0
  70. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_cleaner.py +0 -0
  71. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_context.py +0 -0
  72. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_dependencies.py +0 -0
  73. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_dry_run.py +0 -0
  74. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_holdout.py +0 -0
  75. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_incremental.py +0 -0
  76. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_integration.py +0 -0
  77. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_latency.py +0 -0
  78. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_metrics.py +0 -0
  79. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_optimizer.py +0 -0
  80. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_output.py +0 -0
  81. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_report.py +0 -0
  82. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_sampling.py +0 -0
  83. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_schema.py +0 -0
  84. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_text_mode.py +0 -0
  85. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_validation.py +0 -0
  86. {recursive_cleaner-0.6.1 → recursive_cleaner-0.7.1}/tests/test_vendor_chunker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: recursive-cleaner
3
- Version: 0.6.1
3
+ Version: 0.7.1
4
4
  Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
5
5
  Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
6
6
  Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
@@ -26,8 +26,12 @@ Requires-Dist: tenacity>=8.0
26
26
  Provides-Extra: dev
27
27
  Requires-Dist: pytest-cov>=4.0; extra == 'dev'
28
28
  Requires-Dist: pytest>=7.0; extra == 'dev'
29
+ Provides-Extra: markitdown
30
+ Requires-Dist: markitdown>=0.1.0; extra == 'markitdown'
29
31
  Provides-Extra: mlx
30
32
  Requires-Dist: mlx-lm>=0.10.0; extra == 'mlx'
33
+ Provides-Extra: parquet
34
+ Requires-Dist: pyarrow>=14.0.0; extra == 'parquet'
31
35
  Description-Content-Type: text/markdown
32
36
 
33
37
  # Recursive Data Cleaner
@@ -36,7 +40,7 @@ LLM-powered incremental data cleaning for massive datasets. Process files in chu
36
40
 
37
41
  ## How It Works
38
42
 
39
- 1. **Chunk** your data (JSONL, CSV, JSON, or text)
43
+ 1. **Chunk** your data (JSONL, CSV, JSON, Parquet, PDF, Word, Excel, XML, and more)
40
44
  2. **Analyze** each chunk with an LLM to identify issues
41
45
  3. **Generate** one cleaning function per issue
42
46
  4. **Validate** functions on holdout data before accepting
@@ -55,6 +59,16 @@ For Apple Silicon (MLX backend):
55
59
  pip install -e ".[mlx]"
56
60
  ```
57
61
 
62
+ For document conversion (PDF, Word, Excel, HTML, etc.):
63
+ ```bash
64
+ pip install -e ".[markitdown]"
65
+ ```
66
+
67
+ For Parquet files:
68
+ ```bash
69
+ pip install -e ".[parquet]"
70
+ ```
71
+
58
72
  ## Quick Start
59
73
 
60
74
  ```python
@@ -107,6 +121,11 @@ cleaner.run() # Generates cleaning_functions.py
107
121
  - **Cleaning Reports**: Markdown summary with functions, timing, quality delta
108
122
  - **Dry-Run Mode**: Analyze data without generating functions
109
123
 
124
+ ### Format Expansion (v0.7.0)
125
+ - **Markitdown Integration**: Convert 20+ formats (PDF, Word, Excel, PowerPoint, HTML, EPUB, etc.) to text
126
+ - **Parquet Support**: Load parquet files as structured data via pyarrow
127
+ - **LLM-Generated Parsers**: Auto-generate parsers for XML and unknown formats (`auto_parse=True`)
128
+
110
129
  ## Configuration
111
130
 
112
131
  ```python
@@ -138,6 +157,9 @@ cleaner = DataCleaner(
138
157
  report_path="report.md", # Markdown report output (None to disable)
139
158
  dry_run=False, # Analyze without generating functions
140
159
 
160
+ # Format Expansion
161
+ auto_parse=False, # LLM generates parser for unknown formats
162
+
141
163
  # Progress & State
142
164
  on_progress=callback, # Progress event callback
143
165
  state_file="state.json", # Enable resume on interrupt
@@ -231,20 +253,21 @@ cleaner.run()
231
253
 
232
254
  ```
233
255
  recursive_cleaner/
234
- ├── cleaner.py # Main DataCleaner class (~580 lines)
235
- ├── context.py # Docstring registry with FIFO eviction
236
- ├── dependencies.py # Topological sort for function ordering
237
- ├── metrics.py # Quality metrics before/after
238
- ├── optimizer.py # Two-pass consolidation with LLM agency
239
- ├── output.py # Function file generation + import consolidation
240
- ├── parsers.py # Chunking for JSONL/CSV/JSON/text + sampling
241
- ├── prompt.py # LLM prompt templates
242
- ├── report.py # Markdown report generation
243
- ├── response.py # XML/markdown parsing + agency dataclasses
244
- ├── schema.py # Schema inference
245
- ├── validation.py # Runtime validation + holdout
256
+ ├── cleaner.py # Main DataCleaner class
257
+ ├── context.py # Docstring registry with FIFO eviction
258
+ ├── dependencies.py # Topological sort for function ordering
259
+ ├── metrics.py # Quality metrics before/after
260
+ ├── optimizer.py # Two-pass consolidation with LLM agency
261
+ ├── output.py # Function file generation + import consolidation
262
+ ├── parser_generator.py # LLM-generated parsers for unknown formats
263
+ ├── parsers.py # Chunking for all formats + sampling
264
+ ├── prompt.py # LLM prompt templates
265
+ ├── report.py # Markdown report generation
266
+ ├── response.py # XML/markdown parsing + agency dataclasses
267
+ ├── schema.py # Schema inference
268
+ ├── validation.py # Runtime validation + holdout
246
269
  └── vendor/
247
- └── chunker.py # Vendored sentence-aware chunker
270
+ └── chunker.py # Vendored sentence-aware chunker
248
271
  ```
249
272
 
250
273
  ## Testing
@@ -253,7 +276,7 @@ recursive_cleaner/
253
276
  pytest tests/ -v
254
277
  ```
255
278
 
256
- 392 tests covering all features. Test datasets in `test_cases/`:
279
+ 432 tests covering all features. Test datasets in `test_cases/`:
257
280
  - E-commerce product catalogs
258
281
  - Healthcare patient records
259
282
  - Financial transaction data
@@ -269,6 +292,7 @@ pytest tests/ -v
269
292
 
270
293
  | Version | Features |
271
294
  |---------|----------|
295
+ | v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
272
296
  | v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
273
297
  | v0.5.1 | Dangerous code detection (AST-based security) |
274
298
  | v0.5.0 | Two-pass optimization, early termination, LLM agency |
@@ -4,7 +4,7 @@ LLM-powered incremental data cleaning for massive datasets. Process files in chu
4
4
 
5
5
  ## How It Works
6
6
 
7
- 1. **Chunk** your data (JSONL, CSV, JSON, or text)
7
+ 1. **Chunk** your data (JSONL, CSV, JSON, Parquet, PDF, Word, Excel, XML, and more)
8
8
  2. **Analyze** each chunk with an LLM to identify issues
9
9
  3. **Generate** one cleaning function per issue
10
10
  4. **Validate** functions on holdout data before accepting
@@ -23,6 +23,16 @@ For Apple Silicon (MLX backend):
23
23
  pip install -e ".[mlx]"
24
24
  ```
25
25
 
26
+ For document conversion (PDF, Word, Excel, HTML, etc.):
27
+ ```bash
28
+ pip install -e ".[markitdown]"
29
+ ```
30
+
31
+ For Parquet files:
32
+ ```bash
33
+ pip install -e ".[parquet]"
34
+ ```
35
+
26
36
  ## Quick Start
27
37
 
28
38
  ```python
@@ -75,6 +85,11 @@ cleaner.run() # Generates cleaning_functions.py
75
85
  - **Cleaning Reports**: Markdown summary with functions, timing, quality delta
76
86
  - **Dry-Run Mode**: Analyze data without generating functions
77
87
 
88
+ ### Format Expansion (v0.7.0)
89
+ - **Markitdown Integration**: Convert 20+ formats (PDF, Word, Excel, PowerPoint, HTML, EPUB, etc.) to text
90
+ - **Parquet Support**: Load parquet files as structured data via pyarrow
91
+ - **LLM-Generated Parsers**: Auto-generate parsers for XML and unknown formats (`auto_parse=True`)
92
+
78
93
  ## Configuration
79
94
 
80
95
  ```python
@@ -106,6 +121,9 @@ cleaner = DataCleaner(
106
121
  report_path="report.md", # Markdown report output (None to disable)
107
122
  dry_run=False, # Analyze without generating functions
108
123
 
124
+ # Format Expansion
125
+ auto_parse=False, # LLM generates parser for unknown formats
126
+
109
127
  # Progress & State
110
128
  on_progress=callback, # Progress event callback
111
129
  state_file="state.json", # Enable resume on interrupt
@@ -199,20 +217,21 @@ cleaner.run()
199
217
 
200
218
  ```
201
219
  recursive_cleaner/
202
- ├── cleaner.py # Main DataCleaner class (~580 lines)
203
- ├── context.py # Docstring registry with FIFO eviction
204
- ├── dependencies.py # Topological sort for function ordering
205
- ├── metrics.py # Quality metrics before/after
206
- ├── optimizer.py # Two-pass consolidation with LLM agency
207
- ├── output.py # Function file generation + import consolidation
208
- ├── parsers.py # Chunking for JSONL/CSV/JSON/text + sampling
209
- ├── prompt.py # LLM prompt templates
210
- ├── report.py # Markdown report generation
211
- ├── response.py # XML/markdown parsing + agency dataclasses
212
- ├── schema.py # Schema inference
213
- ├── validation.py # Runtime validation + holdout
220
+ ├── cleaner.py # Main DataCleaner class
221
+ ├── context.py # Docstring registry with FIFO eviction
222
+ ├── dependencies.py # Topological sort for function ordering
223
+ ├── metrics.py # Quality metrics before/after
224
+ ├── optimizer.py # Two-pass consolidation with LLM agency
225
+ ├── output.py # Function file generation + import consolidation
226
+ ├── parser_generator.py # LLM-generated parsers for unknown formats
227
+ ├── parsers.py # Chunking for all formats + sampling
228
+ ├── prompt.py # LLM prompt templates
229
+ ├── report.py # Markdown report generation
230
+ ├── response.py # XML/markdown parsing + agency dataclasses
231
+ ├── schema.py # Schema inference
232
+ ├── validation.py # Runtime validation + holdout
214
233
  └── vendor/
215
- └── chunker.py # Vendored sentence-aware chunker
234
+ └── chunker.py # Vendored sentence-aware chunker
216
235
  ```
217
236
 
218
237
  ## Testing
@@ -221,7 +240,7 @@ recursive_cleaner/
221
240
  pytest tests/ -v
222
241
  ```
223
242
 
224
- 392 tests covering all features. Test datasets in `test_cases/`:
243
+ 432 tests covering all features. Test datasets in `test_cases/`:
225
244
  - E-commerce product catalogs
226
245
  - Healthcare patient records
227
246
  - Financial transaction data
@@ -237,6 +256,7 @@ pytest tests/ -v
237
256
 
238
257
  | Version | Features |
239
258
  |---------|----------|
259
+ | v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
240
260
  | v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
241
261
  | v0.5.1 | Dangerous code detection (AST-based security) |
242
262
  | v0.5.0 | Two-pass optimization, early termination, LLM agency |
@@ -60,30 +60,42 @@ These patterns proved high-value with low implementation effort:
60
60
 
61
61
  ---
62
62
 
63
- ## Future Considerations
63
+ ## Tier 5: Format Expansion & UI (v0.7.0) - PLANNED
64
+
65
+ ### Markitdown Integration
66
+ - [ ] Add markitdown as optional dependency
67
+ - [ ] Auto-convert 20+ formats: Excel, HTML, Word, PDF, PowerPoint, EPUB, etc.
68
+ - [ ] Preprocessing step before chunking
69
+ - **Approach**: `pip install recursive-cleaner[markitdown]`
70
+
71
+ ### Parquet Support
72
+ - [ ] Native parser using pyarrow
73
+ - [ ] Read as list of dicts (same as JSONL)
74
+ - **Approach**: Optional dependency, ~10 lines of code
75
+
76
+ ### LLM-Generated Parsers
77
+ - [ ] For XML and unknown formats
78
+ - [ ] Send sample to LLM: "Generate a function to parse this into list of records"
79
+ - [ ] Validate generated parser on sample before using
80
+ - **Approach**: Wu wei - let LLM decide how to parse data it understands
81
+
82
+ ### Terminal UI (Textual)
83
+ - [ ] Optional `[ui]` extra dependency
84
+ - [ ] Live dashboard showing: chunk progress, function generation, latency sparkline
85
+ - [ ] Pure terminal, no browser needed
86
+ - **Approach**: `pip install recursive-cleaner[ui]`
64
87
 
65
- Ideas that might be valuable but need more thought.
88
+ ---
66
89
 
67
- ### Confidence Scoring
68
- - LLM rates confidence in each generated function (high/medium/low)
69
- - Low confidence = flag for human review
70
- - **Question**: Does this actually help users, or just add noise?
90
+ ## Future Considerations
71
91
 
72
- ### Before/After Examples
73
- - User provides expected input→output pairs
74
- - Validate generated functions match expectations
75
- - **Question**: How to handle functions that transform data differently but correctly?
92
+ Ideas that might be valuable but need more thought.
76
93
 
77
94
  ### Multi-File Batch Mode
78
95
  - Process multiple files with shared function registry
79
96
  - Functions learned from file A applied to file B
80
97
  - **Question**: How to handle schema differences between files?
81
98
 
82
- ### Summary Buffer Memory
83
- - Compress old function docstrings into summaries
84
- - Keep recent functions verbatim
85
- - **Question**: Does FIFO eviction already work well enough?
86
-
87
99
  ---
88
100
 
89
101
  ## Explicitly Deferred
@@ -0,0 +1,13 @@
1
+ # Success Criteria - v0.7.0 Format Expansion
2
+
3
+ ## Project-Level Success
4
+ - [ ] Markitdown integration converts 20+ formats to text before chunking
5
+ - [ ] Parquet files load as list of dicts like JSONL/CSV
6
+ - [ ] LLM-generated parsers handle XML and unknown formats
7
+ - [ ] All new formats integrate seamlessly with existing cleaning pipeline
8
+ - [ ] Optional dependencies don't break core functionality when not installed
9
+ - [ ] All 392 existing tests still pass
10
+
11
+ ## Phase Success Criteria
12
+
13
+ [To be populated during planning]
@@ -0,0 +1,26 @@
1
+ # Workflow State - v0.7.0 Format Expansion
2
+
3
+ ## Current Phase
4
+ Research
5
+
6
+ ## Awaiting
7
+ Subagent Completion (Research)
8
+
9
+ ## Blockers
10
+ None
11
+
12
+ ## Progress
13
+ - [ ] Research complete
14
+ - [ ] Contracts approved
15
+ - [ ] Plan approved
16
+ - [ ] Phase 1: Markitdown integration
17
+ - [ ] Phase 1 audit
18
+ - [ ] Phase 2: Parquet support
19
+ - [ ] Phase 2 audit
20
+ - [ ] Phase 3: LLM-generated parsers
21
+ - [ ] Phase 3 audit
22
+
23
+ ## Previous Version (v0.6.0)
24
+ - **Tests**: 392 passing
25
+ - **Lines**: 2,967 total
26
+ - **Status**: Released on GitHub + PyPI
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "recursive-cleaner"
7
- version = "0.6.1"
7
+ version = "0.7.1"
8
8
  description = "LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -46,6 +46,12 @@ dev = [
46
46
  mlx = [
47
47
  "mlx-lm>=0.10.0",
48
48
  ]
49
+ markitdown = [
50
+ "markitdown>=0.1.0",
51
+ ]
52
+ parquet = [
53
+ "pyarrow>=14.0.0",
54
+ ]
49
55
 
50
56
  [project.urls]
51
57
  Homepage = "https://github.com/gaztrabisme/recursive-data-cleaner"
@@ -16,9 +16,10 @@ from recursive_cleaner.optimizer import (
16
16
  group_by_salience,
17
17
  )
18
18
  from recursive_cleaner.output import write_cleaning_file
19
- from recursive_cleaner.parsers import chunk_file
19
+ from recursive_cleaner.parsers import MARKITDOWN_EXTENSIONS, chunk_file, load_parquet, preprocess_with_markitdown
20
20
  from recursive_cleaner.prompt import build_prompt
21
21
  from recursive_cleaner.response import extract_python_block, parse_response
22
+ from recursive_cleaner.parser_generator import check_parser_safety, generate_parser
22
23
  from recursive_cleaner.validation import check_code_safety, extract_sample_data, validate_function
23
24
 
24
25
  __all__ = [
@@ -27,6 +28,9 @@ __all__ = [
27
28
  "MaxIterationsError",
28
29
  "OutputValidationError",
29
30
  "chunk_file",
31
+ "MARKITDOWN_EXTENSIONS",
32
+ "load_parquet",
33
+ "preprocess_with_markitdown",
30
34
  "parse_response",
31
35
  "extract_python_block",
32
36
  "build_context",
@@ -43,4 +47,6 @@ __all__ = [
43
47
  "extract_tags",
44
48
  "group_by_salience",
45
49
  "consolidate_with_agency",
50
+ "generate_parser",
51
+ "check_parser_safety",
46
52
  ]
@@ -12,7 +12,7 @@ from tenacity import retry, stop_after_attempt, wait_exponential
12
12
  from .context import build_context
13
13
  from .errors import OutputValidationError, ParseError
14
14
  from .metrics import QualityMetrics, compare_quality, load_structured_data, measure_quality
15
- from .parsers import chunk_file
15
+ from .parsers import MARKITDOWN_EXTENSIONS, chunk_file
16
16
  from .prompt import build_prompt
17
17
  from .response import parse_response
18
18
  from .schema import format_schema_for_prompt, infer_schema
@@ -61,6 +61,7 @@ class DataCleaner:
61
61
  saturation_check_interval: int = 20,
62
62
  report_path: str | None = "cleaning_report.md",
63
63
  dry_run: bool = False,
64
+ auto_parse: bool = False,
64
65
  ):
65
66
  self.backend = llm_backend
66
67
  self.file_path = file_path
@@ -84,7 +85,9 @@ class DataCleaner:
84
85
  self.saturation_check_interval = saturation_check_interval
85
86
  self.report_path = report_path
86
87
  self.dry_run = dry_run
88
+ self.auto_parse = auto_parse
87
89
  self.functions: list[dict] = [] # List of {name, docstring, code}
90
+ self._generated_parser: callable | None = None # LLM-generated parser for unknown formats
88
91
  # Track recent function generation for saturation check
89
92
  self._recent_new_function_count = 0
90
93
  self._last_check_function_count = 0
@@ -319,27 +322,72 @@ class DataCleaner:
319
322
  def _detect_mode(self) -> Literal["structured", "text"]:
320
323
  """Detect mode from file extension."""
321
324
  suffix = Path(self.file_path).suffix.lower()
322
- structured_extensions = {".jsonl", ".csv", ".json"}
325
+ # Markitdown formats are processed as text
326
+ if suffix in MARKITDOWN_EXTENSIONS:
327
+ return "text"
328
+ structured_extensions = {".jsonl", ".csv", ".json", ".parquet"}
323
329
  if suffix in structured_extensions:
324
330
  return "structured"
325
331
  return "text"
326
332
 
333
+ def _is_known_extension(self) -> bool:
334
+ """Check if file extension is natively supported."""
335
+ suffix = Path(self.file_path).suffix.lower()
336
+ known = {".jsonl", ".csv", ".json", ".parquet", ".txt"}
337
+ return suffix in known or suffix in MARKITDOWN_EXTENSIONS
338
+
339
+ def _load_with_auto_parser(self) -> list[str]:
340
+ """Load file using LLM-generated parser, return JSONL chunks."""
341
+ from .parser_generator import generate_parser
342
+
343
+ print(f"Unknown file format, generating parser...")
344
+ self._emit("parser_generation_start")
345
+
346
+ parser = generate_parser(self.backend, self.file_path)
347
+ self._generated_parser = parser
348
+
349
+ self._emit("parser_generation_complete")
350
+ print("Parser generated successfully.")
351
+
352
+ # Parse the file
353
+ records = parser(self.file_path)
354
+ if not records:
355
+ return []
356
+
357
+ # Convert to JSONL chunks
358
+ import json
359
+ chunks = []
360
+ for i in range(0, len(records), self.chunk_size):
361
+ chunk_records = records[i:i + self.chunk_size]
362
+ chunk_lines = [json.dumps(r) for r in chunk_records]
363
+ chunks.append("\n".join(chunk_lines))
364
+
365
+ return chunks
366
+
327
367
  def run(self) -> None:
328
368
  """Run the cleaning pipeline."""
329
- # Resolve effective mode
330
- if self.mode == "auto":
331
- self._effective_mode = self._detect_mode()
369
+ # Check if we should use auto-parser for unknown formats
370
+ use_auto_parser = self.auto_parse and not self._is_known_extension()
371
+
372
+ if use_auto_parser:
373
+ # LLM generates parser, always structured mode
374
+ self._effective_mode = "structured"
375
+ chunks = self._load_with_auto_parser()
332
376
  else:
333
- self._effective_mode = self.mode
377
+ # Resolve effective mode
378
+ if self.mode == "auto":
379
+ self._effective_mode = self._detect_mode()
380
+ else:
381
+ self._effective_mode = self.mode
334
382
 
335
- chunks = chunk_file(
336
- self.file_path,
337
- self.chunk_size,
338
- mode=self._effective_mode,
339
- chunk_overlap=self.chunk_overlap,
340
- sampling_strategy=self.sampling_strategy,
341
- stratify_field=self.stratify_field,
342
- )
383
+ chunks = chunk_file(
384
+ self.file_path,
385
+ self.chunk_size,
386
+ mode=self._effective_mode,
387
+ chunk_overlap=self.chunk_overlap,
388
+ sampling_strategy=self.sampling_strategy,
389
+ stratify_field=self.stratify_field,
390
+ )
343
391
 
344
392
  if not chunks:
345
393
  print("No data to process.")
@@ -0,0 +1,123 @@
1
+ """LLM-generated parser for unknown file formats."""
2
+
3
+ import ast
4
+ import re
5
+ from pathlib import Path
6
+
7
+ from .types import LLMBackend
8
+
9
+ # Dangerous patterns for parser code (allows 'open' since parsers need file I/O)
10
+ _DANGEROUS_IMPORTS = frozenset({
11
+ "os", "subprocess", "sys", "shutil", "socket", "urllib",
12
+ "requests", "httplib", "ftplib", "smtplib", "pickle",
13
+ })
14
+ _DANGEROUS_CALLS = frozenset({"eval", "exec", "compile", "__import__"})
15
+
16
+ PARSER_PROMPT = '''You are a data parsing expert. Generate a Python function to parse this file format.
17
+
18
+ === SAMPLE (first 4KB) ===
19
+ {sample}
20
+
21
+ === TASK ===
22
+ Generate a function with this EXACT signature:
23
+
24
+ ```python
25
+ def parse_file(file_path: str) -> list[dict]:
26
+ """Parse the file into a list of records."""
27
+ # Your implementation
28
+ ```
29
+
30
+ RULES:
31
+ - Return list of dicts, one dict per logical record
32
+ - Use only stdlib (xml.etree, json, re, csv)
33
+ - Handle the ENTIRE file, not just this sample
34
+ - Be defensive about malformed data
35
+ - Include necessary imports inside or before the function
36
+ '''
37
+
38
+
39
+ def check_parser_safety(code: str) -> list[str]:
40
+ """Check parser code for dangerous patterns. Returns list of issues."""
41
+ issues = []
42
+ try:
43
+ tree = ast.parse(code)
44
+ except SyntaxError as e:
45
+ return [f"Syntax error: {e}"]
46
+
47
+ for node in ast.walk(tree):
48
+ if isinstance(node, ast.Import):
49
+ for alias in node.names:
50
+ module = alias.name.split(".")[0]
51
+ if module in _DANGEROUS_IMPORTS:
52
+ issues.append(f"Dangerous import: {alias.name}")
53
+ if isinstance(node, ast.ImportFrom):
54
+ if node.module:
55
+ module = node.module.split(".")[0]
56
+ if module in _DANGEROUS_IMPORTS:
57
+ issues.append(f"Dangerous import: from {node.module}")
58
+ if isinstance(node, ast.Call):
59
+ if isinstance(node.func, ast.Name):
60
+ if node.func.id in _DANGEROUS_CALLS:
61
+ issues.append(f"Dangerous call: {node.func.id}()")
62
+ return issues
63
+
64
+
65
+ def extract_python_block(text: str) -> str:
66
+ """Extract code from ```python ... ``` block."""
67
+ match = re.search(r"```python\s*(.*?)\s*```", text, re.DOTALL)
68
+ return match.group(1).strip() if match else text.strip()
69
+
70
+
71
+ def generate_parser(llm_backend: LLMBackend, file_path: str) -> callable:
72
+ """
73
+ Generate a parser function for an unknown file format.
74
+
75
+ Args:
76
+ llm_backend: LLM backend implementing generate(prompt) -> str
77
+ file_path: Path to the file to parse
78
+
79
+ Returns:
80
+ A callable parse_file(file_path) -> list[dict]
81
+
82
+ Raises:
83
+ ValueError: If generated code is unsafe, has invalid syntax,
84
+ or doesn't return list of dicts
85
+ """
86
+ path = Path(file_path)
87
+ with open(path, "r", errors="replace") as f:
88
+ sample = f.read(4096)
89
+
90
+ prompt = PARSER_PROMPT.format(sample=sample)
91
+ response = llm_backend.generate(prompt)
92
+ code = extract_python_block(response)
93
+
94
+ # Validate syntax
95
+ try:
96
+ ast.parse(code)
97
+ except SyntaxError as e:
98
+ raise ValueError(f"Generated parser has invalid syntax: {e}")
99
+
100
+ # Security check
101
+ issues = check_parser_safety(code)
102
+ if issues:
103
+ raise ValueError(f"Generated parser contains dangerous code: {issues}")
104
+
105
+ # Execute to get function
106
+ namespace: dict = {}
107
+ exec(code, namespace)
108
+
109
+ if "parse_file" not in namespace:
110
+ raise ValueError("Generated code must define 'parse_file' function")
111
+
112
+ parser = namespace["parse_file"]
113
+
114
+ # Validate on actual file
115
+ result = parser(file_path)
116
+ if not isinstance(result, list):
117
+ raise ValueError(f"Parser must return list, got {type(result).__name__}")
118
+ if result and not isinstance(result[0], dict):
119
+ raise ValueError(
120
+ f"Parser must return list of dicts, got list of {type(result[0]).__name__}"
121
+ )
122
+
123
+ return parser