recursive-cleaner 0.7.0__tar.gz → 0.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/PKG-INFO +36 -16
  2. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/README.md +35 -15
  3. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/pyproject.toml +1 -1
  4. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/.gitignore +0 -0
  5. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/CLAUDE.md +0 -0
  6. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/LICENSE +0 -0
  7. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/TODO.md +0 -0
  8. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/backends/__init__.py +0 -0
  9. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/backends/mlx_backend.py +0 -0
  10. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/contracts/api-contract.md +0 -0
  11. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/contracts/data-schema.md +0 -0
  12. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/contracts/success-criteria.md +0 -0
  13. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/contracts/text-mode-contract.md +0 -0
  14. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/contracts/tier2-contract.md +0 -0
  15. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/contracts/tier4-contract.md +0 -0
  16. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/contracts/tier4-success-criteria.md +0 -0
  17. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/contracts/two-pass-contract.md +0 -0
  18. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/contracts/v070-success-criteria.md +0 -0
  19. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/handoffs/tier4-handoff.md +0 -0
  20. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/implementation-plan-tier4.md +0 -0
  21. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/implementation-plan-v03.md +0 -0
  22. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/implementation-plan-v04.md +0 -0
  23. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/implementation-plan-v05.md +0 -0
  24. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/implementation-plan.md +0 -0
  25. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/langchain-analysis.md +0 -0
  26. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/langgraph-analysis.md +0 -0
  27. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/mlx-lm-guide.md +0 -0
  28. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/other-frameworks-analysis.md +0 -0
  29. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/refactor-assessment/data/dependency.json +0 -0
  30. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/refactor-assessment/data/stats.json +0 -0
  31. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/refactor-assessment/plan.md +0 -0
  32. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/refactor-assessment/report.md +0 -0
  33. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/research/chonkie-extraction.md +0 -0
  34. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/research/chonkie.md +0 -0
  35. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/research/markitdown.md +0 -0
  36. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/smolagents-analysis.md +0 -0
  37. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/docs/workflow-state.md +0 -0
  38. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/recursive_cleaner/__init__.py +0 -0
  39. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/recursive_cleaner/cleaner.py +0 -0
  40. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/recursive_cleaner/context.py +0 -0
  41. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/recursive_cleaner/dependencies.py +0 -0
  42. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/recursive_cleaner/errors.py +0 -0
  43. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/recursive_cleaner/metrics.py +0 -0
  44. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/recursive_cleaner/optimizer.py +0 -0
  45. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/recursive_cleaner/output.py +0 -0
  46. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/recursive_cleaner/parser_generator.py +0 -0
  47. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/recursive_cleaner/parsers.py +0 -0
  48. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/recursive_cleaner/prompt.py +0 -0
  49. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/recursive_cleaner/report.py +0 -0
  50. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/recursive_cleaner/response.py +0 -0
  51. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/recursive_cleaner/schema.py +0 -0
  52. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/recursive_cleaner/types.py +0 -0
  53. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/recursive_cleaner/validation.py +0 -0
  54. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/recursive_cleaner/vendor/__init__.py +0 -0
  55. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/recursive_cleaner/vendor/chunker.py +0 -0
  56. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/test_cases/ecommerce_instructions.txt +0 -0
  57. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/test_cases/ecommerce_products.jsonl +0 -0
  58. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/test_cases/financial_instructions.txt +0 -0
  59. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/test_cases/financial_transactions.jsonl +0 -0
  60. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/test_cases/healthcare_instructions.txt +0 -0
  61. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/test_cases/healthcare_patients.jsonl +0 -0
  62. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/test_cases/run_ecommerce_test.py +0 -0
  63. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/test_cases/run_financial_test.py +0 -0
  64. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/test_cases/run_healthcare_test.py +0 -0
  65. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/__init__.py +0 -0
  66. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_callbacks.py +0 -0
  67. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_cleaner.py +0 -0
  68. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_context.py +0 -0
  69. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_dependencies.py +0 -0
  70. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_dry_run.py +0 -0
  71. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_holdout.py +0 -0
  72. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_incremental.py +0 -0
  73. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_integration.py +0 -0
  74. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_latency.py +0 -0
  75. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_metrics.py +0 -0
  76. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_optimizer.py +0 -0
  77. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_output.py +0 -0
  78. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_parser_generator.py +0 -0
  79. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_parsers.py +0 -0
  80. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_report.py +0 -0
  81. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_sampling.py +0 -0
  82. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_schema.py +0 -0
  83. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_text_mode.py +0 -0
  84. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_validation.py +0 -0
  85. {recursive_cleaner-0.7.0 → recursive_cleaner-0.7.1}/tests/test_vendor_chunker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: recursive-cleaner
3
- Version: 0.7.0
3
+ Version: 0.7.1
4
4
  Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
5
5
  Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
6
6
  Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
@@ -40,7 +40,7 @@ LLM-powered incremental data cleaning for massive datasets. Process files in chu
40
40
 
41
41
  ## How It Works
42
42
 
43
- 1. **Chunk** your data (JSONL, CSV, JSON, or text)
43
+ 1. **Chunk** your data (JSONL, CSV, JSON, Parquet, PDF, Word, Excel, XML, and more)
44
44
  2. **Analyze** each chunk with an LLM to identify issues
45
45
  3. **Generate** one cleaning function per issue
46
46
  4. **Validate** functions on holdout data before accepting
@@ -59,6 +59,16 @@ For Apple Silicon (MLX backend):
59
59
  pip install -e ".[mlx]"
60
60
  ```
61
61
 
62
+ For document conversion (PDF, Word, Excel, HTML, etc.):
63
+ ```bash
64
+ pip install -e ".[markitdown]"
65
+ ```
66
+
67
+ For Parquet files:
68
+ ```bash
69
+ pip install -e ".[parquet]"
70
+ ```
71
+
62
72
  ## Quick Start
63
73
 
64
74
  ```python
@@ -111,6 +121,11 @@ cleaner.run() # Generates cleaning_functions.py
111
121
  - **Cleaning Reports**: Markdown summary with functions, timing, quality delta
112
122
  - **Dry-Run Mode**: Analyze data without generating functions
113
123
 
124
+ ### Format Expansion (v0.7.0)
125
+ - **Markitdown Integration**: Convert 20+ formats (PDF, Word, Excel, PowerPoint, HTML, EPUB, etc.) to text
126
+ - **Parquet Support**: Load parquet files as structured data via pyarrow
127
+ - **LLM-Generated Parsers**: Auto-generate parsers for XML and unknown formats (`auto_parse=True`)
128
+
114
129
  ## Configuration
115
130
 
116
131
  ```python
@@ -142,6 +157,9 @@ cleaner = DataCleaner(
142
157
  report_path="report.md", # Markdown report output (None to disable)
143
158
  dry_run=False, # Analyze without generating functions
144
159
 
160
+ # Format Expansion
161
+ auto_parse=False, # LLM generates parser for unknown formats
162
+
145
163
  # Progress & State
146
164
  on_progress=callback, # Progress event callback
147
165
  state_file="state.json", # Enable resume on interrupt
@@ -235,20 +253,21 @@ cleaner.run()
235
253
 
236
254
  ```
237
255
  recursive_cleaner/
238
- ├── cleaner.py # Main DataCleaner class (~580 lines)
239
- ├── context.py # Docstring registry with FIFO eviction
240
- ├── dependencies.py # Topological sort for function ordering
241
- ├── metrics.py # Quality metrics before/after
242
- ├── optimizer.py # Two-pass consolidation with LLM agency
243
- ├── output.py # Function file generation + import consolidation
244
- ├── parsers.py # Chunking for JSONL/CSV/JSON/text + sampling
245
- ├── prompt.py # LLM prompt templates
246
- ├── report.py # Markdown report generation
247
- ├── response.py # XML/markdown parsing + agency dataclasses
248
- ├── schema.py # Schema inference
249
- ├── validation.py # Runtime validation + holdout
256
+ ├── cleaner.py # Main DataCleaner class
257
+ ├── context.py # Docstring registry with FIFO eviction
258
+ ├── dependencies.py # Topological sort for function ordering
259
+ ├── metrics.py # Quality metrics before/after
260
+ ├── optimizer.py # Two-pass consolidation with LLM agency
261
+ ├── output.py # Function file generation + import consolidation
262
+ ├── parser_generator.py # LLM-generated parsers for unknown formats
263
+ ├── parsers.py # Chunking for all formats + sampling
264
+ ├── prompt.py # LLM prompt templates
265
+ ├── report.py # Markdown report generation
266
+ ├── response.py # XML/markdown parsing + agency dataclasses
267
+ ├── schema.py # Schema inference
268
+ ├── validation.py # Runtime validation + holdout
250
269
  └── vendor/
251
- └── chunker.py # Vendored sentence-aware chunker
270
+ └── chunker.py # Vendored sentence-aware chunker
252
271
  ```
253
272
 
254
273
  ## Testing
@@ -257,7 +276,7 @@ recursive_cleaner/
257
276
  pytest tests/ -v
258
277
  ```
259
278
 
260
- 392 tests covering all features. Test datasets in `test_cases/`:
279
+ 432 tests covering all features. Test datasets in `test_cases/`:
261
280
  - E-commerce product catalogs
262
281
  - Healthcare patient records
263
282
  - Financial transaction data
@@ -273,6 +292,7 @@ pytest tests/ -v
273
292
 
274
293
  | Version | Features |
275
294
  |---------|----------|
295
+ | v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
276
296
  | v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
277
297
  | v0.5.1 | Dangerous code detection (AST-based security) |
278
298
  | v0.5.0 | Two-pass optimization, early termination, LLM agency |
@@ -4,7 +4,7 @@ LLM-powered incremental data cleaning for massive datasets. Process files in chu
4
4
 
5
5
  ## How It Works
6
6
 
7
- 1. **Chunk** your data (JSONL, CSV, JSON, or text)
7
+ 1. **Chunk** your data (JSONL, CSV, JSON, Parquet, PDF, Word, Excel, XML, and more)
8
8
  2. **Analyze** each chunk with an LLM to identify issues
9
9
  3. **Generate** one cleaning function per issue
10
10
  4. **Validate** functions on holdout data before accepting
@@ -23,6 +23,16 @@ For Apple Silicon (MLX backend):
23
23
  pip install -e ".[mlx]"
24
24
  ```
25
25
 
26
+ For document conversion (PDF, Word, Excel, HTML, etc.):
27
+ ```bash
28
+ pip install -e ".[markitdown]"
29
+ ```
30
+
31
+ For Parquet files:
32
+ ```bash
33
+ pip install -e ".[parquet]"
34
+ ```
35
+
26
36
  ## Quick Start
27
37
 
28
38
  ```python
@@ -75,6 +85,11 @@ cleaner.run() # Generates cleaning_functions.py
75
85
  - **Cleaning Reports**: Markdown summary with functions, timing, quality delta
76
86
  - **Dry-Run Mode**: Analyze data without generating functions
77
87
 
88
+ ### Format Expansion (v0.7.0)
89
+ - **Markitdown Integration**: Convert 20+ formats (PDF, Word, Excel, PowerPoint, HTML, EPUB, etc.) to text
90
+ - **Parquet Support**: Load parquet files as structured data via pyarrow
91
+ - **LLM-Generated Parsers**: Auto-generate parsers for XML and unknown formats (`auto_parse=True`)
92
+
78
93
  ## Configuration
79
94
 
80
95
  ```python
@@ -106,6 +121,9 @@ cleaner = DataCleaner(
106
121
  report_path="report.md", # Markdown report output (None to disable)
107
122
  dry_run=False, # Analyze without generating functions
108
123
 
124
+ # Format Expansion
125
+ auto_parse=False, # LLM generates parser for unknown formats
126
+
109
127
  # Progress & State
110
128
  on_progress=callback, # Progress event callback
111
129
  state_file="state.json", # Enable resume on interrupt
@@ -199,20 +217,21 @@ cleaner.run()
199
217
 
200
218
  ```
201
219
  recursive_cleaner/
202
- ├── cleaner.py # Main DataCleaner class (~580 lines)
203
- ├── context.py # Docstring registry with FIFO eviction
204
- ├── dependencies.py # Topological sort for function ordering
205
- ├── metrics.py # Quality metrics before/after
206
- ├── optimizer.py # Two-pass consolidation with LLM agency
207
- ├── output.py # Function file generation + import consolidation
208
- ├── parsers.py # Chunking for JSONL/CSV/JSON/text + sampling
209
- ├── prompt.py # LLM prompt templates
210
- ├── report.py # Markdown report generation
211
- ├── response.py # XML/markdown parsing + agency dataclasses
212
- ├── schema.py # Schema inference
213
- ├── validation.py # Runtime validation + holdout
220
+ ├── cleaner.py # Main DataCleaner class
221
+ ├── context.py # Docstring registry with FIFO eviction
222
+ ├── dependencies.py # Topological sort for function ordering
223
+ ├── metrics.py # Quality metrics before/after
224
+ ├── optimizer.py # Two-pass consolidation with LLM agency
225
+ ├── output.py # Function file generation + import consolidation
226
+ ├── parser_generator.py # LLM-generated parsers for unknown formats
227
+ ├── parsers.py # Chunking for all formats + sampling
228
+ ├── prompt.py # LLM prompt templates
229
+ ├── report.py # Markdown report generation
230
+ ├── response.py # XML/markdown parsing + agency dataclasses
231
+ ├── schema.py # Schema inference
232
+ ├── validation.py # Runtime validation + holdout
214
233
  └── vendor/
215
- └── chunker.py # Vendored sentence-aware chunker
234
+ └── chunker.py # Vendored sentence-aware chunker
216
235
  ```
217
236
 
218
237
  ## Testing
@@ -221,7 +240,7 @@ recursive_cleaner/
221
240
  pytest tests/ -v
222
241
  ```
223
242
 
224
- 392 tests covering all features. Test datasets in `test_cases/`:
243
+ 432 tests covering all features. Test datasets in `test_cases/`:
225
244
  - E-commerce product catalogs
226
245
  - Healthcare patient records
227
246
  - Financial transaction data
@@ -237,6 +256,7 @@ pytest tests/ -v
237
256
 
238
257
  | Version | Features |
239
258
  |---------|----------|
259
+ | v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
240
260
  | v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
241
261
  | v0.5.1 | Dangerous code detection (AST-based security) |
242
262
  | v0.5.0 | Two-pass optimization, early termination, LLM agency |
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "recursive-cleaner"
7
- version = "0.7.0"
7
+ version = "0.7.1"
8
8
  description = "LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions"
9
9
  readme = "README.md"
10
10
  license = "MIT"