recursive-cleaner 0.7.1__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. recursive_cleaner-1.0.0/AGENTS.md +1 -0
  2. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/CLAUDE.md +14 -2
  3. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/PKG-INFO +119 -4
  4. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/README.md +110 -2
  5. recursive_cleaner-1.0.0/TODO.md +119 -0
  6. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/backends/__init__.py +2 -1
  7. recursive_cleaner-1.0.0/backends/openai_backend.py +71 -0
  8. recursive_cleaner-1.0.0/demo_tui.py +54 -0
  9. recursive_cleaner-1.0.0/docs/contracts/v080-api-contract.md +62 -0
  10. recursive_cleaner-1.0.0/docs/contracts/v080-data-schema.md +90 -0
  11. recursive_cleaner-1.0.0/docs/contracts/v080-success-criteria.md +70 -0
  12. recursive_cleaner-1.0.0/docs/contracts/v090-cli-contract.md +197 -0
  13. recursive_cleaner-1.0.0/docs/contracts/v090-success-criteria.md +153 -0
  14. recursive_cleaner-1.0.0/docs/contracts/v100-api-contract.md +124 -0
  15. recursive_cleaner-1.0.0/docs/contracts/v100-success-criteria.md +127 -0
  16. recursive_cleaner-1.0.0/docs/handoffs/v090-research-handoff.md +71 -0
  17. recursive_cleaner-1.0.0/docs/handoffs/v100-research-handoff.md +46 -0
  18. recursive_cleaner-1.0.0/docs/implementation-plan-v080.md +182 -0
  19. recursive_cleaner-1.0.0/docs/research/cli-backend-patterns.md +302 -0
  20. recursive_cleaner-1.0.0/docs/research/cli-local-research.md +187 -0
  21. recursive_cleaner-1.0.0/docs/research/rich-tui-patterns.md +110 -0
  22. recursive_cleaner-1.0.0/docs/research/v100-apply-mode-research.md +294 -0
  23. recursive_cleaner-1.0.0/docs/v090-implementation-plan.md +147 -0
  24. recursive_cleaner-1.0.0/docs/v100-implementation-plan.md +234 -0
  25. recursive_cleaner-1.0.0/docs/workflow-state.md +50 -0
  26. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/pyproject.toml +15 -2
  27. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/__init__.py +5 -0
  28. recursive_cleaner-1.0.0/recursive_cleaner/__main__.py +8 -0
  29. recursive_cleaner-1.0.0/recursive_cleaner/apply.py +483 -0
  30. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/cleaner.py +122 -29
  31. recursive_cleaner-1.0.0/recursive_cleaner/cli.py +395 -0
  32. recursive_cleaner-1.0.0/recursive_cleaner/tui.py +614 -0
  33. recursive_cleaner-1.0.0/tests/test_apply.py +645 -0
  34. recursive_cleaner-1.0.0/tests/test_cli.py +436 -0
  35. recursive_cleaner-1.0.0/tests/test_openai_backend.py +235 -0
  36. recursive_cleaner-1.0.0/tests/test_tui.py +758 -0
  37. recursive_cleaner-0.7.1/TODO.md +0 -129
  38. recursive_cleaner-0.7.1/docs/workflow-state.md +0 -26
  39. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/.gitignore +0 -0
  40. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/LICENSE +0 -0
  41. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/backends/mlx_backend.py +0 -0
  42. {recursive_cleaner-0.7.1/docs → recursive_cleaner-1.0.0/docs/archive}/langchain-analysis.md +0 -0
  43. {recursive_cleaner-0.7.1/docs → recursive_cleaner-1.0.0/docs/archive}/langgraph-analysis.md +0 -0
  44. {recursive_cleaner-0.7.1/docs → recursive_cleaner-1.0.0/docs/archive}/other-frameworks-analysis.md +0 -0
  45. {recursive_cleaner-0.7.1/docs → recursive_cleaner-1.0.0/docs/archive}/smolagents-analysis.md +0 -0
  46. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/contracts/api-contract.md +0 -0
  47. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/contracts/data-schema.md +0 -0
  48. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/contracts/success-criteria.md +0 -0
  49. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/contracts/text-mode-contract.md +0 -0
  50. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/contracts/tier2-contract.md +0 -0
  51. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/contracts/tier4-contract.md +0 -0
  52. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/contracts/tier4-success-criteria.md +0 -0
  53. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/contracts/two-pass-contract.md +0 -0
  54. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/contracts/v070-success-criteria.md +0 -0
  55. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/handoffs/tier4-handoff.md +0 -0
  56. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/implementation-plan-tier4.md +0 -0
  57. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/implementation-plan-v03.md +0 -0
  58. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/implementation-plan-v04.md +0 -0
  59. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/implementation-plan-v05.md +0 -0
  60. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/implementation-plan.md +0 -0
  61. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/mlx-lm-guide.md +0 -0
  62. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/refactor-assessment/data/dependency.json +0 -0
  63. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/refactor-assessment/data/stats.json +0 -0
  64. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/refactor-assessment/plan.md +0 -0
  65. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/refactor-assessment/report.md +0 -0
  66. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/research/chonkie-extraction.md +0 -0
  67. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/research/chonkie.md +0 -0
  68. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/research/markitdown.md +0 -0
  69. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/context.py +0 -0
  70. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/dependencies.py +0 -0
  71. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/errors.py +0 -0
  72. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/metrics.py +0 -0
  73. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/optimizer.py +0 -0
  74. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/output.py +0 -0
  75. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/parser_generator.py +0 -0
  76. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/parsers.py +0 -0
  77. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/prompt.py +0 -0
  78. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/report.py +0 -0
  79. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/response.py +0 -0
  80. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/schema.py +0 -0
  81. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/types.py +0 -0
  82. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/validation.py +0 -0
  83. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/vendor/__init__.py +0 -0
  84. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/vendor/chunker.py +0 -0
  85. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/test_cases/ecommerce_instructions.txt +0 -0
  86. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/test_cases/ecommerce_products.jsonl +0 -0
  87. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/test_cases/financial_instructions.txt +0 -0
  88. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/test_cases/financial_transactions.jsonl +0 -0
  89. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/test_cases/healthcare_instructions.txt +0 -0
  90. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/test_cases/healthcare_patients.jsonl +0 -0
  91. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/test_cases/run_ecommerce_test.py +0 -0
  92. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/test_cases/run_financial_test.py +0 -0
  93. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/test_cases/run_healthcare_test.py +0 -0
  94. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/__init__.py +0 -0
  95. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_callbacks.py +0 -0
  96. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_cleaner.py +0 -0
  97. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_context.py +0 -0
  98. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_dependencies.py +0 -0
  99. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_dry_run.py +0 -0
  100. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_holdout.py +0 -0
  101. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_incremental.py +0 -0
  102. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_integration.py +0 -0
  103. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_latency.py +0 -0
  104. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_metrics.py +0 -0
  105. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_optimizer.py +0 -0
  106. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_output.py +0 -0
  107. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_parser_generator.py +0 -0
  108. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_parsers.py +0 -0
  109. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_report.py +0 -0
  110. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_sampling.py +0 -0
  111. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_schema.py +0 -0
  112. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_text_mode.py +0 -0
  113. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_validation.py +0 -0
  114. {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_vendor_chunker.py +0 -0
@@ -0,0 +1 @@
1
+ CLAUDE.md
@@ -4,7 +4,11 @@
4
4
 
5
5
  | Version | Status | Date |
6
6
  |---------|--------|------|
7
- | v0.6.0 | **Implemented** | 2025-01-15 |
7
+ | v1.0.0 | **Implemented** | 2025-01-30 |
8
+ | v0.9.0 | Implemented | 2025-01-19 |
9
+ | v0.8.0 | Implemented | 2025-01-19 |
10
+ | v0.7.0 | Implemented | 2025-01-17 |
11
+ | v0.6.0 | Implemented | 2025-01-15 |
8
12
  | v0.5.1 | Implemented | 2025-01-15 |
9
13
  | v0.5.0 | Implemented | 2025-01-15 |
10
14
  | v0.4.0 | Implemented | 2025-01-15 |
@@ -12,9 +16,13 @@
12
16
  | v0.2.0 | Implemented | 2025-01-14 |
13
17
  | v0.1.0 | Implemented | 2025-01-14 |
14
18
 
15
- **Current State**: v0.6.0 complete. 392 tests passing, 2,967 lines total.
19
+ **Current State**: v1.0.0 complete. 548 tests passing.
16
20
 
17
21
  ### Version History
22
+ - **v1.0.0**: Apply mode for applying cleaning functions to data, Excel support, TUI color enhancement
23
+ - **v0.9.0**: CLI tool with MLX and OpenAI-compatible backends (LM Studio, Ollama)
24
+ - **v0.8.0**: Terminal UI with Rich dashboard, mission control aesthetic, transmission log
25
+ - **v0.7.0**: Markitdown integration (20+ formats), Parquet support, LLM-generated parsers
18
26
  - **v0.6.0**: Latency metrics, import consolidation, cleaning report, dry-run mode
19
27
  - **v0.5.1**: Dangerous code detection (AST-based security)
20
28
  - **v0.5.0**: Two-pass optimization with LLM agency (consolidation, early termination)
@@ -69,6 +77,8 @@ cleaner = DataCleaner(
69
77
  # Observability (v0.6.0)
70
78
  report_path="cleaning_report.md", # Generate markdown report (None to disable)
71
79
  dry_run=False, # Set True to analyze without generating functions
80
+ # Terminal UI (v0.8.0)
81
+ tui=True, # Enable Rich dashboard (requires pip install recursive-cleaner[tui])
72
82
  )
73
83
 
74
84
  cleaner.run() # Outputs: cleaning_functions.py, cleaning_report.md
@@ -159,6 +169,7 @@ recursive_cleaner/
159
169
  report.py # Markdown report generation (~120 lines) [v0.6.0]
160
170
  response.py # XML/markdown parsing + agency dataclasses (~292 lines)
161
171
  schema.py # Schema inference (~117 lines) [v0.2.0]
172
+ tui.py # Rich terminal dashboard (~520 lines) [v0.8.0]
162
173
  types.py # LLMBackend protocol (~11 lines)
163
174
  validation.py # Runtime validation + safety checks (~200 lines)
164
175
  vendor/
@@ -187,6 +198,7 @@ tests/ # 392 tests
187
198
  test_sampling.py # Sampling strategy tests [v0.4.0]
188
199
  test_schema.py # Schema inference tests
189
200
  test_text_mode.py # Text mode tests [v0.3.0]
201
+ test_tui.py # Terminal UI tests [v0.8.0]
190
202
  test_validation.py # Runtime validation + safety tests
191
203
  test_vendor_chunker.py # Vendored chunker tests [v0.3.0]
192
204
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: recursive-cleaner
3
- Version: 0.7.1
3
+ Version: 1.0.0
4
4
  Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
5
5
  Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
6
6
  Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
@@ -9,7 +9,7 @@ Author: Gary Tran
9
9
  License-Expression: MIT
10
10
  License-File: LICENSE
11
11
  Keywords: automation,data-cleaning,data-quality,etl,llm,machine-learning
12
- Classifier: Development Status :: 4 - Beta
12
+ Classifier: Development Status :: 5 - Production/Stable
13
13
  Classifier: Intended Audience :: Developers
14
14
  Classifier: Intended Audience :: Science/Research
15
15
  Classifier: License :: OSI Approved :: MIT License
@@ -26,12 +26,19 @@ Requires-Dist: tenacity>=8.0
26
26
  Provides-Extra: dev
27
27
  Requires-Dist: pytest-cov>=4.0; extra == 'dev'
28
28
  Requires-Dist: pytest>=7.0; extra == 'dev'
29
+ Provides-Extra: excel
30
+ Requires-Dist: openpyxl>=3.0.0; extra == 'excel'
31
+ Requires-Dist: xlrd>=2.0.0; extra == 'excel'
29
32
  Provides-Extra: markitdown
30
33
  Requires-Dist: markitdown>=0.1.0; extra == 'markitdown'
31
34
  Provides-Extra: mlx
32
35
  Requires-Dist: mlx-lm>=0.10.0; extra == 'mlx'
36
+ Provides-Extra: openai
37
+ Requires-Dist: openai>=1.0.0; extra == 'openai'
33
38
  Provides-Extra: parquet
34
39
  Requires-Dist: pyarrow>=14.0.0; extra == 'parquet'
40
+ Provides-Extra: tui
41
+ Requires-Dist: rich>=13.0; extra == 'tui'
35
42
  Description-Content-Type: text/markdown
36
43
 
37
44
  # Recursive Data Cleaner
@@ -69,6 +76,11 @@ For Parquet files:
69
76
  pip install -e ".[parquet]"
70
77
  ```
71
78
 
79
+ For Terminal UI (Rich dashboard):
80
+ ```bash
81
+ pip install -e ".[tui]"
82
+ ```
83
+
72
84
  ## Quick Start
73
85
 
74
86
  ```python
@@ -126,6 +138,98 @@ cleaner.run() # Generates cleaning_functions.py
126
138
  - **Parquet Support**: Load parquet files as structured data via pyarrow
127
139
  - **LLM-Generated Parsers**: Auto-generate parsers for XML and unknown formats (`auto_parse=True`)
128
140
 
141
+ ### Terminal UI (v0.8.0)
142
+ - **Mission Control Dashboard**: Rich-based live terminal UI with retro aesthetic
143
+ - **Real-time Progress**: Animated progress bars, chunk/iteration counters
144
+ - **Transmission Log**: Parsed LLM responses showing issues detected and functions being generated
145
+ - **Token Estimation**: Track estimated input/output tokens across the run
146
+ - **Graceful Fallback**: Works without Rich installed (falls back to callbacks)
147
+
148
+ ### CLI (v0.9.0)
149
+ - **Command Line Interface**: Use without writing Python code
150
+ - **Multiple Backends**: MLX (Apple Silicon) and OpenAI-compatible (OpenAI, LM Studio, Ollama)
151
+ - **Four Commands**: `generate`, `analyze` (dry-run), `resume`, `apply`
152
+
153
+ ### Apply Mode (v1.0.0)
154
+ - **Apply Cleaning Functions**: Apply generated functions to full datasets
155
+ - **Data Formats**: JSONL, CSV, JSON, Parquet, Excel (.xlsx/.xls) output same format
156
+ - **Text Formats**: PDF, Word, HTML, etc. output as Markdown
157
+ - **Streaming**: Memory-efficient line-by-line processing for JSONL/CSV
158
+ - **Colored TUI**: Enhanced transmission log with syntax-highlighted XML parsing
159
+
160
+ ## Command Line Interface
161
+
162
+ After installation, the `recursive-cleaner` command is available:
163
+
164
+ ```bash
165
+ # Generate cleaning functions with MLX (Apple Silicon)
166
+ recursive-cleaner generate data.jsonl \
167
+ --provider mlx \
168
+ --model "lmstudio-community/Qwen3-80B-MLX-4bit" \
169
+ --instructions "Normalize phone numbers to E.164" \
170
+ --output cleaning_functions.py
171
+
172
+ # Use OpenAI
173
+ export OPENAI_API_KEY=your-key
174
+ recursive-cleaner generate data.jsonl \
175
+ --provider openai \
176
+ --model gpt-4o \
177
+ --instructions "Fix date formats"
178
+
179
+ # Use LM Studio or Ollama (OpenAI-compatible)
180
+ recursive-cleaner generate data.jsonl \
181
+ --provider openai \
182
+ --model "qwen/qwen3-vl-30b" \
183
+ --base-url http://localhost:1234/v1 \
184
+ --instructions "Normalize prices"
185
+
186
+ # Dry-run analysis
187
+ recursive-cleaner analyze data.jsonl \
188
+ --provider openai \
189
+ --model gpt-4o \
190
+ --instructions @instructions.txt
191
+
192
+ # Resume from checkpoint
193
+ recursive-cleaner resume cleaning_state.json \
194
+ --provider mlx \
195
+ --model "model-path"
196
+
197
+ # Apply cleaning functions to data
198
+ recursive-cleaner apply data.jsonl \
199
+ --functions cleaning_functions.py \
200
+ --output cleaned_data.jsonl
201
+
202
+ # Apply to Excel (outputs same format)
203
+ recursive-cleaner apply sales.xlsx \
204
+ --functions cleaning_functions.py
205
+
206
+ # Apply to PDF (outputs markdown)
207
+ recursive-cleaner apply document.pdf \
208
+ --functions cleaning_functions.py \
209
+ --output cleaned.md
210
+ ```
211
+
212
+ ### CLI Options
213
+
214
+ ```
215
+ recursive-cleaner generate <FILE> [OPTIONS]
216
+
217
+ Required:
218
+ FILE Input data file
219
+ -p, --provider {mlx,openai} LLM provider
220
+ -m, --model MODEL Model name/path
221
+
222
+ Optional:
223
+ -i, --instructions TEXT Cleaning instructions (or @file.txt)
224
+ --base-url URL API URL for OpenAI-compatible servers
225
+ --chunk-size N Items per chunk (default: 50)
226
+ --max-iterations N Max iterations per chunk (default: 5)
227
+ -o, --output PATH Output file (default: cleaning_functions.py)
228
+ --tui Enable Rich dashboard
229
+ --optimize Consolidate redundant functions
230
+ --track-metrics Measure before/after quality
231
+ ```
232
+
129
233
  ## Configuration
130
234
 
131
235
  ```python
@@ -160,6 +264,9 @@ cleaner = DataCleaner(
160
264
  # Format Expansion
161
265
  auto_parse=False, # LLM generates parser for unknown formats
162
266
 
267
+ # Terminal UI
268
+ tui=True, # Enable Rich dashboard (requires [tui] extra)
269
+
163
270
  # Progress & State
164
271
  on_progress=callback, # Progress event callback
165
272
  state_file="state.json", # Enable resume on interrupt
@@ -253,6 +360,7 @@ cleaner.run()
253
360
 
254
361
  ```
255
362
  recursive_cleaner/
363
+ ├── cli.py # Command line interface
256
364
  ├── cleaner.py # Main DataCleaner class
257
365
  ├── context.py # Docstring registry with FIFO eviction
258
366
  ├── dependencies.py # Topological sort for function ordering
@@ -265,9 +373,14 @@ recursive_cleaner/
265
373
  ├── report.py # Markdown report generation
266
374
  ├── response.py # XML/markdown parsing + agency dataclasses
267
375
  ├── schema.py # Schema inference
376
+ ├── tui.py # Rich terminal dashboard
268
377
  ├── validation.py # Runtime validation + holdout
269
378
  └── vendor/
270
379
  └── chunker.py # Vendored sentence-aware chunker
380
+
381
+ backends/
382
+ ├── mlx_backend.py # MLX-LM backend for Apple Silicon
383
+ └── openai_backend.py # OpenAI-compatible backend
271
384
  ```
272
385
 
273
386
  ## Testing
@@ -276,14 +389,14 @@ recursive_cleaner/
276
389
  pytest tests/ -v
277
390
  ```
278
391
 
279
- 432 tests covering all features. Test datasets in `test_cases/`:
392
+ 548 tests covering all features. Test datasets in `test_cases/`:
280
393
  - E-commerce product catalogs
281
394
  - Healthcare patient records
282
395
  - Financial transaction data
283
396
 
284
397
  ## Philosophy
285
398
 
286
- - **Simplicity over extensibility**: ~3,000 lines that do one thing well
399
+ - **Simplicity over extensibility**: ~5,000 lines that do one thing well
287
400
  - **stdlib over dependencies**: Only `tenacity` required
288
401
  - **Retry over recover**: On error, retry with error in prompt
289
402
  - **Wu wei**: Let the LLM make decisions about data it understands
@@ -292,6 +405,8 @@ pytest tests/ -v
292
405
 
293
406
  | Version | Features |
294
407
  |---------|----------|
408
+ | v0.9.0 | CLI tool with MLX and OpenAI-compatible backends (LM Studio, Ollama) |
409
+ | v0.8.0 | Terminal UI with Rich dashboard, mission control aesthetic, transmission log |
295
410
  | v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
296
411
  | v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
297
412
  | v0.5.1 | Dangerous code detection (AST-based security) |
@@ -33,6 +33,11 @@ For Parquet files:
33
33
  pip install -e ".[parquet]"
34
34
  ```
35
35
 
36
+ For Terminal UI (Rich dashboard):
37
+ ```bash
38
+ pip install -e ".[tui]"
39
+ ```
40
+
36
41
  ## Quick Start
37
42
 
38
43
  ```python
@@ -90,6 +95,98 @@ cleaner.run() # Generates cleaning_functions.py
90
95
  - **Parquet Support**: Load parquet files as structured data via pyarrow
91
96
  - **LLM-Generated Parsers**: Auto-generate parsers for XML and unknown formats (`auto_parse=True`)
92
97
 
98
+ ### Terminal UI (v0.8.0)
99
+ - **Mission Control Dashboard**: Rich-based live terminal UI with retro aesthetic
100
+ - **Real-time Progress**: Animated progress bars, chunk/iteration counters
101
+ - **Transmission Log**: Parsed LLM responses showing issues detected and functions being generated
102
+ - **Token Estimation**: Track estimated input/output tokens across the run
103
+ - **Graceful Fallback**: Works without Rich installed (falls back to callbacks)
104
+
105
+ ### CLI (v0.9.0)
106
+ - **Command Line Interface**: Use without writing Python code
107
+ - **Multiple Backends**: MLX (Apple Silicon) and OpenAI-compatible (OpenAI, LM Studio, Ollama)
108
+ - **Four Commands**: `generate`, `analyze` (dry-run), `resume`, `apply`
109
+
110
+ ### Apply Mode (v1.0.0)
111
+ - **Apply Cleaning Functions**: Apply generated functions to full datasets
112
+ - **Data Formats**: JSONL, CSV, JSON, Parquet, Excel (.xlsx/.xls) output same format
113
+ - **Text Formats**: PDF, Word, HTML, etc. output as Markdown
114
+ - **Streaming**: Memory-efficient line-by-line processing for JSONL/CSV
115
+ - **Colored TUI**: Enhanced transmission log with syntax-highlighted XML parsing
116
+
117
+ ## Command Line Interface
118
+
119
+ After installation, the `recursive-cleaner` command is available:
120
+
121
+ ```bash
122
+ # Generate cleaning functions with MLX (Apple Silicon)
123
+ recursive-cleaner generate data.jsonl \
124
+ --provider mlx \
125
+ --model "lmstudio-community/Qwen3-80B-MLX-4bit" \
126
+ --instructions "Normalize phone numbers to E.164" \
127
+ --output cleaning_functions.py
128
+
129
+ # Use OpenAI
130
+ export OPENAI_API_KEY=your-key
131
+ recursive-cleaner generate data.jsonl \
132
+ --provider openai \
133
+ --model gpt-4o \
134
+ --instructions "Fix date formats"
135
+
136
+ # Use LM Studio or Ollama (OpenAI-compatible)
137
+ recursive-cleaner generate data.jsonl \
138
+ --provider openai \
139
+ --model "qwen/qwen3-vl-30b" \
140
+ --base-url http://localhost:1234/v1 \
141
+ --instructions "Normalize prices"
142
+
143
+ # Dry-run analysis
144
+ recursive-cleaner analyze data.jsonl \
145
+ --provider openai \
146
+ --model gpt-4o \
147
+ --instructions @instructions.txt
148
+
149
+ # Resume from checkpoint
150
+ recursive-cleaner resume cleaning_state.json \
151
+ --provider mlx \
152
+ --model "model-path"
153
+
154
+ # Apply cleaning functions to data
155
+ recursive-cleaner apply data.jsonl \
156
+ --functions cleaning_functions.py \
157
+ --output cleaned_data.jsonl
158
+
159
+ # Apply to Excel (outputs same format)
160
+ recursive-cleaner apply sales.xlsx \
161
+ --functions cleaning_functions.py
162
+
163
+ # Apply to PDF (outputs markdown)
164
+ recursive-cleaner apply document.pdf \
165
+ --functions cleaning_functions.py \
166
+ --output cleaned.md
167
+ ```
168
+
169
+ ### CLI Options
170
+
171
+ ```
172
+ recursive-cleaner generate <FILE> [OPTIONS]
173
+
174
+ Required:
175
+ FILE Input data file
176
+ -p, --provider {mlx,openai} LLM provider
177
+ -m, --model MODEL Model name/path
178
+
179
+ Optional:
180
+ -i, --instructions TEXT Cleaning instructions (or @file.txt)
181
+ --base-url URL API URL for OpenAI-compatible servers
182
+ --chunk-size N Items per chunk (default: 50)
183
+ --max-iterations N Max iterations per chunk (default: 5)
184
+ -o, --output PATH Output file (default: cleaning_functions.py)
185
+ --tui Enable Rich dashboard
186
+ --optimize Consolidate redundant functions
187
+ --track-metrics Measure before/after quality
188
+ ```
189
+
93
190
  ## Configuration
94
191
 
95
192
  ```python
@@ -124,6 +221,9 @@ cleaner = DataCleaner(
124
221
  # Format Expansion
125
222
  auto_parse=False, # LLM generates parser for unknown formats
126
223
 
224
+ # Terminal UI
225
+ tui=True, # Enable Rich dashboard (requires [tui] extra)
226
+
127
227
  # Progress & State
128
228
  on_progress=callback, # Progress event callback
129
229
  state_file="state.json", # Enable resume on interrupt
@@ -217,6 +317,7 @@ cleaner.run()
217
317
 
218
318
  ```
219
319
  recursive_cleaner/
320
+ ├── cli.py # Command line interface
220
321
  ├── cleaner.py # Main DataCleaner class
221
322
  ├── context.py # Docstring registry with FIFO eviction
222
323
  ├── dependencies.py # Topological sort for function ordering
@@ -229,9 +330,14 @@ recursive_cleaner/
229
330
  ├── report.py # Markdown report generation
230
331
  ├── response.py # XML/markdown parsing + agency dataclasses
231
332
  ├── schema.py # Schema inference
333
+ ├── tui.py # Rich terminal dashboard
232
334
  ├── validation.py # Runtime validation + holdout
233
335
  └── vendor/
234
336
  └── chunker.py # Vendored sentence-aware chunker
337
+
338
+ backends/
339
+ ├── mlx_backend.py # MLX-LM backend for Apple Silicon
340
+ └── openai_backend.py # OpenAI-compatible backend
235
341
  ```
236
342
 
237
343
  ## Testing
@@ -240,14 +346,14 @@ recursive_cleaner/
240
346
  pytest tests/ -v
241
347
  ```
242
348
 
243
- 432 tests covering all features. Test datasets in `test_cases/`:
349
+ 548 tests covering all features. Test datasets in `test_cases/`:
244
350
  - E-commerce product catalogs
245
351
  - Healthcare patient records
246
352
  - Financial transaction data
247
353
 
248
354
  ## Philosophy
249
355
 
250
- - **Simplicity over extensibility**: ~3,000 lines that do one thing well
356
+ - **Simplicity over extensibility**: ~5,000 lines that do one thing well
251
357
  - **stdlib over dependencies**: Only `tenacity` required
252
358
  - **Retry over recover**: On error, retry with error in prompt
253
359
  - **Wu wei**: Let the LLM make decisions about data it understands
@@ -256,6 +362,8 @@ pytest tests/ -v
256
362
 
257
363
  | Version | Features |
258
364
  |---------|----------|
365
+ | v0.9.0 | CLI tool with MLX and OpenAI-compatible backends (LM Studio, Ollama) |
366
+ | v0.8.0 | Terminal UI with Rich dashboard, mission control aesthetic, transmission log |
259
367
  | v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
260
368
  | v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
261
369
  | v0.5.1 | Dangerous code detection (AST-based security) |
@@ -0,0 +1,119 @@
1
+ # TODO - Recursive Data Cleaner Roadmap
2
+
3
+ ## Current Version: v0.9.0
4
+
5
+ 502 tests passing, ~3,400 lines. CLI complete.
6
+
7
+ ---
8
+
9
+ ## Completed Work
10
+
11
+ | Version | Features |
12
+ |---------|----------|
13
+ | v0.1.0 | Core pipeline, chunking, docstring registry |
14
+ | v0.2.0 | Runtime validation, schema inference, callbacks, incremental saves |
15
+ | v0.3.0 | Text mode with sentence-aware chunking |
16
+ | v0.4.0 | Holdout validation, dependency resolution, smart sampling, quality metrics |
17
+ | v0.5.0 | Two-pass optimization, early termination, LLM agency |
18
+ | v0.5.1 | Dangerous code detection (AST-based security) |
19
+ | v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
20
+ | v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
21
+ | v0.8.0 | Terminal UI with Rich dashboard, mission control aesthetic |
22
+ | v0.9.0 | CLI tool with MLX and OpenAI-compatible backends |
23
+
24
+ ---
25
+
26
+ ## Version Progression
27
+
28
+ | Version | Theme |
29
+ |---------|-------|
30
+ | v0.1-0.2 | Core pipeline + validation |
31
+ | v0.3-0.4 | Data quality assurance |
32
+ | v0.5-0.6 | Optimization + observability |
33
+ | v0.7-0.8 | Accessibility (formats + UI) |
34
+ | v0.9-1.0 | Complete workflow |
35
+
36
+ ---
37
+
38
+ ## Roadmap to v1.0
39
+
40
+ ### v0.9.0 - CLI Tool ✅ COMPLETE
41
+
42
+ CLI implemented with:
43
+ - `recursive_cleaner/cli.py` - argparse CLI (346 lines)
44
+ - `backends/openai_backend.py` - OpenAI-compatible backend (71 lines)
45
+ - Commands: `generate`, `analyze`, `resume`
46
+ - Backends: MLX, OpenAI, LM Studio, Ollama (via --base-url)
47
+
48
+ ### v1.0.0 - Apply Mode (~150 lines)
49
+
50
+ The final step: actually cleaning the data, not just generating functions.
51
+
52
+ ```python
53
+ cleaner = DataCleaner(...)
54
+ cleaner.run() # Generates cleaning_functions.py
55
+
56
+ # NEW: Apply to full dataset
57
+ cleaner.apply(output_path="cleaned_data.jsonl")
58
+ ```
59
+
60
+ **Implementation:**
61
+ - [ ] `DataCleaner.apply(output_path)` method
62
+ - [ ] Stream-process file applying generated functions
63
+ - [ ] Progress callbacks for large files
64
+ - [ ] Validate output schema matches input
65
+ - [ ] CLI integration: `recursive-cleaner apply`
66
+
67
+ ---
68
+
69
+ ## Patterns That Worked
70
+
71
+ These patterns proved high-value with low implementation effort:
72
+
73
+ 1. **AST walking** - Dependency detection, dangerous code detection. ~50 lines each.
74
+ 2. **LLM agency** - Let model decide chunk cleanliness, saturation, consolidation. Elegant.
75
+ 3. **Retry with feedback** - On error, append error to prompt and retry. No complex recovery.
76
+ 4. **Holdout validation** - Test on unseen data before accepting. Catches edge cases.
77
+ 5. **Simple data structures** - List of dicts, JSON serialization. Easy to debug/resume.
78
+
79
+ ---
80
+
81
+ ## What We're Not Doing
82
+
83
+ | Feature | Reason |
84
+ |---------|--------|
85
+ | Global deduplication | Adds complexity, breaks chunk-based philosophy |
86
+ | Built-in LLM backends | Users bring their own, keeps us dependency-free |
87
+ | Config files (YAML/TOML) | Python is already config, YAGNI |
88
+ | Plugin system | No interfaces for things with one implementation |
89
+ | Async multi-chunk | Complexity not justified; sequential is predictable |
90
+ | Vector retrieval | Adds chromadb dependency; FIFO works for typical use |
91
+
92
+ ---
93
+
94
+ ## Line Count Budget
95
+
96
+ | Component | Current | After v1.0 |
97
+ |-----------|---------|------------|
98
+ | Core library | ~3,000 | ~3,350 |
99
+ | Tests | ~4,000 | ~4,400 |
100
+
101
+ Staying under 3,500 lines for the library keeps us true to the philosophy.
102
+
103
+ ---
104
+
105
+ ## Philosophy Reminder
106
+
107
+ From CLAUDE.md:
108
+ - **Simplicity over extensibility** - Keep it lean
109
+ - **stdlib over dependencies** - Only tenacity required
110
+ - **Functions over classes** - Unless state genuinely helps
111
+ - **Delete over abstract** - No interfaces for single implementations
112
+ - **Retry over recover** - On error, retry with error in prompt
113
+ - **Wu wei** - Let the LLM make decisions about data it understands
114
+
115
+ ---
116
+
117
+ ## Known Limitation
118
+
119
+ **Stateful ops within chunks only** - Deduplication and aggregations don't work globally. This is architectural and accepted.
@@ -1,5 +1,6 @@
1
1
  """Backend implementations for Recursive Data Cleaner."""
2
2
 
3
3
  from .mlx_backend import MLXBackend
4
+ from .openai_backend import OpenAIBackend
4
5
 
5
- __all__ = ["MLXBackend"]
6
+ __all__ = ["MLXBackend", "OpenAIBackend"]
@@ -0,0 +1,71 @@
1
+ """OpenAI-compatible backend for Recursive Data Cleaner."""
2
+
3
+ import os
4
+
5
+
6
+ class OpenAIBackend:
7
+ """
8
+ OpenAI-compatible backend implementation.
9
+
10
+ Works with OpenAI API, LM Studio, Ollama, and other OpenAI-compatible servers.
11
+ Conforms to the LLMBackend protocol.
12
+ """
13
+
14
+ def __init__(
15
+ self,
16
+ model: str,
17
+ api_key: str | None = None,
18
+ base_url: str | None = None,
19
+ max_tokens: int = 4096,
20
+ temperature: float = 0.7,
21
+ ):
22
+ """
23
+ Initialize the OpenAI backend.
24
+
25
+ Args:
26
+ model: Model name (e.g., "gpt-4o", "gpt-3.5-turbo")
27
+ api_key: API key (defaults to OPENAI_API_KEY env var, or "not-needed" for local)
28
+ base_url: API base URL (defaults to OpenAI's API)
29
+ max_tokens: Maximum tokens to generate
30
+ temperature: Sampling temperature
31
+ """
32
+ try:
33
+ import openai
34
+ except ImportError:
35
+ raise ImportError(
36
+ "OpenAI SDK not installed. Install with: pip install openai"
37
+ )
38
+
39
+ self.model = model
40
+ self.max_tokens = max_tokens
41
+ self.temperature = temperature
42
+
43
+ # Resolve API key: explicit > env var > "not-needed" for local servers
44
+ if api_key is not None:
45
+ resolved_key = api_key
46
+ else:
47
+ resolved_key = os.environ.get("OPENAI_API_KEY", "not-needed")
48
+
49
+ # Create client
50
+ self._client = openai.OpenAI(
51
+ api_key=resolved_key,
52
+ base_url=base_url,
53
+ )
54
+
55
+ def generate(self, prompt: str) -> str:
56
+ """
57
+ Generate a response from the LLM.
58
+
59
+ Args:
60
+ prompt: The input prompt
61
+
62
+ Returns:
63
+ The generated text response
64
+ """
65
+ response = self._client.chat.completions.create(
66
+ model=self.model,
67
+ messages=[{"role": "user", "content": prompt}],
68
+ max_tokens=self.max_tokens,
69
+ temperature=self.temperature,
70
+ )
71
+ return response.choices[0].message.content or ""
@@ -0,0 +1,54 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Demo script to showcase the Rich TUI with real MLX backend.
4
+
5
+ Run with:
6
+ python demo_tui.py
7
+
8
+ Requirements:
9
+ pip install recursive-cleaner[mlx,tui]
10
+ """
11
+
12
+ from backends import MLXBackend
13
+ from recursive_cleaner import DataCleaner
14
+
15
+ # Use a smaller/faster model for demo (change to your preferred model)
16
+ MODEL = "lmstudio-community/Qwen3-Next-80B-A3B-Instruct-MLX-4bit"
17
+
18
+ print("=" * 60)
19
+ print(" RECURSIVE DATA CLEANER - TUI DEMO")
20
+ print("=" * 60)
21
+ print(f"\nLoading model: {MODEL}")
22
+ print("This may take a moment on first run...\n")
23
+
24
+ llm = MLXBackend(
25
+ model_path=MODEL,
26
+ max_tokens=2048,
27
+ temperature=0.3, # Lower for more consistent output
28
+ verbose=False, # Disable token streaming to avoid interfering with TUI
29
+ )
30
+
31
+ cleaner = DataCleaner(
32
+ llm_backend=llm,
33
+ file_path="test_cases/ecommerce_products.jsonl",
34
+ chunk_size=5, # Small chunks for demo
35
+ max_iterations=3, # Limit iterations per chunk
36
+ instructions="""
37
+ E-commerce product data cleaning:
38
+ - Normalize prices to float (remove $ symbols)
39
+ - Fix category typos and normalize to Title Case
40
+ - Convert weights to kg as float
41
+ - Ensure stock_quantity is non-negative integer
42
+ """,
43
+ tui=True, # Enable the Rich dashboard!
44
+ track_metrics=True,
45
+ )
46
+
47
+ print("\nStarting cleaner with TUI enabled...")
48
+ print("Watch the dashboard below!\n")
49
+
50
+ cleaner.run()
51
+
52
+ print("\n" + "=" * 60)
53
+ print("Demo complete! Check cleaning_functions.py for output.")
54
+ print("=" * 60)