recursive-cleaner 0.7.1__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- recursive_cleaner-1.0.0/AGENTS.md +1 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/CLAUDE.md +14 -2
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/PKG-INFO +119 -4
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/README.md +110 -2
- recursive_cleaner-1.0.0/TODO.md +119 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/backends/__init__.py +2 -1
- recursive_cleaner-1.0.0/backends/openai_backend.py +71 -0
- recursive_cleaner-1.0.0/demo_tui.py +54 -0
- recursive_cleaner-1.0.0/docs/contracts/v080-api-contract.md +62 -0
- recursive_cleaner-1.0.0/docs/contracts/v080-data-schema.md +90 -0
- recursive_cleaner-1.0.0/docs/contracts/v080-success-criteria.md +70 -0
- recursive_cleaner-1.0.0/docs/contracts/v090-cli-contract.md +197 -0
- recursive_cleaner-1.0.0/docs/contracts/v090-success-criteria.md +153 -0
- recursive_cleaner-1.0.0/docs/contracts/v100-api-contract.md +124 -0
- recursive_cleaner-1.0.0/docs/contracts/v100-success-criteria.md +127 -0
- recursive_cleaner-1.0.0/docs/handoffs/v090-research-handoff.md +71 -0
- recursive_cleaner-1.0.0/docs/handoffs/v100-research-handoff.md +46 -0
- recursive_cleaner-1.0.0/docs/implementation-plan-v080.md +182 -0
- recursive_cleaner-1.0.0/docs/research/cli-backend-patterns.md +302 -0
- recursive_cleaner-1.0.0/docs/research/cli-local-research.md +187 -0
- recursive_cleaner-1.0.0/docs/research/rich-tui-patterns.md +110 -0
- recursive_cleaner-1.0.0/docs/research/v100-apply-mode-research.md +294 -0
- recursive_cleaner-1.0.0/docs/v090-implementation-plan.md +147 -0
- recursive_cleaner-1.0.0/docs/v100-implementation-plan.md +234 -0
- recursive_cleaner-1.0.0/docs/workflow-state.md +50 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/pyproject.toml +15 -2
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/__init__.py +5 -0
- recursive_cleaner-1.0.0/recursive_cleaner/__main__.py +8 -0
- recursive_cleaner-1.0.0/recursive_cleaner/apply.py +483 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/cleaner.py +122 -29
- recursive_cleaner-1.0.0/recursive_cleaner/cli.py +395 -0
- recursive_cleaner-1.0.0/recursive_cleaner/tui.py +614 -0
- recursive_cleaner-1.0.0/tests/test_apply.py +645 -0
- recursive_cleaner-1.0.0/tests/test_cli.py +436 -0
- recursive_cleaner-1.0.0/tests/test_openai_backend.py +235 -0
- recursive_cleaner-1.0.0/tests/test_tui.py +758 -0
- recursive_cleaner-0.7.1/TODO.md +0 -129
- recursive_cleaner-0.7.1/docs/workflow-state.md +0 -26
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/.gitignore +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/LICENSE +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/backends/mlx_backend.py +0 -0
- {recursive_cleaner-0.7.1/docs → recursive_cleaner-1.0.0/docs/archive}/langchain-analysis.md +0 -0
- {recursive_cleaner-0.7.1/docs → recursive_cleaner-1.0.0/docs/archive}/langgraph-analysis.md +0 -0
- {recursive_cleaner-0.7.1/docs → recursive_cleaner-1.0.0/docs/archive}/other-frameworks-analysis.md +0 -0
- {recursive_cleaner-0.7.1/docs → recursive_cleaner-1.0.0/docs/archive}/smolagents-analysis.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/contracts/api-contract.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/contracts/data-schema.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/contracts/success-criteria.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/contracts/text-mode-contract.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/contracts/tier2-contract.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/contracts/tier4-contract.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/contracts/tier4-success-criteria.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/contracts/two-pass-contract.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/contracts/v070-success-criteria.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/handoffs/tier4-handoff.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/implementation-plan-tier4.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/implementation-plan-v03.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/implementation-plan-v04.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/implementation-plan-v05.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/implementation-plan.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/mlx-lm-guide.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/refactor-assessment/data/dependency.json +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/refactor-assessment/data/stats.json +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/refactor-assessment/plan.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/refactor-assessment/report.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/research/chonkie-extraction.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/research/chonkie.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/docs/research/markitdown.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/context.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/dependencies.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/errors.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/metrics.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/optimizer.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/output.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/parser_generator.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/parsers.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/prompt.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/report.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/response.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/schema.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/types.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/validation.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/vendor/__init__.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/recursive_cleaner/vendor/chunker.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/test_cases/ecommerce_instructions.txt +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/test_cases/ecommerce_products.jsonl +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/test_cases/financial_instructions.txt +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/test_cases/financial_transactions.jsonl +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/test_cases/healthcare_instructions.txt +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/test_cases/healthcare_patients.jsonl +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/test_cases/run_ecommerce_test.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/test_cases/run_financial_test.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/test_cases/run_healthcare_test.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/__init__.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_callbacks.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_cleaner.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_context.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_dependencies.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_dry_run.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_holdout.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_incremental.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_integration.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_latency.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_metrics.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_optimizer.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_output.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_parser_generator.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_parsers.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_report.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_sampling.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_schema.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_text_mode.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_validation.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-1.0.0}/tests/test_vendor_chunker.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
CLAUDE.md
|
|
@@ -4,7 +4,11 @@
|
|
|
4
4
|
|
|
5
5
|
| Version | Status | Date |
|
|
6
6
|
|---------|--------|------|
|
|
7
|
-
|
|
|
7
|
+
| v1.0.0 | **Implemented** | 2025-01-30 |
|
|
8
|
+
| v0.9.0 | Implemented | 2025-01-19 |
|
|
9
|
+
| v0.8.0 | Implemented | 2025-01-19 |
|
|
10
|
+
| v0.7.0 | Implemented | 2025-01-17 |
|
|
11
|
+
| v0.6.0 | Implemented | 2025-01-15 |
|
|
8
12
|
| v0.5.1 | Implemented | 2025-01-15 |
|
|
9
13
|
| v0.5.0 | Implemented | 2025-01-15 |
|
|
10
14
|
| v0.4.0 | Implemented | 2025-01-15 |
|
|
@@ -12,9 +16,13 @@
|
|
|
12
16
|
| v0.2.0 | Implemented | 2025-01-14 |
|
|
13
17
|
| v0.1.0 | Implemented | 2025-01-14 |
|
|
14
18
|
|
|
15
|
-
**Current State**:
|
|
19
|
+
**Current State**: v1.0.0 complete. 548 tests passing.
|
|
16
20
|
|
|
17
21
|
### Version History
|
|
22
|
+
- **v1.0.0**: Apply mode for applying cleaning functions to data, Excel support, TUI color enhancement
|
|
23
|
+
- **v0.9.0**: CLI tool with MLX and OpenAI-compatible backends (LM Studio, Ollama)
|
|
24
|
+
- **v0.8.0**: Terminal UI with Rich dashboard, mission control aesthetic, transmission log
|
|
25
|
+
- **v0.7.0**: Markitdown integration (20+ formats), Parquet support, LLM-generated parsers
|
|
18
26
|
- **v0.6.0**: Latency metrics, import consolidation, cleaning report, dry-run mode
|
|
19
27
|
- **v0.5.1**: Dangerous code detection (AST-based security)
|
|
20
28
|
- **v0.5.0**: Two-pass optimization with LLM agency (consolidation, early termination)
|
|
@@ -69,6 +77,8 @@ cleaner = DataCleaner(
|
|
|
69
77
|
# Observability (v0.6.0)
|
|
70
78
|
report_path="cleaning_report.md", # Generate markdown report (None to disable)
|
|
71
79
|
dry_run=False, # Set True to analyze without generating functions
|
|
80
|
+
# Terminal UI (v0.8.0)
|
|
81
|
+
tui=True, # Enable Rich dashboard (requires pip install recursive-cleaner[tui])
|
|
72
82
|
)
|
|
73
83
|
|
|
74
84
|
cleaner.run() # Outputs: cleaning_functions.py, cleaning_report.md
|
|
@@ -159,6 +169,7 @@ recursive_cleaner/
|
|
|
159
169
|
report.py # Markdown report generation (~120 lines) [v0.6.0]
|
|
160
170
|
response.py # XML/markdown parsing + agency dataclasses (~292 lines)
|
|
161
171
|
schema.py # Schema inference (~117 lines) [v0.2.0]
|
|
172
|
+
tui.py # Rich terminal dashboard (~520 lines) [v0.8.0]
|
|
162
173
|
types.py # LLMBackend protocol (~11 lines)
|
|
163
174
|
validation.py # Runtime validation + safety checks (~200 lines)
|
|
164
175
|
vendor/
|
|
@@ -187,6 +198,7 @@ tests/ # 392 tests
|
|
|
187
198
|
test_sampling.py # Sampling strategy tests [v0.4.0]
|
|
188
199
|
test_schema.py # Schema inference tests
|
|
189
200
|
test_text_mode.py # Text mode tests [v0.3.0]
|
|
201
|
+
test_tui.py # Terminal UI tests [v0.8.0]
|
|
190
202
|
test_validation.py # Runtime validation + safety tests
|
|
191
203
|
test_vendor_chunker.py # Vendored chunker tests [v0.3.0]
|
|
192
204
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: recursive-cleaner
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
|
|
5
5
|
Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
6
6
|
Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
@@ -9,7 +9,7 @@ Author: Gary Tran
|
|
|
9
9
|
License-Expression: MIT
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Keywords: automation,data-cleaning,data-quality,etl,llm,machine-learning
|
|
12
|
-
Classifier: Development Status ::
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
13
|
Classifier: Intended Audience :: Developers
|
|
14
14
|
Classifier: Intended Audience :: Science/Research
|
|
15
15
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -26,12 +26,19 @@ Requires-Dist: tenacity>=8.0
|
|
|
26
26
|
Provides-Extra: dev
|
|
27
27
|
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
28
28
|
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
29
|
+
Provides-Extra: excel
|
|
30
|
+
Requires-Dist: openpyxl>=3.0.0; extra == 'excel'
|
|
31
|
+
Requires-Dist: xlrd>=2.0.0; extra == 'excel'
|
|
29
32
|
Provides-Extra: markitdown
|
|
30
33
|
Requires-Dist: markitdown>=0.1.0; extra == 'markitdown'
|
|
31
34
|
Provides-Extra: mlx
|
|
32
35
|
Requires-Dist: mlx-lm>=0.10.0; extra == 'mlx'
|
|
36
|
+
Provides-Extra: openai
|
|
37
|
+
Requires-Dist: openai>=1.0.0; extra == 'openai'
|
|
33
38
|
Provides-Extra: parquet
|
|
34
39
|
Requires-Dist: pyarrow>=14.0.0; extra == 'parquet'
|
|
40
|
+
Provides-Extra: tui
|
|
41
|
+
Requires-Dist: rich>=13.0; extra == 'tui'
|
|
35
42
|
Description-Content-Type: text/markdown
|
|
36
43
|
|
|
37
44
|
# Recursive Data Cleaner
|
|
@@ -69,6 +76,11 @@ For Parquet files:
|
|
|
69
76
|
pip install -e ".[parquet]"
|
|
70
77
|
```
|
|
71
78
|
|
|
79
|
+
For Terminal UI (Rich dashboard):
|
|
80
|
+
```bash
|
|
81
|
+
pip install -e ".[tui]"
|
|
82
|
+
```
|
|
83
|
+
|
|
72
84
|
## Quick Start
|
|
73
85
|
|
|
74
86
|
```python
|
|
@@ -126,6 +138,98 @@ cleaner.run() # Generates cleaning_functions.py
|
|
|
126
138
|
- **Parquet Support**: Load parquet files as structured data via pyarrow
|
|
127
139
|
- **LLM-Generated Parsers**: Auto-generate parsers for XML and unknown formats (`auto_parse=True`)
|
|
128
140
|
|
|
141
|
+
### Terminal UI (v0.8.0)
|
|
142
|
+
- **Mission Control Dashboard**: Rich-based live terminal UI with retro aesthetic
|
|
143
|
+
- **Real-time Progress**: Animated progress bars, chunk/iteration counters
|
|
144
|
+
- **Transmission Log**: Parsed LLM responses showing issues detected and functions being generated
|
|
145
|
+
- **Token Estimation**: Track estimated input/output tokens across the run
|
|
146
|
+
- **Graceful Fallback**: Works without Rich installed (falls back to callbacks)
|
|
147
|
+
|
|
148
|
+
### CLI (v0.9.0)
|
|
149
|
+
- **Command Line Interface**: Use without writing Python code
|
|
150
|
+
- **Multiple Backends**: MLX (Apple Silicon) and OpenAI-compatible (OpenAI, LM Studio, Ollama)
|
|
151
|
+
- **Four Commands**: `generate`, `analyze` (dry-run), `resume`, `apply`
|
|
152
|
+
|
|
153
|
+
### Apply Mode (v1.0.0)
|
|
154
|
+
- **Apply Cleaning Functions**: Apply generated functions to full datasets
|
|
155
|
+
- **Data Formats**: JSONL, CSV, JSON, Parquet, Excel (.xlsx/.xls) output same format
|
|
156
|
+
- **Text Formats**: PDF, Word, HTML, etc. output as Markdown
|
|
157
|
+
- **Streaming**: Memory-efficient line-by-line processing for JSONL/CSV
|
|
158
|
+
- **Colored TUI**: Enhanced transmission log with syntax-highlighted XML parsing
|
|
159
|
+
|
|
160
|
+
## Command Line Interface
|
|
161
|
+
|
|
162
|
+
After installation, the `recursive-cleaner` command is available:
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
# Generate cleaning functions with MLX (Apple Silicon)
|
|
166
|
+
recursive-cleaner generate data.jsonl \
|
|
167
|
+
--provider mlx \
|
|
168
|
+
--model "lmstudio-community/Qwen3-80B-MLX-4bit" \
|
|
169
|
+
--instructions "Normalize phone numbers to E.164" \
|
|
170
|
+
--output cleaning_functions.py
|
|
171
|
+
|
|
172
|
+
# Use OpenAI
|
|
173
|
+
export OPENAI_API_KEY=your-key
|
|
174
|
+
recursive-cleaner generate data.jsonl \
|
|
175
|
+
--provider openai \
|
|
176
|
+
--model gpt-4o \
|
|
177
|
+
--instructions "Fix date formats"
|
|
178
|
+
|
|
179
|
+
# Use LM Studio or Ollama (OpenAI-compatible)
|
|
180
|
+
recursive-cleaner generate data.jsonl \
|
|
181
|
+
--provider openai \
|
|
182
|
+
--model "qwen/qwen3-vl-30b" \
|
|
183
|
+
--base-url http://localhost:1234/v1 \
|
|
184
|
+
--instructions "Normalize prices"
|
|
185
|
+
|
|
186
|
+
# Dry-run analysis
|
|
187
|
+
recursive-cleaner analyze data.jsonl \
|
|
188
|
+
--provider openai \
|
|
189
|
+
--model gpt-4o \
|
|
190
|
+
--instructions @instructions.txt
|
|
191
|
+
|
|
192
|
+
# Resume from checkpoint
|
|
193
|
+
recursive-cleaner resume cleaning_state.json \
|
|
194
|
+
--provider mlx \
|
|
195
|
+
--model "model-path"
|
|
196
|
+
|
|
197
|
+
# Apply cleaning functions to data
|
|
198
|
+
recursive-cleaner apply data.jsonl \
|
|
199
|
+
--functions cleaning_functions.py \
|
|
200
|
+
--output cleaned_data.jsonl
|
|
201
|
+
|
|
202
|
+
# Apply to Excel (outputs same format)
|
|
203
|
+
recursive-cleaner apply sales.xlsx \
|
|
204
|
+
--functions cleaning_functions.py
|
|
205
|
+
|
|
206
|
+
# Apply to PDF (outputs markdown)
|
|
207
|
+
recursive-cleaner apply document.pdf \
|
|
208
|
+
--functions cleaning_functions.py \
|
|
209
|
+
--output cleaned.md
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### CLI Options
|
|
213
|
+
|
|
214
|
+
```
|
|
215
|
+
recursive-cleaner generate <FILE> [OPTIONS]
|
|
216
|
+
|
|
217
|
+
Required:
|
|
218
|
+
FILE Input data file
|
|
219
|
+
-p, --provider {mlx,openai} LLM provider
|
|
220
|
+
-m, --model MODEL Model name/path
|
|
221
|
+
|
|
222
|
+
Optional:
|
|
223
|
+
-i, --instructions TEXT Cleaning instructions (or @file.txt)
|
|
224
|
+
--base-url URL API URL for OpenAI-compatible servers
|
|
225
|
+
--chunk-size N Items per chunk (default: 50)
|
|
226
|
+
--max-iterations N Max iterations per chunk (default: 5)
|
|
227
|
+
-o, --output PATH Output file (default: cleaning_functions.py)
|
|
228
|
+
--tui Enable Rich dashboard
|
|
229
|
+
--optimize Consolidate redundant functions
|
|
230
|
+
--track-metrics Measure before/after quality
|
|
231
|
+
```
|
|
232
|
+
|
|
129
233
|
## Configuration
|
|
130
234
|
|
|
131
235
|
```python
|
|
@@ -160,6 +264,9 @@ cleaner = DataCleaner(
|
|
|
160
264
|
# Format Expansion
|
|
161
265
|
auto_parse=False, # LLM generates parser for unknown formats
|
|
162
266
|
|
|
267
|
+
# Terminal UI
|
|
268
|
+
tui=True, # Enable Rich dashboard (requires [tui] extra)
|
|
269
|
+
|
|
163
270
|
# Progress & State
|
|
164
271
|
on_progress=callback, # Progress event callback
|
|
165
272
|
state_file="state.json", # Enable resume on interrupt
|
|
@@ -253,6 +360,7 @@ cleaner.run()
|
|
|
253
360
|
|
|
254
361
|
```
|
|
255
362
|
recursive_cleaner/
|
|
363
|
+
├── cli.py # Command line interface
|
|
256
364
|
├── cleaner.py # Main DataCleaner class
|
|
257
365
|
├── context.py # Docstring registry with FIFO eviction
|
|
258
366
|
├── dependencies.py # Topological sort for function ordering
|
|
@@ -265,9 +373,14 @@ recursive_cleaner/
|
|
|
265
373
|
├── report.py # Markdown report generation
|
|
266
374
|
├── response.py # XML/markdown parsing + agency dataclasses
|
|
267
375
|
├── schema.py # Schema inference
|
|
376
|
+
├── tui.py # Rich terminal dashboard
|
|
268
377
|
├── validation.py # Runtime validation + holdout
|
|
269
378
|
└── vendor/
|
|
270
379
|
└── chunker.py # Vendored sentence-aware chunker
|
|
380
|
+
|
|
381
|
+
backends/
|
|
382
|
+
├── mlx_backend.py # MLX-LM backend for Apple Silicon
|
|
383
|
+
└── openai_backend.py # OpenAI-compatible backend
|
|
271
384
|
```
|
|
272
385
|
|
|
273
386
|
## Testing
|
|
@@ -276,14 +389,14 @@ recursive_cleaner/
|
|
|
276
389
|
pytest tests/ -v
|
|
277
390
|
```
|
|
278
391
|
|
|
279
|
-
|
|
392
|
+
548 tests covering all features. Test datasets in `test_cases/`:
|
|
280
393
|
- E-commerce product catalogs
|
|
281
394
|
- Healthcare patient records
|
|
282
395
|
- Financial transaction data
|
|
283
396
|
|
|
284
397
|
## Philosophy
|
|
285
398
|
|
|
286
|
-
- **Simplicity over extensibility**: ~
|
|
399
|
+
- **Simplicity over extensibility**: ~5,000 lines that do one thing well
|
|
287
400
|
- **stdlib over dependencies**: Only `tenacity` required
|
|
288
401
|
- **Retry over recover**: On error, retry with error in prompt
|
|
289
402
|
- **Wu wei**: Let the LLM make decisions about data it understands
|
|
@@ -292,6 +405,8 @@ pytest tests/ -v
|
|
|
292
405
|
|
|
293
406
|
| Version | Features |
|
|
294
407
|
|---------|----------|
|
|
408
|
+
| v0.9.0 | CLI tool with MLX and OpenAI-compatible backends (LM Studio, Ollama) |
|
|
409
|
+
| v0.8.0 | Terminal UI with Rich dashboard, mission control aesthetic, transmission log |
|
|
295
410
|
| v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
|
|
296
411
|
| v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
|
|
297
412
|
| v0.5.1 | Dangerous code detection (AST-based security) |
|
|
@@ -33,6 +33,11 @@ For Parquet files:
|
|
|
33
33
|
pip install -e ".[parquet]"
|
|
34
34
|
```
|
|
35
35
|
|
|
36
|
+
For Terminal UI (Rich dashboard):
|
|
37
|
+
```bash
|
|
38
|
+
pip install -e ".[tui]"
|
|
39
|
+
```
|
|
40
|
+
|
|
36
41
|
## Quick Start
|
|
37
42
|
|
|
38
43
|
```python
|
|
@@ -90,6 +95,98 @@ cleaner.run() # Generates cleaning_functions.py
|
|
|
90
95
|
- **Parquet Support**: Load parquet files as structured data via pyarrow
|
|
91
96
|
- **LLM-Generated Parsers**: Auto-generate parsers for XML and unknown formats (`auto_parse=True`)
|
|
92
97
|
|
|
98
|
+
### Terminal UI (v0.8.0)
|
|
99
|
+
- **Mission Control Dashboard**: Rich-based live terminal UI with retro aesthetic
|
|
100
|
+
- **Real-time Progress**: Animated progress bars, chunk/iteration counters
|
|
101
|
+
- **Transmission Log**: Parsed LLM responses showing issues detected and functions being generated
|
|
102
|
+
- **Token Estimation**: Track estimated input/output tokens across the run
|
|
103
|
+
- **Graceful Fallback**: Works without Rich installed (falls back to callbacks)
|
|
104
|
+
|
|
105
|
+
### CLI (v0.9.0)
|
|
106
|
+
- **Command Line Interface**: Use without writing Python code
|
|
107
|
+
- **Multiple Backends**: MLX (Apple Silicon) and OpenAI-compatible (OpenAI, LM Studio, Ollama)
|
|
108
|
+
- **Four Commands**: `generate`, `analyze` (dry-run), `resume`, `apply`
|
|
109
|
+
|
|
110
|
+
### Apply Mode (v1.0.0)
|
|
111
|
+
- **Apply Cleaning Functions**: Apply generated functions to full datasets
|
|
112
|
+
- **Data Formats**: JSONL, CSV, JSON, Parquet, Excel (.xlsx/.xls) output same format
|
|
113
|
+
- **Text Formats**: PDF, Word, HTML, etc. output as Markdown
|
|
114
|
+
- **Streaming**: Memory-efficient line-by-line processing for JSONL/CSV
|
|
115
|
+
- **Colored TUI**: Enhanced transmission log with syntax-highlighted XML parsing
|
|
116
|
+
|
|
117
|
+
## Command Line Interface
|
|
118
|
+
|
|
119
|
+
After installation, the `recursive-cleaner` command is available:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
# Generate cleaning functions with MLX (Apple Silicon)
|
|
123
|
+
recursive-cleaner generate data.jsonl \
|
|
124
|
+
--provider mlx \
|
|
125
|
+
--model "lmstudio-community/Qwen3-80B-MLX-4bit" \
|
|
126
|
+
--instructions "Normalize phone numbers to E.164" \
|
|
127
|
+
--output cleaning_functions.py
|
|
128
|
+
|
|
129
|
+
# Use OpenAI
|
|
130
|
+
export OPENAI_API_KEY=your-key
|
|
131
|
+
recursive-cleaner generate data.jsonl \
|
|
132
|
+
--provider openai \
|
|
133
|
+
--model gpt-4o \
|
|
134
|
+
--instructions "Fix date formats"
|
|
135
|
+
|
|
136
|
+
# Use LM Studio or Ollama (OpenAI-compatible)
|
|
137
|
+
recursive-cleaner generate data.jsonl \
|
|
138
|
+
--provider openai \
|
|
139
|
+
--model "qwen/qwen3-vl-30b" \
|
|
140
|
+
--base-url http://localhost:1234/v1 \
|
|
141
|
+
--instructions "Normalize prices"
|
|
142
|
+
|
|
143
|
+
# Dry-run analysis
|
|
144
|
+
recursive-cleaner analyze data.jsonl \
|
|
145
|
+
--provider openai \
|
|
146
|
+
--model gpt-4o \
|
|
147
|
+
--instructions @instructions.txt
|
|
148
|
+
|
|
149
|
+
# Resume from checkpoint
|
|
150
|
+
recursive-cleaner resume cleaning_state.json \
|
|
151
|
+
--provider mlx \
|
|
152
|
+
--model "model-path"
|
|
153
|
+
|
|
154
|
+
# Apply cleaning functions to data
|
|
155
|
+
recursive-cleaner apply data.jsonl \
|
|
156
|
+
--functions cleaning_functions.py \
|
|
157
|
+
--output cleaned_data.jsonl
|
|
158
|
+
|
|
159
|
+
# Apply to Excel (outputs same format)
|
|
160
|
+
recursive-cleaner apply sales.xlsx \
|
|
161
|
+
--functions cleaning_functions.py
|
|
162
|
+
|
|
163
|
+
# Apply to PDF (outputs markdown)
|
|
164
|
+
recursive-cleaner apply document.pdf \
|
|
165
|
+
--functions cleaning_functions.py \
|
|
166
|
+
--output cleaned.md
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### CLI Options
|
|
170
|
+
|
|
171
|
+
```
|
|
172
|
+
recursive-cleaner generate <FILE> [OPTIONS]
|
|
173
|
+
|
|
174
|
+
Required:
|
|
175
|
+
FILE Input data file
|
|
176
|
+
-p, --provider {mlx,openai} LLM provider
|
|
177
|
+
-m, --model MODEL Model name/path
|
|
178
|
+
|
|
179
|
+
Optional:
|
|
180
|
+
-i, --instructions TEXT Cleaning instructions (or @file.txt)
|
|
181
|
+
--base-url URL API URL for OpenAI-compatible servers
|
|
182
|
+
--chunk-size N Items per chunk (default: 50)
|
|
183
|
+
--max-iterations N Max iterations per chunk (default: 5)
|
|
184
|
+
-o, --output PATH Output file (default: cleaning_functions.py)
|
|
185
|
+
--tui Enable Rich dashboard
|
|
186
|
+
--optimize Consolidate redundant functions
|
|
187
|
+
--track-metrics Measure before/after quality
|
|
188
|
+
```
|
|
189
|
+
|
|
93
190
|
## Configuration
|
|
94
191
|
|
|
95
192
|
```python
|
|
@@ -124,6 +221,9 @@ cleaner = DataCleaner(
|
|
|
124
221
|
# Format Expansion
|
|
125
222
|
auto_parse=False, # LLM generates parser for unknown formats
|
|
126
223
|
|
|
224
|
+
# Terminal UI
|
|
225
|
+
tui=True, # Enable Rich dashboard (requires [tui] extra)
|
|
226
|
+
|
|
127
227
|
# Progress & State
|
|
128
228
|
on_progress=callback, # Progress event callback
|
|
129
229
|
state_file="state.json", # Enable resume on interrupt
|
|
@@ -217,6 +317,7 @@ cleaner.run()
|
|
|
217
317
|
|
|
218
318
|
```
|
|
219
319
|
recursive_cleaner/
|
|
320
|
+
├── cli.py # Command line interface
|
|
220
321
|
├── cleaner.py # Main DataCleaner class
|
|
221
322
|
├── context.py # Docstring registry with FIFO eviction
|
|
222
323
|
├── dependencies.py # Topological sort for function ordering
|
|
@@ -229,9 +330,14 @@ recursive_cleaner/
|
|
|
229
330
|
├── report.py # Markdown report generation
|
|
230
331
|
├── response.py # XML/markdown parsing + agency dataclasses
|
|
231
332
|
├── schema.py # Schema inference
|
|
333
|
+
├── tui.py # Rich terminal dashboard
|
|
232
334
|
├── validation.py # Runtime validation + holdout
|
|
233
335
|
└── vendor/
|
|
234
336
|
└── chunker.py # Vendored sentence-aware chunker
|
|
337
|
+
|
|
338
|
+
backends/
|
|
339
|
+
├── mlx_backend.py # MLX-LM backend for Apple Silicon
|
|
340
|
+
└── openai_backend.py # OpenAI-compatible backend
|
|
235
341
|
```
|
|
236
342
|
|
|
237
343
|
## Testing
|
|
@@ -240,14 +346,14 @@ recursive_cleaner/
|
|
|
240
346
|
pytest tests/ -v
|
|
241
347
|
```
|
|
242
348
|
|
|
243
|
-
|
|
349
|
+
548 tests covering all features. Test datasets in `test_cases/`:
|
|
244
350
|
- E-commerce product catalogs
|
|
245
351
|
- Healthcare patient records
|
|
246
352
|
- Financial transaction data
|
|
247
353
|
|
|
248
354
|
## Philosophy
|
|
249
355
|
|
|
250
|
-
- **Simplicity over extensibility**: ~
|
|
356
|
+
- **Simplicity over extensibility**: ~5,000 lines that do one thing well
|
|
251
357
|
- **stdlib over dependencies**: Only `tenacity` required
|
|
252
358
|
- **Retry over recover**: On error, retry with error in prompt
|
|
253
359
|
- **Wu wei**: Let the LLM make decisions about data it understands
|
|
@@ -256,6 +362,8 @@ pytest tests/ -v
|
|
|
256
362
|
|
|
257
363
|
| Version | Features |
|
|
258
364
|
|---------|----------|
|
|
365
|
+
| v0.9.0 | CLI tool with MLX and OpenAI-compatible backends (LM Studio, Ollama) |
|
|
366
|
+
| v0.8.0 | Terminal UI with Rich dashboard, mission control aesthetic, transmission log |
|
|
259
367
|
| v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
|
|
260
368
|
| v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
|
|
261
369
|
| v0.5.1 | Dangerous code detection (AST-based security) |
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# TODO - Recursive Data Cleaner Roadmap
|
|
2
|
+
|
|
3
|
+
## Current Version: v0.9.0
|
|
4
|
+
|
|
5
|
+
502 tests passing, ~3,400 lines. CLI complete.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Completed Work
|
|
10
|
+
|
|
11
|
+
| Version | Features |
|
|
12
|
+
|---------|----------|
|
|
13
|
+
| v0.1.0 | Core pipeline, chunking, docstring registry |
|
|
14
|
+
| v0.2.0 | Runtime validation, schema inference, callbacks, incremental saves |
|
|
15
|
+
| v0.3.0 | Text mode with sentence-aware chunking |
|
|
16
|
+
| v0.4.0 | Holdout validation, dependency resolution, smart sampling, quality metrics |
|
|
17
|
+
| v0.5.0 | Two-pass optimization, early termination, LLM agency |
|
|
18
|
+
| v0.5.1 | Dangerous code detection (AST-based security) |
|
|
19
|
+
| v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
|
|
20
|
+
| v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
|
|
21
|
+
| v0.8.0 | Terminal UI with Rich dashboard, mission control aesthetic |
|
|
22
|
+
| v0.9.0 | CLI tool with MLX and OpenAI-compatible backends |
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Version Progression
|
|
27
|
+
|
|
28
|
+
| Version | Theme |
|
|
29
|
+
|---------|-------|
|
|
30
|
+
| v0.1-0.2 | Core pipeline + validation |
|
|
31
|
+
| v0.3-0.4 | Data quality assurance |
|
|
32
|
+
| v0.5-0.6 | Optimization + observability |
|
|
33
|
+
| v0.7-0.8 | Accessibility (formats + UI) |
|
|
34
|
+
| v0.9-1.0 | Complete workflow |
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## Roadmap to v1.0
|
|
39
|
+
|
|
40
|
+
### v0.9.0 - CLI Tool ✅ COMPLETE
|
|
41
|
+
|
|
42
|
+
CLI implemented with:
|
|
43
|
+
- `recursive_cleaner/cli.py` - argparse CLI (346 lines)
|
|
44
|
+
- `backends/openai_backend.py` - OpenAI-compatible backend (71 lines)
|
|
45
|
+
- Commands: `generate`, `analyze`, `resume`
|
|
46
|
+
- Backends: MLX, OpenAI, LM Studio, Ollama (via --base-url)
|
|
47
|
+
|
|
48
|
+
### v1.0.0 - Apply Mode (~150 lines)
|
|
49
|
+
|
|
50
|
+
The final step: actually cleaning the data, not just generating functions.
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
cleaner = DataCleaner(...)
|
|
54
|
+
cleaner.run() # Generates cleaning_functions.py
|
|
55
|
+
|
|
56
|
+
# NEW: Apply to full dataset
|
|
57
|
+
cleaner.apply(output_path="cleaned_data.jsonl")
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
**Implementation:**
|
|
61
|
+
- [ ] `DataCleaner.apply(output_path)` method
|
|
62
|
+
- [ ] Stream-process file applying generated functions
|
|
63
|
+
- [ ] Progress callbacks for large files
|
|
64
|
+
- [ ] Validate output schema matches input
|
|
65
|
+
- [ ] CLI integration: `recursive-cleaner apply`
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## Patterns That Worked
|
|
70
|
+
|
|
71
|
+
These patterns proved high-value with low implementation effort:
|
|
72
|
+
|
|
73
|
+
1. **AST walking** - Dependency detection, dangerous code detection. ~50 lines each.
|
|
74
|
+
2. **LLM agency** - Let model decide chunk cleanliness, saturation, consolidation. Elegant.
|
|
75
|
+
3. **Retry with feedback** - On error, append error to prompt and retry. No complex recovery.
|
|
76
|
+
4. **Holdout validation** - Test on unseen data before accepting. Catches edge cases.
|
|
77
|
+
5. **Simple data structures** - List of dicts, JSON serialization. Easy to debug/resume.
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## What We're Not Doing
|
|
82
|
+
|
|
83
|
+
| Feature | Reason |
|
|
84
|
+
|---------|--------|
|
|
85
|
+
| Global deduplication | Adds complexity, breaks chunk-based philosophy |
|
|
86
|
+
| Built-in LLM backends | Users bring their own, keeps us dependency-free |
|
|
87
|
+
| Config files (YAML/TOML) | Python is already config, YAGNI |
|
|
88
|
+
| Plugin system | No interfaces for things with one implementation |
|
|
89
|
+
| Async multi-chunk | Complexity not justified; sequential is predictable |
|
|
90
|
+
| Vector retrieval | Adds chromadb dependency; FIFO works for typical use |
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## Line Count Budget
|
|
95
|
+
|
|
96
|
+
| Component | Current | After v1.0 |
|
|
97
|
+
|-----------|---------|------------|
|
|
98
|
+
| Core library | ~3,000 | ~3,350 |
|
|
99
|
+
| Tests | ~4,000 | ~4,400 |
|
|
100
|
+
|
|
101
|
+
Staying under 3,500 lines for the library keeps us true to the philosophy.
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Philosophy Reminder
|
|
106
|
+
|
|
107
|
+
From CLAUDE.md:
|
|
108
|
+
- **Simplicity over extensibility** - Keep it lean
|
|
109
|
+
- **stdlib over dependencies** - Only tenacity required
|
|
110
|
+
- **Functions over classes** - Unless state genuinely helps
|
|
111
|
+
- **Delete over abstract** - No interfaces for single implementations
|
|
112
|
+
- **Retry over recover** - On error, retry with error in prompt
|
|
113
|
+
- **Wu wei** - Let the LLM make decisions about data it understands
|
|
114
|
+
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
## Known Limitation
|
|
118
|
+
|
|
119
|
+
**Stateful ops within chunks only** - Deduplication and aggregations don't work globally. This is architectural and accepted.
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""OpenAI-compatible backend for Recursive Data Cleaner."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class OpenAIBackend:
|
|
7
|
+
"""
|
|
8
|
+
OpenAI-compatible backend implementation.
|
|
9
|
+
|
|
10
|
+
Works with OpenAI API, LM Studio, Ollama, and other OpenAI-compatible servers.
|
|
11
|
+
Conforms to the LLMBackend protocol.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
model: str,
|
|
17
|
+
api_key: str | None = None,
|
|
18
|
+
base_url: str | None = None,
|
|
19
|
+
max_tokens: int = 4096,
|
|
20
|
+
temperature: float = 0.7,
|
|
21
|
+
):
|
|
22
|
+
"""
|
|
23
|
+
Initialize the OpenAI backend.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
model: Model name (e.g., "gpt-4o", "gpt-3.5-turbo")
|
|
27
|
+
api_key: API key (defaults to OPENAI_API_KEY env var, or "not-needed" for local)
|
|
28
|
+
base_url: API base URL (defaults to OpenAI's API)
|
|
29
|
+
max_tokens: Maximum tokens to generate
|
|
30
|
+
temperature: Sampling temperature
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
import openai
|
|
34
|
+
except ImportError:
|
|
35
|
+
raise ImportError(
|
|
36
|
+
"OpenAI SDK not installed. Install with: pip install openai"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
self.model = model
|
|
40
|
+
self.max_tokens = max_tokens
|
|
41
|
+
self.temperature = temperature
|
|
42
|
+
|
|
43
|
+
# Resolve API key: explicit > env var > "not-needed" for local servers
|
|
44
|
+
if api_key is not None:
|
|
45
|
+
resolved_key = api_key
|
|
46
|
+
else:
|
|
47
|
+
resolved_key = os.environ.get("OPENAI_API_KEY", "not-needed")
|
|
48
|
+
|
|
49
|
+
# Create client
|
|
50
|
+
self._client = openai.OpenAI(
|
|
51
|
+
api_key=resolved_key,
|
|
52
|
+
base_url=base_url,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def generate(self, prompt: str) -> str:
|
|
56
|
+
"""
|
|
57
|
+
Generate a response from the LLM.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
prompt: The input prompt
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
The generated text response
|
|
64
|
+
"""
|
|
65
|
+
response = self._client.chat.completions.create(
|
|
66
|
+
model=self.model,
|
|
67
|
+
messages=[{"role": "user", "content": prompt}],
|
|
68
|
+
max_tokens=self.max_tokens,
|
|
69
|
+
temperature=self.temperature,
|
|
70
|
+
)
|
|
71
|
+
return response.choices[0].message.content or ""
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Demo script to showcase the Rich TUI with real MLX backend.
|
|
4
|
+
|
|
5
|
+
Run with:
|
|
6
|
+
python demo_tui.py
|
|
7
|
+
|
|
8
|
+
Requirements:
|
|
9
|
+
pip install recursive-cleaner[mlx,tui]
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from backends import MLXBackend
|
|
13
|
+
from recursive_cleaner import DataCleaner
|
|
14
|
+
|
|
15
|
+
# Use a smaller/faster model for demo (change to your preferred model)
|
|
16
|
+
MODEL = "lmstudio-community/Qwen3-Next-80B-A3B-Instruct-MLX-4bit"
|
|
17
|
+
|
|
18
|
+
print("=" * 60)
|
|
19
|
+
print(" RECURSIVE DATA CLEANER - TUI DEMO")
|
|
20
|
+
print("=" * 60)
|
|
21
|
+
print(f"\nLoading model: {MODEL}")
|
|
22
|
+
print("This may take a moment on first run...\n")
|
|
23
|
+
|
|
24
|
+
llm = MLXBackend(
|
|
25
|
+
model_path=MODEL,
|
|
26
|
+
max_tokens=2048,
|
|
27
|
+
temperature=0.3, # Lower for more consistent output
|
|
28
|
+
verbose=False, # Disable token streaming to avoid interfering with TUI
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
cleaner = DataCleaner(
|
|
32
|
+
llm_backend=llm,
|
|
33
|
+
file_path="test_cases/ecommerce_products.jsonl",
|
|
34
|
+
chunk_size=5, # Small chunks for demo
|
|
35
|
+
max_iterations=3, # Limit iterations per chunk
|
|
36
|
+
instructions="""
|
|
37
|
+
E-commerce product data cleaning:
|
|
38
|
+
- Normalize prices to float (remove $ symbols)
|
|
39
|
+
- Fix category typos and normalize to Title Case
|
|
40
|
+
- Convert weights to kg as float
|
|
41
|
+
- Ensure stock_quantity is non-negative integer
|
|
42
|
+
""",
|
|
43
|
+
tui=True, # Enable the Rich dashboard!
|
|
44
|
+
track_metrics=True,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
print("\nStarting cleaner with TUI enabled...")
|
|
48
|
+
print("Watch the dashboard below!\n")
|
|
49
|
+
|
|
50
|
+
cleaner.run()
|
|
51
|
+
|
|
52
|
+
print("\n" + "=" * 60)
|
|
53
|
+
print("Demo complete! Check cleaning_functions.py for output.")
|
|
54
|
+
print("=" * 60)
|