recursive-cleaner 0.7.1__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/CLAUDE.md +10 -2
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/PKG-INFO +21 -2
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/README.md +18 -1
- recursive_cleaner-0.8.0/demo_tui.py +54 -0
- recursive_cleaner-0.8.0/docs/contracts/v080-api-contract.md +62 -0
- recursive_cleaner-0.8.0/docs/contracts/v080-data-schema.md +90 -0
- recursive_cleaner-0.8.0/docs/contracts/v080-success-criteria.md +70 -0
- recursive_cleaner-0.8.0/docs/implementation-plan-v080.md +182 -0
- recursive_cleaner-0.8.0/docs/research/rich-tui-patterns.md +110 -0
- recursive_cleaner-0.8.0/docs/workflow-state.md +24 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/pyproject.toml +4 -1
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/recursive_cleaner/__init__.py +3 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/recursive_cleaner/cleaner.py +117 -26
- recursive_cleaner-0.8.0/recursive_cleaner/tui.py +595 -0
- recursive_cleaner-0.8.0/tests/test_tui.py +758 -0
- recursive_cleaner-0.7.1/docs/workflow-state.md +0 -26
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/.gitignore +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/LICENSE +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/TODO.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/backends/__init__.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/backends/mlx_backend.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/contracts/api-contract.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/contracts/data-schema.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/contracts/success-criteria.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/contracts/text-mode-contract.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/contracts/tier2-contract.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/contracts/tier4-contract.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/contracts/tier4-success-criteria.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/contracts/two-pass-contract.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/contracts/v070-success-criteria.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/handoffs/tier4-handoff.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/implementation-plan-tier4.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/implementation-plan-v03.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/implementation-plan-v04.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/implementation-plan-v05.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/implementation-plan.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/langchain-analysis.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/langgraph-analysis.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/mlx-lm-guide.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/other-frameworks-analysis.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/refactor-assessment/data/dependency.json +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/refactor-assessment/data/stats.json +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/refactor-assessment/plan.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/refactor-assessment/report.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/research/chonkie-extraction.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/research/chonkie.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/research/markitdown.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/docs/smolagents-analysis.md +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/recursive_cleaner/context.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/recursive_cleaner/dependencies.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/recursive_cleaner/errors.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/recursive_cleaner/metrics.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/recursive_cleaner/optimizer.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/recursive_cleaner/output.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/recursive_cleaner/parser_generator.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/recursive_cleaner/parsers.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/recursive_cleaner/prompt.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/recursive_cleaner/report.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/recursive_cleaner/response.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/recursive_cleaner/schema.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/recursive_cleaner/types.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/recursive_cleaner/validation.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/recursive_cleaner/vendor/__init__.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/recursive_cleaner/vendor/chunker.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/test_cases/ecommerce_instructions.txt +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/test_cases/ecommerce_products.jsonl +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/test_cases/financial_instructions.txt +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/test_cases/financial_transactions.jsonl +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/test_cases/healthcare_instructions.txt +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/test_cases/healthcare_patients.jsonl +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/test_cases/run_ecommerce_test.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/test_cases/run_financial_test.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/test_cases/run_healthcare_test.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/__init__.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_callbacks.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_cleaner.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_context.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_dependencies.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_dry_run.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_holdout.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_incremental.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_integration.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_latency.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_metrics.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_optimizer.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_output.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_parser_generator.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_parsers.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_report.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_sampling.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_schema.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_text_mode.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_validation.py +0 -0
- {recursive_cleaner-0.7.1 → recursive_cleaner-0.8.0}/tests/test_vendor_chunker.py +0 -0
|
@@ -4,7 +4,9 @@
|
|
|
4
4
|
|
|
5
5
|
| Version | Status | Date |
|
|
6
6
|
|---------|--------|------|
|
|
7
|
-
| v0.
|
|
7
|
+
| v0.8.0 | **Implemented** | 2025-01-19 |
|
|
8
|
+
| v0.7.0 | Implemented | 2025-01-17 |
|
|
9
|
+
| v0.6.0 | Implemented | 2025-01-15 |
|
|
8
10
|
| v0.5.1 | Implemented | 2025-01-15 |
|
|
9
11
|
| v0.5.0 | Implemented | 2025-01-15 |
|
|
10
12
|
| v0.4.0 | Implemented | 2025-01-15 |
|
|
@@ -12,9 +14,11 @@
|
|
|
12
14
|
| v0.2.0 | Implemented | 2025-01-14 |
|
|
13
15
|
| v0.1.0 | Implemented | 2025-01-14 |
|
|
14
16
|
|
|
15
|
-
**Current State**: v0.
|
|
17
|
+
**Current State**: v0.8.0 complete. 465 tests passing.
|
|
16
18
|
|
|
17
19
|
### Version History
|
|
20
|
+
- **v0.8.0**: Terminal UI with Rich dashboard, mission control aesthetic, transmission log
|
|
21
|
+
- **v0.7.0**: Markitdown integration (20+ formats), Parquet support, LLM-generated parsers
|
|
18
22
|
- **v0.6.0**: Latency metrics, import consolidation, cleaning report, dry-run mode
|
|
19
23
|
- **v0.5.1**: Dangerous code detection (AST-based security)
|
|
20
24
|
- **v0.5.0**: Two-pass optimization with LLM agency (consolidation, early termination)
|
|
@@ -69,6 +73,8 @@ cleaner = DataCleaner(
|
|
|
69
73
|
# Observability (v0.6.0)
|
|
70
74
|
report_path="cleaning_report.md", # Generate markdown report (None to disable)
|
|
71
75
|
dry_run=False, # Set True to analyze without generating functions
|
|
76
|
+
# Terminal UI (v0.8.0)
|
|
77
|
+
tui=True, # Enable Rich dashboard (requires pip install recursive-cleaner[tui])
|
|
72
78
|
)
|
|
73
79
|
|
|
74
80
|
cleaner.run() # Outputs: cleaning_functions.py, cleaning_report.md
|
|
@@ -159,6 +165,7 @@ recursive_cleaner/
|
|
|
159
165
|
report.py # Markdown report generation (~120 lines) [v0.6.0]
|
|
160
166
|
response.py # XML/markdown parsing + agency dataclasses (~292 lines)
|
|
161
167
|
schema.py # Schema inference (~117 lines) [v0.2.0]
|
|
168
|
+
tui.py # Rich terminal dashboard (~520 lines) [v0.8.0]
|
|
162
169
|
types.py # LLMBackend protocol (~11 lines)
|
|
163
170
|
validation.py # Runtime validation + safety checks (~200 lines)
|
|
164
171
|
vendor/
|
|
@@ -187,6 +194,7 @@ tests/ # 392 tests
|
|
|
187
194
|
test_sampling.py # Sampling strategy tests [v0.4.0]
|
|
188
195
|
test_schema.py # Schema inference tests
|
|
189
196
|
test_text_mode.py # Text mode tests [v0.3.0]
|
|
197
|
+
test_tui.py # Terminal UI tests [v0.8.0]
|
|
190
198
|
test_validation.py # Runtime validation + safety tests
|
|
191
199
|
test_vendor_chunker.py # Vendored chunker tests [v0.3.0]
|
|
192
200
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: recursive-cleaner
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
|
|
5
5
|
Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
6
6
|
Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
@@ -32,6 +32,8 @@ Provides-Extra: mlx
|
|
|
32
32
|
Requires-Dist: mlx-lm>=0.10.0; extra == 'mlx'
|
|
33
33
|
Provides-Extra: parquet
|
|
34
34
|
Requires-Dist: pyarrow>=14.0.0; extra == 'parquet'
|
|
35
|
+
Provides-Extra: tui
|
|
36
|
+
Requires-Dist: rich>=13.0; extra == 'tui'
|
|
35
37
|
Description-Content-Type: text/markdown
|
|
36
38
|
|
|
37
39
|
# Recursive Data Cleaner
|
|
@@ -69,6 +71,11 @@ For Parquet files:
|
|
|
69
71
|
pip install -e ".[parquet]"
|
|
70
72
|
```
|
|
71
73
|
|
|
74
|
+
For Terminal UI (Rich dashboard):
|
|
75
|
+
```bash
|
|
76
|
+
pip install -e ".[tui]"
|
|
77
|
+
```
|
|
78
|
+
|
|
72
79
|
## Quick Start
|
|
73
80
|
|
|
74
81
|
```python
|
|
@@ -126,6 +133,13 @@ cleaner.run() # Generates cleaning_functions.py
|
|
|
126
133
|
- **Parquet Support**: Load parquet files as structured data via pyarrow
|
|
127
134
|
- **LLM-Generated Parsers**: Auto-generate parsers for XML and unknown formats (`auto_parse=True`)
|
|
128
135
|
|
|
136
|
+
### Terminal UI (v0.8.0)
|
|
137
|
+
- **Mission Control Dashboard**: Rich-based live terminal UI with retro aesthetic
|
|
138
|
+
- **Real-time Progress**: Animated progress bars, chunk/iteration counters
|
|
139
|
+
- **Transmission Log**: Parsed LLM responses showing issues detected and functions being generated
|
|
140
|
+
- **Token Estimation**: Track estimated input/output tokens across the run
|
|
141
|
+
- **Graceful Fallback**: Works without Rich installed (falls back to callbacks)
|
|
142
|
+
|
|
129
143
|
## Configuration
|
|
130
144
|
|
|
131
145
|
```python
|
|
@@ -160,6 +174,9 @@ cleaner = DataCleaner(
|
|
|
160
174
|
# Format Expansion
|
|
161
175
|
auto_parse=False, # LLM generates parser for unknown formats
|
|
162
176
|
|
|
177
|
+
# Terminal UI
|
|
178
|
+
tui=True, # Enable Rich dashboard (requires [tui] extra)
|
|
179
|
+
|
|
163
180
|
# Progress & State
|
|
164
181
|
on_progress=callback, # Progress event callback
|
|
165
182
|
state_file="state.json", # Enable resume on interrupt
|
|
@@ -265,6 +282,7 @@ recursive_cleaner/
|
|
|
265
282
|
├── report.py # Markdown report generation
|
|
266
283
|
├── response.py # XML/markdown parsing + agency dataclasses
|
|
267
284
|
├── schema.py # Schema inference
|
|
285
|
+
├── tui.py # Rich terminal dashboard
|
|
268
286
|
├── validation.py # Runtime validation + holdout
|
|
269
287
|
└── vendor/
|
|
270
288
|
└── chunker.py # Vendored sentence-aware chunker
|
|
@@ -276,7 +294,7 @@ recursive_cleaner/
|
|
|
276
294
|
pytest tests/ -v
|
|
277
295
|
```
|
|
278
296
|
|
|
279
|
-
|
|
297
|
+
465 tests covering all features. Test datasets in `test_cases/`:
|
|
280
298
|
- E-commerce product catalogs
|
|
281
299
|
- Healthcare patient records
|
|
282
300
|
- Financial transaction data
|
|
@@ -292,6 +310,7 @@ pytest tests/ -v
|
|
|
292
310
|
|
|
293
311
|
| Version | Features |
|
|
294
312
|
|---------|----------|
|
|
313
|
+
| v0.8.0 | Terminal UI with Rich dashboard, mission control aesthetic, transmission log |
|
|
295
314
|
| v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
|
|
296
315
|
| v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
|
|
297
316
|
| v0.5.1 | Dangerous code detection (AST-based security) |
|
|
@@ -33,6 +33,11 @@ For Parquet files:
|
|
|
33
33
|
pip install -e ".[parquet]"
|
|
34
34
|
```
|
|
35
35
|
|
|
36
|
+
For Terminal UI (Rich dashboard):
|
|
37
|
+
```bash
|
|
38
|
+
pip install -e ".[tui]"
|
|
39
|
+
```
|
|
40
|
+
|
|
36
41
|
## Quick Start
|
|
37
42
|
|
|
38
43
|
```python
|
|
@@ -90,6 +95,13 @@ cleaner.run() # Generates cleaning_functions.py
|
|
|
90
95
|
- **Parquet Support**: Load parquet files as structured data via pyarrow
|
|
91
96
|
- **LLM-Generated Parsers**: Auto-generate parsers for XML and unknown formats (`auto_parse=True`)
|
|
92
97
|
|
|
98
|
+
### Terminal UI (v0.8.0)
|
|
99
|
+
- **Mission Control Dashboard**: Rich-based live terminal UI with retro aesthetic
|
|
100
|
+
- **Real-time Progress**: Animated progress bars, chunk/iteration counters
|
|
101
|
+
- **Transmission Log**: Parsed LLM responses showing issues detected and functions being generated
|
|
102
|
+
- **Token Estimation**: Track estimated input/output tokens across the run
|
|
103
|
+
- **Graceful Fallback**: Works without Rich installed (falls back to callbacks)
|
|
104
|
+
|
|
93
105
|
## Configuration
|
|
94
106
|
|
|
95
107
|
```python
|
|
@@ -124,6 +136,9 @@ cleaner = DataCleaner(
|
|
|
124
136
|
# Format Expansion
|
|
125
137
|
auto_parse=False, # LLM generates parser for unknown formats
|
|
126
138
|
|
|
139
|
+
# Terminal UI
|
|
140
|
+
tui=True, # Enable Rich dashboard (requires [tui] extra)
|
|
141
|
+
|
|
127
142
|
# Progress & State
|
|
128
143
|
on_progress=callback, # Progress event callback
|
|
129
144
|
state_file="state.json", # Enable resume on interrupt
|
|
@@ -229,6 +244,7 @@ recursive_cleaner/
|
|
|
229
244
|
├── report.py # Markdown report generation
|
|
230
245
|
├── response.py # XML/markdown parsing + agency dataclasses
|
|
231
246
|
├── schema.py # Schema inference
|
|
247
|
+
├── tui.py # Rich terminal dashboard
|
|
232
248
|
├── validation.py # Runtime validation + holdout
|
|
233
249
|
└── vendor/
|
|
234
250
|
└── chunker.py # Vendored sentence-aware chunker
|
|
@@ -240,7 +256,7 @@ recursive_cleaner/
|
|
|
240
256
|
pytest tests/ -v
|
|
241
257
|
```
|
|
242
258
|
|
|
243
|
-
|
|
259
|
+
465 tests covering all features. Test datasets in `test_cases/`:
|
|
244
260
|
- E-commerce product catalogs
|
|
245
261
|
- Healthcare patient records
|
|
246
262
|
- Financial transaction data
|
|
@@ -256,6 +272,7 @@ pytest tests/ -v
|
|
|
256
272
|
|
|
257
273
|
| Version | Features |
|
|
258
274
|
|---------|----------|
|
|
275
|
+
| v0.8.0 | Terminal UI with Rich dashboard, mission control aesthetic, transmission log |
|
|
259
276
|
| v0.7.0 | Markitdown (20+ formats), Parquet support, LLM-generated parsers |
|
|
260
277
|
| v0.6.0 | Latency metrics, import consolidation, cleaning report, dry-run mode |
|
|
261
278
|
| v0.5.1 | Dangerous code detection (AST-based security) |
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Demo script to showcase the Rich TUI with real MLX backend.
|
|
4
|
+
|
|
5
|
+
Run with:
|
|
6
|
+
python demo_tui.py
|
|
7
|
+
|
|
8
|
+
Requirements:
|
|
9
|
+
pip install recursive-cleaner[mlx,tui]
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from backends import MLXBackend
|
|
13
|
+
from recursive_cleaner import DataCleaner
|
|
14
|
+
|
|
15
|
+
# Use a smaller/faster model for demo (change to your preferred model)
|
|
16
|
+
MODEL = "lmstudio-community/Qwen3-Next-80B-A3B-Instruct-MLX-4bit"
|
|
17
|
+
|
|
18
|
+
print("=" * 60)
|
|
19
|
+
print(" RECURSIVE DATA CLEANER - TUI DEMO")
|
|
20
|
+
print("=" * 60)
|
|
21
|
+
print(f"\nLoading model: {MODEL}")
|
|
22
|
+
print("This may take a moment on first run...\n")
|
|
23
|
+
|
|
24
|
+
llm = MLXBackend(
|
|
25
|
+
model_path=MODEL,
|
|
26
|
+
max_tokens=2048,
|
|
27
|
+
temperature=0.3, # Lower for more consistent output
|
|
28
|
+
verbose=False, # Disable token streaming to avoid interfering with TUI
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
cleaner = DataCleaner(
|
|
32
|
+
llm_backend=llm,
|
|
33
|
+
file_path="test_cases/ecommerce_products.jsonl",
|
|
34
|
+
chunk_size=5, # Small chunks for demo
|
|
35
|
+
max_iterations=3, # Limit iterations per chunk
|
|
36
|
+
instructions="""
|
|
37
|
+
E-commerce product data cleaning:
|
|
38
|
+
- Normalize prices to float (remove $ symbols)
|
|
39
|
+
- Fix category typos and normalize to Title Case
|
|
40
|
+
- Convert weights to kg as float
|
|
41
|
+
- Ensure stock_quantity is non-negative integer
|
|
42
|
+
""",
|
|
43
|
+
tui=True, # Enable the Rich dashboard!
|
|
44
|
+
track_metrics=True,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
print("\nStarting cleaner with TUI enabled...")
|
|
48
|
+
print("Watch the dashboard below!\n")
|
|
49
|
+
|
|
50
|
+
cleaner.run()
|
|
51
|
+
|
|
52
|
+
print("\n" + "=" * 60)
|
|
53
|
+
print("Demo complete! Check cleaning_functions.py for output.")
|
|
54
|
+
print("=" * 60)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# API Contract: Rich TUI (v0.8.0)
|
|
2
|
+
|
|
3
|
+
## New Parameter
|
|
4
|
+
|
|
5
|
+
```python
|
|
6
|
+
DataCleaner(
|
|
7
|
+
...,
|
|
8
|
+
tui: bool = False, # Enable Rich terminal dashboard
|
|
9
|
+
)
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## Behavior Matrix
|
|
13
|
+
|
|
14
|
+
| `tui` | Rich installed | Behavior |
|
|
15
|
+
|-------|----------------|----------|
|
|
16
|
+
| `False` | Any | Existing callback-based output (no change) |
|
|
17
|
+
| `True` | Yes | Live dashboard replaces callback prints |
|
|
18
|
+
| `True` | No | Warning logged, falls back to callbacks |
|
|
19
|
+
|
|
20
|
+
## New Optional Dependency
|
|
21
|
+
|
|
22
|
+
```toml
|
|
23
|
+
[project.optional-dependencies]
|
|
24
|
+
tui = ["rich>=13.0"]
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install recursive-cleaner[tui]
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## TUI Module API
|
|
32
|
+
|
|
33
|
+
### `recursive_cleaner/tui.py`
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
# Check availability
|
|
37
|
+
HAS_RICH: bool
|
|
38
|
+
|
|
39
|
+
# Main renderer class
|
|
40
|
+
class TUIRenderer:
|
|
41
|
+
def __init__(self, file_path: str, total_chunks: int, total_records: int)
|
|
42
|
+
def start(self) -> None
|
|
43
|
+
def stop(self) -> None
|
|
44
|
+
def update_chunk(self, chunk_index: int, iteration: int, max_iterations: int) -> None
|
|
45
|
+
def update_llm_status(self, status: str) -> None # "calling" | "idle"
|
|
46
|
+
def add_function(self, name: str, docstring: str) -> None
|
|
47
|
+
def update_metrics(self, quality_delta: float, latency_last: float, latency_avg: float, latency_total: float, llm_calls: int) -> None
|
|
48
|
+
def show_complete(self, summary: dict) -> None
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Integration with DataCleaner
|
|
52
|
+
|
|
53
|
+
When `tui=True` and Rich available:
|
|
54
|
+
1. `on_progress` callback still fires (for logging, state tracking)
|
|
55
|
+
2. TUI replaces console output, not callbacks
|
|
56
|
+
3. TUI auto-stops on completion or error
|
|
57
|
+
|
|
58
|
+
## No Breaking Changes
|
|
59
|
+
|
|
60
|
+
- All existing parameters unchanged
|
|
61
|
+
- All existing callbacks unchanged
|
|
62
|
+
- `tui=False` (default) = identical to v0.7.0 behavior
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# Data Schema: TUI Display State (v0.8.0)
|
|
2
|
+
|
|
3
|
+
## Dashboard State
|
|
4
|
+
|
|
5
|
+
```python
|
|
6
|
+
@dataclass
|
|
7
|
+
class TUIState:
|
|
8
|
+
# Header
|
|
9
|
+
file_path: str
|
|
10
|
+
total_records: int
|
|
11
|
+
version: str = "0.8.0"
|
|
12
|
+
|
|
13
|
+
# Progress
|
|
14
|
+
current_chunk: int = 0
|
|
15
|
+
total_chunks: int = 0
|
|
16
|
+
current_iteration: int = 0
|
|
17
|
+
max_iterations: int = 5
|
|
18
|
+
|
|
19
|
+
# LLM Status
|
|
20
|
+
llm_status: Literal["idle", "calling"] = "idle"
|
|
21
|
+
|
|
22
|
+
# Functions
|
|
23
|
+
functions: list[FunctionInfo] = field(default_factory=list)
|
|
24
|
+
|
|
25
|
+
# Metrics
|
|
26
|
+
quality_delta: float = 0.0 # Percentage improvement
|
|
27
|
+
latency_last_ms: float = 0.0
|
|
28
|
+
latency_avg_ms: float = 0.0
|
|
29
|
+
latency_total_ms: float = 0.0
|
|
30
|
+
llm_call_count: int = 0
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class FunctionInfo:
|
|
34
|
+
name: str
|
|
35
|
+
docstring: str # First 50 chars displayed
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Dashboard Layout Schema
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
┌─────────────────────────────────────────────────────────┐
|
|
42
|
+
│ {file_path} v{version} │ <- HEADER (size=3)
|
|
43
|
+
├────────────────────┬────────────────────────────────────┤
|
|
44
|
+
│ PROGRESS │ FUNCTIONS ({len(functions)}) │ <- BODY
|
|
45
|
+
│ [████░░░░░░] {%} │ ├─ {functions[0].name} │
|
|
46
|
+
│ Chunk {cur}/{tot} │ ├─ {functions[1].name} │
|
|
47
|
+
│ Iter {i}/{max} │ └─ {functions[2].name} │
|
|
48
|
+
│ │ (+{n} more) │
|
|
49
|
+
│ {spinner} {status}│ QUALITY: +{quality_delta}% │
|
|
50
|
+
├────────────────────┴────────────────────────────────────┤
|
|
51
|
+
│ ⏱️ {latency_last}ms │ avg {latency_avg}ms │ {llm_calls} │ <- FOOTER (size=3)
|
|
52
|
+
└─────────────────────────────────────────────────────────┘
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Color Scheme
|
|
56
|
+
|
|
57
|
+
| Element | Color | Condition |
|
|
58
|
+
|---------|-------|-----------|
|
|
59
|
+
| Header title | cyan | Always |
|
|
60
|
+
| Progress bar | yellow | In progress |
|
|
61
|
+
| Progress bar | green | Chunk complete |
|
|
62
|
+
| Spinner | yellow | LLM calling |
|
|
63
|
+
| Function names | green | Always |
|
|
64
|
+
| Quality delta | green | Positive |
|
|
65
|
+
| Quality delta | red | Negative |
|
|
66
|
+
| Latency | dim white | Always |
|
|
67
|
+
|
|
68
|
+
## Spinner States
|
|
69
|
+
|
|
70
|
+
| `llm_status` | Display |
|
|
71
|
+
|--------------|---------|
|
|
72
|
+
| `"calling"` | Animated spinner + "Calling LLM..." |
|
|
73
|
+
| `"idle"` | Static checkmark or empty |
|
|
74
|
+
|
|
75
|
+
## Completion Summary
|
|
76
|
+
|
|
77
|
+
On `show_complete()`:
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
┌─────────────────────────────────────────────────────────┐
|
|
81
|
+
│ ✓ COMPLETE │
|
|
82
|
+
├─────────────────────────────────────────────────────────┤
|
|
83
|
+
│ Functions generated: {n} │
|
|
84
|
+
│ Chunks processed: {total_chunks} │
|
|
85
|
+
│ Quality improvement: +{quality_delta}% │
|
|
86
|
+
│ Total time: {latency_total}ms ({llm_calls} LLM calls) │
|
|
87
|
+
│ │
|
|
88
|
+
│ Output: cleaning_functions.py │
|
|
89
|
+
└─────────────────────────────────────────────────────────┘
|
|
90
|
+
```
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# Success Criteria: Rich TUI (v0.8.0)
|
|
2
|
+
|
|
3
|
+
## Project-Level Success
|
|
4
|
+
|
|
5
|
+
- [ ] `pip install recursive-cleaner[tui]` installs rich>=13.0
|
|
6
|
+
- [ ] `DataCleaner(..., tui=True)` shows live dashboard
|
|
7
|
+
- [ ] Dashboard displays all state from data schema contract
|
|
8
|
+
- [ ] Falls back gracefully when Rich not installed
|
|
9
|
+
- [ ] All 432 existing tests pass
|
|
10
|
+
- [ ] Zero breaking changes to existing API
|
|
11
|
+
|
|
12
|
+
## Phase 1: Core TUI Module
|
|
13
|
+
|
|
14
|
+
**Deliverables:**
|
|
15
|
+
- [ ] `recursive_cleaner/tui.py` with `TUIRenderer` class
|
|
16
|
+
- [ ] `HAS_RICH` check with graceful import
|
|
17
|
+
- [ ] Basic `start()` / `stop()` lifecycle
|
|
18
|
+
- [ ] Static layout matching schema (header, body split, footer)
|
|
19
|
+
|
|
20
|
+
**Success Criteria:**
|
|
21
|
+
- [ ] `from recursive_cleaner.tui import TUIRenderer, HAS_RICH` works
|
|
22
|
+
- [ ] `TUIRenderer` can be instantiated without Rich (no crash)
|
|
23
|
+
- [ ] With Rich: `start()` shows layout, `stop()` exits cleanly
|
|
24
|
+
- [ ] Layout has correct sections per data schema
|
|
25
|
+
|
|
26
|
+
**Tests:**
|
|
27
|
+
- [ ] test_tui_import_without_rich
|
|
28
|
+
- [ ] test_tui_renderer_lifecycle
|
|
29
|
+
- [ ] test_tui_layout_structure
|
|
30
|
+
|
|
31
|
+
## Phase 2: Dynamic Updates
|
|
32
|
+
|
|
33
|
+
**Deliverables:**
|
|
34
|
+
- [ ] `update_chunk()` updates progress bar and counters
|
|
35
|
+
- [ ] `update_llm_status()` shows/hides spinner
|
|
36
|
+
- [ ] `add_function()` appends to function list
|
|
37
|
+
- [ ] `update_metrics()` updates footer stats
|
|
38
|
+
|
|
39
|
+
**Success Criteria:**
|
|
40
|
+
- [ ] Progress bar fills based on chunk_index/total_chunks
|
|
41
|
+
- [ ] Spinner animates when status="calling", stops when "idle"
|
|
42
|
+
- [ ] Functions list grows, shows "+N more" when >5 functions
|
|
43
|
+
- [ ] Metrics panel shows formatted latency and counts
|
|
44
|
+
|
|
45
|
+
**Tests:**
|
|
46
|
+
- [ ] test_progress_updates
|
|
47
|
+
- [ ] test_spinner_states
|
|
48
|
+
- [ ] test_function_list_display
|
|
49
|
+
- [ ] test_metrics_display
|
|
50
|
+
|
|
51
|
+
## Phase 3: Integration & Polish
|
|
52
|
+
|
|
53
|
+
**Deliverables:**
|
|
54
|
+
- [ ] `tui=True` parameter on DataCleaner
|
|
55
|
+
- [ ] Integration: TUI updates from cleaner loop
|
|
56
|
+
- [ ] `show_complete()` with summary panel
|
|
57
|
+
- [ ] Fallback warning when Rich not installed
|
|
58
|
+
- [ ] Color transitions (yellow→green on chunk complete)
|
|
59
|
+
|
|
60
|
+
**Success Criteria:**
|
|
61
|
+
- [ ] Full cleaner run with `tui=True` shows live dashboard
|
|
62
|
+
- [ ] Completion shows summary with all stats
|
|
63
|
+
- [ ] `tui=True` without Rich logs warning, uses callbacks
|
|
64
|
+
- [ ] Chunk completion triggers green color flash
|
|
65
|
+
|
|
66
|
+
**Tests:**
|
|
67
|
+
- [ ] test_datacleaner_tui_integration
|
|
68
|
+
- [ ] test_tui_fallback_warning
|
|
69
|
+
- [ ] test_completion_summary
|
|
70
|
+
- [ ] test_color_transitions
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
# Implementation Plan: Rich TUI (v0.8.0)
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Add optional Rich-based terminal dashboard for visual progress tracking during data cleaning runs.
|
|
6
|
+
|
|
7
|
+
## Technology Stack
|
|
8
|
+
|
|
9
|
+
| Layer | Choice | Rationale |
|
|
10
|
+
|-------|--------|-----------|
|
|
11
|
+
| TUI Library | Rich >=13.0 | Simple API, same author as Textual, 50KB |
|
|
12
|
+
| Pattern | Live + Layout | Mission control style, update sections independently |
|
|
13
|
+
| Fallback | Plain callbacks | Zero-dep baseline preserved |
|
|
14
|
+
|
|
15
|
+
## Phase Breakdown
|
|
16
|
+
|
|
17
|
+
### Phase 1: Core TUI Module
|
|
18
|
+
|
|
19
|
+
**Objective:** Create standalone TUI renderer with basic layout.
|
|
20
|
+
|
|
21
|
+
**Deliverables:**
|
|
22
|
+
- [ ] `recursive_cleaner/tui.py` (~150 lines)
|
|
23
|
+
- [ ] `tests/test_tui.py` (basic tests)
|
|
24
|
+
- [ ] `pyproject.toml` update for `[tui]` extra
|
|
25
|
+
|
|
26
|
+
**Implementation:**
|
|
27
|
+
```python
|
|
28
|
+
# tui.py structure
|
|
29
|
+
try:
|
|
30
|
+
from rich.live import Live
|
|
31
|
+
from rich.layout import Layout
|
|
32
|
+
from rich.panel import Panel
|
|
33
|
+
HAS_RICH = True
|
|
34
|
+
except ImportError:
|
|
35
|
+
HAS_RICH = False
|
|
36
|
+
|
|
37
|
+
class TUIRenderer:
|
|
38
|
+
def __init__(self, file_path, total_chunks, total_records):
|
|
39
|
+
self._state = TUIState(...)
|
|
40
|
+
self._layout = self._make_layout() if HAS_RICH else None
|
|
41
|
+
self._live = None
|
|
42
|
+
|
|
43
|
+
def _make_layout(self):
|
|
44
|
+
layout = Layout()
|
|
45
|
+
layout.split_column(
|
|
46
|
+
Layout(name="header", size=3),
|
|
47
|
+
Layout(name="body"),
|
|
48
|
+
Layout(name="footer", size=3)
|
|
49
|
+
)
|
|
50
|
+
layout["body"].split_row(
|
|
51
|
+
Layout(name="progress"),
|
|
52
|
+
Layout(name="functions")
|
|
53
|
+
)
|
|
54
|
+
return layout
|
|
55
|
+
|
|
56
|
+
def start(self):
|
|
57
|
+
if not HAS_RICH:
|
|
58
|
+
return
|
|
59
|
+
self._live = Live(self._layout, refresh_per_second=2)
|
|
60
|
+
self._live.start()
|
|
61
|
+
|
|
62
|
+
def stop(self):
|
|
63
|
+
if self._live:
|
|
64
|
+
self._live.stop()
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
**Success Criteria:**
|
|
68
|
+
- Import works with/without Rich
|
|
69
|
+
- Layout renders with correct sections
|
|
70
|
+
- Start/stop lifecycle works
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
### Phase 2: Dynamic Updates
|
|
75
|
+
|
|
76
|
+
**Objective:** Wire up all state updates to visual components.
|
|
77
|
+
|
|
78
|
+
**Deliverables:**
|
|
79
|
+
- [ ] `update_chunk()` - progress bar + counters
|
|
80
|
+
- [ ] `update_llm_status()` - spinner control
|
|
81
|
+
- [ ] `add_function()` - function list panel
|
|
82
|
+
- [ ] `update_metrics()` - footer stats
|
|
83
|
+
- [ ] Additional tests for each update method
|
|
84
|
+
|
|
85
|
+
**Implementation:**
|
|
86
|
+
```python
|
|
87
|
+
def update_chunk(self, chunk_index, iteration, max_iterations):
|
|
88
|
+
self._state.current_chunk = chunk_index
|
|
89
|
+
self._state.current_iteration = iteration
|
|
90
|
+
self._refresh_progress_panel()
|
|
91
|
+
|
|
92
|
+
def _refresh_progress_panel(self):
|
|
93
|
+
progress = Progress(BarColumn(), TextColumn("{task.percentage:.0f}%"))
|
|
94
|
+
task = progress.add_task("", total=self._state.total_chunks)
|
|
95
|
+
progress.update(task, completed=self._state.current_chunk)
|
|
96
|
+
|
|
97
|
+
content = Group(
|
|
98
|
+
progress,
|
|
99
|
+
Text(f"Chunk {self._state.current_chunk}/{self._state.total_chunks}"),
|
|
100
|
+
Text(f"Iteration {self._state.current_iteration}/{self._state.max_iterations}"),
|
|
101
|
+
self._make_spinner()
|
|
102
|
+
)
|
|
103
|
+
self._layout["progress"].update(Panel(content, title="Progress"))
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**Success Criteria:**
|
|
107
|
+
- Progress bar animates smoothly
|
|
108
|
+
- Spinner shows during LLM calls
|
|
109
|
+
- Function list grows dynamically
|
|
110
|
+
- Metrics update in real-time
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
### Phase 3: Integration & Polish
|
|
115
|
+
|
|
116
|
+
**Objective:** Connect TUI to DataCleaner and add finishing touches.
|
|
117
|
+
|
|
118
|
+
**Deliverables:**
|
|
119
|
+
- [ ] `tui=True` parameter on DataCleaner.__init__
|
|
120
|
+
- [ ] TUI updates from main processing loop
|
|
121
|
+
- [ ] `show_complete()` summary panel
|
|
122
|
+
- [ ] Fallback warning via logging
|
|
123
|
+
- [ ] Color transitions on chunk completion
|
|
124
|
+
- [ ] Integration tests
|
|
125
|
+
|
|
126
|
+
**Implementation in cleaner.py:**
|
|
127
|
+
```python
|
|
128
|
+
def __init__(self, ..., tui: bool = False):
|
|
129
|
+
self.tui = tui
|
|
130
|
+
self._tui_renderer = None
|
|
131
|
+
|
|
132
|
+
def run(self):
|
|
133
|
+
if self.tui:
|
|
134
|
+
from recursive_cleaner.tui import TUIRenderer, HAS_RICH
|
|
135
|
+
if HAS_RICH:
|
|
136
|
+
self._tui_renderer = TUIRenderer(...)
|
|
137
|
+
self._tui_renderer.start()
|
|
138
|
+
else:
|
|
139
|
+
import logging
|
|
140
|
+
logging.warning("tui=True but Rich not installed. pip install recursive-cleaner[tui]")
|
|
141
|
+
|
|
142
|
+
# ... existing loop with TUI updates injected ...
|
|
143
|
+
|
|
144
|
+
if self._tui_renderer:
|
|
145
|
+
self._tui_renderer.show_complete(summary)
|
|
146
|
+
self._tui_renderer.stop()
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
**Success Criteria:**
|
|
150
|
+
- Full run with tui=True shows dashboard
|
|
151
|
+
- Fallback logs warning, uses callbacks
|
|
152
|
+
- Completion summary displays all stats
|
|
153
|
+
- Green flash on chunk completion
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
## Risk Register
|
|
158
|
+
|
|
159
|
+
| Risk | Likelihood | Impact | Mitigation |
|
|
160
|
+
|------|------------|--------|------------|
|
|
161
|
+
| Terminal size too small | Low | Medium | Use `vertical_overflow="crop"` |
|
|
162
|
+
| Rich version incompatibility | Low | Medium | Pin `>=13.0` (stable API) |
|
|
163
|
+
| Performance overhead | Low | Low | refresh_per_second=2 is fine |
|
|
164
|
+
|
|
165
|
+
## Out of Scope
|
|
166
|
+
|
|
167
|
+
- Keyboard interactivity (pause/resume)
|
|
168
|
+
- Mouse support
|
|
169
|
+
- Scrollable function list
|
|
170
|
+
- Custom themes
|
|
171
|
+
- Textual upgrade path
|
|
172
|
+
|
|
173
|
+
## File Changes Summary
|
|
174
|
+
|
|
175
|
+
| File | Change |
|
|
176
|
+
|------|--------|
|
|
177
|
+
| `recursive_cleaner/tui.py` | NEW (~200 lines) |
|
|
178
|
+
| `recursive_cleaner/cleaner.py` | Add `tui` param, TUI integration |
|
|
179
|
+
| `recursive_cleaner/__init__.py` | Export TUIRenderer, HAS_RICH |
|
|
180
|
+
| `pyproject.toml` | Add `[tui]` optional dependency |
|
|
181
|
+
| `tests/test_tui.py` | NEW (~15 tests) |
|
|
182
|
+
| `README.md` | Document TUI feature |
|