recursive-cleaner 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- recursive_cleaner-0.6.0/.gitignore +57 -0
- recursive_cleaner-0.6.0/CLAUDE.md +455 -0
- recursive_cleaner-0.6.0/CLAUDE_ADVANCED.md +955 -0
- recursive_cleaner-0.6.0/LICENSE +21 -0
- recursive_cleaner-0.6.0/PKG-INFO +282 -0
- recursive_cleaner-0.6.0/README.md +250 -0
- recursive_cleaner-0.6.0/TODO.md +117 -0
- recursive_cleaner-0.6.0/backends/__init__.py +5 -0
- recursive_cleaner-0.6.0/backends/mlx_backend.py +95 -0
- recursive_cleaner-0.6.0/docs/contracts/api-contract.md +209 -0
- recursive_cleaner-0.6.0/docs/contracts/data-schema.md +101 -0
- recursive_cleaner-0.6.0/docs/contracts/success-criteria.md +114 -0
- recursive_cleaner-0.6.0/docs/contracts/text-mode-contract.md +178 -0
- recursive_cleaner-0.6.0/docs/contracts/tier2-contract.md +178 -0
- recursive_cleaner-0.6.0/docs/contracts/tier4-contract.md +203 -0
- recursive_cleaner-0.6.0/docs/contracts/tier4-success-criteria.md +108 -0
- recursive_cleaner-0.6.0/docs/contracts/two-pass-contract.md +272 -0
- recursive_cleaner-0.6.0/docs/handoffs/tier4-handoff.md +63 -0
- recursive_cleaner-0.6.0/docs/implementation-plan-tier4.md +132 -0
- recursive_cleaner-0.6.0/docs/implementation-plan-v03.md +95 -0
- recursive_cleaner-0.6.0/docs/implementation-plan-v04.md +148 -0
- recursive_cleaner-0.6.0/docs/implementation-plan-v05.md +190 -0
- recursive_cleaner-0.6.0/docs/implementation-plan.md +184 -0
- recursive_cleaner-0.6.0/docs/langchain-analysis.md +640 -0
- recursive_cleaner-0.6.0/docs/langgraph-analysis.md +507 -0
- recursive_cleaner-0.6.0/docs/mlx-lm-guide.md +425 -0
- recursive_cleaner-0.6.0/docs/other-frameworks-analysis.md +386 -0
- recursive_cleaner-0.6.0/docs/refactor-assessment/data/dependency.json +368 -0
- recursive_cleaner-0.6.0/docs/refactor-assessment/data/stats.json +5070 -0
- recursive_cleaner-0.6.0/docs/refactor-assessment/plan.md +75 -0
- recursive_cleaner-0.6.0/docs/refactor-assessment/report.md +149 -0
- recursive_cleaner-0.6.0/docs/research/chonkie-extraction.md +357 -0
- recursive_cleaner-0.6.0/docs/research/chonkie.md +367 -0
- recursive_cleaner-0.6.0/docs/research/markitdown.md +513 -0
- recursive_cleaner-0.6.0/docs/smolagents-analysis.md +545 -0
- recursive_cleaner-0.6.0/docs/workflow-state.md +45 -0
- recursive_cleaner-0.6.0/pyproject.toml +60 -0
- recursive_cleaner-0.6.0/recursive_cleaner/__init__.py +46 -0
- recursive_cleaner-0.6.0/recursive_cleaner/cleaner.py +628 -0
- recursive_cleaner-0.6.0/recursive_cleaner/context.py +27 -0
- recursive_cleaner-0.6.0/recursive_cleaner/dependencies.py +59 -0
- recursive_cleaner-0.6.0/recursive_cleaner/errors.py +17 -0
- recursive_cleaner-0.6.0/recursive_cleaner/metrics.py +163 -0
- recursive_cleaner-0.6.0/recursive_cleaner/optimizer.py +336 -0
- recursive_cleaner-0.6.0/recursive_cleaner/output.py +197 -0
- recursive_cleaner-0.6.0/recursive_cleaner/parsers.py +325 -0
- recursive_cleaner-0.6.0/recursive_cleaner/prompt.py +218 -0
- recursive_cleaner-0.6.0/recursive_cleaner/report.py +138 -0
- recursive_cleaner-0.6.0/recursive_cleaner/response.py +292 -0
- recursive_cleaner-0.6.0/recursive_cleaner/schema.py +117 -0
- recursive_cleaner-0.6.0/recursive_cleaner/types.py +11 -0
- recursive_cleaner-0.6.0/recursive_cleaner/validation.py +202 -0
- recursive_cleaner-0.6.0/recursive_cleaner/vendor/__init__.py +4 -0
- recursive_cleaner-0.6.0/recursive_cleaner/vendor/chunker.py +187 -0
- recursive_cleaner-0.6.0/test_cases/ecommerce_instructions.txt +12 -0
- recursive_cleaner-0.6.0/test_cases/ecommerce_products.jsonl +60 -0
- recursive_cleaner-0.6.0/test_cases/financial_instructions.txt +14 -0
- recursive_cleaner-0.6.0/test_cases/financial_transactions.jsonl +60 -0
- recursive_cleaner-0.6.0/test_cases/healthcare_instructions.txt +14 -0
- recursive_cleaner-0.6.0/test_cases/healthcare_patients.jsonl +55 -0
- recursive_cleaner-0.6.0/test_cases/run_ecommerce_test.py +45 -0
- recursive_cleaner-0.6.0/test_cases/run_financial_test.py +47 -0
- recursive_cleaner-0.6.0/test_cases/run_healthcare_test.py +47 -0
- recursive_cleaner-0.6.0/tests/__init__.py +1 -0
- recursive_cleaner-0.6.0/tests/test_callbacks.py +195 -0
- recursive_cleaner-0.6.0/tests/test_cleaner.py +156 -0
- recursive_cleaner-0.6.0/tests/test_context.py +68 -0
- recursive_cleaner-0.6.0/tests/test_dependencies.py +260 -0
- recursive_cleaner-0.6.0/tests/test_dry_run.py +215 -0
- recursive_cleaner-0.6.0/tests/test_holdout.py +304 -0
- recursive_cleaner-0.6.0/tests/test_incremental.py +351 -0
- recursive_cleaner-0.6.0/tests/test_integration.py +943 -0
- recursive_cleaner-0.6.0/tests/test_latency.py +208 -0
- recursive_cleaner-0.6.0/tests/test_metrics.py +334 -0
- recursive_cleaner-0.6.0/tests/test_optimizer.py +1504 -0
- recursive_cleaner-0.6.0/tests/test_output.py +263 -0
- recursive_cleaner-0.6.0/tests/test_parsers.py +366 -0
- recursive_cleaner-0.6.0/tests/test_report.py +188 -0
- recursive_cleaner-0.6.0/tests/test_sampling.py +322 -0
- recursive_cleaner-0.6.0/tests/test_schema.py +267 -0
- recursive_cleaner-0.6.0/tests/test_text_mode.py +469 -0
- recursive_cleaner-0.6.0/tests/test_validation.py +559 -0
- recursive_cleaner-0.6.0/tests/test_vendor_chunker.py +341 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
*.egg-info/
|
|
20
|
+
.installed.cfg
|
|
21
|
+
*.egg
|
|
22
|
+
|
|
23
|
+
# Virtual environments
|
|
24
|
+
.env
|
|
25
|
+
.venv
|
|
26
|
+
env/
|
|
27
|
+
venv/
|
|
28
|
+
ENV/
|
|
29
|
+
|
|
30
|
+
# IDE
|
|
31
|
+
.idea/
|
|
32
|
+
.vscode/
|
|
33
|
+
*.swp
|
|
34
|
+
*.swo
|
|
35
|
+
|
|
36
|
+
# Testing
|
|
37
|
+
.pytest_cache/
|
|
38
|
+
.coverage
|
|
39
|
+
htmlcov/
|
|
40
|
+
samples/
|
|
41
|
+
|
|
42
|
+
# Generated files
|
|
43
|
+
cleaning_functions.py
|
|
44
|
+
cleaning_functions_*.py
|
|
45
|
+
cleaning_report.md
|
|
46
|
+
*_report.md
|
|
47
|
+
*_state.json
|
|
48
|
+
test_data.jsonl
|
|
49
|
+
test_cases/*_cleaning_functions.py
|
|
50
|
+
run_mlx_tests.py
|
|
51
|
+
|
|
52
|
+
# Cloned framework repos (for research only)
|
|
53
|
+
docs/frameworks/
|
|
54
|
+
|
|
55
|
+
# OS
|
|
56
|
+
.DS_Store
|
|
57
|
+
Thumbs.db
|
|
@@ -0,0 +1,455 @@
|
|
|
1
|
+
# CLAUDE.md - Recursive Docstring Data Cleaning Pipeline
|
|
2
|
+
|
|
3
|
+
## Project Status
|
|
4
|
+
|
|
5
|
+
| Version | Status | Date |
|
|
6
|
+
|---------|--------|------|
|
|
7
|
+
| v0.6.0 | **Implemented** | 2025-01-15 |
|
|
8
|
+
| v0.5.1 | Implemented | 2025-01-15 |
|
|
9
|
+
| v0.5.0 | Implemented | 2025-01-15 |
|
|
10
|
+
| v0.4.0 | Implemented | 2025-01-15 |
|
|
11
|
+
| v0.3.0 | Implemented | 2025-01-14 |
|
|
12
|
+
| v0.2.0 | Implemented | 2025-01-14 |
|
|
13
|
+
| v0.1.0 | Implemented | 2025-01-14 |
|
|
14
|
+
|
|
15
|
+
**Current State**: v0.6.0 complete. 392 tests passing, 2,967 lines total.
|
|
16
|
+
|
|
17
|
+
### Version History
|
|
18
|
+
- **v0.6.0**: Latency metrics, import consolidation, cleaning report, dry-run mode
|
|
19
|
+
- **v0.5.1**: Dangerous code detection (AST-based security)
|
|
20
|
+
- **v0.5.0**: Two-pass optimization with LLM agency (consolidation, early termination)
|
|
21
|
+
- **v0.4.0**: Holdout validation, dependency resolution, smart sampling, quality metrics
|
|
22
|
+
- **v0.3.0**: Text mode with vendored sentence-aware chunker
|
|
23
|
+
- **v0.2.0**: Runtime validation, schema inference, callbacks, incremental saves
|
|
24
|
+
- **v0.1.0**: Core pipeline
|
|
25
|
+
|
|
26
|
+
## Project Overview
|
|
27
|
+
|
|
28
|
+
A Python library that uses LLMs to incrementally build data cleaning solutions for massive datasets. The system processes data in chunks, identifies quality issues, generates Python functions to solve them one at a time, and maintains awareness of existing solutions through docstring feedback loops.
|
|
29
|
+
|
|
30
|
+
**Core Philosophy**: Elegant, clean, lean, path of least resistance. Trade computational efficiency for human time savings. No frameworks, no abstractions we don't need, just a while loop with good error handling.
|
|
31
|
+
|
|
32
|
+
## Design Principles
|
|
33
|
+
|
|
34
|
+
1. **Simplicity over extensibility** - A 500-line library that does one thing well beats a 5000-line framework
|
|
35
|
+
2. **stdlib over dependencies** - Use `ast.parse()`, `xml.etree`, not custom parsers
|
|
36
|
+
3. **Functions over classes** - Unless state genuinely helps
|
|
37
|
+
4. **Delete over abstract** - No interfaces for things with one implementation
|
|
38
|
+
5. **Retry over recover** - On error, retry with error message appended to prompt
|
|
39
|
+
|
|
40
|
+
## Target User Experience
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from recursive_cleaner import DataCleaner
|
|
44
|
+
|
|
45
|
+
cleaner = DataCleaner(
|
|
46
|
+
llm_backend=my_ollama_client, # User-provided LLM interface
|
|
47
|
+
file_path="messy_customers.jsonl",
|
|
48
|
+
chunk_size=50, # items per chunk
|
|
49
|
+
instructions="""
|
|
50
|
+
CRM export data that needs:
|
|
51
|
+
- Phone numbers normalized to E.164 format
|
|
52
|
+
- Fix typos in 'status' field (valid: active, pending, churned)
|
|
53
|
+
- Remove duplicates by email
|
|
54
|
+
- All dates to ISO 8601
|
|
55
|
+
""",
|
|
56
|
+
# Validation & schema (v0.2.0)
|
|
57
|
+
on_progress=lambda e: print(f"{e['type']}: {e.get('chunk_index', '')}"),
|
|
58
|
+
state_file="cleaning_state.json", # Resume on interrupt
|
|
59
|
+
validate_runtime=True, # Test functions before accepting
|
|
60
|
+
schema_sample_size=10, # Infer schema from first N records
|
|
61
|
+
# Sampling & metrics (v0.4.0)
|
|
62
|
+
holdout_ratio=0.2, # Test on hidden 20% of each chunk
|
|
63
|
+
sampling_strategy="stratified", # "sequential", "random", or "stratified"
|
|
64
|
+
stratify_field="status", # Field for stratified sampling
|
|
65
|
+
track_metrics=True, # Measure before/after quality
|
|
66
|
+
# Optimization (v0.5.0)
|
|
67
|
+
optimize=True, # Consolidate redundant functions after generation
|
|
68
|
+
early_termination=True, # Stop when patterns saturate
|
|
69
|
+
# Observability (v0.6.0)
|
|
70
|
+
report_path="cleaning_report.md", # Generate markdown report (None to disable)
|
|
71
|
+
dry_run=False, # Set True to analyze without generating functions
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
cleaner.run() # Outputs: cleaning_functions.py, cleaning_report.md
|
|
75
|
+
|
|
76
|
+
# Check improvement metrics
|
|
77
|
+
print(cleaner.get_improvement_report())
|
|
78
|
+
|
|
79
|
+
# Or resume from saved state
|
|
80
|
+
cleaner = DataCleaner.resume("cleaning_state.json", my_ollama_client)
|
|
81
|
+
cleaner.run()
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Core Concepts
|
|
85
|
+
|
|
86
|
+
### 1. Chunked Processing
|
|
87
|
+
Large files exceed LLM context windows. Process in chunks:
|
|
88
|
+
- **Text files**: By character count (default 4000)
|
|
89
|
+
- **CSV/JSON/JSONL**: By item count (default 50)
|
|
90
|
+
|
|
91
|
+
### 2. Docstring Registry (Context Memory)
|
|
92
|
+
Each generated function's docstring is fed back into subsequent prompts. Simple list, most recent N functions, character budget.
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
def build_context(functions: list[dict], max_chars: int = 8000) -> str:
|
|
96
|
+
"""Most recent functions that fit in budget. That's it."""
|
|
97
|
+
ctx = ""
|
|
98
|
+
for f in reversed(functions):
|
|
99
|
+
entry = f"## {f['name']}\n{f['docstring']}\n\n"
|
|
100
|
+
if len(ctx) + len(entry) > max_chars:
|
|
101
|
+
break
|
|
102
|
+
ctx = entry + ctx
|
|
103
|
+
return ctx or "(No functions generated yet)"
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### 3. Single-Problem Focus
|
|
107
|
+
Per chunk iteration:
|
|
108
|
+
1. LLM identifies ALL issues in chunk
|
|
109
|
+
2. LLM checks which are already solved (by reviewing docstrings)
|
|
110
|
+
3. LLM generates code for ONLY the first unsolved issue
|
|
111
|
+
4. Repeat until "clean" or max iterations (default 5)
|
|
112
|
+
|
|
113
|
+
### 4. XML Output with Markdown Code Blocks
|
|
114
|
+
XML wrapper for structure, markdown fences for code (handles LLM variance):
|
|
115
|
+
|
|
116
|
+
```xml
|
|
117
|
+
<cleaning_analysis>
|
|
118
|
+
<issues_detected>
|
|
119
|
+
<issue id="1" solved="false">Phone numbers have inconsistent formats</issue>
|
|
120
|
+
<issue id="2" solved="true">Already handled by normalize_dates()</issue>
|
|
121
|
+
</issues_detected>
|
|
122
|
+
|
|
123
|
+
<function_to_generate>
|
|
124
|
+
<name>normalize_phone_numbers</name>
|
|
125
|
+
<docstring>
|
|
126
|
+
Normalize phone numbers to E.164 format.
|
|
127
|
+
Handles: +1-555-1234, (555) 123-4567, raw digits
|
|
128
|
+
</docstring>
|
|
129
|
+
<code>
|
|
130
|
+
```python
|
|
131
|
+
import re
|
|
132
|
+
|
|
133
|
+
def normalize_phone_numbers(data):
|
|
134
|
+
# Implementation...
|
|
135
|
+
pass
|
|
136
|
+
```
|
|
137
|
+
</code>
|
|
138
|
+
</function_to_generate>
|
|
139
|
+
|
|
140
|
+
<chunk_status>needs_more_work</chunk_status>
|
|
141
|
+
</cleaning_analysis>
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## The Lean Architecture (~2,967 lines total)
|
|
145
|
+
|
|
146
|
+
### File Structure (Implemented)
|
|
147
|
+
```
|
|
148
|
+
recursive_cleaner/
|
|
149
|
+
__init__.py # Public exports (~45 lines)
|
|
150
|
+
cleaner.py # Main DataCleaner class (~580 lines)
|
|
151
|
+
context.py # Docstring registry with FIFO eviction (~27 lines)
|
|
152
|
+
dependencies.py # Topological sort for function ordering (~59 lines) [v0.4.0]
|
|
153
|
+
errors.py # 4 exception classes (~17 lines)
|
|
154
|
+
metrics.py # Quality metrics before/after (~163 lines) [v0.4.0]
|
|
155
|
+
optimizer.py # Two-pass consolidation with LLM agency (~336 lines) [v0.5.0]
|
|
156
|
+
output.py # Function file generation (~195 lines)
|
|
157
|
+
parsers.py # Chunk text/csv/json/jsonl with sampling (~325 lines)
|
|
158
|
+
prompt.py # LLM prompt templates (~218 lines)
|
|
159
|
+
report.py # Markdown report generation (~120 lines) [v0.6.0]
|
|
160
|
+
response.py # XML/markdown parsing + agency dataclasses (~292 lines)
|
|
161
|
+
schema.py # Schema inference (~117 lines) [v0.2.0]
|
|
162
|
+
types.py # LLMBackend protocol (~11 lines)
|
|
163
|
+
validation.py # Runtime validation + safety checks (~200 lines)
|
|
164
|
+
vendor/
|
|
165
|
+
__init__.py # Vendor exports (~4 lines)
|
|
166
|
+
chunker.py # Vendored sentence-aware chunker (~187 lines) [v0.3.0]
|
|
167
|
+
|
|
168
|
+
backends/
|
|
169
|
+
__init__.py # Backend exports
|
|
170
|
+
mlx_backend.py # MLX-LM backend for Apple Silicon
|
|
171
|
+
|
|
172
|
+
tests/ # 392 tests
|
|
173
|
+
test_callbacks.py # Progress callback tests
|
|
174
|
+
test_cleaner.py # DataCleaner tests
|
|
175
|
+
test_context.py # Context management tests
|
|
176
|
+
test_dependencies.py # Dependency resolution tests [v0.4.0]
|
|
177
|
+
test_dry_run.py # Dry run mode tests [v0.6.0]
|
|
178
|
+
test_holdout.py # Holdout validation tests [v0.4.0]
|
|
179
|
+
test_incremental.py # Incremental save tests
|
|
180
|
+
test_integration.py # End-to-end tests
|
|
181
|
+
test_latency.py # Latency metrics tests [v0.6.0]
|
|
182
|
+
test_metrics.py # Quality metrics tests [v0.4.0]
|
|
183
|
+
test_optimizer.py # Two-pass optimization tests [v0.5.0]
|
|
184
|
+
test_output.py # Output generation tests
|
|
185
|
+
test_parsers.py # Parsing tests
|
|
186
|
+
test_report.py # Cleaning report tests [v0.6.0]
|
|
187
|
+
test_sampling.py # Sampling strategy tests [v0.4.0]
|
|
188
|
+
test_schema.py # Schema inference tests
|
|
189
|
+
test_text_mode.py # Text mode tests [v0.3.0]
|
|
190
|
+
test_validation.py # Runtime validation + safety tests
|
|
191
|
+
test_vendor_chunker.py # Vendored chunker tests [v0.3.0]
|
|
192
|
+
|
|
193
|
+
test_cases/ # Comprehensive test datasets
|
|
194
|
+
ecommerce_*.jsonl # Product catalog data
|
|
195
|
+
healthcare_*.jsonl # Patient records
|
|
196
|
+
financial_*.jsonl # Transaction data
|
|
197
|
+
|
|
198
|
+
docs/ # Orchestrated dev docs
|
|
199
|
+
contracts/ # API and data contracts
|
|
200
|
+
research/ # Research findings
|
|
201
|
+
handoffs/ # Phase completion handoffs
|
|
202
|
+
|
|
203
|
+
pyproject.toml
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
### Error Classes (18 lines)
|
|
207
|
+
```python
|
|
208
|
+
class CleanerError(Exception):
|
|
209
|
+
"""Base error for the pipeline"""
|
|
210
|
+
|
|
211
|
+
class ParseError(CleanerError):
|
|
212
|
+
"""XML or code extraction failed - retry with error feedback"""
|
|
213
|
+
|
|
214
|
+
class MaxIterationsError(CleanerError):
|
|
215
|
+
"""Chunk never marked clean - skip and continue"""
|
|
216
|
+
|
|
217
|
+
class OutputValidationError(CleanerError):
|
|
218
|
+
"""Generated output file has invalid Python syntax"""
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
### LLM Backend Protocol (5 lines)
|
|
222
|
+
```python
|
|
223
|
+
from typing import Protocol
|
|
224
|
+
|
|
225
|
+
class LLMBackend(Protocol):
|
|
226
|
+
def generate(self, prompt: str) -> str: ...
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### Retry Logic (use tenacity)
|
|
230
|
+
```python
|
|
231
|
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
|
232
|
+
|
|
233
|
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10))
|
|
234
|
+
def call_llm(backend: LLMBackend, prompt: str) -> str:
|
|
235
|
+
return backend.generate(prompt)
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
### Response Parsing (~30 lines)
|
|
239
|
+
```python
|
|
240
|
+
import ast
|
|
241
|
+
import re
|
|
242
|
+
import xml.etree.ElementTree as ET
|
|
243
|
+
|
|
244
|
+
def parse_response(text: str) -> dict:
|
|
245
|
+
"""Extract structured data from LLM response."""
|
|
246
|
+
try:
|
|
247
|
+
# Find XML content
|
|
248
|
+
root = ET.fromstring(f"<root>{text}</root>")
|
|
249
|
+
|
|
250
|
+
# Extract code from markdown fence
|
|
251
|
+
code_elem = root.find(".//code")
|
|
252
|
+
code_text = code_elem.text if code_elem is not None else ""
|
|
253
|
+
code = extract_python_block(code_text)
|
|
254
|
+
|
|
255
|
+
# Validate Python syntax
|
|
256
|
+
ast.parse(code)
|
|
257
|
+
|
|
258
|
+
return {
|
|
259
|
+
"issues": parse_issues(root),
|
|
260
|
+
"name": root.findtext(".//name", "").strip(),
|
|
261
|
+
"docstring": root.findtext(".//docstring", "").strip(),
|
|
262
|
+
"code": code,
|
|
263
|
+
"status": root.findtext(".//chunk_status", "needs_more_work").strip()
|
|
264
|
+
}
|
|
265
|
+
except ET.ParseError as e:
|
|
266
|
+
raise ParseError(f"Invalid XML: {e}")
|
|
267
|
+
except SyntaxError as e:
|
|
268
|
+
raise ParseError(f"Invalid Python: {e}")
|
|
269
|
+
|
|
270
|
+
def extract_python_block(text: str) -> str:
|
|
271
|
+
"""Extract code from ```python ... ``` block."""
|
|
272
|
+
match = re.search(r"```python\s*(.*?)\s*```", text, re.DOTALL)
|
|
273
|
+
return match.group(1) if match else text.strip()
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
### The Main Loop (~80 lines)
|
|
277
|
+
```python
|
|
278
|
+
class DataCleaner:
|
|
279
|
+
def __init__(self, llm_backend, file_path, chunk_size=50,
|
|
280
|
+
instructions="", max_iterations=5, context_budget=8000):
|
|
281
|
+
self.backend = llm_backend
|
|
282
|
+
self.file_path = file_path
|
|
283
|
+
self.chunk_size = chunk_size
|
|
284
|
+
self.instructions = instructions
|
|
285
|
+
self.max_iterations = max_iterations
|
|
286
|
+
self.context_budget = context_budget
|
|
287
|
+
self.functions = [] # List of {name, docstring, code}
|
|
288
|
+
|
|
289
|
+
def run(self):
|
|
290
|
+
chunks = self._load_chunks()
|
|
291
|
+
|
|
292
|
+
for i, chunk in enumerate(chunks):
|
|
293
|
+
print(f"Processing chunk {i+1}/{len(chunks)}...")
|
|
294
|
+
self._process_chunk(chunk, i)
|
|
295
|
+
|
|
296
|
+
self._write_output()
|
|
297
|
+
print(f"Done! Generated {len(self.functions)} functions.")
|
|
298
|
+
|
|
299
|
+
def _process_chunk(self, chunk, chunk_idx):
|
|
300
|
+
for iteration in range(self.max_iterations):
|
|
301
|
+
prompt = self._build_prompt(chunk)
|
|
302
|
+
|
|
303
|
+
try:
|
|
304
|
+
response = call_llm(self.backend, prompt)
|
|
305
|
+
result = parse_response(response)
|
|
306
|
+
except ParseError as e:
|
|
307
|
+
# Retry with error feedback
|
|
308
|
+
prompt += f"\n\nYour previous response had an error: {e}\nPlease try again."
|
|
309
|
+
continue
|
|
310
|
+
|
|
311
|
+
if result["status"] == "clean":
|
|
312
|
+
return
|
|
313
|
+
|
|
314
|
+
if result["code"]:
|
|
315
|
+
self.functions.append({
|
|
316
|
+
"name": result["name"],
|
|
317
|
+
"docstring": result["docstring"],
|
|
318
|
+
"code": result["code"]
|
|
319
|
+
})
|
|
320
|
+
|
|
321
|
+
print(f" Warning: chunk {chunk_idx} hit max iterations")
|
|
322
|
+
|
|
323
|
+
def _build_prompt(self, chunk):
|
|
324
|
+
context = build_context(self.functions, self.context_budget)
|
|
325
|
+
return PROMPT_TEMPLATE.format(
|
|
326
|
+
instructions=self.instructions,
|
|
327
|
+
context=context,
|
|
328
|
+
chunk=chunk
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
def _write_output(self):
|
|
332
|
+
# Generate cleaning_functions.py with all functions
|
|
333
|
+
# and a clean_data() entrypoint
|
|
334
|
+
...
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
## Prompt Template
|
|
338
|
+
|
|
339
|
+
```python
|
|
340
|
+
PROMPT_TEMPLATE = '''You are a data cleaning expert. Analyze data and generate Python functions to fix issues.
|
|
341
|
+
|
|
342
|
+
=== USER'S CLEANING GOALS ===
|
|
343
|
+
{instructions}
|
|
344
|
+
|
|
345
|
+
=== EXISTING FUNCTIONS (DO NOT RECREATE) ===
|
|
346
|
+
{context}
|
|
347
|
+
|
|
348
|
+
=== DATA CHUNK ===
|
|
349
|
+
{chunk}
|
|
350
|
+
|
|
351
|
+
=== TASK ===
|
|
352
|
+
1. List ALL data quality issues
|
|
353
|
+
2. Mark each as solved="true" if an existing function handles it
|
|
354
|
+
3. Generate code for ONLY the FIRST unsolved issue
|
|
355
|
+
4. Use this EXACT format:
|
|
356
|
+
|
|
357
|
+
<cleaning_analysis>
|
|
358
|
+
<issues_detected>
|
|
359
|
+
<issue id="1" solved="true|false">Description</issue>
|
|
360
|
+
</issues_detected>
|
|
361
|
+
|
|
362
|
+
<function_to_generate>
|
|
363
|
+
<name>function_name</name>
|
|
364
|
+
<docstring>What it does, edge cases handled</docstring>
|
|
365
|
+
<code>
|
|
366
|
+
```python
|
|
367
|
+
def function_name(data):
|
|
368
|
+
# Complete implementation
|
|
369
|
+
pass
|
|
370
|
+
```
|
|
371
|
+
</code>
|
|
372
|
+
</function_to_generate>
|
|
373
|
+
|
|
374
|
+
<chunk_status>clean|needs_more_work</chunk_status>
|
|
375
|
+
</cleaning_analysis>
|
|
376
|
+
|
|
377
|
+
RULES:
|
|
378
|
+
- ONE function per response
|
|
379
|
+
- If all issues solved: <chunk_status>clean</chunk_status>, omit <function_to_generate>
|
|
380
|
+
- Include imports in function or at top
|
|
381
|
+
- Function must be idempotent'''
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
## Dependencies
|
|
385
|
+
|
|
386
|
+
```toml
|
|
387
|
+
[project]
|
|
388
|
+
dependencies = [
|
|
389
|
+
"tenacity>=8.0", # Retry logic (battle-tested, 1 decorator)
|
|
390
|
+
]
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
That's it. No langchain, no frameworks, no abstractions.
|
|
394
|
+
|
|
395
|
+
## Edge Cases
|
|
396
|
+
|
|
397
|
+
| Case | Handling |
|
|
398
|
+
|------|----------|
|
|
399
|
+
| Malformed XML | Retry with error appended to prompt (max 3) |
|
|
400
|
+
| Invalid Python | Retry with syntax error in prompt (max 3) |
|
|
401
|
+
| `__main__` imports | Reject during parsing, retry with error feedback |
|
|
402
|
+
| Duplicate functions | Skip duplicates, keep first occurrence |
|
|
403
|
+
| Invalid combined output | Fall back to writing only valid functions |
|
|
404
|
+
| Chunk never "clean" | Skip after 5 iterations, log warning |
|
|
405
|
+
| Empty chunk | Skip without LLM call |
|
|
406
|
+
| Context too large | FIFO eviction, keep most recent functions |
|
|
407
|
+
|
|
408
|
+
## Known Limitations
|
|
409
|
+
|
|
410
|
+
1. **Stateful operations** (deduplication, aggregations) only work within chunks, not globally
|
|
411
|
+
2. ~~**Function ordering** follows generation order, not dependency order~~ → Fixed in v0.4.0 (dependency resolution)
|
|
412
|
+
3. ~~**No runtime testing** of generated functions before output~~ → Fixed in v0.2.0 (runtime validation)
|
|
413
|
+
4. ~~**Redundant functions** when similar issues appear in different chunks~~ → Fixed in v0.5.0 (two-pass consolidation)
|
|
414
|
+
|
|
415
|
+
## LLM Agency (v0.5.0)
|
|
416
|
+
|
|
417
|
+
The LLM now has agency over key decisions:
|
|
418
|
+
|
|
419
|
+
| Decision Point | LLM Decides |
|
|
420
|
+
|----------------|-------------|
|
|
421
|
+
| Chunk cleanliness | `chunk_status: clean/needs_more_work` |
|
|
422
|
+
| Consolidation complete | `complete: true/false` in self-assessment |
|
|
423
|
+
| Pattern saturation | `saturated: true/false` for early termination |
|
|
424
|
+
|
|
425
|
+
This follows the wu wei principle: let the model that understands the data make decisions about the data.
|
|
426
|
+
|
|
427
|
+
## Observability (v0.6.0)
|
|
428
|
+
|
|
429
|
+
New features for monitoring and analysis:
|
|
430
|
+
|
|
431
|
+
| Feature | Description |
|
|
432
|
+
|---------|-------------|
|
|
433
|
+
| Latency Metrics | Track LLM call timing (min/max/avg/total) via `llm_call` events |
|
|
434
|
+
| Import Consolidation | Merge duplicate imports, combine `from x import a, b` |
|
|
435
|
+
| Cleaning Report | Markdown summary with functions, metrics, latency stats |
|
|
436
|
+
| Dry-Run Mode | Analyze data without generating functions (`dry_run=True`) |
|
|
437
|
+
|
|
438
|
+
New events emitted:
|
|
439
|
+
- `llm_call` - After each LLM call with `latency_ms`
|
|
440
|
+
- `issues_detected` - In dry-run mode with detected issues
|
|
441
|
+
- `dry_run_complete` - End of dry run with stats
|
|
442
|
+
- `complete` now includes `latency_stats` dict
|
|
443
|
+
|
|
444
|
+
## Success Criteria
|
|
445
|
+
|
|
446
|
+
User with 500MB JSONL + clear instructions can:
|
|
447
|
+
1. Write 5 lines of setup
|
|
448
|
+
2. Run and walk away
|
|
449
|
+
3. Return to working `cleaning_functions.py`
|
|
450
|
+
4. Tweak edge cases
|
|
451
|
+
5. Apply to full dataset
|
|
452
|
+
|
|
453
|
+
---
|
|
454
|
+
|
|
455
|
+
**For A/B testing with advanced patterns, see `CLAUDE_ADVANCED.md`**
|