recursive-cleaner 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. recursive_cleaner-0.6.0/.gitignore +57 -0
  2. recursive_cleaner-0.6.0/CLAUDE.md +455 -0
  3. recursive_cleaner-0.6.0/CLAUDE_ADVANCED.md +955 -0
  4. recursive_cleaner-0.6.0/LICENSE +21 -0
  5. recursive_cleaner-0.6.0/PKG-INFO +282 -0
  6. recursive_cleaner-0.6.0/README.md +250 -0
  7. recursive_cleaner-0.6.0/TODO.md +117 -0
  8. recursive_cleaner-0.6.0/backends/__init__.py +5 -0
  9. recursive_cleaner-0.6.0/backends/mlx_backend.py +95 -0
  10. recursive_cleaner-0.6.0/docs/contracts/api-contract.md +209 -0
  11. recursive_cleaner-0.6.0/docs/contracts/data-schema.md +101 -0
  12. recursive_cleaner-0.6.0/docs/contracts/success-criteria.md +114 -0
  13. recursive_cleaner-0.6.0/docs/contracts/text-mode-contract.md +178 -0
  14. recursive_cleaner-0.6.0/docs/contracts/tier2-contract.md +178 -0
  15. recursive_cleaner-0.6.0/docs/contracts/tier4-contract.md +203 -0
  16. recursive_cleaner-0.6.0/docs/contracts/tier4-success-criteria.md +108 -0
  17. recursive_cleaner-0.6.0/docs/contracts/two-pass-contract.md +272 -0
  18. recursive_cleaner-0.6.0/docs/handoffs/tier4-handoff.md +63 -0
  19. recursive_cleaner-0.6.0/docs/implementation-plan-tier4.md +132 -0
  20. recursive_cleaner-0.6.0/docs/implementation-plan-v03.md +95 -0
  21. recursive_cleaner-0.6.0/docs/implementation-plan-v04.md +148 -0
  22. recursive_cleaner-0.6.0/docs/implementation-plan-v05.md +190 -0
  23. recursive_cleaner-0.6.0/docs/implementation-plan.md +184 -0
  24. recursive_cleaner-0.6.0/docs/langchain-analysis.md +640 -0
  25. recursive_cleaner-0.6.0/docs/langgraph-analysis.md +507 -0
  26. recursive_cleaner-0.6.0/docs/mlx-lm-guide.md +425 -0
  27. recursive_cleaner-0.6.0/docs/other-frameworks-analysis.md +386 -0
  28. recursive_cleaner-0.6.0/docs/refactor-assessment/data/dependency.json +368 -0
  29. recursive_cleaner-0.6.0/docs/refactor-assessment/data/stats.json +5070 -0
  30. recursive_cleaner-0.6.0/docs/refactor-assessment/plan.md +75 -0
  31. recursive_cleaner-0.6.0/docs/refactor-assessment/report.md +149 -0
  32. recursive_cleaner-0.6.0/docs/research/chonkie-extraction.md +357 -0
  33. recursive_cleaner-0.6.0/docs/research/chonkie.md +367 -0
  34. recursive_cleaner-0.6.0/docs/research/markitdown.md +513 -0
  35. recursive_cleaner-0.6.0/docs/smolagents-analysis.md +545 -0
  36. recursive_cleaner-0.6.0/docs/workflow-state.md +45 -0
  37. recursive_cleaner-0.6.0/pyproject.toml +60 -0
  38. recursive_cleaner-0.6.0/recursive_cleaner/__init__.py +46 -0
  39. recursive_cleaner-0.6.0/recursive_cleaner/cleaner.py +628 -0
  40. recursive_cleaner-0.6.0/recursive_cleaner/context.py +27 -0
  41. recursive_cleaner-0.6.0/recursive_cleaner/dependencies.py +59 -0
  42. recursive_cleaner-0.6.0/recursive_cleaner/errors.py +17 -0
  43. recursive_cleaner-0.6.0/recursive_cleaner/metrics.py +163 -0
  44. recursive_cleaner-0.6.0/recursive_cleaner/optimizer.py +336 -0
  45. recursive_cleaner-0.6.0/recursive_cleaner/output.py +197 -0
  46. recursive_cleaner-0.6.0/recursive_cleaner/parsers.py +325 -0
  47. recursive_cleaner-0.6.0/recursive_cleaner/prompt.py +218 -0
  48. recursive_cleaner-0.6.0/recursive_cleaner/report.py +138 -0
  49. recursive_cleaner-0.6.0/recursive_cleaner/response.py +292 -0
  50. recursive_cleaner-0.6.0/recursive_cleaner/schema.py +117 -0
  51. recursive_cleaner-0.6.0/recursive_cleaner/types.py +11 -0
  52. recursive_cleaner-0.6.0/recursive_cleaner/validation.py +202 -0
  53. recursive_cleaner-0.6.0/recursive_cleaner/vendor/__init__.py +4 -0
  54. recursive_cleaner-0.6.0/recursive_cleaner/vendor/chunker.py +187 -0
  55. recursive_cleaner-0.6.0/test_cases/ecommerce_instructions.txt +12 -0
  56. recursive_cleaner-0.6.0/test_cases/ecommerce_products.jsonl +60 -0
  57. recursive_cleaner-0.6.0/test_cases/financial_instructions.txt +14 -0
  58. recursive_cleaner-0.6.0/test_cases/financial_transactions.jsonl +60 -0
  59. recursive_cleaner-0.6.0/test_cases/healthcare_instructions.txt +14 -0
  60. recursive_cleaner-0.6.0/test_cases/healthcare_patients.jsonl +55 -0
  61. recursive_cleaner-0.6.0/test_cases/run_ecommerce_test.py +45 -0
  62. recursive_cleaner-0.6.0/test_cases/run_financial_test.py +47 -0
  63. recursive_cleaner-0.6.0/test_cases/run_healthcare_test.py +47 -0
  64. recursive_cleaner-0.6.0/tests/__init__.py +1 -0
  65. recursive_cleaner-0.6.0/tests/test_callbacks.py +195 -0
  66. recursive_cleaner-0.6.0/tests/test_cleaner.py +156 -0
  67. recursive_cleaner-0.6.0/tests/test_context.py +68 -0
  68. recursive_cleaner-0.6.0/tests/test_dependencies.py +260 -0
  69. recursive_cleaner-0.6.0/tests/test_dry_run.py +215 -0
  70. recursive_cleaner-0.6.0/tests/test_holdout.py +304 -0
  71. recursive_cleaner-0.6.0/tests/test_incremental.py +351 -0
  72. recursive_cleaner-0.6.0/tests/test_integration.py +943 -0
  73. recursive_cleaner-0.6.0/tests/test_latency.py +208 -0
  74. recursive_cleaner-0.6.0/tests/test_metrics.py +334 -0
  75. recursive_cleaner-0.6.0/tests/test_optimizer.py +1504 -0
  76. recursive_cleaner-0.6.0/tests/test_output.py +263 -0
  77. recursive_cleaner-0.6.0/tests/test_parsers.py +366 -0
  78. recursive_cleaner-0.6.0/tests/test_report.py +188 -0
  79. recursive_cleaner-0.6.0/tests/test_sampling.py +322 -0
  80. recursive_cleaner-0.6.0/tests/test_schema.py +267 -0
  81. recursive_cleaner-0.6.0/tests/test_text_mode.py +469 -0
  82. recursive_cleaner-0.6.0/tests/test_validation.py +559 -0
  83. recursive_cleaner-0.6.0/tests/test_vendor_chunker.py +341 -0
@@ -0,0 +1,57 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual environments
24
+ .env
25
+ .venv
26
+ env/
27
+ venv/
28
+ ENV/
29
+
30
+ # IDE
31
+ .idea/
32
+ .vscode/
33
+ *.swp
34
+ *.swo
35
+
36
+ # Testing
37
+ .pytest_cache/
38
+ .coverage
39
+ htmlcov/
40
+ samples/
41
+
42
+ # Generated files
43
+ cleaning_functions.py
44
+ cleaning_functions_*.py
45
+ cleaning_report.md
46
+ *_report.md
47
+ *_state.json
48
+ test_data.jsonl
49
+ test_cases/*_cleaning_functions.py
50
+ run_mlx_tests.py
51
+
52
+ # Cloned framework repos (for research only)
53
+ docs/frameworks/
54
+
55
+ # OS
56
+ .DS_Store
57
+ Thumbs.db
@@ -0,0 +1,455 @@
1
+ # CLAUDE.md - Recursive Docstring Data Cleaning Pipeline
2
+
3
+ ## Project Status
4
+
5
+ | Version | Status | Date |
6
+ |---------|--------|------|
7
+ | v0.6.0 | **Implemented** | 2025-01-15 |
8
+ | v0.5.1 | Implemented | 2025-01-15 |
9
+ | v0.5.0 | Implemented | 2025-01-15 |
10
+ | v0.4.0 | Implemented | 2025-01-15 |
11
+ | v0.3.0 | Implemented | 2025-01-14 |
12
+ | v0.2.0 | Implemented | 2025-01-14 |
13
+ | v0.1.0 | Implemented | 2025-01-14 |
14
+
15
+ **Current State**: v0.6.0 complete. 392 tests passing, 2,967 lines total.
16
+
17
+ ### Version History
18
+ - **v0.6.0**: Latency metrics, import consolidation, cleaning report, dry-run mode
19
+ - **v0.5.1**: Dangerous code detection (AST-based security)
20
+ - **v0.5.0**: Two-pass optimization with LLM agency (consolidation, early termination)
21
+ - **v0.4.0**: Holdout validation, dependency resolution, smart sampling, quality metrics
22
+ - **v0.3.0**: Text mode with vendored sentence-aware chunker
23
+ - **v0.2.0**: Runtime validation, schema inference, callbacks, incremental saves
24
+ - **v0.1.0**: Core pipeline
25
+
26
+ ## Project Overview
27
+
28
+ A Python library that uses LLMs to incrementally build data cleaning solutions for massive datasets. The system processes data in chunks, identifies quality issues, generates Python functions to solve them one at a time, and maintains awareness of existing solutions through docstring feedback loops.
29
+
30
+ **Core Philosophy**: Elegant, clean, lean, path of least resistance. Trade computational efficiency for human time savings. No frameworks, no abstractions we don't need, just a while loop with good error handling.
31
+
32
+ ## Design Principles
33
+
34
+ 1. **Simplicity over extensibility** - A 500-line library that does one thing well beats a 5000-line framework
35
+ 2. **stdlib over dependencies** - Use `ast.parse()`, `xml.etree`, not custom parsers
36
+ 3. **Functions over classes** - Unless state genuinely helps
37
+ 4. **Delete over abstract** - No interfaces for things with one implementation
38
+ 5. **Retry over recover** - On error, retry with error message appended to prompt
39
+
40
+ ## Target User Experience
41
+
42
+ ```python
43
+ from recursive_cleaner import DataCleaner
44
+
45
+ cleaner = DataCleaner(
46
+ llm_backend=my_ollama_client, # User-provided LLM interface
47
+ file_path="messy_customers.jsonl",
48
+ chunk_size=50, # items per chunk
49
+ instructions="""
50
+ CRM export data that needs:
51
+ - Phone numbers normalized to E.164 format
52
+ - Fix typos in 'status' field (valid: active, pending, churned)
53
+ - Remove duplicates by email
54
+ - All dates to ISO 8601
55
+ """,
56
+ # Validation & schema (v0.2.0)
57
+ on_progress=lambda e: print(f"{e['type']}: {e.get('chunk_index', '')}"),
58
+ state_file="cleaning_state.json", # Resume on interrupt
59
+ validate_runtime=True, # Test functions before accepting
60
+ schema_sample_size=10, # Infer schema from first N records
61
+ # Sampling & metrics (v0.4.0)
62
+ holdout_ratio=0.2, # Test on hidden 20% of each chunk
63
+ sampling_strategy="stratified", # "sequential", "random", or "stratified"
64
+ stratify_field="status", # Field for stratified sampling
65
+ track_metrics=True, # Measure before/after quality
66
+ # Optimization (v0.5.0)
67
+ optimize=True, # Consolidate redundant functions after generation
68
+ early_termination=True, # Stop when patterns saturate
69
+ # Observability (v0.6.0)
70
+ report_path="cleaning_report.md", # Generate markdown report (None to disable)
71
+ dry_run=False, # Set True to analyze without generating functions
72
+ )
73
+
74
+ cleaner.run() # Outputs: cleaning_functions.py, cleaning_report.md
75
+
76
+ # Check improvement metrics
77
+ print(cleaner.get_improvement_report())
78
+
79
+ # Or resume from saved state
80
+ cleaner = DataCleaner.resume("cleaning_state.json", my_ollama_client)
81
+ cleaner.run()
82
+ ```
83
+
84
+ ## Core Concepts
85
+
86
+ ### 1. Chunked Processing
87
+ Large files exceed LLM context windows. Process in chunks:
88
+ - **Text files**: By character count (default 4000)
89
+ - **CSV/JSON/JSONL**: By item count (default 50)
90
+
91
+ ### 2. Docstring Registry (Context Memory)
92
+ Each generated function's docstring is fed back into subsequent prompts. Simple list, most recent N functions, character budget.
93
+
94
+ ```python
95
+ def build_context(functions: list[dict], max_chars: int = 8000) -> str:
96
+ """Most recent functions that fit in budget. That's it."""
97
+ ctx = ""
98
+ for f in reversed(functions):
99
+ entry = f"## {f['name']}\n{f['docstring']}\n\n"
100
+ if len(ctx) + len(entry) > max_chars:
101
+ break
102
+ ctx = entry + ctx
103
+ return ctx or "(No functions generated yet)"
104
+ ```
105
+
106
+ ### 3. Single-Problem Focus
107
+ Per chunk iteration:
108
+ 1. LLM identifies ALL issues in chunk
109
+ 2. LLM checks which are already solved (by reviewing docstrings)
110
+ 3. LLM generates code for ONLY the first unsolved issue
111
+ 4. Repeat until "clean" or max iterations (default 5)
112
+
113
+ ### 4. XML Output with Markdown Code Blocks
114
+ XML wrapper for structure, markdown fences for code (handles LLM variance):
115
+
116
+ ```xml
117
+ <cleaning_analysis>
118
+ <issues_detected>
119
+ <issue id="1" solved="false">Phone numbers have inconsistent formats</issue>
120
+ <issue id="2" solved="true">Already handled by normalize_dates()</issue>
121
+ </issues_detected>
122
+
123
+ <function_to_generate>
124
+ <name>normalize_phone_numbers</name>
125
+ <docstring>
126
+ Normalize phone numbers to E.164 format.
127
+ Handles: +1-555-1234, (555) 123-4567, raw digits
128
+ </docstring>
129
+ <code>
130
+ ```python
131
+ import re
132
+
133
+ def normalize_phone_numbers(data):
134
+ # Implementation...
135
+ pass
136
+ ```
137
+ </code>
138
+ </function_to_generate>
139
+
140
+ <chunk_status>needs_more_work</chunk_status>
141
+ </cleaning_analysis>
142
+ ```
143
+
144
+ ## The Lean Architecture (~2,967 lines total)
145
+
146
+ ### File Structure (Implemented)
147
+ ```
148
+ recursive_cleaner/
149
+ __init__.py # Public exports (~45 lines)
150
+ cleaner.py # Main DataCleaner class (~580 lines)
151
+ context.py # Docstring registry with FIFO eviction (~27 lines)
152
+ dependencies.py # Topological sort for function ordering (~59 lines) [v0.4.0]
153
+ errors.py # 4 exception classes (~17 lines)
154
+ metrics.py # Quality metrics before/after (~163 lines) [v0.4.0]
155
+ optimizer.py # Two-pass consolidation with LLM agency (~336 lines) [v0.5.0]
156
+ output.py # Function file generation (~195 lines)
157
+ parsers.py # Chunk text/csv/json/jsonl with sampling (~325 lines)
158
+ prompt.py # LLM prompt templates (~218 lines)
159
+ report.py # Markdown report generation (~120 lines) [v0.6.0]
160
+ response.py # XML/markdown parsing + agency dataclasses (~292 lines)
161
+ schema.py # Schema inference (~117 lines) [v0.2.0]
162
+ types.py # LLMBackend protocol (~11 lines)
163
+ validation.py # Runtime validation + safety checks (~200 lines)
164
+ vendor/
165
+ __init__.py # Vendor exports (~4 lines)
166
+ chunker.py # Vendored sentence-aware chunker (~187 lines) [v0.3.0]
167
+
168
+ backends/
169
+ __init__.py # Backend exports
170
+ mlx_backend.py # MLX-LM backend for Apple Silicon
171
+
172
+ tests/ # 392 tests
173
+ test_callbacks.py # Progress callback tests
174
+ test_cleaner.py # DataCleaner tests
175
+ test_context.py # Context management tests
176
+ test_dependencies.py # Dependency resolution tests [v0.4.0]
177
+ test_dry_run.py # Dry run mode tests [v0.6.0]
178
+ test_holdout.py # Holdout validation tests [v0.4.0]
179
+ test_incremental.py # Incremental save tests
180
+ test_integration.py # End-to-end tests
181
+ test_latency.py # Latency metrics tests [v0.6.0]
182
+ test_metrics.py # Quality metrics tests [v0.4.0]
183
+ test_optimizer.py # Two-pass optimization tests [v0.5.0]
184
+ test_output.py # Output generation tests
185
+ test_parsers.py # Parsing tests
186
+ test_report.py # Cleaning report tests [v0.6.0]
187
+ test_sampling.py # Sampling strategy tests [v0.4.0]
188
+ test_schema.py # Schema inference tests
189
+ test_text_mode.py # Text mode tests [v0.3.0]
190
+ test_validation.py # Runtime validation + safety tests
191
+ test_vendor_chunker.py # Vendored chunker tests [v0.3.0]
192
+
193
+ test_cases/ # Comprehensive test datasets
194
+ ecommerce_*.jsonl # Product catalog data
195
+ healthcare_*.jsonl # Patient records
196
+ financial_*.jsonl # Transaction data
197
+
198
+ docs/ # Orchestrated dev docs
199
+ contracts/ # API and data contracts
200
+ research/ # Research findings
201
+ handoffs/ # Phase completion handoffs
202
+
203
+ pyproject.toml
204
+ ```
205
+
206
+ ### Error Classes (18 lines)
207
+ ```python
208
+ class CleanerError(Exception):
209
+ """Base error for the pipeline"""
210
+
211
+ class ParseError(CleanerError):
212
+ """XML or code extraction failed - retry with error feedback"""
213
+
214
+ class MaxIterationsError(CleanerError):
215
+ """Chunk never marked clean - skip and continue"""
216
+
217
+ class OutputValidationError(CleanerError):
218
+ """Generated output file has invalid Python syntax"""
219
+ ```
220
+
221
+ ### LLM Backend Protocol (5 lines)
222
+ ```python
223
+ from typing import Protocol
224
+
225
+ class LLMBackend(Protocol):
226
+ def generate(self, prompt: str) -> str: ...
227
+ ```
228
+
229
+ ### Retry Logic (use tenacity)
230
+ ```python
231
+ from tenacity import retry, stop_after_attempt, wait_exponential
232
+
233
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10))
234
+ def call_llm(backend: LLMBackend, prompt: str) -> str:
235
+ return backend.generate(prompt)
236
+ ```
237
+
238
+ ### Response Parsing (~30 lines)
239
+ ```python
240
+ import ast
241
+ import re
242
+ import xml.etree.ElementTree as ET
243
+
244
+ def parse_response(text: str) -> dict:
245
+ """Extract structured data from LLM response."""
246
+ try:
247
+ # Find XML content
248
+ root = ET.fromstring(f"<root>{text}</root>")
249
+
250
+ # Extract code from markdown fence
251
+ code_elem = root.find(".//code")
252
+ code_text = code_elem.text if code_elem is not None else ""
253
+ code = extract_python_block(code_text)
254
+
255
+ # Validate Python syntax
256
+ ast.parse(code)
257
+
258
+ return {
259
+ "issues": parse_issues(root),
260
+ "name": root.findtext(".//name", "").strip(),
261
+ "docstring": root.findtext(".//docstring", "").strip(),
262
+ "code": code,
263
+ "status": root.findtext(".//chunk_status", "needs_more_work").strip()
264
+ }
265
+ except ET.ParseError as e:
266
+ raise ParseError(f"Invalid XML: {e}")
267
+ except SyntaxError as e:
268
+ raise ParseError(f"Invalid Python: {e}")
269
+
270
+ def extract_python_block(text: str) -> str:
271
+ """Extract code from ```python ... ``` block."""
272
+ match = re.search(r"```python\s*(.*?)\s*```", text, re.DOTALL)
273
+ return match.group(1) if match else text.strip()
274
+ ```
275
+
276
+ ### The Main Loop (~80 lines)
277
+ ```python
278
+ class DataCleaner:
279
+ def __init__(self, llm_backend, file_path, chunk_size=50,
280
+ instructions="", max_iterations=5, context_budget=8000):
281
+ self.backend = llm_backend
282
+ self.file_path = file_path
283
+ self.chunk_size = chunk_size
284
+ self.instructions = instructions
285
+ self.max_iterations = max_iterations
286
+ self.context_budget = context_budget
287
+ self.functions = [] # List of {name, docstring, code}
288
+
289
+ def run(self):
290
+ chunks = self._load_chunks()
291
+
292
+ for i, chunk in enumerate(chunks):
293
+ print(f"Processing chunk {i+1}/{len(chunks)}...")
294
+ self._process_chunk(chunk, i)
295
+
296
+ self._write_output()
297
+ print(f"Done! Generated {len(self.functions)} functions.")
298
+
299
+ def _process_chunk(self, chunk, chunk_idx):
300
+ for iteration in range(self.max_iterations):
301
+ prompt = self._build_prompt(chunk)
302
+
303
+ try:
304
+ response = call_llm(self.backend, prompt)
305
+ result = parse_response(response)
306
+ except ParseError as e:
307
+ # Retry with error feedback
308
+ prompt += f"\n\nYour previous response had an error: {e}\nPlease try again."
309
+ continue
310
+
311
+ if result["status"] == "clean":
312
+ return
313
+
314
+ if result["code"]:
315
+ self.functions.append({
316
+ "name": result["name"],
317
+ "docstring": result["docstring"],
318
+ "code": result["code"]
319
+ })
320
+
321
+ print(f" Warning: chunk {chunk_idx} hit max iterations")
322
+
323
+ def _build_prompt(self, chunk):
324
+ context = build_context(self.functions, self.context_budget)
325
+ return PROMPT_TEMPLATE.format(
326
+ instructions=self.instructions,
327
+ context=context,
328
+ chunk=chunk
329
+ )
330
+
331
+ def _write_output(self):
332
+ # Generate cleaning_functions.py with all functions
333
+ # and a clean_data() entrypoint
334
+ ...
335
+ ```
336
+
337
+ ## Prompt Template
338
+
339
+ ```python
340
+ PROMPT_TEMPLATE = '''You are a data cleaning expert. Analyze data and generate Python functions to fix issues.
341
+
342
+ === USER'S CLEANING GOALS ===
343
+ {instructions}
344
+
345
+ === EXISTING FUNCTIONS (DO NOT RECREATE) ===
346
+ {context}
347
+
348
+ === DATA CHUNK ===
349
+ {chunk}
350
+
351
+ === TASK ===
352
+ 1. List ALL data quality issues
353
+ 2. Mark each as solved="true" if an existing function handles it
354
+ 3. Generate code for ONLY the FIRST unsolved issue
355
+ 4. Use this EXACT format:
356
+
357
+ <cleaning_analysis>
358
+ <issues_detected>
359
+ <issue id="1" solved="true|false">Description</issue>
360
+ </issues_detected>
361
+
362
+ <function_to_generate>
363
+ <name>function_name</name>
364
+ <docstring>What it does, edge cases handled</docstring>
365
+ <code>
366
+ ```python
367
+ def function_name(data):
368
+ # Complete implementation
369
+ pass
370
+ ```
371
+ </code>
372
+ </function_to_generate>
373
+
374
+ <chunk_status>clean|needs_more_work</chunk_status>
375
+ </cleaning_analysis>
376
+
377
+ RULES:
378
+ - ONE function per response
379
+ - If all issues solved: <chunk_status>clean</chunk_status>, omit <function_to_generate>
380
+ - Include imports in function or at top
381
+ - Function must be idempotent'''
382
+ ```
383
+
384
+ ## Dependencies
385
+
386
+ ```toml
387
+ [project]
388
+ dependencies = [
389
+ "tenacity>=8.0", # Retry logic (battle-tested, 1 decorator)
390
+ ]
391
+ ```
392
+
393
+ That's it. No langchain, no frameworks, no abstractions.
394
+
395
+ ## Edge Cases
396
+
397
+ | Case | Handling |
398
+ |------|----------|
399
+ | Malformed XML | Retry with error appended to prompt (max 3) |
400
+ | Invalid Python | Retry with syntax error in prompt (max 3) |
401
+ | `__main__` imports | Reject during parsing, retry with error feedback |
402
+ | Duplicate functions | Skip duplicates, keep first occurrence |
403
+ | Invalid combined output | Fall back to writing only valid functions |
404
+ | Chunk never "clean" | Skip after 5 iterations, log warning |
405
+ | Empty chunk | Skip without LLM call |
406
+ | Context too large | FIFO eviction, keep most recent functions |
407
+
408
+ ## Known Limitations
409
+
410
+ 1. **Stateful operations** (deduplication, aggregations) only work within chunks, not globally
411
+ 2. ~~**Function ordering** follows generation order, not dependency order~~ → Fixed in v0.4.0 (dependency resolution)
412
+ 3. ~~**No runtime testing** of generated functions before output~~ → Fixed in v0.2.0 (runtime validation)
413
+ 4. ~~**Redundant functions** when similar issues appear in different chunks~~ → Fixed in v0.5.0 (two-pass consolidation)
414
+
415
+ ## LLM Agency (v0.5.0)
416
+
417
+ The LLM now has agency over key decisions:
418
+
419
+ | Decision Point | LLM Decides |
420
+ |----------------|-------------|
421
+ | Chunk cleanliness | `chunk_status: clean/needs_more_work` |
422
+ | Consolidation complete | `complete: true/false` in self-assessment |
423
+ | Pattern saturation | `saturated: true/false` for early termination |
424
+
425
+ This follows the wu wei principle: let the model that understands the data make decisions about the data.
426
+
427
+ ## Observability (v0.6.0)
428
+
429
+ New features for monitoring and analysis:
430
+
431
+ | Feature | Description |
432
+ |---------|-------------|
433
+ | Latency Metrics | Track LLM call timing (min/max/avg/total) via `llm_call` events |
434
+ | Import Consolidation | Merge duplicate imports, combine `from x import a, b` |
435
+ | Cleaning Report | Markdown summary with functions, metrics, latency stats |
436
+ | Dry-Run Mode | Analyze data without generating functions (`dry_run=True`) |
437
+
438
+ New events emitted:
439
+ - `llm_call` - After each LLM call with `latency_ms`
440
+ - `issues_detected` - In dry-run mode with detected issues
441
+ - `dry_run_complete` - End of dry run with stats
442
+ - `complete` now includes `latency_stats` dict
443
+
444
+ ## Success Criteria
445
+
446
+ User with 500MB JSONL + clear instructions can:
447
+ 1. Write 5 lines of setup
448
+ 2. Run and walk away
449
+ 3. Return to working `cleaning_functions.py`
450
+ 4. Tweak edge cases
451
+ 5. Apply to full dataset
452
+
453
+ ---
454
+
455
+ **For A/B testing with advanced patterns, see `CLAUDE_ADVANCED.md`**