recursive-cleaner 0.6.0__tar.gz → 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/PKG-INFO +7 -2
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/README.md +5 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/pyproject.toml +2 -2
- recursive_cleaner-0.6.0/CLAUDE_ADVANCED.md +0 -955
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/.gitignore +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/CLAUDE.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/LICENSE +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/TODO.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/backends/__init__.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/backends/mlx_backend.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/contracts/api-contract.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/contracts/data-schema.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/contracts/success-criteria.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/contracts/text-mode-contract.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/contracts/tier2-contract.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/contracts/tier4-contract.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/contracts/tier4-success-criteria.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/contracts/two-pass-contract.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/handoffs/tier4-handoff.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/implementation-plan-tier4.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/implementation-plan-v03.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/implementation-plan-v04.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/implementation-plan-v05.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/implementation-plan.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/langchain-analysis.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/langgraph-analysis.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/mlx-lm-guide.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/other-frameworks-analysis.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/refactor-assessment/data/dependency.json +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/refactor-assessment/data/stats.json +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/refactor-assessment/plan.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/refactor-assessment/report.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/research/chonkie-extraction.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/research/chonkie.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/research/markitdown.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/smolagents-analysis.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/workflow-state.md +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/recursive_cleaner/__init__.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/recursive_cleaner/cleaner.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/recursive_cleaner/context.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/recursive_cleaner/dependencies.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/recursive_cleaner/errors.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/recursive_cleaner/metrics.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/recursive_cleaner/optimizer.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/recursive_cleaner/output.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/recursive_cleaner/parsers.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/recursive_cleaner/prompt.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/recursive_cleaner/report.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/recursive_cleaner/response.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/recursive_cleaner/schema.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/recursive_cleaner/types.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/recursive_cleaner/validation.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/recursive_cleaner/vendor/__init__.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/recursive_cleaner/vendor/chunker.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/test_cases/ecommerce_instructions.txt +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/test_cases/ecommerce_products.jsonl +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/test_cases/financial_instructions.txt +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/test_cases/financial_transactions.jsonl +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/test_cases/healthcare_instructions.txt +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/test_cases/healthcare_patients.jsonl +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/test_cases/run_ecommerce_test.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/test_cases/run_financial_test.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/test_cases/run_healthcare_test.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/__init__.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_callbacks.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_cleaner.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_context.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_dependencies.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_dry_run.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_holdout.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_incremental.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_integration.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_latency.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_metrics.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_optimizer.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_output.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_parsers.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_report.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_sampling.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_schema.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_text_mode.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_validation.py +0 -0
- {recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/tests/test_vendor_chunker.py +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: recursive-cleaner
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
|
|
5
5
|
Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
6
6
|
Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
7
7
|
Project-URL: Issues, https://github.com/gaztrabisme/recursive-data-cleaner/issues
|
|
8
|
-
Author: Gary
|
|
8
|
+
Author: Gary Tran
|
|
9
9
|
License-Expression: MIT
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Keywords: automation,data-cleaning,data-quality,etl,llm,machine-learning
|
|
@@ -277,6 +277,11 @@ pytest tests/ -v
|
|
|
277
277
|
| v0.2.0 | Runtime validation, schema inference, callbacks, incremental saves |
|
|
278
278
|
| v0.1.0 | Core pipeline, chunking, docstring registry |
|
|
279
279
|
|
|
280
|
+
## Acknowledgments
|
|
281
|
+
|
|
282
|
+
- Sentence-aware text chunking adapted from [Chonkie](https://github.com/chonkie-inc/chonkie) (MIT License)
|
|
283
|
+
- Development assisted by [Claude Code](https://claude.ai/claude-code)
|
|
284
|
+
|
|
280
285
|
## License
|
|
281
286
|
|
|
282
287
|
MIT
|
|
@@ -245,6 +245,11 @@ pytest tests/ -v
|
|
|
245
245
|
| v0.2.0 | Runtime validation, schema inference, callbacks, incremental saves |
|
|
246
246
|
| v0.1.0 | Core pipeline, chunking, docstring registry |
|
|
247
247
|
|
|
248
|
+
## Acknowledgments
|
|
249
|
+
|
|
250
|
+
- Sentence-aware text chunking adapted from [Chonkie](https://github.com/chonkie-inc/chonkie) (MIT License)
|
|
251
|
+
- Development assisted by [Claude Code](https://claude.ai/claude-code)
|
|
252
|
+
|
|
248
253
|
## License
|
|
249
254
|
|
|
250
255
|
MIT
|
|
@@ -4,13 +4,13 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "recursive-cleaner"
|
|
7
|
-
version = "0.6.
|
|
7
|
+
version = "0.6.1"
|
|
8
8
|
description = "LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
11
11
|
requires-python = ">=3.10"
|
|
12
12
|
authors = [
|
|
13
|
-
{ name = "Gary
|
|
13
|
+
{ name = "Gary Tran" },
|
|
14
14
|
]
|
|
15
15
|
keywords = [
|
|
16
16
|
"data-cleaning",
|
|
@@ -1,955 +0,0 @@
|
|
|
1
|
-
# CLAUDE_ADVANCED.md - Framework-Inspired Architecture
|
|
2
|
-
|
|
3
|
-
> **For A/B testing against `CLAUDE.md` (lean version)**
|
|
4
|
-
>
|
|
5
|
-
> This document describes an architecture borrowing patterns from smolagents, LangChain, LangGraph, AutoGen, CrewAI, and Pydantic-AI. More extensible, more abstracted, potentially more robust for complex use cases.
|
|
6
|
-
|
|
7
|
-
## Project Overview
|
|
8
|
-
|
|
9
|
-
Same as lean version: LLM-powered incremental data cleaning with docstring feedback loops.
|
|
10
|
-
|
|
11
|
-
**Philosophy**: Borrow battle-tested patterns from production agent frameworks. More infrastructure upfront, potentially better error recovery and extensibility.
|
|
12
|
-
|
|
13
|
-
## Architecture Patterns Adopted
|
|
14
|
-
|
|
15
|
-
### From smolagents (HuggingFace)
|
|
16
|
-
- Typed memory steps with dataclasses
|
|
17
|
-
- Error hierarchy with logging integration
|
|
18
|
-
- AST-based code validation
|
|
19
|
-
- Callback system for extensibility
|
|
20
|
-
- Jinja2 prompt templates
|
|
21
|
-
|
|
22
|
-
### From LangChain
|
|
23
|
-
- Tenacity retry with exponential backoff + jitter
|
|
24
|
-
- Output fixing parser (LLM-powered error recovery)
|
|
25
|
-
- Summary buffer memory for context management
|
|
26
|
-
- Structured exception hierarchy
|
|
27
|
-
|
|
28
|
-
### From LangGraph
|
|
29
|
-
- TypedDict state with annotated reducers
|
|
30
|
-
- Checkpoint interface for persistence
|
|
31
|
-
- Conditional routing for iteration control
|
|
32
|
-
|
|
33
|
-
### From Pydantic-AI
|
|
34
|
-
- Output validator pattern (separated validation)
|
|
35
|
-
- ModelRetry exception for retry-with-feedback
|
|
36
|
-
- Clean exception categorization
|
|
37
|
-
|
|
38
|
-
### From CrewAI
|
|
39
|
-
- Guardrail system for chunk completion validation
|
|
40
|
-
- Task-centric workflow definition
|
|
41
|
-
|
|
42
|
-
---
|
|
43
|
-
|
|
44
|
-
## File Structure (~800 lines total)
|
|
45
|
-
|
|
46
|
-
```
|
|
47
|
-
recursive_cleaner/
|
|
48
|
-
__init__.py
|
|
49
|
-
cleaner.py # Main orchestrator (~200 lines)
|
|
50
|
-
state.py # TypedDict state + reducers (~50 lines)
|
|
51
|
-
memory.py # Summary buffer registry (~100 lines)
|
|
52
|
-
parsing.py # XML + code extraction (~80 lines)
|
|
53
|
-
validation.py # Guardrails + validators (~100 lines)
|
|
54
|
-
errors.py # Exception hierarchy (~40 lines)
|
|
55
|
-
callbacks.py # Event hooks (~60 lines)
|
|
56
|
-
prompts/
|
|
57
|
-
system.yaml # Jinja2 templates
|
|
58
|
-
retry.yaml
|
|
59
|
-
checkpoints.py # Optional persistence (~80 lines)
|
|
60
|
-
|
|
61
|
-
pyproject.toml
|
|
62
|
-
```
|
|
63
|
-
|
|
64
|
-
---
|
|
65
|
-
|
|
66
|
-
## Exception Hierarchy (errors.py)
|
|
67
|
-
|
|
68
|
-
Inspired by smolagents + Pydantic-AI:
|
|
69
|
-
|
|
70
|
-
```python
|
|
71
|
-
from dataclasses import dataclass
|
|
72
|
-
from typing import Any
|
|
73
|
-
|
|
74
|
-
class CleanerError(Exception):
|
|
75
|
-
"""Base exception for all pipeline errors"""
|
|
76
|
-
def __init__(self, message: str, context: dict[str, Any] | None = None):
|
|
77
|
-
super().__init__(message)
|
|
78
|
-
self.message = message
|
|
79
|
-
self.context = context or {}
|
|
80
|
-
|
|
81
|
-
class ParseError(CleanerError):
|
|
82
|
-
"""XML parsing failed"""
|
|
83
|
-
pass
|
|
84
|
-
|
|
85
|
-
class CodeValidationError(CleanerError):
|
|
86
|
-
"""Generated Python code is invalid"""
|
|
87
|
-
def __init__(self, message: str, code: str, syntax_error: str):
|
|
88
|
-
super().__init__(message, {"code": code, "syntax_error": syntax_error})
|
|
89
|
-
self.code = code
|
|
90
|
-
self.syntax_error = syntax_error
|
|
91
|
-
|
|
92
|
-
class RetryableError(CleanerError):
|
|
93
|
-
"""Error that should trigger retry with feedback to LLM"""
|
|
94
|
-
def __init__(self, message: str, retry_prompt: str):
|
|
95
|
-
super().__init__(message)
|
|
96
|
-
self.retry_prompt = retry_prompt
|
|
97
|
-
|
|
98
|
-
class MaxIterationsError(CleanerError):
|
|
99
|
-
"""Chunk processing exceeded iteration limit"""
|
|
100
|
-
def __init__(self, chunk_index: int, iterations: int):
|
|
101
|
-
super().__init__(f"Chunk {chunk_index} exceeded {iterations} iterations")
|
|
102
|
-
self.chunk_index = chunk_index
|
|
103
|
-
self.iterations = iterations
|
|
104
|
-
|
|
105
|
-
class ChunkSkippedError(CleanerError):
|
|
106
|
-
"""Chunk was skipped due to unrecoverable errors"""
|
|
107
|
-
pass
|
|
108
|
-
```
|
|
109
|
-
|
|
110
|
-
---
|
|
111
|
-
|
|
112
|
-
## Typed State with Reducers (state.py)
|
|
113
|
-
|
|
114
|
-
Inspired by LangGraph:
|
|
115
|
-
|
|
116
|
-
```python
|
|
117
|
-
from typing import Annotated, TypedDict
|
|
118
|
-
from dataclasses import dataclass, field
|
|
119
|
-
from datetime import datetime
|
|
120
|
-
|
|
121
|
-
def merge_dicts(current: dict, new: dict) -> dict:
|
|
122
|
-
"""Reducer: merge new keys into existing dict"""
|
|
123
|
-
result = current.copy()
|
|
124
|
-
result.update(new)
|
|
125
|
-
return result
|
|
126
|
-
|
|
127
|
-
def append_list(current: list, new: list) -> list:
|
|
128
|
-
"""Reducer: append new items to list"""
|
|
129
|
-
return current + new
|
|
130
|
-
|
|
131
|
-
@dataclass
|
|
132
|
-
class GeneratedFunction:
|
|
133
|
-
name: str
|
|
134
|
-
docstring: str
|
|
135
|
-
code: str
|
|
136
|
-
issues_solved: list[str]
|
|
137
|
-
chunk_index: int
|
|
138
|
-
created_at: datetime = field(default_factory=datetime.now)
|
|
139
|
-
|
|
140
|
-
@dataclass
|
|
141
|
-
class ChunkResult:
|
|
142
|
-
chunk_index: int
|
|
143
|
-
iterations: int
|
|
144
|
-
status: str # "clean" | "max_iterations" | "skipped"
|
|
145
|
-
functions_generated: list[str]
|
|
146
|
-
issues_found: int
|
|
147
|
-
issues_solved: int
|
|
148
|
-
|
|
149
|
-
class PipelineState(TypedDict):
|
|
150
|
-
# Accumulated across all chunks
|
|
151
|
-
functions: Annotated[list[GeneratedFunction], append_list]
|
|
152
|
-
docstring_registry: Annotated[dict[str, str], merge_dicts]
|
|
153
|
-
chunk_results: Annotated[list[ChunkResult], append_list]
|
|
154
|
-
|
|
155
|
-
# Current processing context
|
|
156
|
-
current_chunk_index: int
|
|
157
|
-
current_iteration: int
|
|
158
|
-
total_chunks: int
|
|
159
|
-
|
|
160
|
-
# Metadata
|
|
161
|
-
file_path: str
|
|
162
|
-
started_at: str
|
|
163
|
-
errors: Annotated[list[dict], append_list]
|
|
164
|
-
```
|
|
165
|
-
|
|
166
|
-
---
|
|
167
|
-
|
|
168
|
-
## Summary Buffer Memory (memory.py)
|
|
169
|
-
|
|
170
|
-
Inspired by LangChain's ConversationSummaryBufferMemory:
|
|
171
|
-
|
|
172
|
-
```python
|
|
173
|
-
from dataclasses import dataclass
|
|
174
|
-
from typing import Protocol
|
|
175
|
-
|
|
176
|
-
class LLMBackend(Protocol):
|
|
177
|
-
def generate(self, prompt: str) -> str: ...
|
|
178
|
-
|
|
179
|
-
@dataclass
|
|
180
|
-
class DocstringRegistry:
|
|
181
|
-
"""
|
|
182
|
-
Manages docstring context with token budget.
|
|
183
|
-
When budget exceeded: summarize old functions, keep recent verbatim.
|
|
184
|
-
"""
|
|
185
|
-
llm_backend: LLMBackend
|
|
186
|
-
max_tokens: int = 4000
|
|
187
|
-
summary: str = ""
|
|
188
|
-
recent_functions: list[dict] = None # {name, docstring}
|
|
189
|
-
|
|
190
|
-
def __post_init__(self):
|
|
191
|
-
self.recent_functions = self.recent_functions or []
|
|
192
|
-
|
|
193
|
-
def add(self, name: str, docstring: str):
|
|
194
|
-
self.recent_functions.append({"name": name, "docstring": docstring})
|
|
195
|
-
self._prune_if_needed()
|
|
196
|
-
|
|
197
|
-
def _count_tokens(self) -> int:
|
|
198
|
-
# Approximate: 1 token ~= 4 chars
|
|
199
|
-
total = len(self.summary)
|
|
200
|
-
for f in self.recent_functions:
|
|
201
|
-
total += len(f["name"]) + len(f["docstring"]) + 10
|
|
202
|
-
return total // 4
|
|
203
|
-
|
|
204
|
-
def _prune_if_needed(self):
|
|
205
|
-
if self._count_tokens() <= self.max_tokens:
|
|
206
|
-
return
|
|
207
|
-
|
|
208
|
-
# Move oldest to summary
|
|
209
|
-
to_summarize = []
|
|
210
|
-
while self._count_tokens() > self.max_tokens * 0.7 and self.recent_functions:
|
|
211
|
-
to_summarize.append(self.recent_functions.pop(0))
|
|
212
|
-
|
|
213
|
-
if to_summarize:
|
|
214
|
-
self._update_summary(to_summarize)
|
|
215
|
-
|
|
216
|
-
def _update_summary(self, functions: list[dict]):
|
|
217
|
-
func_text = "\n".join(f"- {f['name']}: {f['docstring'][:100]}..." for f in functions)
|
|
218
|
-
prompt = f"""Summarize these data cleaning functions in 2-3 sentences.
|
|
219
|
-
Focus on what types of data issues they handle.
|
|
220
|
-
|
|
221
|
-
Previous summary: {self.summary or '(none)'}
|
|
222
|
-
|
|
223
|
-
New functions:
|
|
224
|
-
{func_text}
|
|
225
|
-
|
|
226
|
-
Concise summary:"""
|
|
227
|
-
|
|
228
|
-
self.summary = self.llm_backend.generate(prompt).strip()
|
|
229
|
-
|
|
230
|
-
def get_context(self) -> str:
|
|
231
|
-
parts = []
|
|
232
|
-
if self.summary:
|
|
233
|
-
parts.append(f"**Previously generated functions (summarized):**\n{self.summary}")
|
|
234
|
-
if self.recent_functions:
|
|
235
|
-
recent = "\n\n".join(
|
|
236
|
-
f"## {f['name']}\n{f['docstring']}"
|
|
237
|
-
for f in self.recent_functions
|
|
238
|
-
)
|
|
239
|
-
parts.append(f"**Recent functions (full docstrings):**\n{recent}")
|
|
240
|
-
return "\n\n".join(parts) or "(No functions generated yet)"
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
# Alternative: Vector-based retrieval for large registries
|
|
244
|
-
@dataclass
|
|
245
|
-
class VectorDocstringRegistry:
|
|
246
|
-
"""
|
|
247
|
-
For 50+ functions: embed docstrings, retrieve top-K relevant per chunk.
|
|
248
|
-
Requires: chromadb or faiss
|
|
249
|
-
"""
|
|
250
|
-
embedder: any # User-provided embedding function
|
|
251
|
-
store: any # Vector store
|
|
252
|
-
top_k: int = 10
|
|
253
|
-
|
|
254
|
-
def add(self, name: str, docstring: str):
|
|
255
|
-
embedding = self.embedder(docstring)
|
|
256
|
-
self.store.add(name, docstring, embedding)
|
|
257
|
-
|
|
258
|
-
def get_context(self, chunk_text: str) -> str:
|
|
259
|
-
"""Retrieve most relevant docstrings for this chunk"""
|
|
260
|
-
chunk_embedding = self.embedder(chunk_text)
|
|
261
|
-
results = self.store.query(chunk_embedding, k=self.top_k)
|
|
262
|
-
return "\n\n".join(f"## {r.name}\n{r.docstring}" for r in results)
|
|
263
|
-
```
|
|
264
|
-
|
|
265
|
-
---
|
|
266
|
-
|
|
267
|
-
## Validation & Guardrails (validation.py)
|
|
268
|
-
|
|
269
|
-
Inspired by CrewAI + Pydantic-AI:
|
|
270
|
-
|
|
271
|
-
```python
|
|
272
|
-
import ast
|
|
273
|
-
from dataclasses import dataclass
|
|
274
|
-
from typing import Callable
|
|
275
|
-
|
|
276
|
-
@dataclass
|
|
277
|
-
class ValidationResult:
|
|
278
|
-
success: bool
|
|
279
|
-
error: str | None = None
|
|
280
|
-
retry_prompt: str | None = None
|
|
281
|
-
|
|
282
|
-
class CodeValidator:
|
|
283
|
-
"""AST-based validation of generated Python code"""
|
|
284
|
-
|
|
285
|
-
DANGEROUS_IMPORTS = {"os", "subprocess", "sys", "shutil", "pathlib"}
|
|
286
|
-
DANGEROUS_CALLS = {"eval", "exec", "compile", "__import__"}
|
|
287
|
-
|
|
288
|
-
def validate(self, code: str) -> ValidationResult:
|
|
289
|
-
# Syntax check
|
|
290
|
-
try:
|
|
291
|
-
tree = ast.parse(code)
|
|
292
|
-
except SyntaxError as e:
|
|
293
|
-
return ValidationResult(
|
|
294
|
-
success=False,
|
|
295
|
-
error=f"Syntax error: {e}",
|
|
296
|
-
retry_prompt=f"Your code has a syntax error on line {e.lineno}: {e.msg}\nPlease fix and regenerate."
|
|
297
|
-
)
|
|
298
|
-
|
|
299
|
-
# Check for dangerous patterns
|
|
300
|
-
for node in ast.walk(tree):
|
|
301
|
-
if isinstance(node, ast.Import):
|
|
302
|
-
for alias in node.names:
|
|
303
|
-
if alias.name.split('.')[0] in self.DANGEROUS_IMPORTS:
|
|
304
|
-
return ValidationResult(
|
|
305
|
-
success=False,
|
|
306
|
-
error=f"Dangerous import: {alias.name}",
|
|
307
|
-
retry_prompt=f"Do not import {alias.name}. Use only safe data processing imports."
|
|
308
|
-
)
|
|
309
|
-
|
|
310
|
-
if isinstance(node, ast.Call):
|
|
311
|
-
if isinstance(node.func, ast.Name) and node.func.id in self.DANGEROUS_CALLS:
|
|
312
|
-
return ValidationResult(
|
|
313
|
-
success=False,
|
|
314
|
-
error=f"Dangerous function: {node.func.id}",
|
|
315
|
-
retry_prompt=f"Do not use {node.func.id}(). It's a security risk."
|
|
316
|
-
)
|
|
317
|
-
|
|
318
|
-
# Check has docstring
|
|
319
|
-
for node in ast.walk(tree):
|
|
320
|
-
if isinstance(node, ast.FunctionDef):
|
|
321
|
-
if not ast.get_docstring(node):
|
|
322
|
-
return ValidationResult(
|
|
323
|
-
success=False,
|
|
324
|
-
error="Function missing docstring",
|
|
325
|
-
retry_prompt="Your function must have a docstring explaining what it does."
|
|
326
|
-
)
|
|
327
|
-
|
|
328
|
-
return ValidationResult(success=True)
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
@dataclass
|
|
332
|
-
class ChunkGuardrail:
|
|
333
|
-
"""Validates chunk processing results"""
|
|
334
|
-
max_iterations: int = 5
|
|
335
|
-
code_validator: CodeValidator = None
|
|
336
|
-
|
|
337
|
-
def __post_init__(self):
|
|
338
|
-
self.code_validator = self.code_validator or CodeValidator()
|
|
339
|
-
|
|
340
|
-
def check(self, result: dict, iteration: int) -> ValidationResult:
|
|
341
|
-
# Check if done
|
|
342
|
-
if result.get("status") == "clean":
|
|
343
|
-
return ValidationResult(success=True)
|
|
344
|
-
|
|
345
|
-
# Check iteration limit
|
|
346
|
-
if iteration >= self.max_iterations:
|
|
347
|
-
return ValidationResult(
|
|
348
|
-
success=False,
|
|
349
|
-
error=f"Max iterations ({self.max_iterations}) reached"
|
|
350
|
-
)
|
|
351
|
-
|
|
352
|
-
# Validate generated code
|
|
353
|
-
if result.get("code"):
|
|
354
|
-
code_result = self.code_validator.validate(result["code"])
|
|
355
|
-
if not code_result.success:
|
|
356
|
-
return code_result
|
|
357
|
-
|
|
358
|
-
return ValidationResult(success=True)
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
class OutputFixingParser:
|
|
362
|
-
"""
|
|
363
|
-
LLM-powered error recovery for malformed outputs.
|
|
364
|
-
Inspired by LangChain's OutputFixingParser.
|
|
365
|
-
"""
|
|
366
|
-
|
|
367
|
-
def __init__(self, llm_backend, max_retries: int = 2):
|
|
368
|
-
self.llm_backend = llm_backend
|
|
369
|
-
self.max_retries = max_retries
|
|
370
|
-
|
|
371
|
-
def parse_with_fix(self, text: str, parse_fn: Callable, original_prompt: str) -> dict:
|
|
372
|
-
last_error = None
|
|
373
|
-
|
|
374
|
-
for attempt in range(self.max_retries + 1):
|
|
375
|
-
try:
|
|
376
|
-
return parse_fn(text)
|
|
377
|
-
except Exception as e:
|
|
378
|
-
last_error = e
|
|
379
|
-
if attempt < self.max_retries:
|
|
380
|
-
fix_prompt = f"""Your previous response could not be parsed.
|
|
381
|
-
|
|
382
|
-
Original request:
|
|
383
|
-
{original_prompt[:500]}...
|
|
384
|
-
|
|
385
|
-
Your response:
|
|
386
|
-
{text[:1000]}...
|
|
387
|
-
|
|
388
|
-
Error: {e}
|
|
389
|
-
|
|
390
|
-
Please provide your response again, ensuring valid XML format with the code wrapped in ```python blocks."""
|
|
391
|
-
|
|
392
|
-
text = self.llm_backend.generate(fix_prompt)
|
|
393
|
-
|
|
394
|
-
raise last_error
|
|
395
|
-
```
|
|
396
|
-
|
|
397
|
-
---
|
|
398
|
-
|
|
399
|
-
## Callback System (callbacks.py)
|
|
400
|
-
|
|
401
|
-
Inspired by smolagents + LangChain:
|
|
402
|
-
|
|
403
|
-
```python
|
|
404
|
-
from abc import ABC
|
|
405
|
-
from dataclasses import dataclass
|
|
406
|
-
from datetime import datetime
|
|
407
|
-
from typing import Any
|
|
408
|
-
|
|
409
|
-
@dataclass
|
|
410
|
-
class Event:
|
|
411
|
-
timestamp: datetime
|
|
412
|
-
event_type: str
|
|
413
|
-
data: dict[str, Any]
|
|
414
|
-
|
|
415
|
-
class CleanerCallback(ABC):
|
|
416
|
-
"""Base class for pipeline event handlers"""
|
|
417
|
-
|
|
418
|
-
def on_run_start(self, file_path: str, total_chunks: int): pass
|
|
419
|
-
def on_chunk_start(self, chunk_index: int, chunk_preview: str): pass
|
|
420
|
-
def on_iteration_start(self, chunk_index: int, iteration: int): pass
|
|
421
|
-
def on_llm_call(self, prompt_preview: str): pass
|
|
422
|
-
def on_llm_response(self, response_preview: str, latency_ms: float): pass
|
|
423
|
-
def on_function_generated(self, name: str, docstring: str): pass
|
|
424
|
-
def on_validation_error(self, error: str, retry_prompt: str | None): pass
|
|
425
|
-
def on_chunk_complete(self, chunk_index: int, status: str, iterations: int): pass
|
|
426
|
-
def on_run_complete(self, total_functions: int, total_errors: int): pass
|
|
427
|
-
def on_error(self, error: Exception, context: dict): pass
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
class LoggingCallback(CleanerCallback):
|
|
431
|
-
"""Logs events to console and file"""
|
|
432
|
-
|
|
433
|
-
def __init__(self, log_file: str | None = None, verbose: bool = True):
|
|
434
|
-
self.log_file = log_file
|
|
435
|
-
self.verbose = verbose
|
|
436
|
-
self.events: list[Event] = []
|
|
437
|
-
|
|
438
|
-
def _log(self, message: str):
|
|
439
|
-
timestamp = datetime.now().strftime("%H:%M:%S")
|
|
440
|
-
line = f"[{timestamp}] {message}"
|
|
441
|
-
if self.verbose:
|
|
442
|
-
print(line)
|
|
443
|
-
if self.log_file:
|
|
444
|
-
with open(self.log_file, "a") as f:
|
|
445
|
-
f.write(line + "\n")
|
|
446
|
-
|
|
447
|
-
def on_run_start(self, file_path: str, total_chunks: int):
|
|
448
|
-
self._log(f"Starting pipeline: {file_path} ({total_chunks} chunks)")
|
|
449
|
-
|
|
450
|
-
def on_chunk_start(self, chunk_index: int, chunk_preview: str):
|
|
451
|
-
self._log(f"Processing chunk {chunk_index}...")
|
|
452
|
-
|
|
453
|
-
def on_function_generated(self, name: str, docstring: str):
|
|
454
|
-
self._log(f" Generated: {name}()")
|
|
455
|
-
|
|
456
|
-
def on_validation_error(self, error: str, retry_prompt: str | None):
|
|
457
|
-
self._log(f" Validation error: {error}")
|
|
458
|
-
|
|
459
|
-
def on_chunk_complete(self, chunk_index: int, status: str, iterations: int):
|
|
460
|
-
self._log(f" Chunk {chunk_index} complete: {status} ({iterations} iterations)")
|
|
461
|
-
|
|
462
|
-
def on_run_complete(self, total_functions: int, total_errors: int):
|
|
463
|
-
self._log(f"Pipeline complete! {total_functions} functions, {total_errors} errors")
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
class MetricsCallback(CleanerCallback):
|
|
467
|
-
"""Collects metrics for analysis"""
|
|
468
|
-
|
|
469
|
-
def __init__(self):
|
|
470
|
-
self.metrics = {
|
|
471
|
-
"llm_calls": 0,
|
|
472
|
-
"total_latency_ms": 0,
|
|
473
|
-
"functions_generated": 0,
|
|
474
|
-
"chunks_processed": 0,
|
|
475
|
-
"chunks_skipped": 0,
|
|
476
|
-
"validation_errors": 0,
|
|
477
|
-
"retries": 0,
|
|
478
|
-
}
|
|
479
|
-
|
|
480
|
-
def on_llm_response(self, response_preview: str, latency_ms: float):
|
|
481
|
-
self.metrics["llm_calls"] += 1
|
|
482
|
-
self.metrics["total_latency_ms"] += latency_ms
|
|
483
|
-
|
|
484
|
-
def on_function_generated(self, name: str, docstring: str):
|
|
485
|
-
self.metrics["functions_generated"] += 1
|
|
486
|
-
|
|
487
|
-
def on_validation_error(self, error: str, retry_prompt: str | None):
|
|
488
|
-
self.metrics["validation_errors"] += 1
|
|
489
|
-
if retry_prompt:
|
|
490
|
-
self.metrics["retries"] += 1
|
|
491
|
-
|
|
492
|
-
def get_summary(self) -> dict:
|
|
493
|
-
m = self.metrics
|
|
494
|
-
return {
|
|
495
|
-
**m,
|
|
496
|
-
"avg_latency_ms": m["total_latency_ms"] / max(m["llm_calls"], 1),
|
|
497
|
-
}
|
|
498
|
-
```
|
|
499
|
-
|
|
500
|
-
---
|
|
501
|
-
|
|
502
|
-
## Jinja2 Prompt Templates (prompts/system.yaml)
|
|
503
|
-
|
|
504
|
-
Inspired by smolagents:
|
|
505
|
-
|
|
506
|
-
```yaml
|
|
507
|
-
# prompts/system.yaml
|
|
508
|
-
system_prompt: |
|
|
509
|
-
You are a data cleaning expert. Your job is to analyze data chunks and generate Python functions to fix quality issues.
|
|
510
|
-
|
|
511
|
-
=== USER'S CLEANING GOALS ===
|
|
512
|
-
{{ instructions }}
|
|
513
|
-
|
|
514
|
-
=== EXISTING CLEANING FUNCTIONS ===
|
|
515
|
-
{% if docstring_context %}
|
|
516
|
-
{{ docstring_context }}
|
|
517
|
-
{% else %}
|
|
518
|
-
(No functions generated yet)
|
|
519
|
-
{% endif %}
|
|
520
|
-
|
|
521
|
-
=== DATA CHUNK {{ chunk_index }}/{{ total_chunks }} ===
|
|
522
|
-
{{ chunk_data }}
|
|
523
|
-
|
|
524
|
-
=== YOUR TASK ===
|
|
525
|
-
1. Identify ALL data quality issues in this chunk
|
|
526
|
-
2. Check if each issue is already solved by existing functions
|
|
527
|
-
3. Generate code for ONLY the FIRST unsolved issue
|
|
528
|
-
4. Return your analysis in this EXACT format:
|
|
529
|
-
|
|
530
|
-
<cleaning_analysis>
|
|
531
|
-
<issues_detected>
|
|
532
|
-
<issue id="1" solved="true|false">Description</issue>
|
|
533
|
-
</issues_detected>
|
|
534
|
-
|
|
535
|
-
<function_to_generate>
|
|
536
|
-
<name>function_name</name>
|
|
537
|
-
<docstring>
|
|
538
|
-
What this function does.
|
|
539
|
-
What edge cases it handles.
|
|
540
|
-
</docstring>
|
|
541
|
-
<code>
|
|
542
|
-
```python
|
|
543
|
-
def function_name(data):
|
|
544
|
-
"""Docstring here"""
|
|
545
|
-
# Implementation
|
|
546
|
-
pass
|
|
547
|
-
```
|
|
548
|
-
</code>
|
|
549
|
-
</function_to_generate>
|
|
550
|
-
|
|
551
|
-
<chunk_status>clean|needs_more_work</chunk_status>
|
|
552
|
-
</cleaning_analysis>
|
|
553
|
-
|
|
554
|
-
CRITICAL RULES:
|
|
555
|
-
- Generate EXACTLY ONE function per response
|
|
556
|
-
- If all issues solved: set <chunk_status>clean</chunk_status> and omit <function_to_generate>
|
|
557
|
-
- Include all imports at the top of the code block
|
|
558
|
-
- Functions must be idempotent (safe to run multiple times)
|
|
559
|
-
- Always include a docstring in your function
|
|
560
|
-
```
|
|
561
|
-
|
|
562
|
-
```yaml
|
|
563
|
-
# prompts/retry.yaml
|
|
564
|
-
retry_prompt: |
|
|
565
|
-
Your previous response had an error:
|
|
566
|
-
|
|
567
|
-
{{ error_message }}
|
|
568
|
-
|
|
569
|
-
{% if retry_hint %}
|
|
570
|
-
Hint: {{ retry_hint }}
|
|
571
|
-
{% endif %}
|
|
572
|
-
|
|
573
|
-
Please regenerate your response in the correct format.
|
|
574
|
-
```
|
|
575
|
-
|
|
576
|
-
---
|
|
577
|
-
|
|
578
|
-
## Checkpoint Persistence (checkpoints.py)
|
|
579
|
-
|
|
580
|
-
Inspired by LangGraph:
|
|
581
|
-
|
|
582
|
-
```python
|
|
583
|
-
import json
|
|
584
|
-
from abc import ABC, abstractmethod
|
|
585
|
-
from dataclasses import asdict
|
|
586
|
-
from datetime import datetime
|
|
587
|
-
from pathlib import Path
|
|
588
|
-
from typing import Any
|
|
589
|
-
|
|
590
|
-
class CheckpointSaver(ABC):
|
|
591
|
-
"""Abstract interface for state persistence"""
|
|
592
|
-
|
|
593
|
-
@abstractmethod
|
|
594
|
-
def save(self, state: dict, step: int) -> None: ...
|
|
595
|
-
|
|
596
|
-
@abstractmethod
|
|
597
|
-
def load(self, step: int | None = None) -> dict | None: ...
|
|
598
|
-
|
|
599
|
-
@abstractmethod
|
|
600
|
-
def list_steps(self) -> list[int]: ...
|
|
601
|
-
|
|
602
|
-
@abstractmethod
|
|
603
|
-
def delete(self, step: int) -> None: ...
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
class FileCheckpointSaver(CheckpointSaver):
|
|
607
|
-
"""Save checkpoints to JSON files"""
|
|
608
|
-
|
|
609
|
-
def __init__(self, directory: str = ".checkpoints"):
|
|
610
|
-
self.directory = Path(directory)
|
|
611
|
-
self.directory.mkdir(exist_ok=True)
|
|
612
|
-
|
|
613
|
-
def _path(self, step: int) -> Path:
|
|
614
|
-
return self.directory / f"checkpoint_{step:06d}.json"
|
|
615
|
-
|
|
616
|
-
def save(self, state: dict, step: int) -> None:
|
|
617
|
-
data = {
|
|
618
|
-
"step": step,
|
|
619
|
-
"timestamp": datetime.now().isoformat(),
|
|
620
|
-
"state": self._serialize(state),
|
|
621
|
-
}
|
|
622
|
-
with open(self._path(step), "w") as f:
|
|
623
|
-
json.dump(data, f, indent=2, default=str)
|
|
624
|
-
|
|
625
|
-
def load(self, step: int | None = None) -> dict | None:
|
|
626
|
-
if step is None:
|
|
627
|
-
steps = self.list_steps()
|
|
628
|
-
if not steps:
|
|
629
|
-
return None
|
|
630
|
-
step = max(steps)
|
|
631
|
-
|
|
632
|
-
path = self._path(step)
|
|
633
|
-
if not path.exists():
|
|
634
|
-
return None
|
|
635
|
-
|
|
636
|
-
with open(path) as f:
|
|
637
|
-
data = json.load(f)
|
|
638
|
-
return data["state"]
|
|
639
|
-
|
|
640
|
-
def list_steps(self) -> list[int]:
|
|
641
|
-
steps = []
|
|
642
|
-
for path in self.directory.glob("checkpoint_*.json"):
|
|
643
|
-
try:
|
|
644
|
-
step = int(path.stem.split("_")[1])
|
|
645
|
-
steps.append(step)
|
|
646
|
-
except (IndexError, ValueError):
|
|
647
|
-
continue
|
|
648
|
-
return sorted(steps)
|
|
649
|
-
|
|
650
|
-
def delete(self, step: int) -> None:
|
|
651
|
-
path = self._path(step)
|
|
652
|
-
if path.exists():
|
|
653
|
-
path.unlink()
|
|
654
|
-
|
|
655
|
-
def _serialize(self, obj: Any) -> Any:
|
|
656
|
-
if hasattr(obj, "__dict__"):
|
|
657
|
-
return {k: self._serialize(v) for k, v in obj.__dict__.items()}
|
|
658
|
-
if isinstance(obj, list):
|
|
659
|
-
return [self._serialize(v) for v in obj]
|
|
660
|
-
if isinstance(obj, dict):
|
|
661
|
-
return {k: self._serialize(v) for k, v in obj.items()}
|
|
662
|
-
if isinstance(obj, datetime):
|
|
663
|
-
return obj.isoformat()
|
|
664
|
-
return obj
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
class InMemoryCheckpointSaver(CheckpointSaver):
|
|
668
|
-
"""For testing - stores checkpoints in memory"""
|
|
669
|
-
|
|
670
|
-
def __init__(self):
|
|
671
|
-
self.checkpoints: dict[int, dict] = {}
|
|
672
|
-
|
|
673
|
-
def save(self, state: dict, step: int) -> None:
|
|
674
|
-
self.checkpoints[step] = state.copy()
|
|
675
|
-
|
|
676
|
-
def load(self, step: int | None = None) -> dict | None:
|
|
677
|
-
if step is None:
|
|
678
|
-
if not self.checkpoints:
|
|
679
|
-
return None
|
|
680
|
-
step = max(self.checkpoints.keys())
|
|
681
|
-
return self.checkpoints.get(step)
|
|
682
|
-
|
|
683
|
-
def list_steps(self) -> list[int]:
|
|
684
|
-
return sorted(self.checkpoints.keys())
|
|
685
|
-
|
|
686
|
-
def delete(self, step: int) -> None:
|
|
687
|
-
self.checkpoints.pop(step, None)
|
|
688
|
-
```
|
|
689
|
-
|
|
690
|
-
---
|
|
691
|
-
|
|
692
|
-
## Main Orchestrator (cleaner.py)
|
|
693
|
-
|
|
694
|
-
```python
|
|
695
|
-
import time
|
|
696
|
-
from dataclasses import dataclass, field
|
|
697
|
-
from datetime import datetime
|
|
698
|
-
from typing import Protocol
|
|
699
|
-
from jinja2 import Environment, PackageLoader
|
|
700
|
-
|
|
701
|
-
from .state import PipelineState, GeneratedFunction, ChunkResult
|
|
702
|
-
from .memory import DocstringRegistry
|
|
703
|
-
from .validation import ChunkGuardrail, OutputFixingParser, CodeValidator
|
|
704
|
-
from .callbacks import CleanerCallback, LoggingCallback
|
|
705
|
-
from .checkpoints import CheckpointSaver
|
|
706
|
-
from .errors import *
|
|
707
|
-
from .parsing import parse_response
|
|
708
|
-
|
|
709
|
-
from tenacity import retry, stop_after_attempt, wait_exponential_jitter
|
|
710
|
-
|
|
711
|
-
class LLMBackend(Protocol):
|
|
712
|
-
def generate(self, prompt: str) -> str: ...
|
|
713
|
-
|
|
714
|
-
@dataclass
|
|
715
|
-
class DataCleaner:
|
|
716
|
-
llm_backend: LLMBackend
|
|
717
|
-
file_path: str
|
|
718
|
-
instructions: str
|
|
719
|
-
chunk_size: int = 50
|
|
720
|
-
max_iterations: int = 5
|
|
721
|
-
context_budget: int = 4000
|
|
722
|
-
callbacks: list[CleanerCallback] = field(default_factory=list)
|
|
723
|
-
checkpointer: CheckpointSaver | None = None
|
|
724
|
-
checkpoint_interval: int = 10 # Save every N chunks
|
|
725
|
-
|
|
726
|
-
def __post_init__(self):
|
|
727
|
-
self.env = Environment(loader=PackageLoader("recursive_cleaner", "prompts"))
|
|
728
|
-
self.system_template = self.env.get_template("system.yaml")
|
|
729
|
-
self.retry_template = self.env.get_template("retry.yaml")
|
|
730
|
-
|
|
731
|
-
self.registry = DocstringRegistry(self.llm_backend, self.context_budget)
|
|
732
|
-
self.guardrail = ChunkGuardrail(self.max_iterations)
|
|
733
|
-
self.output_fixer = OutputFixingParser(self.llm_backend)
|
|
734
|
-
|
|
735
|
-
self.state: PipelineState = {
|
|
736
|
-
"functions": [],
|
|
737
|
-
"docstring_registry": {},
|
|
738
|
-
"chunk_results": [],
|
|
739
|
-
"current_chunk_index": 0,
|
|
740
|
-
"current_iteration": 0,
|
|
741
|
-
"total_chunks": 0,
|
|
742
|
-
"file_path": self.file_path,
|
|
743
|
-
"started_at": datetime.now().isoformat(),
|
|
744
|
-
"errors": [],
|
|
745
|
-
}
|
|
746
|
-
|
|
747
|
-
# Add default logging callback if none provided
|
|
748
|
-
if not self.callbacks:
|
|
749
|
-
self.callbacks.append(LoggingCallback())
|
|
750
|
-
|
|
751
|
-
def run(self, resume_from: int | None = None):
|
|
752
|
-
"""Run the cleaning pipeline"""
|
|
753
|
-
chunks = self._load_chunks()
|
|
754
|
-
self.state["total_chunks"] = len(chunks)
|
|
755
|
-
|
|
756
|
-
# Resume from checkpoint if requested
|
|
757
|
-
start_index = 0
|
|
758
|
-
if resume_from is not None and self.checkpointer:
|
|
759
|
-
saved_state = self.checkpointer.load(resume_from)
|
|
760
|
-
if saved_state:
|
|
761
|
-
self.state = saved_state
|
|
762
|
-
start_index = saved_state["current_chunk_index"]
|
|
763
|
-
|
|
764
|
-
self._emit("on_run_start", self.file_path, len(chunks))
|
|
765
|
-
|
|
766
|
-
for i, chunk in enumerate(chunks[start_index:], start=start_index):
|
|
767
|
-
self.state["current_chunk_index"] = i
|
|
768
|
-
self._emit("on_chunk_start", i, chunk[:100])
|
|
769
|
-
|
|
770
|
-
try:
|
|
771
|
-
result = self._process_chunk(chunk, i)
|
|
772
|
-
self.state["chunk_results"].append(result)
|
|
773
|
-
except ChunkSkippedError as e:
|
|
774
|
-
self.state["errors"].append({"chunk": i, "error": str(e)})
|
|
775
|
-
|
|
776
|
-
# Checkpoint periodically
|
|
777
|
-
if self.checkpointer and i % self.checkpoint_interval == 0:
|
|
778
|
-
self.checkpointer.save(self.state, i)
|
|
779
|
-
|
|
780
|
-
self._write_output()
|
|
781
|
-
self._emit("on_run_complete", len(self.state["functions"]), len(self.state["errors"]))
|
|
782
|
-
|
|
783
|
-
def _process_chunk(self, chunk: str, chunk_index: int) -> ChunkResult:
|
|
784
|
-
functions_generated = []
|
|
785
|
-
issues_found = 0
|
|
786
|
-
|
|
787
|
-
for iteration in range(self.max_iterations):
|
|
788
|
-
self.state["current_iteration"] = iteration
|
|
789
|
-
self._emit("on_iteration_start", chunk_index, iteration)
|
|
790
|
-
|
|
791
|
-
prompt = self._build_prompt(chunk, chunk_index)
|
|
792
|
-
|
|
793
|
-
# Call LLM with retry
|
|
794
|
-
start_time = time.time()
|
|
795
|
-
try:
|
|
796
|
-
response = self._call_llm_with_retry(prompt)
|
|
797
|
-
except Exception as e:
|
|
798
|
-
self._emit("on_error", e, {"chunk": chunk_index, "iteration": iteration})
|
|
799
|
-
raise ChunkSkippedError(f"LLM call failed: {e}")
|
|
800
|
-
|
|
801
|
-
latency_ms = (time.time() - start_time) * 1000
|
|
802
|
-
self._emit("on_llm_response", response[:100], latency_ms)
|
|
803
|
-
|
|
804
|
-
# Parse with auto-fix
|
|
805
|
-
try:
|
|
806
|
-
result = self.output_fixer.parse_with_fix(response, parse_response, prompt)
|
|
807
|
-
except Exception as e:
|
|
808
|
-
self._emit("on_validation_error", str(e), None)
|
|
809
|
-
continue
|
|
810
|
-
|
|
811
|
-
issues_found = len(result.get("issues", []))
|
|
812
|
-
|
|
813
|
-
# Validate
|
|
814
|
-
validation = self.guardrail.check(result, iteration)
|
|
815
|
-
if not validation.success:
|
|
816
|
-
if validation.retry_prompt:
|
|
817
|
-
self._emit("on_validation_error", validation.error, validation.retry_prompt)
|
|
818
|
-
# Retry with feedback
|
|
819
|
-
continue
|
|
820
|
-
else:
|
|
821
|
-
# Non-recoverable
|
|
822
|
-
break
|
|
823
|
-
|
|
824
|
-
# Check if clean
|
|
825
|
-
if result.get("status") == "clean":
|
|
826
|
-
self._emit("on_chunk_complete", chunk_index, "clean", iteration + 1)
|
|
827
|
-
return ChunkResult(
|
|
828
|
-
chunk_index=chunk_index,
|
|
829
|
-
iterations=iteration + 1,
|
|
830
|
-
status="clean",
|
|
831
|
-
functions_generated=functions_generated,
|
|
832
|
-
issues_found=issues_found,
|
|
833
|
-
issues_solved=issues_found,
|
|
834
|
-
)
|
|
835
|
-
|
|
836
|
-
# Store generated function
|
|
837
|
-
if result.get("code"):
|
|
838
|
-
func = GeneratedFunction(
|
|
839
|
-
name=result["name"],
|
|
840
|
-
docstring=result["docstring"],
|
|
841
|
-
code=result["code"],
|
|
842
|
-
issues_solved=[], # Could parse from issues
|
|
843
|
-
chunk_index=chunk_index,
|
|
844
|
-
)
|
|
845
|
-
self.state["functions"].append(func)
|
|
846
|
-
self.registry.add(func.name, func.docstring)
|
|
847
|
-
functions_generated.append(func.name)
|
|
848
|
-
self._emit("on_function_generated", func.name, func.docstring)
|
|
849
|
-
|
|
850
|
-
# Max iterations reached
|
|
851
|
-
self._emit("on_chunk_complete", chunk_index, "max_iterations", self.max_iterations)
|
|
852
|
-
return ChunkResult(
|
|
853
|
-
chunk_index=chunk_index,
|
|
854
|
-
iterations=self.max_iterations,
|
|
855
|
-
status="max_iterations",
|
|
856
|
-
functions_generated=functions_generated,
|
|
857
|
-
issues_found=issues_found,
|
|
858
|
-
issues_solved=len(functions_generated),
|
|
859
|
-
)
|
|
860
|
-
|
|
861
|
-
@retry(
|
|
862
|
-
stop=stop_after_attempt(3),
|
|
863
|
-
wait=wait_exponential_jitter(initial=1, max=30, jitter=5),
|
|
864
|
-
)
|
|
865
|
-
def _call_llm_with_retry(self, prompt: str) -> str:
|
|
866
|
-
self._emit("on_llm_call", prompt[:100])
|
|
867
|
-
return self.llm_backend.generate(prompt)
|
|
868
|
-
|
|
869
|
-
def _build_prompt(self, chunk: str, chunk_index: int) -> str:
|
|
870
|
-
return self.system_template.render(
|
|
871
|
-
instructions=self.instructions,
|
|
872
|
-
docstring_context=self.registry.get_context(),
|
|
873
|
-
chunk_index=chunk_index + 1,
|
|
874
|
-
total_chunks=self.state["total_chunks"],
|
|
875
|
-
chunk_data=chunk,
|
|
876
|
-
)
|
|
877
|
-
|
|
878
|
-
def _emit(self, event: str, *args, **kwargs):
|
|
879
|
-
for callback in self.callbacks:
|
|
880
|
-
method = getattr(callback, event, None)
|
|
881
|
-
if method:
|
|
882
|
-
try:
|
|
883
|
-
method(*args, **kwargs)
|
|
884
|
-
except Exception:
|
|
885
|
-
pass # Don't let callback errors break pipeline
|
|
886
|
-
|
|
887
|
-
def _load_chunks(self) -> list[str]:
|
|
888
|
-
# Implementation for text/csv/json chunking
|
|
889
|
-
...
|
|
890
|
-
|
|
891
|
-
def _write_output(self):
|
|
892
|
-
# Generate cleaning_functions.py
|
|
893
|
-
...
|
|
894
|
-
```
|
|
895
|
-
|
|
896
|
-
---
|
|
897
|
-
|
|
898
|
-
## Dependencies
|
|
899
|
-
|
|
900
|
-
```toml
|
|
901
|
-
[project]
|
|
902
|
-
dependencies = [
|
|
903
|
-
"tenacity>=8.0", # Retry logic
|
|
904
|
-
"jinja2>=3.0", # Prompt templates
|
|
905
|
-
]
|
|
906
|
-
|
|
907
|
-
[project.optional-dependencies]
|
|
908
|
-
vector = [
|
|
909
|
-
"chromadb>=0.4", # Vector store for large registries
|
|
910
|
-
]
|
|
911
|
-
```
|
|
912
|
-
|
|
913
|
-
---
|
|
914
|
-
|
|
915
|
-
## Comparison: Lean vs Advanced
|
|
916
|
-
|
|
917
|
-
| Feature | Lean (CLAUDE.md) | Advanced (this file) |
|
|
918
|
-
|---------|------------------|----------------------|
|
|
919
|
-
| Lines of code | ~300 | ~800 |
|
|
920
|
-
| Dependencies | tenacity | tenacity, jinja2, (chromadb) |
|
|
921
|
-
| Error handling | 3 exception classes | 6 exception classes + hierarchy |
|
|
922
|
-
| Context management | FIFO eviction | Summary buffer + optional vector |
|
|
923
|
-
| Output recovery | Retry with error msg | LLM-powered output fixer |
|
|
924
|
-
| Validation | ast.parse() | AST + dangerous code detection |
|
|
925
|
-
| Extensibility | None | Callback system |
|
|
926
|
-
| Persistence | None | Checkpoint interface |
|
|
927
|
-
| Templates | f-strings | Jinja2 YAML files |
|
|
928
|
-
| State | Plain dicts | TypedDict with reducers |
|
|
929
|
-
|
|
930
|
-
---
|
|
931
|
-
|
|
932
|
-
## When to Use Each
|
|
933
|
-
|
|
934
|
-
**Use Lean (CLAUDE.md) when:**
|
|
935
|
-
- Getting started / prototyping
|
|
936
|
-
- Simple, predictable data
|
|
937
|
-
- You want minimal dependencies
|
|
938
|
-
- You'll monitor the run yourself
|
|
939
|
-
|
|
940
|
-
**Use Advanced (this file) when:**
|
|
941
|
-
- Running on large datasets (1000+ chunks)
|
|
942
|
-
- Need to resume interrupted runs
|
|
943
|
-
- Want detailed metrics/logging
|
|
944
|
-
- Integrating into larger systems
|
|
945
|
-
- Need security validation of generated code
|
|
946
|
-
|
|
947
|
-
---
|
|
948
|
-
|
|
949
|
-
## References
|
|
950
|
-
|
|
951
|
-
See `docs/` folder for detailed framework analyses:
|
|
952
|
-
- `smolagents-analysis.md` - Memory steps, error hierarchy, AST validation
|
|
953
|
-
- `langchain-analysis.md` - Tenacity retry, summary buffer, output fixing
|
|
954
|
-
- `langgraph-analysis.md` - TypedDict state, reducers, checkpoints
|
|
955
|
-
- `other-frameworks-analysis.md` - Guardrails, validation patterns
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/contracts/tier4-success-criteria.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/refactor-assessment/data/dependency.json
RENAMED
|
File without changes
|
{recursive_cleaner-0.6.0 → recursive_cleaner-0.6.1}/docs/refactor-assessment/data/stats.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|