recursive-cleaner 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- backends/__init__.py +5 -0
- backends/mlx_backend.py +95 -0
- recursive_cleaner/__init__.py +46 -0
- recursive_cleaner/cleaner.py +628 -0
- recursive_cleaner/context.py +27 -0
- recursive_cleaner/dependencies.py +59 -0
- recursive_cleaner/errors.py +17 -0
- recursive_cleaner/metrics.py +163 -0
- recursive_cleaner/optimizer.py +336 -0
- recursive_cleaner/output.py +197 -0
- recursive_cleaner/parsers.py +325 -0
- recursive_cleaner/prompt.py +218 -0
- recursive_cleaner/report.py +138 -0
- recursive_cleaner/response.py +292 -0
- recursive_cleaner/schema.py +117 -0
- recursive_cleaner/types.py +11 -0
- recursive_cleaner/validation.py +202 -0
- recursive_cleaner/vendor/__init__.py +4 -0
- recursive_cleaner/vendor/chunker.py +187 -0
- recursive_cleaner-0.6.0.dist-info/METADATA +282 -0
- recursive_cleaner-0.6.0.dist-info/RECORD +23 -0
- recursive_cleaner-0.6.0.dist-info/WHEEL +4 -0
- recursive_cleaner-0.6.0.dist-info/licenses/LICENSE +21 -0
backends/__init__.py
ADDED
backends/mlx_backend.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""MLX-LM backend for Recursive Data Cleaner."""
|
|
2
|
+
|
|
3
|
+
from mlx_lm import load, generate
|
|
4
|
+
from mlx_lm.sample_utils import make_sampler, make_logits_processors
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class MLXBackend:
|
|
8
|
+
"""
|
|
9
|
+
MLX-LM backend implementation for Apple Silicon.
|
|
10
|
+
|
|
11
|
+
Conforms to the LLMBackend protocol.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
model_path: str = "lmstudio-community/Qwen3-Next-80B-A3B-Instruct-MLX-4bit",
|
|
17
|
+
max_tokens: int = 4096,
|
|
18
|
+
temperature: float = 0.7,
|
|
19
|
+
top_p: float = 0.9,
|
|
20
|
+
repetition_penalty: float = 1.1,
|
|
21
|
+
verbose: bool = False,
|
|
22
|
+
):
|
|
23
|
+
"""
|
|
24
|
+
Initialize the MLX backend.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
model_path: HuggingFace model path or local path
|
|
28
|
+
max_tokens: Maximum tokens to generate
|
|
29
|
+
temperature: Sampling temperature (0 = deterministic)
|
|
30
|
+
top_p: Nucleus sampling threshold
|
|
31
|
+
repetition_penalty: Penalty for repeated tokens
|
|
32
|
+
verbose: Whether to print loading info
|
|
33
|
+
"""
|
|
34
|
+
self.model_path = model_path
|
|
35
|
+
self.max_tokens = max_tokens
|
|
36
|
+
self.temperature = temperature
|
|
37
|
+
self.top_p = top_p
|
|
38
|
+
self.repetition_penalty = repetition_penalty
|
|
39
|
+
self.verbose = verbose
|
|
40
|
+
|
|
41
|
+
self._model = None
|
|
42
|
+
self._tokenizer = None
|
|
43
|
+
self._sampler = None
|
|
44
|
+
self._logits_processors = None
|
|
45
|
+
|
|
46
|
+
def _ensure_loaded(self):
|
|
47
|
+
"""Lazy load the model on first use."""
|
|
48
|
+
if self._model is None:
|
|
49
|
+
if self.verbose:
|
|
50
|
+
print(f"Loading model: {self.model_path}")
|
|
51
|
+
self._model, self._tokenizer = load(self.model_path)
|
|
52
|
+
|
|
53
|
+
# Create sampler and processors
|
|
54
|
+
self._sampler = make_sampler(temp=self.temperature, top_p=self.top_p)
|
|
55
|
+
self._logits_processors = make_logits_processors(
|
|
56
|
+
repetition_penalty=self.repetition_penalty
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
if self.verbose:
|
|
60
|
+
print("Model loaded successfully")
|
|
61
|
+
|
|
62
|
+
def generate(self, prompt: str) -> str:
|
|
63
|
+
"""
|
|
64
|
+
Generate a response from the LLM.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
prompt: The input prompt
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
The generated text response
|
|
71
|
+
"""
|
|
72
|
+
self._ensure_loaded()
|
|
73
|
+
|
|
74
|
+
# Apply chat template if available (for instruction-tuned models)
|
|
75
|
+
if hasattr(self._tokenizer, 'apply_chat_template'):
|
|
76
|
+
messages = [{"role": "user", "content": prompt}]
|
|
77
|
+
formatted_prompt = self._tokenizer.apply_chat_template(
|
|
78
|
+
messages,
|
|
79
|
+
tokenize=False,
|
|
80
|
+
add_generation_prompt=True
|
|
81
|
+
)
|
|
82
|
+
else:
|
|
83
|
+
formatted_prompt = prompt
|
|
84
|
+
|
|
85
|
+
response = generate(
|
|
86
|
+
self._model,
|
|
87
|
+
self._tokenizer,
|
|
88
|
+
prompt=formatted_prompt,
|
|
89
|
+
max_tokens=self.max_tokens,
|
|
90
|
+
sampler=self._sampler,
|
|
91
|
+
logits_processors=self._logits_processors,
|
|
92
|
+
verbose=self.verbose,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
return response
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Recursive Data Cleaner - LLM-powered incremental data cleaning pipeline."""
|
|
2
|
+
|
|
3
|
+
from recursive_cleaner.cleaner import DataCleaner
|
|
4
|
+
from recursive_cleaner.context import build_context
|
|
5
|
+
from recursive_cleaner.dependencies import resolve_dependencies
|
|
6
|
+
from recursive_cleaner.errors import (
|
|
7
|
+
CleanerError,
|
|
8
|
+
MaxIterationsError,
|
|
9
|
+
OutputValidationError,
|
|
10
|
+
ParseError,
|
|
11
|
+
)
|
|
12
|
+
from recursive_cleaner.metrics import QualityMetrics, compare_quality, measure_quality
|
|
13
|
+
from recursive_cleaner.optimizer import (
|
|
14
|
+
consolidate_with_agency,
|
|
15
|
+
extract_tags,
|
|
16
|
+
group_by_salience,
|
|
17
|
+
)
|
|
18
|
+
from recursive_cleaner.output import write_cleaning_file
|
|
19
|
+
from recursive_cleaner.parsers import chunk_file
|
|
20
|
+
from recursive_cleaner.prompt import build_prompt
|
|
21
|
+
from recursive_cleaner.response import extract_python_block, parse_response
|
|
22
|
+
from recursive_cleaner.validation import check_code_safety, extract_sample_data, validate_function
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"CleanerError",
|
|
26
|
+
"ParseError",
|
|
27
|
+
"MaxIterationsError",
|
|
28
|
+
"OutputValidationError",
|
|
29
|
+
"chunk_file",
|
|
30
|
+
"parse_response",
|
|
31
|
+
"extract_python_block",
|
|
32
|
+
"build_context",
|
|
33
|
+
"build_prompt",
|
|
34
|
+
"write_cleaning_file",
|
|
35
|
+
"DataCleaner",
|
|
36
|
+
"validate_function",
|
|
37
|
+
"extract_sample_data",
|
|
38
|
+
"check_code_safety",
|
|
39
|
+
"resolve_dependencies",
|
|
40
|
+
"QualityMetrics",
|
|
41
|
+
"measure_quality",
|
|
42
|
+
"compare_quality",
|
|
43
|
+
"extract_tags",
|
|
44
|
+
"group_by_salience",
|
|
45
|
+
"consolidate_with_agency",
|
|
46
|
+
]
|