recursive-cleaner 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,117 @@
1
+ """Schema inference for data files."""
2
+
3
+ import csv
4
+ import json
5
+ from io import StringIO
6
+ from pathlib import Path
7
+
8
+
9
+ def infer_schema(file_path: str, sample_size: int = 10) -> dict:
10
+ """
11
+ Infer data schema from first N records.
12
+
13
+ Args:
14
+ file_path: Path to data file
15
+ sample_size: Number of records to sample
16
+
17
+ Returns:
18
+ {"fields": [...], "types": {...}, "samples": {...}, "nullable": {...}}
19
+ """
20
+ path = Path(file_path)
21
+ suffix = path.suffix.lower()
22
+
23
+ if not path.exists():
24
+ return {"fields": [], "types": {}, "samples": {}, "nullable": {}}
25
+
26
+ content = path.read_text(encoding="utf-8")
27
+ if not content.strip():
28
+ return {"fields": [], "types": {}, "samples": {}, "nullable": {}}
29
+
30
+ if suffix == ".jsonl":
31
+ return _infer_jsonl(content, sample_size)
32
+ elif suffix == ".csv":
33
+ return _infer_csv(content, sample_size)
34
+ elif suffix == ".json":
35
+ return _infer_json(content, sample_size)
36
+ else:
37
+ return {"fields": [], "types": {}, "samples": {}, "nullable": {}}
38
+
39
+
40
+ def _infer_jsonl(content: str, sample_size: int) -> dict:
41
+ """Infer schema from JSONL content."""
42
+ lines = [line.strip() for line in content.strip().split("\n") if line.strip()]
43
+ records = [json.loads(line) for line in lines[:sample_size]]
44
+ return _infer_from_records(records)
45
+
46
+
47
+ def _infer_csv(content: str, sample_size: int) -> dict:
48
+ """Infer schema from CSV content."""
49
+ reader = csv.DictReader(StringIO(content))
50
+ records = [row for _, row in zip(range(sample_size), reader)]
51
+ return _infer_from_records(records)
52
+
53
+
54
+ def _infer_json(content: str, sample_size: int) -> dict:
55
+ """Infer schema from JSON content."""
56
+ data = json.loads(content)
57
+ if isinstance(data, list):
58
+ records = data[:sample_size]
59
+ elif isinstance(data, dict):
60
+ records = [data]
61
+ else:
62
+ return {"fields": [], "types": {}, "samples": {}, "nullable": {}}
63
+ return _infer_from_records(records)
64
+
65
+
66
+ def _infer_from_records(records: list[dict]) -> dict:
67
+ """Build schema from list of record dicts."""
68
+ if not records:
69
+ return {"fields": [], "types": {}, "samples": {}, "nullable": {}}
70
+
71
+ fields = list(dict.fromkeys(k for r in records for k in r.keys()))
72
+ types = {}
73
+ samples = {}
74
+ nullable = {}
75
+
76
+ for field in fields:
77
+ values = [r.get(field) for r in records if field in r]
78
+ nullable[field] = any(v is None for v in values)
79
+ non_null = [v for v in values if v is not None]
80
+ samples[field] = non_null[:3]
81
+ types[field] = _infer_type(non_null)
82
+
83
+ return {"fields": fields, "types": types, "samples": samples, "nullable": nullable}
84
+
85
+
86
+ def _infer_type(values: list) -> str:
87
+ """Infer type from list of non-null values."""
88
+ if not values:
89
+ return "unknown"
90
+ type_map = {str: "str", int: "int", float: "float", bool: "bool", list: "list", dict: "dict"}
91
+ seen = set()
92
+ for v in values:
93
+ for py_type, name in type_map.items():
94
+ if type(v) is py_type:
95
+ seen.add(name)
96
+ break
97
+ else:
98
+ seen.add("unknown")
99
+ if len(seen) == 1:
100
+ return seen.pop()
101
+ return "mixed"
102
+
103
+
104
+ def format_schema_for_prompt(schema: dict) -> str:
105
+ """Format schema dict as human-readable string for prompt injection."""
106
+ if not schema.get("fields"):
107
+ return ""
108
+ lines = ["Fields detected:"]
109
+ for field in schema["fields"]:
110
+ ftype = schema["types"].get(field, "unknown")
111
+ is_nullable = schema["nullable"].get(field, False)
112
+ samples = schema["samples"].get(field, [])
113
+ type_str = f"{ftype}, nullable" if is_nullable else ftype
114
+ sample_strs = [repr(s) if isinstance(s, str) else str(s) for s in samples]
115
+ sample_part = ", ".join(sample_strs) if sample_strs else "no samples"
116
+ lines.append(f"- {field} ({type_str}): {sample_part}")
117
+ return "\n".join(lines)
@@ -0,0 +1,11 @@
1
+ """Type definitions for the recursive cleaner pipeline."""
2
+
3
+ from typing import Protocol
4
+
5
+
6
+ class LLMBackend(Protocol):
7
+ """Protocol for LLM backend implementations."""
8
+
9
+ def generate(self, prompt: str) -> str:
10
+ """Generate a response from the LLM given a prompt."""
11
+ ...
@@ -0,0 +1,202 @@
1
+ """Runtime validation for generated cleaning functions."""
2
+
3
+ import ast
4
+ import json
5
+ import re
6
+ from typing import Literal
7
+
8
+ # Modules that could access filesystem, run commands, or exfiltrate data
9
+ DANGEROUS_IMPORTS = frozenset({
10
+ "os",
11
+ "subprocess",
12
+ "sys",
13
+ "shutil",
14
+ "pathlib",
15
+ "socket",
16
+ "urllib",
17
+ "requests",
18
+ "httplib",
19
+ "ftplib",
20
+ "smtplib",
21
+ "pickle",
22
+ })
23
+
24
+ # Built-in functions that allow arbitrary code execution
25
+ DANGEROUS_CALLS = frozenset({
26
+ "eval",
27
+ "exec",
28
+ "compile",
29
+ "__import__",
30
+ "open", # Data cleaning functions receive data as args, shouldn't need file I/O
31
+ })
32
+
33
+
34
+ def check_code_safety(code: str) -> tuple[bool, str | None]:
35
+ """
36
+ Check if generated code contains dangerous patterns.
37
+
38
+ Catches common LLM mistakes like importing os or using eval().
39
+ Not a security sandbox - won't catch obfuscated/adversarial code.
40
+
41
+ Args:
42
+ code: Python source code to check
43
+
44
+ Returns:
45
+ (True, None) if code appears safe
46
+ (False, error_message) if dangerous pattern detected
47
+ """
48
+ try:
49
+ tree = ast.parse(code)
50
+ except SyntaxError as e:
51
+ return False, f"Syntax error: {e}"
52
+
53
+ for node in ast.walk(tree):
54
+ # Check: import os, import subprocess, etc.
55
+ if isinstance(node, ast.Import):
56
+ for alias in node.names:
57
+ module = alias.name.split(".")[0]
58
+ if module in DANGEROUS_IMPORTS:
59
+ return False, f"Dangerous import: {alias.name}"
60
+
61
+ # Check: from os import path, from subprocess import run, etc.
62
+ if isinstance(node, ast.ImportFrom):
63
+ if node.module:
64
+ module = node.module.split(".")[0]
65
+ if module in DANGEROUS_IMPORTS:
66
+ return False, f"Dangerous import: from {node.module}"
67
+
68
+ # Check: eval(...), exec(...), open(...), etc.
69
+ if isinstance(node, ast.Call):
70
+ if isinstance(node.func, ast.Name):
71
+ if node.func.id in DANGEROUS_CALLS:
72
+ return False, f"Dangerous function call: {node.func.id}()"
73
+
74
+ return True, None
75
+
76
+
77
+ def split_holdout(
78
+ chunk: str, holdout_ratio: float, mode: Literal["structured", "text"] = "structured"
79
+ ) -> tuple[str, str]:
80
+ """
81
+ Split chunk into generation and holdout portions.
82
+
83
+ Args:
84
+ chunk: Raw chunk string (JSONL for structured, plain text for text mode)
85
+ holdout_ratio: Fraction to hold out (0.0-0.5)
86
+ mode: "structured" splits by JSONL records, "text" splits at sentence boundary
87
+
88
+ Returns:
89
+ (generation_data, holdout_data) - both as strings
90
+ """
91
+ if holdout_ratio <= 0:
92
+ return chunk, ""
93
+
94
+ if mode == "structured":
95
+ lines = [ln for ln in chunk.strip().split("\n") if ln.strip()]
96
+ if not lines:
97
+ return chunk, ""
98
+ holdout_count = max(1, int(len(lines) * holdout_ratio))
99
+ split_idx = len(lines) - holdout_count
100
+ return "\n".join(lines[:split_idx]), "\n".join(lines[split_idx:])
101
+ else:
102
+ # Text mode: split at sentence boundary
103
+ sentences = re.split(r"(?<=[.!?])\s+", chunk.strip())
104
+ if len(sentences) <= 1:
105
+ return chunk, ""
106
+ holdout_count = max(1, int(len(sentences) * holdout_ratio))
107
+ split_idx = len(sentences) - holdout_count
108
+ return " ".join(sentences[:split_idx]), " ".join(sentences[split_idx:])
109
+
110
+
111
+ def validate_function(
112
+ code: str,
113
+ sample_data: list[dict] | str,
114
+ function_name: str,
115
+ mode: Literal["structured", "text"] = "structured",
116
+ ) -> tuple[bool, str | None]:
117
+ """
118
+ Execute generated function on sample data to catch runtime errors.
119
+
120
+ Args:
121
+ code: The Python source code of the function
122
+ sample_data: List of data records (structured) or text string (text mode)
123
+ function_name: Name of the function to call
124
+ mode: "structured" for dict records, "text" for string input
125
+
126
+ Returns:
127
+ (True, None) if function executes without error
128
+ (False, error_message) if function raises an exception
129
+ """
130
+ # Handle empty data
131
+ if mode == "text":
132
+ if not sample_data or (isinstance(sample_data, str) and not sample_data.strip()):
133
+ return True, None
134
+ else:
135
+ if not sample_data:
136
+ return True, None
137
+
138
+ # Create isolated namespace and execute the code
139
+ namespace: dict = {}
140
+ try:
141
+ exec(code, namespace)
142
+ except Exception as e:
143
+ return False, f"Code compilation failed: {type(e).__name__}: {e}"
144
+
145
+ # Get the function from namespace
146
+ func = namespace.get(function_name)
147
+ if func is None:
148
+ return False, f"Function '{function_name}' not found in code"
149
+
150
+ if mode == "text":
151
+ # Text mode: sample_data is a string
152
+ try:
153
+ result = func(sample_data)
154
+ # Verify function returns a string
155
+ if not isinstance(result, str):
156
+ return False, f"Function must return str, got {type(result).__name__}"
157
+ except Exception as e:
158
+ return False, f"Runtime error on text input: {type(e).__name__}: {e}"
159
+ else:
160
+ # Structured mode: sample_data is list[dict]
161
+ for i, record in enumerate(sample_data):
162
+ try:
163
+ func(record)
164
+ except Exception as e:
165
+ return False, f"Runtime error on sample {i}: {type(e).__name__}: {e}"
166
+
167
+ return True, None
168
+
169
+
170
+ def extract_sample_data(
171
+ chunk: str, max_samples: int = 3, mode: Literal["structured", "text"] = "structured"
172
+ ) -> list[dict] | str:
173
+ """
174
+ Extract sample data from a chunk string.
175
+
176
+ Args:
177
+ chunk: Raw chunk string
178
+ max_samples: Maximum number of samples to extract (structured mode only)
179
+ mode: "structured" for JSONL parsing, "text" for raw string
180
+
181
+ Returns:
182
+ List of parsed JSON objects (structured) or the chunk string (text)
183
+ """
184
+ if mode == "text":
185
+ # Text mode: return the chunk as-is for validation
186
+ return chunk
187
+
188
+ # Structured mode: parse JSONL
189
+ samples = []
190
+ for line in chunk.strip().split("\n"):
191
+ line = line.strip()
192
+ if not line:
193
+ continue
194
+ try:
195
+ obj = json.loads(line)
196
+ if isinstance(obj, dict):
197
+ samples.append(obj)
198
+ if len(samples) >= max_samples:
199
+ break
200
+ except json.JSONDecodeError:
201
+ continue
202
+ return samples
@@ -0,0 +1,4 @@
1
+ """Vendored third-party code."""
2
+ from .chunker import Chunk, SentenceChunker
3
+
4
+ __all__ = ["Chunk", "SentenceChunker"]
@@ -0,0 +1,187 @@
1
+ """Minimal sentence-aware text chunker extracted from chonkie.
2
+
3
+ Sentence chunking algorithm adapted from Chonkie
4
+ https://github.com/chonkie-inc/chonkie
5
+ Copyright (c) 2025 Chonkie
6
+ Licensed under the MIT License
7
+
8
+ MIT License
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+ """
28
+
29
+ from bisect import bisect_left
30
+ from dataclasses import dataclass
31
+ from itertools import accumulate
32
+ from typing import Literal, Optional, Union
33
+
34
+
35
+ @dataclass
36
+ class Chunk:
37
+ """A chunk of text with position metadata."""
38
+
39
+ text: str
40
+ start_index: int
41
+ end_index: int
42
+ token_count: int # In our case, character count
43
+
44
+ def __len__(self) -> int:
45
+ return len(self.text)
46
+
47
+
48
+ class SentenceChunker:
49
+ """Split text into chunks based on sentence boundaries.
50
+
51
+ Args:
52
+ chunk_size: Maximum characters per chunk
53
+ chunk_overlap: Characters to overlap between chunks
54
+ min_sentences_per_chunk: Minimum sentences per chunk
55
+ min_characters_per_sentence: Minimum characters for valid sentence
56
+ delim: Sentence boundary delimiters
57
+ include_delim: Where to include delimiter ("prev", "next", or None)
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ chunk_size: int = 4000,
63
+ chunk_overlap: int = 200,
64
+ min_sentences_per_chunk: int = 1,
65
+ min_characters_per_sentence: int = 12,
66
+ delim: Union[str, list[str]] = [". ", "! ", "? ", "\n"],
67
+ include_delim: Optional[Literal["prev", "next"]] = "prev",
68
+ ):
69
+ if chunk_size <= 0:
70
+ raise ValueError("chunk_size must be positive")
71
+ if chunk_overlap >= chunk_size:
72
+ raise ValueError("chunk_overlap must be less than chunk_size")
73
+ if min_sentences_per_chunk < 1:
74
+ raise ValueError("min_sentences_per_chunk must be at least 1")
75
+
76
+ self.chunk_size = chunk_size
77
+ self.chunk_overlap = chunk_overlap
78
+ self.min_sentences_per_chunk = min_sentences_per_chunk
79
+ self.min_characters_per_sentence = min_characters_per_sentence
80
+ self.delim = [delim] if isinstance(delim, str) else delim
81
+ self.include_delim = include_delim
82
+ self._sep = "\x00" # Internal separator (null char)
83
+
84
+ def _split_into_sentences(self, text: str) -> list[str]:
85
+ """Split text into sentences based on delimiters."""
86
+ t = text
87
+ for d in self.delim:
88
+ if self.include_delim == "prev":
89
+ t = t.replace(d, d + self._sep)
90
+ elif self.include_delim == "next":
91
+ t = t.replace(d, self._sep + d)
92
+ else:
93
+ t = t.replace(d, self._sep)
94
+
95
+ splits = [s for s in t.split(self._sep) if s]
96
+
97
+ # Merge short splits with previous sentence
98
+ sentences = []
99
+ current = ""
100
+ for s in splits:
101
+ if len(s) < self.min_characters_per_sentence:
102
+ current += s
103
+ elif current:
104
+ current += s
105
+ sentences.append(current)
106
+ current = ""
107
+ else:
108
+ sentences.append(s)
109
+
110
+ if len(current) >= self.min_characters_per_sentence:
111
+ sentences.append(current)
112
+ current = ""
113
+
114
+ if current:
115
+ sentences.append(current)
116
+
117
+ return sentences
118
+
119
+ def chunk(self, text: str) -> list[Chunk]:
120
+ """Split text into overlapping chunks based on sentences."""
121
+ if not text.strip():
122
+ return []
123
+
124
+ # Split into sentences with positions and character counts
125
+ sentence_texts = self._split_into_sentences(text)
126
+ if not sentence_texts:
127
+ return []
128
+
129
+ # Calculate positions
130
+ positions = []
131
+ current_pos = 0
132
+ for sent in sentence_texts:
133
+ positions.append(current_pos)
134
+ current_pos += len(sent)
135
+
136
+ # Character counts (our "tokens")
137
+ char_counts = [len(s) for s in sentence_texts]
138
+
139
+ # Cumulative character counts for bisect
140
+ char_sums = list(accumulate([0] + char_counts))
141
+
142
+ chunks = []
143
+ pos = 0
144
+
145
+ while pos < len(sentence_texts):
146
+ # Find split point using bisect
147
+ target = char_sums[pos] + self.chunk_size
148
+ split_idx = bisect_left(char_sums, target) - 1
149
+ split_idx = max(split_idx, pos + 1) # At least one sentence
150
+ split_idx = min(split_idx, len(sentence_texts))
151
+
152
+ # Handle minimum sentences requirement
153
+ if split_idx - pos < self.min_sentences_per_chunk:
154
+ if pos + self.min_sentences_per_chunk <= len(sentence_texts):
155
+ split_idx = pos + self.min_sentences_per_chunk
156
+ else:
157
+ split_idx = len(sentence_texts)
158
+
159
+ # Create chunk
160
+ chunk_sentences = sentence_texts[pos:split_idx]
161
+ chunk_text = "".join(chunk_sentences)
162
+ chunks.append(
163
+ Chunk(
164
+ text=chunk_text,
165
+ start_index=positions[pos],
166
+ end_index=positions[pos] + len(chunk_text),
167
+ token_count=len(chunk_text),
168
+ )
169
+ )
170
+
171
+ # Calculate next position with overlap
172
+ if self.chunk_overlap > 0 and split_idx < len(sentence_texts):
173
+ overlap_chars = 0
174
+ overlap_idx = split_idx - 1
175
+
176
+ while overlap_idx > pos and overlap_chars < self.chunk_overlap:
177
+ next_chars = overlap_chars + char_counts[overlap_idx]
178
+ if next_chars > self.chunk_overlap:
179
+ break
180
+ overlap_chars = next_chars
181
+ overlap_idx -= 1
182
+
183
+ pos = overlap_idx + 1
184
+ else:
185
+ pos = split_idx
186
+
187
+ return chunks