recursive-cleaner 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- backends/__init__.py +5 -0
- backends/mlx_backend.py +95 -0
- recursive_cleaner/__init__.py +46 -0
- recursive_cleaner/cleaner.py +628 -0
- recursive_cleaner/context.py +27 -0
- recursive_cleaner/dependencies.py +59 -0
- recursive_cleaner/errors.py +17 -0
- recursive_cleaner/metrics.py +163 -0
- recursive_cleaner/optimizer.py +336 -0
- recursive_cleaner/output.py +197 -0
- recursive_cleaner/parsers.py +325 -0
- recursive_cleaner/prompt.py +218 -0
- recursive_cleaner/report.py +138 -0
- recursive_cleaner/response.py +292 -0
- recursive_cleaner/schema.py +117 -0
- recursive_cleaner/types.py +11 -0
- recursive_cleaner/validation.py +202 -0
- recursive_cleaner/vendor/__init__.py +4 -0
- recursive_cleaner/vendor/chunker.py +187 -0
- recursive_cleaner-0.6.0.dist-info/METADATA +282 -0
- recursive_cleaner-0.6.0.dist-info/RECORD +23 -0
- recursive_cleaner-0.6.0.dist-info/WHEEL +4 -0
- recursive_cleaner-0.6.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Schema inference for data files."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import json
|
|
5
|
+
from io import StringIO
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def infer_schema(file_path: str, sample_size: int = 10) -> dict:
|
|
10
|
+
"""
|
|
11
|
+
Infer data schema from first N records.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
file_path: Path to data file
|
|
15
|
+
sample_size: Number of records to sample
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
{"fields": [...], "types": {...}, "samples": {...}, "nullable": {...}}
|
|
19
|
+
"""
|
|
20
|
+
path = Path(file_path)
|
|
21
|
+
suffix = path.suffix.lower()
|
|
22
|
+
|
|
23
|
+
if not path.exists():
|
|
24
|
+
return {"fields": [], "types": {}, "samples": {}, "nullable": {}}
|
|
25
|
+
|
|
26
|
+
content = path.read_text(encoding="utf-8")
|
|
27
|
+
if not content.strip():
|
|
28
|
+
return {"fields": [], "types": {}, "samples": {}, "nullable": {}}
|
|
29
|
+
|
|
30
|
+
if suffix == ".jsonl":
|
|
31
|
+
return _infer_jsonl(content, sample_size)
|
|
32
|
+
elif suffix == ".csv":
|
|
33
|
+
return _infer_csv(content, sample_size)
|
|
34
|
+
elif suffix == ".json":
|
|
35
|
+
return _infer_json(content, sample_size)
|
|
36
|
+
else:
|
|
37
|
+
return {"fields": [], "types": {}, "samples": {}, "nullable": {}}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _infer_jsonl(content: str, sample_size: int) -> dict:
|
|
41
|
+
"""Infer schema from JSONL content."""
|
|
42
|
+
lines = [line.strip() for line in content.strip().split("\n") if line.strip()]
|
|
43
|
+
records = [json.loads(line) for line in lines[:sample_size]]
|
|
44
|
+
return _infer_from_records(records)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _infer_csv(content: str, sample_size: int) -> dict:
|
|
48
|
+
"""Infer schema from CSV content."""
|
|
49
|
+
reader = csv.DictReader(StringIO(content))
|
|
50
|
+
records = [row for _, row in zip(range(sample_size), reader)]
|
|
51
|
+
return _infer_from_records(records)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _infer_json(content: str, sample_size: int) -> dict:
|
|
55
|
+
"""Infer schema from JSON content."""
|
|
56
|
+
data = json.loads(content)
|
|
57
|
+
if isinstance(data, list):
|
|
58
|
+
records = data[:sample_size]
|
|
59
|
+
elif isinstance(data, dict):
|
|
60
|
+
records = [data]
|
|
61
|
+
else:
|
|
62
|
+
return {"fields": [], "types": {}, "samples": {}, "nullable": {}}
|
|
63
|
+
return _infer_from_records(records)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _infer_from_records(records: list[dict]) -> dict:
|
|
67
|
+
"""Build schema from list of record dicts."""
|
|
68
|
+
if not records:
|
|
69
|
+
return {"fields": [], "types": {}, "samples": {}, "nullable": {}}
|
|
70
|
+
|
|
71
|
+
fields = list(dict.fromkeys(k for r in records for k in r.keys()))
|
|
72
|
+
types = {}
|
|
73
|
+
samples = {}
|
|
74
|
+
nullable = {}
|
|
75
|
+
|
|
76
|
+
for field in fields:
|
|
77
|
+
values = [r.get(field) for r in records if field in r]
|
|
78
|
+
nullable[field] = any(v is None for v in values)
|
|
79
|
+
non_null = [v for v in values if v is not None]
|
|
80
|
+
samples[field] = non_null[:3]
|
|
81
|
+
types[field] = _infer_type(non_null)
|
|
82
|
+
|
|
83
|
+
return {"fields": fields, "types": types, "samples": samples, "nullable": nullable}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _infer_type(values: list) -> str:
|
|
87
|
+
"""Infer type from list of non-null values."""
|
|
88
|
+
if not values:
|
|
89
|
+
return "unknown"
|
|
90
|
+
type_map = {str: "str", int: "int", float: "float", bool: "bool", list: "list", dict: "dict"}
|
|
91
|
+
seen = set()
|
|
92
|
+
for v in values:
|
|
93
|
+
for py_type, name in type_map.items():
|
|
94
|
+
if type(v) is py_type:
|
|
95
|
+
seen.add(name)
|
|
96
|
+
break
|
|
97
|
+
else:
|
|
98
|
+
seen.add("unknown")
|
|
99
|
+
if len(seen) == 1:
|
|
100
|
+
return seen.pop()
|
|
101
|
+
return "mixed"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def format_schema_for_prompt(schema: dict) -> str:
|
|
105
|
+
"""Format schema dict as human-readable string for prompt injection."""
|
|
106
|
+
if not schema.get("fields"):
|
|
107
|
+
return ""
|
|
108
|
+
lines = ["Fields detected:"]
|
|
109
|
+
for field in schema["fields"]:
|
|
110
|
+
ftype = schema["types"].get(field, "unknown")
|
|
111
|
+
is_nullable = schema["nullable"].get(field, False)
|
|
112
|
+
samples = schema["samples"].get(field, [])
|
|
113
|
+
type_str = f"{ftype}, nullable" if is_nullable else ftype
|
|
114
|
+
sample_strs = [repr(s) if isinstance(s, str) else str(s) for s in samples]
|
|
115
|
+
sample_part = ", ".join(sample_strs) if sample_strs else "no samples"
|
|
116
|
+
lines.append(f"- {field} ({type_str}): {sample_part}")
|
|
117
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Type definitions for the recursive cleaner pipeline."""
|
|
2
|
+
|
|
3
|
+
from typing import Protocol
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class LLMBackend(Protocol):
|
|
7
|
+
"""Protocol for LLM backend implementations."""
|
|
8
|
+
|
|
9
|
+
def generate(self, prompt: str) -> str:
|
|
10
|
+
"""Generate a response from the LLM given a prompt."""
|
|
11
|
+
...
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""Runtime validation for generated cleaning functions."""
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
# Modules that could access filesystem, run commands, or exfiltrate data
|
|
9
|
+
DANGEROUS_IMPORTS = frozenset({
|
|
10
|
+
"os",
|
|
11
|
+
"subprocess",
|
|
12
|
+
"sys",
|
|
13
|
+
"shutil",
|
|
14
|
+
"pathlib",
|
|
15
|
+
"socket",
|
|
16
|
+
"urllib",
|
|
17
|
+
"requests",
|
|
18
|
+
"httplib",
|
|
19
|
+
"ftplib",
|
|
20
|
+
"smtplib",
|
|
21
|
+
"pickle",
|
|
22
|
+
})
|
|
23
|
+
|
|
24
|
+
# Built-in functions that allow arbitrary code execution
|
|
25
|
+
DANGEROUS_CALLS = frozenset({
|
|
26
|
+
"eval",
|
|
27
|
+
"exec",
|
|
28
|
+
"compile",
|
|
29
|
+
"__import__",
|
|
30
|
+
"open", # Data cleaning functions receive data as args, shouldn't need file I/O
|
|
31
|
+
})
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def check_code_safety(code: str) -> tuple[bool, str | None]:
|
|
35
|
+
"""
|
|
36
|
+
Check if generated code contains dangerous patterns.
|
|
37
|
+
|
|
38
|
+
Catches common LLM mistakes like importing os or using eval().
|
|
39
|
+
Not a security sandbox - won't catch obfuscated/adversarial code.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
code: Python source code to check
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
(True, None) if code appears safe
|
|
46
|
+
(False, error_message) if dangerous pattern detected
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
tree = ast.parse(code)
|
|
50
|
+
except SyntaxError as e:
|
|
51
|
+
return False, f"Syntax error: {e}"
|
|
52
|
+
|
|
53
|
+
for node in ast.walk(tree):
|
|
54
|
+
# Check: import os, import subprocess, etc.
|
|
55
|
+
if isinstance(node, ast.Import):
|
|
56
|
+
for alias in node.names:
|
|
57
|
+
module = alias.name.split(".")[0]
|
|
58
|
+
if module in DANGEROUS_IMPORTS:
|
|
59
|
+
return False, f"Dangerous import: {alias.name}"
|
|
60
|
+
|
|
61
|
+
# Check: from os import path, from subprocess import run, etc.
|
|
62
|
+
if isinstance(node, ast.ImportFrom):
|
|
63
|
+
if node.module:
|
|
64
|
+
module = node.module.split(".")[0]
|
|
65
|
+
if module in DANGEROUS_IMPORTS:
|
|
66
|
+
return False, f"Dangerous import: from {node.module}"
|
|
67
|
+
|
|
68
|
+
# Check: eval(...), exec(...), open(...), etc.
|
|
69
|
+
if isinstance(node, ast.Call):
|
|
70
|
+
if isinstance(node.func, ast.Name):
|
|
71
|
+
if node.func.id in DANGEROUS_CALLS:
|
|
72
|
+
return False, f"Dangerous function call: {node.func.id}()"
|
|
73
|
+
|
|
74
|
+
return True, None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def split_holdout(
|
|
78
|
+
chunk: str, holdout_ratio: float, mode: Literal["structured", "text"] = "structured"
|
|
79
|
+
) -> tuple[str, str]:
|
|
80
|
+
"""
|
|
81
|
+
Split chunk into generation and holdout portions.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
chunk: Raw chunk string (JSONL for structured, plain text for text mode)
|
|
85
|
+
holdout_ratio: Fraction to hold out (0.0-0.5)
|
|
86
|
+
mode: "structured" splits by JSONL records, "text" splits at sentence boundary
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
(generation_data, holdout_data) - both as strings
|
|
90
|
+
"""
|
|
91
|
+
if holdout_ratio <= 0:
|
|
92
|
+
return chunk, ""
|
|
93
|
+
|
|
94
|
+
if mode == "structured":
|
|
95
|
+
lines = [ln for ln in chunk.strip().split("\n") if ln.strip()]
|
|
96
|
+
if not lines:
|
|
97
|
+
return chunk, ""
|
|
98
|
+
holdout_count = max(1, int(len(lines) * holdout_ratio))
|
|
99
|
+
split_idx = len(lines) - holdout_count
|
|
100
|
+
return "\n".join(lines[:split_idx]), "\n".join(lines[split_idx:])
|
|
101
|
+
else:
|
|
102
|
+
# Text mode: split at sentence boundary
|
|
103
|
+
sentences = re.split(r"(?<=[.!?])\s+", chunk.strip())
|
|
104
|
+
if len(sentences) <= 1:
|
|
105
|
+
return chunk, ""
|
|
106
|
+
holdout_count = max(1, int(len(sentences) * holdout_ratio))
|
|
107
|
+
split_idx = len(sentences) - holdout_count
|
|
108
|
+
return " ".join(sentences[:split_idx]), " ".join(sentences[split_idx:])
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def validate_function(
|
|
112
|
+
code: str,
|
|
113
|
+
sample_data: list[dict] | str,
|
|
114
|
+
function_name: str,
|
|
115
|
+
mode: Literal["structured", "text"] = "structured",
|
|
116
|
+
) -> tuple[bool, str | None]:
|
|
117
|
+
"""
|
|
118
|
+
Execute generated function on sample data to catch runtime errors.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
code: The Python source code of the function
|
|
122
|
+
sample_data: List of data records (structured) or text string (text mode)
|
|
123
|
+
function_name: Name of the function to call
|
|
124
|
+
mode: "structured" for dict records, "text" for string input
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
(True, None) if function executes without error
|
|
128
|
+
(False, error_message) if function raises an exception
|
|
129
|
+
"""
|
|
130
|
+
# Handle empty data
|
|
131
|
+
if mode == "text":
|
|
132
|
+
if not sample_data or (isinstance(sample_data, str) and not sample_data.strip()):
|
|
133
|
+
return True, None
|
|
134
|
+
else:
|
|
135
|
+
if not sample_data:
|
|
136
|
+
return True, None
|
|
137
|
+
|
|
138
|
+
# Create isolated namespace and execute the code
|
|
139
|
+
namespace: dict = {}
|
|
140
|
+
try:
|
|
141
|
+
exec(code, namespace)
|
|
142
|
+
except Exception as e:
|
|
143
|
+
return False, f"Code compilation failed: {type(e).__name__}: {e}"
|
|
144
|
+
|
|
145
|
+
# Get the function from namespace
|
|
146
|
+
func = namespace.get(function_name)
|
|
147
|
+
if func is None:
|
|
148
|
+
return False, f"Function '{function_name}' not found in code"
|
|
149
|
+
|
|
150
|
+
if mode == "text":
|
|
151
|
+
# Text mode: sample_data is a string
|
|
152
|
+
try:
|
|
153
|
+
result = func(sample_data)
|
|
154
|
+
# Verify function returns a string
|
|
155
|
+
if not isinstance(result, str):
|
|
156
|
+
return False, f"Function must return str, got {type(result).__name__}"
|
|
157
|
+
except Exception as e:
|
|
158
|
+
return False, f"Runtime error on text input: {type(e).__name__}: {e}"
|
|
159
|
+
else:
|
|
160
|
+
# Structured mode: sample_data is list[dict]
|
|
161
|
+
for i, record in enumerate(sample_data):
|
|
162
|
+
try:
|
|
163
|
+
func(record)
|
|
164
|
+
except Exception as e:
|
|
165
|
+
return False, f"Runtime error on sample {i}: {type(e).__name__}: {e}"
|
|
166
|
+
|
|
167
|
+
return True, None
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def extract_sample_data(
|
|
171
|
+
chunk: str, max_samples: int = 3, mode: Literal["structured", "text"] = "structured"
|
|
172
|
+
) -> list[dict] | str:
|
|
173
|
+
"""
|
|
174
|
+
Extract sample data from a chunk string.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
chunk: Raw chunk string
|
|
178
|
+
max_samples: Maximum number of samples to extract (structured mode only)
|
|
179
|
+
mode: "structured" for JSONL parsing, "text" for raw string
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
List of parsed JSON objects (structured) or the chunk string (text)
|
|
183
|
+
"""
|
|
184
|
+
if mode == "text":
|
|
185
|
+
# Text mode: return the chunk as-is for validation
|
|
186
|
+
return chunk
|
|
187
|
+
|
|
188
|
+
# Structured mode: parse JSONL
|
|
189
|
+
samples = []
|
|
190
|
+
for line in chunk.strip().split("\n"):
|
|
191
|
+
line = line.strip()
|
|
192
|
+
if not line:
|
|
193
|
+
continue
|
|
194
|
+
try:
|
|
195
|
+
obj = json.loads(line)
|
|
196
|
+
if isinstance(obj, dict):
|
|
197
|
+
samples.append(obj)
|
|
198
|
+
if len(samples) >= max_samples:
|
|
199
|
+
break
|
|
200
|
+
except json.JSONDecodeError:
|
|
201
|
+
continue
|
|
202
|
+
return samples
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""Minimal sentence-aware text chunker extracted from chonkie.
|
|
2
|
+
|
|
3
|
+
Sentence chunking algorithm adapted from Chonkie
|
|
4
|
+
https://github.com/chonkie-inc/chonkie
|
|
5
|
+
Copyright (c) 2025 Chonkie
|
|
6
|
+
Licensed under the MIT License
|
|
7
|
+
|
|
8
|
+
MIT License
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from bisect import bisect_left
|
|
30
|
+
from dataclasses import dataclass
|
|
31
|
+
from itertools import accumulate
|
|
32
|
+
from typing import Literal, Optional, Union
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class Chunk:
|
|
37
|
+
"""A chunk of text with position metadata."""
|
|
38
|
+
|
|
39
|
+
text: str
|
|
40
|
+
start_index: int
|
|
41
|
+
end_index: int
|
|
42
|
+
token_count: int # In our case, character count
|
|
43
|
+
|
|
44
|
+
def __len__(self) -> int:
|
|
45
|
+
return len(self.text)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class SentenceChunker:
|
|
49
|
+
"""Split text into chunks based on sentence boundaries.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
chunk_size: Maximum characters per chunk
|
|
53
|
+
chunk_overlap: Characters to overlap between chunks
|
|
54
|
+
min_sentences_per_chunk: Minimum sentences per chunk
|
|
55
|
+
min_characters_per_sentence: Minimum characters for valid sentence
|
|
56
|
+
delim: Sentence boundary delimiters
|
|
57
|
+
include_delim: Where to include delimiter ("prev", "next", or None)
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
chunk_size: int = 4000,
|
|
63
|
+
chunk_overlap: int = 200,
|
|
64
|
+
min_sentences_per_chunk: int = 1,
|
|
65
|
+
min_characters_per_sentence: int = 12,
|
|
66
|
+
delim: Union[str, list[str]] = [". ", "! ", "? ", "\n"],
|
|
67
|
+
include_delim: Optional[Literal["prev", "next"]] = "prev",
|
|
68
|
+
):
|
|
69
|
+
if chunk_size <= 0:
|
|
70
|
+
raise ValueError("chunk_size must be positive")
|
|
71
|
+
if chunk_overlap >= chunk_size:
|
|
72
|
+
raise ValueError("chunk_overlap must be less than chunk_size")
|
|
73
|
+
if min_sentences_per_chunk < 1:
|
|
74
|
+
raise ValueError("min_sentences_per_chunk must be at least 1")
|
|
75
|
+
|
|
76
|
+
self.chunk_size = chunk_size
|
|
77
|
+
self.chunk_overlap = chunk_overlap
|
|
78
|
+
self.min_sentences_per_chunk = min_sentences_per_chunk
|
|
79
|
+
self.min_characters_per_sentence = min_characters_per_sentence
|
|
80
|
+
self.delim = [delim] if isinstance(delim, str) else delim
|
|
81
|
+
self.include_delim = include_delim
|
|
82
|
+
self._sep = "\x00" # Internal separator (null char)
|
|
83
|
+
|
|
84
|
+
def _split_into_sentences(self, text: str) -> list[str]:
|
|
85
|
+
"""Split text into sentences based on delimiters."""
|
|
86
|
+
t = text
|
|
87
|
+
for d in self.delim:
|
|
88
|
+
if self.include_delim == "prev":
|
|
89
|
+
t = t.replace(d, d + self._sep)
|
|
90
|
+
elif self.include_delim == "next":
|
|
91
|
+
t = t.replace(d, self._sep + d)
|
|
92
|
+
else:
|
|
93
|
+
t = t.replace(d, self._sep)
|
|
94
|
+
|
|
95
|
+
splits = [s for s in t.split(self._sep) if s]
|
|
96
|
+
|
|
97
|
+
# Merge short splits with previous sentence
|
|
98
|
+
sentences = []
|
|
99
|
+
current = ""
|
|
100
|
+
for s in splits:
|
|
101
|
+
if len(s) < self.min_characters_per_sentence:
|
|
102
|
+
current += s
|
|
103
|
+
elif current:
|
|
104
|
+
current += s
|
|
105
|
+
sentences.append(current)
|
|
106
|
+
current = ""
|
|
107
|
+
else:
|
|
108
|
+
sentences.append(s)
|
|
109
|
+
|
|
110
|
+
if len(current) >= self.min_characters_per_sentence:
|
|
111
|
+
sentences.append(current)
|
|
112
|
+
current = ""
|
|
113
|
+
|
|
114
|
+
if current:
|
|
115
|
+
sentences.append(current)
|
|
116
|
+
|
|
117
|
+
return sentences
|
|
118
|
+
|
|
119
|
+
def chunk(self, text: str) -> list[Chunk]:
|
|
120
|
+
"""Split text into overlapping chunks based on sentences."""
|
|
121
|
+
if not text.strip():
|
|
122
|
+
return []
|
|
123
|
+
|
|
124
|
+
# Split into sentences with positions and character counts
|
|
125
|
+
sentence_texts = self._split_into_sentences(text)
|
|
126
|
+
if not sentence_texts:
|
|
127
|
+
return []
|
|
128
|
+
|
|
129
|
+
# Calculate positions
|
|
130
|
+
positions = []
|
|
131
|
+
current_pos = 0
|
|
132
|
+
for sent in sentence_texts:
|
|
133
|
+
positions.append(current_pos)
|
|
134
|
+
current_pos += len(sent)
|
|
135
|
+
|
|
136
|
+
# Character counts (our "tokens")
|
|
137
|
+
char_counts = [len(s) for s in sentence_texts]
|
|
138
|
+
|
|
139
|
+
# Cumulative character counts for bisect
|
|
140
|
+
char_sums = list(accumulate([0] + char_counts))
|
|
141
|
+
|
|
142
|
+
chunks = []
|
|
143
|
+
pos = 0
|
|
144
|
+
|
|
145
|
+
while pos < len(sentence_texts):
|
|
146
|
+
# Find split point using bisect
|
|
147
|
+
target = char_sums[pos] + self.chunk_size
|
|
148
|
+
split_idx = bisect_left(char_sums, target) - 1
|
|
149
|
+
split_idx = max(split_idx, pos + 1) # At least one sentence
|
|
150
|
+
split_idx = min(split_idx, len(sentence_texts))
|
|
151
|
+
|
|
152
|
+
# Handle minimum sentences requirement
|
|
153
|
+
if split_idx - pos < self.min_sentences_per_chunk:
|
|
154
|
+
if pos + self.min_sentences_per_chunk <= len(sentence_texts):
|
|
155
|
+
split_idx = pos + self.min_sentences_per_chunk
|
|
156
|
+
else:
|
|
157
|
+
split_idx = len(sentence_texts)
|
|
158
|
+
|
|
159
|
+
# Create chunk
|
|
160
|
+
chunk_sentences = sentence_texts[pos:split_idx]
|
|
161
|
+
chunk_text = "".join(chunk_sentences)
|
|
162
|
+
chunks.append(
|
|
163
|
+
Chunk(
|
|
164
|
+
text=chunk_text,
|
|
165
|
+
start_index=positions[pos],
|
|
166
|
+
end_index=positions[pos] + len(chunk_text),
|
|
167
|
+
token_count=len(chunk_text),
|
|
168
|
+
)
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# Calculate next position with overlap
|
|
172
|
+
if self.chunk_overlap > 0 and split_idx < len(sentence_texts):
|
|
173
|
+
overlap_chars = 0
|
|
174
|
+
overlap_idx = split_idx - 1
|
|
175
|
+
|
|
176
|
+
while overlap_idx > pos and overlap_chars < self.chunk_overlap:
|
|
177
|
+
next_chars = overlap_chars + char_counts[overlap_idx]
|
|
178
|
+
if next_chars > self.chunk_overlap:
|
|
179
|
+
break
|
|
180
|
+
overlap_chars = next_chars
|
|
181
|
+
overlap_idx -= 1
|
|
182
|
+
|
|
183
|
+
pos = overlap_idx + 1
|
|
184
|
+
else:
|
|
185
|
+
pos = split_idx
|
|
186
|
+
|
|
187
|
+
return chunks
|