autochunks 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autochunk/__init__.py +9 -0
- autochunk/__main__.py +5 -0
- autochunk/adapters/__init__.py +3 -0
- autochunk/adapters/haystack.py +68 -0
- autochunk/adapters/langchain.py +81 -0
- autochunk/adapters/llamaindex.py +94 -0
- autochunk/autochunker.py +606 -0
- autochunk/chunkers/__init__.py +100 -0
- autochunk/chunkers/agentic.py +184 -0
- autochunk/chunkers/base.py +16 -0
- autochunk/chunkers/contextual_retrieval.py +151 -0
- autochunk/chunkers/fixed_length.py +110 -0
- autochunk/chunkers/html_section.py +225 -0
- autochunk/chunkers/hybrid_semantic_stat.py +199 -0
- autochunk/chunkers/layout_aware.py +192 -0
- autochunk/chunkers/parent_child.py +172 -0
- autochunk/chunkers/proposition.py +175 -0
- autochunk/chunkers/python_ast.py +248 -0
- autochunk/chunkers/recursive_character.py +215 -0
- autochunk/chunkers/semantic_local.py +140 -0
- autochunk/chunkers/sentence_aware.py +102 -0
- autochunk/cli.py +135 -0
- autochunk/config.py +76 -0
- autochunk/embedding/__init__.py +22 -0
- autochunk/embedding/adapter.py +14 -0
- autochunk/embedding/base.py +33 -0
- autochunk/embedding/hashing.py +42 -0
- autochunk/embedding/local.py +154 -0
- autochunk/embedding/ollama.py +66 -0
- autochunk/embedding/openai.py +62 -0
- autochunk/embedding/tokenizer.py +9 -0
- autochunk/enrichment/__init__.py +0 -0
- autochunk/enrichment/contextual.py +29 -0
- autochunk/eval/__init__.py +0 -0
- autochunk/eval/harness.py +177 -0
- autochunk/eval/metrics.py +27 -0
- autochunk/eval/ragas_eval.py +234 -0
- autochunk/eval/synthetic.py +104 -0
- autochunk/quality/__init__.py +31 -0
- autochunk/quality/deduplicator.py +326 -0
- autochunk/quality/overlap_optimizer.py +402 -0
- autochunk/quality/post_processor.py +245 -0
- autochunk/quality/scorer.py +459 -0
- autochunk/retrieval/__init__.py +0 -0
- autochunk/retrieval/in_memory.py +47 -0
- autochunk/retrieval/parent_child.py +4 -0
- autochunk/storage/__init__.py +0 -0
- autochunk/storage/cache.py +34 -0
- autochunk/storage/plan.py +40 -0
- autochunk/utils/__init__.py +0 -0
- autochunk/utils/hashing.py +8 -0
- autochunk/utils/io.py +176 -0
- autochunk/utils/logger.py +64 -0
- autochunk/utils/telemetry.py +44 -0
- autochunk/utils/text.py +199 -0
- autochunks-0.0.8.dist-info/METADATA +133 -0
- autochunks-0.0.8.dist-info/RECORD +61 -0
- autochunks-0.0.8.dist-info/WHEEL +5 -0
- autochunks-0.0.8.dist-info/entry_points.txt +2 -0
- autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
- autochunks-0.0.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import List, Callable, Optional, Pattern
|
|
4
|
+
import re
|
|
5
|
+
from .base import BaseChunker, Chunk
|
|
6
|
+
from ..utils.text import count_tokens, get_tokens, decode_tokens
|
|
7
|
+
|
|
8
|
+
class RecursiveCharacterChunker(BaseChunker):
|
|
9
|
+
"""
|
|
10
|
+
Recursive Character Chunker with Tiered Separators.
|
|
11
|
+
|
|
12
|
+
BEST-OF-BREED FEATURES:
|
|
13
|
+
1. Regex Separator Support: Use regex patterns via `is_separator_regex=True`.
|
|
14
|
+
2. Keep Separator Mode: Preserves delimiters at chunk boundaries.
|
|
15
|
+
3. Start Index Tracking: Records character offset for citation purposes.
|
|
16
|
+
4. Adaptive Fallback: Falls back to token-split when separators are exhausted.
|
|
17
|
+
5. Code Block Awareness: Avoids splitting inside fenced code blocks.
|
|
18
|
+
"""
|
|
19
|
+
name = "recursive_character"
|
|
20
|
+
|
|
21
|
+
# Default separator hierarchy (most significant first)
|
|
22
|
+
DEFAULT_SEPARATORS = ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""]
|
|
23
|
+
|
|
24
|
+
def __init__(self,
|
|
25
|
+
separators: List[str] = None,
|
|
26
|
+
is_separator_regex: bool = False,
|
|
27
|
+
keep_separator: bool = True,
|
|
28
|
+
tokenizer: str = "auto"):
|
|
29
|
+
"""
|
|
30
|
+
Initialize the chunker.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
separators: List of separators in priority order
|
|
34
|
+
is_separator_regex: If True, treat separators as regex patterns
|
|
35
|
+
keep_separator: If True, include separator in the chunk that precedes it
|
|
36
|
+
tokenizer: Backend for tokenization
|
|
37
|
+
"""
|
|
38
|
+
self.separators = separators or self.DEFAULT_SEPARATORS
|
|
39
|
+
self.is_separator_regex = is_separator_regex
|
|
40
|
+
self.keep_separator = keep_separator
|
|
41
|
+
self.tokenizer = tokenizer
|
|
42
|
+
|
|
43
|
+
def chunk(self,
|
|
44
|
+
doc_id: str,
|
|
45
|
+
text: str,
|
|
46
|
+
base_token_size: int = 512,
|
|
47
|
+
overlap: int = 64,
|
|
48
|
+
add_start_index: bool = False,
|
|
49
|
+
respect_code_blocks: bool = True,
|
|
50
|
+
**params) -> List[Chunk]:
|
|
51
|
+
"""
|
|
52
|
+
Recursively split text using separator hierarchy.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
doc_id: Document identifier
|
|
56
|
+
text: Input text
|
|
57
|
+
base_token_size: Target chunk size in tokens
|
|
58
|
+
overlap: Number of tokens to overlap between chunks
|
|
59
|
+
add_start_index: If True, record character start position
|
|
60
|
+
respect_code_blocks: If True, avoid splitting inside ``` blocks
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
List of Chunk objects
|
|
64
|
+
"""
|
|
65
|
+
separators = params.get("separators", self.separators)
|
|
66
|
+
|
|
67
|
+
# Extract code blocks if needed
|
|
68
|
+
code_block_ranges = []
|
|
69
|
+
if respect_code_blocks:
|
|
70
|
+
from ..utils.text import extract_code_blocks
|
|
71
|
+
code_blocks = extract_code_blocks(text)
|
|
72
|
+
code_block_ranges = [(b["start"], b["end"]) for b in code_blocks]
|
|
73
|
+
|
|
74
|
+
def _is_in_code_block(pos: int) -> bool:
|
|
75
|
+
for start, end in code_block_ranges:
|
|
76
|
+
if start <= pos < end:
|
|
77
|
+
return True
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
def _split_with_separator(input_text: str, separator: str, is_regex: bool) -> List[str]:
|
|
81
|
+
"""Split text while optionally keeping the separator."""
|
|
82
|
+
if is_regex:
|
|
83
|
+
pattern = separator
|
|
84
|
+
else:
|
|
85
|
+
pattern = re.escape(separator)
|
|
86
|
+
|
|
87
|
+
if self.keep_separator:
|
|
88
|
+
# Use capturing group to keep separator
|
|
89
|
+
raw_splits = re.split(f"({pattern})", input_text)
|
|
90
|
+
# Merge separator with preceding text
|
|
91
|
+
splits = []
|
|
92
|
+
for i in range(0, len(raw_splits) - 1, 2):
|
|
93
|
+
combined = raw_splits[i] + raw_splits[i + 1]
|
|
94
|
+
if combined:
|
|
95
|
+
splits.append(combined)
|
|
96
|
+
if len(raw_splits) % 2 == 1 and raw_splits[-1]:
|
|
97
|
+
splits.append(raw_splits[-1])
|
|
98
|
+
return splits
|
|
99
|
+
else:
|
|
100
|
+
return [s for s in re.split(pattern, input_text) if s]
|
|
101
|
+
|
|
102
|
+
def _recursive_split(input_text: str, seps: List[str], char_offset: int = 0) -> List[tuple]:
|
|
103
|
+
"""
|
|
104
|
+
Recursively split text.
|
|
105
|
+
Returns list of (text, start_char_index) tuples.
|
|
106
|
+
"""
|
|
107
|
+
token_count = count_tokens(input_text, tokenizer=self.tokenizer)
|
|
108
|
+
|
|
109
|
+
# Base case: fits in one chunk
|
|
110
|
+
if token_count <= base_token_size:
|
|
111
|
+
return [(input_text, char_offset)]
|
|
112
|
+
|
|
113
|
+
# No more separators: fallback to token splitting
|
|
114
|
+
if not seps:
|
|
115
|
+
all_tokens = get_tokens(input_text, tokenizer=self.tokenizer)
|
|
116
|
+
results = []
|
|
117
|
+
step = max(1, base_token_size - overlap)
|
|
118
|
+
pos = 0
|
|
119
|
+
curr_char = char_offset
|
|
120
|
+
|
|
121
|
+
while pos < len(all_tokens):
|
|
122
|
+
window = all_tokens[pos : pos + base_token_size]
|
|
123
|
+
chunk_text = decode_tokens(window)
|
|
124
|
+
results.append((chunk_text, curr_char))
|
|
125
|
+
|
|
126
|
+
stepped = all_tokens[pos : pos + step]
|
|
127
|
+
curr_char += len(decode_tokens(stepped))
|
|
128
|
+
pos += step
|
|
129
|
+
|
|
130
|
+
if pos >= len(all_tokens):
|
|
131
|
+
break
|
|
132
|
+
|
|
133
|
+
return results
|
|
134
|
+
|
|
135
|
+
# Try current separator
|
|
136
|
+
curr_sep = seps[0]
|
|
137
|
+
remaining_seps = seps[1:]
|
|
138
|
+
|
|
139
|
+
splits = _split_with_separator(input_text, curr_sep, self.is_separator_regex)
|
|
140
|
+
|
|
141
|
+
# If separator didn't help, try next
|
|
142
|
+
if len(splits) <= 1:
|
|
143
|
+
return _recursive_split(input_text, remaining_seps, char_offset)
|
|
144
|
+
|
|
145
|
+
# Merge splits into chunks
|
|
146
|
+
final_chunks = []
|
|
147
|
+
buffer = []
|
|
148
|
+
buffer_tokens = 0
|
|
149
|
+
buffer_start = char_offset
|
|
150
|
+
current_char = char_offset
|
|
151
|
+
|
|
152
|
+
for split_text in splits:
|
|
153
|
+
split_tokens = count_tokens(split_text, tokenizer=self.tokenizer)
|
|
154
|
+
|
|
155
|
+
# If a single split is too big, recurse deeper
|
|
156
|
+
if split_tokens > base_token_size:
|
|
157
|
+
# Flush buffer first
|
|
158
|
+
if buffer:
|
|
159
|
+
final_chunks.append(("".join(buffer), buffer_start))
|
|
160
|
+
buffer = []
|
|
161
|
+
buffer_tokens = 0
|
|
162
|
+
|
|
163
|
+
# Recurse on the large split
|
|
164
|
+
sub_chunks = _recursive_split(split_text, remaining_seps, current_char)
|
|
165
|
+
final_chunks.extend(sub_chunks)
|
|
166
|
+
current_char += len(split_text)
|
|
167
|
+
buffer_start = current_char
|
|
168
|
+
continue
|
|
169
|
+
|
|
170
|
+
# Would this overflow the buffer?
|
|
171
|
+
if buffer_tokens + split_tokens > base_token_size and buffer:
|
|
172
|
+
final_chunks.append(("".join(buffer), buffer_start))
|
|
173
|
+
|
|
174
|
+
# Handle overlap: keep last N tokens worth of text
|
|
175
|
+
overlap_buffer = []
|
|
176
|
+
overlap_tokens = 0
|
|
177
|
+
for prev in reversed(buffer):
|
|
178
|
+
prev_tokens = count_tokens(prev, tokenizer=self.tokenizer)
|
|
179
|
+
if overlap_tokens + prev_tokens <= overlap:
|
|
180
|
+
overlap_buffer.insert(0, prev)
|
|
181
|
+
overlap_tokens += prev_tokens
|
|
182
|
+
else:
|
|
183
|
+
break
|
|
184
|
+
|
|
185
|
+
buffer = overlap_buffer
|
|
186
|
+
buffer_tokens = overlap_tokens
|
|
187
|
+
# Adjust start position for overlap
|
|
188
|
+
buffer_start = current_char - len("".join(overlap_buffer))
|
|
189
|
+
|
|
190
|
+
buffer.append(split_text)
|
|
191
|
+
buffer_tokens += split_tokens
|
|
192
|
+
current_char += len(split_text)
|
|
193
|
+
|
|
194
|
+
if buffer:
|
|
195
|
+
final_chunks.append(("".join(buffer), buffer_start))
|
|
196
|
+
|
|
197
|
+
return final_chunks
|
|
198
|
+
|
|
199
|
+
# Execute recursive pipeline
|
|
200
|
+
raw_chunks = _recursive_split(text, separators, 0)
|
|
201
|
+
|
|
202
|
+
# Wrap in Chunk objects
|
|
203
|
+
return [
|
|
204
|
+
Chunk(
|
|
205
|
+
id=f"{doc_id}#rc#{i}",
|
|
206
|
+
doc_id=doc_id,
|
|
207
|
+
text=chunk_text.strip(),
|
|
208
|
+
meta={
|
|
209
|
+
"chunk_index": i,
|
|
210
|
+
"strategy": "recursive_character",
|
|
211
|
+
"token_count": count_tokens(chunk_text, tokenizer=self.tokenizer),
|
|
212
|
+
**({"start_index": start_idx} if add_start_index else {})
|
|
213
|
+
}
|
|
214
|
+
) for i, (chunk_text, start_idx) in enumerate(raw_chunks) if chunk_text.strip()
|
|
215
|
+
]
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import numpy as np
|
|
4
|
+
from typing import List, Any, Callable
|
|
5
|
+
from .base import BaseChunker, Chunk
|
|
6
|
+
from ..utils.text import split_sentences, count_tokens
|
|
7
|
+
|
|
8
|
+
class SemanticLocalChunker(BaseChunker):
|
|
9
|
+
"""
|
|
10
|
+
Semantic Chunker using Window-based Gradient Similarity.
|
|
11
|
+
Detects topic shifts by comparing sliding windows of sentence embeddings.
|
|
12
|
+
|
|
13
|
+
BEST-OF-BREED IMPROVEMENTS:
|
|
14
|
+
1. Windowed-Similarity: Instead of comparing adjacent sentences, it compares the
|
|
15
|
+
"semantic momentum" of previous and future windows. This suppresses noise from
|
|
16
|
+
short/outlier sentences.
|
|
17
|
+
2. Dynamic Percentile: Uses a robust percentile-based peak detection for boundaries.
|
|
18
|
+
3. Multi-Factor Safety: Combines semantic drift with strict token caps to ensure LLM
|
|
19
|
+
context-window compliance.
|
|
20
|
+
"""
|
|
21
|
+
name = "semantic_local"
|
|
22
|
+
|
|
23
|
+
def chunk(self,
|
|
24
|
+
doc_id: str,
|
|
25
|
+
text: str,
|
|
26
|
+
embedding_fn: Callable[[List[str]], List[List[float]]] = None,
|
|
27
|
+
threshold_percentile: float = 0.9,
|
|
28
|
+
window_size: int = 3,
|
|
29
|
+
**params) -> List[Chunk]:
|
|
30
|
+
|
|
31
|
+
sentences = split_sentences(text)
|
|
32
|
+
if len(sentences) <= 1:
|
|
33
|
+
return [Chunk(id=f"{doc_id}#sl#0", doc_id=doc_id, text=text, meta={"chunk_index": 0})]
|
|
34
|
+
|
|
35
|
+
if embedding_fn is None:
|
|
36
|
+
# Fallback to SentenceAware if no embeddings
|
|
37
|
+
from .sentence_aware import SentenceAwareChunker
|
|
38
|
+
return SentenceAwareChunker().chunk(doc_id, text, **params)
|
|
39
|
+
|
|
40
|
+
# 1. Vectorize all sentences -style
|
|
41
|
+
embeddings = np.array(embedding_fn(sentences))
|
|
42
|
+
n = len(embeddings)
|
|
43
|
+
|
|
44
|
+
# 2. Calculate Windowed Distances (Vectorized Gradient)
|
|
45
|
+
# We use a moving average window to calculate semantic momentum
|
|
46
|
+
def moving_average(a, n=3):
|
|
47
|
+
ret = np.cumsum(a, axis=0)
|
|
48
|
+
ret[n:] = ret[n:] - ret[:-n]
|
|
49
|
+
return ret[n - 1:] / n
|
|
50
|
+
|
|
51
|
+
# Pre-calculate windowed means (shifted to align with gaps between sentences)
|
|
52
|
+
# For a prompt comparison between windows [i-w+1:i+1] and [i+1:i+w+1]
|
|
53
|
+
|
|
54
|
+
# We can do this efficiently by calculating all possible window means once
|
|
55
|
+
# and then slicing them.
|
|
56
|
+
|
|
57
|
+
distances = []
|
|
58
|
+
# Fallback to loop if document is very short, otherwise vectorized logic
|
|
59
|
+
# For simplicity and robustness with edge cases (min/max window),
|
|
60
|
+
# we'll keep the loop but optimize the inside with pre-calculated norms
|
|
61
|
+
# but in a real-world scenario we'd use np.convolve for a pure vector path.
|
|
62
|
+
|
|
63
|
+
norms = np.linalg.norm(embeddings, axis=1)
|
|
64
|
+
# Avoid division by zero
|
|
65
|
+
norms[norms == 0] = 1e-9
|
|
66
|
+
norm_embeddings = embeddings / norms[:, np.newaxis]
|
|
67
|
+
|
|
68
|
+
for i in range(n - 1):
|
|
69
|
+
start_prev = max(0, i - window_size + 1)
|
|
70
|
+
end_prev = i + 1
|
|
71
|
+
start_next = i + 1
|
|
72
|
+
end_next = min(n, i + 1 + window_size)
|
|
73
|
+
|
|
74
|
+
# Use raw embeddings for mean to preserve magnitude signal if desired,
|
|
75
|
+
# or normalized embeddings for pure cosine. usually uses raw mean then normalize.
|
|
76
|
+
vec_prev = np.mean(embeddings[start_prev:end_prev], axis=0)
|
|
77
|
+
vec_next = np.mean(embeddings[start_next:end_next], axis=0)
|
|
78
|
+
|
|
79
|
+
norm_p = np.linalg.norm(vec_prev)
|
|
80
|
+
norm_n = np.linalg.norm(vec_next)
|
|
81
|
+
|
|
82
|
+
if norm_p < 1e-9 or norm_n < 1e-9:
|
|
83
|
+
distances.append(0.0)
|
|
84
|
+
else:
|
|
85
|
+
sim = np.dot(vec_prev, vec_next) / (norm_p * norm_n)
|
|
86
|
+
distances.append(float(1 - sim))
|
|
87
|
+
|
|
88
|
+
# 3. Peak Detection
|
|
89
|
+
if not distances:
|
|
90
|
+
return [Chunk(id=f"{doc_id}#sl#0", doc_id=doc_id, text=text, meta={"chunk_index": 0})]
|
|
91
|
+
|
|
92
|
+
breakpoint_threshold = np.percentile(distances, threshold_percentile * 100)
|
|
93
|
+
|
|
94
|
+
# 4. Greedy Assembly with Safety Caps
|
|
95
|
+
safety_max_tokens = params.get("base_token_size", 512) * 2 # Usually 2x the target size is a good semantic ceiling
|
|
96
|
+
if "safety_max_tokens" in params:
|
|
97
|
+
safety_max_tokens = params["safety_max_tokens"]
|
|
98
|
+
|
|
99
|
+
chunks = []
|
|
100
|
+
curr_buffer = []
|
|
101
|
+
curr_tokens = 0
|
|
102
|
+
|
|
103
|
+
for i, sentence in enumerate(sentences):
|
|
104
|
+
sent_tokens = count_tokens(sentence)
|
|
105
|
+
|
|
106
|
+
# Decide to split BEFORE adding the sentence?
|
|
107
|
+
is_semantic_split = False
|
|
108
|
+
if i > 0 and i-1 < len(distances):
|
|
109
|
+
if distances[i-1] >= breakpoint_threshold and distances[i-1] > 0:
|
|
110
|
+
is_semantic_split = True
|
|
111
|
+
|
|
112
|
+
# Safety Split: Don't let semantic chunks grow into monsters
|
|
113
|
+
is_safety_split = curr_tokens + sent_tokens > safety_max_tokens
|
|
114
|
+
|
|
115
|
+
if (is_semantic_split or is_safety_split) and curr_buffer:
|
|
116
|
+
chunks.append(self._make_chunk(doc_id, curr_buffer, len(chunks), "safety" if is_safety_split else "semantic"))
|
|
117
|
+
curr_buffer = []
|
|
118
|
+
curr_tokens = 0
|
|
119
|
+
|
|
120
|
+
curr_buffer.append(sentence)
|
|
121
|
+
curr_tokens += sent_tokens
|
|
122
|
+
|
|
123
|
+
if curr_buffer:
|
|
124
|
+
chunks.append(self._make_chunk(doc_id, curr_buffer, len(chunks), "final"))
|
|
125
|
+
|
|
126
|
+
return chunks
|
|
127
|
+
|
|
128
|
+
def _make_chunk(self, doc_id: str, buffer: List[str], index: int, split_reason: str) -> Chunk:
|
|
129
|
+
text = " ".join(buffer).strip()
|
|
130
|
+
return Chunk(
|
|
131
|
+
id=f"{doc_id}#sl#{index}",
|
|
132
|
+
doc_id=doc_id,
|
|
133
|
+
text=text,
|
|
134
|
+
meta={
|
|
135
|
+
"chunk_index": index,
|
|
136
|
+
"strategy": "semantic_local",
|
|
137
|
+
"split_reason": split_reason,
|
|
138
|
+
"token_count": count_tokens(text)
|
|
139
|
+
}
|
|
140
|
+
)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import List, Any
|
|
4
|
+
from .base import BaseChunker, Chunk
|
|
5
|
+
from ..utils.text import split_sentences, count_tokens
|
|
6
|
+
|
|
7
|
+
class SentenceAwareChunker(BaseChunker):
|
|
8
|
+
"""
|
|
9
|
+
Sentence-Aware Chunker with Look-back Overlap.
|
|
10
|
+
Groups sentences while respecting token limits and providing context continuity.
|
|
11
|
+
|
|
12
|
+
BEST-OF-BREED IMPROVEMENTS:
|
|
13
|
+
1. Sentence Overlap: Repeating the last N sentences for transition context.
|
|
14
|
+
2. Intelligent Oversize Handling: Uses recursive splitting for "monster sentences"
|
|
15
|
+
instead of crude fixed-length fallback.
|
|
16
|
+
3. NLTK Integration: Leverages the updated sentence splitter.
|
|
17
|
+
"""
|
|
18
|
+
name = "sentence_aware"
|
|
19
|
+
|
|
20
|
+
def chunk(self, doc_id: str, text: str, base_token_size: int = 512, overlap: int = 64, **params) -> List[Chunk]:
|
|
21
|
+
sentences = split_sentences(text)
|
|
22
|
+
chunks = []
|
|
23
|
+
current_buffer = []
|
|
24
|
+
current_tokens = 0
|
|
25
|
+
idx = 0
|
|
26
|
+
|
|
27
|
+
# We define overlap in sentences if possible, or tokens
|
|
28
|
+
# Standard is to use token-based sentence overlap
|
|
29
|
+
|
|
30
|
+
for s in sentences:
|
|
31
|
+
sent_tokens = count_tokens(s)
|
|
32
|
+
|
|
33
|
+
# 1. Handle "Monster Sentences" (Single sentence > limit)
|
|
34
|
+
if sent_tokens > base_token_size:
|
|
35
|
+
# If we have stuff in buffer, flush first
|
|
36
|
+
if current_buffer:
|
|
37
|
+
chunks.append(self._make_chunk(doc_id, current_buffer, idx))
|
|
38
|
+
idx += 1
|
|
39
|
+
# Prepare overlap from the end of the buffer
|
|
40
|
+
current_buffer, current_tokens = self._get_overlap(current_buffer, overlap)
|
|
41
|
+
|
|
42
|
+
# Use recursive logic for this giant sentence
|
|
43
|
+
from .recursive_character import RecursiveCharacterChunker
|
|
44
|
+
sub_chunker = RecursiveCharacterChunker()
|
|
45
|
+
sub_chunks = sub_chunker.chunk(f"{doc_id}_monster", s, base_token_size=base_token_size, overlap=overlap)
|
|
46
|
+
|
|
47
|
+
for sc in sub_chunks:
|
|
48
|
+
chunks.append(Chunk(
|
|
49
|
+
id=f"{doc_id}#sa#{idx}",
|
|
50
|
+
doc_id=doc_id,
|
|
51
|
+
text=sc.text,
|
|
52
|
+
meta={"chunk_index": idx, "strategy": "sentence_aware_recursive"}
|
|
53
|
+
))
|
|
54
|
+
idx += 1
|
|
55
|
+
|
|
56
|
+
# Reset buffer after a monster sentence break
|
|
57
|
+
current_buffer = []
|
|
58
|
+
current_tokens = 0
|
|
59
|
+
continue
|
|
60
|
+
|
|
61
|
+
# 2. Regular Accumulation
|
|
62
|
+
if current_tokens + sent_tokens > base_token_size:
|
|
63
|
+
# Flush
|
|
64
|
+
chunks.append(self._make_chunk(doc_id, current_buffer, idx))
|
|
65
|
+
idx += 1
|
|
66
|
+
|
|
67
|
+
# Context Overlap (Sentence-Level)
|
|
68
|
+
current_buffer, current_tokens = self._get_overlap(current_buffer, overlap)
|
|
69
|
+
|
|
70
|
+
current_buffer.append(s)
|
|
71
|
+
current_tokens += sent_tokens
|
|
72
|
+
|
|
73
|
+
if current_buffer:
|
|
74
|
+
chunks.append(self._make_chunk(doc_id, current_buffer, idx))
|
|
75
|
+
|
|
76
|
+
return chunks
|
|
77
|
+
|
|
78
|
+
def _get_overlap(self, buffer: List[str], overlap_limit: int) -> tuple[List[str], int]:
|
|
79
|
+
"""Calculates the suffix of sentences to carry over for overlap."""
|
|
80
|
+
overlap_buffer = []
|
|
81
|
+
overlap_tokens = 0
|
|
82
|
+
for s in reversed(buffer):
|
|
83
|
+
t = count_tokens(s)
|
|
84
|
+
if overlap_tokens + t <= overlap_limit:
|
|
85
|
+
overlap_buffer.insert(0, s)
|
|
86
|
+
overlap_tokens += t
|
|
87
|
+
else:
|
|
88
|
+
break
|
|
89
|
+
return overlap_buffer, overlap_tokens
|
|
90
|
+
|
|
91
|
+
def _make_chunk(self, doc_id: str, buffer: List[str], index: int) -> Chunk:
|
|
92
|
+
text = " ".join(buffer).strip()
|
|
93
|
+
return Chunk(
|
|
94
|
+
id=f"{doc_id}#sa#{index}",
|
|
95
|
+
doc_id=doc_id,
|
|
96
|
+
text=text,
|
|
97
|
+
meta={
|
|
98
|
+
"chunk_index": index,
|
|
99
|
+
"strategy": "sentence_aware",
|
|
100
|
+
"token_count": count_tokens(text)
|
|
101
|
+
}
|
|
102
|
+
)
|
autochunk/cli.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
|
|
2
|
+
import argparse
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
import json
|
|
6
|
+
import yaml
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
from .autochunker import AutoChunker
|
|
10
|
+
from .config import AutoChunkConfig, EvalConfig, ProxyConfig, NetworkConfig, RagasConfig
|
|
11
|
+
from .storage.plan import Plan
|
|
12
|
+
from .utils.logger import logger
|
|
13
|
+
|
|
14
|
+
def cmd_optimize(args):
|
|
15
|
+
"""Run the optimization search from CLI."""
|
|
16
|
+
logger.info(f"Starting CLI Optimization on: {args.docs}")
|
|
17
|
+
|
|
18
|
+
cfg = AutoChunkConfig(
|
|
19
|
+
mode=args.mode,
|
|
20
|
+
embedding_provider=args.embedding_provider,
|
|
21
|
+
embedding_model_or_path=args.embedding_model,
|
|
22
|
+
cache_dir=args.cache_dir
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Optional override for proxy
|
|
26
|
+
if args.no_proxy:
|
|
27
|
+
cfg.proxy_config.enabled = False
|
|
28
|
+
|
|
29
|
+
chunker = AutoChunker(
|
|
30
|
+
mode=cfg.mode,
|
|
31
|
+
eval_config=EvalConfig(objective=args.objective),
|
|
32
|
+
embedding_provider=cfg.embedding_provider,
|
|
33
|
+
embedding_model_or_path=cfg.embedding_model_or_path,
|
|
34
|
+
cache_dir=cfg.cache_dir
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Enable RAGAS if flag is set
|
|
38
|
+
if args.analyze_ragas:
|
|
39
|
+
chunker.cfg.ragas_config.enabled = True
|
|
40
|
+
if hasattr(args, 'ragas_llm_provider') and args.ragas_llm_provider:
|
|
41
|
+
chunker.cfg.ragas_config.llm_provider = args.ragas_llm_provider
|
|
42
|
+
if hasattr(args, 'ragas_llm_model') and args.ragas_llm_model:
|
|
43
|
+
chunker.cfg.ragas_config.llm_model = args.ragas_llm_model
|
|
44
|
+
logger.info(f"RAGAS enabled. LLM Provider: {chunker.cfg.ragas_config.llm_provider}")
|
|
45
|
+
|
|
46
|
+
def on_progress(msg, step=None):
|
|
47
|
+
prefix = f"[Stage {step}] " if step else ""
|
|
48
|
+
logger.info(f"{prefix}{msg}")
|
|
49
|
+
|
|
50
|
+
plan, report = chunker.optimize(
|
|
51
|
+
documents=args.docs,
|
|
52
|
+
on_progress=on_progress
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Save the plan
|
|
56
|
+
Plan.write(args.out, plan)
|
|
57
|
+
logger.success(f"Optimization complete! Winning plan saved to: {args.out}")
|
|
58
|
+
|
|
59
|
+
if args.report:
|
|
60
|
+
with open(args.report, 'w') as f:
|
|
61
|
+
json.dump(report, f, indent=2)
|
|
62
|
+
logger.info(f"Full metrics report saved to: {args.report}")
|
|
63
|
+
|
|
64
|
+
def cmd_apply(args):
|
|
65
|
+
"""Apply a winning plan to a corpus."""
|
|
66
|
+
logger.info(f"Applying plan {args.plan} to docs: {args.docs}")
|
|
67
|
+
|
|
68
|
+
plan = Plan.read(args.plan)
|
|
69
|
+
|
|
70
|
+
# Initialize chunker with plan's embedding settings
|
|
71
|
+
chunker = AutoChunker(
|
|
72
|
+
embedding_provider=plan.embedding.get("name"),
|
|
73
|
+
embedding_model_or_path=plan.embedding.get("model")
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
chunks = plan.apply(args.docs, chunker)
|
|
77
|
+
|
|
78
|
+
with open(args.out, 'w', encoding='utf-8') as f:
|
|
79
|
+
json.dump(chunks, f, ensure_ascii=False, indent=2)
|
|
80
|
+
|
|
81
|
+
logger.success(f"Successfully created {len(chunks)} chunks. Selection saved to: {args.out}")
|
|
82
|
+
|
|
83
|
+
def cmd_serve(args):
|
|
84
|
+
"""Launch the Dashboard."""
|
|
85
|
+
import uvicorn
|
|
86
|
+
from .web.server import app
|
|
87
|
+
logger.info(f"Launching AutoChunks Dashboard on http://{args.host}:{args.port}")
|
|
88
|
+
uvicorn.run(app, host=args.host, port=args.port)
|
|
89
|
+
|
|
90
|
+
def main():
|
|
91
|
+
parser = argparse.ArgumentParser(
|
|
92
|
+
description="AutoChunks: Autonomous Retrieval Optimization for RAG",
|
|
93
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
|
94
|
+
)
|
|
95
|
+
subparsers = parser.add_subparsers(dest="command", help="Command to execute")
|
|
96
|
+
|
|
97
|
+
# Optimize command
|
|
98
|
+
opt_p = subparsers.add_parser("optimize", help="Search for the best chunking strategy")
|
|
99
|
+
opt_p.add_argument("--docs", required=True, help="Path to document folder")
|
|
100
|
+
opt_p.add_argument("--mode", default="light", choices=["light", "full"], help="Evaluation Depth: Controls synthetic QA sampling density")
|
|
101
|
+
opt_p.add_argument("--objective", default="balanced", choices=["balanced", "quality", "cost", "latency", "metric_only"], help="Optimization objective")
|
|
102
|
+
opt_p.add_argument("--embedding-provider", default="hashing", help="Embedding provider")
|
|
103
|
+
opt_p.add_argument("--embedding-model", default="BAAI/bge-small-en-v1.5", help="Model name or path")
|
|
104
|
+
opt_p.add_argument("--out", default="best_plan.yaml", help="Output path for the winning plan")
|
|
105
|
+
opt_p.add_argument("--report", help="Output path for the full JSON metrics report")
|
|
106
|
+
opt_p.add_argument("--cache-dir", default=".ac_cache", help="Cache directory")
|
|
107
|
+
opt_p.add_argument("--no-proxy", action="store_true", help="Disable representative sampling (process all docs)")
|
|
108
|
+
opt_p.add_argument("--analyze-ragas", action="store_true", help="Enable RAGAS LLM-based evaluation metrics (Context Precision/Recall)")
|
|
109
|
+
opt_p.add_argument("--ragas-llm-provider", default="auto", choices=["auto", "openai", "ollama", "huggingface"], help="LLM provider for RAGAS (auto detects OpenAI key or Ollama)")
|
|
110
|
+
opt_p.add_argument("--ragas-llm-model", default=None, help="Model name for RAGAS LLM (e.g., gpt-4o-mini, llama3.2)")
|
|
111
|
+
|
|
112
|
+
# Apply command
|
|
113
|
+
app_p = subparsers.add_parser("apply", help="Execute a saved plan on a corpus")
|
|
114
|
+
app_p.add_argument("--plan", required=True, help="Path to the .yaml plan file")
|
|
115
|
+
app_p.add_argument("--docs", required=True, help="Path to documents to chunk")
|
|
116
|
+
app_p.add_argument("--out", default="chunks.json", help="Output path for processed chunks")
|
|
117
|
+
|
|
118
|
+
# Serve command
|
|
119
|
+
srv_p = subparsers.add_parser("serve", help="Start the Web Dashboard")
|
|
120
|
+
srv_p.add_argument("--host", default="0.0.0.0", help="Host address")
|
|
121
|
+
srv_p.add_argument("--port", type=int, default=8000, help="Port number")
|
|
122
|
+
|
|
123
|
+
args = parser.parse_args()
|
|
124
|
+
|
|
125
|
+
if args.command == "optimize":
|
|
126
|
+
cmd_optimize(args)
|
|
127
|
+
elif args.command == "apply":
|
|
128
|
+
cmd_apply(args)
|
|
129
|
+
elif args.command == "serve":
|
|
130
|
+
cmd_serve(args)
|
|
131
|
+
else:
|
|
132
|
+
parser.print_help()
|
|
133
|
+
|
|
134
|
+
if __name__ == "__main__":
|
|
135
|
+
main()
|
autochunk/config.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Callable, Optional, List, Dict, Any
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class EvalConfig:
|
|
8
|
+
metrics: List[str] = field(default_factory=lambda: ["mrr@10", "ndcg@10", "recall@50"])
|
|
9
|
+
objective: str = "balanced" # quality|cost|latency|balanced
|
|
10
|
+
k: int = 10
|
|
11
|
+
latency_target_ms: int = 250
|
|
12
|
+
cost_budget_usd: float = 5.0
|
|
13
|
+
parent_child_eval: bool = False
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class RetrievalStrategy:
|
|
17
|
+
type: str = "standard" # or parent_child
|
|
18
|
+
child_token_size: int = 128
|
|
19
|
+
parent_token_size: int = 1024
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ProxyConfig:
|
|
23
|
+
enabled: bool = False
|
|
24
|
+
cluster_k: int = 5
|
|
25
|
+
proxy_percent: int = 10
|
|
26
|
+
verify_percent: int = 20
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class RagasConfig:
|
|
30
|
+
enabled: bool = False
|
|
31
|
+
metrics: List[str] = field(default_factory=lambda: ["context_precision", "context_recall"])
|
|
32
|
+
sample_size: int = 20 # Limit RAGAS to a subset to save costs
|
|
33
|
+
llm_provider: str = "openai" # openai|ollama|huggingface
|
|
34
|
+
llm_model: Optional[str] = None # Model name (e.g., "gpt-4o-mini", "llama3.2", "microsoft/Phi-3-mini-4k-instruct")
|
|
35
|
+
api_key: Optional[str] = field(default=None, repr=False) # API Key for OpenAI/Cloud providers
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class SafetyConstraints:
|
|
39
|
+
max_chunks_per_doc: int = 5000
|
|
40
|
+
min_avg_chunk_tokens: int = 120
|
|
41
|
+
max_redundant_overlap_ratio: float = 0.35
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class ParallelConfig:
|
|
45
|
+
embedding_concurrency: int = 4
|
|
46
|
+
retriever_concurrency: int = 4
|
|
47
|
+
batch_size: int = 32
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class NetworkConfig:
|
|
51
|
+
proxy_url: Optional[str] = None
|
|
52
|
+
local_models_path: Optional[str] = None
|
|
53
|
+
trusted_orgs: List[str] = field(default_factory=lambda: ["ds4sd", "RapidAI", "BAAI", "sentence-transformers"])
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class TokenizerConfig:
|
|
57
|
+
name: str = "whitespace"
|
|
58
|
+
vocab_source: str = "custom"
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class AutoChunkConfig:
|
|
62
|
+
eval_config: EvalConfig = field(default_factory=EvalConfig)
|
|
63
|
+
retrieval_strategy: RetrievalStrategy = field(default_factory=RetrievalStrategy)
|
|
64
|
+
proxy_config: ProxyConfig = field(default_factory=ProxyConfig)
|
|
65
|
+
ragas_config: RagasConfig = field(default_factory=RagasConfig)
|
|
66
|
+
safety: SafetyConstraints = field(default_factory=SafetyConstraints)
|
|
67
|
+
parallel: ParallelConfig = field(default_factory=ParallelConfig)
|
|
68
|
+
tokenizer: TokenizerConfig = field(default_factory=TokenizerConfig)
|
|
69
|
+
network: NetworkConfig = field(default_factory=NetworkConfig)
|
|
70
|
+
embedding_provider: str = "hashing" # hashing|local|tei|openai
|
|
71
|
+
embedding_model_or_path: str = "BAAI/bge-small-en-v1.5"
|
|
72
|
+
embedding_api_key: Optional[str] = field(default=None, repr=False) # API Key for cloud embedding providers
|
|
73
|
+
mode: str = "light" # light|full|incremental
|
|
74
|
+
cache_dir: str = ".ac_cache"
|
|
75
|
+
telemetry_enabled: bool = False # Enable Tracing (Arize Phoenix)
|
|
76
|
+
metadata_enrichment: Dict[str, Any] = field(default_factory=dict)
|