autochunks 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autochunk/__init__.py +9 -0
- autochunk/__main__.py +5 -0
- autochunk/adapters/__init__.py +3 -0
- autochunk/adapters/haystack.py +68 -0
- autochunk/adapters/langchain.py +81 -0
- autochunk/adapters/llamaindex.py +94 -0
- autochunk/autochunker.py +606 -0
- autochunk/chunkers/__init__.py +100 -0
- autochunk/chunkers/agentic.py +184 -0
- autochunk/chunkers/base.py +16 -0
- autochunk/chunkers/contextual_retrieval.py +151 -0
- autochunk/chunkers/fixed_length.py +110 -0
- autochunk/chunkers/html_section.py +225 -0
- autochunk/chunkers/hybrid_semantic_stat.py +199 -0
- autochunk/chunkers/layout_aware.py +192 -0
- autochunk/chunkers/parent_child.py +172 -0
- autochunk/chunkers/proposition.py +175 -0
- autochunk/chunkers/python_ast.py +248 -0
- autochunk/chunkers/recursive_character.py +215 -0
- autochunk/chunkers/semantic_local.py +140 -0
- autochunk/chunkers/sentence_aware.py +102 -0
- autochunk/cli.py +135 -0
- autochunk/config.py +76 -0
- autochunk/embedding/__init__.py +22 -0
- autochunk/embedding/adapter.py +14 -0
- autochunk/embedding/base.py +33 -0
- autochunk/embedding/hashing.py +42 -0
- autochunk/embedding/local.py +154 -0
- autochunk/embedding/ollama.py +66 -0
- autochunk/embedding/openai.py +62 -0
- autochunk/embedding/tokenizer.py +9 -0
- autochunk/enrichment/__init__.py +0 -0
- autochunk/enrichment/contextual.py +29 -0
- autochunk/eval/__init__.py +0 -0
- autochunk/eval/harness.py +177 -0
- autochunk/eval/metrics.py +27 -0
- autochunk/eval/ragas_eval.py +234 -0
- autochunk/eval/synthetic.py +104 -0
- autochunk/quality/__init__.py +31 -0
- autochunk/quality/deduplicator.py +326 -0
- autochunk/quality/overlap_optimizer.py +402 -0
- autochunk/quality/post_processor.py +245 -0
- autochunk/quality/scorer.py +459 -0
- autochunk/retrieval/__init__.py +0 -0
- autochunk/retrieval/in_memory.py +47 -0
- autochunk/retrieval/parent_child.py +4 -0
- autochunk/storage/__init__.py +0 -0
- autochunk/storage/cache.py +34 -0
- autochunk/storage/plan.py +40 -0
- autochunk/utils/__init__.py +0 -0
- autochunk/utils/hashing.py +8 -0
- autochunk/utils/io.py +176 -0
- autochunk/utils/logger.py +64 -0
- autochunk/utils/telemetry.py +44 -0
- autochunk/utils/text.py +199 -0
- autochunks-0.0.8.dist-info/METADATA +133 -0
- autochunks-0.0.8.dist-info/RECORD +61 -0
- autochunks-0.0.8.dist-info/WHEEL +5 -0
- autochunks-0.0.8.dist-info/entry_points.txt +2 -0
- autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
- autochunks-0.0.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import List, Optional, Dict, Any
|
|
4
|
+
import re
|
|
5
|
+
from .base import BaseChunker, Chunk
|
|
6
|
+
from ..utils.text import count_tokens
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from bs4 import BeautifulSoup, NavigableString, Tag
|
|
10
|
+
BS4_AVAILABLE = True
|
|
11
|
+
except ImportError:
|
|
12
|
+
BS4_AVAILABLE = False
|
|
13
|
+
|
|
14
|
+
class HTMLSectionChunker(BaseChunker):
|
|
15
|
+
"""
|
|
16
|
+
DOM-Aware HTML Chunker for structural web content splitting.
|
|
17
|
+
|
|
18
|
+
BEST-OF-BREED FEATURES:
|
|
19
|
+
1. DOM-Tree Navigation: Respects HTML hierarchy (doesn't blindly split tags).
|
|
20
|
+
2. Structural Metadata: Tracks DOM path IDs (e.g. body > main > article).
|
|
21
|
+
3. Semantic Grouping: Keeps tables, lists, and definition lists intact.
|
|
22
|
+
4. Header Hierarchy: Uses H1-H6 as natural boundaries.
|
|
23
|
+
"""
|
|
24
|
+
name = "html_section"
|
|
25
|
+
|
|
26
|
+
# Tags that act as hard section boundaries
|
|
27
|
+
SECTION_TAGS = {'body', 'main', 'section', 'article', 'nav', 'aside', 'footer', 'header'}
|
|
28
|
+
|
|
29
|
+
# Tags that are logical blocks (like paragraphs)
|
|
30
|
+
BLOCK_TAGS = {'p', 'div', 'blockquote', 'pre', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'figure', 'li', 'td', 'th'}
|
|
31
|
+
|
|
32
|
+
# Tags that should be kept atomic if possible
|
|
33
|
+
ATOMIC_TAGS = {'table', 'ul', 'ol', 'dl', 'code', 'pre'}
|
|
34
|
+
|
|
35
|
+
def __init__(self,
|
|
36
|
+
base_token_size: int = 512,
|
|
37
|
+
max_token_size: int = 2048,
|
|
38
|
+
respect_headers: bool = True):
|
|
39
|
+
self.base_token_size = base_token_size
|
|
40
|
+
self.max_token_size = max_token_size
|
|
41
|
+
self.respect_headers = respect_headers
|
|
42
|
+
|
|
43
|
+
def chunk(self, doc_id: str, text: str, **params) -> List[Chunk]:
|
|
44
|
+
"""
|
|
45
|
+
Chunk HTML text using DOM analysis.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
doc_id: Document ID
|
|
49
|
+
text: HTML source string
|
|
50
|
+
"""
|
|
51
|
+
if not BS4_AVAILABLE:
|
|
52
|
+
from ..utils.logger import logger
|
|
53
|
+
logger.warning("BeautifulSoup not installed, falling back to RecursiveCharacterChunker")
|
|
54
|
+
from .recursive_character import RecursiveCharacterChunker
|
|
55
|
+
return RecursiveCharacterChunker().chunk(doc_id, text, base_token_size=self.base_token_size)
|
|
56
|
+
|
|
57
|
+
# Parse HTML
|
|
58
|
+
soup = BeautifulSoup(text, 'lxml')
|
|
59
|
+
|
|
60
|
+
# Remove noisy tags
|
|
61
|
+
for t in soup(['script', 'style', 'noscript', 'meta', 'link']):
|
|
62
|
+
t.decompose()
|
|
63
|
+
|
|
64
|
+
chunks = []
|
|
65
|
+
current_chunk_text = []
|
|
66
|
+
current_chunk_tokens = 0
|
|
67
|
+
current_meta = {}
|
|
68
|
+
|
|
69
|
+
# Traverse DOM depth-first
|
|
70
|
+
elements = self._flatten_dom(soup.body if soup.body else soup)
|
|
71
|
+
|
|
72
|
+
chunk_idx = 0
|
|
73
|
+
|
|
74
|
+
for elem_text, dom_path, is_header in elements:
|
|
75
|
+
token_count = count_tokens(elem_text)
|
|
76
|
+
|
|
77
|
+
# If single element is huge, need to split it (fallback)
|
|
78
|
+
if token_count > self.max_token_size:
|
|
79
|
+
# Flush current accumulator first
|
|
80
|
+
if current_chunk_text:
|
|
81
|
+
self._flush_chunk(doc_id, chunk_idx, current_chunk_text, current_meta, chunks)
|
|
82
|
+
chunk_idx += 1
|
|
83
|
+
current_chunk_text = []
|
|
84
|
+
current_chunk_tokens = 0
|
|
85
|
+
|
|
86
|
+
# Split the huge element
|
|
87
|
+
from .recursive_character import RecursiveCharacterChunker
|
|
88
|
+
sub_chunks = RecursiveCharacterChunker().chunk(
|
|
89
|
+
f"{doc_id}_sub", elem_text, base_token_size=self.base_token_size
|
|
90
|
+
)
|
|
91
|
+
for sc in sub_chunks:
|
|
92
|
+
chunks.append(Chunk(
|
|
93
|
+
id=f"{doc_id}#html#{chunk_idx}",
|
|
94
|
+
doc_id=doc_id,
|
|
95
|
+
text=sc.text,
|
|
96
|
+
meta={**current_meta, "chunk_index": chunk_idx, "dom_path": dom_path, "subtype": "large_element_split"}
|
|
97
|
+
))
|
|
98
|
+
chunk_idx += 1
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
# Check if we should split
|
|
102
|
+
# 1. Header detected (and we have content)
|
|
103
|
+
# 2. Size limit reached
|
|
104
|
+
is_new_section = is_header and self.respect_headers
|
|
105
|
+
is_full = (current_chunk_tokens + token_count) > self.base_token_size
|
|
106
|
+
|
|
107
|
+
if (is_new_section or is_full) and current_chunk_text:
|
|
108
|
+
self._flush_chunk(doc_id, chunk_idx, current_chunk_text, current_meta, chunks)
|
|
109
|
+
chunk_idx += 1
|
|
110
|
+
current_chunk_text = []
|
|
111
|
+
current_chunk_tokens = 0
|
|
112
|
+
# Use metadata from new starting element (specifically header info)
|
|
113
|
+
current_meta = {"dom_path": dom_path, "is_header": is_header}
|
|
114
|
+
|
|
115
|
+
if not current_chunk_text:
|
|
116
|
+
current_meta = {"dom_path": dom_path, "is_header": is_header}
|
|
117
|
+
|
|
118
|
+
current_chunk_text.append(elem_text)
|
|
119
|
+
current_chunk_tokens += token_count
|
|
120
|
+
|
|
121
|
+
# Final flush
|
|
122
|
+
if current_chunk_text:
|
|
123
|
+
self._flush_chunk(doc_id, chunk_idx, current_chunk_text, current_meta, chunks)
|
|
124
|
+
|
|
125
|
+
return chunks
|
|
126
|
+
|
|
127
|
+
def _flush_chunk(self, doc_id, idx, text_parts, meta, chunks_list):
|
|
128
|
+
full_text = "\n\n".join(text_parts).strip()
|
|
129
|
+
if not full_text:
|
|
130
|
+
return
|
|
131
|
+
|
|
132
|
+
chunks_list.append(Chunk(
|
|
133
|
+
id=f"{doc_id}#html#{idx}",
|
|
134
|
+
doc_id=doc_id,
|
|
135
|
+
text=full_text,
|
|
136
|
+
meta={
|
|
137
|
+
"chunk_index": idx,
|
|
138
|
+
"strategy": "html_section",
|
|
139
|
+
"token_count": count_tokens(full_text),
|
|
140
|
+
**meta
|
|
141
|
+
}
|
|
142
|
+
))
|
|
143
|
+
|
|
144
|
+
def _flatten_dom(self, node) -> List[tuple[str, str, bool]]:
|
|
145
|
+
"""
|
|
146
|
+
Flatten DOM into text blocks with metadata.
|
|
147
|
+
Returns list of (text, dom_path, is_header).
|
|
148
|
+
"""
|
|
149
|
+
results = []
|
|
150
|
+
|
|
151
|
+
# Helper to get path
|
|
152
|
+
def get_path(tag):
|
|
153
|
+
path = []
|
|
154
|
+
p = tag.parent
|
|
155
|
+
while p and p.name != '[document]':
|
|
156
|
+
path.insert(0, p.name)
|
|
157
|
+
p = p.parent
|
|
158
|
+
path.append(tag.name)
|
|
159
|
+
return " > ".join(path)
|
|
160
|
+
|
|
161
|
+
# Iterate only over block elements or meaningful leaf nodes
|
|
162
|
+
# This is a simplification: complex iteration logic needed for perfect reconstruction
|
|
163
|
+
# We'll walk and extract text from "safe" blocks
|
|
164
|
+
|
|
165
|
+
for element in node.descendants:
|
|
166
|
+
if isinstance(element, Tag):
|
|
167
|
+
if element.name in self.BLOCK_TAGS or element.name in self.ATOMIC_TAGS:
|
|
168
|
+
# Check if this element contains OTHER block tags. If so, don't extract yet (wait for children).
|
|
169
|
+
# Exception: ATOMIC_TAGS (tables, etc.) - extract WHOLE text
|
|
170
|
+
has_block_children = any(child.name in self.BLOCK_TAGS for child in element.find_all(recursive=False))
|
|
171
|
+
|
|
172
|
+
if element.name in self.ATOMIC_TAGS or not has_block_children:
|
|
173
|
+
# Extract text
|
|
174
|
+
text = element.get_text(separator=" ", strip=True)
|
|
175
|
+
if text:
|
|
176
|
+
is_header = element.name in {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
|
|
177
|
+
path = get_path(element)
|
|
178
|
+
results.append((text, path, is_header))
|
|
179
|
+
|
|
180
|
+
# If atomic, skip processing detailed descendants to avoid dups
|
|
181
|
+
# (BeautifulSoup yields same nodes if we don't handle this carefully)
|
|
182
|
+
# Actually node.descendants is a flat generator.
|
|
183
|
+
# We need to manually control recursion to avoid duplication.
|
|
184
|
+
# Since we can't easily skip in .descendants loop, we just rely on the fact
|
|
185
|
+
# that we only grab "leaf-like" blocks.
|
|
186
|
+
|
|
187
|
+
# Better approach: recursive generator
|
|
188
|
+
return self._recursive_extract(node)
|
|
189
|
+
|
|
190
|
+
def _recursive_extract(self, node, path="") -> List[tuple[str, str, bool]]:
|
|
191
|
+
results = []
|
|
192
|
+
|
|
193
|
+
if isinstance(node, NavigableString):
|
|
194
|
+
text = str(node).strip()
|
|
195
|
+
if text:
|
|
196
|
+
return [(text, path, False)]
|
|
197
|
+
return []
|
|
198
|
+
|
|
199
|
+
if isinstance(node, Tag):
|
|
200
|
+
new_path = f"{path} > {node.name}" if path else node.name
|
|
201
|
+
is_header = node.name in {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
|
|
202
|
+
|
|
203
|
+
# Atomic tags: return all text as one block
|
|
204
|
+
if node.name in self.ATOMIC_TAGS:
|
|
205
|
+
text = node.get_text(separator="\n", strip=True)
|
|
206
|
+
if text:
|
|
207
|
+
return [(text, new_path, is_header)]
|
|
208
|
+
return []
|
|
209
|
+
|
|
210
|
+
# Block tags: process content
|
|
211
|
+
if node.name in self.BLOCK_TAGS:
|
|
212
|
+
# Process children
|
|
213
|
+
block_content = []
|
|
214
|
+
for child in node.children:
|
|
215
|
+
block_content.extend(self._recursive_extract(child, new_path))
|
|
216
|
+
|
|
217
|
+
# If we gathered content, maybe we should join it if it's small?
|
|
218
|
+
# For now, just return specific detailed blocks
|
|
219
|
+
return block_content
|
|
220
|
+
|
|
221
|
+
# Inline tags (span, b, etc.): just continue
|
|
222
|
+
for child in node.children:
|
|
223
|
+
results.extend(self._recursive_extract(child, path)) # passing parent path for inline
|
|
224
|
+
|
|
225
|
+
return results
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import numpy as np
|
|
4
|
+
from typing import List, Any, Callable, Dict
|
|
5
|
+
from .base import BaseChunker, Chunk
|
|
6
|
+
from ..utils.text import split_sentences, count_tokens
|
|
7
|
+
|
|
8
|
+
class HybridSemanticStatChunker(BaseChunker):
|
|
9
|
+
"""
|
|
10
|
+
Hybrid Chunker combining Semantic Similarity with Statistical Constraints.
|
|
11
|
+
|
|
12
|
+
BEST-OF-BREED FEATURES:
|
|
13
|
+
1. Windowed Similarity: Uses window-averaged embeddings for noise suppression.
|
|
14
|
+
2. Percentile-Based Threshold: Adaptive boundary detection like SemanticLocal.
|
|
15
|
+
3. Statistical Forces: Token pressure, sentence length variance, and entropy.
|
|
16
|
+
4. Multi-Factor Scoring: Configurable weights for semantic vs statistical signals.
|
|
17
|
+
"""
|
|
18
|
+
name = "hybrid_semantic_stat"
|
|
19
|
+
|
|
20
|
+
def __init__(self,
|
|
21
|
+
alpha: float = 0.6,
|
|
22
|
+
beta: float = 0.4,
|
|
23
|
+
window_size: int = 3,
|
|
24
|
+
threshold_percentile: float = 0.85):
|
|
25
|
+
"""
|
|
26
|
+
Initialize the chunker.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
alpha: Weight for semantic similarity signal (0-1)
|
|
30
|
+
beta: Weight for statistical signal (0-1)
|
|
31
|
+
window_size: Number of sentences for windowed similarity
|
|
32
|
+
threshold_percentile: Percentile for adaptive threshold (0-1)
|
|
33
|
+
"""
|
|
34
|
+
self.alpha = alpha
|
|
35
|
+
self.beta = beta
|
|
36
|
+
self.window_size = window_size
|
|
37
|
+
self.threshold_percentile = threshold_percentile
|
|
38
|
+
|
|
39
|
+
def chunk(self,
|
|
40
|
+
doc_id: str,
|
|
41
|
+
text: str,
|
|
42
|
+
embedding_fn: Callable[[List[str]], List[List[float]]] = None,
|
|
43
|
+
alpha: float = None,
|
|
44
|
+
beta: float = None,
|
|
45
|
+
base_token_size: int = 512,
|
|
46
|
+
**params) -> List[Chunk]:
|
|
47
|
+
"""
|
|
48
|
+
Split text using hybrid semantic-statistical analysis.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
doc_id: Document identifier
|
|
52
|
+
text: Input text
|
|
53
|
+
embedding_fn: Function to generate embeddings for sentences
|
|
54
|
+
alpha: Override semantic weight
|
|
55
|
+
beta: Override statistical weight
|
|
56
|
+
base_token_size: Target chunk size
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
List of Chunk objects
|
|
60
|
+
"""
|
|
61
|
+
alpha = alpha if alpha is not None else self.alpha
|
|
62
|
+
beta = beta if beta is not None else self.beta
|
|
63
|
+
|
|
64
|
+
sentences = split_sentences(text)
|
|
65
|
+
if len(sentences) <= 1:
|
|
66
|
+
return [Chunk(id=f"{doc_id}#hss#0", doc_id=doc_id, text=text, meta={"chunk_index": 0})]
|
|
67
|
+
|
|
68
|
+
if embedding_fn is None:
|
|
69
|
+
from .sentence_aware import SentenceAwareChunker
|
|
70
|
+
return SentenceAwareChunker().chunk(doc_id, text, base_token_size=base_token_size)
|
|
71
|
+
|
|
72
|
+
# 1. Get embeddings
|
|
73
|
+
embeddings = np.array(embedding_fn(sentences))
|
|
74
|
+
|
|
75
|
+
# 2. Calculate per-sentence metrics
|
|
76
|
+
sent_lengths = [count_tokens(s) for s in sentences]
|
|
77
|
+
avg_length = np.mean(sent_lengths)
|
|
78
|
+
std_length = np.std(sent_lengths) if len(sent_lengths) > 1 else 1.0
|
|
79
|
+
|
|
80
|
+
# 3. Calculate windowed semantic distances (Vectorized)
|
|
81
|
+
n = len(embeddings)
|
|
82
|
+
semantic_distances = []
|
|
83
|
+
|
|
84
|
+
# Pre-calculate norms for efficient similarity calculation
|
|
85
|
+
norms = np.linalg.norm(embeddings, axis=1)
|
|
86
|
+
norms[norms < 1e-9] = 1e-9
|
|
87
|
+
|
|
88
|
+
for i in range(n - 1):
|
|
89
|
+
start_prev = max(0, i - self.window_size + 1)
|
|
90
|
+
end_prev = i + 1
|
|
91
|
+
start_next = i + 1
|
|
92
|
+
end_next = min(n, i + 1 + self.window_size)
|
|
93
|
+
|
|
94
|
+
vec_prev = np.mean(embeddings[start_prev:end_prev], axis=0)
|
|
95
|
+
vec_next = np.mean(embeddings[start_next:end_next], axis=0)
|
|
96
|
+
|
|
97
|
+
norm_p = np.linalg.norm(vec_prev)
|
|
98
|
+
norm_n = np.linalg.norm(vec_next)
|
|
99
|
+
|
|
100
|
+
if norm_p < 1e-9 or norm_n < 1e-9:
|
|
101
|
+
dist = 0.0
|
|
102
|
+
else:
|
|
103
|
+
sim = np.dot(vec_prev, vec_next) / (norm_p * norm_n)
|
|
104
|
+
dist = float(1 - sim)
|
|
105
|
+
semantic_distances.append(dist)
|
|
106
|
+
|
|
107
|
+
# 4. Calculate boundary scores (Vectorized Signals)
|
|
108
|
+
semantic_signals = np.array(semantic_distances) if semantic_distances else np.zeros(n-1)
|
|
109
|
+
|
|
110
|
+
# Vectorize cumulative token pressure
|
|
111
|
+
cumulative_tokens = np.cumsum(sent_lengths)[:-1]
|
|
112
|
+
token_pressures = np.minimum(1.0, (cumulative_tokens / base_token_size) ** 2)
|
|
113
|
+
|
|
114
|
+
# Vectorize length anomaly signals
|
|
115
|
+
length_zs = np.abs(np.array(sent_lengths[:-1]) - avg_length) / (std_length + 1e-6)
|
|
116
|
+
length_signals = np.minimum(1.0, length_zs / 3)
|
|
117
|
+
|
|
118
|
+
# Combined statistical signal
|
|
119
|
+
stat_signals = 0.7 * token_pressures + 0.3 * length_signals
|
|
120
|
+
|
|
121
|
+
# Vectorized combined boundary scores
|
|
122
|
+
combined_scores = (alpha * semantic_signals) + (beta * stat_signals)
|
|
123
|
+
|
|
124
|
+
# Prepare score_info for the split loop (legacy compatibility with split-logic)
|
|
125
|
+
boundary_scores = []
|
|
126
|
+
for i in range(n - 1):
|
|
127
|
+
boundary_scores.append({
|
|
128
|
+
"position": i,
|
|
129
|
+
"semantic": float(semantic_signals[i]),
|
|
130
|
+
"statistical": float(stat_signals[i]),
|
|
131
|
+
"combined": float(combined_scores[i]),
|
|
132
|
+
"running_tokens": int(cumulative_tokens[i])
|
|
133
|
+
})
|
|
134
|
+
|
|
135
|
+
# 5. Determine adaptive threshold
|
|
136
|
+
if boundary_scores:
|
|
137
|
+
all_combined = [b["combined"] for b in boundary_scores]
|
|
138
|
+
threshold = np.percentile(all_combined, self.threshold_percentile * 100)
|
|
139
|
+
else:
|
|
140
|
+
threshold = 0.5
|
|
141
|
+
|
|
142
|
+
# 6. Build chunks using detected boundaries
|
|
143
|
+
chunks = []
|
|
144
|
+
curr_sentences = [sentences[0]]
|
|
145
|
+
curr_tokens = sent_lengths[0]
|
|
146
|
+
|
|
147
|
+
for i, score_info in enumerate(boundary_scores):
|
|
148
|
+
should_split = False
|
|
149
|
+
split_reason = "none"
|
|
150
|
+
|
|
151
|
+
# Semantic+Statistical split
|
|
152
|
+
if score_info["combined"] >= threshold:
|
|
153
|
+
should_split = True
|
|
154
|
+
split_reason = "hybrid"
|
|
155
|
+
|
|
156
|
+
# Safety split (hard token limit)
|
|
157
|
+
next_sent_tokens = sent_lengths[i + 1] if i + 1 < len(sent_lengths) else 0
|
|
158
|
+
if curr_tokens + next_sent_tokens > base_token_size * 1.3:
|
|
159
|
+
should_split = True
|
|
160
|
+
split_reason = "safety"
|
|
161
|
+
|
|
162
|
+
if should_split and curr_sentences:
|
|
163
|
+
chunk_text = " ".join(curr_sentences)
|
|
164
|
+
chunks.append(Chunk(
|
|
165
|
+
id=f"{doc_id}#hss#{len(chunks)}",
|
|
166
|
+
doc_id=doc_id,
|
|
167
|
+
text=chunk_text,
|
|
168
|
+
meta={
|
|
169
|
+
"chunk_index": len(chunks),
|
|
170
|
+
"strategy": "hybrid_semantic_stat",
|
|
171
|
+
"split_reason": split_reason,
|
|
172
|
+
"boundary_score": score_info["combined"],
|
|
173
|
+
"token_count": count_tokens(chunk_text)
|
|
174
|
+
}
|
|
175
|
+
))
|
|
176
|
+
curr_sentences = []
|
|
177
|
+
curr_tokens = 0
|
|
178
|
+
|
|
179
|
+
# Add next sentence to buffer
|
|
180
|
+
if i + 1 < len(sentences):
|
|
181
|
+
curr_sentences.append(sentences[i + 1])
|
|
182
|
+
curr_tokens += sent_lengths[i + 1]
|
|
183
|
+
|
|
184
|
+
# Final chunk
|
|
185
|
+
if curr_sentences:
|
|
186
|
+
chunk_text = " ".join(curr_sentences)
|
|
187
|
+
chunks.append(Chunk(
|
|
188
|
+
id=f"{doc_id}#hss#{len(chunks)}",
|
|
189
|
+
doc_id=doc_id,
|
|
190
|
+
text=chunk_text,
|
|
191
|
+
meta={
|
|
192
|
+
"chunk_index": len(chunks),
|
|
193
|
+
"strategy": "hybrid_semantic_stat",
|
|
194
|
+
"split_reason": "final",
|
|
195
|
+
"token_count": count_tokens(chunk_text)
|
|
196
|
+
}
|
|
197
|
+
))
|
|
198
|
+
|
|
199
|
+
return chunks
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import List, Dict, Any
|
|
4
|
+
import re
|
|
5
|
+
from .base import BaseChunker, Chunk
|
|
6
|
+
from ..utils.text import count_tokens, extract_code_blocks
|
|
7
|
+
|
|
8
|
+
class LayoutAwareChunker(BaseChunker):
|
|
9
|
+
"""
|
|
10
|
+
Document Chunker with Structure Preservation.
|
|
11
|
+
|
|
12
|
+
BEST-OF-BREED FEATURES:
|
|
13
|
+
1. Table Inheritance: Re-attaches table headers to split table fragments.
|
|
14
|
+
2. Header Lineage: Prepends [Section: X > Y] for retrieval context.
|
|
15
|
+
3. Code Block Integrity: Never splits inside fenced code blocks.
|
|
16
|
+
4. List Awareness: Keeps list items together when possible.
|
|
17
|
+
5. Multi-Format: Handles Markdown, HTML tables, and plain text.
|
|
18
|
+
"""
|
|
19
|
+
name = "layout_aware"
|
|
20
|
+
|
|
21
|
+
def __init__(self,
|
|
22
|
+
prepend_lineage: bool = True,
|
|
23
|
+
preserve_code_blocks: bool = True,
|
|
24
|
+
preserve_tables: bool = True):
|
|
25
|
+
"""
|
|
26
|
+
Initialize the chunker.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
prepend_lineage: If True, prepend section hierarchy to chunk text
|
|
30
|
+
preserve_code_blocks: If True, avoid splitting inside ``` blocks
|
|
31
|
+
preserve_tables: If True, re-attach table headers to fragments
|
|
32
|
+
"""
|
|
33
|
+
self.prepend_lineage = prepend_lineage
|
|
34
|
+
self.preserve_code_blocks = preserve_code_blocks
|
|
35
|
+
self.preserve_tables = preserve_tables
|
|
36
|
+
|
|
37
|
+
def chunk(self, doc_id: str, text: str, base_token_size: int = 512, **params) -> List[Chunk]:
|
|
38
|
+
"""
|
|
39
|
+
Split text while respecting document structure.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
doc_id: Document identifier
|
|
43
|
+
text: Input text (Markdown preferred)
|
|
44
|
+
base_token_size: Target chunk size in tokens
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
List of Chunk objects with structural metadata
|
|
48
|
+
"""
|
|
49
|
+
# Extract structural elements
|
|
50
|
+
code_blocks = extract_code_blocks(text) if self.preserve_code_blocks else []
|
|
51
|
+
|
|
52
|
+
lines = text.split("\n")
|
|
53
|
+
chunks = []
|
|
54
|
+
buffer = []
|
|
55
|
+
buffer_tokens = 0
|
|
56
|
+
|
|
57
|
+
# State tracking
|
|
58
|
+
header_stack = [] # Current header hierarchy
|
|
59
|
+
table_header = None # Current table header row
|
|
60
|
+
table_separator = None # Table separator row |---|
|
|
61
|
+
in_code_block = False
|
|
62
|
+
code_block_buffer = []
|
|
63
|
+
|
|
64
|
+
for line_idx, line in enumerate(lines):
|
|
65
|
+
stripped = line.strip()
|
|
66
|
+
line_tokens = count_tokens(line)
|
|
67
|
+
|
|
68
|
+
# Track fenced code blocks
|
|
69
|
+
if stripped.startswith("```"):
|
|
70
|
+
if not in_code_block:
|
|
71
|
+
# Starting a code block
|
|
72
|
+
in_code_block = True
|
|
73
|
+
code_block_buffer = [line]
|
|
74
|
+
continue
|
|
75
|
+
else:
|
|
76
|
+
# Ending a code block
|
|
77
|
+
in_code_block = False
|
|
78
|
+
code_block_buffer.append(line)
|
|
79
|
+
code_block_text = "\n".join(code_block_buffer)
|
|
80
|
+
code_block_tokens = count_tokens(code_block_text)
|
|
81
|
+
|
|
82
|
+
# Flush buffer if adding code block would overflow
|
|
83
|
+
if buffer_tokens + code_block_tokens > base_token_size and buffer:
|
|
84
|
+
chunks.append(self._make_chunk(doc_id, buffer, len(chunks), header_stack))
|
|
85
|
+
buffer = []
|
|
86
|
+
buffer_tokens = 0
|
|
87
|
+
|
|
88
|
+
buffer.append(code_block_text)
|
|
89
|
+
buffer_tokens += code_block_tokens
|
|
90
|
+
code_block_buffer = []
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
if in_code_block:
|
|
94
|
+
code_block_buffer.append(line)
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
# Skip empty lines but preserve them in buffer
|
|
98
|
+
if not stripped:
|
|
99
|
+
if buffer:
|
|
100
|
+
buffer.append("")
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
# Header detection - update lineage
|
|
104
|
+
if stripped.startswith("#"):
|
|
105
|
+
# Count header level
|
|
106
|
+
match = re.match(r'^(#+)\s+(.+)$', stripped)
|
|
107
|
+
if match:
|
|
108
|
+
level = len(match.group(1))
|
|
109
|
+
title = match.group(2).strip()
|
|
110
|
+
|
|
111
|
+
# Trim stack to parent level and add new header
|
|
112
|
+
header_stack = header_stack[:level-1]
|
|
113
|
+
header_stack.append(title)
|
|
114
|
+
|
|
115
|
+
# Hard break on new header
|
|
116
|
+
if buffer:
|
|
117
|
+
chunks.append(self._make_chunk(doc_id, buffer, len(chunks), header_stack[:-1]))
|
|
118
|
+
buffer = []
|
|
119
|
+
buffer_tokens = 0
|
|
120
|
+
|
|
121
|
+
# Table detection
|
|
122
|
+
is_table_row = "|" in stripped and not stripped.startswith("```")
|
|
123
|
+
is_separator_row = is_table_row and re.match(r'^[\|\s\-:]+$', stripped)
|
|
124
|
+
|
|
125
|
+
if is_table_row:
|
|
126
|
+
if table_header is None and not is_separator_row:
|
|
127
|
+
table_header = line
|
|
128
|
+
elif is_separator_row:
|
|
129
|
+
table_separator = line
|
|
130
|
+
elif table_header:
|
|
131
|
+
# Exiting table
|
|
132
|
+
table_header = None
|
|
133
|
+
table_separator = None
|
|
134
|
+
|
|
135
|
+
# Assembly logic
|
|
136
|
+
if buffer_tokens + line_tokens > base_token_size and buffer:
|
|
137
|
+
chunks.append(self._make_chunk(doc_id, buffer, len(chunks), header_stack))
|
|
138
|
+
buffer = []
|
|
139
|
+
buffer_tokens = 0
|
|
140
|
+
|
|
141
|
+
# Table inheritance: add header to new chunk
|
|
142
|
+
if is_table_row and self.preserve_tables and table_header and line != table_header:
|
|
143
|
+
buffer.append(table_header)
|
|
144
|
+
buffer_tokens += count_tokens(table_header)
|
|
145
|
+
if table_separator:
|
|
146
|
+
buffer.append(table_separator)
|
|
147
|
+
buffer_tokens += count_tokens(table_separator)
|
|
148
|
+
|
|
149
|
+
buffer.append(line)
|
|
150
|
+
buffer_tokens += line_tokens
|
|
151
|
+
|
|
152
|
+
# Handle remaining code block if unclosed
|
|
153
|
+
if code_block_buffer:
|
|
154
|
+
code_block_text = "\n".join(code_block_buffer)
|
|
155
|
+
buffer.append(code_block_text)
|
|
156
|
+
buffer_tokens += count_tokens(code_block_text)
|
|
157
|
+
|
|
158
|
+
# Final chunk
|
|
159
|
+
if buffer:
|
|
160
|
+
chunks.append(self._make_chunk(doc_id, buffer, len(chunks), header_stack))
|
|
161
|
+
|
|
162
|
+
return chunks
|
|
163
|
+
|
|
164
|
+
def _make_chunk(self, doc_id: str, lines: List[str], index: int, lineage: List[str]) -> Chunk:
|
|
165
|
+
"""Create a chunk with proper formatting and metadata."""
|
|
166
|
+
# Clean up empty lines at start/end
|
|
167
|
+
while lines and not lines[0].strip():
|
|
168
|
+
lines.pop(0)
|
|
169
|
+
while lines and not lines[-1].strip():
|
|
170
|
+
lines.pop()
|
|
171
|
+
|
|
172
|
+
body_text = "\n".join(lines)
|
|
173
|
+
|
|
174
|
+
# Prepend lineage for improved retrieval
|
|
175
|
+
if self.prepend_lineage and lineage:
|
|
176
|
+
lineage_str = " > ".join(lineage)
|
|
177
|
+
final_text = f"[Section: {lineage_str}]\n{body_text}"
|
|
178
|
+
else:
|
|
179
|
+
final_text = body_text
|
|
180
|
+
|
|
181
|
+
return Chunk(
|
|
182
|
+
id=f"{doc_id}#la#{index}",
|
|
183
|
+
doc_id=doc_id,
|
|
184
|
+
text=final_text,
|
|
185
|
+
meta={
|
|
186
|
+
"chunk_index": index,
|
|
187
|
+
"strategy": "layout_aware",
|
|
188
|
+
"lineage": lineage,
|
|
189
|
+
"lineage_str": " > ".join(lineage) if lineage else "",
|
|
190
|
+
"token_count": count_tokens(final_text)
|
|
191
|
+
}
|
|
192
|
+
)
|