autochunks 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. autochunk/__init__.py +9 -0
  2. autochunk/__main__.py +5 -0
  3. autochunk/adapters/__init__.py +3 -0
  4. autochunk/adapters/haystack.py +68 -0
  5. autochunk/adapters/langchain.py +81 -0
  6. autochunk/adapters/llamaindex.py +94 -0
  7. autochunk/autochunker.py +606 -0
  8. autochunk/chunkers/__init__.py +100 -0
  9. autochunk/chunkers/agentic.py +184 -0
  10. autochunk/chunkers/base.py +16 -0
  11. autochunk/chunkers/contextual_retrieval.py +151 -0
  12. autochunk/chunkers/fixed_length.py +110 -0
  13. autochunk/chunkers/html_section.py +225 -0
  14. autochunk/chunkers/hybrid_semantic_stat.py +199 -0
  15. autochunk/chunkers/layout_aware.py +192 -0
  16. autochunk/chunkers/parent_child.py +172 -0
  17. autochunk/chunkers/proposition.py +175 -0
  18. autochunk/chunkers/python_ast.py +248 -0
  19. autochunk/chunkers/recursive_character.py +215 -0
  20. autochunk/chunkers/semantic_local.py +140 -0
  21. autochunk/chunkers/sentence_aware.py +102 -0
  22. autochunk/cli.py +135 -0
  23. autochunk/config.py +76 -0
  24. autochunk/embedding/__init__.py +22 -0
  25. autochunk/embedding/adapter.py +14 -0
  26. autochunk/embedding/base.py +33 -0
  27. autochunk/embedding/hashing.py +42 -0
  28. autochunk/embedding/local.py +154 -0
  29. autochunk/embedding/ollama.py +66 -0
  30. autochunk/embedding/openai.py +62 -0
  31. autochunk/embedding/tokenizer.py +9 -0
  32. autochunk/enrichment/__init__.py +0 -0
  33. autochunk/enrichment/contextual.py +29 -0
  34. autochunk/eval/__init__.py +0 -0
  35. autochunk/eval/harness.py +177 -0
  36. autochunk/eval/metrics.py +27 -0
  37. autochunk/eval/ragas_eval.py +234 -0
  38. autochunk/eval/synthetic.py +104 -0
  39. autochunk/quality/__init__.py +31 -0
  40. autochunk/quality/deduplicator.py +326 -0
  41. autochunk/quality/overlap_optimizer.py +402 -0
  42. autochunk/quality/post_processor.py +245 -0
  43. autochunk/quality/scorer.py +459 -0
  44. autochunk/retrieval/__init__.py +0 -0
  45. autochunk/retrieval/in_memory.py +47 -0
  46. autochunk/retrieval/parent_child.py +4 -0
  47. autochunk/storage/__init__.py +0 -0
  48. autochunk/storage/cache.py +34 -0
  49. autochunk/storage/plan.py +40 -0
  50. autochunk/utils/__init__.py +0 -0
  51. autochunk/utils/hashing.py +8 -0
  52. autochunk/utils/io.py +176 -0
  53. autochunk/utils/logger.py +64 -0
  54. autochunk/utils/telemetry.py +44 -0
  55. autochunk/utils/text.py +199 -0
  56. autochunks-0.0.8.dist-info/METADATA +133 -0
  57. autochunks-0.0.8.dist-info/RECORD +61 -0
  58. autochunks-0.0.8.dist-info/WHEEL +5 -0
  59. autochunks-0.0.8.dist-info/entry_points.txt +2 -0
  60. autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
  61. autochunks-0.0.8.dist-info/top_level.txt +1 -0
@@ -0,0 +1,225 @@
1
+
2
+ from __future__ import annotations
3
+ from typing import List, Optional, Dict, Any
4
+ import re
5
+ from .base import BaseChunker, Chunk
6
+ from ..utils.text import count_tokens
7
+
8
+ try:
9
+ from bs4 import BeautifulSoup, NavigableString, Tag
10
+ BS4_AVAILABLE = True
11
+ except ImportError:
12
+ BS4_AVAILABLE = False
13
+
14
+ class HTMLSectionChunker(BaseChunker):
15
+ """
16
+ DOM-Aware HTML Chunker for structural web content splitting.
17
+
18
+ BEST-OF-BREED FEATURES:
19
+ 1. DOM-Tree Navigation: Respects HTML hierarchy (doesn't blindly split tags).
20
+ 2. Structural Metadata: Tracks DOM path IDs (e.g. body > main > article).
21
+ 3. Semantic Grouping: Keeps tables, lists, and definition lists intact.
22
+ 4. Header Hierarchy: Uses H1-H6 as natural boundaries.
23
+ """
24
+ name = "html_section"
25
+
26
+ # Tags that act as hard section boundaries
27
+ SECTION_TAGS = {'body', 'main', 'section', 'article', 'nav', 'aside', 'footer', 'header'}
28
+
29
+ # Tags that are logical blocks (like paragraphs)
30
+ BLOCK_TAGS = {'p', 'div', 'blockquote', 'pre', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'figure', 'li', 'td', 'th'}
31
+
32
+ # Tags that should be kept atomic if possible
33
+ ATOMIC_TAGS = {'table', 'ul', 'ol', 'dl', 'code', 'pre'}
34
+
35
+ def __init__(self,
36
+ base_token_size: int = 512,
37
+ max_token_size: int = 2048,
38
+ respect_headers: bool = True):
39
+ self.base_token_size = base_token_size
40
+ self.max_token_size = max_token_size
41
+ self.respect_headers = respect_headers
42
+
43
+ def chunk(self, doc_id: str, text: str, **params) -> List[Chunk]:
44
+ """
45
+ Chunk HTML text using DOM analysis.
46
+
47
+ Args:
48
+ doc_id: Document ID
49
+ text: HTML source string
50
+ """
51
+ if not BS4_AVAILABLE:
52
+ from ..utils.logger import logger
53
+ logger.warning("BeautifulSoup not installed, falling back to RecursiveCharacterChunker")
54
+ from .recursive_character import RecursiveCharacterChunker
55
+ return RecursiveCharacterChunker().chunk(doc_id, text, base_token_size=self.base_token_size)
56
+
57
+ # Parse HTML
58
+ soup = BeautifulSoup(text, 'lxml')
59
+
60
+ # Remove noisy tags
61
+ for t in soup(['script', 'style', 'noscript', 'meta', 'link']):
62
+ t.decompose()
63
+
64
+ chunks = []
65
+ current_chunk_text = []
66
+ current_chunk_tokens = 0
67
+ current_meta = {}
68
+
69
+ # Traverse DOM depth-first
70
+ elements = self._flatten_dom(soup.body if soup.body else soup)
71
+
72
+ chunk_idx = 0
73
+
74
+ for elem_text, dom_path, is_header in elements:
75
+ token_count = count_tokens(elem_text)
76
+
77
+ # If single element is huge, need to split it (fallback)
78
+ if token_count > self.max_token_size:
79
+ # Flush current accumulator first
80
+ if current_chunk_text:
81
+ self._flush_chunk(doc_id, chunk_idx, current_chunk_text, current_meta, chunks)
82
+ chunk_idx += 1
83
+ current_chunk_text = []
84
+ current_chunk_tokens = 0
85
+
86
+ # Split the huge element
87
+ from .recursive_character import RecursiveCharacterChunker
88
+ sub_chunks = RecursiveCharacterChunker().chunk(
89
+ f"{doc_id}_sub", elem_text, base_token_size=self.base_token_size
90
+ )
91
+ for sc in sub_chunks:
92
+ chunks.append(Chunk(
93
+ id=f"{doc_id}#html#{chunk_idx}",
94
+ doc_id=doc_id,
95
+ text=sc.text,
96
+ meta={**current_meta, "chunk_index": chunk_idx, "dom_path": dom_path, "subtype": "large_element_split"}
97
+ ))
98
+ chunk_idx += 1
99
+ continue
100
+
101
+ # Check if we should split
102
+ # 1. Header detected (and we have content)
103
+ # 2. Size limit reached
104
+ is_new_section = is_header and self.respect_headers
105
+ is_full = (current_chunk_tokens + token_count) > self.base_token_size
106
+
107
+ if (is_new_section or is_full) and current_chunk_text:
108
+ self._flush_chunk(doc_id, chunk_idx, current_chunk_text, current_meta, chunks)
109
+ chunk_idx += 1
110
+ current_chunk_text = []
111
+ current_chunk_tokens = 0
112
+ # Use metadata from new starting element (specifically header info)
113
+ current_meta = {"dom_path": dom_path, "is_header": is_header}
114
+
115
+ if not current_chunk_text:
116
+ current_meta = {"dom_path": dom_path, "is_header": is_header}
117
+
118
+ current_chunk_text.append(elem_text)
119
+ current_chunk_tokens += token_count
120
+
121
+ # Final flush
122
+ if current_chunk_text:
123
+ self._flush_chunk(doc_id, chunk_idx, current_chunk_text, current_meta, chunks)
124
+
125
+ return chunks
126
+
127
+ def _flush_chunk(self, doc_id, idx, text_parts, meta, chunks_list):
128
+ full_text = "\n\n".join(text_parts).strip()
129
+ if not full_text:
130
+ return
131
+
132
+ chunks_list.append(Chunk(
133
+ id=f"{doc_id}#html#{idx}",
134
+ doc_id=doc_id,
135
+ text=full_text,
136
+ meta={
137
+ "chunk_index": idx,
138
+ "strategy": "html_section",
139
+ "token_count": count_tokens(full_text),
140
+ **meta
141
+ }
142
+ ))
143
+
144
+ def _flatten_dom(self, node) -> List[tuple[str, str, bool]]:
145
+ """
146
+ Flatten DOM into text blocks with metadata.
147
+ Returns list of (text, dom_path, is_header).
148
+ """
149
+ results = []
150
+
151
+ # Helper to get path
152
+ def get_path(tag):
153
+ path = []
154
+ p = tag.parent
155
+ while p and p.name != '[document]':
156
+ path.insert(0, p.name)
157
+ p = p.parent
158
+ path.append(tag.name)
159
+ return " > ".join(path)
160
+
161
+ # Iterate only over block elements or meaningful leaf nodes
162
+ # This is a simplification: complex iteration logic needed for perfect reconstruction
163
+ # We'll walk and extract text from "safe" blocks
164
+
165
+ for element in node.descendants:
166
+ if isinstance(element, Tag):
167
+ if element.name in self.BLOCK_TAGS or element.name in self.ATOMIC_TAGS:
168
+ # Check if this element contains OTHER block tags. If so, don't extract yet (wait for children).
169
+ # Exception: ATOMIC_TAGS (tables, etc.) - extract WHOLE text
170
+ has_block_children = any(child.name in self.BLOCK_TAGS for child in element.find_all(recursive=False))
171
+
172
+ if element.name in self.ATOMIC_TAGS or not has_block_children:
173
+ # Extract text
174
+ text = element.get_text(separator=" ", strip=True)
175
+ if text:
176
+ is_header = element.name in {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
177
+ path = get_path(element)
178
+ results.append((text, path, is_header))
179
+
180
+ # If atomic, skip processing detailed descendants to avoid dups
181
+ # (BeautifulSoup yields same nodes if we don't handle this carefully)
182
+ # Actually node.descendants is a flat generator.
183
+ # We need to manually control recursion to avoid duplication.
184
+ # Since we can't easily skip in .descendants loop, we just rely on the fact
185
+ # that we only grab "leaf-like" blocks.
186
+
187
+ # Better approach: recursive generator
188
+ return self._recursive_extract(node)
189
+
190
+ def _recursive_extract(self, node, path="") -> List[tuple[str, str, bool]]:
191
+ results = []
192
+
193
+ if isinstance(node, NavigableString):
194
+ text = str(node).strip()
195
+ if text:
196
+ return [(text, path, False)]
197
+ return []
198
+
199
+ if isinstance(node, Tag):
200
+ new_path = f"{path} > {node.name}" if path else node.name
201
+ is_header = node.name in {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
202
+
203
+ # Atomic tags: return all text as one block
204
+ if node.name in self.ATOMIC_TAGS:
205
+ text = node.get_text(separator="\n", strip=True)
206
+ if text:
207
+ return [(text, new_path, is_header)]
208
+ return []
209
+
210
+ # Block tags: process content
211
+ if node.name in self.BLOCK_TAGS:
212
+ # Process children
213
+ block_content = []
214
+ for child in node.children:
215
+ block_content.extend(self._recursive_extract(child, new_path))
216
+
217
+ # If we gathered content, maybe we should join it if it's small?
218
+ # For now, just return specific detailed blocks
219
+ return block_content
220
+
221
+ # Inline tags (span, b, etc.): just continue
222
+ for child in node.children:
223
+ results.extend(self._recursive_extract(child, path)) # passing parent path for inline
224
+
225
+ return results
@@ -0,0 +1,199 @@
1
+
2
+ from __future__ import annotations
3
+ import numpy as np
4
+ from typing import List, Any, Callable, Dict
5
+ from .base import BaseChunker, Chunk
6
+ from ..utils.text import split_sentences, count_tokens
7
+
8
+ class HybridSemanticStatChunker(BaseChunker):
9
+ """
10
+ Hybrid Chunker combining Semantic Similarity with Statistical Constraints.
11
+
12
+ BEST-OF-BREED FEATURES:
13
+ 1. Windowed Similarity: Uses window-averaged embeddings for noise suppression.
14
+ 2. Percentile-Based Threshold: Adaptive boundary detection like SemanticLocal.
15
+ 3. Statistical Forces: Token pressure, sentence length variance, and entropy.
16
+ 4. Multi-Factor Scoring: Configurable weights for semantic vs statistical signals.
17
+ """
18
+ name = "hybrid_semantic_stat"
19
+
20
+ def __init__(self,
21
+ alpha: float = 0.6,
22
+ beta: float = 0.4,
23
+ window_size: int = 3,
24
+ threshold_percentile: float = 0.85):
25
+ """
26
+ Initialize the chunker.
27
+
28
+ Args:
29
+ alpha: Weight for semantic similarity signal (0-1)
30
+ beta: Weight for statistical signal (0-1)
31
+ window_size: Number of sentences for windowed similarity
32
+ threshold_percentile: Percentile for adaptive threshold (0-1)
33
+ """
34
+ self.alpha = alpha
35
+ self.beta = beta
36
+ self.window_size = window_size
37
+ self.threshold_percentile = threshold_percentile
38
+
39
+ def chunk(self,
40
+ doc_id: str,
41
+ text: str,
42
+ embedding_fn: Callable[[List[str]], List[List[float]]] = None,
43
+ alpha: float = None,
44
+ beta: float = None,
45
+ base_token_size: int = 512,
46
+ **params) -> List[Chunk]:
47
+ """
48
+ Split text using hybrid semantic-statistical analysis.
49
+
50
+ Args:
51
+ doc_id: Document identifier
52
+ text: Input text
53
+ embedding_fn: Function to generate embeddings for sentences
54
+ alpha: Override semantic weight
55
+ beta: Override statistical weight
56
+ base_token_size: Target chunk size
57
+
58
+ Returns:
59
+ List of Chunk objects
60
+ """
61
+ alpha = alpha if alpha is not None else self.alpha
62
+ beta = beta if beta is not None else self.beta
63
+
64
+ sentences = split_sentences(text)
65
+ if len(sentences) <= 1:
66
+ return [Chunk(id=f"{doc_id}#hss#0", doc_id=doc_id, text=text, meta={"chunk_index": 0})]
67
+
68
+ if embedding_fn is None:
69
+ from .sentence_aware import SentenceAwareChunker
70
+ return SentenceAwareChunker().chunk(doc_id, text, base_token_size=base_token_size)
71
+
72
+ # 1. Get embeddings
73
+ embeddings = np.array(embedding_fn(sentences))
74
+
75
+ # 2. Calculate per-sentence metrics
76
+ sent_lengths = [count_tokens(s) for s in sentences]
77
+ avg_length = np.mean(sent_lengths)
78
+ std_length = np.std(sent_lengths) if len(sent_lengths) > 1 else 1.0
79
+
80
+ # 3. Calculate windowed semantic distances (Vectorized)
81
+ n = len(embeddings)
82
+ semantic_distances = []
83
+
84
+ # Pre-calculate norms for efficient similarity calculation
85
+ norms = np.linalg.norm(embeddings, axis=1)
86
+ norms[norms < 1e-9] = 1e-9
87
+
88
+ for i in range(n - 1):
89
+ start_prev = max(0, i - self.window_size + 1)
90
+ end_prev = i + 1
91
+ start_next = i + 1
92
+ end_next = min(n, i + 1 + self.window_size)
93
+
94
+ vec_prev = np.mean(embeddings[start_prev:end_prev], axis=0)
95
+ vec_next = np.mean(embeddings[start_next:end_next], axis=0)
96
+
97
+ norm_p = np.linalg.norm(vec_prev)
98
+ norm_n = np.linalg.norm(vec_next)
99
+
100
+ if norm_p < 1e-9 or norm_n < 1e-9:
101
+ dist = 0.0
102
+ else:
103
+ sim = np.dot(vec_prev, vec_next) / (norm_p * norm_n)
104
+ dist = float(1 - sim)
105
+ semantic_distances.append(dist)
106
+
107
+ # 4. Calculate boundary scores (Vectorized Signals)
108
+ semantic_signals = np.array(semantic_distances) if semantic_distances else np.zeros(n-1)
109
+
110
+ # Vectorize cumulative token pressure
111
+ cumulative_tokens = np.cumsum(sent_lengths)[:-1]
112
+ token_pressures = np.minimum(1.0, (cumulative_tokens / base_token_size) ** 2)
113
+
114
+ # Vectorize length anomaly signals
115
+ length_zs = np.abs(np.array(sent_lengths[:-1]) - avg_length) / (std_length + 1e-6)
116
+ length_signals = np.minimum(1.0, length_zs / 3)
117
+
118
+ # Combined statistical signal
119
+ stat_signals = 0.7 * token_pressures + 0.3 * length_signals
120
+
121
+ # Vectorized combined boundary scores
122
+ combined_scores = (alpha * semantic_signals) + (beta * stat_signals)
123
+
124
+ # Prepare score_info for the split loop (legacy compatibility with split-logic)
125
+ boundary_scores = []
126
+ for i in range(n - 1):
127
+ boundary_scores.append({
128
+ "position": i,
129
+ "semantic": float(semantic_signals[i]),
130
+ "statistical": float(stat_signals[i]),
131
+ "combined": float(combined_scores[i]),
132
+ "running_tokens": int(cumulative_tokens[i])
133
+ })
134
+
135
+ # 5. Determine adaptive threshold
136
+ if boundary_scores:
137
+ all_combined = [b["combined"] for b in boundary_scores]
138
+ threshold = np.percentile(all_combined, self.threshold_percentile * 100)
139
+ else:
140
+ threshold = 0.5
141
+
142
+ # 6. Build chunks using detected boundaries
143
+ chunks = []
144
+ curr_sentences = [sentences[0]]
145
+ curr_tokens = sent_lengths[0]
146
+
147
+ for i, score_info in enumerate(boundary_scores):
148
+ should_split = False
149
+ split_reason = "none"
150
+
151
+ # Semantic+Statistical split
152
+ if score_info["combined"] >= threshold:
153
+ should_split = True
154
+ split_reason = "hybrid"
155
+
156
+ # Safety split (hard token limit)
157
+ next_sent_tokens = sent_lengths[i + 1] if i + 1 < len(sent_lengths) else 0
158
+ if curr_tokens + next_sent_tokens > base_token_size * 1.3:
159
+ should_split = True
160
+ split_reason = "safety"
161
+
162
+ if should_split and curr_sentences:
163
+ chunk_text = " ".join(curr_sentences)
164
+ chunks.append(Chunk(
165
+ id=f"{doc_id}#hss#{len(chunks)}",
166
+ doc_id=doc_id,
167
+ text=chunk_text,
168
+ meta={
169
+ "chunk_index": len(chunks),
170
+ "strategy": "hybrid_semantic_stat",
171
+ "split_reason": split_reason,
172
+ "boundary_score": score_info["combined"],
173
+ "token_count": count_tokens(chunk_text)
174
+ }
175
+ ))
176
+ curr_sentences = []
177
+ curr_tokens = 0
178
+
179
+ # Add next sentence to buffer
180
+ if i + 1 < len(sentences):
181
+ curr_sentences.append(sentences[i + 1])
182
+ curr_tokens += sent_lengths[i + 1]
183
+
184
+ # Final chunk
185
+ if curr_sentences:
186
+ chunk_text = " ".join(curr_sentences)
187
+ chunks.append(Chunk(
188
+ id=f"{doc_id}#hss#{len(chunks)}",
189
+ doc_id=doc_id,
190
+ text=chunk_text,
191
+ meta={
192
+ "chunk_index": len(chunks),
193
+ "strategy": "hybrid_semantic_stat",
194
+ "split_reason": "final",
195
+ "token_count": count_tokens(chunk_text)
196
+ }
197
+ ))
198
+
199
+ return chunks
@@ -0,0 +1,192 @@
1
+
2
+ from __future__ import annotations
3
+ from typing import List, Dict, Any
4
+ import re
5
+ from .base import BaseChunker, Chunk
6
+ from ..utils.text import count_tokens, extract_code_blocks
7
+
8
+ class LayoutAwareChunker(BaseChunker):
9
+ """
10
+ Document Chunker with Structure Preservation.
11
+
12
+ BEST-OF-BREED FEATURES:
13
+ 1. Table Inheritance: Re-attaches table headers to split table fragments.
14
+ 2. Header Lineage: Prepends [Section: X > Y] for retrieval context.
15
+ 3. Code Block Integrity: Never splits inside fenced code blocks.
16
+ 4. List Awareness: Keeps list items together when possible.
17
+ 5. Multi-Format: Handles Markdown, HTML tables, and plain text.
18
+ """
19
+ name = "layout_aware"
20
+
21
+ def __init__(self,
22
+ prepend_lineage: bool = True,
23
+ preserve_code_blocks: bool = True,
24
+ preserve_tables: bool = True):
25
+ """
26
+ Initialize the chunker.
27
+
28
+ Args:
29
+ prepend_lineage: If True, prepend section hierarchy to chunk text
30
+ preserve_code_blocks: If True, avoid splitting inside ``` blocks
31
+ preserve_tables: If True, re-attach table headers to fragments
32
+ """
33
+ self.prepend_lineage = prepend_lineage
34
+ self.preserve_code_blocks = preserve_code_blocks
35
+ self.preserve_tables = preserve_tables
36
+
37
+ def chunk(self, doc_id: str, text: str, base_token_size: int = 512, **params) -> List[Chunk]:
38
+ """
39
+ Split text while respecting document structure.
40
+
41
+ Args:
42
+ doc_id: Document identifier
43
+ text: Input text (Markdown preferred)
44
+ base_token_size: Target chunk size in tokens
45
+
46
+ Returns:
47
+ List of Chunk objects with structural metadata
48
+ """
49
+ # Extract structural elements
50
+ code_blocks = extract_code_blocks(text) if self.preserve_code_blocks else []
51
+
52
+ lines = text.split("\n")
53
+ chunks = []
54
+ buffer = []
55
+ buffer_tokens = 0
56
+
57
+ # State tracking
58
+ header_stack = [] # Current header hierarchy
59
+ table_header = None # Current table header row
60
+ table_separator = None # Table separator row |---|
61
+ in_code_block = False
62
+ code_block_buffer = []
63
+
64
+ for line_idx, line in enumerate(lines):
65
+ stripped = line.strip()
66
+ line_tokens = count_tokens(line)
67
+
68
+ # Track fenced code blocks
69
+ if stripped.startswith("```"):
70
+ if not in_code_block:
71
+ # Starting a code block
72
+ in_code_block = True
73
+ code_block_buffer = [line]
74
+ continue
75
+ else:
76
+ # Ending a code block
77
+ in_code_block = False
78
+ code_block_buffer.append(line)
79
+ code_block_text = "\n".join(code_block_buffer)
80
+ code_block_tokens = count_tokens(code_block_text)
81
+
82
+ # Flush buffer if adding code block would overflow
83
+ if buffer_tokens + code_block_tokens > base_token_size and buffer:
84
+ chunks.append(self._make_chunk(doc_id, buffer, len(chunks), header_stack))
85
+ buffer = []
86
+ buffer_tokens = 0
87
+
88
+ buffer.append(code_block_text)
89
+ buffer_tokens += code_block_tokens
90
+ code_block_buffer = []
91
+ continue
92
+
93
+ if in_code_block:
94
+ code_block_buffer.append(line)
95
+ continue
96
+
97
+ # Skip empty lines but preserve them in buffer
98
+ if not stripped:
99
+ if buffer:
100
+ buffer.append("")
101
+ continue
102
+
103
+ # Header detection - update lineage
104
+ if stripped.startswith("#"):
105
+ # Count header level
106
+ match = re.match(r'^(#+)\s+(.+)$', stripped)
107
+ if match:
108
+ level = len(match.group(1))
109
+ title = match.group(2).strip()
110
+
111
+ # Trim stack to parent level and add new header
112
+ header_stack = header_stack[:level-1]
113
+ header_stack.append(title)
114
+
115
+ # Hard break on new header
116
+ if buffer:
117
+ chunks.append(self._make_chunk(doc_id, buffer, len(chunks), header_stack[:-1]))
118
+ buffer = []
119
+ buffer_tokens = 0
120
+
121
+ # Table detection
122
+ is_table_row = "|" in stripped and not stripped.startswith("```")
123
+ is_separator_row = is_table_row and re.match(r'^[\|\s\-:]+$', stripped)
124
+
125
+ if is_table_row:
126
+ if table_header is None and not is_separator_row:
127
+ table_header = line
128
+ elif is_separator_row:
129
+ table_separator = line
130
+ elif table_header:
131
+ # Exiting table
132
+ table_header = None
133
+ table_separator = None
134
+
135
+ # Assembly logic
136
+ if buffer_tokens + line_tokens > base_token_size and buffer:
137
+ chunks.append(self._make_chunk(doc_id, buffer, len(chunks), header_stack))
138
+ buffer = []
139
+ buffer_tokens = 0
140
+
141
+ # Table inheritance: add header to new chunk
142
+ if is_table_row and self.preserve_tables and table_header and line != table_header:
143
+ buffer.append(table_header)
144
+ buffer_tokens += count_tokens(table_header)
145
+ if table_separator:
146
+ buffer.append(table_separator)
147
+ buffer_tokens += count_tokens(table_separator)
148
+
149
+ buffer.append(line)
150
+ buffer_tokens += line_tokens
151
+
152
+ # Handle remaining code block if unclosed
153
+ if code_block_buffer:
154
+ code_block_text = "\n".join(code_block_buffer)
155
+ buffer.append(code_block_text)
156
+ buffer_tokens += count_tokens(code_block_text)
157
+
158
+ # Final chunk
159
+ if buffer:
160
+ chunks.append(self._make_chunk(doc_id, buffer, len(chunks), header_stack))
161
+
162
+ return chunks
163
+
164
+ def _make_chunk(self, doc_id: str, lines: List[str], index: int, lineage: List[str]) -> Chunk:
165
+ """Create a chunk with proper formatting and metadata."""
166
+ # Clean up empty lines at start/end
167
+ while lines and not lines[0].strip():
168
+ lines.pop(0)
169
+ while lines and not lines[-1].strip():
170
+ lines.pop()
171
+
172
+ body_text = "\n".join(lines)
173
+
174
+ # Prepend lineage for improved retrieval
175
+ if self.prepend_lineage and lineage:
176
+ lineage_str = " > ".join(lineage)
177
+ final_text = f"[Section: {lineage_str}]\n{body_text}"
178
+ else:
179
+ final_text = body_text
180
+
181
+ return Chunk(
182
+ id=f"{doc_id}#la#{index}",
183
+ doc_id=doc_id,
184
+ text=final_text,
185
+ meta={
186
+ "chunk_index": index,
187
+ "strategy": "layout_aware",
188
+ "lineage": lineage,
189
+ "lineage_str": " > ".join(lineage) if lineage else "",
190
+ "token_count": count_tokens(final_text)
191
+ }
192
+ )