longparser 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- longparser/__init__.py +104 -0
- longparser/chunkers/__init__.py +5 -0
- longparser/chunkers/hybrid_chunker.py +1046 -0
- longparser/extractors/__init__.py +9 -0
- longparser/extractors/base.py +62 -0
- longparser/extractors/docling_extractor.py +2065 -0
- longparser/extractors/latex_ocr.py +404 -0
- longparser/integrations/__init__.py +31 -0
- longparser/integrations/langchain.py +138 -0
- longparser/integrations/llamaindex.py +157 -0
- longparser/pipeline/__init__.py +8 -0
- longparser/pipeline/orchestrator.py +230 -0
- longparser/py.typed +0 -0
- longparser/schemas.py +247 -0
- longparser/server/__init__.py +22 -0
- longparser/server/app.py +1045 -0
- longparser/server/chat/__init__.py +39 -0
- longparser/server/chat/callbacks.py +110 -0
- longparser/server/chat/engine.py +341 -0
- longparser/server/chat/graph.py +176 -0
- longparser/server/chat/llm_chain.py +153 -0
- longparser/server/chat/retriever.py +111 -0
- longparser/server/chat/schemas.py +164 -0
- longparser/server/db.py +656 -0
- longparser/server/embeddings.py +181 -0
- longparser/server/queue.py +97 -0
- longparser/server/routers/__init__.py +0 -0
- longparser/server/schemas.py +204 -0
- longparser/server/vectorstores.py +443 -0
- longparser/server/worker.py +480 -0
- longparser/utils/__init__.py +5 -0
- longparser/utils/rtl_detector.py +93 -0
- longparser-0.1.0.dist-info/METADATA +337 -0
- longparser-0.1.0.dist-info/RECORD +36 -0
- longparser-0.1.0.dist-info/WHEEL +5 -0
- longparser-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1046 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hybrid Chunker for LongParser — RAG-optimized document splitting.
|
|
3
|
+
|
|
4
|
+
Combines 6 strategies:
|
|
5
|
+
0. Autonomous equation detection (pre-pass)
|
|
6
|
+
1. Structure-aware (hierarchical) chunking
|
|
7
|
+
2. Layout-aware block classification
|
|
8
|
+
3. Token-window packing with overlap
|
|
9
|
+
4. Table-aware chunking
|
|
10
|
+
5. List-aware chunking
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
import logging
|
|
17
|
+
import unicodedata
|
|
18
|
+
from typing import Optional
|
|
19
|
+
|
|
20
|
+
from ..schemas import Block, BlockType, Chunk, ChunkingConfig
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
# Constants for autonomous equation detection
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
# Greek letters (lowercase + uppercase)
|
|
29
|
+
_GREEK = set("αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ")
|
|
30
|
+
|
|
31
|
+
# Common math operators & symbols
|
|
32
|
+
_MATH_SYMBOLS = set("∑∏∫∂∇±×÷≤≥≠≈∞∈∉⊂⊃⊆⊇∪∩∧∨¬⊕⊗→←↔⇒⇐⇔∀∃∅⟨⟩⟦⟧")
|
|
33
|
+
|
|
34
|
+
# Subscript / superscript Unicode ranges
|
|
35
|
+
_SUB_SUPER = set("₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₔₕₖₗₘₙₚₛₜ"
|
|
36
|
+
"⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿⁱ")
|
|
37
|
+
|
|
38
|
+
# Combined set for fast lookup
|
|
39
|
+
_MATH_CHARS = _GREEK | _MATH_SYMBOLS | _SUB_SUPER
|
|
40
|
+
|
|
41
|
+
# Regex patterns for equation-like text
|
|
42
|
+
_EQ_PATTERNS = [
|
|
43
|
+
# Variable subscript patterns: x_i, y_j, a_1, etc.
|
|
44
|
+
re.compile(r"\b[a-zA-Z]_[a-zA-Z0-9]+\b"),
|
|
45
|
+
# Inline math with equals: f(x) = ..., y = ...
|
|
46
|
+
re.compile(r"[a-zA-Z]\s*\([a-zA-Z,\s]*\)\s*="),
|
|
47
|
+
# Summation/product patterns
|
|
48
|
+
re.compile(r"∑|∏|∫|Σ|Π"),
|
|
49
|
+
# Fractions and common math notation
|
|
50
|
+
re.compile(r"\b(?:frac|sqrt|log|exp|sin|cos|tan|argmax|argmin)\b"),
|
|
51
|
+
# LaTeX-like patterns that may survive OCR
|
|
52
|
+
re.compile(r"\\[a-zA-Z]+"),
|
|
53
|
+
# Tensor/dimension-like notation: d1, d2, d3, n(D)
|
|
54
|
+
re.compile(r"\b[a-z]\d+\b"),
|
|
55
|
+
# Comma-separated Greek or single-letter variables: κ,λ,ν or x, y, z
|
|
56
|
+
re.compile(r"[α-ωΑ-Ω],\s*[α-ωΑ-Ω]"),
|
|
57
|
+
# Parenthetical math notation: ( ) D n, (x), f(x)
|
|
58
|
+
re.compile(r"\(\s*\)\s*[A-Z]\s*[a-z]"),
|
|
59
|
+
# Scattered single-letter math variables with spaces: n y n y, a i y
|
|
60
|
+
re.compile(r"(?:\b[a-z]\b\s+){3,}"),
|
|
61
|
+
# Cardinality / dimensionality notation
|
|
62
|
+
re.compile(r"\b(?:cardinality|dimension|tensor|kernel|initializer)\b", re.IGNORECASE),
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
# Phrases that introduce or surround equations
|
|
66
|
+
_EQ_LEAD_PHRASES = re.compile(
|
|
67
|
+
r"(?:defined\s+as|given\s+by|expressed\s+as|computed\s+as|"
|
|
68
|
+
r"formally|where\s+\w+\s+(?:is|are|denotes?))",
|
|
69
|
+
re.IGNORECASE,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Phrases at the end of a block that introduce an upcoming equation
|
|
73
|
+
_EQ_TAIL_PHRASES = re.compile(
|
|
74
|
+
r"(?:defined\s+as\s*[,.]?\s*$|given\s+by\s*[,.]?\s*$|"
|
|
75
|
+
r"expressed\s+as\s*[,.]?\s*$|computed\s+as\s*[,.]?\s*$)",
|
|
76
|
+
re.IGNORECASE | re.MULTILINE,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# ---------------------------------------------------------------------------
|
|
81
|
+
# Tokenizer (word-split approximation, no tiktoken dependency)
|
|
82
|
+
# ---------------------------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
def _count_tokens(text: str) -> int:
|
|
85
|
+
"""Approximate token count (≈ 0.75 tokens per whitespace-split word)."""
|
|
86
|
+
words = text.split()
|
|
87
|
+
return max(1, int(len(words) * 1.33)) # words * 1.33 ≈ tokens
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# ---------------------------------------------------------------------------
|
|
91
|
+
# Strategy 0: Autonomous equation detection
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
# Regex for blocks that contain only separator characters
|
|
95
|
+
_SEPARATOR_ONLY = re.compile(r"^[\s_\-=\.·•─━═]+$")
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _is_separator_only(text: str) -> bool:
|
|
99
|
+
"""Return True if text is only separator chars (underscores, dashes, etc.)."""
|
|
100
|
+
return bool(text and _SEPARATOR_ONLY.match(text.strip()))
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _math_char_density(text: str) -> float:
|
|
104
|
+
"""Fraction of chars that are math-class Unicode."""
|
|
105
|
+
if not text:
|
|
106
|
+
return 0.0
|
|
107
|
+
count = sum(1 for ch in text if ch in _MATH_CHARS or
|
|
108
|
+
unicodedata.category(ch) in ("Sm", "So"))
|
|
109
|
+
return count / len(text)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _eq_pattern_hits(text: str) -> int:
|
|
113
|
+
"""Count how many equation regex patterns match."""
|
|
114
|
+
return sum(1 for pat in _EQ_PATTERNS if pat.search(text))
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _is_equation_candidate(block: Block, prev_block: Optional[Block] = None) -> bool:
|
|
118
|
+
"""
|
|
119
|
+
Determine if a paragraph block should be re-tagged as an equation.
|
|
120
|
+
|
|
121
|
+
Scoring heuristics (threshold = 2.0):
|
|
122
|
+
- Math-char density > 5% → +2.0; > 1% → +1.0
|
|
123
|
+
- Equation pattern hits ≥ 3 → +2.0; ≥ 2 → +1.5; ≥ 1 → +0.5
|
|
124
|
+
- Short block (< 80 chars) with isolated variables → +1.0
|
|
125
|
+
- Previous block ends with lead-in phrase → +1.5
|
|
126
|
+
- Self text starts with or contains lead-in phrase → +1.0
|
|
127
|
+
- Greek letters ≥ 2 → +1.5; ≥ 1 → +0.5
|
|
128
|
+
"""
|
|
129
|
+
if block.type != BlockType.PARAGRAPH:
|
|
130
|
+
return False
|
|
131
|
+
|
|
132
|
+
text = block.text.strip()
|
|
133
|
+
if not text:
|
|
134
|
+
return False
|
|
135
|
+
|
|
136
|
+
# Math-char density
|
|
137
|
+
density = _math_char_density(text)
|
|
138
|
+
|
|
139
|
+
# Pattern hits
|
|
140
|
+
hits = _eq_pattern_hits(text)
|
|
141
|
+
|
|
142
|
+
# Short block (< 80 chars) with variable-like single letters
|
|
143
|
+
is_short = len(text) < 80
|
|
144
|
+
has_isolated_vars = bool(re.search(r"(?<!\w)[a-zA-Z](?!\w)", text))
|
|
145
|
+
|
|
146
|
+
# Previous block leads into equation
|
|
147
|
+
has_lead_in = False
|
|
148
|
+
if prev_block and prev_block.text:
|
|
149
|
+
prev_tail = prev_block.text.strip()[-120:]
|
|
150
|
+
if _EQ_LEAD_PHRASES.search(prev_tail) or _EQ_TAIL_PHRASES.search(prev_tail):
|
|
151
|
+
has_lead_in = True
|
|
152
|
+
|
|
153
|
+
# Self text contains equation-contextual phrases
|
|
154
|
+
has_self_context = bool(_EQ_LEAD_PHRASES.search(text))
|
|
155
|
+
|
|
156
|
+
# --- Scoring ---
|
|
157
|
+
score = 0.0
|
|
158
|
+
|
|
159
|
+
# Density scoring (lowered thresholds for OCR'd math text)
|
|
160
|
+
if density > 0.05:
|
|
161
|
+
score += 2.0
|
|
162
|
+
elif density > 0.01:
|
|
163
|
+
score += 1.0
|
|
164
|
+
|
|
165
|
+
# Pattern hits
|
|
166
|
+
if hits >= 3:
|
|
167
|
+
score += 2.0
|
|
168
|
+
elif hits >= 2:
|
|
169
|
+
score += 1.5
|
|
170
|
+
elif hits >= 1:
|
|
171
|
+
score += 0.5
|
|
172
|
+
|
|
173
|
+
# Short math fragments
|
|
174
|
+
if is_short and has_isolated_vars:
|
|
175
|
+
score += 1.0
|
|
176
|
+
|
|
177
|
+
# Contextual cues
|
|
178
|
+
if has_lead_in:
|
|
179
|
+
score += 1.5
|
|
180
|
+
|
|
181
|
+
if has_self_context:
|
|
182
|
+
score += 1.0
|
|
183
|
+
|
|
184
|
+
# Greek letter presence
|
|
185
|
+
greek_count = sum(1 for ch in text if ch in _GREEK)
|
|
186
|
+
if greek_count >= 2:
|
|
187
|
+
score += 1.5
|
|
188
|
+
elif greek_count >= 1:
|
|
189
|
+
score += 0.5
|
|
190
|
+
|
|
191
|
+
return score >= 2.0
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _detect_equations(blocks: list[Block]) -> list[Block]:
|
|
195
|
+
"""
|
|
196
|
+
Pre-pass: re-tag paragraph blocks that look like equations.
|
|
197
|
+
Returns the modified block list (original list is mutated in place).
|
|
198
|
+
"""
|
|
199
|
+
retagged = 0
|
|
200
|
+
for i, block in enumerate(blocks):
|
|
201
|
+
prev = blocks[i - 1] if i > 0 else None
|
|
202
|
+
if _is_equation_candidate(block, prev):
|
|
203
|
+
logger.info(
|
|
204
|
+
f" [EQ-DETECT] Re-tagged block {block.block_id} "
|
|
205
|
+
f"(order={block.order_index}, page={block.provenance.page_number}) "
|
|
206
|
+
f"as equation — preview: {block.text[:80]!r}"
|
|
207
|
+
)
|
|
208
|
+
block.type = BlockType.EQUATION
|
|
209
|
+
retagged += 1
|
|
210
|
+
|
|
211
|
+
logger.info(f" [EQ-DETECT] Re-tagged {retagged} paragraph(s) → equation")
|
|
212
|
+
return blocks
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
# ---------------------------------------------------------------------------
|
|
216
|
+
# Strategy 4: Table-aware chunking (with smart rendering + profiling)
|
|
217
|
+
# ---------------------------------------------------------------------------
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _build_ordered_grid(table) -> dict[int, dict[int, str]]:
|
|
221
|
+
"""
|
|
222
|
+
Build a 2D dict from table cells: rows[r][c] = text.
|
|
223
|
+
Enforces column order (Fix B).
|
|
224
|
+
"""
|
|
225
|
+
rows: dict[int, dict[int, str]] = {}
|
|
226
|
+
for cell in table.cells:
|
|
227
|
+
r = cell.row_index
|
|
228
|
+
c = cell.col_index
|
|
229
|
+
rows.setdefault(r, {})[c] = cell.text
|
|
230
|
+
return rows
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _detect_header_rows(table) -> list[int]:
|
|
234
|
+
"""
|
|
235
|
+
Detect header row indices using column_header flag (Gap #2).
|
|
236
|
+
Falls back to row 0 if no flags are set.
|
|
237
|
+
"""
|
|
238
|
+
header_rows = set()
|
|
239
|
+
for cell in table.cells:
|
|
240
|
+
# Check if the cell has a column_header flag (from Docling)
|
|
241
|
+
if getattr(cell, 'column_header', False):
|
|
242
|
+
header_rows.add(cell.row_index)
|
|
243
|
+
|
|
244
|
+
if header_rows:
|
|
245
|
+
return sorted(header_rows)
|
|
246
|
+
# Fallback: treat row 0 as header
|
|
247
|
+
return [0]
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def _get_column_names(grid: dict[int, dict[int, str]], header_rows: list[int], n_cols: int) -> list[str]:
|
|
251
|
+
"""
|
|
252
|
+
Extract column names from header rows.
|
|
253
|
+
Synthesizes col_0..col_n if headers are empty.
|
|
254
|
+
"""
|
|
255
|
+
names = [""] * n_cols
|
|
256
|
+
for hr in header_rows:
|
|
257
|
+
row_data = grid.get(hr, {})
|
|
258
|
+
for c in range(n_cols):
|
|
259
|
+
val = row_data.get(c, "").strip()
|
|
260
|
+
if val:
|
|
261
|
+
if names[c]:
|
|
262
|
+
names[c] += f" {val}"
|
|
263
|
+
else:
|
|
264
|
+
names[c] = val
|
|
265
|
+
|
|
266
|
+
# Synthesize if still empty
|
|
267
|
+
for c in range(n_cols):
|
|
268
|
+
if not names[c]:
|
|
269
|
+
names[c] = f"col_{c}"
|
|
270
|
+
|
|
271
|
+
return names
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _render_row_as_record(row_idx: int, row_data: dict[int, str], col_names: list[str], n_cols: int) -> str:
|
|
275
|
+
"""Render a single row as: Row N: col_a=val; col_b=val; ..."""
|
|
276
|
+
parts = []
|
|
277
|
+
for c in range(n_cols):
|
|
278
|
+
val = row_data.get(c, "").strip()
|
|
279
|
+
if val:
|
|
280
|
+
parts.append(f"{col_names[c]}={val}")
|
|
281
|
+
return f"Row {row_idx}: " + "; ".join(parts) if parts else ""
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _render_row_as_pipe(row_data: dict[int, str], n_cols: int) -> str:
|
|
285
|
+
"""Render a single row as pipe-delimited."""
|
|
286
|
+
return " | ".join(row_data.get(c, "") for c in range(n_cols))
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _guess_col_type(values: list[str]) -> str:
|
|
290
|
+
"""Guess column type from sample values."""
|
|
291
|
+
if not values:
|
|
292
|
+
return "string"
|
|
293
|
+
|
|
294
|
+
non_empty = [v for v in values if v.strip()]
|
|
295
|
+
if not non_empty:
|
|
296
|
+
return "string"
|
|
297
|
+
|
|
298
|
+
# Check numeric
|
|
299
|
+
num_count = 0
|
|
300
|
+
for v in non_empty[:20]: # Sample first 20
|
|
301
|
+
try:
|
|
302
|
+
float(v.replace(",", "").replace("$", "").replace("%", ""))
|
|
303
|
+
num_count += 1
|
|
304
|
+
except ValueError:
|
|
305
|
+
pass
|
|
306
|
+
if num_count > len(non_empty[:20]) * 0.7:
|
|
307
|
+
return "number"
|
|
308
|
+
|
|
309
|
+
# Check date-like
|
|
310
|
+
import re
|
|
311
|
+
date_pattern = re.compile(r'\d{1,4}[-/]\d{1,2}[-/]\d{1,4}')
|
|
312
|
+
date_count = sum(1 for v in non_empty[:20] if date_pattern.search(v))
|
|
313
|
+
if date_count > len(non_empty[:20]) * 0.5:
|
|
314
|
+
return "date"
|
|
315
|
+
|
|
316
|
+
return "string"
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def _generate_schema_chunk(
|
|
320
|
+
block, table, grid, header_rows, col_names, n_cols, data_row_indices
|
|
321
|
+
) -> Chunk:
|
|
322
|
+
"""
|
|
323
|
+
Generate a schema chunk for a table (Fix E + Gap #5).
|
|
324
|
+
Contains: table info, column list with types, null rates, sample rows.
|
|
325
|
+
"""
|
|
326
|
+
page = block.provenance.page_number
|
|
327
|
+
n_data = len(data_row_indices)
|
|
328
|
+
|
|
329
|
+
# Column profiling
|
|
330
|
+
col_profiles = []
|
|
331
|
+
for c in range(n_cols):
|
|
332
|
+
values = [grid.get(r, {}).get(c, "") for r in data_row_indices]
|
|
333
|
+
col_type = _guess_col_type(values)
|
|
334
|
+
total = len(values)
|
|
335
|
+
null_count = sum(1 for v in values if not v.strip())
|
|
336
|
+
null_pct = f"{(null_count / total * 100):.0f}%" if total > 0 else "0%"
|
|
337
|
+
col_profiles.append(f" - {col_names[c]} ({col_type}, {null_pct} null)")
|
|
338
|
+
|
|
339
|
+
# Sample rows (first 3–5)
|
|
340
|
+
sample_count = min(5, n_data)
|
|
341
|
+
sample_rows = []
|
|
342
|
+
for i, r_idx in enumerate(data_row_indices[:sample_count]):
|
|
343
|
+
row_data = grid.get(r_idx, {})
|
|
344
|
+
parts = [f"{col_names[c]}={row_data.get(c, '')}" for c in range(n_cols)]
|
|
345
|
+
sample_rows.append(f" Row {r_idx}: " + "; ".join(parts))
|
|
346
|
+
|
|
347
|
+
lines = [
|
|
348
|
+
f"[TABLE SCHEMA]",
|
|
349
|
+
f"Table ID: {block.block_id}",
|
|
350
|
+
f"Rows: {n_data} (data rows), Columns: {n_cols}",
|
|
351
|
+
f"Columns:",
|
|
352
|
+
]
|
|
353
|
+
lines.extend(col_profiles)
|
|
354
|
+
lines.append(f"Sample Rows ({sample_count}):")
|
|
355
|
+
lines.extend(sample_rows)
|
|
356
|
+
|
|
357
|
+
schema_text = "\n".join(lines)
|
|
358
|
+
return Chunk(
|
|
359
|
+
text=schema_text,
|
|
360
|
+
token_count=_count_tokens(schema_text),
|
|
361
|
+
chunk_type="table_schema",
|
|
362
|
+
section_path=list(block.hierarchy_path),
|
|
363
|
+
page_numbers=[page],
|
|
364
|
+
block_ids=[block.block_id],
|
|
365
|
+
metadata={"schema": True, "n_rows": n_data, "n_cols": n_cols},
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def _chunk_table(block: Block, config: ChunkingConfig) -> list[Chunk]:
|
|
370
|
+
"""
|
|
371
|
+
Create chunks from a table block.
|
|
372
|
+
|
|
373
|
+
Implements:
|
|
374
|
+
Fix B: Column-ordered rendering
|
|
375
|
+
Fix C: Token-aware row batching (header repeated)
|
|
376
|
+
Fix D: Row-as-record format for RAG
|
|
377
|
+
Fix E: Schema chunk per table
|
|
378
|
+
Fix F: Wide-table column banding
|
|
379
|
+
Gap #2: Smart header detection
|
|
380
|
+
Gap #4: Chunk metadata (row ranges)
|
|
381
|
+
Gap #5: Schema chunk profiling
|
|
382
|
+
"""
|
|
383
|
+
chunks: list[Chunk] = []
|
|
384
|
+
table = block.table
|
|
385
|
+
page = block.provenance.page_number
|
|
386
|
+
|
|
387
|
+
if not table:
|
|
388
|
+
# No structured table data — fallback to single text chunk
|
|
389
|
+
chunks.append(Chunk(
|
|
390
|
+
text=block.text,
|
|
391
|
+
token_count=_count_tokens(block.text),
|
|
392
|
+
chunk_type="table",
|
|
393
|
+
section_path=list(block.hierarchy_path),
|
|
394
|
+
page_numbers=[page],
|
|
395
|
+
block_ids=[block.block_id],
|
|
396
|
+
))
|
|
397
|
+
return chunks
|
|
398
|
+
|
|
399
|
+
_n_rows = table.n_rows
|
|
400
|
+
n_cols = table.n_cols
|
|
401
|
+
|
|
402
|
+
# Fix B: Build ordered grid
|
|
403
|
+
grid = _build_ordered_grid(table)
|
|
404
|
+
|
|
405
|
+
# Gap #2: Detect header rows
|
|
406
|
+
header_rows = _detect_header_rows(table)
|
|
407
|
+
col_names = _get_column_names(grid, header_rows, n_cols)
|
|
408
|
+
|
|
409
|
+
# Data rows = all rows not in header
|
|
410
|
+
header_set = set(header_rows)
|
|
411
|
+
data_row_indices = sorted(r for r in grid.keys() if r not in header_set)
|
|
412
|
+
|
|
413
|
+
# Fix E + Gap #5: Schema chunk
|
|
414
|
+
if config.generate_schema_chunks and data_row_indices:
|
|
415
|
+
schema_chunk = _generate_schema_chunk(
|
|
416
|
+
block, table, grid, header_rows, col_names, n_cols, data_row_indices
|
|
417
|
+
)
|
|
418
|
+
chunks.append(schema_chunk)
|
|
419
|
+
|
|
420
|
+
# Fix F: Wide-table column banding
|
|
421
|
+
if n_cols > config.wide_table_col_threshold:
|
|
422
|
+
# Keep col 0 as key column, split remaining into bands
|
|
423
|
+
key_col = 0
|
|
424
|
+
remaining_cols = list(range(1, n_cols))
|
|
425
|
+
band_size = 12
|
|
426
|
+
bands = []
|
|
427
|
+
for i in range(0, len(remaining_cols), band_size):
|
|
428
|
+
band_cols = [key_col] + remaining_cols[i:i + band_size]
|
|
429
|
+
bands.append(band_cols)
|
|
430
|
+
else:
|
|
431
|
+
bands = [list(range(n_cols))] # Single band with all columns
|
|
432
|
+
|
|
433
|
+
# Process each band
|
|
434
|
+
for band_idx, band_cols in enumerate(bands):
|
|
435
|
+
band_col_names = [col_names[c] for c in band_cols]
|
|
436
|
+
|
|
437
|
+
# Build header text for pipe format
|
|
438
|
+
if config.table_chunk_format == "pipe":
|
|
439
|
+
header_text = " | ".join(band_col_names)
|
|
440
|
+
else:
|
|
441
|
+
header_text = "" # Not needed for row_record; names are inline
|
|
442
|
+
|
|
443
|
+
# Fix C: Token-aware row batching
|
|
444
|
+
current_row_texts: list[str] = []
|
|
445
|
+
current_tokens = _count_tokens(header_text) if header_text else 0
|
|
446
|
+
chunk_row_start = data_row_indices[0] if data_row_indices else 0
|
|
447
|
+
|
|
448
|
+
for r_idx in data_row_indices:
|
|
449
|
+
row_data = {c: grid.get(r_idx, {}).get(c, "") for c in band_cols}
|
|
450
|
+
|
|
451
|
+
# Fix D: Render based on format
|
|
452
|
+
if config.table_chunk_format == "row_record":
|
|
453
|
+
row_text = _render_row_as_record(r_idx, row_data, band_col_names, len(band_cols))
|
|
454
|
+
else:
|
|
455
|
+
row_text = _render_row_as_pipe(row_data, len(band_cols))
|
|
456
|
+
|
|
457
|
+
if not row_text.strip():
|
|
458
|
+
continue
|
|
459
|
+
|
|
460
|
+
row_tokens = _count_tokens(row_text)
|
|
461
|
+
|
|
462
|
+
# Would adding this row exceed budget?
|
|
463
|
+
if current_tokens + row_tokens > config.max_tokens and current_row_texts:
|
|
464
|
+
# Flush current chunk
|
|
465
|
+
if config.table_chunk_format == "pipe" and header_text:
|
|
466
|
+
chunk_text = header_text + "\n" + "\n".join(current_row_texts)
|
|
467
|
+
else:
|
|
468
|
+
chunk_text = "\n".join(current_row_texts)
|
|
469
|
+
|
|
470
|
+
chunk_row_end = r_idx - 1
|
|
471
|
+
meta = {
|
|
472
|
+
"row_start": chunk_row_start,
|
|
473
|
+
"row_end": chunk_row_end,
|
|
474
|
+
"col_band": band_cols if len(bands) > 1 else None,
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
chunks.append(Chunk(
|
|
478
|
+
text=chunk_text,
|
|
479
|
+
token_count=_count_tokens(chunk_text),
|
|
480
|
+
chunk_type="table",
|
|
481
|
+
section_path=list(block.hierarchy_path),
|
|
482
|
+
page_numbers=[page],
|
|
483
|
+
block_ids=[block.block_id],
|
|
484
|
+
metadata=meta,
|
|
485
|
+
))
|
|
486
|
+
|
|
487
|
+
current_row_texts = []
|
|
488
|
+
current_tokens = _count_tokens(header_text) if header_text else 0
|
|
489
|
+
chunk_row_start = r_idx
|
|
490
|
+
|
|
491
|
+
current_row_texts.append(row_text)
|
|
492
|
+
current_tokens += row_tokens
|
|
493
|
+
|
|
494
|
+
# Flush remaining rows
|
|
495
|
+
if current_row_texts:
|
|
496
|
+
if config.table_chunk_format == "pipe" and header_text:
|
|
497
|
+
chunk_text = header_text + "\n" + "\n".join(current_row_texts)
|
|
498
|
+
else:
|
|
499
|
+
chunk_text = "\n".join(current_row_texts)
|
|
500
|
+
|
|
501
|
+
chunk_row_end = data_row_indices[-1] if data_row_indices else chunk_row_start
|
|
502
|
+
meta = {
|
|
503
|
+
"row_start": chunk_row_start,
|
|
504
|
+
"row_end": chunk_row_end,
|
|
505
|
+
"col_band": band_cols if len(bands) > 1 else None,
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
chunks.append(Chunk(
|
|
509
|
+
text=chunk_text,
|
|
510
|
+
token_count=_count_tokens(chunk_text),
|
|
511
|
+
chunk_type="table",
|
|
512
|
+
section_path=list(block.hierarchy_path),
|
|
513
|
+
page_numbers=[page],
|
|
514
|
+
block_ids=[block.block_id],
|
|
515
|
+
metadata=meta,
|
|
516
|
+
))
|
|
517
|
+
|
|
518
|
+
return chunks
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
# ---------------------------------------------------------------------------
|
|
522
|
+
# Strategy 5: List-aware chunking
|
|
523
|
+
# ---------------------------------------------------------------------------
|
|
524
|
+
|
|
525
|
+
def _extract_list_groups(blocks: list[Block]) -> list[tuple[int, int]]:
|
|
526
|
+
"""
|
|
527
|
+
Identify contiguous list_item sequences with their lead-in paragraph.
|
|
528
|
+
Returns list of (start_index, end_index) inclusive ranges.
|
|
529
|
+
"""
|
|
530
|
+
groups: list[tuple[int, int]] = []
|
|
531
|
+
i = 0
|
|
532
|
+
while i < len(blocks):
|
|
533
|
+
if blocks[i].type == BlockType.LIST_ITEM:
|
|
534
|
+
# Look back for a lead-in paragraph
|
|
535
|
+
start = i
|
|
536
|
+
if i > 0 and blocks[i - 1].type == BlockType.PARAGRAPH:
|
|
537
|
+
start = i - 1
|
|
538
|
+
|
|
539
|
+
# Extend to all consecutive list items
|
|
540
|
+
end = i
|
|
541
|
+
while end + 1 < len(blocks) and blocks[end + 1].type == BlockType.LIST_ITEM:
|
|
542
|
+
end += 1
|
|
543
|
+
|
|
544
|
+
groups.append((start, end))
|
|
545
|
+
i = end + 1
|
|
546
|
+
else:
|
|
547
|
+
i += 1
|
|
548
|
+
|
|
549
|
+
return groups
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
# ---------------------------------------------------------------------------
|
|
553
|
+
# Main chunker class
|
|
554
|
+
# ---------------------------------------------------------------------------
|
|
555
|
+
|
|
556
|
+
class HybridChunker:
|
|
557
|
+
"""
|
|
558
|
+
Hybrid chunking engine combining 6 strategies for RAG-optimized output.
|
|
559
|
+
|
|
560
|
+
Usage:
|
|
561
|
+
chunker = HybridChunker(ChunkingConfig())
|
|
562
|
+
chunks = chunker.chunk(blocks)
|
|
563
|
+
"""
|
|
564
|
+
|
|
565
|
+
def __init__(self, config: Optional[ChunkingConfig] = None):
|
|
566
|
+
self.config = config or ChunkingConfig()
|
|
567
|
+
|
|
568
|
+
def chunk(self, blocks: list[Block]) -> list[Chunk]:
|
|
569
|
+
"""
|
|
570
|
+
Run the full hybrid chunking pipeline on a list of blocks.
|
|
571
|
+
|
|
572
|
+
Steps:
|
|
573
|
+
0. Autonomous equation detection (re-tag missed equations)
|
|
574
|
+
1. Filter header/footer blocks
|
|
575
|
+
2. Group by section (hierarchy_path)
|
|
576
|
+
3. Per section: table-aware → list-aware → token-window packing
|
|
577
|
+
4. Apply overlap between consecutive chunks within a section
|
|
578
|
+
"""
|
|
579
|
+
logger.info(f"[HybridChunker] Starting — {len(blocks)} blocks, "
|
|
580
|
+
f"max_tokens={self.config.max_tokens}")
|
|
581
|
+
|
|
582
|
+
# --- Strategy 0: equation detection ---
|
|
583
|
+
if self.config.detect_equations:
|
|
584
|
+
blocks = _detect_equations(blocks)
|
|
585
|
+
|
|
586
|
+
# --- Filter headers/footers ---
|
|
587
|
+
if self.config.exclude_headers_footers:
|
|
588
|
+
before = len(blocks)
|
|
589
|
+
blocks = [
|
|
590
|
+
b for b in blocks
|
|
591
|
+
if b.type not in (BlockType.HEADER, BlockType.FOOTER)
|
|
592
|
+
]
|
|
593
|
+
filtered = before - len(blocks)
|
|
594
|
+
if filtered:
|
|
595
|
+
logger.info(f" Filtered {filtered} header/footer block(s)")
|
|
596
|
+
|
|
597
|
+
# --- Filter separator-only blocks (underscores, dashes, etc.) ---
|
|
598
|
+
before = len(blocks)
|
|
599
|
+
blocks = [
|
|
600
|
+
b for b in blocks
|
|
601
|
+
if not _is_separator_only(b.text)
|
|
602
|
+
]
|
|
603
|
+
sep_filtered = before - len(blocks)
|
|
604
|
+
if sep_filtered:
|
|
605
|
+
logger.info(f" Filtered {sep_filtered} separator-only block(s)")
|
|
606
|
+
|
|
607
|
+
# --- Strategy 1: group by section ---
|
|
608
|
+
section_groups = self._group_by_section(blocks)
|
|
609
|
+
logger.info(f" {len(section_groups)} section group(s)")
|
|
610
|
+
|
|
611
|
+
all_chunks: list[Chunk] = []
|
|
612
|
+
|
|
613
|
+
for section_path, section_blocks in section_groups:
|
|
614
|
+
section_chunks = self._chunk_section(section_path, section_blocks)
|
|
615
|
+
all_chunks.extend(section_chunks)
|
|
616
|
+
|
|
617
|
+
# --- Merge small chunks ---
|
|
618
|
+
all_chunks = self._merge_small_chunks(all_chunks)
|
|
619
|
+
|
|
620
|
+
# --- Apply overlap ---
|
|
621
|
+
all_chunks = self._apply_overlap(all_chunks)
|
|
622
|
+
|
|
623
|
+
logger.info(f"[HybridChunker] Done — {len(all_chunks)} chunks produced")
|
|
624
|
+
return all_chunks
|
|
625
|
+
|
|
626
|
+
# -----------------------------------------------------------------------
|
|
627
|
+
# Strategy 1: Structure-aware grouping
|
|
628
|
+
# -----------------------------------------------------------------------
|
|
629
|
+
|
|
630
|
+
def _group_by_section(
|
|
631
|
+
self, blocks: list[Block]
|
|
632
|
+
) -> list[tuple[list[str], list[Block]]]:
|
|
633
|
+
"""Group blocks by hierarchy_path (section boundaries)."""
|
|
634
|
+
groups: list[tuple[list[str], list[Block]]] = []
|
|
635
|
+
current_path: list[str] | None = None
|
|
636
|
+
current_blocks: list[Block] = []
|
|
637
|
+
|
|
638
|
+
for block in blocks:
|
|
639
|
+
path = tuple(block.hierarchy_path)
|
|
640
|
+
if path != tuple(current_path or []):
|
|
641
|
+
if current_blocks:
|
|
642
|
+
groups.append((list(current_path or []), current_blocks))
|
|
643
|
+
current_path = list(block.hierarchy_path)
|
|
644
|
+
current_blocks = [block]
|
|
645
|
+
else:
|
|
646
|
+
current_blocks.append(block)
|
|
647
|
+
|
|
648
|
+
if current_blocks:
|
|
649
|
+
groups.append((list(current_path or []), current_blocks))
|
|
650
|
+
|
|
651
|
+
return groups
|
|
652
|
+
|
|
653
|
+
# -----------------------------------------------------------------------
|
|
654
|
+
# Per-section chunking
|
|
655
|
+
# -----------------------------------------------------------------------
|
|
656
|
+
|
|
657
|
+
def _chunk_section(
|
|
658
|
+
self, section_path: list[str], blocks: list[Block]
|
|
659
|
+
) -> list[Chunk]:
|
|
660
|
+
"""
|
|
661
|
+
Process a section's blocks using strategies 2-5.
|
|
662
|
+
|
|
663
|
+
Order:
|
|
664
|
+
- Tables → dedicated table chunks
|
|
665
|
+
- List groups → dedicated list chunks
|
|
666
|
+
- Equations → kept with surrounding context
|
|
667
|
+
- Remaining → token-window packing
|
|
668
|
+
"""
|
|
669
|
+
chunks: list[Chunk] = []
|
|
670
|
+
|
|
671
|
+
# Identify which blocks are consumed by tables or list groups
|
|
672
|
+
consumed: set[int] = set()
|
|
673
|
+
|
|
674
|
+
# --- Strategy 4: Table-aware ---
|
|
675
|
+
for i, block in enumerate(blocks):
|
|
676
|
+
if block.type == BlockType.TABLE:
|
|
677
|
+
table_chunks = _chunk_table(block, self.config)
|
|
678
|
+
chunks.extend(table_chunks)
|
|
679
|
+
consumed.add(i)
|
|
680
|
+
# Also consume adjacent captions
|
|
681
|
+
if i > 0 and blocks[i - 1].type == BlockType.CAPTION:
|
|
682
|
+
# Prepend caption to first table chunk
|
|
683
|
+
if table_chunks and blocks[i - 1].text:
|
|
684
|
+
table_chunks[0].text = blocks[i - 1].text + "\n\n" + table_chunks[0].text
|
|
685
|
+
table_chunks[0].token_count = _count_tokens(table_chunks[0].text)
|
|
686
|
+
table_chunks[0].block_ids.insert(0, blocks[i - 1].block_id)
|
|
687
|
+
consumed.add(i - 1)
|
|
688
|
+
if i + 1 < len(blocks) and blocks[i + 1].type == BlockType.CAPTION:
|
|
689
|
+
if table_chunks:
|
|
690
|
+
table_chunks[-1].text += "\n\n" + blocks[i + 1].text
|
|
691
|
+
table_chunks[-1].token_count = _count_tokens(table_chunks[-1].text)
|
|
692
|
+
table_chunks[-1].block_ids.append(blocks[i + 1].block_id)
|
|
693
|
+
consumed.add(i + 1)
|
|
694
|
+
|
|
695
|
+
# --- Strategy 5: List-aware ---
|
|
696
|
+
remaining = [(i, b) for i, b in enumerate(blocks) if i not in consumed]
|
|
697
|
+
|
|
698
|
+
# Find list groups in the remaining blocks
|
|
699
|
+
remaining_blocks = [b for _, b in remaining]
|
|
700
|
+
remaining_indices = [i for i, _ in remaining]
|
|
701
|
+
list_groups = _extract_list_groups(remaining_blocks)
|
|
702
|
+
|
|
703
|
+
list_consumed_local: set[int] = set() # indices into remaining_blocks
|
|
704
|
+
for group_start, group_end in list_groups:
|
|
705
|
+
group_blocks = remaining_blocks[group_start:group_end + 1]
|
|
706
|
+
group_text = "\n\n".join(b.text for b in group_blocks if b.text)
|
|
707
|
+
tokens = _count_tokens(group_text)
|
|
708
|
+
|
|
709
|
+
if tokens <= self.config.max_tokens:
|
|
710
|
+
chunks.append(Chunk(
|
|
711
|
+
text=group_text,
|
|
712
|
+
token_count=tokens,
|
|
713
|
+
chunk_type="list",
|
|
714
|
+
section_path=section_path,
|
|
715
|
+
page_numbers=sorted(set(
|
|
716
|
+
b.provenance.page_number for b in group_blocks
|
|
717
|
+
)),
|
|
718
|
+
block_ids=[b.block_id for b in group_blocks],
|
|
719
|
+
))
|
|
720
|
+
else:
|
|
721
|
+
# Split list at bullet boundaries
|
|
722
|
+
chunks.extend(
|
|
723
|
+
self._split_list_group(group_blocks, section_path)
|
|
724
|
+
)
|
|
725
|
+
|
|
726
|
+
for j in range(group_start, group_end + 1):
|
|
727
|
+
list_consumed_local.add(j)
|
|
728
|
+
consumed.add(remaining_indices[j])
|
|
729
|
+
|
|
730
|
+
# --- Strategy 3: Token-window packing for remaining blocks ---
|
|
731
|
+
final_remaining = [
|
|
732
|
+
b for i, b in enumerate(blocks) if i not in consumed
|
|
733
|
+
]
|
|
734
|
+
if final_remaining:
|
|
735
|
+
packed = self._pack_blocks(final_remaining, section_path)
|
|
736
|
+
chunks.extend(packed)
|
|
737
|
+
|
|
738
|
+
return chunks
|
|
739
|
+
|
|
740
|
+
# -----------------------------------------------------------------------
|
|
741
|
+
# Strategy 3: Token-window packing
|
|
742
|
+
# -----------------------------------------------------------------------
|
|
743
|
+
|
|
744
|
+
def _pack_blocks(
|
|
745
|
+
self, blocks: list[Block], section_path: list[str]
|
|
746
|
+
) -> list[Chunk]:
|
|
747
|
+
"""
|
|
748
|
+
Pack blocks into chunks respecting max_tokens.
|
|
749
|
+
Equations are kept with their surrounding context.
|
|
750
|
+
"""
|
|
751
|
+
chunks: list[Chunk] = []
|
|
752
|
+
current_texts: list[str] = []
|
|
753
|
+
current_ids: list[str] = []
|
|
754
|
+
current_pages: set[int] = set()
|
|
755
|
+
current_tokens = 0
|
|
756
|
+
has_equation = False
|
|
757
|
+
|
|
758
|
+
for block in blocks:
|
|
759
|
+
text = block.text.strip()
|
|
760
|
+
if not text:
|
|
761
|
+
continue
|
|
762
|
+
|
|
763
|
+
block_tokens = _count_tokens(text)
|
|
764
|
+
|
|
765
|
+
# If adding this block would exceed the limit, flush
|
|
766
|
+
if (current_tokens + block_tokens > self.config.max_tokens
|
|
767
|
+
and current_texts):
|
|
768
|
+
|
|
769
|
+
carry_text = None
|
|
770
|
+
carry_id = None
|
|
771
|
+
carry_tokens = 0
|
|
772
|
+
|
|
773
|
+
# Glue logic: if next block (this block) is an equation,
|
|
774
|
+
# keep the last paragraph with it
|
|
775
|
+
if block.type == BlockType.EQUATION and len(current_texts) > 0:
|
|
776
|
+
carry_text = current_texts.pop()
|
|
777
|
+
carry_id = current_ids.pop()
|
|
778
|
+
carry_tokens = _count_tokens(carry_text)
|
|
779
|
+
current_tokens -= carry_tokens
|
|
780
|
+
|
|
781
|
+
chunk_text = "\n\n".join(current_texts)
|
|
782
|
+
if chunk_text.strip():
|
|
783
|
+
chunk_type = "equation" if has_equation else "section"
|
|
784
|
+
chunks.append(Chunk(
|
|
785
|
+
text=chunk_text,
|
|
786
|
+
token_count=_count_tokens(chunk_text),
|
|
787
|
+
chunk_type=chunk_type,
|
|
788
|
+
section_path=section_path,
|
|
789
|
+
page_numbers=sorted(current_pages),
|
|
790
|
+
block_ids=list(current_ids),
|
|
791
|
+
equation_detected=has_equation,
|
|
792
|
+
))
|
|
793
|
+
|
|
794
|
+
current_texts = []
|
|
795
|
+
current_ids = []
|
|
796
|
+
current_pages = set()
|
|
797
|
+
current_tokens = 0
|
|
798
|
+
has_equation = False
|
|
799
|
+
|
|
800
|
+
if carry_text:
|
|
801
|
+
current_texts.append(carry_text)
|
|
802
|
+
current_ids.append(carry_id)
|
|
803
|
+
current_tokens += carry_tokens
|
|
804
|
+
# Note: We assume the carried block is close enough to the next block
|
|
805
|
+
# that simply adding the next block's page will suffice for provenance.
|
|
806
|
+
|
|
807
|
+
current_texts.append(text)
|
|
808
|
+
current_ids.append(block.block_id)
|
|
809
|
+
current_pages.add(block.provenance.page_number)
|
|
810
|
+
current_tokens += block_tokens
|
|
811
|
+
|
|
812
|
+
if block.type == BlockType.EQUATION:
|
|
813
|
+
has_equation = True
|
|
814
|
+
|
|
815
|
+
# Flush remaining
|
|
816
|
+
if current_texts:
|
|
817
|
+
chunk_text = "\n\n".join(current_texts)
|
|
818
|
+
chunk_type = "equation" if has_equation else "section"
|
|
819
|
+
# Only emit if meets min_tokens or is the only content
|
|
820
|
+
if _count_tokens(chunk_text) >= self.config.min_tokens or not chunks:
|
|
821
|
+
chunks.append(Chunk(
|
|
822
|
+
text=chunk_text,
|
|
823
|
+
token_count=_count_tokens(chunk_text),
|
|
824
|
+
chunk_type=chunk_type,
|
|
825
|
+
section_path=section_path,
|
|
826
|
+
page_numbers=sorted(current_pages),
|
|
827
|
+
block_ids=list(current_ids),
|
|
828
|
+
equation_detected=has_equation,
|
|
829
|
+
))
|
|
830
|
+
elif chunks:
|
|
831
|
+
# Merge into previous chunk
|
|
832
|
+
prev = chunks[-1]
|
|
833
|
+
prev.text += "\n\n" + chunk_text
|
|
834
|
+
prev.token_count = _count_tokens(prev.text)
|
|
835
|
+
prev.block_ids.extend(current_ids)
|
|
836
|
+
prev.page_numbers = sorted(
|
|
837
|
+
set(prev.page_numbers) | current_pages
|
|
838
|
+
)
|
|
839
|
+
if has_equation:
|
|
840
|
+
prev.equation_detected = True
|
|
841
|
+
prev.chunk_type = "equation"
|
|
842
|
+
|
|
843
|
+
return chunks
|
|
844
|
+
|
|
845
|
+
# -----------------------------------------------------------------------
|
|
846
|
+
# List splitting helper
|
|
847
|
+
# -----------------------------------------------------------------------
|
|
848
|
+
|
|
849
|
+
def _split_list_group(
|
|
850
|
+
self, blocks: list[Block], section_path: list[str]
|
|
851
|
+
) -> list[Chunk]:
|
|
852
|
+
"""Split a list group that exceeds max_tokens at bullet boundaries."""
|
|
853
|
+
chunks: list[Chunk] = []
|
|
854
|
+
current_texts: list[str] = []
|
|
855
|
+
current_ids: list[str] = []
|
|
856
|
+
current_pages: set[int] = set()
|
|
857
|
+
current_tokens = 0
|
|
858
|
+
|
|
859
|
+
for block in blocks:
|
|
860
|
+
text = block.text.strip()
|
|
861
|
+
if not text:
|
|
862
|
+
continue
|
|
863
|
+
block_tokens = _count_tokens(text)
|
|
864
|
+
|
|
865
|
+
if (current_tokens + block_tokens > self.config.max_tokens
|
|
866
|
+
and current_texts):
|
|
867
|
+
chunk_text = "\n\n".join(current_texts)
|
|
868
|
+
chunks.append(Chunk(
|
|
869
|
+
text=chunk_text,
|
|
870
|
+
token_count=_count_tokens(chunk_text),
|
|
871
|
+
chunk_type="list",
|
|
872
|
+
section_path=section_path,
|
|
873
|
+
page_numbers=sorted(current_pages),
|
|
874
|
+
block_ids=list(current_ids),
|
|
875
|
+
))
|
|
876
|
+
current_texts = []
|
|
877
|
+
current_ids = []
|
|
878
|
+
current_pages = set()
|
|
879
|
+
current_tokens = 0
|
|
880
|
+
|
|
881
|
+
current_texts.append(text)
|
|
882
|
+
current_ids.append(block.block_id)
|
|
883
|
+
current_pages.add(block.provenance.page_number)
|
|
884
|
+
current_tokens += block_tokens
|
|
885
|
+
|
|
886
|
+
if current_texts:
|
|
887
|
+
chunk_text = "\n\n".join(current_texts)
|
|
888
|
+
chunks.append(Chunk(
|
|
889
|
+
text=chunk_text,
|
|
890
|
+
token_count=_count_tokens(chunk_text),
|
|
891
|
+
chunk_type="list",
|
|
892
|
+
section_path=section_path,
|
|
893
|
+
page_numbers=sorted(current_pages),
|
|
894
|
+
block_ids=list(current_ids),
|
|
895
|
+
))
|
|
896
|
+
|
|
897
|
+
return chunks
|
|
898
|
+
|
|
899
|
+
# -----------------------------------------------------------------------
|
|
900
|
+
# Merge small chunks
|
|
901
|
+
# -----------------------------------------------------------------------
|
|
902
|
+
|
|
903
|
+
def _merge_small_chunks(self, chunks: list[Chunk]) -> list[Chunk]:
|
|
904
|
+
"""
|
|
905
|
+
Post-processing pass: merge any chunk below min_tokens into its
|
|
906
|
+
nearest neighbor within the same section. Preference order:
|
|
907
|
+
1. Previous chunk (same section_path)
|
|
908
|
+
2. Next chunk (same section_path)
|
|
909
|
+
3. Previous chunk (different section — avoid data loss)
|
|
910
|
+
"""
|
|
911
|
+
if not chunks or self.config.min_tokens <= 0:
|
|
912
|
+
return chunks
|
|
913
|
+
|
|
914
|
+
merged: list[Chunk] = []
|
|
915
|
+
|
|
916
|
+
for chunk in chunks:
|
|
917
|
+
# If large enough, keep
|
|
918
|
+
if chunk.token_count >= self.config.min_tokens:
|
|
919
|
+
merged.append(chunk)
|
|
920
|
+
continue
|
|
921
|
+
|
|
922
|
+
# --- Try to merge into previous chunk (same section) ---
|
|
923
|
+
if merged and merged[-1].section_path == chunk.section_path:
|
|
924
|
+
self._absorb(merged[-1], chunk)
|
|
925
|
+
logger.debug(
|
|
926
|
+
f" [MERGE] Merged small chunk {chunk.token_count} "
|
|
927
|
+
f"tokens into previous (same section)"
|
|
928
|
+
)
|
|
929
|
+
continue
|
|
930
|
+
|
|
931
|
+
# --- No previous same-section neighbor; buffer it ---
|
|
932
|
+
merged.append(chunk)
|
|
933
|
+
|
|
934
|
+
# Second pass: merge any remaining small chunks forward
|
|
935
|
+
final: list[Chunk] = []
|
|
936
|
+
for i, chunk in enumerate(merged):
|
|
937
|
+
if chunk.token_count < self.config.min_tokens:
|
|
938
|
+
# 1. Try next chunk (same section)
|
|
939
|
+
if i + 1 < len(merged) and merged[i + 1].section_path == chunk.section_path:
|
|
940
|
+
self._absorb_prepend(merged[i + 1], chunk)
|
|
941
|
+
logger.debug(
|
|
942
|
+
f" [MERGE] Merged small chunk {chunk.token_count} "
|
|
943
|
+
f"tokens into next (same section)"
|
|
944
|
+
)
|
|
945
|
+
continue
|
|
946
|
+
|
|
947
|
+
# 2. Try previous chunk (any section, fallback)
|
|
948
|
+
if final:
|
|
949
|
+
self._absorb(final[-1], chunk)
|
|
950
|
+
logger.debug(
|
|
951
|
+
f" [MERGE] Merged small chunk {chunk.token_count} "
|
|
952
|
+
f"tokens into previous (cross-section)"
|
|
953
|
+
)
|
|
954
|
+
continue
|
|
955
|
+
|
|
956
|
+
# 3. Try next chunk (any section, fallback - e.g. first chunk)
|
|
957
|
+
if i + 1 < len(merged):
|
|
958
|
+
self._absorb_prepend(merged[i + 1], chunk)
|
|
959
|
+
logger.debug(
|
|
960
|
+
f" [MERGE] Merged small chunk {chunk.token_count} "
|
|
961
|
+
f"tokens into next (cross-section)"
|
|
962
|
+
)
|
|
963
|
+
continue
|
|
964
|
+
|
|
965
|
+
# 4. Total fallback: if isolated and TINY, ignore it? Or keep?
|
|
966
|
+
# For now, we keep it if we can't merge anywhere.
|
|
967
|
+
logger.warning(
|
|
968
|
+
f" [MERGE] Could not merge isolated small chunk: {chunk.token_count} tokens"
|
|
969
|
+
)
|
|
970
|
+
|
|
971
|
+
final.append(chunk)
|
|
972
|
+
|
|
973
|
+
|
|
974
|
+
|
|
975
|
+
before = len(chunks)
|
|
976
|
+
after = len(final)
|
|
977
|
+
if before != after:
|
|
978
|
+
logger.info(
|
|
979
|
+
f" [MERGE] Merged {before - after} small chunk(s) "
|
|
980
|
+
f"(min_tokens={self.config.min_tokens}): {before} → {after}"
|
|
981
|
+
)
|
|
982
|
+
|
|
983
|
+
return final
|
|
984
|
+
|
|
985
|
+
@staticmethod
|
|
986
|
+
def _absorb(target: Chunk, small: Chunk) -> None:
|
|
987
|
+
"""Append small chunk content into target."""
|
|
988
|
+
target.text += "\n\n" + small.text
|
|
989
|
+
target.token_count = _count_tokens(target.text)
|
|
990
|
+
target.block_ids.extend(small.block_ids)
|
|
991
|
+
target.page_numbers = sorted(set(target.page_numbers) | set(small.page_numbers))
|
|
992
|
+
if small.equation_detected:
|
|
993
|
+
target.equation_detected = True
|
|
994
|
+
target.chunk_type = "equation"
|
|
995
|
+
|
|
996
|
+
@staticmethod
|
|
997
|
+
def _absorb_prepend(target: Chunk, small: Chunk) -> None:
|
|
998
|
+
"""Prepend small chunk content into target."""
|
|
999
|
+
target.text = small.text + "\n\n" + target.text
|
|
1000
|
+
target.token_count = _count_tokens(target.text)
|
|
1001
|
+
target.block_ids = small.block_ids + target.block_ids
|
|
1002
|
+
target.page_numbers = sorted(set(target.page_numbers) | set(small.page_numbers))
|
|
1003
|
+
if small.equation_detected:
|
|
1004
|
+
target.equation_detected = True
|
|
1005
|
+
target.chunk_type = "equation"
|
|
1006
|
+
|
|
1007
|
+
# -----------------------------------------------------------------------
|
|
1008
|
+
# Overlap
|
|
1009
|
+
# -----------------------------------------------------------------------
|
|
1010
|
+
|
|
1011
|
+
def _apply_overlap(self, chunks: list[Chunk]) -> list[Chunk]:
|
|
1012
|
+
"""
|
|
1013
|
+
Apply block-level overlap between consecutive chunks
|
|
1014
|
+
within the same section.
|
|
1015
|
+
"""
|
|
1016
|
+
if self.config.overlap_blocks <= 0 or len(chunks) < 2:
|
|
1017
|
+
return chunks
|
|
1018
|
+
|
|
1019
|
+
for i in range(1, len(chunks)):
|
|
1020
|
+
prev = chunks[i - 1]
|
|
1021
|
+
curr = chunks[i]
|
|
1022
|
+
|
|
1023
|
+
# Only overlap within same section
|
|
1024
|
+
if prev.section_path != curr.section_path:
|
|
1025
|
+
continue
|
|
1026
|
+
|
|
1027
|
+
# Skip overlap for table chunks
|
|
1028
|
+
if prev.chunk_type == "table" or curr.chunk_type == "table":
|
|
1029
|
+
continue
|
|
1030
|
+
|
|
1031
|
+
# Get last N paragraphs of previous chunk as overlap
|
|
1032
|
+
prev_parts = prev.text.split("\n\n")
|
|
1033
|
+
overlap_parts = prev_parts[-self.config.overlap_blocks:]
|
|
1034
|
+
|
|
1035
|
+
# Avoid duplicating equations in overlap
|
|
1036
|
+
if any("⟦EQUATION⟧" in part for part in overlap_parts):
|
|
1037
|
+
continue
|
|
1038
|
+
|
|
1039
|
+
|
|
1040
|
+
if overlap_parts:
|
|
1041
|
+
overlap_text = "\n\n".join(overlap_parts)
|
|
1042
|
+
curr.text = overlap_text + "\n\n" + curr.text
|
|
1043
|
+
curr.token_count = _count_tokens(curr.text)
|
|
1044
|
+
curr.overlap_with_previous = True
|
|
1045
|
+
|
|
1046
|
+
return chunks
|