longparser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1046 @@
1
+ """
2
+ Hybrid Chunker for LongParser — RAG-optimized document splitting.
3
+
4
+ Combines 6 strategies:
5
+ 0. Autonomous equation detection (pre-pass)
6
+ 1. Structure-aware (hierarchical) chunking
7
+ 2. Layout-aware block classification
8
+ 3. Token-window packing with overlap
9
+ 4. Table-aware chunking
10
+ 5. List-aware chunking
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import re
16
+ import logging
17
+ import unicodedata
18
+ from typing import Optional
19
+
20
+ from ..schemas import Block, BlockType, Chunk, ChunkingConfig
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # Constants for autonomous equation detection
26
+ # ---------------------------------------------------------------------------
27
+
28
+ # Greek letters (lowercase + uppercase)
29
+ _GREEK = set("αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ")
30
+
31
+ # Common math operators & symbols
32
+ _MATH_SYMBOLS = set("∑∏∫∂∇±×÷≤≥≠≈∞∈∉⊂⊃⊆⊇∪∩∧∨¬⊕⊗→←↔⇒⇐⇔∀∃∅⟨⟩⟦⟧")
33
+
34
+ # Subscript / superscript Unicode ranges
35
+ _SUB_SUPER = set("₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₔₕₖₗₘₙₚₛₜ"
36
+ "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿⁱ")
37
+
38
+ # Combined set for fast lookup
39
+ _MATH_CHARS = _GREEK | _MATH_SYMBOLS | _SUB_SUPER
40
+
41
+ # Regex patterns for equation-like text
42
+ _EQ_PATTERNS = [
43
+ # Variable subscript patterns: x_i, y_j, a_1, etc.
44
+ re.compile(r"\b[a-zA-Z]_[a-zA-Z0-9]+\b"),
45
+ # Inline math with equals: f(x) = ..., y = ...
46
+ re.compile(r"[a-zA-Z]\s*\([a-zA-Z,\s]*\)\s*="),
47
+ # Summation/product patterns
48
+ re.compile(r"∑|∏|∫|Σ|Π"),
49
+ # Fractions and common math notation
50
+ re.compile(r"\b(?:frac|sqrt|log|exp|sin|cos|tan|argmax|argmin)\b"),
51
+ # LaTeX-like patterns that may survive OCR
52
+ re.compile(r"\\[a-zA-Z]+"),
53
+ # Tensor/dimension-like notation: d1, d2, d3, n(D)
54
+ re.compile(r"\b[a-z]\d+\b"),
55
+ # Comma-separated Greek or single-letter variables: κ,λ,ν or x, y, z
56
+ re.compile(r"[α-ωΑ-Ω],\s*[α-ωΑ-Ω]"),
57
+ # Parenthetical math notation: ( ) D n, (x), f(x)
58
+ re.compile(r"\(\s*\)\s*[A-Z]\s*[a-z]"),
59
+ # Scattered single-letter math variables with spaces: n y n y, a i y
60
+ re.compile(r"(?:\b[a-z]\b\s+){3,}"),
61
+ # Cardinality / dimensionality notation
62
+ re.compile(r"\b(?:cardinality|dimension|tensor|kernel|initializer)\b", re.IGNORECASE),
63
+ ]
64
+
65
+ # Phrases that introduce or surround equations
66
+ _EQ_LEAD_PHRASES = re.compile(
67
+ r"(?:defined\s+as|given\s+by|expressed\s+as|computed\s+as|"
68
+ r"formally|where\s+\w+\s+(?:is|are|denotes?))",
69
+ re.IGNORECASE,
70
+ )
71
+
72
+ # Phrases at the end of a block that introduce an upcoming equation
73
+ _EQ_TAIL_PHRASES = re.compile(
74
+ r"(?:defined\s+as\s*[,.]?\s*$|given\s+by\s*[,.]?\s*$|"
75
+ r"expressed\s+as\s*[,.]?\s*$|computed\s+as\s*[,.]?\s*$)",
76
+ re.IGNORECASE | re.MULTILINE,
77
+ )
78
+
79
+
80
+ # ---------------------------------------------------------------------------
81
+ # Tokenizer (word-split approximation, no tiktoken dependency)
82
+ # ---------------------------------------------------------------------------
83
+
84
+ def _count_tokens(text: str) -> int:
85
+ """Approximate token count (≈ 0.75 tokens per whitespace-split word)."""
86
+ words = text.split()
87
+ return max(1, int(len(words) * 1.33)) # words * 1.33 ≈ tokens
88
+
89
+
90
+ # ---------------------------------------------------------------------------
91
+ # Strategy 0: Autonomous equation detection
92
+ # ---------------------------------------------------------------------------
93
+
94
+ # Regex for blocks that contain only separator characters
95
+ _SEPARATOR_ONLY = re.compile(r"^[\s_\-=\.·•─━═]+$")
96
+
97
+
98
+ def _is_separator_only(text: str) -> bool:
99
+ """Return True if text is only separator chars (underscores, dashes, etc.)."""
100
+ return bool(text and _SEPARATOR_ONLY.match(text.strip()))
101
+
102
+
103
+ def _math_char_density(text: str) -> float:
104
+ """Fraction of chars that are math-class Unicode."""
105
+ if not text:
106
+ return 0.0
107
+ count = sum(1 for ch in text if ch in _MATH_CHARS or
108
+ unicodedata.category(ch) in ("Sm", "So"))
109
+ return count / len(text)
110
+
111
+
112
+ def _eq_pattern_hits(text: str) -> int:
113
+ """Count how many equation regex patterns match."""
114
+ return sum(1 for pat in _EQ_PATTERNS if pat.search(text))
115
+
116
+
117
+ def _is_equation_candidate(block: Block, prev_block: Optional[Block] = None) -> bool:
118
+ """
119
+ Determine if a paragraph block should be re-tagged as an equation.
120
+
121
+ Scoring heuristics (threshold = 2.0):
122
+ - Math-char density > 5% → +2.0; > 1% → +1.0
123
+ - Equation pattern hits ≥ 3 → +2.0; ≥ 2 → +1.5; ≥ 1 → +0.5
124
+ - Short block (< 80 chars) with isolated variables → +1.0
125
+ - Previous block ends with lead-in phrase → +1.5
126
+ - Self text starts with or contains lead-in phrase → +1.0
127
+ - Greek letters ≥ 2 → +1.5; ≥ 1 → +0.5
128
+ """
129
+ if block.type != BlockType.PARAGRAPH:
130
+ return False
131
+
132
+ text = block.text.strip()
133
+ if not text:
134
+ return False
135
+
136
+ # Math-char density
137
+ density = _math_char_density(text)
138
+
139
+ # Pattern hits
140
+ hits = _eq_pattern_hits(text)
141
+
142
+ # Short block (< 80 chars) with variable-like single letters
143
+ is_short = len(text) < 80
144
+ has_isolated_vars = bool(re.search(r"(?<!\w)[a-zA-Z](?!\w)", text))
145
+
146
+ # Previous block leads into equation
147
+ has_lead_in = False
148
+ if prev_block and prev_block.text:
149
+ prev_tail = prev_block.text.strip()[-120:]
150
+ if _EQ_LEAD_PHRASES.search(prev_tail) or _EQ_TAIL_PHRASES.search(prev_tail):
151
+ has_lead_in = True
152
+
153
+ # Self text contains equation-contextual phrases
154
+ has_self_context = bool(_EQ_LEAD_PHRASES.search(text))
155
+
156
+ # --- Scoring ---
157
+ score = 0.0
158
+
159
+ # Density scoring (lowered thresholds for OCR'd math text)
160
+ if density > 0.05:
161
+ score += 2.0
162
+ elif density > 0.01:
163
+ score += 1.0
164
+
165
+ # Pattern hits
166
+ if hits >= 3:
167
+ score += 2.0
168
+ elif hits >= 2:
169
+ score += 1.5
170
+ elif hits >= 1:
171
+ score += 0.5
172
+
173
+ # Short math fragments
174
+ if is_short and has_isolated_vars:
175
+ score += 1.0
176
+
177
+ # Contextual cues
178
+ if has_lead_in:
179
+ score += 1.5
180
+
181
+ if has_self_context:
182
+ score += 1.0
183
+
184
+ # Greek letter presence
185
+ greek_count = sum(1 for ch in text if ch in _GREEK)
186
+ if greek_count >= 2:
187
+ score += 1.5
188
+ elif greek_count >= 1:
189
+ score += 0.5
190
+
191
+ return score >= 2.0
192
+
193
+
194
+ def _detect_equations(blocks: list[Block]) -> list[Block]:
195
+ """
196
+ Pre-pass: re-tag paragraph blocks that look like equations.
197
+ Returns the modified block list (original list is mutated in place).
198
+ """
199
+ retagged = 0
200
+ for i, block in enumerate(blocks):
201
+ prev = blocks[i - 1] if i > 0 else None
202
+ if _is_equation_candidate(block, prev):
203
+ logger.info(
204
+ f" [EQ-DETECT] Re-tagged block {block.block_id} "
205
+ f"(order={block.order_index}, page={block.provenance.page_number}) "
206
+ f"as equation — preview: {block.text[:80]!r}"
207
+ )
208
+ block.type = BlockType.EQUATION
209
+ retagged += 1
210
+
211
+ logger.info(f" [EQ-DETECT] Re-tagged {retagged} paragraph(s) → equation")
212
+ return blocks
213
+
214
+
215
+ # ---------------------------------------------------------------------------
216
+ # Strategy 4: Table-aware chunking (with smart rendering + profiling)
217
+ # ---------------------------------------------------------------------------
218
+
219
+
220
+ def _build_ordered_grid(table) -> dict[int, dict[int, str]]:
221
+ """
222
+ Build a 2D dict from table cells: rows[r][c] = text.
223
+ Enforces column order (Fix B).
224
+ """
225
+ rows: dict[int, dict[int, str]] = {}
226
+ for cell in table.cells:
227
+ r = cell.row_index
228
+ c = cell.col_index
229
+ rows.setdefault(r, {})[c] = cell.text
230
+ return rows
231
+
232
+
233
+ def _detect_header_rows(table) -> list[int]:
234
+ """
235
+ Detect header row indices using column_header flag (Gap #2).
236
+ Falls back to row 0 if no flags are set.
237
+ """
238
+ header_rows = set()
239
+ for cell in table.cells:
240
+ # Check if the cell has a column_header flag (from Docling)
241
+ if getattr(cell, 'column_header', False):
242
+ header_rows.add(cell.row_index)
243
+
244
+ if header_rows:
245
+ return sorted(header_rows)
246
+ # Fallback: treat row 0 as header
247
+ return [0]
248
+
249
+
250
+ def _get_column_names(grid: dict[int, dict[int, str]], header_rows: list[int], n_cols: int) -> list[str]:
251
+ """
252
+ Extract column names from header rows.
253
+ Synthesizes col_0..col_n if headers are empty.
254
+ """
255
+ names = [""] * n_cols
256
+ for hr in header_rows:
257
+ row_data = grid.get(hr, {})
258
+ for c in range(n_cols):
259
+ val = row_data.get(c, "").strip()
260
+ if val:
261
+ if names[c]:
262
+ names[c] += f" {val}"
263
+ else:
264
+ names[c] = val
265
+
266
+ # Synthesize if still empty
267
+ for c in range(n_cols):
268
+ if not names[c]:
269
+ names[c] = f"col_{c}"
270
+
271
+ return names
272
+
273
+
274
+ def _render_row_as_record(row_idx: int, row_data: dict[int, str], col_names: list[str], n_cols: int) -> str:
275
+ """Render a single row as: Row N: col_a=val; col_b=val; ..."""
276
+ parts = []
277
+ for c in range(n_cols):
278
+ val = row_data.get(c, "").strip()
279
+ if val:
280
+ parts.append(f"{col_names[c]}={val}")
281
+ return f"Row {row_idx}: " + "; ".join(parts) if parts else ""
282
+
283
+
284
+ def _render_row_as_pipe(row_data: dict[int, str], n_cols: int) -> str:
285
+ """Render a single row as pipe-delimited."""
286
+ return " | ".join(row_data.get(c, "") for c in range(n_cols))
287
+
288
+
289
+ def _guess_col_type(values: list[str]) -> str:
290
+ """Guess column type from sample values."""
291
+ if not values:
292
+ return "string"
293
+
294
+ non_empty = [v for v in values if v.strip()]
295
+ if not non_empty:
296
+ return "string"
297
+
298
+ # Check numeric
299
+ num_count = 0
300
+ for v in non_empty[:20]: # Sample first 20
301
+ try:
302
+ float(v.replace(",", "").replace("$", "").replace("%", ""))
303
+ num_count += 1
304
+ except ValueError:
305
+ pass
306
+ if num_count > len(non_empty[:20]) * 0.7:
307
+ return "number"
308
+
309
+ # Check date-like
310
+ import re
311
+ date_pattern = re.compile(r'\d{1,4}[-/]\d{1,2}[-/]\d{1,4}')
312
+ date_count = sum(1 for v in non_empty[:20] if date_pattern.search(v))
313
+ if date_count > len(non_empty[:20]) * 0.5:
314
+ return "date"
315
+
316
+ return "string"
317
+
318
+
319
+ def _generate_schema_chunk(
320
+ block, table, grid, header_rows, col_names, n_cols, data_row_indices
321
+ ) -> Chunk:
322
+ """
323
+ Generate a schema chunk for a table (Fix E + Gap #5).
324
+ Contains: table info, column list with types, null rates, sample rows.
325
+ """
326
+ page = block.provenance.page_number
327
+ n_data = len(data_row_indices)
328
+
329
+ # Column profiling
330
+ col_profiles = []
331
+ for c in range(n_cols):
332
+ values = [grid.get(r, {}).get(c, "") for r in data_row_indices]
333
+ col_type = _guess_col_type(values)
334
+ total = len(values)
335
+ null_count = sum(1 for v in values if not v.strip())
336
+ null_pct = f"{(null_count / total * 100):.0f}%" if total > 0 else "0%"
337
+ col_profiles.append(f" - {col_names[c]} ({col_type}, {null_pct} null)")
338
+
339
+ # Sample rows (first 3–5)
340
+ sample_count = min(5, n_data)
341
+ sample_rows = []
342
+ for i, r_idx in enumerate(data_row_indices[:sample_count]):
343
+ row_data = grid.get(r_idx, {})
344
+ parts = [f"{col_names[c]}={row_data.get(c, '')}" for c in range(n_cols)]
345
+ sample_rows.append(f" Row {r_idx}: " + "; ".join(parts))
346
+
347
+ lines = [
348
+ f"[TABLE SCHEMA]",
349
+ f"Table ID: {block.block_id}",
350
+ f"Rows: {n_data} (data rows), Columns: {n_cols}",
351
+ f"Columns:",
352
+ ]
353
+ lines.extend(col_profiles)
354
+ lines.append(f"Sample Rows ({sample_count}):")
355
+ lines.extend(sample_rows)
356
+
357
+ schema_text = "\n".join(lines)
358
+ return Chunk(
359
+ text=schema_text,
360
+ token_count=_count_tokens(schema_text),
361
+ chunk_type="table_schema",
362
+ section_path=list(block.hierarchy_path),
363
+ page_numbers=[page],
364
+ block_ids=[block.block_id],
365
+ metadata={"schema": True, "n_rows": n_data, "n_cols": n_cols},
366
+ )
367
+
368
+
369
+ def _chunk_table(block: Block, config: ChunkingConfig) -> list[Chunk]:
370
+ """
371
+ Create chunks from a table block.
372
+
373
+ Implements:
374
+ Fix B: Column-ordered rendering
375
+ Fix C: Token-aware row batching (header repeated)
376
+ Fix D: Row-as-record format for RAG
377
+ Fix E: Schema chunk per table
378
+ Fix F: Wide-table column banding
379
+ Gap #2: Smart header detection
380
+ Gap #4: Chunk metadata (row ranges)
381
+ Gap #5: Schema chunk profiling
382
+ """
383
+ chunks: list[Chunk] = []
384
+ table = block.table
385
+ page = block.provenance.page_number
386
+
387
+ if not table:
388
+ # No structured table data — fallback to single text chunk
389
+ chunks.append(Chunk(
390
+ text=block.text,
391
+ token_count=_count_tokens(block.text),
392
+ chunk_type="table",
393
+ section_path=list(block.hierarchy_path),
394
+ page_numbers=[page],
395
+ block_ids=[block.block_id],
396
+ ))
397
+ return chunks
398
+
399
+ _n_rows = table.n_rows
400
+ n_cols = table.n_cols
401
+
402
+ # Fix B: Build ordered grid
403
+ grid = _build_ordered_grid(table)
404
+
405
+ # Gap #2: Detect header rows
406
+ header_rows = _detect_header_rows(table)
407
+ col_names = _get_column_names(grid, header_rows, n_cols)
408
+
409
+ # Data rows = all rows not in header
410
+ header_set = set(header_rows)
411
+ data_row_indices = sorted(r for r in grid.keys() if r not in header_set)
412
+
413
+ # Fix E + Gap #5: Schema chunk
414
+ if config.generate_schema_chunks and data_row_indices:
415
+ schema_chunk = _generate_schema_chunk(
416
+ block, table, grid, header_rows, col_names, n_cols, data_row_indices
417
+ )
418
+ chunks.append(schema_chunk)
419
+
420
+ # Fix F: Wide-table column banding
421
+ if n_cols > config.wide_table_col_threshold:
422
+ # Keep col 0 as key column, split remaining into bands
423
+ key_col = 0
424
+ remaining_cols = list(range(1, n_cols))
425
+ band_size = 12
426
+ bands = []
427
+ for i in range(0, len(remaining_cols), band_size):
428
+ band_cols = [key_col] + remaining_cols[i:i + band_size]
429
+ bands.append(band_cols)
430
+ else:
431
+ bands = [list(range(n_cols))] # Single band with all columns
432
+
433
+ # Process each band
434
+ for band_idx, band_cols in enumerate(bands):
435
+ band_col_names = [col_names[c] for c in band_cols]
436
+
437
+ # Build header text for pipe format
438
+ if config.table_chunk_format == "pipe":
439
+ header_text = " | ".join(band_col_names)
440
+ else:
441
+ header_text = "" # Not needed for row_record; names are inline
442
+
443
+ # Fix C: Token-aware row batching
444
+ current_row_texts: list[str] = []
445
+ current_tokens = _count_tokens(header_text) if header_text else 0
446
+ chunk_row_start = data_row_indices[0] if data_row_indices else 0
447
+
448
+ for r_idx in data_row_indices:
449
+ row_data = {c: grid.get(r_idx, {}).get(c, "") for c in band_cols}
450
+
451
+ # Fix D: Render based on format
452
+ if config.table_chunk_format == "row_record":
453
+ row_text = _render_row_as_record(r_idx, row_data, band_col_names, len(band_cols))
454
+ else:
455
+ row_text = _render_row_as_pipe(row_data, len(band_cols))
456
+
457
+ if not row_text.strip():
458
+ continue
459
+
460
+ row_tokens = _count_tokens(row_text)
461
+
462
+ # Would adding this row exceed budget?
463
+ if current_tokens + row_tokens > config.max_tokens and current_row_texts:
464
+ # Flush current chunk
465
+ if config.table_chunk_format == "pipe" and header_text:
466
+ chunk_text = header_text + "\n" + "\n".join(current_row_texts)
467
+ else:
468
+ chunk_text = "\n".join(current_row_texts)
469
+
470
+ chunk_row_end = r_idx - 1
471
+ meta = {
472
+ "row_start": chunk_row_start,
473
+ "row_end": chunk_row_end,
474
+ "col_band": band_cols if len(bands) > 1 else None,
475
+ }
476
+
477
+ chunks.append(Chunk(
478
+ text=chunk_text,
479
+ token_count=_count_tokens(chunk_text),
480
+ chunk_type="table",
481
+ section_path=list(block.hierarchy_path),
482
+ page_numbers=[page],
483
+ block_ids=[block.block_id],
484
+ metadata=meta,
485
+ ))
486
+
487
+ current_row_texts = []
488
+ current_tokens = _count_tokens(header_text) if header_text else 0
489
+ chunk_row_start = r_idx
490
+
491
+ current_row_texts.append(row_text)
492
+ current_tokens += row_tokens
493
+
494
+ # Flush remaining rows
495
+ if current_row_texts:
496
+ if config.table_chunk_format == "pipe" and header_text:
497
+ chunk_text = header_text + "\n" + "\n".join(current_row_texts)
498
+ else:
499
+ chunk_text = "\n".join(current_row_texts)
500
+
501
+ chunk_row_end = data_row_indices[-1] if data_row_indices else chunk_row_start
502
+ meta = {
503
+ "row_start": chunk_row_start,
504
+ "row_end": chunk_row_end,
505
+ "col_band": band_cols if len(bands) > 1 else None,
506
+ }
507
+
508
+ chunks.append(Chunk(
509
+ text=chunk_text,
510
+ token_count=_count_tokens(chunk_text),
511
+ chunk_type="table",
512
+ section_path=list(block.hierarchy_path),
513
+ page_numbers=[page],
514
+ block_ids=[block.block_id],
515
+ metadata=meta,
516
+ ))
517
+
518
+ return chunks
519
+
520
+
521
+ # ---------------------------------------------------------------------------
522
+ # Strategy 5: List-aware chunking
523
+ # ---------------------------------------------------------------------------
524
+
525
+ def _extract_list_groups(blocks: list[Block]) -> list[tuple[int, int]]:
526
+ """
527
+ Identify contiguous list_item sequences with their lead-in paragraph.
528
+ Returns list of (start_index, end_index) inclusive ranges.
529
+ """
530
+ groups: list[tuple[int, int]] = []
531
+ i = 0
532
+ while i < len(blocks):
533
+ if blocks[i].type == BlockType.LIST_ITEM:
534
+ # Look back for a lead-in paragraph
535
+ start = i
536
+ if i > 0 and blocks[i - 1].type == BlockType.PARAGRAPH:
537
+ start = i - 1
538
+
539
+ # Extend to all consecutive list items
540
+ end = i
541
+ while end + 1 < len(blocks) and blocks[end + 1].type == BlockType.LIST_ITEM:
542
+ end += 1
543
+
544
+ groups.append((start, end))
545
+ i = end + 1
546
+ else:
547
+ i += 1
548
+
549
+ return groups
550
+
551
+
552
+ # ---------------------------------------------------------------------------
553
+ # Main chunker class
554
+ # ---------------------------------------------------------------------------
555
+
556
+ class HybridChunker:
557
+ """
558
+ Hybrid chunking engine combining 6 strategies for RAG-optimized output.
559
+
560
+ Usage:
561
+ chunker = HybridChunker(ChunkingConfig())
562
+ chunks = chunker.chunk(blocks)
563
+ """
564
+
565
+ def __init__(self, config: Optional[ChunkingConfig] = None):
566
+ self.config = config or ChunkingConfig()
567
+
568
+ def chunk(self, blocks: list[Block]) -> list[Chunk]:
569
+ """
570
+ Run the full hybrid chunking pipeline on a list of blocks.
571
+
572
+ Steps:
573
+ 0. Autonomous equation detection (re-tag missed equations)
574
+ 1. Filter header/footer blocks
575
+ 2. Group by section (hierarchy_path)
576
+ 3. Per section: table-aware → list-aware → token-window packing
577
+ 4. Apply overlap between consecutive chunks within a section
578
+ """
579
+ logger.info(f"[HybridChunker] Starting — {len(blocks)} blocks, "
580
+ f"max_tokens={self.config.max_tokens}")
581
+
582
+ # --- Strategy 0: equation detection ---
583
+ if self.config.detect_equations:
584
+ blocks = _detect_equations(blocks)
585
+
586
+ # --- Filter headers/footers ---
587
+ if self.config.exclude_headers_footers:
588
+ before = len(blocks)
589
+ blocks = [
590
+ b for b in blocks
591
+ if b.type not in (BlockType.HEADER, BlockType.FOOTER)
592
+ ]
593
+ filtered = before - len(blocks)
594
+ if filtered:
595
+ logger.info(f" Filtered {filtered} header/footer block(s)")
596
+
597
+ # --- Filter separator-only blocks (underscores, dashes, etc.) ---
598
+ before = len(blocks)
599
+ blocks = [
600
+ b for b in blocks
601
+ if not _is_separator_only(b.text)
602
+ ]
603
+ sep_filtered = before - len(blocks)
604
+ if sep_filtered:
605
+ logger.info(f" Filtered {sep_filtered} separator-only block(s)")
606
+
607
+ # --- Strategy 1: group by section ---
608
+ section_groups = self._group_by_section(blocks)
609
+ logger.info(f" {len(section_groups)} section group(s)")
610
+
611
+ all_chunks: list[Chunk] = []
612
+
613
+ for section_path, section_blocks in section_groups:
614
+ section_chunks = self._chunk_section(section_path, section_blocks)
615
+ all_chunks.extend(section_chunks)
616
+
617
+ # --- Merge small chunks ---
618
+ all_chunks = self._merge_small_chunks(all_chunks)
619
+
620
+ # --- Apply overlap ---
621
+ all_chunks = self._apply_overlap(all_chunks)
622
+
623
+ logger.info(f"[HybridChunker] Done — {len(all_chunks)} chunks produced")
624
+ return all_chunks
625
+
626
+ # -----------------------------------------------------------------------
627
+ # Strategy 1: Structure-aware grouping
628
+ # -----------------------------------------------------------------------
629
+
630
+ def _group_by_section(
631
+ self, blocks: list[Block]
632
+ ) -> list[tuple[list[str], list[Block]]]:
633
+ """Group blocks by hierarchy_path (section boundaries)."""
634
+ groups: list[tuple[list[str], list[Block]]] = []
635
+ current_path: list[str] | None = None
636
+ current_blocks: list[Block] = []
637
+
638
+ for block in blocks:
639
+ path = tuple(block.hierarchy_path)
640
+ if path != tuple(current_path or []):
641
+ if current_blocks:
642
+ groups.append((list(current_path or []), current_blocks))
643
+ current_path = list(block.hierarchy_path)
644
+ current_blocks = [block]
645
+ else:
646
+ current_blocks.append(block)
647
+
648
+ if current_blocks:
649
+ groups.append((list(current_path or []), current_blocks))
650
+
651
+ return groups
652
+
653
+ # -----------------------------------------------------------------------
654
+ # Per-section chunking
655
+ # -----------------------------------------------------------------------
656
+
657
+ def _chunk_section(
658
+ self, section_path: list[str], blocks: list[Block]
659
+ ) -> list[Chunk]:
660
+ """
661
+ Process a section's blocks using strategies 2-5.
662
+
663
+ Order:
664
+ - Tables → dedicated table chunks
665
+ - List groups → dedicated list chunks
666
+ - Equations → kept with surrounding context
667
+ - Remaining → token-window packing
668
+ """
669
+ chunks: list[Chunk] = []
670
+
671
+ # Identify which blocks are consumed by tables or list groups
672
+ consumed: set[int] = set()
673
+
674
+ # --- Strategy 4: Table-aware ---
675
+ for i, block in enumerate(blocks):
676
+ if block.type == BlockType.TABLE:
677
+ table_chunks = _chunk_table(block, self.config)
678
+ chunks.extend(table_chunks)
679
+ consumed.add(i)
680
+ # Also consume adjacent captions
681
+ if i > 0 and blocks[i - 1].type == BlockType.CAPTION:
682
+ # Prepend caption to first table chunk
683
+ if table_chunks and blocks[i - 1].text:
684
+ table_chunks[0].text = blocks[i - 1].text + "\n\n" + table_chunks[0].text
685
+ table_chunks[0].token_count = _count_tokens(table_chunks[0].text)
686
+ table_chunks[0].block_ids.insert(0, blocks[i - 1].block_id)
687
+ consumed.add(i - 1)
688
+ if i + 1 < len(blocks) and blocks[i + 1].type == BlockType.CAPTION:
689
+ if table_chunks:
690
+ table_chunks[-1].text += "\n\n" + blocks[i + 1].text
691
+ table_chunks[-1].token_count = _count_tokens(table_chunks[-1].text)
692
+ table_chunks[-1].block_ids.append(blocks[i + 1].block_id)
693
+ consumed.add(i + 1)
694
+
695
+ # --- Strategy 5: List-aware ---
696
+ remaining = [(i, b) for i, b in enumerate(blocks) if i not in consumed]
697
+
698
+ # Find list groups in the remaining blocks
699
+ remaining_blocks = [b for _, b in remaining]
700
+ remaining_indices = [i for i, _ in remaining]
701
+ list_groups = _extract_list_groups(remaining_blocks)
702
+
703
+ list_consumed_local: set[int] = set() # indices into remaining_blocks
704
+ for group_start, group_end in list_groups:
705
+ group_blocks = remaining_blocks[group_start:group_end + 1]
706
+ group_text = "\n\n".join(b.text for b in group_blocks if b.text)
707
+ tokens = _count_tokens(group_text)
708
+
709
+ if tokens <= self.config.max_tokens:
710
+ chunks.append(Chunk(
711
+ text=group_text,
712
+ token_count=tokens,
713
+ chunk_type="list",
714
+ section_path=section_path,
715
+ page_numbers=sorted(set(
716
+ b.provenance.page_number for b in group_blocks
717
+ )),
718
+ block_ids=[b.block_id for b in group_blocks],
719
+ ))
720
+ else:
721
+ # Split list at bullet boundaries
722
+ chunks.extend(
723
+ self._split_list_group(group_blocks, section_path)
724
+ )
725
+
726
+ for j in range(group_start, group_end + 1):
727
+ list_consumed_local.add(j)
728
+ consumed.add(remaining_indices[j])
729
+
730
+ # --- Strategy 3: Token-window packing for remaining blocks ---
731
+ final_remaining = [
732
+ b for i, b in enumerate(blocks) if i not in consumed
733
+ ]
734
+ if final_remaining:
735
+ packed = self._pack_blocks(final_remaining, section_path)
736
+ chunks.extend(packed)
737
+
738
+ return chunks
739
+
740
+ # -----------------------------------------------------------------------
741
+ # Strategy 3: Token-window packing
742
+ # -----------------------------------------------------------------------
743
+
744
+ def _pack_blocks(
745
+ self, blocks: list[Block], section_path: list[str]
746
+ ) -> list[Chunk]:
747
+ """
748
+ Pack blocks into chunks respecting max_tokens.
749
+ Equations are kept with their surrounding context.
750
+ """
751
+ chunks: list[Chunk] = []
752
+ current_texts: list[str] = []
753
+ current_ids: list[str] = []
754
+ current_pages: set[int] = set()
755
+ current_tokens = 0
756
+ has_equation = False
757
+
758
+ for block in blocks:
759
+ text = block.text.strip()
760
+ if not text:
761
+ continue
762
+
763
+ block_tokens = _count_tokens(text)
764
+
765
+ # If adding this block would exceed the limit, flush
766
+ if (current_tokens + block_tokens > self.config.max_tokens
767
+ and current_texts):
768
+
769
+ carry_text = None
770
+ carry_id = None
771
+ carry_tokens = 0
772
+
773
+ # Glue logic: if next block (this block) is an equation,
774
+ # keep the last paragraph with it
775
+ if block.type == BlockType.EQUATION and len(current_texts) > 0:
776
+ carry_text = current_texts.pop()
777
+ carry_id = current_ids.pop()
778
+ carry_tokens = _count_tokens(carry_text)
779
+ current_tokens -= carry_tokens
780
+
781
+ chunk_text = "\n\n".join(current_texts)
782
+ if chunk_text.strip():
783
+ chunk_type = "equation" if has_equation else "section"
784
+ chunks.append(Chunk(
785
+ text=chunk_text,
786
+ token_count=_count_tokens(chunk_text),
787
+ chunk_type=chunk_type,
788
+ section_path=section_path,
789
+ page_numbers=sorted(current_pages),
790
+ block_ids=list(current_ids),
791
+ equation_detected=has_equation,
792
+ ))
793
+
794
+ current_texts = []
795
+ current_ids = []
796
+ current_pages = set()
797
+ current_tokens = 0
798
+ has_equation = False
799
+
800
+ if carry_text:
801
+ current_texts.append(carry_text)
802
+ current_ids.append(carry_id)
803
+ current_tokens += carry_tokens
804
+ # Note: We assume the carried block is close enough to the next block
805
+ # that simply adding the next block's page will suffice for provenance.
806
+
807
+ current_texts.append(text)
808
+ current_ids.append(block.block_id)
809
+ current_pages.add(block.provenance.page_number)
810
+ current_tokens += block_tokens
811
+
812
+ if block.type == BlockType.EQUATION:
813
+ has_equation = True
814
+
815
+ # Flush remaining
816
+ if current_texts:
817
+ chunk_text = "\n\n".join(current_texts)
818
+ chunk_type = "equation" if has_equation else "section"
819
+ # Only emit if meets min_tokens or is the only content
820
+ if _count_tokens(chunk_text) >= self.config.min_tokens or not chunks:
821
+ chunks.append(Chunk(
822
+ text=chunk_text,
823
+ token_count=_count_tokens(chunk_text),
824
+ chunk_type=chunk_type,
825
+ section_path=section_path,
826
+ page_numbers=sorted(current_pages),
827
+ block_ids=list(current_ids),
828
+ equation_detected=has_equation,
829
+ ))
830
+ elif chunks:
831
+ # Merge into previous chunk
832
+ prev = chunks[-1]
833
+ prev.text += "\n\n" + chunk_text
834
+ prev.token_count = _count_tokens(prev.text)
835
+ prev.block_ids.extend(current_ids)
836
+ prev.page_numbers = sorted(
837
+ set(prev.page_numbers) | current_pages
838
+ )
839
+ if has_equation:
840
+ prev.equation_detected = True
841
+ prev.chunk_type = "equation"
842
+
843
+ return chunks
844
+
845
+ # -----------------------------------------------------------------------
846
+ # List splitting helper
847
+ # -----------------------------------------------------------------------
848
+
849
+ def _split_list_group(
850
+ self, blocks: list[Block], section_path: list[str]
851
+ ) -> list[Chunk]:
852
+ """Split a list group that exceeds max_tokens at bullet boundaries."""
853
+ chunks: list[Chunk] = []
854
+ current_texts: list[str] = []
855
+ current_ids: list[str] = []
856
+ current_pages: set[int] = set()
857
+ current_tokens = 0
858
+
859
+ for block in blocks:
860
+ text = block.text.strip()
861
+ if not text:
862
+ continue
863
+ block_tokens = _count_tokens(text)
864
+
865
+ if (current_tokens + block_tokens > self.config.max_tokens
866
+ and current_texts):
867
+ chunk_text = "\n\n".join(current_texts)
868
+ chunks.append(Chunk(
869
+ text=chunk_text,
870
+ token_count=_count_tokens(chunk_text),
871
+ chunk_type="list",
872
+ section_path=section_path,
873
+ page_numbers=sorted(current_pages),
874
+ block_ids=list(current_ids),
875
+ ))
876
+ current_texts = []
877
+ current_ids = []
878
+ current_pages = set()
879
+ current_tokens = 0
880
+
881
+ current_texts.append(text)
882
+ current_ids.append(block.block_id)
883
+ current_pages.add(block.provenance.page_number)
884
+ current_tokens += block_tokens
885
+
886
+ if current_texts:
887
+ chunk_text = "\n\n".join(current_texts)
888
+ chunks.append(Chunk(
889
+ text=chunk_text,
890
+ token_count=_count_tokens(chunk_text),
891
+ chunk_type="list",
892
+ section_path=section_path,
893
+ page_numbers=sorted(current_pages),
894
+ block_ids=list(current_ids),
895
+ ))
896
+
897
+ return chunks
898
+
899
+ # -----------------------------------------------------------------------
900
+ # Merge small chunks
901
+ # -----------------------------------------------------------------------
902
+
903
+ def _merge_small_chunks(self, chunks: list[Chunk]) -> list[Chunk]:
904
+ """
905
+ Post-processing pass: merge any chunk below min_tokens into its
906
+ nearest neighbor within the same section. Preference order:
907
+ 1. Previous chunk (same section_path)
908
+ 2. Next chunk (same section_path)
909
+ 3. Previous chunk (different section — avoid data loss)
910
+ """
911
+ if not chunks or self.config.min_tokens <= 0:
912
+ return chunks
913
+
914
+ merged: list[Chunk] = []
915
+
916
+ for chunk in chunks:
917
+ # If large enough, keep
918
+ if chunk.token_count >= self.config.min_tokens:
919
+ merged.append(chunk)
920
+ continue
921
+
922
+ # --- Try to merge into previous chunk (same section) ---
923
+ if merged and merged[-1].section_path == chunk.section_path:
924
+ self._absorb(merged[-1], chunk)
925
+ logger.debug(
926
+ f" [MERGE] Merged small chunk {chunk.token_count} "
927
+ f"tokens into previous (same section)"
928
+ )
929
+ continue
930
+
931
+ # --- No previous same-section neighbor; buffer it ---
932
+ merged.append(chunk)
933
+
934
+ # Second pass: merge any remaining small chunks forward
935
+ final: list[Chunk] = []
936
+ for i, chunk in enumerate(merged):
937
+ if chunk.token_count < self.config.min_tokens:
938
+ # 1. Try next chunk (same section)
939
+ if i + 1 < len(merged) and merged[i + 1].section_path == chunk.section_path:
940
+ self._absorb_prepend(merged[i + 1], chunk)
941
+ logger.debug(
942
+ f" [MERGE] Merged small chunk {chunk.token_count} "
943
+ f"tokens into next (same section)"
944
+ )
945
+ continue
946
+
947
+ # 2. Try previous chunk (any section, fallback)
948
+ if final:
949
+ self._absorb(final[-1], chunk)
950
+ logger.debug(
951
+ f" [MERGE] Merged small chunk {chunk.token_count} "
952
+ f"tokens into previous (cross-section)"
953
+ )
954
+ continue
955
+
956
+ # 3. Try next chunk (any section, fallback - e.g. first chunk)
957
+ if i + 1 < len(merged):
958
+ self._absorb_prepend(merged[i + 1], chunk)
959
+ logger.debug(
960
+ f" [MERGE] Merged small chunk {chunk.token_count} "
961
+ f"tokens into next (cross-section)"
962
+ )
963
+ continue
964
+
965
+ # 4. Total fallback: if isolated and TINY, ignore it? Or keep?
966
+ # For now, we keep it if we can't merge anywhere.
967
+ logger.warning(
968
+ f" [MERGE] Could not merge isolated small chunk: {chunk.token_count} tokens"
969
+ )
970
+
971
+ final.append(chunk)
972
+
973
+
974
+
975
+ before = len(chunks)
976
+ after = len(final)
977
+ if before != after:
978
+ logger.info(
979
+ f" [MERGE] Merged {before - after} small chunk(s) "
980
+ f"(min_tokens={self.config.min_tokens}): {before} → {after}"
981
+ )
982
+
983
+ return final
984
+
985
+ @staticmethod
986
+ def _absorb(target: Chunk, small: Chunk) -> None:
987
+ """Append small chunk content into target."""
988
+ target.text += "\n\n" + small.text
989
+ target.token_count = _count_tokens(target.text)
990
+ target.block_ids.extend(small.block_ids)
991
+ target.page_numbers = sorted(set(target.page_numbers) | set(small.page_numbers))
992
+ if small.equation_detected:
993
+ target.equation_detected = True
994
+ target.chunk_type = "equation"
995
+
996
+ @staticmethod
997
+ def _absorb_prepend(target: Chunk, small: Chunk) -> None:
998
+ """Prepend small chunk content into target."""
999
+ target.text = small.text + "\n\n" + target.text
1000
+ target.token_count = _count_tokens(target.text)
1001
+ target.block_ids = small.block_ids + target.block_ids
1002
+ target.page_numbers = sorted(set(target.page_numbers) | set(small.page_numbers))
1003
+ if small.equation_detected:
1004
+ target.equation_detected = True
1005
+ target.chunk_type = "equation"
1006
+
1007
+ # -----------------------------------------------------------------------
1008
+ # Overlap
1009
+ # -----------------------------------------------------------------------
1010
+
1011
+ def _apply_overlap(self, chunks: list[Chunk]) -> list[Chunk]:
1012
+ """
1013
+ Apply block-level overlap between consecutive chunks
1014
+ within the same section.
1015
+ """
1016
+ if self.config.overlap_blocks <= 0 or len(chunks) < 2:
1017
+ return chunks
1018
+
1019
+ for i in range(1, len(chunks)):
1020
+ prev = chunks[i - 1]
1021
+ curr = chunks[i]
1022
+
1023
+ # Only overlap within same section
1024
+ if prev.section_path != curr.section_path:
1025
+ continue
1026
+
1027
+ # Skip overlap for table chunks
1028
+ if prev.chunk_type == "table" or curr.chunk_type == "table":
1029
+ continue
1030
+
1031
+ # Get last N paragraphs of previous chunk as overlap
1032
+ prev_parts = prev.text.split("\n\n")
1033
+ overlap_parts = prev_parts[-self.config.overlap_blocks:]
1034
+
1035
+ # Avoid duplicating equations in overlap
1036
+ if any("⟦EQUATION⟧" in part for part in overlap_parts):
1037
+ continue
1038
+
1039
+
1040
+ if overlap_parts:
1041
+ overlap_text = "\n\n".join(overlap_parts)
1042
+ curr.text = overlap_text + "\n\n" + curr.text
1043
+ curr.token_count = _count_tokens(curr.text)
1044
+ curr.overlap_with_previous = True
1045
+
1046
+ return chunks