longparser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2065 @@
1
+ """Docling-based document extractor with Tesseract CLI OCR and HierarchicalChunker.
2
+
3
+ Uses:
4
+ - Tesseract CLI for OCR
5
+ - Layout analysis always enabled
6
+ - TableFormer for table structure
7
+ - HierarchicalChunker for heading hierarchy
8
+ - iterate_items() for reading-order block extraction
9
+
10
+ No hardcoded heuristics — relies entirely on Docling's native capabilities.
11
+ """
12
+
13
+ from pathlib import Path
14
+ from typing import Optional, Tuple, List, Dict
15
+ import os
16
+ import time
17
+ import logging
18
+ import hashlib
19
+ import uuid
20
+ import re
21
+ from dataclasses import dataclass
22
+ from docling.datamodel.pipeline_options import (
23
+ PdfPipelineOptions,
24
+ TesseractCliOcrOptions,
25
+ )
26
+ from docling.datamodel.base_models import InputFormat
27
+ from docling.document_converter import (
28
+ DocumentConverter,
29
+ PdfFormatOption,
30
+ WordFormatOption,
31
+ PowerpointFormatOption,
32
+ ExcelFormatOption,
33
+ CsvFormatOption,
34
+ )
35
+
36
+ from docling_core.transforms.chunker import HierarchicalChunker
37
+ from docling_core.types.doc import (
38
+ SectionHeaderItem,
39
+ TableItem,
40
+ PictureItem,
41
+ TextItem,
42
+ ListItem,
43
+ DocItemLabel,
44
+ )
45
+
46
+ # TitleItem is used by Docling for PPTX slide titles (not SectionHeaderItem)
47
+ try:
48
+ from docling_core.types.doc import TitleItem
49
+ except ImportError:
50
+ TitleItem = None # Fallback for older docling versions
51
+
52
+ from ..schemas import (
53
+ Document, Page, Block, Table, TableCell,
54
+ BlockType, ExtractorType, ProcessingConfig,
55
+ BoundingBox, Provenance, Confidence, BlockFlags,
56
+ DocumentMetadata, PageProfile, ExtractionMetadata,
57
+ )
58
+ from .base import BaseExtractor
59
+
60
+ logger = logging.getLogger(__name__)
61
+
62
+ # Pattern to detect structured leading markers in headings.
63
+ # Matches alphanumeric + punctuation prefixes followed by whitespace:
64
+ # "I.", "II.", "A.", "1.", "2.3", "IV", "a)", etc.
65
+ _MARKER_RE = re.compile(r'^([A-Za-z0-9][A-Za-z0-9.()]*)[.\s]\s*')
66
+
67
+ # Pattern used to detect garbled math in paragraph blocks.
68
+ _MATH_RE = re.compile(
69
+ r'[\u2211\u220F\u222B\u221A\u00B1\u2264\u2265\u2248\u2260\u03B1-\u03C9\u03A3]'
70
+ r'|[a-z]\s*=\s*[a-z0-9]',
71
+ re.IGNORECASE,
72
+ )
73
+
74
+
75
+ def _iou_px(a: dict, b: dict) -> float:
76
+ """Compute IoU between two pixel-space bbox dicts {x0,y0,x1,y1}."""
77
+ xi0, yi0 = max(a["x0"], b["x0"]), max(a["y0"], b["y0"])
78
+ xi1, yi1 = min(a["x1"], b["x1"]), min(a["y1"], b["y1"])
79
+ inter = max(0, xi1 - xi0) * max(0, yi1 - yi0)
80
+ ua = (a["x1"] - a["x0"]) * (a["y1"] - a["y0"])
81
+ ub = (b["x1"] - b["x0"]) * (b["y1"] - b["y0"])
82
+ union = ua + ub - inter
83
+ return inter / union if union > 0 else 0.0
84
+
85
+
86
+ def _is_mfd_candidate(page_no: int, page_blocks, docling_formula_count: int) -> bool:
87
+ """Return True if MFD should scan this page.
88
+
89
+ Runs MFD if Docling found few/no formulas OR at least one non-equation
90
+ block on this page contains garbled math Unicode.
91
+ """
92
+ if docling_formula_count > 3:
93
+ return False # Docling handled it well; trust it
94
+ garbled = any(
95
+ _MATH_RE.search(b.text)
96
+ for b in page_blocks
97
+ if getattr(b, "type", None) is not None and str(b.type) != "equation"
98
+ )
99
+ return docling_formula_count == 0 or garbled
100
+
101
+
102
+ @dataclass
103
+ class _HeadingInfo:
104
+ """Internal heading tracking."""
105
+ text: str
106
+ level: int
107
+ hierarchy_path: List[str]
108
+
109
+
110
+ @dataclass
111
+ class PptxParaInfo:
112
+ """Paragraph info extracted directly from python-pptx."""
113
+ indent_level: int # 0-8 from paragraph.level
114
+ is_title: bool # True for TITLE / CENTER_TITLE placeholders
115
+ is_subtitle: bool # True for SUBTITLE placeholders
116
+ is_list: bool # True if Docling would treat it as list item
117
+ bullet_type: str # 'Bullet', 'Numbered', 'None'
118
+ is_footer: bool = False # True for DATE / FOOTER / SLIDE_NUMBER placeholders
119
+
120
+
121
+ @dataclass
122
+ class HierarchyChunk:
123
+ """A chunk with hierarchy information."""
124
+ text: str
125
+ heading_path: List[str]
126
+ level: int
127
+ page_number: int
128
+ order_index: int
129
+
130
+
131
+ class DoclingExtractor(BaseExtractor):
132
+ """
133
+ Document extractor using Docling with Tesseract CLI OCR.
134
+
135
+ Relies entirely on Docling's native APIs:
136
+ - iterate_items() for reading-order traversal with hierarchy level
137
+ - SectionHeaderItem / TextItem / TableItem / ListItem / PictureItem for type detection
138
+ - item.label (DocItemLabel) for fine-grained classification
139
+ - item.prov for page number and bounding box
140
+ - page.size for actual page dimensions
141
+ - HierarchicalChunker for heading hierarchy paths
142
+
143
+ Heading hierarchy is inferred autonomously from:
144
+ 1. Pattern Priority (Numbered vs Unnumbered)
145
+ 2. Position Awareness (Late Arrival Rule)
146
+ 3. Font-size clustering
147
+
148
+ No hardcoded numbering conventions.
149
+ """
150
+
151
+ extractor_type = ExtractorType.DOCLING
152
+ version = "3.0.0"
153
+
154
+ def __init__(self, tesseract_lang: List[str] = None, tessdata_path: str = None, force_full_page_ocr: bool = False):
155
+ """
156
+ Initialize Docling extractor.
157
+
158
+ Args:
159
+ tesseract_lang: Languages for Tesseract OCR (default: ["eng"])
160
+ tessdata_path: Path to tessdata directory with language models and configs.
161
+ If None, uses system default.
162
+ force_full_page_ocr: If True, OCR entire page even if embedded text exists.
163
+ Required for PDFs with broken Unicode mapping.
164
+ """
165
+ self._converter = None
166
+ self._chunker = None
167
+ self._initialized = False
168
+ self._languages = tesseract_lang or ["eng"]
169
+ self._tessdata_dir = tessdata_path
170
+ self._force_full_page_ocr = force_full_page_ocr
171
+
172
+ def _create_converter(self, config: ProcessingConfig, formula_enrichment: Optional[bool] = None) -> DocumentConverter:
173
+ """Create a DocumentConverter with Tesseract CLI OCR."""
174
+ # Configure pipeline
175
+ pipeline_options = PdfPipelineOptions()
176
+ pipeline_options.do_ocr = config.do_ocr
177
+ pipeline_options.do_table_structure = config.do_table_structure
178
+
179
+ # Determine formula enrichment setting (independent of do_ocr)
180
+ if formula_enrichment is not None:
181
+ pipeline_options.do_formula_enrichment = formula_enrichment
182
+ elif not config.formula_ocr:
183
+ # Formula OCR explicitly disabled
184
+ pipeline_options.do_formula_enrichment = False
185
+ elif config.formula_mode == "full":
186
+ pipeline_options.do_formula_enrichment = True
187
+ else:
188
+ # Default to False for "fast" and "smart" (initial pass)
189
+ pipeline_options.do_formula_enrichment = False
190
+
191
+ # Enable image export
192
+ pipeline_options.generate_page_images = True
193
+ pipeline_options.generate_picture_images = config.export_images
194
+ pipeline_options.images_scale = 2.0
195
+
196
+ # Use Tesseract CLI for OCR
197
+ ocr_options = TesseractCliOcrOptions(
198
+ lang=self._languages,
199
+ tesseract_cmd="tesseract",
200
+ path=self._tessdata_dir,
201
+ force_full_page_ocr=config.force_full_page_ocr,
202
+ )
203
+ pipeline_options.ocr_options = ocr_options
204
+
205
+ return DocumentConverter(
206
+ format_options={
207
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
208
+ InputFormat.DOCX: WordFormatOption(pipeline_options=pipeline_options),
209
+ InputFormat.PPTX: PowerpointFormatOption(pipeline_options=pipeline_options),
210
+ InputFormat.XLSX: ExcelFormatOption(pipeline_options=pipeline_options),
211
+ InputFormat.CSV: CsvFormatOption(pipeline_options=pipeline_options),
212
+ }
213
+ )
214
+
215
+
216
+ def _run_docling(self, file_path: Path, config: ProcessingConfig):
217
+ """Run Docling conversion and return the DoclingDocument."""
218
+ # Check if we need to re-initialize converter due to config change or first run
219
+ # For simplicity, we just ensure self._converter exists.
220
+ # In smart/fast mode, it will be the "fast" converter (no enrichment).
221
+ # In full mode, it will be the "full" converter (enrichment).
222
+ if not self._initialized:
223
+ logger.info("Initializing Docling pipeline...")
224
+ self._converter = self._create_converter(config)
225
+ self._chunker = HierarchicalChunker()
226
+ self._initialized = True
227
+ logger.info(f"Docling pipeline initialized (formula_mode={config.formula_mode})")
228
+
229
+ file_path = Path(file_path)
230
+
231
+ logger.info(f"Extracting with Docling: {file_path.name}")
232
+
233
+ try:
234
+ # Powerpoint/Excel/Word/CSV use standard conversion
235
+ ext = file_path.suffix.lower()
236
+ if ext not in [".pdf"]:
237
+ result = self._converter.convert(str(file_path))
238
+
239
+ # DOCX/PPTX: inject OMML equations as LaTeX
240
+ if ext in (".docx", ".pptx") and config.formula_mode != "fast":
241
+ if ext == ".docx":
242
+ latex_eqs = self._extract_docx_equations(file_path)
243
+ else:
244
+ latex_eqs = self._extract_pptx_equations(file_path)
245
+
246
+ if latex_eqs:
247
+ # Find formula blocks in Docling output (order-based)
248
+ formula_blocks = []
249
+ for item, _ in result.document.iterate_items():
250
+ label = getattr(item, "label", None)
251
+ if label and ("formula" in str(label).lower() or "equation" in str(label).lower()):
252
+ formula_blocks.append(item)
253
+
254
+ # Order-based substitution with alignment gate
255
+ injected = 0
256
+ _non_omml = 0
257
+ for block, latex in zip(formula_blocks, latex_eqs):
258
+ orig_len = len(block.text.strip()) if block.text else 0
259
+ latex_len = len(latex.strip())
260
+
261
+ # Asymmetric gate: allow if Docling text is empty/garbled
262
+ if orig_len < 3 and latex_len > 3:
263
+ block.text = f"$${latex}$$"
264
+ injected += 1
265
+ elif latex_len > 0 and 0.2 <= (orig_len + 5) / (latex_len + 5) <= 5.0:
266
+ block.text = f"$${latex}$$"
267
+ injected += 1
268
+ else:
269
+ logger.debug(f"Skipping equation inject: ratio out of range")
270
+
271
+ if len(formula_blocks) != len(latex_eqs):
272
+ logger.warning(
273
+ f"{ext.upper()} equation count mismatch: "
274
+ f"extracted={len(latex_eqs)}, docling={len(formula_blocks)}. "
275
+ f"Injected {injected}."
276
+ )
277
+ else:
278
+ logger.info(f"Injected {injected} LaTeX equations from {ext.upper()}")
279
+
280
+ return result
281
+
282
+ # --- PDF Handling with Smart Formula Mode ---
283
+
284
+ # Pass 1: Run standard conversion
285
+ # If mode="smart" or "fast", this is the FAST pass (no enrichment).
286
+ # If mode="full", this is the FULL pass (enrichment enabled).
287
+ start_time = time.time()
288
+ result = self._converter.convert(str(file_path))
289
+ _duration = time.time() - start_time
290
+
291
+ # If not smart mode, we are done (Full or Fast)
292
+ if config.formula_mode != "smart":
293
+ # Apply normalization if Fast mode (to make unicode math nicer)
294
+ if config.formula_mode == "fast":
295
+ _keys = list(result.document.pages.keys()) # snapshot (unused; iteration below)
296
+ for _, page in result.document.pages.items():
297
+ # Iterate all items on page
298
+ # We can't easily modify text in-place efficiently without iterating items
299
+ pass
300
+ # Actually, normalization is better applied globally to the doc text items
301
+ for item, _ in result.document.iterate_items():
302
+ if hasattr(item, "text"):
303
+ item.text = self._normalize_unicode_math(item.text)
304
+ return result
305
+
306
+ # --- Smart Mode: BBox Crop → LaTeX-OCR ---
307
+
308
+ num_pages = len(result.document.pages)
309
+
310
+ # Page cap: if PDF is huge, fallback to fast
311
+ if num_pages > 100:
312
+ logger.info(f"Smart mode disabled: {num_pages} pages exceeds cap (100). "
313
+ "Falling back to Unicode normalization only.")
314
+ for item, _ in result.document.iterate_items():
315
+ if hasattr(item, "text"):
316
+ item.text = self._normalize_unicode_math(item.text)
317
+ return result
318
+
319
+ # Find equation items (FORMULA-labeled blocks only)
320
+ equation_items = self._find_equation_items(result.document)
321
+
322
+ if equation_items:
323
+ # Merge adjacent formula fragments
324
+ merged_items, union_bboxes, blank_ids = self._merge_adjacent_formulas(
325
+ equation_items, result.document
326
+ )
327
+
328
+ backend = os.getenv("LONGPARSER_LATEX_OCR_BACKEND", "pix2tex")
329
+ try:
330
+ from .latex_ocr import LaTeXOCR
331
+ ocr = LaTeXOCR(backend=backend)
332
+ except ImportError:
333
+ ocr = None
334
+ logger.warning("latex_ocr module not available. Skipping formula OCR.")
335
+
336
+ if ocr and ocr.available:
337
+ processed, t0 = 0, time.monotonic()
338
+ # Per-equation timeout: cap each pix2tex call to prevent one slow eq from blocking
339
+ per_eq_timeout = float(os.getenv("LONGPARSER_FORMULA_PER_EQ_TIMEOUT", "30"))
340
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeout
341
+ executor = ThreadPoolExecutor(max_workers=1)
342
+
343
+ for item, page_no in merged_items:
344
+ # Circuit breaker: equation count
345
+ if processed >= config.smart_max_equations:
346
+ logger.info(f"Circuit breaker: {processed} equations reached limit")
347
+ break
348
+ # Circuit breaker: time budget
349
+ if time.monotonic() - t0 > config.smart_max_ocr_seconds:
350
+ logger.info(f"Circuit breaker: OCR time budget exceeded "
351
+ f"({time.monotonic() - t0:.1f}s > {config.smart_max_ocr_seconds}s)")
352
+ break
353
+
354
+ crop = self._crop_equation_bbox(
355
+ result.document, item, page_no, union_bboxes
356
+ )
357
+ if crop is None:
358
+ continue
359
+
360
+ # Run pix2tex with per-equation timeout
361
+ try:
362
+ future = executor.submit(ocr.recognize, crop)
363
+ latex = future.result(timeout=per_eq_timeout)
364
+ except FuturesTimeout:
365
+ logger.info(f"Equation OCR timed out after {per_eq_timeout}s, skipping")
366
+ continue
367
+ except Exception as e:
368
+ logger.debug(f"Equation OCR error: {e}")
369
+ continue
370
+
371
+ if latex:
372
+ item.text = f"$${latex}$$"
373
+ processed += 1
374
+
375
+ executor.shutdown(wait=False)
376
+
377
+ # Blank leftover fragments (merged items whose text was absorbed)
378
+ for item, _ in result.document.iterate_items():
379
+ if id(item) in blank_ids:
380
+ item.text = ""
381
+
382
+ logger.info(f"Smart mode: OCR'd {processed} equations in "
383
+ f"{time.monotonic() - t0:.2f}s")
384
+ else:
385
+ logger.info("LaTeX-OCR not available. Using Unicode normalization only.")
386
+ else:
387
+ logger.info("Smart mode: No FORMULA blocks detected by Docling.")
388
+
389
+ # ── MFD fallback: scan candidate pages for missed equations ───────────
390
+ try:
391
+ from .latex_ocr import MFDBackend
392
+ mfd = MFDBackend.get()
393
+ except Exception:
394
+ mfd = None
395
+
396
+ if mfd and mfd.available and ocr and ocr.available:
397
+ t0_mfd = time.monotonic()
398
+ max_ocr_secs = config.smart_max_ocr_seconds
399
+
400
+ for page_no, page_obj in result.document.pages.items():
401
+ # Budget gate: skip MFD if <60% of budget remains
402
+ elapsed = time.monotonic() - t0_mfd
403
+ if elapsed > max_ocr_secs * 0.4:
404
+ logger.info("MFD: time budget low, stopping page scan")
405
+ break
406
+
407
+ # Count Docling formulas on this page for candidate gating
408
+ docling_formula_count = sum(
409
+ 1 for item, pno in result.document.iterate_items()
410
+ if pno == page_no and str(getattr(item, "label", "")).lower() in {"formula", "equation"}
411
+ )
412
+
413
+ # Collect page blocks for gating and replace-first
414
+ # (These are the Block objects we will be building during extraction;
415
+ # at this stage we check items directly from Docling result)
416
+ page_text_items = [
417
+ item for item, pno in result.document.iterate_items()
418
+ if pno == page_no and hasattr(item, "text") and item.text
419
+ ]
420
+
421
+ # Build minimal proxy dicts for _is_mfd_candidate
422
+ page_proxy = [
423
+ type("_P", (), {"text": it.text, "type": str(getattr(it, "label", ""))}) # type: ignore
424
+ for it in page_text_items
425
+ ]
426
+
427
+ if not _is_mfd_candidate(page_no, page_proxy, docling_formula_count):
428
+ continue
429
+
430
+ # Get page PIL image (already rendered by Docling in smart mode)
431
+ page_img = None
432
+ try:
433
+ page_img = page_obj.image.pil_image
434
+ except Exception:
435
+ continue
436
+ if page_img is None:
437
+ continue
438
+
439
+ mfd_boxes = mfd.detect(page_img)
440
+ if not mfd_boxes:
441
+ continue
442
+
443
+ # Build pixel-space bboxes for existing Docling FORMULA items on this page
444
+ existing_formula_px: list[dict] = []
445
+ img_w, img_h = page_img.size
446
+ page_w = page_obj.size.width
447
+ page_h = page_obj.size.height
448
+ sx = img_w / page_w if page_w else 1.0
449
+ sy = img_h / page_h if page_h else 1.0
450
+
451
+ for item, pno in result.document.iterate_items():
452
+ if pno != page_no:
453
+ continue
454
+ label = str(getattr(item, "label", "")).lower()
455
+ if label not in {"formula", "equation"}:
456
+ continue
457
+ for prov in getattr(item, "prov", []):
458
+ if getattr(prov, "page_no", None) != page_no:
459
+ continue
460
+ bbox = getattr(prov, "bbox", None)
461
+ if bbox is None:
462
+ continue
463
+ tl = bbox.to_top_left_origin(page_h)
464
+ existing_formula_px.append({
465
+ "x0": int(tl.l * sx), "y0": int(tl.t * sy),
466
+ "x1": int(tl.r * sx), "y1": int(tl.b * sy),
467
+ })
468
+
469
+ for mbox in mfd_boxes:
470
+ # Circuit breakers
471
+ if processed >= config.smart_max_equations:
472
+ break
473
+ if time.monotonic() - t0_mfd > max_ocr_secs:
474
+ break
475
+
476
+ # Skip if already covered by a Docling formula bbox
477
+ if any(_iou_px(mbox, ex) > 0.5 for ex in existing_formula_px):
478
+ continue
479
+
480
+ # Crop and OCR
481
+ from PIL import Image as _PILImage
482
+ pad_x = (mbox["x1"] - mbox["x0"]) * 0.15
483
+ pad_y = (mbox["y1"] - mbox["y0"]) * 0.15
484
+ cx0 = max(0, mbox["x0"] - pad_x)
485
+ cy0 = max(0, mbox["y0"] - pad_y)
486
+ cx1 = min(img_w, mbox["x1"] + pad_x)
487
+ cy1 = min(img_h, mbox["y1"] + pad_y)
488
+ if (cx1 - cx0) < 64 or (cy1 - cy0) < 64:
489
+ continue
490
+ crop = page_img.crop((int(cx0), int(cy0), int(cx1), int(cy1)))
491
+
492
+ latex = ocr.recognize(crop)
493
+ if not latex:
494
+ continue
495
+
496
+ processed += 1
497
+ delim = "$$" if mbox["type"] == "isolated" else "$"
498
+ latex_text = f"{delim}{latex}{delim}"
499
+ mbox_dict = mbox # alias for IoU below
500
+
501
+ # Replace-first: find an overlapping garbled non-formula item
502
+ replaced = False
503
+ for item, pno in result.document.iterate_items():
504
+ if pno != page_no:
505
+ continue
506
+ if not hasattr(item, "text") or not item.text:
507
+ continue
508
+ label = str(getattr(item, "label", "")).lower()
509
+ if label in {"formula", "equation"}:
510
+ continue
511
+ if not _MATH_RE.search(item.text):
512
+ continue
513
+ # Compute pixel bbox for this item
514
+ for prov in getattr(item, "prov", []):
515
+ if getattr(prov, "page_no", None) != page_no:
516
+ continue
517
+ bbox = getattr(prov, "bbox", None)
518
+ if bbox is None:
519
+ continue
520
+ tl = bbox.to_top_left_origin(page_h)
521
+ item_px = {
522
+ "x0": int(tl.l * sx), "y0": int(tl.t * sy),
523
+ "x1": int(tl.r * sx), "y1": int(tl.b * sy),
524
+ }
525
+ if _iou_px(item_px, mbox_dict) > 0.5:
526
+ item.text = latex_text
527
+ # Update label to formula so downstream sees it correctly
528
+ try:
529
+ item.label = type(item.label)("formula")
530
+ except Exception:
531
+ pass
532
+ replaced = True
533
+ logger.debug(f"MFD: replaced garbled block on page {page_no}")
534
+ break
535
+ if replaced:
536
+ break
537
+
538
+ if not replaced:
539
+ # Append a new synthetic formula item text to the first item on
540
+ # this page so it flows into the block extraction pass.
541
+ # Simpler: log and let the extractor create it via block loop.
542
+ logger.debug(f"MFD: no overlapping garbled block found on page {page_no}; "
543
+ f"new equation injected as standalone")
544
+ # Inject as a minimal TextItem appended to the page's item list
545
+ try:
546
+ from docling_core.types.doc import TextItem as _TextItem, DocItemLabel as _DIL
547
+ new_item = _TextItem(
548
+ label=_DIL.FORMULA,
549
+ text=latex_text,
550
+ prov=[],
551
+ )
552
+ result.document.texts.append(new_item)
553
+ except Exception as e:
554
+ logger.debug(f"MFD: could not inject new item: {e}")
555
+
556
+ logger.info(f"MFD fallback finished. Total OCR'd: {processed}")
557
+
558
+ # Normalize remaining text (items not replaced with LaTeX)
559
+ for item, _ in result.document.iterate_items():
560
+ if hasattr(item, "text") and not item.text.startswith("$$"):
561
+ item.text = self._normalize_unicode_math(item.text)
562
+
563
+ return result
564
+
565
+ except Exception as e:
566
+ logger.error(f"Docling extraction failed: {e}")
567
+ raise
568
+
569
+ def _cluster_font_sizes(self, heights: List[float], tolerance: float = 0.15) -> List[List[float]]:
570
+ """
571
+ Cluster heading bbox heights into distinct font-size groups.
572
+
573
+ Uses relative tolerance: two heights belong to the same cluster
574
+ if they are within `tolerance` (15%) of the cluster's mean height.
575
+
576
+ Returns:
577
+ List of clusters, sorted from largest mean height to smallest.
578
+ Each cluster is a list of heights that belong to it.
579
+ """
580
+ if not heights:
581
+ return []
582
+
583
+ sorted_heights = sorted(set(heights), reverse=True)
584
+ clusters = []
585
+
586
+ for h in sorted_heights:
587
+ placed = False
588
+ for cluster in clusters:
589
+ cluster_mean = sum(cluster) / len(cluster)
590
+ # Relative difference check
591
+ if abs(h - cluster_mean) / max(cluster_mean, 0.1) <= tolerance:
592
+ cluster.append(h)
593
+ placed = True
594
+ break
595
+ if not placed:
596
+ clusters.append([h])
597
+
598
+ # Sort clusters by mean height descending (largest font first)
599
+ clusters.sort(key=lambda c: sum(c) / len(c), reverse=True)
600
+ return clusters
601
+
602
+ @staticmethod
603
+ def _extract_marker(text: str) -> Optional[str]:
604
+ """
605
+ Extract the leading marker/prefix from a heading text.
606
+
607
+ Detects structured prefixes like "I.", "A.", "1.", "IV.",
608
+ "2.3", etc. using a general pattern matcher.
609
+
610
+ Returns:
611
+ The marker string if found, or None.
612
+ """
613
+ m = _MARKER_RE.match(text.strip())
614
+ return m.group(1) if m else None
615
+
616
+ @staticmethod
617
+ def _classify_marker_type(marker: str) -> str:
618
+ """
619
+ Classify a marker using strict numbering patterns.
620
+
621
+ Returns:
622
+ 'numeric' for 1, 1.1, 1.1.1
623
+ 'alpha' for A, B, A.1
624
+ 'roman' for I, II, IV
625
+ 'other' for bullets or non-structural markers
626
+ """
627
+ if not marker:
628
+ return 'other'
629
+
630
+ marker = marker.strip()
631
+
632
+ # Strict Numeric: 1. or 1.1 or 1.1.1
633
+ if re.match(r'^\d+(\.\d+)*\.?$', marker):
634
+ return 'numeric'
635
+
636
+ # Strict Roman: I. or IV. (common uppercase roman)
637
+ if re.match(r'^[IVX]+\.?$', marker):
638
+ return 'roman'
639
+
640
+ # Strict Alpha: A. or A.1
641
+ if re.match(r'^[A-Z](\.[0-9]+)*\.?$', marker):
642
+ return 'alpha'
643
+
644
+ return 'other'
645
+
646
+ def _sub_cluster_by_markers(
647
+ self,
648
+ texts_in_cluster: List[str],
649
+ base_level: int,
650
+ ) -> Dict[str, int]:
651
+ """
652
+ Sub-differentiate headings within the same font-size cluster
653
+ using autonomous marker-pattern analysis.
654
+
655
+ Fully data-driven — no hardcoded rankings or character sets.
656
+
657
+ Algorithm:
658
+ 1. Group headings by marker character-class (objective string
659
+ properties: isdigit, isupper, len).
660
+ 2. Compute average span (gap) between consecutive markers of
661
+ each group. Parent sections have LARGER spans because child
662
+ sections fill the gaps between them.
663
+ 3. Rank groups by span size: largest span = parent level.
664
+
665
+ Args:
666
+ texts_in_cluster: Heading texts in this font-size cluster.
667
+ base_level: Level assigned by font-size clustering.
668
+
669
+ Returns:
670
+ Dict mapping heading text -> adjusted heading level.
671
+ """
672
+ if len(texts_in_cluster) <= 1:
673
+ return {t: base_level for t in texts_in_cluster}
674
+
675
+ # Step 1: Extract and classify markers by character class
676
+ text_info = [] # [(text, mtype)]
677
+ type_counts = {}
678
+
679
+ for text in texts_in_cluster:
680
+ marker = self._extract_marker(text)
681
+ mtype = self._classify_marker_type(marker) if marker else None
682
+ text_info.append((text, mtype))
683
+ if mtype:
684
+ type_counts[mtype] = type_counts.get(mtype, 0) + 1
685
+
686
+ # Need at least 2 distinct types with 2+ headings each
687
+ active_types = {t for t, c in type_counts.items() if c >= 2}
688
+ if len(active_types) <= 1:
689
+ return {t: base_level for t in texts_in_cluster}
690
+
691
+ # Step 2: Compute average span for each marker type
692
+ # Parent groups have LARGER spans (children fill the gaps)
693
+ type_positions = {}
694
+ for idx, (text, mtype) in enumerate(text_info):
695
+ if mtype and mtype in active_types:
696
+ type_positions.setdefault(mtype, []).append(idx)
697
+
698
+ type_avg_span = {}
699
+ for mtype, positions in type_positions.items():
700
+ if len(positions) < 2:
701
+ # Single instance — treat as broadest span
702
+ type_avg_span[mtype] = len(text_info)
703
+ else:
704
+ spans = [positions[i+1] - positions[i]
705
+ for i in range(len(positions) - 1)]
706
+ type_avg_span[mtype] = sum(spans) / len(spans)
707
+
708
+ # Step 3: Sort by average span DESCENDING (largest = parent)
709
+ sorted_types = sorted(
710
+ active_types,
711
+ key=lambda t: type_avg_span[t],
712
+ reverse=True,
713
+ )
714
+
715
+ # Assign sub-levels
716
+ type_to_sublevel = {}
717
+ for i, mtype in enumerate(sorted_types):
718
+ type_to_sublevel[mtype] = base_level + i
719
+
720
+ result = {}
721
+ for text, mtype in text_info:
722
+ if mtype and mtype in type_to_sublevel:
723
+ result[text] = type_to_sublevel[mtype]
724
+ else:
725
+ result[text] = base_level
726
+
727
+ logger.debug(
728
+ f"Sub-clustered {len(texts_in_cluster)} headings at level {base_level}: "
729
+ f"types={dict(type_counts)}, spans={type_avg_span}, "
730
+ f"sub-levels={type_to_sublevel}"
731
+ )
732
+
733
+ return result
734
+
735
+ def _build_hierarchy_map(self, docling_doc) -> Tuple[Dict[str, List[str]], Dict[str, int]]:
736
+ """
737
+ Build two mappings using Docling's native APIs:
738
+ 1. item self_ref -> heading path (from HierarchicalChunker)
739
+ 2. heading text -> heading level (font-size + marker analysis)
740
+
741
+ Two-phase heading level inference:
742
+ Phase 1: Font-size clustering — groups by bbox height.
743
+ Largest font = h1, next = h2, etc.
744
+ Phase 2: Marker-pattern analysis — within each font-size
745
+ cluster, detect structural prefix patterns to
746
+ create sub-levels (e.g. "I." parent, "A." child).
747
+
748
+ Returns:
749
+ Tuple of (ref_to_path, heading_to_level)
750
+ """
751
+ ref_to_path = {}
752
+ heading_to_level = {}
753
+
754
+ # --- Step 1: Collect heading texts and bbox heights ---
755
+ heading_heights = {} # text -> height
756
+ heading_order = [] # preserve document order
757
+
758
+ for item, level in docling_doc.iterate_items():
759
+ if isinstance(item, SectionHeaderItem):
760
+ text = getattr(item, 'text', '')
761
+ if not text:
762
+ continue
763
+
764
+ height = 0.0
765
+ prov = getattr(item, 'prov', [])
766
+ if prov and len(prov) > 0:
767
+ bbox = getattr(prov[0], 'bbox', None)
768
+ if bbox:
769
+ height = abs(getattr(bbox, 't', 0) - getattr(bbox, 'b', 0))
770
+
771
+ if text not in heading_heights:
772
+ heading_heights[text] = height
773
+ heading_order.append(text)
774
+
775
+ if not heading_heights:
776
+ logger.info("No section headers found in document")
777
+ return ref_to_path, heading_to_level
778
+
779
+ # --- Step 2: Font-size clustering ---
780
+ all_heights = list(heading_heights.values())
781
+ clusters = self._cluster_font_sizes(all_heights)
782
+
783
+ # Build height -> cluster index
784
+ height_to_cidx = {}
785
+ for idx, cluster in enumerate(clusters):
786
+ for h in cluster:
787
+ height_to_cidx[h] = idx
788
+
789
+ # Group heading texts by font-size cluster
790
+ cluster_texts = {}
791
+ for text in heading_order:
792
+ cidx = height_to_cidx.get(heading_heights[text], 0)
793
+ if cidx not in cluster_texts:
794
+ cluster_texts[cidx] = []
795
+ cluster_texts[cidx].append(text)
796
+
797
+ # --- Step 3: Marker-pattern sub-clustering & Late-Arrival Logic ---
798
+
799
+ # 3a. Find the first "Strong" (Numbered) heading in the entire document
800
+ first_strong_index = float('inf')
801
+ for idx, text in enumerate(heading_order):
802
+ marker = self._extract_marker(text)
803
+ mtype = self._classify_marker_type(marker) if marker else 'other'
804
+ if mtype in ('numeric', 'alpha', 'roman'):
805
+ first_strong_index = idx
806
+ break
807
+
808
+ logger.info(f"First strong heading index: {first_strong_index if first_strong_index != float('inf') else 'None'}")
809
+
810
+ # 3b. Assign levels with Late-Arrival check
811
+ current_level = 1
812
+
813
+ for cidx in sorted(cluster_texts.keys()):
814
+ texts = cluster_texts[cidx]
815
+
816
+ # Filter matches for this cluster
817
+ valid_texts = []
818
+ demoted_texts = []
819
+
820
+ for text in texts:
821
+ # Global index in the document
822
+ g_idx = heading_order.index(text)
823
+
824
+ marker = self._extract_marker(text)
825
+ mtype = self._classify_marker_type(marker) if marker else 'other'
826
+
827
+ # Late Arrival Rule:
828
+ # If Unnumbered AND appears AFTER the first strong heading -> Demote
829
+ if mtype == 'other':
830
+ # Allow standard titles even if unnumbered
831
+ is_standard = text.strip().lower() in {
832
+ "introduction", "abstract", "background", "objective",
833
+ "conclusion", "references", "appendix"
834
+ }
835
+ if not is_standard and g_idx > first_strong_index:
836
+ # Demote to -1 (Paragraph) or a very deep level?
837
+ # Decision: Demote to -1 to force Paragraph type
838
+ demoted_texts.append(text)
839
+ continue
840
+
841
+ valid_texts.append(text)
842
+
843
+ # Apply levels to valid texts
844
+ if valid_texts:
845
+ sub_levels = self._sub_cluster_by_markers(valid_texts, base_level=current_level)
846
+ heading_to_level.update(sub_levels)
847
+ max_sub = max(sub_levels.values()) if sub_levels else current_level
848
+ current_level = max_sub + 1
849
+
850
+ # Apply demotion (-1 -> Paragraph)
851
+ for t in demoted_texts:
852
+ heading_to_level[t] = -1
853
+
854
+ # Log results
855
+ level_counts = {}
856
+ demoted_count = 0
857
+ for lvl in heading_to_level.values():
858
+ if lvl == -1:
859
+ demoted_count += 1
860
+ else:
861
+ level_counts[lvl] = level_counts.get(lvl, 0) + 1
862
+
863
+ cluster_info = ", ".join(
864
+ f"h{i+1}={sum(c)/len(c):.1f}px ({len(c)} headings)"
865
+ for i, c in enumerate(clusters)
866
+ )
867
+ logger.info(
868
+ f"Heading levels analyzed: {len(heading_to_level)} total. "
869
+ f"Valid levels={dict(sorted(level_counts.items()))}, "
870
+ f"Demoted (Text)={demoted_count} "
871
+ f"[clusters: {cluster_info}]"
872
+ )
873
+
874
+ # --- Step 4: Build ref_to_path from HierarchicalChunker ---
875
+ try:
876
+ chunks = list(self._chunker.chunk(docling_doc))
877
+ for chunk in chunks:
878
+ heading_path = []
879
+ if hasattr(chunk, 'meta') and chunk.meta:
880
+ if hasattr(chunk.meta, 'headings') and chunk.meta.headings:
881
+ heading_path = list(chunk.meta.headings)
882
+
883
+ if hasattr(chunk.meta, 'doc_items') and chunk.meta.doc_items:
884
+ for item in chunk.meta.doc_items:
885
+ ref = getattr(item, 'self_ref', None)
886
+ if ref:
887
+ ref_to_path[ref] = heading_path
888
+ except Exception as e:
889
+ logger.warning(f"HierarchicalChunker failed, hierarchy paths will be empty: {e}")
890
+
891
+ return ref_to_path, heading_to_level
892
+
893
+ def _get_page_dimensions(self, docling_doc) -> Dict[int, Tuple[float, float]]:
894
+ """
895
+ Extract actual page dimensions from Docling document.
896
+
897
+ Returns:
898
+ Dict mapping page_no (0-based) -> (width, height)
899
+ """
900
+ dims = {}
901
+ if hasattr(docling_doc, 'pages') and docling_doc.pages:
902
+ for page_no, page in docling_doc.pages.items():
903
+ width, height = 612.0, 792.0 # Fallback to US Letter
904
+ if hasattr(page, 'size') and page.size:
905
+ width = float(page.size.width) if hasattr(page.size, 'width') else 612.0
906
+ height = float(page.size.height) if hasattr(page.size, 'height') else 792.0
907
+ dims[page_no - 1] = (width, height) # Convert to 0-based
908
+ return dims
909
+
910
+ def _extract_bbox(self, prov) -> BoundingBox:
911
+ """Extract BoundingBox from a provenance entry."""
912
+ if not prov or not hasattr(prov, 'bbox') or not prov.bbox:
913
+ return BoundingBox(x0=0, y0=0, x1=0, y1=0)
914
+
915
+ prov_bbox = prov.bbox
916
+ if hasattr(prov_bbox, 'l'):
917
+ return BoundingBox(
918
+ x0=float(prov_bbox.l),
919
+ y0=float(prov_bbox.t),
920
+ x1=float(prov_bbox.r),
921
+ y1=float(prov_bbox.b),
922
+ )
923
+ elif isinstance(prov_bbox, (list, tuple)) and len(prov_bbox) >= 4:
924
+ return BoundingBox(
925
+ x0=float(prov_bbox[0]),
926
+ y0=float(prov_bbox[1]),
927
+ x1=float(prov_bbox[2]),
928
+ y1=float(prov_bbox[3]),
929
+ )
930
+ return BoundingBox(x0=0, y0=0, x1=0, y1=0)
931
+
932
+ def _get_item_provenance(self, item) -> Tuple[int, BoundingBox]:
933
+ """
934
+ Extract page number (0-based) and bbox from a Docling item.
935
+
936
+ Returns:
937
+ Tuple of (page_number_0based, BoundingBox)
938
+ """
939
+ page_num = 0
940
+ bbox = BoundingBox(x0=0, y0=0, x1=0, y1=0)
941
+
942
+ if hasattr(item, 'prov') and item.prov:
943
+ for prov in item.prov:
944
+ if hasattr(prov, 'page_no'):
945
+ page_num = prov.page_no - 1 # Convert to 0-based
946
+ bbox = self._extract_bbox(prov)
947
+ break # Use first provenance entry
948
+
949
+ return page_num, bbox
950
+
951
+ def _determine_block_type(self, item, level: int, heading_to_level: Dict[str, int] = None) -> Tuple[BlockType, Optional[int]]:
952
+ """
953
+ Determine block type and heading level from a Docling item
954
+ using isinstance checks and item.label.
955
+
956
+ For headings, uses the heading_to_level map (built from
957
+ HierarchicalChunker) for proper heading depth.
958
+
959
+ Returns:
960
+ Tuple of (BlockType, heading_level_or_None)
961
+ """
962
+ heading_level = None
963
+
964
+ # Primary: isinstance checks (most reliable)
965
+ if isinstance(item, SectionHeaderItem):
966
+ # Use HierarchicalChunker-derived level if available
967
+ text = getattr(item, 'text', '')
968
+ if heading_to_level and text in heading_to_level:
969
+ lvl = heading_to_level[text]
970
+ # If level is -1, it was demoted to Paragraph
971
+ if lvl == -1:
972
+ return BlockType.PARAGRAPH, None
973
+ heading_level = lvl
974
+ else:
975
+ heading_level = max(1, level)
976
+ return BlockType.HEADING, heading_level
977
+
978
+ # PPTX slide titles come as TitleItem (extends TextItem)
979
+ if TitleItem is not None and isinstance(item, TitleItem):
980
+ text = getattr(item, 'text', '')
981
+ if heading_to_level and text in heading_to_level:
982
+ lvl = heading_to_level[text]
983
+ if lvl == -1:
984
+ return BlockType.PARAGRAPH, None
985
+ heading_level = lvl
986
+ else:
987
+ heading_level = max(1, level)
988
+ return BlockType.HEADING, heading_level
989
+
990
+ if isinstance(item, TableItem):
991
+ return BlockType.TABLE, None
992
+
993
+ if isinstance(item, ListItem):
994
+ return BlockType.LIST_ITEM, None
995
+
996
+ if isinstance(item, PictureItem):
997
+ return BlockType.FIGURE, None
998
+
999
+ # Secondary: check item.label for fine-grained classification
1000
+ label = getattr(item, 'label', None)
1001
+ if label:
1002
+ label_str = str(label).lower() if not isinstance(label, str) else label.lower()
1003
+
1004
+ if 'caption' in label_str:
1005
+ return BlockType.CAPTION, None
1006
+ if 'footer' in label_str or 'footnote' in label_str:
1007
+ return BlockType.FOOTER, None
1008
+ if 'header' in label_str and 'section' not in label_str:
1009
+ return BlockType.HEADER, None
1010
+ if 'equation' in label_str or 'formula' in label_str:
1011
+ return BlockType.EQUATION, None
1012
+ if 'code' in label_str:
1013
+ return BlockType.CODE, None
1014
+ if 'title' in label_str:
1015
+ return BlockType.HEADING, max(1, level)
1016
+
1017
+ # Default: paragraph
1018
+ return BlockType.PARAGRAPH, None
1019
+
1020
+ def _get_item_text(self, item, docling_doc=None) -> str:
1021
+ """Extract text from a Docling item."""
1022
+ # For tables, prefer markdown with doc context for proper rendering
1023
+ if isinstance(item, TableItem) and hasattr(item, 'export_to_markdown'):
1024
+ try:
1025
+ return item.export_to_markdown(doc=docling_doc)
1026
+ except Exception:
1027
+ pass
1028
+ if hasattr(item, 'text') and item.text:
1029
+ return item.text
1030
+ if hasattr(item, 'export_to_markdown'):
1031
+ try:
1032
+ return item.export_to_markdown()
1033
+ except Exception:
1034
+ pass
1035
+ return ""
1036
+
1037
+ def _get_item_confidence(self, item) -> float:
1038
+ """Extract confidence from a Docling item, defaulting to 1.0."""
1039
+ if hasattr(item, 'confidence') and item.confidence is not None:
1040
+ return float(item.confidence)
1041
+ return 1.0
1042
+
1043
+ def _build_pptx_text_map(self, file_path: Path) -> Dict[int, Dict[str, PptxParaInfo]]:
1044
+ """
1045
+ Use python-pptx to build a per-slide map of text -> paragraph info.
1046
+
1047
+ Returns:
1048
+ Dict[slide_idx (0-based), Dict[normalized_text, PptxParaInfo]]
1049
+ """
1050
+ try:
1051
+ from pptx import Presentation
1052
+ from pptx.util import Emu
1053
+ from pptx.enum.shapes import PP_PLACEHOLDER_TYPE as PP_PLACEHOLDER
1054
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
1055
+ except ImportError:
1056
+ logger.warning("python-pptx not installed, cannot build PPTX indent map")
1057
+ return {}
1058
+
1059
+ pptx_map: Dict[int, Dict[str, PptxParaInfo]] = {}
1060
+
1061
+ try:
1062
+ prs = Presentation(str(file_path))
1063
+ except Exception as e:
1064
+ logger.warning(f"Failed to open PPTX with python-pptx: {e}")
1065
+ return {}
1066
+
1067
+ for slide_idx, slide in enumerate(prs.slides):
1068
+ slide_map: Dict[str, PptxParaInfo] = {}
1069
+ found_title = False
1070
+
1071
+ # Check if slide 0 has an actual SUBTITLE placeholder
1072
+ # If it does, we don't need the positional heuristic
1073
+ has_subtitle_placeholder = False
1074
+ if slide_idx == 0:
1075
+ try:
1076
+ from pptx.enum.shapes import PP_PLACEHOLDER_TYPE as PP_PH
1077
+ for s in slide.shapes:
1078
+ if s.is_placeholder:
1079
+ try:
1080
+ if s.placeholder_format.type == PP_PH.SUBTITLE:
1081
+ has_subtitle_placeholder = True
1082
+ break
1083
+ except Exception:
1084
+ pass
1085
+ except ImportError:
1086
+ pass
1087
+
1088
+ for shape in slide.shapes:
1089
+ found_title = self._extract_pptx_shape_info(
1090
+ shape, slide_map, slide_idx=slide_idx, found_title=found_title,
1091
+ has_subtitle_placeholder=has_subtitle_placeholder,
1092
+ )
1093
+
1094
+ pptx_map[slide_idx] = slide_map
1095
+
1096
+ # Post-processing: detect repeated text across slides (footer/header noise)
1097
+ # Text appearing on >50% of slides is likely a repeated footer/header element
1098
+ num_slides = len(pptx_map)
1099
+ if num_slides >= 3: # Only apply for presentations with enough slides
1100
+ text_slide_count: Dict[str, int] = {}
1101
+ for slide_map in pptx_map.values():
1102
+ for text, info in slide_map.items():
1103
+ if not info.is_title and not info.is_subtitle and not info.is_footer:
1104
+ text_slide_count[text] = text_slide_count.get(text, 0) + 1
1105
+
1106
+ threshold = num_slides * 0.5
1107
+ repeated_texts = {t for t, count in text_slide_count.items() if count > threshold}
1108
+
1109
+ if repeated_texts:
1110
+ logger.info(f"Detected {len(repeated_texts)} repeated footer/header texts across slides")
1111
+ for slide_map in pptx_map.values():
1112
+ for text in repeated_texts:
1113
+ if text in slide_map:
1114
+ slide_map[text] = PptxParaInfo(
1115
+ indent_level=0, is_title=False, is_subtitle=False,
1116
+ is_list=False, bullet_type='None', is_footer=True,
1117
+ )
1118
+
1119
+ total_entries = sum(len(m) for m in pptx_map.values())
1120
+ logger.info(f"Built PPTX text map: {len(pptx_map)} slides, {total_entries} text entries")
1121
+ return pptx_map
1122
+
1123
+ def _extract_pptx_shape_info(self, shape, slide_map: Dict[str, PptxParaInfo],
1124
+ slide_idx: int = 0, found_title: bool = False,
1125
+ has_subtitle_placeholder: bool = False) -> bool:
1126
+ """Extract paragraph info from a shape, handling groups recursively.
1127
+
1128
+ Returns whether a title shape has been found (for subtitle detection).
1129
+ """
1130
+ try:
1131
+ from pptx.enum.shapes import PP_PLACEHOLDER_TYPE as PP_PLACEHOLDER
1132
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
1133
+ except ImportError:
1134
+ return found_title
1135
+
1136
+ # Handle group shapes recursively
1137
+ if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
1138
+ for child_shape in shape.shapes:
1139
+ found_title = self._extract_pptx_shape_info(
1140
+ child_shape, slide_map, slide_idx=slide_idx, found_title=found_title,
1141
+ has_subtitle_placeholder=has_subtitle_placeholder,
1142
+ )
1143
+ return found_title
1144
+
1145
+ if not hasattr(shape, 'text_frame'):
1146
+ return found_title
1147
+ if not shape.has_text_frame:
1148
+ return found_title
1149
+
1150
+ # Determine if this shape is a title/subtitle/footer placeholder
1151
+ is_title_shape = False
1152
+ is_subtitle_shape = False
1153
+ is_footer_shape = False
1154
+ if shape.is_placeholder:
1155
+ try:
1156
+ ph_type = shape.placeholder_format.type
1157
+ if ph_type in (PP_PLACEHOLDER.TITLE, PP_PLACEHOLDER.CENTER_TITLE):
1158
+ is_title_shape = True
1159
+ elif ph_type == PP_PLACEHOLDER.SUBTITLE:
1160
+ is_subtitle_shape = True
1161
+ elif ph_type in (PP_PLACEHOLDER.DATE, PP_PLACEHOLDER.FOOTER, PP_PLACEHOLDER.SLIDE_NUMBER):
1162
+ is_footer_shape = True
1163
+ except Exception:
1164
+ pass
1165
+
1166
+ # Skip footer/date/slide-number shapes entirely
1167
+ if is_footer_shape:
1168
+ # Still record them in the map so we can filter during block conversion
1169
+ for paragraph in shape.text_frame.paragraphs:
1170
+ text = paragraph.text.strip()
1171
+ if text:
1172
+ norm_text = ' '.join(text.split())
1173
+ if norm_text not in slide_map:
1174
+ slide_map[norm_text] = PptxParaInfo(
1175
+ indent_level=0, is_title=False, is_subtitle=False,
1176
+ is_list=False, bullet_type='None', is_footer=True,
1177
+ )
1178
+ return found_title
1179
+
1180
+ # Detect subtitle: on the title slide (slide 0), the first non-placeholder
1181
+ # text shape after the TITLE is a subtitle — but ONLY if there's no actual
1182
+ # SUBTITLE placeholder on the slide (to avoid false positives)
1183
+ is_subtitle_by_position = False
1184
+ if (slide_idx == 0 and found_title and not is_title_shape
1185
+ and not shape.is_placeholder and not has_subtitle_placeholder):
1186
+ is_subtitle_by_position = True
1187
+
1188
+ first_para_in_shape = True
1189
+ for paragraph in shape.text_frame.paragraphs:
1190
+ text = paragraph.text.strip()
1191
+ if not text:
1192
+ continue
1193
+
1194
+ indent_level = paragraph.level if paragraph.level is not None else 0
1195
+
1196
+ # Detect bullet/numbered list
1197
+ is_list = False
1198
+ bullet_type = 'None'
1199
+ p_elem = paragraph._element
1200
+ ns = {'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}
1201
+ if p_elem.find('.//a:buChar', namespaces=ns) is not None:
1202
+ is_list = True
1203
+ bullet_type = 'Bullet'
1204
+ elif p_elem.find('.//a:buAutoNum', namespaces=ns) is not None:
1205
+ is_list = True
1206
+ bullet_type = 'Numbered'
1207
+ elif indent_level > 0 and not is_title_shape:
1208
+ is_list = True
1209
+
1210
+ # For title shapes, override indent to 0
1211
+ if is_title_shape or is_subtitle_shape:
1212
+ indent_level = 0
1213
+ is_list = False
1214
+ bullet_type = 'None'
1215
+
1216
+ # Mark as subtitle only for the FIRST paragraph of the subtitle shape
1217
+ mark_subtitle = is_subtitle_shape or (is_subtitle_by_position and first_para_in_shape)
1218
+ if mark_subtitle:
1219
+ indent_level = 0
1220
+ is_list = False
1221
+ bullet_type = 'None'
1222
+
1223
+ # Normalize text for matching (Docling may strip/normalize differently)
1224
+ norm_text = ' '.join(text.split())
1225
+
1226
+ # Only store first occurrence per slide (duplicate text on same slide is rare)
1227
+ if norm_text not in slide_map:
1228
+ slide_map[norm_text] = PptxParaInfo(
1229
+ indent_level=indent_level,
1230
+ is_title=is_title_shape,
1231
+ is_subtitle=mark_subtitle,
1232
+ is_list=is_list,
1233
+ bullet_type=bullet_type,
1234
+ )
1235
+
1236
+ first_para_in_shape = False
1237
+
1238
+ # Track that we've seen a title shape
1239
+ if is_title_shape:
1240
+ found_title = True
1241
+
1242
+ return found_title
1243
+
1244
+ def extract(
1245
+ self,
1246
+ file_path: Path,
1247
+ config: ProcessingConfig,
1248
+ page_numbers: Optional[List[int]] = None,
1249
+ ) -> Tuple[Document, ExtractionMetadata]:
1250
+ """
1251
+ Extract document using Docling.
1252
+
1253
+ Uses iterate_items() for reading-order block extraction
1254
+ and HierarchicalChunker for heading hierarchy paths.
1255
+ For PPTX files, uses python-pptx directly for indent levels.
1256
+
1257
+ Args:
1258
+ file_path: Path to document file
1259
+ config: Processing configuration
1260
+ page_numbers: Optional specific pages to extract
1261
+
1262
+ Returns:
1263
+ Tuple of (Document, ExtractionMetadata)
1264
+ """
1265
+ file_path = Path(file_path)
1266
+ is_pptx = file_path.suffix.lower() in ('.pptx', '.ppt')
1267
+
1268
+ # Calculate file hash
1269
+ with open(file_path, "rb") as f:
1270
+ file_hash = hashlib.md5(f.read()).hexdigest()
1271
+
1272
+ # Get conversion result (cached or new)
1273
+ result = self._run_docling(file_path, config)
1274
+ docling_doc = result.document
1275
+
1276
+ # Build PPTX-specific indent map if applicable
1277
+ pptx_text_map = None
1278
+ if is_pptx:
1279
+ pptx_text_map = self._build_pptx_text_map(file_path)
1280
+ # For PPTX: skip font-size clustering, use simple heading levels
1281
+ # All slide titles become h2 (since they're all peer-level slides)
1282
+ heading_to_level = {}
1283
+ for item, level in docling_doc.iterate_items():
1284
+ if isinstance(item, SectionHeaderItem):
1285
+ text = getattr(item, 'text', '')
1286
+ if text:
1287
+ heading_to_level[text] = 2 # All PPTX titles = h2
1288
+ elif TitleItem is not None and isinstance(item, TitleItem):
1289
+ text = getattr(item, 'text', '')
1290
+ if text:
1291
+ heading_to_level[text] = 2 # All PPTX titles = h2
1292
+ hierarchy_map = {}
1293
+ logger.info(f"PPTX mode: assigned {len(heading_to_level)} headings to level 2")
1294
+ else:
1295
+ # Standard PDF/DOCX path: use font-size clustering
1296
+ hierarchy_map, heading_to_level = self._build_hierarchy_map(docling_doc)
1297
+
1298
+ logger.info(f"Built hierarchy map with {len(hierarchy_map)} item mappings")
1299
+
1300
+ # Get actual page dimensions
1301
+ page_dims = self._get_page_dimensions(docling_doc)
1302
+
1303
+ # Convert to our Document format using iterate_items()
1304
+ pages = self._convert_to_pages(
1305
+ docling_doc,
1306
+ hierarchy_map,
1307
+ heading_to_level,
1308
+ page_dims,
1309
+ file_path,
1310
+ file_hash,
1311
+ exclude_headers_footers=config.exclude_page_headers_footers,
1312
+ pptx_text_map=pptx_text_map,
1313
+ )
1314
+
1315
+ # Filter pages if specific ones requested
1316
+ if page_numbers is not None:
1317
+ pages = [p for p in pages if p.page_number in page_numbers]
1318
+
1319
+ # Build document
1320
+ doc = Document(
1321
+ metadata=DocumentMetadata(
1322
+ source_file=str(file_path),
1323
+ file_hash=file_hash,
1324
+ total_pages=len(pages),
1325
+ ),
1326
+ pages=pages,
1327
+ )
1328
+
1329
+ # Extraction metadata
1330
+ strategy_desc = "PPTX mode (python-pptx indent map)" if is_pptx else "PDF/DOCX mode (font-size clustering)"
1331
+ meta = ExtractionMetadata(
1332
+ strategy_used="docling",
1333
+ ocr_backend_used="tesseract_cli",
1334
+ reasons=[f"Used Docling with Tesseract CLI OCR, iterate_items() for block extraction. {strategy_desc}"],
1335
+ )
1336
+
1337
+ total_blocks = sum(len(p.blocks) for p in pages)
1338
+ logger.info(f"Extracted {len(pages)} pages, {total_blocks} blocks")
1339
+
1340
+ return doc, meta
1341
+
1342
+ def _build_table_from_item(self, item, docling_doc=None) -> Optional[Table]:
1343
+ """
1344
+ Convert Docling TableItem.data into our Table schema.
1345
+
1346
+ Uses table_cells with proper row/col indices.
1347
+ Falls back to export_to_dataframe() if direct conversion fails.
1348
+ """
1349
+ if not isinstance(item, TableItem) or not hasattr(item, 'data'):
1350
+ return None
1351
+
1352
+ table_data = item.data
1353
+ if table_data.num_rows == 0 or table_data.num_cols == 0:
1354
+ return None
1355
+
1356
+ try:
1357
+ cells = []
1358
+ for dcell in table_data.table_cells:
1359
+ cells.append(TableCell(
1360
+ r0=dcell.start_row_offset_idx,
1361
+ c0=dcell.start_col_offset_idx,
1362
+ rspan=dcell.end_row_offset_idx - dcell.start_row_offset_idx,
1363
+ cspan=dcell.end_col_offset_idx - dcell.start_col_offset_idx,
1364
+ text=dcell.text,
1365
+ ))
1366
+
1367
+ if cells:
1368
+ return Table(
1369
+ n_rows=table_data.num_rows,
1370
+ n_cols=table_data.num_cols,
1371
+ cells=cells,
1372
+ )
1373
+ except Exception as e:
1374
+ logger.warning(f"Direct table cell conversion failed: {e}")
1375
+
1376
+ # Fallback: use export_to_dataframe()
1377
+ try:
1378
+ import pandas as pd
1379
+ df = item.export_to_dataframe(doc=docling_doc)
1380
+ if df is not None and not df.empty:
1381
+ n_rows = len(df) + 1 # +1 for header row
1382
+ n_cols = len(df.columns)
1383
+ cells = []
1384
+ # Header row
1385
+ for c_idx, col_name in enumerate(df.columns):
1386
+ cells.append(TableCell(
1387
+ r0=0, c0=c_idx, rspan=1, cspan=1,
1388
+ text=str(col_name),
1389
+ ))
1390
+ # Data rows
1391
+ for r_idx, (_, row) in enumerate(df.iterrows(), start=1):
1392
+ for c_idx, val in enumerate(row):
1393
+ cells.append(TableCell(
1394
+ r0=r_idx, c0=c_idx, rspan=1, cspan=1,
1395
+ text=str(val) if pd.notna(val) else "",
1396
+ ))
1397
+ return Table(
1398
+ n_rows=n_rows,
1399
+ n_cols=n_cols,
1400
+ cells=cells,
1401
+ )
1402
+ except Exception as e:
1403
+ logger.warning(f"DataFrame fallback also failed: {e}")
1404
+
1405
+ return None
1406
+
1407
+ def _convert_to_pages(
1408
+ self,
1409
+ docling_doc,
1410
+ hierarchy_map: Dict[str, List[str]],
1411
+ heading_to_level: Dict[str, int],
1412
+ page_dims: Dict[int, Tuple[float, float]],
1413
+ file_path: Path,
1414
+ file_hash: str,
1415
+ exclude_headers_footers: bool = True,
1416
+ pptx_text_map: Optional[Dict[int, Dict[str, 'PptxParaInfo']]] = None,
1417
+ ) -> List[Page]:
1418
+ """
1419
+ Convert Docling document to our Page format using iterate_items().
1420
+
1421
+ No synthetic heading injection, no inline heading regex,
1422
+ no hardcoded dimensions — purely Docling-native.
1423
+
1424
+ Tracks TableItem children to prevent duplicate blocks.
1425
+ When pptx_text_map is provided, uses it to set indent_level on blocks.
1426
+ """
1427
+ pages_dict: Dict[int, Page] = {}
1428
+ block_idx = 0
1429
+
1430
+ # Gap #1: Collect all self_refs that belong to table children
1431
+ # so we can skip them when they appear as standalone items.
1432
+ table_child_refs: set = set()
1433
+ for item, _level in docling_doc.iterate_items():
1434
+ if isinstance(item, TableItem):
1435
+ # Mark all refs inside this table's cells as children
1436
+ if hasattr(item, 'data') and item.data:
1437
+ for dcell in item.data.table_cells:
1438
+ if hasattr(dcell, 'ref') and dcell.ref:
1439
+ ref = getattr(dcell.ref, 'cref', getattr(dcell.ref, 'self_ref', None))
1440
+ if ref:
1441
+ table_child_refs.add(ref)
1442
+
1443
+ # iterate_items() provides (item, level) in reading order
1444
+ for item, level in docling_doc.iterate_items():
1445
+ # Gap #1: Skip items that are children of a table
1446
+ item_ref = getattr(item, 'self_ref', None)
1447
+ if item_ref and item_ref in table_child_refs:
1448
+ continue
1449
+
1450
+ # Get page and bbox from provenance
1451
+ page_num, bbox = self._get_item_provenance(item)
1452
+
1453
+ # Determine block type using Docling's native types + chunker heading levels
1454
+ block_type, heading_level = self._determine_block_type(item, level, heading_to_level)
1455
+
1456
+ # Filter headers and footers if requested
1457
+ if exclude_headers_footers and block_type in (BlockType.HEADER, BlockType.FOOTER):
1458
+ continue
1459
+
1460
+ # Get text (prefer markdown for equations to get LaTeX)
1461
+ if block_type == BlockType.EQUATION and hasattr(item, 'export_to_markdown'):
1462
+ try:
1463
+ text = item.export_to_markdown()
1464
+ except Exception:
1465
+ text = self._get_item_text(item, docling_doc)
1466
+ else:
1467
+ text = self._get_item_text(item, docling_doc)
1468
+
1469
+ if not text:
1470
+ continue
1471
+
1472
+ # Wrap equations with markers
1473
+ if block_type == BlockType.EQUATION:
1474
+ text = f"⟦EQUATION⟧\n{text.strip()}\n⟦/EQUATION⟧"
1475
+
1476
+ # Get hierarchy path from chunker map
1477
+ item_ref = getattr(item, 'self_ref', None)
1478
+ hierarchy_path = hierarchy_map.get(item_ref, [])
1479
+
1480
+ # Get native confidence
1481
+ item_confidence = self._get_item_confidence(item)
1482
+
1483
+ # Create page if needed, with actual dimensions
1484
+ if page_num not in pages_dict:
1485
+ width, height = page_dims.get(page_num, (612.0, 792.0))
1486
+ pages_dict[page_num] = Page(
1487
+ page_number=page_num,
1488
+ width=width,
1489
+ height=height,
1490
+ blocks=[],
1491
+ profile=PageProfile(page_number=page_num),
1492
+ )
1493
+
1494
+ # Build block
1495
+ table_obj = None
1496
+ if block_type == BlockType.TABLE:
1497
+ table_obj = self._build_table_from_item(item, docling_doc)
1498
+ if table_obj:
1499
+ logger.info(
1500
+ f" Populated Block.table: {table_obj.n_rows} rows × "
1501
+ f"{table_obj.n_cols} cols, {len(table_obj.cells)} cells"
1502
+ )
1503
+
1504
+ # Determine indent_level and filter footers from PPTX text map
1505
+ indent_level = 0
1506
+ if pptx_text_map is not None:
1507
+ norm_block_text = ' '.join(text.strip().split())
1508
+ # page_num is 1-based, pptx_text_map is 0-based
1509
+ slide_map = pptx_text_map.get(page_num - 1, {})
1510
+ # Also try page_num as-is (in case of off-by-one)
1511
+ if not slide_map:
1512
+ slide_map = pptx_text_map.get(page_num, {})
1513
+ pptx_info = slide_map.get(norm_block_text)
1514
+ if pptx_info:
1515
+ if pptx_info.is_footer:
1516
+ # Skip footer/date/slide-number content
1517
+ continue
1518
+ indent_level = pptx_info.indent_level
1519
+ # Promote subtitle to heading level 3
1520
+ if pptx_info.is_subtitle and block_type != BlockType.HEADING:
1521
+ block_type = BlockType.HEADING
1522
+ heading_level = 3
1523
+
1524
+ # Filter slide number patterns (e.g., "1 / 22", "12/22")
1525
+ if re.match(r'^\d+\s*/\s*\d+$', norm_block_text):
1526
+ continue
1527
+ # Filter single-character noise (common Beamer artifact)
1528
+ if len(norm_block_text) <= 1 and block_type not in (BlockType.HEADING,):
1529
+ continue
1530
+
1531
+ block = Block(
1532
+ type=block_type,
1533
+ text=text,
1534
+ order_index=block_idx,
1535
+ heading_level=heading_level,
1536
+ indent_level=indent_level,
1537
+ hierarchy_path=hierarchy_path,
1538
+ provenance=Provenance(
1539
+ source_file=str(file_path),
1540
+ page_number=page_num,
1541
+ bbox=bbox,
1542
+ extractor=ExtractorType.DOCLING,
1543
+ extractor_version=self.version,
1544
+ ),
1545
+ confidence=Confidence(
1546
+ overall=item_confidence,
1547
+ text_confidence=item_confidence,
1548
+ layout_confidence=item_confidence,
1549
+ ),
1550
+ table=table_obj,
1551
+ )
1552
+
1553
+ pages_dict[page_num].blocks.append(block)
1554
+ block_idx += 1
1555
+
1556
+ # Sort pages by page number and reindex blocks
1557
+ pages = sorted(pages_dict.values(), key=lambda p: p.page_number)
1558
+ for page in pages:
1559
+ for i, block in enumerate(page.blocks):
1560
+ block.order_index = i
1561
+
1562
+ return pages
1563
+
1564
+ def extract_page(
1565
+ self,
1566
+ file_path: Path,
1567
+ page_number: int,
1568
+ config: ProcessingConfig,
1569
+ ) -> Page:
1570
+ """Extract a single page."""
1571
+ doc, _ = self.extract(file_path, config, page_numbers=[page_number])
1572
+ if doc.pages:
1573
+ return doc.pages[0]
1574
+ raise ValueError(f"Page {page_number} not found in {file_path}")
1575
+
1576
+ def get_hierarchy(
1577
+ self,
1578
+ file_path: Path,
1579
+ config: ProcessingConfig,
1580
+ ) -> List[HierarchyChunk]:
1581
+ """
1582
+ Get document hierarchy using HierarchicalChunker.
1583
+
1584
+ Returns list of chunks with hierarchy information.
1585
+ """
1586
+ file_path = Path(file_path)
1587
+
1588
+ # Get conversion result (cached or new)
1589
+ result = self._run_docling(file_path, config)
1590
+ chunks = list(self._chunker.chunk(result.document))
1591
+
1592
+ hierarchy_chunks = []
1593
+ for idx, chunk in enumerate(chunks):
1594
+ heading_path = []
1595
+ page_num = 0
1596
+
1597
+ if hasattr(chunk, 'meta') and chunk.meta:
1598
+ if hasattr(chunk.meta, 'headings'):
1599
+ heading_path = list(chunk.meta.headings or [])
1600
+ if hasattr(chunk.meta, 'doc_items') and chunk.meta.doc_items:
1601
+ for item in chunk.meta.doc_items:
1602
+ if hasattr(item, 'prov') and item.prov:
1603
+ for prov in item.prov:
1604
+ if hasattr(prov, 'page_no'):
1605
+ page_num = prov.page_no - 1
1606
+ break
1607
+
1608
+ hierarchy_chunks.append(HierarchyChunk(
1609
+ text=chunk.text,
1610
+ heading_path=heading_path,
1611
+ level=len(heading_path),
1612
+ page_number=page_num,
1613
+ order_index=idx,
1614
+ ))
1615
+
1616
+ return hierarchy_chunks
1617
+
1618
+ def to_markdown(self, doc: Document) -> str:
1619
+ """Convert document to Markdown."""
1620
+ lines = []
1621
+
1622
+ for page in doc.pages:
1623
+ for block in page.blocks:
1624
+ if block.type == BlockType.HEADING and block.heading_level:
1625
+ prefix = "#" * min(block.heading_level, 6)
1626
+ lines.append(f"{prefix} {block.text}")
1627
+ lines.append("")
1628
+ elif block.type == BlockType.LIST_ITEM:
1629
+ indent = " " * block.indent_level
1630
+ lines.append(f"{indent}- {block.text}")
1631
+ lines.append("")
1632
+ elif block.type == BlockType.HEADER:
1633
+ # Page headers (e.g. running headers)
1634
+ lines.append(block.text)
1635
+ lines.append("")
1636
+ else:
1637
+ text = block.text
1638
+ # Escape leading # in non-heading text to prevent
1639
+ # markdown interpreting code comments as headings
1640
+ if text.lstrip().startswith('#') and block.type != BlockType.HEADING:
1641
+ text = text.replace('#', '\\#', 1)
1642
+ lines.append(text)
1643
+ lines.append("")
1644
+
1645
+ return "\n".join(lines)
1646
+
1647
+ def _sanitize_filename(self, name: str) -> str:
1648
+ """Sanitize string for filename."""
1649
+ return "".join(c for c in name if c.isalnum() or c in ('-', '_')).strip()
1650
+
1651
+ def save_images(self, output_dir: Path) -> List[Path]:
1652
+ """
1653
+ Save extracted images (pages, figures, tables).
1654
+
1655
+ Args:
1656
+ output_dir: Directory to save images
1657
+
1658
+ Returns:
1659
+ List of saved image paths
1660
+ """
1661
+ if self._last_result:
1662
+ result = self._last_result
1663
+ else:
1664
+ logger.warning("No conversion result available to save images from")
1665
+ return []
1666
+
1667
+ output_dir = Path(output_dir)
1668
+ output_dir.mkdir(parents=True, exist_ok=True)
1669
+ saved_paths = []
1670
+
1671
+ try:
1672
+ # Save page images
1673
+ if hasattr(result.document, 'pages'):
1674
+ for page_no, page in result.document.pages.items():
1675
+ if hasattr(page, 'image') and page.image and hasattr(page.image, 'pil_image'):
1676
+ image_path = output_dir / f"page_{page_no}.png"
1677
+ try:
1678
+ page.image.pil_image.save(image_path, format="PNG")
1679
+ saved_paths.append(image_path)
1680
+ except Exception as e:
1681
+ logger.warning(f"Failed to save page image {page_no}: {e}")
1682
+
1683
+ # Save figures and tables
1684
+ for element, _level in result.document.iterate_items():
1685
+ if isinstance(element, PictureItem):
1686
+ try:
1687
+ img = element.get_image(result.document)
1688
+ if img:
1689
+ safe_ref = self._sanitize_filename(element.self_ref)
1690
+ if not safe_ref:
1691
+ safe_ref = f"picture_{uuid.uuid4().hex[:8]}"
1692
+ image_path = output_dir / f"figure_{safe_ref}.png"
1693
+ img.save(image_path, format="PNG")
1694
+ saved_paths.append(image_path)
1695
+ except Exception as e:
1696
+ logger.warning(f"Failed to save figure image: {e}")
1697
+
1698
+ if isinstance(element, TableItem):
1699
+ try:
1700
+ img = element.get_image(result.document)
1701
+ if img:
1702
+ safe_ref = self._sanitize_filename(element.self_ref)
1703
+ if not safe_ref:
1704
+ safe_ref = f"table_{uuid.uuid4().hex[:8]}"
1705
+ image_path = output_dir / f"table_{safe_ref}.png"
1706
+ img.save(image_path, format="PNG")
1707
+ saved_paths.append(image_path)
1708
+ except Exception as e:
1709
+ logger.warning(f"Failed to save table image: {e}")
1710
+
1711
+ except Exception as e:
1712
+ logger.error(f"Failed to save images: {e}")
1713
+
1714
+ return saved_paths
1715
+
1716
+ # ------------------------------------------------------------------
1717
+ # LaTeX-OCR helpers (PDF smart mode)
1718
+ # ------------------------------------------------------------------
1719
+
1720
+ def _find_equation_items(self, doc) -> List[tuple]:
1721
+ """Find FORMULA-labeled items. Returns [(item, page_no), ...]."""
1722
+ equation_items = []
1723
+ for item, _ in doc.iterate_items():
1724
+ label = getattr(item, "label", None)
1725
+ if label is None:
1726
+ continue
1727
+ label_str = str(label).lower()
1728
+ if "formula" in label_str or "equation" in label_str:
1729
+ # Get page number from provenance
1730
+ page_no = 1
1731
+ if hasattr(item, "prov") and item.prov:
1732
+ page_no = item.prov[0].page_no
1733
+ equation_items.append((item, page_no))
1734
+ return equation_items
1735
+
1736
+ def _merge_adjacent_formulas(self, items: List[tuple], doc) -> tuple:
1737
+ """Merge vertically adjacent FORMULA bboxes in pixel space.
1738
+
1739
+ Returns:
1740
+ merged_items: list of (item, page_no)
1741
+ union_bboxes: dict of id(item) -> (x0, y0, x1, y1) in pixels
1742
+ blank_ids: set of id(item) for leftover fragments to blank
1743
+ """
1744
+ union_bboxes: Dict[int, tuple] = {}
1745
+ blank_ids: set = set()
1746
+
1747
+ if len(items) < 2:
1748
+ return items, union_bboxes, blank_ids
1749
+
1750
+ # Convert to pixel-space, matching prov by page_no
1751
+ pixel_items = []
1752
+ for item, page_no in items:
1753
+ # Find provenance matching this page
1754
+ prov = None
1755
+ for p in getattr(item, 'prov', []):
1756
+ if getattr(p, 'page_no', None) == page_no:
1757
+ prov = p
1758
+ break
1759
+ if not prov or not prov.bbox:
1760
+ continue
1761
+
1762
+ page = doc.pages.get(page_no)
1763
+ if page is None or not hasattr(page, 'image') or page.image is None:
1764
+ continue
1765
+
1766
+ try:
1767
+ pil_img = page.image.pil_image
1768
+ img_w, img_h = pil_img.size
1769
+ page_w = page.size.width
1770
+ page_h = page.size.height
1771
+
1772
+ tl = prov.bbox.to_top_left_origin(page_h)
1773
+ sx, sy = img_w / page_w, img_h / page_h
1774
+ px_bbox = (tl.l * sx, tl.t * sy, tl.r * sx, tl.b * sy)
1775
+ pixel_items.append((item, page_no, px_bbox))
1776
+ except Exception as e:
1777
+ logger.debug(f"Skipping formula merge for item: {e}")
1778
+ continue
1779
+
1780
+ if len(pixel_items) < 2:
1781
+ return items, union_bboxes, blank_ids
1782
+
1783
+ GAP_PX = 20
1784
+ H_OVERLAP_MIN = 0.3
1785
+ OVERLAP_Y_ALLOW = 5
1786
+
1787
+ groups = [[pixel_items[0]]]
1788
+ for entry in pixel_items[1:]:
1789
+ _, pg, (x0, y0, x1, y1) = entry
1790
+ prev = groups[-1][-1]
1791
+ _, prev_pg, (px0, py0, px1, py1) = prev
1792
+
1793
+ # Directed gap (y increases downward in pixel space)
1794
+ gap = y0 - py1
1795
+ if pg != prev_pg or gap < -OVERLAP_Y_ALLOW or gap > GAP_PX:
1796
+ groups.append([entry])
1797
+ continue
1798
+
1799
+ h_overlap = max(0, min(x1, px1) - max(x0, px0))
1800
+ h_extent = max(x1, px1) - min(x0, px0)
1801
+ if h_extent > 0 and h_overlap / h_extent >= H_OVERLAP_MIN:
1802
+ groups[-1].append(entry)
1803
+ else:
1804
+ groups.append([entry])
1805
+
1806
+ merged = []
1807
+ for group in groups:
1808
+ anchor_item, anchor_pg, _ = group[0]
1809
+ if len(group) == 1:
1810
+ merged.append((anchor_item, anchor_pg))
1811
+ else:
1812
+ # Compute union bbox in pixel space
1813
+ ux0 = min(e[2][0] for e in group)
1814
+ uy0 = min(e[2][1] for e in group)
1815
+ ux1 = max(e[2][2] for e in group)
1816
+ uy1 = max(e[2][3] for e in group)
1817
+ union_bboxes[id(anchor_item)] = (ux0, uy0, ux1, uy1)
1818
+ merged.append((anchor_item, anchor_pg))
1819
+ # Mark non-anchor items for blanking
1820
+ for e in group[1:]:
1821
+ blank_ids.add(id(e[0]))
1822
+
1823
+ return merged, union_bboxes, blank_ids
1824
+
1825
+ def _crop_equation_bbox(self, doc, item, page_no: int,
1826
+ union_bboxes: Dict[int, tuple] = None):
1827
+ """Crop equation image from page. Returns PIL Image or None."""
1828
+ page = doc.pages.get(page_no)
1829
+ if page is None or not hasattr(page, 'image') or page.image is None:
1830
+ return None
1831
+
1832
+ try:
1833
+ pil_img = page.image.pil_image
1834
+ img_w, img_h = pil_img.size
1835
+ except Exception:
1836
+ return None
1837
+
1838
+ # Check for merged union bbox first
1839
+ if union_bboxes and id(item) in union_bboxes:
1840
+ x0, y0, x1, y1 = union_bboxes[id(item)]
1841
+ else:
1842
+ # Standard provenance → pixel transform
1843
+ prov = None
1844
+ for p in getattr(item, 'prov', []):
1845
+ if getattr(p, 'page_no', None) == page_no:
1846
+ prov = p
1847
+ break
1848
+ if not prov or not prov.bbox:
1849
+ return None
1850
+
1851
+ page_w = page.size.width
1852
+ page_h = page.size.height
1853
+
1854
+ tl = prov.bbox.to_top_left_origin(page_h)
1855
+ sx, sy = img_w / page_w, img_h / page_h
1856
+ x0, y0 = tl.l * sx, tl.t * sy
1857
+ x1, y1 = tl.r * sx, tl.b * sy
1858
+
1859
+ # Rotation/sanity: coords must be within image
1860
+ if x0 < 0 or y0 < 0 or x1 > img_w or y1 > img_h:
1861
+ logger.debug(f"BBox outside image bounds on page {page_no}, skipping")
1862
+ return None
1863
+
1864
+ # 15% padding + clamp
1865
+ pad_x = (x1 - x0) * 0.15
1866
+ pad_y = (y1 - y0) * 0.15
1867
+ x0, y0 = max(0, x0 - pad_x), max(0, y0 - pad_y)
1868
+ x1, y1 = min(img_w, x1 + pad_x), min(img_h, y1 + pad_y)
1869
+
1870
+ # Minimum 64px crop
1871
+ if (x1 - x0) < 64 or (y1 - y0) < 64:
1872
+ logger.debug(f"Crop too small ({x1-x0:.0f}×{y1-y0:.0f}px) on page {page_no}")
1873
+ return None
1874
+
1875
+ return pil_img.crop((int(x0), int(y0), int(x1), int(y1)))
1876
+
1877
+ # ------------------------------------------------------------------
1878
+ # DOCX/PPTX equation extraction
1879
+ # ------------------------------------------------------------------
1880
+
1881
+ def _extract_docx_equations(self, file_path: Path) -> List[str]:
1882
+ """Extract OMML equations from DOCX as LaTeX strings."""
1883
+ try:
1884
+ from docxlatex import Document as DocxLatexDoc
1885
+ doc = DocxLatexDoc(str(file_path))
1886
+ equations = doc.get_equations()
1887
+ return [self._normalize_latex(eq) for eq in equations if eq.strip()]
1888
+ except ImportError:
1889
+ logger.warning("docxlatex not installed. Skipping DOCX equation extraction.")
1890
+ return []
1891
+ except Exception as e:
1892
+ logger.warning(f"DOCX equation extraction failed: {e}")
1893
+ return []
1894
+
1895
+ def _extract_pptx_equations(self, file_path: Path) -> List[str]:
1896
+ """Scan PPTX slide XML for <m:oMath> nodes."""
1897
+ import zipfile
1898
+ try:
1899
+ import defusedxml.ElementTree as ET
1900
+ except ImportError:
1901
+ logger.warning("defusedxml not installed. Skipping PPTX equation extraction.")
1902
+ return []
1903
+
1904
+ equations = []
1905
+ MATH_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math"
1906
+ MAX_SLIDES = 50
1907
+ MAX_BYTES_PER_SLIDE = 10 * 1024 * 1024
1908
+ MAX_TOTAL_BYTES = 100 * 1024 * 1024
1909
+ MAX_COMPRESSION_RATIO = 100
1910
+ MAX_ENTRIES = 500
1911
+
1912
+ try:
1913
+ with zipfile.ZipFile(str(file_path)) as z:
1914
+ # Zip entry count cap
1915
+ if len(z.infolist()) > MAX_ENTRIES:
1916
+ logger.warning(f"PPTX has {len(z.infolist())} entries (>{MAX_ENTRIES}). Skipping.")
1917
+ return []
1918
+
1919
+ slide_files = sorted([
1920
+ n for n in z.namelist()
1921
+ if n.startswith("ppt/slides/slide") and n.endswith(".xml")
1922
+ ])[:MAX_SLIDES]
1923
+
1924
+ total_bytes = 0
1925
+ for name in slide_files:
1926
+ info = z.getinfo(name)
1927
+ if info.file_size > MAX_BYTES_PER_SLIDE:
1928
+ logger.debug(f"Skipping {name}: too large")
1929
+ continue
1930
+ if info.compress_size > 0 and info.file_size / info.compress_size > MAX_COMPRESSION_RATIO:
1931
+ logger.debug(f"Skipping {name}: suspicious compression ratio")
1932
+ continue
1933
+ total_bytes += info.file_size
1934
+ if total_bytes > MAX_TOTAL_BYTES:
1935
+ logger.warning("PPTX total uncompressed bytes exceeded cap")
1936
+ break
1937
+
1938
+ try:
1939
+ tree = ET.parse(z.open(name))
1940
+ for omath in tree.iter(f"{{{MATH_NS}}}oMath"):
1941
+ # Extract text content from m:t elements
1942
+ texts = []
1943
+ for t_elem in omath.iter(f"{{{MATH_NS}}}t"):
1944
+ if t_elem.text:
1945
+ texts.append(t_elem.text)
1946
+ if texts:
1947
+ raw = " ".join(texts)
1948
+ equations.append(self._normalize_latex(raw))
1949
+ except Exception as e:
1950
+ logger.debug(f"Failed to parse {name}: {e}")
1951
+ except Exception as e:
1952
+ logger.warning(f"PPTX equation extraction failed: {e}")
1953
+
1954
+ return equations
1955
+
1956
+ def _normalize_latex(self, latex: str) -> str:
1957
+ """Fix whitespace artifacts in converted LaTeX."""
1958
+ if not latex:
1959
+ return latex
1960
+ # Collapse broken control sequences: \f r a c → \frac
1961
+ prev = None
1962
+ while prev != latex:
1963
+ prev = latex
1964
+ latex = re.sub(r'\\([a-zA-Z]+)\s+([a-zA-Z])', r'\\\1\2', latex)
1965
+ # Trim repeated spaces
1966
+ latex = re.sub(r'\s+', ' ', latex).strip()
1967
+ return latex
1968
+
1969
+ def _normalize_unicode_math(self, text: str) -> str:
1970
+ """
1971
+ Convert Unicode math symbols to LaTeX-lite notation.
1972
+ Only applies if text does NOT look like it already has LaTeX formatting.
1973
+ """
1974
+ if not text:
1975
+ return text
1976
+
1977
+ # Scope check: Don't touch if it looks like LaTeX already
1978
+ if "$" in text or "\\" in text:
1979
+ return text
1980
+
1981
+ # Common math symbols
1982
+ replacements = [
1983
+ (r"²", "^2"), (r"³", "^3"),
1984
+ (r"₁", "_1"), (r"₂", "_2"), (r"₃", "_3"), (r"ᵢ", "_i"), (r"ⱼ", "_j"), (r"ₙ", "_n"),
1985
+ (r"∑", r"\\sum"), (r"∫", r"\\int"), (r"∞", r"\\infty"),
1986
+ (r"√", r"\\sqrt"), (r"∂", r"\\partial"), (r"∇", r"\\nabla"),
1987
+ (r"≈", r"\\approx"), (r"≠", r"\\neq"), (r"≤", r"\\leq"), (r"≥", r"\\geq"),
1988
+ (r"α", r"\\alpha"), (r"β", r"\\beta"), (r"γ", r"\\gamma"), (r"θ", r"\\theta"),
1989
+ (r"π", r"\\pi"), (r"µ", r"\\mu"), (r"σ", r"\\sigma"), (r"Ω", r"\\Omega"),
1990
+ (r"∈", r"\\in"), (r"∀", r"\\forall"), (r"∃", r"\\exists"),
1991
+ (r"→", r"\\to"), (r"⇒", r"\\implies"), (r"±", r"\\pm"),
1992
+ ]
1993
+
1994
+ normalized = text
1995
+ for char, latex in replacements:
1996
+ normalized = normalized.replace(char, latex)
1997
+
1998
+ return normalized
1999
+
2000
+ def _detect_math_heavy_pages(self, doc, threshold: int = 3) -> List[int]:
2001
+ """
2002
+ Identify pages that contain significant math content.
2003
+ Returns a list of 1-based page numbers.
2004
+ """
2005
+ math_pages = set()
2006
+ math_symbols = set("∑∫√∂∇≈≤≥∞αβγθπµσΩ∈∀∃⇒±")
2007
+
2008
+ # Efficient pass: Iterate all items once
2009
+ page_math_scores = {} # page_no -> score
2010
+
2011
+ for item, _ in doc.iterate_items():
2012
+ # Get page number (1-based)
2013
+ page_no = 1
2014
+ if hasattr(item, "prov") and item.prov:
2015
+ # prov is a list of Provenance items
2016
+ page_no = item.prov[0].page_no
2017
+
2018
+ # Check for Formula label
2019
+ # Docling label enum or string: "formula", "equation"
2020
+ label = getattr(item, "label", "").lower() if hasattr(item, "label") else ""
2021
+ if "formula" in label or "equation" in label:
2022
+ page_math_scores[page_no] = page_math_scores.get(page_no, 0) + 10 # High score for explicit label
2023
+
2024
+ # Check text content
2025
+ text = getattr(item, "text", "")
2026
+ if text:
2027
+ # Unicode density check
2028
+ symbol_count = sum(1 for char in text if char in math_symbols)
2029
+
2030
+ # Superscript/Subscript check
2031
+ # ranges: super (²³¹⁰...): \u00B2, \u00B3, \u00B9, \u2070-\u207F
2032
+ # sub (₀₁...): \u2080-\u209C
2033
+ sub_super_count = 0
2034
+ for char in text:
2035
+ code = ord(char)
2036
+ if (0x2070 <= code <= 0x207F) or (0x2080 <= code <= 0x209C) or code in [0xB2, 0xB3, 0xB9]:
2037
+ sub_super_count += 1
2038
+
2039
+ page_math_scores[page_no] = page_math_scores.get(page_no, 0) + symbol_count + (sub_super_count * 0.5)
2040
+
2041
+ # Filter pages exceeding threshold
2042
+ for page_no, score in page_math_scores.items():
2043
+ if score >= threshold:
2044
+ math_pages.add(page_no)
2045
+
2046
+ return sorted(list(math_pages))
2047
+
2048
+ def _is_enriched_page_valid(self, doc, page_no: int) -> bool:
2049
+ """
2050
+ Check if an enriched page has valid output (detect garbled text).
2051
+ """
2052
+ # Get text for specific page from the doc
2053
+ page_text = ""
2054
+ # iterate_items(page_no) is supported
2055
+ for item, _ in doc.iterate_items(page_no=page_no):
2056
+ page_text += getattr(item, "text", "") + " "
2057
+
2058
+ if not page_text.strip():
2059
+ return True # Empty page is "valid" in the sense of not garbled
2060
+
2061
+ # Check for garble markers
2062
+ if "/C0" in page_text or "/C1" in page_text:
2063
+ return False
2064
+
2065
+ return True