longparser 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- longparser/__init__.py +104 -0
- longparser/chunkers/__init__.py +5 -0
- longparser/chunkers/hybrid_chunker.py +1046 -0
- longparser/extractors/__init__.py +9 -0
- longparser/extractors/base.py +62 -0
- longparser/extractors/docling_extractor.py +2065 -0
- longparser/extractors/latex_ocr.py +404 -0
- longparser/integrations/__init__.py +31 -0
- longparser/integrations/langchain.py +138 -0
- longparser/integrations/llamaindex.py +157 -0
- longparser/pipeline/__init__.py +8 -0
- longparser/pipeline/orchestrator.py +230 -0
- longparser/py.typed +0 -0
- longparser/schemas.py +247 -0
- longparser/server/__init__.py +22 -0
- longparser/server/app.py +1045 -0
- longparser/server/chat/__init__.py +39 -0
- longparser/server/chat/callbacks.py +110 -0
- longparser/server/chat/engine.py +341 -0
- longparser/server/chat/graph.py +176 -0
- longparser/server/chat/llm_chain.py +153 -0
- longparser/server/chat/retriever.py +111 -0
- longparser/server/chat/schemas.py +164 -0
- longparser/server/db.py +656 -0
- longparser/server/embeddings.py +181 -0
- longparser/server/queue.py +97 -0
- longparser/server/routers/__init__.py +0 -0
- longparser/server/schemas.py +204 -0
- longparser/server/vectorstores.py +443 -0
- longparser/server/worker.py +480 -0
- longparser/utils/__init__.py +5 -0
- longparser/utils/rtl_detector.py +93 -0
- longparser-0.1.0.dist-info/METADATA +337 -0
- longparser-0.1.0.dist-info/RECORD +36 -0
- longparser-0.1.0.dist-info/WHEEL +5 -0
- longparser-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2065 @@
|
|
|
1
|
+
"""Docling-based document extractor with Tesseract CLI OCR and HierarchicalChunker.
|
|
2
|
+
|
|
3
|
+
Uses:
|
|
4
|
+
- Tesseract CLI for OCR
|
|
5
|
+
- Layout analysis always enabled
|
|
6
|
+
- TableFormer for table structure
|
|
7
|
+
- HierarchicalChunker for heading hierarchy
|
|
8
|
+
- iterate_items() for reading-order block extraction
|
|
9
|
+
|
|
10
|
+
No hardcoded heuristics — relies entirely on Docling's native capabilities.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Optional, Tuple, List, Dict
|
|
15
|
+
import os
|
|
16
|
+
import time
|
|
17
|
+
import logging
|
|
18
|
+
import hashlib
|
|
19
|
+
import uuid
|
|
20
|
+
import re
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from docling.datamodel.pipeline_options import (
|
|
23
|
+
PdfPipelineOptions,
|
|
24
|
+
TesseractCliOcrOptions,
|
|
25
|
+
)
|
|
26
|
+
from docling.datamodel.base_models import InputFormat
|
|
27
|
+
from docling.document_converter import (
|
|
28
|
+
DocumentConverter,
|
|
29
|
+
PdfFormatOption,
|
|
30
|
+
WordFormatOption,
|
|
31
|
+
PowerpointFormatOption,
|
|
32
|
+
ExcelFormatOption,
|
|
33
|
+
CsvFormatOption,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
from docling_core.transforms.chunker import HierarchicalChunker
|
|
37
|
+
from docling_core.types.doc import (
|
|
38
|
+
SectionHeaderItem,
|
|
39
|
+
TableItem,
|
|
40
|
+
PictureItem,
|
|
41
|
+
TextItem,
|
|
42
|
+
ListItem,
|
|
43
|
+
DocItemLabel,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# TitleItem is used by Docling for PPTX slide titles (not SectionHeaderItem)
|
|
47
|
+
try:
|
|
48
|
+
from docling_core.types.doc import TitleItem
|
|
49
|
+
except ImportError:
|
|
50
|
+
TitleItem = None # Fallback for older docling versions
|
|
51
|
+
|
|
52
|
+
from ..schemas import (
|
|
53
|
+
Document, Page, Block, Table, TableCell,
|
|
54
|
+
BlockType, ExtractorType, ProcessingConfig,
|
|
55
|
+
BoundingBox, Provenance, Confidence, BlockFlags,
|
|
56
|
+
DocumentMetadata, PageProfile, ExtractionMetadata,
|
|
57
|
+
)
|
|
58
|
+
from .base import BaseExtractor
|
|
59
|
+
|
|
60
|
+
logger = logging.getLogger(__name__)
|
|
61
|
+
|
|
62
|
+
# Pattern to detect structured leading markers in headings.
|
|
63
|
+
# Matches alphanumeric + punctuation prefixes followed by whitespace:
|
|
64
|
+
# "I.", "II.", "A.", "1.", "2.3", "IV", "a)", etc.
|
|
65
|
+
_MARKER_RE = re.compile(r'^([A-Za-z0-9][A-Za-z0-9.()]*)[.\s]\s*')
|
|
66
|
+
|
|
67
|
+
# Pattern used to detect garbled math in paragraph blocks.
|
|
68
|
+
_MATH_RE = re.compile(
|
|
69
|
+
r'[\u2211\u220F\u222B\u221A\u00B1\u2264\u2265\u2248\u2260\u03B1-\u03C9\u03A3]'
|
|
70
|
+
r'|[a-z]\s*=\s*[a-z0-9]',
|
|
71
|
+
re.IGNORECASE,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _iou_px(a: dict, b: dict) -> float:
|
|
76
|
+
"""Compute IoU between two pixel-space bbox dicts {x0,y0,x1,y1}."""
|
|
77
|
+
xi0, yi0 = max(a["x0"], b["x0"]), max(a["y0"], b["y0"])
|
|
78
|
+
xi1, yi1 = min(a["x1"], b["x1"]), min(a["y1"], b["y1"])
|
|
79
|
+
inter = max(0, xi1 - xi0) * max(0, yi1 - yi0)
|
|
80
|
+
ua = (a["x1"] - a["x0"]) * (a["y1"] - a["y0"])
|
|
81
|
+
ub = (b["x1"] - b["x0"]) * (b["y1"] - b["y0"])
|
|
82
|
+
union = ua + ub - inter
|
|
83
|
+
return inter / union if union > 0 else 0.0
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _is_mfd_candidate(page_no: int, page_blocks, docling_formula_count: int) -> bool:
|
|
87
|
+
"""Return True if MFD should scan this page.
|
|
88
|
+
|
|
89
|
+
Runs MFD if Docling found few/no formulas OR at least one non-equation
|
|
90
|
+
block on this page contains garbled math Unicode.
|
|
91
|
+
"""
|
|
92
|
+
if docling_formula_count > 3:
|
|
93
|
+
return False # Docling handled it well; trust it
|
|
94
|
+
garbled = any(
|
|
95
|
+
_MATH_RE.search(b.text)
|
|
96
|
+
for b in page_blocks
|
|
97
|
+
if getattr(b, "type", None) is not None and str(b.type) != "equation"
|
|
98
|
+
)
|
|
99
|
+
return docling_formula_count == 0 or garbled
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@dataclass
|
|
103
|
+
class _HeadingInfo:
|
|
104
|
+
"""Internal heading tracking."""
|
|
105
|
+
text: str
|
|
106
|
+
level: int
|
|
107
|
+
hierarchy_path: List[str]
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataclass
|
|
111
|
+
class PptxParaInfo:
|
|
112
|
+
"""Paragraph info extracted directly from python-pptx."""
|
|
113
|
+
indent_level: int # 0-8 from paragraph.level
|
|
114
|
+
is_title: bool # True for TITLE / CENTER_TITLE placeholders
|
|
115
|
+
is_subtitle: bool # True for SUBTITLE placeholders
|
|
116
|
+
is_list: bool # True if Docling would treat it as list item
|
|
117
|
+
bullet_type: str # 'Bullet', 'Numbered', 'None'
|
|
118
|
+
is_footer: bool = False # True for DATE / FOOTER / SLIDE_NUMBER placeholders
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@dataclass
|
|
122
|
+
class HierarchyChunk:
|
|
123
|
+
"""A chunk with hierarchy information."""
|
|
124
|
+
text: str
|
|
125
|
+
heading_path: List[str]
|
|
126
|
+
level: int
|
|
127
|
+
page_number: int
|
|
128
|
+
order_index: int
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class DoclingExtractor(BaseExtractor):
|
|
132
|
+
"""
|
|
133
|
+
Document extractor using Docling with Tesseract CLI OCR.
|
|
134
|
+
|
|
135
|
+
Relies entirely on Docling's native APIs:
|
|
136
|
+
- iterate_items() for reading-order traversal with hierarchy level
|
|
137
|
+
- SectionHeaderItem / TextItem / TableItem / ListItem / PictureItem for type detection
|
|
138
|
+
- item.label (DocItemLabel) for fine-grained classification
|
|
139
|
+
- item.prov for page number and bounding box
|
|
140
|
+
- page.size for actual page dimensions
|
|
141
|
+
- HierarchicalChunker for heading hierarchy paths
|
|
142
|
+
|
|
143
|
+
Heading hierarchy is inferred autonomously from:
|
|
144
|
+
1. Pattern Priority (Numbered vs Unnumbered)
|
|
145
|
+
2. Position Awareness (Late Arrival Rule)
|
|
146
|
+
3. Font-size clustering
|
|
147
|
+
|
|
148
|
+
No hardcoded numbering conventions.
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
extractor_type = ExtractorType.DOCLING
|
|
152
|
+
version = "3.0.0"
|
|
153
|
+
|
|
154
|
+
def __init__(self, tesseract_lang: List[str] = None, tessdata_path: str = None, force_full_page_ocr: bool = False):
|
|
155
|
+
"""
|
|
156
|
+
Initialize Docling extractor.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
tesseract_lang: Languages for Tesseract OCR (default: ["eng"])
|
|
160
|
+
tessdata_path: Path to tessdata directory with language models and configs.
|
|
161
|
+
If None, uses system default.
|
|
162
|
+
force_full_page_ocr: If True, OCR entire page even if embedded text exists.
|
|
163
|
+
Required for PDFs with broken Unicode mapping.
|
|
164
|
+
"""
|
|
165
|
+
self._converter = None
|
|
166
|
+
self._chunker = None
|
|
167
|
+
self._initialized = False
|
|
168
|
+
self._languages = tesseract_lang or ["eng"]
|
|
169
|
+
self._tessdata_dir = tessdata_path
|
|
170
|
+
self._force_full_page_ocr = force_full_page_ocr
|
|
171
|
+
|
|
172
|
+
def _create_converter(self, config: ProcessingConfig, formula_enrichment: Optional[bool] = None) -> DocumentConverter:
|
|
173
|
+
"""Create a DocumentConverter with Tesseract CLI OCR."""
|
|
174
|
+
# Configure pipeline
|
|
175
|
+
pipeline_options = PdfPipelineOptions()
|
|
176
|
+
pipeline_options.do_ocr = config.do_ocr
|
|
177
|
+
pipeline_options.do_table_structure = config.do_table_structure
|
|
178
|
+
|
|
179
|
+
# Determine formula enrichment setting (independent of do_ocr)
|
|
180
|
+
if formula_enrichment is not None:
|
|
181
|
+
pipeline_options.do_formula_enrichment = formula_enrichment
|
|
182
|
+
elif not config.formula_ocr:
|
|
183
|
+
# Formula OCR explicitly disabled
|
|
184
|
+
pipeline_options.do_formula_enrichment = False
|
|
185
|
+
elif config.formula_mode == "full":
|
|
186
|
+
pipeline_options.do_formula_enrichment = True
|
|
187
|
+
else:
|
|
188
|
+
# Default to False for "fast" and "smart" (initial pass)
|
|
189
|
+
pipeline_options.do_formula_enrichment = False
|
|
190
|
+
|
|
191
|
+
# Enable image export
|
|
192
|
+
pipeline_options.generate_page_images = True
|
|
193
|
+
pipeline_options.generate_picture_images = config.export_images
|
|
194
|
+
pipeline_options.images_scale = 2.0
|
|
195
|
+
|
|
196
|
+
# Use Tesseract CLI for OCR
|
|
197
|
+
ocr_options = TesseractCliOcrOptions(
|
|
198
|
+
lang=self._languages,
|
|
199
|
+
tesseract_cmd="tesseract",
|
|
200
|
+
path=self._tessdata_dir,
|
|
201
|
+
force_full_page_ocr=config.force_full_page_ocr,
|
|
202
|
+
)
|
|
203
|
+
pipeline_options.ocr_options = ocr_options
|
|
204
|
+
|
|
205
|
+
return DocumentConverter(
|
|
206
|
+
format_options={
|
|
207
|
+
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
|
|
208
|
+
InputFormat.DOCX: WordFormatOption(pipeline_options=pipeline_options),
|
|
209
|
+
InputFormat.PPTX: PowerpointFormatOption(pipeline_options=pipeline_options),
|
|
210
|
+
InputFormat.XLSX: ExcelFormatOption(pipeline_options=pipeline_options),
|
|
211
|
+
InputFormat.CSV: CsvFormatOption(pipeline_options=pipeline_options),
|
|
212
|
+
}
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _run_docling(self, file_path: Path, config: ProcessingConfig):
|
|
217
|
+
"""Run Docling conversion and return the DoclingDocument."""
|
|
218
|
+
# Check if we need to re-initialize converter due to config change or first run
|
|
219
|
+
# For simplicity, we just ensure self._converter exists.
|
|
220
|
+
# In smart/fast mode, it will be the "fast" converter (no enrichment).
|
|
221
|
+
# In full mode, it will be the "full" converter (enrichment).
|
|
222
|
+
if not self._initialized:
|
|
223
|
+
logger.info("Initializing Docling pipeline...")
|
|
224
|
+
self._converter = self._create_converter(config)
|
|
225
|
+
self._chunker = HierarchicalChunker()
|
|
226
|
+
self._initialized = True
|
|
227
|
+
logger.info(f"Docling pipeline initialized (formula_mode={config.formula_mode})")
|
|
228
|
+
|
|
229
|
+
file_path = Path(file_path)
|
|
230
|
+
|
|
231
|
+
logger.info(f"Extracting with Docling: {file_path.name}")
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
# Powerpoint/Excel/Word/CSV use standard conversion
|
|
235
|
+
ext = file_path.suffix.lower()
|
|
236
|
+
if ext not in [".pdf"]:
|
|
237
|
+
result = self._converter.convert(str(file_path))
|
|
238
|
+
|
|
239
|
+
# DOCX/PPTX: inject OMML equations as LaTeX
|
|
240
|
+
if ext in (".docx", ".pptx") and config.formula_mode != "fast":
|
|
241
|
+
if ext == ".docx":
|
|
242
|
+
latex_eqs = self._extract_docx_equations(file_path)
|
|
243
|
+
else:
|
|
244
|
+
latex_eqs = self._extract_pptx_equations(file_path)
|
|
245
|
+
|
|
246
|
+
if latex_eqs:
|
|
247
|
+
# Find formula blocks in Docling output (order-based)
|
|
248
|
+
formula_blocks = []
|
|
249
|
+
for item, _ in result.document.iterate_items():
|
|
250
|
+
label = getattr(item, "label", None)
|
|
251
|
+
if label and ("formula" in str(label).lower() or "equation" in str(label).lower()):
|
|
252
|
+
formula_blocks.append(item)
|
|
253
|
+
|
|
254
|
+
# Order-based substitution with alignment gate
|
|
255
|
+
injected = 0
|
|
256
|
+
_non_omml = 0
|
|
257
|
+
for block, latex in zip(formula_blocks, latex_eqs):
|
|
258
|
+
orig_len = len(block.text.strip()) if block.text else 0
|
|
259
|
+
latex_len = len(latex.strip())
|
|
260
|
+
|
|
261
|
+
# Asymmetric gate: allow if Docling text is empty/garbled
|
|
262
|
+
if orig_len < 3 and latex_len > 3:
|
|
263
|
+
block.text = f"$${latex}$$"
|
|
264
|
+
injected += 1
|
|
265
|
+
elif latex_len > 0 and 0.2 <= (orig_len + 5) / (latex_len + 5) <= 5.0:
|
|
266
|
+
block.text = f"$${latex}$$"
|
|
267
|
+
injected += 1
|
|
268
|
+
else:
|
|
269
|
+
logger.debug(f"Skipping equation inject: ratio out of range")
|
|
270
|
+
|
|
271
|
+
if len(formula_blocks) != len(latex_eqs):
|
|
272
|
+
logger.warning(
|
|
273
|
+
f"{ext.upper()} equation count mismatch: "
|
|
274
|
+
f"extracted={len(latex_eqs)}, docling={len(formula_blocks)}. "
|
|
275
|
+
f"Injected {injected}."
|
|
276
|
+
)
|
|
277
|
+
else:
|
|
278
|
+
logger.info(f"Injected {injected} LaTeX equations from {ext.upper()}")
|
|
279
|
+
|
|
280
|
+
return result
|
|
281
|
+
|
|
282
|
+
# --- PDF Handling with Smart Formula Mode ---
|
|
283
|
+
|
|
284
|
+
# Pass 1: Run standard conversion
|
|
285
|
+
# If mode="smart" or "fast", this is the FAST pass (no enrichment).
|
|
286
|
+
# If mode="full", this is the FULL pass (enrichment enabled).
|
|
287
|
+
start_time = time.time()
|
|
288
|
+
result = self._converter.convert(str(file_path))
|
|
289
|
+
_duration = time.time() - start_time
|
|
290
|
+
|
|
291
|
+
# If not smart mode, we are done (Full or Fast)
|
|
292
|
+
if config.formula_mode != "smart":
|
|
293
|
+
# Apply normalization if Fast mode (to make unicode math nicer)
|
|
294
|
+
if config.formula_mode == "fast":
|
|
295
|
+
_keys = list(result.document.pages.keys()) # snapshot (unused; iteration below)
|
|
296
|
+
for _, page in result.document.pages.items():
|
|
297
|
+
# Iterate all items on page
|
|
298
|
+
# We can't easily modify text in-place efficiently without iterating items
|
|
299
|
+
pass
|
|
300
|
+
# Actually, normalization is better applied globally to the doc text items
|
|
301
|
+
for item, _ in result.document.iterate_items():
|
|
302
|
+
if hasattr(item, "text"):
|
|
303
|
+
item.text = self._normalize_unicode_math(item.text)
|
|
304
|
+
return result
|
|
305
|
+
|
|
306
|
+
# --- Smart Mode: BBox Crop → LaTeX-OCR ---
|
|
307
|
+
|
|
308
|
+
num_pages = len(result.document.pages)
|
|
309
|
+
|
|
310
|
+
# Page cap: if PDF is huge, fallback to fast
|
|
311
|
+
if num_pages > 100:
|
|
312
|
+
logger.info(f"Smart mode disabled: {num_pages} pages exceeds cap (100). "
|
|
313
|
+
"Falling back to Unicode normalization only.")
|
|
314
|
+
for item, _ in result.document.iterate_items():
|
|
315
|
+
if hasattr(item, "text"):
|
|
316
|
+
item.text = self._normalize_unicode_math(item.text)
|
|
317
|
+
return result
|
|
318
|
+
|
|
319
|
+
# Find equation items (FORMULA-labeled blocks only)
|
|
320
|
+
equation_items = self._find_equation_items(result.document)
|
|
321
|
+
|
|
322
|
+
if equation_items:
|
|
323
|
+
# Merge adjacent formula fragments
|
|
324
|
+
merged_items, union_bboxes, blank_ids = self._merge_adjacent_formulas(
|
|
325
|
+
equation_items, result.document
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
backend = os.getenv("LONGPARSER_LATEX_OCR_BACKEND", "pix2tex")
|
|
329
|
+
try:
|
|
330
|
+
from .latex_ocr import LaTeXOCR
|
|
331
|
+
ocr = LaTeXOCR(backend=backend)
|
|
332
|
+
except ImportError:
|
|
333
|
+
ocr = None
|
|
334
|
+
logger.warning("latex_ocr module not available. Skipping formula OCR.")
|
|
335
|
+
|
|
336
|
+
if ocr and ocr.available:
|
|
337
|
+
processed, t0 = 0, time.monotonic()
|
|
338
|
+
# Per-equation timeout: cap each pix2tex call to prevent one slow eq from blocking
|
|
339
|
+
per_eq_timeout = float(os.getenv("LONGPARSER_FORMULA_PER_EQ_TIMEOUT", "30"))
|
|
340
|
+
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeout
|
|
341
|
+
executor = ThreadPoolExecutor(max_workers=1)
|
|
342
|
+
|
|
343
|
+
for item, page_no in merged_items:
|
|
344
|
+
# Circuit breaker: equation count
|
|
345
|
+
if processed >= config.smart_max_equations:
|
|
346
|
+
logger.info(f"Circuit breaker: {processed} equations reached limit")
|
|
347
|
+
break
|
|
348
|
+
# Circuit breaker: time budget
|
|
349
|
+
if time.monotonic() - t0 > config.smart_max_ocr_seconds:
|
|
350
|
+
logger.info(f"Circuit breaker: OCR time budget exceeded "
|
|
351
|
+
f"({time.monotonic() - t0:.1f}s > {config.smart_max_ocr_seconds}s)")
|
|
352
|
+
break
|
|
353
|
+
|
|
354
|
+
crop = self._crop_equation_bbox(
|
|
355
|
+
result.document, item, page_no, union_bboxes
|
|
356
|
+
)
|
|
357
|
+
if crop is None:
|
|
358
|
+
continue
|
|
359
|
+
|
|
360
|
+
# Run pix2tex with per-equation timeout
|
|
361
|
+
try:
|
|
362
|
+
future = executor.submit(ocr.recognize, crop)
|
|
363
|
+
latex = future.result(timeout=per_eq_timeout)
|
|
364
|
+
except FuturesTimeout:
|
|
365
|
+
logger.info(f"Equation OCR timed out after {per_eq_timeout}s, skipping")
|
|
366
|
+
continue
|
|
367
|
+
except Exception as e:
|
|
368
|
+
logger.debug(f"Equation OCR error: {e}")
|
|
369
|
+
continue
|
|
370
|
+
|
|
371
|
+
if latex:
|
|
372
|
+
item.text = f"$${latex}$$"
|
|
373
|
+
processed += 1
|
|
374
|
+
|
|
375
|
+
executor.shutdown(wait=False)
|
|
376
|
+
|
|
377
|
+
# Blank leftover fragments (merged items whose text was absorbed)
|
|
378
|
+
for item, _ in result.document.iterate_items():
|
|
379
|
+
if id(item) in blank_ids:
|
|
380
|
+
item.text = ""
|
|
381
|
+
|
|
382
|
+
logger.info(f"Smart mode: OCR'd {processed} equations in "
|
|
383
|
+
f"{time.monotonic() - t0:.2f}s")
|
|
384
|
+
else:
|
|
385
|
+
logger.info("LaTeX-OCR not available. Using Unicode normalization only.")
|
|
386
|
+
else:
|
|
387
|
+
logger.info("Smart mode: No FORMULA blocks detected by Docling.")
|
|
388
|
+
|
|
389
|
+
# ── MFD fallback: scan candidate pages for missed equations ───────────
|
|
390
|
+
try:
|
|
391
|
+
from .latex_ocr import MFDBackend
|
|
392
|
+
mfd = MFDBackend.get()
|
|
393
|
+
except Exception:
|
|
394
|
+
mfd = None
|
|
395
|
+
|
|
396
|
+
if mfd and mfd.available and ocr and ocr.available:
|
|
397
|
+
t0_mfd = time.monotonic()
|
|
398
|
+
max_ocr_secs = config.smart_max_ocr_seconds
|
|
399
|
+
|
|
400
|
+
for page_no, page_obj in result.document.pages.items():
|
|
401
|
+
# Budget gate: skip MFD if <60% of budget remains
|
|
402
|
+
elapsed = time.monotonic() - t0_mfd
|
|
403
|
+
if elapsed > max_ocr_secs * 0.4:
|
|
404
|
+
logger.info("MFD: time budget low, stopping page scan")
|
|
405
|
+
break
|
|
406
|
+
|
|
407
|
+
# Count Docling formulas on this page for candidate gating
|
|
408
|
+
docling_formula_count = sum(
|
|
409
|
+
1 for item, pno in result.document.iterate_items()
|
|
410
|
+
if pno == page_no and str(getattr(item, "label", "")).lower() in {"formula", "equation"}
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
# Collect page blocks for gating and replace-first
|
|
414
|
+
# (These are the Block objects we will be building during extraction;
|
|
415
|
+
# at this stage we check items directly from Docling result)
|
|
416
|
+
page_text_items = [
|
|
417
|
+
item for item, pno in result.document.iterate_items()
|
|
418
|
+
if pno == page_no and hasattr(item, "text") and item.text
|
|
419
|
+
]
|
|
420
|
+
|
|
421
|
+
# Build minimal proxy dicts for _is_mfd_candidate
|
|
422
|
+
page_proxy = [
|
|
423
|
+
type("_P", (), {"text": it.text, "type": str(getattr(it, "label", ""))}) # type: ignore
|
|
424
|
+
for it in page_text_items
|
|
425
|
+
]
|
|
426
|
+
|
|
427
|
+
if not _is_mfd_candidate(page_no, page_proxy, docling_formula_count):
|
|
428
|
+
continue
|
|
429
|
+
|
|
430
|
+
# Get page PIL image (already rendered by Docling in smart mode)
|
|
431
|
+
page_img = None
|
|
432
|
+
try:
|
|
433
|
+
page_img = page_obj.image.pil_image
|
|
434
|
+
except Exception:
|
|
435
|
+
continue
|
|
436
|
+
if page_img is None:
|
|
437
|
+
continue
|
|
438
|
+
|
|
439
|
+
mfd_boxes = mfd.detect(page_img)
|
|
440
|
+
if not mfd_boxes:
|
|
441
|
+
continue
|
|
442
|
+
|
|
443
|
+
# Build pixel-space bboxes for existing Docling FORMULA items on this page
|
|
444
|
+
existing_formula_px: list[dict] = []
|
|
445
|
+
img_w, img_h = page_img.size
|
|
446
|
+
page_w = page_obj.size.width
|
|
447
|
+
page_h = page_obj.size.height
|
|
448
|
+
sx = img_w / page_w if page_w else 1.0
|
|
449
|
+
sy = img_h / page_h if page_h else 1.0
|
|
450
|
+
|
|
451
|
+
for item, pno in result.document.iterate_items():
|
|
452
|
+
if pno != page_no:
|
|
453
|
+
continue
|
|
454
|
+
label = str(getattr(item, "label", "")).lower()
|
|
455
|
+
if label not in {"formula", "equation"}:
|
|
456
|
+
continue
|
|
457
|
+
for prov in getattr(item, "prov", []):
|
|
458
|
+
if getattr(prov, "page_no", None) != page_no:
|
|
459
|
+
continue
|
|
460
|
+
bbox = getattr(prov, "bbox", None)
|
|
461
|
+
if bbox is None:
|
|
462
|
+
continue
|
|
463
|
+
tl = bbox.to_top_left_origin(page_h)
|
|
464
|
+
existing_formula_px.append({
|
|
465
|
+
"x0": int(tl.l * sx), "y0": int(tl.t * sy),
|
|
466
|
+
"x1": int(tl.r * sx), "y1": int(tl.b * sy),
|
|
467
|
+
})
|
|
468
|
+
|
|
469
|
+
for mbox in mfd_boxes:
|
|
470
|
+
# Circuit breakers
|
|
471
|
+
if processed >= config.smart_max_equations:
|
|
472
|
+
break
|
|
473
|
+
if time.monotonic() - t0_mfd > max_ocr_secs:
|
|
474
|
+
break
|
|
475
|
+
|
|
476
|
+
# Skip if already covered by a Docling formula bbox
|
|
477
|
+
if any(_iou_px(mbox, ex) > 0.5 for ex in existing_formula_px):
|
|
478
|
+
continue
|
|
479
|
+
|
|
480
|
+
# Crop and OCR
|
|
481
|
+
from PIL import Image as _PILImage
|
|
482
|
+
pad_x = (mbox["x1"] - mbox["x0"]) * 0.15
|
|
483
|
+
pad_y = (mbox["y1"] - mbox["y0"]) * 0.15
|
|
484
|
+
cx0 = max(0, mbox["x0"] - pad_x)
|
|
485
|
+
cy0 = max(0, mbox["y0"] - pad_y)
|
|
486
|
+
cx1 = min(img_w, mbox["x1"] + pad_x)
|
|
487
|
+
cy1 = min(img_h, mbox["y1"] + pad_y)
|
|
488
|
+
if (cx1 - cx0) < 64 or (cy1 - cy0) < 64:
|
|
489
|
+
continue
|
|
490
|
+
crop = page_img.crop((int(cx0), int(cy0), int(cx1), int(cy1)))
|
|
491
|
+
|
|
492
|
+
latex = ocr.recognize(crop)
|
|
493
|
+
if not latex:
|
|
494
|
+
continue
|
|
495
|
+
|
|
496
|
+
processed += 1
|
|
497
|
+
delim = "$$" if mbox["type"] == "isolated" else "$"
|
|
498
|
+
latex_text = f"{delim}{latex}{delim}"
|
|
499
|
+
mbox_dict = mbox # alias for IoU below
|
|
500
|
+
|
|
501
|
+
# Replace-first: find an overlapping garbled non-formula item
|
|
502
|
+
replaced = False
|
|
503
|
+
for item, pno in result.document.iterate_items():
|
|
504
|
+
if pno != page_no:
|
|
505
|
+
continue
|
|
506
|
+
if not hasattr(item, "text") or not item.text:
|
|
507
|
+
continue
|
|
508
|
+
label = str(getattr(item, "label", "")).lower()
|
|
509
|
+
if label in {"formula", "equation"}:
|
|
510
|
+
continue
|
|
511
|
+
if not _MATH_RE.search(item.text):
|
|
512
|
+
continue
|
|
513
|
+
# Compute pixel bbox for this item
|
|
514
|
+
for prov in getattr(item, "prov", []):
|
|
515
|
+
if getattr(prov, "page_no", None) != page_no:
|
|
516
|
+
continue
|
|
517
|
+
bbox = getattr(prov, "bbox", None)
|
|
518
|
+
if bbox is None:
|
|
519
|
+
continue
|
|
520
|
+
tl = bbox.to_top_left_origin(page_h)
|
|
521
|
+
item_px = {
|
|
522
|
+
"x0": int(tl.l * sx), "y0": int(tl.t * sy),
|
|
523
|
+
"x1": int(tl.r * sx), "y1": int(tl.b * sy),
|
|
524
|
+
}
|
|
525
|
+
if _iou_px(item_px, mbox_dict) > 0.5:
|
|
526
|
+
item.text = latex_text
|
|
527
|
+
# Update label to formula so downstream sees it correctly
|
|
528
|
+
try:
|
|
529
|
+
item.label = type(item.label)("formula")
|
|
530
|
+
except Exception:
|
|
531
|
+
pass
|
|
532
|
+
replaced = True
|
|
533
|
+
logger.debug(f"MFD: replaced garbled block on page {page_no}")
|
|
534
|
+
break
|
|
535
|
+
if replaced:
|
|
536
|
+
break
|
|
537
|
+
|
|
538
|
+
if not replaced:
|
|
539
|
+
# Append a new synthetic formula item text to the first item on
|
|
540
|
+
# this page so it flows into the block extraction pass.
|
|
541
|
+
# Simpler: log and let the extractor create it via block loop.
|
|
542
|
+
logger.debug(f"MFD: no overlapping garbled block found on page {page_no}; "
|
|
543
|
+
f"new equation injected as standalone")
|
|
544
|
+
# Inject as a minimal TextItem appended to the page's item list
|
|
545
|
+
try:
|
|
546
|
+
from docling_core.types.doc import TextItem as _TextItem, DocItemLabel as _DIL
|
|
547
|
+
new_item = _TextItem(
|
|
548
|
+
label=_DIL.FORMULA,
|
|
549
|
+
text=latex_text,
|
|
550
|
+
prov=[],
|
|
551
|
+
)
|
|
552
|
+
result.document.texts.append(new_item)
|
|
553
|
+
except Exception as e:
|
|
554
|
+
logger.debug(f"MFD: could not inject new item: {e}")
|
|
555
|
+
|
|
556
|
+
logger.info(f"MFD fallback finished. Total OCR'd: {processed}")
|
|
557
|
+
|
|
558
|
+
# Normalize remaining text (items not replaced with LaTeX)
|
|
559
|
+
for item, _ in result.document.iterate_items():
|
|
560
|
+
if hasattr(item, "text") and not item.text.startswith("$$"):
|
|
561
|
+
item.text = self._normalize_unicode_math(item.text)
|
|
562
|
+
|
|
563
|
+
return result
|
|
564
|
+
|
|
565
|
+
except Exception as e:
|
|
566
|
+
logger.error(f"Docling extraction failed: {e}")
|
|
567
|
+
raise
|
|
568
|
+
|
|
569
|
+
def _cluster_font_sizes(self, heights: List[float], tolerance: float = 0.15) -> List[List[float]]:
|
|
570
|
+
"""
|
|
571
|
+
Cluster heading bbox heights into distinct font-size groups.
|
|
572
|
+
|
|
573
|
+
Uses relative tolerance: two heights belong to the same cluster
|
|
574
|
+
if they are within `tolerance` (15%) of the cluster's mean height.
|
|
575
|
+
|
|
576
|
+
Returns:
|
|
577
|
+
List of clusters, sorted from largest mean height to smallest.
|
|
578
|
+
Each cluster is a list of heights that belong to it.
|
|
579
|
+
"""
|
|
580
|
+
if not heights:
|
|
581
|
+
return []
|
|
582
|
+
|
|
583
|
+
sorted_heights = sorted(set(heights), reverse=True)
|
|
584
|
+
clusters = []
|
|
585
|
+
|
|
586
|
+
for h in sorted_heights:
|
|
587
|
+
placed = False
|
|
588
|
+
for cluster in clusters:
|
|
589
|
+
cluster_mean = sum(cluster) / len(cluster)
|
|
590
|
+
# Relative difference check
|
|
591
|
+
if abs(h - cluster_mean) / max(cluster_mean, 0.1) <= tolerance:
|
|
592
|
+
cluster.append(h)
|
|
593
|
+
placed = True
|
|
594
|
+
break
|
|
595
|
+
if not placed:
|
|
596
|
+
clusters.append([h])
|
|
597
|
+
|
|
598
|
+
# Sort clusters by mean height descending (largest font first)
|
|
599
|
+
clusters.sort(key=lambda c: sum(c) / len(c), reverse=True)
|
|
600
|
+
return clusters
|
|
601
|
+
|
|
602
|
+
@staticmethod
|
|
603
|
+
def _extract_marker(text: str) -> Optional[str]:
|
|
604
|
+
"""
|
|
605
|
+
Extract the leading marker/prefix from a heading text.
|
|
606
|
+
|
|
607
|
+
Detects structured prefixes like "I.", "A.", "1.", "IV.",
|
|
608
|
+
"2.3", etc. using a general pattern matcher.
|
|
609
|
+
|
|
610
|
+
Returns:
|
|
611
|
+
The marker string if found, or None.
|
|
612
|
+
"""
|
|
613
|
+
m = _MARKER_RE.match(text.strip())
|
|
614
|
+
return m.group(1) if m else None
|
|
615
|
+
|
|
616
|
+
@staticmethod
|
|
617
|
+
def _classify_marker_type(marker: str) -> str:
|
|
618
|
+
"""
|
|
619
|
+
Classify a marker using strict numbering patterns.
|
|
620
|
+
|
|
621
|
+
Returns:
|
|
622
|
+
'numeric' for 1, 1.1, 1.1.1
|
|
623
|
+
'alpha' for A, B, A.1
|
|
624
|
+
'roman' for I, II, IV
|
|
625
|
+
'other' for bullets or non-structural markers
|
|
626
|
+
"""
|
|
627
|
+
if not marker:
|
|
628
|
+
return 'other'
|
|
629
|
+
|
|
630
|
+
marker = marker.strip()
|
|
631
|
+
|
|
632
|
+
# Strict Numeric: 1. or 1.1 or 1.1.1
|
|
633
|
+
if re.match(r'^\d+(\.\d+)*\.?$', marker):
|
|
634
|
+
return 'numeric'
|
|
635
|
+
|
|
636
|
+
# Strict Roman: I. or IV. (common uppercase roman)
|
|
637
|
+
if re.match(r'^[IVX]+\.?$', marker):
|
|
638
|
+
return 'roman'
|
|
639
|
+
|
|
640
|
+
# Strict Alpha: A. or A.1
|
|
641
|
+
if re.match(r'^[A-Z](\.[0-9]+)*\.?$', marker):
|
|
642
|
+
return 'alpha'
|
|
643
|
+
|
|
644
|
+
return 'other'
|
|
645
|
+
|
|
646
|
+
def _sub_cluster_by_markers(
|
|
647
|
+
self,
|
|
648
|
+
texts_in_cluster: List[str],
|
|
649
|
+
base_level: int,
|
|
650
|
+
) -> Dict[str, int]:
|
|
651
|
+
"""
|
|
652
|
+
Sub-differentiate headings within the same font-size cluster
|
|
653
|
+
using autonomous marker-pattern analysis.
|
|
654
|
+
|
|
655
|
+
Fully data-driven — no hardcoded rankings or character sets.
|
|
656
|
+
|
|
657
|
+
Algorithm:
|
|
658
|
+
1. Group headings by marker character-class (objective string
|
|
659
|
+
properties: isdigit, isupper, len).
|
|
660
|
+
2. Compute average span (gap) between consecutive markers of
|
|
661
|
+
each group. Parent sections have LARGER spans because child
|
|
662
|
+
sections fill the gaps between them.
|
|
663
|
+
3. Rank groups by span size: largest span = parent level.
|
|
664
|
+
|
|
665
|
+
Args:
|
|
666
|
+
texts_in_cluster: Heading texts in this font-size cluster.
|
|
667
|
+
base_level: Level assigned by font-size clustering.
|
|
668
|
+
|
|
669
|
+
Returns:
|
|
670
|
+
Dict mapping heading text -> adjusted heading level.
|
|
671
|
+
"""
|
|
672
|
+
if len(texts_in_cluster) <= 1:
|
|
673
|
+
return {t: base_level for t in texts_in_cluster}
|
|
674
|
+
|
|
675
|
+
# Step 1: Extract and classify markers by character class
|
|
676
|
+
text_info = [] # [(text, mtype)]
|
|
677
|
+
type_counts = {}
|
|
678
|
+
|
|
679
|
+
for text in texts_in_cluster:
|
|
680
|
+
marker = self._extract_marker(text)
|
|
681
|
+
mtype = self._classify_marker_type(marker) if marker else None
|
|
682
|
+
text_info.append((text, mtype))
|
|
683
|
+
if mtype:
|
|
684
|
+
type_counts[mtype] = type_counts.get(mtype, 0) + 1
|
|
685
|
+
|
|
686
|
+
# Need at least 2 distinct types with 2+ headings each
|
|
687
|
+
active_types = {t for t, c in type_counts.items() if c >= 2}
|
|
688
|
+
if len(active_types) <= 1:
|
|
689
|
+
return {t: base_level for t in texts_in_cluster}
|
|
690
|
+
|
|
691
|
+
# Step 2: Compute average span for each marker type
|
|
692
|
+
# Parent groups have LARGER spans (children fill the gaps)
|
|
693
|
+
type_positions = {}
|
|
694
|
+
for idx, (text, mtype) in enumerate(text_info):
|
|
695
|
+
if mtype and mtype in active_types:
|
|
696
|
+
type_positions.setdefault(mtype, []).append(idx)
|
|
697
|
+
|
|
698
|
+
type_avg_span = {}
|
|
699
|
+
for mtype, positions in type_positions.items():
|
|
700
|
+
if len(positions) < 2:
|
|
701
|
+
# Single instance — treat as broadest span
|
|
702
|
+
type_avg_span[mtype] = len(text_info)
|
|
703
|
+
else:
|
|
704
|
+
spans = [positions[i+1] - positions[i]
|
|
705
|
+
for i in range(len(positions) - 1)]
|
|
706
|
+
type_avg_span[mtype] = sum(spans) / len(spans)
|
|
707
|
+
|
|
708
|
+
# Step 3: Sort by average span DESCENDING (largest = parent)
|
|
709
|
+
sorted_types = sorted(
|
|
710
|
+
active_types,
|
|
711
|
+
key=lambda t: type_avg_span[t],
|
|
712
|
+
reverse=True,
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
# Assign sub-levels
|
|
716
|
+
type_to_sublevel = {}
|
|
717
|
+
for i, mtype in enumerate(sorted_types):
|
|
718
|
+
type_to_sublevel[mtype] = base_level + i
|
|
719
|
+
|
|
720
|
+
result = {}
|
|
721
|
+
for text, mtype in text_info:
|
|
722
|
+
if mtype and mtype in type_to_sublevel:
|
|
723
|
+
result[text] = type_to_sublevel[mtype]
|
|
724
|
+
else:
|
|
725
|
+
result[text] = base_level
|
|
726
|
+
|
|
727
|
+
logger.debug(
|
|
728
|
+
f"Sub-clustered {len(texts_in_cluster)} headings at level {base_level}: "
|
|
729
|
+
f"types={dict(type_counts)}, spans={type_avg_span}, "
|
|
730
|
+
f"sub-levels={type_to_sublevel}"
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
return result
|
|
734
|
+
|
|
735
|
+
def _build_hierarchy_map(self, docling_doc) -> Tuple[Dict[str, List[str]], Dict[str, int]]:
|
|
736
|
+
"""
|
|
737
|
+
Build two mappings using Docling's native APIs:
|
|
738
|
+
1. item self_ref -> heading path (from HierarchicalChunker)
|
|
739
|
+
2. heading text -> heading level (font-size + marker analysis)
|
|
740
|
+
|
|
741
|
+
Two-phase heading level inference:
|
|
742
|
+
Phase 1: Font-size clustering — groups by bbox height.
|
|
743
|
+
Largest font = h1, next = h2, etc.
|
|
744
|
+
Phase 2: Marker-pattern analysis — within each font-size
|
|
745
|
+
cluster, detect structural prefix patterns to
|
|
746
|
+
create sub-levels (e.g. "I." parent, "A." child).
|
|
747
|
+
|
|
748
|
+
Returns:
|
|
749
|
+
Tuple of (ref_to_path, heading_to_level)
|
|
750
|
+
"""
|
|
751
|
+
ref_to_path = {}
|
|
752
|
+
heading_to_level = {}
|
|
753
|
+
|
|
754
|
+
# --- Step 1: Collect heading texts and bbox heights ---
|
|
755
|
+
heading_heights = {} # text -> height
|
|
756
|
+
heading_order = [] # preserve document order
|
|
757
|
+
|
|
758
|
+
for item, level in docling_doc.iterate_items():
|
|
759
|
+
if isinstance(item, SectionHeaderItem):
|
|
760
|
+
text = getattr(item, 'text', '')
|
|
761
|
+
if not text:
|
|
762
|
+
continue
|
|
763
|
+
|
|
764
|
+
height = 0.0
|
|
765
|
+
prov = getattr(item, 'prov', [])
|
|
766
|
+
if prov and len(prov) > 0:
|
|
767
|
+
bbox = getattr(prov[0], 'bbox', None)
|
|
768
|
+
if bbox:
|
|
769
|
+
height = abs(getattr(bbox, 't', 0) - getattr(bbox, 'b', 0))
|
|
770
|
+
|
|
771
|
+
if text not in heading_heights:
|
|
772
|
+
heading_heights[text] = height
|
|
773
|
+
heading_order.append(text)
|
|
774
|
+
|
|
775
|
+
if not heading_heights:
|
|
776
|
+
logger.info("No section headers found in document")
|
|
777
|
+
return ref_to_path, heading_to_level
|
|
778
|
+
|
|
779
|
+
# --- Step 2: Font-size clustering ---
|
|
780
|
+
all_heights = list(heading_heights.values())
|
|
781
|
+
clusters = self._cluster_font_sizes(all_heights)
|
|
782
|
+
|
|
783
|
+
# Build height -> cluster index
|
|
784
|
+
height_to_cidx = {}
|
|
785
|
+
for idx, cluster in enumerate(clusters):
|
|
786
|
+
for h in cluster:
|
|
787
|
+
height_to_cidx[h] = idx
|
|
788
|
+
|
|
789
|
+
# Group heading texts by font-size cluster
|
|
790
|
+
cluster_texts = {}
|
|
791
|
+
for text in heading_order:
|
|
792
|
+
cidx = height_to_cidx.get(heading_heights[text], 0)
|
|
793
|
+
if cidx not in cluster_texts:
|
|
794
|
+
cluster_texts[cidx] = []
|
|
795
|
+
cluster_texts[cidx].append(text)
|
|
796
|
+
|
|
797
|
+
# --- Step 3: Marker-pattern sub-clustering & Late-Arrival Logic ---
|
|
798
|
+
|
|
799
|
+
# 3a. Find the first "Strong" (Numbered) heading in the entire document
|
|
800
|
+
first_strong_index = float('inf')
|
|
801
|
+
for idx, text in enumerate(heading_order):
|
|
802
|
+
marker = self._extract_marker(text)
|
|
803
|
+
mtype = self._classify_marker_type(marker) if marker else 'other'
|
|
804
|
+
if mtype in ('numeric', 'alpha', 'roman'):
|
|
805
|
+
first_strong_index = idx
|
|
806
|
+
break
|
|
807
|
+
|
|
808
|
+
logger.info(f"First strong heading index: {first_strong_index if first_strong_index != float('inf') else 'None'}")
|
|
809
|
+
|
|
810
|
+
# 3b. Assign levels with Late-Arrival check
|
|
811
|
+
current_level = 1
|
|
812
|
+
|
|
813
|
+
for cidx in sorted(cluster_texts.keys()):
|
|
814
|
+
texts = cluster_texts[cidx]
|
|
815
|
+
|
|
816
|
+
# Filter matches for this cluster
|
|
817
|
+
valid_texts = []
|
|
818
|
+
demoted_texts = []
|
|
819
|
+
|
|
820
|
+
for text in texts:
|
|
821
|
+
# Global index in the document
|
|
822
|
+
g_idx = heading_order.index(text)
|
|
823
|
+
|
|
824
|
+
marker = self._extract_marker(text)
|
|
825
|
+
mtype = self._classify_marker_type(marker) if marker else 'other'
|
|
826
|
+
|
|
827
|
+
# Late Arrival Rule:
|
|
828
|
+
# If Unnumbered AND appears AFTER the first strong heading -> Demote
|
|
829
|
+
if mtype == 'other':
|
|
830
|
+
# Allow standard titles even if unnumbered
|
|
831
|
+
is_standard = text.strip().lower() in {
|
|
832
|
+
"introduction", "abstract", "background", "objective",
|
|
833
|
+
"conclusion", "references", "appendix"
|
|
834
|
+
}
|
|
835
|
+
if not is_standard and g_idx > first_strong_index:
|
|
836
|
+
# Demote to -1 (Paragraph) or a very deep level?
|
|
837
|
+
# Decision: Demote to -1 to force Paragraph type
|
|
838
|
+
demoted_texts.append(text)
|
|
839
|
+
continue
|
|
840
|
+
|
|
841
|
+
valid_texts.append(text)
|
|
842
|
+
|
|
843
|
+
# Apply levels to valid texts
|
|
844
|
+
if valid_texts:
|
|
845
|
+
sub_levels = self._sub_cluster_by_markers(valid_texts, base_level=current_level)
|
|
846
|
+
heading_to_level.update(sub_levels)
|
|
847
|
+
max_sub = max(sub_levels.values()) if sub_levels else current_level
|
|
848
|
+
current_level = max_sub + 1
|
|
849
|
+
|
|
850
|
+
# Apply demotion (-1 -> Paragraph)
|
|
851
|
+
for t in demoted_texts:
|
|
852
|
+
heading_to_level[t] = -1
|
|
853
|
+
|
|
854
|
+
# Log results
|
|
855
|
+
level_counts = {}
|
|
856
|
+
demoted_count = 0
|
|
857
|
+
for lvl in heading_to_level.values():
|
|
858
|
+
if lvl == -1:
|
|
859
|
+
demoted_count += 1
|
|
860
|
+
else:
|
|
861
|
+
level_counts[lvl] = level_counts.get(lvl, 0) + 1
|
|
862
|
+
|
|
863
|
+
cluster_info = ", ".join(
|
|
864
|
+
f"h{i+1}={sum(c)/len(c):.1f}px ({len(c)} headings)"
|
|
865
|
+
for i, c in enumerate(clusters)
|
|
866
|
+
)
|
|
867
|
+
logger.info(
|
|
868
|
+
f"Heading levels analyzed: {len(heading_to_level)} total. "
|
|
869
|
+
f"Valid levels={dict(sorted(level_counts.items()))}, "
|
|
870
|
+
f"Demoted (Text)={demoted_count} "
|
|
871
|
+
f"[clusters: {cluster_info}]"
|
|
872
|
+
)
|
|
873
|
+
|
|
874
|
+
# --- Step 4: Build ref_to_path from HierarchicalChunker ---
|
|
875
|
+
try:
|
|
876
|
+
chunks = list(self._chunker.chunk(docling_doc))
|
|
877
|
+
for chunk in chunks:
|
|
878
|
+
heading_path = []
|
|
879
|
+
if hasattr(chunk, 'meta') and chunk.meta:
|
|
880
|
+
if hasattr(chunk.meta, 'headings') and chunk.meta.headings:
|
|
881
|
+
heading_path = list(chunk.meta.headings)
|
|
882
|
+
|
|
883
|
+
if hasattr(chunk.meta, 'doc_items') and chunk.meta.doc_items:
|
|
884
|
+
for item in chunk.meta.doc_items:
|
|
885
|
+
ref = getattr(item, 'self_ref', None)
|
|
886
|
+
if ref:
|
|
887
|
+
ref_to_path[ref] = heading_path
|
|
888
|
+
except Exception as e:
|
|
889
|
+
logger.warning(f"HierarchicalChunker failed, hierarchy paths will be empty: {e}")
|
|
890
|
+
|
|
891
|
+
return ref_to_path, heading_to_level
|
|
892
|
+
|
|
893
|
+
def _get_page_dimensions(self, docling_doc) -> Dict[int, Tuple[float, float]]:
|
|
894
|
+
"""
|
|
895
|
+
Extract actual page dimensions from Docling document.
|
|
896
|
+
|
|
897
|
+
Returns:
|
|
898
|
+
Dict mapping page_no (0-based) -> (width, height)
|
|
899
|
+
"""
|
|
900
|
+
dims = {}
|
|
901
|
+
if hasattr(docling_doc, 'pages') and docling_doc.pages:
|
|
902
|
+
for page_no, page in docling_doc.pages.items():
|
|
903
|
+
width, height = 612.0, 792.0 # Fallback to US Letter
|
|
904
|
+
if hasattr(page, 'size') and page.size:
|
|
905
|
+
width = float(page.size.width) if hasattr(page.size, 'width') else 612.0
|
|
906
|
+
height = float(page.size.height) if hasattr(page.size, 'height') else 792.0
|
|
907
|
+
dims[page_no - 1] = (width, height) # Convert to 0-based
|
|
908
|
+
return dims
|
|
909
|
+
|
|
910
|
+
def _extract_bbox(self, prov) -> BoundingBox:
|
|
911
|
+
"""Extract BoundingBox from a provenance entry."""
|
|
912
|
+
if not prov or not hasattr(prov, 'bbox') or not prov.bbox:
|
|
913
|
+
return BoundingBox(x0=0, y0=0, x1=0, y1=0)
|
|
914
|
+
|
|
915
|
+
prov_bbox = prov.bbox
|
|
916
|
+
if hasattr(prov_bbox, 'l'):
|
|
917
|
+
return BoundingBox(
|
|
918
|
+
x0=float(prov_bbox.l),
|
|
919
|
+
y0=float(prov_bbox.t),
|
|
920
|
+
x1=float(prov_bbox.r),
|
|
921
|
+
y1=float(prov_bbox.b),
|
|
922
|
+
)
|
|
923
|
+
elif isinstance(prov_bbox, (list, tuple)) and len(prov_bbox) >= 4:
|
|
924
|
+
return BoundingBox(
|
|
925
|
+
x0=float(prov_bbox[0]),
|
|
926
|
+
y0=float(prov_bbox[1]),
|
|
927
|
+
x1=float(prov_bbox[2]),
|
|
928
|
+
y1=float(prov_bbox[3]),
|
|
929
|
+
)
|
|
930
|
+
return BoundingBox(x0=0, y0=0, x1=0, y1=0)
|
|
931
|
+
|
|
932
|
+
def _get_item_provenance(self, item) -> Tuple[int, BoundingBox]:
|
|
933
|
+
"""
|
|
934
|
+
Extract page number (0-based) and bbox from a Docling item.
|
|
935
|
+
|
|
936
|
+
Returns:
|
|
937
|
+
Tuple of (page_number_0based, BoundingBox)
|
|
938
|
+
"""
|
|
939
|
+
page_num = 0
|
|
940
|
+
bbox = BoundingBox(x0=0, y0=0, x1=0, y1=0)
|
|
941
|
+
|
|
942
|
+
if hasattr(item, 'prov') and item.prov:
|
|
943
|
+
for prov in item.prov:
|
|
944
|
+
if hasattr(prov, 'page_no'):
|
|
945
|
+
page_num = prov.page_no - 1 # Convert to 0-based
|
|
946
|
+
bbox = self._extract_bbox(prov)
|
|
947
|
+
break # Use first provenance entry
|
|
948
|
+
|
|
949
|
+
return page_num, bbox
|
|
950
|
+
|
|
951
|
+
def _determine_block_type(self, item, level: int, heading_to_level: Dict[str, int] = None) -> Tuple[BlockType, Optional[int]]:
|
|
952
|
+
"""
|
|
953
|
+
Determine block type and heading level from a Docling item
|
|
954
|
+
using isinstance checks and item.label.
|
|
955
|
+
|
|
956
|
+
For headings, uses the heading_to_level map (built from
|
|
957
|
+
HierarchicalChunker) for proper heading depth.
|
|
958
|
+
|
|
959
|
+
Returns:
|
|
960
|
+
Tuple of (BlockType, heading_level_or_None)
|
|
961
|
+
"""
|
|
962
|
+
heading_level = None
|
|
963
|
+
|
|
964
|
+
# Primary: isinstance checks (most reliable)
|
|
965
|
+
if isinstance(item, SectionHeaderItem):
|
|
966
|
+
# Use HierarchicalChunker-derived level if available
|
|
967
|
+
text = getattr(item, 'text', '')
|
|
968
|
+
if heading_to_level and text in heading_to_level:
|
|
969
|
+
lvl = heading_to_level[text]
|
|
970
|
+
# If level is -1, it was demoted to Paragraph
|
|
971
|
+
if lvl == -1:
|
|
972
|
+
return BlockType.PARAGRAPH, None
|
|
973
|
+
heading_level = lvl
|
|
974
|
+
else:
|
|
975
|
+
heading_level = max(1, level)
|
|
976
|
+
return BlockType.HEADING, heading_level
|
|
977
|
+
|
|
978
|
+
# PPTX slide titles come as TitleItem (extends TextItem)
|
|
979
|
+
if TitleItem is not None and isinstance(item, TitleItem):
|
|
980
|
+
text = getattr(item, 'text', '')
|
|
981
|
+
if heading_to_level and text in heading_to_level:
|
|
982
|
+
lvl = heading_to_level[text]
|
|
983
|
+
if lvl == -1:
|
|
984
|
+
return BlockType.PARAGRAPH, None
|
|
985
|
+
heading_level = lvl
|
|
986
|
+
else:
|
|
987
|
+
heading_level = max(1, level)
|
|
988
|
+
return BlockType.HEADING, heading_level
|
|
989
|
+
|
|
990
|
+
if isinstance(item, TableItem):
|
|
991
|
+
return BlockType.TABLE, None
|
|
992
|
+
|
|
993
|
+
if isinstance(item, ListItem):
|
|
994
|
+
return BlockType.LIST_ITEM, None
|
|
995
|
+
|
|
996
|
+
if isinstance(item, PictureItem):
|
|
997
|
+
return BlockType.FIGURE, None
|
|
998
|
+
|
|
999
|
+
# Secondary: check item.label for fine-grained classification
|
|
1000
|
+
label = getattr(item, 'label', None)
|
|
1001
|
+
if label:
|
|
1002
|
+
label_str = str(label).lower() if not isinstance(label, str) else label.lower()
|
|
1003
|
+
|
|
1004
|
+
if 'caption' in label_str:
|
|
1005
|
+
return BlockType.CAPTION, None
|
|
1006
|
+
if 'footer' in label_str or 'footnote' in label_str:
|
|
1007
|
+
return BlockType.FOOTER, None
|
|
1008
|
+
if 'header' in label_str and 'section' not in label_str:
|
|
1009
|
+
return BlockType.HEADER, None
|
|
1010
|
+
if 'equation' in label_str or 'formula' in label_str:
|
|
1011
|
+
return BlockType.EQUATION, None
|
|
1012
|
+
if 'code' in label_str:
|
|
1013
|
+
return BlockType.CODE, None
|
|
1014
|
+
if 'title' in label_str:
|
|
1015
|
+
return BlockType.HEADING, max(1, level)
|
|
1016
|
+
|
|
1017
|
+
# Default: paragraph
|
|
1018
|
+
return BlockType.PARAGRAPH, None
|
|
1019
|
+
|
|
1020
|
+
def _get_item_text(self, item, docling_doc=None) -> str:
|
|
1021
|
+
"""Extract text from a Docling item."""
|
|
1022
|
+
# For tables, prefer markdown with doc context for proper rendering
|
|
1023
|
+
if isinstance(item, TableItem) and hasattr(item, 'export_to_markdown'):
|
|
1024
|
+
try:
|
|
1025
|
+
return item.export_to_markdown(doc=docling_doc)
|
|
1026
|
+
except Exception:
|
|
1027
|
+
pass
|
|
1028
|
+
if hasattr(item, 'text') and item.text:
|
|
1029
|
+
return item.text
|
|
1030
|
+
if hasattr(item, 'export_to_markdown'):
|
|
1031
|
+
try:
|
|
1032
|
+
return item.export_to_markdown()
|
|
1033
|
+
except Exception:
|
|
1034
|
+
pass
|
|
1035
|
+
return ""
|
|
1036
|
+
|
|
1037
|
+
def _get_item_confidence(self, item) -> float:
|
|
1038
|
+
"""Extract confidence from a Docling item, defaulting to 1.0."""
|
|
1039
|
+
if hasattr(item, 'confidence') and item.confidence is not None:
|
|
1040
|
+
return float(item.confidence)
|
|
1041
|
+
return 1.0
|
|
1042
|
+
|
|
1043
|
+
def _build_pptx_text_map(self, file_path: Path) -> Dict[int, Dict[str, PptxParaInfo]]:
|
|
1044
|
+
"""
|
|
1045
|
+
Use python-pptx to build a per-slide map of text -> paragraph info.
|
|
1046
|
+
|
|
1047
|
+
Returns:
|
|
1048
|
+
Dict[slide_idx (0-based), Dict[normalized_text, PptxParaInfo]]
|
|
1049
|
+
"""
|
|
1050
|
+
try:
|
|
1051
|
+
from pptx import Presentation
|
|
1052
|
+
from pptx.util import Emu
|
|
1053
|
+
from pptx.enum.shapes import PP_PLACEHOLDER_TYPE as PP_PLACEHOLDER
|
|
1054
|
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
|
1055
|
+
except ImportError:
|
|
1056
|
+
logger.warning("python-pptx not installed, cannot build PPTX indent map")
|
|
1057
|
+
return {}
|
|
1058
|
+
|
|
1059
|
+
pptx_map: Dict[int, Dict[str, PptxParaInfo]] = {}
|
|
1060
|
+
|
|
1061
|
+
try:
|
|
1062
|
+
prs = Presentation(str(file_path))
|
|
1063
|
+
except Exception as e:
|
|
1064
|
+
logger.warning(f"Failed to open PPTX with python-pptx: {e}")
|
|
1065
|
+
return {}
|
|
1066
|
+
|
|
1067
|
+
for slide_idx, slide in enumerate(prs.slides):
|
|
1068
|
+
slide_map: Dict[str, PptxParaInfo] = {}
|
|
1069
|
+
found_title = False
|
|
1070
|
+
|
|
1071
|
+
# Check if slide 0 has an actual SUBTITLE placeholder
|
|
1072
|
+
# If it does, we don't need the positional heuristic
|
|
1073
|
+
has_subtitle_placeholder = False
|
|
1074
|
+
if slide_idx == 0:
|
|
1075
|
+
try:
|
|
1076
|
+
from pptx.enum.shapes import PP_PLACEHOLDER_TYPE as PP_PH
|
|
1077
|
+
for s in slide.shapes:
|
|
1078
|
+
if s.is_placeholder:
|
|
1079
|
+
try:
|
|
1080
|
+
if s.placeholder_format.type == PP_PH.SUBTITLE:
|
|
1081
|
+
has_subtitle_placeholder = True
|
|
1082
|
+
break
|
|
1083
|
+
except Exception:
|
|
1084
|
+
pass
|
|
1085
|
+
except ImportError:
|
|
1086
|
+
pass
|
|
1087
|
+
|
|
1088
|
+
for shape in slide.shapes:
|
|
1089
|
+
found_title = self._extract_pptx_shape_info(
|
|
1090
|
+
shape, slide_map, slide_idx=slide_idx, found_title=found_title,
|
|
1091
|
+
has_subtitle_placeholder=has_subtitle_placeholder,
|
|
1092
|
+
)
|
|
1093
|
+
|
|
1094
|
+
pptx_map[slide_idx] = slide_map
|
|
1095
|
+
|
|
1096
|
+
# Post-processing: detect repeated text across slides (footer/header noise)
|
|
1097
|
+
# Text appearing on >50% of slides is likely a repeated footer/header element
|
|
1098
|
+
num_slides = len(pptx_map)
|
|
1099
|
+
if num_slides >= 3: # Only apply for presentations with enough slides
|
|
1100
|
+
text_slide_count: Dict[str, int] = {}
|
|
1101
|
+
for slide_map in pptx_map.values():
|
|
1102
|
+
for text, info in slide_map.items():
|
|
1103
|
+
if not info.is_title and not info.is_subtitle and not info.is_footer:
|
|
1104
|
+
text_slide_count[text] = text_slide_count.get(text, 0) + 1
|
|
1105
|
+
|
|
1106
|
+
threshold = num_slides * 0.5
|
|
1107
|
+
repeated_texts = {t for t, count in text_slide_count.items() if count > threshold}
|
|
1108
|
+
|
|
1109
|
+
if repeated_texts:
|
|
1110
|
+
logger.info(f"Detected {len(repeated_texts)} repeated footer/header texts across slides")
|
|
1111
|
+
for slide_map in pptx_map.values():
|
|
1112
|
+
for text in repeated_texts:
|
|
1113
|
+
if text in slide_map:
|
|
1114
|
+
slide_map[text] = PptxParaInfo(
|
|
1115
|
+
indent_level=0, is_title=False, is_subtitle=False,
|
|
1116
|
+
is_list=False, bullet_type='None', is_footer=True,
|
|
1117
|
+
)
|
|
1118
|
+
|
|
1119
|
+
total_entries = sum(len(m) for m in pptx_map.values())
|
|
1120
|
+
logger.info(f"Built PPTX text map: {len(pptx_map)} slides, {total_entries} text entries")
|
|
1121
|
+
return pptx_map
|
|
1122
|
+
|
|
1123
|
+
def _extract_pptx_shape_info(self, shape, slide_map: Dict[str, PptxParaInfo],
|
|
1124
|
+
slide_idx: int = 0, found_title: bool = False,
|
|
1125
|
+
has_subtitle_placeholder: bool = False) -> bool:
|
|
1126
|
+
"""Extract paragraph info from a shape, handling groups recursively.
|
|
1127
|
+
|
|
1128
|
+
Returns whether a title shape has been found (for subtitle detection).
|
|
1129
|
+
"""
|
|
1130
|
+
try:
|
|
1131
|
+
from pptx.enum.shapes import PP_PLACEHOLDER_TYPE as PP_PLACEHOLDER
|
|
1132
|
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
|
1133
|
+
except ImportError:
|
|
1134
|
+
return found_title
|
|
1135
|
+
|
|
1136
|
+
# Handle group shapes recursively
|
|
1137
|
+
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
|
1138
|
+
for child_shape in shape.shapes:
|
|
1139
|
+
found_title = self._extract_pptx_shape_info(
|
|
1140
|
+
child_shape, slide_map, slide_idx=slide_idx, found_title=found_title,
|
|
1141
|
+
has_subtitle_placeholder=has_subtitle_placeholder,
|
|
1142
|
+
)
|
|
1143
|
+
return found_title
|
|
1144
|
+
|
|
1145
|
+
if not hasattr(shape, 'text_frame'):
|
|
1146
|
+
return found_title
|
|
1147
|
+
if not shape.has_text_frame:
|
|
1148
|
+
return found_title
|
|
1149
|
+
|
|
1150
|
+
# Determine if this shape is a title/subtitle/footer placeholder
|
|
1151
|
+
is_title_shape = False
|
|
1152
|
+
is_subtitle_shape = False
|
|
1153
|
+
is_footer_shape = False
|
|
1154
|
+
if shape.is_placeholder:
|
|
1155
|
+
try:
|
|
1156
|
+
ph_type = shape.placeholder_format.type
|
|
1157
|
+
if ph_type in (PP_PLACEHOLDER.TITLE, PP_PLACEHOLDER.CENTER_TITLE):
|
|
1158
|
+
is_title_shape = True
|
|
1159
|
+
elif ph_type == PP_PLACEHOLDER.SUBTITLE:
|
|
1160
|
+
is_subtitle_shape = True
|
|
1161
|
+
elif ph_type in (PP_PLACEHOLDER.DATE, PP_PLACEHOLDER.FOOTER, PP_PLACEHOLDER.SLIDE_NUMBER):
|
|
1162
|
+
is_footer_shape = True
|
|
1163
|
+
except Exception:
|
|
1164
|
+
pass
|
|
1165
|
+
|
|
1166
|
+
# Skip footer/date/slide-number shapes entirely
|
|
1167
|
+
if is_footer_shape:
|
|
1168
|
+
# Still record them in the map so we can filter during block conversion
|
|
1169
|
+
for paragraph in shape.text_frame.paragraphs:
|
|
1170
|
+
text = paragraph.text.strip()
|
|
1171
|
+
if text:
|
|
1172
|
+
norm_text = ' '.join(text.split())
|
|
1173
|
+
if norm_text not in slide_map:
|
|
1174
|
+
slide_map[norm_text] = PptxParaInfo(
|
|
1175
|
+
indent_level=0, is_title=False, is_subtitle=False,
|
|
1176
|
+
is_list=False, bullet_type='None', is_footer=True,
|
|
1177
|
+
)
|
|
1178
|
+
return found_title
|
|
1179
|
+
|
|
1180
|
+
# Detect subtitle: on the title slide (slide 0), the first non-placeholder
|
|
1181
|
+
# text shape after the TITLE is a subtitle — but ONLY if there's no actual
|
|
1182
|
+
# SUBTITLE placeholder on the slide (to avoid false positives)
|
|
1183
|
+
is_subtitle_by_position = False
|
|
1184
|
+
if (slide_idx == 0 and found_title and not is_title_shape
|
|
1185
|
+
and not shape.is_placeholder and not has_subtitle_placeholder):
|
|
1186
|
+
is_subtitle_by_position = True
|
|
1187
|
+
|
|
1188
|
+
first_para_in_shape = True
|
|
1189
|
+
for paragraph in shape.text_frame.paragraphs:
|
|
1190
|
+
text = paragraph.text.strip()
|
|
1191
|
+
if not text:
|
|
1192
|
+
continue
|
|
1193
|
+
|
|
1194
|
+
indent_level = paragraph.level if paragraph.level is not None else 0
|
|
1195
|
+
|
|
1196
|
+
# Detect bullet/numbered list
|
|
1197
|
+
is_list = False
|
|
1198
|
+
bullet_type = 'None'
|
|
1199
|
+
p_elem = paragraph._element
|
|
1200
|
+
ns = {'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}
|
|
1201
|
+
if p_elem.find('.//a:buChar', namespaces=ns) is not None:
|
|
1202
|
+
is_list = True
|
|
1203
|
+
bullet_type = 'Bullet'
|
|
1204
|
+
elif p_elem.find('.//a:buAutoNum', namespaces=ns) is not None:
|
|
1205
|
+
is_list = True
|
|
1206
|
+
bullet_type = 'Numbered'
|
|
1207
|
+
elif indent_level > 0 and not is_title_shape:
|
|
1208
|
+
is_list = True
|
|
1209
|
+
|
|
1210
|
+
# For title shapes, override indent to 0
|
|
1211
|
+
if is_title_shape or is_subtitle_shape:
|
|
1212
|
+
indent_level = 0
|
|
1213
|
+
is_list = False
|
|
1214
|
+
bullet_type = 'None'
|
|
1215
|
+
|
|
1216
|
+
# Mark as subtitle only for the FIRST paragraph of the subtitle shape
|
|
1217
|
+
mark_subtitle = is_subtitle_shape or (is_subtitle_by_position and first_para_in_shape)
|
|
1218
|
+
if mark_subtitle:
|
|
1219
|
+
indent_level = 0
|
|
1220
|
+
is_list = False
|
|
1221
|
+
bullet_type = 'None'
|
|
1222
|
+
|
|
1223
|
+
# Normalize text for matching (Docling may strip/normalize differently)
|
|
1224
|
+
norm_text = ' '.join(text.split())
|
|
1225
|
+
|
|
1226
|
+
# Only store first occurrence per slide (duplicate text on same slide is rare)
|
|
1227
|
+
if norm_text not in slide_map:
|
|
1228
|
+
slide_map[norm_text] = PptxParaInfo(
|
|
1229
|
+
indent_level=indent_level,
|
|
1230
|
+
is_title=is_title_shape,
|
|
1231
|
+
is_subtitle=mark_subtitle,
|
|
1232
|
+
is_list=is_list,
|
|
1233
|
+
bullet_type=bullet_type,
|
|
1234
|
+
)
|
|
1235
|
+
|
|
1236
|
+
first_para_in_shape = False
|
|
1237
|
+
|
|
1238
|
+
# Track that we've seen a title shape
|
|
1239
|
+
if is_title_shape:
|
|
1240
|
+
found_title = True
|
|
1241
|
+
|
|
1242
|
+
return found_title
|
|
1243
|
+
|
|
1244
|
+
def extract(
|
|
1245
|
+
self,
|
|
1246
|
+
file_path: Path,
|
|
1247
|
+
config: ProcessingConfig,
|
|
1248
|
+
page_numbers: Optional[List[int]] = None,
|
|
1249
|
+
) -> Tuple[Document, ExtractionMetadata]:
|
|
1250
|
+
"""
|
|
1251
|
+
Extract document using Docling.
|
|
1252
|
+
|
|
1253
|
+
Uses iterate_items() for reading-order block extraction
|
|
1254
|
+
and HierarchicalChunker for heading hierarchy paths.
|
|
1255
|
+
For PPTX files, uses python-pptx directly for indent levels.
|
|
1256
|
+
|
|
1257
|
+
Args:
|
|
1258
|
+
file_path: Path to document file
|
|
1259
|
+
config: Processing configuration
|
|
1260
|
+
page_numbers: Optional specific pages to extract
|
|
1261
|
+
|
|
1262
|
+
Returns:
|
|
1263
|
+
Tuple of (Document, ExtractionMetadata)
|
|
1264
|
+
"""
|
|
1265
|
+
file_path = Path(file_path)
|
|
1266
|
+
is_pptx = file_path.suffix.lower() in ('.pptx', '.ppt')
|
|
1267
|
+
|
|
1268
|
+
# Calculate file hash
|
|
1269
|
+
with open(file_path, "rb") as f:
|
|
1270
|
+
file_hash = hashlib.md5(f.read()).hexdigest()
|
|
1271
|
+
|
|
1272
|
+
# Get conversion result (cached or new)
|
|
1273
|
+
result = self._run_docling(file_path, config)
|
|
1274
|
+
docling_doc = result.document
|
|
1275
|
+
|
|
1276
|
+
# Build PPTX-specific indent map if applicable
|
|
1277
|
+
pptx_text_map = None
|
|
1278
|
+
if is_pptx:
|
|
1279
|
+
pptx_text_map = self._build_pptx_text_map(file_path)
|
|
1280
|
+
# For PPTX: skip font-size clustering, use simple heading levels
|
|
1281
|
+
# All slide titles become h2 (since they're all peer-level slides)
|
|
1282
|
+
heading_to_level = {}
|
|
1283
|
+
for item, level in docling_doc.iterate_items():
|
|
1284
|
+
if isinstance(item, SectionHeaderItem):
|
|
1285
|
+
text = getattr(item, 'text', '')
|
|
1286
|
+
if text:
|
|
1287
|
+
heading_to_level[text] = 2 # All PPTX titles = h2
|
|
1288
|
+
elif TitleItem is not None and isinstance(item, TitleItem):
|
|
1289
|
+
text = getattr(item, 'text', '')
|
|
1290
|
+
if text:
|
|
1291
|
+
heading_to_level[text] = 2 # All PPTX titles = h2
|
|
1292
|
+
hierarchy_map = {}
|
|
1293
|
+
logger.info(f"PPTX mode: assigned {len(heading_to_level)} headings to level 2")
|
|
1294
|
+
else:
|
|
1295
|
+
# Standard PDF/DOCX path: use font-size clustering
|
|
1296
|
+
hierarchy_map, heading_to_level = self._build_hierarchy_map(docling_doc)
|
|
1297
|
+
|
|
1298
|
+
logger.info(f"Built hierarchy map with {len(hierarchy_map)} item mappings")
|
|
1299
|
+
|
|
1300
|
+
# Get actual page dimensions
|
|
1301
|
+
page_dims = self._get_page_dimensions(docling_doc)
|
|
1302
|
+
|
|
1303
|
+
# Convert to our Document format using iterate_items()
|
|
1304
|
+
pages = self._convert_to_pages(
|
|
1305
|
+
docling_doc,
|
|
1306
|
+
hierarchy_map,
|
|
1307
|
+
heading_to_level,
|
|
1308
|
+
page_dims,
|
|
1309
|
+
file_path,
|
|
1310
|
+
file_hash,
|
|
1311
|
+
exclude_headers_footers=config.exclude_page_headers_footers,
|
|
1312
|
+
pptx_text_map=pptx_text_map,
|
|
1313
|
+
)
|
|
1314
|
+
|
|
1315
|
+
# Filter pages if specific ones requested
|
|
1316
|
+
if page_numbers is not None:
|
|
1317
|
+
pages = [p for p in pages if p.page_number in page_numbers]
|
|
1318
|
+
|
|
1319
|
+
# Build document
|
|
1320
|
+
doc = Document(
|
|
1321
|
+
metadata=DocumentMetadata(
|
|
1322
|
+
source_file=str(file_path),
|
|
1323
|
+
file_hash=file_hash,
|
|
1324
|
+
total_pages=len(pages),
|
|
1325
|
+
),
|
|
1326
|
+
pages=pages,
|
|
1327
|
+
)
|
|
1328
|
+
|
|
1329
|
+
# Extraction metadata
|
|
1330
|
+
strategy_desc = "PPTX mode (python-pptx indent map)" if is_pptx else "PDF/DOCX mode (font-size clustering)"
|
|
1331
|
+
meta = ExtractionMetadata(
|
|
1332
|
+
strategy_used="docling",
|
|
1333
|
+
ocr_backend_used="tesseract_cli",
|
|
1334
|
+
reasons=[f"Used Docling with Tesseract CLI OCR, iterate_items() for block extraction. {strategy_desc}"],
|
|
1335
|
+
)
|
|
1336
|
+
|
|
1337
|
+
total_blocks = sum(len(p.blocks) for p in pages)
|
|
1338
|
+
logger.info(f"Extracted {len(pages)} pages, {total_blocks} blocks")
|
|
1339
|
+
|
|
1340
|
+
return doc, meta
|
|
1341
|
+
|
|
1342
|
+
def _build_table_from_item(self, item, docling_doc=None) -> Optional[Table]:
|
|
1343
|
+
"""
|
|
1344
|
+
Convert Docling TableItem.data into our Table schema.
|
|
1345
|
+
|
|
1346
|
+
Uses table_cells with proper row/col indices.
|
|
1347
|
+
Falls back to export_to_dataframe() if direct conversion fails.
|
|
1348
|
+
"""
|
|
1349
|
+
if not isinstance(item, TableItem) or not hasattr(item, 'data'):
|
|
1350
|
+
return None
|
|
1351
|
+
|
|
1352
|
+
table_data = item.data
|
|
1353
|
+
if table_data.num_rows == 0 or table_data.num_cols == 0:
|
|
1354
|
+
return None
|
|
1355
|
+
|
|
1356
|
+
try:
|
|
1357
|
+
cells = []
|
|
1358
|
+
for dcell in table_data.table_cells:
|
|
1359
|
+
cells.append(TableCell(
|
|
1360
|
+
r0=dcell.start_row_offset_idx,
|
|
1361
|
+
c0=dcell.start_col_offset_idx,
|
|
1362
|
+
rspan=dcell.end_row_offset_idx - dcell.start_row_offset_idx,
|
|
1363
|
+
cspan=dcell.end_col_offset_idx - dcell.start_col_offset_idx,
|
|
1364
|
+
text=dcell.text,
|
|
1365
|
+
))
|
|
1366
|
+
|
|
1367
|
+
if cells:
|
|
1368
|
+
return Table(
|
|
1369
|
+
n_rows=table_data.num_rows,
|
|
1370
|
+
n_cols=table_data.num_cols,
|
|
1371
|
+
cells=cells,
|
|
1372
|
+
)
|
|
1373
|
+
except Exception as e:
|
|
1374
|
+
logger.warning(f"Direct table cell conversion failed: {e}")
|
|
1375
|
+
|
|
1376
|
+
# Fallback: use export_to_dataframe()
|
|
1377
|
+
try:
|
|
1378
|
+
import pandas as pd
|
|
1379
|
+
df = item.export_to_dataframe(doc=docling_doc)
|
|
1380
|
+
if df is not None and not df.empty:
|
|
1381
|
+
n_rows = len(df) + 1 # +1 for header row
|
|
1382
|
+
n_cols = len(df.columns)
|
|
1383
|
+
cells = []
|
|
1384
|
+
# Header row
|
|
1385
|
+
for c_idx, col_name in enumerate(df.columns):
|
|
1386
|
+
cells.append(TableCell(
|
|
1387
|
+
r0=0, c0=c_idx, rspan=1, cspan=1,
|
|
1388
|
+
text=str(col_name),
|
|
1389
|
+
))
|
|
1390
|
+
# Data rows
|
|
1391
|
+
for r_idx, (_, row) in enumerate(df.iterrows(), start=1):
|
|
1392
|
+
for c_idx, val in enumerate(row):
|
|
1393
|
+
cells.append(TableCell(
|
|
1394
|
+
r0=r_idx, c0=c_idx, rspan=1, cspan=1,
|
|
1395
|
+
text=str(val) if pd.notna(val) else "",
|
|
1396
|
+
))
|
|
1397
|
+
return Table(
|
|
1398
|
+
n_rows=n_rows,
|
|
1399
|
+
n_cols=n_cols,
|
|
1400
|
+
cells=cells,
|
|
1401
|
+
)
|
|
1402
|
+
except Exception as e:
|
|
1403
|
+
logger.warning(f"DataFrame fallback also failed: {e}")
|
|
1404
|
+
|
|
1405
|
+
return None
|
|
1406
|
+
|
|
1407
|
+
def _convert_to_pages(
|
|
1408
|
+
self,
|
|
1409
|
+
docling_doc,
|
|
1410
|
+
hierarchy_map: Dict[str, List[str]],
|
|
1411
|
+
heading_to_level: Dict[str, int],
|
|
1412
|
+
page_dims: Dict[int, Tuple[float, float]],
|
|
1413
|
+
file_path: Path,
|
|
1414
|
+
file_hash: str,
|
|
1415
|
+
exclude_headers_footers: bool = True,
|
|
1416
|
+
pptx_text_map: Optional[Dict[int, Dict[str, 'PptxParaInfo']]] = None,
|
|
1417
|
+
) -> List[Page]:
|
|
1418
|
+
"""
|
|
1419
|
+
Convert Docling document to our Page format using iterate_items().
|
|
1420
|
+
|
|
1421
|
+
No synthetic heading injection, no inline heading regex,
|
|
1422
|
+
no hardcoded dimensions — purely Docling-native.
|
|
1423
|
+
|
|
1424
|
+
Tracks TableItem children to prevent duplicate blocks.
|
|
1425
|
+
When pptx_text_map is provided, uses it to set indent_level on blocks.
|
|
1426
|
+
"""
|
|
1427
|
+
pages_dict: Dict[int, Page] = {}
|
|
1428
|
+
block_idx = 0
|
|
1429
|
+
|
|
1430
|
+
# Gap #1: Collect all self_refs that belong to table children
|
|
1431
|
+
# so we can skip them when they appear as standalone items.
|
|
1432
|
+
table_child_refs: set = set()
|
|
1433
|
+
for item, _level in docling_doc.iterate_items():
|
|
1434
|
+
if isinstance(item, TableItem):
|
|
1435
|
+
# Mark all refs inside this table's cells as children
|
|
1436
|
+
if hasattr(item, 'data') and item.data:
|
|
1437
|
+
for dcell in item.data.table_cells:
|
|
1438
|
+
if hasattr(dcell, 'ref') and dcell.ref:
|
|
1439
|
+
ref = getattr(dcell.ref, 'cref', getattr(dcell.ref, 'self_ref', None))
|
|
1440
|
+
if ref:
|
|
1441
|
+
table_child_refs.add(ref)
|
|
1442
|
+
|
|
1443
|
+
# iterate_items() provides (item, level) in reading order
|
|
1444
|
+
for item, level in docling_doc.iterate_items():
|
|
1445
|
+
# Gap #1: Skip items that are children of a table
|
|
1446
|
+
item_ref = getattr(item, 'self_ref', None)
|
|
1447
|
+
if item_ref and item_ref in table_child_refs:
|
|
1448
|
+
continue
|
|
1449
|
+
|
|
1450
|
+
# Get page and bbox from provenance
|
|
1451
|
+
page_num, bbox = self._get_item_provenance(item)
|
|
1452
|
+
|
|
1453
|
+
# Determine block type using Docling's native types + chunker heading levels
|
|
1454
|
+
block_type, heading_level = self._determine_block_type(item, level, heading_to_level)
|
|
1455
|
+
|
|
1456
|
+
# Filter headers and footers if requested
|
|
1457
|
+
if exclude_headers_footers and block_type in (BlockType.HEADER, BlockType.FOOTER):
|
|
1458
|
+
continue
|
|
1459
|
+
|
|
1460
|
+
# Get text (prefer markdown for equations to get LaTeX)
|
|
1461
|
+
if block_type == BlockType.EQUATION and hasattr(item, 'export_to_markdown'):
|
|
1462
|
+
try:
|
|
1463
|
+
text = item.export_to_markdown()
|
|
1464
|
+
except Exception:
|
|
1465
|
+
text = self._get_item_text(item, docling_doc)
|
|
1466
|
+
else:
|
|
1467
|
+
text = self._get_item_text(item, docling_doc)
|
|
1468
|
+
|
|
1469
|
+
if not text:
|
|
1470
|
+
continue
|
|
1471
|
+
|
|
1472
|
+
# Wrap equations with markers
|
|
1473
|
+
if block_type == BlockType.EQUATION:
|
|
1474
|
+
text = f"⟦EQUATION⟧\n{text.strip()}\n⟦/EQUATION⟧"
|
|
1475
|
+
|
|
1476
|
+
# Get hierarchy path from chunker map
|
|
1477
|
+
item_ref = getattr(item, 'self_ref', None)
|
|
1478
|
+
hierarchy_path = hierarchy_map.get(item_ref, [])
|
|
1479
|
+
|
|
1480
|
+
# Get native confidence
|
|
1481
|
+
item_confidence = self._get_item_confidence(item)
|
|
1482
|
+
|
|
1483
|
+
# Create page if needed, with actual dimensions
|
|
1484
|
+
if page_num not in pages_dict:
|
|
1485
|
+
width, height = page_dims.get(page_num, (612.0, 792.0))
|
|
1486
|
+
pages_dict[page_num] = Page(
|
|
1487
|
+
page_number=page_num,
|
|
1488
|
+
width=width,
|
|
1489
|
+
height=height,
|
|
1490
|
+
blocks=[],
|
|
1491
|
+
profile=PageProfile(page_number=page_num),
|
|
1492
|
+
)
|
|
1493
|
+
|
|
1494
|
+
# Build block
|
|
1495
|
+
table_obj = None
|
|
1496
|
+
if block_type == BlockType.TABLE:
|
|
1497
|
+
table_obj = self._build_table_from_item(item, docling_doc)
|
|
1498
|
+
if table_obj:
|
|
1499
|
+
logger.info(
|
|
1500
|
+
f" Populated Block.table: {table_obj.n_rows} rows × "
|
|
1501
|
+
f"{table_obj.n_cols} cols, {len(table_obj.cells)} cells"
|
|
1502
|
+
)
|
|
1503
|
+
|
|
1504
|
+
# Determine indent_level and filter footers from PPTX text map
|
|
1505
|
+
indent_level = 0
|
|
1506
|
+
if pptx_text_map is not None:
|
|
1507
|
+
norm_block_text = ' '.join(text.strip().split())
|
|
1508
|
+
# page_num is 1-based, pptx_text_map is 0-based
|
|
1509
|
+
slide_map = pptx_text_map.get(page_num - 1, {})
|
|
1510
|
+
# Also try page_num as-is (in case of off-by-one)
|
|
1511
|
+
if not slide_map:
|
|
1512
|
+
slide_map = pptx_text_map.get(page_num, {})
|
|
1513
|
+
pptx_info = slide_map.get(norm_block_text)
|
|
1514
|
+
if pptx_info:
|
|
1515
|
+
if pptx_info.is_footer:
|
|
1516
|
+
# Skip footer/date/slide-number content
|
|
1517
|
+
continue
|
|
1518
|
+
indent_level = pptx_info.indent_level
|
|
1519
|
+
# Promote subtitle to heading level 3
|
|
1520
|
+
if pptx_info.is_subtitle and block_type != BlockType.HEADING:
|
|
1521
|
+
block_type = BlockType.HEADING
|
|
1522
|
+
heading_level = 3
|
|
1523
|
+
|
|
1524
|
+
# Filter slide number patterns (e.g., "1 / 22", "12/22")
|
|
1525
|
+
if re.match(r'^\d+\s*/\s*\d+$', norm_block_text):
|
|
1526
|
+
continue
|
|
1527
|
+
# Filter single-character noise (common Beamer artifact)
|
|
1528
|
+
if len(norm_block_text) <= 1 and block_type not in (BlockType.HEADING,):
|
|
1529
|
+
continue
|
|
1530
|
+
|
|
1531
|
+
block = Block(
|
|
1532
|
+
type=block_type,
|
|
1533
|
+
text=text,
|
|
1534
|
+
order_index=block_idx,
|
|
1535
|
+
heading_level=heading_level,
|
|
1536
|
+
indent_level=indent_level,
|
|
1537
|
+
hierarchy_path=hierarchy_path,
|
|
1538
|
+
provenance=Provenance(
|
|
1539
|
+
source_file=str(file_path),
|
|
1540
|
+
page_number=page_num,
|
|
1541
|
+
bbox=bbox,
|
|
1542
|
+
extractor=ExtractorType.DOCLING,
|
|
1543
|
+
extractor_version=self.version,
|
|
1544
|
+
),
|
|
1545
|
+
confidence=Confidence(
|
|
1546
|
+
overall=item_confidence,
|
|
1547
|
+
text_confidence=item_confidence,
|
|
1548
|
+
layout_confidence=item_confidence,
|
|
1549
|
+
),
|
|
1550
|
+
table=table_obj,
|
|
1551
|
+
)
|
|
1552
|
+
|
|
1553
|
+
pages_dict[page_num].blocks.append(block)
|
|
1554
|
+
block_idx += 1
|
|
1555
|
+
|
|
1556
|
+
# Sort pages by page number and reindex blocks
|
|
1557
|
+
pages = sorted(pages_dict.values(), key=lambda p: p.page_number)
|
|
1558
|
+
for page in pages:
|
|
1559
|
+
for i, block in enumerate(page.blocks):
|
|
1560
|
+
block.order_index = i
|
|
1561
|
+
|
|
1562
|
+
return pages
|
|
1563
|
+
|
|
1564
|
+
def extract_page(
|
|
1565
|
+
self,
|
|
1566
|
+
file_path: Path,
|
|
1567
|
+
page_number: int,
|
|
1568
|
+
config: ProcessingConfig,
|
|
1569
|
+
) -> Page:
|
|
1570
|
+
"""Extract a single page."""
|
|
1571
|
+
doc, _ = self.extract(file_path, config, page_numbers=[page_number])
|
|
1572
|
+
if doc.pages:
|
|
1573
|
+
return doc.pages[0]
|
|
1574
|
+
raise ValueError(f"Page {page_number} not found in {file_path}")
|
|
1575
|
+
|
|
1576
|
+
def get_hierarchy(
|
|
1577
|
+
self,
|
|
1578
|
+
file_path: Path,
|
|
1579
|
+
config: ProcessingConfig,
|
|
1580
|
+
) -> List[HierarchyChunk]:
|
|
1581
|
+
"""
|
|
1582
|
+
Get document hierarchy using HierarchicalChunker.
|
|
1583
|
+
|
|
1584
|
+
Returns list of chunks with hierarchy information.
|
|
1585
|
+
"""
|
|
1586
|
+
file_path = Path(file_path)
|
|
1587
|
+
|
|
1588
|
+
# Get conversion result (cached or new)
|
|
1589
|
+
result = self._run_docling(file_path, config)
|
|
1590
|
+
chunks = list(self._chunker.chunk(result.document))
|
|
1591
|
+
|
|
1592
|
+
hierarchy_chunks = []
|
|
1593
|
+
for idx, chunk in enumerate(chunks):
|
|
1594
|
+
heading_path = []
|
|
1595
|
+
page_num = 0
|
|
1596
|
+
|
|
1597
|
+
if hasattr(chunk, 'meta') and chunk.meta:
|
|
1598
|
+
if hasattr(chunk.meta, 'headings'):
|
|
1599
|
+
heading_path = list(chunk.meta.headings or [])
|
|
1600
|
+
if hasattr(chunk.meta, 'doc_items') and chunk.meta.doc_items:
|
|
1601
|
+
for item in chunk.meta.doc_items:
|
|
1602
|
+
if hasattr(item, 'prov') and item.prov:
|
|
1603
|
+
for prov in item.prov:
|
|
1604
|
+
if hasattr(prov, 'page_no'):
|
|
1605
|
+
page_num = prov.page_no - 1
|
|
1606
|
+
break
|
|
1607
|
+
|
|
1608
|
+
hierarchy_chunks.append(HierarchyChunk(
|
|
1609
|
+
text=chunk.text,
|
|
1610
|
+
heading_path=heading_path,
|
|
1611
|
+
level=len(heading_path),
|
|
1612
|
+
page_number=page_num,
|
|
1613
|
+
order_index=idx,
|
|
1614
|
+
))
|
|
1615
|
+
|
|
1616
|
+
return hierarchy_chunks
|
|
1617
|
+
|
|
1618
|
+
def to_markdown(self, doc: Document) -> str:
|
|
1619
|
+
"""Convert document to Markdown."""
|
|
1620
|
+
lines = []
|
|
1621
|
+
|
|
1622
|
+
for page in doc.pages:
|
|
1623
|
+
for block in page.blocks:
|
|
1624
|
+
if block.type == BlockType.HEADING and block.heading_level:
|
|
1625
|
+
prefix = "#" * min(block.heading_level, 6)
|
|
1626
|
+
lines.append(f"{prefix} {block.text}")
|
|
1627
|
+
lines.append("")
|
|
1628
|
+
elif block.type == BlockType.LIST_ITEM:
|
|
1629
|
+
indent = " " * block.indent_level
|
|
1630
|
+
lines.append(f"{indent}- {block.text}")
|
|
1631
|
+
lines.append("")
|
|
1632
|
+
elif block.type == BlockType.HEADER:
|
|
1633
|
+
# Page headers (e.g. running headers)
|
|
1634
|
+
lines.append(block.text)
|
|
1635
|
+
lines.append("")
|
|
1636
|
+
else:
|
|
1637
|
+
text = block.text
|
|
1638
|
+
# Escape leading # in non-heading text to prevent
|
|
1639
|
+
# markdown interpreting code comments as headings
|
|
1640
|
+
if text.lstrip().startswith('#') and block.type != BlockType.HEADING:
|
|
1641
|
+
text = text.replace('#', '\\#', 1)
|
|
1642
|
+
lines.append(text)
|
|
1643
|
+
lines.append("")
|
|
1644
|
+
|
|
1645
|
+
return "\n".join(lines)
|
|
1646
|
+
|
|
1647
|
+
def _sanitize_filename(self, name: str) -> str:
|
|
1648
|
+
"""Sanitize string for filename."""
|
|
1649
|
+
return "".join(c for c in name if c.isalnum() or c in ('-', '_')).strip()
|
|
1650
|
+
|
|
1651
|
+
def save_images(self, output_dir: Path) -> List[Path]:
|
|
1652
|
+
"""
|
|
1653
|
+
Save extracted images (pages, figures, tables).
|
|
1654
|
+
|
|
1655
|
+
Args:
|
|
1656
|
+
output_dir: Directory to save images
|
|
1657
|
+
|
|
1658
|
+
Returns:
|
|
1659
|
+
List of saved image paths
|
|
1660
|
+
"""
|
|
1661
|
+
if self._last_result:
|
|
1662
|
+
result = self._last_result
|
|
1663
|
+
else:
|
|
1664
|
+
logger.warning("No conversion result available to save images from")
|
|
1665
|
+
return []
|
|
1666
|
+
|
|
1667
|
+
output_dir = Path(output_dir)
|
|
1668
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
1669
|
+
saved_paths = []
|
|
1670
|
+
|
|
1671
|
+
try:
|
|
1672
|
+
# Save page images
|
|
1673
|
+
if hasattr(result.document, 'pages'):
|
|
1674
|
+
for page_no, page in result.document.pages.items():
|
|
1675
|
+
if hasattr(page, 'image') and page.image and hasattr(page.image, 'pil_image'):
|
|
1676
|
+
image_path = output_dir / f"page_{page_no}.png"
|
|
1677
|
+
try:
|
|
1678
|
+
page.image.pil_image.save(image_path, format="PNG")
|
|
1679
|
+
saved_paths.append(image_path)
|
|
1680
|
+
except Exception as e:
|
|
1681
|
+
logger.warning(f"Failed to save page image {page_no}: {e}")
|
|
1682
|
+
|
|
1683
|
+
# Save figures and tables
|
|
1684
|
+
for element, _level in result.document.iterate_items():
|
|
1685
|
+
if isinstance(element, PictureItem):
|
|
1686
|
+
try:
|
|
1687
|
+
img = element.get_image(result.document)
|
|
1688
|
+
if img:
|
|
1689
|
+
safe_ref = self._sanitize_filename(element.self_ref)
|
|
1690
|
+
if not safe_ref:
|
|
1691
|
+
safe_ref = f"picture_{uuid.uuid4().hex[:8]}"
|
|
1692
|
+
image_path = output_dir / f"figure_{safe_ref}.png"
|
|
1693
|
+
img.save(image_path, format="PNG")
|
|
1694
|
+
saved_paths.append(image_path)
|
|
1695
|
+
except Exception as e:
|
|
1696
|
+
logger.warning(f"Failed to save figure image: {e}")
|
|
1697
|
+
|
|
1698
|
+
if isinstance(element, TableItem):
|
|
1699
|
+
try:
|
|
1700
|
+
img = element.get_image(result.document)
|
|
1701
|
+
if img:
|
|
1702
|
+
safe_ref = self._sanitize_filename(element.self_ref)
|
|
1703
|
+
if not safe_ref:
|
|
1704
|
+
safe_ref = f"table_{uuid.uuid4().hex[:8]}"
|
|
1705
|
+
image_path = output_dir / f"table_{safe_ref}.png"
|
|
1706
|
+
img.save(image_path, format="PNG")
|
|
1707
|
+
saved_paths.append(image_path)
|
|
1708
|
+
except Exception as e:
|
|
1709
|
+
logger.warning(f"Failed to save table image: {e}")
|
|
1710
|
+
|
|
1711
|
+
except Exception as e:
|
|
1712
|
+
logger.error(f"Failed to save images: {e}")
|
|
1713
|
+
|
|
1714
|
+
return saved_paths
|
|
1715
|
+
|
|
1716
|
+
# ------------------------------------------------------------------
|
|
1717
|
+
# LaTeX-OCR helpers (PDF smart mode)
|
|
1718
|
+
# ------------------------------------------------------------------
|
|
1719
|
+
|
|
1720
|
+
def _find_equation_items(self, doc) -> List[tuple]:
|
|
1721
|
+
"""Find FORMULA-labeled items. Returns [(item, page_no), ...]."""
|
|
1722
|
+
equation_items = []
|
|
1723
|
+
for item, _ in doc.iterate_items():
|
|
1724
|
+
label = getattr(item, "label", None)
|
|
1725
|
+
if label is None:
|
|
1726
|
+
continue
|
|
1727
|
+
label_str = str(label).lower()
|
|
1728
|
+
if "formula" in label_str or "equation" in label_str:
|
|
1729
|
+
# Get page number from provenance
|
|
1730
|
+
page_no = 1
|
|
1731
|
+
if hasattr(item, "prov") and item.prov:
|
|
1732
|
+
page_no = item.prov[0].page_no
|
|
1733
|
+
equation_items.append((item, page_no))
|
|
1734
|
+
return equation_items
|
|
1735
|
+
|
|
1736
|
+
def _merge_adjacent_formulas(self, items: List[tuple], doc) -> tuple:
|
|
1737
|
+
"""Merge vertically adjacent FORMULA bboxes in pixel space.
|
|
1738
|
+
|
|
1739
|
+
Returns:
|
|
1740
|
+
merged_items: list of (item, page_no)
|
|
1741
|
+
union_bboxes: dict of id(item) -> (x0, y0, x1, y1) in pixels
|
|
1742
|
+
blank_ids: set of id(item) for leftover fragments to blank
|
|
1743
|
+
"""
|
|
1744
|
+
union_bboxes: Dict[int, tuple] = {}
|
|
1745
|
+
blank_ids: set = set()
|
|
1746
|
+
|
|
1747
|
+
if len(items) < 2:
|
|
1748
|
+
return items, union_bboxes, blank_ids
|
|
1749
|
+
|
|
1750
|
+
# Convert to pixel-space, matching prov by page_no
|
|
1751
|
+
pixel_items = []
|
|
1752
|
+
for item, page_no in items:
|
|
1753
|
+
# Find provenance matching this page
|
|
1754
|
+
prov = None
|
|
1755
|
+
for p in getattr(item, 'prov', []):
|
|
1756
|
+
if getattr(p, 'page_no', None) == page_no:
|
|
1757
|
+
prov = p
|
|
1758
|
+
break
|
|
1759
|
+
if not prov or not prov.bbox:
|
|
1760
|
+
continue
|
|
1761
|
+
|
|
1762
|
+
page = doc.pages.get(page_no)
|
|
1763
|
+
if page is None or not hasattr(page, 'image') or page.image is None:
|
|
1764
|
+
continue
|
|
1765
|
+
|
|
1766
|
+
try:
|
|
1767
|
+
pil_img = page.image.pil_image
|
|
1768
|
+
img_w, img_h = pil_img.size
|
|
1769
|
+
page_w = page.size.width
|
|
1770
|
+
page_h = page.size.height
|
|
1771
|
+
|
|
1772
|
+
tl = prov.bbox.to_top_left_origin(page_h)
|
|
1773
|
+
sx, sy = img_w / page_w, img_h / page_h
|
|
1774
|
+
px_bbox = (tl.l * sx, tl.t * sy, tl.r * sx, tl.b * sy)
|
|
1775
|
+
pixel_items.append((item, page_no, px_bbox))
|
|
1776
|
+
except Exception as e:
|
|
1777
|
+
logger.debug(f"Skipping formula merge for item: {e}")
|
|
1778
|
+
continue
|
|
1779
|
+
|
|
1780
|
+
if len(pixel_items) < 2:
|
|
1781
|
+
return items, union_bboxes, blank_ids
|
|
1782
|
+
|
|
1783
|
+
GAP_PX = 20
|
|
1784
|
+
H_OVERLAP_MIN = 0.3
|
|
1785
|
+
OVERLAP_Y_ALLOW = 5
|
|
1786
|
+
|
|
1787
|
+
groups = [[pixel_items[0]]]
|
|
1788
|
+
for entry in pixel_items[1:]:
|
|
1789
|
+
_, pg, (x0, y0, x1, y1) = entry
|
|
1790
|
+
prev = groups[-1][-1]
|
|
1791
|
+
_, prev_pg, (px0, py0, px1, py1) = prev
|
|
1792
|
+
|
|
1793
|
+
# Directed gap (y increases downward in pixel space)
|
|
1794
|
+
gap = y0 - py1
|
|
1795
|
+
if pg != prev_pg or gap < -OVERLAP_Y_ALLOW or gap > GAP_PX:
|
|
1796
|
+
groups.append([entry])
|
|
1797
|
+
continue
|
|
1798
|
+
|
|
1799
|
+
h_overlap = max(0, min(x1, px1) - max(x0, px0))
|
|
1800
|
+
h_extent = max(x1, px1) - min(x0, px0)
|
|
1801
|
+
if h_extent > 0 and h_overlap / h_extent >= H_OVERLAP_MIN:
|
|
1802
|
+
groups[-1].append(entry)
|
|
1803
|
+
else:
|
|
1804
|
+
groups.append([entry])
|
|
1805
|
+
|
|
1806
|
+
merged = []
|
|
1807
|
+
for group in groups:
|
|
1808
|
+
anchor_item, anchor_pg, _ = group[0]
|
|
1809
|
+
if len(group) == 1:
|
|
1810
|
+
merged.append((anchor_item, anchor_pg))
|
|
1811
|
+
else:
|
|
1812
|
+
# Compute union bbox in pixel space
|
|
1813
|
+
ux0 = min(e[2][0] for e in group)
|
|
1814
|
+
uy0 = min(e[2][1] for e in group)
|
|
1815
|
+
ux1 = max(e[2][2] for e in group)
|
|
1816
|
+
uy1 = max(e[2][3] for e in group)
|
|
1817
|
+
union_bboxes[id(anchor_item)] = (ux0, uy0, ux1, uy1)
|
|
1818
|
+
merged.append((anchor_item, anchor_pg))
|
|
1819
|
+
# Mark non-anchor items for blanking
|
|
1820
|
+
for e in group[1:]:
|
|
1821
|
+
blank_ids.add(id(e[0]))
|
|
1822
|
+
|
|
1823
|
+
return merged, union_bboxes, blank_ids
|
|
1824
|
+
|
|
1825
|
+
def _crop_equation_bbox(self, doc, item, page_no: int,
|
|
1826
|
+
union_bboxes: Dict[int, tuple] = None):
|
|
1827
|
+
"""Crop equation image from page. Returns PIL Image or None."""
|
|
1828
|
+
page = doc.pages.get(page_no)
|
|
1829
|
+
if page is None or not hasattr(page, 'image') or page.image is None:
|
|
1830
|
+
return None
|
|
1831
|
+
|
|
1832
|
+
try:
|
|
1833
|
+
pil_img = page.image.pil_image
|
|
1834
|
+
img_w, img_h = pil_img.size
|
|
1835
|
+
except Exception:
|
|
1836
|
+
return None
|
|
1837
|
+
|
|
1838
|
+
# Check for merged union bbox first
|
|
1839
|
+
if union_bboxes and id(item) in union_bboxes:
|
|
1840
|
+
x0, y0, x1, y1 = union_bboxes[id(item)]
|
|
1841
|
+
else:
|
|
1842
|
+
# Standard provenance → pixel transform
|
|
1843
|
+
prov = None
|
|
1844
|
+
for p in getattr(item, 'prov', []):
|
|
1845
|
+
if getattr(p, 'page_no', None) == page_no:
|
|
1846
|
+
prov = p
|
|
1847
|
+
break
|
|
1848
|
+
if not prov or not prov.bbox:
|
|
1849
|
+
return None
|
|
1850
|
+
|
|
1851
|
+
page_w = page.size.width
|
|
1852
|
+
page_h = page.size.height
|
|
1853
|
+
|
|
1854
|
+
tl = prov.bbox.to_top_left_origin(page_h)
|
|
1855
|
+
sx, sy = img_w / page_w, img_h / page_h
|
|
1856
|
+
x0, y0 = tl.l * sx, tl.t * sy
|
|
1857
|
+
x1, y1 = tl.r * sx, tl.b * sy
|
|
1858
|
+
|
|
1859
|
+
# Rotation/sanity: coords must be within image
|
|
1860
|
+
if x0 < 0 or y0 < 0 or x1 > img_w or y1 > img_h:
|
|
1861
|
+
logger.debug(f"BBox outside image bounds on page {page_no}, skipping")
|
|
1862
|
+
return None
|
|
1863
|
+
|
|
1864
|
+
# 15% padding + clamp
|
|
1865
|
+
pad_x = (x1 - x0) * 0.15
|
|
1866
|
+
pad_y = (y1 - y0) * 0.15
|
|
1867
|
+
x0, y0 = max(0, x0 - pad_x), max(0, y0 - pad_y)
|
|
1868
|
+
x1, y1 = min(img_w, x1 + pad_x), min(img_h, y1 + pad_y)
|
|
1869
|
+
|
|
1870
|
+
# Minimum 64px crop
|
|
1871
|
+
if (x1 - x0) < 64 or (y1 - y0) < 64:
|
|
1872
|
+
logger.debug(f"Crop too small ({x1-x0:.0f}×{y1-y0:.0f}px) on page {page_no}")
|
|
1873
|
+
return None
|
|
1874
|
+
|
|
1875
|
+
return pil_img.crop((int(x0), int(y0), int(x1), int(y1)))
|
|
1876
|
+
|
|
1877
|
+
# ------------------------------------------------------------------
|
|
1878
|
+
# DOCX/PPTX equation extraction
|
|
1879
|
+
# ------------------------------------------------------------------
|
|
1880
|
+
|
|
1881
|
+
def _extract_docx_equations(self, file_path: Path) -> List[str]:
|
|
1882
|
+
"""Extract OMML equations from DOCX as LaTeX strings."""
|
|
1883
|
+
try:
|
|
1884
|
+
from docxlatex import Document as DocxLatexDoc
|
|
1885
|
+
doc = DocxLatexDoc(str(file_path))
|
|
1886
|
+
equations = doc.get_equations()
|
|
1887
|
+
return [self._normalize_latex(eq) for eq in equations if eq.strip()]
|
|
1888
|
+
except ImportError:
|
|
1889
|
+
logger.warning("docxlatex not installed. Skipping DOCX equation extraction.")
|
|
1890
|
+
return []
|
|
1891
|
+
except Exception as e:
|
|
1892
|
+
logger.warning(f"DOCX equation extraction failed: {e}")
|
|
1893
|
+
return []
|
|
1894
|
+
|
|
1895
|
+
def _extract_pptx_equations(self, file_path: Path) -> List[str]:
|
|
1896
|
+
"""Scan PPTX slide XML for <m:oMath> nodes."""
|
|
1897
|
+
import zipfile
|
|
1898
|
+
try:
|
|
1899
|
+
import defusedxml.ElementTree as ET
|
|
1900
|
+
except ImportError:
|
|
1901
|
+
logger.warning("defusedxml not installed. Skipping PPTX equation extraction.")
|
|
1902
|
+
return []
|
|
1903
|
+
|
|
1904
|
+
equations = []
|
|
1905
|
+
MATH_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math"
|
|
1906
|
+
MAX_SLIDES = 50
|
|
1907
|
+
MAX_BYTES_PER_SLIDE = 10 * 1024 * 1024
|
|
1908
|
+
MAX_TOTAL_BYTES = 100 * 1024 * 1024
|
|
1909
|
+
MAX_COMPRESSION_RATIO = 100
|
|
1910
|
+
MAX_ENTRIES = 500
|
|
1911
|
+
|
|
1912
|
+
try:
|
|
1913
|
+
with zipfile.ZipFile(str(file_path)) as z:
|
|
1914
|
+
# Zip entry count cap
|
|
1915
|
+
if len(z.infolist()) > MAX_ENTRIES:
|
|
1916
|
+
logger.warning(f"PPTX has {len(z.infolist())} entries (>{MAX_ENTRIES}). Skipping.")
|
|
1917
|
+
return []
|
|
1918
|
+
|
|
1919
|
+
slide_files = sorted([
|
|
1920
|
+
n for n in z.namelist()
|
|
1921
|
+
if n.startswith("ppt/slides/slide") and n.endswith(".xml")
|
|
1922
|
+
])[:MAX_SLIDES]
|
|
1923
|
+
|
|
1924
|
+
total_bytes = 0
|
|
1925
|
+
for name in slide_files:
|
|
1926
|
+
info = z.getinfo(name)
|
|
1927
|
+
if info.file_size > MAX_BYTES_PER_SLIDE:
|
|
1928
|
+
logger.debug(f"Skipping {name}: too large")
|
|
1929
|
+
continue
|
|
1930
|
+
if info.compress_size > 0 and info.file_size / info.compress_size > MAX_COMPRESSION_RATIO:
|
|
1931
|
+
logger.debug(f"Skipping {name}: suspicious compression ratio")
|
|
1932
|
+
continue
|
|
1933
|
+
total_bytes += info.file_size
|
|
1934
|
+
if total_bytes > MAX_TOTAL_BYTES:
|
|
1935
|
+
logger.warning("PPTX total uncompressed bytes exceeded cap")
|
|
1936
|
+
break
|
|
1937
|
+
|
|
1938
|
+
try:
|
|
1939
|
+
tree = ET.parse(z.open(name))
|
|
1940
|
+
for omath in tree.iter(f"{{{MATH_NS}}}oMath"):
|
|
1941
|
+
# Extract text content from m:t elements
|
|
1942
|
+
texts = []
|
|
1943
|
+
for t_elem in omath.iter(f"{{{MATH_NS}}}t"):
|
|
1944
|
+
if t_elem.text:
|
|
1945
|
+
texts.append(t_elem.text)
|
|
1946
|
+
if texts:
|
|
1947
|
+
raw = " ".join(texts)
|
|
1948
|
+
equations.append(self._normalize_latex(raw))
|
|
1949
|
+
except Exception as e:
|
|
1950
|
+
logger.debug(f"Failed to parse {name}: {e}")
|
|
1951
|
+
except Exception as e:
|
|
1952
|
+
logger.warning(f"PPTX equation extraction failed: {e}")
|
|
1953
|
+
|
|
1954
|
+
return equations
|
|
1955
|
+
|
|
1956
|
+
def _normalize_latex(self, latex: str) -> str:
|
|
1957
|
+
"""Fix whitespace artifacts in converted LaTeX."""
|
|
1958
|
+
if not latex:
|
|
1959
|
+
return latex
|
|
1960
|
+
# Collapse broken control sequences: \f r a c → \frac
|
|
1961
|
+
prev = None
|
|
1962
|
+
while prev != latex:
|
|
1963
|
+
prev = latex
|
|
1964
|
+
latex = re.sub(r'\\([a-zA-Z]+)\s+([a-zA-Z])', r'\\\1\2', latex)
|
|
1965
|
+
# Trim repeated spaces
|
|
1966
|
+
latex = re.sub(r'\s+', ' ', latex).strip()
|
|
1967
|
+
return latex
|
|
1968
|
+
|
|
1969
|
+
def _normalize_unicode_math(self, text: str) -> str:
|
|
1970
|
+
"""
|
|
1971
|
+
Convert Unicode math symbols to LaTeX-lite notation.
|
|
1972
|
+
Only applies if text does NOT look like it already has LaTeX formatting.
|
|
1973
|
+
"""
|
|
1974
|
+
if not text:
|
|
1975
|
+
return text
|
|
1976
|
+
|
|
1977
|
+
# Scope check: Don't touch if it looks like LaTeX already
|
|
1978
|
+
if "$" in text or "\\" in text:
|
|
1979
|
+
return text
|
|
1980
|
+
|
|
1981
|
+
# Common math symbols
|
|
1982
|
+
replacements = [
|
|
1983
|
+
(r"²", "^2"), (r"³", "^3"),
|
|
1984
|
+
(r"₁", "_1"), (r"₂", "_2"), (r"₃", "_3"), (r"ᵢ", "_i"), (r"ⱼ", "_j"), (r"ₙ", "_n"),
|
|
1985
|
+
(r"∑", r"\\sum"), (r"∫", r"\\int"), (r"∞", r"\\infty"),
|
|
1986
|
+
(r"√", r"\\sqrt"), (r"∂", r"\\partial"), (r"∇", r"\\nabla"),
|
|
1987
|
+
(r"≈", r"\\approx"), (r"≠", r"\\neq"), (r"≤", r"\\leq"), (r"≥", r"\\geq"),
|
|
1988
|
+
(r"α", r"\\alpha"), (r"β", r"\\beta"), (r"γ", r"\\gamma"), (r"θ", r"\\theta"),
|
|
1989
|
+
(r"π", r"\\pi"), (r"µ", r"\\mu"), (r"σ", r"\\sigma"), (r"Ω", r"\\Omega"),
|
|
1990
|
+
(r"∈", r"\\in"), (r"∀", r"\\forall"), (r"∃", r"\\exists"),
|
|
1991
|
+
(r"→", r"\\to"), (r"⇒", r"\\implies"), (r"±", r"\\pm"),
|
|
1992
|
+
]
|
|
1993
|
+
|
|
1994
|
+
normalized = text
|
|
1995
|
+
for char, latex in replacements:
|
|
1996
|
+
normalized = normalized.replace(char, latex)
|
|
1997
|
+
|
|
1998
|
+
return normalized
|
|
1999
|
+
|
|
2000
|
+
def _detect_math_heavy_pages(self, doc, threshold: int = 3) -> List[int]:
|
|
2001
|
+
"""
|
|
2002
|
+
Identify pages that contain significant math content.
|
|
2003
|
+
Returns a list of 1-based page numbers.
|
|
2004
|
+
"""
|
|
2005
|
+
math_pages = set()
|
|
2006
|
+
math_symbols = set("∑∫√∂∇≈≤≥∞αβγθπµσΩ∈∀∃⇒±")
|
|
2007
|
+
|
|
2008
|
+
# Efficient pass: Iterate all items once
|
|
2009
|
+
page_math_scores = {} # page_no -> score
|
|
2010
|
+
|
|
2011
|
+
for item, _ in doc.iterate_items():
|
|
2012
|
+
# Get page number (1-based)
|
|
2013
|
+
page_no = 1
|
|
2014
|
+
if hasattr(item, "prov") and item.prov:
|
|
2015
|
+
# prov is a list of Provenance items
|
|
2016
|
+
page_no = item.prov[0].page_no
|
|
2017
|
+
|
|
2018
|
+
# Check for Formula label
|
|
2019
|
+
# Docling label enum or string: "formula", "equation"
|
|
2020
|
+
label = getattr(item, "label", "").lower() if hasattr(item, "label") else ""
|
|
2021
|
+
if "formula" in label or "equation" in label:
|
|
2022
|
+
page_math_scores[page_no] = page_math_scores.get(page_no, 0) + 10 # High score for explicit label
|
|
2023
|
+
|
|
2024
|
+
# Check text content
|
|
2025
|
+
text = getattr(item, "text", "")
|
|
2026
|
+
if text:
|
|
2027
|
+
# Unicode density check
|
|
2028
|
+
symbol_count = sum(1 for char in text if char in math_symbols)
|
|
2029
|
+
|
|
2030
|
+
# Superscript/Subscript check
|
|
2031
|
+
# ranges: super (²³¹⁰...): \u00B2, \u00B3, \u00B9, \u2070-\u207F
|
|
2032
|
+
# sub (₀₁...): \u2080-\u209C
|
|
2033
|
+
sub_super_count = 0
|
|
2034
|
+
for char in text:
|
|
2035
|
+
code = ord(char)
|
|
2036
|
+
if (0x2070 <= code <= 0x207F) or (0x2080 <= code <= 0x209C) or code in [0xB2, 0xB3, 0xB9]:
|
|
2037
|
+
sub_super_count += 1
|
|
2038
|
+
|
|
2039
|
+
page_math_scores[page_no] = page_math_scores.get(page_no, 0) + symbol_count + (sub_super_count * 0.5)
|
|
2040
|
+
|
|
2041
|
+
# Filter pages exceeding threshold
|
|
2042
|
+
for page_no, score in page_math_scores.items():
|
|
2043
|
+
if score >= threshold:
|
|
2044
|
+
math_pages.add(page_no)
|
|
2045
|
+
|
|
2046
|
+
return sorted(list(math_pages))
|
|
2047
|
+
|
|
2048
|
+
def _is_enriched_page_valid(self, doc, page_no: int) -> bool:
|
|
2049
|
+
"""
|
|
2050
|
+
Check if an enriched page has valid output (detect garbled text).
|
|
2051
|
+
"""
|
|
2052
|
+
# Get text for specific page from the doc
|
|
2053
|
+
page_text = ""
|
|
2054
|
+
# iterate_items(page_no) is supported
|
|
2055
|
+
for item, _ in doc.iterate_items(page_no=page_no):
|
|
2056
|
+
page_text += getattr(item, "text", "") + " "
|
|
2057
|
+
|
|
2058
|
+
if not page_text.strip():
|
|
2059
|
+
return True # Empty page is "valid" in the sense of not garbled
|
|
2060
|
+
|
|
2061
|
+
# Check for garble markers
|
|
2062
|
+
if "/C0" in page_text or "/C1" in page_text:
|
|
2063
|
+
return False
|
|
2064
|
+
|
|
2065
|
+
return True
|