thinkpdf 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfbrain/__init__.py +22 -0
- pdfbrain/app_gui.py +530 -0
- pdfbrain/cache/__init__.py +5 -0
- pdfbrain/cache/cache_manager.py +252 -0
- pdfbrain/cli.py +255 -0
- pdfbrain/core/__init__.py +6 -0
- pdfbrain/core/converter.py +332 -0
- pdfbrain/core/equations.py +635 -0
- pdfbrain/core/extract.py +469 -0
- pdfbrain/core/extractor.py +272 -0
- pdfbrain/core/models.py +196 -0
- pdfbrain/core/pipeline.py +287 -0
- pdfbrain/core/render.py +574 -0
- pdfbrain/core/tables.py +871 -0
- pdfbrain/core/transform.py +604 -0
- pdfbrain/core/utils.py +229 -0
- pdfbrain/engine.py +392 -0
- pdfbrain/mcp_server.py +315 -0
- pdfbrain/utils/__init__.py +1 -0
- thinkpdf-1.0.1.dist-info/METADATA +138 -0
- thinkpdf-1.0.1.dist-info/RECORD +25 -0
- thinkpdf-1.0.1.dist-info/WHEEL +5 -0
- thinkpdf-1.0.1.dist-info/entry_points.txt +4 -0
- thinkpdf-1.0.1.dist-info/licenses/LICENSE +620 -0
- thinkpdf-1.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,604 @@
|
|
|
1
|
+
"""Text shaping & heuristics for pdfmd.
|
|
2
|
+
|
|
3
|
+
This module transforms `PageText` structures prior to Markdown rendering.
|
|
4
|
+
It is *format-agnostic*: it never emits Markdown. The goal is to clean and
|
|
5
|
+
annotate the intermediate model so the renderer can stay simple and
|
|
6
|
+
predictable.
|
|
7
|
+
|
|
8
|
+
Included heuristics:
|
|
9
|
+
- Detect and remove repeating headers/footers across pages.
|
|
10
|
+
- Strip obvious drop caps (oversized first letter at paragraph start).
|
|
11
|
+
- Merge bullet-only lines with following text lines for better list detection.
|
|
12
|
+
- Detect and annotate tables with normalized rectangular grids.
|
|
13
|
+
- Detect and annotate mathematical expressions and equations.
|
|
14
|
+
- Compute body-size baselines used for heading promotion (by size).
|
|
15
|
+
- Provide ALL-CAPS helpers used by the renderer for heading promotion.
|
|
16
|
+
|
|
17
|
+
Transform functions return new `PageText` instances (immutability by copy), so
|
|
18
|
+
upstream stages can compare before and after if needed.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from collections import Counter
|
|
22
|
+
from dataclasses import replace
|
|
23
|
+
from typing import List, Optional, Tuple
|
|
24
|
+
import re
|
|
25
|
+
|
|
26
|
+
from .models import PageText, Block, Line, Span, Options
|
|
27
|
+
from .tables import detect_tables_on_page
|
|
28
|
+
from .equations import annotate_math_on_page
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# --------------------------- CAPS heuristics ---------------------------
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def is_all_caps_line(s: str) -> bool:
|
|
35
|
+
"""Return True if a line is entirely alphabetic and all caps.
|
|
36
|
+
|
|
37
|
+
We ignore digits and punctuation. Whitespace is stripped at both ends.
|
|
38
|
+
"""
|
|
39
|
+
s = s.strip()
|
|
40
|
+
if not s:
|
|
41
|
+
return False
|
|
42
|
+
|
|
43
|
+
letters = [ch for ch in s if ch.isalpha()]
|
|
44
|
+
if not letters:
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
return all(ch.isupper() for ch in letters)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def is_mostly_caps(s: str, threshold: float = 0.7) -> bool:
|
|
51
|
+
"""Return True if a line is mostly capitalized alphabetic characters.
|
|
52
|
+
|
|
53
|
+
We count alphabetic characters only and consider the line "mostly caps"
|
|
54
|
+
if the fraction of uppercase letters is >= `threshold`.
|
|
55
|
+
"""
|
|
56
|
+
s = s.strip()
|
|
57
|
+
if not s:
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
letters = [ch for ch in s if ch.isalpha()]
|
|
61
|
+
if not letters:
|
|
62
|
+
return False
|
|
63
|
+
return sum(1 for ch in letters if ch.isupper()) / len(letters) >= threshold
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# --------------------------- Basic line helpers ---------------------------
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _line_text(line: Line) -> str:
|
|
70
|
+
"""Join all span texts in a line and strip outer whitespace."""
|
|
71
|
+
return "".join(sp.text for sp in line.spans).strip()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _first_nonblank_line_text(page: PageText) -> str:
|
|
75
|
+
"""Return the text of the first non empty line on a page."""
|
|
76
|
+
for blk in page.blocks:
|
|
77
|
+
for ln in blk.lines:
|
|
78
|
+
t = _line_text(ln)
|
|
79
|
+
if t:
|
|
80
|
+
return t
|
|
81
|
+
return ""
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _last_nonblank_line_text(page: PageText) -> str:
|
|
85
|
+
"""Return the text of the last non empty line on a page."""
|
|
86
|
+
for blk in reversed(page.blocks):
|
|
87
|
+
for ln in reversed(blk.lines):
|
|
88
|
+
t = _line_text(ln)
|
|
89
|
+
if t:
|
|
90
|
+
return t
|
|
91
|
+
return ""
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ------------------------- Header/footer detection -------------------------
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
_HEADER_SIMILARITY_THRESHOLD = 0.8
|
|
98
|
+
_FOOTER_SIMILARITY_THRESHOLD = 0.8
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _normalized_text(s: str) -> str:
|
|
102
|
+
"""Normalize text for header/footer comparison.
|
|
103
|
+
|
|
104
|
+
This strips surrounding whitespace, collapses internal whitespace,
|
|
105
|
+
and lowercases the result.
|
|
106
|
+
"""
|
|
107
|
+
s = s.strip()
|
|
108
|
+
if not s:
|
|
109
|
+
return ""
|
|
110
|
+
return re.sub(r"\s+", " ", s).lower()
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _similarity(a: str, b: str) -> float:
|
|
114
|
+
"""Return a crude similarity score between 0 and 1 for two strings.
|
|
115
|
+
|
|
116
|
+
We compute a token-based Jaccard similarity over words, with a small
|
|
117
|
+
smoothing factor to avoid zero-division.
|
|
118
|
+
"""
|
|
119
|
+
na = _normalized_text(a)
|
|
120
|
+
nb = _normalized_text(b)
|
|
121
|
+
if not na or not nb:
|
|
122
|
+
return 0.0
|
|
123
|
+
sa = set(na.split())
|
|
124
|
+
sb = set(nb.split())
|
|
125
|
+
inter = len(sa & sb)
|
|
126
|
+
union = len(sa | sb)
|
|
127
|
+
if union == 0:
|
|
128
|
+
return 0.0
|
|
129
|
+
return inter / union
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def detect_repeating_edges(
|
|
133
|
+
pages: List[PageText],
|
|
134
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
|
135
|
+
"""Detect repeating header and footer strings across pages.
|
|
136
|
+
|
|
137
|
+
We look at the first and last non empty line of each page and compute a
|
|
138
|
+
majority candidate by normalized text. If enough pages share the same
|
|
139
|
+
normalized header/footer (above similarity threshold), we return the
|
|
140
|
+
canonical string as the detected header/footer.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
(header, footer) where each is either a string or None if no stable
|
|
144
|
+
pattern could be found.
|
|
145
|
+
"""
|
|
146
|
+
if not pages:
|
|
147
|
+
return None, None
|
|
148
|
+
|
|
149
|
+
header_candidates: List[str] = []
|
|
150
|
+
footer_candidates: List[str] = []
|
|
151
|
+
|
|
152
|
+
for p in pages:
|
|
153
|
+
h = _first_nonblank_line_text(p)
|
|
154
|
+
f = _last_nonblank_line_text(p)
|
|
155
|
+
if h:
|
|
156
|
+
header_candidates.append(h)
|
|
157
|
+
if f:
|
|
158
|
+
footer_candidates.append(f)
|
|
159
|
+
|
|
160
|
+
if len(header_candidates) < 2 and len(footer_candidates) < 2:
|
|
161
|
+
return None, None
|
|
162
|
+
|
|
163
|
+
def _majority(candidates: List[str], threshold: float) -> Optional[str]:
|
|
164
|
+
if not candidates:
|
|
165
|
+
return None
|
|
166
|
+
|
|
167
|
+
normalized = [_normalized_text(c) for c in candidates if c.strip()]
|
|
168
|
+
if not normalized:
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
counts = Counter(normalized)
|
|
172
|
+
most_common, freq = counts.most_common(1)[0]
|
|
173
|
+
frac = freq / len(normalized)
|
|
174
|
+
if frac < threshold:
|
|
175
|
+
return None
|
|
176
|
+
|
|
177
|
+
# Return one original candidate that matches the normalized winner.
|
|
178
|
+
for c in candidates:
|
|
179
|
+
if _normalized_text(c) == most_common:
|
|
180
|
+
return c
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
header = _majority(header_candidates, _HEADER_SIMILARITY_THRESHOLD)
|
|
184
|
+
footer = _majority(footer_candidates, _FOOTER_SIMILARITY_THRESHOLD)
|
|
185
|
+
return header, footer
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# We also apply a couple of pattern-based cleanups for typical page numbers.
|
|
189
|
+
|
|
190
|
+
_FOOTER_DASH_PATTERN = re.compile(r"^-+\s*\d+\s*-+$")
|
|
191
|
+
_FOOTER_PAGENUM_PATTERN = re.compile(r"^\d+$")
|
|
192
|
+
_FOOTER_PAGE_LABEL_PATTERN = re.compile(r"^page\s+\d+$", re.IGNORECASE)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _is_footer_noise(text: str) -> bool:
|
|
196
|
+
"""Heuristic for noisy footer or header artifacts at the bottom of a page.
|
|
197
|
+
|
|
198
|
+
Examples:
|
|
199
|
+
"- - 1"
|
|
200
|
+
"- - 2"
|
|
201
|
+
"Page 2"
|
|
202
|
+
"---- 3 ----"
|
|
203
|
+
"""
|
|
204
|
+
s = text.strip()
|
|
205
|
+
if not s:
|
|
206
|
+
return False
|
|
207
|
+
if _FOOTER_DASH_PATTERN.match(s):
|
|
208
|
+
return True
|
|
209
|
+
if _FOOTER_PAGENUM_PATTERN.match(s):
|
|
210
|
+
return True
|
|
211
|
+
if _FOOTER_PAGE_LABEL_PATTERN.match(s):
|
|
212
|
+
return True
|
|
213
|
+
return False
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def remove_header_footer(
|
|
217
|
+
pages: List[PageText], header: Optional[str], footer: Optional[str]
|
|
218
|
+
) -> List[PageText]:
|
|
219
|
+
"""Return copies of pages with matching header or footer lines removed.
|
|
220
|
+
|
|
221
|
+
We compare the joined text of each line to the detected strings and also
|
|
222
|
+
apply some light pattern based cleanup for common footer artifacts like
|
|
223
|
+
"- - 1" or "---- 7 ----".
|
|
224
|
+
"""
|
|
225
|
+
if not pages:
|
|
226
|
+
return pages
|
|
227
|
+
|
|
228
|
+
header_norm = _normalized_text(header) if header else ""
|
|
229
|
+
footer_norm = _normalized_text(footer) if footer else ""
|
|
230
|
+
|
|
231
|
+
out_pages: List[PageText] = []
|
|
232
|
+
|
|
233
|
+
for p in pages:
|
|
234
|
+
new_blocks: List[Block] = []
|
|
235
|
+
|
|
236
|
+
for blk in p.blocks:
|
|
237
|
+
new_lines: List[Line] = []
|
|
238
|
+
for ln in blk.lines:
|
|
239
|
+
text = _line_text(ln)
|
|
240
|
+
norm = _normalized_text(text)
|
|
241
|
+
|
|
242
|
+
# Strip header if it matches (or is very close).
|
|
243
|
+
if header and norm and _similarity(norm, header_norm) >= 0.95:
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
# Strip footer if it matches (or is noise).
|
|
247
|
+
if footer and norm and _similarity(norm, footer_norm) >= 0.95:
|
|
248
|
+
continue
|
|
249
|
+
if _is_footer_noise(text):
|
|
250
|
+
continue
|
|
251
|
+
|
|
252
|
+
new_lines.append(ln)
|
|
253
|
+
|
|
254
|
+
if new_lines:
|
|
255
|
+
new_blocks.append(replace(blk, lines=new_lines))
|
|
256
|
+
|
|
257
|
+
out_pages.append(replace(p, blocks=new_blocks))
|
|
258
|
+
|
|
259
|
+
return out_pages
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
# ------------------------------- Drop caps -------------------------------
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def strip_drop_caps_in_page(page: PageText) -> PageText:
|
|
266
|
+
"""Strip obvious decorative drop caps from the start of blocks.
|
|
267
|
+
|
|
268
|
+
Heuristic: if the first span in the first non blank line of a block is a
|
|
269
|
+
single alphabetic character, and its font size is much larger than the
|
|
270
|
+
median size of the rest of the line, we remove it.
|
|
271
|
+
"""
|
|
272
|
+
new_blocks: List[Block] = []
|
|
273
|
+
|
|
274
|
+
for blk in page.blocks:
|
|
275
|
+
lines = blk.lines
|
|
276
|
+
if not lines:
|
|
277
|
+
new_blocks.append(blk)
|
|
278
|
+
continue
|
|
279
|
+
|
|
280
|
+
modified = False
|
|
281
|
+
new_lines: List[Line] = []
|
|
282
|
+
|
|
283
|
+
for ln in lines:
|
|
284
|
+
spans = ln.spans
|
|
285
|
+
if not spans:
|
|
286
|
+
new_lines.append(ln)
|
|
287
|
+
continue
|
|
288
|
+
|
|
289
|
+
# Find first non empty span.
|
|
290
|
+
first_idx = None
|
|
291
|
+
for i, sp in enumerate(spans):
|
|
292
|
+
if sp.text.strip():
|
|
293
|
+
first_idx = i
|
|
294
|
+
break
|
|
295
|
+
|
|
296
|
+
if first_idx is None:
|
|
297
|
+
new_lines.append(ln)
|
|
298
|
+
continue
|
|
299
|
+
|
|
300
|
+
first = spans[first_idx]
|
|
301
|
+
rest = spans[first_idx + 1 :]
|
|
302
|
+
|
|
303
|
+
if (
|
|
304
|
+
len(first.text.strip()) == 1
|
|
305
|
+
and first.text.strip().isalpha()
|
|
306
|
+
and first.size > 0
|
|
307
|
+
and rest
|
|
308
|
+
):
|
|
309
|
+
# Compute median size of rest-of-line.
|
|
310
|
+
sizes = [sp.size for sp in rest if sp.size > 0]
|
|
311
|
+
if sizes:
|
|
312
|
+
sizes_sorted = sorted(sizes)
|
|
313
|
+
mid = len(sizes_sorted) // 2
|
|
314
|
+
if len(sizes_sorted) % 2 == 1:
|
|
315
|
+
median = sizes_sorted[mid]
|
|
316
|
+
else:
|
|
317
|
+
median = 0.5 * (
|
|
318
|
+
sizes_sorted[mid - 1] + sizes_sorted[mid]
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
if first.size >= 1.5 * median:
|
|
322
|
+
# Drop-cap detected: remove this span.
|
|
323
|
+
new_spans = spans[:first_idx] + rest
|
|
324
|
+
new_ln = replace(ln, spans=new_spans)
|
|
325
|
+
new_lines.append(new_ln)
|
|
326
|
+
modified = True
|
|
327
|
+
continue
|
|
328
|
+
|
|
329
|
+
new_lines.append(ln)
|
|
330
|
+
|
|
331
|
+
if modified:
|
|
332
|
+
new_blocks.append(replace(blk, lines=new_lines))
|
|
333
|
+
else:
|
|
334
|
+
new_blocks.append(blk)
|
|
335
|
+
|
|
336
|
+
return replace(page, blocks=new_blocks)
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def strip_drop_caps(pages: List[PageText]) -> List[PageText]:
|
|
340
|
+
"""Apply `strip_drop_caps_in_page` to all pages."""
|
|
341
|
+
return [strip_drop_caps_in_page(p) for p in pages]
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
# --------------------------- Bullet line merging ---------------------------
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
_BULLET_ONLY_PATTERN = re.compile(r"^[•◦◦·\-—–]\s*$")
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def _merge_bullet_lines_in_page(page: PageText) -> PageText:
|
|
351
|
+
"""Merge bullet only lines with their following text lines.
|
|
352
|
+
|
|
353
|
+
Many PDFs encode bullets as one line containing only "•" and the actual
|
|
354
|
+
item text on the next line:
|
|
355
|
+
|
|
356
|
+
•
|
|
357
|
+
|
|
358
|
+
This is the first bullet item.
|
|
359
|
+
|
|
360
|
+
We instead want a single logical line that starts with "• " followed
|
|
361
|
+
by the item text.
|
|
362
|
+
"""
|
|
363
|
+
new_blocks: List[Block] = []
|
|
364
|
+
|
|
365
|
+
for blk in page.blocks:
|
|
366
|
+
lines = blk.lines
|
|
367
|
+
if not lines:
|
|
368
|
+
new_blocks.append(blk)
|
|
369
|
+
continue
|
|
370
|
+
|
|
371
|
+
merged_lines: List[Line] = []
|
|
372
|
+
i = 0
|
|
373
|
+
n = len(lines)
|
|
374
|
+
|
|
375
|
+
while i < n:
|
|
376
|
+
ln = lines[i]
|
|
377
|
+
text = _line_text(ln)
|
|
378
|
+
|
|
379
|
+
if (
|
|
380
|
+
_BULLET_ONLY_PATTERN.match(text)
|
|
381
|
+
and i + 1 < n
|
|
382
|
+
and _line_text(lines[i + 1])
|
|
383
|
+
):
|
|
384
|
+
# Bullet-only line followed by a non-empty line.
|
|
385
|
+
bullet_span = ln.spans[0] if ln.spans else None
|
|
386
|
+
nxt = lines[i + 1]
|
|
387
|
+
if bullet_span is None:
|
|
388
|
+
# Fallback: just keep the next line as-is.
|
|
389
|
+
merged_lines.append(nxt)
|
|
390
|
+
i += 2
|
|
391
|
+
continue
|
|
392
|
+
|
|
393
|
+
# Prepend bullet span text + a space to the next line's first span.
|
|
394
|
+
nxt_spans = list(nxt.spans)
|
|
395
|
+
if nxt_spans:
|
|
396
|
+
first_span = nxt_spans[0]
|
|
397
|
+
# Preserve style of the next line; only modify text.
|
|
398
|
+
bullet_text = bullet_span.text.strip() or "•"
|
|
399
|
+
new_text = f"{bullet_text} {first_span.text.lstrip()}"
|
|
400
|
+
nxt_spans[0] = replace(first_span, text=new_text)
|
|
401
|
+
else:
|
|
402
|
+
# No spans? Use the bullet span as a single-span line.
|
|
403
|
+
nxt_spans = [bullet_span]
|
|
404
|
+
|
|
405
|
+
# Combined spans: bullet spans followed by modified next line spans.
|
|
406
|
+
combined_spans = list(ln.spans) + nxt_spans
|
|
407
|
+
merged_ln = replace(nxt, spans=combined_spans)
|
|
408
|
+
merged_lines.append(merged_ln)
|
|
409
|
+
i += 2
|
|
410
|
+
continue
|
|
411
|
+
|
|
412
|
+
merged_lines.append(ln)
|
|
413
|
+
i += 1
|
|
414
|
+
|
|
415
|
+
new_blocks.append(replace(blk, lines=merged_lines))
|
|
416
|
+
|
|
417
|
+
return replace(page, blocks=new_blocks)
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def merge_bullet_lines(pages: List[PageText]) -> List[PageText]:
|
|
421
|
+
"""Apply `_merge_bullet_lines_in_page` to all pages."""
|
|
422
|
+
return [_merge_bullet_lines_in_page(p) for p in pages]
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
# ------------------------------ Body sizes ------------------------------
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def estimate_body_size(pages: List[PageText]) -> List[float]:
|
|
429
|
+
"""Estimate a body text font size per page.
|
|
430
|
+
|
|
431
|
+
We collect all non empty span sizes on each page and take the median.
|
|
432
|
+
If a page has no spans with a positive size, we fall back to 11.0.
|
|
433
|
+
"""
|
|
434
|
+
body_sizes: List[float] = []
|
|
435
|
+
|
|
436
|
+
for p in pages:
|
|
437
|
+
sizes = [
|
|
438
|
+
sp.size
|
|
439
|
+
for blk in p.blocks
|
|
440
|
+
for ln in blk.lines
|
|
441
|
+
for sp in ln.spans
|
|
442
|
+
if sp.size > 0 and (sp.text or "").strip()
|
|
443
|
+
]
|
|
444
|
+
|
|
445
|
+
if not sizes:
|
|
446
|
+
body_sizes.append(11.0)
|
|
447
|
+
continue
|
|
448
|
+
|
|
449
|
+
sizes_sorted = sorted(sizes)
|
|
450
|
+
mid = len(sizes_sorted) // 2
|
|
451
|
+
if len(sizes_sorted) % 2 == 1:
|
|
452
|
+
median = sizes_sorted[mid]
|
|
453
|
+
else:
|
|
454
|
+
median = 0.5 * (sizes_sorted[mid - 1] + sizes_sorted[mid])
|
|
455
|
+
|
|
456
|
+
body_sizes.append(median)
|
|
457
|
+
|
|
458
|
+
return body_sizes
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
# --------------------------- Table detection & annotation ---------------------------
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
def _annotate_tables_on_page(page: PageText, debug: bool = False) -> PageText:
|
|
465
|
+
"""Detect tables and annotate blocks with table metadata.
|
|
466
|
+
|
|
467
|
+
For each detected table, the corresponding block(s) get dynamic attributes:
|
|
468
|
+
- is_table: bool = True
|
|
469
|
+
- table_grid: List[List[str]] = normalized rectangular grid
|
|
470
|
+
- table_type: str = detection method used
|
|
471
|
+
- table_score: float = confidence score
|
|
472
|
+
|
|
473
|
+
Args:
|
|
474
|
+
page: PageText to analyze
|
|
475
|
+
debug: If True, log detection details
|
|
476
|
+
|
|
477
|
+
Returns:
|
|
478
|
+
New PageText with annotated blocks
|
|
479
|
+
"""
|
|
480
|
+
detections = detect_tables_on_page(page, debug=debug)
|
|
481
|
+
|
|
482
|
+
if not detections:
|
|
483
|
+
return page
|
|
484
|
+
|
|
485
|
+
# Build a mapping of block_index -> TableDetection
|
|
486
|
+
table_map = {det.block_index: det for det in detections}
|
|
487
|
+
|
|
488
|
+
new_blocks: List[Block] = []
|
|
489
|
+
|
|
490
|
+
for idx, blk in enumerate(page.blocks):
|
|
491
|
+
if idx in table_map:
|
|
492
|
+
det = table_map[idx]
|
|
493
|
+
|
|
494
|
+
# Normalize grid to ensure rectangular structure
|
|
495
|
+
max_cols = max(len(row) for row in det.grid)
|
|
496
|
+
normalized_grid = []
|
|
497
|
+
for row in det.grid:
|
|
498
|
+
if len(row) < max_cols:
|
|
499
|
+
# Pad short rows with empty strings
|
|
500
|
+
row = row + [''] * (max_cols - len(row))
|
|
501
|
+
normalized_grid.append(row)
|
|
502
|
+
|
|
503
|
+
# Attach table metadata as dynamic attributes
|
|
504
|
+
setattr(blk, "is_table", True)
|
|
505
|
+
setattr(blk, "table_grid", normalized_grid)
|
|
506
|
+
setattr(blk, "table_type", det.detection_type)
|
|
507
|
+
setattr(blk, "table_score", det.score)
|
|
508
|
+
|
|
509
|
+
if debug:
|
|
510
|
+
try:
|
|
511
|
+
from .utils import log
|
|
512
|
+
log(f"[transform] Annotated block {idx} as table "
|
|
513
|
+
f"({det.detection_type}, {det.n_rows}x{det.n_cols}, "
|
|
514
|
+
f"score={det.score:.2f})")
|
|
515
|
+
except ImportError:
|
|
516
|
+
pass
|
|
517
|
+
|
|
518
|
+
new_blocks.append(blk)
|
|
519
|
+
|
|
520
|
+
return replace(page, blocks=new_blocks)
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def annotate_tables(pages: List[PageText], debug: bool = False) -> List[PageText]:
|
|
524
|
+
"""Detect and annotate tables across all pages.
|
|
525
|
+
|
|
526
|
+
Args:
|
|
527
|
+
pages: List of PageText objects to process
|
|
528
|
+
debug: If True, enable debug logging for table detection
|
|
529
|
+
|
|
530
|
+
Returns:
|
|
531
|
+
List of PageText objects with table-annotated blocks
|
|
532
|
+
"""
|
|
533
|
+
return [_annotate_tables_on_page(p, debug=debug) for p in pages]
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
# --------------------------- Main transform API ---------------------------
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def transform_pages(
|
|
540
|
+
pages: List[PageText],
|
|
541
|
+
options: Options,
|
|
542
|
+
debug_tables: bool = False,
|
|
543
|
+
) -> Tuple[List[PageText], Optional[str], Optional[str], List[float]]:
|
|
544
|
+
"""Run the standard transform pipeline.
|
|
545
|
+
|
|
546
|
+
Pipeline stages:
|
|
547
|
+
1. Strip decorative drop caps
|
|
548
|
+
2. Detect and remove repeating headers/footers (if enabled)
|
|
549
|
+
3. Merge bullet-only lines with following text
|
|
550
|
+
4. Detect and annotate tables
|
|
551
|
+
5. Detect and annotate mathematical expressions
|
|
552
|
+
6. Compute body font size baselines
|
|
553
|
+
|
|
554
|
+
Args:
|
|
555
|
+
pages: Raw extracted PageText objects
|
|
556
|
+
options: Transformation options (from models.Options)
|
|
557
|
+
debug_tables: Enable debug logging for table detection
|
|
558
|
+
|
|
559
|
+
Returns:
|
|
560
|
+
Tuple of:
|
|
561
|
+
- pages_t: Transformed pages with annotations
|
|
562
|
+
- header: Detected repeating header string (or None)
|
|
563
|
+
- footer: Detected repeating footer string (or None)
|
|
564
|
+
- body_sizes: Per-page body font size baselines
|
|
565
|
+
"""
|
|
566
|
+
# 1. Strip decorative drop caps.
|
|
567
|
+
pages_t = strip_drop_caps(pages)
|
|
568
|
+
|
|
569
|
+
# 2. Detect repeating header or footer and remove them (if enabled).
|
|
570
|
+
header: Optional[str] = None
|
|
571
|
+
footer: Optional[str] = None
|
|
572
|
+
|
|
573
|
+
if options.remove_headers_footers:
|
|
574
|
+
header, footer = detect_repeating_edges(pages_t)
|
|
575
|
+
pages_t = remove_header_footer(pages_t, header, footer)
|
|
576
|
+
|
|
577
|
+
# 3. Merge bullet only lines with following text lines for list detection.
|
|
578
|
+
pages_t = merge_bullet_lines(pages_t)
|
|
579
|
+
|
|
580
|
+
# 4. Detect simple text tables and annotate blocks.
|
|
581
|
+
pages_t = annotate_tables(pages_t, debug=debug_tables)
|
|
582
|
+
|
|
583
|
+
# 5. Detect and annotate math equations and expressions.
|
|
584
|
+
for page in pages_t:
|
|
585
|
+
annotate_math_on_page(page)
|
|
586
|
+
|
|
587
|
+
# 6. Compute per page body font size baselines for heading promotion.
|
|
588
|
+
body_sizes = estimate_body_size(pages_t)
|
|
589
|
+
|
|
590
|
+
return pages_t, header, footer, body_sizes
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
__all__ = [
|
|
594
|
+
"is_all_caps_line",
|
|
595
|
+
"is_mostly_caps",
|
|
596
|
+
"detect_repeating_edges",
|
|
597
|
+
"remove_header_footer",
|
|
598
|
+
"strip_drop_caps_in_page",
|
|
599
|
+
"strip_drop_caps",
|
|
600
|
+
"merge_bullet_lines",
|
|
601
|
+
"estimate_body_size",
|
|
602
|
+
"annotate_tables",
|
|
603
|
+
"transform_pages",
|
|
604
|
+
]
|