thinkpdf 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,604 @@
1
+ """Text shaping & heuristics for pdfmd.
2
+
3
+ This module transforms `PageText` structures prior to Markdown rendering.
4
+ It is *format-agnostic*: it never emits Markdown. The goal is to clean and
5
+ annotate the intermediate model so the renderer can stay simple and
6
+ predictable.
7
+
8
+ Included heuristics:
9
+ - Detect and remove repeating headers/footers across pages.
10
+ - Strip obvious drop caps (oversized first letter at paragraph start).
11
+ - Merge bullet-only lines with following text lines for better list detection.
12
+ - Detect and annotate tables with normalized rectangular grids.
13
+ - Detect and annotate mathematical expressions and equations.
14
+ - Compute body-size baselines used for heading promotion (by size).
15
+ - Provide ALL-CAPS helpers used by the renderer for heading promotion.
16
+
17
+ Transform functions return new `PageText` instances (immutability by copy), so
18
+ upstream stages can compare before and after if needed.
19
+ """
20
+
21
+ from collections import Counter
22
+ from dataclasses import replace
23
+ from typing import List, Optional, Tuple
24
+ import re
25
+
26
+ from .models import PageText, Block, Line, Span, Options
27
+ from .tables import detect_tables_on_page
28
+ from .equations import annotate_math_on_page
29
+
30
+
31
+ # --------------------------- CAPS heuristics ---------------------------
32
+
33
+
34
+ def is_all_caps_line(s: str) -> bool:
35
+ """Return True if a line is entirely alphabetic and all caps.
36
+
37
+ We ignore digits and punctuation. Whitespace is stripped at both ends.
38
+ """
39
+ s = s.strip()
40
+ if not s:
41
+ return False
42
+
43
+ letters = [ch for ch in s if ch.isalpha()]
44
+ if not letters:
45
+ return False
46
+
47
+ return all(ch.isupper() for ch in letters)
48
+
49
+
50
+ def is_mostly_caps(s: str, threshold: float = 0.7) -> bool:
51
+ """Return True if a line is mostly capitalized alphabetic characters.
52
+
53
+ We count alphabetic characters only and consider the line "mostly caps"
54
+ if the fraction of uppercase letters is >= `threshold`.
55
+ """
56
+ s = s.strip()
57
+ if not s:
58
+ return False
59
+
60
+ letters = [ch for ch in s if ch.isalpha()]
61
+ if not letters:
62
+ return False
63
+ return sum(1 for ch in letters if ch.isupper()) / len(letters) >= threshold
64
+
65
+
66
+ # --------------------------- Basic line helpers ---------------------------
67
+
68
+
69
+ def _line_text(line: Line) -> str:
70
+ """Join all span texts in a line and strip outer whitespace."""
71
+ return "".join(sp.text for sp in line.spans).strip()
72
+
73
+
74
+ def _first_nonblank_line_text(page: PageText) -> str:
75
+ """Return the text of the first non empty line on a page."""
76
+ for blk in page.blocks:
77
+ for ln in blk.lines:
78
+ t = _line_text(ln)
79
+ if t:
80
+ return t
81
+ return ""
82
+
83
+
84
+ def _last_nonblank_line_text(page: PageText) -> str:
85
+ """Return the text of the last non empty line on a page."""
86
+ for blk in reversed(page.blocks):
87
+ for ln in reversed(blk.lines):
88
+ t = _line_text(ln)
89
+ if t:
90
+ return t
91
+ return ""
92
+
93
+
94
+ # ------------------------- Header/footer detection -------------------------
95
+
96
+
97
+ _HEADER_SIMILARITY_THRESHOLD = 0.8
98
+ _FOOTER_SIMILARITY_THRESHOLD = 0.8
99
+
100
+
101
+ def _normalized_text(s: str) -> str:
102
+ """Normalize text for header/footer comparison.
103
+
104
+ This strips surrounding whitespace, collapses internal whitespace,
105
+ and lowercases the result.
106
+ """
107
+ s = s.strip()
108
+ if not s:
109
+ return ""
110
+ return re.sub(r"\s+", " ", s).lower()
111
+
112
+
113
+ def _similarity(a: str, b: str) -> float:
114
+ """Return a crude similarity score between 0 and 1 for two strings.
115
+
116
+ We compute a token-based Jaccard similarity over words, with a small
117
+ smoothing factor to avoid zero-division.
118
+ """
119
+ na = _normalized_text(a)
120
+ nb = _normalized_text(b)
121
+ if not na or not nb:
122
+ return 0.0
123
+ sa = set(na.split())
124
+ sb = set(nb.split())
125
+ inter = len(sa & sb)
126
+ union = len(sa | sb)
127
+ if union == 0:
128
+ return 0.0
129
+ return inter / union
130
+
131
+
132
+ def detect_repeating_edges(
133
+ pages: List[PageText],
134
+ ) -> Tuple[Optional[str], Optional[str]]:
135
+ """Detect repeating header and footer strings across pages.
136
+
137
+ We look at the first and last non empty line of each page and compute a
138
+ majority candidate by normalized text. If enough pages share the same
139
+ normalized header/footer (above similarity threshold), we return the
140
+ canonical string as the detected header/footer.
141
+
142
+ Returns:
143
+ (header, footer) where each is either a string or None if no stable
144
+ pattern could be found.
145
+ """
146
+ if not pages:
147
+ return None, None
148
+
149
+ header_candidates: List[str] = []
150
+ footer_candidates: List[str] = []
151
+
152
+ for p in pages:
153
+ h = _first_nonblank_line_text(p)
154
+ f = _last_nonblank_line_text(p)
155
+ if h:
156
+ header_candidates.append(h)
157
+ if f:
158
+ footer_candidates.append(f)
159
+
160
+ if len(header_candidates) < 2 and len(footer_candidates) < 2:
161
+ return None, None
162
+
163
+ def _majority(candidates: List[str], threshold: float) -> Optional[str]:
164
+ if not candidates:
165
+ return None
166
+
167
+ normalized = [_normalized_text(c) for c in candidates if c.strip()]
168
+ if not normalized:
169
+ return None
170
+
171
+ counts = Counter(normalized)
172
+ most_common, freq = counts.most_common(1)[0]
173
+ frac = freq / len(normalized)
174
+ if frac < threshold:
175
+ return None
176
+
177
+ # Return one original candidate that matches the normalized winner.
178
+ for c in candidates:
179
+ if _normalized_text(c) == most_common:
180
+ return c
181
+ return None
182
+
183
+ header = _majority(header_candidates, _HEADER_SIMILARITY_THRESHOLD)
184
+ footer = _majority(footer_candidates, _FOOTER_SIMILARITY_THRESHOLD)
185
+ return header, footer
186
+
187
+
188
+ # We also apply a couple of pattern-based cleanups for typical page numbers.
189
+
190
+ _FOOTER_DASH_PATTERN = re.compile(r"^-+\s*\d+\s*-+$")
191
+ _FOOTER_PAGENUM_PATTERN = re.compile(r"^\d+$")
192
+ _FOOTER_PAGE_LABEL_PATTERN = re.compile(r"^page\s+\d+$", re.IGNORECASE)
193
+
194
+
195
+ def _is_footer_noise(text: str) -> bool:
196
+ """Heuristic for noisy footer or header artifacts at the bottom of a page.
197
+
198
+ Examples:
199
+ "- - 1"
200
+ "- - 2"
201
+ "Page 2"
202
+ "---- 3 ----"
203
+ """
204
+ s = text.strip()
205
+ if not s:
206
+ return False
207
+ if _FOOTER_DASH_PATTERN.match(s):
208
+ return True
209
+ if _FOOTER_PAGENUM_PATTERN.match(s):
210
+ return True
211
+ if _FOOTER_PAGE_LABEL_PATTERN.match(s):
212
+ return True
213
+ return False
214
+
215
+
216
+ def remove_header_footer(
217
+ pages: List[PageText], header: Optional[str], footer: Optional[str]
218
+ ) -> List[PageText]:
219
+ """Return copies of pages with matching header or footer lines removed.
220
+
221
+ We compare the joined text of each line to the detected strings and also
222
+ apply some light pattern based cleanup for common footer artifacts like
223
+ "- - 1" or "---- 7 ----".
224
+ """
225
+ if not pages:
226
+ return pages
227
+
228
+ header_norm = _normalized_text(header) if header else ""
229
+ footer_norm = _normalized_text(footer) if footer else ""
230
+
231
+ out_pages: List[PageText] = []
232
+
233
+ for p in pages:
234
+ new_blocks: List[Block] = []
235
+
236
+ for blk in p.blocks:
237
+ new_lines: List[Line] = []
238
+ for ln in blk.lines:
239
+ text = _line_text(ln)
240
+ norm = _normalized_text(text)
241
+
242
+ # Strip header if it matches (or is very close).
243
+ if header and norm and _similarity(norm, header_norm) >= 0.95:
244
+ continue
245
+
246
+ # Strip footer if it matches (or is noise).
247
+ if footer and norm and _similarity(norm, footer_norm) >= 0.95:
248
+ continue
249
+ if _is_footer_noise(text):
250
+ continue
251
+
252
+ new_lines.append(ln)
253
+
254
+ if new_lines:
255
+ new_blocks.append(replace(blk, lines=new_lines))
256
+
257
+ out_pages.append(replace(p, blocks=new_blocks))
258
+
259
+ return out_pages
260
+
261
+
262
+ # ------------------------------- Drop caps -------------------------------
263
+
264
+
265
+ def strip_drop_caps_in_page(page: PageText) -> PageText:
266
+ """Strip obvious decorative drop caps from the start of blocks.
267
+
268
+ Heuristic: if the first span in the first non blank line of a block is a
269
+ single alphabetic character, and its font size is much larger than the
270
+ median size of the rest of the line, we remove it.
271
+ """
272
+ new_blocks: List[Block] = []
273
+
274
+ for blk in page.blocks:
275
+ lines = blk.lines
276
+ if not lines:
277
+ new_blocks.append(blk)
278
+ continue
279
+
280
+ modified = False
281
+ new_lines: List[Line] = []
282
+
283
+ for ln in lines:
284
+ spans = ln.spans
285
+ if not spans:
286
+ new_lines.append(ln)
287
+ continue
288
+
289
+ # Find first non empty span.
290
+ first_idx = None
291
+ for i, sp in enumerate(spans):
292
+ if sp.text.strip():
293
+ first_idx = i
294
+ break
295
+
296
+ if first_idx is None:
297
+ new_lines.append(ln)
298
+ continue
299
+
300
+ first = spans[first_idx]
301
+ rest = spans[first_idx + 1 :]
302
+
303
+ if (
304
+ len(first.text.strip()) == 1
305
+ and first.text.strip().isalpha()
306
+ and first.size > 0
307
+ and rest
308
+ ):
309
+ # Compute median size of rest-of-line.
310
+ sizes = [sp.size for sp in rest if sp.size > 0]
311
+ if sizes:
312
+ sizes_sorted = sorted(sizes)
313
+ mid = len(sizes_sorted) // 2
314
+ if len(sizes_sorted) % 2 == 1:
315
+ median = sizes_sorted[mid]
316
+ else:
317
+ median = 0.5 * (
318
+ sizes_sorted[mid - 1] + sizes_sorted[mid]
319
+ )
320
+
321
+ if first.size >= 1.5 * median:
322
+ # Drop-cap detected: remove this span.
323
+ new_spans = spans[:first_idx] + rest
324
+ new_ln = replace(ln, spans=new_spans)
325
+ new_lines.append(new_ln)
326
+ modified = True
327
+ continue
328
+
329
+ new_lines.append(ln)
330
+
331
+ if modified:
332
+ new_blocks.append(replace(blk, lines=new_lines))
333
+ else:
334
+ new_blocks.append(blk)
335
+
336
+ return replace(page, blocks=new_blocks)
337
+
338
+
339
+ def strip_drop_caps(pages: List[PageText]) -> List[PageText]:
340
+ """Apply `strip_drop_caps_in_page` to all pages."""
341
+ return [strip_drop_caps_in_page(p) for p in pages]
342
+
343
+
344
+ # --------------------------- Bullet line merging ---------------------------
345
+
346
+
347
+ _BULLET_ONLY_PATTERN = re.compile(r"^[•◦◦·\-—–]\s*$")
348
+
349
+
350
+ def _merge_bullet_lines_in_page(page: PageText) -> PageText:
351
+ """Merge bullet only lines with their following text lines.
352
+
353
+ Many PDFs encode bullets as one line containing only "•" and the actual
354
+ item text on the next line:
355
+
356
+
357
+
358
+ This is the first bullet item.
359
+
360
+ We instead want a single logical line that starts with "• " followed
361
+ by the item text.
362
+ """
363
+ new_blocks: List[Block] = []
364
+
365
+ for blk in page.blocks:
366
+ lines = blk.lines
367
+ if not lines:
368
+ new_blocks.append(blk)
369
+ continue
370
+
371
+ merged_lines: List[Line] = []
372
+ i = 0
373
+ n = len(lines)
374
+
375
+ while i < n:
376
+ ln = lines[i]
377
+ text = _line_text(ln)
378
+
379
+ if (
380
+ _BULLET_ONLY_PATTERN.match(text)
381
+ and i + 1 < n
382
+ and _line_text(lines[i + 1])
383
+ ):
384
+ # Bullet-only line followed by a non-empty line.
385
+ bullet_span = ln.spans[0] if ln.spans else None
386
+ nxt = lines[i + 1]
387
+ if bullet_span is None:
388
+ # Fallback: just keep the next line as-is.
389
+ merged_lines.append(nxt)
390
+ i += 2
391
+ continue
392
+
393
+ # Prepend bullet span text + a space to the next line's first span.
394
+ nxt_spans = list(nxt.spans)
395
+ if nxt_spans:
396
+ first_span = nxt_spans[0]
397
+ # Preserve style of the next line; only modify text.
398
+ bullet_text = bullet_span.text.strip() or "•"
399
+ new_text = f"{bullet_text} {first_span.text.lstrip()}"
400
+ nxt_spans[0] = replace(first_span, text=new_text)
401
+ else:
402
+ # No spans? Use the bullet span as a single-span line.
403
+ nxt_spans = [bullet_span]
404
+
405
+ # Combined spans: bullet spans followed by modified next line spans.
406
+ combined_spans = list(ln.spans) + nxt_spans
407
+ merged_ln = replace(nxt, spans=combined_spans)
408
+ merged_lines.append(merged_ln)
409
+ i += 2
410
+ continue
411
+
412
+ merged_lines.append(ln)
413
+ i += 1
414
+
415
+ new_blocks.append(replace(blk, lines=merged_lines))
416
+
417
+ return replace(page, blocks=new_blocks)
418
+
419
+
420
+ def merge_bullet_lines(pages: List[PageText]) -> List[PageText]:
421
+ """Apply `_merge_bullet_lines_in_page` to all pages."""
422
+ return [_merge_bullet_lines_in_page(p) for p in pages]
423
+
424
+
425
+ # ------------------------------ Body sizes ------------------------------
426
+
427
+
428
+ def estimate_body_size(pages: List[PageText]) -> List[float]:
429
+ """Estimate a body text font size per page.
430
+
431
+ We collect all non empty span sizes on each page and take the median.
432
+ If a page has no spans with a positive size, we fall back to 11.0.
433
+ """
434
+ body_sizes: List[float] = []
435
+
436
+ for p in pages:
437
+ sizes = [
438
+ sp.size
439
+ for blk in p.blocks
440
+ for ln in blk.lines
441
+ for sp in ln.spans
442
+ if sp.size > 0 and (sp.text or "").strip()
443
+ ]
444
+
445
+ if not sizes:
446
+ body_sizes.append(11.0)
447
+ continue
448
+
449
+ sizes_sorted = sorted(sizes)
450
+ mid = len(sizes_sorted) // 2
451
+ if len(sizes_sorted) % 2 == 1:
452
+ median = sizes_sorted[mid]
453
+ else:
454
+ median = 0.5 * (sizes_sorted[mid - 1] + sizes_sorted[mid])
455
+
456
+ body_sizes.append(median)
457
+
458
+ return body_sizes
459
+
460
+
461
+ # --------------------------- Table detection & annotation ---------------------------
462
+
463
+
464
+ def _annotate_tables_on_page(page: PageText, debug: bool = False) -> PageText:
465
+ """Detect tables and annotate blocks with table metadata.
466
+
467
+ For each detected table, the corresponding block(s) get dynamic attributes:
468
+ - is_table: bool = True
469
+ - table_grid: List[List[str]] = normalized rectangular grid
470
+ - table_type: str = detection method used
471
+ - table_score: float = confidence score
472
+
473
+ Args:
474
+ page: PageText to analyze
475
+ debug: If True, log detection details
476
+
477
+ Returns:
478
+ New PageText with annotated blocks
479
+ """
480
+ detections = detect_tables_on_page(page, debug=debug)
481
+
482
+ if not detections:
483
+ return page
484
+
485
+ # Build a mapping of block_index -> TableDetection
486
+ table_map = {det.block_index: det for det in detections}
487
+
488
+ new_blocks: List[Block] = []
489
+
490
+ for idx, blk in enumerate(page.blocks):
491
+ if idx in table_map:
492
+ det = table_map[idx]
493
+
494
+ # Normalize grid to ensure rectangular structure
495
+ max_cols = max(len(row) for row in det.grid)
496
+ normalized_grid = []
497
+ for row in det.grid:
498
+ if len(row) < max_cols:
499
+ # Pad short rows with empty strings
500
+ row = row + [''] * (max_cols - len(row))
501
+ normalized_grid.append(row)
502
+
503
+ # Attach table metadata as dynamic attributes
504
+ setattr(blk, "is_table", True)
505
+ setattr(blk, "table_grid", normalized_grid)
506
+ setattr(blk, "table_type", det.detection_type)
507
+ setattr(blk, "table_score", det.score)
508
+
509
+ if debug:
510
+ try:
511
+ from .utils import log
512
+ log(f"[transform] Annotated block {idx} as table "
513
+ f"({det.detection_type}, {det.n_rows}x{det.n_cols}, "
514
+ f"score={det.score:.2f})")
515
+ except ImportError:
516
+ pass
517
+
518
+ new_blocks.append(blk)
519
+
520
+ return replace(page, blocks=new_blocks)
521
+
522
+
523
+ def annotate_tables(pages: List[PageText], debug: bool = False) -> List[PageText]:
524
+ """Detect and annotate tables across all pages.
525
+
526
+ Args:
527
+ pages: List of PageText objects to process
528
+ debug: If True, enable debug logging for table detection
529
+
530
+ Returns:
531
+ List of PageText objects with table-annotated blocks
532
+ """
533
+ return [_annotate_tables_on_page(p, debug=debug) for p in pages]
534
+
535
+
536
+ # --------------------------- Main transform API ---------------------------
537
+
538
+
539
+ def transform_pages(
540
+ pages: List[PageText],
541
+ options: Options,
542
+ debug_tables: bool = False,
543
+ ) -> Tuple[List[PageText], Optional[str], Optional[str], List[float]]:
544
+ """Run the standard transform pipeline.
545
+
546
+ Pipeline stages:
547
+ 1. Strip decorative drop caps
548
+ 2. Detect and remove repeating headers/footers (if enabled)
549
+ 3. Merge bullet-only lines with following text
550
+ 4. Detect and annotate tables
551
+ 5. Detect and annotate mathematical expressions
552
+ 6. Compute body font size baselines
553
+
554
+ Args:
555
+ pages: Raw extracted PageText objects
556
+ options: Transformation options (from models.Options)
557
+ debug_tables: Enable debug logging for table detection
558
+
559
+ Returns:
560
+ Tuple of:
561
+ - pages_t: Transformed pages with annotations
562
+ - header: Detected repeating header string (or None)
563
+ - footer: Detected repeating footer string (or None)
564
+ - body_sizes: Per-page body font size baselines
565
+ """
566
+ # 1. Strip decorative drop caps.
567
+ pages_t = strip_drop_caps(pages)
568
+
569
+ # 2. Detect repeating header or footer and remove them (if enabled).
570
+ header: Optional[str] = None
571
+ footer: Optional[str] = None
572
+
573
+ if options.remove_headers_footers:
574
+ header, footer = detect_repeating_edges(pages_t)
575
+ pages_t = remove_header_footer(pages_t, header, footer)
576
+
577
+ # 3. Merge bullet only lines with following text lines for list detection.
578
+ pages_t = merge_bullet_lines(pages_t)
579
+
580
+ # 4. Detect simple text tables and annotate blocks.
581
+ pages_t = annotate_tables(pages_t, debug=debug_tables)
582
+
583
+ # 5. Detect and annotate math equations and expressions.
584
+ for page in pages_t:
585
+ annotate_math_on_page(page)
586
+
587
+ # 6. Compute per page body font size baselines for heading promotion.
588
+ body_sizes = estimate_body_size(pages_t)
589
+
590
+ return pages_t, header, footer, body_sizes
591
+
592
+
593
+ __all__ = [
594
+ "is_all_caps_line",
595
+ "is_mostly_caps",
596
+ "detect_repeating_edges",
597
+ "remove_header_footer",
598
+ "strip_drop_caps_in_page",
599
+ "strip_drop_caps",
600
+ "merge_bullet_lines",
601
+ "estimate_body_size",
602
+ "annotate_tables",
603
+ "transform_pages",
604
+ ]