thinkpdf 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,635 @@
1
+ from __future__ import annotations
2
+
3
+ """
4
+ Math detection and LaTeX-style normalization for pdfmd.
5
+
6
+ This module works purely on the intermediate text model defined in
7
+ models.PageText / Block / Line / Span. It does **not** depend on any
8
+ PDF geometry and intentionally avoids heavy dependencies.
9
+
10
+ Goals
11
+ -----
12
+
13
+ 1. Detect lines that *behave like* mathematical content:
14
+ - Display equations (standalone lines).
15
+ - Inline-style math hiding inside text lines.
16
+ - Existing LaTeX math that should be preserved.
17
+
18
+ 2. Normalize math text toward LaTeX-friendly syntax:
19
+ - Map Unicode Greek letters to \\alpha, \\beta, ...
20
+ - Map ≤, ≥, ≠, ∞, ∑, ∫, √, ×, ·, etc. to LaTeX commands.
21
+ - Map superscript/subscript digits (x², a₁₀) to x^{2}, a_{10}.
22
+ - Avoid Markdown escaping here — math should be passed as-is,
23
+ then wrapped by the renderer using `$...$` or `$$...$$`.
24
+
25
+ 3. Stay conservative:
26
+ - Prefer to miss ambiguous prose rather than misclassify it as math.
27
+ - Treat a line as a display equation only if its "math density"
28
+ and structure strongly suggest it.
29
+
30
+ Integration Sketch
31
+ ------------------
32
+
33
+ Typical integration in `transform.py` might look like:
34
+
35
+ from .equations import annotate_math_on_page
36
+
37
+ for page in pages:
38
+ annotate_math_on_page(page)
39
+
40
+ Then, in `render.py`, inside `_block_to_lines`, you can check each Line
41
+ **before** escaping Markdown:
42
+
43
+ if getattr(line, "is_math", False):
44
+ tex = getattr(line, "math_tex", "").strip()
45
+ kind = getattr(line, "math_kind", "display")
46
+ if tex:
47
+ if kind == "display":
48
+ md_lines.append(f"$$\\n{tex}\\n$$")
49
+ else:
50
+ md_lines.append(f"${tex}$")
51
+ continue # IMPORTANT: skip normal escaping / processing
52
+
53
+ This module intentionally stops short of that Markdown wrapping so you
54
+ can tune the behaviour per project.
55
+ """
56
+
57
+ from dataclasses import dataclass
58
+ from typing import List, Optional, Tuple
59
+
60
+ import re
61
+
62
+ from .models import PageText, Block, Line
63
+
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Data structures
67
+ # ---------------------------------------------------------------------------
68
+
69
+
70
+ @dataclass
71
+ class MathDetection:
72
+ """
73
+ Lightweight representation of a detected math region.
74
+
75
+ Attributes
76
+ ----------
77
+ block_index:
78
+ Index of the Block within PageText.blocks.
79
+ line_index:
80
+ Index of the Line within block.lines.
81
+ kind:
82
+ Either "display" or "inline".
83
+ raw:
84
+ Raw (joined) line text as extracted from the PDF.
85
+ tex:
86
+ Normalized, LaTeX-ish math text for this line/region.
87
+ This is **not** wrapped in `$` or `$$` — the renderer should
88
+ decide how to wrap it.
89
+ """
90
+ block_index: int
91
+ line_index: int
92
+ kind: str # "display" | "inline"
93
+ raw: str
94
+ tex: str
95
+
96
+
97
+ # ---------------------------------------------------------------------------
98
+ # Unicode → LaTeX maps
99
+ # ---------------------------------------------------------------------------
100
+
101
+ # Common Greek letters used in math.
102
+ _GREEK_MAP = {
103
+ "α": r"\alpha",
104
+ "β": r"\beta",
105
+ "γ": r"\gamma",
106
+ "δ": r"\delta",
107
+ "ε": r"\epsilon",
108
+ "ζ": r"\zeta",
109
+ "η": r"\eta",
110
+ "θ": r"\theta",
111
+ "ι": r"\iota",
112
+ "κ": r"\kappa",
113
+ "λ": r"\lambda",
114
+ "μ": r"\mu",
115
+ "ν": r"\nu",
116
+ "ξ": r"\xi",
117
+ "ο": r"o",
118
+ "π": r"\pi",
119
+ "ρ": r"\rho",
120
+ "σ": r"\sigma",
121
+ "τ": r"\tau",
122
+ "υ": r"\upsilon",
123
+ "φ": r"\phi",
124
+ "χ": r"\chi",
125
+ "ψ": r"\psi",
126
+ "ω": r"\omega",
127
+ "Γ": r"\Gamma",
128
+ "Δ": r"\Delta",
129
+ "Θ": r"\Theta",
130
+ "Λ": r"\Lambda",
131
+ "Ξ": r"\Xi",
132
+ "Π": r"\Pi",
133
+ "Σ": r"\Sigma",
134
+ "Υ": r"\Upsilon",
135
+ "Φ": r"\Phi",
136
+ "Ψ": r"\Psi",
137
+ "Ω": r"\Omega",
138
+ }
139
+
140
+ # Superscript and subscript digits / operators used in math.
141
+ _SUPERSCRIPT_MAP = {
142
+ "⁰": "0",
143
+ "¹": "1",
144
+ "²": "2",
145
+ "³": "3",
146
+ "⁴": "4",
147
+ "⁵": "5",
148
+ "⁶": "6",
149
+ "⁷": "7",
150
+ "⁸": "8",
151
+ "⁹": "9",
152
+ "⁺": "+",
153
+ "⁻": "-",
154
+ "⁼": "=",
155
+ "⁽": "(",
156
+ "⁾": ")",
157
+ }
158
+
159
+ _SUBSCRIPT_MAP = {
160
+ "₀": "0",
161
+ "₁": "1",
162
+ "₂": "2",
163
+ "₃": "3",
164
+ "₄": "4",
165
+ "₅": "5",
166
+ "₆": "6",
167
+ "₇": "7",
168
+ "₈": "8",
169
+ "₉": "9",
170
+ "₊": "+",
171
+ "₋": "-",
172
+ "₌": "=",
173
+ "₍": "(",
174
+ "₎": ")",
175
+ }
176
+
177
+ # Misc Unicode math symbols to LaTeX.
178
+ _UNICODE_MATH_MAP = {
179
+ "≤": r"\leq",
180
+ "≥": r"\geq",
181
+ "≠": r"\ne",
182
+ "≈": r"\approx",
183
+ "≃": r"\simeq",
184
+ "≡": r"\equiv",
185
+ "∞": r"\infty",
186
+ "∑": r"\sum",
187
+ "∏": r"\prod",
188
+ "∫": r"\int",
189
+ "∮": r"\oint",
190
+ "√": r"\sqrt",
191
+ "∂": r"\partial",
192
+ "∇": r"\nabla",
193
+ "∈": r"\in",
194
+ "∉": r"\notin",
195
+ "⊂": r"\subset",
196
+ "⊆": r"\subseteq",
197
+ "⊃": r"\supset",
198
+ "⊇": r"\supseteq",
199
+ "⋂": r"\cap",
200
+ "⋃": r"\cup",
201
+ "∧": r"\wedge",
202
+ "∨": r"\vee",
203
+ "¬": r"\neg",
204
+ "⇒": r"\Rightarrow",
205
+ "→": r"\to",
206
+ "←": r"\leftarrow",
207
+ "⇔": r"\Leftrightarrow",
208
+ "↦": r"\mapsto",
209
+ "⊕": r"\oplus",
210
+ "⊗": r"\otimes",
211
+ "⊙": r"\odot",
212
+ "±": r"\pm",
213
+ "∓": r"\mp",
214
+ "×": r"\times",
215
+ "·": r"\cdot",
216
+ "∝": r"\propto",
217
+ }
218
+
219
+ # Characters that make a line "mathy".
220
+ _MATH_OPERATORS = set("=<>+-*/^_")
221
+ _MATH_PARENS = set("()[]{}")
222
+
223
+
224
+ # ---------------------------------------------------------------------------
225
+ # Core helpers
226
+ # ---------------------------------------------------------------------------
227
+
228
+
229
+ def _line_text(line: Line) -> str:
230
+ """Join spans into a raw line string."""
231
+ return "".join(sp.text or "" for sp in line.spans).rstrip("\n")
232
+
233
+
234
+ def _normalize_unicode_math(text: str) -> str:
235
+ """
236
+ Map Unicode Greek / superscripts / subscripts / math symbols to LaTeX-ish.
237
+
238
+ We intentionally do **not** escape for Markdown here. The renderer should
239
+ decide whether to escape or bypass escaping for math segments.
240
+
241
+ Examples:
242
+ "α + β²" -> "\\alpha + \\beta^2"
243
+ "x₁₀²" -> "x_{10}^2"
244
+ """
245
+ if not text:
246
+ return text
247
+
248
+ out: List[str] = []
249
+ i = 0
250
+ n = len(text)
251
+
252
+ while i < n:
253
+ ch = text[i]
254
+
255
+ # Greek letters
256
+ if ch in _GREEK_MAP:
257
+ out.append(_GREEK_MAP[ch])
258
+ i += 1
259
+ continue
260
+
261
+ # Superscripts → collect contiguous run → ^{...} or ^x
262
+ if ch in _SUPERSCRIPT_MAP:
263
+ sup_chars: List[str] = []
264
+ while i < n and text[i] in _SUPERSCRIPT_MAP:
265
+ sup_chars.append(_SUPERSCRIPT_MAP[text[i]])
266
+ i += 1
267
+ sup_text = "".join(sup_chars)
268
+ if len(sup_text) > 1:
269
+ out.append(f"^{{{sup_text}}}")
270
+ else:
271
+ out.append("^" + sup_text)
272
+ continue
273
+
274
+ # Subscripts → collect contiguous run → _{...} or _x
275
+ if ch in _SUBSCRIPT_MAP:
276
+ sub_chars: List[str] = []
277
+ while i < n and text[i] in _SUBSCRIPT_MAP:
278
+ sub_chars.append(_SUBSCRIPT_MAP[text[i]])
279
+ i += 1
280
+ sub_text = "".join(sub_chars)
281
+ if len(sub_text) > 1:
282
+ out.append(f"_{{{sub_text}}}")
283
+ else:
284
+ out.append("_" + sub_text)
285
+ continue
286
+
287
+ # Generic math symbols
288
+ if ch in _UNICODE_MATH_MAP:
289
+ out.append(_UNICODE_MATH_MAP[ch])
290
+ i += 1
291
+ continue
292
+
293
+ # Normal character
294
+ out.append(ch)
295
+ i += 1
296
+
297
+ return "".join(out)
298
+
299
+
300
+ _LATEX_MATH_HINT_RE = re.compile(
301
+ r"(\$[^$]+\$|\\\(|\\\[|\\begin\{(equation|align|gather|multline)\})"
302
+ )
303
+
304
+
305
+ def _contains_explicit_latex(text: str) -> bool:
306
+ """Detect if the line already contains LaTeX math delimiters."""
307
+ if "$" in text:
308
+ return True
309
+ if "\\(" in text or "\\)" in text:
310
+ return True
311
+ if "\\[" in text or "\\]" in text:
312
+ return True
313
+ if "\\begin{" in text and "}" in text:
314
+ return True
315
+ return bool(_LATEX_MATH_HINT_RE.search(text))
316
+
317
+
318
+ def _math_density(text: str) -> float:
319
+ """
320
+ Return a crude "mathiness" score between 0 and 1.
321
+
322
+ We count digits, math operators, parentheses, Greek, and known
323
+ Unicode math symbols as "math characters".
324
+ """
325
+ non_space = [c for c in text if not c.isspace()]
326
+ if not non_space:
327
+ return 0.0
328
+
329
+ math_chars = 0
330
+ for c in non_space:
331
+ if c.isdigit():
332
+ math_chars += 1
333
+ continue
334
+ if c in _MATH_OPERATORS or c in _MATH_PARENS:
335
+ math_chars += 1
336
+ continue
337
+ if c in _GREEK_MAP or c in _UNICODE_MATH_MAP:
338
+ math_chars += 1
339
+ continue
340
+
341
+ return math_chars / float(len(non_space))
342
+
343
+
344
+ _EQ_OPERATOR_RE = re.compile(r"(=|≤|≥|≠|≈|≃|⇒|→|⇔|↦)")
345
+
346
+
347
+ def _looks_like_equation(text: str) -> bool:
348
+ """
349
+ Heuristic: does this line look like a standalone equation?
350
+
351
+ Signals:
352
+ - Contains an equality/comparison symbol.
353
+ - Has reasonably high math density.
354
+ - Not clearly a sentence (few long words, no trailing full stop).
355
+ """
356
+ s = text.strip()
357
+ if not s:
358
+ return False
359
+
360
+ if not _EQ_OPERATOR_RE.search(s):
361
+ # Many equations have '=', '<=', '>=', etc.
362
+ return False
363
+
364
+ density = _math_density(s)
365
+ if density < 0.4:
366
+ return False
367
+
368
+ # Avoid obvious prose: many words + period at the end.
369
+ words = s.split()
370
+ if len(words) >= 7 and s.endswith("."):
371
+ return False
372
+
373
+ return True
374
+
375
+
376
+ def _looks_math_heavy_inline(text: str) -> bool:
377
+ """
378
+ Heuristic for a line that is mostly prose but contains math segments.
379
+
380
+ We look for:
381
+ - Non-trivial math density.
382
+ - Presence of typical math operators or Greek letters.
383
+ """
384
+ s = text.strip()
385
+ if not s:
386
+ return False
387
+
388
+ density = _math_density(s)
389
+ if density < 0.25:
390
+ return False
391
+
392
+ if any(ch in s for ch in "=<>±×÷") or any(ch in s for ch in _GREEK_MAP.keys()):
393
+ return True
394
+
395
+ return False
396
+
397
+
398
+ def _split_inline_math_segments(text: str) -> List[Tuple[int, int]]:
399
+ """
400
+ Very lightweight segmentation into "mathy" spans inside a line.
401
+
402
+ Returns a list of (start, end) indices for substrings that appear
403
+ math-heavy relative to their surroundings.
404
+
405
+ This is intentionally simple: we scan for runs containing at least
406
+ one operator and at least one digit or Greek letter.
407
+ """
408
+ spans: List[Tuple[int, int]] = []
409
+ n = len(text)
410
+ i = 0
411
+
412
+ while i < n:
413
+ # Skip whitespace
414
+ while i < n and text[i].isspace():
415
+ i += 1
416
+ start = i
417
+
418
+ has_op = False
419
+ has_digit_or_greek = False
420
+
421
+ while i < n and not text[i].isspace():
422
+ ch = text[i]
423
+ if ch in _MATH_OPERATORS or ch in _UNICODE_MATH_MAP or ch in _EQ_OPERATOR_RE.pattern:
424
+ has_op = True
425
+ if ch.isdigit() or ch in _GREEK_MAP:
426
+ has_digit_or_greek = True
427
+ i += 1
428
+
429
+ end = i
430
+ if end > start and has_op and has_digit_or_greek:
431
+ spans.append((start, end))
432
+
433
+ # Move past any trailing whitespace
434
+ while i < n and text[i].isspace():
435
+ i += 1
436
+
437
+ return spans
438
+
439
+
440
+ def _is_display_candidate(text: str) -> bool:
441
+ """
442
+ Decide whether a line should be treated as a display equation.
443
+
444
+ A line is a display candidate if:
445
+ - It "looks like" an equation, OR
446
+ - It contains explicit LaTeX math and is short.
447
+ """
448
+ s = text.strip()
449
+ if not s:
450
+ return False
451
+
452
+ if _looks_like_equation(s):
453
+ return True
454
+
455
+ if _contains_explicit_latex(s):
456
+ # If the whole line is relatively short and math-heavy, prefer display.
457
+ if len(s) <= 80 and _math_density(s) >= 0.35:
458
+ return True
459
+
460
+ return False
461
+
462
+
463
+ def _non_empty_line_texts(block: Block) -> List[str]:
464
+ """Utility mirror of tables._non_empty_line_texts for reuse if needed."""
465
+ texts: List[str] = []
466
+ for ln in block.lines:
467
+ t = _line_text(ln)
468
+ if t.strip():
469
+ texts.append(t)
470
+ return texts
471
+
472
+
473
+ # ---------------------------------------------------------------------------
474
+ # Public API
475
+ # ---------------------------------------------------------------------------
476
+
477
+
478
+ def convert_math_text(text: str) -> str:
479
+ """
480
+ Normalize a math string to be more LaTeX-friendly.
481
+
482
+ This:
483
+ - Preserves any existing LaTeX commands.
484
+ - Converts Unicode Greek, superscripts, subscripts, and math symbols.
485
+ - Leaves Markdown escaping to the renderer.
486
+
487
+ It is safe to call this on both "pure" equations and inline segments.
488
+ """
489
+ if not text:
490
+ return text
491
+ return _normalize_unicode_math(text)
492
+
493
+
494
+ def detect_math_on_page(page: PageText) -> List[MathDetection]:
495
+ """
496
+ Detect math-like lines on a single page.
497
+
498
+ Detection order:
499
+ 1. Lines with explicit LaTeX math → always flagged.
500
+ 2. Lines that look like standalone equations → "display".
501
+ 3. Lines with math-heavy segments inside prose → "inline".
502
+
503
+ For inline math, we normalize only the mathy segments the detector
504
+ finds, leaving surrounding prose untouched.
505
+ """
506
+ detections: List[MathDetection] = []
507
+
508
+ for b_idx, block in enumerate(page.blocks):
509
+ for l_idx, line in enumerate(block.lines):
510
+ raw = _line_text(line)
511
+ if not raw.strip():
512
+ continue
513
+
514
+ # 1) Explicit LaTeX markers — trust the author and just normalize.
515
+ if _contains_explicit_latex(raw):
516
+ tex = convert_math_text(raw)
517
+ detections.append(
518
+ MathDetection(
519
+ block_index=b_idx,
520
+ line_index=l_idx,
521
+ kind="display" if _is_display_candidate(raw) else "inline",
522
+ raw=raw,
523
+ tex=tex,
524
+ )
525
+ )
526
+ continue
527
+
528
+ # 2) Standalone equation-style lines.
529
+ if _is_display_candidate(raw):
530
+ tex = convert_math_text(raw)
531
+ detections.append(
532
+ MathDetection(
533
+ block_index=b_idx,
534
+ line_index=l_idx,
535
+ kind="display",
536
+ raw=raw,
537
+ tex=tex,
538
+ )
539
+ )
540
+ continue
541
+
542
+ # 3) Prose lines with clearly math-heavy segments.
543
+ if _looks_math_heavy_inline(raw):
544
+ spans = _split_inline_math_segments(raw)
545
+ if not spans:
546
+ # Fall back to line-level normalization.
547
+ tex = convert_math_text(raw)
548
+ else:
549
+ # Rebuild string around normalized math segments
550
+ parts: List[str] = []
551
+ last_end = 0
552
+ for (start, end) in spans:
553
+ # Non-math text before the segment
554
+ parts.append(raw[last_end:start])
555
+ # Normalized math segment
556
+ parts.append(convert_math_text(raw[start:end]))
557
+ last_end = end
558
+ # Trailing non-math text
559
+ parts.append(raw[last_end:])
560
+ tex = "".join(parts)
561
+
562
+ detections.append(
563
+ MathDetection(
564
+ block_index=b_idx,
565
+ line_index=l_idx,
566
+ kind="inline",
567
+ raw=raw,
568
+ tex=tex,
569
+ )
570
+ )
571
+
572
+ return detections
573
+
574
+
575
+ def detect_math(pages: List[PageText]) -> List[MathDetection]:
576
+ """
577
+ Detect math regions across all pages of a document.
578
+
579
+ This is a thin convenience wrapper over `detect_math_on_page`.
580
+ """
581
+ all_detections: List[MathDetection] = []
582
+ for page in pages:
583
+ page_dets = detect_math_on_page(page)
584
+ # block_index is local to each page, so no remapping needed here;
585
+ # callers should treat detections as per-page if they care.
586
+ all_detections.extend(page_dets)
587
+ return all_detections
588
+
589
+
590
+ def annotate_math_on_page(page: PageText) -> List[MathDetection]:
591
+ """
592
+ Detect math on a page and annotate the underlying Line objects in-place.
593
+
594
+ Side effects:
595
+ - For each line that contains math, sets:
596
+ line.is_math = True
597
+ line.math_kind = "display" | "inline"
598
+ line.math_tex = normalized LaTeX-style text
599
+
600
+ Returns:
601
+ The list of MathDetection objects for further introspection if needed.
602
+ """
603
+ detections = detect_math_on_page(page)
604
+ # Attach attributes directly to lines for easy use in render.py
605
+ for det in detections:
606
+ blk = page.blocks[det.block_index]
607
+ if det.line_index < 0 or det.line_index >= len(blk.lines):
608
+ continue
609
+ ln = blk.lines[det.line_index]
610
+ setattr(ln, "is_math", True)
611
+ setattr(ln, "math_kind", det.kind)
612
+ setattr(ln, "math_tex", det.tex)
613
+ return detections
614
+
615
+
616
+ def annotate_math(pages: List[PageText]) -> List[MathDetection]:
617
+ """
618
+ Annotate math across all pages and return the combined detections.
619
+
620
+ This is a convenience wrapper over `annotate_math_on_page`.
621
+ """
622
+ all_detections: List[MathDetection] = []
623
+ for page in pages:
624
+ all_detections.extend(annotate_math_on_page(page))
625
+ return all_detections
626
+
627
+
628
+ __all__ = [
629
+ "MathDetection",
630
+ "convert_math_text",
631
+ "detect_math_on_page",
632
+ "detect_math",
633
+ "annotate_math_on_page",
634
+ "annotate_math",
635
+ ]