thinkpdf 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,574 @@
1
+ """Markdown rendering for pdfmd.
2
+
3
+ This module converts transformed `PageText` structures into Markdown.
4
+ It assumes header/footer removal and drop-cap stripping have already been run
5
+ (see `transform.py`).
6
+
7
+ Main entry: `render_document(pages, options, body_sizes=None, progress_cb=None)`
8
+
9
+ Key behaviours:
10
+ - Applies heading promotion via font size and optional CAPS heuristics.
11
+ - Normalizes bullets and numbered lists to proper Markdown formats.
12
+ - Repairs hyphenation and unwraps hard line breaks into paragraphs.
13
+ - Optionally inserts `---` page break markers between pages.
14
+ - Defragments short orphan lines into their preceding paragraphs.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import re
20
+ from statistics import median
21
+ from typing import Callable, List, Optional
22
+
23
+ from .models import Block, Line, PageText, Options
24
+ from .utils import normalize_punctuation, linkify_urls, escape_markdown
25
+ from .transform import is_all_caps_line, is_mostly_caps
26
+
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # Inline helpers
30
+ # ---------------------------------------------------------------------------
31
+
32
+
33
+ def _wrap_inline(text: str, bold: bool, italic: bool) -> str:
34
+ """Wrap text with Markdown inline markers for bold/italic.
35
+
36
+ Rules:
37
+ - bold + italic: ***text***
38
+ - bold only: **text**
39
+ - italic only: *text*
40
+ - neither: text
41
+ """
42
+ if not text:
43
+ return text
44
+
45
+ if bold and italic:
46
+ return f"***{text}***"
47
+ if bold:
48
+ return f"**{text}**"
49
+ if italic:
50
+ return f"*{text}*"
51
+ return text
52
+
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # Line / paragraph shaping
56
+ # ---------------------------------------------------------------------------
57
+
58
+
59
+ def _fix_hyphenation(text: str) -> str:
60
+ """Repair line-wrap hyphenation.
61
+
62
+ Typical case in PDFs:
63
+ 'hy-\nphen' → 'hyphen'
64
+
65
+ We only remove hyphen + newline when it is clearly a wrap.
66
+ """
67
+ return re.sub(r"-\n(\s*)", r"\1", text)
68
+
69
+
70
+ def _unwrap_hard_breaks(lines: List[str]) -> str:
71
+ """Merge wrapped lines into paragraphs. Blank lines remain paragraph breaks.
72
+
73
+ Rules:
74
+ - Consecutive non-blank lines are joined with spaces.
75
+ - Blank lines are preserved as paragraph separators.
76
+ - Lines ending with two spaces `" "` are treated as explicit hard breaks
77
+ (Markdown convention) and terminate the paragraph.
78
+ """
79
+ out: List[str] = []
80
+ buf: List[str] = []
81
+
82
+ def flush() -> None:
83
+ if buf:
84
+ out.append(" ".join(buf).strip())
85
+ buf.clear()
86
+
87
+ for raw in lines:
88
+ line = raw.rstrip("\n")
89
+ if not line.strip():
90
+ flush()
91
+ out.append("")
92
+ continue
93
+
94
+ # Explicit hard break: keep line as-is and terminate paragraph buffer.
95
+ if line.endswith(" "):
96
+ buf.append(line)
97
+ flush()
98
+ continue
99
+
100
+ buf.append(line)
101
+
102
+ flush()
103
+ return "\n".join(out)
104
+
105
+
106
+ def _defragment_orphans(md: str, max_len: int = 45) -> str:
107
+ """Merge short, isolated lines back into the previous paragraph.
108
+
109
+ This operates on the final Markdown string, post-assembly.
110
+
111
+ Heuristic:
112
+ - If a non-heading line is:
113
+ * sandwiched between blank lines
114
+ * short (<= max_len chars)
115
+ * not already a list item,
116
+ then we append it to the previous non-blank line.
117
+ """
118
+ lines = md.splitlines()
119
+ res: List[str] = []
120
+ i = 0
121
+
122
+ while i < len(lines):
123
+ line = lines[i]
124
+
125
+ if (
126
+ i > 0
127
+ and i < len(lines) - 1
128
+ and not lines[i - 1].strip()
129
+ and not lines[i + 1].strip()
130
+ and 0 < len(line.strip()) <= max_len
131
+ and not line.strip().startswith("#")
132
+ ):
133
+ # Attach orphan to the previous non-blank line
134
+ j = len(res) - 1
135
+ while j >= 0 and not res[j].strip():
136
+ j -= 1
137
+ if j >= 0:
138
+ res[j] = (res[j].rstrip() + " " + line.strip()).strip()
139
+ i += 2
140
+ continue
141
+
142
+ res.append(line)
143
+ i += 1
144
+
145
+ return "\n".join(res)
146
+
147
+
148
+ # ---------------------------------------------------------------------------
149
+ # Safe joining & footer detection reuse
150
+ # ---------------------------------------------------------------------------
151
+
152
+
153
+ def _safe_join_texts(parts: List[str]) -> str:
154
+ """Join adjacent span texts, avoiding accidental double spaces."""
155
+ if not parts:
156
+ return ""
157
+ out = [parts[0]]
158
+ for p in parts[1:]:
159
+ if not p:
160
+ continue
161
+ if out[-1].endswith(" ") or p.startswith(" "):
162
+ out.append(p)
163
+ else:
164
+ out.append(" " + p)
165
+ return "".join(out)
166
+
167
+
168
+ # Reuse footer noise heuristic from transform-like logic here for line-level cleanup.
169
+ _FOOTER_DASH_PATTERN = re.compile(r"^[-–—]\s*[-–—]?\s*\d*\s*$")
170
+ _FOOTER_PAGENUM_PATTERN = re.compile(r"^\d+\s*$")
171
+ _FOOTER_PAGE_LABEL_PATTERN = re.compile(r"^Page\s+\d+\s*$", re.IGNORECASE)
172
+
173
+
174
+ def _is_footer_noise(text: str) -> bool:
175
+ s = text.strip()
176
+ if not s:
177
+ return False
178
+ if _FOOTER_DASH_PATTERN.match(s):
179
+ return True
180
+ if _FOOTER_PAGENUM_PATTERN.match(s):
181
+ return True
182
+ if _FOOTER_PAGE_LABEL_PATTERN.match(s):
183
+ return True
184
+ return False
185
+
186
+
187
+ # ---------------------------------------------------------------------------
188
+ # List normalisation
189
+ # ---------------------------------------------------------------------------
190
+
191
+
192
+ def _normalize_list_line(ln: str) -> str:
193
+ """Normalize various bullet/numbered prefixes into Markdown list syntax."""
194
+ s = ln.lstrip()
195
+ # Bullet-like prefixes
196
+ if re.match(r"^[•○◦·\-–—]\s+", s):
197
+ s = re.sub(r"^[•○◦·\-–—]\s+", "- ", s)
198
+ return s
199
+
200
+ # Numbered: "1. text" or "1) text"
201
+ m_num = re.match(r"^(\d+)[\.\)]\s+", s)
202
+ if m_num:
203
+ num = m_num.group(1)
204
+ s = re.sub(r"^\d+[\.\)]\s+", f"{num}. ", s)
205
+ return s
206
+
207
+ # Lettered outlines: "A. text" or "a) text" → bullet
208
+ if re.match(r"^[A-Za-z][\.\)]\s+", s):
209
+ s = re.sub(r"^[A-Za-z][\.\)]\s+", "- ", s)
210
+ return s
211
+
212
+ return ln.strip()
213
+
214
+
215
+ # ---------------------------------------------------------------------------
216
+ # Table rendering
217
+ # ---------------------------------------------------------------------------
218
+
219
+
220
+ def _infer_column_alignments(grid: List[List[str]]) -> List[str]:
221
+ """Return alignment hints per column: 'left', 'right', or 'center'.
222
+
223
+ Heuristic:
224
+ - Look at all body rows (skip header).
225
+ - If >70% of non-empty cells in a column are numeric-like (including
226
+ currency / percentages), mark that column as 'right'.
227
+ - Otherwise default to 'left'.
228
+ """
229
+ if not grid or len(grid) < 2:
230
+ return []
231
+
232
+ n_cols = len(grid[0])
233
+ alignments: List[str] = []
234
+
235
+ for col_idx in range(n_cols):
236
+ numeric_count = 0
237
+ total_count = 0
238
+
239
+ for row in grid[1:]: # Skip header row
240
+ if col_idx >= len(row):
241
+ continue
242
+ cell = (row[col_idx] or "").strip()
243
+ if not cell:
244
+ continue
245
+ total_count += 1
246
+
247
+ # Check if numeric-ish (supports commas, $, %, and simple negatives).
248
+ clean = (
249
+ cell.replace(",", "")
250
+ .replace("$", "")
251
+ .replace("%", "")
252
+ .replace("(", "")
253
+ .replace(")", "")
254
+ )
255
+ clean = clean.strip()
256
+ if clean.startswith("+") or clean.startswith("-"):
257
+ clean = clean[1:].strip()
258
+ try:
259
+ float(clean)
260
+ numeric_count += 1
261
+ except ValueError:
262
+ pass
263
+
264
+ if total_count == 0:
265
+ alignments.append("left")
266
+ elif numeric_count / total_count > 0.7:
267
+ alignments.append("right")
268
+ else:
269
+ alignments.append("left")
270
+
271
+ return alignments
272
+
273
+
274
+ def _render_table_block(block: Block) -> List[str]:
275
+ """Render a table-annotated block (from tables.detect_tables_on_page) as Markdown.
276
+
277
+ Expects `block.table_grid` to be a rectangular list-of-lists of strings.
278
+ The first row is treated as a header row. All cell contents are passed
279
+ through Markdown escaping and punctuation normalisation, with smart
280
+ handling of pipe characters so the table stays valid.
281
+ """
282
+ grid = getattr(block, "table_grid", None)
283
+ if not grid:
284
+ return []
285
+
286
+ # Ensure all rows have the same number of columns.
287
+ n_cols = max((len(row) for row in grid), default=0)
288
+ if n_cols == 0:
289
+ return []
290
+
291
+ norm_rows: List[List[str]] = []
292
+ for row in grid:
293
+ # Pad shorter rows; never truncate content.
294
+ if len(row) < n_cols:
295
+ row = row + [""] * (n_cols - len(row))
296
+ norm_rows.append(row)
297
+
298
+ header = norm_rows[0]
299
+ body = norm_rows[1:]
300
+
301
+ def fmt_cell(text: str) -> str:
302
+ # Treat lone ASCII pipes as border artifacts from old-style tables.
303
+ raw = (text or "").strip()
304
+ if raw in {"|", "||", "¦"}:
305
+ raw = ""
306
+
307
+ # Normalise punctuation and escape Markdown specials.
308
+ raw = normalize_punctuation(raw)
309
+ raw = escape_markdown(raw)
310
+
311
+ # Critical: escape any remaining pipe characters so Markdown does
312
+ # not misinterpret them as column separators.
313
+ raw = raw.replace("|", "\\|")
314
+
315
+ return raw
316
+
317
+ # Infer alignments (left / right) from the data, fallback to left.
318
+ alignments = _infer_column_alignments(norm_rows)
319
+ if not alignments or len(alignments) != n_cols:
320
+ alignments = ["left"] * n_cols
321
+
322
+ header_cells = [fmt_cell(c) for c in header]
323
+ header_line = "| " + " | ".join(header_cells) + " |"
324
+
325
+ # Build separator row with alignment markers.
326
+ separator_cells: List[str] = []
327
+ for align in alignments:
328
+ if align == "right":
329
+ separator_cells.append("---:")
330
+ elif align == "center":
331
+ separator_cells.append(":---:")
332
+ else:
333
+ separator_cells.append(":---") # left-align with explicit marker
334
+
335
+ separator_line = "| " + " | ".join(separator_cells) + " |"
336
+
337
+ body_lines: List[str] = []
338
+ for row in body:
339
+ cells = [fmt_cell(c) for c in row]
340
+ body_lines.append("| " + " | ".join(cells) + " |")
341
+
342
+ lines: List[str] = []
343
+ lines.append(header_line)
344
+ lines.append(separator_line)
345
+ lines.extend(body_lines)
346
+ lines.append("") # blank line after table
347
+
348
+ return lines
349
+
350
+
351
+ # ---------------------------------------------------------------------------
352
+ # Block → Markdown lines
353
+ # ---------------------------------------------------------------------------
354
+
355
+
356
+ def _block_to_lines(
357
+ block: Block,
358
+ body_size: float,
359
+ caps_to_headings: bool,
360
+ heading_size_ratio: float,
361
+ ) -> List[str]:
362
+ """Convert a Block into a list of Markdown lines.
363
+
364
+ We build two parallel views:
365
+ - raw_lines: plain text (no Markdown), for heading detection
366
+ - rendered_lines: text with inline styling (bold/italic), for body output
367
+
368
+ Heading detection uses:
369
+ - average span font size vs body_size
370
+ - optional ALL-CAPS / MOSTLY-CAPS heuristic across the block
371
+ """
372
+ # Tables: if this block was annotated as a table in transform.py,
373
+ # render it via the table grid and skip paragraph / heading heuristics.
374
+ if getattr(block, "is_table", False) and getattr(block, "table_grid", None) is not None:
375
+ return _render_table_block(block)
376
+
377
+ rendered_lines: List[str] = []
378
+ raw_lines: List[str] = []
379
+ line_sizes: List[float] = []
380
+
381
+ for line in block.lines:
382
+ spans = line.spans
383
+ texts_fmt: List[str] = []
384
+ texts_raw: List[str] = []
385
+ sizes: List[float] = []
386
+
387
+ # --- Math-aware path: equations module sets dynamic attributes ---
388
+ if getattr(line, "is_math", False):
389
+ kind = getattr(line, "math_kind", "display")
390
+ tex = (getattr(line, "math_tex", "") or "").strip()
391
+ if not tex:
392
+ # Fallback: join raw span text
393
+ tex = "".join(sp.text or "" for sp in spans)
394
+
395
+ # Display math: wrap with $$ ... $$ and completely skip
396
+ # escape_markdown, list detection will treat this as a plain line.
397
+ if kind == "display":
398
+ joined_fmt = f"$$\n{tex}\n$$"
399
+ joined_raw = tex
400
+ # Do not contribute to heading sizing; leave `sizes` empty.
401
+
402
+ # Inline math: `tex` is the whole line with math segments already
403
+ # normalized; we keep it as-is and again skip escape_markdown so
404
+ # LaTeX commands stay intact.
405
+ else: # "inline"
406
+ joined_fmt = tex
407
+ joined_raw = tex
408
+ # Use span sizes for body-size estimation if available.
409
+ for sp in spans:
410
+ if getattr(sp, "size", 0.0):
411
+ sizes.append(float(sp.size))
412
+
413
+ else:
414
+ # Normal text line: escape Markdown and apply inline bold/italic.
415
+ for sp in spans:
416
+ raw_text = sp.text or ""
417
+ texts_raw.append(raw_text)
418
+
419
+ esc = escape_markdown(raw_text)
420
+ esc = _wrap_inline(esc, sp.bold, sp.italic)
421
+ texts_fmt.append(esc)
422
+
423
+ if getattr(sp, "size", 0.0):
424
+ sizes.append(float(sp.size))
425
+
426
+ joined_fmt = _safe_join_texts(texts_fmt)
427
+ joined_raw = _safe_join_texts(texts_raw)
428
+
429
+ if joined_fmt.strip():
430
+ rendered_lines.append(joined_fmt)
431
+ raw_lines.append(joined_raw)
432
+ if sizes:
433
+ line_sizes.append(median(sizes))
434
+
435
+ if not rendered_lines:
436
+ return []
437
+
438
+ avg_line_size = median(line_sizes) if line_sizes else body_size
439
+
440
+ # Use RAW text (no ** or *) for heading heuristics
441
+ block_text_flat = " ".join(raw_lines).strip()
442
+
443
+ heading_by_size = avg_line_size >= body_size * heading_size_ratio
444
+ heading_by_caps = caps_to_headings and (
445
+ is_all_caps_line(block_text_flat) or is_mostly_caps(block_text_flat)
446
+ )
447
+
448
+ if heading_by_size or heading_by_caps:
449
+ # H1 if much larger than body or if CAPS; otherwise H2
450
+ level = 1 if (avg_line_size >= body_size * 1.6) or heading_by_caps else 2
451
+
452
+ # Heading text: use ONLY the first RAW line, not the formatted one
453
+ heading_raw = raw_lines[0]
454
+ heading_text = escape_markdown(heading_raw)
455
+ heading_text = re.sub(r"\s+", " ", heading_text).strip(" -:–—")
456
+ heading_text = normalize_punctuation(heading_text)
457
+ heading_line = f"{'#' * level} {heading_text}"
458
+
459
+ # If there's no additional text, just output heading + blank line
460
+ if len(rendered_lines) == 1:
461
+ return [heading_line, ""]
462
+
463
+ # Otherwise, render remaining lines as normal paragraph/list text
464
+ tail_text = _fix_hyphenation("\n".join(rendered_lines[1:]))
465
+
466
+ lines: List[str] = []
467
+ for ln in tail_text.splitlines():
468
+ if not ln.strip():
469
+ lines.append("")
470
+ continue
471
+
472
+ if _is_footer_noise(ln):
473
+ continue
474
+
475
+ norm = _normalize_list_line(ln)
476
+ lines.append(norm)
477
+
478
+ para = _unwrap_hard_breaks(lines)
479
+ para = normalize_punctuation(para)
480
+ para = linkify_urls(para)
481
+
482
+ out: List[str] = [heading_line, ""]
483
+ if para.strip():
484
+ out.append(para)
485
+ out.append("")
486
+ return out
487
+
488
+ # ----------------- Normal paragraph path ----------------------------
489
+
490
+ para_text = _fix_hyphenation("\n".join(rendered_lines))
491
+
492
+ lines: List[str] = []
493
+ for ln in para_text.splitlines():
494
+ if not ln.strip():
495
+ lines.append("")
496
+ continue
497
+
498
+ if _is_footer_noise(ln):
499
+ continue
500
+
501
+ norm = _normalize_list_line(ln)
502
+ lines.append(norm)
503
+
504
+ para = _unwrap_hard_breaks(lines)
505
+ para = normalize_punctuation(para)
506
+ para = linkify_urls(para)
507
+ return [para, ""]
508
+
509
+
510
+ # ---------------------------------------------------------------------------
511
+ # Document render
512
+ # ---------------------------------------------------------------------------
513
+
514
+ DefProgress = Optional[Callable[[int, int], None]]
515
+
516
+
517
+ def render_document(
518
+ pages: List[PageText],
519
+ options: Options,
520
+ body_sizes: Optional[List[float]] = None,
521
+ progress_cb: DefProgress = None,
522
+ ) -> str:
523
+ """Render transformed pages to a Markdown string.
524
+
525
+ Args:
526
+ pages: transformed PageText pages
527
+ options: rendering options (see models.Options)
528
+ body_sizes: optional per-page body-size baselines.
529
+ If not provided, the renderer falls back to 11.0.
530
+ progress_cb: optional progress callback (done, total)
531
+ """
532
+ md_lines: List[str] = []
533
+ total = len(pages)
534
+
535
+ for i, page in enumerate(pages):
536
+ body = body_sizes[i] if body_sizes and i < len(body_sizes) else 11.0
537
+
538
+ for blk in page.blocks:
539
+ if blk.is_empty():
540
+ continue
541
+ md_lines.extend(
542
+ _block_to_lines(
543
+ blk,
544
+ body_size=body,
545
+ caps_to_headings=options.caps_to_headings,
546
+ heading_size_ratio=options.heading_size_ratio,
547
+ )
548
+ )
549
+
550
+ if options.insert_page_breaks and i < total - 1:
551
+ md_lines.extend(["---", ""]) # page rule
552
+
553
+ if progress_cb:
554
+ progress_cb(i + 1, total)
555
+
556
+ md = "\n".join(md_lines)
557
+ # Collapse excessive blank lines
558
+ md = re.sub(r"\n{3,}", "\n\n", md).strip() + "\n"
559
+
560
+ if options.defragment_short:
561
+ md = _defragment_orphans(md, max_len=options.orphan_max_len)
562
+
563
+ # Strip common footer artefacts like trailing "- - 1" or "- -" at end of lines
564
+ md = re.sub(r"\s*-+\s*-+\s*\d*\s*$", "", md, flags=re.MULTILINE)
565
+
566
+ # Tighten spaces before punctuation
567
+ md = re.sub(r"\s+([,.;:?!])", r"\1", md)
568
+
569
+ return md
570
+
571
+
572
+ __all__ = [
573
+ "render_document",
574
+ ]