thinkpdf 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,287 @@
1
+ """End-to-end conversion pipeline for pdfmd.
2
+
3
+ Public API:
4
+ pdf_to_markdown(input_pdf: str, output_md: str, options: Options,
5
+ progress_cb: callable|None = None, log_cb: callable|None = None,
6
+ pdf_password: str|None = None, debug_tables: bool = False)
7
+
8
+ Stages:
9
+ 1) Extract → PageText pages (native or OCR depending on Options)
10
+ 2) Transform → clean/annotate pages (drop caps, header/footer removal, table detection)
11
+ 3) Render → Markdown
12
+ 4) Optional: export images to _assets/ and append simple references
13
+
14
+ Notes:
15
+ - `progress_cb` receives (done, total) at a few milestones; GUI can map this
16
+ to a determinate bar.
17
+ - Image references use forward slashes in Markdown (portable across OSes),
18
+ while all file I/O uses Path/os to be cross-platform safe.
19
+ - Password handling is secure: never logged, never persisted, only used in-memory.
20
+ - Table detection can be debugged with debug_tables=True flag.
21
+ """
22
+ from __future__ import annotations
23
+
24
+ from pathlib import Path
25
+ from typing import Callable, Optional, List, Dict
26
+ import os
27
+
28
+ try:
29
+ import fitz # PyMuPDF
30
+ except Exception:
31
+ fitz = None
32
+
33
+ from .models import Options
34
+ from .extract import extract_pages, _open_pdf_with_password
35
+ from .transform import transform_pages
36
+ from .render import render_document
37
+ from .utils import log as default_log
38
+
39
+
40
+ DefProgress = Optional[Callable[[int, int], None]]
41
+ DefLogger = Optional[Callable[[str], None]]
42
+
43
+
44
+ def _append_image_refs(md: str, page_to_relpaths: Dict[int, List[str]]) -> str:
45
+ """Append image references to the end of the Markdown document.
46
+
47
+ Args:
48
+ md: Markdown content
49
+ page_to_relpaths: Mapping of page_index → list of relative image paths
50
+
51
+ Returns:
52
+ Markdown with image references appended
53
+ """
54
+ if not page_to_relpaths:
55
+ return md
56
+
57
+ lines: List[str] = [md.rstrip(), ""]
58
+
59
+ for pno in sorted(page_to_relpaths):
60
+ paths = page_to_relpaths[pno]
61
+ if not paths:
62
+ continue
63
+ lines.append(f"**Images from page {pno + 1}:**")
64
+ for i, rel in enumerate(paths, start=1):
65
+ lines.append(f"- ![p{pno + 1}-{i}]({rel})")
66
+ lines.append("")
67
+
68
+ return "\n".join(lines).rstrip() + "\n"
69
+
70
+
71
+ def _export_images(
72
+ pdf_path: str,
73
+ output_md: str,
74
+ options: Options,
75
+ log_cb: DefLogger = None,
76
+ pdf_password: Optional[str] = None,
77
+ ) -> Dict[int, List[str]]:
78
+ """Export images to an _assets folder next to output_md and return relative paths.
79
+
80
+ Returns a mapping: page_index → [relpath, ...].
81
+
82
+ For password-protected PDFs, the password is used only to open the
83
+ document in memory. It is never logged or persisted.
84
+
85
+ Args:
86
+ pdf_path: Path to input PDF
87
+ output_md: Path to output Markdown file
88
+ options: Conversion options
89
+ log_cb: Optional logging callback
90
+ pdf_password: Optional PDF password (ephemeral, in-memory only)
91
+
92
+ Returns:
93
+ Dictionary mapping page indices to lists of relative image paths
94
+ """
95
+ if not options.export_images:
96
+ return {}
97
+
98
+ if fitz is None:
99
+ if log_cb:
100
+ log_cb("[pipeline] PyMuPDF is not available; cannot export images.")
101
+ return {}
102
+
103
+ try:
104
+ # Reuse the central password-aware open helper so behavior matches extract.py
105
+ doc = _open_pdf_with_password(pdf_path, pdf_password)
106
+ except Exception as e:
107
+ if log_cb:
108
+ log_cb(f"[pipeline] Could not export images: {e}")
109
+ return {}
110
+
111
+ try:
112
+ out_path = Path(output_md)
113
+ assets_dir = out_path.with_name(out_path.stem + "_assets")
114
+ assets_dir.mkdir(parents=True, exist_ok=True)
115
+
116
+ mapping: Dict[int, List[str]] = {}
117
+ page_count = doc.page_count
118
+ limit = page_count if not options.preview_only else min(3, page_count)
119
+
120
+ for pno in range(limit):
121
+ page = doc.load_page(pno)
122
+ images = page.get_images(full=True)
123
+ rels: List[str] = []
124
+
125
+ for idx, img in enumerate(images, start=1):
126
+ xref = img[0]
127
+ pix = fitz.Pixmap(doc, xref)
128
+
129
+ # Convert CMYK to RGB if needed
130
+ if pix.n > 4:
131
+ pix = fitz.Pixmap(fitz.csRGB, pix)
132
+
133
+ fname = assets_dir / f"img_{pno + 1:03d}_{idx:02d}.png"
134
+ pix.save(str(fname))
135
+
136
+ # Markdown wants forward slashes for portability
137
+ rel = assets_dir.name + "/" + fname.name
138
+ rels.append(rel)
139
+
140
+ if rels:
141
+ mapping[pno] = rels
142
+
143
+ if log_cb and mapping:
144
+ log_cb(f"[pipeline] Exported images to folder: {assets_dir}")
145
+
146
+ return mapping
147
+
148
+ finally:
149
+ doc.close()
150
+
151
+
152
+ def pdf_to_markdown(
153
+ input_pdf: str,
154
+ output_md: str,
155
+ options: Options,
156
+ progress_cb: DefProgress = None,
157
+ log_cb: DefLogger = None,
158
+ pdf_password: Optional[str] = None,
159
+ debug_tables: bool = False,
160
+ ) -> None:
161
+ """Convert a PDF to Markdown using the full pdfmd pipeline.
162
+
163
+ This is the main entry point for PDF to Markdown conversion. It orchestrates
164
+ all stages: extraction, transformation, rendering, and optional image export.
165
+
166
+ Args:
167
+ input_pdf: Path to input PDF file
168
+ output_md: Path where Markdown output will be written
169
+ options: Conversion options (OCR mode, heading detection, etc.)
170
+ progress_cb: Optional callback for progress updates: (done, total)
171
+ log_cb: Optional callback for log messages
172
+ pdf_password: Optional password for encrypted PDFs (ephemeral)
173
+ debug_tables: Enable debug logging for table detection
174
+
175
+ Raises:
176
+ RuntimeError: If PyMuPDF is not installed
177
+ ValueError: If PDF has no pages or is invalid
178
+ Various exceptions from extraction, transformation, or rendering stages
179
+
180
+ Side Effects:
181
+ - Writes Markdown file to output_md
182
+ - May create _assets/ folder if export_images is enabled
183
+ - Calls progress_cb and log_cb if provided
184
+
185
+ Security Notes:
186
+ - pdf_password is never logged or persisted
187
+ - All processing happens locally
188
+ - Output files are written unencrypted
189
+ """
190
+ if log_cb is None:
191
+ log_cb = default_log
192
+
193
+ if fitz is None:
194
+ raise RuntimeError("PyMuPDF (fitz) is not installed. Install with: pip install pymupdf")
195
+
196
+ # --- Stage 1: Extract ---
197
+ if log_cb:
198
+ log_cb("[pipeline] Extracting text…")
199
+
200
+ # Map page-level progress into the [0, 30] range of a 0 to 100 scale.
201
+ def _stage1_progress(done_pages: int, total_pages: int) -> None:
202
+ if progress_cb and total_pages > 0:
203
+ pct = int(done_pages * 30 / total_pages)
204
+ progress_cb(pct, 100)
205
+
206
+ pages = extract_pages(
207
+ input_pdf,
208
+ options,
209
+ progress_cb=_stage1_progress,
210
+ pdf_password=pdf_password,
211
+ )
212
+
213
+ if not pages:
214
+ raise ValueError("PDF extraction produced no pages")
215
+
216
+ if progress_cb:
217
+ progress_cb(30, 100)
218
+
219
+ # --- Stage 2: Transform ---
220
+ if log_cb:
221
+ log_cb("[pipeline] Transforming pages…")
222
+
223
+ pages_t, header, footer, body_sizes = transform_pages(
224
+ pages,
225
+ options,
226
+ debug_tables=debug_tables,
227
+ )
228
+
229
+ if log_cb and (header or footer):
230
+ log_cb(f"[pipeline] Removed repeating edges → header={header!r}, footer={footer!r}")
231
+
232
+ if progress_cb:
233
+ progress_cb(60, 100)
234
+
235
+ # --- Stage 3: Render ---
236
+ if log_cb:
237
+ log_cb("[pipeline] Rendering Markdown…")
238
+
239
+ md = render_document(
240
+ pages_t,
241
+ options,
242
+ body_sizes=body_sizes,
243
+ )
244
+
245
+ if progress_cb:
246
+ progress_cb(80, 100)
247
+
248
+ # --- Stage 4: Optional image export ---
249
+ if options.export_images:
250
+ if log_cb:
251
+ log_cb("[pipeline] Exporting images…")
252
+
253
+ page_to_rel = _export_images(
254
+ input_pdf,
255
+ output_md,
256
+ options,
257
+ log_cb=log_cb,
258
+ pdf_password=pdf_password,
259
+ )
260
+
261
+ if page_to_rel:
262
+ md = _append_image_refs(md, page_to_rel)
263
+
264
+ if progress_cb:
265
+ progress_cb(90, 100)
266
+
267
+ # --- Write output ---
268
+ if log_cb:
269
+ log_cb("[pipeline] Writing output file…")
270
+
271
+ try:
272
+ Path(output_md).write_text(md, encoding="utf-8")
273
+ except Exception as e:
274
+ if log_cb:
275
+ log_cb(f"[pipeline] Error writing output: {e}")
276
+ raise
277
+
278
+ if progress_cb:
279
+ progress_cb(100, 100)
280
+
281
+ if log_cb:
282
+ log_cb(f"[pipeline] Saved → {output_md}")
283
+
284
+
285
+ __all__ = [
286
+ "pdf_to_markdown",
287
+ ]