thinkpdf 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfbrain/__init__.py +22 -0
- pdfbrain/app_gui.py +530 -0
- pdfbrain/cache/__init__.py +5 -0
- pdfbrain/cache/cache_manager.py +252 -0
- pdfbrain/cli.py +255 -0
- pdfbrain/core/__init__.py +6 -0
- pdfbrain/core/converter.py +332 -0
- pdfbrain/core/equations.py +635 -0
- pdfbrain/core/extract.py +469 -0
- pdfbrain/core/extractor.py +272 -0
- pdfbrain/core/models.py +196 -0
- pdfbrain/core/pipeline.py +287 -0
- pdfbrain/core/render.py +574 -0
- pdfbrain/core/tables.py +871 -0
- pdfbrain/core/transform.py +604 -0
- pdfbrain/core/utils.py +229 -0
- pdfbrain/engine.py +392 -0
- pdfbrain/mcp_server.py +315 -0
- pdfbrain/utils/__init__.py +1 -0
- thinkpdf-1.0.1.dist-info/METADATA +138 -0
- thinkpdf-1.0.1.dist-info/RECORD +25 -0
- thinkpdf-1.0.1.dist-info/WHEEL +5 -0
- thinkpdf-1.0.1.dist-info/entry_points.txt +4 -0
- thinkpdf-1.0.1.dist-info/licenses/LICENSE +620 -0
- thinkpdf-1.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
"""End-to-end conversion pipeline for pdfmd.
|
|
2
|
+
|
|
3
|
+
Public API:
|
|
4
|
+
pdf_to_markdown(input_pdf: str, output_md: str, options: Options,
|
|
5
|
+
progress_cb: callable|None = None, log_cb: callable|None = None,
|
|
6
|
+
pdf_password: str|None = None, debug_tables: bool = False)
|
|
7
|
+
|
|
8
|
+
Stages:
|
|
9
|
+
1) Extract → PageText pages (native or OCR depending on Options)
|
|
10
|
+
2) Transform → clean/annotate pages (drop caps, header/footer removal, table detection)
|
|
11
|
+
3) Render → Markdown
|
|
12
|
+
4) Optional: export images to _assets/ and append simple references
|
|
13
|
+
|
|
14
|
+
Notes:
|
|
15
|
+
- `progress_cb` receives (done, total) at a few milestones; GUI can map this
|
|
16
|
+
to a determinate bar.
|
|
17
|
+
- Image references use forward slashes in Markdown (portable across OSes),
|
|
18
|
+
while all file I/O uses Path/os to be cross-platform safe.
|
|
19
|
+
- Password handling is secure: never logged, never persisted, only used in-memory.
|
|
20
|
+
- Table detection can be debugged with debug_tables=True flag.
|
|
21
|
+
"""
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Callable, Optional, List, Dict
|
|
26
|
+
import os
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
import fitz # PyMuPDF
|
|
30
|
+
except Exception:
|
|
31
|
+
fitz = None
|
|
32
|
+
|
|
33
|
+
from .models import Options
|
|
34
|
+
from .extract import extract_pages, _open_pdf_with_password
|
|
35
|
+
from .transform import transform_pages
|
|
36
|
+
from .render import render_document
|
|
37
|
+
from .utils import log as default_log
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
DefProgress = Optional[Callable[[int, int], None]]
|
|
41
|
+
DefLogger = Optional[Callable[[str], None]]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _append_image_refs(md: str, page_to_relpaths: Dict[int, List[str]]) -> str:
|
|
45
|
+
"""Append image references to the end of the Markdown document.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
md: Markdown content
|
|
49
|
+
page_to_relpaths: Mapping of page_index → list of relative image paths
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Markdown with image references appended
|
|
53
|
+
"""
|
|
54
|
+
if not page_to_relpaths:
|
|
55
|
+
return md
|
|
56
|
+
|
|
57
|
+
lines: List[str] = [md.rstrip(), ""]
|
|
58
|
+
|
|
59
|
+
for pno in sorted(page_to_relpaths):
|
|
60
|
+
paths = page_to_relpaths[pno]
|
|
61
|
+
if not paths:
|
|
62
|
+
continue
|
|
63
|
+
lines.append(f"**Images from page {pno + 1}:**")
|
|
64
|
+
for i, rel in enumerate(paths, start=1):
|
|
65
|
+
lines.append(f"- ")
|
|
66
|
+
lines.append("")
|
|
67
|
+
|
|
68
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _export_images(
|
|
72
|
+
pdf_path: str,
|
|
73
|
+
output_md: str,
|
|
74
|
+
options: Options,
|
|
75
|
+
log_cb: DefLogger = None,
|
|
76
|
+
pdf_password: Optional[str] = None,
|
|
77
|
+
) -> Dict[int, List[str]]:
|
|
78
|
+
"""Export images to an _assets folder next to output_md and return relative paths.
|
|
79
|
+
|
|
80
|
+
Returns a mapping: page_index → [relpath, ...].
|
|
81
|
+
|
|
82
|
+
For password-protected PDFs, the password is used only to open the
|
|
83
|
+
document in memory. It is never logged or persisted.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
pdf_path: Path to input PDF
|
|
87
|
+
output_md: Path to output Markdown file
|
|
88
|
+
options: Conversion options
|
|
89
|
+
log_cb: Optional logging callback
|
|
90
|
+
pdf_password: Optional PDF password (ephemeral, in-memory only)
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Dictionary mapping page indices to lists of relative image paths
|
|
94
|
+
"""
|
|
95
|
+
if not options.export_images:
|
|
96
|
+
return {}
|
|
97
|
+
|
|
98
|
+
if fitz is None:
|
|
99
|
+
if log_cb:
|
|
100
|
+
log_cb("[pipeline] PyMuPDF is not available; cannot export images.")
|
|
101
|
+
return {}
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
# Reuse the central password-aware open helper so behavior matches extract.py
|
|
105
|
+
doc = _open_pdf_with_password(pdf_path, pdf_password)
|
|
106
|
+
except Exception as e:
|
|
107
|
+
if log_cb:
|
|
108
|
+
log_cb(f"[pipeline] Could not export images: {e}")
|
|
109
|
+
return {}
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
out_path = Path(output_md)
|
|
113
|
+
assets_dir = out_path.with_name(out_path.stem + "_assets")
|
|
114
|
+
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
115
|
+
|
|
116
|
+
mapping: Dict[int, List[str]] = {}
|
|
117
|
+
page_count = doc.page_count
|
|
118
|
+
limit = page_count if not options.preview_only else min(3, page_count)
|
|
119
|
+
|
|
120
|
+
for pno in range(limit):
|
|
121
|
+
page = doc.load_page(pno)
|
|
122
|
+
images = page.get_images(full=True)
|
|
123
|
+
rels: List[str] = []
|
|
124
|
+
|
|
125
|
+
for idx, img in enumerate(images, start=1):
|
|
126
|
+
xref = img[0]
|
|
127
|
+
pix = fitz.Pixmap(doc, xref)
|
|
128
|
+
|
|
129
|
+
# Convert CMYK to RGB if needed
|
|
130
|
+
if pix.n > 4:
|
|
131
|
+
pix = fitz.Pixmap(fitz.csRGB, pix)
|
|
132
|
+
|
|
133
|
+
fname = assets_dir / f"img_{pno + 1:03d}_{idx:02d}.png"
|
|
134
|
+
pix.save(str(fname))
|
|
135
|
+
|
|
136
|
+
# Markdown wants forward slashes for portability
|
|
137
|
+
rel = assets_dir.name + "/" + fname.name
|
|
138
|
+
rels.append(rel)
|
|
139
|
+
|
|
140
|
+
if rels:
|
|
141
|
+
mapping[pno] = rels
|
|
142
|
+
|
|
143
|
+
if log_cb and mapping:
|
|
144
|
+
log_cb(f"[pipeline] Exported images to folder: {assets_dir}")
|
|
145
|
+
|
|
146
|
+
return mapping
|
|
147
|
+
|
|
148
|
+
finally:
|
|
149
|
+
doc.close()
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def pdf_to_markdown(
|
|
153
|
+
input_pdf: str,
|
|
154
|
+
output_md: str,
|
|
155
|
+
options: Options,
|
|
156
|
+
progress_cb: DefProgress = None,
|
|
157
|
+
log_cb: DefLogger = None,
|
|
158
|
+
pdf_password: Optional[str] = None,
|
|
159
|
+
debug_tables: bool = False,
|
|
160
|
+
) -> None:
|
|
161
|
+
"""Convert a PDF to Markdown using the full pdfmd pipeline.
|
|
162
|
+
|
|
163
|
+
This is the main entry point for PDF to Markdown conversion. It orchestrates
|
|
164
|
+
all stages: extraction, transformation, rendering, and optional image export.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
input_pdf: Path to input PDF file
|
|
168
|
+
output_md: Path where Markdown output will be written
|
|
169
|
+
options: Conversion options (OCR mode, heading detection, etc.)
|
|
170
|
+
progress_cb: Optional callback for progress updates: (done, total)
|
|
171
|
+
log_cb: Optional callback for log messages
|
|
172
|
+
pdf_password: Optional password for encrypted PDFs (ephemeral)
|
|
173
|
+
debug_tables: Enable debug logging for table detection
|
|
174
|
+
|
|
175
|
+
Raises:
|
|
176
|
+
RuntimeError: If PyMuPDF is not installed
|
|
177
|
+
ValueError: If PDF has no pages or is invalid
|
|
178
|
+
Various exceptions from extraction, transformation, or rendering stages
|
|
179
|
+
|
|
180
|
+
Side Effects:
|
|
181
|
+
- Writes Markdown file to output_md
|
|
182
|
+
- May create _assets/ folder if export_images is enabled
|
|
183
|
+
- Calls progress_cb and log_cb if provided
|
|
184
|
+
|
|
185
|
+
Security Notes:
|
|
186
|
+
- pdf_password is never logged or persisted
|
|
187
|
+
- All processing happens locally
|
|
188
|
+
- Output files are written unencrypted
|
|
189
|
+
"""
|
|
190
|
+
if log_cb is None:
|
|
191
|
+
log_cb = default_log
|
|
192
|
+
|
|
193
|
+
if fitz is None:
|
|
194
|
+
raise RuntimeError("PyMuPDF (fitz) is not installed. Install with: pip install pymupdf")
|
|
195
|
+
|
|
196
|
+
# --- Stage 1: Extract ---
|
|
197
|
+
if log_cb:
|
|
198
|
+
log_cb("[pipeline] Extracting text…")
|
|
199
|
+
|
|
200
|
+
# Map page-level progress into the [0, 30] range of a 0 to 100 scale.
|
|
201
|
+
def _stage1_progress(done_pages: int, total_pages: int) -> None:
|
|
202
|
+
if progress_cb and total_pages > 0:
|
|
203
|
+
pct = int(done_pages * 30 / total_pages)
|
|
204
|
+
progress_cb(pct, 100)
|
|
205
|
+
|
|
206
|
+
pages = extract_pages(
|
|
207
|
+
input_pdf,
|
|
208
|
+
options,
|
|
209
|
+
progress_cb=_stage1_progress,
|
|
210
|
+
pdf_password=pdf_password,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
if not pages:
|
|
214
|
+
raise ValueError("PDF extraction produced no pages")
|
|
215
|
+
|
|
216
|
+
if progress_cb:
|
|
217
|
+
progress_cb(30, 100)
|
|
218
|
+
|
|
219
|
+
# --- Stage 2: Transform ---
|
|
220
|
+
if log_cb:
|
|
221
|
+
log_cb("[pipeline] Transforming pages…")
|
|
222
|
+
|
|
223
|
+
pages_t, header, footer, body_sizes = transform_pages(
|
|
224
|
+
pages,
|
|
225
|
+
options,
|
|
226
|
+
debug_tables=debug_tables,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
if log_cb and (header or footer):
|
|
230
|
+
log_cb(f"[pipeline] Removed repeating edges → header={header!r}, footer={footer!r}")
|
|
231
|
+
|
|
232
|
+
if progress_cb:
|
|
233
|
+
progress_cb(60, 100)
|
|
234
|
+
|
|
235
|
+
# --- Stage 3: Render ---
|
|
236
|
+
if log_cb:
|
|
237
|
+
log_cb("[pipeline] Rendering Markdown…")
|
|
238
|
+
|
|
239
|
+
md = render_document(
|
|
240
|
+
pages_t,
|
|
241
|
+
options,
|
|
242
|
+
body_sizes=body_sizes,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
if progress_cb:
|
|
246
|
+
progress_cb(80, 100)
|
|
247
|
+
|
|
248
|
+
# --- Stage 4: Optional image export ---
|
|
249
|
+
if options.export_images:
|
|
250
|
+
if log_cb:
|
|
251
|
+
log_cb("[pipeline] Exporting images…")
|
|
252
|
+
|
|
253
|
+
page_to_rel = _export_images(
|
|
254
|
+
input_pdf,
|
|
255
|
+
output_md,
|
|
256
|
+
options,
|
|
257
|
+
log_cb=log_cb,
|
|
258
|
+
pdf_password=pdf_password,
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
if page_to_rel:
|
|
262
|
+
md = _append_image_refs(md, page_to_rel)
|
|
263
|
+
|
|
264
|
+
if progress_cb:
|
|
265
|
+
progress_cb(90, 100)
|
|
266
|
+
|
|
267
|
+
# --- Write output ---
|
|
268
|
+
if log_cb:
|
|
269
|
+
log_cb("[pipeline] Writing output file…")
|
|
270
|
+
|
|
271
|
+
try:
|
|
272
|
+
Path(output_md).write_text(md, encoding="utf-8")
|
|
273
|
+
except Exception as e:
|
|
274
|
+
if log_cb:
|
|
275
|
+
log_cb(f"[pipeline] Error writing output: {e}")
|
|
276
|
+
raise
|
|
277
|
+
|
|
278
|
+
if progress_cb:
|
|
279
|
+
progress_cb(100, 100)
|
|
280
|
+
|
|
281
|
+
if log_cb:
|
|
282
|
+
log_cb(f"[pipeline] Saved → {output_md}")
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
__all__ = [
|
|
286
|
+
"pdf_to_markdown",
|
|
287
|
+
]
|