thinkpdf 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,469 @@
1
+ """Text extraction layer for pdfmd.
2
+
3
+ This module provides a single public function `extract_pages()` that returns a
4
+ list of `PageText` objects for the given PDF. It supports three modes:
5
+
6
+ - Native (PyMuPDF): fast, faithful when the PDF contains real text.
7
+ - OCR via Tesseract (optional): render each page → run pytesseract.
8
+ - OCR via OCRmyPDF (optional): pre-process the whole PDF with `ocrmypdf`, then
9
+ run the native extractor on the OCR'ed PDF. Useful for scanned PDFs while
10
+ preserving layout and selectable text.
11
+
12
+ The chosen path is controlled by `Options.ocr_mode`:
13
+ "off" | "auto" | "tesseract" | "ocrmypdf".
14
+ When set to "auto", a quick probe examines the first few pages and switches to
15
+ OCR if the doc appears scanned.
16
+
17
+ The module also contains helper functions for OCR probing, Tesseract/ocrmypdf
18
+ availability checks, and a small wrapper around temporary files.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import io
24
+ import os
25
+ import shutil
26
+ import subprocess
27
+ import tempfile
28
+ from typing import Callable, List, Optional
29
+
30
+ try:
31
+ import fitz # type: ignore
32
+ except Exception: # pragma: no cover - optional dependency
33
+ fitz = None # type: ignore
34
+
35
+ try:
36
+ import pytesseract # type: ignore
37
+ _HAS_TESS = True
38
+ except Exception: # pragma: no cover - optional dependency
39
+ pytesseract = None # type: ignore
40
+ _HAS_TESS = False
41
+
42
+ try:
43
+ from PIL import Image # type: ignore
44
+ _HAS_PIL = True
45
+ except Exception: # pragma: no cover - optional dependency
46
+ Image = None # type: ignore
47
+ _HAS_PIL = False
48
+
49
+ from .models import PageText, Options
50
+ from .utils import log
51
+
52
+
53
+ # ---------------------- Secure PDF open helpers -----------------------
54
+
55
+
56
+ def _open_pdf_with_password(pdf_path: str, pdf_password: Optional[str]):
57
+ """Open a PDF with optional password using PyMuPDF.
58
+
59
+ This helper centralizes password handling so that:
60
+
61
+ * We never log or persist the password.
62
+ * We raise clear, consistent errors for CLI / GUI to react to.
63
+ * We avoid keeping the password around longer than needed.
64
+ """
65
+ if fitz is None: # pragma: no cover - guarded earlier
66
+ raise RuntimeError("PyMuPDF (fitz) is not installed. Install with: pip install pymupdf")
67
+
68
+ # Open the document first; PyMuPDF will tell us if a password is needed.
69
+ try:
70
+ doc = fitz.open(pdf_path)
71
+ except Exception as e: # pragma: no cover - passthrough, handled by caller
72
+ raise e
73
+
74
+ # If the document is encrypted and still needs a password, authenticate.
75
+ needs_pass = bool(getattr(doc, "needs_pass", False))
76
+ if needs_pass:
77
+ if not pdf_password:
78
+ doc.close()
79
+ # Message is intentionally simple so other layers can pattern match.
80
+ raise RuntimeError("Password required to open this PDF.")
81
+ try:
82
+ ok = bool(doc.authenticate(pdf_password))
83
+ except Exception:
84
+ doc.close()
85
+ raise RuntimeError("Incorrect PDF password or cannot decrypt.")
86
+ if not ok:
87
+ doc.close()
88
+ raise RuntimeError("Incorrect PDF password or cannot decrypt.")
89
+
90
+ return doc
91
+
92
+
93
+ def _prepare_ocr_input(pdf_path: str, pdf_password: Optional[str], tmpdir: str) -> str:
94
+ """Return the path that OCRmyPDF should read from.
95
+
96
+ For unencrypted PDFs this is simply *pdf_path*.
97
+
98
+ For password-protected PDFs, we do not pass the password to external
99
+ commands (which could expose it via process listings). Instead we:
100
+
101
+ 1. Open and decrypt the PDF in-process using PyMuPDF.
102
+ 2. Write a temporary, decrypted copy inside *tmpdir*.
103
+ 3. Return the path to that temporary copy for OCRmyPDF to process.
104
+
105
+ The temporary file lives only in the OS temp directory and is deleted
106
+ together with *tmpdir* once processing completes.
107
+ """
108
+ # First try opening the document; this will also validate the password if needed.
109
+ doc = _open_pdf_with_password(pdf_path, pdf_password)
110
+ try:
111
+ needs_pass = bool(getattr(doc, "needs_pass", False))
112
+ # If no password was required (unencrypted or already openable), we can
113
+ # safely let OCRmyPDF read the original file directly.
114
+ if not needs_pass:
115
+ return pdf_path
116
+
117
+ # The document required a password and has now been authenticated.
118
+ # Create a decrypted temporary copy for OCR.
119
+ tmp_plain = os.path.join(tmpdir, "decrypted_input.pdf")
120
+ out_doc = fitz.open() # new empty document
121
+ try:
122
+ out_doc.insert_pdf(doc)
123
+ out_doc.save(tmp_plain)
124
+ finally:
125
+ out_doc.close()
126
+ return tmp_plain
127
+ finally:
128
+ doc.close()
129
+
130
+
131
+ # --------------------------- Public entry point ---------------------------
132
+
133
+ DefProgress = Optional[Callable[[int, int], None]]
134
+
135
+
136
+ def extract_pages(
137
+ pdf_path: str,
138
+ options: Options,
139
+ progress_cb: DefProgress = None,
140
+ pdf_password: Optional[str] = None,
141
+ ) -> List[PageText]:
142
+ """Extract pages as PageText according to OCR mode and preview flag.
143
+
144
+ progress_cb, if provided, is called as (done_pages, total_pages).
145
+ """
146
+ if fitz is None:
147
+ raise RuntimeError("PyMuPDF (fitz) is not installed. Install with: pip install pymupdf")
148
+
149
+ mode = (options.ocr_mode or "off").lower()
150
+
151
+ if mode == "off":
152
+ return _extract_native(pdf_path, options, progress_cb, pdf_password)
153
+
154
+ if mode == "auto":
155
+ if _needs_ocr_probe(pdf_path, pdf_password):
156
+ log("[extract] Auto: scanned PDF detected.")
157
+ if _HAS_TESS and _HAS_PIL and _tesseract_available():
158
+ log("[extract] Using Tesseract OCR...")
159
+ return _extract_tesseract(pdf_path, options, progress_cb, pdf_password)
160
+ elif _which("ocrmypdf") and _tesseract_available():
161
+ log("[extract] Using OCRmyPDF...")
162
+ return _extract_ocrmypdf_then_native(pdf_path, options, progress_cb, pdf_password)
163
+ else:
164
+ log("[extract] WARNING: Scanned PDF detected but no OCR available!")
165
+ log("[extract] Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki")
166
+ log("[extract] Then run: pip install pytesseract pillow")
167
+ log("[extract] Falling back to native extraction (may produce poor results).")
168
+ return _extract_native(pdf_path, options, progress_cb, pdf_password)
169
+ # Otherwise, native path
170
+ return _extract_native(pdf_path, options, progress_cb, pdf_password)
171
+
172
+ if mode == "tesseract":
173
+ if not (_HAS_TESS and _HAS_PIL):
174
+ raise RuntimeError(
175
+ "OCR mode 'tesseract' selected but pytesseract/Pillow are not available.\n"
176
+ "Install with: pip install pytesseract pillow\n"
177
+ "And install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki"
178
+ )
179
+ if not _tesseract_available():
180
+ raise RuntimeError(
181
+ "OCR mode 'tesseract' selected but Tesseract binary is not available on PATH.\n"
182
+ "Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki"
183
+ )
184
+ return _extract_tesseract(pdf_path, options, progress_cb, pdf_password)
185
+
186
+ if mode == "ocrmypdf":
187
+ if not _tesseract_available():
188
+ raise RuntimeError(
189
+ "OCR mode 'ocrmypdf' selected but Tesseract is not available on PATH.\n"
190
+ "Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki"
191
+ )
192
+ if not _which("ocrmypdf"):
193
+ raise RuntimeError(
194
+ "OCR mode 'ocrmypdf' selected but ocrmypdf is not installed.\n"
195
+ "Install with: pip install ocrmypdf"
196
+ )
197
+ return _extract_ocrmypdf_then_native(pdf_path, options, progress_cb, pdf_password)
198
+
199
+ raise ValueError(f"Unknown ocr_mode: {mode!r}")
200
+
201
+
202
+ # ------------------------ Native PyMuPDF extraction ----------------------
203
+
204
+
205
+ def _extract_native(
206
+ pdf_path: str,
207
+ options: Options,
208
+ progress_cb: DefProgress,
209
+ pdf_password: Optional[str] = None,
210
+ ) -> List[PageText]:
211
+ """Extract text using PyMuPDF's native text extraction."""
212
+ doc = _open_pdf_with_password(pdf_path, pdf_password)
213
+ try:
214
+ total = doc.page_count
215
+
216
+ if total == 0:
217
+ raise ValueError("PDF has no pages")
218
+
219
+ limit = total if not options.preview_only else min(3, total)
220
+ out: List[PageText] = []
221
+
222
+ for i in range(limit):
223
+ page = doc.load_page(i)
224
+ info = page.get_text("dict")
225
+ out.append(PageText.from_pymupdf(info))
226
+
227
+ if progress_cb:
228
+ progress_cb(i + 1, total)
229
+
230
+ return out
231
+ finally:
232
+ doc.close()
233
+
234
+
235
+ # ------------------------ Tesseract-based OCR path -----------------------
236
+
237
+
238
+ def _extract_tesseract(
239
+ pdf_path: str,
240
+ options: Options,
241
+ progress_cb: DefProgress,
242
+ pdf_password: Optional[str] = None,
243
+ ) -> List[PageText]:
244
+ """Render each page to an image, feed into Tesseract, build PageText."""
245
+ if not (_HAS_TESS and _HAS_PIL): # pragma: no cover - guarded earlier
246
+ raise RuntimeError("Tesseract/Pillow not available")
247
+
248
+ doc = _open_pdf_with_password(pdf_path, pdf_password)
249
+ try:
250
+ total = doc.page_count
251
+
252
+ if total == 0:
253
+ raise ValueError("PDF has no pages")
254
+
255
+ limit = total if not options.preview_only else min(3, total)
256
+ out: List[PageText] = []
257
+
258
+ # Use 200 DPI for preview mode to save memory/time, 300 for full quality
259
+ dpi = 200 if options.preview_only else 300
260
+
261
+ for i in range(limit):
262
+ page = doc.load_page(i)
263
+
264
+ # Render at higher DPI for better OCR
265
+ pix = page.get_pixmap(dpi=dpi)
266
+ if not hasattr(pix, "tobytes"):
267
+ raise RuntimeError("Unexpected: pixmap missing tobytes()")
268
+
269
+ png_bytes = pix.tobytes("png")
270
+ img = Image.open(io.BytesIO(png_bytes))
271
+
272
+ # Let pytesseract detect layout at word/line level
273
+ data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
274
+ out.append(PageText.from_tesseract_data(data))
275
+
276
+ if progress_cb:
277
+ progress_cb(i + 1, total)
278
+
279
+ return out
280
+ finally:
281
+ doc.close()
282
+
283
+
284
+ # ------------------------ OCRmyPDF + native path -------------------------
285
+
286
+
287
+ def _extract_ocrmypdf_then_native(
288
+ pdf_path: str,
289
+ options: Options,
290
+ progress_cb: DefProgress,
291
+ pdf_password: Optional[str] = None,
292
+ ) -> List[PageText]:
293
+ """Run OCRmyPDF on a temp copy, then extract using _extract_native.
294
+
295
+ This allows combining OCR with PyMuPDF's excellent layout-preserving
296
+ extraction on the OCR'ed output.
297
+ """
298
+ ocrmypdf_bin = _which("ocrmypdf")
299
+ if not ocrmypdf_bin:
300
+ raise RuntimeError("ocrmypdf not found on PATH")
301
+
302
+ # Create a temporary directory to hold the OCR'ed PDF
303
+ with tempfile.TemporaryDirectory(prefix="pdfmd_") as tmp:
304
+ out_pdf = os.path.join(tmp, "ocr.pdf")
305
+
306
+ # Decide which file OCRmyPDF should read from (may create a decrypted temp copy).
307
+ input_for_ocr = _prepare_ocr_input(pdf_path, pdf_password, tmp)
308
+
309
+ # Build command: --force-ocr ensures OCR even if text exists
310
+ # Removed --skip-text as it conflicts with --force-ocr
311
+ cmd = [ocrmypdf_bin, "--force-ocr", input_for_ocr, out_pdf]
312
+
313
+ try:
314
+ log("[extract] Running OCRmyPDF (this may take a while)...")
315
+ # Set timeout to 10 minutes (600 seconds) to prevent hanging
316
+ # Capture output for progress logging
317
+ result = subprocess.run(
318
+ cmd,
319
+ check=True,
320
+ stdout=subprocess.PIPE,
321
+ stderr=subprocess.PIPE,
322
+ timeout=600,
323
+ )
324
+ # Log summary (avoid spamming full output)
325
+ if result.stdout:
326
+ log("[extract] ocrmypdf output (truncated):")
327
+ log("[extract] " + result.stdout.decode(errors="ignore").splitlines()[0])
328
+ if result.stderr:
329
+ first_err_line = result.stderr.decode(errors="ignore").splitlines()[0]
330
+ log("[extract] ocrmypdf stderr (first line):")
331
+ log("[extract] " + first_err_line)
332
+ except subprocess.TimeoutExpired:
333
+ log("[extract] ERROR: ocrmypdf timed out after 10 minutes.")
334
+ raise
335
+ except subprocess.CalledProcessError as e:
336
+ log(f"[extract] ERROR: ocrmypdf failed with return code {e.returncode}.")
337
+ if e.stdout:
338
+ log("[extract] stdout (truncated):")
339
+ log("[extract] " + e.stdout.decode(errors="ignore").splitlines()[0])
340
+ if e.stderr:
341
+ log("[extract] stderr (truncated):")
342
+ log("[extract] " + e.stderr.decode(errors="ignore").splitlines()[0])
343
+ raise
344
+
345
+ # Now that we have OCR'ed PDF, run native extraction on it
346
+ # The OCR output is never password protected.
347
+ return _extract_native(out_pdf, options, progress_cb, None)
348
+
349
+
350
+ # ----------------------- OCR probe and helpers ----------------------------
351
+
352
+
353
+ def _needs_ocr_probe(
354
+ pdf_path: str,
355
+ pdf_password: Optional[str] = None,
356
+ pages_to_check: int = 3,
357
+ ) -> bool:
358
+ """Heuristic: determine if PDF is likely scanned and needs OCR.
359
+
360
+ We consider a PDF "scanned" if:
361
+ 1. Very little extractable text (< ~100 chars) on first pages
362
+ 2. Presence of large images covering most of the page area
363
+ 3. Low text density relative to page size
364
+ """
365
+ try:
366
+ doc = _open_pdf_with_password(pdf_path, pdf_password)
367
+ except Exception:
368
+ return False
369
+
370
+ try:
371
+ if doc.page_count == 0:
372
+ return False
373
+
374
+ total = min(pages_to_check, doc.page_count)
375
+ text_chars = 0
376
+ scanned_indicators = 0
377
+
378
+ for i in range(total):
379
+ page = doc.load_page(i)
380
+ text = page.get_text("text").strip()
381
+ text_chars += len(text)
382
+
383
+ # Get page dimensions
384
+ rect = page.rect
385
+ page_area = rect.width * rect.height
386
+
387
+ # Check for images
388
+ images = page.get_images(full=True)
389
+ if images:
390
+ for img_info in images:
391
+ try:
392
+ xref = img_info[0]
393
+ pix = fitz.Pixmap(doc, xref)
394
+ img_area = pix.width * pix.height
395
+ pix = None # free resources
396
+ # If image covers a large portion of the page, count it
397
+ if img_area > 0.3 * page_area:
398
+ scanned_indicators += 1
399
+ except Exception:
400
+ continue
401
+
402
+ # Very low text and presence of large images suggests scanned
403
+ if text_chars < 100 and scanned_indicators > 0:
404
+ return True
405
+
406
+ # Also treat very low text density as scanned
407
+ avg_text_per_page = text_chars / max(total, 1)
408
+ if avg_text_per_page < 50 and scanned_indicators > 0:
409
+ return True
410
+
411
+ return False
412
+ finally:
413
+ doc.close()
414
+
415
+
416
+ def _tesseract_available() -> bool:
417
+ """Check if Tesseract is available on PATH.
418
+
419
+ We prefer using pytesseract for detection because it is already imported
420
+ when OCR is needed, but we also verify the underlying binary is callable.
421
+ """
422
+ if pytesseract is None:
423
+ return False
424
+
425
+ try:
426
+ result = subprocess.run(
427
+ ["tesseract", "--version"],
428
+ capture_output=True,
429
+ timeout=5,
430
+ )
431
+ return result.returncode == 0
432
+ except Exception:
433
+ return False
434
+
435
+
436
+ def _which(cmd: str) -> Optional[str]:
437
+ """Portable `which` implementation.
438
+
439
+ Uses shutil.which when available, falls back to a simple PATH scan.
440
+ """
441
+ path = shutil.which(cmd)
442
+ if path:
443
+ return path
444
+
445
+ # Fallback scan
446
+ exts = [""] # On Windows PATHEXT is used by shutil.which; we emulate minimal behaviour.
447
+ if os.name == "nt":
448
+ pathext = os.environ.get("PATHEXT", "")
449
+ exts.extend(ext.strip() for ext in pathext.split(os.pathsep) if ext.strip())
450
+
451
+ for p in os.environ.get("PATH", "").split(os.pathsep):
452
+ candidate = os.path.join(p, cmd)
453
+ if os.path.isfile(candidate) and os.access(candidate, os.X_OK):
454
+ return candidate
455
+
456
+ # On Windows, try with each extension
457
+ if os.name == "nt":
458
+ base = candidate
459
+ for e in exts:
460
+ cand2 = base + e
461
+ if os.path.isfile(cand2) and os.access(cand2, os.X_OK):
462
+ return cand2
463
+
464
+ return None
465
+
466
+
467
+ __all__ = [
468
+ "extract_pages",
469
+ ]