thinkpdf 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,272 @@
1
+ """
2
+ PDF Extractor - Core extraction engine using PyMuPDF.
3
+
4
+ This module handles the low-level extraction of content from PDF files,
5
+ including text, images, tables, and metadata.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import hashlib
11
+ from dataclasses import dataclass, field
12
+ from pathlib import Path
13
+ from typing import List, Optional, Dict, Any, Callable
14
+ import fitz # PyMuPDF
15
+
16
+
17
+ @dataclass
18
+ class TextSpan:
19
+ """A span of text with formatting information."""
20
+ text: str
21
+ font: str = ""
22
+ size: float = 0.0
23
+ flags: int = 0 # bold, italic, etc.
24
+ color: int = 0
25
+
26
+ @property
27
+ def is_bold(self) -> bool:
28
+ return bool(self.flags & 2**4)
29
+
30
+ @property
31
+ def is_italic(self) -> bool:
32
+ return bool(self.flags & 2**1)
33
+
34
+
35
+ @dataclass
36
+ class TextLine:
37
+ """A line of text containing multiple spans."""
38
+ spans: List[TextSpan] = field(default_factory=list)
39
+ bbox: tuple = (0, 0, 0, 0)
40
+
41
+ @property
42
+ def text(self) -> str:
43
+ return "".join(span.text for span in self.spans)
44
+
45
+
46
+ @dataclass
47
+ class TextBlock:
48
+ """A block of text containing multiple lines."""
49
+ lines: List[TextLine] = field(default_factory=list)
50
+ bbox: tuple = (0, 0, 0, 0)
51
+ block_type: str = "text" # text, image, table
52
+
53
+ @property
54
+ def text(self) -> str:
55
+ return "\n".join(line.text for line in self.lines)
56
+
57
+
58
+ @dataclass
59
+ class PageContent:
60
+ """Content extracted from a single page."""
61
+ page_number: int
62
+ blocks: List[TextBlock] = field(default_factory=list)
63
+ images: List[Dict[str, Any]] = field(default_factory=list)
64
+ width: float = 0.0
65
+ height: float = 0.0
66
+
67
+ @property
68
+ def text(self) -> str:
69
+ return "\n\n".join(block.text for block in self.blocks)
70
+
71
+
72
+ @dataclass
73
+ class DocumentContent:
74
+ """Content extracted from an entire document."""
75
+ pages: List[PageContent] = field(default_factory=list)
76
+ metadata: Dict[str, Any] = field(default_factory=dict)
77
+ file_hash: str = ""
78
+
79
+ @property
80
+ def text(self) -> str:
81
+ return "\n\n".join(page.text for page in self.pages)
82
+
83
+ @property
84
+ def page_count(self) -> int:
85
+ return len(self.pages)
86
+
87
+
88
+ class PDFExtractor:
89
+ """
90
+ Extract content from PDF files using PyMuPDF.
91
+
92
+ Features:
93
+ - Fast text extraction with formatting preservation
94
+ - Image extraction with metadata
95
+ - Support for password-protected PDFs
96
+ - Progress callbacks for large documents
97
+ """
98
+
99
+ def __init__(
100
+ self,
101
+ extract_images: bool = False,
102
+ preserve_formatting: bool = True,
103
+ progress_callback: Optional[Callable[[int, int], None]] = None,
104
+ ):
105
+ self.extract_images = extract_images
106
+ self.preserve_formatting = preserve_formatting
107
+ self.progress_callback = progress_callback
108
+
109
+ def extract(
110
+ self,
111
+ pdf_path: str | Path,
112
+ password: Optional[str] = None,
113
+ page_range: Optional[tuple[int, int]] = None,
114
+ ) -> DocumentContent:
115
+ """
116
+ Extract content from a PDF file.
117
+
118
+ Args:
119
+ pdf_path: Path to the PDF file
120
+ password: Optional password for encrypted PDFs
121
+ page_range: Optional (start, end) page range (0-indexed, inclusive)
122
+
123
+ Returns:
124
+ DocumentContent with extracted pages and metadata
125
+ """
126
+ pdf_path = Path(pdf_path)
127
+
128
+ # Calculate file hash for caching
129
+ file_hash = self._calculate_hash(pdf_path)
130
+
131
+ # Open document
132
+ doc = fitz.open(pdf_path)
133
+
134
+ if doc.is_encrypted:
135
+ if password:
136
+ doc.authenticate(password)
137
+ else:
138
+ raise ValueError("PDF is encrypted and no password provided")
139
+
140
+ # Determine page range
141
+ start_page = 0
142
+ end_page = doc.page_count - 1
143
+
144
+ if page_range:
145
+ start_page = max(0, page_range[0])
146
+ end_page = min(doc.page_count - 1, page_range[1])
147
+
148
+ # Extract metadata
149
+ metadata = {
150
+ "title": doc.metadata.get("title", ""),
151
+ "author": doc.metadata.get("author", ""),
152
+ "subject": doc.metadata.get("subject", ""),
153
+ "creator": doc.metadata.get("creator", ""),
154
+ "page_count": doc.page_count,
155
+ "file_path": str(pdf_path),
156
+ }
157
+
158
+ # Extract pages
159
+ pages = []
160
+ total_pages = end_page - start_page + 1
161
+
162
+ for i, page_num in enumerate(range(start_page, end_page + 1)):
163
+ page = doc[page_num]
164
+ page_content = self._extract_page(page, page_num)
165
+ pages.append(page_content)
166
+
167
+ if self.progress_callback:
168
+ self.progress_callback(i + 1, total_pages)
169
+
170
+ doc.close()
171
+
172
+ return DocumentContent(
173
+ pages=pages,
174
+ metadata=metadata,
175
+ file_hash=file_hash,
176
+ )
177
+
178
+ def _extract_page(self, page: fitz.Page, page_number: int) -> PageContent:
179
+ """Extract content from a single page."""
180
+ blocks = []
181
+ images = []
182
+
183
+ # Get page dimensions
184
+ rect = page.rect
185
+ width, height = rect.width, rect.height
186
+
187
+ # Extract text blocks with detailed info
188
+ text_dict = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE)
189
+
190
+ for block in text_dict.get("blocks", []):
191
+ if block.get("type") == 0: # Text block
192
+ text_block = self._parse_text_block(block)
193
+ if text_block.text.strip():
194
+ blocks.append(text_block)
195
+ elif block.get("type") == 1 and self.extract_images: # Image block
196
+ image_info = self._extract_image_info(block, page)
197
+ if image_info:
198
+ images.append(image_info)
199
+
200
+ return PageContent(
201
+ page_number=page_number,
202
+ blocks=blocks,
203
+ images=images,
204
+ width=width,
205
+ height=height,
206
+ )
207
+
208
+ def _parse_text_block(self, block: dict) -> TextBlock:
209
+ """Parse a PyMuPDF text block into our format."""
210
+ lines = []
211
+ bbox = block.get("bbox", (0, 0, 0, 0))
212
+
213
+ for line in block.get("lines", []):
214
+ spans = []
215
+ line_bbox = line.get("bbox", (0, 0, 0, 0))
216
+
217
+ for span in line.get("spans", []):
218
+ text_span = TextSpan(
219
+ text=span.get("text", ""),
220
+ font=span.get("font", ""),
221
+ size=span.get("size", 0.0),
222
+ flags=span.get("flags", 0),
223
+ color=span.get("color", 0),
224
+ )
225
+ spans.append(text_span)
226
+
227
+ if spans:
228
+ lines.append(TextLine(spans=spans, bbox=line_bbox))
229
+
230
+ return TextBlock(lines=lines, bbox=bbox, block_type="text")
231
+
232
+ def _extract_image_info(self, block: dict, page: fitz.Page) -> Optional[Dict[str, Any]]:
233
+ """Extract image information from a block."""
234
+ try:
235
+ bbox = block.get("bbox", (0, 0, 0, 0))
236
+ return {
237
+ "bbox": bbox,
238
+ "width": bbox[2] - bbox[0],
239
+ "height": bbox[3] - bbox[1],
240
+ "page": page.number,
241
+ }
242
+ except Exception:
243
+ return None
244
+
245
+ def _calculate_hash(self, pdf_path: Path) -> str:
246
+ """Calculate SHA256 hash of the PDF file."""
247
+ sha256 = hashlib.sha256()
248
+ with open(pdf_path, "rb") as f:
249
+ for chunk in iter(lambda: f.read(8192), b""):
250
+ sha256.update(chunk)
251
+ return sha256.hexdigest()
252
+
253
+
254
+ # Convenience function
255
+ def extract_pdf(
256
+ pdf_path: str | Path,
257
+ password: Optional[str] = None,
258
+ extract_images: bool = False,
259
+ ) -> DocumentContent:
260
+ """
261
+ Quick extraction of PDF content.
262
+
263
+ Args:
264
+ pdf_path: Path to PDF file
265
+ password: Optional password
266
+ extract_images: Whether to extract image info
267
+
268
+ Returns:
269
+ DocumentContent with extracted content
270
+ """
271
+ extractor = PDFExtractor(extract_images=extract_images)
272
+ return extractor.extract(pdf_path, password=password)
@@ -0,0 +1,196 @@
1
+ """Core data models for pdfmd.
2
+
3
+ This module defines lightweight, serializable structures that represent the
4
+ intermediate text model we pass through the pipeline:
5
+
6
+ - Span: A run of text with uniform styling.
7
+ - Line: A sequence of spans that appear on the same baseline.
8
+ - Block: A group of lines (roughly a paragraph or heading candidate).
9
+ - PageText:All text blocks for a page.
10
+ - Options: User-configurable knobs used by extract/transform/render stages.
11
+
12
+ We provide static constructors to build PageText from:
13
+ • PyMuPDF ("dict" output)
14
+ • Tesseract (pytesseract.image_to_data dict)
15
+
16
+ These constructors keep *only* the essentials the rest of the pipeline needs:
17
+ text runs and coarse style hints (approx size, bold, italic). Layout geometry is
18
+ not preserved beyond what helps basic heuristics.
19
+ """
20
+ from __future__ import annotations
21
+
22
+ from dataclasses import dataclass, field
23
+ from typing import List, Dict, Any, Iterable, Optional, Literal
24
+
25
+
26
+ # ---------------------------- Text structures ----------------------------
27
+ @dataclass
28
+ class Span:
29
+ text: str
30
+ size: float = 0.0
31
+ bold: bool = False
32
+ italic: bool = False
33
+
34
+
35
+ @dataclass
36
+ class Line:
37
+ spans: List[Span] = field(default_factory=list)
38
+
39
+ def text(self) -> str:
40
+ return "".join(s.text for s in self.spans)
41
+
42
+
43
+ @dataclass
44
+ class Block:
45
+ lines: List[Line] = field(default_factory=list)
46
+
47
+ def is_empty(self) -> bool:
48
+ for ln in self.lines:
49
+ if any(sp.text.strip() for sp in ln.spans):
50
+ return False
51
+ return True
52
+
53
+
54
+ @dataclass
55
+ class PageText:
56
+ blocks: List[Block] = field(default_factory=list)
57
+
58
+ # ------------------------ PyMuPDF constructor ------------------------
59
+ @staticmethod
60
+ def from_pymupdf(page_dict: Dict[str, Any]) -> "PageText":
61
+ """Build a PageText from fitz.Page.get_text("dict").
62
+
63
+ We extract spans (text, size, bold/italic hints) and group them into
64
+ lines and blocks following the original dict structure.
65
+ """
66
+ def span_style(span: Dict[str, Any]) -> tuple[float, bool, bool, str]:
67
+ txt = span.get("text", "") or ""
68
+ size = float(span.get("size", 0.0) or 0.0)
69
+ flags = int(span.get("flags", 0) or 0)
70
+ font = str(span.get("font", "") or "").lower()
71
+ # Heuristics similar to PyMuPDF semantics
72
+ is_bold = bool(flags & 16) or any(k in font for k in ("bold", "black", "heavy", "semibold"))
73
+ is_italic = bool(flags & 2) or any(k in font for k in ("italic", "oblique"))
74
+ return size, is_bold, is_italic, txt
75
+
76
+ blocks: List[Block] = []
77
+ for b in page_dict.get("blocks", []) or []:
78
+ if "lines" not in b:
79
+ # skip images and non-text blocks here
80
+ continue
81
+ lines: List[Line] = []
82
+ for ln in b.get("lines", []) or []:
83
+ spans: List[Span] = []
84
+ for sp in ln.get("spans", []) or []:
85
+ size, bold, italic, txt = span_style(sp)
86
+ if not txt:
87
+ continue
88
+ spans.append(Span(text=txt, size=size, bold=bold, italic=italic))
89
+ if spans:
90
+ lines.append(Line(spans=spans))
91
+ if lines:
92
+ blocks.append(Block(lines=lines))
93
+ return PageText(blocks=blocks)
94
+
95
+ # ------------------------- Tesseract constructor -------------------------
96
+ @staticmethod
97
+ def from_tesseract_data(data: Dict[str, List[Any]]) -> "PageText":
98
+ """Build PageText from pytesseract.image_to_data() result.
99
+
100
+ The data dict contains parallel lists for keys: level, page_num, block_num,
101
+ par_num, line_num, word_num, left, top, width, height, conf, text.
102
+
103
+ We group by (block_num, line_num). We do not try to infer bold/italic.
104
+ A crude font-size proxy uses the median of word heights in a line.
105
+ """
106
+ n = len(data.get("text", []))
107
+ if n == 0:
108
+ return PageText()
109
+
110
+ # Group indices by (block_num, line_num)
111
+ groups: Dict[tuple[int, int], List[int]] = {}
112
+ for i in range(n):
113
+ txt = data["text"][i] or ""
114
+ if not txt.strip():
115
+ continue
116
+ bno = int(data.get("block_num", [0]*n)[i] or 0)
117
+ lno = int(data.get("line_num", [0]*n)[i] or 0)
118
+ groups.setdefault((bno, lno), []).append(i)
119
+
120
+ # Sort groups by block, then line order (by top coordinate if present)
121
+ def group_top(idx_list: List[int]) -> int:
122
+ tops = [int(data.get("top", [0]*n)[i] or 0) for i in idx_list]
123
+ return min(tops) if tops else 0
124
+
125
+ ordered_keys = sorted(groups.keys(), key=lambda k: (k[0], group_top(groups[k])))
126
+
127
+ blocks: List[Block] = []
128
+ cur_block_key: Optional[int] = None
129
+ cur_block_lines: List[Line] = []
130
+
131
+ for (bno, lno) in ordered_keys:
132
+ idxs = groups[(bno, lno)]
133
+ # estimate size by median of heights in this line
134
+ heights = [int(data.get("height", [0]*n)[i] or 0) for i in idxs]
135
+ size_est = float(median_safe(heights)) if heights else 0.0
136
+ # assemble spans in reading order (left coordinate if present)
137
+ idxs_sorted = sorted(idxs, key=lambda i: int(data.get("left", [0]*n)[i] or 0))
138
+ spans = [Span(text=str(data["text"][i]), size=size_est) for i in idxs_sorted]
139
+ line = Line(spans=spans)
140
+
141
+ if cur_block_key is None:
142
+ cur_block_key = bno
143
+ if bno != cur_block_key:
144
+ # flush previous block
145
+ if cur_block_lines:
146
+ blocks.append(Block(lines=cur_block_lines))
147
+ cur_block_lines = [line]
148
+ cur_block_key = bno
149
+ else:
150
+ cur_block_lines.append(line)
151
+
152
+ if cur_block_lines:
153
+ blocks.append(Block(lines=cur_block_lines))
154
+
155
+ return PageText(blocks=blocks)
156
+
157
+
158
+ # ------------------------------ Options ------------------------------
159
+ @dataclass
160
+ class Options:
161
+ # Extraction / OCR
162
+ ocr_mode: Literal["off", "auto", "tesseract", "ocrmypdf"] = "off"
163
+ preview_only: bool = False
164
+
165
+ # Transform heuristics
166
+ caps_to_headings: bool = True
167
+ defragment_short: bool = True
168
+ heading_size_ratio: float = 1.15
169
+ orphan_max_len: int = 45
170
+ remove_headers_footers: bool = True
171
+
172
+ # Rendering / output
173
+ insert_page_breaks: bool = False
174
+ export_images: bool = False
175
+
176
+
177
+ # ------------------------------ Utilities ------------------------------
178
+ def median_safe(vals: Iterable[int | float]) -> float:
179
+ xs = [float(v) for v in vals]
180
+ if not xs:
181
+ return 0.0
182
+ xs.sort()
183
+ m = len(xs) // 2
184
+ if len(xs) % 2:
185
+ return xs[m]
186
+ return (xs[m - 1] + xs[m]) / 2.0
187
+
188
+
189
+ __all__ = [
190
+ "Span",
191
+ "Line",
192
+ "Block",
193
+ "PageText",
194
+ "Options",
195
+ "median_safe",
196
+ ]