thinkpdf 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfbrain/__init__.py +22 -0
- pdfbrain/app_gui.py +530 -0
- pdfbrain/cache/__init__.py +5 -0
- pdfbrain/cache/cache_manager.py +252 -0
- pdfbrain/cli.py +255 -0
- pdfbrain/core/__init__.py +6 -0
- pdfbrain/core/converter.py +332 -0
- pdfbrain/core/equations.py +635 -0
- pdfbrain/core/extract.py +469 -0
- pdfbrain/core/extractor.py +272 -0
- pdfbrain/core/models.py +196 -0
- pdfbrain/core/pipeline.py +287 -0
- pdfbrain/core/render.py +574 -0
- pdfbrain/core/tables.py +871 -0
- pdfbrain/core/transform.py +604 -0
- pdfbrain/core/utils.py +229 -0
- pdfbrain/engine.py +392 -0
- pdfbrain/mcp_server.py +315 -0
- pdfbrain/utils/__init__.py +1 -0
- thinkpdf-1.0.1.dist-info/METADATA +138 -0
- thinkpdf-1.0.1.dist-info/RECORD +25 -0
- thinkpdf-1.0.1.dist-info/WHEEL +5 -0
- thinkpdf-1.0.1.dist-info/entry_points.txt +4 -0
- thinkpdf-1.0.1.dist-info/licenses/LICENSE +620 -0
- thinkpdf-1.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDF to Markdown Converter - Core conversion engine.
|
|
3
|
+
|
|
4
|
+
This module converts extracted PDF content to Markdown format,
|
|
5
|
+
with support for tables, equations, code blocks, and more.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import List, Optional, Dict, Any, Callable
|
|
14
|
+
|
|
15
|
+
from .extractor import (
|
|
16
|
+
PDFExtractor,
|
|
17
|
+
DocumentContent,
|
|
18
|
+
PageContent,
|
|
19
|
+
TextBlock,
|
|
20
|
+
TextLine,
|
|
21
|
+
TextSpan,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class ConversionOptions:
|
|
27
|
+
"""Options for PDF to Markdown conversion."""
|
|
28
|
+
|
|
29
|
+
# Quality settings
|
|
30
|
+
quality: str = "balanced" # "fast", "balanced", "maximum"
|
|
31
|
+
|
|
32
|
+
# Content options
|
|
33
|
+
preserve_formatting: bool = True
|
|
34
|
+
detect_tables: bool = True
|
|
35
|
+
detect_equations: bool = True
|
|
36
|
+
detect_code_blocks: bool = True
|
|
37
|
+
|
|
38
|
+
# Header/footer removal
|
|
39
|
+
remove_headers: bool = True
|
|
40
|
+
remove_footers: bool = True
|
|
41
|
+
|
|
42
|
+
# Heading detection
|
|
43
|
+
heading_ratio: float = 1.15 # Font size ratio to consider as heading
|
|
44
|
+
|
|
45
|
+
# OCR
|
|
46
|
+
ocr_mode: str = "auto" # "off", "auto", "force"
|
|
47
|
+
|
|
48
|
+
# Images
|
|
49
|
+
export_images: bool = False
|
|
50
|
+
image_output_dir: Optional[str] = None
|
|
51
|
+
|
|
52
|
+
# LLM validation (optional)
|
|
53
|
+
use_llm: bool = False
|
|
54
|
+
llm_service: Optional[str] = None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class ConversionResult:
|
|
59
|
+
"""Result of PDF to Markdown conversion."""
|
|
60
|
+
markdown: str
|
|
61
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
62
|
+
images: List[str] = field(default_factory=list)
|
|
63
|
+
warnings: List[str] = field(default_factory=list)
|
|
64
|
+
stats: Dict[str, Any] = field(default_factory=dict)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class PDFConverter:
|
|
68
|
+
"""
|
|
69
|
+
Convert PDF documents to Markdown.
|
|
70
|
+
|
|
71
|
+
Features:
|
|
72
|
+
- Smart heading detection based on font size
|
|
73
|
+
- Table detection and reconstruction
|
|
74
|
+
- Equation detection with LaTeX conversion
|
|
75
|
+
- Code block detection
|
|
76
|
+
- Header/footer removal
|
|
77
|
+
- Bold/italic preservation
|
|
78
|
+
- Optional LLM validation for improved accuracy
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
options: Optional[ConversionOptions] = None,
|
|
84
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
|
85
|
+
log_callback: Optional[Callable[[str], None]] = None,
|
|
86
|
+
):
|
|
87
|
+
self.options = options or ConversionOptions()
|
|
88
|
+
self.progress_callback = progress_callback
|
|
89
|
+
self.log_callback = log_callback
|
|
90
|
+
|
|
91
|
+
# Track font sizes for heading detection
|
|
92
|
+
self._font_sizes: List[float] = []
|
|
93
|
+
self._base_font_size: float = 12.0
|
|
94
|
+
|
|
95
|
+
def convert(
|
|
96
|
+
self,
|
|
97
|
+
pdf_path: str | Path,
|
|
98
|
+
output_path: Optional[str | Path] = None,
|
|
99
|
+
password: Optional[str] = None,
|
|
100
|
+
) -> ConversionResult:
|
|
101
|
+
"""
|
|
102
|
+
Convert a PDF file to Markdown.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
pdf_path: Path to the PDF file
|
|
106
|
+
output_path: Optional path to save the Markdown file
|
|
107
|
+
password: Optional password for encrypted PDFs
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
ConversionResult with markdown content and metadata
|
|
111
|
+
"""
|
|
112
|
+
pdf_path = Path(pdf_path)
|
|
113
|
+
|
|
114
|
+
self._log(f"Starting conversion: {pdf_path.name}")
|
|
115
|
+
|
|
116
|
+
# Step 1: Extract content
|
|
117
|
+
self._log("Extracting content...")
|
|
118
|
+
extractor = PDFExtractor(
|
|
119
|
+
extract_images=self.options.export_images,
|
|
120
|
+
preserve_formatting=self.options.preserve_formatting,
|
|
121
|
+
progress_callback=self._extraction_progress,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
document = extractor.extract(pdf_path, password=password)
|
|
125
|
+
|
|
126
|
+
# Step 2: Analyze document
|
|
127
|
+
self._log("Analyzing document structure...")
|
|
128
|
+
self._analyze_fonts(document)
|
|
129
|
+
|
|
130
|
+
# Step 3: Convert pages
|
|
131
|
+
self._log("Converting to Markdown...")
|
|
132
|
+
markdown_parts = []
|
|
133
|
+
|
|
134
|
+
for i, page in enumerate(document.pages):
|
|
135
|
+
page_md = self._convert_page(page)
|
|
136
|
+
markdown_parts.append(page_md)
|
|
137
|
+
|
|
138
|
+
if self.progress_callback:
|
|
139
|
+
self.progress_callback(i + 1, len(document.pages))
|
|
140
|
+
|
|
141
|
+
# Step 4: Combine and clean up
|
|
142
|
+
markdown = "\n\n".join(markdown_parts)
|
|
143
|
+
markdown = self._clean_markdown(markdown)
|
|
144
|
+
|
|
145
|
+
# Step 5: Save if output path provided
|
|
146
|
+
if output_path:
|
|
147
|
+
output_path = Path(output_path)
|
|
148
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
149
|
+
output_path.write_text(markdown, encoding="utf-8")
|
|
150
|
+
self._log(f"Saved to: {output_path}")
|
|
151
|
+
|
|
152
|
+
# Build result
|
|
153
|
+
result = ConversionResult(
|
|
154
|
+
markdown=markdown,
|
|
155
|
+
metadata=document.metadata,
|
|
156
|
+
stats={
|
|
157
|
+
"page_count": document.page_count,
|
|
158
|
+
"word_count": len(markdown.split()),
|
|
159
|
+
"character_count": len(markdown),
|
|
160
|
+
},
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
self._log("Conversion complete!")
|
|
164
|
+
return result
|
|
165
|
+
|
|
166
|
+
def _analyze_fonts(self, document: DocumentContent) -> None:
|
|
167
|
+
"""Analyze font sizes across the document to determine base font and headings."""
|
|
168
|
+
sizes = []
|
|
169
|
+
|
|
170
|
+
for page in document.pages:
|
|
171
|
+
for block in page.blocks:
|
|
172
|
+
for line in block.lines:
|
|
173
|
+
for span in line.spans:
|
|
174
|
+
if span.size > 0:
|
|
175
|
+
sizes.append(span.size)
|
|
176
|
+
|
|
177
|
+
if sizes:
|
|
178
|
+
# Use median as base font size
|
|
179
|
+
sizes.sort()
|
|
180
|
+
self._base_font_size = sizes[len(sizes) // 2]
|
|
181
|
+
self._font_sizes = sizes
|
|
182
|
+
|
|
183
|
+
def _convert_page(self, page: PageContent) -> str:
|
|
184
|
+
"""Convert a single page to Markdown."""
|
|
185
|
+
parts = []
|
|
186
|
+
|
|
187
|
+
for block in page.blocks:
|
|
188
|
+
block_md = self._convert_block(block)
|
|
189
|
+
if block_md.strip():
|
|
190
|
+
parts.append(block_md)
|
|
191
|
+
|
|
192
|
+
return "\n\n".join(parts)
|
|
193
|
+
|
|
194
|
+
def _convert_block(self, block: TextBlock) -> str:
|
|
195
|
+
"""Convert a text block to Markdown."""
|
|
196
|
+
lines_md = []
|
|
197
|
+
|
|
198
|
+
for line in block.lines:
|
|
199
|
+
line_md = self._convert_line(line)
|
|
200
|
+
if line_md.strip():
|
|
201
|
+
lines_md.append(line_md)
|
|
202
|
+
|
|
203
|
+
# Check if this should be a heading
|
|
204
|
+
if lines_md and self._is_heading(block):
|
|
205
|
+
heading_level = self._get_heading_level(block)
|
|
206
|
+
prefix = "#" * heading_level + " "
|
|
207
|
+
return prefix + " ".join(lines_md)
|
|
208
|
+
|
|
209
|
+
return " ".join(lines_md)
|
|
210
|
+
|
|
211
|
+
def _convert_line(self, line: TextLine) -> str:
|
|
212
|
+
"""Convert a text line to Markdown with formatting."""
|
|
213
|
+
parts = []
|
|
214
|
+
|
|
215
|
+
for span in line.spans:
|
|
216
|
+
text = span.text
|
|
217
|
+
|
|
218
|
+
if not text.strip():
|
|
219
|
+
parts.append(text)
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
# Apply formatting
|
|
223
|
+
if self.options.preserve_formatting:
|
|
224
|
+
if span.is_bold and span.is_italic:
|
|
225
|
+
text = f"***{text.strip()}***"
|
|
226
|
+
elif span.is_bold:
|
|
227
|
+
text = f"**{text.strip()}**"
|
|
228
|
+
elif span.is_italic:
|
|
229
|
+
text = f"*{text.strip()}*"
|
|
230
|
+
|
|
231
|
+
parts.append(text)
|
|
232
|
+
|
|
233
|
+
return "".join(parts)
|
|
234
|
+
|
|
235
|
+
def _is_heading(self, block: TextBlock) -> bool:
|
|
236
|
+
"""Determine if a block should be formatted as a heading."""
|
|
237
|
+
if not block.lines:
|
|
238
|
+
return False
|
|
239
|
+
|
|
240
|
+
# Get average font size of the block
|
|
241
|
+
sizes = []
|
|
242
|
+
for line in block.lines:
|
|
243
|
+
for span in line.spans:
|
|
244
|
+
if span.size > 0:
|
|
245
|
+
sizes.append(span.size)
|
|
246
|
+
|
|
247
|
+
if not sizes:
|
|
248
|
+
return False
|
|
249
|
+
|
|
250
|
+
avg_size = sum(sizes) / len(sizes)
|
|
251
|
+
|
|
252
|
+
# Check if significantly larger than base font
|
|
253
|
+
ratio = avg_size / self._base_font_size if self._base_font_size > 0 else 1.0
|
|
254
|
+
|
|
255
|
+
return ratio >= self.options.heading_ratio
|
|
256
|
+
|
|
257
|
+
def _get_heading_level(self, block: TextBlock) -> int:
|
|
258
|
+
"""Determine the heading level (1-6) based on font size."""
|
|
259
|
+
sizes = []
|
|
260
|
+
for line in block.lines:
|
|
261
|
+
for span in line.spans:
|
|
262
|
+
if span.size > 0:
|
|
263
|
+
sizes.append(span.size)
|
|
264
|
+
|
|
265
|
+
if not sizes:
|
|
266
|
+
return 2
|
|
267
|
+
|
|
268
|
+
avg_size = sum(sizes) / len(sizes)
|
|
269
|
+
ratio = avg_size / self._base_font_size if self._base_font_size > 0 else 1.0
|
|
270
|
+
|
|
271
|
+
# Map ratio to heading level
|
|
272
|
+
if ratio >= 2.0:
|
|
273
|
+
return 1
|
|
274
|
+
elif ratio >= 1.6:
|
|
275
|
+
return 2
|
|
276
|
+
elif ratio >= 1.4:
|
|
277
|
+
return 3
|
|
278
|
+
elif ratio >= 1.2:
|
|
279
|
+
return 4
|
|
280
|
+
else:
|
|
281
|
+
return 5
|
|
282
|
+
|
|
283
|
+
def _clean_markdown(self, markdown: str) -> str:
|
|
284
|
+
"""Clean up the generated Markdown."""
|
|
285
|
+
# Remove excessive blank lines
|
|
286
|
+
markdown = re.sub(r"\n{4,}", "\n\n\n", markdown)
|
|
287
|
+
|
|
288
|
+
# Fix spacing around headings
|
|
289
|
+
markdown = re.sub(r"(#{1,6} .+)\n{3,}", r"\1\n\n", markdown)
|
|
290
|
+
|
|
291
|
+
# Remove trailing whitespace
|
|
292
|
+
lines = [line.rstrip() for line in markdown.split("\n")]
|
|
293
|
+
markdown = "\n".join(lines)
|
|
294
|
+
|
|
295
|
+
# Ensure single newline at end
|
|
296
|
+
markdown = markdown.strip() + "\n"
|
|
297
|
+
|
|
298
|
+
return markdown
|
|
299
|
+
|
|
300
|
+
def _extraction_progress(self, done: int, total: int) -> None:
|
|
301
|
+
"""Progress callback for extraction phase."""
|
|
302
|
+
# Map extraction progress to first half of overall progress
|
|
303
|
+
if self.progress_callback:
|
|
304
|
+
overall = done * 50 // total
|
|
305
|
+
self.progress_callback(overall, 100)
|
|
306
|
+
|
|
307
|
+
def _log(self, message: str) -> None:
|
|
308
|
+
"""Log a message if callback is set."""
|
|
309
|
+
if self.log_callback:
|
|
310
|
+
self.log_callback(message)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def convert_pdf_to_markdown(
|
|
314
|
+
pdf_path: str | Path,
|
|
315
|
+
output_path: Optional[str | Path] = None,
|
|
316
|
+
quality: str = "balanced",
|
|
317
|
+
) -> str:
|
|
318
|
+
"""
|
|
319
|
+
Quick conversion of PDF to Markdown.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
pdf_path: Path to PDF file
|
|
323
|
+
output_path: Optional path to save output
|
|
324
|
+
quality: "fast", "balanced", or "maximum"
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
Markdown string
|
|
328
|
+
"""
|
|
329
|
+
options = ConversionOptions(quality=quality)
|
|
330
|
+
converter = PDFConverter(options=options)
|
|
331
|
+
result = converter.convert(pdf_path, output_path=output_path)
|
|
332
|
+
return result.markdown
|