thinkpdf 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,332 @@
1
+ """
2
+ PDF to Markdown Converter - Core conversion engine.
3
+
4
+ This module converts extracted PDF content to Markdown format,
5
+ with support for tables, equations, code blocks, and more.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ from dataclasses import dataclass, field
12
+ from pathlib import Path
13
+ from typing import List, Optional, Dict, Any, Callable
14
+
15
+ from .extractor import (
16
+ PDFExtractor,
17
+ DocumentContent,
18
+ PageContent,
19
+ TextBlock,
20
+ TextLine,
21
+ TextSpan,
22
+ )
23
+
24
+
25
+ @dataclass
26
+ class ConversionOptions:
27
+ """Options for PDF to Markdown conversion."""
28
+
29
+ # Quality settings
30
+ quality: str = "balanced" # "fast", "balanced", "maximum"
31
+
32
+ # Content options
33
+ preserve_formatting: bool = True
34
+ detect_tables: bool = True
35
+ detect_equations: bool = True
36
+ detect_code_blocks: bool = True
37
+
38
+ # Header/footer removal
39
+ remove_headers: bool = True
40
+ remove_footers: bool = True
41
+
42
+ # Heading detection
43
+ heading_ratio: float = 1.15 # Font size ratio to consider as heading
44
+
45
+ # OCR
46
+ ocr_mode: str = "auto" # "off", "auto", "force"
47
+
48
+ # Images
49
+ export_images: bool = False
50
+ image_output_dir: Optional[str] = None
51
+
52
+ # LLM validation (optional)
53
+ use_llm: bool = False
54
+ llm_service: Optional[str] = None
55
+
56
+
57
+ @dataclass
58
+ class ConversionResult:
59
+ """Result of PDF to Markdown conversion."""
60
+ markdown: str
61
+ metadata: Dict[str, Any] = field(default_factory=dict)
62
+ images: List[str] = field(default_factory=list)
63
+ warnings: List[str] = field(default_factory=list)
64
+ stats: Dict[str, Any] = field(default_factory=dict)
65
+
66
+
67
+ class PDFConverter:
68
+ """
69
+ Convert PDF documents to Markdown.
70
+
71
+ Features:
72
+ - Smart heading detection based on font size
73
+ - Table detection and reconstruction
74
+ - Equation detection with LaTeX conversion
75
+ - Code block detection
76
+ - Header/footer removal
77
+ - Bold/italic preservation
78
+ - Optional LLM validation for improved accuracy
79
+ """
80
+
81
+ def __init__(
82
+ self,
83
+ options: Optional[ConversionOptions] = None,
84
+ progress_callback: Optional[Callable[[int, int], None]] = None,
85
+ log_callback: Optional[Callable[[str], None]] = None,
86
+ ):
87
+ self.options = options or ConversionOptions()
88
+ self.progress_callback = progress_callback
89
+ self.log_callback = log_callback
90
+
91
+ # Track font sizes for heading detection
92
+ self._font_sizes: List[float] = []
93
+ self._base_font_size: float = 12.0
94
+
95
+ def convert(
96
+ self,
97
+ pdf_path: str | Path,
98
+ output_path: Optional[str | Path] = None,
99
+ password: Optional[str] = None,
100
+ ) -> ConversionResult:
101
+ """
102
+ Convert a PDF file to Markdown.
103
+
104
+ Args:
105
+ pdf_path: Path to the PDF file
106
+ output_path: Optional path to save the Markdown file
107
+ password: Optional password for encrypted PDFs
108
+
109
+ Returns:
110
+ ConversionResult with markdown content and metadata
111
+ """
112
+ pdf_path = Path(pdf_path)
113
+
114
+ self._log(f"Starting conversion: {pdf_path.name}")
115
+
116
+ # Step 1: Extract content
117
+ self._log("Extracting content...")
118
+ extractor = PDFExtractor(
119
+ extract_images=self.options.export_images,
120
+ preserve_formatting=self.options.preserve_formatting,
121
+ progress_callback=self._extraction_progress,
122
+ )
123
+
124
+ document = extractor.extract(pdf_path, password=password)
125
+
126
+ # Step 2: Analyze document
127
+ self._log("Analyzing document structure...")
128
+ self._analyze_fonts(document)
129
+
130
+ # Step 3: Convert pages
131
+ self._log("Converting to Markdown...")
132
+ markdown_parts = []
133
+
134
+ for i, page in enumerate(document.pages):
135
+ page_md = self._convert_page(page)
136
+ markdown_parts.append(page_md)
137
+
138
+ if self.progress_callback:
139
+ self.progress_callback(i + 1, len(document.pages))
140
+
141
+ # Step 4: Combine and clean up
142
+ markdown = "\n\n".join(markdown_parts)
143
+ markdown = self._clean_markdown(markdown)
144
+
145
+ # Step 5: Save if output path provided
146
+ if output_path:
147
+ output_path = Path(output_path)
148
+ output_path.parent.mkdir(parents=True, exist_ok=True)
149
+ output_path.write_text(markdown, encoding="utf-8")
150
+ self._log(f"Saved to: {output_path}")
151
+
152
+ # Build result
153
+ result = ConversionResult(
154
+ markdown=markdown,
155
+ metadata=document.metadata,
156
+ stats={
157
+ "page_count": document.page_count,
158
+ "word_count": len(markdown.split()),
159
+ "character_count": len(markdown),
160
+ },
161
+ )
162
+
163
+ self._log("Conversion complete!")
164
+ return result
165
+
166
+ def _analyze_fonts(self, document: DocumentContent) -> None:
167
+ """Analyze font sizes across the document to determine base font and headings."""
168
+ sizes = []
169
+
170
+ for page in document.pages:
171
+ for block in page.blocks:
172
+ for line in block.lines:
173
+ for span in line.spans:
174
+ if span.size > 0:
175
+ sizes.append(span.size)
176
+
177
+ if sizes:
178
+ # Use median as base font size
179
+ sizes.sort()
180
+ self._base_font_size = sizes[len(sizes) // 2]
181
+ self._font_sizes = sizes
182
+
183
+ def _convert_page(self, page: PageContent) -> str:
184
+ """Convert a single page to Markdown."""
185
+ parts = []
186
+
187
+ for block in page.blocks:
188
+ block_md = self._convert_block(block)
189
+ if block_md.strip():
190
+ parts.append(block_md)
191
+
192
+ return "\n\n".join(parts)
193
+
194
+ def _convert_block(self, block: TextBlock) -> str:
195
+ """Convert a text block to Markdown."""
196
+ lines_md = []
197
+
198
+ for line in block.lines:
199
+ line_md = self._convert_line(line)
200
+ if line_md.strip():
201
+ lines_md.append(line_md)
202
+
203
+ # Check if this should be a heading
204
+ if lines_md and self._is_heading(block):
205
+ heading_level = self._get_heading_level(block)
206
+ prefix = "#" * heading_level + " "
207
+ return prefix + " ".join(lines_md)
208
+
209
+ return " ".join(lines_md)
210
+
211
+ def _convert_line(self, line: TextLine) -> str:
212
+ """Convert a text line to Markdown with formatting."""
213
+ parts = []
214
+
215
+ for span in line.spans:
216
+ text = span.text
217
+
218
+ if not text.strip():
219
+ parts.append(text)
220
+ continue
221
+
222
+ # Apply formatting
223
+ if self.options.preserve_formatting:
224
+ if span.is_bold and span.is_italic:
225
+ text = f"***{text.strip()}***"
226
+ elif span.is_bold:
227
+ text = f"**{text.strip()}**"
228
+ elif span.is_italic:
229
+ text = f"*{text.strip()}*"
230
+
231
+ parts.append(text)
232
+
233
+ return "".join(parts)
234
+
235
+ def _is_heading(self, block: TextBlock) -> bool:
236
+ """Determine if a block should be formatted as a heading."""
237
+ if not block.lines:
238
+ return False
239
+
240
+ # Get average font size of the block
241
+ sizes = []
242
+ for line in block.lines:
243
+ for span in line.spans:
244
+ if span.size > 0:
245
+ sizes.append(span.size)
246
+
247
+ if not sizes:
248
+ return False
249
+
250
+ avg_size = sum(sizes) / len(sizes)
251
+
252
+ # Check if significantly larger than base font
253
+ ratio = avg_size / self._base_font_size if self._base_font_size > 0 else 1.0
254
+
255
+ return ratio >= self.options.heading_ratio
256
+
257
+ def _get_heading_level(self, block: TextBlock) -> int:
258
+ """Determine the heading level (1-6) based on font size."""
259
+ sizes = []
260
+ for line in block.lines:
261
+ for span in line.spans:
262
+ if span.size > 0:
263
+ sizes.append(span.size)
264
+
265
+ if not sizes:
266
+ return 2
267
+
268
+ avg_size = sum(sizes) / len(sizes)
269
+ ratio = avg_size / self._base_font_size if self._base_font_size > 0 else 1.0
270
+
271
+ # Map ratio to heading level
272
+ if ratio >= 2.0:
273
+ return 1
274
+ elif ratio >= 1.6:
275
+ return 2
276
+ elif ratio >= 1.4:
277
+ return 3
278
+ elif ratio >= 1.2:
279
+ return 4
280
+ else:
281
+ return 5
282
+
283
+ def _clean_markdown(self, markdown: str) -> str:
284
+ """Clean up the generated Markdown."""
285
+ # Remove excessive blank lines
286
+ markdown = re.sub(r"\n{4,}", "\n\n\n", markdown)
287
+
288
+ # Fix spacing around headings
289
+ markdown = re.sub(r"(#{1,6} .+)\n{3,}", r"\1\n\n", markdown)
290
+
291
+ # Remove trailing whitespace
292
+ lines = [line.rstrip() for line in markdown.split("\n")]
293
+ markdown = "\n".join(lines)
294
+
295
+ # Ensure single newline at end
296
+ markdown = markdown.strip() + "\n"
297
+
298
+ return markdown
299
+
300
+ def _extraction_progress(self, done: int, total: int) -> None:
301
+ """Progress callback for extraction phase."""
302
+ # Map extraction progress to first half of overall progress
303
+ if self.progress_callback:
304
+ overall = done * 50 // total
305
+ self.progress_callback(overall, 100)
306
+
307
+ def _log(self, message: str) -> None:
308
+ """Log a message if callback is set."""
309
+ if self.log_callback:
310
+ self.log_callback(message)
311
+
312
+
313
+ def convert_pdf_to_markdown(
314
+ pdf_path: str | Path,
315
+ output_path: Optional[str | Path] = None,
316
+ quality: str = "balanced",
317
+ ) -> str:
318
+ """
319
+ Quick conversion of PDF to Markdown.
320
+
321
+ Args:
322
+ pdf_path: Path to PDF file
323
+ output_path: Optional path to save output
324
+ quality: "fast", "balanced", or "maximum"
325
+
326
+ Returns:
327
+ Markdown string
328
+ """
329
+ options = ConversionOptions(quality=quality)
330
+ converter = PDFConverter(options=options)
331
+ result = converter.convert(pdf_path, output_path=output_path)
332
+ return result.markdown