thinkpdf 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pdfbrain/core/utils.py ADDED
@@ -0,0 +1,229 @@
1
+ """Utility helpers for pdfmd.
2
+
3
+ Provides small, side-effect-free helpers used across modules:
4
+ - OS-aware path display for GUI/CLI logs.
5
+ - Simple logging and progress callbacks.
6
+ - Generic regex/text helpers.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import os
12
+ import sys
13
+ import re
14
+ from pathlib import Path
15
+ from typing import Callable, Optional
16
+
17
+
18
+ # ---------------------------------------------------------------------------
19
+ # OS / PATH HELPERS
20
+ # ---------------------------------------------------------------------------
21
+
22
+
23
+ def is_windows() -> bool:
24
+ """Return True if running on Windows."""
25
+ return os.name == "nt" or sys.platform.lower().startswith("win")
26
+
27
+
28
+ def os_display_path(p: os.PathLike | str) -> str:
29
+ """Return a user-facing path string with OS-appropriate separators.
30
+
31
+ On Windows: backslashes (\\)
32
+ On POSIX: forward slashes (/)
33
+ """
34
+ s = str(p)
35
+ if not s:
36
+ return s
37
+
38
+ if is_windows():
39
+ # Normalize to backslashes
40
+ s = s.replace("/", "\\")
41
+ else:
42
+ # Normalize to forward slashes
43
+ s = s.replace("\\", "/")
44
+ return s
45
+
46
+
47
+ def safe_join(*parts: str | os.PathLike) -> str:
48
+ """Join path parts, skipping empty segments."""
49
+ cleaned = [str(p) for p in parts if p not in (None, "", ".")]
50
+ if not cleaned:
51
+ return ""
52
+ return str(Path(cleaned[0]).joinpath(*cleaned[1:]))
53
+
54
+
55
+ # ---------------------------------------------------------------------------
56
+ # LOGGING / PROGRESS
57
+ # ---------------------------------------------------------------------------
58
+
59
+ # These are deliberately simple so they work in CLI and GUI contexts.
60
+
61
+
62
+ def log(message: str) -> None:
63
+ """Print a log message to stderr, prefixed for clarity."""
64
+ text = str(message)
65
+ sys.stderr.write(f"[pdf_to_md] {text}\n")
66
+ sys.stderr.flush()
67
+
68
+
69
+ def progress(done: int, total: int) -> None:
70
+ """Simple textual progress callback.
71
+
72
+ Other modules can pass this into long-running operations.
73
+ """
74
+ if total <= 0:
75
+ pct = 0.0
76
+ else:
77
+ pct = (done / total) * 100.0
78
+ sys.stderr.write(f"[pdf_to_md] Progress: {done}/{total} ({pct:.1f}%)\r")
79
+ sys.stderr.flush()
80
+ if done >= total:
81
+ sys.stderr.write("\n")
82
+ sys.stderr.flush()
83
+
84
+
85
+ def clear_console() -> None:
86
+ """Clear the terminal/console screen, best-effort."""
87
+ try:
88
+ if is_windows():
89
+ os.system("cls")
90
+ else:
91
+ os.system("clear")
92
+ except Exception:
93
+ # Never crash on a cosmetic operation.
94
+ pass
95
+
96
+
97
+ def print_error(message: str) -> None:
98
+ """Print an error message to stderr in a consistent format."""
99
+ sys.stderr.write(f"[pdf_to_md:ERROR] {message}\n")
100
+ sys.stderr.flush()
101
+
102
+
103
+ # ---------------------------------------------------------------------------
104
+ # TEXT NORMALIZATION
105
+ # ---------------------------------------------------------------------------
106
+
107
+
108
+ _PUNCT_MAP = {
109
+ # Quotes
110
+ "\u2018": "'", # ‘
111
+ "\u2019": "'", # ’
112
+ "\u201c": '"', # “
113
+ "\u201d": '"', # ”
114
+ # Dashes
115
+ "\u2013": "-", # –
116
+ "\u2014": "-", # —
117
+ # Ellipsis
118
+ "\u2026": "...", # …
119
+ }
120
+
121
+
122
+ def normalize_punctuation(text: str) -> str:
123
+ """Normalize common Unicode punctuation to simpler ASCII forms.
124
+
125
+ This keeps the output predictable in Markdown and text editors.
126
+ """
127
+ if not text:
128
+ return text
129
+ out = []
130
+ for ch in text:
131
+ out.append(_PUNCT_MAP.get(ch, ch))
132
+ return "".join(out)
133
+
134
+
135
+ _URL_RE = re.compile(
136
+ r"(?P<url>(https?://[^\s<>]+|www\.[^\s<>]+))",
137
+ re.IGNORECASE,
138
+ )
139
+
140
+
141
+ def linkify_urls(text: str) -> str:
142
+ """Wrap bare URLs in Markdown-friendly form.
143
+
144
+ Example:
145
+ 'See https://example.com' → 'See <https://example.com>'
146
+ """
147
+
148
+ def _repl(match: re.Match) -> str:
149
+ url = match.group("url")
150
+ # Avoid double-wrapping if already inside <...>
151
+ if url.startswith("<") and url.endswith(">"):
152
+ return url
153
+ # If it's a www. URL, add scheme for safety
154
+ if url.lower().startswith("www."):
155
+ return f"<https://{url}>"
156
+ return f"<{url}>"
157
+
158
+ return _URL_RE.sub(_repl, text)
159
+
160
+
161
+ # ---------------------------------------------------------------------------
162
+ # MARKDOWN ESCAPING
163
+ # ---------------------------------------------------------------------------
164
+
165
+
166
+ def escape_markdown(text: str) -> str:
167
+ """Escape only the minimal set of characters that break Markdown.
168
+
169
+ IMPORTANT:
170
+ - We do NOT escape periods, parentheses, hyphens, or '#'.
171
+ - We avoid the old behaviour that produced '\\.' and '\\(' everywhere.
172
+ - We only escape characters that are truly dangerous inside plain text.
173
+
174
+ This function is called on raw PDF span text BEFORE we add **bold** or *italic*
175
+ markers in the renderer.
176
+ """
177
+ if not text:
178
+ return text
179
+
180
+ # Characters we actually want to escape:
181
+ # - backslash itself
182
+ # - backtick (inline code)
183
+ # - asterisk and underscore (emphasis)
184
+ # - curly braces, brackets, angle brackets, and pipe (tables/links)
185
+ # We intentionally do NOT include:
186
+ # . (.) ( ) - # !
187
+ specials = set("\\`*_{[]}<>|]")
188
+
189
+ out_chars = []
190
+ for ch in text:
191
+ if ch in specials:
192
+ out_chars.append("\\" + ch)
193
+ else:
194
+ out_chars.append(ch)
195
+ return "".join(out_chars)
196
+
197
+
198
+ # ---------------------------------------------------------------------------
199
+ # MISC
200
+ # ---------------------------------------------------------------------------
201
+
202
+
203
+ def truncate(text: str, max_len: int = 120) -> str:
204
+ """Truncate a string for logging/debug, preserving the end.
205
+
206
+ Example:
207
+ truncate("abcdef", 4) → "a..."
208
+ """
209
+ s = str(text)
210
+ if len(s) <= max_len:
211
+ return s
212
+ if max_len <= 3:
213
+ return s[:max_len]
214
+ return s[: max_len - 3] + "..."
215
+
216
+
217
+ __all__ = [
218
+ "os_display_path",
219
+ "safe_join",
220
+ "log",
221
+ "progress",
222
+ "normalize_punctuation",
223
+ "linkify_urls",
224
+ "escape_markdown",
225
+ "truncate",
226
+ "is_windows",
227
+ "clear_console",
228
+ "print_error",
229
+ ]
pdfbrain/engine.py ADDED
@@ -0,0 +1,392 @@
1
+ """
2
+ thinkpdf Engine v2 - Powered by IBM Docling for maximum quality.
3
+
4
+ Features:
5
+ - 97%+ accuracy on tables (TableFormer AI)
6
+ - Advanced layout analysis (DocLayNet)
7
+ - LaTeX equation support
8
+ - Multi-format: PDF, DOCX, PPTX, HTML
9
+ - Smart caching to avoid reprocessing
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import hashlib
15
+ import json
16
+ import os
17
+ from pathlib import Path
18
+ from typing import Optional, Union, Callable
19
+ from datetime import datetime
20
+
21
+ # Try to import Docling
22
+ try:
23
+ from docling.document_converter import DocumentConverter
24
+ from docling.datamodel.base_models import InputFormat
25
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
26
+ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
27
+ HAS_DOCLING = True
28
+ except ImportError:
29
+ HAS_DOCLING = False
30
+ DocumentConverter = None
31
+
32
+ # Fallback to pdfmd pipeline
33
+ try:
34
+ from .core.pipeline import pdf_to_markdown as pdfmd_convert
35
+ from .core.models import Options as PdfmdOptions
36
+ HAS_PDFMD = True
37
+ except ImportError:
38
+ HAS_PDFMD = False
39
+
40
+ # PyMuPDF for analysis
41
+ try:
42
+ import fitz # PyMuPDF
43
+ HAS_PYMUPDF = True
44
+ except ImportError:
45
+ HAS_PYMUPDF = False
46
+
47
+
48
+ def analyze_pdf_complexity(file_path: Path) -> dict:
49
+ """
50
+ Analyze PDF to determine optimal conversion engine.
51
+
52
+ Returns dict with:
53
+ - page_count: number of pages
54
+ - has_images: True if PDF contains images
55
+ - is_scanned: True if PDF appears to be scanned (little text, many images)
56
+ - has_tables: True if PDF likely contains tables
57
+ - recommended_engine: 'docling' or 'pdfmd'
58
+ """
59
+ if not HAS_PYMUPDF:
60
+ return {"recommended_engine": "docling"} # Default to best quality
61
+
62
+ try:
63
+ doc = fitz.open(str(file_path))
64
+ page_count = len(doc)
65
+
66
+ total_text_chars = 0
67
+ total_images = 0
68
+ table_indicators = 0
69
+
70
+ # Analyze first few pages (max 5 for speed)
71
+ sample_pages = min(5, page_count)
72
+
73
+ for i in range(sample_pages):
74
+ page = doc[i]
75
+
76
+ # Count text
77
+ text = page.get_text()
78
+ total_text_chars += len(text)
79
+
80
+ # Count images
81
+ images = page.get_images()
82
+ total_images += len(images)
83
+
84
+ # Detect table indicators (many lines, grid patterns)
85
+ # Check for lines/rectangles that might indicate tables
86
+ drawings = page.get_drawings()
87
+ lines = [d for d in drawings if d.get("items") and any(
88
+ item[0] in ("l", "re") for item in d.get("items", [])
89
+ )]
90
+ if len(lines) > 10:
91
+ table_indicators += 1
92
+
93
+ # Check for tabular text patterns (columns of numbers/text)
94
+ blocks = page.get_text("dict")["blocks"]
95
+ if len(blocks) > 5:
96
+ # Multiple text blocks might indicate columns/tables
97
+ table_indicators += 1
98
+
99
+ doc.close()
100
+
101
+ # Determine characteristics
102
+ avg_chars_per_page = total_text_chars / sample_pages if sample_pages > 0 else 0
103
+ avg_images_per_page = total_images / sample_pages if sample_pages > 0 else 0
104
+
105
+ is_scanned = avg_chars_per_page < 100 and avg_images_per_page > 0
106
+ has_tables = table_indicators >= 2
107
+ has_images = total_images > 0
108
+
109
+ # Decision logic
110
+ if is_scanned:
111
+ # Scanned PDFs need OCR - use Docling
112
+ recommended_engine = "docling"
113
+ elif has_tables:
114
+ # Tables need Docling's TableFormer for accuracy
115
+ recommended_engine = "docling"
116
+ elif page_count > 50 and not has_tables:
117
+ # Large simple documents - use pdfmd for speed
118
+ recommended_engine = "pdfmd"
119
+ elif page_count <= 10 and not has_tables and not is_scanned:
120
+ # Small simple documents - use pdfmd for speed
121
+ recommended_engine = "pdfmd"
122
+ else:
123
+ # Default to Docling for quality
124
+ recommended_engine = "docling"
125
+
126
+ return {
127
+ "page_count": page_count,
128
+ "has_images": has_images,
129
+ "is_scanned": is_scanned,
130
+ "has_tables": has_tables,
131
+ "avg_chars_per_page": int(avg_chars_per_page),
132
+ "recommended_engine": recommended_engine,
133
+ }
134
+
135
+ except Exception as e:
136
+ return {"recommended_engine": "docling", "error": str(e)}
137
+
138
+
139
+ class thinkpdfEngine:
140
+ """
141
+ Unified PDF to Markdown conversion engine.
142
+
143
+ Uses:
144
+ - Docling (IBM) for maximum quality when available
145
+ - pdfmd as fallback for simpler documents
146
+ """
147
+
148
+ def __init__(
149
+ self,
150
+ cache_dir: Optional[Path] = None,
151
+ use_cache: bool = True,
152
+ engine: str = "auto", # "auto", "docling", "pdfmd"
153
+ ):
154
+ self.cache_dir = cache_dir or Path.home() / ".thinkpdf" / "cache"
155
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
156
+ self.use_cache = use_cache
157
+ self.engine = engine
158
+ self.docling_pipeline = None
159
+ self.docling_model = None
160
+ self._docling_converter = None # Lazy loaded
161
+
162
+ # Branding / Credit (Protection)
163
+ print("🧠 thinkpdf 1.0 - Hybrid PDF Engine | https://github.com/thinkpdf/thinkpdf")
164
+ # Initialize Docling converter if available
165
+ if HAS_DOCLING and engine in ("auto", "docling"):
166
+ self._init_docling()
167
+
168
+ def _init_docling(self):
169
+ """Initialize Docling converter with optimal settings."""
170
+ try:
171
+ pipeline_options = PdfPipelineOptions()
172
+ pipeline_options.do_ocr = True
173
+ pipeline_options.do_table_structure = True
174
+
175
+ self._docling_converter = DocumentConverter(
176
+ allowed_formats=[
177
+ InputFormat.PDF,
178
+ InputFormat.DOCX,
179
+ InputFormat.PPTX,
180
+ InputFormat.HTML,
181
+ InputFormat.IMAGE,
182
+ ]
183
+ )
184
+ except Exception as e:
185
+ print(f"Warning: Could not initialize Docling: {e}")
186
+ self._docling_converter = None
187
+
188
+ def get_file_hash(self, file_path: Path) -> str:
189
+ """Calculate SHA256 hash of file for caching."""
190
+ hasher = hashlib.sha256()
191
+ with open(file_path, "rb") as f:
192
+ for chunk in iter(lambda: f.read(65536), b""):
193
+ hasher.update(chunk)
194
+ return hasher.hexdigest()[:16]
195
+
196
+ def get_cached(self, file_path: Path) -> Optional[str]:
197
+ """Get cached conversion if available and valid."""
198
+ if not self.use_cache:
199
+ return None
200
+
201
+ file_hash = self.get_file_hash(file_path)
202
+ cache_file = self.cache_dir / f"{file_hash}.md"
203
+ meta_file = self.cache_dir / f"{file_hash}.json"
204
+
205
+ if cache_file.exists() and meta_file.exists():
206
+ try:
207
+ with open(meta_file, "r") as f:
208
+ meta = json.load(f)
209
+
210
+ # Check if source file was modified
211
+ source_mtime = file_path.stat().st_mtime
212
+ if meta.get("source_mtime") == source_mtime:
213
+ return cache_file.read_text(encoding="utf-8")
214
+ except Exception:
215
+ pass
216
+
217
+ return None
218
+
219
+ def cache_result(self, file_path: Path, markdown: str):
220
+ """Cache the conversion result."""
221
+ if not self.use_cache:
222
+ return
223
+
224
+ file_hash = self.get_file_hash(file_path)
225
+ cache_file = self.cache_dir / f"{file_hash}.md"
226
+ meta_file = self.cache_dir / f"{file_hash}.json"
227
+
228
+ cache_file.write_text(markdown, encoding="utf-8")
229
+
230
+ meta = {
231
+ "source_file": str(file_path),
232
+ "source_mtime": file_path.stat().st_mtime,
233
+ "converted_at": datetime.now().isoformat(),
234
+ "engine": self.engine,
235
+ }
236
+ with open(meta_file, "w") as f:
237
+ json.dump(meta, f)
238
+
239
+ def convert(
240
+ self,
241
+ input_path: Union[str, Path],
242
+ output_path: Optional[Union[str, Path]] = None,
243
+ progress_callback: Optional[Callable[[int, int], None]] = None,
244
+ ) -> str:
245
+ """
246
+ Convert a document to Markdown.
247
+
248
+ Args:
249
+ input_path: Path to PDF, DOCX, PPTX, or HTML file
250
+ output_path: Optional path to save markdown
251
+ progress_callback: Optional callback for progress updates
252
+
253
+ Returns:
254
+ Markdown content as string
255
+ """
256
+ input_path = Path(input_path)
257
+
258
+ if not input_path.exists():
259
+ raise FileNotFoundError(f"File not found: {input_path}")
260
+
261
+ # Check cache first
262
+ cached = self.get_cached(input_path)
263
+ if cached:
264
+ if output_path:
265
+ Path(output_path).write_text(cached, encoding="utf-8")
266
+ return cached
267
+
268
+ # Convert using best available engine
269
+ use_engine = self.engine
270
+
271
+ # Auto-detect best engine for PDFs
272
+ if use_engine == "auto" and input_path.suffix.lower() == ".pdf":
273
+ analysis = analyze_pdf_complexity(input_path)
274
+ use_engine = analysis.get("recommended_engine", "docling")
275
+
276
+ if use_engine == "docling" and self._docling_converter:
277
+ markdown = self._convert_with_docling(input_path, progress_callback)
278
+ elif use_engine == "pdfmd" and HAS_PDFMD and input_path.suffix.lower() == ".pdf":
279
+ markdown = self._convert_with_pdfmd(input_path, progress_callback)
280
+ elif self._docling_converter:
281
+ # Fallback to Docling for non-PDF formats
282
+ markdown = self._convert_with_docling(input_path, progress_callback)
283
+ elif HAS_PDFMD and input_path.suffix.lower() == ".pdf":
284
+ markdown = self._convert_with_pdfmd(input_path, progress_callback)
285
+ else:
286
+ raise RuntimeError("No conversion engine available. Install docling or pdfmd.")
287
+
288
+ # Validate output
289
+ if not markdown.strip():
290
+ # Assuming 'logger' is defined elsewhere or needs to be imported/defined
291
+ # For now, using print as a placeholder for logger.warning
292
+ print("Warning: Conversion produced empty output")
293
+ # Fallback to simple extraction if main engine failed to produce text
294
+ # This is a safety net
295
+
296
+ # Forensic Watermark (Invisible to user, visible in raw file)
297
+ # This helps prove provenance if someone steals the output/engine
298
+ if markdown and not markdown.endswith("thinkpdf"):
299
+ markdown += "\n\n<!-- Generated by thinkpdf engine -->"
300
+
301
+ # Cache result
302
+ self.cache_result(input_path, markdown)
303
+
304
+ # Save to file if requested
305
+ if output_path:
306
+ out_path = Path(output_path)
307
+ out_path.parent.mkdir(parents=True, exist_ok=True)
308
+ out_path.write_text(markdown, encoding="utf-8")
309
+
310
+ return markdown
311
+
312
+ def _convert_with_docling(
313
+ self,
314
+ input_path: Path,
315
+ progress_callback: Optional[Callable] = None,
316
+ ) -> str:
317
+ """Convert using IBM Docling (highest quality)."""
318
+ if progress_callback:
319
+ progress_callback(0, 100)
320
+
321
+ result = self._docling_converter.convert(str(input_path))
322
+
323
+ if progress_callback:
324
+ progress_callback(50, 100)
325
+
326
+ # Export to markdown
327
+ markdown = result.document.export_to_markdown()
328
+
329
+ if progress_callback:
330
+ progress_callback(100, 100)
331
+
332
+ return markdown
333
+
334
+ def _convert_with_pdfmd(
335
+ self,
336
+ input_path: Path,
337
+ progress_callback: Optional[Callable] = None,
338
+ ) -> str:
339
+ """Convert using pdfmd (fallback)."""
340
+ import tempfile
341
+
342
+ with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp:
343
+ tmp_path = tmp.name
344
+
345
+ try:
346
+ options = PdfmdOptions()
347
+ pdfmd_convert(
348
+ input_pdf=str(input_path),
349
+ output_md=tmp_path,
350
+ options=options,
351
+ progress_cb=progress_callback,
352
+ )
353
+
354
+ return Path(tmp_path).read_text(encoding="utf-8")
355
+ finally:
356
+ if os.path.exists(tmp_path):
357
+ os.unlink(tmp_path)
358
+
359
+ def get_document_info(self, input_path: Union[str, Path]) -> dict:
360
+ """Get information about a document."""
361
+ input_path = Path(input_path)
362
+
363
+ return {
364
+ "filename": input_path.name,
365
+ "path": str(input_path.absolute()),
366
+ "size_bytes": input_path.stat().st_size,
367
+ "size_mb": round(input_path.stat().st_size / (1024 * 1024), 2),
368
+ "extension": input_path.suffix.lower(),
369
+ "file_hash": self.get_file_hash(input_path),
370
+ "has_docling": HAS_DOCLING,
371
+ "has_pdfmd": HAS_PDFMD,
372
+ }
373
+
374
+
375
+ # Global instance for convenience
376
+ _engine: Optional[thinkpdfEngine] = None
377
+
378
+
379
+ def get_engine() -> thinkpdfEngine:
380
+ """Get or create the global engine instance."""
381
+ global _engine
382
+ if _engine is None:
383
+ _engine = thinkpdfEngine()
384
+ return _engine
385
+
386
+
387
+ def convert(
388
+ input_path: Union[str, Path],
389
+ output_path: Optional[Union[str, Path]] = None,
390
+ ) -> str:
391
+ """Quick conversion function."""
392
+ return get_engine().convert(input_path, output_path)