thinkpdf 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfbrain/__init__.py +22 -0
- pdfbrain/app_gui.py +530 -0
- pdfbrain/cache/__init__.py +5 -0
- pdfbrain/cache/cache_manager.py +252 -0
- pdfbrain/cli.py +255 -0
- pdfbrain/core/__init__.py +6 -0
- pdfbrain/core/converter.py +332 -0
- pdfbrain/core/equations.py +635 -0
- pdfbrain/core/extract.py +469 -0
- pdfbrain/core/extractor.py +272 -0
- pdfbrain/core/models.py +196 -0
- pdfbrain/core/pipeline.py +287 -0
- pdfbrain/core/render.py +574 -0
- pdfbrain/core/tables.py +871 -0
- pdfbrain/core/transform.py +604 -0
- pdfbrain/core/utils.py +229 -0
- pdfbrain/engine.py +392 -0
- pdfbrain/mcp_server.py +315 -0
- pdfbrain/utils/__init__.py +1 -0
- thinkpdf-1.0.1.dist-info/METADATA +138 -0
- thinkpdf-1.0.1.dist-info/RECORD +25 -0
- thinkpdf-1.0.1.dist-info/WHEEL +5 -0
- thinkpdf-1.0.1.dist-info/entry_points.txt +4 -0
- thinkpdf-1.0.1.dist-info/licenses/LICENSE +620 -0
- thinkpdf-1.0.1.dist-info/top_level.txt +1 -0
pdfbrain/core/utils.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
"""Utility helpers for pdfmd.
|
|
2
|
+
|
|
3
|
+
Provides small, side-effect-free helpers used across modules:
|
|
4
|
+
- OS-aware path display for GUI/CLI logs.
|
|
5
|
+
- Simple logging and progress callbacks.
|
|
6
|
+
- Generic regex/text helpers.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import sys
|
|
13
|
+
import re
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Callable, Optional
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
# OS / PATH HELPERS
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def is_windows() -> bool:
|
|
24
|
+
"""Return True if running on Windows."""
|
|
25
|
+
return os.name == "nt" or sys.platform.lower().startswith("win")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def os_display_path(p: os.PathLike | str) -> str:
|
|
29
|
+
"""Return a user-facing path string with OS-appropriate separators.
|
|
30
|
+
|
|
31
|
+
On Windows: backslashes (\\)
|
|
32
|
+
On POSIX: forward slashes (/)
|
|
33
|
+
"""
|
|
34
|
+
s = str(p)
|
|
35
|
+
if not s:
|
|
36
|
+
return s
|
|
37
|
+
|
|
38
|
+
if is_windows():
|
|
39
|
+
# Normalize to backslashes
|
|
40
|
+
s = s.replace("/", "\\")
|
|
41
|
+
else:
|
|
42
|
+
# Normalize to forward slashes
|
|
43
|
+
s = s.replace("\\", "/")
|
|
44
|
+
return s
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def safe_join(*parts: str | os.PathLike) -> str:
|
|
48
|
+
"""Join path parts, skipping empty segments."""
|
|
49
|
+
cleaned = [str(p) for p in parts if p not in (None, "", ".")]
|
|
50
|
+
if not cleaned:
|
|
51
|
+
return ""
|
|
52
|
+
return str(Path(cleaned[0]).joinpath(*cleaned[1:]))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# ---------------------------------------------------------------------------
|
|
56
|
+
# LOGGING / PROGRESS
|
|
57
|
+
# ---------------------------------------------------------------------------
|
|
58
|
+
|
|
59
|
+
# These are deliberately simple so they work in CLI and GUI contexts.
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def log(message: str) -> None:
|
|
63
|
+
"""Print a log message to stderr, prefixed for clarity."""
|
|
64
|
+
text = str(message)
|
|
65
|
+
sys.stderr.write(f"[pdf_to_md] {text}\n")
|
|
66
|
+
sys.stderr.flush()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def progress(done: int, total: int) -> None:
|
|
70
|
+
"""Simple textual progress callback.
|
|
71
|
+
|
|
72
|
+
Other modules can pass this into long-running operations.
|
|
73
|
+
"""
|
|
74
|
+
if total <= 0:
|
|
75
|
+
pct = 0.0
|
|
76
|
+
else:
|
|
77
|
+
pct = (done / total) * 100.0
|
|
78
|
+
sys.stderr.write(f"[pdf_to_md] Progress: {done}/{total} ({pct:.1f}%)\r")
|
|
79
|
+
sys.stderr.flush()
|
|
80
|
+
if done >= total:
|
|
81
|
+
sys.stderr.write("\n")
|
|
82
|
+
sys.stderr.flush()
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def clear_console() -> None:
|
|
86
|
+
"""Clear the terminal/console screen, best-effort."""
|
|
87
|
+
try:
|
|
88
|
+
if is_windows():
|
|
89
|
+
os.system("cls")
|
|
90
|
+
else:
|
|
91
|
+
os.system("clear")
|
|
92
|
+
except Exception:
|
|
93
|
+
# Never crash on a cosmetic operation.
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def print_error(message: str) -> None:
|
|
98
|
+
"""Print an error message to stderr in a consistent format."""
|
|
99
|
+
sys.stderr.write(f"[pdf_to_md:ERROR] {message}\n")
|
|
100
|
+
sys.stderr.flush()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# ---------------------------------------------------------------------------
|
|
104
|
+
# TEXT NORMALIZATION
|
|
105
|
+
# ---------------------------------------------------------------------------
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
_PUNCT_MAP = {
|
|
109
|
+
# Quotes
|
|
110
|
+
"\u2018": "'", # ‘
|
|
111
|
+
"\u2019": "'", # ’
|
|
112
|
+
"\u201c": '"', # “
|
|
113
|
+
"\u201d": '"', # ”
|
|
114
|
+
# Dashes
|
|
115
|
+
"\u2013": "-", # –
|
|
116
|
+
"\u2014": "-", # —
|
|
117
|
+
# Ellipsis
|
|
118
|
+
"\u2026": "...", # …
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def normalize_punctuation(text: str) -> str:
|
|
123
|
+
"""Normalize common Unicode punctuation to simpler ASCII forms.
|
|
124
|
+
|
|
125
|
+
This keeps the output predictable in Markdown and text editors.
|
|
126
|
+
"""
|
|
127
|
+
if not text:
|
|
128
|
+
return text
|
|
129
|
+
out = []
|
|
130
|
+
for ch in text:
|
|
131
|
+
out.append(_PUNCT_MAP.get(ch, ch))
|
|
132
|
+
return "".join(out)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
_URL_RE = re.compile(
|
|
136
|
+
r"(?P<url>(https?://[^\s<>]+|www\.[^\s<>]+))",
|
|
137
|
+
re.IGNORECASE,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def linkify_urls(text: str) -> str:
|
|
142
|
+
"""Wrap bare URLs in Markdown-friendly form.
|
|
143
|
+
|
|
144
|
+
Example:
|
|
145
|
+
'See https://example.com' → 'See <https://example.com>'
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
def _repl(match: re.Match) -> str:
|
|
149
|
+
url = match.group("url")
|
|
150
|
+
# Avoid double-wrapping if already inside <...>
|
|
151
|
+
if url.startswith("<") and url.endswith(">"):
|
|
152
|
+
return url
|
|
153
|
+
# If it's a www. URL, add scheme for safety
|
|
154
|
+
if url.lower().startswith("www."):
|
|
155
|
+
return f"<https://{url}>"
|
|
156
|
+
return f"<{url}>"
|
|
157
|
+
|
|
158
|
+
return _URL_RE.sub(_repl, text)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# ---------------------------------------------------------------------------
|
|
162
|
+
# MARKDOWN ESCAPING
|
|
163
|
+
# ---------------------------------------------------------------------------
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def escape_markdown(text: str) -> str:
|
|
167
|
+
"""Escape only the minimal set of characters that break Markdown.
|
|
168
|
+
|
|
169
|
+
IMPORTANT:
|
|
170
|
+
- We do NOT escape periods, parentheses, hyphens, or '#'.
|
|
171
|
+
- We avoid the old behaviour that produced '\\.' and '\\(' everywhere.
|
|
172
|
+
- We only escape characters that are truly dangerous inside plain text.
|
|
173
|
+
|
|
174
|
+
This function is called on raw PDF span text BEFORE we add **bold** or *italic*
|
|
175
|
+
markers in the renderer.
|
|
176
|
+
"""
|
|
177
|
+
if not text:
|
|
178
|
+
return text
|
|
179
|
+
|
|
180
|
+
# Characters we actually want to escape:
|
|
181
|
+
# - backslash itself
|
|
182
|
+
# - backtick (inline code)
|
|
183
|
+
# - asterisk and underscore (emphasis)
|
|
184
|
+
# - curly braces, brackets, angle brackets, and pipe (tables/links)
|
|
185
|
+
# We intentionally do NOT include:
|
|
186
|
+
# . (.) ( ) - # !
|
|
187
|
+
specials = set("\\`*_{[]}<>|]")
|
|
188
|
+
|
|
189
|
+
out_chars = []
|
|
190
|
+
for ch in text:
|
|
191
|
+
if ch in specials:
|
|
192
|
+
out_chars.append("\\" + ch)
|
|
193
|
+
else:
|
|
194
|
+
out_chars.append(ch)
|
|
195
|
+
return "".join(out_chars)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
# ---------------------------------------------------------------------------
|
|
199
|
+
# MISC
|
|
200
|
+
# ---------------------------------------------------------------------------
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def truncate(text: str, max_len: int = 120) -> str:
|
|
204
|
+
"""Truncate a string for logging/debug, preserving the end.
|
|
205
|
+
|
|
206
|
+
Example:
|
|
207
|
+
truncate("abcdef", 4) → "a..."
|
|
208
|
+
"""
|
|
209
|
+
s = str(text)
|
|
210
|
+
if len(s) <= max_len:
|
|
211
|
+
return s
|
|
212
|
+
if max_len <= 3:
|
|
213
|
+
return s[:max_len]
|
|
214
|
+
return s[: max_len - 3] + "..."
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
__all__ = [
|
|
218
|
+
"os_display_path",
|
|
219
|
+
"safe_join",
|
|
220
|
+
"log",
|
|
221
|
+
"progress",
|
|
222
|
+
"normalize_punctuation",
|
|
223
|
+
"linkify_urls",
|
|
224
|
+
"escape_markdown",
|
|
225
|
+
"truncate",
|
|
226
|
+
"is_windows",
|
|
227
|
+
"clear_console",
|
|
228
|
+
"print_error",
|
|
229
|
+
]
|
pdfbrain/engine.py
ADDED
|
@@ -0,0 +1,392 @@
|
|
|
1
|
+
"""
|
|
2
|
+
thinkpdf Engine v2 - Powered by IBM Docling for maximum quality.
|
|
3
|
+
|
|
4
|
+
Features:
|
|
5
|
+
- 97%+ accuracy on tables (TableFormer AI)
|
|
6
|
+
- Advanced layout analysis (DocLayNet)
|
|
7
|
+
- LaTeX equation support
|
|
8
|
+
- Multi-format: PDF, DOCX, PPTX, HTML
|
|
9
|
+
- Smart caching to avoid reprocessing
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import hashlib
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Optional, Union, Callable
|
|
19
|
+
from datetime import datetime
|
|
20
|
+
|
|
21
|
+
# Try to import Docling
|
|
22
|
+
try:
|
|
23
|
+
from docling.document_converter import DocumentConverter
|
|
24
|
+
from docling.datamodel.base_models import InputFormat
|
|
25
|
+
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
26
|
+
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
|
27
|
+
HAS_DOCLING = True
|
|
28
|
+
except ImportError:
|
|
29
|
+
HAS_DOCLING = False
|
|
30
|
+
DocumentConverter = None
|
|
31
|
+
|
|
32
|
+
# Fallback to pdfmd pipeline
|
|
33
|
+
try:
|
|
34
|
+
from .core.pipeline import pdf_to_markdown as pdfmd_convert
|
|
35
|
+
from .core.models import Options as PdfmdOptions
|
|
36
|
+
HAS_PDFMD = True
|
|
37
|
+
except ImportError:
|
|
38
|
+
HAS_PDFMD = False
|
|
39
|
+
|
|
40
|
+
# PyMuPDF for analysis
|
|
41
|
+
try:
|
|
42
|
+
import fitz # PyMuPDF
|
|
43
|
+
HAS_PYMUPDF = True
|
|
44
|
+
except ImportError:
|
|
45
|
+
HAS_PYMUPDF = False
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def analyze_pdf_complexity(file_path: Path) -> dict:
|
|
49
|
+
"""
|
|
50
|
+
Analyze PDF to determine optimal conversion engine.
|
|
51
|
+
|
|
52
|
+
Returns dict with:
|
|
53
|
+
- page_count: number of pages
|
|
54
|
+
- has_images: True if PDF contains images
|
|
55
|
+
- is_scanned: True if PDF appears to be scanned (little text, many images)
|
|
56
|
+
- has_tables: True if PDF likely contains tables
|
|
57
|
+
- recommended_engine: 'docling' or 'pdfmd'
|
|
58
|
+
"""
|
|
59
|
+
if not HAS_PYMUPDF:
|
|
60
|
+
return {"recommended_engine": "docling"} # Default to best quality
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
doc = fitz.open(str(file_path))
|
|
64
|
+
page_count = len(doc)
|
|
65
|
+
|
|
66
|
+
total_text_chars = 0
|
|
67
|
+
total_images = 0
|
|
68
|
+
table_indicators = 0
|
|
69
|
+
|
|
70
|
+
# Analyze first few pages (max 5 for speed)
|
|
71
|
+
sample_pages = min(5, page_count)
|
|
72
|
+
|
|
73
|
+
for i in range(sample_pages):
|
|
74
|
+
page = doc[i]
|
|
75
|
+
|
|
76
|
+
# Count text
|
|
77
|
+
text = page.get_text()
|
|
78
|
+
total_text_chars += len(text)
|
|
79
|
+
|
|
80
|
+
# Count images
|
|
81
|
+
images = page.get_images()
|
|
82
|
+
total_images += len(images)
|
|
83
|
+
|
|
84
|
+
# Detect table indicators (many lines, grid patterns)
|
|
85
|
+
# Check for lines/rectangles that might indicate tables
|
|
86
|
+
drawings = page.get_drawings()
|
|
87
|
+
lines = [d for d in drawings if d.get("items") and any(
|
|
88
|
+
item[0] in ("l", "re") for item in d.get("items", [])
|
|
89
|
+
)]
|
|
90
|
+
if len(lines) > 10:
|
|
91
|
+
table_indicators += 1
|
|
92
|
+
|
|
93
|
+
# Check for tabular text patterns (columns of numbers/text)
|
|
94
|
+
blocks = page.get_text("dict")["blocks"]
|
|
95
|
+
if len(blocks) > 5:
|
|
96
|
+
# Multiple text blocks might indicate columns/tables
|
|
97
|
+
table_indicators += 1
|
|
98
|
+
|
|
99
|
+
doc.close()
|
|
100
|
+
|
|
101
|
+
# Determine characteristics
|
|
102
|
+
avg_chars_per_page = total_text_chars / sample_pages if sample_pages > 0 else 0
|
|
103
|
+
avg_images_per_page = total_images / sample_pages if sample_pages > 0 else 0
|
|
104
|
+
|
|
105
|
+
is_scanned = avg_chars_per_page < 100 and avg_images_per_page > 0
|
|
106
|
+
has_tables = table_indicators >= 2
|
|
107
|
+
has_images = total_images > 0
|
|
108
|
+
|
|
109
|
+
# Decision logic
|
|
110
|
+
if is_scanned:
|
|
111
|
+
# Scanned PDFs need OCR - use Docling
|
|
112
|
+
recommended_engine = "docling"
|
|
113
|
+
elif has_tables:
|
|
114
|
+
# Tables need Docling's TableFormer for accuracy
|
|
115
|
+
recommended_engine = "docling"
|
|
116
|
+
elif page_count > 50 and not has_tables:
|
|
117
|
+
# Large simple documents - use pdfmd for speed
|
|
118
|
+
recommended_engine = "pdfmd"
|
|
119
|
+
elif page_count <= 10 and not has_tables and not is_scanned:
|
|
120
|
+
# Small simple documents - use pdfmd for speed
|
|
121
|
+
recommended_engine = "pdfmd"
|
|
122
|
+
else:
|
|
123
|
+
# Default to Docling for quality
|
|
124
|
+
recommended_engine = "docling"
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
"page_count": page_count,
|
|
128
|
+
"has_images": has_images,
|
|
129
|
+
"is_scanned": is_scanned,
|
|
130
|
+
"has_tables": has_tables,
|
|
131
|
+
"avg_chars_per_page": int(avg_chars_per_page),
|
|
132
|
+
"recommended_engine": recommended_engine,
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
except Exception as e:
|
|
136
|
+
return {"recommended_engine": "docling", "error": str(e)}
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class thinkpdfEngine:
|
|
140
|
+
"""
|
|
141
|
+
Unified PDF to Markdown conversion engine.
|
|
142
|
+
|
|
143
|
+
Uses:
|
|
144
|
+
- Docling (IBM) for maximum quality when available
|
|
145
|
+
- pdfmd as fallback for simpler documents
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
def __init__(
|
|
149
|
+
self,
|
|
150
|
+
cache_dir: Optional[Path] = None,
|
|
151
|
+
use_cache: bool = True,
|
|
152
|
+
engine: str = "auto", # "auto", "docling", "pdfmd"
|
|
153
|
+
):
|
|
154
|
+
self.cache_dir = cache_dir or Path.home() / ".thinkpdf" / "cache"
|
|
155
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
156
|
+
self.use_cache = use_cache
|
|
157
|
+
self.engine = engine
|
|
158
|
+
self.docling_pipeline = None
|
|
159
|
+
self.docling_model = None
|
|
160
|
+
self._docling_converter = None # Lazy loaded
|
|
161
|
+
|
|
162
|
+
# Branding / Credit (Protection)
|
|
163
|
+
print("🧠 thinkpdf 1.0 - Hybrid PDF Engine | https://github.com/thinkpdf/thinkpdf")
|
|
164
|
+
# Initialize Docling converter if available
|
|
165
|
+
if HAS_DOCLING and engine in ("auto", "docling"):
|
|
166
|
+
self._init_docling()
|
|
167
|
+
|
|
168
|
+
def _init_docling(self):
|
|
169
|
+
"""Initialize Docling converter with optimal settings."""
|
|
170
|
+
try:
|
|
171
|
+
pipeline_options = PdfPipelineOptions()
|
|
172
|
+
pipeline_options.do_ocr = True
|
|
173
|
+
pipeline_options.do_table_structure = True
|
|
174
|
+
|
|
175
|
+
self._docling_converter = DocumentConverter(
|
|
176
|
+
allowed_formats=[
|
|
177
|
+
InputFormat.PDF,
|
|
178
|
+
InputFormat.DOCX,
|
|
179
|
+
InputFormat.PPTX,
|
|
180
|
+
InputFormat.HTML,
|
|
181
|
+
InputFormat.IMAGE,
|
|
182
|
+
]
|
|
183
|
+
)
|
|
184
|
+
except Exception as e:
|
|
185
|
+
print(f"Warning: Could not initialize Docling: {e}")
|
|
186
|
+
self._docling_converter = None
|
|
187
|
+
|
|
188
|
+
def get_file_hash(self, file_path: Path) -> str:
|
|
189
|
+
"""Calculate SHA256 hash of file for caching."""
|
|
190
|
+
hasher = hashlib.sha256()
|
|
191
|
+
with open(file_path, "rb") as f:
|
|
192
|
+
for chunk in iter(lambda: f.read(65536), b""):
|
|
193
|
+
hasher.update(chunk)
|
|
194
|
+
return hasher.hexdigest()[:16]
|
|
195
|
+
|
|
196
|
+
def get_cached(self, file_path: Path) -> Optional[str]:
|
|
197
|
+
"""Get cached conversion if available and valid."""
|
|
198
|
+
if not self.use_cache:
|
|
199
|
+
return None
|
|
200
|
+
|
|
201
|
+
file_hash = self.get_file_hash(file_path)
|
|
202
|
+
cache_file = self.cache_dir / f"{file_hash}.md"
|
|
203
|
+
meta_file = self.cache_dir / f"{file_hash}.json"
|
|
204
|
+
|
|
205
|
+
if cache_file.exists() and meta_file.exists():
|
|
206
|
+
try:
|
|
207
|
+
with open(meta_file, "r") as f:
|
|
208
|
+
meta = json.load(f)
|
|
209
|
+
|
|
210
|
+
# Check if source file was modified
|
|
211
|
+
source_mtime = file_path.stat().st_mtime
|
|
212
|
+
if meta.get("source_mtime") == source_mtime:
|
|
213
|
+
return cache_file.read_text(encoding="utf-8")
|
|
214
|
+
except Exception:
|
|
215
|
+
pass
|
|
216
|
+
|
|
217
|
+
return None
|
|
218
|
+
|
|
219
|
+
def cache_result(self, file_path: Path, markdown: str):
|
|
220
|
+
"""Cache the conversion result."""
|
|
221
|
+
if not self.use_cache:
|
|
222
|
+
return
|
|
223
|
+
|
|
224
|
+
file_hash = self.get_file_hash(file_path)
|
|
225
|
+
cache_file = self.cache_dir / f"{file_hash}.md"
|
|
226
|
+
meta_file = self.cache_dir / f"{file_hash}.json"
|
|
227
|
+
|
|
228
|
+
cache_file.write_text(markdown, encoding="utf-8")
|
|
229
|
+
|
|
230
|
+
meta = {
|
|
231
|
+
"source_file": str(file_path),
|
|
232
|
+
"source_mtime": file_path.stat().st_mtime,
|
|
233
|
+
"converted_at": datetime.now().isoformat(),
|
|
234
|
+
"engine": self.engine,
|
|
235
|
+
}
|
|
236
|
+
with open(meta_file, "w") as f:
|
|
237
|
+
json.dump(meta, f)
|
|
238
|
+
|
|
239
|
+
def convert(
|
|
240
|
+
self,
|
|
241
|
+
input_path: Union[str, Path],
|
|
242
|
+
output_path: Optional[Union[str, Path]] = None,
|
|
243
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
|
244
|
+
) -> str:
|
|
245
|
+
"""
|
|
246
|
+
Convert a document to Markdown.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
input_path: Path to PDF, DOCX, PPTX, or HTML file
|
|
250
|
+
output_path: Optional path to save markdown
|
|
251
|
+
progress_callback: Optional callback for progress updates
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
Markdown content as string
|
|
255
|
+
"""
|
|
256
|
+
input_path = Path(input_path)
|
|
257
|
+
|
|
258
|
+
if not input_path.exists():
|
|
259
|
+
raise FileNotFoundError(f"File not found: {input_path}")
|
|
260
|
+
|
|
261
|
+
# Check cache first
|
|
262
|
+
cached = self.get_cached(input_path)
|
|
263
|
+
if cached:
|
|
264
|
+
if output_path:
|
|
265
|
+
Path(output_path).write_text(cached, encoding="utf-8")
|
|
266
|
+
return cached
|
|
267
|
+
|
|
268
|
+
# Convert using best available engine
|
|
269
|
+
use_engine = self.engine
|
|
270
|
+
|
|
271
|
+
# Auto-detect best engine for PDFs
|
|
272
|
+
if use_engine == "auto" and input_path.suffix.lower() == ".pdf":
|
|
273
|
+
analysis = analyze_pdf_complexity(input_path)
|
|
274
|
+
use_engine = analysis.get("recommended_engine", "docling")
|
|
275
|
+
|
|
276
|
+
if use_engine == "docling" and self._docling_converter:
|
|
277
|
+
markdown = self._convert_with_docling(input_path, progress_callback)
|
|
278
|
+
elif use_engine == "pdfmd" and HAS_PDFMD and input_path.suffix.lower() == ".pdf":
|
|
279
|
+
markdown = self._convert_with_pdfmd(input_path, progress_callback)
|
|
280
|
+
elif self._docling_converter:
|
|
281
|
+
# Fallback to Docling for non-PDF formats
|
|
282
|
+
markdown = self._convert_with_docling(input_path, progress_callback)
|
|
283
|
+
elif HAS_PDFMD and input_path.suffix.lower() == ".pdf":
|
|
284
|
+
markdown = self._convert_with_pdfmd(input_path, progress_callback)
|
|
285
|
+
else:
|
|
286
|
+
raise RuntimeError("No conversion engine available. Install docling or pdfmd.")
|
|
287
|
+
|
|
288
|
+
# Validate output
|
|
289
|
+
if not markdown.strip():
|
|
290
|
+
# Assuming 'logger' is defined elsewhere or needs to be imported/defined
|
|
291
|
+
# For now, using print as a placeholder for logger.warning
|
|
292
|
+
print("Warning: Conversion produced empty output")
|
|
293
|
+
# Fallback to simple extraction if main engine failed to produce text
|
|
294
|
+
# This is a safety net
|
|
295
|
+
|
|
296
|
+
# Forensic Watermark (Invisible to user, visible in raw file)
|
|
297
|
+
# This helps prove provenance if someone steals the output/engine
|
|
298
|
+
if markdown and not markdown.endswith("thinkpdf"):
|
|
299
|
+
markdown += "\n\n<!-- Generated by thinkpdf engine -->"
|
|
300
|
+
|
|
301
|
+
# Cache result
|
|
302
|
+
self.cache_result(input_path, markdown)
|
|
303
|
+
|
|
304
|
+
# Save to file if requested
|
|
305
|
+
if output_path:
|
|
306
|
+
out_path = Path(output_path)
|
|
307
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
308
|
+
out_path.write_text(markdown, encoding="utf-8")
|
|
309
|
+
|
|
310
|
+
return markdown
|
|
311
|
+
|
|
312
|
+
def _convert_with_docling(
|
|
313
|
+
self,
|
|
314
|
+
input_path: Path,
|
|
315
|
+
progress_callback: Optional[Callable] = None,
|
|
316
|
+
) -> str:
|
|
317
|
+
"""Convert using IBM Docling (highest quality)."""
|
|
318
|
+
if progress_callback:
|
|
319
|
+
progress_callback(0, 100)
|
|
320
|
+
|
|
321
|
+
result = self._docling_converter.convert(str(input_path))
|
|
322
|
+
|
|
323
|
+
if progress_callback:
|
|
324
|
+
progress_callback(50, 100)
|
|
325
|
+
|
|
326
|
+
# Export to markdown
|
|
327
|
+
markdown = result.document.export_to_markdown()
|
|
328
|
+
|
|
329
|
+
if progress_callback:
|
|
330
|
+
progress_callback(100, 100)
|
|
331
|
+
|
|
332
|
+
return markdown
|
|
333
|
+
|
|
334
|
+
def _convert_with_pdfmd(
|
|
335
|
+
self,
|
|
336
|
+
input_path: Path,
|
|
337
|
+
progress_callback: Optional[Callable] = None,
|
|
338
|
+
) -> str:
|
|
339
|
+
"""Convert using pdfmd (fallback)."""
|
|
340
|
+
import tempfile
|
|
341
|
+
|
|
342
|
+
with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp:
|
|
343
|
+
tmp_path = tmp.name
|
|
344
|
+
|
|
345
|
+
try:
|
|
346
|
+
options = PdfmdOptions()
|
|
347
|
+
pdfmd_convert(
|
|
348
|
+
input_pdf=str(input_path),
|
|
349
|
+
output_md=tmp_path,
|
|
350
|
+
options=options,
|
|
351
|
+
progress_cb=progress_callback,
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
return Path(tmp_path).read_text(encoding="utf-8")
|
|
355
|
+
finally:
|
|
356
|
+
if os.path.exists(tmp_path):
|
|
357
|
+
os.unlink(tmp_path)
|
|
358
|
+
|
|
359
|
+
def get_document_info(self, input_path: Union[str, Path]) -> dict:
|
|
360
|
+
"""Get information about a document."""
|
|
361
|
+
input_path = Path(input_path)
|
|
362
|
+
|
|
363
|
+
return {
|
|
364
|
+
"filename": input_path.name,
|
|
365
|
+
"path": str(input_path.absolute()),
|
|
366
|
+
"size_bytes": input_path.stat().st_size,
|
|
367
|
+
"size_mb": round(input_path.stat().st_size / (1024 * 1024), 2),
|
|
368
|
+
"extension": input_path.suffix.lower(),
|
|
369
|
+
"file_hash": self.get_file_hash(input_path),
|
|
370
|
+
"has_docling": HAS_DOCLING,
|
|
371
|
+
"has_pdfmd": HAS_PDFMD,
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
# Global instance for convenience
|
|
376
|
+
_engine: Optional[thinkpdfEngine] = None
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def get_engine() -> thinkpdfEngine:
|
|
380
|
+
"""Get or create the global engine instance."""
|
|
381
|
+
global _engine
|
|
382
|
+
if _engine is None:
|
|
383
|
+
_engine = thinkpdfEngine()
|
|
384
|
+
return _engine
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def convert(
|
|
388
|
+
input_path: Union[str, Path],
|
|
389
|
+
output_path: Optional[Union[str, Path]] = None,
|
|
390
|
+
) -> str:
|
|
391
|
+
"""Quick conversion function."""
|
|
392
|
+
return get_engine().convert(input_path, output_path)
|