thinkpdf 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,252 @@
1
+ """
2
+ Cache Manager - Intelligent caching for PDF conversions.
3
+
4
+ Avoids re-processing PDFs that haven't changed, saving time
5
+ and computational resources.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import hashlib
11
+ import json
12
+ import os
13
+ from dataclasses import dataclass, asdict
14
+ from datetime import datetime
15
+ from pathlib import Path
16
+ from typing import Optional, Dict, Any
17
+
18
+
19
+ @dataclass
20
+ class CacheEntry:
21
+ """A cached conversion result."""
22
+ file_hash: str
23
+ file_path: str
24
+ file_size: int
25
+ conversion_time: str
26
+ options_hash: str
27
+ markdown_path: str
28
+ metadata: Dict[str, Any]
29
+
30
+ def to_dict(self) -> Dict[str, Any]:
31
+ return asdict(self)
32
+
33
+ @classmethod
34
+ def from_dict(cls, data: Dict[str, Any]) -> "CacheEntry":
35
+ return cls(**data)
36
+
37
+
38
+ class CacheManager:
39
+ """
40
+ Manage cached PDF conversions.
41
+
42
+ Features:
43
+ - SHA256-based file hashing
44
+ - Options-aware caching (different options = different cache)
45
+ - Automatic cache invalidation on file changes
46
+ - Configurable cache location
47
+ - Cache size limits
48
+ """
49
+
50
+ DEFAULT_CACHE_DIR = Path.home() / ".thinkpdf" / "cache"
51
+ INDEX_FILE = "cache_index.json"
52
+ MAX_CACHE_SIZE_MB = 500
53
+
54
+ def __init__(self, cache_dir: Optional[Path] = None):
55
+ self.cache_dir = cache_dir or self.DEFAULT_CACHE_DIR
56
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
57
+ self.index_path = self.cache_dir / self.INDEX_FILE
58
+ self._index: Dict[str, CacheEntry] = {}
59
+ self._load_index()
60
+
61
+ def get_cached(
62
+ self,
63
+ pdf_path: str | Path,
64
+ options_hash: Optional[str] = None,
65
+ ) -> Optional[str]:
66
+ """
67
+ Get cached markdown if available.
68
+
69
+ Args:
70
+ pdf_path: Path to the PDF file
71
+ options_hash: Hash of conversion options
72
+
73
+ Returns:
74
+ Cached markdown content or None if not cached
75
+ """
76
+ pdf_path = Path(pdf_path)
77
+ cache_key = self._make_cache_key(pdf_path, options_hash)
78
+
79
+ if cache_key not in self._index:
80
+ return None
81
+
82
+ entry = self._index[cache_key]
83
+
84
+ # Verify file hasn't changed
85
+ current_hash = self._hash_file(pdf_path)
86
+ if current_hash != entry.file_hash:
87
+ # File changed, invalidate cache
88
+ self._remove_entry(cache_key)
89
+ return None
90
+
91
+ # Read cached markdown
92
+ md_path = Path(entry.markdown_path)
93
+ if not md_path.exists():
94
+ self._remove_entry(cache_key)
95
+ return None
96
+
97
+ return md_path.read_text(encoding="utf-8")
98
+
99
+ def cache(
100
+ self,
101
+ pdf_path: str | Path,
102
+ markdown: str,
103
+ options_hash: Optional[str] = None,
104
+ metadata: Optional[Dict[str, Any]] = None,
105
+ ) -> None:
106
+ """
107
+ Cache a conversion result.
108
+
109
+ Args:
110
+ pdf_path: Path to the original PDF
111
+ markdown: Converted markdown content
112
+ options_hash: Hash of conversion options
113
+ metadata: Optional metadata to store
114
+ """
115
+ pdf_path = Path(pdf_path)
116
+ cache_key = self._make_cache_key(pdf_path, options_hash)
117
+
118
+ # Save markdown file
119
+ md_filename = f"{cache_key}.md"
120
+ md_path = self.cache_dir / md_filename
121
+ md_path.write_text(markdown, encoding="utf-8")
122
+
123
+ # Create cache entry
124
+ entry = CacheEntry(
125
+ file_hash=self._hash_file(pdf_path),
126
+ file_path=str(pdf_path.absolute()),
127
+ file_size=pdf_path.stat().st_size,
128
+ conversion_time=datetime.now().isoformat(),
129
+ options_hash=options_hash or "",
130
+ markdown_path=str(md_path),
131
+ metadata=metadata or {},
132
+ )
133
+
134
+ self._index[cache_key] = entry
135
+ self._save_index()
136
+
137
+ # Check cache size
138
+ self._enforce_size_limit()
139
+
140
+ def invalidate(self, pdf_path: str | Path) -> None:
141
+ """Invalidate all cache entries for a PDF file."""
142
+ pdf_path = Path(pdf_path)
143
+ keys_to_remove = [
144
+ key for key, entry in self._index.items()
145
+ if Path(entry.file_path) == pdf_path.absolute()
146
+ ]
147
+
148
+ for key in keys_to_remove:
149
+ self._remove_entry(key)
150
+
151
+ self._save_index()
152
+
153
+ def clear(self) -> None:
154
+ """Clear all cached entries."""
155
+ for key in list(self._index.keys()):
156
+ self._remove_entry(key)
157
+
158
+ self._index.clear()
159
+ self._save_index()
160
+
161
+ def get_stats(self) -> Dict[str, Any]:
162
+ """Get cache statistics."""
163
+ total_size = sum(
164
+ Path(entry.markdown_path).stat().st_size
165
+ for entry in self._index.values()
166
+ if Path(entry.markdown_path).exists()
167
+ )
168
+
169
+ return {
170
+ "entries": len(self._index),
171
+ "total_size_bytes": total_size,
172
+ "total_size_mb": total_size / (1024 * 1024),
173
+ "cache_dir": str(self.cache_dir),
174
+ }
175
+
176
+ def _make_cache_key(self, pdf_path: Path, options_hash: Optional[str]) -> str:
177
+ """Create a unique cache key for a PDF + options combination."""
178
+ key_str = str(pdf_path.absolute())
179
+ if options_hash:
180
+ key_str += f":{options_hash}"
181
+
182
+ return hashlib.sha256(key_str.encode()).hexdigest()[:16]
183
+
184
+ def _hash_file(self, file_path: Path) -> str:
185
+ """Calculate SHA256 hash of a file."""
186
+ sha256 = hashlib.sha256()
187
+ with open(file_path, "rb") as f:
188
+ for chunk in iter(lambda: f.read(8192), b""):
189
+ sha256.update(chunk)
190
+ return sha256.hexdigest()
191
+
192
+ def _remove_entry(self, cache_key: str) -> None:
193
+ """Remove a cache entry and its markdown file."""
194
+ if cache_key in self._index:
195
+ entry = self._index[cache_key]
196
+ md_path = Path(entry.markdown_path)
197
+ if md_path.exists():
198
+ md_path.unlink()
199
+ del self._index[cache_key]
200
+
201
+ def _load_index(self) -> None:
202
+ """Load the cache index from disk."""
203
+ if self.index_path.exists():
204
+ try:
205
+ data = json.loads(self.index_path.read_text(encoding="utf-8"))
206
+ self._index = {
207
+ key: CacheEntry.from_dict(value)
208
+ for key, value in data.items()
209
+ }
210
+ except (json.JSONDecodeError, KeyError):
211
+ self._index = {}
212
+
213
+ def _save_index(self) -> None:
214
+ """Save the cache index to disk."""
215
+ data = {
216
+ key: entry.to_dict()
217
+ for key, entry in self._index.items()
218
+ }
219
+ self.index_path.write_text(
220
+ json.dumps(data, indent=2),
221
+ encoding="utf-8",
222
+ )
223
+
224
+ def _enforce_size_limit(self) -> None:
225
+ """Remove oldest entries if cache exceeds size limit."""
226
+ max_size_bytes = self.MAX_CACHE_SIZE_MB * 1024 * 1024
227
+
228
+ # Calculate current size
229
+ entries_with_size = []
230
+ for key, entry in self._index.items():
231
+ md_path = Path(entry.markdown_path)
232
+ if md_path.exists():
233
+ size = md_path.stat().st_size
234
+ entries_with_size.append((key, entry.conversion_time, size))
235
+
236
+ total_size = sum(e[2] for e in entries_with_size)
237
+
238
+ if total_size <= max_size_bytes:
239
+ return
240
+
241
+ # Sort by conversion time (oldest first)
242
+ entries_with_size.sort(key=lambda x: x[1])
243
+
244
+ # Remove oldest until under limit
245
+ for key, _, size in entries_with_size:
246
+ if total_size <= max_size_bytes:
247
+ break
248
+
249
+ self._remove_entry(key)
250
+ total_size -= size
251
+
252
+ self._save_index()
pdfbrain/cli.py ADDED
@@ -0,0 +1,255 @@
1
+ """
2
+ thinkpdf CLI Pro - Uses the full pdfmd pipeline for best quality conversion.
3
+
4
+ This CLI uses the advanced modules from pdfmd for:
5
+ - Table detection and reconstruction
6
+ - Equation/LaTeX detection
7
+ - Header/footer removal
8
+ - Smart paragraph merging
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import sys
15
+ from pathlib import Path
16
+ from typing import List, Optional
17
+
18
+ # Use the pdfmd pipeline
19
+ from .core.pipeline import pdf_to_markdown
20
+ from .core.models import Options
21
+ from .cache.cache_manager import CacheManager
22
+
23
+
24
+ def create_parser() -> argparse.ArgumentParser:
25
+ """Create the argument parser."""
26
+ parser = argparse.ArgumentParser(
27
+ prog="thinkpdf",
28
+ description="thinkpdf Pro - The Ultimate PDF to Markdown Converter",
29
+ formatter_class=argparse.RawDescriptionHelpFormatter,
30
+ epilog="""
31
+ Examples:
32
+ thinkpdf document.pdf # Convert single file
33
+ thinkpdf document.pdf -o output.md # Specify output path
34
+ thinkpdf folder/ --batch # Convert all PDFs in folder
35
+ """,
36
+ )
37
+
38
+ parser.add_argument(
39
+ "input",
40
+ help="PDF file or folder to convert",
41
+ )
42
+
43
+ parser.add_argument(
44
+ "-o", "--output",
45
+ help="Output markdown file or folder",
46
+ default=None,
47
+ )
48
+
49
+ parser.add_argument(
50
+ "--batch",
51
+ action="store_true",
52
+ help="Batch convert all PDFs in a folder",
53
+ )
54
+
55
+ parser.add_argument(
56
+ "--no-cache",
57
+ action="store_true",
58
+ help="Skip cache and force re-conversion",
59
+ )
60
+
61
+ parser.add_argument(
62
+ "--ocr",
63
+ choices=["off", "auto", "force"],
64
+ default="auto",
65
+ help="OCR mode (default: auto)",
66
+ )
67
+
68
+ parser.add_argument(
69
+ "--export-images",
70
+ action="store_true",
71
+ help="Export images to _assets folder",
72
+ )
73
+
74
+ parser.add_argument(
75
+ "--password",
76
+ help="Password for encrypted PDFs",
77
+ default=None,
78
+ )
79
+
80
+ parser.add_argument(
81
+ "-v", "--verbose",
82
+ action="store_true",
83
+ help="Verbose output",
84
+ )
85
+
86
+ parser.add_argument(
87
+ "--version",
88
+ action="version",
89
+ version="thinkpdf Pro 1.1.0",
90
+ )
91
+
92
+ return parser
93
+
94
+
95
+ def convert_single_file(
96
+ input_path: Path,
97
+ output_path: Optional[Path],
98
+ options: Options,
99
+ use_cache: bool,
100
+ password: Optional[str],
101
+ verbose: bool,
102
+ ) -> bool:
103
+ """Convert a single PDF file using the full pipeline."""
104
+
105
+ def log(msg: str) -> None:
106
+ if verbose:
107
+ print(f" {msg}")
108
+
109
+ def progress(done: int, total: int) -> None:
110
+ if verbose:
111
+ pct = done * 100 // total
112
+ print(f" Progress: {pct}%", end="\r")
113
+
114
+ # Determine output path
115
+ if output_path is None:
116
+ output_path = input_path.with_suffix(".md")
117
+
118
+ print(f"[PDF] Converting: {input_path.name}")
119
+
120
+ # Check cache
121
+ cache = CacheManager() if use_cache else None
122
+
123
+ if cache:
124
+ cached = cache.get_cached(input_path)
125
+ if cached:
126
+ output_path.write_text(cached, encoding="utf-8")
127
+ print(f" [CACHE] Loaded from cache -> {output_path.name}")
128
+ return True
129
+
130
+ # Convert using pdfmd pipeline
131
+ try:
132
+ pdf_to_markdown(
133
+ input_pdf=str(input_path),
134
+ output_md=str(output_path),
135
+ options=options,
136
+ progress_cb=progress if verbose else None,
137
+ log_cb=log,
138
+ pdf_password=password,
139
+ )
140
+
141
+ # Read result for caching
142
+ markdown = output_path.read_text(encoding="utf-8")
143
+
144
+ # Cache result
145
+ if cache:
146
+ cache.cache(input_path, markdown)
147
+
148
+ word_count = len(markdown.split())
149
+ print(f" [OK] Converted -> {output_path.name}")
150
+ print(f" {word_count} words")
151
+
152
+ return True
153
+
154
+ except Exception as e:
155
+ print(f" [ERROR] {e}")
156
+ return False
157
+
158
+
159
+ def convert_batch(
160
+ input_dir: Path,
161
+ output_dir: Optional[Path],
162
+ options: Options,
163
+ use_cache: bool,
164
+ password: Optional[str],
165
+ verbose: bool,
166
+ ) -> int:
167
+ """Convert all PDFs in a folder."""
168
+
169
+ pdf_files = list(input_dir.glob("*.pdf"))
170
+
171
+ if not pdf_files:
172
+ print(f"No PDF files found in: {input_dir}")
173
+ return 0
174
+
175
+ print(f"[BATCH] Converting {len(pdf_files)} files from: {input_dir}")
176
+
177
+ if output_dir is None:
178
+ output_dir = input_dir
179
+ else:
180
+ output_dir = Path(output_dir)
181
+ output_dir.mkdir(parents=True, exist_ok=True)
182
+
183
+ success_count = 0
184
+
185
+ for pdf_file in pdf_files:
186
+ output_path = output_dir / pdf_file.with_suffix(".md").name
187
+
188
+ if convert_single_file(
189
+ pdf_file,
190
+ output_path,
191
+ options,
192
+ use_cache,
193
+ password,
194
+ verbose,
195
+ ):
196
+ success_count += 1
197
+
198
+ print(f"\n[DONE] Converted {success_count}/{len(pdf_files)} files")
199
+ return success_count
200
+
201
+
202
+ def main(args: Optional[List[str]] = None) -> int:
203
+ """Main entry point."""
204
+ parser = create_parser()
205
+ parsed = parser.parse_args(args)
206
+
207
+ input_path = Path(parsed.input)
208
+
209
+ if not input_path.exists():
210
+ print(f"[ERROR] Input not found: {input_path}")
211
+ return 1
212
+
213
+ # Build options using pdfmd Options class
214
+ options = Options(
215
+ ocr_mode=parsed.ocr,
216
+ export_images=parsed.export_images,
217
+ )
218
+
219
+ use_cache = not parsed.no_cache
220
+
221
+ # Handle batch or single file
222
+ if input_path.is_dir() or parsed.batch:
223
+ if not input_path.is_dir():
224
+ print(f"[ERROR] --batch requires a directory")
225
+ return 1
226
+
227
+ output_dir = Path(parsed.output) if parsed.output else None
228
+
229
+ success = convert_batch(
230
+ input_path,
231
+ output_dir,
232
+ options,
233
+ use_cache,
234
+ parsed.password,
235
+ parsed.verbose,
236
+ )
237
+
238
+ return 0 if success > 0 else 1
239
+ else:
240
+ output_path = Path(parsed.output) if parsed.output else None
241
+
242
+ success = convert_single_file(
243
+ input_path,
244
+ output_path,
245
+ options,
246
+ use_cache,
247
+ parsed.password,
248
+ parsed.verbose,
249
+ )
250
+
251
+ return 0 if success else 1
252
+
253
+
254
+ if __name__ == "__main__":
255
+ sys.exit(main())
@@ -0,0 +1,6 @@
1
+ """Core extraction and conversion modules."""
2
+
3
+ from .extractor import PDFExtractor
4
+ from .converter import PDFConverter
5
+
6
+ __all__ = ["PDFExtractor", "PDFConverter"]