PyPI - thinkpdf - Versions diffs - 1.0.1__py3-none-any.whl - Mend

thinkpdf 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

pdfbrain/__init__.py +22 -0
pdfbrain/app_gui.py +530 -0
pdfbrain/cache/__init__.py +5 -0
pdfbrain/cache/cache_manager.py +252 -0
pdfbrain/cli.py +255 -0
pdfbrain/core/__init__.py +6 -0
pdfbrain/core/converter.py +332 -0
pdfbrain/core/equations.py +635 -0
pdfbrain/core/extract.py +469 -0
pdfbrain/core/extractor.py +272 -0
pdfbrain/core/models.py +196 -0
pdfbrain/core/pipeline.py +287 -0
pdfbrain/core/render.py +574 -0
pdfbrain/core/tables.py +871 -0
pdfbrain/core/transform.py +604 -0
pdfbrain/core/utils.py +229 -0
pdfbrain/engine.py +392 -0
pdfbrain/mcp_server.py +315 -0
pdfbrain/utils/__init__.py +1 -0
thinkpdf-1.0.1.dist-info/METADATA +138 -0
thinkpdf-1.0.1.dist-info/RECORD +25 -0
thinkpdf-1.0.1.dist-info/WHEEL +5 -0
thinkpdf-1.0.1.dist-info/entry_points.txt +4 -0
thinkpdf-1.0.1.dist-info/licenses/LICENSE +620 -0
thinkpdf-1.0.1.dist-info/top_level.txt +1 -0

pdfbrain/cache/cache_manager.py ADDED Viewed

@@ -0,0 +1,252 @@
+"""
+Cache Manager - Intelligent caching for PDF conversions.
+Avoids re-processing PDFs that haven't changed, saving time
+and computational resources.
+"""
+from __future__ import annotations
+import hashlib
+import json
+import os
+from dataclasses import dataclass, asdict
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, Dict, Any
+@dataclass
+class CacheEntry:
+    """A cached conversion result."""
+    file_hash: str
+    file_path: str
+    file_size: int
+    conversion_time: str
+    options_hash: str
+    markdown_path: str
+    metadata: Dict[str, Any]
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "CacheEntry":
+        return cls(**data)
+class CacheManager:
+    """
+    Manage cached PDF conversions.
+    Features:
+    - SHA256-based file hashing
+    - Options-aware caching (different options = different cache)
+    - Automatic cache invalidation on file changes
+    - Configurable cache location
+    - Cache size limits
+    """
+    DEFAULT_CACHE_DIR = Path.home() / ".thinkpdf" / "cache"
+    INDEX_FILE = "cache_index.json"
+    MAX_CACHE_SIZE_MB = 500
+    def __init__(self, cache_dir: Optional[Path] = None):
+        self.cache_dir = cache_dir or self.DEFAULT_CACHE_DIR
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        self.index_path = self.cache_dir / self.INDEX_FILE
+        self._index: Dict[str, CacheEntry] = {}
+        self._load_index()
+    def get_cached(
+        self,
+        pdf_path: str | Path,
+        options_hash: Optional[str] = None,
+    ) -> Optional[str]:
+        """
+        Get cached markdown if available.
+        Args:
+            pdf_path: Path to the PDF file
+            options_hash: Hash of conversion options
+        Returns:
+            Cached markdown content or None if not cached
+        """
+        pdf_path = Path(pdf_path)
+        cache_key = self._make_cache_key(pdf_path, options_hash)
+        if cache_key not in self._index:
+            return None
+        entry = self._index[cache_key]
+        # Verify file hasn't changed
+        current_hash = self._hash_file(pdf_path)
+        if current_hash != entry.file_hash:
+            # File changed, invalidate cache
+            self._remove_entry(cache_key)
+            return None
+        # Read cached markdown
+        md_path = Path(entry.markdown_path)
+        if not md_path.exists():
+            self._remove_entry(cache_key)
+            return None
+        return md_path.read_text(encoding="utf-8")
+    def cache(
+        self,
+        pdf_path: str | Path,
+        markdown: str,
+        options_hash: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """
+        Cache a conversion result.
+        Args:
+            pdf_path: Path to the original PDF
+            markdown: Converted markdown content
+            options_hash: Hash of conversion options
+            metadata: Optional metadata to store
+        """
+        pdf_path = Path(pdf_path)
+        cache_key = self._make_cache_key(pdf_path, options_hash)
+        # Save markdown file
+        md_filename = f"{cache_key}.md"
+        md_path = self.cache_dir / md_filename
+        md_path.write_text(markdown, encoding="utf-8")
+        # Create cache entry
+        entry = CacheEntry(
+            file_hash=self._hash_file(pdf_path),
+            file_path=str(pdf_path.absolute()),
+            file_size=pdf_path.stat().st_size,
+            conversion_time=datetime.now().isoformat(),
+            options_hash=options_hash or "",
+            markdown_path=str(md_path),
+            metadata=metadata or {},
+        )
+        self._index[cache_key] = entry
+        self._save_index()
+        # Check cache size
+        self._enforce_size_limit()
+    def invalidate(self, pdf_path: str | Path) -> None:
+        """Invalidate all cache entries for a PDF file."""
+        pdf_path = Path(pdf_path)
+        keys_to_remove = [
+            key for key, entry in self._index.items()
+            if Path(entry.file_path) == pdf_path.absolute()
+        ]
+        for key in keys_to_remove:
+            self._remove_entry(key)
+        self._save_index()
+    def clear(self) -> None:
+        """Clear all cached entries."""
+        for key in list(self._index.keys()):
+            self._remove_entry(key)
+        self._index.clear()
+        self._save_index()
+    def get_stats(self) -> Dict[str, Any]:
+        """Get cache statistics."""
+        total_size = sum(
+            Path(entry.markdown_path).stat().st_size
+            for entry in self._index.values()
+            if Path(entry.markdown_path).exists()
+        )
+        return {
+            "entries": len(self._index),
+            "total_size_bytes": total_size,
+            "total_size_mb": total_size / (1024 * 1024),
+            "cache_dir": str(self.cache_dir),
+        }
+    def _make_cache_key(self, pdf_path: Path, options_hash: Optional[str]) -> str:
+        """Create a unique cache key for a PDF + options combination."""
+        key_str = str(pdf_path.absolute())
+        if options_hash:
+            key_str += f":{options_hash}"
+        return hashlib.sha256(key_str.encode()).hexdigest()[:16]
+    def _hash_file(self, file_path: Path) -> str:
+        """Calculate SHA256 hash of a file."""
+        sha256 = hashlib.sha256()
+        with open(file_path, "rb") as f:
+            for chunk in iter(lambda: f.read(8192), b""):
+                sha256.update(chunk)
+        return sha256.hexdigest()
+    def _remove_entry(self, cache_key: str) -> None:
+        """Remove a cache entry and its markdown file."""
+        if cache_key in self._index:
+            entry = self._index[cache_key]
+            md_path = Path(entry.markdown_path)
+            if md_path.exists():
+                md_path.unlink()
+            del self._index[cache_key]
+    def _load_index(self) -> None:
+        """Load the cache index from disk."""
+        if self.index_path.exists():
+            try:
+                data = json.loads(self.index_path.read_text(encoding="utf-8"))
+                self._index = {
+                    key: CacheEntry.from_dict(value)
+                    for key, value in data.items()
+                }
+            except (json.JSONDecodeError, KeyError):
+                self._index = {}
+    def _save_index(self) -> None:
+        """Save the cache index to disk."""
+        data = {
+            key: entry.to_dict()
+            for key, entry in self._index.items()
+        }
+        self.index_path.write_text(
+            json.dumps(data, indent=2),
+            encoding="utf-8",
+        )
+    def _enforce_size_limit(self) -> None:
+        """Remove oldest entries if cache exceeds size limit."""
+        max_size_bytes = self.MAX_CACHE_SIZE_MB * 1024 * 1024
+        # Calculate current size
+        entries_with_size = []
+        for key, entry in self._index.items():
+            md_path = Path(entry.markdown_path)
+            if md_path.exists():
+                size = md_path.stat().st_size
+                entries_with_size.append((key, entry.conversion_time, size))
+        total_size = sum(e[2] for e in entries_with_size)
+        if total_size <= max_size_bytes:
+            return
+        # Sort by conversion time (oldest first)
+        entries_with_size.sort(key=lambda x: x[1])
+        # Remove oldest until under limit
+        for key, _, size in entries_with_size:
+            if total_size <= max_size_bytes:
+                break
+            self._remove_entry(key)
+            total_size -= size
+        self._save_index()

pdfbrain/cli.py ADDED Viewed

@@ -0,0 +1,255 @@
+"""
+thinkpdf CLI Pro - Uses the full pdfmd pipeline for best quality conversion.
+This CLI uses the advanced modules from pdfmd for:
+- Table detection and reconstruction
+- Equation/LaTeX detection
+- Header/footer removal
+- Smart paragraph merging
+"""
+from __future__ import annotations
+import argparse
+import sys
+from pathlib import Path
+from typing import List, Optional
+# Use the pdfmd pipeline
+from .core.pipeline import pdf_to_markdown
+from .core.models import Options
+from .cache.cache_manager import CacheManager
+def create_parser() -> argparse.ArgumentParser:
+    """Create the argument parser."""
+    parser = argparse.ArgumentParser(
+        prog="thinkpdf",
+        description="thinkpdf Pro - The Ultimate PDF to Markdown Converter",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  thinkpdf document.pdf                    # Convert single file
+  thinkpdf document.pdf -o output.md       # Specify output path
+  thinkpdf folder/ --batch                 # Convert all PDFs in folder
+        """,
+    )
+    parser.add_argument(
+        "input",
+        help="PDF file or folder to convert",
+    )
+    parser.add_argument(
+        "-o", "--output",
+        help="Output markdown file or folder",
+        default=None,
+    )
+    parser.add_argument(
+        "--batch",
+        action="store_true",
+        help="Batch convert all PDFs in a folder",
+    )
+    parser.add_argument(
+        "--no-cache",
+        action="store_true",
+        help="Skip cache and force re-conversion",
+    )
+    parser.add_argument(
+        "--ocr",
+        choices=["off", "auto", "force"],
+        default="auto",
+        help="OCR mode (default: auto)",
+    )
+    parser.add_argument(
+        "--export-images",
+        action="store_true",
+        help="Export images to _assets folder",
+    )
+    parser.add_argument(
+        "--password",
+        help="Password for encrypted PDFs",
+        default=None,
+    )
+    parser.add_argument(
+        "-v", "--verbose",
+        action="store_true",
+        help="Verbose output",
+    )
+    parser.add_argument(
+        "--version",
+        action="version",
+        version="thinkpdf Pro 1.1.0",
+    )
+    return parser
+def convert_single_file(
+    input_path: Path,
+    output_path: Optional[Path],
+    options: Options,
+    use_cache: bool,
+    password: Optional[str],
+    verbose: bool,
+) -> bool:
+    """Convert a single PDF file using the full pipeline."""
+    def log(msg: str) -> None:
+        if verbose:
+            print(f"  {msg}")
+    def progress(done: int, total: int) -> None:
+        if verbose:
+            pct = done * 100 // total
+            print(f"  Progress: {pct}%", end="\r")
+    # Determine output path
+    if output_path is None:
+        output_path = input_path.with_suffix(".md")
+    print(f"[PDF] Converting: {input_path.name}")
+    # Check cache
+    cache = CacheManager() if use_cache else None
+    if cache:
+        cached = cache.get_cached(input_path)
+        if cached:
+            output_path.write_text(cached, encoding="utf-8")
+            print(f"  [CACHE] Loaded from cache -> {output_path.name}")
+            return True
+    # Convert using pdfmd pipeline
+    try:
+        pdf_to_markdown(
+            input_pdf=str(input_path),
+            output_md=str(output_path),
+            options=options,
+            progress_cb=progress if verbose else None,
+            log_cb=log,
+            pdf_password=password,
+        )
+        # Read result for caching
+        markdown = output_path.read_text(encoding="utf-8")
+        # Cache result
+        if cache:
+            cache.cache(input_path, markdown)
+        word_count = len(markdown.split())
+        print(f"  [OK] Converted -> {output_path.name}")
+        print(f"       {word_count} words")
+        return True
+    except Exception as e:
+        print(f"  [ERROR] {e}")
+        return False
+def convert_batch(
+    input_dir: Path,
+    output_dir: Optional[Path],
+    options: Options,
+    use_cache: bool,
+    password: Optional[str],
+    verbose: bool,
+) -> int:
+    """Convert all PDFs in a folder."""
+    pdf_files = list(input_dir.glob("*.pdf"))
+    if not pdf_files:
+        print(f"No PDF files found in: {input_dir}")
+        return 0
+    print(f"[BATCH] Converting {len(pdf_files)} files from: {input_dir}")
+    if output_dir is None:
+        output_dir = input_dir
+    else:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+    success_count = 0
+    for pdf_file in pdf_files:
+        output_path = output_dir / pdf_file.with_suffix(".md").name
+        if convert_single_file(
+            pdf_file,
+            output_path,
+            options,
+            use_cache,
+            password,
+            verbose,
+        ):
+            success_count += 1
+    print(f"\n[DONE] Converted {success_count}/{len(pdf_files)} files")
+    return success_count
+def main(args: Optional[List[str]] = None) -> int:
+    """Main entry point."""
+    parser = create_parser()
+    parsed = parser.parse_args(args)
+    input_path = Path(parsed.input)
+    if not input_path.exists():
+        print(f"[ERROR] Input not found: {input_path}")
+        return 1
+    # Build options using pdfmd Options class
+    options = Options(
+        ocr_mode=parsed.ocr,
+        export_images=parsed.export_images,
+    )
+    use_cache = not parsed.no_cache
+    # Handle batch or single file
+    if input_path.is_dir() or parsed.batch:
+        if not input_path.is_dir():
+            print(f"[ERROR] --batch requires a directory")
+            return 1
+        output_dir = Path(parsed.output) if parsed.output else None
+        success = convert_batch(
+            input_path,
+            output_dir,
+            options,
+            use_cache,
+            parsed.password,
+            parsed.verbose,
+        )
+        return 0 if success > 0 else 1
+    else:
+        output_path = Path(parsed.output) if parsed.output else None
+        success = convert_single_file(
+            input_path,
+            output_path,
+            options,
+            use_cache,
+            parsed.password,
+            parsed.verbose,
+        )
+        return 0 if success else 1
+if __name__ == "__main__":
+    sys.exit(main())

pdfbrain/core/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Core extraction and conversion modules."""
+from .extractor import PDFExtractor
+from .converter import PDFConverter
+__all__ = ["PDFExtractor", "PDFConverter"]