PyPI - thinkpdf - Versions diffs - 1.0.1__py3-none-any.whl - Mend

thinkpdf 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

pdfbrain/__init__.py +22 -0
pdfbrain/app_gui.py +530 -0
pdfbrain/cache/__init__.py +5 -0
pdfbrain/cache/cache_manager.py +252 -0
pdfbrain/cli.py +255 -0
pdfbrain/core/__init__.py +6 -0
pdfbrain/core/converter.py +332 -0
pdfbrain/core/equations.py +635 -0
pdfbrain/core/extract.py +469 -0
pdfbrain/core/extractor.py +272 -0
pdfbrain/core/models.py +196 -0
pdfbrain/core/pipeline.py +287 -0
pdfbrain/core/render.py +574 -0
pdfbrain/core/tables.py +871 -0
pdfbrain/core/transform.py +604 -0
pdfbrain/core/utils.py +229 -0
pdfbrain/engine.py +392 -0
pdfbrain/mcp_server.py +315 -0
pdfbrain/utils/__init__.py +1 -0
thinkpdf-1.0.1.dist-info/METADATA +138 -0
thinkpdf-1.0.1.dist-info/RECORD +25 -0
thinkpdf-1.0.1.dist-info/WHEEL +5 -0
thinkpdf-1.0.1.dist-info/entry_points.txt +4 -0
thinkpdf-1.0.1.dist-info/licenses/LICENSE +620 -0
thinkpdf-1.0.1.dist-info/top_level.txt +1 -0

pdfbrain/core/extractor.py ADDED Viewed

@@ -0,0 +1,272 @@
+"""
+PDF Extractor - Core extraction engine using PyMuPDF.
+This module handles the low-level extraction of content from PDF files,
+including text, images, tables, and metadata.
+"""
+from __future__ import annotations
+import hashlib
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import List, Optional, Dict, Any, Callable
+import fitz  # PyMuPDF
+@dataclass
+class TextSpan:
+    """A span of text with formatting information."""
+    text: str
+    font: str = ""
+    size: float = 0.0
+    flags: int = 0  # bold, italic, etc.
+    color: int = 0
+    @property
+    def is_bold(self) -> bool:
+        return bool(self.flags & 2**4)
+    @property
+    def is_italic(self) -> bool:
+        return bool(self.flags & 2**1)
+@dataclass
+class TextLine:
+    """A line of text containing multiple spans."""
+    spans: List[TextSpan] = field(default_factory=list)
+    bbox: tuple = (0, 0, 0, 0)
+    @property
+    def text(self) -> str:
+        return "".join(span.text for span in self.spans)
+@dataclass
+class TextBlock:
+    """A block of text containing multiple lines."""
+    lines: List[TextLine] = field(default_factory=list)
+    bbox: tuple = (0, 0, 0, 0)
+    block_type: str = "text"  # text, image, table
+    @property
+    def text(self) -> str:
+        return "\n".join(line.text for line in self.lines)
+@dataclass
+class PageContent:
+    """Content extracted from a single page."""
+    page_number: int
+    blocks: List[TextBlock] = field(default_factory=list)
+    images: List[Dict[str, Any]] = field(default_factory=list)
+    width: float = 0.0
+    height: float = 0.0
+    @property
+    def text(self) -> str:
+        return "\n\n".join(block.text for block in self.blocks)
+@dataclass
+class DocumentContent:
+    """Content extracted from an entire document."""
+    pages: List[PageContent] = field(default_factory=list)
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    file_hash: str = ""
+    @property
+    def text(self) -> str:
+        return "\n\n".join(page.text for page in self.pages)
+    @property
+    def page_count(self) -> int:
+        return len(self.pages)
+class PDFExtractor:
+    """
+    Extract content from PDF files using PyMuPDF.
+    Features:
+    - Fast text extraction with formatting preservation
+    - Image extraction with metadata
+    - Support for password-protected PDFs
+    - Progress callbacks for large documents
+    """
+    def __init__(
+        self,
+        extract_images: bool = False,
+        preserve_formatting: bool = True,
+        progress_callback: Optional[Callable[[int, int], None]] = None,
+    ):
+        self.extract_images = extract_images
+        self.preserve_formatting = preserve_formatting
+        self.progress_callback = progress_callback
+    def extract(
+        self,
+        pdf_path: str | Path,
+        password: Optional[str] = None,
+        page_range: Optional[tuple[int, int]] = None,
+    ) -> DocumentContent:
+        """
+        Extract content from a PDF file.
+        Args:
+            pdf_path: Path to the PDF file
+            password: Optional password for encrypted PDFs
+            page_range: Optional (start, end) page range (0-indexed, inclusive)
+        Returns:
+            DocumentContent with extracted pages and metadata
+        """
+        pdf_path = Path(pdf_path)
+        # Calculate file hash for caching
+        file_hash = self._calculate_hash(pdf_path)
+        # Open document
+        doc = fitz.open(pdf_path)
+        if doc.is_encrypted:
+            if password:
+                doc.authenticate(password)
+            else:
+                raise ValueError("PDF is encrypted and no password provided")
+        # Determine page range
+        start_page = 0
+        end_page = doc.page_count - 1
+        if page_range:
+            start_page = max(0, page_range[0])
+            end_page = min(doc.page_count - 1, page_range[1])
+        # Extract metadata
+        metadata = {
+            "title": doc.metadata.get("title", ""),
+            "author": doc.metadata.get("author", ""),
+            "subject": doc.metadata.get("subject", ""),
+            "creator": doc.metadata.get("creator", ""),
+            "page_count": doc.page_count,
+            "file_path": str(pdf_path),
+        }
+        # Extract pages
+        pages = []
+        total_pages = end_page - start_page + 1
+        for i, page_num in enumerate(range(start_page, end_page + 1)):
+            page = doc[page_num]
+            page_content = self._extract_page(page, page_num)
+            pages.append(page_content)
+            if self.progress_callback:
+                self.progress_callback(i + 1, total_pages)
+        doc.close()
+        return DocumentContent(
+            pages=pages,
+            metadata=metadata,
+            file_hash=file_hash,
+        )
+    def _extract_page(self, page: fitz.Page, page_number: int) -> PageContent:
+        """Extract content from a single page."""
+        blocks = []
+        images = []
+        # Get page dimensions
+        rect = page.rect
+        width, height = rect.width, rect.height
+        # Extract text blocks with detailed info
+        text_dict = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE)
+        for block in text_dict.get("blocks", []):
+            if block.get("type") == 0:  # Text block
+                text_block = self._parse_text_block(block)
+                if text_block.text.strip():
+                    blocks.append(text_block)
+            elif block.get("type") == 1 and self.extract_images:  # Image block
+                image_info = self._extract_image_info(block, page)
+                if image_info:
+                    images.append(image_info)
+        return PageContent(
+            page_number=page_number,
+            blocks=blocks,
+            images=images,
+            width=width,
+            height=height,
+        )
+    def _parse_text_block(self, block: dict) -> TextBlock:
+        """Parse a PyMuPDF text block into our format."""
+        lines = []
+        bbox = block.get("bbox", (0, 0, 0, 0))
+        for line in block.get("lines", []):
+            spans = []
+            line_bbox = line.get("bbox", (0, 0, 0, 0))
+            for span in line.get("spans", []):
+                text_span = TextSpan(
+                    text=span.get("text", ""),
+                    font=span.get("font", ""),
+                    size=span.get("size", 0.0),
+                    flags=span.get("flags", 0),
+                    color=span.get("color", 0),
+                )
+                spans.append(text_span)
+            if spans:
+                lines.append(TextLine(spans=spans, bbox=line_bbox))
+        return TextBlock(lines=lines, bbox=bbox, block_type="text")
+    def _extract_image_info(self, block: dict, page: fitz.Page) -> Optional[Dict[str, Any]]:
+        """Extract image information from a block."""
+        try:
+            bbox = block.get("bbox", (0, 0, 0, 0))
+            return {
+                "bbox": bbox,
+                "width": bbox[2] - bbox[0],
+                "height": bbox[3] - bbox[1],
+                "page": page.number,
+            }
+        except Exception:
+            return None
+    def _calculate_hash(self, pdf_path: Path) -> str:
+        """Calculate SHA256 hash of the PDF file."""
+        sha256 = hashlib.sha256()
+        with open(pdf_path, "rb") as f:
+            for chunk in iter(lambda: f.read(8192), b""):
+                sha256.update(chunk)
+        return sha256.hexdigest()
+# Convenience function
+def extract_pdf(
+    pdf_path: str | Path,
+    password: Optional[str] = None,
+    extract_images: bool = False,
+) -> DocumentContent:
+    """
+    Quick extraction of PDF content.
+    Args:
+        pdf_path: Path to PDF file
+        password: Optional password
+        extract_images: Whether to extract image info
+    Returns:
+        DocumentContent with extracted content
+    """
+    extractor = PDFExtractor(extract_images=extract_images)
+    return extractor.extract(pdf_path, password=password)

pdfbrain/core/models.py ADDED Viewed

@@ -0,0 +1,196 @@
+"""Core data models for pdfmd.
+This module defines lightweight, serializable structures that represent the
+intermediate text model we pass through the pipeline:
+- Span:    A run of text with uniform styling.
+- Line:    A sequence of spans that appear on the same baseline.
+- Block:   A group of lines (roughly a paragraph or heading candidate).
+- PageText:All text blocks for a page.
+- Options: User-configurable knobs used by extract/transform/render stages.
+We provide static constructors to build PageText from:
+  • PyMuPDF ("dict" output)
+  • Tesseract (pytesseract.image_to_data dict)
+These constructors keep *only* the essentials the rest of the pipeline needs:
+text runs and coarse style hints (approx size, bold, italic). Layout geometry is
+not preserved beyond what helps basic heuristics.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import List, Dict, Any, Iterable, Optional, Literal
+# ---------------------------- Text structures ----------------------------
+@dataclass
+class Span:
+    text: str
+    size: float = 0.0
+    bold: bool = False
+    italic: bool = False
+@dataclass
+class Line:
+    spans: List[Span] = field(default_factory=list)
+    def text(self) -> str:
+        return "".join(s.text for s in self.spans)
+@dataclass
+class Block:
+    lines: List[Line] = field(default_factory=list)
+    def is_empty(self) -> bool:
+        for ln in self.lines:
+            if any(sp.text.strip() for sp in ln.spans):
+                return False
+        return True
+@dataclass
+class PageText:
+    blocks: List[Block] = field(default_factory=list)
+    # ------------------------ PyMuPDF constructor ------------------------
+    @staticmethod
+    def from_pymupdf(page_dict: Dict[str, Any]) -> "PageText":
+        """Build a PageText from fitz.Page.get_text("dict").
+        We extract spans (text, size, bold/italic hints) and group them into
+        lines and blocks following the original dict structure.
+        """
+        def span_style(span: Dict[str, Any]) -> tuple[float, bool, bool, str]:
+            txt = span.get("text", "") or ""
+            size = float(span.get("size", 0.0) or 0.0)
+            flags = int(span.get("flags", 0) or 0)
+            font = str(span.get("font", "") or "").lower()
+            # Heuristics similar to PyMuPDF semantics
+            is_bold = bool(flags & 16) or any(k in font for k in ("bold", "black", "heavy", "semibold"))
+            is_italic = bool(flags & 2) or any(k in font for k in ("italic", "oblique"))
+            return size, is_bold, is_italic, txt
+        blocks: List[Block] = []
+        for b in page_dict.get("blocks", []) or []:
+            if "lines" not in b:
+                # skip images and non-text blocks here
+                continue
+            lines: List[Line] = []
+            for ln in b.get("lines", []) or []:
+                spans: List[Span] = []
+                for sp in ln.get("spans", []) or []:
+                    size, bold, italic, txt = span_style(sp)
+                    if not txt:
+                        continue
+                    spans.append(Span(text=txt, size=size, bold=bold, italic=italic))
+                if spans:
+                    lines.append(Line(spans=spans))
+            if lines:
+                blocks.append(Block(lines=lines))
+        return PageText(blocks=blocks)
+    # ------------------------- Tesseract constructor -------------------------
+    @staticmethod
+    def from_tesseract_data(data: Dict[str, List[Any]]) -> "PageText":
+        """Build PageText from pytesseract.image_to_data() result.
+        The data dict contains parallel lists for keys: level, page_num, block_num,
+        par_num, line_num, word_num, left, top, width, height, conf, text.
+        We group by (block_num, line_num). We do not try to infer bold/italic.
+        A crude font-size proxy uses the median of word heights in a line.
+        """
+        n = len(data.get("text", []))
+        if n == 0:
+            return PageText()
+        # Group indices by (block_num, line_num)
+        groups: Dict[tuple[int, int], List[int]] = {}
+        for i in range(n):
+            txt = data["text"][i] or ""
+            if not txt.strip():
+                continue
+            bno = int(data.get("block_num", [0]*n)[i] or 0)
+            lno = int(data.get("line_num", [0]*n)[i] or 0)
+            groups.setdefault((bno, lno), []).append(i)
+        # Sort groups by block, then line order (by top coordinate if present)
+        def group_top(idx_list: List[int]) -> int:
+            tops = [int(data.get("top", [0]*n)[i] or 0) for i in idx_list]
+            return min(tops) if tops else 0
+        ordered_keys = sorted(groups.keys(), key=lambda k: (k[0], group_top(groups[k])))
+        blocks: List[Block] = []
+        cur_block_key: Optional[int] = None
+        cur_block_lines: List[Line] = []
+        for (bno, lno) in ordered_keys:
+            idxs = groups[(bno, lno)]
+            # estimate size by median of heights in this line
+            heights = [int(data.get("height", [0]*n)[i] or 0) for i in idxs]
+            size_est = float(median_safe(heights)) if heights else 0.0
+            # assemble spans in reading order (left coordinate if present)
+            idxs_sorted = sorted(idxs, key=lambda i: int(data.get("left", [0]*n)[i] or 0))
+            spans = [Span(text=str(data["text"][i]), size=size_est) for i in idxs_sorted]
+            line = Line(spans=spans)
+            if cur_block_key is None:
+                cur_block_key = bno
+            if bno != cur_block_key:
+                # flush previous block
+                if cur_block_lines:
+                    blocks.append(Block(lines=cur_block_lines))
+                cur_block_lines = [line]
+                cur_block_key = bno
+            else:
+                cur_block_lines.append(line)
+        if cur_block_lines:
+            blocks.append(Block(lines=cur_block_lines))
+        return PageText(blocks=blocks)
+# ------------------------------ Options ------------------------------
+@dataclass
+class Options:
+    # Extraction / OCR
+    ocr_mode: Literal["off", "auto", "tesseract", "ocrmypdf"] = "off"
+    preview_only: bool = False
+    # Transform heuristics
+    caps_to_headings: bool = True
+    defragment_short: bool = True
+    heading_size_ratio: float = 1.15
+    orphan_max_len: int = 45
+    remove_headers_footers: bool = True
+    # Rendering / output
+    insert_page_breaks: bool = False
+    export_images: bool = False
+# ------------------------------ Utilities ------------------------------
+def median_safe(vals: Iterable[int | float]) -> float:
+    xs = [float(v) for v in vals]
+    if not xs:
+        return 0.0
+    xs.sort()
+    m = len(xs) // 2
+    if len(xs) % 2:
+        return xs[m]
+    return (xs[m - 1] + xs[m]) / 2.0
+__all__ = [
+    "Span",
+    "Line",
+    "Block",
+    "PageText",
+    "Options",
+    "median_safe",
+]