PyPI - rc-docparser - Versions diffs - 0.2.0__py3-none-any.whl - Mend

rc-docparser 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

docparser/__init__.py +87 -0
docparser/cli.py +209 -0
docparser/common.py +163 -0
docparser/csvtab.py +131 -0
docparser/docx.py +488 -0
docparser/epub.py +349 -0
docparser/html.py +322 -0
docparser/image.py +343 -0
docparser/localvlm.py +103 -0
docparser/ocr.py +68 -0
docparser/orchestrator.py +304 -0
docparser/pdf.py +430 -0
docparser/pdf_backends.py +89 -0
docparser/pptx.py +332 -0
docparser/py.typed +0 -0
docparser/text.py +189 -0
docparser/xlsx.py +319 -0
rc_docparser-0.2.0.dist-info/METADATA +344 -0
rc_docparser-0.2.0.dist-info/RECORD +22 -0
rc_docparser-0.2.0.dist-info/WHEEL +4 -0
rc_docparser-0.2.0.dist-info/entry_points.txt +2 -0
rc_docparser-0.2.0.dist-info/licenses/LICENSE +21 -0

docparser/pdf.py ADDED Viewed

@@ -0,0 +1,430 @@
+"""PDF parser using PyMuPDF, with optional high-fidelity backends.
+The builtin engine extracts text page-by-page (preserving reading order via
+PyMuPDF's "blocks" API) and embedded raster images, using a best-effort
+heading classifier based on font sizing. On top of that it offers:
+- ``backend=``     route conversion to a third-party engine (pymupdf4llm /
+  docling / marker) for higher-fidelity Markdown (see
+  :mod:`docparser.pdf_backends`).
+- ``ocr=``         ``"off" | "auto" | "force"`` - OCR scanned/low-text pages via
+  the ``[ocr]`` extra (see :mod:`docparser.ocr`).
+- ``extract_tables`` use ``pdfplumber`` (the ``[tables]`` extra) to emit real
+  table blocks instead of flattened text.
+Requires the ``[pdf]`` extra: ``pip install 'docparser[pdf]'``.
+"""
+from __future__ import annotations
+import re
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any
+from .common import (
+    WorkspaceLayout,
+    bytes_sha1,
+    file_sha1,
+    truncate,
+    utc_now_iso,
+    write_json,
+    write_text,
+)
+CAPTION_RE = re.compile(
+    r"^\s*(figure|fig\.?|table|scheme|chart|diagram)\s*[:.\-]?\s*\d+[.\:\-]?\s+",
+    re.IGNORECASE,
+)
+# A page with fewer than this many extractable characters is treated as
+# "scanned" by ``ocr="auto"``.
+OCR_AUTO_MIN_CHARS = 80
+def _percentile(values: list[float], pct: float) -> float:
+    if not values:
+        return 0.0
+    s = sorted(values)
+    k = max(0, min(len(s) - 1, round(pct * (len(s) - 1))))
+    return s[k]
+def _render_table_md(rows: list[list[str]]) -> list[str]:
+    if not rows:
+        return []
+    ncols = max(len(r) for r in rows)
+    out = []
+    header = rows[0]
+    header_cells = [
+        (header[c] if c < len(header) else "").replace("|", "\\|").replace("\n", " ")
+        for c in range(ncols)
+    ]
+    out.append("| " + " | ".join(header_cells) + " |")
+    out.append("| " + " | ".join(["---"] * ncols) + " |")
+    for row in rows[1:]:
+        cells = [
+            (row[c] if c < len(row) else "").replace("|", "\\|").replace("\n", " ")
+            for c in range(ncols)
+        ]
+        out.append("| " + " | ".join(cells) + " |")
+    out.append("")
+    return out
+def parse_pdf(
+    source: Path | str,
+    layout: WorkspaceLayout | None = None,
+    *,
+    captioner: Callable[..., dict[str, Any]] | None = None,
+    write_outputs: bool = True,
+    extract_images: bool = True,
+    backend: str = "builtin",
+    ocr: str = "off",
+    extract_tables: bool = False,
+) -> dict[str, Any]:
+    """Parse a PDF into Markdown + JSON.
+    Parameters
+    ----------
+    extract_images : bool
+        If False, skip raster image extraction (faster for text-only docs).
+    backend : str
+        ``"builtin"`` (default) uses PyMuPDF heuristics. ``"pymupdf4llm"``,
+        ``"docling"``, or ``"marker"`` route to the corresponding extra for
+        higher-fidelity Markdown; images are still extracted via PyMuPDF.
+    ocr : str
+        ``"off"`` (default), ``"auto"`` (OCR only low-text pages), or
+        ``"force"`` (OCR every page). Requires the ``[ocr]`` extra.
+    extract_tables : bool
+        If True, extract tables with ``pdfplumber`` (the ``[tables]`` extra)
+        and emit ``table`` blocks.
+    """
+    try:
+        import fitz  # type: ignore  # PyMuPDF
+    except ImportError as exc:  # pragma: no cover
+        raise ImportError(
+            "docparser.pdf.parse_pdf requires the [pdf] extra. "
+            "Install with: pip install 'docparser[pdf]'"
+        ) from exc
+    if ocr not in {"off", "auto", "force"}:
+        raise ValueError(f"ocr must be 'off', 'auto', or 'force'; got {ocr!r}")
+    source = Path(source)
+    layout = layout or WorkspaceLayout()
+    real_source = source.resolve()
+    out_dir = layout.parsed_dir_for(source)
+    asset_dir = layout.assets_dir_for(source)
+    if write_outputs:
+        out_dir.mkdir(parents=True, exist_ok=True)
+        if extract_images:
+            asset_dir.mkdir(parents=True, exist_ok=True)
+    doc = fitz.open(str(real_source))
+    blocks_payload: list[dict[str, Any]] = []
+    images_payload: list[dict[str, Any]] = []
+    image_caption_results: dict[str, dict[str, Any]] = {}
+    section_stack: list[str] = []
+    state = {"image_seq": 0}
+    md_lines: list[str] = [
+        f"# {source.stem}",
+        "",
+        f"> Source: `{source.name}` \u00b7 sha1 `{file_sha1(real_source)[:12]}` "
+        f"\u00b7 parsed `{utc_now_iso()}` \u00b7 pages: {doc.page_count} \u00b7 backend: {backend}",
+        "",
+    ]
+    # ---- shared image extraction (used by every backend) -------------------
+    def process_page_images(page, page_idx: int) -> None:
+        for img_info in page.get_images(full=True) or []:
+            xref = img_info[0]
+            try:
+                img_dict = doc.extract_image(xref)
+            except Exception:
+                continue
+            if not img_dict:
+                continue
+            blob = img_dict.get("image")
+            ext = (img_dict.get("ext") or "png").lower()
+            if not blob:
+                continue
+            state["image_seq"] += 1
+            image_seq = state["image_seq"]
+            sha = bytes_sha1(blob)
+            asset_name = f"img-{image_seq:03d}-{sha[:10]}.{ext}"
+            asset_path = asset_dir / asset_name
+            if write_outputs and not asset_path.exists():
+                asset_path.write_bytes(blob)
+            ctx_before = next(
+                (
+                    b["text"]
+                    for b in reversed(blocks_payload)
+                    if b.get("page") == page_idx + 1
+                    and b.get("kind") in {"paragraph", "caption", "heading"}
+                ),
+                "",
+            )
+            cap = None
+            if captioner is not None:
+                try:
+                    cap = captioner(
+                        image_bytes=blob,
+                        mime=f"image/{'jpeg' if ext == 'jpg' else ext}",
+                        doc_name=f"{source.name} :: page {page_idx + 1}",
+                        nearby_caption="",
+                        context=ctx_before,
+                    )
+                except Exception as exc:
+                    cap = {"error": str(exc)}
+            image_caption_results[str(image_seq)] = cap or {}
+            rel = layout.relpath_from_parsed(asset_path, source)
+            alt = (cap or {}).get("caption") or f"page-{page_idx + 1}-image-{image_seq}"
+            md_lines.append(f"![{alt}]({rel})")
+            if cap and cap.get("description"):
+                md_lines.append("")
+                md_lines.append(f"<!-- vlm: {cap.get('model','')} -->")
+                md_lines.append(f"> **VLM caption.** {cap.get('caption','')}")
+                md_lines.append(">")
+                md_lines.append(f"> {cap.get('description','')}")
+                if cap.get("visible_text"):
+                    md_lines.append(">")
+                    vt = cap["visible_text"].replace("\n", "\n> ")
+                    md_lines.append(f"> *Visible text:* {vt}")
+                if cap.get("tags"):
+                    md_lines.append(">")
+                    md_lines.append("> *Tags:* " + ", ".join(cap["tags"]))
+            md_lines.append("")
+            images_payload.append(
+                {
+                    "seq": image_seq,
+                    "page": page_idx + 1,
+                    "xref": xref,
+                    "ext": ext,
+                    "sha1": sha,
+                    "asset_path": rel,
+                    "context_before": truncate(ctx_before, 600),
+                    "section_path": list(section_stack),
+                    "semantic": cap,
+                }
+            )
+    def page_tables(page_idx: int) -> list[list[list[str]]]:
+        try:
+            import pdfplumber  # type: ignore
+        except ImportError as exc:  # pragma: no cover - optional dep
+            raise ImportError(
+                "extract_tables=True requires the [tables] extra. "
+                "Install with: pip install 'docparser[tables]'"
+            ) from exc
+        out: list[list[list[str]]] = []
+        with pdfplumber.open(str(real_source)) as pdf:
+            if page_idx >= len(pdf.pages):
+                return out
+            for tbl in pdf.pages[page_idx].extract_tables() or []:
+                norm = [[(c or "").strip() for c in row] for row in tbl]
+                if any(any(c for c in row) for row in norm):
+                    out.append(norm)
+        return out
+    if backend == "builtin":
+        # First pass: collect line font sizes to set heading thresholds.
+        all_sizes: list[float] = []
+        for page in doc:
+            td = page.get_text("dict")
+            for block in td.get("blocks", []) or []:
+                if block.get("type") != 0:
+                    continue
+                for line in block.get("lines", []) or []:
+                    for span in line.get("spans", []) or []:
+                        sz = float(span.get("size") or 0.0)
+                        if sz > 0:
+                            all_sizes.append(sz)
+        body_size = _percentile(all_sizes, 0.5) if all_sizes else 11.0
+        h_thresh = body_size * 1.2
+        for page_idx, page in enumerate(doc):
+            page_text_chars = 0
+            td = page.get_text("dict")
+            for b_idx, block in enumerate(td.get("blocks", []) or []):
+                if block.get("type") != 0:  # only text blocks here
+                    continue
+                lines = block.get("lines", []) or []
+                if not lines:
+                    continue
+                line_texts: list[str] = []
+                max_span_size = 0.0
+                any_bold = False
+                for line in lines:
+                    spans = line.get("spans", []) or []
+                    line_text = "".join(span.get("text", "") for span in spans).strip()
+                    if not line_text:
+                        continue
+                    line_texts.append(line_text)
+                    for span in spans:
+                        sz = float(span.get("size") or 0.0)
+                        if sz > max_span_size:
+                            max_span_size = sz
+                        if int(span.get("flags") or 0) & 16:
+                            any_bold = True
+                text = "\n".join(line_texts).strip()
+                if not text:
+                    continue
+                page_text_chars += len(text)
+                location = f"page[{page_idx + 1}].block[{b_idx}]"
+                is_heading = (
+                    max_span_size >= h_thresh
+                    and len(text) <= 140
+                    and not text.endswith(".")
+                )
+                if is_heading:
+                    if max_span_size >= body_size * 1.6:
+                        level = 1
+                    elif max_span_size >= body_size * 1.35:
+                        level = 2
+                    else:
+                        level = 3
+                    while len(section_stack) >= level:
+                        section_stack.pop()
+                    section_stack.append(text)
+                    blocks_payload.append(
+                        {
+                            "kind": "heading",
+                            "level": level,
+                            "text": text,
+                            "size": max_span_size,
+                            "bold": any_bold,
+                            "page": page_idx + 1,
+                            "location": location,
+                            "section_path": list(section_stack),
+                        }
+                    )
+                    md_lines.append(f"{'#' * (level + 1)} {text}")
+                    md_lines.append("")
+                else:
+                    kind = "caption" if CAPTION_RE.match(text) else "paragraph"
+                    blocks_payload.append(
+                        {
+                            "kind": kind,
+                            "text": text,
+                            "size": max_span_size,
+                            "bold": any_bold,
+                            "page": page_idx + 1,
+                            "location": location,
+                            "section_path": list(section_stack),
+                        }
+                    )
+                    md_lines.append(f"*{text}*" if kind == "caption" else text)
+                    md_lines.append("")
+            # ---- OCR fallback for scanned / low-text pages ----------------
+            if ocr == "force" or (ocr == "auto" and page_text_chars < OCR_AUTO_MIN_CHARS):
+                from .ocr import ocr_pdf_page
+                ocr_text = ocr_pdf_page(page)
+                if ocr_text:
+                    blocks_payload.append(
+                        {
+                            "kind": "paragraph",
+                            "text": ocr_text,
+                            "page": page_idx + 1,
+                            "location": f"page[{page_idx + 1}].ocr",
+                            "ocr": True,
+                            "section_path": list(section_stack),
+                        }
+                    )
+                    md_lines.append(ocr_text)
+                    md_lines.append("")
+            # ---- tables ---------------------------------------------------
+            if extract_tables:
+                for t_idx, rows in enumerate(page_tables(page_idx)):
+                    blocks_payload.append(
+                        {
+                            "kind": "table",
+                            "rows": rows,
+                            "page": page_idx + 1,
+                            "location": f"page[{page_idx + 1}].table[{t_idx}]",
+                            "section_path": list(section_stack),
+                        }
+                    )
+                    md_lines.extend(_render_table_md(rows))
+            # ---- images ---------------------------------------------------
+            if extract_images:
+                process_page_images(page, page_idx)
+    else:
+        # External high-fidelity backend: Markdown + derived blocks.
+        from .pdf_backends import run_backend
+        result = run_backend(backend, real_source)
+        for b in result["blocks"]:
+            b.setdefault("page", None)
+            blocks_payload.append(b)
+        if result["markdown"].strip():
+            md_lines.append(result["markdown"].strip())
+            md_lines.append("")
+        if extract_tables:
+            md_lines.append("## Tables")
+            md_lines.append("")
+            for page_idx in range(doc.page_count):
+                for t_idx, rows in enumerate(page_tables(page_idx)):
+                    blocks_payload.append(
+                        {
+                            "kind": "table",
+                            "rows": rows,
+                            "page": page_idx + 1,
+                            "location": f"page[{page_idx + 1}].table[{t_idx}]",
+                            "section_path": [],
+                        }
+                    )
+                    md_lines.extend(_render_table_md(rows))
+        if extract_images:
+            md_lines.append("## Extracted images")
+            md_lines.append("")
+            for page_idx, page in enumerate(doc):
+                process_page_images(page, page_idx)
+    md_text = "\n".join(md_lines).rstrip() + "\n"
+    json_payload = {
+        "source": {
+            "filename": source.name,
+            "absolute_path": str(real_source),
+            "sha1": file_sha1(real_source),
+            "size_bytes": real_source.stat().st_size,
+            "kind": "pdf",
+        },
+        "parsed_at": utc_now_iso(),
+        "backend": backend,
+        "ocr": ocr,
+        "n_pages": doc.page_count,
+        "blocks": blocks_payload,
+        "images": images_payload,
+        "stats": {
+            "n_blocks": len(blocks_payload),
+            "n_headings": sum(1 for b in blocks_payload if b["kind"] == "heading"),
+            "n_paragraphs": sum(1 for b in blocks_payload if b["kind"] == "paragraph"),
+            "n_captions": sum(1 for b in blocks_payload if b["kind"] == "caption"),
+            "n_tables": sum(1 for b in blocks_payload if b["kind"] == "table"),
+            "n_ocr_blocks": sum(1 for b in blocks_payload if b.get("ocr")),
+            "n_images": len(images_payload),
+            "n_captioned_images": sum(
+                1
+                for v in image_caption_results.values()
+                if v and not v.get("error") and v.get("caption")
+            ),
+        },
+    }
+    if write_outputs:
+        write_text(out_dir / "document.md", md_text)
+        write_json(out_dir / "document.json", json_payload)
+    doc.close()
+    return json_payload

docparser/pdf_backends.py ADDED Viewed

@@ -0,0 +1,89 @@
+"""Pluggable high-fidelity PDF -> Markdown backends.
+The builtin PyMuPDF parser in :mod:`docparser.pdf` is fast and dependency-light
+but uses heuristics for layout. For higher fidelity (tables, multi-column,
+formulas) callers can route to a third-party backend. Each backend converts a
+PDF to Markdown; this module normalizes that Markdown into docparser's block
+schema via :func:`docparser.text._blocks_from_markdown`.
+All backends are optional extras and lazily imported:
+- ``pymupdf4llm`` -> ``pip install 'docparser[pymupdf4llm]'`` (note: AGPL/commercial)
+- ``docling``     -> ``pip install 'docparser[docling]'`` (MIT)
+- ``marker``      -> ``pip install 'docparser[marker]'`` (GPL-3.0)
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+from .text import _blocks_from_markdown
+AVAILABLE_BACKENDS = ("builtin", "pymupdf4llm", "docling", "marker")
+def _markdown_pymupdf4llm(path: Path) -> str:
+    try:
+        import pymupdf4llm  # type: ignore
+    except ImportError as exc:  # pragma: no cover - optional dep
+        raise ImportError(
+            "backend='pymupdf4llm' requires the [pymupdf4llm] extra. "
+            "Install with: pip install 'docparser[pymupdf4llm]'"
+        ) from exc
+    return pymupdf4llm.to_markdown(str(path))
+def _markdown_docling(path: Path) -> str:
+    try:
+        from docling.document_converter import DocumentConverter  # type: ignore
+    except ImportError as exc:  # pragma: no cover - optional dep
+        raise ImportError(
+            "backend='docling' requires the [docling] extra. "
+            "Install with: pip install 'docparser[docling]'"
+        ) from exc
+    converter = DocumentConverter()
+    result = converter.convert(str(path))
+    return result.document.export_to_markdown()
+def _markdown_marker(path: Path) -> str:
+    try:
+        from marker.config.parser import ConfigParser  # type: ignore
+        from marker.converters.pdf import PdfConverter  # type: ignore
+        from marker.models import create_model_dict  # type: ignore
+        from marker.output import text_from_rendered  # type: ignore
+    except ImportError as exc:  # pragma: no cover - optional dep
+        raise ImportError(
+            "backend='marker' requires the [marker] extra. "
+            "Install with: pip install 'docparser[marker]'"
+        ) from exc
+    config_parser = ConfigParser({"output_format": "markdown"})
+    converter = PdfConverter(
+        config=config_parser.generate_config_dict(),
+        artifact_dict=create_model_dict(),
+    )
+    rendered = converter(str(path))
+    text, _, _ = text_from_rendered(rendered)
+    return text
+_BACKEND_FUNCS = {
+    "pymupdf4llm": _markdown_pymupdf4llm,
+    "docling": _markdown_docling,
+    "marker": _markdown_marker,
+}
+def run_backend(backend: str, source: Path) -> dict[str, Any]:
+    """Run an external PDF backend and return normalized output.
+    Returns a dict with ``markdown`` (str) and ``blocks`` (list of typed
+    block dicts derived from the Markdown).
+    """
+    if backend not in _BACKEND_FUNCS:
+        raise ValueError(
+            f"unknown PDF backend {backend!r}; expected one of {AVAILABLE_BACKENDS}"
+        )
+    markdown = _BACKEND_FUNCS[backend](Path(source))
+    blocks = _blocks_from_markdown(markdown or "")
+    return {"markdown": markdown or "", "blocks": blocks}