PyPI - rc-docparser - Versions diffs - 0.2.0__py3-none-any.whl - Mend

rc-docparser 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

docparser/__init__.py +87 -0
docparser/cli.py +209 -0
docparser/common.py +163 -0
docparser/csvtab.py +131 -0
docparser/docx.py +488 -0
docparser/epub.py +349 -0
docparser/html.py +322 -0
docparser/image.py +343 -0
docparser/localvlm.py +103 -0
docparser/ocr.py +68 -0
docparser/orchestrator.py +304 -0
docparser/pdf.py +430 -0
docparser/pdf_backends.py +89 -0
docparser/pptx.py +332 -0
docparser/py.typed +0 -0
docparser/text.py +189 -0
docparser/xlsx.py +319 -0
rc_docparser-0.2.0.dist-info/METADATA +344 -0
rc_docparser-0.2.0.dist-info/RECORD +22 -0
rc_docparser-0.2.0.dist-info/WHEEL +4 -0
rc_docparser-0.2.0.dist-info/entry_points.txt +2 -0
rc_docparser-0.2.0.dist-info/licenses/LICENSE +21 -0

docparser/docx.py ADDED Viewed

@@ -0,0 +1,488 @@
+"""DOCX parser: emits Markdown + JSON + extracted images.
+Walks the document body in document order so the generated Markdown faithfully
+reflects the source layout. Each embedded image is written to
+``layout.assets_dir_for(source)`` and (optionally) captioned via a callable
+``captioner`` (typically :func:`docparser.image.caption_image` adapted by an
+orchestrator).
+"""
+from __future__ import annotations
+import re
+from collections.abc import Callable, Iterator
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+from docx import Document
+from docx.document import Document as DocxDocument
+from docx.oxml.ns import qn
+from docx.table import Table, _Cell
+from docx.text.paragraph import Paragraph
+from .common import (
+    WorkspaceLayout,
+    bytes_sha1,
+    file_sha1,
+    truncate,
+    utc_now_iso,
+    write_json,
+    write_text,
+)
+CAPTION_RE = re.compile(
+    r"^\s*(figure|fig\.?|table|scheme|chart|diagram)\s*[:.\-]?\s*\d+[.\:\-]?\s+",
+    re.IGNORECASE,
+)
+CAPTION_STYLE_NAMES = {"caption", "figure caption", "table caption"}
+@dataclass
+class ExtractedImage:
+    seq: int
+    rel_id: str
+    filename: str
+    content_type: str
+    blob: bytes
+    sha1: str
+    ext: str
+    asset_path: Path
+    nearby_caption: str = ""
+    context_before: str = ""
+    context_after: str = ""
+    section_path: list[str] = field(default_factory=list)
+    location: str = ""
+    def to_dict(self, source: Path, layout: WorkspaceLayout) -> dict[str, Any]:
+        return {
+            "seq": self.seq,
+            "filename": self.filename,
+            "content_type": self.content_type,
+            "ext": self.ext,
+            "sha1": self.sha1,
+            "asset_path": layout.relpath_from_parsed(self.asset_path, source),
+            "nearby_caption": self.nearby_caption,
+            "context_before": truncate(self.context_before, 600),
+            "context_after": truncate(self.context_after, 600),
+            "section_path": self.section_path,
+            "location": self.location,
+        }
+def _ext_for(content_type: str, filename: str) -> str:
+    mapping = {
+        "image/png": "png",
+        "image/jpeg": "jpg",
+        "image/jpg": "jpg",
+        "image/gif": "gif",
+        "image/bmp": "bmp",
+        "image/tiff": "tiff",
+        "image/webp": "webp",
+        "image/svg+xml": "svg",
+        "image/x-emf": "emf",
+        "image/x-wmf": "wmf",
+    }
+    if content_type in mapping:
+        return mapping[content_type]
+    suf = Path(filename).suffix.lstrip(".").lower()
+    return suf or "bin"
+def _image_blip_ids(elem) -> list[str]:
+    ns = {
+        "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
+        "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
+    }
+    ids: list[str] = []
+    for blip in elem.iter(f"{{{ns['a']}}}blip"):
+        rid = blip.get(f"{{{ns['r']}}}embed")
+        if rid:
+            ids.append(rid)
+    return ids
+def _iter_block_items(parent) -> Iterator[Paragraph | Table]:
+    if isinstance(parent, DocxDocument):
+        body = parent.element.body
+    elif isinstance(parent, _Cell):
+        body = parent._tc
+    else:
+        body = getattr(parent, "_element", parent)
+    for child in body.iterchildren():
+        if child.tag == qn("w:p"):
+            yield Paragraph(child, parent)
+        elif child.tag == qn("w:tbl"):
+            yield Table(child, parent)
+def _para_style(p: Paragraph) -> str:
+    try:
+        style = p.style
+        return ((style.name if style is not None else "") or "").strip()
+    except Exception:
+        return ""
+def _heading_level(style_name: str) -> int | None:
+    if not style_name:
+        return None
+    s = style_name.lower()
+    if s == "title":
+        return 1
+    m = re.match(r"heading\s*(\d+)", s)
+    if m:
+        return min(6, max(1, int(m.group(1))))
+    return None
+def _is_caption(p: Paragraph) -> bool:
+    style = _para_style(p).lower()
+    if style in CAPTION_STYLE_NAMES:
+        return True
+    text = (p.text or "").strip()
+    if CAPTION_RE.match(text):
+        return True
+    return False
+@dataclass
+class _Block:
+    kind: str
+    payload: dict[str, Any]
+def parse_docx(
+    source: Path | str,
+    layout: WorkspaceLayout | None = None,
+    *,
+    captioner: Callable[..., dict[str, Any]] | None = None,
+    write_outputs: bool = True,
+) -> dict[str, Any]:
+    """Parse a DOCX file into Markdown + JSON + images.
+    Parameters
+    ----------
+    source : Path
+        Path to the .docx file (or a symlink to one).
+    layout : WorkspaceLayout, optional
+        Where to write outputs. Defaults to ``WorkspaceLayout()`` (cwd-relative
+        ``data/parsed``, ``data/assets``, ``.cache``).
+    captioner : callable, optional
+        ``captioner(image_bytes, mime, doc_name, nearby_caption, context) -> dict``.
+        Used to caption every extracted image. If ``None``, no VLM call is made.
+    write_outputs : bool
+        If False, only return the parsed structure (handy for tests).
+    """
+    source = Path(source)
+    layout = layout or WorkspaceLayout()
+    real_source = source.resolve()
+    doc = Document(str(real_source))
+    out_dir = layout.parsed_dir_for(source)
+    asset_dir = layout.assets_dir_for(source)
+    if write_outputs:
+        out_dir.mkdir(parents=True, exist_ok=True)
+        asset_dir.mkdir(parents=True, exist_ok=True)
+    image_parts = doc.part.related_parts
+    rid_to_image: dict[str, ExtractedImage] = {}
+    blocks: list[_Block] = []
+    seq = 0
+    section_stack: list[str] = []
+    def _emit_image_for(elem, location: str) -> None:
+        nonlocal seq
+        for rid in _image_blip_ids(elem):
+            part = image_parts.get(rid)
+            if part is None:
+                continue
+            try:
+                blob = part.blob
+            except Exception:
+                continue
+            ct = getattr(part, "content_type", "") or ""
+            filename = Path(getattr(part, "partname", f"image_{rid}")).name
+            sha = bytes_sha1(blob)
+            ext = _ext_for(ct, filename)
+            seq += 1
+            asset_name = f"img-{seq:03d}-{sha[:10]}.{ext}"
+            asset_path = asset_dir / asset_name
+            if write_outputs and not asset_path.exists():
+                asset_path.write_bytes(blob)
+            img = ExtractedImage(
+                seq=seq,
+                rel_id=rid,
+                filename=filename,
+                content_type=ct,
+                blob=blob,
+                sha1=sha,
+                ext=ext,
+                asset_path=asset_path,
+                section_path=list(section_stack),
+                location=location,
+            )
+            key = f"{location}::{seq}::{rid}"
+            rid_to_image[key] = img
+            blocks.append(_Block("image", {"image_key": key}))
+    def handle_paragraph(p: Paragraph, location: str) -> None:
+        nonlocal section_stack
+        style = _para_style(p)
+        text = (p.text or "").strip()
+        level = _heading_level(style)
+        _emit_image_for(p._element, location)
+        if level is not None and text:
+            while len(section_stack) >= level:
+                section_stack.pop()
+            section_stack.append(text)
+            blocks.append(
+                _Block(
+                    "heading",
+                    {
+                        "level": level,
+                        "text": text,
+                        "style": style,
+                        "section_path": list(section_stack),
+                    },
+                )
+            )
+            return
+        if not text:
+            return
+        kind = "caption" if _is_caption(p) else "paragraph"
+        list_level: int | None = None
+        try:
+            num_pr = p._element.find(qn("w:pPr") + "/" + qn("w:numPr"))
+            if num_pr is not None:
+                ilvl = num_pr.find(qn("w:ilvl"))
+                if ilvl is not None:
+                    list_level = int(ilvl.get(qn("w:val"), "0"))
+        except Exception:
+            list_level = None
+        blocks.append(
+            _Block(
+                kind,
+                {
+                    "text": text,
+                    "style": style,
+                    "list_level": list_level,
+                    "section_path": list(section_stack),
+                },
+            )
+        )
+    def handle_table(tbl: Table, location: str) -> None:
+        rows: list[list[dict[str, Any]]] = []
+        for r_idx, row in enumerate(tbl.rows):
+            row_payload: list[dict[str, Any]] = []
+            for c_idx, cell in enumerate(row.cells):
+                cell_text_parts: list[str] = []
+                for sub in _iter_block_items(cell):
+                    if isinstance(sub, Paragraph):
+                        _emit_image_for(
+                            sub._element, f"{location}.cell[{r_idx},{c_idx}]"
+                        )
+                        if (sub.text or "").strip():
+                            cell_text_parts.append(sub.text.strip())
+                    elif isinstance(sub, Table):
+                        for nrow in sub.rows:
+                            for ncell in nrow.cells:
+                                t = (ncell.text or "").strip()
+                                if t:
+                                    cell_text_parts.append(t)
+                row_payload.append({"text": "\n".join(cell_text_parts).strip()})
+            rows.append(row_payload)
+        blocks.append(
+            _Block(
+                "table",
+                {"rows": rows, "section_path": list(section_stack), "location": location},
+            )
+        )
+    for idx, item in enumerate(_iter_block_items(doc)):
+        loc = f"body[{idx}]"
+        if isinstance(item, Paragraph):
+            handle_paragraph(item, loc)
+        elif isinstance(item, Table):
+            handle_table(item, loc)
+    # context association ---------------------------------------------------
+    for i, b in enumerate(blocks):
+        if b.kind != "image":
+            continue
+        img = rid_to_image[b.payload["image_key"]]
+        for j in range(i + 1, min(i + 4, len(blocks))):
+            nb = blocks[j]
+            if nb.kind == "caption":
+                img.nearby_caption = nb.payload.get("text", "")
+                break
+            if nb.kind == "paragraph" and CAPTION_RE.match(nb.payload.get("text", "") or ""):
+                img.nearby_caption = nb.payload.get("text", "")
+                break
+            if nb.kind == "image":
+                break
+        before_parts: list[str] = []
+        for j in range(i - 1, max(-1, i - 5), -1):
+            nb = blocks[j]
+            if nb.kind in {"paragraph", "caption", "heading"}:
+                before_parts.append(nb.payload.get("text", ""))
+                if len(before_parts) >= 2:
+                    break
+            if nb.kind == "image":
+                break
+        img.context_before = " \u00b6 ".join(reversed(before_parts)).strip()
+        after_parts: list[str] = []
+        skipped_caption = False
+        for j in range(i + 1, min(i + 6, len(blocks))):
+            nb = blocks[j]
+            if nb.kind == "image":
+                break
+            if nb.kind == "caption" and not skipped_caption:
+                skipped_caption = True
+                continue
+            if nb.kind in {"paragraph", "heading"}:
+                after_parts.append(nb.payload.get("text", ""))
+                if len(after_parts) >= 2:
+                    break
+        img.context_after = " \u00b6 ".join(after_parts).strip()
+    # captioning ------------------------------------------------------------
+    image_caption_results: dict[str, dict[str, Any]] = {}
+    if captioner is not None:
+        for key, img in rid_to_image.items():
+            mime = img.content_type or "image/png"
+            try:
+                result = captioner(
+                    image_bytes=img.blob,
+                    mime=mime,
+                    doc_name=source.name,
+                    nearby_caption=img.nearby_caption,
+                    context=(img.context_before + " \u00b6 " + img.context_after).strip(),
+                )
+            except Exception as exc:
+                result = {"error": f"captioner exception: {exc}"}
+            image_caption_results[key] = result
+    # markdown rendering ----------------------------------------------------
+    md_lines: list[str] = []
+    md_lines.append(f"# {source.stem}")
+    md_lines.append("")
+    md_lines.append(
+        f"> Source: `{source.name}` \u00b7 sha1 `{file_sha1(real_source)[:12]}` "
+        f"\u00b7 parsed `{utc_now_iso()}`"
+    )
+    md_lines.append("")
+    for b in blocks:
+        if b.kind == "heading":
+            level = b.payload["level"]
+            md_lines.append(f"{'#' * (level + 1)} {b.payload['text']}")
+            md_lines.append("")
+        elif b.kind == "paragraph":
+            text = b.payload.get("text", "")
+            list_level = b.payload.get("list_level")
+            if list_level is not None:
+                indent = "  " * list_level
+                md_lines.append(f"{indent}- {text}")
+            else:
+                md_lines.append(text)
+            md_lines.append("")
+        elif b.kind == "caption":
+            md_lines.append(f"*{b.payload.get('text', '')}*")
+            md_lines.append("")
+        elif b.kind == "image":
+            key = b.payload["image_key"]
+            img = rid_to_image[key]
+            rel = layout.relpath_from_parsed(img.asset_path, source)
+            cap_data = image_caption_results.get(key) or {}
+            short = cap_data.get("caption") or img.nearby_caption or img.filename
+            alt = (short or "figure").replace("\n", " ").replace("|", "/")
+            md_lines.append(f"![{alt}]({rel})")
+            if img.nearby_caption:
+                md_lines.append(f"*{img.nearby_caption}*")
+            if cap_data.get("description"):
+                md_lines.append("")
+                md_lines.append(f"<!-- vlm: {cap_data.get('model','')} -->")
+                md_lines.append(f"> **VLM caption.** {cap_data['caption']}")
+                md_lines.append(">")
+                md_lines.append(f"> {cap_data['description']}")
+                if cap_data.get("visible_text"):
+                    md_lines.append(">")
+                    vt = cap_data["visible_text"].replace("\n", "\n> ")
+                    md_lines.append(f"> *Visible text:* {vt}")
+                if cap_data.get("tags"):
+                    md_lines.append(">")
+                    md_lines.append("> *Tags:* " + ", ".join(cap_data["tags"]))
+                if cap_data.get("domain_relevance"):
+                    md_lines.append(">")
+                    md_lines.append(f"> *Relevance:* {cap_data['domain_relevance']}")
+            md_lines.append("")
+        elif b.kind == "table":
+            rows = b.payload["rows"]
+            if not rows:
+                continue
+            ncols = max(len(r) for r in rows)
+            header = rows[0]
+            header_cells = [
+                (header[c]["text"] if c < len(header) else "")
+                .replace("|", "\\|")
+                .replace("\n", " ")
+                for c in range(ncols)
+            ]
+            md_lines.append("| " + " | ".join(header_cells or [""]) + " |")
+            md_lines.append("| " + " | ".join(["---"] * ncols) + " |")
+            for row in rows[1:]:
+                cells = [
+                    (row[c]["text"] if c < len(row) else "")
+                    .replace("|", "\\|")
+                    .replace("\n", " ")
+                    for c in range(ncols)
+                ]
+                md_lines.append("| " + " | ".join(cells) + " |")
+            md_lines.append("")
+    md_text = "\n".join(md_lines).rstrip() + "\n"
+    images_json = []
+    for key, img in rid_to_image.items():
+        d = img.to_dict(source, layout)
+        d["semantic"] = image_caption_results.get(key)
+        images_json.append(d)
+    images_json.sort(key=lambda d: d["seq"])
+    json_payload = {
+        "source": {
+            "filename": source.name,
+            "absolute_path": str(real_source),
+            "sha1": file_sha1(real_source),
+            "size_bytes": real_source.stat().st_size,
+            "kind": "docx",
+        },
+        "parsed_at": utc_now_iso(),
+        "blocks": [{"kind": b.kind, **b.payload} for b in blocks],
+        "images": images_json,
+        "stats": {
+            "n_blocks": len(blocks),
+            "n_headings": sum(1 for b in blocks if b.kind == "heading"),
+            "n_paragraphs": sum(1 for b in blocks if b.kind == "paragraph"),
+            "n_tables": sum(1 for b in blocks if b.kind == "table"),
+            "n_images": len(rid_to_image),
+            "n_captioned_images": sum(
+                1
+                for v in image_caption_results.values()
+                if v and not v.get("error") and v.get("caption")
+            ),
+        },
+    }
+    if write_outputs:
+        write_text(out_dir / "document.md", md_text)
+        write_json(out_dir / "document.json", json_payload)
+    return json_payload