PyPI - rc-docparser - Versions diffs - 0.2.0__py3-none-any.whl - Mend

rc-docparser 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

docparser/__init__.py +87 -0
docparser/cli.py +209 -0
docparser/common.py +163 -0
docparser/csvtab.py +131 -0
docparser/docx.py +488 -0
docparser/epub.py +349 -0
docparser/html.py +322 -0
docparser/image.py +343 -0
docparser/localvlm.py +103 -0
docparser/ocr.py +68 -0
docparser/orchestrator.py +304 -0
docparser/pdf.py +430 -0
docparser/pdf_backends.py +89 -0
docparser/pptx.py +332 -0
docparser/py.typed +0 -0
docparser/text.py +189 -0
docparser/xlsx.py +319 -0
rc_docparser-0.2.0.dist-info/METADATA +344 -0
rc_docparser-0.2.0.dist-info/RECORD +22 -0
rc_docparser-0.2.0.dist-info/WHEEL +4 -0
rc_docparser-0.2.0.dist-info/entry_points.txt +2 -0
rc_docparser-0.2.0.dist-info/licenses/LICENSE +21 -0

docparser/pptx.py ADDED Viewed

@@ -0,0 +1,332 @@
+"""PPTX parser: emits Markdown + JSON + extracted images.
+Walks slides in presentation order. Each slide becomes a level-2 heading
+(``Slide N`` plus the slide title when present); text frames, tables, and
+pictures are emitted in shape order. Speaker notes are captured per slide.
+Embedded pictures are written to ``layout.assets_dir_for(source)`` and may be
+captioned via a ``captioner`` callable (same contract as the other parsers).
+Requires the ``[pptx]`` extra: ``pip install 'docparser[pptx]'``.
+"""
+from __future__ import annotations
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any
+from .common import (
+    WorkspaceLayout,
+    bytes_sha1,
+    file_sha1,
+    truncate,
+    utc_now_iso,
+    write_json,
+    write_text,
+)
+def _import_pptx():
+    try:
+        from pptx import Presentation  # type: ignore
+        from pptx.enum.shapes import MSO_SHAPE_TYPE  # type: ignore
+        from pptx.util import Emu  # type: ignore  # noqa: F401
+    except ImportError as exc:  # pragma: no cover - optional dep guard
+        raise ImportError(
+            "docparser.pptx.parse_pptx requires the [pptx] extra. "
+            "Install with: pip install 'docparser[pptx]'"
+        ) from exc
+    return Presentation, MSO_SHAPE_TYPE
+def _ext_for(content_type: str, blob_name: str) -> str:
+    mapping = {
+        "image/png": "png",
+        "image/jpeg": "jpg",
+        "image/jpg": "jpg",
+        "image/gif": "gif",
+        "image/bmp": "bmp",
+        "image/tiff": "tiff",
+        "image/webp": "webp",
+        "image/svg+xml": "svg",
+        "image/x-emf": "emf",
+        "image/x-wmf": "wmf",
+    }
+    if content_type in mapping:
+        return mapping[content_type]
+    suf = Path(blob_name or "").suffix.lstrip(".").lower()
+    return suf or "png"
+def parse_pptx(
+    source: Path | str,
+    layout: WorkspaceLayout | None = None,
+    *,
+    captioner: Callable[..., dict[str, Any]] | None = None,
+    write_outputs: bool = True,
+) -> dict[str, Any]:
+    """Parse a PPTX presentation into Markdown + JSON + images.
+    See :func:`docparser.parse_docx` for the parameter conventions.
+    """
+    Presentation, MSO_SHAPE_TYPE = _import_pptx()
+    source = Path(source)
+    layout = layout or WorkspaceLayout()
+    real_source = source.resolve()
+    prs = Presentation(str(real_source))
+    out_dir = layout.parsed_dir_for(source)
+    asset_dir = layout.assets_dir_for(source)
+    if write_outputs:
+        out_dir.mkdir(parents=True, exist_ok=True)
+        asset_dir.mkdir(parents=True, exist_ok=True)
+    blocks: list[dict[str, Any]] = []
+    images: list[dict[str, Any]] = []
+    image_caption_results: dict[str, dict[str, Any]] = {}
+    image_seq = 0
+    section_stack: list[str] = []
+    def _shape_sort_key(shape) -> tuple[int, int]:
+        top = getattr(shape, "top", None)
+        left = getattr(shape, "left", None)
+        return (top if top is not None else 1 << 60, left if left is not None else 1 << 60)
+    def _emit_table(shape, slide_no: int, section_path: list[str]) -> None:
+        table = shape.table
+        rows: list[list[str]] = []
+        for row in table.rows:
+            rows.append([cell.text.strip() for cell in row.cells])
+        if rows:
+            blocks.append(
+                {
+                    "kind": "table",
+                    "rows": [[{"text": c} for c in r] for r in rows],
+                    "slide": slide_no,
+                    "section_path": section_path,
+                }
+            )
+    def _emit_picture(shape, slide_no: int, section_path: list[str], ctx_before: str) -> None:
+        nonlocal image_seq
+        try:
+            image = shape.image
+        except Exception:
+            return
+        blob = image.blob
+        if not blob:
+            return
+        image_seq += 1
+        sha = bytes_sha1(blob)
+        ext = _ext_for(getattr(image, "content_type", "") or "", getattr(image, "filename", "") or "")
+        asset_name = f"img-{image_seq:03d}-{sha[:10]}.{ext}"
+        asset_path = asset_dir / asset_name
+        if write_outputs and not asset_path.exists():
+            asset_path.write_bytes(blob)
+        rel = layout.relpath_from_parsed(asset_path, source)
+        key = str(image_seq)
+        cap = None
+        if captioner is not None:
+            try:
+                cap = captioner(
+                    image_bytes=blob,
+                    mime=getattr(image, "content_type", "") or f"image/{ext}",
+                    doc_name=f"{source.name} :: slide {slide_no}",
+                    nearby_caption="",
+                    context=ctx_before,
+                )
+            except Exception as exc:
+                cap = {"error": str(exc)}
+        image_caption_results[key] = cap or {}
+        images.append(
+            {
+                "seq": image_seq,
+                "slide": slide_no,
+                "sha1": sha,
+                "ext": ext,
+                "asset_path": rel,
+                "context_before": truncate(ctx_before, 600),
+                "section_path": section_path,
+                "semantic": cap,
+            }
+        )
+        blocks.append(
+            {
+                "kind": "image",
+                "image_seq": image_seq,
+                "asset_path": rel,
+                "slide": slide_no,
+                "section_path": section_path,
+            }
+        )
+    for slide_idx, slide in enumerate(prs.slides):
+        slide_no = slide_idx + 1
+        title_text = ""
+        try:
+            if slide.shapes.title is not None and slide.shapes.title.has_text_frame:
+                title_text = (slide.shapes.title.text or "").strip()
+        except Exception:
+            title_text = ""
+        section_stack = [f"Slide {slide_no}" + (f": {title_text}" if title_text else "")]
+        heading_text = section_stack[0]
+        blocks.append(
+            {
+                "kind": "heading",
+                "level": 1,
+                "text": heading_text,
+                "slide": slide_no,
+                "section_path": list(section_stack),
+            }
+        )
+        recent_text = heading_text
+        title_id = id(slide.shapes.title) if slide.shapes.title is not None else None
+        for shape in sorted(slide.shapes, key=_shape_sort_key):
+            if title_id is not None and id(shape) == title_id:
+                continue
+            stype = getattr(shape, "shape_type", None)
+            if shape.has_table:
+                _emit_table(shape, slide_no, list(section_stack))
+                continue
+            if stype == MSO_SHAPE_TYPE.PICTURE or getattr(shape, "shape_type", None) == 13:
+                _emit_picture(shape, slide_no, list(section_stack), recent_text)
+                continue
+            if shape.has_text_frame:
+                tf = shape.text_frame
+                for para in tf.paragraphs:
+                    text = "".join(run.text for run in para.runs).strip()
+                    if not text and para.text:
+                        text = para.text.strip()
+                    if not text:
+                        continue
+                    level = getattr(para, "level", 0) or 0
+                    blocks.append(
+                        {
+                            "kind": "paragraph",
+                            "text": text,
+                            "list_level": level if level > 0 else None,
+                            "slide": slide_no,
+                            "section_path": list(section_stack),
+                        }
+                    )
+                    recent_text = text
+        # speaker notes
+        notes_text = ""
+        try:
+            if slide.has_notes_slide:
+                notes_text = (slide.notes_slide.notes_text_frame.text or "").strip()
+        except Exception:
+            notes_text = ""
+        if notes_text:
+            blocks.append(
+                {
+                    "kind": "notes",
+                    "text": notes_text,
+                    "slide": slide_no,
+                    "section_path": list(section_stack),
+                }
+            )
+    # markdown rendering ----------------------------------------------------
+    md_lines: list[str] = [
+        f"# {source.stem}",
+        "",
+        f"> Source: `{source.name}` \u00b7 sha1 `{file_sha1(real_source)[:12]}` "
+        f"\u00b7 parsed `{utc_now_iso()}` \u00b7 slides: {len(prs.slides._sldIdLst)}",
+        "",
+    ]
+    for b in blocks:
+        kind = b["kind"]
+        if kind == "heading":
+            md_lines.append(f"## {b['text']}")
+            md_lines.append("")
+        elif kind == "paragraph":
+            list_level = b.get("list_level")
+            if list_level:
+                md_lines.append("  " * (list_level - 1) + f"- {b['text']}")
+            else:
+                md_lines.append(b["text"])
+            md_lines.append("")
+        elif kind == "notes":
+            md_lines.append(f"> **Notes.** {b['text']}")
+            md_lines.append("")
+        elif kind == "image":
+            key = str(b["image_seq"])
+            cap = image_caption_results.get(key) or {}
+            alt = cap.get("caption") or f"slide-{b['slide']}-image-{b['image_seq']}"
+            md_lines.append(f"![{alt}]({b['asset_path']})")
+            if cap.get("description"):
+                md_lines.append("")
+                md_lines.append(f"<!-- vlm: {cap.get('model','')} -->")
+                md_lines.append(f"> **VLM caption.** {cap.get('caption','')}")
+                md_lines.append(">")
+                md_lines.append(f"> {cap.get('description','')}")
+                if cap.get("visible_text"):
+                    md_lines.append(">")
+                    vt = cap["visible_text"].replace("\n", "\n> ")
+                    md_lines.append(f"> *Visible text:* {vt}")
+                if cap.get("tags"):
+                    md_lines.append(">")
+                    md_lines.append("> *Tags:* " + ", ".join(cap["tags"]))
+            md_lines.append("")
+        elif kind == "table":
+            rows = b["rows"]
+            if not rows:
+                continue
+            ncols = max(len(r) for r in rows)
+            header = rows[0]
+            header_cells = [
+                (header[c]["text"] if c < len(header) else "").replace("|", "\\|").replace("\n", " ")
+                for c in range(ncols)
+            ]
+            md_lines.append("| " + " | ".join(header_cells) + " |")
+            md_lines.append("| " + " | ".join(["---"] * ncols) + " |")
+            for row in rows[1:]:
+                cells = [
+                    (row[c]["text"] if c < len(row) else "").replace("|", "\\|").replace("\n", " ")
+                    for c in range(ncols)
+                ]
+                md_lines.append("| " + " | ".join(cells) + " |")
+            md_lines.append("")
+    md_text = "\n".join(md_lines).rstrip() + "\n"
+    json_payload = {
+        "source": {
+            "filename": source.name,
+            "absolute_path": str(real_source),
+            "sha1": file_sha1(real_source),
+            "size_bytes": real_source.stat().st_size,
+            "kind": "pptx",
+        },
+        "parsed_at": utc_now_iso(),
+        "n_slides": len(prs.slides._sldIdLst),
+        "blocks": blocks,
+        "images": images,
+        "stats": {
+            "n_blocks": len(blocks),
+            "n_slides": len(prs.slides._sldIdLst),
+            "n_headings": sum(1 for b in blocks if b["kind"] == "heading"),
+            "n_paragraphs": sum(1 for b in blocks if b["kind"] == "paragraph"),
+            "n_tables": sum(1 for b in blocks if b["kind"] == "table"),
+            "n_images": len(images),
+            "n_notes": sum(1 for b in blocks if b["kind"] == "notes"),
+            "n_captioned_images": sum(
+                1
+                for v in image_caption_results.values()
+                if v and not v.get("error") and v.get("caption")
+            ),
+        },
+    }
+    if write_outputs:
+        write_text(out_dir / "document.md", md_text)
+        write_json(out_dir / "document.json", json_payload)
+    return json_payload

docparser/py.typed ADDED Viewed

File without changes

docparser/text.py ADDED Viewed

@@ -0,0 +1,189 @@
+"""Plain-text and Markdown parser (core, no extra dependencies).
+Handles ``.txt`` and ``.md`` / ``.markdown`` files. Markdown is passed through
+to ``document.md`` verbatim while a lightweight block model (headings, list
+items, code fences, paragraphs) is emitted into ``document.json`` so downstream
+RAG layers get structure. Plain text is split into paragraphs on blank lines.
+"""
+from __future__ import annotations
+import re
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any
+from .common import (
+    WorkspaceLayout,
+    file_sha1,
+    utc_now_iso,
+    write_json,
+    write_text,
+)
+_ATX_RE = re.compile(r"^(#{1,6})\s+(.*)$")
+_LIST_RE = re.compile(r"^\s*([-*+]|\d+[.)])\s+(.*)$")
+def _blocks_from_markdown(text: str) -> list[dict[str, Any]]:
+    blocks: list[dict[str, Any]] = []
+    section_stack: list[str] = []
+    in_code = False
+    code_lang = ""
+    code_buf: list[str] = []
+    para_buf: list[str] = []
+    def flush_para() -> None:
+        if para_buf:
+            joined = " ".join(p.strip() for p in para_buf if p.strip()).strip()
+            if joined:
+                blocks.append(
+                    {"kind": "paragraph", "text": joined, "section_path": list(section_stack)}
+                )
+            para_buf.clear()
+    for raw_line in text.splitlines():
+        line = raw_line.rstrip("\n")
+        fence = line.strip()
+        if fence.startswith("```") or fence.startswith("~~~"):
+            if in_code:
+                blocks.append(
+                    {
+                        "kind": "code",
+                        "language": code_lang,
+                        "text": "\n".join(code_buf),
+                        "section_path": list(section_stack),
+                    }
+                )
+                code_buf = []
+                in_code = False
+                code_lang = ""
+            else:
+                flush_para()
+                in_code = True
+                code_lang = fence[3:].strip()
+            continue
+        if in_code:
+            code_buf.append(line)
+            continue
+        m = _ATX_RE.match(line)
+        if m:
+            flush_para()
+            level = len(m.group(1))
+            htext = m.group(2).strip()
+            while len(section_stack) >= level:
+                section_stack.pop()
+            section_stack.append(htext)
+            blocks.append(
+                {
+                    "kind": "heading",
+                    "level": level,
+                    "text": htext,
+                    "section_path": list(section_stack),
+                }
+            )
+            continue
+        lm = _LIST_RE.match(line)
+        if lm:
+            flush_para()
+            blocks.append(
+                {
+                    "kind": "list_item",
+                    "text": lm.group(2).strip(),
+                    "section_path": list(section_stack),
+                }
+            )
+            continue
+        if not line.strip():
+            flush_para()
+            continue
+        para_buf.append(line)
+    if in_code and code_buf:
+        blocks.append(
+            {
+                "kind": "code",
+                "language": code_lang,
+                "text": "\n".join(code_buf),
+                "section_path": list(section_stack),
+            }
+        )
+    flush_para()
+    return blocks
+def _blocks_from_plaintext(text: str) -> list[dict[str, Any]]:
+    blocks: list[dict[str, Any]] = []
+    for chunk in re.split(r"\n\s*\n", text):
+        para = " ".join(line.strip() for line in chunk.splitlines() if line.strip()).strip()
+        if para:
+            blocks.append({"kind": "paragraph", "text": para, "section_path": []})
+    return blocks
+def parse_text(
+    source: Path | str,
+    layout: WorkspaceLayout | None = None,
+    *,
+    captioner: Callable[..., dict[str, Any]] | None = None,
+    write_outputs: bool = True,
+) -> dict[str, Any]:
+    """Parse a ``.txt`` / ``.md`` file into Markdown + JSON.
+    The ``captioner`` argument is accepted for API symmetry but unused (plain
+    text has no embedded images).
+    """
+    _ = captioner
+    source = Path(source)
+    layout = layout or WorkspaceLayout()
+    real_source = source.resolve()
+    raw = real_source.read_text(encoding="utf-8", errors="replace")
+    is_markdown = source.suffix.lower() in {".md", ".markdown", ".mdown", ".mkd"}
+    blocks = _blocks_from_markdown(raw) if is_markdown else _blocks_from_plaintext(raw)
+    out_dir = layout.parsed_dir_for(source)
+    if write_outputs:
+        out_dir.mkdir(parents=True, exist_ok=True)
+    header = (
+        f"> Source: `{source.name}` \u00b7 sha1 `{file_sha1(real_source)[:12]}` "
+        f"\u00b7 parsed `{utc_now_iso()}`"
+    )
+    if is_markdown:
+        md_text = f"{header}\n\n{raw.strip()}\n"
+    else:
+        md_lines = [f"# {source.stem}", "", header, ""]
+        for b in blocks:
+            md_lines.append(b["text"])
+            md_lines.append("")
+        md_text = "\n".join(md_lines).rstrip() + "\n"
+    json_payload = {
+        "source": {
+            "filename": source.name,
+            "absolute_path": str(real_source),
+            "sha1": file_sha1(real_source),
+            "size_bytes": real_source.stat().st_size,
+            "kind": "markdown" if is_markdown else "text",
+        },
+        "parsed_at": utc_now_iso(),
+        "blocks": blocks,
+        "stats": {
+            "n_blocks": len(blocks),
+            "n_headings": sum(1 for b in blocks if b["kind"] == "heading"),
+            "n_paragraphs": sum(1 for b in blocks if b["kind"] == "paragraph"),
+            "n_list_items": sum(1 for b in blocks if b["kind"] == "list_item"),
+            "n_code_blocks": sum(1 for b in blocks if b["kind"] == "code"),
+            "n_chars": len(raw),
+        },
+    }
+    if write_outputs:
+        write_text(out_dir / "document.md", md_text)
+        write_json(out_dir / "document.json", json_payload)
+    return json_payload