PyPI - rc-docparser - Versions diffs - 0.2.0__py3-none-any.whl - Mend

rc-docparser 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

docparser/__init__.py +87 -0
docparser/cli.py +209 -0
docparser/common.py +163 -0
docparser/csvtab.py +131 -0
docparser/docx.py +488 -0
docparser/epub.py +349 -0
docparser/html.py +322 -0
docparser/image.py +343 -0
docparser/localvlm.py +103 -0
docparser/ocr.py +68 -0
docparser/orchestrator.py +304 -0
docparser/pdf.py +430 -0
docparser/pdf_backends.py +89 -0
docparser/pptx.py +332 -0
docparser/py.typed +0 -0
docparser/text.py +189 -0
docparser/xlsx.py +319 -0
rc_docparser-0.2.0.dist-info/METADATA +344 -0
rc_docparser-0.2.0.dist-info/RECORD +22 -0
rc_docparser-0.2.0.dist-info/WHEEL +4 -0
rc_docparser-0.2.0.dist-info/entry_points.txt +2 -0
rc_docparser-0.2.0.dist-info/licenses/LICENSE +21 -0

docparser/image.py ADDED Viewed

@@ -0,0 +1,343 @@
+"""Semantic image captioner via OpenAI-compatible VLM providers.
+Works with any OpenAI-compatible ``/chat/completions`` endpoint. Providers are
+selected via the ``provider`` argument (or the ``DOCPARSER_VLM_PROVIDER`` env
+var) and resolve sensible defaults for base URL, API-key env var, and model:
+- ``openrouter`` (default) - OpenRouter, ``OPENROUTER_API_KEY``
+- ``openai``               - OpenAI, ``OPENAI_API_KEY``
+- ``gemini``               - Google Gemini (OpenAI-compatible), ``GEMINI_API_KEY``
+- ``local``                - any local server (Ollama / vLLM / LM Studio),
+  ``DOCPARSER_VLM_BASE_URL`` (default ``http://localhost:11434/v1``)
+Caches results by ``SHA-1(image_bytes) x model`` so re-runs are free. The cache
+directory is taken from a ``WorkspaceLayout`` (when supplied) or from
+``~/.cache/docparser/vlm/`` by default.
+This module requires the optional ``[vlm]`` extra (``requests``). For a fully
+local, network-free captioner see :mod:`docparser.localvlm` (``[localvlm]``).
+"""
+from __future__ import annotations
+import base64
+import json
+import os
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+from .common import WorkspaceLayout, bytes_sha1
+try:
+    import requests  # type: ignore
+except ImportError:  # pragma: no cover - optional dep
+    requests = None  # type: ignore[assignment]
+DEFAULT_MODEL_ENV = "OPENROUTER_VLM_MODEL"
+DEFAULT_MODEL = "anthropic/claude-sonnet-4"
+DEFAULT_BASE_URL = "https://openrouter.ai/api/v1"
+DEFAULT_REFERER = "https://github.com/Research-Commons/docparser"
+DEFAULT_TITLE = "docparser"
+DEFAULT_PROVIDER = "openrouter"
+# Provider presets: defaults for base URL, API-key env var, and model. Any can
+# be overridden by explicit arguments or environment variables.
+PROVIDERS: dict[str, dict[str, str]] = {
+    "openrouter": {
+        "base_url": "https://openrouter.ai/api/v1",
+        "api_key_env": "OPENROUTER_API_KEY",
+        "model_env": "OPENROUTER_VLM_MODEL",
+        "default_model": "anthropic/claude-sonnet-4",
+    },
+    "openai": {
+        "base_url": "https://api.openai.com/v1",
+        "api_key_env": "OPENAI_API_KEY",
+        "model_env": "OPENAI_VLM_MODEL",
+        "default_model": "gpt-4o-mini",
+    },
+    "gemini": {
+        "base_url": "https://generativelanguage.googleapis.com/v1beta/openai",
+        "api_key_env": "GEMINI_API_KEY",
+        "model_env": "GEMINI_VLM_MODEL",
+        "default_model": "gemini-1.5-flash",
+    },
+    "local": {
+        "base_url": "http://localhost:11434/v1",
+        "api_key_env": "DOCPARSER_VLM_API_KEY",
+        "model_env": "DOCPARSER_VLM_MODEL",
+        "default_model": "llava",
+    },
+}
+def _resolve_provider(provider: str | None) -> tuple[str, dict[str, str]]:
+    name = (provider or os.environ.get("DOCPARSER_VLM_PROVIDER") or DEFAULT_PROVIDER).lower()
+    if name not in PROVIDERS:
+        raise ValueError(
+            f"unknown VLM provider {name!r}; expected one of {sorted(PROVIDERS)}"
+        )
+    return name, PROVIDERS[name]
+DEFAULT_SYSTEM_PROMPT = (
+    "You are a meticulous research assistant building a structured corpus from "
+    "documents. Given an image taken from a document, return a strict JSON "
+    "object with the following keys:\n"
+    "  caption: one concise sentence (<=25 words) suitable as a figure caption.\n"
+    "  description: 2-5 sentence paragraph describing what the image shows.\n"
+    "  visible_text: any text that appears in the image, transcribed verbatim. "
+    "    Empty string if none.\n"
+    "  tags: 3-8 short lowercase keywords.\n"
+    "  image_kind: one of [diagram, plot, screenshot, photo, equation, table, "
+    "    handwriting, ui, other].\n"
+    "  domain_relevance: one sentence linking the image to the document's "
+    "    apparent topic; empty string if not applicable.\n"
+    "Return ONLY the JSON object, no markdown fences, no commentary."
+)
+DEFAULT_USER_PROMPT_TEMPLATE = (
+    "Document: {doc_name}\n"
+    "Nearby caption text (may be empty or noisy): {nearby_caption}\n"
+    "Surrounding context excerpt: {context}\n\n"
+    "Now analyze the attached image and respond with the JSON object."
+)
+@dataclass
+class VLMResult:
+    caption: str
+    description: str
+    visible_text: str
+    tags: list[str]
+    image_kind: str
+    domain_relevance: str
+    raw: dict[str, Any]
+    model: str
+    cached: bool = False
+    error: str | None = None
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "caption": self.caption,
+            "description": self.description,
+            "visible_text": self.visible_text,
+            "tags": self.tags,
+            "image_kind": self.image_kind,
+            "domain_relevance": self.domain_relevance,
+            "model": self.model,
+            "cached": self.cached,
+            "error": self.error,
+        }
+def _default_cache_root() -> Path:
+    return Path.home() / ".cache" / "docparser"
+def _cache_path(image_sha1: str, model: str, cache_root: Path) -> Path:
+    safe_model = model.replace("/", "__")
+    return cache_root / "vlm" / safe_model / f"{image_sha1}.json"
+def _load_cached(image_sha1: str, model: str, cache_root: Path) -> dict[str, Any] | None:
+    p = _cache_path(image_sha1, model, cache_root)
+    if not p.exists():
+        return None
+    try:
+        return json.loads(p.read_text(encoding="utf-8"))
+    except json.JSONDecodeError:
+        return None
+def _save_cache(image_sha1: str, model: str, cache_root: Path, payload: dict[str, Any]) -> None:
+    p = _cache_path(image_sha1, model, cache_root)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    p.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
+def _coerce(
+    payload: dict[str, Any],
+    *,
+    raw: dict[str, Any],
+    model: str,
+    cached: bool,
+    error: str | None = None,
+) -> VLMResult:
+    return VLMResult(
+        caption=str(payload.get("caption", "")).strip(),
+        description=str(payload.get("description", "")).strip(),
+        visible_text=str(payload.get("visible_text", "")).strip(),
+        tags=[str(t).strip().lower() for t in (payload.get("tags") or []) if str(t).strip()],
+        image_kind=str(payload.get("image_kind", "other")).strip() or "other",
+        domain_relevance=str(payload.get("domain_relevance", "")).strip(),
+        raw=raw,
+        model=model,
+        cached=cached,
+        error=error,
+    )
+def _extract_json(text: str) -> dict[str, Any]:
+    text = text.strip()
+    if text.startswith("```"):
+        lines = text.splitlines()
+        if lines and lines[0].startswith("```"):
+            lines = lines[1:]
+        if lines and lines[-1].startswith("```"):
+            lines = lines[:-1]
+        text = "\n".join(lines).strip()
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        start = text.find("{")
+        end = text.rfind("}")
+        if start != -1 and end != -1 and end > start:
+            try:
+                return json.loads(text[start : end + 1])
+            except json.JSONDecodeError:
+                pass
+    raise ValueError(f"Could not parse JSON from VLM response: {text[:400]}")
+def caption_image(
+    image_bytes: bytes,
+    *,
+    mime: str = "image/png",
+    doc_name: str = "",
+    nearby_caption: str = "",
+    context: str = "",
+    provider: str | None = None,
+    model: str | None = None,
+    api_key: str | None = None,
+    base_url: str | None = None,
+    referer: str | None = None,
+    title: str | None = None,
+    system_prompt: str | None = None,
+    user_prompt_template: str | None = None,
+    layout: WorkspaceLayout | None = None,
+    cache_root: Path | None = None,
+    max_retries: int = 3,
+    timeout: int = 90,
+) -> VLMResult:
+    """Caption a single image via an OpenAI-compatible VLM, with on-disk caching.
+    The ``provider`` argument selects a preset (see :data:`PROVIDERS`);
+    ``model``, ``api_key``, and ``base_url`` override the preset when given.
+    Cache key: ``SHA1(image_bytes) x model``. Cached at
+    ``layout.cache_dir / "vlm" / <model> / <sha>.json`` when a layout is
+    supplied, otherwise at ``~/.cache/docparser``.
+    Network calls require the ``[vlm]`` extra (``requests``).
+    """
+    if requests is None:  # pragma: no cover - optional dep guard
+        raise ImportError(
+            "docparser.image.caption_image requires the [vlm] extra. "
+            "Install with: pip install 'docparser[vlm]'"
+        )
+    provider_name, preset = _resolve_provider(provider)
+    model = model or os.environ.get(preset["model_env"]) or preset["default_model"]
+    api_key = api_key or os.environ.get(preset["api_key_env"])
+    base_url = (
+        base_url
+        or os.environ.get("OPENROUTER_BASE_URL" if provider_name == "openrouter" else "")
+        or os.environ.get("DOCPARSER_VLM_BASE_URL" if provider_name == "local" else "")
+        or preset["base_url"]
+    )
+    referer = referer or os.environ.get("OPENROUTER_REFERER", DEFAULT_REFERER)
+    title = title or os.environ.get("OPENROUTER_TITLE", DEFAULT_TITLE)
+    system_prompt = system_prompt or DEFAULT_SYSTEM_PROMPT
+    user_prompt_template = user_prompt_template or DEFAULT_USER_PROMPT_TEMPLATE
+    if cache_root is None:
+        cache_root = (layout.cache_dir if layout is not None else _default_cache_root())
+    sha = bytes_sha1(image_bytes)
+    cached = _load_cached(sha, model, cache_root)
+    if cached is not None:
+        return _coerce(cached.get("payload", {}), raw=cached, model=model, cached=True)
+    if not api_key:
+        return _coerce(
+            {},
+            raw={},
+            model=model,
+            cached=False,
+            error="OPENROUTER_API_KEY not set; skipping VLM call.",
+        )
+    b64 = base64.b64encode(image_bytes).decode("ascii")
+    user_msg = user_prompt_template.format(
+        doc_name=doc_name or "(unknown)",
+        nearby_caption=(nearby_caption or "").strip()[:500] or "(none)",
+        context=(context or "").strip()[:1500] or "(none)",
+    )
+    body = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": user_msg},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:{mime};base64,{b64}"},
+                    },
+                ],
+            },
+        ],
+        "temperature": 0.2,
+        "max_tokens": 1800,
+        "response_format": {"type": "json_object"},
+    }
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+        "HTTP-Referer": referer,
+        "X-Title": title,
+    }
+    url = f"{base_url}/chat/completions"
+    last_err: str | None = None
+    for attempt in range(1, max_retries + 1):
+        try:
+            resp = requests.post(url, headers=headers, json=body, timeout=timeout)  # type: ignore[arg-type]
+        except requests.RequestException as exc:
+            last_err = f"network error: {exc}"
+            time.sleep(min(2**attempt, 15))
+            continue
+        if resp.status_code == 429 or resp.status_code >= 500:
+            last_err = f"http {resp.status_code}: {resp.text[:300]}"
+            time.sleep(min(2**attempt, 20))
+            continue
+        if resp.status_code != 200:
+            last_err = f"http {resp.status_code}: {resp.text[:600]}"
+            break
+        data = resp.json()
+        try:
+            content = data["choices"][0]["message"]["content"]
+        except (KeyError, IndexError, TypeError):
+            last_err = f"unexpected response shape: {json.dumps(data)[:400]}"
+            break
+        if isinstance(content, list):
+            content = "".join(part.get("text", "") for part in content if isinstance(part, dict))
+        try:
+            payload = _extract_json(content)
+        except ValueError as exc:
+            last_err = str(exc)
+            break
+        record = {
+            "payload": payload,
+            "model": model,
+            "image_sha1": sha,
+            "doc_name": doc_name,
+            "nearby_caption": nearby_caption,
+            "raw_choice": content,
+        }
+        _save_cache(sha, model, cache_root, record)
+        return _coerce(payload, raw=record, model=model, cached=False)
+    return _coerce({}, raw={}, model=model, cached=False, error=last_err or "unknown")

docparser/localvlm.py ADDED Viewed

@@ -0,0 +1,103 @@
+"""Fully-local, network-free image captioning via Hugging Face ``transformers``.
+This is a lighter-weight alternative to the API-based captioner in
+:mod:`docparser.image` for environments without internet access. It produces a
+short caption (and, where the model supports it, a description) using a local
+image-to-text model such as BLIP.
+It honors the same ``VLMResult`` shape and the same ``SHA1(image) x model``
+on-disk cache as the API captioner, so output is interchangeable.
+Requires the ``[localvlm]`` extra: ``pip install 'docparser[localvlm]'``.
+"""
+from __future__ import annotations
+import io
+import json
+import os
+from functools import lru_cache
+from pathlib import Path
+from .common import WorkspaceLayout, bytes_sha1
+from .image import VLMResult, _cache_path, _coerce, _load_cached
+DEFAULT_LOCAL_MODEL = "Salesforce/blip-image-captioning-large"
+LOCAL_MODEL_ENV = "DOCPARSER_LOCAL_VLM_MODEL"
+@lru_cache(maxsize=2)
+def _load_pipeline(model: str):
+    try:
+        from transformers import pipeline  # type: ignore
+    except ImportError as exc:  # pragma: no cover - optional dep
+        raise ImportError(
+            "docparser.localvlm requires the [localvlm] extra. "
+            "Install with: pip install 'docparser[localvlm]'"
+        ) from exc
+    return pipeline("image-to-text", model=model)
+def _save_cache(image_sha1: str, model: str, cache_root: Path, payload: dict) -> None:
+    p = _cache_path(image_sha1, model, cache_root)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    p.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
+def caption_image_local(
+    image_bytes: bytes,
+    *,
+    mime: str = "image/png",
+    doc_name: str = "",
+    nearby_caption: str = "",
+    context: str = "",
+    model: str | None = None,
+    layout: WorkspaceLayout | None = None,
+    cache_root: Path | None = None,
+    max_new_tokens: int = 60,
+) -> VLMResult:
+    """Caption an image with a local transformers image-to-text model.
+    Unused keyword arguments (``mime``, ``doc_name``, ``nearby_caption``,
+    ``context``) are accepted for signature parity with
+    :func:`docparser.image.caption_image`.
+    """
+    _ = (mime, doc_name, nearby_caption, context)
+    model = model or os.environ.get(LOCAL_MODEL_ENV) or DEFAULT_LOCAL_MODEL
+    if cache_root is None:
+        cache_root = layout.cache_dir if layout is not None else (Path.home() / ".cache" / "docparser")
+    sha = bytes_sha1(image_bytes)
+    cached = _load_cached(sha, model, cache_root)
+    if cached is not None:
+        return _coerce(cached.get("payload", {}), raw=cached, model=model, cached=True)
+    try:
+        from PIL import Image  # type: ignore
+    except ImportError as exc:  # pragma: no cover - optional dep
+        raise ImportError(
+            "docparser.localvlm requires Pillow (bundled with the [localvlm] extra)."
+        ) from exc
+    pipe = _load_pipeline(model)
+    img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    try:
+        out = pipe(img, max_new_tokens=max_new_tokens)
+    except Exception as exc:  # pragma: no cover - runtime/model failure
+        return _coerce({}, raw={}, model=model, cached=False, error=f"local vlm error: {exc}")
+    caption = ""
+    if isinstance(out, list) and out and isinstance(out[0], dict):
+        caption = str(out[0].get("generated_text", "")).strip()
+    payload = {
+        "caption": caption,
+        "description": caption,
+        "visible_text": "",
+        "tags": [],
+        "image_kind": "other",
+        "domain_relevance": "",
+    }
+    record = {"payload": payload, "model": model, "image_sha1": sha, "doc_name": doc_name}
+    _save_cache(sha, model, cache_root, record)
+    return _coerce(payload, raw=record, model=model, cached=False)

docparser/ocr.py ADDED Viewed

@@ -0,0 +1,68 @@
+"""OCR helpers for scanned PDFs and images.
+Uses ``rapidocr-onnxruntime`` by default: a pure-pip ONNX OCR engine that needs
+no system binaries (unlike Tesseract). The engine is created once and reused.
+Requires the ``[ocr]`` extra: ``pip install 'docparser[ocr]'``.
+"""
+from __future__ import annotations
+import io
+from functools import lru_cache
+from typing import Any
+_NO_OCR_MSG = (
+    "OCR requires the [ocr] extra. Install with: pip install 'docparser[ocr]'"
+)
+@lru_cache(maxsize=1)
+def _engine():
+    try:
+        from rapidocr_onnxruntime import RapidOCR  # type: ignore
+    except ImportError as exc:  # pragma: no cover - optional dep
+        raise ImportError(_NO_OCR_MSG) from exc
+    return RapidOCR()
+def ocr_available() -> bool:
+    try:
+        import rapidocr_onnxruntime  # type: ignore  # noqa: F401
+    except ImportError:
+        return False
+    return True
+def ocr_image_bytes(blob: bytes) -> str:
+    """Return recognized text from raw image bytes (joined by newlines)."""
+    try:
+        import numpy as np  # type: ignore
+        from PIL import Image  # type: ignore
+    except ImportError as exc:  # pragma: no cover - optional dep
+        raise ImportError(_NO_OCR_MSG) from exc
+    engine = _engine()
+    img = Image.open(io.BytesIO(blob)).convert("RGB")
+    arr = np.array(img)
+    result, _ = engine(arr)
+    if not result:
+        return ""
+    lines: list[str] = []
+    for item in result:
+        # rapidocr returns [box, text, score]
+        if isinstance(item, (list, tuple)) and len(item) >= 2:
+            lines.append(str(item[1]))
+    return "\n".join(lines).strip()
+def ocr_pdf_page(page: Any, *, dpi: int = 200) -> str:
+    """Render a PyMuPDF page to a raster image and OCR it."""
+    try:
+        import fitz  # type: ignore  # noqa: F401
+    except ImportError as exc:  # pragma: no cover
+        raise ImportError(
+            "OCR of PDF pages requires the [pdf] extra (PyMuPDF)."
+        ) from exc
+    zoom = dpi / 72.0
+    pix = page.get_pixmap(matrix=__import__("fitz").Matrix(zoom, zoom))
+    return ocr_image_bytes(pix.tobytes("png"))