rc-docparser 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docparser/image.py ADDED
@@ -0,0 +1,343 @@
1
+ """Semantic image captioner via OpenAI-compatible VLM providers.
2
+
3
+ Works with any OpenAI-compatible ``/chat/completions`` endpoint. Providers are
4
+ selected via the ``provider`` argument (or the ``DOCPARSER_VLM_PROVIDER`` env
5
+ var) and resolve sensible defaults for base URL, API-key env var, and model:
6
+
7
+ - ``openrouter`` (default) - OpenRouter, ``OPENROUTER_API_KEY``
8
+ - ``openai`` - OpenAI, ``OPENAI_API_KEY``
9
+ - ``gemini`` - Google Gemini (OpenAI-compatible), ``GEMINI_API_KEY``
10
+ - ``local`` - any local server (Ollama / vLLM / LM Studio),
11
+ ``DOCPARSER_VLM_BASE_URL`` (default ``http://localhost:11434/v1``)
12
+
13
+ Caches results by ``SHA-1(image_bytes) x model`` so re-runs are free. The cache
14
+ directory is taken from a ``WorkspaceLayout`` (when supplied) or from
15
+ ``~/.cache/docparser/vlm/`` by default.
16
+
17
+ This module requires the optional ``[vlm]`` extra (``requests``). For a fully
18
+ local, network-free captioner see :mod:`docparser.localvlm` (``[localvlm]``).
19
+ """
20
+ from __future__ import annotations
21
+
22
+ import base64
23
+ import json
24
+ import os
25
+ import time
26
+ from dataclasses import dataclass
27
+ from pathlib import Path
28
+ from typing import Any
29
+
30
+ from .common import WorkspaceLayout, bytes_sha1
31
+
32
+ try:
33
+ import requests # type: ignore
34
+ except ImportError: # pragma: no cover - optional dep
35
+ requests = None # type: ignore[assignment]
36
+
37
+ DEFAULT_MODEL_ENV = "OPENROUTER_VLM_MODEL"
38
+ DEFAULT_MODEL = "anthropic/claude-sonnet-4"
39
+ DEFAULT_BASE_URL = "https://openrouter.ai/api/v1"
40
+ DEFAULT_REFERER = "https://github.com/Research-Commons/docparser"
41
+ DEFAULT_TITLE = "docparser"
42
+ DEFAULT_PROVIDER = "openrouter"
43
+
44
+
45
+ # Provider presets: defaults for base URL, API-key env var, and model. Any can
46
+ # be overridden by explicit arguments or environment variables.
47
+ PROVIDERS: dict[str, dict[str, str]] = {
48
+ "openrouter": {
49
+ "base_url": "https://openrouter.ai/api/v1",
50
+ "api_key_env": "OPENROUTER_API_KEY",
51
+ "model_env": "OPENROUTER_VLM_MODEL",
52
+ "default_model": "anthropic/claude-sonnet-4",
53
+ },
54
+ "openai": {
55
+ "base_url": "https://api.openai.com/v1",
56
+ "api_key_env": "OPENAI_API_KEY",
57
+ "model_env": "OPENAI_VLM_MODEL",
58
+ "default_model": "gpt-4o-mini",
59
+ },
60
+ "gemini": {
61
+ "base_url": "https://generativelanguage.googleapis.com/v1beta/openai",
62
+ "api_key_env": "GEMINI_API_KEY",
63
+ "model_env": "GEMINI_VLM_MODEL",
64
+ "default_model": "gemini-1.5-flash",
65
+ },
66
+ "local": {
67
+ "base_url": "http://localhost:11434/v1",
68
+ "api_key_env": "DOCPARSER_VLM_API_KEY",
69
+ "model_env": "DOCPARSER_VLM_MODEL",
70
+ "default_model": "llava",
71
+ },
72
+ }
73
+
74
+
75
+ def _resolve_provider(provider: str | None) -> tuple[str, dict[str, str]]:
76
+ name = (provider or os.environ.get("DOCPARSER_VLM_PROVIDER") or DEFAULT_PROVIDER).lower()
77
+ if name not in PROVIDERS:
78
+ raise ValueError(
79
+ f"unknown VLM provider {name!r}; expected one of {sorted(PROVIDERS)}"
80
+ )
81
+ return name, PROVIDERS[name]
82
+
83
+ DEFAULT_SYSTEM_PROMPT = (
84
+ "You are a meticulous research assistant building a structured corpus from "
85
+ "documents. Given an image taken from a document, return a strict JSON "
86
+ "object with the following keys:\n"
87
+ " caption: one concise sentence (<=25 words) suitable as a figure caption.\n"
88
+ " description: 2-5 sentence paragraph describing what the image shows.\n"
89
+ " visible_text: any text that appears in the image, transcribed verbatim. "
90
+ " Empty string if none.\n"
91
+ " tags: 3-8 short lowercase keywords.\n"
92
+ " image_kind: one of [diagram, plot, screenshot, photo, equation, table, "
93
+ " handwriting, ui, other].\n"
94
+ " domain_relevance: one sentence linking the image to the document's "
95
+ " apparent topic; empty string if not applicable.\n"
96
+ "Return ONLY the JSON object, no markdown fences, no commentary."
97
+ )
98
+
99
+ DEFAULT_USER_PROMPT_TEMPLATE = (
100
+ "Document: {doc_name}\n"
101
+ "Nearby caption text (may be empty or noisy): {nearby_caption}\n"
102
+ "Surrounding context excerpt: {context}\n\n"
103
+ "Now analyze the attached image and respond with the JSON object."
104
+ )
105
+
106
+
107
+ @dataclass
108
+ class VLMResult:
109
+ caption: str
110
+ description: str
111
+ visible_text: str
112
+ tags: list[str]
113
+ image_kind: str
114
+ domain_relevance: str
115
+ raw: dict[str, Any]
116
+ model: str
117
+ cached: bool = False
118
+ error: str | None = None
119
+
120
+ def to_dict(self) -> dict[str, Any]:
121
+ return {
122
+ "caption": self.caption,
123
+ "description": self.description,
124
+ "visible_text": self.visible_text,
125
+ "tags": self.tags,
126
+ "image_kind": self.image_kind,
127
+ "domain_relevance": self.domain_relevance,
128
+ "model": self.model,
129
+ "cached": self.cached,
130
+ "error": self.error,
131
+ }
132
+
133
+
134
+ def _default_cache_root() -> Path:
135
+ return Path.home() / ".cache" / "docparser"
136
+
137
+
138
+ def _cache_path(image_sha1: str, model: str, cache_root: Path) -> Path:
139
+ safe_model = model.replace("/", "__")
140
+ return cache_root / "vlm" / safe_model / f"{image_sha1}.json"
141
+
142
+
143
+ def _load_cached(image_sha1: str, model: str, cache_root: Path) -> dict[str, Any] | None:
144
+ p = _cache_path(image_sha1, model, cache_root)
145
+ if not p.exists():
146
+ return None
147
+ try:
148
+ return json.loads(p.read_text(encoding="utf-8"))
149
+ except json.JSONDecodeError:
150
+ return None
151
+
152
+
153
+ def _save_cache(image_sha1: str, model: str, cache_root: Path, payload: dict[str, Any]) -> None:
154
+ p = _cache_path(image_sha1, model, cache_root)
155
+ p.parent.mkdir(parents=True, exist_ok=True)
156
+ p.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
157
+
158
+
159
+ def _coerce(
160
+ payload: dict[str, Any],
161
+ *,
162
+ raw: dict[str, Any],
163
+ model: str,
164
+ cached: bool,
165
+ error: str | None = None,
166
+ ) -> VLMResult:
167
+ return VLMResult(
168
+ caption=str(payload.get("caption", "")).strip(),
169
+ description=str(payload.get("description", "")).strip(),
170
+ visible_text=str(payload.get("visible_text", "")).strip(),
171
+ tags=[str(t).strip().lower() for t in (payload.get("tags") or []) if str(t).strip()],
172
+ image_kind=str(payload.get("image_kind", "other")).strip() or "other",
173
+ domain_relevance=str(payload.get("domain_relevance", "")).strip(),
174
+ raw=raw,
175
+ model=model,
176
+ cached=cached,
177
+ error=error,
178
+ )
179
+
180
+
181
+ def _extract_json(text: str) -> dict[str, Any]:
182
+ text = text.strip()
183
+ if text.startswith("```"):
184
+ lines = text.splitlines()
185
+ if lines and lines[0].startswith("```"):
186
+ lines = lines[1:]
187
+ if lines and lines[-1].startswith("```"):
188
+ lines = lines[:-1]
189
+ text = "\n".join(lines).strip()
190
+ try:
191
+ return json.loads(text)
192
+ except json.JSONDecodeError:
193
+ start = text.find("{")
194
+ end = text.rfind("}")
195
+ if start != -1 and end != -1 and end > start:
196
+ try:
197
+ return json.loads(text[start : end + 1])
198
+ except json.JSONDecodeError:
199
+ pass
200
+ raise ValueError(f"Could not parse JSON from VLM response: {text[:400]}")
201
+
202
+
203
+ def caption_image(
204
+ image_bytes: bytes,
205
+ *,
206
+ mime: str = "image/png",
207
+ doc_name: str = "",
208
+ nearby_caption: str = "",
209
+ context: str = "",
210
+ provider: str | None = None,
211
+ model: str | None = None,
212
+ api_key: str | None = None,
213
+ base_url: str | None = None,
214
+ referer: str | None = None,
215
+ title: str | None = None,
216
+ system_prompt: str | None = None,
217
+ user_prompt_template: str | None = None,
218
+ layout: WorkspaceLayout | None = None,
219
+ cache_root: Path | None = None,
220
+ max_retries: int = 3,
221
+ timeout: int = 90,
222
+ ) -> VLMResult:
223
+ """Caption a single image via an OpenAI-compatible VLM, with on-disk caching.
224
+
225
+ The ``provider`` argument selects a preset (see :data:`PROVIDERS`);
226
+ ``model``, ``api_key``, and ``base_url`` override the preset when given.
227
+
228
+ Cache key: ``SHA1(image_bytes) x model``. Cached at
229
+ ``layout.cache_dir / "vlm" / <model> / <sha>.json`` when a layout is
230
+ supplied, otherwise at ``~/.cache/docparser``.
231
+
232
+ Network calls require the ``[vlm]`` extra (``requests``).
233
+ """
234
+ if requests is None: # pragma: no cover - optional dep guard
235
+ raise ImportError(
236
+ "docparser.image.caption_image requires the [vlm] extra. "
237
+ "Install with: pip install 'docparser[vlm]'"
238
+ )
239
+
240
+ provider_name, preset = _resolve_provider(provider)
241
+ model = model or os.environ.get(preset["model_env"]) or preset["default_model"]
242
+ api_key = api_key or os.environ.get(preset["api_key_env"])
243
+ base_url = (
244
+ base_url
245
+ or os.environ.get("OPENROUTER_BASE_URL" if provider_name == "openrouter" else "")
246
+ or os.environ.get("DOCPARSER_VLM_BASE_URL" if provider_name == "local" else "")
247
+ or preset["base_url"]
248
+ )
249
+ referer = referer or os.environ.get("OPENROUTER_REFERER", DEFAULT_REFERER)
250
+ title = title or os.environ.get("OPENROUTER_TITLE", DEFAULT_TITLE)
251
+ system_prompt = system_prompt or DEFAULT_SYSTEM_PROMPT
252
+ user_prompt_template = user_prompt_template or DEFAULT_USER_PROMPT_TEMPLATE
253
+
254
+ if cache_root is None:
255
+ cache_root = (layout.cache_dir if layout is not None else _default_cache_root())
256
+
257
+ sha = bytes_sha1(image_bytes)
258
+ cached = _load_cached(sha, model, cache_root)
259
+ if cached is not None:
260
+ return _coerce(cached.get("payload", {}), raw=cached, model=model, cached=True)
261
+
262
+ if not api_key:
263
+ return _coerce(
264
+ {},
265
+ raw={},
266
+ model=model,
267
+ cached=False,
268
+ error="OPENROUTER_API_KEY not set; skipping VLM call.",
269
+ )
270
+
271
+ b64 = base64.b64encode(image_bytes).decode("ascii")
272
+ user_msg = user_prompt_template.format(
273
+ doc_name=doc_name or "(unknown)",
274
+ nearby_caption=(nearby_caption or "").strip()[:500] or "(none)",
275
+ context=(context or "").strip()[:1500] or "(none)",
276
+ )
277
+ body = {
278
+ "model": model,
279
+ "messages": [
280
+ {"role": "system", "content": system_prompt},
281
+ {
282
+ "role": "user",
283
+ "content": [
284
+ {"type": "text", "text": user_msg},
285
+ {
286
+ "type": "image_url",
287
+ "image_url": {"url": f"data:{mime};base64,{b64}"},
288
+ },
289
+ ],
290
+ },
291
+ ],
292
+ "temperature": 0.2,
293
+ "max_tokens": 1800,
294
+ "response_format": {"type": "json_object"},
295
+ }
296
+ headers = {
297
+ "Authorization": f"Bearer {api_key}",
298
+ "Content-Type": "application/json",
299
+ "HTTP-Referer": referer,
300
+ "X-Title": title,
301
+ }
302
+ url = f"{base_url}/chat/completions"
303
+
304
+ last_err: str | None = None
305
+ for attempt in range(1, max_retries + 1):
306
+ try:
307
+ resp = requests.post(url, headers=headers, json=body, timeout=timeout) # type: ignore[arg-type]
308
+ except requests.RequestException as exc:
309
+ last_err = f"network error: {exc}"
310
+ time.sleep(min(2**attempt, 15))
311
+ continue
312
+ if resp.status_code == 429 or resp.status_code >= 500:
313
+ last_err = f"http {resp.status_code}: {resp.text[:300]}"
314
+ time.sleep(min(2**attempt, 20))
315
+ continue
316
+ if resp.status_code != 200:
317
+ last_err = f"http {resp.status_code}: {resp.text[:600]}"
318
+ break
319
+ data = resp.json()
320
+ try:
321
+ content = data["choices"][0]["message"]["content"]
322
+ except (KeyError, IndexError, TypeError):
323
+ last_err = f"unexpected response shape: {json.dumps(data)[:400]}"
324
+ break
325
+ if isinstance(content, list):
326
+ content = "".join(part.get("text", "") for part in content if isinstance(part, dict))
327
+ try:
328
+ payload = _extract_json(content)
329
+ except ValueError as exc:
330
+ last_err = str(exc)
331
+ break
332
+ record = {
333
+ "payload": payload,
334
+ "model": model,
335
+ "image_sha1": sha,
336
+ "doc_name": doc_name,
337
+ "nearby_caption": nearby_caption,
338
+ "raw_choice": content,
339
+ }
340
+ _save_cache(sha, model, cache_root, record)
341
+ return _coerce(payload, raw=record, model=model, cached=False)
342
+
343
+ return _coerce({}, raw={}, model=model, cached=False, error=last_err or "unknown")
docparser/localvlm.py ADDED
@@ -0,0 +1,103 @@
1
+ """Fully-local, network-free image captioning via Hugging Face ``transformers``.
2
+
3
+ This is a lighter-weight alternative to the API-based captioner in
4
+ :mod:`docparser.image` for environments without internet access. It produces a
5
+ short caption (and, where the model supports it, a description) using a local
6
+ image-to-text model such as BLIP.
7
+
8
+ It honors the same ``VLMResult`` shape and the same ``SHA1(image) x model``
9
+ on-disk cache as the API captioner, so output is interchangeable.
10
+
11
+ Requires the ``[localvlm]`` extra: ``pip install 'docparser[localvlm]'``.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import io
16
+ import json
17
+ import os
18
+ from functools import lru_cache
19
+ from pathlib import Path
20
+
21
+ from .common import WorkspaceLayout, bytes_sha1
22
+ from .image import VLMResult, _cache_path, _coerce, _load_cached
23
+
24
+ DEFAULT_LOCAL_MODEL = "Salesforce/blip-image-captioning-large"
25
+ LOCAL_MODEL_ENV = "DOCPARSER_LOCAL_VLM_MODEL"
26
+
27
+
28
+ @lru_cache(maxsize=2)
29
+ def _load_pipeline(model: str):
30
+ try:
31
+ from transformers import pipeline # type: ignore
32
+ except ImportError as exc: # pragma: no cover - optional dep
33
+ raise ImportError(
34
+ "docparser.localvlm requires the [localvlm] extra. "
35
+ "Install with: pip install 'docparser[localvlm]'"
36
+ ) from exc
37
+ return pipeline("image-to-text", model=model)
38
+
39
+
40
+ def _save_cache(image_sha1: str, model: str, cache_root: Path, payload: dict) -> None:
41
+ p = _cache_path(image_sha1, model, cache_root)
42
+ p.parent.mkdir(parents=True, exist_ok=True)
43
+ p.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
44
+
45
+
46
+ def caption_image_local(
47
+ image_bytes: bytes,
48
+ *,
49
+ mime: str = "image/png",
50
+ doc_name: str = "",
51
+ nearby_caption: str = "",
52
+ context: str = "",
53
+ model: str | None = None,
54
+ layout: WorkspaceLayout | None = None,
55
+ cache_root: Path | None = None,
56
+ max_new_tokens: int = 60,
57
+ ) -> VLMResult:
58
+ """Caption an image with a local transformers image-to-text model.
59
+
60
+ Unused keyword arguments (``mime``, ``doc_name``, ``nearby_caption``,
61
+ ``context``) are accepted for signature parity with
62
+ :func:`docparser.image.caption_image`.
63
+ """
64
+ _ = (mime, doc_name, nearby_caption, context)
65
+ model = model or os.environ.get(LOCAL_MODEL_ENV) or DEFAULT_LOCAL_MODEL
66
+
67
+ if cache_root is None:
68
+ cache_root = layout.cache_dir if layout is not None else (Path.home() / ".cache" / "docparser")
69
+
70
+ sha = bytes_sha1(image_bytes)
71
+ cached = _load_cached(sha, model, cache_root)
72
+ if cached is not None:
73
+ return _coerce(cached.get("payload", {}), raw=cached, model=model, cached=True)
74
+
75
+ try:
76
+ from PIL import Image # type: ignore
77
+ except ImportError as exc: # pragma: no cover - optional dep
78
+ raise ImportError(
79
+ "docparser.localvlm requires Pillow (bundled with the [localvlm] extra)."
80
+ ) from exc
81
+
82
+ pipe = _load_pipeline(model)
83
+ img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
84
+ try:
85
+ out = pipe(img, max_new_tokens=max_new_tokens)
86
+ except Exception as exc: # pragma: no cover - runtime/model failure
87
+ return _coerce({}, raw={}, model=model, cached=False, error=f"local vlm error: {exc}")
88
+
89
+ caption = ""
90
+ if isinstance(out, list) and out and isinstance(out[0], dict):
91
+ caption = str(out[0].get("generated_text", "")).strip()
92
+
93
+ payload = {
94
+ "caption": caption,
95
+ "description": caption,
96
+ "visible_text": "",
97
+ "tags": [],
98
+ "image_kind": "other",
99
+ "domain_relevance": "",
100
+ }
101
+ record = {"payload": payload, "model": model, "image_sha1": sha, "doc_name": doc_name}
102
+ _save_cache(sha, model, cache_root, record)
103
+ return _coerce(payload, raw=record, model=model, cached=False)
docparser/ocr.py ADDED
@@ -0,0 +1,68 @@
1
+ """OCR helpers for scanned PDFs and images.
2
+
3
+ Uses ``rapidocr-onnxruntime`` by default: a pure-pip ONNX OCR engine that needs
4
+ no system binaries (unlike Tesseract). The engine is created once and reused.
5
+
6
+ Requires the ``[ocr]`` extra: ``pip install 'docparser[ocr]'``.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import io
11
+ from functools import lru_cache
12
+ from typing import Any
13
+
14
+ _NO_OCR_MSG = (
15
+ "OCR requires the [ocr] extra. Install with: pip install 'docparser[ocr]'"
16
+ )
17
+
18
+
19
+ @lru_cache(maxsize=1)
20
+ def _engine():
21
+ try:
22
+ from rapidocr_onnxruntime import RapidOCR # type: ignore
23
+ except ImportError as exc: # pragma: no cover - optional dep
24
+ raise ImportError(_NO_OCR_MSG) from exc
25
+ return RapidOCR()
26
+
27
+
28
+ def ocr_available() -> bool:
29
+ try:
30
+ import rapidocr_onnxruntime # type: ignore # noqa: F401
31
+ except ImportError:
32
+ return False
33
+ return True
34
+
35
+
36
+ def ocr_image_bytes(blob: bytes) -> str:
37
+ """Return recognized text from raw image bytes (joined by newlines)."""
38
+ try:
39
+ import numpy as np # type: ignore
40
+ from PIL import Image # type: ignore
41
+ except ImportError as exc: # pragma: no cover - optional dep
42
+ raise ImportError(_NO_OCR_MSG) from exc
43
+
44
+ engine = _engine()
45
+ img = Image.open(io.BytesIO(blob)).convert("RGB")
46
+ arr = np.array(img)
47
+ result, _ = engine(arr)
48
+ if not result:
49
+ return ""
50
+ lines: list[str] = []
51
+ for item in result:
52
+ # rapidocr returns [box, text, score]
53
+ if isinstance(item, (list, tuple)) and len(item) >= 2:
54
+ lines.append(str(item[1]))
55
+ return "\n".join(lines).strip()
56
+
57
+
58
+ def ocr_pdf_page(page: Any, *, dpi: int = 200) -> str:
59
+ """Render a PyMuPDF page to a raster image and OCR it."""
60
+ try:
61
+ import fitz # type: ignore # noqa: F401
62
+ except ImportError as exc: # pragma: no cover
63
+ raise ImportError(
64
+ "OCR of PDF pages requires the [pdf] extra (PyMuPDF)."
65
+ ) from exc
66
+ zoom = dpi / 72.0
67
+ pix = page.get_pixmap(matrix=__import__("fitz").Matrix(zoom, zoom))
68
+ return ocr_image_bytes(pix.tobytes("png"))