rc-docparser 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docparser/pdf.py ADDED
@@ -0,0 +1,430 @@
1
+ """PDF parser using PyMuPDF, with optional high-fidelity backends.
2
+
3
+ The builtin engine extracts text page-by-page (preserving reading order via
4
+ PyMuPDF's "blocks" API) and embedded raster images, using a best-effort
5
+ heading classifier based on font sizing. On top of that it offers:
6
+
7
+ - ``backend=`` route conversion to a third-party engine (pymupdf4llm /
8
+ docling / marker) for higher-fidelity Markdown (see
9
+ :mod:`docparser.pdf_backends`).
10
+ - ``ocr=`` ``"off" | "auto" | "force"`` - OCR scanned/low-text pages via
11
+ the ``[ocr]`` extra (see :mod:`docparser.ocr`).
12
+ - ``extract_tables`` use ``pdfplumber`` (the ``[tables]`` extra) to emit real
13
+ table blocks instead of flattened text.
14
+
15
+ Requires the ``[pdf]`` extra: ``pip install 'docparser[pdf]'``.
16
+ """
17
+ from __future__ import annotations
18
+
19
+ import re
20
+ from collections.abc import Callable
21
+ from pathlib import Path
22
+ from typing import Any
23
+
24
+ from .common import (
25
+ WorkspaceLayout,
26
+ bytes_sha1,
27
+ file_sha1,
28
+ truncate,
29
+ utc_now_iso,
30
+ write_json,
31
+ write_text,
32
+ )
33
+
34
+ CAPTION_RE = re.compile(
35
+ r"^\s*(figure|fig\.?|table|scheme|chart|diagram)\s*[:.\-]?\s*\d+[.\:\-]?\s+",
36
+ re.IGNORECASE,
37
+ )
38
+
39
+ # A page with fewer than this many extractable characters is treated as
40
+ # "scanned" by ``ocr="auto"``.
41
+ OCR_AUTO_MIN_CHARS = 80
42
+
43
+
44
+ def _percentile(values: list[float], pct: float) -> float:
45
+ if not values:
46
+ return 0.0
47
+ s = sorted(values)
48
+ k = max(0, min(len(s) - 1, round(pct * (len(s) - 1))))
49
+ return s[k]
50
+
51
+
52
+ def _render_table_md(rows: list[list[str]]) -> list[str]:
53
+ if not rows:
54
+ return []
55
+ ncols = max(len(r) for r in rows)
56
+ out = []
57
+ header = rows[0]
58
+ header_cells = [
59
+ (header[c] if c < len(header) else "").replace("|", "\\|").replace("\n", " ")
60
+ for c in range(ncols)
61
+ ]
62
+ out.append("| " + " | ".join(header_cells) + " |")
63
+ out.append("| " + " | ".join(["---"] * ncols) + " |")
64
+ for row in rows[1:]:
65
+ cells = [
66
+ (row[c] if c < len(row) else "").replace("|", "\\|").replace("\n", " ")
67
+ for c in range(ncols)
68
+ ]
69
+ out.append("| " + " | ".join(cells) + " |")
70
+ out.append("")
71
+ return out
72
+
73
+
74
+ def parse_pdf(
75
+ source: Path | str,
76
+ layout: WorkspaceLayout | None = None,
77
+ *,
78
+ captioner: Callable[..., dict[str, Any]] | None = None,
79
+ write_outputs: bool = True,
80
+ extract_images: bool = True,
81
+ backend: str = "builtin",
82
+ ocr: str = "off",
83
+ extract_tables: bool = False,
84
+ ) -> dict[str, Any]:
85
+ """Parse a PDF into Markdown + JSON.
86
+
87
+ Parameters
88
+ ----------
89
+ extract_images : bool
90
+ If False, skip raster image extraction (faster for text-only docs).
91
+ backend : str
92
+ ``"builtin"`` (default) uses PyMuPDF heuristics. ``"pymupdf4llm"``,
93
+ ``"docling"``, or ``"marker"`` route to the corresponding extra for
94
+ higher-fidelity Markdown; images are still extracted via PyMuPDF.
95
+ ocr : str
96
+ ``"off"`` (default), ``"auto"`` (OCR only low-text pages), or
97
+ ``"force"`` (OCR every page). Requires the ``[ocr]`` extra.
98
+ extract_tables : bool
99
+ If True, extract tables with ``pdfplumber`` (the ``[tables]`` extra)
100
+ and emit ``table`` blocks.
101
+ """
102
+ try:
103
+ import fitz # type: ignore # PyMuPDF
104
+ except ImportError as exc: # pragma: no cover
105
+ raise ImportError(
106
+ "docparser.pdf.parse_pdf requires the [pdf] extra. "
107
+ "Install with: pip install 'docparser[pdf]'"
108
+ ) from exc
109
+
110
+ if ocr not in {"off", "auto", "force"}:
111
+ raise ValueError(f"ocr must be 'off', 'auto', or 'force'; got {ocr!r}")
112
+
113
+ source = Path(source)
114
+ layout = layout or WorkspaceLayout()
115
+ real_source = source.resolve()
116
+ out_dir = layout.parsed_dir_for(source)
117
+ asset_dir = layout.assets_dir_for(source)
118
+ if write_outputs:
119
+ out_dir.mkdir(parents=True, exist_ok=True)
120
+ if extract_images:
121
+ asset_dir.mkdir(parents=True, exist_ok=True)
122
+
123
+ doc = fitz.open(str(real_source))
124
+
125
+ blocks_payload: list[dict[str, Any]] = []
126
+ images_payload: list[dict[str, Any]] = []
127
+ image_caption_results: dict[str, dict[str, Any]] = {}
128
+ section_stack: list[str] = []
129
+ state = {"image_seq": 0}
130
+
131
+ md_lines: list[str] = [
132
+ f"# {source.stem}",
133
+ "",
134
+ f"> Source: `{source.name}` \u00b7 sha1 `{file_sha1(real_source)[:12]}` "
135
+ f"\u00b7 parsed `{utc_now_iso()}` \u00b7 pages: {doc.page_count} \u00b7 backend: {backend}",
136
+ "",
137
+ ]
138
+
139
+ # ---- shared image extraction (used by every backend) -------------------
140
+ def process_page_images(page, page_idx: int) -> None:
141
+ for img_info in page.get_images(full=True) or []:
142
+ xref = img_info[0]
143
+ try:
144
+ img_dict = doc.extract_image(xref)
145
+ except Exception:
146
+ continue
147
+ if not img_dict:
148
+ continue
149
+ blob = img_dict.get("image")
150
+ ext = (img_dict.get("ext") or "png").lower()
151
+ if not blob:
152
+ continue
153
+ state["image_seq"] += 1
154
+ image_seq = state["image_seq"]
155
+ sha = bytes_sha1(blob)
156
+ asset_name = f"img-{image_seq:03d}-{sha[:10]}.{ext}"
157
+ asset_path = asset_dir / asset_name
158
+ if write_outputs and not asset_path.exists():
159
+ asset_path.write_bytes(blob)
160
+
161
+ ctx_before = next(
162
+ (
163
+ b["text"]
164
+ for b in reversed(blocks_payload)
165
+ if b.get("page") == page_idx + 1
166
+ and b.get("kind") in {"paragraph", "caption", "heading"}
167
+ ),
168
+ "",
169
+ )
170
+ cap = None
171
+ if captioner is not None:
172
+ try:
173
+ cap = captioner(
174
+ image_bytes=blob,
175
+ mime=f"image/{'jpeg' if ext == 'jpg' else ext}",
176
+ doc_name=f"{source.name} :: page {page_idx + 1}",
177
+ nearby_caption="",
178
+ context=ctx_before,
179
+ )
180
+ except Exception as exc:
181
+ cap = {"error": str(exc)}
182
+ image_caption_results[str(image_seq)] = cap or {}
183
+
184
+ rel = layout.relpath_from_parsed(asset_path, source)
185
+ alt = (cap or {}).get("caption") or f"page-{page_idx + 1}-image-{image_seq}"
186
+ md_lines.append(f"![{alt}]({rel})")
187
+ if cap and cap.get("description"):
188
+ md_lines.append("")
189
+ md_lines.append(f"<!-- vlm: {cap.get('model','')} -->")
190
+ md_lines.append(f"> **VLM caption.** {cap.get('caption','')}")
191
+ md_lines.append(">")
192
+ md_lines.append(f"> {cap.get('description','')}")
193
+ if cap.get("visible_text"):
194
+ md_lines.append(">")
195
+ vt = cap["visible_text"].replace("\n", "\n> ")
196
+ md_lines.append(f"> *Visible text:* {vt}")
197
+ if cap.get("tags"):
198
+ md_lines.append(">")
199
+ md_lines.append("> *Tags:* " + ", ".join(cap["tags"]))
200
+ md_lines.append("")
201
+ images_payload.append(
202
+ {
203
+ "seq": image_seq,
204
+ "page": page_idx + 1,
205
+ "xref": xref,
206
+ "ext": ext,
207
+ "sha1": sha,
208
+ "asset_path": rel,
209
+ "context_before": truncate(ctx_before, 600),
210
+ "section_path": list(section_stack),
211
+ "semantic": cap,
212
+ }
213
+ )
214
+
215
+ def page_tables(page_idx: int) -> list[list[list[str]]]:
216
+ try:
217
+ import pdfplumber # type: ignore
218
+ except ImportError as exc: # pragma: no cover - optional dep
219
+ raise ImportError(
220
+ "extract_tables=True requires the [tables] extra. "
221
+ "Install with: pip install 'docparser[tables]'"
222
+ ) from exc
223
+ out: list[list[list[str]]] = []
224
+ with pdfplumber.open(str(real_source)) as pdf:
225
+ if page_idx >= len(pdf.pages):
226
+ return out
227
+ for tbl in pdf.pages[page_idx].extract_tables() or []:
228
+ norm = [[(c or "").strip() for c in row] for row in tbl]
229
+ if any(any(c for c in row) for row in norm):
230
+ out.append(norm)
231
+ return out
232
+
233
+ if backend == "builtin":
234
+ # First pass: collect line font sizes to set heading thresholds.
235
+ all_sizes: list[float] = []
236
+ for page in doc:
237
+ td = page.get_text("dict")
238
+ for block in td.get("blocks", []) or []:
239
+ if block.get("type") != 0:
240
+ continue
241
+ for line in block.get("lines", []) or []:
242
+ for span in line.get("spans", []) or []:
243
+ sz = float(span.get("size") or 0.0)
244
+ if sz > 0:
245
+ all_sizes.append(sz)
246
+ body_size = _percentile(all_sizes, 0.5) if all_sizes else 11.0
247
+ h_thresh = body_size * 1.2
248
+
249
+ for page_idx, page in enumerate(doc):
250
+ page_text_chars = 0
251
+ td = page.get_text("dict")
252
+ for b_idx, block in enumerate(td.get("blocks", []) or []):
253
+ if block.get("type") != 0: # only text blocks here
254
+ continue
255
+ lines = block.get("lines", []) or []
256
+ if not lines:
257
+ continue
258
+ line_texts: list[str] = []
259
+ max_span_size = 0.0
260
+ any_bold = False
261
+ for line in lines:
262
+ spans = line.get("spans", []) or []
263
+ line_text = "".join(span.get("text", "") for span in spans).strip()
264
+ if not line_text:
265
+ continue
266
+ line_texts.append(line_text)
267
+ for span in spans:
268
+ sz = float(span.get("size") or 0.0)
269
+ if sz > max_span_size:
270
+ max_span_size = sz
271
+ if int(span.get("flags") or 0) & 16:
272
+ any_bold = True
273
+ text = "\n".join(line_texts).strip()
274
+ if not text:
275
+ continue
276
+ page_text_chars += len(text)
277
+ location = f"page[{page_idx + 1}].block[{b_idx}]"
278
+ is_heading = (
279
+ max_span_size >= h_thresh
280
+ and len(text) <= 140
281
+ and not text.endswith(".")
282
+ )
283
+ if is_heading:
284
+ if max_span_size >= body_size * 1.6:
285
+ level = 1
286
+ elif max_span_size >= body_size * 1.35:
287
+ level = 2
288
+ else:
289
+ level = 3
290
+ while len(section_stack) >= level:
291
+ section_stack.pop()
292
+ section_stack.append(text)
293
+ blocks_payload.append(
294
+ {
295
+ "kind": "heading",
296
+ "level": level,
297
+ "text": text,
298
+ "size": max_span_size,
299
+ "bold": any_bold,
300
+ "page": page_idx + 1,
301
+ "location": location,
302
+ "section_path": list(section_stack),
303
+ }
304
+ )
305
+ md_lines.append(f"{'#' * (level + 1)} {text}")
306
+ md_lines.append("")
307
+ else:
308
+ kind = "caption" if CAPTION_RE.match(text) else "paragraph"
309
+ blocks_payload.append(
310
+ {
311
+ "kind": kind,
312
+ "text": text,
313
+ "size": max_span_size,
314
+ "bold": any_bold,
315
+ "page": page_idx + 1,
316
+ "location": location,
317
+ "section_path": list(section_stack),
318
+ }
319
+ )
320
+ md_lines.append(f"*{text}*" if kind == "caption" else text)
321
+ md_lines.append("")
322
+
323
+ # ---- OCR fallback for scanned / low-text pages ----------------
324
+ if ocr == "force" or (ocr == "auto" and page_text_chars < OCR_AUTO_MIN_CHARS):
325
+ from .ocr import ocr_pdf_page
326
+
327
+ ocr_text = ocr_pdf_page(page)
328
+ if ocr_text:
329
+ blocks_payload.append(
330
+ {
331
+ "kind": "paragraph",
332
+ "text": ocr_text,
333
+ "page": page_idx + 1,
334
+ "location": f"page[{page_idx + 1}].ocr",
335
+ "ocr": True,
336
+ "section_path": list(section_stack),
337
+ }
338
+ )
339
+ md_lines.append(ocr_text)
340
+ md_lines.append("")
341
+
342
+ # ---- tables ---------------------------------------------------
343
+ if extract_tables:
344
+ for t_idx, rows in enumerate(page_tables(page_idx)):
345
+ blocks_payload.append(
346
+ {
347
+ "kind": "table",
348
+ "rows": rows,
349
+ "page": page_idx + 1,
350
+ "location": f"page[{page_idx + 1}].table[{t_idx}]",
351
+ "section_path": list(section_stack),
352
+ }
353
+ )
354
+ md_lines.extend(_render_table_md(rows))
355
+
356
+ # ---- images ---------------------------------------------------
357
+ if extract_images:
358
+ process_page_images(page, page_idx)
359
+ else:
360
+ # External high-fidelity backend: Markdown + derived blocks.
361
+ from .pdf_backends import run_backend
362
+
363
+ result = run_backend(backend, real_source)
364
+ for b in result["blocks"]:
365
+ b.setdefault("page", None)
366
+ blocks_payload.append(b)
367
+ if result["markdown"].strip():
368
+ md_lines.append(result["markdown"].strip())
369
+ md_lines.append("")
370
+
371
+ if extract_tables:
372
+ md_lines.append("## Tables")
373
+ md_lines.append("")
374
+ for page_idx in range(doc.page_count):
375
+ for t_idx, rows in enumerate(page_tables(page_idx)):
376
+ blocks_payload.append(
377
+ {
378
+ "kind": "table",
379
+ "rows": rows,
380
+ "page": page_idx + 1,
381
+ "location": f"page[{page_idx + 1}].table[{t_idx}]",
382
+ "section_path": [],
383
+ }
384
+ )
385
+ md_lines.extend(_render_table_md(rows))
386
+
387
+ if extract_images:
388
+ md_lines.append("## Extracted images")
389
+ md_lines.append("")
390
+ for page_idx, page in enumerate(doc):
391
+ process_page_images(page, page_idx)
392
+
393
+ md_text = "\n".join(md_lines).rstrip() + "\n"
394
+
395
+ json_payload = {
396
+ "source": {
397
+ "filename": source.name,
398
+ "absolute_path": str(real_source),
399
+ "sha1": file_sha1(real_source),
400
+ "size_bytes": real_source.stat().st_size,
401
+ "kind": "pdf",
402
+ },
403
+ "parsed_at": utc_now_iso(),
404
+ "backend": backend,
405
+ "ocr": ocr,
406
+ "n_pages": doc.page_count,
407
+ "blocks": blocks_payload,
408
+ "images": images_payload,
409
+ "stats": {
410
+ "n_blocks": len(blocks_payload),
411
+ "n_headings": sum(1 for b in blocks_payload if b["kind"] == "heading"),
412
+ "n_paragraphs": sum(1 for b in blocks_payload if b["kind"] == "paragraph"),
413
+ "n_captions": sum(1 for b in blocks_payload if b["kind"] == "caption"),
414
+ "n_tables": sum(1 for b in blocks_payload if b["kind"] == "table"),
415
+ "n_ocr_blocks": sum(1 for b in blocks_payload if b.get("ocr")),
416
+ "n_images": len(images_payload),
417
+ "n_captioned_images": sum(
418
+ 1
419
+ for v in image_caption_results.values()
420
+ if v and not v.get("error") and v.get("caption")
421
+ ),
422
+ },
423
+ }
424
+
425
+ if write_outputs:
426
+ write_text(out_dir / "document.md", md_text)
427
+ write_json(out_dir / "document.json", json_payload)
428
+
429
+ doc.close()
430
+ return json_payload
@@ -0,0 +1,89 @@
1
+ """Pluggable high-fidelity PDF -> Markdown backends.
2
+
3
+ The builtin PyMuPDF parser in :mod:`docparser.pdf` is fast and dependency-light
4
+ but uses heuristics for layout. For higher fidelity (tables, multi-column,
5
+ formulas) callers can route to a third-party backend. Each backend converts a
6
+ PDF to Markdown; this module normalizes that Markdown into docparser's block
7
+ schema via :func:`docparser.text._blocks_from_markdown`.
8
+
9
+ All backends are optional extras and lazily imported:
10
+
11
+ - ``pymupdf4llm`` -> ``pip install 'docparser[pymupdf4llm]'`` (note: AGPL/commercial)
12
+ - ``docling`` -> ``pip install 'docparser[docling]'`` (MIT)
13
+ - ``marker`` -> ``pip install 'docparser[marker]'`` (GPL-3.0)
14
+ """
15
+ from __future__ import annotations
16
+
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+ from .text import _blocks_from_markdown
21
+
22
+ AVAILABLE_BACKENDS = ("builtin", "pymupdf4llm", "docling", "marker")
23
+
24
+
25
+ def _markdown_pymupdf4llm(path: Path) -> str:
26
+ try:
27
+ import pymupdf4llm # type: ignore
28
+ except ImportError as exc: # pragma: no cover - optional dep
29
+ raise ImportError(
30
+ "backend='pymupdf4llm' requires the [pymupdf4llm] extra. "
31
+ "Install with: pip install 'docparser[pymupdf4llm]'"
32
+ ) from exc
33
+ return pymupdf4llm.to_markdown(str(path))
34
+
35
+
36
+ def _markdown_docling(path: Path) -> str:
37
+ try:
38
+ from docling.document_converter import DocumentConverter # type: ignore
39
+ except ImportError as exc: # pragma: no cover - optional dep
40
+ raise ImportError(
41
+ "backend='docling' requires the [docling] extra. "
42
+ "Install with: pip install 'docparser[docling]'"
43
+ ) from exc
44
+ converter = DocumentConverter()
45
+ result = converter.convert(str(path))
46
+ return result.document.export_to_markdown()
47
+
48
+
49
+ def _markdown_marker(path: Path) -> str:
50
+ try:
51
+ from marker.config.parser import ConfigParser # type: ignore
52
+ from marker.converters.pdf import PdfConverter # type: ignore
53
+ from marker.models import create_model_dict # type: ignore
54
+ from marker.output import text_from_rendered # type: ignore
55
+ except ImportError as exc: # pragma: no cover - optional dep
56
+ raise ImportError(
57
+ "backend='marker' requires the [marker] extra. "
58
+ "Install with: pip install 'docparser[marker]'"
59
+ ) from exc
60
+ config_parser = ConfigParser({"output_format": "markdown"})
61
+ converter = PdfConverter(
62
+ config=config_parser.generate_config_dict(),
63
+ artifact_dict=create_model_dict(),
64
+ )
65
+ rendered = converter(str(path))
66
+ text, _, _ = text_from_rendered(rendered)
67
+ return text
68
+
69
+
70
+ _BACKEND_FUNCS = {
71
+ "pymupdf4llm": _markdown_pymupdf4llm,
72
+ "docling": _markdown_docling,
73
+ "marker": _markdown_marker,
74
+ }
75
+
76
+
77
+ def run_backend(backend: str, source: Path) -> dict[str, Any]:
78
+ """Run an external PDF backend and return normalized output.
79
+
80
+ Returns a dict with ``markdown`` (str) and ``blocks`` (list of typed
81
+ block dicts derived from the Markdown).
82
+ """
83
+ if backend not in _BACKEND_FUNCS:
84
+ raise ValueError(
85
+ f"unknown PDF backend {backend!r}; expected one of {AVAILABLE_BACKENDS}"
86
+ )
87
+ markdown = _BACKEND_FUNCS[backend](Path(source))
88
+ blocks = _blocks_from_markdown(markdown or "")
89
+ return {"markdown": markdown or "", "blocks": blocks}