rc-docparser 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docparser/xlsx.py ADDED
@@ -0,0 +1,319 @@
1
+ """XLSX parser: emits Markdown + JSON for every sheet/row/column.
2
+
3
+ Preserves cell value, openpyxl ``data_type``, formula (separate pass with
4
+ ``data_only=False``), hyperlinks, comments, merged ranges, frozen panes, and
5
+ embedded images.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import datetime as _dt
10
+ import io
11
+ from collections.abc import Callable
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ from openpyxl import load_workbook
16
+ from openpyxl.cell.cell import Cell
17
+ from openpyxl.utils import get_column_letter
18
+ from openpyxl.worksheet.worksheet import Worksheet
19
+
20
+ from .common import (
21
+ WorkspaceLayout,
22
+ bytes_sha1,
23
+ file_sha1,
24
+ truncate,
25
+ utc_now_iso,
26
+ write_json,
27
+ write_text,
28
+ )
29
+
30
+
31
+ def _value_to_jsonable(v: Any) -> Any:
32
+ if v is None:
33
+ return None
34
+ if isinstance(v, bool):
35
+ return v
36
+ if isinstance(v, (int, float, str)):
37
+ return v
38
+ if isinstance(v, (_dt.datetime, _dt.date, _dt.time)):
39
+ return v.isoformat()
40
+ if isinstance(v, _dt.timedelta):
41
+ return v.total_seconds()
42
+ return str(v)
43
+
44
+
45
+ def _cell_record(cell: Cell, formulas_ws: Worksheet | None) -> dict[str, Any]:
46
+ rec: dict[str, Any] = {
47
+ "addr": cell.coordinate,
48
+ "row": cell.row,
49
+ "col": cell.column,
50
+ "col_letter": get_column_letter(cell.column),
51
+ "value": _value_to_jsonable(cell.value),
52
+ "data_type": cell.data_type,
53
+ }
54
+ if cell.number_format and cell.number_format != "General":
55
+ rec["number_format"] = cell.number_format
56
+ if cell.hyperlink is not None:
57
+ target = getattr(cell.hyperlink, "target", None)
58
+ if target:
59
+ rec["hyperlink"] = target
60
+ if formulas_ws is not None:
61
+ try:
62
+ f_cell = formulas_ws[cell.coordinate]
63
+ f_val = f_cell.value
64
+ if isinstance(f_val, str) and f_val.startswith("="):
65
+ rec["formula"] = f_val
66
+ except Exception:
67
+ pass
68
+ if cell.comment is not None:
69
+ try:
70
+ rec["comment"] = str(cell.comment.text)
71
+ except Exception:
72
+ pass
73
+ return rec
74
+
75
+
76
+ def _extract_images(
77
+ ws: Worksheet, asset_dir: Path, layout: WorkspaceLayout, source: Path, start_seq: int, write_outputs: bool
78
+ ) -> list[dict[str, Any]]:
79
+ images: list[dict[str, Any]] = []
80
+ seq = start_seq
81
+ for img in list(getattr(ws, "_images", []) or []):
82
+ try:
83
+ ref = img.ref
84
+ data: bytes | None = None
85
+ if hasattr(ref, "read"):
86
+ data = ref.read()
87
+ try:
88
+ ref.seek(0)
89
+ except Exception:
90
+ pass
91
+ elif isinstance(ref, (bytes, bytearray)):
92
+ data = bytes(ref)
93
+ else:
94
+ pil_img = getattr(img, "image", None)
95
+ if pil_img is not None:
96
+ buf = io.BytesIO()
97
+ pil_img.save(buf, format=pil_img.format or "PNG")
98
+ data = buf.getvalue()
99
+ if not data:
100
+ continue
101
+ seq += 1
102
+ sha = bytes_sha1(data)
103
+ ext = (Path(getattr(img, "path", "") or "").suffix.lstrip(".") or "png").lower()
104
+ asset_name = f"img-{ws.title.replace('/', '_')}-{seq:03d}-{sha[:10]}.{ext}"
105
+ asset_path = asset_dir / asset_name
106
+ if write_outputs and not asset_path.exists():
107
+ asset_path.write_bytes(data)
108
+ anchor = getattr(img, "anchor", None)
109
+ anchor_cell = None
110
+ try:
111
+ if anchor and hasattr(anchor, "_from"):
112
+ anchor_cell = (
113
+ f"{get_column_letter(anchor._from.col + 1)}{anchor._from.row + 1}"
114
+ )
115
+ except Exception:
116
+ anchor_cell = None
117
+ images.append(
118
+ {
119
+ "seq": seq,
120
+ "sha1": sha,
121
+ "ext": ext,
122
+ "asset_path": layout.relpath_from_parsed(asset_path, source),
123
+ "abs_asset_path": str(asset_path),
124
+ "sheet": ws.title,
125
+ "anchor_cell": anchor_cell,
126
+ "blob": data,
127
+ }
128
+ )
129
+ except Exception:
130
+ continue
131
+ return images
132
+
133
+
134
+ def _md_cell(v: Any) -> str:
135
+ if v is None:
136
+ return ""
137
+ if isinstance(v, bool):
138
+ return "TRUE" if v else "FALSE"
139
+ s = str(v)
140
+ s = s.replace("|", "\\|").replace("\r", " ").replace("\n", " <br> ")
141
+ return truncate(s, 400)
142
+
143
+
144
+ def parse_xlsx(
145
+ source: Path | str,
146
+ layout: WorkspaceLayout | None = None,
147
+ *,
148
+ captioner: Callable[..., dict[str, Any]] | None = None,
149
+ write_outputs: bool = True,
150
+ ) -> dict[str, Any]:
151
+ """Parse an XLSX workbook into Markdown + JSON.
152
+
153
+ See :func:`docparser.parse_docx` for parameter conventions.
154
+ """
155
+ source = Path(source)
156
+ layout = layout or WorkspaceLayout()
157
+ real_source = source.resolve()
158
+
159
+ wb_values = load_workbook(filename=str(real_source), data_only=True, read_only=False)
160
+ wb_formulas = load_workbook(filename=str(real_source), data_only=False, read_only=False)
161
+
162
+ out_dir = layout.parsed_dir_for(source)
163
+ asset_dir = layout.assets_dir_for(source)
164
+ if write_outputs:
165
+ out_dir.mkdir(parents=True, exist_ok=True)
166
+ asset_dir.mkdir(parents=True, exist_ok=True)
167
+
168
+ sheets_payload: list[dict[str, Any]] = []
169
+ md_lines: list[str] = [f"# {source.stem}", ""]
170
+ md_lines.append(
171
+ f"> Source: `{source.name}` \u00b7 sha1 `{file_sha1(real_source)[:12]}` "
172
+ f"\u00b7 parsed `{utc_now_iso()}` \u00b7 sheets: {len(wb_values.sheetnames)}"
173
+ )
174
+ md_lines.append("")
175
+ md_lines.append("## Sheets")
176
+ md_lines.append("")
177
+ for s in wb_values.sheetnames:
178
+ md_lines.append(f"- [{s}](#sheet-{s.lower().replace(' ', '-')})")
179
+ md_lines.append("")
180
+
181
+ image_seq = 0
182
+ for sheet_name in wb_values.sheetnames:
183
+ ws = wb_values[sheet_name]
184
+ ws_f = wb_formulas[sheet_name] if sheet_name in wb_formulas.sheetnames else None
185
+ max_row = ws.max_row or 0
186
+ max_col = ws.max_column or 0
187
+
188
+ rows_records: list[list[dict[str, Any]]] = []
189
+ any_value = False
190
+ for row in ws.iter_rows(min_row=1, max_row=max_row, max_col=max_col):
191
+ row_recs = []
192
+ for cell in row:
193
+ rec = _cell_record(cell, ws_f)
194
+ row_recs.append(rec)
195
+ if rec["value"] not in (None, "") or rec.get("formula"):
196
+ any_value = True
197
+ rows_records.append(row_recs)
198
+
199
+ merged = [str(r) for r in (ws.merged_cells.ranges or [])]
200
+ frozen = ws.freeze_panes
201
+ sheet_images = _extract_images(ws, asset_dir, layout, source, image_seq, write_outputs)
202
+ image_seq = max([image_seq] + [im["seq"] for im in sheet_images])
203
+
204
+ sheet_payload = {
205
+ "name": sheet_name,
206
+ "max_row": max_row,
207
+ "max_col": max_col,
208
+ "frozen_panes": frozen,
209
+ "merged_ranges": merged,
210
+ "n_nonempty_cells": sum(
211
+ 1 for row in rows_records for c in row if c["value"] not in (None, "")
212
+ ),
213
+ "rows": rows_records,
214
+ "images": [{k: v for k, v in im.items() if k != "blob"} for im in sheet_images],
215
+ }
216
+ sheets_payload.append(sheet_payload)
217
+
218
+ md_lines.append(f"## Sheet: {sheet_name}")
219
+ md_lines.append("")
220
+ md_lines.append(
221
+ f"_dimensions:_ {max_row} rows \u00d7 {max_col} cols "
222
+ f"\u00b7 _frozen:_ `{frozen or 'none'}` "
223
+ f"\u00b7 _merged ranges:_ {len(merged)} "
224
+ f"\u00b7 _images:_ {len(sheet_images)}"
225
+ )
226
+ md_lines.append("")
227
+ if not any_value and not sheet_images:
228
+ md_lines.append("_(empty sheet)_")
229
+ md_lines.append("")
230
+ continue
231
+
232
+ header_idx = 0
233
+ for i, row in enumerate(rows_records):
234
+ if any(c["value"] not in (None, "") for c in row):
235
+ header_idx = i
236
+ break
237
+ header = rows_records[header_idx] if rows_records else []
238
+ body = rows_records[header_idx + 1 :]
239
+
240
+ ncols = max_col
241
+ header_cells = [
242
+ _md_cell(header[c]["value"] if c < len(header) else "") for c in range(ncols)
243
+ ]
244
+ if not any(h.strip() for h in header_cells):
245
+ header_cells = [get_column_letter(c + 1) for c in range(ncols)]
246
+ md_lines.append("| " + " | ".join(header_cells) + " |")
247
+ md_lines.append("| " + " | ".join(["---"] * ncols) + " |")
248
+ for row in body:
249
+ cells = []
250
+ for c in range(ncols):
251
+ v = row[c]["value"] if c < len(row) else ""
252
+ f = row[c].get("formula") if c < len(row) else None
253
+ cells.append(_md_cell(f if f and (v is None or v == "") else v))
254
+ md_lines.append("| " + " | ".join(cells) + " |")
255
+ md_lines.append("")
256
+
257
+ if sheet_images:
258
+ md_lines.append(f"### Images in `{sheet_name}`")
259
+ md_lines.append("")
260
+ for im in sheet_images:
261
+ rel = im["asset_path"]
262
+ cap = None
263
+ if captioner is not None:
264
+ try:
265
+ cap = captioner(
266
+ image_bytes=im["blob"],
267
+ mime="image/" + (im["ext"] if im["ext"] != "jpg" else "jpeg"),
268
+ doc_name=f"{source.name} :: sheet {sheet_name}",
269
+ nearby_caption=f"Anchor cell: {im.get('anchor_cell') or 'unknown'}",
270
+ context=f"Workbook image inside sheet '{sheet_name}'.",
271
+ )
272
+ except Exception as exc:
273
+ cap = {"error": str(exc)}
274
+ im["semantic"] = cap
275
+ alt = (cap or {}).get("caption") or f"image-{im['seq']}"
276
+ md_lines.append(f"![{alt}]({rel})")
277
+ if cap and cap.get("description"):
278
+ md_lines.append("")
279
+ md_lines.append(f"> **VLM caption.** {cap.get('caption','')}")
280
+ md_lines.append(">")
281
+ md_lines.append(f"> {cap.get('description','')}")
282
+ if cap.get("visible_text"):
283
+ md_lines.append(">")
284
+ vt = cap["visible_text"].replace("\n", "\n> ")
285
+ md_lines.append(f"> *Visible text:* {vt}")
286
+ if cap.get("tags"):
287
+ md_lines.append(">")
288
+ md_lines.append("> *Tags:* " + ", ".join(cap["tags"]))
289
+ md_lines.append("")
290
+ sheet_payload["images"] = [
291
+ {k: v for k, v in im.items() if k != "blob"} for im in sheet_images
292
+ ]
293
+
294
+ md_text = "\n".join(md_lines).rstrip() + "\n"
295
+
296
+ json_payload = {
297
+ "source": {
298
+ "filename": source.name,
299
+ "absolute_path": str(real_source),
300
+ "sha1": file_sha1(real_source),
301
+ "size_bytes": real_source.stat().st_size,
302
+ "kind": "xlsx",
303
+ },
304
+ "parsed_at": utc_now_iso(),
305
+ "sheet_names": list(wb_values.sheetnames),
306
+ "sheets": sheets_payload,
307
+ "stats": {
308
+ "n_sheets": len(sheets_payload),
309
+ "n_rows_total": sum(s["max_row"] for s in sheets_payload),
310
+ "n_nonempty_cells_total": sum(s["n_nonempty_cells"] for s in sheets_payload),
311
+ "n_images_total": sum(len(s["images"]) for s in sheets_payload),
312
+ },
313
+ }
314
+
315
+ if write_outputs:
316
+ write_text(out_dir / "document.md", md_text)
317
+ write_json(out_dir / "document.json", json_payload)
318
+
319
+ return json_payload
@@ -0,0 +1,344 @@
1
+ Metadata-Version: 2.4
2
+ Name: rc-docparser
3
+ Version: 0.2.0
4
+ Summary: Convert research literature (.docx, .xlsx, .pdf, .html, .pptx, .epub, .txt, .md, .csv) into structured Markdown + JSON corpora, with optional VLM image semantic captioning.
5
+ Project-URL: Homepage, https://github.com/Research-Commons/docparser
6
+ Project-URL: Repository, https://github.com/Research-Commons/docparser
7
+ Project-URL: Issues, https://github.com/Research-Commons/docparser/issues
8
+ Project-URL: Changelog, https://github.com/Research-Commons/docparser/blob/main/CHANGELOG.md
9
+ Author-email: Research Commons <shubhankitsingh@researchcommons.ai>
10
+ License: MIT License
11
+
12
+ Copyright (c) 2026 Research Commons
13
+
14
+ Permission is hereby granted, free of charge, to any person obtaining a copy
15
+ of this software and associated documentation files (the "Software"), to deal
16
+ in the Software without restriction, including without limitation the rights
17
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
18
+ copies of the Software, and to permit persons to whom the Software is
19
+ furnished to do so, subject to the following conditions:
20
+
21
+ The above copyright notice and this permission notice shall be included in all
22
+ copies or substantial portions of the Software.
23
+
24
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30
+ SOFTWARE.
31
+ License-File: LICENSE
32
+ Keywords: corpus,csv,docx,epub,html,literature,markdown,ocr,parser,pdf,pptx,rag,vlm,xlsx
33
+ Classifier: Development Status :: 4 - Beta
34
+ Classifier: Intended Audience :: Developers
35
+ Classifier: Intended Audience :: Science/Research
36
+ Classifier: License :: OSI Approved :: MIT License
37
+ Classifier: Operating System :: OS Independent
38
+ Classifier: Programming Language :: Python :: 3
39
+ Classifier: Programming Language :: Python :: 3.10
40
+ Classifier: Programming Language :: Python :: 3.11
41
+ Classifier: Programming Language :: Python :: 3.12
42
+ Classifier: Topic :: Scientific/Engineering
43
+ Classifier: Topic :: Text Processing :: Markup
44
+ Requires-Python: >=3.10
45
+ Requires-Dist: lxml>=5.3.0
46
+ Requires-Dist: openpyxl>=3.1.5
47
+ Requires-Dist: pillow>=10.4.0
48
+ Requires-Dist: python-docx>=1.1.2
49
+ Requires-Dist: python-dotenv>=1.0.1
50
+ Requires-Dist: pyyaml>=6.0.2
51
+ Requires-Dist: tqdm>=4.66.5
52
+ Provides-Extra: all
53
+ Requires-Dist: beautifulsoup4>=4.12.0; extra == 'all'
54
+ Requires-Dist: ebooklib>=0.18; extra == 'all'
55
+ Requires-Dist: numpy>=1.24.0; extra == 'all'
56
+ Requires-Dist: pdfplumber>=0.11.0; extra == 'all'
57
+ Requires-Dist: pymupdf>=1.24.0; extra == 'all'
58
+ Requires-Dist: python-pptx>=1.0.0; extra == 'all'
59
+ Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == 'all'
60
+ Requires-Dist: requests>=2.32.3; extra == 'all'
61
+ Requires-Dist: trafilatura>=1.12.0; extra == 'all'
62
+ Provides-Extra: dev
63
+ Requires-Dist: build>=1.2.0; extra == 'dev'
64
+ Requires-Dist: ebooklib>=0.18; extra == 'dev'
65
+ Requires-Dist: mypy>=1.10.0; extra == 'dev'
66
+ Requires-Dist: pandas>=2.2.0; extra == 'dev'
67
+ Requires-Dist: pdfplumber>=0.11.0; extra == 'dev'
68
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
69
+ Requires-Dist: pytest>=8.0; extra == 'dev'
70
+ Requires-Dist: python-pptx>=1.0.0; extra == 'dev'
71
+ Requires-Dist: ruff>=0.6.0; extra == 'dev'
72
+ Requires-Dist: twine>=5.1.0; extra == 'dev'
73
+ Provides-Extra: docling
74
+ Requires-Dist: docling>=2.0.0; extra == 'docling'
75
+ Provides-Extra: epub
76
+ Requires-Dist: beautifulsoup4>=4.12.0; extra == 'epub'
77
+ Requires-Dist: ebooklib>=0.18; extra == 'epub'
78
+ Provides-Extra: html
79
+ Requires-Dist: beautifulsoup4>=4.12.0; extra == 'html'
80
+ Requires-Dist: trafilatura>=1.12.0; extra == 'html'
81
+ Provides-Extra: localvlm
82
+ Requires-Dist: pillow>=10.4.0; extra == 'localvlm'
83
+ Requires-Dist: torch>=2.2.0; extra == 'localvlm'
84
+ Requires-Dist: transformers>=4.40.0; extra == 'localvlm'
85
+ Provides-Extra: marker
86
+ Requires-Dist: marker-pdf>=1.0.0; extra == 'marker'
87
+ Provides-Extra: ocr
88
+ Requires-Dist: numpy>=1.24.0; extra == 'ocr'
89
+ Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == 'ocr'
90
+ Provides-Extra: pdf
91
+ Requires-Dist: pymupdf>=1.24.0; extra == 'pdf'
92
+ Provides-Extra: pptx
93
+ Requires-Dist: python-pptx>=1.0.0; extra == 'pptx'
94
+ Provides-Extra: pymupdf4llm
95
+ Requires-Dist: pymupdf4llm>=0.0.17; extra == 'pymupdf4llm'
96
+ Provides-Extra: tables
97
+ Requires-Dist: pdfplumber>=0.11.0; extra == 'tables'
98
+ Provides-Extra: vlm
99
+ Requires-Dist: requests>=2.32.3; extra == 'vlm'
100
+ Description-Content-Type: text/markdown
101
+
102
+ # docparser
103
+
104
+ Convert research literature (`.docx`, `.xlsx`, `.pdf`, `.html`, `.pptx`,
105
+ `.epub`, `.txt`, `.md`, `.csv`) into a clean, reproducible **Markdown + JSON**
106
+ corpus, with optional **vision-language captioning** of every embedded figure
107
+ via OpenRouter, OpenAI, Gemini, a local server, or a fully-local model.
108
+
109
+ ```text
110
+ ┌────────────────┐
111
+ data/raw/*.docx │ docparser │ data/parsed/<slug>/
112
+ data/raw/*.xlsx │ - parse_docx │ document.md
113
+ data/raw/*.pdf ─────► │ - parse_xlsx │ ────► document.json
114
+ data/raw/*.html │ - parse_pdf │
115
+ data/raw/*.pptx │ - parse_html │ data/assets/<slug>/
116
+ data/raw/*.epub │ - parse_pptx │ img-*.png
117
+ data/raw/*.txt|md|csv │ - parse_epub │
118
+ │ - VLM caption │
119
+ └────────────────┘
120
+ ```
121
+
122
+ ## Install
123
+
124
+ ```bash
125
+ pip install docparser # core: docx + xlsx + txt/md + csv/tsv
126
+ pip install 'docparser[pdf]' # + PyMuPDF for PDFs
127
+ pip install 'docparser[html]' # + trafilatura + bs4 for HTML
128
+ pip install 'docparser[pptx]' # + python-pptx for PowerPoint
129
+ pip install 'docparser[epub]' # + EbookLib + bs4 for EPUB
130
+ pip install 'docparser[vlm]' # + requests for API VLM captions
131
+ pip install 'docparser[all]' # everything above (recommended)
132
+ ```
133
+
134
+ Higher-fidelity / heavier features are separate opt-in extras (so the core
135
+ install stays small and MIT):
136
+
137
+ ```bash
138
+ pip install 'docparser[tables]' # + pdfplumber for PDF table extraction
139
+ pip install 'docparser[ocr]' # + rapidocr-onnxruntime for scanned PDFs
140
+ pip install 'docparser[pymupdf4llm]' # PyMuPDF4LLM PDF backend (AGPL/commercial)
141
+ pip install 'docparser[docling]' # IBM Docling PDF backend (MIT)
142
+ pip install 'docparser[marker]' # Datalab Marker PDF backend (GPL-3.0)
143
+ pip install 'docparser[localvlm]' # transformers/torch local captioning
144
+ ```
145
+
146
+ `docparser` requires Python 3.10+.
147
+
148
+ ## Quick start (library)
149
+
150
+ ```python
151
+ from docparser import WorkspaceLayout, run_all
152
+
153
+ layout = WorkspaceLayout.under("./project") # data/raw, data/parsed, data/assets, .cache
154
+ layout.ensure()
155
+
156
+ run_all(layout, use_vlm=False) # parse everything in data/raw
157
+ ```
158
+
159
+ For a single file:
160
+
161
+ ```python
162
+ from docparser import parse_path, WorkspaceLayout
163
+
164
+ layout = WorkspaceLayout.under(".")
165
+ payload = parse_path("paper.pdf", layout)
166
+ print(payload["stats"])
167
+ ```
168
+
169
+ ## Quick start (CLI)
170
+
171
+ ```bash
172
+ # parse a single file
173
+ docparser parse paper.pdf --workspace ./out --no-vlm
174
+
175
+ # walk a whole directory
176
+ docparser parse-all --workspace ./project --no-vlm
177
+
178
+ # enable VLM captioning (requires OPENROUTER_API_KEY in env or .env)
179
+ export OPENROUTER_API_KEY=sk-or-v1-...
180
+ docparser parse-all --workspace ./project --max-images 50
181
+
182
+ # higher-fidelity PDF: pick a backend, OCR scanned pages, extract tables
183
+ docparser parse paper.pdf --pdf-backend docling --ocr auto --pdf-tables --no-vlm
184
+
185
+ # caption with a different provider
186
+ docparser parse-all --workspace ./project --vlm-provider openai --vlm-model gpt-4o-mini
187
+
188
+ docparser version
189
+ ```
190
+
191
+ ## What gets captured
192
+
193
+ ### `.docx`
194
+ - Walks the document body in **document order** (paragraphs + tables + drawings).
195
+ - Preserves heading hierarchy (`section_path`) on every block.
196
+ - Extracts every embedded image to `data/assets/<slug>/` with a stable
197
+ `img-<seq>-<sha10>.<ext>` name.
198
+ - Detects figure/table captions (style `Caption` or text matching
199
+ `Figure 1: …` / `Fig. 1.` / `Table 1.`) and associates the caption with the
200
+ preceding image.
201
+ - Captures `context_before` and `context_after` for every image so the VLM has
202
+ document-grounded context.
203
+
204
+ ### `.xlsx`
205
+ - Iterates **every sheet, every row, every column**.
206
+ - For each cell stores: address, row/col indices, value, openpyxl `data_type`,
207
+ `number_format`, `hyperlink`, `comment`, and the **formula** (from a second
208
+ pass with `data_only=False`).
209
+ - Stores `merged_ranges`, `frozen_panes`, and any embedded images.
210
+ - Markdown rendering uses the first non-empty row as a header heuristic and
211
+ preserves multi-line cells with `<br>`.
212
+
213
+ ### `.pdf`
214
+ - Page-by-page text extraction in reading order via PyMuPDF's blocks API.
215
+ - Best-effort heading detection from font size (≥120% of the body-text median
216
+ promotes a line to a heading; bold flag tracked).
217
+ - Embedded raster images extracted via `doc.extract_image(xref)`.
218
+ - **Pluggable backends** (`backend="pymupdf4llm" | "docling" | "marker"`) route
219
+ conversion to a higher-fidelity engine; their Markdown is normalized into the
220
+ same block schema. Images are still extracted via PyMuPDF.
221
+ - **OCR** (`ocr="auto" | "force"`, `[ocr]` extra) recognizes text on scanned /
222
+ low-text pages; OCR'd blocks carry `"ocr": true`.
223
+ - **Tables** (`extract_tables=True`, `[tables]` extra) emit real `table` blocks
224
+ via `pdfplumber`.
225
+
226
+ ### `.html`
227
+ - Article-grade body extraction via `trafilatura`.
228
+ - Plus a **structural** BeautifulSoup walk that emits typed blocks
229
+ (`heading` / `paragraph` / `list` / `table` / `image`) so downstream RAG
230
+ layers can rely on the JSON.
231
+
232
+ ### `.pptx`
233
+ - Walks slides in presentation order; each slide becomes a section.
234
+ - Emits per-slide headings (slide title), bulleted text frames (with list
235
+ level), tables, pictures, and **speaker notes**.
236
+ - Embedded pictures extracted and optionally captioned.
237
+
238
+ ### `.epub`
239
+ - Walks the spine in reading order; per-chapter BeautifulSoup structural walk.
240
+ - Captures metadata (title/author/language), headings, paragraphs, lists,
241
+ tables, and embedded images (resolved from the EPUB image manifest).
242
+
243
+ ### `.txt` / `.md` and `.csv` / `.tsv` (core, no extras)
244
+ - Plain text is split into paragraph blocks; Markdown is passed through and
245
+ also decomposed into heading / list / code / paragraph blocks.
246
+ - CSV/TSV: delimiter sniffing, header detection, a Markdown table, and one JSON
247
+ record per row.
248
+
249
+ ### Images (`[vlm]` extra)
250
+ Each image is sent to a vision-language model (default provider OpenRouter,
251
+ model `anthropic/claude-sonnet-4`) along with its surrounding caption +
252
+ context. Any OpenAI-compatible provider works via `--vlm-provider`
253
+ (`openrouter` / `openai` / `gemini` / `local`), or use a fully-local
254
+ `transformers` model with `--vlm-provider transformers` (`[localvlm]` extra).
255
+ The model returns a strict JSON object:
256
+
257
+ ```json
258
+ {
259
+ "caption": "one-sentence figure caption",
260
+ "description": "2–5 sentence paragraph",
261
+ "visible_text": "OCR-style transcription",
262
+ "tags": ["world-model", "diagram", "..."],
263
+ "image_kind": "diagram | plot | screenshot | photo | equation | table | ...",
264
+ "domain_relevance": "how this relates to the document's topic"
265
+ }
266
+ ```
267
+
268
+ Results are cached on disk at `<cache_dir>/vlm/<model>/<sha1>.json`, keyed by
269
+ **SHA-1 of the image bytes × model**, so re-runs are free until the source
270
+ image bytes change.
271
+
272
+ ## Configuration (`.env`)
273
+
274
+ | Var | Default | Purpose |
275
+ | --- | --- | --- |
276
+ | `DOCPARSER_VLM_PROVIDER` | `openrouter` | `openrouter` / `openai` / `gemini` / `local` |
277
+ | `OPENROUTER_API_KEY` | _required for OpenRouter_ | OpenRouter key (`sk-or-...`) |
278
+ | `OPENROUTER_VLM_MODEL` | `anthropic/claude-sonnet-4` | any vision-capable OpenRouter model |
279
+ | `OPENROUTER_BASE_URL` | `https://openrouter.ai/api/v1` | override for a proxy |
280
+ | `OPENROUTER_REFERER` / `OPENROUTER_TITLE` | repo URL / `docparser` | OpenRouter attribution headers |
281
+ | `OPENAI_API_KEY` / `OPENAI_VLM_MODEL` | _required for OpenAI_ / `gpt-4o-mini` | OpenAI provider |
282
+ | `GEMINI_API_KEY` / `GEMINI_VLM_MODEL` | _required for Gemini_ / `gemini-1.5-flash` | Gemini provider |
283
+ | `DOCPARSER_VLM_BASE_URL` | `http://localhost:11434/v1` | base URL for the `local` provider |
284
+ | `DOCPARSER_VLM_API_KEY` / `DOCPARSER_VLM_MODEL` | — / `llava` | key + model for the `local` provider |
285
+ | `DOCPARSER_LOCAL_VLM_MODEL` | `Salesforce/blip-image-captioning-large` | model for the `transformers` backend |
286
+
287
+ ## API reference (highlights)
288
+
289
+ - `WorkspaceLayout(raw_dir, parsed_dir, assets_dir, cache_dir)` —
290
+ dataclass describing where parser output lives. Use `.under(root)` for the
291
+ default `data/raw + data/parsed + data/assets + .cache` layout under a root.
292
+ - `parse_docx(source, layout=None, *, captioner=None, write_outputs=True)` →
293
+ payload dict.
294
+ - `parse_xlsx(source, layout=None, *, captioner=None, write_outputs=True)` →
295
+ payload dict.
296
+ - `parse_pdf(source, layout=None, *, captioner=None, write_outputs=True, extract_images=True, backend="builtin", ocr="off", extract_tables=False)` →
297
+ payload dict. (requires `[pdf]`; backends/OCR/tables require their extras)
298
+ - `parse_html(source, layout=None, *, captioner=None, write_outputs=True, use_trafilatura=True)` →
299
+ payload dict. `source` may be a path or `http(s)://` URL. (requires `[html]`)
300
+ - `parse_pptx(source, layout=None, *, captioner=None, write_outputs=True)` →
301
+ payload dict. (requires `[pptx]`)
302
+ - `parse_epub(source, layout=None, *, captioner=None, write_outputs=True)` →
303
+ payload dict. (requires `[epub]`)
304
+ - `parse_text(source, layout=None, ...)` / `parse_csv(source, layout=None, ...)` —
305
+ core parsers for `.txt`/`.md` and `.csv`/`.tsv`.
306
+ - `parse_path(source, layout=None, **kwargs)` — dispatches by extension;
307
+ PDF-only kwargs (`backend`, `ocr`, `extract_tables`) are forwarded to PDFs.
308
+ - `run_all(layout, *, use_vlm=True, only=None, max_images=None, continue_on_error=False, vlm_provider=None, vlm_model=None, pdf_backend="builtin", ocr="off", extract_tables=False)` —
309
+ walks `layout.raw_dir`, parses everything supported, writes a top-level
310
+ `CORPUS.md` and `data/parsed/corpus.json`.
311
+ - `caption_image(image_bytes, *, mime, doc_name, nearby_caption, context, provider=None, model=None, layout=None, ...)` →
312
+ `VLMResult`. (requires `[vlm]`)
313
+
314
+ ## Development
315
+
316
+ ```bash
317
+ git clone https://github.com/Research-Commons/docparser
318
+ cd docparser
319
+ python -m venv .venv && source .venv/bin/activate
320
+ pip install -e ".[all,dev]"
321
+ pytest -ra
322
+ ruff check src tests
323
+ mypy
324
+ python -m build # produces dist/*.whl + *.tar.gz
325
+ twine check dist/*
326
+ ```
327
+
328
+ ### Publishing
329
+
330
+ CI runs lint + mypy + tests on Python 3.10-3.12 and builds the distribution on
331
+ every push/PR (`.github/workflows/ci.yml`). Pushing a version tag (e.g.
332
+ `v0.2.0`) triggers `.github/workflows/publish.yml`, which builds and uploads to
333
+ PyPI via **Trusted Publishing** (OIDC, no stored token) — configure a trusted
334
+ publisher for the project on PyPI first. To publish manually instead:
335
+
336
+ ```bash
337
+ python -m build
338
+ twine check dist/*
339
+ twine upload dist/*
340
+ ```
341
+
342
+ ## License
343
+
344
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,22 @@
1
+ docparser/__init__.py,sha256=RII8KjcE7EzeBPthWp7-8cYiih9v782lGmctSc12zlU,2307
2
+ docparser/cli.py,sha256=eDyGm1loOaut-JZh-85PU171_hKlb6XFc_UBnYu1030,6531
3
+ docparser/common.py,sha256=o7tt8OQWAGQnhFqqkq2D0wNx5eV_cNeN7WMUNVQkDbU,5231
4
+ docparser/csvtab.py,sha256=nmzWlNNqGYBrTwgJHk5Kcx0U55Fd3_EuVD-PjBt7Ecs,3960
5
+ docparser/docx.py,sha256=oSkwdp4XNY2ZIEaMOilDhg0fiYrpAto5FTJP_FIlclY,17088
6
+ docparser/epub.py,sha256=rIBiIBCMIgyo__uR2D0bMn2QEuBayoMYK4eheI2dsZY,12608
7
+ docparser/html.py,sha256=SV_zc2wdXjPkKhSdz4mBKg-c8AAyf7W6SEV2o0Zhh0w,11376
8
+ docparser/image.py,sha256=ZZk5UchqX3GtcB2U0rPGCnyL2NSx6Jmf-5k0UFtqROw,12392
9
+ docparser/localvlm.py,sha256=LAcD79mEqDBBoyc7v0Ic-W5SexqZlN4ulxMv7f9Rphg,3757
10
+ docparser/ocr.py,sha256=mWMPOx0eKm3VrDLDhZ7hQcNzj5RotxtamKVbxi9WpoA,2103
11
+ docparser/orchestrator.py,sha256=5Or11Ye3yVE4rOGA9MEviWT2QOIWXvWg-1RhkwdvbhA,9813
12
+ docparser/pdf.py,sha256=d3Zyj-zpY7EFs1sWvzDXQshQXAJGkjCIIBKkf1yCuRs,16698
13
+ docparser/pdf_backends.py,sha256=5P1PrEAGb8UZpYWfyi7D0WYHo-oWed7S3By1CTe1n2Q,3348
14
+ docparser/pptx.py,sha256=8c-aLyqnoYhJj4b_ZvA8NahvwRWPicwV5_VMlrCMNQI,12020
15
+ docparser/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ docparser/text.py,sha256=r3u-bk09jAfpvYq7yuM7yYOLRjY1ChA0sDi619pzZiE,5930
17
+ docparser/xlsx.py,sha256=6s-dtbyN92qFRLT3dKi_1HVuUGzCa8IyZ1RJLwwNDP8,11758
18
+ rc_docparser-0.2.0.dist-info/METADATA,sha256=IFGuaKCRvT8VZoYfo1UPWrSss_JmwFdAd0sMFfNCSCA,15512
19
+ rc_docparser-0.2.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
20
+ rc_docparser-0.2.0.dist-info/entry_points.txt,sha256=VFP7dbtsKVgtvwjo63QThpCN65nmXpp3OIvvokVz39s,49
21
+ rc_docparser-0.2.0.dist-info/licenses/LICENSE,sha256=509nFfR8HUUnDqGdHdYzy9CHL_7cCxKsR7zqZJko0N0,1073
22
+ rc_docparser-0.2.0.dist-info/RECORD,,