rc-docparser 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docparser/__init__.py +87 -0
- docparser/cli.py +209 -0
- docparser/common.py +163 -0
- docparser/csvtab.py +131 -0
- docparser/docx.py +488 -0
- docparser/epub.py +349 -0
- docparser/html.py +322 -0
- docparser/image.py +343 -0
- docparser/localvlm.py +103 -0
- docparser/ocr.py +68 -0
- docparser/orchestrator.py +304 -0
- docparser/pdf.py +430 -0
- docparser/pdf_backends.py +89 -0
- docparser/pptx.py +332 -0
- docparser/py.typed +0 -0
- docparser/text.py +189 -0
- docparser/xlsx.py +319 -0
- rc_docparser-0.2.0.dist-info/METADATA +344 -0
- rc_docparser-0.2.0.dist-info/RECORD +22 -0
- rc_docparser-0.2.0.dist-info/WHEEL +4 -0
- rc_docparser-0.2.0.dist-info/entry_points.txt +2 -0
- rc_docparser-0.2.0.dist-info/licenses/LICENSE +21 -0
docparser/xlsx.py
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
"""XLSX parser: emits Markdown + JSON for every sheet/row/column.
|
|
2
|
+
|
|
3
|
+
Preserves cell value, openpyxl ``data_type``, formula (separate pass with
|
|
4
|
+
``data_only=False``), hyperlinks, comments, merged ranges, frozen panes, and
|
|
5
|
+
embedded images.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import datetime as _dt
|
|
10
|
+
import io
|
|
11
|
+
from collections.abc import Callable
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from openpyxl import load_workbook
|
|
16
|
+
from openpyxl.cell.cell import Cell
|
|
17
|
+
from openpyxl.utils import get_column_letter
|
|
18
|
+
from openpyxl.worksheet.worksheet import Worksheet
|
|
19
|
+
|
|
20
|
+
from .common import (
|
|
21
|
+
WorkspaceLayout,
|
|
22
|
+
bytes_sha1,
|
|
23
|
+
file_sha1,
|
|
24
|
+
truncate,
|
|
25
|
+
utc_now_iso,
|
|
26
|
+
write_json,
|
|
27
|
+
write_text,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _value_to_jsonable(v: Any) -> Any:
|
|
32
|
+
if v is None:
|
|
33
|
+
return None
|
|
34
|
+
if isinstance(v, bool):
|
|
35
|
+
return v
|
|
36
|
+
if isinstance(v, (int, float, str)):
|
|
37
|
+
return v
|
|
38
|
+
if isinstance(v, (_dt.datetime, _dt.date, _dt.time)):
|
|
39
|
+
return v.isoformat()
|
|
40
|
+
if isinstance(v, _dt.timedelta):
|
|
41
|
+
return v.total_seconds()
|
|
42
|
+
return str(v)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _cell_record(cell: Cell, formulas_ws: Worksheet | None) -> dict[str, Any]:
|
|
46
|
+
rec: dict[str, Any] = {
|
|
47
|
+
"addr": cell.coordinate,
|
|
48
|
+
"row": cell.row,
|
|
49
|
+
"col": cell.column,
|
|
50
|
+
"col_letter": get_column_letter(cell.column),
|
|
51
|
+
"value": _value_to_jsonable(cell.value),
|
|
52
|
+
"data_type": cell.data_type,
|
|
53
|
+
}
|
|
54
|
+
if cell.number_format and cell.number_format != "General":
|
|
55
|
+
rec["number_format"] = cell.number_format
|
|
56
|
+
if cell.hyperlink is not None:
|
|
57
|
+
target = getattr(cell.hyperlink, "target", None)
|
|
58
|
+
if target:
|
|
59
|
+
rec["hyperlink"] = target
|
|
60
|
+
if formulas_ws is not None:
|
|
61
|
+
try:
|
|
62
|
+
f_cell = formulas_ws[cell.coordinate]
|
|
63
|
+
f_val = f_cell.value
|
|
64
|
+
if isinstance(f_val, str) and f_val.startswith("="):
|
|
65
|
+
rec["formula"] = f_val
|
|
66
|
+
except Exception:
|
|
67
|
+
pass
|
|
68
|
+
if cell.comment is not None:
|
|
69
|
+
try:
|
|
70
|
+
rec["comment"] = str(cell.comment.text)
|
|
71
|
+
except Exception:
|
|
72
|
+
pass
|
|
73
|
+
return rec
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _extract_images(
|
|
77
|
+
ws: Worksheet, asset_dir: Path, layout: WorkspaceLayout, source: Path, start_seq: int, write_outputs: bool
|
|
78
|
+
) -> list[dict[str, Any]]:
|
|
79
|
+
images: list[dict[str, Any]] = []
|
|
80
|
+
seq = start_seq
|
|
81
|
+
for img in list(getattr(ws, "_images", []) or []):
|
|
82
|
+
try:
|
|
83
|
+
ref = img.ref
|
|
84
|
+
data: bytes | None = None
|
|
85
|
+
if hasattr(ref, "read"):
|
|
86
|
+
data = ref.read()
|
|
87
|
+
try:
|
|
88
|
+
ref.seek(0)
|
|
89
|
+
except Exception:
|
|
90
|
+
pass
|
|
91
|
+
elif isinstance(ref, (bytes, bytearray)):
|
|
92
|
+
data = bytes(ref)
|
|
93
|
+
else:
|
|
94
|
+
pil_img = getattr(img, "image", None)
|
|
95
|
+
if pil_img is not None:
|
|
96
|
+
buf = io.BytesIO()
|
|
97
|
+
pil_img.save(buf, format=pil_img.format or "PNG")
|
|
98
|
+
data = buf.getvalue()
|
|
99
|
+
if not data:
|
|
100
|
+
continue
|
|
101
|
+
seq += 1
|
|
102
|
+
sha = bytes_sha1(data)
|
|
103
|
+
ext = (Path(getattr(img, "path", "") or "").suffix.lstrip(".") or "png").lower()
|
|
104
|
+
asset_name = f"img-{ws.title.replace('/', '_')}-{seq:03d}-{sha[:10]}.{ext}"
|
|
105
|
+
asset_path = asset_dir / asset_name
|
|
106
|
+
if write_outputs and not asset_path.exists():
|
|
107
|
+
asset_path.write_bytes(data)
|
|
108
|
+
anchor = getattr(img, "anchor", None)
|
|
109
|
+
anchor_cell = None
|
|
110
|
+
try:
|
|
111
|
+
if anchor and hasattr(anchor, "_from"):
|
|
112
|
+
anchor_cell = (
|
|
113
|
+
f"{get_column_letter(anchor._from.col + 1)}{anchor._from.row + 1}"
|
|
114
|
+
)
|
|
115
|
+
except Exception:
|
|
116
|
+
anchor_cell = None
|
|
117
|
+
images.append(
|
|
118
|
+
{
|
|
119
|
+
"seq": seq,
|
|
120
|
+
"sha1": sha,
|
|
121
|
+
"ext": ext,
|
|
122
|
+
"asset_path": layout.relpath_from_parsed(asset_path, source),
|
|
123
|
+
"abs_asset_path": str(asset_path),
|
|
124
|
+
"sheet": ws.title,
|
|
125
|
+
"anchor_cell": anchor_cell,
|
|
126
|
+
"blob": data,
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
except Exception:
|
|
130
|
+
continue
|
|
131
|
+
return images
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _md_cell(v: Any) -> str:
|
|
135
|
+
if v is None:
|
|
136
|
+
return ""
|
|
137
|
+
if isinstance(v, bool):
|
|
138
|
+
return "TRUE" if v else "FALSE"
|
|
139
|
+
s = str(v)
|
|
140
|
+
s = s.replace("|", "\\|").replace("\r", " ").replace("\n", " <br> ")
|
|
141
|
+
return truncate(s, 400)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def parse_xlsx(
|
|
145
|
+
source: Path | str,
|
|
146
|
+
layout: WorkspaceLayout | None = None,
|
|
147
|
+
*,
|
|
148
|
+
captioner: Callable[..., dict[str, Any]] | None = None,
|
|
149
|
+
write_outputs: bool = True,
|
|
150
|
+
) -> dict[str, Any]:
|
|
151
|
+
"""Parse an XLSX workbook into Markdown + JSON.
|
|
152
|
+
|
|
153
|
+
See :func:`docparser.parse_docx` for parameter conventions.
|
|
154
|
+
"""
|
|
155
|
+
source = Path(source)
|
|
156
|
+
layout = layout or WorkspaceLayout()
|
|
157
|
+
real_source = source.resolve()
|
|
158
|
+
|
|
159
|
+
wb_values = load_workbook(filename=str(real_source), data_only=True, read_only=False)
|
|
160
|
+
wb_formulas = load_workbook(filename=str(real_source), data_only=False, read_only=False)
|
|
161
|
+
|
|
162
|
+
out_dir = layout.parsed_dir_for(source)
|
|
163
|
+
asset_dir = layout.assets_dir_for(source)
|
|
164
|
+
if write_outputs:
|
|
165
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
166
|
+
asset_dir.mkdir(parents=True, exist_ok=True)
|
|
167
|
+
|
|
168
|
+
sheets_payload: list[dict[str, Any]] = []
|
|
169
|
+
md_lines: list[str] = [f"# {source.stem}", ""]
|
|
170
|
+
md_lines.append(
|
|
171
|
+
f"> Source: `{source.name}` \u00b7 sha1 `{file_sha1(real_source)[:12]}` "
|
|
172
|
+
f"\u00b7 parsed `{utc_now_iso()}` \u00b7 sheets: {len(wb_values.sheetnames)}"
|
|
173
|
+
)
|
|
174
|
+
md_lines.append("")
|
|
175
|
+
md_lines.append("## Sheets")
|
|
176
|
+
md_lines.append("")
|
|
177
|
+
for s in wb_values.sheetnames:
|
|
178
|
+
md_lines.append(f"- [{s}](#sheet-{s.lower().replace(' ', '-')})")
|
|
179
|
+
md_lines.append("")
|
|
180
|
+
|
|
181
|
+
image_seq = 0
|
|
182
|
+
for sheet_name in wb_values.sheetnames:
|
|
183
|
+
ws = wb_values[sheet_name]
|
|
184
|
+
ws_f = wb_formulas[sheet_name] if sheet_name in wb_formulas.sheetnames else None
|
|
185
|
+
max_row = ws.max_row or 0
|
|
186
|
+
max_col = ws.max_column or 0
|
|
187
|
+
|
|
188
|
+
rows_records: list[list[dict[str, Any]]] = []
|
|
189
|
+
any_value = False
|
|
190
|
+
for row in ws.iter_rows(min_row=1, max_row=max_row, max_col=max_col):
|
|
191
|
+
row_recs = []
|
|
192
|
+
for cell in row:
|
|
193
|
+
rec = _cell_record(cell, ws_f)
|
|
194
|
+
row_recs.append(rec)
|
|
195
|
+
if rec["value"] not in (None, "") or rec.get("formula"):
|
|
196
|
+
any_value = True
|
|
197
|
+
rows_records.append(row_recs)
|
|
198
|
+
|
|
199
|
+
merged = [str(r) for r in (ws.merged_cells.ranges or [])]
|
|
200
|
+
frozen = ws.freeze_panes
|
|
201
|
+
sheet_images = _extract_images(ws, asset_dir, layout, source, image_seq, write_outputs)
|
|
202
|
+
image_seq = max([image_seq] + [im["seq"] for im in sheet_images])
|
|
203
|
+
|
|
204
|
+
sheet_payload = {
|
|
205
|
+
"name": sheet_name,
|
|
206
|
+
"max_row": max_row,
|
|
207
|
+
"max_col": max_col,
|
|
208
|
+
"frozen_panes": frozen,
|
|
209
|
+
"merged_ranges": merged,
|
|
210
|
+
"n_nonempty_cells": sum(
|
|
211
|
+
1 for row in rows_records for c in row if c["value"] not in (None, "")
|
|
212
|
+
),
|
|
213
|
+
"rows": rows_records,
|
|
214
|
+
"images": [{k: v for k, v in im.items() if k != "blob"} for im in sheet_images],
|
|
215
|
+
}
|
|
216
|
+
sheets_payload.append(sheet_payload)
|
|
217
|
+
|
|
218
|
+
md_lines.append(f"## Sheet: {sheet_name}")
|
|
219
|
+
md_lines.append("")
|
|
220
|
+
md_lines.append(
|
|
221
|
+
f"_dimensions:_ {max_row} rows \u00d7 {max_col} cols "
|
|
222
|
+
f"\u00b7 _frozen:_ `{frozen or 'none'}` "
|
|
223
|
+
f"\u00b7 _merged ranges:_ {len(merged)} "
|
|
224
|
+
f"\u00b7 _images:_ {len(sheet_images)}"
|
|
225
|
+
)
|
|
226
|
+
md_lines.append("")
|
|
227
|
+
if not any_value and not sheet_images:
|
|
228
|
+
md_lines.append("_(empty sheet)_")
|
|
229
|
+
md_lines.append("")
|
|
230
|
+
continue
|
|
231
|
+
|
|
232
|
+
header_idx = 0
|
|
233
|
+
for i, row in enumerate(rows_records):
|
|
234
|
+
if any(c["value"] not in (None, "") for c in row):
|
|
235
|
+
header_idx = i
|
|
236
|
+
break
|
|
237
|
+
header = rows_records[header_idx] if rows_records else []
|
|
238
|
+
body = rows_records[header_idx + 1 :]
|
|
239
|
+
|
|
240
|
+
ncols = max_col
|
|
241
|
+
header_cells = [
|
|
242
|
+
_md_cell(header[c]["value"] if c < len(header) else "") for c in range(ncols)
|
|
243
|
+
]
|
|
244
|
+
if not any(h.strip() for h in header_cells):
|
|
245
|
+
header_cells = [get_column_letter(c + 1) for c in range(ncols)]
|
|
246
|
+
md_lines.append("| " + " | ".join(header_cells) + " |")
|
|
247
|
+
md_lines.append("| " + " | ".join(["---"] * ncols) + " |")
|
|
248
|
+
for row in body:
|
|
249
|
+
cells = []
|
|
250
|
+
for c in range(ncols):
|
|
251
|
+
v = row[c]["value"] if c < len(row) else ""
|
|
252
|
+
f = row[c].get("formula") if c < len(row) else None
|
|
253
|
+
cells.append(_md_cell(f if f and (v is None or v == "") else v))
|
|
254
|
+
md_lines.append("| " + " | ".join(cells) + " |")
|
|
255
|
+
md_lines.append("")
|
|
256
|
+
|
|
257
|
+
if sheet_images:
|
|
258
|
+
md_lines.append(f"### Images in `{sheet_name}`")
|
|
259
|
+
md_lines.append("")
|
|
260
|
+
for im in sheet_images:
|
|
261
|
+
rel = im["asset_path"]
|
|
262
|
+
cap = None
|
|
263
|
+
if captioner is not None:
|
|
264
|
+
try:
|
|
265
|
+
cap = captioner(
|
|
266
|
+
image_bytes=im["blob"],
|
|
267
|
+
mime="image/" + (im["ext"] if im["ext"] != "jpg" else "jpeg"),
|
|
268
|
+
doc_name=f"{source.name} :: sheet {sheet_name}",
|
|
269
|
+
nearby_caption=f"Anchor cell: {im.get('anchor_cell') or 'unknown'}",
|
|
270
|
+
context=f"Workbook image inside sheet '{sheet_name}'.",
|
|
271
|
+
)
|
|
272
|
+
except Exception as exc:
|
|
273
|
+
cap = {"error": str(exc)}
|
|
274
|
+
im["semantic"] = cap
|
|
275
|
+
alt = (cap or {}).get("caption") or f"image-{im['seq']}"
|
|
276
|
+
md_lines.append(f"")
|
|
277
|
+
if cap and cap.get("description"):
|
|
278
|
+
md_lines.append("")
|
|
279
|
+
md_lines.append(f"> **VLM caption.** {cap.get('caption','')}")
|
|
280
|
+
md_lines.append(">")
|
|
281
|
+
md_lines.append(f"> {cap.get('description','')}")
|
|
282
|
+
if cap.get("visible_text"):
|
|
283
|
+
md_lines.append(">")
|
|
284
|
+
vt = cap["visible_text"].replace("\n", "\n> ")
|
|
285
|
+
md_lines.append(f"> *Visible text:* {vt}")
|
|
286
|
+
if cap.get("tags"):
|
|
287
|
+
md_lines.append(">")
|
|
288
|
+
md_lines.append("> *Tags:* " + ", ".join(cap["tags"]))
|
|
289
|
+
md_lines.append("")
|
|
290
|
+
sheet_payload["images"] = [
|
|
291
|
+
{k: v for k, v in im.items() if k != "blob"} for im in sheet_images
|
|
292
|
+
]
|
|
293
|
+
|
|
294
|
+
md_text = "\n".join(md_lines).rstrip() + "\n"
|
|
295
|
+
|
|
296
|
+
json_payload = {
|
|
297
|
+
"source": {
|
|
298
|
+
"filename": source.name,
|
|
299
|
+
"absolute_path": str(real_source),
|
|
300
|
+
"sha1": file_sha1(real_source),
|
|
301
|
+
"size_bytes": real_source.stat().st_size,
|
|
302
|
+
"kind": "xlsx",
|
|
303
|
+
},
|
|
304
|
+
"parsed_at": utc_now_iso(),
|
|
305
|
+
"sheet_names": list(wb_values.sheetnames),
|
|
306
|
+
"sheets": sheets_payload,
|
|
307
|
+
"stats": {
|
|
308
|
+
"n_sheets": len(sheets_payload),
|
|
309
|
+
"n_rows_total": sum(s["max_row"] for s in sheets_payload),
|
|
310
|
+
"n_nonempty_cells_total": sum(s["n_nonempty_cells"] for s in sheets_payload),
|
|
311
|
+
"n_images_total": sum(len(s["images"]) for s in sheets_payload),
|
|
312
|
+
},
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
if write_outputs:
|
|
316
|
+
write_text(out_dir / "document.md", md_text)
|
|
317
|
+
write_json(out_dir / "document.json", json_payload)
|
|
318
|
+
|
|
319
|
+
return json_payload
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rc-docparser
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Convert research literature (.docx, .xlsx, .pdf, .html, .pptx, .epub, .txt, .md, .csv) into structured Markdown + JSON corpora, with optional VLM image semantic captioning.
|
|
5
|
+
Project-URL: Homepage, https://github.com/Research-Commons/docparser
|
|
6
|
+
Project-URL: Repository, https://github.com/Research-Commons/docparser
|
|
7
|
+
Project-URL: Issues, https://github.com/Research-Commons/docparser/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/Research-Commons/docparser/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: Research Commons <shubhankitsingh@researchcommons.ai>
|
|
10
|
+
License: MIT License
|
|
11
|
+
|
|
12
|
+
Copyright (c) 2026 Research Commons
|
|
13
|
+
|
|
14
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
15
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
16
|
+
in the Software without restriction, including without limitation the rights
|
|
17
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
18
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
19
|
+
furnished to do so, subject to the following conditions:
|
|
20
|
+
|
|
21
|
+
The above copyright notice and this permission notice shall be included in all
|
|
22
|
+
copies or substantial portions of the Software.
|
|
23
|
+
|
|
24
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
25
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
26
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
27
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
28
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
29
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
30
|
+
SOFTWARE.
|
|
31
|
+
License-File: LICENSE
|
|
32
|
+
Keywords: corpus,csv,docx,epub,html,literature,markdown,ocr,parser,pdf,pptx,rag,vlm,xlsx
|
|
33
|
+
Classifier: Development Status :: 4 - Beta
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: Intended Audience :: Science/Research
|
|
36
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
37
|
+
Classifier: Operating System :: OS Independent
|
|
38
|
+
Classifier: Programming Language :: Python :: 3
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
42
|
+
Classifier: Topic :: Scientific/Engineering
|
|
43
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
44
|
+
Requires-Python: >=3.10
|
|
45
|
+
Requires-Dist: lxml>=5.3.0
|
|
46
|
+
Requires-Dist: openpyxl>=3.1.5
|
|
47
|
+
Requires-Dist: pillow>=10.4.0
|
|
48
|
+
Requires-Dist: python-docx>=1.1.2
|
|
49
|
+
Requires-Dist: python-dotenv>=1.0.1
|
|
50
|
+
Requires-Dist: pyyaml>=6.0.2
|
|
51
|
+
Requires-Dist: tqdm>=4.66.5
|
|
52
|
+
Provides-Extra: all
|
|
53
|
+
Requires-Dist: beautifulsoup4>=4.12.0; extra == 'all'
|
|
54
|
+
Requires-Dist: ebooklib>=0.18; extra == 'all'
|
|
55
|
+
Requires-Dist: numpy>=1.24.0; extra == 'all'
|
|
56
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == 'all'
|
|
57
|
+
Requires-Dist: pymupdf>=1.24.0; extra == 'all'
|
|
58
|
+
Requires-Dist: python-pptx>=1.0.0; extra == 'all'
|
|
59
|
+
Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == 'all'
|
|
60
|
+
Requires-Dist: requests>=2.32.3; extra == 'all'
|
|
61
|
+
Requires-Dist: trafilatura>=1.12.0; extra == 'all'
|
|
62
|
+
Provides-Extra: dev
|
|
63
|
+
Requires-Dist: build>=1.2.0; extra == 'dev'
|
|
64
|
+
Requires-Dist: ebooklib>=0.18; extra == 'dev'
|
|
65
|
+
Requires-Dist: mypy>=1.10.0; extra == 'dev'
|
|
66
|
+
Requires-Dist: pandas>=2.2.0; extra == 'dev'
|
|
67
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == 'dev'
|
|
68
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
69
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
70
|
+
Requires-Dist: python-pptx>=1.0.0; extra == 'dev'
|
|
71
|
+
Requires-Dist: ruff>=0.6.0; extra == 'dev'
|
|
72
|
+
Requires-Dist: twine>=5.1.0; extra == 'dev'
|
|
73
|
+
Provides-Extra: docling
|
|
74
|
+
Requires-Dist: docling>=2.0.0; extra == 'docling'
|
|
75
|
+
Provides-Extra: epub
|
|
76
|
+
Requires-Dist: beautifulsoup4>=4.12.0; extra == 'epub'
|
|
77
|
+
Requires-Dist: ebooklib>=0.18; extra == 'epub'
|
|
78
|
+
Provides-Extra: html
|
|
79
|
+
Requires-Dist: beautifulsoup4>=4.12.0; extra == 'html'
|
|
80
|
+
Requires-Dist: trafilatura>=1.12.0; extra == 'html'
|
|
81
|
+
Provides-Extra: localvlm
|
|
82
|
+
Requires-Dist: pillow>=10.4.0; extra == 'localvlm'
|
|
83
|
+
Requires-Dist: torch>=2.2.0; extra == 'localvlm'
|
|
84
|
+
Requires-Dist: transformers>=4.40.0; extra == 'localvlm'
|
|
85
|
+
Provides-Extra: marker
|
|
86
|
+
Requires-Dist: marker-pdf>=1.0.0; extra == 'marker'
|
|
87
|
+
Provides-Extra: ocr
|
|
88
|
+
Requires-Dist: numpy>=1.24.0; extra == 'ocr'
|
|
89
|
+
Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == 'ocr'
|
|
90
|
+
Provides-Extra: pdf
|
|
91
|
+
Requires-Dist: pymupdf>=1.24.0; extra == 'pdf'
|
|
92
|
+
Provides-Extra: pptx
|
|
93
|
+
Requires-Dist: python-pptx>=1.0.0; extra == 'pptx'
|
|
94
|
+
Provides-Extra: pymupdf4llm
|
|
95
|
+
Requires-Dist: pymupdf4llm>=0.0.17; extra == 'pymupdf4llm'
|
|
96
|
+
Provides-Extra: tables
|
|
97
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == 'tables'
|
|
98
|
+
Provides-Extra: vlm
|
|
99
|
+
Requires-Dist: requests>=2.32.3; extra == 'vlm'
|
|
100
|
+
Description-Content-Type: text/markdown
|
|
101
|
+
|
|
102
|
+
# docparser
|
|
103
|
+
|
|
104
|
+
Convert research literature (`.docx`, `.xlsx`, `.pdf`, `.html`, `.pptx`,
|
|
105
|
+
`.epub`, `.txt`, `.md`, `.csv`) into a clean, reproducible **Markdown + JSON**
|
|
106
|
+
corpus, with optional **vision-language captioning** of every embedded figure
|
|
107
|
+
via OpenRouter, OpenAI, Gemini, a local server, or a fully-local model.
|
|
108
|
+
|
|
109
|
+
```text
|
|
110
|
+
┌────────────────┐
|
|
111
|
+
data/raw/*.docx │ docparser │ data/parsed/<slug>/
|
|
112
|
+
data/raw/*.xlsx │ - parse_docx │ document.md
|
|
113
|
+
data/raw/*.pdf ─────► │ - parse_xlsx │ ────► document.json
|
|
114
|
+
data/raw/*.html │ - parse_pdf │
|
|
115
|
+
data/raw/*.pptx │ - parse_html │ data/assets/<slug>/
|
|
116
|
+
data/raw/*.epub │ - parse_pptx │ img-*.png
|
|
117
|
+
data/raw/*.txt|md|csv │ - parse_epub │
|
|
118
|
+
│ - VLM caption │
|
|
119
|
+
└────────────────┘
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Install
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
pip install docparser # core: docx + xlsx + txt/md + csv/tsv
|
|
126
|
+
pip install 'docparser[pdf]' # + PyMuPDF for PDFs
|
|
127
|
+
pip install 'docparser[html]' # + trafilatura + bs4 for HTML
|
|
128
|
+
pip install 'docparser[pptx]' # + python-pptx for PowerPoint
|
|
129
|
+
pip install 'docparser[epub]' # + EbookLib + bs4 for EPUB
|
|
130
|
+
pip install 'docparser[vlm]' # + requests for API VLM captions
|
|
131
|
+
pip install 'docparser[all]' # everything above (recommended)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Higher-fidelity / heavier features are separate opt-in extras (so the core
|
|
135
|
+
install stays small and MIT):
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
pip install 'docparser[tables]' # + pdfplumber for PDF table extraction
|
|
139
|
+
pip install 'docparser[ocr]' # + rapidocr-onnxruntime for scanned PDFs
|
|
140
|
+
pip install 'docparser[pymupdf4llm]' # PyMuPDF4LLM PDF backend (AGPL/commercial)
|
|
141
|
+
pip install 'docparser[docling]' # IBM Docling PDF backend (MIT)
|
|
142
|
+
pip install 'docparser[marker]' # Datalab Marker PDF backend (GPL-3.0)
|
|
143
|
+
pip install 'docparser[localvlm]' # transformers/torch local captioning
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
`docparser` requires Python 3.10+.
|
|
147
|
+
|
|
148
|
+
## Quick start (library)
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
from docparser import WorkspaceLayout, run_all
|
|
152
|
+
|
|
153
|
+
layout = WorkspaceLayout.under("./project") # data/raw, data/parsed, data/assets, .cache
|
|
154
|
+
layout.ensure()
|
|
155
|
+
|
|
156
|
+
run_all(layout, use_vlm=False) # parse everything in data/raw
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
For a single file:
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
from docparser import parse_path, WorkspaceLayout
|
|
163
|
+
|
|
164
|
+
layout = WorkspaceLayout.under(".")
|
|
165
|
+
payload = parse_path("paper.pdf", layout)
|
|
166
|
+
print(payload["stats"])
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## Quick start (CLI)
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
# parse a single file
|
|
173
|
+
docparser parse paper.pdf --workspace ./out --no-vlm
|
|
174
|
+
|
|
175
|
+
# walk a whole directory
|
|
176
|
+
docparser parse-all --workspace ./project --no-vlm
|
|
177
|
+
|
|
178
|
+
# enable VLM captioning (requires OPENROUTER_API_KEY in env or .env)
|
|
179
|
+
export OPENROUTER_API_KEY=sk-or-v1-...
|
|
180
|
+
docparser parse-all --workspace ./project --max-images 50
|
|
181
|
+
|
|
182
|
+
# higher-fidelity PDF: pick a backend, OCR scanned pages, extract tables
|
|
183
|
+
docparser parse paper.pdf --pdf-backend docling --ocr auto --pdf-tables --no-vlm
|
|
184
|
+
|
|
185
|
+
# caption with a different provider
|
|
186
|
+
docparser parse-all --workspace ./project --vlm-provider openai --vlm-model gpt-4o-mini
|
|
187
|
+
|
|
188
|
+
docparser version
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## What gets captured
|
|
192
|
+
|
|
193
|
+
### `.docx`
|
|
194
|
+
- Walks the document body in **document order** (paragraphs + tables + drawings).
|
|
195
|
+
- Preserves heading hierarchy (`section_path`) on every block.
|
|
196
|
+
- Extracts every embedded image to `data/assets/<slug>/` with a stable
|
|
197
|
+
`img-<seq>-<sha10>.<ext>` name.
|
|
198
|
+
- Detects figure/table captions (style `Caption` or text matching
|
|
199
|
+
`Figure 1: …` / `Fig. 1.` / `Table 1.`) and associates the caption with the
|
|
200
|
+
preceding image.
|
|
201
|
+
- Captures `context_before` and `context_after` for every image so the VLM has
|
|
202
|
+
document-grounded context.
|
|
203
|
+
|
|
204
|
+
### `.xlsx`
|
|
205
|
+
- Iterates **every sheet, every row, every column**.
|
|
206
|
+
- For each cell stores: address, row/col indices, value, openpyxl `data_type`,
|
|
207
|
+
`number_format`, `hyperlink`, `comment`, and the **formula** (from a second
|
|
208
|
+
pass with `data_only=False`).
|
|
209
|
+
- Stores `merged_ranges`, `frozen_panes`, and any embedded images.
|
|
210
|
+
- Markdown rendering uses the first non-empty row as a header heuristic and
|
|
211
|
+
preserves multi-line cells with `<br>`.
|
|
212
|
+
|
|
213
|
+
### `.pdf`
|
|
214
|
+
- Page-by-page text extraction in reading order via PyMuPDF's blocks API.
|
|
215
|
+
- Best-effort heading detection from font size (≥120% of the body-text median
|
|
216
|
+
promotes a line to a heading; bold flag tracked).
|
|
217
|
+
- Embedded raster images extracted via `doc.extract_image(xref)`.
|
|
218
|
+
- **Pluggable backends** (`backend="pymupdf4llm" | "docling" | "marker"`) route
|
|
219
|
+
conversion to a higher-fidelity engine; their Markdown is normalized into the
|
|
220
|
+
same block schema. Images are still extracted via PyMuPDF.
|
|
221
|
+
- **OCR** (`ocr="auto" | "force"`, `[ocr]` extra) recognizes text on scanned /
|
|
222
|
+
low-text pages; OCR'd blocks carry `"ocr": true`.
|
|
223
|
+
- **Tables** (`extract_tables=True`, `[tables]` extra) emit real `table` blocks
|
|
224
|
+
via `pdfplumber`.
|
|
225
|
+
|
|
226
|
+
### `.html`
|
|
227
|
+
- Article-grade body extraction via `trafilatura`.
|
|
228
|
+
- Plus a **structural** BeautifulSoup walk that emits typed blocks
|
|
229
|
+
(`heading` / `paragraph` / `list` / `table` / `image`) so downstream RAG
|
|
230
|
+
layers can rely on the JSON.
|
|
231
|
+
|
|
232
|
+
### `.pptx`
|
|
233
|
+
- Walks slides in presentation order; each slide becomes a section.
|
|
234
|
+
- Emits per-slide headings (slide title), bulleted text frames (with list
|
|
235
|
+
level), tables, pictures, and **speaker notes**.
|
|
236
|
+
- Embedded pictures extracted and optionally captioned.
|
|
237
|
+
|
|
238
|
+
### `.epub`
|
|
239
|
+
- Walks the spine in reading order; per-chapter BeautifulSoup structural walk.
|
|
240
|
+
- Captures metadata (title/author/language), headings, paragraphs, lists,
|
|
241
|
+
tables, and embedded images (resolved from the EPUB image manifest).
|
|
242
|
+
|
|
243
|
+
### `.txt` / `.md` and `.csv` / `.tsv` (core, no extras)
|
|
244
|
+
- Plain text is split into paragraph blocks; Markdown is passed through and
|
|
245
|
+
also decomposed into heading / list / code / paragraph blocks.
|
|
246
|
+
- CSV/TSV: delimiter sniffing, header detection, a Markdown table, and one JSON
|
|
247
|
+
record per row.
|
|
248
|
+
|
|
249
|
+
### Images (`[vlm]` extra)
|
|
250
|
+
Each image is sent to a vision-language model (default provider OpenRouter,
|
|
251
|
+
model `anthropic/claude-sonnet-4`) along with its surrounding caption +
|
|
252
|
+
context. Any OpenAI-compatible provider works via `--vlm-provider`
|
|
253
|
+
(`openrouter` / `openai` / `gemini` / `local`), or use a fully-local
|
|
254
|
+
`transformers` model with `--vlm-provider transformers` (`[localvlm]` extra).
|
|
255
|
+
The model returns a strict JSON object:
|
|
256
|
+
|
|
257
|
+
```json
|
|
258
|
+
{
|
|
259
|
+
"caption": "one-sentence figure caption",
|
|
260
|
+
"description": "2–5 sentence paragraph",
|
|
261
|
+
"visible_text": "OCR-style transcription",
|
|
262
|
+
"tags": ["world-model", "diagram", "..."],
|
|
263
|
+
"image_kind": "diagram | plot | screenshot | photo | equation | table | ...",
|
|
264
|
+
"domain_relevance": "how this relates to the document's topic"
|
|
265
|
+
}
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
Results are cached on disk at `<cache_dir>/vlm/<model>/<sha1>.json`, keyed by
|
|
269
|
+
**SHA-1 of the image bytes × model**, so re-runs are free until the source
|
|
270
|
+
image bytes change.
|
|
271
|
+
|
|
272
|
+
## Configuration (`.env`)
|
|
273
|
+
|
|
274
|
+
| Var | Default | Purpose |
|
|
275
|
+
| --- | --- | --- |
|
|
276
|
+
| `DOCPARSER_VLM_PROVIDER` | `openrouter` | `openrouter` / `openai` / `gemini` / `local` |
|
|
277
|
+
| `OPENROUTER_API_KEY` | _required for OpenRouter_ | OpenRouter key (`sk-or-...`) |
|
|
278
|
+
| `OPENROUTER_VLM_MODEL` | `anthropic/claude-sonnet-4` | any vision-capable OpenRouter model |
|
|
279
|
+
| `OPENROUTER_BASE_URL` | `https://openrouter.ai/api/v1` | override for a proxy |
|
|
280
|
+
| `OPENROUTER_REFERER` / `OPENROUTER_TITLE` | repo URL / `docparser` | OpenRouter attribution headers |
|
|
281
|
+
| `OPENAI_API_KEY` / `OPENAI_VLM_MODEL` | _required for OpenAI_ / `gpt-4o-mini` | OpenAI provider |
|
|
282
|
+
| `GEMINI_API_KEY` / `GEMINI_VLM_MODEL` | _required for Gemini_ / `gemini-1.5-flash` | Gemini provider |
|
|
283
|
+
| `DOCPARSER_VLM_BASE_URL` | `http://localhost:11434/v1` | base URL for the `local` provider |
|
|
284
|
+
| `DOCPARSER_VLM_API_KEY` / `DOCPARSER_VLM_MODEL` | — / `llava` | key + model for the `local` provider |
|
|
285
|
+
| `DOCPARSER_LOCAL_VLM_MODEL` | `Salesforce/blip-image-captioning-large` | model for the `transformers` backend |
|
|
286
|
+
|
|
287
|
+
## API reference (highlights)
|
|
288
|
+
|
|
289
|
+
- `WorkspaceLayout(raw_dir, parsed_dir, assets_dir, cache_dir)` —
|
|
290
|
+
dataclass describing where parser output lives. Use `.under(root)` for the
|
|
291
|
+
default `data/raw + data/parsed + data/assets + .cache` layout under a root.
|
|
292
|
+
- `parse_docx(source, layout=None, *, captioner=None, write_outputs=True)` →
|
|
293
|
+
payload dict.
|
|
294
|
+
- `parse_xlsx(source, layout=None, *, captioner=None, write_outputs=True)` →
|
|
295
|
+
payload dict.
|
|
296
|
+
- `parse_pdf(source, layout=None, *, captioner=None, write_outputs=True, extract_images=True, backend="builtin", ocr="off", extract_tables=False)` →
|
|
297
|
+
payload dict. (requires `[pdf]`; backends/OCR/tables require their extras)
|
|
298
|
+
- `parse_html(source, layout=None, *, captioner=None, write_outputs=True, use_trafilatura=True)` →
|
|
299
|
+
payload dict. `source` may be a path or `http(s)://` URL. (requires `[html]`)
|
|
300
|
+
- `parse_pptx(source, layout=None, *, captioner=None, write_outputs=True)` →
|
|
301
|
+
payload dict. (requires `[pptx]`)
|
|
302
|
+
- `parse_epub(source, layout=None, *, captioner=None, write_outputs=True)` →
|
|
303
|
+
payload dict. (requires `[epub]`)
|
|
304
|
+
- `parse_text(source, layout=None, ...)` / `parse_csv(source, layout=None, ...)` —
|
|
305
|
+
core parsers for `.txt`/`.md` and `.csv`/`.tsv`.
|
|
306
|
+
- `parse_path(source, layout=None, **kwargs)` — dispatches by extension;
|
|
307
|
+
PDF-only kwargs (`backend`, `ocr`, `extract_tables`) are forwarded to PDFs.
|
|
308
|
+
- `run_all(layout, *, use_vlm=True, only=None, max_images=None, continue_on_error=False, vlm_provider=None, vlm_model=None, pdf_backend="builtin", ocr="off", extract_tables=False)` —
|
|
309
|
+
walks `layout.raw_dir`, parses everything supported, writes a top-level
|
|
310
|
+
`CORPUS.md` and `data/parsed/corpus.json`.
|
|
311
|
+
- `caption_image(image_bytes, *, mime, doc_name, nearby_caption, context, provider=None, model=None, layout=None, ...)` →
|
|
312
|
+
`VLMResult`. (requires `[vlm]`)
|
|
313
|
+
|
|
314
|
+
## Development
|
|
315
|
+
|
|
316
|
+
```bash
|
|
317
|
+
git clone https://github.com/Research-Commons/docparser
|
|
318
|
+
cd docparser
|
|
319
|
+
python -m venv .venv && source .venv/bin/activate
|
|
320
|
+
pip install -e ".[all,dev]"
|
|
321
|
+
pytest -ra
|
|
322
|
+
ruff check src tests
|
|
323
|
+
mypy
|
|
324
|
+
python -m build # produces dist/*.whl + *.tar.gz
|
|
325
|
+
twine check dist/*
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
### Publishing
|
|
329
|
+
|
|
330
|
+
CI runs lint + mypy + tests on Python 3.10-3.12 and builds the distribution on
|
|
331
|
+
every push/PR (`.github/workflows/ci.yml`). Pushing a version tag (e.g.
|
|
332
|
+
`v0.2.0`) triggers `.github/workflows/publish.yml`, which builds and uploads to
|
|
333
|
+
PyPI via **Trusted Publishing** (OIDC, no stored token) — configure a trusted
|
|
334
|
+
publisher for the project on PyPI first. To publish manually instead:
|
|
335
|
+
|
|
336
|
+
```bash
|
|
337
|
+
python -m build
|
|
338
|
+
twine check dist/*
|
|
339
|
+
twine upload dist/*
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
## License
|
|
343
|
+
|
|
344
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
docparser/__init__.py,sha256=RII8KjcE7EzeBPthWp7-8cYiih9v782lGmctSc12zlU,2307
|
|
2
|
+
docparser/cli.py,sha256=eDyGm1loOaut-JZh-85PU171_hKlb6XFc_UBnYu1030,6531
|
|
3
|
+
docparser/common.py,sha256=o7tt8OQWAGQnhFqqkq2D0wNx5eV_cNeN7WMUNVQkDbU,5231
|
|
4
|
+
docparser/csvtab.py,sha256=nmzWlNNqGYBrTwgJHk5Kcx0U55Fd3_EuVD-PjBt7Ecs,3960
|
|
5
|
+
docparser/docx.py,sha256=oSkwdp4XNY2ZIEaMOilDhg0fiYrpAto5FTJP_FIlclY,17088
|
|
6
|
+
docparser/epub.py,sha256=rIBiIBCMIgyo__uR2D0bMn2QEuBayoMYK4eheI2dsZY,12608
|
|
7
|
+
docparser/html.py,sha256=SV_zc2wdXjPkKhSdz4mBKg-c8AAyf7W6SEV2o0Zhh0w,11376
|
|
8
|
+
docparser/image.py,sha256=ZZk5UchqX3GtcB2U0rPGCnyL2NSx6Jmf-5k0UFtqROw,12392
|
|
9
|
+
docparser/localvlm.py,sha256=LAcD79mEqDBBoyc7v0Ic-W5SexqZlN4ulxMv7f9Rphg,3757
|
|
10
|
+
docparser/ocr.py,sha256=mWMPOx0eKm3VrDLDhZ7hQcNzj5RotxtamKVbxi9WpoA,2103
|
|
11
|
+
docparser/orchestrator.py,sha256=5Or11Ye3yVE4rOGA9MEviWT2QOIWXvWg-1RhkwdvbhA,9813
|
|
12
|
+
docparser/pdf.py,sha256=d3Zyj-zpY7EFs1sWvzDXQshQXAJGkjCIIBKkf1yCuRs,16698
|
|
13
|
+
docparser/pdf_backends.py,sha256=5P1PrEAGb8UZpYWfyi7D0WYHo-oWed7S3By1CTe1n2Q,3348
|
|
14
|
+
docparser/pptx.py,sha256=8c-aLyqnoYhJj4b_ZvA8NahvwRWPicwV5_VMlrCMNQI,12020
|
|
15
|
+
docparser/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
+
docparser/text.py,sha256=r3u-bk09jAfpvYq7yuM7yYOLRjY1ChA0sDi619pzZiE,5930
|
|
17
|
+
docparser/xlsx.py,sha256=6s-dtbyN92qFRLT3dKi_1HVuUGzCa8IyZ1RJLwwNDP8,11758
|
|
18
|
+
rc_docparser-0.2.0.dist-info/METADATA,sha256=IFGuaKCRvT8VZoYfo1UPWrSss_JmwFdAd0sMFfNCSCA,15512
|
|
19
|
+
rc_docparser-0.2.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
20
|
+
rc_docparser-0.2.0.dist-info/entry_points.txt,sha256=VFP7dbtsKVgtvwjo63QThpCN65nmXpp3OIvvokVz39s,49
|
|
21
|
+
rc_docparser-0.2.0.dist-info/licenses/LICENSE,sha256=509nFfR8HUUnDqGdHdYzy9CHL_7cCxKsR7zqZJko0N0,1073
|
|
22
|
+
rc_docparser-0.2.0.dist-info/RECORD,,
|