rc-docparser 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docparser/__init__.py +87 -0
- docparser/cli.py +209 -0
- docparser/common.py +163 -0
- docparser/csvtab.py +131 -0
- docparser/docx.py +488 -0
- docparser/epub.py +349 -0
- docparser/html.py +322 -0
- docparser/image.py +343 -0
- docparser/localvlm.py +103 -0
- docparser/ocr.py +68 -0
- docparser/orchestrator.py +304 -0
- docparser/pdf.py +430 -0
- docparser/pdf_backends.py +89 -0
- docparser/pptx.py +332 -0
- docparser/py.typed +0 -0
- docparser/text.py +189 -0
- docparser/xlsx.py +319 -0
- rc_docparser-0.2.0.dist-info/METADATA +344 -0
- rc_docparser-0.2.0.dist-info/RECORD +22 -0
- rc_docparser-0.2.0.dist-info/WHEEL +4 -0
- rc_docparser-0.2.0.dist-info/entry_points.txt +2 -0
- rc_docparser-0.2.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
"""Orchestrator: dispatch by file extension and walk a directory.
|
|
2
|
+
|
|
3
|
+
The orchestrator is what most callers use: give it a directory and a
|
|
4
|
+
``WorkspaceLayout`` and it parses every supported file underneath, writing
|
|
5
|
+
``document.md`` + ``document.json`` per file plus a top-level ``corpus.json``
|
|
6
|
+
index.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import time
|
|
11
|
+
import traceback
|
|
12
|
+
from collections.abc import Callable
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from .common import (
|
|
17
|
+
WorkspaceLayout,
|
|
18
|
+
slugify,
|
|
19
|
+
truncate,
|
|
20
|
+
utc_now_iso,
|
|
21
|
+
write_json,
|
|
22
|
+
write_text,
|
|
23
|
+
)
|
|
24
|
+
from .csvtab import parse_csv
|
|
25
|
+
from .docx import parse_docx
|
|
26
|
+
from .text import parse_text
|
|
27
|
+
from .xlsx import parse_xlsx
|
|
28
|
+
|
|
29
|
+
SUPPORTED_EXTENSIONS: dict[str, str] = {
|
|
30
|
+
".docx": "docx",
|
|
31
|
+
".xlsx": "xlsx",
|
|
32
|
+
".pdf": "pdf",
|
|
33
|
+
".html": "html",
|
|
34
|
+
".htm": "html",
|
|
35
|
+
".pptx": "pptx",
|
|
36
|
+
".epub": "epub",
|
|
37
|
+
".txt": "text",
|
|
38
|
+
".md": "text",
|
|
39
|
+
".markdown": "text",
|
|
40
|
+
".csv": "csv",
|
|
41
|
+
".tsv": "csv",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _get_parser(kind: str) -> Callable[..., dict[str, Any]]:
|
|
46
|
+
if kind == "docx":
|
|
47
|
+
return parse_docx
|
|
48
|
+
if kind == "xlsx":
|
|
49
|
+
return parse_xlsx
|
|
50
|
+
if kind == "text":
|
|
51
|
+
return parse_text
|
|
52
|
+
if kind == "csv":
|
|
53
|
+
return parse_csv
|
|
54
|
+
if kind == "pdf":
|
|
55
|
+
from .pdf import parse_pdf
|
|
56
|
+
|
|
57
|
+
return parse_pdf
|
|
58
|
+
if kind == "html":
|
|
59
|
+
from .html import parse_html
|
|
60
|
+
|
|
61
|
+
return parse_html
|
|
62
|
+
if kind == "pptx":
|
|
63
|
+
from .pptx import parse_pptx
|
|
64
|
+
|
|
65
|
+
return parse_pptx
|
|
66
|
+
if kind == "epub":
|
|
67
|
+
from .epub import parse_epub
|
|
68
|
+
|
|
69
|
+
return parse_epub
|
|
70
|
+
raise ValueError(f"unsupported kind: {kind}")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def parse_path(
|
|
74
|
+
source: Path | str,
|
|
75
|
+
layout: WorkspaceLayout | None = None,
|
|
76
|
+
*,
|
|
77
|
+
captioner: Callable[..., dict[str, Any]] | None = None,
|
|
78
|
+
write_outputs: bool = True,
|
|
79
|
+
**kwargs: Any,
|
|
80
|
+
) -> dict[str, Any]:
|
|
81
|
+
"""Dispatch ``source`` to the right parser by extension.
|
|
82
|
+
|
|
83
|
+
Extra keyword arguments (e.g. ``backend``, ``ocr``, ``extract_tables``)
|
|
84
|
+
are forwarded only to the PDF parser; other parsers ignore them.
|
|
85
|
+
"""
|
|
86
|
+
source = Path(source)
|
|
87
|
+
suffix = source.suffix.lower()
|
|
88
|
+
kind = SUPPORTED_EXTENSIONS.get(suffix)
|
|
89
|
+
if kind is None:
|
|
90
|
+
raise ValueError(f"Unsupported extension: {suffix}")
|
|
91
|
+
parser = _get_parser(kind)
|
|
92
|
+
if kind == "pdf" and kwargs:
|
|
93
|
+
return parser(source, layout, captioner=captioner, write_outputs=write_outputs, **kwargs)
|
|
94
|
+
return parser(source, layout, captioner=captioner, write_outputs=write_outputs)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def make_captioner(
|
|
98
|
+
*,
|
|
99
|
+
enabled: bool,
|
|
100
|
+
layout: WorkspaceLayout,
|
|
101
|
+
max_calls: int | None = None,
|
|
102
|
+
log_path: Path | None = None,
|
|
103
|
+
provider: str | None = None,
|
|
104
|
+
model: str | None = None,
|
|
105
|
+
):
|
|
106
|
+
"""Build a captioner closure that wraps the image captioner.
|
|
107
|
+
|
|
108
|
+
``provider`` selects the captioning backend: an API provider preset
|
|
109
|
+
(``openrouter`` / ``openai`` / ``gemini`` / ``local``) handled by
|
|
110
|
+
:func:`docparser.image.caption_image`, or ``transformers`` for the fully
|
|
111
|
+
local :func:`docparser.localvlm.caption_image_local`.
|
|
112
|
+
|
|
113
|
+
Enforces a per-run cap on actual VLM calls and (optionally) appends a TSV
|
|
114
|
+
log of every call.
|
|
115
|
+
"""
|
|
116
|
+
if not enabled:
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
use_local = (provider or "").lower() in {"transformers", "localvlm"}
|
|
120
|
+
_backend: Callable[..., Any]
|
|
121
|
+
if use_local:
|
|
122
|
+
from .localvlm import caption_image_local
|
|
123
|
+
|
|
124
|
+
_backend = caption_image_local # lazy: needs [localvlm]
|
|
125
|
+
else:
|
|
126
|
+
from .image import caption_image
|
|
127
|
+
|
|
128
|
+
_backend = caption_image # lazy import: needs [vlm] extra
|
|
129
|
+
|
|
130
|
+
state = {"calls": 0}
|
|
131
|
+
|
|
132
|
+
def _captioner(*, image_bytes, mime, doc_name, nearby_caption, context):
|
|
133
|
+
if max_calls is not None and state["calls"] >= max_calls:
|
|
134
|
+
return {
|
|
135
|
+
"error": f"max_calls ({max_calls}) reached for this run",
|
|
136
|
+
"caption": "",
|
|
137
|
+
"description": "",
|
|
138
|
+
"visible_text": "",
|
|
139
|
+
"tags": [],
|
|
140
|
+
"image_kind": "other",
|
|
141
|
+
"domain_relevance": "",
|
|
142
|
+
"model": "(skipped)",
|
|
143
|
+
"cached": False,
|
|
144
|
+
}
|
|
145
|
+
t0 = time.time()
|
|
146
|
+
kwargs = dict(
|
|
147
|
+
image_bytes=image_bytes,
|
|
148
|
+
mime=mime,
|
|
149
|
+
doc_name=doc_name,
|
|
150
|
+
nearby_caption=nearby_caption,
|
|
151
|
+
context=context,
|
|
152
|
+
layout=layout,
|
|
153
|
+
)
|
|
154
|
+
if not use_local:
|
|
155
|
+
kwargs["provider"] = provider
|
|
156
|
+
kwargs["model"] = model
|
|
157
|
+
result = _backend(**kwargs)
|
|
158
|
+
elapsed = time.time() - t0
|
|
159
|
+
if not result.cached:
|
|
160
|
+
state["calls"] += 1
|
|
161
|
+
if log_path is not None:
|
|
162
|
+
log_path.parent.mkdir(parents=True, exist_ok=True)
|
|
163
|
+
with log_path.open("a", encoding="utf-8") as fh:
|
|
164
|
+
fh.write(
|
|
165
|
+
f"{utc_now_iso()}\t{doc_name}\tcached={result.cached}\t"
|
|
166
|
+
f"elapsed={elapsed:.2f}s\tmodel={result.model}\t"
|
|
167
|
+
f"err={result.error or ''}\tcaption={truncate(result.caption,120)}\n"
|
|
168
|
+
)
|
|
169
|
+
return result.to_dict()
|
|
170
|
+
|
|
171
|
+
return _captioner
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def build_corpus_index(
|
|
175
|
+
parsed_runs: list[dict[str, Any]], layout: WorkspaceLayout
|
|
176
|
+
) -> tuple[str, dict[str, Any]]:
|
|
177
|
+
md = ["# Corpus index", ""]
|
|
178
|
+
md.append(f"_Built {utc_now_iso()} from {len(parsed_runs)} source files._")
|
|
179
|
+
md.append("")
|
|
180
|
+
md.append("| File | Kind | Sections | Images | Captioned | sha1 |")
|
|
181
|
+
md.append("| --- | --- | --- | --- | --- | --- |")
|
|
182
|
+
total_images = 0
|
|
183
|
+
total_captioned = 0
|
|
184
|
+
for r in parsed_runs:
|
|
185
|
+
src = r["source"]
|
|
186
|
+
stats = r.get("stats", {})
|
|
187
|
+
kind = src["kind"]
|
|
188
|
+
if kind == "xlsx":
|
|
189
|
+
sect = f"{stats.get('n_sheets', 0)} sheets · {stats.get('n_nonempty_cells_total', 0)} cells"
|
|
190
|
+
n_imgs = stats.get("n_images_total", 0)
|
|
191
|
+
n_capt = sum(
|
|
192
|
+
1
|
|
193
|
+
for s in r.get("sheets", [])
|
|
194
|
+
for im in s.get("images", [])
|
|
195
|
+
if (im.get("semantic") or {}).get("caption")
|
|
196
|
+
)
|
|
197
|
+
elif kind == "html":
|
|
198
|
+
sect = (
|
|
199
|
+
f"{stats.get('n_headings', 0)} hdr · "
|
|
200
|
+
f"{stats.get('n_paragraphs', 0)} para · "
|
|
201
|
+
f"{stats.get('n_tables', 0)} tbl"
|
|
202
|
+
)
|
|
203
|
+
n_imgs = stats.get("n_images", 0)
|
|
204
|
+
n_capt = 0
|
|
205
|
+
else:
|
|
206
|
+
sect = (
|
|
207
|
+
f"{stats.get('n_headings', 0)} hdr · "
|
|
208
|
+
f"{stats.get('n_paragraphs', 0)} para · "
|
|
209
|
+
f"{stats.get('n_tables', 0)} tbl"
|
|
210
|
+
)
|
|
211
|
+
n_imgs = stats.get("n_images", 0)
|
|
212
|
+
n_capt = stats.get("n_captioned_images", 0)
|
|
213
|
+
total_images += n_imgs
|
|
214
|
+
total_captioned += n_capt
|
|
215
|
+
slug = slugify(Path(src["filename"]).stem)
|
|
216
|
+
md.append(
|
|
217
|
+
f"| `{src['filename']}` | {kind} | {sect} | {n_imgs} | {n_capt} | `{src['sha1'][:10]}` |"
|
|
218
|
+
)
|
|
219
|
+
_ = slug
|
|
220
|
+
md.append("")
|
|
221
|
+
md.append(f"**Totals:** {total_images} images, {total_captioned} VLM-captioned.")
|
|
222
|
+
md.append("")
|
|
223
|
+
corpus_json = {
|
|
224
|
+
"built_at": utc_now_iso(),
|
|
225
|
+
"n_sources": len(parsed_runs),
|
|
226
|
+
"totals": {"images": total_images, "captioned_images": total_captioned},
|
|
227
|
+
"sources": [
|
|
228
|
+
{
|
|
229
|
+
"filename": r["source"]["filename"],
|
|
230
|
+
"kind": r["source"]["kind"],
|
|
231
|
+
"sha1": r["source"]["sha1"],
|
|
232
|
+
"stats": r.get("stats", {}),
|
|
233
|
+
"parsed_dir": str(layout.parsed_dir_for(Path(r["source"]["filename"]))),
|
|
234
|
+
}
|
|
235
|
+
for r in parsed_runs
|
|
236
|
+
],
|
|
237
|
+
}
|
|
238
|
+
return "\n".join(md), corpus_json
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def run_all(
|
|
242
|
+
layout: WorkspaceLayout,
|
|
243
|
+
*,
|
|
244
|
+
use_vlm: bool = True,
|
|
245
|
+
only: str | None = None,
|
|
246
|
+
max_images: int | None = None,
|
|
247
|
+
continue_on_error: bool = False,
|
|
248
|
+
log_path: Path | None = None,
|
|
249
|
+
write_corpus: bool = True,
|
|
250
|
+
vlm_provider: str | None = None,
|
|
251
|
+
vlm_model: str | None = None,
|
|
252
|
+
pdf_backend: str = "builtin",
|
|
253
|
+
ocr: str = "off",
|
|
254
|
+
extract_tables: bool = False,
|
|
255
|
+
) -> list[dict[str, Any]]:
|
|
256
|
+
"""Walk ``layout.raw_dir`` and parse every supported file.
|
|
257
|
+
|
|
258
|
+
Returns the list of parsed payloads. If ``write_corpus`` is True (default),
|
|
259
|
+
a ``corpus.json`` is written under ``layout.parsed_dir`` and a top-level
|
|
260
|
+
``CORPUS.md`` is returned via the side effect of writing it next to the
|
|
261
|
+
parsed dir's parent.
|
|
262
|
+
"""
|
|
263
|
+
layout.ensure()
|
|
264
|
+
files = sorted(p for p in layout.raw_dir.iterdir() if p.is_file() or p.is_symlink())
|
|
265
|
+
files = [p for p in files if p.suffix.lower() in SUPPORTED_EXTENSIONS]
|
|
266
|
+
if only:
|
|
267
|
+
files = [p for p in files if only.lower() in p.name.lower()]
|
|
268
|
+
if not files:
|
|
269
|
+
return []
|
|
270
|
+
|
|
271
|
+
captioner = make_captioner(
|
|
272
|
+
enabled=use_vlm,
|
|
273
|
+
layout=layout,
|
|
274
|
+
max_calls=max_images,
|
|
275
|
+
log_path=log_path or (layout.cache_dir / "vlm_calls.log"),
|
|
276
|
+
provider=vlm_provider,
|
|
277
|
+
model=vlm_model,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
pdf_kwargs: dict[str, Any] = {
|
|
281
|
+
"backend": pdf_backend,
|
|
282
|
+
"ocr": ocr,
|
|
283
|
+
"extract_tables": extract_tables,
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
parsed_runs: list[dict[str, Any]] = []
|
|
287
|
+
for p in files:
|
|
288
|
+
try:
|
|
289
|
+
t0 = time.time()
|
|
290
|
+
payload = parse_path(p, layout, captioner=captioner, **pdf_kwargs)
|
|
291
|
+
print(f"[docparser] {p.name} ok ({time.time() - t0:.1f}s) stats={payload.get('stats', {})}")
|
|
292
|
+
parsed_runs.append(payload)
|
|
293
|
+
except Exception as exc:
|
|
294
|
+
print(f"[docparser] {p.name} FAILED: {exc}")
|
|
295
|
+
traceback.print_exc()
|
|
296
|
+
if not continue_on_error:
|
|
297
|
+
raise
|
|
298
|
+
|
|
299
|
+
if write_corpus:
|
|
300
|
+
md, corpus = build_corpus_index(parsed_runs, layout)
|
|
301
|
+
write_text(layout.parsed_dir.parent / "CORPUS.md", md)
|
|
302
|
+
write_json(layout.parsed_dir / "corpus.json", corpus)
|
|
303
|
+
|
|
304
|
+
return parsed_runs
|