rc-docparser 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,304 @@
1
+ """Orchestrator: dispatch by file extension and walk a directory.
2
+
3
+ The orchestrator is what most callers use: give it a directory and a
4
+ ``WorkspaceLayout`` and it parses every supported file underneath, writing
5
+ ``document.md`` + ``document.json`` per file plus a top-level ``corpus.json``
6
+ index.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import time
11
+ import traceback
12
+ from collections.abc import Callable
13
+ from pathlib import Path
14
+ from typing import Any
15
+
16
+ from .common import (
17
+ WorkspaceLayout,
18
+ slugify,
19
+ truncate,
20
+ utc_now_iso,
21
+ write_json,
22
+ write_text,
23
+ )
24
+ from .csvtab import parse_csv
25
+ from .docx import parse_docx
26
+ from .text import parse_text
27
+ from .xlsx import parse_xlsx
28
+
29
+ SUPPORTED_EXTENSIONS: dict[str, str] = {
30
+ ".docx": "docx",
31
+ ".xlsx": "xlsx",
32
+ ".pdf": "pdf",
33
+ ".html": "html",
34
+ ".htm": "html",
35
+ ".pptx": "pptx",
36
+ ".epub": "epub",
37
+ ".txt": "text",
38
+ ".md": "text",
39
+ ".markdown": "text",
40
+ ".csv": "csv",
41
+ ".tsv": "csv",
42
+ }
43
+
44
+
45
+ def _get_parser(kind: str) -> Callable[..., dict[str, Any]]:
46
+ if kind == "docx":
47
+ return parse_docx
48
+ if kind == "xlsx":
49
+ return parse_xlsx
50
+ if kind == "text":
51
+ return parse_text
52
+ if kind == "csv":
53
+ return parse_csv
54
+ if kind == "pdf":
55
+ from .pdf import parse_pdf
56
+
57
+ return parse_pdf
58
+ if kind == "html":
59
+ from .html import parse_html
60
+
61
+ return parse_html
62
+ if kind == "pptx":
63
+ from .pptx import parse_pptx
64
+
65
+ return parse_pptx
66
+ if kind == "epub":
67
+ from .epub import parse_epub
68
+
69
+ return parse_epub
70
+ raise ValueError(f"unsupported kind: {kind}")
71
+
72
+
73
+ def parse_path(
74
+ source: Path | str,
75
+ layout: WorkspaceLayout | None = None,
76
+ *,
77
+ captioner: Callable[..., dict[str, Any]] | None = None,
78
+ write_outputs: bool = True,
79
+ **kwargs: Any,
80
+ ) -> dict[str, Any]:
81
+ """Dispatch ``source`` to the right parser by extension.
82
+
83
+ Extra keyword arguments (e.g. ``backend``, ``ocr``, ``extract_tables``)
84
+ are forwarded only to the PDF parser; other parsers ignore them.
85
+ """
86
+ source = Path(source)
87
+ suffix = source.suffix.lower()
88
+ kind = SUPPORTED_EXTENSIONS.get(suffix)
89
+ if kind is None:
90
+ raise ValueError(f"Unsupported extension: {suffix}")
91
+ parser = _get_parser(kind)
92
+ if kind == "pdf" and kwargs:
93
+ return parser(source, layout, captioner=captioner, write_outputs=write_outputs, **kwargs)
94
+ return parser(source, layout, captioner=captioner, write_outputs=write_outputs)
95
+
96
+
97
+ def make_captioner(
98
+ *,
99
+ enabled: bool,
100
+ layout: WorkspaceLayout,
101
+ max_calls: int | None = None,
102
+ log_path: Path | None = None,
103
+ provider: str | None = None,
104
+ model: str | None = None,
105
+ ):
106
+ """Build a captioner closure that wraps the image captioner.
107
+
108
+ ``provider`` selects the captioning backend: an API provider preset
109
+ (``openrouter`` / ``openai`` / ``gemini`` / ``local``) handled by
110
+ :func:`docparser.image.caption_image`, or ``transformers`` for the fully
111
+ local :func:`docparser.localvlm.caption_image_local`.
112
+
113
+ Enforces a per-run cap on actual VLM calls and (optionally) appends a TSV
114
+ log of every call.
115
+ """
116
+ if not enabled:
117
+ return None
118
+
119
+ use_local = (provider or "").lower() in {"transformers", "localvlm"}
120
+ _backend: Callable[..., Any]
121
+ if use_local:
122
+ from .localvlm import caption_image_local
123
+
124
+ _backend = caption_image_local # lazy: needs [localvlm]
125
+ else:
126
+ from .image import caption_image
127
+
128
+ _backend = caption_image # lazy import: needs [vlm] extra
129
+
130
+ state = {"calls": 0}
131
+
132
+ def _captioner(*, image_bytes, mime, doc_name, nearby_caption, context):
133
+ if max_calls is not None and state["calls"] >= max_calls:
134
+ return {
135
+ "error": f"max_calls ({max_calls}) reached for this run",
136
+ "caption": "",
137
+ "description": "",
138
+ "visible_text": "",
139
+ "tags": [],
140
+ "image_kind": "other",
141
+ "domain_relevance": "",
142
+ "model": "(skipped)",
143
+ "cached": False,
144
+ }
145
+ t0 = time.time()
146
+ kwargs = dict(
147
+ image_bytes=image_bytes,
148
+ mime=mime,
149
+ doc_name=doc_name,
150
+ nearby_caption=nearby_caption,
151
+ context=context,
152
+ layout=layout,
153
+ )
154
+ if not use_local:
155
+ kwargs["provider"] = provider
156
+ kwargs["model"] = model
157
+ result = _backend(**kwargs)
158
+ elapsed = time.time() - t0
159
+ if not result.cached:
160
+ state["calls"] += 1
161
+ if log_path is not None:
162
+ log_path.parent.mkdir(parents=True, exist_ok=True)
163
+ with log_path.open("a", encoding="utf-8") as fh:
164
+ fh.write(
165
+ f"{utc_now_iso()}\t{doc_name}\tcached={result.cached}\t"
166
+ f"elapsed={elapsed:.2f}s\tmodel={result.model}\t"
167
+ f"err={result.error or ''}\tcaption={truncate(result.caption,120)}\n"
168
+ )
169
+ return result.to_dict()
170
+
171
+ return _captioner
172
+
173
+
174
+ def build_corpus_index(
175
+ parsed_runs: list[dict[str, Any]], layout: WorkspaceLayout
176
+ ) -> tuple[str, dict[str, Any]]:
177
+ md = ["# Corpus index", ""]
178
+ md.append(f"_Built {utc_now_iso()} from {len(parsed_runs)} source files._")
179
+ md.append("")
180
+ md.append("| File | Kind | Sections | Images | Captioned | sha1 |")
181
+ md.append("| --- | --- | --- | --- | --- | --- |")
182
+ total_images = 0
183
+ total_captioned = 0
184
+ for r in parsed_runs:
185
+ src = r["source"]
186
+ stats = r.get("stats", {})
187
+ kind = src["kind"]
188
+ if kind == "xlsx":
189
+ sect = f"{stats.get('n_sheets', 0)} sheets · {stats.get('n_nonempty_cells_total', 0)} cells"
190
+ n_imgs = stats.get("n_images_total", 0)
191
+ n_capt = sum(
192
+ 1
193
+ for s in r.get("sheets", [])
194
+ for im in s.get("images", [])
195
+ if (im.get("semantic") or {}).get("caption")
196
+ )
197
+ elif kind == "html":
198
+ sect = (
199
+ f"{stats.get('n_headings', 0)} hdr · "
200
+ f"{stats.get('n_paragraphs', 0)} para · "
201
+ f"{stats.get('n_tables', 0)} tbl"
202
+ )
203
+ n_imgs = stats.get("n_images", 0)
204
+ n_capt = 0
205
+ else:
206
+ sect = (
207
+ f"{stats.get('n_headings', 0)} hdr · "
208
+ f"{stats.get('n_paragraphs', 0)} para · "
209
+ f"{stats.get('n_tables', 0)} tbl"
210
+ )
211
+ n_imgs = stats.get("n_images", 0)
212
+ n_capt = stats.get("n_captioned_images", 0)
213
+ total_images += n_imgs
214
+ total_captioned += n_capt
215
+ slug = slugify(Path(src["filename"]).stem)
216
+ md.append(
217
+ f"| `{src['filename']}` | {kind} | {sect} | {n_imgs} | {n_capt} | `{src['sha1'][:10]}` |"
218
+ )
219
+ _ = slug
220
+ md.append("")
221
+ md.append(f"**Totals:** {total_images} images, {total_captioned} VLM-captioned.")
222
+ md.append("")
223
+ corpus_json = {
224
+ "built_at": utc_now_iso(),
225
+ "n_sources": len(parsed_runs),
226
+ "totals": {"images": total_images, "captioned_images": total_captioned},
227
+ "sources": [
228
+ {
229
+ "filename": r["source"]["filename"],
230
+ "kind": r["source"]["kind"],
231
+ "sha1": r["source"]["sha1"],
232
+ "stats": r.get("stats", {}),
233
+ "parsed_dir": str(layout.parsed_dir_for(Path(r["source"]["filename"]))),
234
+ }
235
+ for r in parsed_runs
236
+ ],
237
+ }
238
+ return "\n".join(md), corpus_json
239
+
240
+
241
+ def run_all(
242
+ layout: WorkspaceLayout,
243
+ *,
244
+ use_vlm: bool = True,
245
+ only: str | None = None,
246
+ max_images: int | None = None,
247
+ continue_on_error: bool = False,
248
+ log_path: Path | None = None,
249
+ write_corpus: bool = True,
250
+ vlm_provider: str | None = None,
251
+ vlm_model: str | None = None,
252
+ pdf_backend: str = "builtin",
253
+ ocr: str = "off",
254
+ extract_tables: bool = False,
255
+ ) -> list[dict[str, Any]]:
256
+ """Walk ``layout.raw_dir`` and parse every supported file.
257
+
258
+ Returns the list of parsed payloads. If ``write_corpus`` is True (default),
259
+ a ``corpus.json`` is written under ``layout.parsed_dir`` and a top-level
260
+ ``CORPUS.md`` is returned via the side effect of writing it next to the
261
+ parsed dir's parent.
262
+ """
263
+ layout.ensure()
264
+ files = sorted(p for p in layout.raw_dir.iterdir() if p.is_file() or p.is_symlink())
265
+ files = [p for p in files if p.suffix.lower() in SUPPORTED_EXTENSIONS]
266
+ if only:
267
+ files = [p for p in files if only.lower() in p.name.lower()]
268
+ if not files:
269
+ return []
270
+
271
+ captioner = make_captioner(
272
+ enabled=use_vlm,
273
+ layout=layout,
274
+ max_calls=max_images,
275
+ log_path=log_path or (layout.cache_dir / "vlm_calls.log"),
276
+ provider=vlm_provider,
277
+ model=vlm_model,
278
+ )
279
+
280
+ pdf_kwargs: dict[str, Any] = {
281
+ "backend": pdf_backend,
282
+ "ocr": ocr,
283
+ "extract_tables": extract_tables,
284
+ }
285
+
286
+ parsed_runs: list[dict[str, Any]] = []
287
+ for p in files:
288
+ try:
289
+ t0 = time.time()
290
+ payload = parse_path(p, layout, captioner=captioner, **pdf_kwargs)
291
+ print(f"[docparser] {p.name} ok ({time.time() - t0:.1f}s) stats={payload.get('stats', {})}")
292
+ parsed_runs.append(payload)
293
+ except Exception as exc:
294
+ print(f"[docparser] {p.name} FAILED: {exc}")
295
+ traceback.print_exc()
296
+ if not continue_on_error:
297
+ raise
298
+
299
+ if write_corpus:
300
+ md, corpus = build_corpus_index(parsed_runs, layout)
301
+ write_text(layout.parsed_dir.parent / "CORPUS.md", md)
302
+ write_json(layout.parsed_dir / "corpus.json", corpus)
303
+
304
+ return parsed_runs