rc-docparser 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docparser/__init__.py ADDED
@@ -0,0 +1,87 @@
1
+ """docparser - convert research literature into structured Markdown + JSON corpora.
2
+
3
+ Public API
4
+ ----------
5
+ ``WorkspaceLayout``
6
+ Dataclass describing where raw / parsed / asset / cache files live. All
7
+ parsers accept a layout instead of relying on global paths, so the library
8
+ can be embedded in any project.
9
+ ``parse_docx``, ``parse_xlsx``, ``parse_pdf``, ``parse_html``
10
+ Per-format parsers. Each returns the structured payload and (by default)
11
+ writes ``document.md`` + ``document.json`` plus extracted assets into the
12
+ layout's ``parsed_dir_for(source)`` and ``assets_dir_for(source)``.
13
+ ``parse_path``
14
+ Dispatcher that picks a parser by file extension.
15
+ ``run_all``
16
+ Walk a directory and parse every supported file under it.
17
+ ``caption_image``
18
+ OpenRouter VLM helper, used by all parsers when a captioner is supplied.
19
+ """
20
+ from __future__ import annotations
21
+
22
+ __version__ = "0.2.0"
23
+
24
+ from .common import (
25
+ WorkspaceLayout,
26
+ bytes_sha1,
27
+ file_sha1,
28
+ slugify,
29
+ truncate,
30
+ utc_now_iso,
31
+ write_json,
32
+ write_text,
33
+ )
34
+ from .csvtab import parse_csv
35
+ from .docx import parse_docx
36
+ from .orchestrator import SUPPORTED_EXTENSIONS, parse_path, run_all
37
+ from .text import parse_text
38
+ from .xlsx import parse_xlsx
39
+
40
+ __all__ = [
41
+ "SUPPORTED_EXTENSIONS",
42
+ "WorkspaceLayout",
43
+ "__version__",
44
+ "bytes_sha1",
45
+ "caption_image",
46
+ "file_sha1",
47
+ "parse_csv",
48
+ "parse_docx",
49
+ "parse_epub",
50
+ "parse_html",
51
+ "parse_path",
52
+ "parse_pdf",
53
+ "parse_pptx",
54
+ "parse_text",
55
+ "parse_xlsx",
56
+ "run_all",
57
+ "slugify",
58
+ "truncate",
59
+ "utc_now_iso",
60
+ "write_json",
61
+ "write_text",
62
+ ]
63
+
64
+
65
+ def __getattr__(name: str):
66
+ """Lazy-import optional-extra modules so missing deps surface a clear error."""
67
+ if name == "caption_image":
68
+ from .image import caption_image
69
+
70
+ return caption_image
71
+ if name == "parse_pdf":
72
+ from .pdf import parse_pdf
73
+
74
+ return parse_pdf
75
+ if name == "parse_html":
76
+ from .html import parse_html
77
+
78
+ return parse_html
79
+ if name == "parse_pptx":
80
+ from .pptx import parse_pptx
81
+
82
+ return parse_pptx
83
+ if name == "parse_epub":
84
+ from .epub import parse_epub
85
+
86
+ return parse_epub
87
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
docparser/cli.py ADDED
@@ -0,0 +1,209 @@
1
+ """docparser command-line interface.
2
+
3
+ Subcommands
4
+ -----------
5
+ - ``docparser parse <FILE>`` parse a single file
6
+ - ``docparser parse-all <DIR>`` walk a directory and parse everything
7
+ - ``docparser version`` print package version
8
+
9
+ The CLI uses argparse only (no extra deps) so it works whether or not the
10
+ optional extras are installed.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import sys
16
+ from pathlib import Path
17
+
18
+ from . import __version__
19
+ from .common import WorkspaceLayout
20
+ from .orchestrator import SUPPORTED_EXTENSIONS, parse_path, run_all
21
+
22
+
23
+ def _layout_from_args(args: argparse.Namespace) -> WorkspaceLayout:
24
+ if args.workspace:
25
+ layout = WorkspaceLayout.under(args.workspace)
26
+ else:
27
+ layout = WorkspaceLayout()
28
+ if getattr(args, "raw_dir", None):
29
+ layout.raw_dir = Path(args.raw_dir)
30
+ if getattr(args, "parsed_dir", None):
31
+ layout.parsed_dir = Path(args.parsed_dir)
32
+ if getattr(args, "assets_dir", None):
33
+ layout.assets_dir = Path(args.assets_dir)
34
+ if getattr(args, "cache_dir", None):
35
+ layout.cache_dir = Path(args.cache_dir)
36
+ return layout
37
+
38
+
39
+ def _add_layout_args(p: argparse.ArgumentParser) -> None:
40
+ p.add_argument("--workspace", help="Convenience: rooted layout under this dir.")
41
+ p.add_argument("--raw-dir", help="Override raw_dir (defaults to data/raw).")
42
+ p.add_argument("--parsed-dir", help="Override parsed_dir (defaults to data/parsed).")
43
+ p.add_argument("--assets-dir", help="Override assets_dir (defaults to data/assets).")
44
+ p.add_argument("--cache-dir", help="Override cache_dir (defaults to .cache).")
45
+
46
+
47
+ def _add_vlm_args(p: argparse.ArgumentParser) -> None:
48
+ p.add_argument(
49
+ "--no-vlm",
50
+ action="store_true",
51
+ help="Skip vision-language captioning of extracted images.",
52
+ )
53
+ p.add_argument(
54
+ "--max-images",
55
+ type=int,
56
+ default=None,
57
+ help="Cap total VLM calls per run (cached calls don't count).",
58
+ )
59
+ p.add_argument(
60
+ "--vlm-provider",
61
+ default=None,
62
+ help=(
63
+ "Captioning backend: openrouter (default), openai, gemini, local, "
64
+ "or transformers (fully local)."
65
+ ),
66
+ )
67
+ p.add_argument(
68
+ "--vlm-model",
69
+ default=None,
70
+ help="Override the captioning model for the chosen provider.",
71
+ )
72
+
73
+
74
+ def _add_pdf_args(p: argparse.ArgumentParser) -> None:
75
+ p.add_argument(
76
+ "--pdf-backend",
77
+ default="builtin",
78
+ choices=["builtin", "pymupdf4llm", "docling", "marker"],
79
+ help="High-fidelity PDF conversion backend (default: builtin).",
80
+ )
81
+ p.add_argument(
82
+ "--ocr",
83
+ default="off",
84
+ choices=["off", "auto", "force"],
85
+ help="OCR scanned/low-text PDF pages (requires the [ocr] extra).",
86
+ )
87
+ p.add_argument(
88
+ "--pdf-tables",
89
+ action="store_true",
90
+ help="Extract PDF tables with pdfplumber (requires the [tables] extra).",
91
+ )
92
+
93
+
94
+ def cmd_parse(args: argparse.Namespace) -> int:
95
+ src = Path(args.source)
96
+ if not src.exists():
97
+ print(f"error: {src} does not exist", file=sys.stderr)
98
+ return 2
99
+ suffix = src.suffix.lower()
100
+ if suffix not in SUPPORTED_EXTENSIONS:
101
+ print(
102
+ f"error: unsupported extension {suffix!r}; supported: "
103
+ f"{sorted(SUPPORTED_EXTENSIONS)}",
104
+ file=sys.stderr,
105
+ )
106
+ return 2
107
+ layout = _layout_from_args(args)
108
+ layout.ensure()
109
+
110
+ captioner = None
111
+ if not args.no_vlm:
112
+ from .orchestrator import make_captioner
113
+
114
+ captioner = make_captioner(
115
+ enabled=True,
116
+ layout=layout,
117
+ max_calls=args.max_images,
118
+ provider=args.vlm_provider,
119
+ model=args.vlm_model,
120
+ )
121
+ pdf_kwargs: dict = {}
122
+ if suffix == ".pdf":
123
+ pdf_kwargs = {
124
+ "backend": args.pdf_backend,
125
+ "ocr": args.ocr,
126
+ "extract_tables": args.pdf_tables,
127
+ }
128
+ payload = parse_path(src, layout, captioner=captioner, **pdf_kwargs)
129
+ print(f"[docparser] {src.name} stats={payload.get('stats', {})}")
130
+ print(f"[docparser] wrote {layout.parsed_dir_for(src) / 'document.md'}")
131
+ return 0
132
+
133
+
134
+ def cmd_parse_all(args: argparse.Namespace) -> int:
135
+ layout = _layout_from_args(args)
136
+ if args.directory:
137
+ layout.raw_dir = Path(args.directory)
138
+ if not layout.raw_dir.exists():
139
+ print(f"error: {layout.raw_dir} does not exist", file=sys.stderr)
140
+ return 2
141
+ runs = run_all(
142
+ layout,
143
+ use_vlm=not args.no_vlm,
144
+ only=args.only,
145
+ max_images=args.max_images,
146
+ continue_on_error=args.continue_on_error,
147
+ vlm_provider=args.vlm_provider,
148
+ vlm_model=args.vlm_model,
149
+ pdf_backend=args.pdf_backend,
150
+ ocr=args.ocr,
151
+ extract_tables=args.pdf_tables,
152
+ )
153
+ print(f"[docparser] parsed {len(runs)} sources")
154
+ return 0
155
+
156
+
157
+ def cmd_version(_args: argparse.Namespace) -> int:
158
+ print(__version__)
159
+ return 0
160
+
161
+
162
+ def build_parser() -> argparse.ArgumentParser:
163
+ p = argparse.ArgumentParser(
164
+ prog="docparser",
165
+ description=(
166
+ "Convert .docx/.xlsx/.pdf/.html research literature into "
167
+ "Markdown + JSON corpora."
168
+ ),
169
+ )
170
+ sub = p.add_subparsers(dest="cmd", required=True)
171
+
172
+ p_parse = sub.add_parser("parse", help="Parse a single file.")
173
+ p_parse.add_argument(
174
+ "source",
175
+ help="Path to a supported file (.docx/.xlsx/.pdf/.html/.pptx/.epub/.txt/.md/.csv/.tsv).",
176
+ )
177
+ _add_layout_args(p_parse)
178
+ _add_vlm_args(p_parse)
179
+ _add_pdf_args(p_parse)
180
+ p_parse.set_defaults(func=cmd_parse)
181
+
182
+ p_all = sub.add_parser("parse-all", help="Walk a directory and parse every supported file.")
183
+ p_all.add_argument(
184
+ "directory",
185
+ nargs="?",
186
+ default=None,
187
+ help="Directory to walk. Defaults to the layout's raw_dir.",
188
+ )
189
+ p_all.add_argument("--only", default=None, help="Substring filter on filename.")
190
+ p_all.add_argument("--continue-on-error", action="store_true")
191
+ _add_layout_args(p_all)
192
+ _add_vlm_args(p_all)
193
+ _add_pdf_args(p_all)
194
+ p_all.set_defaults(func=cmd_parse_all)
195
+
196
+ p_ver = sub.add_parser("version", help="Print package version.")
197
+ p_ver.set_defaults(func=cmd_version)
198
+
199
+ return p
200
+
201
+
202
+ def main(argv: list[str] | None = None) -> int:
203
+ parser = build_parser()
204
+ args = parser.parse_args(argv)
205
+ return args.func(args)
206
+
207
+
208
+ if __name__ == "__main__":
209
+ raise SystemExit(main())
docparser/common.py ADDED
@@ -0,0 +1,163 @@
1
+ """Shared utilities and the ``WorkspaceLayout`` dataclass.
2
+
3
+ The original codebase pinned paths to ``REPO_ROOT``. For a library that's the
4
+ wrong shape: callers want to choose where parsed output lands, where extracted
5
+ assets go, and where the VLM cache lives. ``WorkspaceLayout`` captures those
6
+ four directories explicitly and every parser accepts one.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import datetime as _dt
11
+ import hashlib
12
+ import json
13
+ import os
14
+ import re
15
+ import unicodedata
16
+ from collections.abc import Iterable
17
+ from dataclasses import dataclass, field
18
+ from pathlib import Path
19
+ from typing import Any
20
+
21
+ # ---------------------------------------------------------------------------
22
+ # slug / hash helpers (pure)
23
+
24
+
25
+ _slug_re = re.compile(r"[^a-z0-9]+")
26
+
27
+
28
+ def slugify(text: str, *, max_len: int = 80) -> str:
29
+ """ASCII slug suitable for filenames and IDs."""
30
+ if text is None:
31
+ return ""
32
+ norm = unicodedata.normalize("NFKD", str(text))
33
+ norm = norm.encode("ascii", "ignore").decode("ascii").lower()
34
+ norm = _slug_re.sub("-", norm).strip("-")
35
+ return norm[:max_len] or "untitled"
36
+
37
+
38
+ def file_sha1(path: Path, *, chunk: int = 1 << 20) -> str:
39
+ h = hashlib.sha1()
40
+ with Path(path).open("rb") as fh:
41
+ while True:
42
+ buf = fh.read(chunk)
43
+ if not buf:
44
+ break
45
+ h.update(buf)
46
+ return h.hexdigest()
47
+
48
+
49
+ def bytes_sha1(data: bytes) -> str:
50
+ return hashlib.sha1(data).hexdigest()
51
+
52
+
53
+ def utc_now_iso() -> str:
54
+ return _dt.datetime.now(_dt.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
55
+
56
+
57
+ def _json_default(obj: Any) -> Any:
58
+ if isinstance(obj, (_dt.datetime, _dt.date, _dt.time)):
59
+ return obj.isoformat()
60
+ if isinstance(obj, bytes):
61
+ return obj.decode("utf-8", "replace")
62
+ if isinstance(obj, set):
63
+ return sorted(obj)
64
+ if hasattr(obj, "isoformat"):
65
+ return obj.isoformat()
66
+ return str(obj)
67
+
68
+
69
+ def write_json(path: Path, payload: Any) -> None:
70
+ Path(path).parent.mkdir(parents=True, exist_ok=True)
71
+ with Path(path).open("w", encoding="utf-8") as fh:
72
+ json.dump(payload, fh, indent=2, ensure_ascii=False, default=_json_default)
73
+ fh.write("\n")
74
+
75
+
76
+ def write_text(path: Path, text: str) -> None:
77
+ Path(path).parent.mkdir(parents=True, exist_ok=True)
78
+ with Path(path).open("w", encoding="utf-8") as fh:
79
+ fh.write(text)
80
+
81
+
82
+ def chunked(seq: Iterable[Any], n: int) -> Iterable[list[Any]]:
83
+ buf: list[Any] = []
84
+ for item in seq:
85
+ buf.append(item)
86
+ if len(buf) >= n:
87
+ yield buf
88
+ buf = []
89
+ if buf:
90
+ yield buf
91
+
92
+
93
+ def truncate(text: str, max_chars: int = 240) -> str:
94
+ if not text:
95
+ return ""
96
+ text = " ".join(text.split())
97
+ return text if len(text) <= max_chars else text[: max_chars - 1] + "\u2026"
98
+
99
+
100
+ # ---------------------------------------------------------------------------
101
+ # WorkspaceLayout
102
+
103
+
104
+ @dataclass
105
+ class WorkspaceLayout:
106
+ """Directory plan for parser output.
107
+
108
+ Parameters
109
+ ----------
110
+ raw_dir : Path
111
+ Where original source files live (or symlinks to them). Walked by
112
+ :func:`docparser.run_all`.
113
+ parsed_dir : Path
114
+ Where ``document.md`` and ``document.json`` are written (one folder
115
+ per source).
116
+ assets_dir : Path
117
+ Where extracted images go (one folder per source).
118
+ cache_dir : Path
119
+ On-disk cache for VLM responses (keyed by image SHA-1) and any other
120
+ caches the parsers want to reuse across runs.
121
+ """
122
+
123
+ raw_dir: Path = field(default_factory=lambda: Path("data/raw"))
124
+ parsed_dir: Path = field(default_factory=lambda: Path("data/parsed"))
125
+ assets_dir: Path = field(default_factory=lambda: Path("data/assets"))
126
+ cache_dir: Path = field(default_factory=lambda: Path(".cache"))
127
+
128
+ def __post_init__(self) -> None:
129
+ self.raw_dir = Path(self.raw_dir)
130
+ self.parsed_dir = Path(self.parsed_dir)
131
+ self.assets_dir = Path(self.assets_dir)
132
+ self.cache_dir = Path(self.cache_dir)
133
+
134
+ # convenience -----------------------------------------------------------
135
+
136
+ @classmethod
137
+ def under(cls, root: Path | str) -> WorkspaceLayout:
138
+ """Build a default layout rooted at ``root``."""
139
+ root = Path(root)
140
+ return cls(
141
+ raw_dir=root / "data" / "raw",
142
+ parsed_dir=root / "data" / "parsed",
143
+ assets_dir=root / "data" / "assets",
144
+ cache_dir=root / ".cache",
145
+ )
146
+
147
+ def ensure(self) -> None:
148
+ for d in (self.raw_dir, self.parsed_dir, self.assets_dir, self.cache_dir):
149
+ d.mkdir(parents=True, exist_ok=True)
150
+
151
+ def parsed_dir_for(self, source: Path | str) -> Path:
152
+ return self.parsed_dir / slugify(Path(source).stem)
153
+
154
+ def assets_dir_for(self, source: Path | str) -> Path:
155
+ return self.assets_dir / slugify(Path(source).stem)
156
+
157
+ def relpath_from_parsed(self, target: Path | str, source: Path | str) -> str:
158
+ """Markdown-friendly relative path from a source's parsed dir to ``target``."""
159
+ base = self.parsed_dir_for(source)
160
+ try:
161
+ return str(Path(target).resolve().relative_to(base.resolve()))
162
+ except ValueError:
163
+ return os.path.relpath(target, base)
docparser/csvtab.py ADDED
@@ -0,0 +1,131 @@
1
+ """CSV / TSV parser (core, stdlib ``csv``).
2
+
3
+ Sniffs the delimiter, renders a Markdown table, and stores every row as JSON.
4
+ The first row is treated as a header when it looks like one (all non-empty and
5
+ not purely numeric); otherwise synthetic ``col1..colN`` headers are used.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import csv
10
+ from collections.abc import Callable
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ from .common import (
15
+ WorkspaceLayout,
16
+ file_sha1,
17
+ truncate,
18
+ utc_now_iso,
19
+ write_json,
20
+ write_text,
21
+ )
22
+
23
+
24
+ def _sniff_delimiter(sample: str, suffix: str) -> str:
25
+ if suffix == ".tsv":
26
+ return "\t"
27
+ try:
28
+ dialect = csv.Sniffer().sniff(sample, delimiters=",\t;|")
29
+ return dialect.delimiter
30
+ except csv.Error:
31
+ return "\t" if suffix == ".tsv" else ","
32
+
33
+
34
+ def _looks_like_header(row: list[str]) -> bool:
35
+ if not row or any(not c.strip() for c in row):
36
+ return False
37
+ for c in row:
38
+ try:
39
+ float(c.replace(",", ""))
40
+ return False
41
+ except ValueError:
42
+ continue
43
+ return True
44
+
45
+
46
+ def _md_cell(v: str) -> str:
47
+ return truncate(str(v).replace("|", "\\|").replace("\r", " ").replace("\n", " <br> "), 400)
48
+
49
+
50
+ def parse_csv(
51
+ source: Path | str,
52
+ layout: WorkspaceLayout | None = None,
53
+ *,
54
+ captioner: Callable[..., dict[str, Any]] | None = None,
55
+ write_outputs: bool = True,
56
+ ) -> dict[str, Any]:
57
+ """Parse a ``.csv`` / ``.tsv`` file into Markdown + JSON."""
58
+ _ = captioner
59
+ source = Path(source)
60
+ layout = layout or WorkspaceLayout()
61
+ real_source = source.resolve()
62
+ suffix = source.suffix.lower()
63
+
64
+ text = real_source.read_text(encoding="utf-8", errors="replace")
65
+ delimiter = _sniff_delimiter(text[:4096], suffix)
66
+ reader = csv.reader(text.splitlines(), delimiter=delimiter)
67
+ rows: list[list[str]] = [list(r) for r in reader]
68
+
69
+ out_dir = layout.parsed_dir_for(source)
70
+ if write_outputs:
71
+ out_dir.mkdir(parents=True, exist_ok=True)
72
+
73
+ ncols = max((len(r) for r in rows), default=0)
74
+ has_header = bool(rows) and _looks_like_header(rows[0])
75
+ if has_header:
76
+ header = [c.strip() for c in rows[0]] + [""] * (ncols - len(rows[0]))
77
+ body = rows[1:]
78
+ else:
79
+ header = [f"col{i + 1}" for i in range(ncols)]
80
+ body = rows
81
+
82
+ records: list[dict[str, str]] = []
83
+ for r in body:
84
+ padded = list(r) + [""] * (ncols - len(r))
85
+ records.append({header[i] or f"col{i + 1}": padded[i] for i in range(ncols)})
86
+
87
+ md_lines = [
88
+ f"# {source.stem}",
89
+ "",
90
+ f"> Source: `{source.name}` \u00b7 sha1 `{file_sha1(real_source)[:12]}` "
91
+ f"\u00b7 parsed `{utc_now_iso()}` \u00b7 {len(rows)} rows \u00d7 {ncols} cols",
92
+ "",
93
+ ]
94
+ if ncols:
95
+ md_lines.append("| " + " | ".join(_md_cell(h) for h in header) + " |")
96
+ md_lines.append("| " + " | ".join(["---"] * ncols) + " |")
97
+ for r in body:
98
+ padded = list(r) + [""] * (ncols - len(r))
99
+ md_lines.append("| " + " | ".join(_md_cell(c) for c in padded) + " |")
100
+ md_lines.append("")
101
+ else:
102
+ md_lines.append("_(empty file)_")
103
+ md_lines.append("")
104
+
105
+ md_text = "\n".join(md_lines).rstrip() + "\n"
106
+
107
+ json_payload = {
108
+ "source": {
109
+ "filename": source.name,
110
+ "absolute_path": str(real_source),
111
+ "sha1": file_sha1(real_source),
112
+ "size_bytes": real_source.stat().st_size,
113
+ "kind": "csv",
114
+ },
115
+ "parsed_at": utc_now_iso(),
116
+ "delimiter": delimiter,
117
+ "has_header": has_header,
118
+ "header": header,
119
+ "rows": records,
120
+ "stats": {
121
+ "n_rows": len(body),
122
+ "n_cols": ncols,
123
+ "has_header": has_header,
124
+ },
125
+ }
126
+
127
+ if write_outputs:
128
+ write_text(out_dir / "document.md", md_text)
129
+ write_json(out_dir / "document.json", json_payload)
130
+
131
+ return json_payload