rc-docparser 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docparser/__init__.py +87 -0
- docparser/cli.py +209 -0
- docparser/common.py +163 -0
- docparser/csvtab.py +131 -0
- docparser/docx.py +488 -0
- docparser/epub.py +349 -0
- docparser/html.py +322 -0
- docparser/image.py +343 -0
- docparser/localvlm.py +103 -0
- docparser/ocr.py +68 -0
- docparser/orchestrator.py +304 -0
- docparser/pdf.py +430 -0
- docparser/pdf_backends.py +89 -0
- docparser/pptx.py +332 -0
- docparser/py.typed +0 -0
- docparser/text.py +189 -0
- docparser/xlsx.py +319 -0
- rc_docparser-0.2.0.dist-info/METADATA +344 -0
- rc_docparser-0.2.0.dist-info/RECORD +22 -0
- rc_docparser-0.2.0.dist-info/WHEEL +4 -0
- rc_docparser-0.2.0.dist-info/entry_points.txt +2 -0
- rc_docparser-0.2.0.dist-info/licenses/LICENSE +21 -0
docparser/epub.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
"""EPUB parser.
|
|
2
|
+
|
|
3
|
+
Walks the EPUB spine in reading order, runs a BeautifulSoup structural walk
|
|
4
|
+
over each chapter (headings / paragraphs / lists / tables / images), and
|
|
5
|
+
extracts embedded images to the asset directory. Embedded images can be
|
|
6
|
+
captioned via a ``captioner`` callable.
|
|
7
|
+
|
|
8
|
+
Requires the ``[epub]`` extra: ``pip install 'docparser[epub]'`` (which also
|
|
9
|
+
pulls in BeautifulSoup from the ``[html]`` extra).
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
from collections.abc import Callable
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from .common import (
|
|
19
|
+
WorkspaceLayout,
|
|
20
|
+
bytes_sha1,
|
|
21
|
+
file_sha1,
|
|
22
|
+
truncate,
|
|
23
|
+
utc_now_iso,
|
|
24
|
+
write_json,
|
|
25
|
+
write_text,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
_HEADING_RE = re.compile(r"^h([1-6])$", re.IGNORECASE)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _import_deps():
|
|
32
|
+
try:
|
|
33
|
+
import ebooklib # type: ignore
|
|
34
|
+
from bs4 import BeautifulSoup # type: ignore
|
|
35
|
+
from ebooklib import epub # type: ignore
|
|
36
|
+
except ImportError as exc: # pragma: no cover - optional dep guard
|
|
37
|
+
raise ImportError(
|
|
38
|
+
"docparser.epub.parse_epub requires the [epub] extra. "
|
|
39
|
+
"Install with: pip install 'docparser[epub]'"
|
|
40
|
+
) from exc
|
|
41
|
+
return ebooklib, epub, BeautifulSoup
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _ext_for(media_type: str, name: str) -> str:
|
|
45
|
+
mapping = {
|
|
46
|
+
"image/png": "png",
|
|
47
|
+
"image/jpeg": "jpg",
|
|
48
|
+
"image/gif": "gif",
|
|
49
|
+
"image/svg+xml": "svg",
|
|
50
|
+
"image/webp": "webp",
|
|
51
|
+
}
|
|
52
|
+
if media_type in mapping:
|
|
53
|
+
return mapping[media_type]
|
|
54
|
+
return Path(name or "").suffix.lstrip(".").lower() or "png"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def parse_epub(
|
|
58
|
+
source: Path | str,
|
|
59
|
+
layout: WorkspaceLayout | None = None,
|
|
60
|
+
*,
|
|
61
|
+
captioner: Callable[..., dict[str, Any]] | None = None,
|
|
62
|
+
write_outputs: bool = True,
|
|
63
|
+
) -> dict[str, Any]:
|
|
64
|
+
"""Parse an EPUB into Markdown + JSON + images."""
|
|
65
|
+
ebooklib, epub, BeautifulSoup = _import_deps()
|
|
66
|
+
|
|
67
|
+
source = Path(source)
|
|
68
|
+
layout = layout or WorkspaceLayout()
|
|
69
|
+
real_source = source.resolve()
|
|
70
|
+
book = epub.read_epub(str(real_source))
|
|
71
|
+
|
|
72
|
+
out_dir = layout.parsed_dir_for(source)
|
|
73
|
+
asset_dir = layout.assets_dir_for(source)
|
|
74
|
+
if write_outputs:
|
|
75
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
76
|
+
asset_dir.mkdir(parents=True, exist_ok=True)
|
|
77
|
+
|
|
78
|
+
# ---- extract embedded images up front, keyed by (normalized) href -----
|
|
79
|
+
image_by_name: dict[str, dict[str, Any]] = {}
|
|
80
|
+
images: list[dict[str, Any]] = []
|
|
81
|
+
image_caption_results: dict[str, dict[str, Any]] = {}
|
|
82
|
+
image_seq = 0
|
|
83
|
+
for item in book.get_items_of_type(ebooklib.ITEM_IMAGE):
|
|
84
|
+
blob = item.get_content()
|
|
85
|
+
if not blob:
|
|
86
|
+
continue
|
|
87
|
+
image_seq += 1
|
|
88
|
+
sha = bytes_sha1(blob)
|
|
89
|
+
ext = _ext_for(getattr(item, "media_type", "") or "", item.get_name())
|
|
90
|
+
asset_name = f"img-{image_seq:03d}-{sha[:10]}.{ext}"
|
|
91
|
+
asset_path = asset_dir / asset_name
|
|
92
|
+
if write_outputs and not asset_path.exists():
|
|
93
|
+
asset_path.write_bytes(blob)
|
|
94
|
+
rel = layout.relpath_from_parsed(asset_path, source)
|
|
95
|
+
rec = {
|
|
96
|
+
"seq": image_seq,
|
|
97
|
+
"sha1": sha,
|
|
98
|
+
"ext": ext,
|
|
99
|
+
"name": item.get_name(),
|
|
100
|
+
"asset_path": rel,
|
|
101
|
+
"blob": blob,
|
|
102
|
+
"mime": getattr(item, "media_type", "") or f"image/{ext}",
|
|
103
|
+
}
|
|
104
|
+
key = Path(item.get_name()).name.lower()
|
|
105
|
+
image_by_name[key] = rec
|
|
106
|
+
images.append(rec)
|
|
107
|
+
|
|
108
|
+
def _resolve_image(src: str) -> dict[str, Any] | None:
|
|
109
|
+
if not src:
|
|
110
|
+
return None
|
|
111
|
+
return image_by_name.get(Path(src.split("#")[0]).name.lower())
|
|
112
|
+
|
|
113
|
+
# ---- metadata ---------------------------------------------------------
|
|
114
|
+
def _meta(field: str) -> str:
|
|
115
|
+
try:
|
|
116
|
+
vals = book.get_metadata("DC", field)
|
|
117
|
+
if vals:
|
|
118
|
+
return str(vals[0][0])
|
|
119
|
+
except Exception:
|
|
120
|
+
pass
|
|
121
|
+
return ""
|
|
122
|
+
|
|
123
|
+
title = _meta("title") or source.stem
|
|
124
|
+
author = _meta("creator")
|
|
125
|
+
language = _meta("language")
|
|
126
|
+
|
|
127
|
+
# ---- structural walk per spine document -------------------------------
|
|
128
|
+
blocks: list[dict[str, Any]] = []
|
|
129
|
+
section_stack: list[str] = []
|
|
130
|
+
|
|
131
|
+
def push_heading(level: int, text: str) -> None:
|
|
132
|
+
while len(section_stack) >= level:
|
|
133
|
+
section_stack.pop()
|
|
134
|
+
section_stack.append(text)
|
|
135
|
+
blocks.append(
|
|
136
|
+
{"kind": "heading", "level": level, "text": text, "section_path": list(section_stack)}
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
spine_ids = [sid for sid, _ in book.spine]
|
|
140
|
+
items = []
|
|
141
|
+
for sid in spine_ids:
|
|
142
|
+
it = book.get_item_with_id(sid)
|
|
143
|
+
if it is not None and it.get_type() == ebooklib.ITEM_DOCUMENT:
|
|
144
|
+
items.append(it)
|
|
145
|
+
if not items:
|
|
146
|
+
items = list(book.get_items_of_type(ebooklib.ITEM_DOCUMENT))
|
|
147
|
+
|
|
148
|
+
for doc_item in items:
|
|
149
|
+
content = doc_item.get_content()
|
|
150
|
+
if not content:
|
|
151
|
+
continue
|
|
152
|
+
soup = BeautifulSoup(content, "html.parser")
|
|
153
|
+
for tag in soup(["script", "style"]):
|
|
154
|
+
tag.decompose()
|
|
155
|
+
body = soup.body or soup
|
|
156
|
+
seen_lists: set[int] = set()
|
|
157
|
+
for el in body.find_all(True, recursive=True):
|
|
158
|
+
name = (el.name or "").lower()
|
|
159
|
+
m = _HEADING_RE.match(name)
|
|
160
|
+
if m:
|
|
161
|
+
text = el.get_text(" ", strip=True)
|
|
162
|
+
if text:
|
|
163
|
+
push_heading(int(m.group(1)), text)
|
|
164
|
+
continue
|
|
165
|
+
if name == "p":
|
|
166
|
+
text = el.get_text(" ", strip=True)
|
|
167
|
+
if text:
|
|
168
|
+
blocks.append(
|
|
169
|
+
{"kind": "paragraph", "text": text, "section_path": list(section_stack)}
|
|
170
|
+
)
|
|
171
|
+
continue
|
|
172
|
+
if name in {"ul", "ol"}:
|
|
173
|
+
if id(el) in seen_lists:
|
|
174
|
+
continue
|
|
175
|
+
seen_lists.add(id(el))
|
|
176
|
+
li_items = [li.get_text(" ", strip=True) for li in el.find_all("li", recursive=False)]
|
|
177
|
+
li_items = [i for i in li_items if i]
|
|
178
|
+
if li_items:
|
|
179
|
+
blocks.append(
|
|
180
|
+
{
|
|
181
|
+
"kind": "list",
|
|
182
|
+
"ordered": name == "ol",
|
|
183
|
+
"items": li_items,
|
|
184
|
+
"section_path": list(section_stack),
|
|
185
|
+
}
|
|
186
|
+
)
|
|
187
|
+
continue
|
|
188
|
+
if name == "table":
|
|
189
|
+
rows: list[list[str]] = []
|
|
190
|
+
for tr in el.find_all("tr", recursive=True):
|
|
191
|
+
cells = [
|
|
192
|
+
td.get_text(" ", strip=True)
|
|
193
|
+
for td in tr.find_all(["td", "th"], recursive=False)
|
|
194
|
+
]
|
|
195
|
+
if cells:
|
|
196
|
+
rows.append(cells)
|
|
197
|
+
if rows:
|
|
198
|
+
blocks.append(
|
|
199
|
+
{"kind": "table", "rows": rows, "section_path": list(section_stack)}
|
|
200
|
+
)
|
|
201
|
+
continue
|
|
202
|
+
if name in {"img", "image"}:
|
|
203
|
+
src = el.get("src") or el.get("href") or el.get("xlink:href") or ""
|
|
204
|
+
img_rec = _resolve_image(src)
|
|
205
|
+
blocks.append(
|
|
206
|
+
{
|
|
207
|
+
"kind": "image",
|
|
208
|
+
"src": src,
|
|
209
|
+
"alt": el.get("alt") or "",
|
|
210
|
+
"image_seq": img_rec["seq"] if img_rec else None,
|
|
211
|
+
"asset_path": img_rec["asset_path"] if img_rec else None,
|
|
212
|
+
"section_path": list(section_stack),
|
|
213
|
+
}
|
|
214
|
+
)
|
|
215
|
+
continue
|
|
216
|
+
|
|
217
|
+
# ---- caption images that were actually referenced ---------------------
|
|
218
|
+
referenced_seqs = {b["image_seq"] for b in blocks if b.get("kind") == "image" and b.get("image_seq")}
|
|
219
|
+
if captioner is not None:
|
|
220
|
+
ctx_by_seq: dict[int, str] = {}
|
|
221
|
+
last_text = ""
|
|
222
|
+
for b in blocks:
|
|
223
|
+
if b["kind"] in {"paragraph", "heading"}:
|
|
224
|
+
last_text = b.get("text", "")
|
|
225
|
+
elif b["kind"] == "image" and b.get("image_seq"):
|
|
226
|
+
ctx_by_seq.setdefault(b["image_seq"], last_text)
|
|
227
|
+
for rec in images:
|
|
228
|
+
if rec["seq"] not in referenced_seqs:
|
|
229
|
+
continue
|
|
230
|
+
try:
|
|
231
|
+
cap = captioner(
|
|
232
|
+
image_bytes=rec["blob"],
|
|
233
|
+
mime=rec["mime"],
|
|
234
|
+
doc_name=source.name,
|
|
235
|
+
nearby_caption="",
|
|
236
|
+
context=ctx_by_seq.get(rec["seq"], ""),
|
|
237
|
+
)
|
|
238
|
+
except Exception as exc:
|
|
239
|
+
cap = {"error": str(exc)}
|
|
240
|
+
image_caption_results[str(rec["seq"])] = cap or {}
|
|
241
|
+
|
|
242
|
+
# ---- markdown rendering -----------------------------------------------
|
|
243
|
+
md_lines = [
|
|
244
|
+
f"# {title}",
|
|
245
|
+
"",
|
|
246
|
+
f"> Source: `{source.name}` \u00b7 sha1 `{file_sha1(real_source)[:12]}` "
|
|
247
|
+
f"\u00b7 parsed `{utc_now_iso()}`",
|
|
248
|
+
]
|
|
249
|
+
meta_parts = []
|
|
250
|
+
if author:
|
|
251
|
+
meta_parts.append(f"author: {author}")
|
|
252
|
+
if language:
|
|
253
|
+
meta_parts.append(f"language: {language}")
|
|
254
|
+
if meta_parts:
|
|
255
|
+
md_lines.append("> " + " \u00b7 ".join(meta_parts))
|
|
256
|
+
md_lines.append("")
|
|
257
|
+
|
|
258
|
+
for b in blocks:
|
|
259
|
+
kind = b["kind"]
|
|
260
|
+
if kind == "heading":
|
|
261
|
+
md_lines.append(f"{'#' * min(6, b['level'] + 1)} {b['text']}")
|
|
262
|
+
md_lines.append("")
|
|
263
|
+
elif kind == "paragraph":
|
|
264
|
+
md_lines.append(b["text"])
|
|
265
|
+
md_lines.append("")
|
|
266
|
+
elif kind == "list":
|
|
267
|
+
for item in b["items"]:
|
|
268
|
+
md_lines.append(f"- {item}")
|
|
269
|
+
md_lines.append("")
|
|
270
|
+
elif kind == "table":
|
|
271
|
+
rows = b["rows"]
|
|
272
|
+
ncols = max(len(r) for r in rows)
|
|
273
|
+
header = rows[0]
|
|
274
|
+
md_lines.append(
|
|
275
|
+
"| "
|
|
276
|
+
+ " | ".join(
|
|
277
|
+
(header[c] if c < len(header) else "").replace("|", "\\|") for c in range(ncols)
|
|
278
|
+
)
|
|
279
|
+
+ " |"
|
|
280
|
+
)
|
|
281
|
+
md_lines.append("| " + " | ".join(["---"] * ncols) + " |")
|
|
282
|
+
for row in rows[1:]:
|
|
283
|
+
md_lines.append(
|
|
284
|
+
"| "
|
|
285
|
+
+ " | ".join(
|
|
286
|
+
(row[c] if c < len(row) else "").replace("|", "\\|") for c in range(ncols)
|
|
287
|
+
)
|
|
288
|
+
+ " |"
|
|
289
|
+
)
|
|
290
|
+
md_lines.append("")
|
|
291
|
+
elif kind == "image":
|
|
292
|
+
rel = b.get("asset_path") or b.get("src", "")
|
|
293
|
+
cap = image_caption_results.get(str(b.get("image_seq"))) or {}
|
|
294
|
+
alt = cap.get("caption") or b.get("alt") or "image"
|
|
295
|
+
md_lines.append(f"")
|
|
296
|
+
if cap.get("description"):
|
|
297
|
+
md_lines.append("")
|
|
298
|
+
md_lines.append(f"> **VLM caption.** {cap.get('caption','')}")
|
|
299
|
+
md_lines.append(">")
|
|
300
|
+
md_lines.append(f"> {cap.get('description','')}")
|
|
301
|
+
if cap.get("tags"):
|
|
302
|
+
md_lines.append(">")
|
|
303
|
+
md_lines.append("> *Tags:* " + ", ".join(cap["tags"]))
|
|
304
|
+
md_lines.append("")
|
|
305
|
+
|
|
306
|
+
md_text = "\n".join(md_lines).rstrip() + "\n"
|
|
307
|
+
|
|
308
|
+
images_json = []
|
|
309
|
+
for rec in images:
|
|
310
|
+
d = {k: v for k, v in rec.items() if k != "blob"}
|
|
311
|
+
d["semantic"] = image_caption_results.get(str(rec["seq"]))
|
|
312
|
+
d["referenced"] = rec["seq"] in referenced_seqs
|
|
313
|
+
images_json.append(d)
|
|
314
|
+
|
|
315
|
+
json_payload = {
|
|
316
|
+
"source": {
|
|
317
|
+
"filename": source.name,
|
|
318
|
+
"absolute_path": str(real_source),
|
|
319
|
+
"sha1": file_sha1(real_source),
|
|
320
|
+
"size_bytes": real_source.stat().st_size,
|
|
321
|
+
"kind": "epub",
|
|
322
|
+
},
|
|
323
|
+
"parsed_at": utc_now_iso(),
|
|
324
|
+
"metadata": {"title": title, "author": author, "language": language},
|
|
325
|
+
"blocks": blocks,
|
|
326
|
+
"images": images_json,
|
|
327
|
+
"stats": {
|
|
328
|
+
"n_blocks": len(blocks),
|
|
329
|
+
"n_chapters": len(items),
|
|
330
|
+
"n_headings": sum(1 for b in blocks if b["kind"] == "heading"),
|
|
331
|
+
"n_paragraphs": sum(1 for b in blocks if b["kind"] == "paragraph"),
|
|
332
|
+
"n_lists": sum(1 for b in blocks if b["kind"] == "list"),
|
|
333
|
+
"n_tables": sum(1 for b in blocks if b["kind"] == "table"),
|
|
334
|
+
"n_images": len(images),
|
|
335
|
+
"n_captioned_images": sum(
|
|
336
|
+
1
|
|
337
|
+
for v in image_caption_results.values()
|
|
338
|
+
if v and not v.get("error") and v.get("caption")
|
|
339
|
+
),
|
|
340
|
+
},
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
_ = truncate # symmetry with other parsers
|
|
344
|
+
|
|
345
|
+
if write_outputs:
|
|
346
|
+
write_text(out_dir / "document.md", md_text)
|
|
347
|
+
write_json(out_dir / "document.json", json_payload)
|
|
348
|
+
|
|
349
|
+
return json_payload
|
docparser/html.py
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
"""HTML parser.
|
|
2
|
+
|
|
3
|
+
Two-tier strategy:
|
|
4
|
+
|
|
5
|
+
1. **Article extraction** via `trafilatura` for the main body (drops nav,
|
|
6
|
+
sidebars, ads).
|
|
7
|
+
2. **Structural fallback** via BeautifulSoup that walks the DOM and emits
|
|
8
|
+
typed blocks (heading/paragraph/list/table/image) when trafilatura returns
|
|
9
|
+
nothing useful or when the caller wants the full structure.
|
|
10
|
+
|
|
11
|
+
Requires the ``[html]`` extra: ``pip install 'docparser[html]'``.
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
from collections.abc import Callable
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any
|
|
19
|
+
from urllib.parse import urlparse
|
|
20
|
+
|
|
21
|
+
from .common import (
|
|
22
|
+
WorkspaceLayout,
|
|
23
|
+
file_sha1,
|
|
24
|
+
truncate,
|
|
25
|
+
utc_now_iso,
|
|
26
|
+
write_json,
|
|
27
|
+
write_text,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
_HEADING_RE = re.compile(r"^h([1-6])$", re.IGNORECASE)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _import_deps():
|
|
34
|
+
try:
|
|
35
|
+
import trafilatura # type: ignore
|
|
36
|
+
from bs4 import BeautifulSoup # type: ignore
|
|
37
|
+
except ImportError as exc: # pragma: no cover
|
|
38
|
+
raise ImportError(
|
|
39
|
+
"docparser.html.parse_html requires the [html] extra. "
|
|
40
|
+
"Install with: pip install 'docparser[html]'"
|
|
41
|
+
) from exc
|
|
42
|
+
return trafilatura, BeautifulSoup
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def parse_html(
|
|
46
|
+
source: Path | str,
|
|
47
|
+
layout: WorkspaceLayout | None = None,
|
|
48
|
+
*,
|
|
49
|
+
captioner: Callable[..., dict[str, Any]] | None = None,
|
|
50
|
+
write_outputs: bool = True,
|
|
51
|
+
use_trafilatura: bool = True,
|
|
52
|
+
) -> dict[str, Any]:
|
|
53
|
+
"""Parse an HTML file (or http(s) URL) into Markdown + JSON.
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
source : Path | str
|
|
58
|
+
Local path or full URL. URLs are fetched via ``trafilatura`` (which
|
|
59
|
+
falls back to ``urllib`` if needed).
|
|
60
|
+
use_trafilatura : bool
|
|
61
|
+
If True (default), prefer trafilatura's article extraction. The
|
|
62
|
+
structural BS4 walk is always run as the fallback / structural source.
|
|
63
|
+
"""
|
|
64
|
+
trafilatura, BeautifulSoup = _import_deps()
|
|
65
|
+
|
|
66
|
+
is_url = isinstance(source, str) and re.match(r"^https?://", source) is not None
|
|
67
|
+
real_source: Path | None
|
|
68
|
+
if is_url:
|
|
69
|
+
url = str(source)
|
|
70
|
+
downloaded = trafilatura.fetch_url(url)
|
|
71
|
+
if not downloaded:
|
|
72
|
+
raise RuntimeError(f"Could not fetch URL: {url}")
|
|
73
|
+
html_text = downloaded
|
|
74
|
+
# Use the URL's path for layout slugs; fall back to host
|
|
75
|
+
host = urlparse(url).netloc.replace(":", "_")
|
|
76
|
+
path_part = urlparse(url).path.strip("/").replace("/", "_") or "index"
|
|
77
|
+
synthetic_name = f"{host}_{path_part}.html"
|
|
78
|
+
layout = layout or WorkspaceLayout()
|
|
79
|
+
out_dir = layout.parsed_dir / layout.parsed_dir_for(Path(synthetic_name)).name
|
|
80
|
+
asset_dir = layout.assets_dir_for(Path(synthetic_name))
|
|
81
|
+
source_filename = synthetic_name
|
|
82
|
+
absolute_path = url
|
|
83
|
+
size_bytes = len(html_text.encode("utf-8"))
|
|
84
|
+
sha = "" # we hash the fetched bytes below
|
|
85
|
+
import hashlib
|
|
86
|
+
|
|
87
|
+
sha = hashlib.sha1(html_text.encode("utf-8")).hexdigest()
|
|
88
|
+
real_source = None
|
|
89
|
+
else:
|
|
90
|
+
real_source = Path(source).resolve()
|
|
91
|
+
html_text = real_source.read_text(encoding="utf-8", errors="replace")
|
|
92
|
+
layout = layout or WorkspaceLayout()
|
|
93
|
+
out_dir = layout.parsed_dir_for(real_source)
|
|
94
|
+
asset_dir = layout.assets_dir_for(real_source)
|
|
95
|
+
source_filename = real_source.name
|
|
96
|
+
absolute_path = str(real_source)
|
|
97
|
+
size_bytes = real_source.stat().st_size
|
|
98
|
+
sha = file_sha1(real_source)
|
|
99
|
+
|
|
100
|
+
if write_outputs:
|
|
101
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
102
|
+
asset_dir.mkdir(parents=True, exist_ok=True)
|
|
103
|
+
|
|
104
|
+
# 1. Article-grade markdown via trafilatura --------------------------------
|
|
105
|
+
article_md: str | None = None
|
|
106
|
+
article_meta: dict[str, Any] = {}
|
|
107
|
+
if use_trafilatura:
|
|
108
|
+
try:
|
|
109
|
+
article_md = trafilatura.extract(
|
|
110
|
+
html_text,
|
|
111
|
+
output_format="markdown",
|
|
112
|
+
include_comments=False,
|
|
113
|
+
include_tables=True,
|
|
114
|
+
include_links=True,
|
|
115
|
+
with_metadata=False,
|
|
116
|
+
)
|
|
117
|
+
meta = trafilatura.extract_metadata(html_text)
|
|
118
|
+
if meta is not None:
|
|
119
|
+
article_meta = {
|
|
120
|
+
k: getattr(meta, k, None)
|
|
121
|
+
for k in (
|
|
122
|
+
"title",
|
|
123
|
+
"author",
|
|
124
|
+
"date",
|
|
125
|
+
"sitename",
|
|
126
|
+
"url",
|
|
127
|
+
"description",
|
|
128
|
+
"categories",
|
|
129
|
+
"tags",
|
|
130
|
+
)
|
|
131
|
+
}
|
|
132
|
+
except Exception:
|
|
133
|
+
article_md = None
|
|
134
|
+
|
|
135
|
+
# 2. Structural walk via BeautifulSoup --------------------------------------
|
|
136
|
+
soup = BeautifulSoup(html_text, "html.parser")
|
|
137
|
+
for tag in soup(["script", "style", "noscript"]):
|
|
138
|
+
tag.decompose()
|
|
139
|
+
|
|
140
|
+
blocks: list[dict[str, Any]] = []
|
|
141
|
+
section_stack: list[str] = []
|
|
142
|
+
|
|
143
|
+
def push_heading(level: int, text: str) -> None:
|
|
144
|
+
nonlocal section_stack
|
|
145
|
+
while len(section_stack) >= level:
|
|
146
|
+
section_stack.pop()
|
|
147
|
+
section_stack.append(text)
|
|
148
|
+
blocks.append(
|
|
149
|
+
{
|
|
150
|
+
"kind": "heading",
|
|
151
|
+
"level": level,
|
|
152
|
+
"text": text,
|
|
153
|
+
"section_path": list(section_stack),
|
|
154
|
+
}
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
title_tag = soup.find("title")
|
|
158
|
+
if title_tag and title_tag.get_text(strip=True):
|
|
159
|
+
push_heading(1, title_tag.get_text(strip=True))
|
|
160
|
+
|
|
161
|
+
# walk body in document order
|
|
162
|
+
body = soup.body or soup
|
|
163
|
+
seen_lists: set[int] = set()
|
|
164
|
+
for el in body.find_all(True, recursive=True):
|
|
165
|
+
name = (el.name or "").lower()
|
|
166
|
+
m = _HEADING_RE.match(name)
|
|
167
|
+
if m:
|
|
168
|
+
level = int(m.group(1))
|
|
169
|
+
text = el.get_text(" ", strip=True)
|
|
170
|
+
if text:
|
|
171
|
+
push_heading(level, text)
|
|
172
|
+
continue
|
|
173
|
+
if name == "p":
|
|
174
|
+
text = el.get_text(" ", strip=True)
|
|
175
|
+
if text:
|
|
176
|
+
blocks.append(
|
|
177
|
+
{"kind": "paragraph", "text": text, "section_path": list(section_stack)}
|
|
178
|
+
)
|
|
179
|
+
continue
|
|
180
|
+
if name in {"ul", "ol"}:
|
|
181
|
+
if id(el) in seen_lists:
|
|
182
|
+
continue
|
|
183
|
+
seen_lists.add(id(el))
|
|
184
|
+
items = [li.get_text(" ", strip=True) for li in el.find_all("li", recursive=False)]
|
|
185
|
+
items = [i for i in items if i]
|
|
186
|
+
if items:
|
|
187
|
+
blocks.append(
|
|
188
|
+
{
|
|
189
|
+
"kind": "list",
|
|
190
|
+
"ordered": name == "ol",
|
|
191
|
+
"items": items,
|
|
192
|
+
"section_path": list(section_stack),
|
|
193
|
+
}
|
|
194
|
+
)
|
|
195
|
+
continue
|
|
196
|
+
if name == "table":
|
|
197
|
+
rows: list[list[str]] = []
|
|
198
|
+
for tr in el.find_all("tr", recursive=True):
|
|
199
|
+
cells = [
|
|
200
|
+
td.get_text(" ", strip=True)
|
|
201
|
+
for td in tr.find_all(["td", "th"], recursive=False)
|
|
202
|
+
]
|
|
203
|
+
if cells:
|
|
204
|
+
rows.append(cells)
|
|
205
|
+
if rows:
|
|
206
|
+
blocks.append(
|
|
207
|
+
{"kind": "table", "rows": rows, "section_path": list(section_stack)}
|
|
208
|
+
)
|
|
209
|
+
continue
|
|
210
|
+
if name == "img":
|
|
211
|
+
src = el.get("src") or ""
|
|
212
|
+
alt = el.get("alt") or ""
|
|
213
|
+
blocks.append(
|
|
214
|
+
{
|
|
215
|
+
"kind": "image",
|
|
216
|
+
"src": src,
|
|
217
|
+
"alt": alt,
|
|
218
|
+
"section_path": list(section_stack),
|
|
219
|
+
}
|
|
220
|
+
)
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
# 3. Markdown rendering ---------------------------------------------------
|
|
224
|
+
md_lines: list[str] = [
|
|
225
|
+
f"# {article_meta.get('title') or Path(source_filename).stem}",
|
|
226
|
+
"",
|
|
227
|
+
f"> Source: `{source_filename}` \u00b7 sha1 `{sha[:12]}` \u00b7 parsed `{utc_now_iso()}`",
|
|
228
|
+
"",
|
|
229
|
+
]
|
|
230
|
+
if article_meta:
|
|
231
|
+
meta_parts = []
|
|
232
|
+
if article_meta.get("author"):
|
|
233
|
+
meta_parts.append(f"author: {article_meta['author']}")
|
|
234
|
+
if article_meta.get("date"):
|
|
235
|
+
meta_parts.append(f"date: {article_meta['date']}")
|
|
236
|
+
if article_meta.get("sitename"):
|
|
237
|
+
meta_parts.append(f"site: {article_meta['sitename']}")
|
|
238
|
+
if article_meta.get("url"):
|
|
239
|
+
meta_parts.append(f"url: {article_meta['url']}")
|
|
240
|
+
if meta_parts:
|
|
241
|
+
md_lines.append("> " + " \u00b7 ".join(meta_parts))
|
|
242
|
+
md_lines.append("")
|
|
243
|
+
|
|
244
|
+
if article_md and article_md.strip():
|
|
245
|
+
md_lines.append("## Article (trafilatura)")
|
|
246
|
+
md_lines.append("")
|
|
247
|
+
md_lines.append(article_md.strip())
|
|
248
|
+
md_lines.append("")
|
|
249
|
+
md_lines.append("---")
|
|
250
|
+
md_lines.append("")
|
|
251
|
+
|
|
252
|
+
md_lines.append("## Structural extraction")
|
|
253
|
+
md_lines.append("")
|
|
254
|
+
for b in blocks:
|
|
255
|
+
kind = b["kind"]
|
|
256
|
+
if kind == "heading":
|
|
257
|
+
level = b["level"]
|
|
258
|
+
md_lines.append(f"{'#' * min(6, level + 2)} {b['text']}")
|
|
259
|
+
md_lines.append("")
|
|
260
|
+
elif kind == "paragraph":
|
|
261
|
+
md_lines.append(b["text"])
|
|
262
|
+
md_lines.append("")
|
|
263
|
+
elif kind == "list":
|
|
264
|
+
for item in b["items"]:
|
|
265
|
+
md_lines.append(f"- {item}")
|
|
266
|
+
md_lines.append("")
|
|
267
|
+
elif kind == "table":
|
|
268
|
+
rows = b["rows"]
|
|
269
|
+
ncols = max(len(r) for r in rows)
|
|
270
|
+
header = rows[0]
|
|
271
|
+
header_cells = [
|
|
272
|
+
(header[c] if c < len(header) else "").replace("|", "\\|").replace("\n", " ")
|
|
273
|
+
for c in range(ncols)
|
|
274
|
+
]
|
|
275
|
+
md_lines.append("| " + " | ".join(header_cells) + " |")
|
|
276
|
+
md_lines.append("| " + " | ".join(["---"] * ncols) + " |")
|
|
277
|
+
for row in rows[1:]:
|
|
278
|
+
cells = [
|
|
279
|
+
(row[c] if c < len(row) else "").replace("|", "\\|").replace("\n", " ")
|
|
280
|
+
for c in range(ncols)
|
|
281
|
+
]
|
|
282
|
+
md_lines.append("| " + " | ".join(cells) + " |")
|
|
283
|
+
md_lines.append("")
|
|
284
|
+
elif kind == "image":
|
|
285
|
+
md_lines.append(f"})")
|
|
286
|
+
md_lines.append("")
|
|
287
|
+
|
|
288
|
+
md_text = "\n".join(md_lines).rstrip() + "\n"
|
|
289
|
+
|
|
290
|
+
json_payload = {
|
|
291
|
+
"source": {
|
|
292
|
+
"filename": source_filename,
|
|
293
|
+
"absolute_path": absolute_path,
|
|
294
|
+
"sha1": sha,
|
|
295
|
+
"size_bytes": size_bytes,
|
|
296
|
+
"kind": "html",
|
|
297
|
+
},
|
|
298
|
+
"parsed_at": utc_now_iso(),
|
|
299
|
+
"metadata": article_meta,
|
|
300
|
+
"article_markdown": article_md,
|
|
301
|
+
"blocks": blocks,
|
|
302
|
+
"stats": {
|
|
303
|
+
"n_blocks": len(blocks),
|
|
304
|
+
"n_headings": sum(1 for b in blocks if b["kind"] == "heading"),
|
|
305
|
+
"n_paragraphs": sum(1 for b in blocks if b["kind"] == "paragraph"),
|
|
306
|
+
"n_lists": sum(1 for b in blocks if b["kind"] == "list"),
|
|
307
|
+
"n_tables": sum(1 for b in blocks if b["kind"] == "table"),
|
|
308
|
+
"n_images": sum(1 for b in blocks if b["kind"] == "image"),
|
|
309
|
+
"article_chars": len(article_md or ""),
|
|
310
|
+
},
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
# captioner is supported here only for completeness; HTML images are usually
|
|
314
|
+
# remote URLs we don't fetch by default.
|
|
315
|
+
_ = captioner
|
|
316
|
+
_ = truncate # silence unused in some configurations
|
|
317
|
+
|
|
318
|
+
if write_outputs:
|
|
319
|
+
write_text(out_dir / "document.md", md_text)
|
|
320
|
+
write_json(out_dir / "document.json", json_payload)
|
|
321
|
+
|
|
322
|
+
return json_payload
|