rc-docparser 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docparser/epub.py ADDED
@@ -0,0 +1,349 @@
1
+ """EPUB parser.
2
+
3
+ Walks the EPUB spine in reading order, runs a BeautifulSoup structural walk
4
+ over each chapter (headings / paragraphs / lists / tables / images), and
5
+ extracts embedded images to the asset directory. Embedded images can be
6
+ captioned via a ``captioner`` callable.
7
+
8
+ Requires the ``[epub]`` extra: ``pip install 'docparser[epub]'`` (which also
9
+ pulls in BeautifulSoup from the ``[html]`` extra).
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import re
14
+ from collections.abc import Callable
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ from .common import (
19
+ WorkspaceLayout,
20
+ bytes_sha1,
21
+ file_sha1,
22
+ truncate,
23
+ utc_now_iso,
24
+ write_json,
25
+ write_text,
26
+ )
27
+
28
+ _HEADING_RE = re.compile(r"^h([1-6])$", re.IGNORECASE)
29
+
30
+
31
+ def _import_deps():
32
+ try:
33
+ import ebooklib # type: ignore
34
+ from bs4 import BeautifulSoup # type: ignore
35
+ from ebooklib import epub # type: ignore
36
+ except ImportError as exc: # pragma: no cover - optional dep guard
37
+ raise ImportError(
38
+ "docparser.epub.parse_epub requires the [epub] extra. "
39
+ "Install with: pip install 'docparser[epub]'"
40
+ ) from exc
41
+ return ebooklib, epub, BeautifulSoup
42
+
43
+
44
+ def _ext_for(media_type: str, name: str) -> str:
45
+ mapping = {
46
+ "image/png": "png",
47
+ "image/jpeg": "jpg",
48
+ "image/gif": "gif",
49
+ "image/svg+xml": "svg",
50
+ "image/webp": "webp",
51
+ }
52
+ if media_type in mapping:
53
+ return mapping[media_type]
54
+ return Path(name or "").suffix.lstrip(".").lower() or "png"
55
+
56
+
57
+ def parse_epub(
58
+ source: Path | str,
59
+ layout: WorkspaceLayout | None = None,
60
+ *,
61
+ captioner: Callable[..., dict[str, Any]] | None = None,
62
+ write_outputs: bool = True,
63
+ ) -> dict[str, Any]:
64
+ """Parse an EPUB into Markdown + JSON + images."""
65
+ ebooklib, epub, BeautifulSoup = _import_deps()
66
+
67
+ source = Path(source)
68
+ layout = layout or WorkspaceLayout()
69
+ real_source = source.resolve()
70
+ book = epub.read_epub(str(real_source))
71
+
72
+ out_dir = layout.parsed_dir_for(source)
73
+ asset_dir = layout.assets_dir_for(source)
74
+ if write_outputs:
75
+ out_dir.mkdir(parents=True, exist_ok=True)
76
+ asset_dir.mkdir(parents=True, exist_ok=True)
77
+
78
+ # ---- extract embedded images up front, keyed by (normalized) href -----
79
+ image_by_name: dict[str, dict[str, Any]] = {}
80
+ images: list[dict[str, Any]] = []
81
+ image_caption_results: dict[str, dict[str, Any]] = {}
82
+ image_seq = 0
83
+ for item in book.get_items_of_type(ebooklib.ITEM_IMAGE):
84
+ blob = item.get_content()
85
+ if not blob:
86
+ continue
87
+ image_seq += 1
88
+ sha = bytes_sha1(blob)
89
+ ext = _ext_for(getattr(item, "media_type", "") or "", item.get_name())
90
+ asset_name = f"img-{image_seq:03d}-{sha[:10]}.{ext}"
91
+ asset_path = asset_dir / asset_name
92
+ if write_outputs and not asset_path.exists():
93
+ asset_path.write_bytes(blob)
94
+ rel = layout.relpath_from_parsed(asset_path, source)
95
+ rec = {
96
+ "seq": image_seq,
97
+ "sha1": sha,
98
+ "ext": ext,
99
+ "name": item.get_name(),
100
+ "asset_path": rel,
101
+ "blob": blob,
102
+ "mime": getattr(item, "media_type", "") or f"image/{ext}",
103
+ }
104
+ key = Path(item.get_name()).name.lower()
105
+ image_by_name[key] = rec
106
+ images.append(rec)
107
+
108
+ def _resolve_image(src: str) -> dict[str, Any] | None:
109
+ if not src:
110
+ return None
111
+ return image_by_name.get(Path(src.split("#")[0]).name.lower())
112
+
113
+ # ---- metadata ---------------------------------------------------------
114
+ def _meta(field: str) -> str:
115
+ try:
116
+ vals = book.get_metadata("DC", field)
117
+ if vals:
118
+ return str(vals[0][0])
119
+ except Exception:
120
+ pass
121
+ return ""
122
+
123
+ title = _meta("title") or source.stem
124
+ author = _meta("creator")
125
+ language = _meta("language")
126
+
127
+ # ---- structural walk per spine document -------------------------------
128
+ blocks: list[dict[str, Any]] = []
129
+ section_stack: list[str] = []
130
+
131
+ def push_heading(level: int, text: str) -> None:
132
+ while len(section_stack) >= level:
133
+ section_stack.pop()
134
+ section_stack.append(text)
135
+ blocks.append(
136
+ {"kind": "heading", "level": level, "text": text, "section_path": list(section_stack)}
137
+ )
138
+
139
+ spine_ids = [sid for sid, _ in book.spine]
140
+ items = []
141
+ for sid in spine_ids:
142
+ it = book.get_item_with_id(sid)
143
+ if it is not None and it.get_type() == ebooklib.ITEM_DOCUMENT:
144
+ items.append(it)
145
+ if not items:
146
+ items = list(book.get_items_of_type(ebooklib.ITEM_DOCUMENT))
147
+
148
+ for doc_item in items:
149
+ content = doc_item.get_content()
150
+ if not content:
151
+ continue
152
+ soup = BeautifulSoup(content, "html.parser")
153
+ for tag in soup(["script", "style"]):
154
+ tag.decompose()
155
+ body = soup.body or soup
156
+ seen_lists: set[int] = set()
157
+ for el in body.find_all(True, recursive=True):
158
+ name = (el.name or "").lower()
159
+ m = _HEADING_RE.match(name)
160
+ if m:
161
+ text = el.get_text(" ", strip=True)
162
+ if text:
163
+ push_heading(int(m.group(1)), text)
164
+ continue
165
+ if name == "p":
166
+ text = el.get_text(" ", strip=True)
167
+ if text:
168
+ blocks.append(
169
+ {"kind": "paragraph", "text": text, "section_path": list(section_stack)}
170
+ )
171
+ continue
172
+ if name in {"ul", "ol"}:
173
+ if id(el) in seen_lists:
174
+ continue
175
+ seen_lists.add(id(el))
176
+ li_items = [li.get_text(" ", strip=True) for li in el.find_all("li", recursive=False)]
177
+ li_items = [i for i in li_items if i]
178
+ if li_items:
179
+ blocks.append(
180
+ {
181
+ "kind": "list",
182
+ "ordered": name == "ol",
183
+ "items": li_items,
184
+ "section_path": list(section_stack),
185
+ }
186
+ )
187
+ continue
188
+ if name == "table":
189
+ rows: list[list[str]] = []
190
+ for tr in el.find_all("tr", recursive=True):
191
+ cells = [
192
+ td.get_text(" ", strip=True)
193
+ for td in tr.find_all(["td", "th"], recursive=False)
194
+ ]
195
+ if cells:
196
+ rows.append(cells)
197
+ if rows:
198
+ blocks.append(
199
+ {"kind": "table", "rows": rows, "section_path": list(section_stack)}
200
+ )
201
+ continue
202
+ if name in {"img", "image"}:
203
+ src = el.get("src") or el.get("href") or el.get("xlink:href") or ""
204
+ img_rec = _resolve_image(src)
205
+ blocks.append(
206
+ {
207
+ "kind": "image",
208
+ "src": src,
209
+ "alt": el.get("alt") or "",
210
+ "image_seq": img_rec["seq"] if img_rec else None,
211
+ "asset_path": img_rec["asset_path"] if img_rec else None,
212
+ "section_path": list(section_stack),
213
+ }
214
+ )
215
+ continue
216
+
217
+ # ---- caption images that were actually referenced ---------------------
218
+ referenced_seqs = {b["image_seq"] for b in blocks if b.get("kind") == "image" and b.get("image_seq")}
219
+ if captioner is not None:
220
+ ctx_by_seq: dict[int, str] = {}
221
+ last_text = ""
222
+ for b in blocks:
223
+ if b["kind"] in {"paragraph", "heading"}:
224
+ last_text = b.get("text", "")
225
+ elif b["kind"] == "image" and b.get("image_seq"):
226
+ ctx_by_seq.setdefault(b["image_seq"], last_text)
227
+ for rec in images:
228
+ if rec["seq"] not in referenced_seqs:
229
+ continue
230
+ try:
231
+ cap = captioner(
232
+ image_bytes=rec["blob"],
233
+ mime=rec["mime"],
234
+ doc_name=source.name,
235
+ nearby_caption="",
236
+ context=ctx_by_seq.get(rec["seq"], ""),
237
+ )
238
+ except Exception as exc:
239
+ cap = {"error": str(exc)}
240
+ image_caption_results[str(rec["seq"])] = cap or {}
241
+
242
+ # ---- markdown rendering -----------------------------------------------
243
+ md_lines = [
244
+ f"# {title}",
245
+ "",
246
+ f"> Source: `{source.name}` \u00b7 sha1 `{file_sha1(real_source)[:12]}` "
247
+ f"\u00b7 parsed `{utc_now_iso()}`",
248
+ ]
249
+ meta_parts = []
250
+ if author:
251
+ meta_parts.append(f"author: {author}")
252
+ if language:
253
+ meta_parts.append(f"language: {language}")
254
+ if meta_parts:
255
+ md_lines.append("> " + " \u00b7 ".join(meta_parts))
256
+ md_lines.append("")
257
+
258
+ for b in blocks:
259
+ kind = b["kind"]
260
+ if kind == "heading":
261
+ md_lines.append(f"{'#' * min(6, b['level'] + 1)} {b['text']}")
262
+ md_lines.append("")
263
+ elif kind == "paragraph":
264
+ md_lines.append(b["text"])
265
+ md_lines.append("")
266
+ elif kind == "list":
267
+ for item in b["items"]:
268
+ md_lines.append(f"- {item}")
269
+ md_lines.append("")
270
+ elif kind == "table":
271
+ rows = b["rows"]
272
+ ncols = max(len(r) for r in rows)
273
+ header = rows[0]
274
+ md_lines.append(
275
+ "| "
276
+ + " | ".join(
277
+ (header[c] if c < len(header) else "").replace("|", "\\|") for c in range(ncols)
278
+ )
279
+ + " |"
280
+ )
281
+ md_lines.append("| " + " | ".join(["---"] * ncols) + " |")
282
+ for row in rows[1:]:
283
+ md_lines.append(
284
+ "| "
285
+ + " | ".join(
286
+ (row[c] if c < len(row) else "").replace("|", "\\|") for c in range(ncols)
287
+ )
288
+ + " |"
289
+ )
290
+ md_lines.append("")
291
+ elif kind == "image":
292
+ rel = b.get("asset_path") or b.get("src", "")
293
+ cap = image_caption_results.get(str(b.get("image_seq"))) or {}
294
+ alt = cap.get("caption") or b.get("alt") or "image"
295
+ md_lines.append(f"![{alt}]({rel})")
296
+ if cap.get("description"):
297
+ md_lines.append("")
298
+ md_lines.append(f"> **VLM caption.** {cap.get('caption','')}")
299
+ md_lines.append(">")
300
+ md_lines.append(f"> {cap.get('description','')}")
301
+ if cap.get("tags"):
302
+ md_lines.append(">")
303
+ md_lines.append("> *Tags:* " + ", ".join(cap["tags"]))
304
+ md_lines.append("")
305
+
306
+ md_text = "\n".join(md_lines).rstrip() + "\n"
307
+
308
+ images_json = []
309
+ for rec in images:
310
+ d = {k: v for k, v in rec.items() if k != "blob"}
311
+ d["semantic"] = image_caption_results.get(str(rec["seq"]))
312
+ d["referenced"] = rec["seq"] in referenced_seqs
313
+ images_json.append(d)
314
+
315
+ json_payload = {
316
+ "source": {
317
+ "filename": source.name,
318
+ "absolute_path": str(real_source),
319
+ "sha1": file_sha1(real_source),
320
+ "size_bytes": real_source.stat().st_size,
321
+ "kind": "epub",
322
+ },
323
+ "parsed_at": utc_now_iso(),
324
+ "metadata": {"title": title, "author": author, "language": language},
325
+ "blocks": blocks,
326
+ "images": images_json,
327
+ "stats": {
328
+ "n_blocks": len(blocks),
329
+ "n_chapters": len(items),
330
+ "n_headings": sum(1 for b in blocks if b["kind"] == "heading"),
331
+ "n_paragraphs": sum(1 for b in blocks if b["kind"] == "paragraph"),
332
+ "n_lists": sum(1 for b in blocks if b["kind"] == "list"),
333
+ "n_tables": sum(1 for b in blocks if b["kind"] == "table"),
334
+ "n_images": len(images),
335
+ "n_captioned_images": sum(
336
+ 1
337
+ for v in image_caption_results.values()
338
+ if v and not v.get("error") and v.get("caption")
339
+ ),
340
+ },
341
+ }
342
+
343
+ _ = truncate # symmetry with other parsers
344
+
345
+ if write_outputs:
346
+ write_text(out_dir / "document.md", md_text)
347
+ write_json(out_dir / "document.json", json_payload)
348
+
349
+ return json_payload
docparser/html.py ADDED
@@ -0,0 +1,322 @@
1
+ """HTML parser.
2
+
3
+ Two-tier strategy:
4
+
5
+ 1. **Article extraction** via `trafilatura` for the main body (drops nav,
6
+ sidebars, ads).
7
+ 2. **Structural fallback** via BeautifulSoup that walks the DOM and emits
8
+ typed blocks (heading/paragraph/list/table/image) when trafilatura returns
9
+ nothing useful or when the caller wants the full structure.
10
+
11
+ Requires the ``[html]`` extra: ``pip install 'docparser[html]'``.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import re
16
+ from collections.abc import Callable
17
+ from pathlib import Path
18
+ from typing import Any
19
+ from urllib.parse import urlparse
20
+
21
+ from .common import (
22
+ WorkspaceLayout,
23
+ file_sha1,
24
+ truncate,
25
+ utc_now_iso,
26
+ write_json,
27
+ write_text,
28
+ )
29
+
30
+ _HEADING_RE = re.compile(r"^h([1-6])$", re.IGNORECASE)
31
+
32
+
33
+ def _import_deps():
34
+ try:
35
+ import trafilatura # type: ignore
36
+ from bs4 import BeautifulSoup # type: ignore
37
+ except ImportError as exc: # pragma: no cover
38
+ raise ImportError(
39
+ "docparser.html.parse_html requires the [html] extra. "
40
+ "Install with: pip install 'docparser[html]'"
41
+ ) from exc
42
+ return trafilatura, BeautifulSoup
43
+
44
+
45
+ def parse_html(
46
+ source: Path | str,
47
+ layout: WorkspaceLayout | None = None,
48
+ *,
49
+ captioner: Callable[..., dict[str, Any]] | None = None,
50
+ write_outputs: bool = True,
51
+ use_trafilatura: bool = True,
52
+ ) -> dict[str, Any]:
53
+ """Parse an HTML file (or http(s) URL) into Markdown + JSON.
54
+
55
+ Parameters
56
+ ----------
57
+ source : Path | str
58
+ Local path or full URL. URLs are fetched via ``trafilatura`` (which
59
+ falls back to ``urllib`` if needed).
60
+ use_trafilatura : bool
61
+ If True (default), prefer trafilatura's article extraction. The
62
+ structural BS4 walk is always run as the fallback / structural source.
63
+ """
64
+ trafilatura, BeautifulSoup = _import_deps()
65
+
66
+ is_url = isinstance(source, str) and re.match(r"^https?://", source) is not None
67
+ real_source: Path | None
68
+ if is_url:
69
+ url = str(source)
70
+ downloaded = trafilatura.fetch_url(url)
71
+ if not downloaded:
72
+ raise RuntimeError(f"Could not fetch URL: {url}")
73
+ html_text = downloaded
74
+ # Use the URL's path for layout slugs; fall back to host
75
+ host = urlparse(url).netloc.replace(":", "_")
76
+ path_part = urlparse(url).path.strip("/").replace("/", "_") or "index"
77
+ synthetic_name = f"{host}_{path_part}.html"
78
+ layout = layout or WorkspaceLayout()
79
+ out_dir = layout.parsed_dir / layout.parsed_dir_for(Path(synthetic_name)).name
80
+ asset_dir = layout.assets_dir_for(Path(synthetic_name))
81
+ source_filename = synthetic_name
82
+ absolute_path = url
83
+ size_bytes = len(html_text.encode("utf-8"))
84
+ sha = "" # we hash the fetched bytes below
85
+ import hashlib
86
+
87
+ sha = hashlib.sha1(html_text.encode("utf-8")).hexdigest()
88
+ real_source = None
89
+ else:
90
+ real_source = Path(source).resolve()
91
+ html_text = real_source.read_text(encoding="utf-8", errors="replace")
92
+ layout = layout or WorkspaceLayout()
93
+ out_dir = layout.parsed_dir_for(real_source)
94
+ asset_dir = layout.assets_dir_for(real_source)
95
+ source_filename = real_source.name
96
+ absolute_path = str(real_source)
97
+ size_bytes = real_source.stat().st_size
98
+ sha = file_sha1(real_source)
99
+
100
+ if write_outputs:
101
+ out_dir.mkdir(parents=True, exist_ok=True)
102
+ asset_dir.mkdir(parents=True, exist_ok=True)
103
+
104
+ # 1. Article-grade markdown via trafilatura --------------------------------
105
+ article_md: str | None = None
106
+ article_meta: dict[str, Any] = {}
107
+ if use_trafilatura:
108
+ try:
109
+ article_md = trafilatura.extract(
110
+ html_text,
111
+ output_format="markdown",
112
+ include_comments=False,
113
+ include_tables=True,
114
+ include_links=True,
115
+ with_metadata=False,
116
+ )
117
+ meta = trafilatura.extract_metadata(html_text)
118
+ if meta is not None:
119
+ article_meta = {
120
+ k: getattr(meta, k, None)
121
+ for k in (
122
+ "title",
123
+ "author",
124
+ "date",
125
+ "sitename",
126
+ "url",
127
+ "description",
128
+ "categories",
129
+ "tags",
130
+ )
131
+ }
132
+ except Exception:
133
+ article_md = None
134
+
135
+ # 2. Structural walk via BeautifulSoup --------------------------------------
136
+ soup = BeautifulSoup(html_text, "html.parser")
137
+ for tag in soup(["script", "style", "noscript"]):
138
+ tag.decompose()
139
+
140
+ blocks: list[dict[str, Any]] = []
141
+ section_stack: list[str] = []
142
+
143
+ def push_heading(level: int, text: str) -> None:
144
+ nonlocal section_stack
145
+ while len(section_stack) >= level:
146
+ section_stack.pop()
147
+ section_stack.append(text)
148
+ blocks.append(
149
+ {
150
+ "kind": "heading",
151
+ "level": level,
152
+ "text": text,
153
+ "section_path": list(section_stack),
154
+ }
155
+ )
156
+
157
+ title_tag = soup.find("title")
158
+ if title_tag and title_tag.get_text(strip=True):
159
+ push_heading(1, title_tag.get_text(strip=True))
160
+
161
+ # walk body in document order
162
+ body = soup.body or soup
163
+ seen_lists: set[int] = set()
164
+ for el in body.find_all(True, recursive=True):
165
+ name = (el.name or "").lower()
166
+ m = _HEADING_RE.match(name)
167
+ if m:
168
+ level = int(m.group(1))
169
+ text = el.get_text(" ", strip=True)
170
+ if text:
171
+ push_heading(level, text)
172
+ continue
173
+ if name == "p":
174
+ text = el.get_text(" ", strip=True)
175
+ if text:
176
+ blocks.append(
177
+ {"kind": "paragraph", "text": text, "section_path": list(section_stack)}
178
+ )
179
+ continue
180
+ if name in {"ul", "ol"}:
181
+ if id(el) in seen_lists:
182
+ continue
183
+ seen_lists.add(id(el))
184
+ items = [li.get_text(" ", strip=True) for li in el.find_all("li", recursive=False)]
185
+ items = [i for i in items if i]
186
+ if items:
187
+ blocks.append(
188
+ {
189
+ "kind": "list",
190
+ "ordered": name == "ol",
191
+ "items": items,
192
+ "section_path": list(section_stack),
193
+ }
194
+ )
195
+ continue
196
+ if name == "table":
197
+ rows: list[list[str]] = []
198
+ for tr in el.find_all("tr", recursive=True):
199
+ cells = [
200
+ td.get_text(" ", strip=True)
201
+ for td in tr.find_all(["td", "th"], recursive=False)
202
+ ]
203
+ if cells:
204
+ rows.append(cells)
205
+ if rows:
206
+ blocks.append(
207
+ {"kind": "table", "rows": rows, "section_path": list(section_stack)}
208
+ )
209
+ continue
210
+ if name == "img":
211
+ src = el.get("src") or ""
212
+ alt = el.get("alt") or ""
213
+ blocks.append(
214
+ {
215
+ "kind": "image",
216
+ "src": src,
217
+ "alt": alt,
218
+ "section_path": list(section_stack),
219
+ }
220
+ )
221
+ continue
222
+
223
+ # 3. Markdown rendering ---------------------------------------------------
224
+ md_lines: list[str] = [
225
+ f"# {article_meta.get('title') or Path(source_filename).stem}",
226
+ "",
227
+ f"> Source: `{source_filename}` \u00b7 sha1 `{sha[:12]}` \u00b7 parsed `{utc_now_iso()}`",
228
+ "",
229
+ ]
230
+ if article_meta:
231
+ meta_parts = []
232
+ if article_meta.get("author"):
233
+ meta_parts.append(f"author: {article_meta['author']}")
234
+ if article_meta.get("date"):
235
+ meta_parts.append(f"date: {article_meta['date']}")
236
+ if article_meta.get("sitename"):
237
+ meta_parts.append(f"site: {article_meta['sitename']}")
238
+ if article_meta.get("url"):
239
+ meta_parts.append(f"url: {article_meta['url']}")
240
+ if meta_parts:
241
+ md_lines.append("> " + " \u00b7 ".join(meta_parts))
242
+ md_lines.append("")
243
+
244
+ if article_md and article_md.strip():
245
+ md_lines.append("## Article (trafilatura)")
246
+ md_lines.append("")
247
+ md_lines.append(article_md.strip())
248
+ md_lines.append("")
249
+ md_lines.append("---")
250
+ md_lines.append("")
251
+
252
+ md_lines.append("## Structural extraction")
253
+ md_lines.append("")
254
+ for b in blocks:
255
+ kind = b["kind"]
256
+ if kind == "heading":
257
+ level = b["level"]
258
+ md_lines.append(f"{'#' * min(6, level + 2)} {b['text']}")
259
+ md_lines.append("")
260
+ elif kind == "paragraph":
261
+ md_lines.append(b["text"])
262
+ md_lines.append("")
263
+ elif kind == "list":
264
+ for item in b["items"]:
265
+ md_lines.append(f"- {item}")
266
+ md_lines.append("")
267
+ elif kind == "table":
268
+ rows = b["rows"]
269
+ ncols = max(len(r) for r in rows)
270
+ header = rows[0]
271
+ header_cells = [
272
+ (header[c] if c < len(header) else "").replace("|", "\\|").replace("\n", " ")
273
+ for c in range(ncols)
274
+ ]
275
+ md_lines.append("| " + " | ".join(header_cells) + " |")
276
+ md_lines.append("| " + " | ".join(["---"] * ncols) + " |")
277
+ for row in rows[1:]:
278
+ cells = [
279
+ (row[c] if c < len(row) else "").replace("|", "\\|").replace("\n", " ")
280
+ for c in range(ncols)
281
+ ]
282
+ md_lines.append("| " + " | ".join(cells) + " |")
283
+ md_lines.append("")
284
+ elif kind == "image":
285
+ md_lines.append(f"![{b.get('alt') or 'image'}]({b.get('src','')})")
286
+ md_lines.append("")
287
+
288
+ md_text = "\n".join(md_lines).rstrip() + "\n"
289
+
290
+ json_payload = {
291
+ "source": {
292
+ "filename": source_filename,
293
+ "absolute_path": absolute_path,
294
+ "sha1": sha,
295
+ "size_bytes": size_bytes,
296
+ "kind": "html",
297
+ },
298
+ "parsed_at": utc_now_iso(),
299
+ "metadata": article_meta,
300
+ "article_markdown": article_md,
301
+ "blocks": blocks,
302
+ "stats": {
303
+ "n_blocks": len(blocks),
304
+ "n_headings": sum(1 for b in blocks if b["kind"] == "heading"),
305
+ "n_paragraphs": sum(1 for b in blocks if b["kind"] == "paragraph"),
306
+ "n_lists": sum(1 for b in blocks if b["kind"] == "list"),
307
+ "n_tables": sum(1 for b in blocks if b["kind"] == "table"),
308
+ "n_images": sum(1 for b in blocks if b["kind"] == "image"),
309
+ "article_chars": len(article_md or ""),
310
+ },
311
+ }
312
+
313
+ # captioner is supported here only for completeness; HTML images are usually
314
+ # remote URLs we don't fetch by default.
315
+ _ = captioner
316
+ _ = truncate # silence unused in some configurations
317
+
318
+ if write_outputs:
319
+ write_text(out_dir / "document.md", md_text)
320
+ write_json(out_dir / "document.json", json_payload)
321
+
322
+ return json_payload