rc-docparser 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docparser/docx.py ADDED
@@ -0,0 +1,488 @@
1
+ """DOCX parser: emits Markdown + JSON + extracted images.
2
+
3
+ Walks the document body in document order so the generated Markdown faithfully
4
+ reflects the source layout. Each embedded image is written to
5
+ ``layout.assets_dir_for(source)`` and (optionally) captioned via a callable
6
+ ``captioner`` (typically :func:`docparser.image.caption_image` adapted by an
7
+ orchestrator).
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ from collections.abc import Callable, Iterator
13
+ from dataclasses import dataclass, field
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ from docx import Document
18
+ from docx.document import Document as DocxDocument
19
+ from docx.oxml.ns import qn
20
+ from docx.table import Table, _Cell
21
+ from docx.text.paragraph import Paragraph
22
+
23
+ from .common import (
24
+ WorkspaceLayout,
25
+ bytes_sha1,
26
+ file_sha1,
27
+ truncate,
28
+ utc_now_iso,
29
+ write_json,
30
+ write_text,
31
+ )
32
+
33
+ CAPTION_RE = re.compile(
34
+ r"^\s*(figure|fig\.?|table|scheme|chart|diagram)\s*[:.\-]?\s*\d+[.\:\-]?\s+",
35
+ re.IGNORECASE,
36
+ )
37
+ CAPTION_STYLE_NAMES = {"caption", "figure caption", "table caption"}
38
+
39
+
40
+ @dataclass
41
+ class ExtractedImage:
42
+ seq: int
43
+ rel_id: str
44
+ filename: str
45
+ content_type: str
46
+ blob: bytes
47
+ sha1: str
48
+ ext: str
49
+ asset_path: Path
50
+ nearby_caption: str = ""
51
+ context_before: str = ""
52
+ context_after: str = ""
53
+ section_path: list[str] = field(default_factory=list)
54
+ location: str = ""
55
+
56
+ def to_dict(self, source: Path, layout: WorkspaceLayout) -> dict[str, Any]:
57
+ return {
58
+ "seq": self.seq,
59
+ "filename": self.filename,
60
+ "content_type": self.content_type,
61
+ "ext": self.ext,
62
+ "sha1": self.sha1,
63
+ "asset_path": layout.relpath_from_parsed(self.asset_path, source),
64
+ "nearby_caption": self.nearby_caption,
65
+ "context_before": truncate(self.context_before, 600),
66
+ "context_after": truncate(self.context_after, 600),
67
+ "section_path": self.section_path,
68
+ "location": self.location,
69
+ }
70
+
71
+
72
+ def _ext_for(content_type: str, filename: str) -> str:
73
+ mapping = {
74
+ "image/png": "png",
75
+ "image/jpeg": "jpg",
76
+ "image/jpg": "jpg",
77
+ "image/gif": "gif",
78
+ "image/bmp": "bmp",
79
+ "image/tiff": "tiff",
80
+ "image/webp": "webp",
81
+ "image/svg+xml": "svg",
82
+ "image/x-emf": "emf",
83
+ "image/x-wmf": "wmf",
84
+ }
85
+ if content_type in mapping:
86
+ return mapping[content_type]
87
+ suf = Path(filename).suffix.lstrip(".").lower()
88
+ return suf or "bin"
89
+
90
+
91
+ def _image_blip_ids(elem) -> list[str]:
92
+ ns = {
93
+ "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
94
+ "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
95
+ }
96
+ ids: list[str] = []
97
+ for blip in elem.iter(f"{{{ns['a']}}}blip"):
98
+ rid = blip.get(f"{{{ns['r']}}}embed")
99
+ if rid:
100
+ ids.append(rid)
101
+ return ids
102
+
103
+
104
+ def _iter_block_items(parent) -> Iterator[Paragraph | Table]:
105
+ if isinstance(parent, DocxDocument):
106
+ body = parent.element.body
107
+ elif isinstance(parent, _Cell):
108
+ body = parent._tc
109
+ else:
110
+ body = getattr(parent, "_element", parent)
111
+ for child in body.iterchildren():
112
+ if child.tag == qn("w:p"):
113
+ yield Paragraph(child, parent)
114
+ elif child.tag == qn("w:tbl"):
115
+ yield Table(child, parent)
116
+
117
+
118
+ def _para_style(p: Paragraph) -> str:
119
+ try:
120
+ style = p.style
121
+ return ((style.name if style is not None else "") or "").strip()
122
+ except Exception:
123
+ return ""
124
+
125
+
126
+ def _heading_level(style_name: str) -> int | None:
127
+ if not style_name:
128
+ return None
129
+ s = style_name.lower()
130
+ if s == "title":
131
+ return 1
132
+ m = re.match(r"heading\s*(\d+)", s)
133
+ if m:
134
+ return min(6, max(1, int(m.group(1))))
135
+ return None
136
+
137
+
138
+ def _is_caption(p: Paragraph) -> bool:
139
+ style = _para_style(p).lower()
140
+ if style in CAPTION_STYLE_NAMES:
141
+ return True
142
+ text = (p.text or "").strip()
143
+ if CAPTION_RE.match(text):
144
+ return True
145
+ return False
146
+
147
+
148
+ @dataclass
149
+ class _Block:
150
+ kind: str
151
+ payload: dict[str, Any]
152
+
153
+
154
+ def parse_docx(
155
+ source: Path | str,
156
+ layout: WorkspaceLayout | None = None,
157
+ *,
158
+ captioner: Callable[..., dict[str, Any]] | None = None,
159
+ write_outputs: bool = True,
160
+ ) -> dict[str, Any]:
161
+ """Parse a DOCX file into Markdown + JSON + images.
162
+
163
+ Parameters
164
+ ----------
165
+ source : Path
166
+ Path to the .docx file (or a symlink to one).
167
+ layout : WorkspaceLayout, optional
168
+ Where to write outputs. Defaults to ``WorkspaceLayout()`` (cwd-relative
169
+ ``data/parsed``, ``data/assets``, ``.cache``).
170
+ captioner : callable, optional
171
+ ``captioner(image_bytes, mime, doc_name, nearby_caption, context) -> dict``.
172
+ Used to caption every extracted image. If ``None``, no VLM call is made.
173
+ write_outputs : bool
174
+ If False, only return the parsed structure (handy for tests).
175
+ """
176
+ source = Path(source)
177
+ layout = layout or WorkspaceLayout()
178
+ real_source = source.resolve()
179
+ doc = Document(str(real_source))
180
+
181
+ out_dir = layout.parsed_dir_for(source)
182
+ asset_dir = layout.assets_dir_for(source)
183
+ if write_outputs:
184
+ out_dir.mkdir(parents=True, exist_ok=True)
185
+ asset_dir.mkdir(parents=True, exist_ok=True)
186
+
187
+ image_parts = doc.part.related_parts
188
+ rid_to_image: dict[str, ExtractedImage] = {}
189
+ blocks: list[_Block] = []
190
+ seq = 0
191
+ section_stack: list[str] = []
192
+
193
+ def _emit_image_for(elem, location: str) -> None:
194
+ nonlocal seq
195
+ for rid in _image_blip_ids(elem):
196
+ part = image_parts.get(rid)
197
+ if part is None:
198
+ continue
199
+ try:
200
+ blob = part.blob
201
+ except Exception:
202
+ continue
203
+ ct = getattr(part, "content_type", "") or ""
204
+ filename = Path(getattr(part, "partname", f"image_{rid}")).name
205
+ sha = bytes_sha1(blob)
206
+ ext = _ext_for(ct, filename)
207
+ seq += 1
208
+ asset_name = f"img-{seq:03d}-{sha[:10]}.{ext}"
209
+ asset_path = asset_dir / asset_name
210
+ if write_outputs and not asset_path.exists():
211
+ asset_path.write_bytes(blob)
212
+ img = ExtractedImage(
213
+ seq=seq,
214
+ rel_id=rid,
215
+ filename=filename,
216
+ content_type=ct,
217
+ blob=blob,
218
+ sha1=sha,
219
+ ext=ext,
220
+ asset_path=asset_path,
221
+ section_path=list(section_stack),
222
+ location=location,
223
+ )
224
+ key = f"{location}::{seq}::{rid}"
225
+ rid_to_image[key] = img
226
+ blocks.append(_Block("image", {"image_key": key}))
227
+
228
+ def handle_paragraph(p: Paragraph, location: str) -> None:
229
+ nonlocal section_stack
230
+ style = _para_style(p)
231
+ text = (p.text or "").strip()
232
+ level = _heading_level(style)
233
+
234
+ _emit_image_for(p._element, location)
235
+
236
+ if level is not None and text:
237
+ while len(section_stack) >= level:
238
+ section_stack.pop()
239
+ section_stack.append(text)
240
+ blocks.append(
241
+ _Block(
242
+ "heading",
243
+ {
244
+ "level": level,
245
+ "text": text,
246
+ "style": style,
247
+ "section_path": list(section_stack),
248
+ },
249
+ )
250
+ )
251
+ return
252
+
253
+ if not text:
254
+ return
255
+
256
+ kind = "caption" if _is_caption(p) else "paragraph"
257
+ list_level: int | None = None
258
+ try:
259
+ num_pr = p._element.find(qn("w:pPr") + "/" + qn("w:numPr"))
260
+ if num_pr is not None:
261
+ ilvl = num_pr.find(qn("w:ilvl"))
262
+ if ilvl is not None:
263
+ list_level = int(ilvl.get(qn("w:val"), "0"))
264
+ except Exception:
265
+ list_level = None
266
+ blocks.append(
267
+ _Block(
268
+ kind,
269
+ {
270
+ "text": text,
271
+ "style": style,
272
+ "list_level": list_level,
273
+ "section_path": list(section_stack),
274
+ },
275
+ )
276
+ )
277
+
278
+ def handle_table(tbl: Table, location: str) -> None:
279
+ rows: list[list[dict[str, Any]]] = []
280
+ for r_idx, row in enumerate(tbl.rows):
281
+ row_payload: list[dict[str, Any]] = []
282
+ for c_idx, cell in enumerate(row.cells):
283
+ cell_text_parts: list[str] = []
284
+ for sub in _iter_block_items(cell):
285
+ if isinstance(sub, Paragraph):
286
+ _emit_image_for(
287
+ sub._element, f"{location}.cell[{r_idx},{c_idx}]"
288
+ )
289
+ if (sub.text or "").strip():
290
+ cell_text_parts.append(sub.text.strip())
291
+ elif isinstance(sub, Table):
292
+ for nrow in sub.rows:
293
+ for ncell in nrow.cells:
294
+ t = (ncell.text or "").strip()
295
+ if t:
296
+ cell_text_parts.append(t)
297
+ row_payload.append({"text": "\n".join(cell_text_parts).strip()})
298
+ rows.append(row_payload)
299
+ blocks.append(
300
+ _Block(
301
+ "table",
302
+ {"rows": rows, "section_path": list(section_stack), "location": location},
303
+ )
304
+ )
305
+
306
+ for idx, item in enumerate(_iter_block_items(doc)):
307
+ loc = f"body[{idx}]"
308
+ if isinstance(item, Paragraph):
309
+ handle_paragraph(item, loc)
310
+ elif isinstance(item, Table):
311
+ handle_table(item, loc)
312
+
313
+ # context association ---------------------------------------------------
314
+ for i, b in enumerate(blocks):
315
+ if b.kind != "image":
316
+ continue
317
+ img = rid_to_image[b.payload["image_key"]]
318
+ for j in range(i + 1, min(i + 4, len(blocks))):
319
+ nb = blocks[j]
320
+ if nb.kind == "caption":
321
+ img.nearby_caption = nb.payload.get("text", "")
322
+ break
323
+ if nb.kind == "paragraph" and CAPTION_RE.match(nb.payload.get("text", "") or ""):
324
+ img.nearby_caption = nb.payload.get("text", "")
325
+ break
326
+ if nb.kind == "image":
327
+ break
328
+
329
+ before_parts: list[str] = []
330
+ for j in range(i - 1, max(-1, i - 5), -1):
331
+ nb = blocks[j]
332
+ if nb.kind in {"paragraph", "caption", "heading"}:
333
+ before_parts.append(nb.payload.get("text", ""))
334
+ if len(before_parts) >= 2:
335
+ break
336
+ if nb.kind == "image":
337
+ break
338
+ img.context_before = " \u00b6 ".join(reversed(before_parts)).strip()
339
+
340
+ after_parts: list[str] = []
341
+ skipped_caption = False
342
+ for j in range(i + 1, min(i + 6, len(blocks))):
343
+ nb = blocks[j]
344
+ if nb.kind == "image":
345
+ break
346
+ if nb.kind == "caption" and not skipped_caption:
347
+ skipped_caption = True
348
+ continue
349
+ if nb.kind in {"paragraph", "heading"}:
350
+ after_parts.append(nb.payload.get("text", ""))
351
+ if len(after_parts) >= 2:
352
+ break
353
+ img.context_after = " \u00b6 ".join(after_parts).strip()
354
+
355
+ # captioning ------------------------------------------------------------
356
+ image_caption_results: dict[str, dict[str, Any]] = {}
357
+ if captioner is not None:
358
+ for key, img in rid_to_image.items():
359
+ mime = img.content_type or "image/png"
360
+ try:
361
+ result = captioner(
362
+ image_bytes=img.blob,
363
+ mime=mime,
364
+ doc_name=source.name,
365
+ nearby_caption=img.nearby_caption,
366
+ context=(img.context_before + " \u00b6 " + img.context_after).strip(),
367
+ )
368
+ except Exception as exc:
369
+ result = {"error": f"captioner exception: {exc}"}
370
+ image_caption_results[key] = result
371
+
372
+ # markdown rendering ----------------------------------------------------
373
+ md_lines: list[str] = []
374
+ md_lines.append(f"# {source.stem}")
375
+ md_lines.append("")
376
+ md_lines.append(
377
+ f"> Source: `{source.name}` \u00b7 sha1 `{file_sha1(real_source)[:12]}` "
378
+ f"\u00b7 parsed `{utc_now_iso()}`"
379
+ )
380
+ md_lines.append("")
381
+
382
+ for b in blocks:
383
+ if b.kind == "heading":
384
+ level = b.payload["level"]
385
+ md_lines.append(f"{'#' * (level + 1)} {b.payload['text']}")
386
+ md_lines.append("")
387
+ elif b.kind == "paragraph":
388
+ text = b.payload.get("text", "")
389
+ list_level = b.payload.get("list_level")
390
+ if list_level is not None:
391
+ indent = " " * list_level
392
+ md_lines.append(f"{indent}- {text}")
393
+ else:
394
+ md_lines.append(text)
395
+ md_lines.append("")
396
+ elif b.kind == "caption":
397
+ md_lines.append(f"*{b.payload.get('text', '')}*")
398
+ md_lines.append("")
399
+ elif b.kind == "image":
400
+ key = b.payload["image_key"]
401
+ img = rid_to_image[key]
402
+ rel = layout.relpath_from_parsed(img.asset_path, source)
403
+ cap_data = image_caption_results.get(key) or {}
404
+ short = cap_data.get("caption") or img.nearby_caption or img.filename
405
+ alt = (short or "figure").replace("\n", " ").replace("|", "/")
406
+ md_lines.append(f"![{alt}]({rel})")
407
+ if img.nearby_caption:
408
+ md_lines.append(f"*{img.nearby_caption}*")
409
+ if cap_data.get("description"):
410
+ md_lines.append("")
411
+ md_lines.append(f"<!-- vlm: {cap_data.get('model','')} -->")
412
+ md_lines.append(f"> **VLM caption.** {cap_data['caption']}")
413
+ md_lines.append(">")
414
+ md_lines.append(f"> {cap_data['description']}")
415
+ if cap_data.get("visible_text"):
416
+ md_lines.append(">")
417
+ vt = cap_data["visible_text"].replace("\n", "\n> ")
418
+ md_lines.append(f"> *Visible text:* {vt}")
419
+ if cap_data.get("tags"):
420
+ md_lines.append(">")
421
+ md_lines.append("> *Tags:* " + ", ".join(cap_data["tags"]))
422
+ if cap_data.get("domain_relevance"):
423
+ md_lines.append(">")
424
+ md_lines.append(f"> *Relevance:* {cap_data['domain_relevance']}")
425
+ md_lines.append("")
426
+ elif b.kind == "table":
427
+ rows = b.payload["rows"]
428
+ if not rows:
429
+ continue
430
+ ncols = max(len(r) for r in rows)
431
+ header = rows[0]
432
+ header_cells = [
433
+ (header[c]["text"] if c < len(header) else "")
434
+ .replace("|", "\\|")
435
+ .replace("\n", " ")
436
+ for c in range(ncols)
437
+ ]
438
+ md_lines.append("| " + " | ".join(header_cells or [""]) + " |")
439
+ md_lines.append("| " + " | ".join(["---"] * ncols) + " |")
440
+ for row in rows[1:]:
441
+ cells = [
442
+ (row[c]["text"] if c < len(row) else "")
443
+ .replace("|", "\\|")
444
+ .replace("\n", " ")
445
+ for c in range(ncols)
446
+ ]
447
+ md_lines.append("| " + " | ".join(cells) + " |")
448
+ md_lines.append("")
449
+
450
+ md_text = "\n".join(md_lines).rstrip() + "\n"
451
+
452
+ images_json = []
453
+ for key, img in rid_to_image.items():
454
+ d = img.to_dict(source, layout)
455
+ d["semantic"] = image_caption_results.get(key)
456
+ images_json.append(d)
457
+ images_json.sort(key=lambda d: d["seq"])
458
+
459
+ json_payload = {
460
+ "source": {
461
+ "filename": source.name,
462
+ "absolute_path": str(real_source),
463
+ "sha1": file_sha1(real_source),
464
+ "size_bytes": real_source.stat().st_size,
465
+ "kind": "docx",
466
+ },
467
+ "parsed_at": utc_now_iso(),
468
+ "blocks": [{"kind": b.kind, **b.payload} for b in blocks],
469
+ "images": images_json,
470
+ "stats": {
471
+ "n_blocks": len(blocks),
472
+ "n_headings": sum(1 for b in blocks if b.kind == "heading"),
473
+ "n_paragraphs": sum(1 for b in blocks if b.kind == "paragraph"),
474
+ "n_tables": sum(1 for b in blocks if b.kind == "table"),
475
+ "n_images": len(rid_to_image),
476
+ "n_captioned_images": sum(
477
+ 1
478
+ for v in image_caption_results.values()
479
+ if v and not v.get("error") and v.get("caption")
480
+ ),
481
+ },
482
+ }
483
+
484
+ if write_outputs:
485
+ write_text(out_dir / "document.md", md_text)
486
+ write_json(out_dir / "document.json", json_payload)
487
+
488
+ return json_payload