rc-docparser 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docparser/pptx.py ADDED
@@ -0,0 +1,332 @@
1
+ """PPTX parser: emits Markdown + JSON + extracted images.
2
+
3
+ Walks slides in presentation order. Each slide becomes a level-2 heading
4
+ (``Slide N`` plus the slide title when present); text frames, tables, and
5
+ pictures are emitted in shape order. Speaker notes are captured per slide.
6
+ Embedded pictures are written to ``layout.assets_dir_for(source)`` and may be
7
+ captioned via a ``captioner`` callable (same contract as the other parsers).
8
+
9
+ Requires the ``[pptx]`` extra: ``pip install 'docparser[pptx]'``.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ from collections.abc import Callable
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ from .common import (
18
+ WorkspaceLayout,
19
+ bytes_sha1,
20
+ file_sha1,
21
+ truncate,
22
+ utc_now_iso,
23
+ write_json,
24
+ write_text,
25
+ )
26
+
27
+
28
+ def _import_pptx():
29
+ try:
30
+ from pptx import Presentation # type: ignore
31
+ from pptx.enum.shapes import MSO_SHAPE_TYPE # type: ignore
32
+ from pptx.util import Emu # type: ignore # noqa: F401
33
+ except ImportError as exc: # pragma: no cover - optional dep guard
34
+ raise ImportError(
35
+ "docparser.pptx.parse_pptx requires the [pptx] extra. "
36
+ "Install with: pip install 'docparser[pptx]'"
37
+ ) from exc
38
+ return Presentation, MSO_SHAPE_TYPE
39
+
40
+
41
+ def _ext_for(content_type: str, blob_name: str) -> str:
42
+ mapping = {
43
+ "image/png": "png",
44
+ "image/jpeg": "jpg",
45
+ "image/jpg": "jpg",
46
+ "image/gif": "gif",
47
+ "image/bmp": "bmp",
48
+ "image/tiff": "tiff",
49
+ "image/webp": "webp",
50
+ "image/svg+xml": "svg",
51
+ "image/x-emf": "emf",
52
+ "image/x-wmf": "wmf",
53
+ }
54
+ if content_type in mapping:
55
+ return mapping[content_type]
56
+ suf = Path(blob_name or "").suffix.lstrip(".").lower()
57
+ return suf or "png"
58
+
59
+
60
+ def parse_pptx(
61
+ source: Path | str,
62
+ layout: WorkspaceLayout | None = None,
63
+ *,
64
+ captioner: Callable[..., dict[str, Any]] | None = None,
65
+ write_outputs: bool = True,
66
+ ) -> dict[str, Any]:
67
+ """Parse a PPTX presentation into Markdown + JSON + images.
68
+
69
+ See :func:`docparser.parse_docx` for the parameter conventions.
70
+ """
71
+ Presentation, MSO_SHAPE_TYPE = _import_pptx()
72
+
73
+ source = Path(source)
74
+ layout = layout or WorkspaceLayout()
75
+ real_source = source.resolve()
76
+ prs = Presentation(str(real_source))
77
+
78
+ out_dir = layout.parsed_dir_for(source)
79
+ asset_dir = layout.assets_dir_for(source)
80
+ if write_outputs:
81
+ out_dir.mkdir(parents=True, exist_ok=True)
82
+ asset_dir.mkdir(parents=True, exist_ok=True)
83
+
84
+ blocks: list[dict[str, Any]] = []
85
+ images: list[dict[str, Any]] = []
86
+ image_caption_results: dict[str, dict[str, Any]] = {}
87
+ image_seq = 0
88
+ section_stack: list[str] = []
89
+
90
+ def _shape_sort_key(shape) -> tuple[int, int]:
91
+ top = getattr(shape, "top", None)
92
+ left = getattr(shape, "left", None)
93
+ return (top if top is not None else 1 << 60, left if left is not None else 1 << 60)
94
+
95
+ def _emit_table(shape, slide_no: int, section_path: list[str]) -> None:
96
+ table = shape.table
97
+ rows: list[list[str]] = []
98
+ for row in table.rows:
99
+ rows.append([cell.text.strip() for cell in row.cells])
100
+ if rows:
101
+ blocks.append(
102
+ {
103
+ "kind": "table",
104
+ "rows": [[{"text": c} for c in r] for r in rows],
105
+ "slide": slide_no,
106
+ "section_path": section_path,
107
+ }
108
+ )
109
+
110
+ def _emit_picture(shape, slide_no: int, section_path: list[str], ctx_before: str) -> None:
111
+ nonlocal image_seq
112
+ try:
113
+ image = shape.image
114
+ except Exception:
115
+ return
116
+ blob = image.blob
117
+ if not blob:
118
+ return
119
+ image_seq += 1
120
+ sha = bytes_sha1(blob)
121
+ ext = _ext_for(getattr(image, "content_type", "") or "", getattr(image, "filename", "") or "")
122
+ asset_name = f"img-{image_seq:03d}-{sha[:10]}.{ext}"
123
+ asset_path = asset_dir / asset_name
124
+ if write_outputs and not asset_path.exists():
125
+ asset_path.write_bytes(blob)
126
+ rel = layout.relpath_from_parsed(asset_path, source)
127
+ key = str(image_seq)
128
+
129
+ cap = None
130
+ if captioner is not None:
131
+ try:
132
+ cap = captioner(
133
+ image_bytes=blob,
134
+ mime=getattr(image, "content_type", "") or f"image/{ext}",
135
+ doc_name=f"{source.name} :: slide {slide_no}",
136
+ nearby_caption="",
137
+ context=ctx_before,
138
+ )
139
+ except Exception as exc:
140
+ cap = {"error": str(exc)}
141
+ image_caption_results[key] = cap or {}
142
+
143
+ images.append(
144
+ {
145
+ "seq": image_seq,
146
+ "slide": slide_no,
147
+ "sha1": sha,
148
+ "ext": ext,
149
+ "asset_path": rel,
150
+ "context_before": truncate(ctx_before, 600),
151
+ "section_path": section_path,
152
+ "semantic": cap,
153
+ }
154
+ )
155
+ blocks.append(
156
+ {
157
+ "kind": "image",
158
+ "image_seq": image_seq,
159
+ "asset_path": rel,
160
+ "slide": slide_no,
161
+ "section_path": section_path,
162
+ }
163
+ )
164
+
165
+ for slide_idx, slide in enumerate(prs.slides):
166
+ slide_no = slide_idx + 1
167
+ title_text = ""
168
+ try:
169
+ if slide.shapes.title is not None and slide.shapes.title.has_text_frame:
170
+ title_text = (slide.shapes.title.text or "").strip()
171
+ except Exception:
172
+ title_text = ""
173
+
174
+ section_stack = [f"Slide {slide_no}" + (f": {title_text}" if title_text else "")]
175
+ heading_text = section_stack[0]
176
+ blocks.append(
177
+ {
178
+ "kind": "heading",
179
+ "level": 1,
180
+ "text": heading_text,
181
+ "slide": slide_no,
182
+ "section_path": list(section_stack),
183
+ }
184
+ )
185
+
186
+ recent_text = heading_text
187
+ title_id = id(slide.shapes.title) if slide.shapes.title is not None else None
188
+ for shape in sorted(slide.shapes, key=_shape_sort_key):
189
+ if title_id is not None and id(shape) == title_id:
190
+ continue
191
+ stype = getattr(shape, "shape_type", None)
192
+ if shape.has_table:
193
+ _emit_table(shape, slide_no, list(section_stack))
194
+ continue
195
+ if stype == MSO_SHAPE_TYPE.PICTURE or getattr(shape, "shape_type", None) == 13:
196
+ _emit_picture(shape, slide_no, list(section_stack), recent_text)
197
+ continue
198
+ if shape.has_text_frame:
199
+ tf = shape.text_frame
200
+ for para in tf.paragraphs:
201
+ text = "".join(run.text for run in para.runs).strip()
202
+ if not text and para.text:
203
+ text = para.text.strip()
204
+ if not text:
205
+ continue
206
+ level = getattr(para, "level", 0) or 0
207
+ blocks.append(
208
+ {
209
+ "kind": "paragraph",
210
+ "text": text,
211
+ "list_level": level if level > 0 else None,
212
+ "slide": slide_no,
213
+ "section_path": list(section_stack),
214
+ }
215
+ )
216
+ recent_text = text
217
+
218
+ # speaker notes
219
+ notes_text = ""
220
+ try:
221
+ if slide.has_notes_slide:
222
+ notes_text = (slide.notes_slide.notes_text_frame.text or "").strip()
223
+ except Exception:
224
+ notes_text = ""
225
+ if notes_text:
226
+ blocks.append(
227
+ {
228
+ "kind": "notes",
229
+ "text": notes_text,
230
+ "slide": slide_no,
231
+ "section_path": list(section_stack),
232
+ }
233
+ )
234
+
235
+ # markdown rendering ----------------------------------------------------
236
+ md_lines: list[str] = [
237
+ f"# {source.stem}",
238
+ "",
239
+ f"> Source: `{source.name}` \u00b7 sha1 `{file_sha1(real_source)[:12]}` "
240
+ f"\u00b7 parsed `{utc_now_iso()}` \u00b7 slides: {len(prs.slides._sldIdLst)}",
241
+ "",
242
+ ]
243
+
244
+ for b in blocks:
245
+ kind = b["kind"]
246
+ if kind == "heading":
247
+ md_lines.append(f"## {b['text']}")
248
+ md_lines.append("")
249
+ elif kind == "paragraph":
250
+ list_level = b.get("list_level")
251
+ if list_level:
252
+ md_lines.append(" " * (list_level - 1) + f"- {b['text']}")
253
+ else:
254
+ md_lines.append(b["text"])
255
+ md_lines.append("")
256
+ elif kind == "notes":
257
+ md_lines.append(f"> **Notes.** {b['text']}")
258
+ md_lines.append("")
259
+ elif kind == "image":
260
+ key = str(b["image_seq"])
261
+ cap = image_caption_results.get(key) or {}
262
+ alt = cap.get("caption") or f"slide-{b['slide']}-image-{b['image_seq']}"
263
+ md_lines.append(f"![{alt}]({b['asset_path']})")
264
+ if cap.get("description"):
265
+ md_lines.append("")
266
+ md_lines.append(f"<!-- vlm: {cap.get('model','')} -->")
267
+ md_lines.append(f"> **VLM caption.** {cap.get('caption','')}")
268
+ md_lines.append(">")
269
+ md_lines.append(f"> {cap.get('description','')}")
270
+ if cap.get("visible_text"):
271
+ md_lines.append(">")
272
+ vt = cap["visible_text"].replace("\n", "\n> ")
273
+ md_lines.append(f"> *Visible text:* {vt}")
274
+ if cap.get("tags"):
275
+ md_lines.append(">")
276
+ md_lines.append("> *Tags:* " + ", ".join(cap["tags"]))
277
+ md_lines.append("")
278
+ elif kind == "table":
279
+ rows = b["rows"]
280
+ if not rows:
281
+ continue
282
+ ncols = max(len(r) for r in rows)
283
+ header = rows[0]
284
+ header_cells = [
285
+ (header[c]["text"] if c < len(header) else "").replace("|", "\\|").replace("\n", " ")
286
+ for c in range(ncols)
287
+ ]
288
+ md_lines.append("| " + " | ".join(header_cells) + " |")
289
+ md_lines.append("| " + " | ".join(["---"] * ncols) + " |")
290
+ for row in rows[1:]:
291
+ cells = [
292
+ (row[c]["text"] if c < len(row) else "").replace("|", "\\|").replace("\n", " ")
293
+ for c in range(ncols)
294
+ ]
295
+ md_lines.append("| " + " | ".join(cells) + " |")
296
+ md_lines.append("")
297
+
298
+ md_text = "\n".join(md_lines).rstrip() + "\n"
299
+
300
+ json_payload = {
301
+ "source": {
302
+ "filename": source.name,
303
+ "absolute_path": str(real_source),
304
+ "sha1": file_sha1(real_source),
305
+ "size_bytes": real_source.stat().st_size,
306
+ "kind": "pptx",
307
+ },
308
+ "parsed_at": utc_now_iso(),
309
+ "n_slides": len(prs.slides._sldIdLst),
310
+ "blocks": blocks,
311
+ "images": images,
312
+ "stats": {
313
+ "n_blocks": len(blocks),
314
+ "n_slides": len(prs.slides._sldIdLst),
315
+ "n_headings": sum(1 for b in blocks if b["kind"] == "heading"),
316
+ "n_paragraphs": sum(1 for b in blocks if b["kind"] == "paragraph"),
317
+ "n_tables": sum(1 for b in blocks if b["kind"] == "table"),
318
+ "n_images": len(images),
319
+ "n_notes": sum(1 for b in blocks if b["kind"] == "notes"),
320
+ "n_captioned_images": sum(
321
+ 1
322
+ for v in image_caption_results.values()
323
+ if v and not v.get("error") and v.get("caption")
324
+ ),
325
+ },
326
+ }
327
+
328
+ if write_outputs:
329
+ write_text(out_dir / "document.md", md_text)
330
+ write_json(out_dir / "document.json", json_payload)
331
+
332
+ return json_payload
docparser/py.typed ADDED
File without changes
docparser/text.py ADDED
@@ -0,0 +1,189 @@
1
+ """Plain-text and Markdown parser (core, no extra dependencies).
2
+
3
+ Handles ``.txt`` and ``.md`` / ``.markdown`` files. Markdown is passed through
4
+ to ``document.md`` verbatim while a lightweight block model (headings, list
5
+ items, code fences, paragraphs) is emitted into ``document.json`` so downstream
6
+ RAG layers get structure. Plain text is split into paragraphs on blank lines.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ from collections.abc import Callable
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ from .common import (
16
+ WorkspaceLayout,
17
+ file_sha1,
18
+ utc_now_iso,
19
+ write_json,
20
+ write_text,
21
+ )
22
+
23
+ _ATX_RE = re.compile(r"^(#{1,6})\s+(.*)$")
24
+ _LIST_RE = re.compile(r"^\s*([-*+]|\d+[.)])\s+(.*)$")
25
+
26
+
27
+ def _blocks_from_markdown(text: str) -> list[dict[str, Any]]:
28
+ blocks: list[dict[str, Any]] = []
29
+ section_stack: list[str] = []
30
+ in_code = False
31
+ code_lang = ""
32
+ code_buf: list[str] = []
33
+ para_buf: list[str] = []
34
+
35
+ def flush_para() -> None:
36
+ if para_buf:
37
+ joined = " ".join(p.strip() for p in para_buf if p.strip()).strip()
38
+ if joined:
39
+ blocks.append(
40
+ {"kind": "paragraph", "text": joined, "section_path": list(section_stack)}
41
+ )
42
+ para_buf.clear()
43
+
44
+ for raw_line in text.splitlines():
45
+ line = raw_line.rstrip("\n")
46
+ fence = line.strip()
47
+ if fence.startswith("```") or fence.startswith("~~~"):
48
+ if in_code:
49
+ blocks.append(
50
+ {
51
+ "kind": "code",
52
+ "language": code_lang,
53
+ "text": "\n".join(code_buf),
54
+ "section_path": list(section_stack),
55
+ }
56
+ )
57
+ code_buf = []
58
+ in_code = False
59
+ code_lang = ""
60
+ else:
61
+ flush_para()
62
+ in_code = True
63
+ code_lang = fence[3:].strip()
64
+ continue
65
+ if in_code:
66
+ code_buf.append(line)
67
+ continue
68
+
69
+ m = _ATX_RE.match(line)
70
+ if m:
71
+ flush_para()
72
+ level = len(m.group(1))
73
+ htext = m.group(2).strip()
74
+ while len(section_stack) >= level:
75
+ section_stack.pop()
76
+ section_stack.append(htext)
77
+ blocks.append(
78
+ {
79
+ "kind": "heading",
80
+ "level": level,
81
+ "text": htext,
82
+ "section_path": list(section_stack),
83
+ }
84
+ )
85
+ continue
86
+
87
+ lm = _LIST_RE.match(line)
88
+ if lm:
89
+ flush_para()
90
+ blocks.append(
91
+ {
92
+ "kind": "list_item",
93
+ "text": lm.group(2).strip(),
94
+ "section_path": list(section_stack),
95
+ }
96
+ )
97
+ continue
98
+
99
+ if not line.strip():
100
+ flush_para()
101
+ continue
102
+
103
+ para_buf.append(line)
104
+
105
+ if in_code and code_buf:
106
+ blocks.append(
107
+ {
108
+ "kind": "code",
109
+ "language": code_lang,
110
+ "text": "\n".join(code_buf),
111
+ "section_path": list(section_stack),
112
+ }
113
+ )
114
+ flush_para()
115
+ return blocks
116
+
117
+
118
+ def _blocks_from_plaintext(text: str) -> list[dict[str, Any]]:
119
+ blocks: list[dict[str, Any]] = []
120
+ for chunk in re.split(r"\n\s*\n", text):
121
+ para = " ".join(line.strip() for line in chunk.splitlines() if line.strip()).strip()
122
+ if para:
123
+ blocks.append({"kind": "paragraph", "text": para, "section_path": []})
124
+ return blocks
125
+
126
+
127
+ def parse_text(
128
+ source: Path | str,
129
+ layout: WorkspaceLayout | None = None,
130
+ *,
131
+ captioner: Callable[..., dict[str, Any]] | None = None,
132
+ write_outputs: bool = True,
133
+ ) -> dict[str, Any]:
134
+ """Parse a ``.txt`` / ``.md`` file into Markdown + JSON.
135
+
136
+ The ``captioner`` argument is accepted for API symmetry but unused (plain
137
+ text has no embedded images).
138
+ """
139
+ _ = captioner
140
+ source = Path(source)
141
+ layout = layout or WorkspaceLayout()
142
+ real_source = source.resolve()
143
+ raw = real_source.read_text(encoding="utf-8", errors="replace")
144
+
145
+ is_markdown = source.suffix.lower() in {".md", ".markdown", ".mdown", ".mkd"}
146
+ blocks = _blocks_from_markdown(raw) if is_markdown else _blocks_from_plaintext(raw)
147
+
148
+ out_dir = layout.parsed_dir_for(source)
149
+ if write_outputs:
150
+ out_dir.mkdir(parents=True, exist_ok=True)
151
+
152
+ header = (
153
+ f"> Source: `{source.name}` \u00b7 sha1 `{file_sha1(real_source)[:12]}` "
154
+ f"\u00b7 parsed `{utc_now_iso()}`"
155
+ )
156
+ if is_markdown:
157
+ md_text = f"{header}\n\n{raw.strip()}\n"
158
+ else:
159
+ md_lines = [f"# {source.stem}", "", header, ""]
160
+ for b in blocks:
161
+ md_lines.append(b["text"])
162
+ md_lines.append("")
163
+ md_text = "\n".join(md_lines).rstrip() + "\n"
164
+
165
+ json_payload = {
166
+ "source": {
167
+ "filename": source.name,
168
+ "absolute_path": str(real_source),
169
+ "sha1": file_sha1(real_source),
170
+ "size_bytes": real_source.stat().st_size,
171
+ "kind": "markdown" if is_markdown else "text",
172
+ },
173
+ "parsed_at": utc_now_iso(),
174
+ "blocks": blocks,
175
+ "stats": {
176
+ "n_blocks": len(blocks),
177
+ "n_headings": sum(1 for b in blocks if b["kind"] == "heading"),
178
+ "n_paragraphs": sum(1 for b in blocks if b["kind"] == "paragraph"),
179
+ "n_list_items": sum(1 for b in blocks if b["kind"] == "list_item"),
180
+ "n_code_blocks": sum(1 for b in blocks if b["kind"] == "code"),
181
+ "n_chars": len(raw),
182
+ },
183
+ }
184
+
185
+ if write_outputs:
186
+ write_text(out_dir / "document.md", md_text)
187
+ write_json(out_dir / "document.json", json_payload)
188
+
189
+ return json_payload