markdown-extractor 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ """markdown-extractor — extract structured sections from Markdown.
2
+
3
+ Public API:
4
+ MDExtractor — entry point for parsing a Markdown document.
5
+ Section — a node in the parsed header tree.
6
+ Block — a node in a section's parsed body block tree.
7
+ """
8
+
9
+ from markdown_extractor.blocks import Block
10
+ from markdown_extractor.extractor import MDExtractor
11
+ from markdown_extractor.section import Section
12
+
13
+ __version__ = "0.1.0"
14
+ __all__ = ["MDExtractor", "Section", "Block", "__version__"]
@@ -0,0 +1,359 @@
1
+ """Body-block parser — turn a section's prose into a tree of blocks.
2
+
3
+ The header parser in :mod:`markdown_extractor.parser` splits a document by header
4
+ *level* and stops there: it never looks inside a section's body. This
5
+ module is the second pass — it walks the lines of a single section and
6
+ produces a small block tree (paragraphs, lists, list items, code blocks,
7
+ blockquotes) so callers can ask for ``Section.to_list()``, embed the
8
+ structure in ``Section.to_dict()``, or render it to HTML.
9
+
10
+ The grammar is intentionally a CommonMark *subset* — enough to do useful
11
+ things with typical README/spec/FAQ-style documents without growing into
12
+ a full Markdown parser. The header parser stays the source of truth for
13
+ the document's outline; this module only sees the body text in between.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import re
19
+ from dataclasses import dataclass, field
20
+ from typing import Iterator, List, Optional
21
+
22
+ from markdown_extractor.text_renderer import strip_inline
23
+
24
+
25
+ # A block is one of:
26
+ # "paragraph" — a run of non-empty, non-structural lines
27
+ # "list" — bullet (-, *, +) container; children are list_items
28
+ # "ordered_list" — numbered (1. 1)) container; children are list_items
29
+ # "list_item" — one item; children are nested lists or paragraphs
30
+ # "code" — fenced block (``` or ~~~); language stored in ``info``
31
+ # "blockquote" — `>`-prefixed run; children are inner blocks
32
+ @dataclass
33
+ class Block:
34
+ kind: str
35
+ text: str = ""
36
+ children: List["Block"] = field(default_factory=list)
37
+ info: str = "" # code language, list marker style, etc.
38
+
39
+ def walk(self) -> Iterator["Block"]:
40
+ yield self
41
+ for child in self.children:
42
+ yield from child.walk()
43
+
44
+ def to_dict(self) -> dict:
45
+ out: dict = {"kind": self.kind, "text": self.text}
46
+ if self.info:
47
+ out["info"] = self.info
48
+ if self.children:
49
+ out["children"] = [c.to_dict() for c in self.children]
50
+ return out
51
+
52
+ @property
53
+ def text_plain(self) -> str:
54
+ """``self.text`` with inline Markdown markers stripped.
55
+
56
+ ``**bold**`` → ``bold``, ``[label](url)`` → ``label``, etc.
57
+ Returns ``""`` for the null sentinel returned by :meth:`get`.
58
+ """
59
+ return strip_inline(self.text)
60
+
61
+ def get(self, *indices: int) -> "Block":
62
+ """Soft index walk into ``self.children`` by integer index.
63
+
64
+ ``block.get(1, 0)`` is equivalent to ``block.children[1].children[0]``
65
+ but returns a *null Block* sentinel (whose ``text_plain`` is
66
+ ``""``) if any index is out of range. Subsequent ``.get()`` calls
67
+ on the null block keep returning the null block, so chains like
68
+ ``block.get(99).get(0).text_plain`` are safe.
69
+ """
70
+ node: "Block" = self
71
+ for i in indices:
72
+ if not node:
73
+ return _null_block()
74
+ n = len(node.children)
75
+ if not n or i < -n or i >= n:
76
+ return _null_block()
77
+ node = node.children[i]
78
+ return node
79
+
80
+ def __bool__(self) -> bool:
81
+ """``False`` only for the null sentinel returned by :meth:`get`.
82
+
83
+ Real blocks always have a non-empty ``kind`` (the parser assigns
84
+ one); the sentinel uses ``kind=""``.
85
+ """
86
+ return self.kind != ""
87
+
88
+
89
+ _FENCE_RE = re.compile(r"^([ ]{0,3})(`{3,}|~{3,})(.*)$")
90
+ _BULLET_RE = re.compile(r"^(?P<indent>[ \t]*)(?P<marker>[-*+])[ \t]+(?P<rest>.*)$")
91
+ _ORDERED_RE = re.compile(r"^(?P<indent>[ \t]*)(?P<num>\d+)(?P<sep>[.)])[ \t]+(?P<rest>.*)$")
92
+ _BLOCKQUOTE_RE = re.compile(r"^[ ]{0,3}>[ ]?(?P<rest>.*)$")
93
+
94
+
95
+ def _expand_indent(s: str) -> int:
96
+ """Visual column of the first non-whitespace char (tabs = 4 cols)."""
97
+ col = 0
98
+ for ch in s:
99
+ if ch == " ":
100
+ col += 1
101
+ elif ch == "\t":
102
+ col += 4 - (col % 4)
103
+ else:
104
+ break
105
+ return col
106
+
107
+
108
+ def parse_blocks(text: str) -> List[Block]:
109
+ """Parse a section's body text into a list of top-level blocks."""
110
+ if not text or not text.strip():
111
+ return []
112
+ lines = text.split("\n")
113
+ return _parse(lines, 0, len(lines), base_indent=0)
114
+
115
+
116
+ def _parse(lines: List[str], start: int, end: int, base_indent: int) -> List[Block]:
117
+ blocks: List[Block] = []
118
+ i = start
119
+ while i < end:
120
+ line = lines[i]
121
+ stripped = line.strip()
122
+
123
+ # Skip blank lines between blocks.
124
+ if not stripped:
125
+ i += 1
126
+ continue
127
+
128
+ indent = _expand_indent(line)
129
+ if indent < base_indent:
130
+ break
131
+
132
+ # Code fence.
133
+ m_fence = _FENCE_RE.match(line)
134
+ if m_fence:
135
+ block, i = _consume_fence(lines, i, end, m_fence)
136
+ blocks.append(block)
137
+ continue
138
+
139
+ # Blockquote.
140
+ if _BLOCKQUOTE_RE.match(line):
141
+ block, i = _consume_blockquote(lines, i, end)
142
+ blocks.append(block)
143
+ continue
144
+
145
+ # Lists.
146
+ m_b = _BULLET_RE.match(line)
147
+ m_o = _ORDERED_RE.match(line)
148
+ if m_b or m_o:
149
+ block, i = _consume_list(lines, i, end, base_indent, ordered=m_o is not None)
150
+ blocks.append(block)
151
+ continue
152
+
153
+ # Paragraph.
154
+ block, i = _consume_paragraph(lines, i, end, base_indent)
155
+ blocks.append(block)
156
+
157
+ return blocks
158
+
159
+
160
+ def _consume_fence(lines, i, end, m_fence):
161
+ marker = m_fence.group(2)
162
+ info = m_fence.group(3).strip()
163
+ fence_char = marker[0]
164
+ fence_len = len(marker)
165
+ body: List[str] = []
166
+ j = i + 1
167
+ closer = re.compile(r"^[ ]{0,3}" + re.escape(fence_char) + r"{" + str(fence_len) + r",}[ \t]*$")
168
+ while j < end and not closer.match(lines[j]):
169
+ body.append(lines[j])
170
+ j += 1
171
+ # Skip the closing fence if present.
172
+ if j < end:
173
+ j += 1
174
+ return Block(kind="code", text="\n".join(body), info=info), j
175
+
176
+
177
+ def _consume_blockquote(lines, i, end):
178
+ body: List[str] = []
179
+ j = i
180
+ while j < end:
181
+ m = _BLOCKQUOTE_RE.match(lines[j])
182
+ if not m:
183
+ # A blank line ends the quote.
184
+ if not lines[j].strip():
185
+ break
186
+ # A non-quote, non-blank line also ends it (lazy continuation
187
+ # is not supported in this subset).
188
+ break
189
+ body.append(m.group("rest"))
190
+ j += 1
191
+ inner_text = "\n".join(body)
192
+ inner_blocks = parse_blocks(inner_text)
193
+ return Block(kind="blockquote", text=inner_text, children=inner_blocks), j
194
+
195
+
196
+ def _consume_list(lines, i, end, base_indent, ordered: bool):
197
+ """Consume a contiguous list at the indentation of ``lines[i]``.
198
+
199
+ Items at the same indent attach to the same list. Greater-indent
200
+ content (sub-list or indented paragraph) becomes the previous item's
201
+ child. Lesser-indent content terminates the list.
202
+ """
203
+ list_indent = _expand_indent(lines[i])
204
+ kind = "ordered_list" if ordered else "list"
205
+ items: List[Block] = []
206
+ j = i
207
+ while j < end:
208
+ line = lines[j]
209
+ if not line.strip():
210
+ # Blank lines are allowed inside a list; peek ahead to decide
211
+ # whether we're still inside it.
212
+ k = j + 1
213
+ while k < end and not lines[k].strip():
214
+ k += 1
215
+ if k >= end:
216
+ j = k
217
+ break
218
+ next_indent = _expand_indent(lines[k])
219
+ next_b = _BULLET_RE.match(lines[k])
220
+ next_o = _ORDERED_RE.match(lines[k])
221
+ if next_indent < list_indent:
222
+ j = k
223
+ break
224
+ if next_indent == list_indent and (next_b or next_o):
225
+ # Same indent + list marker: stay in this list ONLY if the
226
+ # marker type matches (- vs 1.). A different type starts a
227
+ # new list at the outer level.
228
+ same_type = next_o if ordered else next_b
229
+ if same_type:
230
+ j = k
231
+ continue
232
+ j = k
233
+ break
234
+ if next_indent > list_indent:
235
+ # Indented continuation belongs to the previous item.
236
+ j = k
237
+ # Fall through into the item-extension branch below.
238
+ else:
239
+ # Same indent but not a list marker → list ends.
240
+ j = k
241
+ break
242
+
243
+ line = lines[j]
244
+ indent = _expand_indent(line)
245
+ m_b = _BULLET_RE.match(line)
246
+ m_o = _ORDERED_RE.match(line)
247
+
248
+ if indent == list_indent and (m_b or m_o):
249
+ # Marker at this list's indent — keep it only if the type
250
+ # matches; a different type ends this list so the outer
251
+ # parser can start a fresh one.
252
+ m = m_o if ordered else m_b
253
+ if m is None:
254
+ break
255
+ rest = m.group("rest")
256
+ item = Block(kind="list_item", text=rest.strip())
257
+ items.append(item)
258
+ j += 1
259
+ continue
260
+
261
+ if indent > list_indent and items:
262
+ # Continuation / nested content for the most recent item.
263
+ sub_end = _find_block_end(lines, j, end, list_indent)
264
+ sub_lines = lines[j:sub_end]
265
+ child_indent = _expand_indent(lines[j])
266
+ sub_blocks = _parse(sub_lines, 0, len(sub_lines), base_indent=child_indent)
267
+ # Strip leading whitespace so child text reads naturally.
268
+ for b in sub_blocks:
269
+ items[-1].children.append(b)
270
+ j = sub_end
271
+ continue
272
+
273
+ # Lower indent → list ends.
274
+ break
275
+
276
+ return Block(kind=kind, text="", children=items), j
277
+
278
+
279
+ def _find_block_end(lines, start, end, parent_indent):
280
+ """Find the first line at-or-below ``parent_indent`` (or EOF)."""
281
+ j = start
282
+ while j < end:
283
+ line = lines[j]
284
+ if not line.strip():
285
+ # Look past blank lines.
286
+ k = j + 1
287
+ while k < end and not lines[k].strip():
288
+ k += 1
289
+ if k >= end:
290
+ return k
291
+ if _expand_indent(lines[k]) <= parent_indent:
292
+ return j
293
+ j = k
294
+ continue
295
+ if _expand_indent(line) <= parent_indent:
296
+ return j
297
+ j += 1
298
+ return j
299
+
300
+
301
+ def _consume_paragraph(lines, i, end, base_indent):
302
+ body: List[str] = []
303
+ j = i
304
+ while j < end:
305
+ line = lines[j]
306
+ stripped = line.strip()
307
+ if not stripped:
308
+ break
309
+ indent = _expand_indent(line)
310
+ if indent < base_indent:
311
+ break
312
+ if (
313
+ _BULLET_RE.match(line)
314
+ or _ORDERED_RE.match(line)
315
+ or _BLOCKQUOTE_RE.match(line)
316
+ or _FENCE_RE.match(line)
317
+ ):
318
+ break
319
+ body.append(stripped)
320
+ j += 1
321
+ return Block(kind="paragraph", text=" ".join(body)), j
322
+
323
+
324
+ def flatten(blocks: List[Block]) -> List[str]:
325
+ """Flatten a block tree to a list of strings — one entry per top-level
326
+ block. Lists expand to one string per top-level list_item."""
327
+ out: List[str] = []
328
+ for b in blocks:
329
+ if b.kind in ("list", "ordered_list"):
330
+ for item in b.children:
331
+ out.append(item.text)
332
+ elif b.kind == "code":
333
+ out.append(b.text)
334
+ elif b.kind == "blockquote":
335
+ out.append(b.text.strip())
336
+ else:
337
+ out.append(b.text)
338
+ return out
339
+
340
+
341
+ # ---------------------------------------------------------------- null sentinel
342
+
343
+ _NULL_BLOCK: Optional[Block] = None
344
+
345
+
346
+ def _null_block() -> Block:
347
+ """Cached null-Block sentinel returned by :meth:`Block.get` /
348
+ :meth:`Section.block` on out-of-range indices.
349
+
350
+ Behaviour:
351
+ - ``bool(b)`` is ``False``
352
+ - ``b.text_plain`` → ``""``
353
+ - ``b.text`` → ``""``, ``b.children`` → ``[]``
354
+ - ``b.get(*more)`` → keeps returning this sentinel
355
+ """
356
+ global _NULL_BLOCK
357
+ if _NULL_BLOCK is None:
358
+ _NULL_BLOCK = Block(kind="", text="", children=[], info="")
359
+ return _NULL_BLOCK
@@ -0,0 +1,193 @@
1
+ """Top-level facade that turns a Markdown string into a :class:`Section` tree."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Iterator, List, Optional, Union
8
+
9
+ from markdown_extractor.blocks import Block
10
+ from markdown_extractor.parser import parse
11
+ from markdown_extractor.section import Section
12
+
13
+
14
+ class MDExtractor:
15
+ """Parse a Markdown document and expose its headers as a navigable tree.
16
+
17
+ Bracket access mirrors the ergonomics of a nested dictionary::
18
+
19
+ extractor = MDExtractor(text)
20
+ extractor["Section 1"]["Subsection 1.1"]
21
+
22
+ The empty string returns the synthetic root section, which represents
23
+ the entire document::
24
+
25
+ extractor[""] # whole document, including front matter
26
+ """
27
+
28
+ def __init__(self, markdown_content: str) -> None:
29
+ if not isinstance(markdown_content, str):
30
+ raise TypeError(
31
+ "markdown_content must be a str, got "
32
+ f"{type(markdown_content).__name__}"
33
+ )
34
+ self._content = markdown_content
35
+ self._root = self._build_tree()
36
+
37
+ # ------------------------------------------------------------------ construction
38
+
39
+ @classmethod
40
+ def from_file(
41
+ cls, path: Union[str, Path], encoding: str = "utf-8"
42
+ ) -> "MDExtractor":
43
+ """Read ``path`` and parse its contents."""
44
+ return cls(Path(path).read_text(encoding=encoding))
45
+
46
+ def _build_tree(self) -> Section:
47
+ headers, lines = parse(self._content)
48
+ root = Section(
49
+ title="",
50
+ level=0,
51
+ line_start=0,
52
+ line_end=len(lines),
53
+ lines=lines,
54
+ )
55
+ stack: List[Section] = [root]
56
+ for header in headers:
57
+ # Pop ancestors whose level is >= this header's level: the new
58
+ # section attaches to the deepest still-open parent.
59
+ while stack[-1].level >= header.level:
60
+ stack.pop()
61
+ parent = stack[-1]
62
+ section = Section(
63
+ title=header.title,
64
+ level=header.level,
65
+ line_start=header.line,
66
+ parent=parent,
67
+ lines=lines,
68
+ )
69
+ parent.children.append(section)
70
+ stack.append(section)
71
+ self._fill_line_ends(root, len(lines))
72
+ return root
73
+
74
+ @staticmethod
75
+ def _fill_line_ends(section: Section, doc_end: int) -> None:
76
+ """Populate ``line_end`` for every section by sibling/parent boundaries."""
77
+ if section.line_end is None:
78
+ section.line_end = doc_end
79
+ for i, child in enumerate(section.children):
80
+ if i + 1 < len(section.children):
81
+ child.line_end = section.children[i + 1].line_start
82
+ else:
83
+ child.line_end = section.line_end
84
+ MDExtractor._fill_line_ends(child, doc_end)
85
+
86
+ # ------------------------------------------------------------------ accessors
87
+
88
+ @property
89
+ def root(self) -> Section:
90
+ """The synthetic top-level section that owns every other section."""
91
+ return self._root
92
+
93
+ @property
94
+ def content(self) -> str:
95
+ """The original Markdown source, unmodified."""
96
+ return self._content
97
+
98
+ def list(self) -> List[str]:
99
+ """Top-level section titles."""
100
+ return self._root.list()
101
+
102
+ def get_section(self, *path: str) -> Section:
103
+ """Navigate by a sequence of titles (root → leaf)."""
104
+ return self._root.get_section(*path)
105
+
106
+ def find(self, title: str) -> List[Section]:
107
+ """Find every section whose title equals ``title`` (any depth)."""
108
+ return self._root.find(title)
109
+
110
+ def walk(self) -> Iterator[Section]:
111
+ """Iterate over every header section in the document, depth-first."""
112
+ for section in self._root.walk():
113
+ if section.level > 0:
114
+ yield section
115
+
116
+ def headers(self) -> List[Section]:
117
+ """All header sections as a flat list (depth-first order)."""
118
+ return list(self.walk())
119
+
120
+ def to_dict(self) -> dict:
121
+ """JSON-friendly dict of the whole tree."""
122
+ return self._root.to_dict()
123
+
124
+ def to_json(self, **kwargs) -> str:
125
+ """Shorthand for ``json.dumps(self.to_dict(), **kwargs)``."""
126
+ return json.dumps(self.to_dict(), **kwargs)
127
+
128
+ def to_list(self) -> List[str]:
129
+ """Flatten the document's body into one entry per top-level block.
130
+
131
+ See :meth:`Section.to_list` for the per-section equivalent.
132
+ """
133
+ return self._root.to_list()
134
+
135
+ def to_text(self) -> str:
136
+ """Render the document body to plain text (Markdown markers stripped).
137
+
138
+ See :meth:`Section.to_text` for the per-section equivalent.
139
+ """
140
+ return self._root.to_text()
141
+
142
+ def to_html(
143
+ self, xpath: Optional[str] = None, as_text: bool = False
144
+ ) -> Union[str, List[str]]:
145
+ """Render the whole document's body to HTML.
146
+
147
+ See :meth:`Section.to_html` for the per-section equivalent,
148
+ XPath usage notes, and the ``as_text`` parameter.
149
+ """
150
+ return self._root.to_html(xpath, as_text=as_text)
151
+
152
+ def block(self, *indices: int) -> Block:
153
+ """Soft index walk into the document's body blocks.
154
+
155
+ See :meth:`Section.block` for the per-section equivalent.
156
+ """
157
+ return self._root.block(*indices)
158
+
159
+ def get(self, *path: str) -> Section:
160
+ """Soft path walk on the document — see :meth:`Section.get`.
161
+
162
+ Returns the matched section, or a null section sentinel if any
163
+ title in ``path`` is missing. The null section is falsy and its
164
+ ``to_list``/``to_dict``/``to_json``/``to_html``/``to_text``
165
+ methods all return empty values, so chains stay safe.
166
+ """
167
+ return self._root.get(*path)
168
+
169
+ def tree(self) -> str:
170
+ """ASCII tree of the document's header structure."""
171
+ return self._root.tree()
172
+
173
+ # ------------------------------------------------------------------ dunder
174
+
175
+ def __getitem__(self, key: Union[str, int]) -> Section:
176
+ if isinstance(key, str) and key == "":
177
+ return self._root
178
+ return self._root[key]
179
+
180
+ def __contains__(self, key: object) -> bool:
181
+ return key in self._root
182
+
183
+ def __iter__(self) -> Iterator[Section]:
184
+ return iter(self._root)
185
+
186
+ def __len__(self) -> int:
187
+ return len(self._root)
188
+
189
+ def __str__(self) -> str:
190
+ return self._content
191
+
192
+ def __repr__(self) -> str:
193
+ return f"MDExtractor(headers={len(self.headers())})"