PyPI - markdown-extractor - Versions diffs - 0.1.0__py3-none-any.whl - Mend

markdown-extractor 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

markdown_extractor/__init__.py +14 -0
markdown_extractor/blocks.py +359 -0
markdown_extractor/extractor.py +193 -0
markdown_extractor/html_renderer.py +150 -0
markdown_extractor/parser.py +153 -0
markdown_extractor/py.typed +0 -0
markdown_extractor/section.py +320 -0
markdown_extractor/text_renderer.py +83 -0
markdown_extractor-0.1.0.dist-info/METADATA +676 -0
markdown_extractor-0.1.0.dist-info/RECORD +13 -0
markdown_extractor-0.1.0.dist-info/WHEEL +5 -0
markdown_extractor-0.1.0.dist-info/licenses/LICENSE +201 -0
markdown_extractor-0.1.0.dist-info/top_level.txt +1 -0

markdown_extractor/html_renderer.py ADDED Viewed

@@ -0,0 +1,150 @@
+"""Render the parsed block tree to HTML.
+Pure-Python, zero-dependency renderer for the same Markdown subset that
+:mod:`markdown_extractor.blocks` understands: paragraphs, ordered/unordered
+lists with nesting, code fences, blockquotes — plus the common inline
+constructs (``**bold**``, ``*em*``, ``` `code` ```, ``[text](url)``,
+``![alt](url)``).
+The output is plain HTML5 with no styling. It is intentionally minimal:
+``markdown-extractor``'s job is structural extraction, not pretty rendering.
+"""
+from __future__ import annotations
+import re
+from html import escape
+from typing import Iterable, List
+from markdown_extractor.blocks import Block
+def render(blocks: Iterable[Block]) -> str:
+    """Render a sequence of blocks to an HTML fragment."""
+    return "\n".join(_render_block(b) for b in blocks)
+def _render_block(block: Block) -> str:
+    if block.kind == "paragraph":
+        return f"<p>{_inline(block.text)}</p>"
+    if block.kind == "code":
+        cls = f' class="language-{escape(block.info)}"' if block.info else ""
+        return f"<pre><code{cls}>{escape(block.text)}</code></pre>"
+    if block.kind == "blockquote":
+        inner = render(block.children) if block.children else f"<p>{_inline(block.text)}</p>"
+        return f"<blockquote>\n{inner}\n</blockquote>"
+    if block.kind in ("list", "ordered_list"):
+        tag = "ol" if block.kind == "ordered_list" else "ul"
+        items = "\n".join(_render_block(item) for item in block.children)
+        return f"<{tag}>\n{items}\n</{tag}>"
+    if block.kind == "list_item":
+        body = _inline(block.text)
+        if block.children:
+            nested = render(block.children)
+            return f"<li>{body}\n{nested}\n</li>"
+        return f"<li>{body}</li>"
+    return f"<div>{_inline(block.text)}</div>"
+# ---------------------------------------------------------------- inline
+# Order matters: replace inline code first (so its contents are not further
+# transformed), then images, links, bold, em.
+_CODE_RE = re.compile(r"`([^`]+)`")
+_IMG_RE = re.compile(r"!\[([^\]]*)\]\(([^)\s]+)\)")
+_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)\s]+)\)")
+_BOLD_RE = re.compile(r"\*\*([^*]+)\*\*")
+_EM_RE = re.compile(r"(?<![*\w])\*([^*\n]+?)\*(?!\w)")
+_EM_UNDER_RE = re.compile(r"(?<![\w_])_([^_\n]+?)_(?!\w)")
+def _inline(text: str) -> str:
+    """Convert the inline-formatting subset to HTML.
+    Inline ``code`` content is escaped and stashed before any other rule
+    runs, so backticks shield their contents from bold/em/link parsing.
+    """
+    if not text:
+        return ""
+    placeholders: List[str] = []
+    def stash(html: str) -> str:
+        placeholders.append(html)
+        return f"\x00{len(placeholders) - 1}\x00"
+    def repl_code(m: re.Match) -> str:
+        return stash(f"<code>{escape(m.group(1))}</code>")
+    out = _CODE_RE.sub(repl_code, text)
+    out = escape(out, quote=False)
+    def repl_img(m: re.Match) -> str:
+        alt = escape(m.group(1), quote=True)
+        src = escape(m.group(2), quote=True)
+        return stash(f'<img src="{src}" alt="{alt}">')
+    def repl_link(m: re.Match) -> str:
+        label = m.group(1)
+        href = escape(m.group(2), quote=True)
+        return stash(f'<a href="{href}">{label}</a>')
+    # The escape pass replaced angle brackets — restore the markers our
+    # regexes need by working on the escaped string for img/link too.
+    out = _IMG_RE.sub(repl_img, out)
+    out = _LINK_RE.sub(repl_link, out)
+    out = _BOLD_RE.sub(lambda m: f"<strong>{m.group(1)}</strong>", out)
+    out = _EM_RE.sub(lambda m: f"<em>{m.group(1)}</em>", out)
+    out = _EM_UNDER_RE.sub(lambda m: f"<em>{m.group(1)}</em>", out)
+    # Restore stashed HTML.
+    def unstash(m: re.Match) -> str:
+        return placeholders[int(m.group(1))]
+    out = re.sub(r"\x00(\d+)\x00", unstash, out)
+    return out
+def query_xpath(html: str, xpath: str, as_text: bool = False) -> List[str]:
+    """Run ``xpath`` over ``html`` and return the matched fragments.
+    Requires the ``lxml`` extra (``pip install markdown-extractor[xpath]``).
+    By default each element match is returned as an HTML string;
+    string/attribute matches are returned as-is.
+    With ``as_text=True``, element matches are flattened to their text
+    content (recursively — so ``<li><strong>Bold</strong> rest</li>``
+    yields ``"Bold rest"``). Use this when you want the data inside the
+    element rather than the markup. Compare to writing ``/text()`` in
+    the XPath itself, which only collects *direct* text children and
+    skips text nested inside inline tags.
+    """
+    if not html:
+        return []
+    try:
+        from lxml import etree, html as lxml_html
+    except ImportError as e:  # pragma: no cover - only hit without lxml
+        raise ModuleNotFoundError(
+            "XPath queries require the 'lxml' package. "
+            "Install with: pip install markdown-extractor[xpath]"
+        ) from e
+    fragment = lxml_html.fragment_fromstring(html, create_parent="div")
+    results = fragment.xpath(xpath)
+    out: List[str] = []
+    for r in results:
+        if isinstance(r, str):
+            out.append(r)
+        elif isinstance(r, etree._Element):
+            if as_text:
+                out.append(r.text_content())
+            else:
+                # ``with_tail=False`` excludes sibling text after the closing
+                # tag — callers asking for an element fragment want just the
+                # element, not its trailing context.
+                out.append(
+                    lxml_html.tostring(r, encoding="unicode", with_tail=False).rstrip()
+                )
+        else:
+            out.append(str(r))
+    return out

markdown_extractor/parser.py ADDED Viewed

@@ -0,0 +1,153 @@
+"""Header detection for Markdown documents.
+The parser walks the document line by line while tracking which "block
+context" each line belongs to. Headers found inside fenced code blocks,
+math blocks, tables, or YAML front matter are intentionally ignored.
+"""
+from __future__ import annotations
+import re
+from typing import List, NamedTuple, Tuple
+class Header(NamedTuple):
+    """A single header occurrence detected in the source document."""
+    line: int
+    level: int
+    title: str
+# ATX header: optional indentation, 1–6 #s, required space, title, optional
+# trailing #s.  We allow any amount of leading whitespace because the spec
+# explicitly calls for "indentation support".
+_ATX_RE = re.compile(r"^[ \t]*(#{1,6})[ \t]+(.+?)(?:[ \t]+#+[ \t]*)?$")
+# Fenced code block opener: 3+ backticks or 3+ tildes with up to 3 leading
+# spaces (CommonMark restricts the indent of a fence to <4 spaces).
+_FENCE_RE = re.compile(r"^[ ]{0,3}(`{3,}|~{3,})")
+# Setext underlines.
+_SETEXT_H1_RE = re.compile(r"^[ ]{0,3}=+[ \t]*$")
+_SETEXT_H2_RE = re.compile(r"^[ ]{0,3}-+[ \t]*$")
+# Quick check for "looks like a list item" — used to disambiguate setext h2
+# from a regular --- horizontal rule below a paragraph.
+_LIST_RE = re.compile(r"^[ \t]*([-*+]|\d+[.)])[ \t]")
+# Table separator row: |---|---| or :---:|---: etc.
+_TABLE_SEP_RE = re.compile(r"^[ \t]*\|?[ \t]*:?-+:?([ \t]*\|[ \t]*:?-+:?)+[ \t]*\|?[ \t]*$")
+def _closing_fence_re(fence_char: str, fence_len: int) -> re.Pattern[str]:
+    return re.compile(
+        r"^[ ]{0,3}" + re.escape(fence_char) + r"{" + str(fence_len) + r",}[ \t]*$"
+    )
+def parse(content: str) -> Tuple[List[Header], List[str]]:
+    """Return ``(headers, lines)`` for ``content``.
+    ``lines`` is the document split on ``\n`` (newlines stripped) and is
+    shared with :class:`markdown_extractor.section.Section` so each section can
+    rebuild its own slice of the source on demand.
+    """
+    lines = content.split("\n")
+    headers: List[Header] = []
+    in_code = False
+    fence_char = ""
+    fence_len = 0
+    closing_re: re.Pattern[str] | None = None
+    in_math = False
+    in_yaml = False
+    yaml_checked = False
+    in_table = False
+    for i, line in enumerate(lines):
+        stripped = line.strip()
+        # ---------- YAML front matter (only at the very top of the file)
+        if not yaml_checked:
+            yaml_checked = True
+            if stripped == "---":
+                in_yaml = True
+                continue
+        if in_yaml:
+            if stripped == "---" or stripped == "...":
+                in_yaml = False
+            continue
+        # ---------- Fenced code blocks
+        if in_code:
+            assert closing_re is not None
+            if closing_re.match(line):
+                in_code = False
+                closing_re = None
+            continue
+        m_fence = _FENCE_RE.match(line)
+        if m_fence:
+            in_code = True
+            marker = m_fence.group(1)
+            fence_char = marker[0]
+            fence_len = len(marker)
+            closing_re = _closing_fence_re(fence_char, fence_len)
+            in_table = False
+            continue
+        # ---------- Math blocks ($$ ... $$)
+        if stripped == "$$":
+            in_math = not in_math
+            in_table = False
+            continue
+        if in_math:
+            continue
+        # ---------- Tables
+        # A table block runs from the first pipe-line through a blank line.
+        # Headers inside a table are extremely unusual but we still skip
+        # them to honour the spec.
+        if not stripped:
+            in_table = False
+        elif _TABLE_SEP_RE.match(line) or "|" in stripped:
+            # Stay in table mode while we see pipe-lines or separators.
+            if in_table or _TABLE_SEP_RE.match(line) or stripped.startswith("|"):
+                in_table = True
+                # Continue: a pipe-line itself can never be a header.
+                continue
+        if in_table:
+            continue
+        # ---------- ATX headers
+        m_atx = _ATX_RE.match(line)
+        if m_atx:
+            level = len(m_atx.group(1))
+            title = m_atx.group(2).strip().rstrip("#").rstrip()
+            if title:
+                headers.append(Header(line=i, level=level, title=title))
+            continue
+        # ---------- Setext headers (=== / ---)
+        # The underline lives on the *current* line; the title is the
+        # previous line.  We only accept it when the previous line is
+        # genuine paragraph text (not a list item, not blank, and not
+        # already classified as something else).
+        if i > 0 and stripped:
+            prev = lines[i - 1]
+            prev_stripped = prev.strip()
+            if prev_stripped and not _LIST_RE.match(prev) and not _ATX_RE.match(prev):
+                if _SETEXT_H1_RE.match(line):
+                    headers.append(Header(line=i - 1, level=1, title=prev_stripped))
+                    continue
+                if _SETEXT_H2_RE.match(line) and prev_stripped != "---":
+                    headers.append(Header(line=i - 1, level=2, title=prev_stripped))
+                    continue
+    return headers, lines

markdown_extractor/py.typed ADDED Viewed

File without changes

markdown_extractor/section.py ADDED Viewed

@@ -0,0 +1,320 @@
+"""The :class:`Section` node — one entry in the parsed header tree."""
+from __future__ import annotations
+import json
+from typing import Any, Dict, Iterator, List, Optional, Union
+from markdown_extractor.blocks import Block, _null_block, flatten, parse_blocks
+from markdown_extractor.html_renderer import query_xpath, render
+from markdown_extractor.text_renderer import render_text
+class Section:
+    """A single header (and its body) in a parsed Markdown document.
+    A ``Section`` lazily slices the original document on access — there is
+    no per-section copy of the source, so the tree is cheap to hold even
+    for very large documents.
+    """
+    __slots__ = (
+        "title",
+        "level",
+        "line_start",
+        "line_end",
+        "parent",
+        "children",
+        "_lines",
+        "_blocks_cache",
+    )
+    def __init__(
+        self,
+        title: str,
+        level: int,
+        line_start: int,
+        line_end: Optional[int] = None,
+        parent: Optional["Section"] = None,
+        lines: Optional[List[str]] = None,
+    ) -> None:
+        self.title = title
+        self.level = level
+        self.line_start = line_start
+        self.line_end = line_end
+        self.parent = parent
+        self.children: List["Section"] = []
+        self._lines = lines
+        self._blocks_cache: Optional[List[Block]] = None
+    # ------------------------------------------------------------------ slices
+    @property
+    def content(self) -> str:
+        """The header line plus everything beneath it, including subsections."""
+        if self._lines is None or self.line_end is None:
+            return ""
+        return "\n".join(self._lines[self.line_start : self.line_end])
+    @property
+    def body(self) -> str:
+        """The section content with the header line removed.
+        For the synthetic root (level 0) this is identical to :attr:`content`
+        because the root has no header line to strip.
+        """
+        if self._lines is None or self.line_end is None:
+            return ""
+        start = self.line_start if self.level == 0 else self.line_start + 1
+        return "\n".join(self._lines[start : self.line_end])
+    @property
+    def text(self) -> str:
+        """Just this section's own prose — the header is dropped and any
+        nested subsections are excluded."""
+        if self._lines is None or self.line_end is None:
+            return ""
+        start = self.line_start if self.level == 0 else self.line_start + 1
+        end = self.children[0].line_start if self.children else self.line_end
+        return "\n".join(self._lines[start:end])
+    # ------------------------------------------------------------------ navigation
+    @property
+    def path(self) -> List[str]:
+        """Titles from the topmost ancestor down to this section (root excluded)."""
+        result: List[str] = []
+        node: Optional[Section] = self
+        while node is not None and node.level > 0:
+            result.append(node.title)
+            node = node.parent
+        result.reverse()
+        return result
+    def list(self) -> List[str]:
+        """Titles of immediate child sections."""
+        return [c.title for c in self.children]
+    def get_section(self, *path: str) -> "Section":
+        """Walk a sequence of child titles, e.g. ``s.get_section("A", "B")``.
+        Strict — raises :class:`KeyError` if any title is missing. For
+        an error-free version that flows through to empty results, use
+        :meth:`get`.
+        """
+        node: Section = self
+        for title in path:
+            node = node[title]
+        return node
+    def get(self, *path: str) -> "Section":
+        """Soft path walk — returns a *null* section on miss instead of raising.
+        ``e.get("Foo", "Bar").to_list()`` returns ``[]`` (not a ``KeyError``)
+        if either ``"Foo"`` or ``"Bar"`` is missing. The returned null
+        section is falsy (``bool(s) is False``) and its ``to_list``,
+        ``to_dict``, ``to_json``, ``to_html``, and ``to_text`` methods
+        return empty values, so chains stay safe.
+        Use ``[]`` (or :meth:`get_section`) when you want missing keys
+        to fail loudly.
+        """
+        node: Section = self
+        for title in path:
+            if not node:  # already null — keep flowing
+                return _null_section()
+            found: Optional[Section] = None
+            for child in node.children:
+                if child.title == title:
+                    found = child
+                    break
+            if found is None:
+                return _null_section()
+            node = found
+        return node
+    def find(self, title: str) -> List["Section"]:
+        """All descendants whose title equals ``title`` (depth-first order)."""
+        results: List[Section] = []
+        for child in self.children:
+            if child.title == title:
+                results.append(child)
+            results.extend(child.find(title))
+        return results
+    def walk(self) -> Iterator["Section"]:
+        """Yield this section and every descendant, depth-first."""
+        yield self
+        for child in self.children:
+            yield from child.walk()
+    # ------------------------------------------------------------------ body blocks
+    @property
+    def blocks(self) -> List[Block]:
+        """Lazy parse of this section's own prose into a block tree.
+        The block tree covers paragraphs, ordered/unordered lists with
+        nested items, code fences, and blockquotes. Header subsections
+        of this section are *not* included — those live in
+        :attr:`children`.
+        """
+        if self._blocks_cache is None:
+            self._blocks_cache = parse_blocks(self.text)
+        return self._blocks_cache
+    def to_list(self) -> List[str]:
+        """Flatten the body into a list of strings, one per top-level
+        block (or one per top-level list item if the body is a list).
+        Useful when you want the section's body as data — e.g. ``Overview``
+        bullets as a list of feature strings — rather than as a header
+        title roster (which is what :meth:`list` returns).
+        """
+        return flatten(self.blocks)
+    def block(self, *indices: int) -> Block:
+        """Soft index walk into this section's body block tree.
+        The first index addresses :attr:`blocks`; subsequent indices walk
+        into ``.children``. Returns a *null Block* (whose ``text_plain``
+        is ``""``) if any index is out of range, so chains like
+        ``section.block(99, 0).text_plain`` stay safe.
+        ``section.block(1, 1).text_plain`` is the soft equivalent of
+        ``section.blocks[1].children[1].text_plain``.
+        """
+        if not indices:
+            return _null_block()
+        blocks = self.blocks
+        head, *rest = indices
+        n = len(blocks)
+        if not n or head < -n or head >= n:
+            return _null_block()
+        return blocks[head].get(*rest)
+    def to_text(self) -> str:
+        """Render the body to plain text with Markdown markers stripped.
+        Bullets become ``- `` lines, ordered items become ``1. `` lines,
+        nested children indent by four spaces, code blocks are kept
+        verbatim, and inline ``**bold**`` / ``*em*`` / ``` `code` `` /
+        links / images are reduced to their visible text.
+        """
+        return render_text(self.blocks)
+    # ------------------------------------------------------------------ serialisation
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to a JSON-friendly nested dict.
+        Includes both header subsections (``children``) and the body
+        block tree (``blocks``). The ``blocks`` field captures bullet
+        lists, paragraphs, and indented continuations as nested nodes.
+        """
+        return {
+            "title": self.title,
+            "level": self.level,
+            "text": self.text,
+            "blocks": [b.to_dict() for b in self.blocks],
+            "children": [c.to_dict() for c in self.children],
+        }
+    def to_json(self, **kwargs: Any) -> str:
+        """Shorthand for ``json.dumps(self.to_dict(), **kwargs)``."""
+        return json.dumps(self.to_dict(), **kwargs)
+    def to_html(
+        self, xpath: Optional[str] = None, as_text: bool = False
+    ) -> Union[str, List[str]]:
+        """Render this section's body as an HTML fragment.
+        Without ``xpath``, returns the full HTML string. With ``xpath``,
+        returns a list of matched fragments — each is an HTML string for
+        element matches, or the raw value for string / attribute matches.
+        Pass ``as_text=True`` to flatten element matches to their text
+        content (recursively, so inline tags like ``<strong>`` are
+        unwrapped). Useful when you want the data inside the element
+        rather than the markup. Has no effect when ``xpath`` is ``None``.
+        XPath support requires the optional ``lxml`` extra::
+            pip install markdown-extractor[xpath]
+        """
+        html = render(self.blocks)
+        if xpath is None:
+            return html
+        return query_xpath(html, xpath, as_text=as_text)
+    def tree(self, _indent: int = 0) -> str:
+        """ASCII tree rendering of this section and its descendants."""
+        label = "<root>" if self.level == 0 else f"{'#' * self.level} {self.title}"
+        out = "  " * _indent + label
+        for child in self.children:
+            out += "\n" + child.tree(_indent + 1)
+        return out
+    # ------------------------------------------------------------------ dunder
+    def __getitem__(self, key: Union[str, int]) -> "Section":
+        if isinstance(key, int):
+            return self.children[key]
+        for child in self.children:
+            if child.title == key:
+                return child
+        raise KeyError(
+            f"Section {key!r} not found. Available children: {self.list()!r}"
+        )
+    def __contains__(self, key: object) -> bool:
+        if isinstance(key, str):
+            return any(child.title == key for child in self.children)
+        return False
+    def __iter__(self) -> Iterator["Section"]:
+        return iter(self.children)
+    def __len__(self) -> int:
+        return len(self.children)
+    def __str__(self) -> str:
+        return self.content
+    def __bool__(self) -> bool:
+        """A non-null section is truthy. The sentinel returned by
+        :meth:`get` on a missing path is the only falsy ``Section``."""
+        return self._lines is not None
+    def __repr__(self) -> str:
+        if self._lines is None:
+            return "Section(<null>)"
+        return (
+            f"Section(title={self.title!r}, level={self.level}, "
+            f"children={len(self.children)})"
+        )
+# ---------------------------------------------------------------- null sentinel
+_NULL_SECTION: Optional[Section] = None
+def _null_section() -> Section:
+    """Cached null-section sentinel returned by :meth:`Section.get` on miss.
+    Behaviour summary:
+    - ``bool(s)`` is ``False``
+    - ``to_list()`` → ``[]``
+    - ``to_dict()`` → ``{"title": "", "level": 0, "text": "", "blocks": [], "children": []}``
+    - ``to_json()`` → JSON of the above
+    - ``to_html()`` → ``""``  (with ``xpath=...`` → ``[]``)
+    - ``to_text()`` → ``""``
+    - ``get(*more)`` → keeps returning this sentinel
+    """
+    global _NULL_SECTION
+    if _NULL_SECTION is None:
+        _NULL_SECTION = Section(
+            title="", level=0, line_start=0, line_end=None, lines=None
+        )
+    return _NULL_SECTION