PyPI - mf2dom - Versions diffs - 0.1.9__py3-none-any.whl - Mend

mf2dom 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

mf2dom/__init__.py +8 -0
mf2dom/classes.py +87 -0
mf2dom/dom.py +133 -0
mf2dom/implied.py +166 -0
mf2dom/parser.py +395 -0
mf2dom/properties.py +257 -0
mf2dom/renderer.py +601 -0
mf2dom/text.py +66 -0
mf2dom/types.py +57 -0
mf2dom/urls.py +31 -0
mf2dom/vcp.py +211 -0
mf2dom-0.1.9.dist-info/METADATA +94 -0
mf2dom-0.1.9.dist-info/RECORD +15 -0
mf2dom-0.1.9.dist-info/WHEEL +4 -0
mf2dom-0.1.9.dist-info/licenses/LICENSE.md +651 -0

mf2dom/parser.py ADDED Viewed

@@ -0,0 +1,395 @@
+"""Microformats2 parser.
+Implements the mf2 parsing algorithm:
+https://microformats.org/wiki/microformats2-parsing
+Entry points:
+- `parse(...)` for synchronous parsing
+- `parse_async(...)` for running parsing off-thread
+"""
+from __future__ import annotations
+import asyncio
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Literal, TypeAlias, cast
+from justhtml import JustHTML
+from .dom import (
+    Element,
+    HasDom,
+    ancestor_elements,
+    get_attr,
+    iter_child_elements,
+    iter_descendant_elements,
+)
+from .implied import implied_name, implied_photo, implied_url
+from .properties import (
+    is_microformat_root,
+    parse_dt,
+    parse_e,
+    parse_p,
+    parse_u,
+    property_classes,
+    root_types,
+)
+from .text import text_content
+from .types import EValue, Mf2Document, Mf2Item
+from .urls import try_urljoin
+if TYPE_CHECKING:  # pragma: no cover
+    from .types import PropertyValue, RelUrl
+@dataclass(slots=True)
+class _ParseContext:
+    base_url: str | None
+    document_lang: str | None
+HtmlInput: TypeAlias = str | bytes | bytearray | memoryview | JustHTML | HasDom
+def _first_lang(doc_root: HasDom) -> str | None:
+    for el in iter_descendant_elements(doc_root):
+        if el.name.lower() == "html":
+            return get_attr(el, "lang")
+    return None
+def _discover_base_url(doc_root: HasDom, *, base_url: str | None) -> str | None:
+    for el in iter_descendant_elements(doc_root):
+        if el.name.lower() != "base":
+            continue
+        href = get_attr(el, "href")
+        if not href:
+            break
+        # Absolute replaces; relative is joined against provided URL if any.
+        joined = try_urljoin(base_url, href)
+        base_url = joined or href
+        break
+    return base_url
+def _split_tokens(value: str | None) -> list[str]:
+    if not value:
+        return []
+    return [t for t in value.split() if t]
+_REL_ATTRS: tuple[Literal["media"], Literal["hreflang"], Literal["type"], Literal["title"]] = (
+    "media",
+    "hreflang",
+    "type",
+    "title",
+)
+def _parse_rels(
+    doc_root: HasDom, *, base_url: str | None
+) -> tuple[dict[str, list[str]], dict[str, RelUrl]]:
+    rels: dict[str, list[str]] = defaultdict(list)
+    rel_urls: dict[str, RelUrl] = {}
+    for el in iter_descendant_elements(doc_root):
+        if el.name.lower() not in {"a", "area", "link"}:
+            continue
+        rel_attr = get_attr(el, "rel")
+        if not rel_attr:
+            continue
+        href = get_attr(el, "href")
+        if href is None:
+            continue
+        abs_href = try_urljoin(base_url, href) or href
+        rel_tokens = _split_tokens(rel_attr)
+        if not rel_tokens:
+            continue
+        for rel in rel_tokens:
+            if abs_href not in rels[rel]:
+                rels[rel].append(abs_href)
+        entry = rel_urls.setdefault(abs_href, {"rels": []})
+        entry_rels = entry["rels"]
+        for rel in rel_tokens:
+            if rel not in entry_rels:
+                entry_rels.append(rel)
+        if "text" not in entry:
+            entry["text"] = text_content(el)
+        for attr in _REL_ATTRS:
+            v = get_attr(el, attr)
+            if v is not None and v != "":
+                entry.setdefault(attr, v)
+    return dict(rels), rel_urls
+def _has_ancestor_microformat_root(el: Element) -> bool:
+    return any(is_microformat_root(a) for a in ancestor_elements(el))
+def _top_level_roots(doc_root: HasDom) -> list[Element]:
+    roots: list[Element] = []
+    for el in iter_descendant_elements(doc_root):
+        if not is_microformat_root(el):
+            continue
+        if _has_ancestor_microformat_root(el):
+            continue
+        roots.append(el)
+    return roots
+def _is_property_for_parent(el: Element) -> bool:
+    return bool(property_classes(el))
+def _parse_item(
+    el: Element,
+    ctx: _ParseContext,
+    *,
+    parent_lang: str | None,
+    ignore_root_property_classes: frozenset[str] = frozenset(),
+) -> Mf2Item:
+    types = root_types(el)
+    item: Mf2Item = {"type": types, "properties": {}}
+    element_id = get_attr(el, "id")
+    if element_id:
+        item["id"] = element_id
+    root_lang = get_attr(el, "lang") or parent_lang
+    children: list[Mf2Item] = []
+    props: dict[str, list[PropertyValue]] = defaultdict(list)
+    default_date: str | None = None
+    has_p = False
+    has_u = False
+    has_e = False
+    has_nested_microformat = False
+    def add_prop(name: str, value: PropertyValue) -> None:
+        props[name].append(value)
+    def simple_value(prop_class: str, target: Element) -> PropertyValue:
+        nonlocal default_date
+        if prop_class.startswith("p-"):
+            return parse_p(target, base_url=ctx.base_url)  # pragma: no cover
+        if prop_class.startswith("u-"):
+            return parse_u(target, base_url=ctx.base_url)  # pragma: no cover
+        if prop_class.startswith("dt-"):
+            dt = parse_dt(target, default_date=default_date)
+            if dt.date:
+                default_date = dt.date
+            return dt.value
+        if prop_class.startswith("e-"):
+            return parse_e(
+                target, base_url=ctx.base_url, root_lang=root_lang, document_lang=ctx.document_lang
+            )
+        return ""  # pragma: no cover
+    def embedded_value(prop_class: str, embedded_item: Mf2Item, target: Element) -> PropertyValue:
+        props_obj = embedded_item.get("properties")
+        if not isinstance(props_obj, dict):
+            return simple_value(prop_class, target)  # pragma: no cover
+        def descendant_name_class_info(root: Element) -> tuple[bool, bool]:
+            """Return (has_p_name, has_any_typed_name)."""
+            has_p_name = False
+            has_any_typed_name = False
+            name_tokens = {"p-name", "u-name", "dt-name", "e-name"}
+            stack: list[Element] = list(iter_child_elements(root))
+            while stack:
+                cur = stack.pop()
+                pcs = property_classes(cur)
+                if pcs:
+                    for pc in pcs:
+                        if pc == "p-name":
+                            has_p_name = True
+                            has_any_typed_name = True
+                            break
+                        if pc in name_tokens:
+                            has_any_typed_name = True
+                    if has_p_name and has_any_typed_name:
+                        return True, True
+                if is_microformat_root(cur):
+                    continue
+                stack.extend(iter_child_elements(cur))
+            return has_p_name, has_any_typed_name
+        if prop_class.startswith("u-"):
+            found_url = False
+            for key in ("url", "uid"):
+                vals = props_obj.get(key)
+                if not isinstance(vals, list) or not vals:
+                    continue
+                found_url = True
+                candidate = vals[0]
+                url = candidate if isinstance(candidate, str) else str(candidate.get("value", ""))  # type: ignore[union-attr]
+                if url.startswith(("http://", "https://")):
+                    return candidate
+            if found_url:
+                # If we have a URL property but it's not an absolute URL, fall back to plain text.
+                return parse_p(target, base_url=ctx.base_url)
+            # Otherwise, parse the `u-*` value from the element itself (URL join behavior).
+            return parse_u(target, base_url=ctx.base_url)
+        if prop_class.startswith("p-"):
+            vals = props_obj.get("name")
+            if (
+                isinstance(vals, list)
+                and vals
+                and isinstance(vals[0], str)
+                and not vals[0].startswith(("http://", "https://"))
+            ):
+                has_p_name, has_any_typed_name = descendant_name_class_info(target)
+                # Favor the embedded `p-name` value; otherwise only use implied name.
+                if has_p_name or not has_any_typed_name:
+                    return vals[0]
+            return parse_p(target, base_url=ctx.base_url)
+        return simple_value(prop_class, target)
+    def handle_property_class(prop_class: str, target: Element) -> None:
+        nonlocal default_date
+        nonlocal has_p, has_u, has_e
+        if prop_class.startswith("p-"):
+            has_p = True
+            add_prop(prop_class[2:], parse_p(target, base_url=ctx.base_url))
+        elif prop_class.startswith("u-"):
+            has_u = True
+            add_prop(prop_class[2:], parse_u(target, base_url=ctx.base_url))
+        elif prop_class.startswith("dt-"):
+            dt = parse_dt(target, default_date=default_date)
+            add_prop(prop_class[3:], dt.value)
+            if dt.date:
+                default_date = dt.date
+        elif prop_class.startswith("e-"):
+            has_e = True
+            add_prop(
+                prop_class[2:],
+                parse_e(
+                    target,
+                    base_url=ctx.base_url,
+                    root_lang=root_lang,
+                    document_lang=ctx.document_lang,
+                ),
+            )
+        else:  # pragma: no cover
+            return
+    def walk(node: Element, *, is_root: bool) -> None:
+        nonlocal has_e, has_nested_microformat, has_p, has_u
+        if not is_root and is_microformat_root(node):
+            has_nested_microformat = True
+            nested = _parse_item(
+                node,
+                ctx,
+                parent_lang=root_lang,
+                ignore_root_property_classes=frozenset(property_classes(node))
+                if _is_property_for_parent(node)
+                else frozenset(),
+            )
+            if _is_property_for_parent(node):
+                for pc in property_classes(node):
+                    if pc.startswith("p-"):
+                        has_p = True
+                    elif pc.startswith("u-"):
+                        has_u = True
+                    elif pc.startswith("e-"):
+                        has_e = True
+                    name = pc.split("-", 1)[1]
+                    embedded = cast(Mf2Item, dict(nested))
+                    val = embedded_value(pc, nested, node)
+                    if pc.startswith("e-") and isinstance(val, dict):
+                        e_val = cast(EValue, val)
+                        embedded["value"] = e_val["value"]
+                        embedded["html"] = e_val["html"]
+                        if "lang" in e_val:
+                            embedded["lang"] = e_val["lang"]
+                    else:
+                        embedded["value"] = val
+                    add_prop(name, embedded)
+            else:
+                children.append(nested)
+            return
+        pcs = property_classes(node)
+        if is_root and ignore_root_property_classes:
+            pcs = [pc for pc in pcs if pc not in ignore_root_property_classes]
+        for pc in pcs:
+            handle_property_class(pc, node)
+        for child in iter_child_elements(node):
+            walk(child, is_root=False)
+    walk(el, is_root=True)
+    # Apply implied properties if missing.
+    if "name" not in props and not has_p and not has_e and not has_nested_microformat:
+        props["name"].append(implied_name(el, ctx.base_url))
+    if "photo" not in props and not has_u and not has_nested_microformat:
+        photo = implied_photo(el, ctx.base_url)
+        if photo is not None:
+            props["photo"].append(photo)
+    if "url" not in props and not has_u and not has_nested_microformat:
+        url = implied_url(el, ctx.base_url)
+        if url is not None:
+            props["url"].append(url)
+    item["properties"] = dict(props)
+    if children:
+        item["children"] = children
+    return item
+def parse(
+    html: HtmlInput | None,
+    *,
+    base_url: str | None = None,
+    url: str | None = None,
+) -> Mf2Document:
+    """Parse Microformats2 JSON from HTML or a JustHTML document.
+    Returns a dict containing `items`, `rels`, and `rel-urls`.
+    Args:
+        html: HTML markup, a JustHTML instance, or a JustHTML root node.
+        base_url: Base URL for resolving relative URLs. Prefer this parameter.
+        url: Deprecated alias for `base_url`.
+    """
+    if base_url is not None and url is not None and base_url != url:
+        msg = "Provide only one of `base_url` or `url`."
+        raise ValueError(msg)
+    if base_url is None:
+        base_url = url
+    if isinstance(html, JustHTML):
+        doc_root = cast(HasDom, html.root)
+    elif html is None or isinstance(html, str | bytes | bytearray | memoryview):
+        doc_root = cast(HasDom, JustHTML(html).root)
+    else:
+        doc_root = cast(HasDom, html)
+    base_url = _discover_base_url(doc_root, base_url=base_url)
+    document_lang = _first_lang(doc_root)
+    ctx = _ParseContext(base_url=base_url, document_lang=document_lang)
+    rels, rel_urls = _parse_rels(doc_root, base_url=base_url)
+    items = [
+        _parse_item(root, ctx, parent_lang=document_lang) for root in _top_level_roots(doc_root)
+    ]
+    return {"items": items, "rels": rels, "rel-urls": rel_urls}
+async def parse_async(
+    html: HtmlInput | None,
+    *,
+    base_url: str | None = None,
+    url: str | None = None,
+) -> Mf2Document:
+    return await asyncio.to_thread(parse, html, base_url=base_url, url=url)

mf2dom/properties.py ADDED Viewed

@@ -0,0 +1,257 @@
+"""Property parsing for mf2 (`p-`, `u-`, `dt-`, `e-`) and microformat detection."""
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+from justhtml.constants import VOID_ELEMENTS
+from .classes import has_root_class
+from .classes import property_classes as mf2_property_classes
+from .classes import root_types as mf2_root_types
+from .dom import (
+    Element,
+    get_attr,
+    get_classes,
+    is_element,
+    iter_preorder_elements,
+)
+from .text import text_content
+from .urls import parse_srcset, try_urljoin
+from .vcp import _DATE_RE, _DATETIME_RE_COMPILED, _TIME_RE, normalize_datetime
+from .vcp import datetime as vcp_datetime
+from .vcp import text as vcp_text
+if TYPE_CHECKING:  # pragma: no cover
+    from .types import EValue, UrlObject, UrlValue
+@dataclass(slots=True)
+class DtResult:
+    value: str
+    date: str | None
+def parse_p(el: Element, *, base_url: str | None) -> str:
+    if (v := vcp_text(el)) is not None:
+        return v
+    title = get_attr(el, "title")
+    tag = el.name.lower()
+    if title is not None and tag in {"abbr", "link"}:
+        return title
+    value = get_attr(el, "value")
+    if value is not None and tag in {"data", "input"}:
+        return value
+    alt = get_attr(el, "alt")
+    if alt is not None and tag in {"img", "area"}:
+        return alt
+    return text_content(el, replace_img=True, img_to_src=False, base_url=base_url).strip()
+def _img_value(img: Element, base_url: str | None) -> UrlValue | None:
+    src = get_attr(img, "src")
+    if src is None:
+        return None
+    src_abs = try_urljoin(base_url, src) or src
+    alt = get_attr(img, "alt")
+    srcset = get_attr(img, "srcset")
+    if alt is not None or srcset:
+        out: UrlObject = {"value": src_abs}
+        if alt is not None:
+            out["alt"] = alt
+        if srcset:
+            out["srcset"] = parse_srcset(srcset, base_url)
+        return out
+    return src_abs
+def parse_u(el: Element, *, base_url: str | None) -> UrlValue:
+    tag = el.name.lower()
+    href = get_attr(el, "href")
+    if href is not None and tag in {"a", "area", "link"}:
+        return try_urljoin(base_url, href) or href
+    if tag == "img":
+        img = _img_value(el, base_url)
+        if img is not None:
+            return img
+    src = get_attr(el, "src")
+    if src is not None and tag in {"audio", "video", "source", "iframe"}:
+        return try_urljoin(base_url, src) or src
+    poster = get_attr(el, "poster")
+    if poster is not None and tag == "video":
+        return try_urljoin(base_url, poster) or poster
+    data = get_attr(el, "data")
+    if data is not None and tag == "object":
+        return try_urljoin(base_url, data) or data
+    v = vcp_text(el)
+    if v is not None:
+        return try_urljoin(base_url, v) or v
+    if tag == "abbr":
+        title = get_attr(el, "title")
+        if title is not None:
+            return try_urljoin(base_url, title) or title
+    value = get_attr(el, "value")
+    if value is not None and tag in {"data", "input"}:
+        return try_urljoin(base_url, value) or value
+    txt = text_content(el).strip()
+    return try_urljoin(base_url, txt) or txt
+_TIME_ONLY_RE = re.compile(_TIME_RE + "$")
+_DATETIME_RE = _DATETIME_RE_COMPILED
+def parse_dt(el: Element, *, default_date: str | None) -> DtResult:
+    v = vcp_datetime(el, default_date)
+    if v is not None:
+        return DtResult(value=v[0], date=v[1])
+    tag = el.name.lower()
+    prop_value: str
+    from_attr = False
+    if tag in {"time", "ins", "del"}:
+        dt = get_attr(el, "datetime")
+        if dt is not None:
+            prop_value = dt
+            from_attr = True
+        else:
+            prop_value = text_content(el)
+    elif tag == "abbr":
+        title = get_attr(el, "title")
+        if title is not None:
+            prop_value = title
+            from_attr = True
+        else:
+            prop_value = text_content(el)
+    elif tag in {"data", "input"}:
+        value = get_attr(el, "value")
+        if value is not None:
+            prop_value = value
+            from_attr = True
+        else:
+            prop_value = text_content(el)
+    else:
+        prop_value = text_content(el)
+    stripped = prop_value.strip()
+    time_match = _TIME_ONLY_RE.match(stripped)
+    if time_match and default_date:
+        combined = f"{default_date} {stripped}"
+        match = _DATETIME_RE.match(combined)
+        return DtResult(value=normalize_datetime(combined, match=match), date=default_date)
+    match = _DATETIME_RE.match(stripped)
+    if match:
+        normalized = normalize_datetime(stripped, match=match)
+        # If normalization didn't change (no AM/PM), preserve original attribute spacing.
+        if from_attr and normalized == stripped:
+            return DtResult(value=prop_value, date=match.group("date"))
+        return DtResult(value=normalized, date=match.group("date"))
+    date_match = re.match(_DATE_RE + "$", stripped)
+    return DtResult(
+        value=(prop_value if from_attr else stripped),
+        date=(date_match.group(0) if date_match else None),
+    )
+_URL_ATTRS_IN_E = ("href", "src", "cite", "data", "poster")
+def _inner_html(el: Element) -> str:
+    return "".join(_serialize_node(child) for child in el.children or [])
+def _escape_text(text: str) -> str:
+    return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
+def _escape_attr_value(value: str) -> str:
+    return value.replace("&", "&amp;").replace('"', "&quot;")
+def _serialize_element(el: Element) -> str:
+    name = el.name
+    parts: list[str] = [f"<{name}"]
+    for key, value in (el.attrs or {}).items():
+        if value is None:
+            parts.append(f" {key}")
+        else:
+            parts.append(f' {key}="{_escape_attr_value(str(value))}"')
+    parts.append(">")
+    start = "".join(parts)
+    if name.lower() in VOID_ELEMENTS:
+        return start
+    inner = "".join(_serialize_node(child) for child in el.children or [])
+    return f"{start}{inner}</{name}>"
+def _serialize_node(node: Any) -> str:
+    name = getattr(node, "name", "")
+    if name == "#text":
+        data = getattr(node, "data", None)
+        return _escape_text(str(data)) if data is not None else ""
+    if name == "#comment":
+        data = getattr(node, "data", "") or ""
+        return f"<!--{data}-->"
+    if name in {"!doctype", "#document", "#document-fragment"}:
+        return "".join(_serialize_node(child) for child in getattr(node, "children", None) or [])
+    if name.lower() == "template":
+        return ""
+    if is_element(node):
+        return _serialize_element(node)
+    return ""
+def parse_e(
+    el: Element,
+    *,
+    base_url: str | None,
+    root_lang: str | None,
+    document_lang: str | None,
+) -> EValue:
+    clone = el.clone_node(deep=True)  # type: ignore[attr-defined]
+    for tag in iter_preorder_elements(clone):
+        for attr in _URL_ATTRS_IN_E:
+            val = get_attr(tag, attr)
+            if val is not None:
+                tag.attrs[attr] = try_urljoin(base_url, val)
+    out: EValue = {
+        "value": text_content(el, replace_img=True, base_url=base_url).strip(),
+        "html": "",
+    }
+    lang = get_attr(el, "lang") or root_lang or document_lang
+    if lang:
+        out["lang"] = lang
+    out["html"] = _inner_html(clone).strip()
+    return out
+def is_microformat_root(el: Element) -> bool:
+    return has_root_class(get_classes(el))
+def root_types(el: Element) -> list[str]:
+    return mf2_root_types(get_classes(el))
+def property_classes(el: Element) -> list[str]:
+    return mf2_property_classes(get_classes(el))