mf2dom 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mf2dom/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ """mf2dom: Microformats2 parsing + rendering using JustHTML."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .parser import parse, parse_async
6
+ from .renderer import render
7
+
8
+ __all__ = ["parse", "parse_async", "render"]
mf2dom/classes.py ADDED
@@ -0,0 +1,87 @@
1
+ """Microformats2 class token validation.
2
+
3
+ The official test suite (`microformats-v2-unit/names-*`) defines additional
4
+ constraints beyond "starts with the right prefix". This module encodes those
5
+ rules and provides utilities for extracting valid root and property classes.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+
12
+ _DIGIT_LETTER_RE = re.compile(r"^\d+[a-z]$")
13
+ _DIGIT_LETTERS_DIGITS_RE = re.compile(r"^\d+[a-z]+\d+$")
14
+ _LETTERS_DIGITS_OPTLETTERS_RE = re.compile(r"^([a-z]+)(\d+)([a-z]*)$")
15
+
16
+ _ALLOWED_NAME_CHARS = frozenset("abcdefghijklmnopqrstuvwxyz0123456789-")
17
+ _MIN_DIGITS_FOR_TRAILING_LETTERS = 2
18
+
19
+
20
+ def is_valid_mf2_name(name: str) -> bool:
21
+ """Validate the mf2 *name* portion after the prefix.
22
+
23
+ Based on the official microformats2 parsing test suite:
24
+ - ASCII lowercase letters, digits, and hyphen only
25
+ - no leading/trailing hyphen, no empty segments / no '--'
26
+ - a purely numeric name is invalid ('p-19')
27
+ - numeric-only segments are allowed only as the first segment ('p-6-test')
28
+ - segments starting with a digit must be either:
29
+ - digits+single-letter ('7t')
30
+ - digits+letters+digits ('8t8', '8to8')
31
+ - segments starting with a letter may contain digits; if digits are followed by letters,
32
+ the digit run must be at least 2 ('t11t' valid, 'car1d' invalid)
33
+ """
34
+ if not name:
35
+ return False
36
+ if name[0] == "-" or name[-1] == "-" or "--" in name:
37
+ return False
38
+ if any(ch not in _ALLOWED_NAME_CHARS for ch in name):
39
+ return False
40
+
41
+ parts = name.split("-")
42
+ if len(parts) == 1 and parts[0].isdigit():
43
+ return False
44
+
45
+ return all(_is_valid_name_part(part, is_first=(idx == 0)) for idx, part in enumerate(parts))
46
+
47
+
48
+ def _is_valid_name_part(part: str, *, is_first: bool) -> bool:
49
+ if part.isdigit():
50
+ return is_first
51
+ if part[0].isdigit():
52
+ return bool(_DIGIT_LETTER_RE.match(part) or _DIGIT_LETTERS_DIGITS_RE.match(part))
53
+ if part.isalpha():
54
+ return True
55
+
56
+ match = _LETTERS_DIGITS_OPTLETTERS_RE.match(part)
57
+ if not match:
58
+ return False
59
+
60
+ _letters, digits, trailing = match.groups()
61
+ if trailing == "":
62
+ return True
63
+ return len(digits) >= _MIN_DIGITS_FOR_TRAILING_LETTERS
64
+
65
+
66
+ def is_valid_root_class(token: str) -> bool:
67
+ return token.startswith("h-") and is_valid_mf2_name(token[2:])
68
+
69
+
70
+ def is_valid_property_class(token: str) -> bool:
71
+ if token.startswith(("p-", "u-", "e-")):
72
+ return is_valid_mf2_name(token[2:])
73
+ if token.startswith("dt-"):
74
+ return is_valid_mf2_name(token[3:])
75
+ return False
76
+
77
+
78
+ def root_types(classes: list[str]) -> list[str]:
79
+ return sorted({c for c in classes if is_valid_root_class(c)})
80
+
81
+
82
+ def property_classes(classes: list[str]) -> list[str]:
83
+ return [c for c in classes if is_valid_property_class(c)]
84
+
85
+
86
+ def has_root_class(classes: list[str]) -> bool:
87
+ return any(is_valid_root_class(c) for c in classes)
mf2dom/dom.py ADDED
@@ -0,0 +1,133 @@
1
+ """DOM helpers for JustHTML nodes.
2
+
3
+ JustHTML exposes a lightweight DOM-like tree. This module provides:
4
+ - Safe element detection
5
+ - Deterministic traversal helpers
6
+ - HTML `class` parsing that follows the HTML spec (ASCII whitespace only)
7
+
8
+ Important: `class` tokens are returned in document order (a list), because some
9
+ microformats parsing rules are order-sensitive.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ from dataclasses import dataclass
16
+ from typing import TYPE_CHECKING, Any, Protocol
17
+
18
+ if TYPE_CHECKING: # pragma: no cover
19
+ from collections.abc import Iterable, Iterator
20
+
21
+
22
+ class HasDom(Protocol):
23
+ name: str
24
+ parent: Any | None
25
+
26
+ @property
27
+ def children(self) -> list[Any]: ... # pragma: no cover
28
+
29
+
30
+ class Element(HasDom, Protocol):
31
+ attrs: dict[str, str | None]
32
+
33
+ def to_html(
34
+ self, *, pretty: bool = True, indent_size: int = 2, indent: int = 0
35
+ ) -> str: ... # pragma: no cover
36
+
37
+
38
+ def is_element(node: Any) -> bool:
39
+ name = str(getattr(node, "name", ""))
40
+ if not name or name.startswith(("#", "!")):
41
+ return False
42
+ return isinstance(getattr(node, "attrs", None), dict)
43
+
44
+
45
+ def iter_child_nodes(node: HasDom) -> Iterator[Any]:
46
+ """Yield child nodes (including text/comment/doctype nodes)."""
47
+ children = getattr(node, "children", None)
48
+ if children:
49
+ yield from children
50
+
51
+
52
+ def iter_child_elements(node: HasDom) -> Iterator[Element]:
53
+ """Yield element children, skipping `<template>` elements."""
54
+ for child in iter_child_nodes(node):
55
+ if is_element(child) and getattr(child, "name", "").lower() != "template":
56
+ yield child
57
+
58
+
59
+ def iter_descendants(node: HasDom) -> Iterator[Any]:
60
+ """Yield descendant nodes in document order (depth-first)."""
61
+ stack: list[Any] = list(reversed(list(iter_child_nodes(node))))
62
+ while stack:
63
+ cur = stack.pop()
64
+ yield cur
65
+ if hasattr(cur, "children"):
66
+ stack.extend(reversed(list(iter_child_nodes(cur))))
67
+
68
+
69
+ def iter_descendant_elements(node: HasDom) -> Iterator[Element]:
70
+ """Yield descendant elements, skipping `<template>` elements."""
71
+ for cur in iter_descendants(node):
72
+ if is_element(cur) and getattr(cur, "name", "").lower() != "template":
73
+ yield cur
74
+
75
+
76
+ def iter_preorder_elements(root: Element) -> Iterator[Element]:
77
+ """Yield `root` then its descendant elements (pre-order)."""
78
+ yield root
79
+ yield from iter_descendant_elements(root)
80
+
81
+
82
+ def get_attr(el: Element, name: str) -> str | None:
83
+ """Get an attribute value (or None if missing)."""
84
+ return el.attrs.get(name)
85
+
86
+
87
+ def set_attr(el: Element, name: str, value: str | None) -> None:
88
+ """Set an attribute value (use None for boolean attributes)."""
89
+ el.attrs[name] = value
90
+
91
+
92
+ def get_classes(el: Element) -> list[str]:
93
+ """Return `class` tokens in document order.
94
+
95
+ Per HTML, class attributes are split on ASCII whitespace only
96
+ (` \\t\\n\\f\\r`). Non-ASCII whitespace characters are treated as part of the token.
97
+ """
98
+ raw = get_attr(el, "class")
99
+ if not raw:
100
+ return []
101
+ # Per HTML, class is split on ASCII whitespace only.
102
+ raw = raw.strip(" \t\n\f\r")
103
+ if not raw:
104
+ return []
105
+ return [c for c in _ASCII_WHITESPACE_RE.split(raw) if c]
106
+
107
+
108
+ _ASCII_WHITESPACE_RE = re.compile(r"[ \t\n\f\r]+")
109
+
110
+
111
+ def has_any_class(el: Element, names: Iterable[str]) -> bool:
112
+ classes = set(get_classes(el))
113
+ return any(name in classes for name in names)
114
+
115
+
116
+ def has_class_prefix(el: Element, prefixes: Iterable[str]) -> bool:
117
+ """Return True if any class token starts with one of `prefixes`."""
118
+ prefix_tuple = tuple(prefixes)
119
+ return any(cls.startswith(prefix_tuple) for cls in get_classes(el))
120
+
121
+
122
+ def ancestor_elements(el: Element) -> Iterator[Element]:
123
+ """Yield ancestor elements starting from the parent."""
124
+ cur: Any | None = el.parent
125
+ while cur is not None:
126
+ if is_element(cur):
127
+ yield cur
128
+ cur = getattr(cur, "parent", None)
129
+
130
+
131
+ @dataclass(frozen=True, slots=True)
132
+ class ValueClassNodes:
133
+ nodes: list[Element]
mf2dom/implied.py ADDED
@@ -0,0 +1,166 @@
1
+ """Implied property parsing (name/photo/url).
2
+
3
+ Implied properties are applied when an mf2 item has no explicit corresponding
4
+ properties, per the mf2 parsing algorithm.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import re
10
+ from typing import TYPE_CHECKING
11
+
12
+ from .classes import has_root_class, is_valid_property_class
13
+ from .dom import Element, get_attr, get_classes, iter_child_elements
14
+ from .text import text_content
15
+ from .urls import parse_srcset, try_urljoin
16
+
17
+ if TYPE_CHECKING: # pragma: no cover
18
+ from .types import UrlObject, UrlValue
19
+
20
+
21
+ def _is_microformat_root(el: Element) -> bool:
22
+ return has_root_class(get_classes(el))
23
+
24
+
25
+ def _has_property_class(el: Element) -> bool:
26
+ return any(is_valid_property_class(cls) for cls in get_classes(el))
27
+
28
+
29
+ def _is_implied_candidate(el: Element) -> bool:
30
+ return not _is_microformat_root(el) and not _has_property_class(el)
31
+
32
+
33
+ _WHITESPACE_RE = re.compile(r"\s+")
34
+
35
+
36
+ def implied_name(root: Element, base_url: str | None) -> str:
37
+ """Compute implied `name` for an item root."""
38
+
39
+ def non_empty(val: str | None) -> bool:
40
+ return val is not None and val != ""
41
+
42
+ def normalize_ws(val: str) -> str:
43
+ return _WHITESPACE_RE.sub(" ", val).strip()
44
+
45
+ if root.name.lower() in {"img", "area"}:
46
+ alt = get_attr(root, "alt")
47
+ if alt is not None:
48
+ return normalize_ws(alt)
49
+
50
+ if root.name.lower() == "abbr":
51
+ title = get_attr(root, "title")
52
+ if title is not None:
53
+ return normalize_ws(title)
54
+
55
+ children = list(iter_child_elements(root))
56
+ candidate: Element | None = None
57
+ if len(children) == 1:
58
+ candidate = children[0]
59
+ if _is_microformat_root(candidate):
60
+ candidate = None
61
+ elif candidate.name.lower() not in {"img", "area", "abbr"}:
62
+ grand = list(iter_child_elements(candidate))
63
+ if len(grand) == 1:
64
+ candidate = grand[0]
65
+ if candidate.name.lower() not in {"img", "area", "abbr"} or _is_microformat_root(
66
+ candidate
67
+ ):
68
+ candidate = None
69
+
70
+ if candidate is not None:
71
+ if candidate.name.lower() in {"img", "area"}:
72
+ alt = get_attr(candidate, "alt")
73
+ if non_empty(alt):
74
+ return normalize_ws(alt or "")
75
+ if candidate.name.lower() == "abbr":
76
+ title = get_attr(candidate, "title")
77
+ if non_empty(title):
78
+ return normalize_ws(title or "")
79
+
80
+ return normalize_ws(text_content(root, replace_img=True, img_to_src=False, base_url=base_url))
81
+
82
+
83
+ def _img_value(img: Element, base_url: str | None) -> UrlValue | None:
84
+ src = get_attr(img, "src")
85
+ if src is None:
86
+ return None
87
+ abs_src = try_urljoin(base_url, src) or src
88
+ alt = get_attr(img, "alt")
89
+ srcset = get_attr(img, "srcset")
90
+ if alt is not None or srcset:
91
+ out: UrlObject = {"value": abs_src}
92
+ if alt is not None:
93
+ out["alt"] = alt
94
+ if srcset:
95
+ out["srcset"] = parse_srcset(srcset, base_url)
96
+ return out
97
+ return abs_src
98
+
99
+
100
+ def implied_photo(root: Element, base_url: str | None) -> UrlValue | None:
101
+ """Compute implied `photo` for an item root."""
102
+ if root.name.lower() == "img":
103
+ return _img_value(root, base_url)
104
+ if root.name.lower() == "object":
105
+ data = get_attr(root, "data")
106
+ if data is not None:
107
+ return try_urljoin(base_url, data) or data
108
+
109
+ def has_u_property(el: Element) -> bool:
110
+ return any(cls.startswith("u-") and is_valid_property_class(cls) for cls in get_classes(el))
111
+
112
+ def photo_child(children: list[Element]) -> Element | None:
113
+ imgs = [c for c in children if c.name.lower() == "img"]
114
+ if len(imgs) == 1 and not _is_microformat_root(imgs[0]) and not has_u_property(imgs[0]):
115
+ return imgs[0]
116
+ objs = [c for c in children if c.name.lower() == "object"]
117
+ if len(objs) == 1 and not _is_microformat_root(objs[0]) and not has_u_property(objs[0]):
118
+ return objs[0]
119
+ return None
120
+
121
+ children = list(iter_child_elements(root))
122
+ candidate = photo_child(children)
123
+ if candidate is None and len(children) == 1 and not _is_microformat_root(children[0]):
124
+ candidate = photo_child(list(iter_child_elements(children[0])))
125
+
126
+ if candidate is None:
127
+ return None
128
+
129
+ if candidate.name.lower() == "img":
130
+ return _img_value(candidate, base_url)
131
+ data = get_attr(candidate, "data")
132
+ if data is not None:
133
+ return try_urljoin(base_url, data) or data
134
+ return None
135
+
136
+
137
+ def implied_url(root: Element, base_url: str | None) -> str | None:
138
+ """Compute implied `url` for an item root."""
139
+ if root.name.lower() in {"a", "area"}:
140
+ href = get_attr(root, "href")
141
+ if href is not None:
142
+ return try_urljoin(base_url, href) or href
143
+
144
+ def has_u_property(el: Element) -> bool:
145
+ return any(cls.startswith("u-") and is_valid_property_class(cls) for cls in get_classes(el))
146
+
147
+ def url_child(children: list[Element]) -> Element | None:
148
+ as_ = [c for c in children if c.name.lower() == "a"]
149
+ if len(as_) == 1 and not _is_microformat_root(as_[0]) and not has_u_property(as_[0]):
150
+ return as_[0]
151
+ areas = [c for c in children if c.name.lower() == "area"]
152
+ if len(areas) == 1 and not _is_microformat_root(areas[0]) and not has_u_property(areas[0]):
153
+ return areas[0]
154
+ return None
155
+
156
+ children = list(iter_child_elements(root))
157
+ candidate = url_child(children)
158
+ if candidate is None and len(children) == 1 and not _is_microformat_root(children[0]):
159
+ candidate = url_child(list(iter_child_elements(children[0])))
160
+ if candidate is None:
161
+ return None
162
+
163
+ href = get_attr(candidate, "href")
164
+ if href is None:
165
+ return None
166
+ return try_urljoin(base_url, href) or href