mf2dom 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mf2dom/__init__.py +8 -0
- mf2dom/classes.py +87 -0
- mf2dom/dom.py +133 -0
- mf2dom/implied.py +166 -0
- mf2dom/parser.py +395 -0
- mf2dom/properties.py +257 -0
- mf2dom/renderer.py +601 -0
- mf2dom/text.py +66 -0
- mf2dom/types.py +57 -0
- mf2dom/urls.py +31 -0
- mf2dom/vcp.py +211 -0
- mf2dom-0.1.9.dist-info/METADATA +94 -0
- mf2dom-0.1.9.dist-info/RECORD +15 -0
- mf2dom-0.1.9.dist-info/WHEEL +4 -0
- mf2dom-0.1.9.dist-info/licenses/LICENSE.md +651 -0
mf2dom/__init__.py
ADDED
mf2dom/classes.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Microformats2 class token validation.
|
|
2
|
+
|
|
3
|
+
The official test suite (`microformats-v2-unit/names-*`) defines additional
|
|
4
|
+
constraints beyond "starts with the right prefix". This module encodes those
|
|
5
|
+
rules and provides utilities for extracting valid root and property classes.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
_DIGIT_LETTER_RE = re.compile(r"^\d+[a-z]$")
|
|
13
|
+
_DIGIT_LETTERS_DIGITS_RE = re.compile(r"^\d+[a-z]+\d+$")
|
|
14
|
+
_LETTERS_DIGITS_OPTLETTERS_RE = re.compile(r"^([a-z]+)(\d+)([a-z]*)$")
|
|
15
|
+
|
|
16
|
+
_ALLOWED_NAME_CHARS = frozenset("abcdefghijklmnopqrstuvwxyz0123456789-")
|
|
17
|
+
_MIN_DIGITS_FOR_TRAILING_LETTERS = 2
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def is_valid_mf2_name(name: str) -> bool:
|
|
21
|
+
"""Validate the mf2 *name* portion after the prefix.
|
|
22
|
+
|
|
23
|
+
Based on the official microformats2 parsing test suite:
|
|
24
|
+
- ASCII lowercase letters, digits, and hyphen only
|
|
25
|
+
- no leading/trailing hyphen, no empty segments / no '--'
|
|
26
|
+
- a purely numeric name is invalid ('p-19')
|
|
27
|
+
- numeric-only segments are allowed only as the first segment ('p-6-test')
|
|
28
|
+
- segments starting with a digit must be either:
|
|
29
|
+
- digits+single-letter ('7t')
|
|
30
|
+
- digits+letters+digits ('8t8', '8to8')
|
|
31
|
+
- segments starting with a letter may contain digits; if digits are followed by letters,
|
|
32
|
+
the digit run must be at least 2 ('t11t' valid, 'car1d' invalid)
|
|
33
|
+
"""
|
|
34
|
+
if not name:
|
|
35
|
+
return False
|
|
36
|
+
if name[0] == "-" or name[-1] == "-" or "--" in name:
|
|
37
|
+
return False
|
|
38
|
+
if any(ch not in _ALLOWED_NAME_CHARS for ch in name):
|
|
39
|
+
return False
|
|
40
|
+
|
|
41
|
+
parts = name.split("-")
|
|
42
|
+
if len(parts) == 1 and parts[0].isdigit():
|
|
43
|
+
return False
|
|
44
|
+
|
|
45
|
+
return all(_is_valid_name_part(part, is_first=(idx == 0)) for idx, part in enumerate(parts))
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _is_valid_name_part(part: str, *, is_first: bool) -> bool:
|
|
49
|
+
if part.isdigit():
|
|
50
|
+
return is_first
|
|
51
|
+
if part[0].isdigit():
|
|
52
|
+
return bool(_DIGIT_LETTER_RE.match(part) or _DIGIT_LETTERS_DIGITS_RE.match(part))
|
|
53
|
+
if part.isalpha():
|
|
54
|
+
return True
|
|
55
|
+
|
|
56
|
+
match = _LETTERS_DIGITS_OPTLETTERS_RE.match(part)
|
|
57
|
+
if not match:
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
_letters, digits, trailing = match.groups()
|
|
61
|
+
if trailing == "":
|
|
62
|
+
return True
|
|
63
|
+
return len(digits) >= _MIN_DIGITS_FOR_TRAILING_LETTERS
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def is_valid_root_class(token: str) -> bool:
|
|
67
|
+
return token.startswith("h-") and is_valid_mf2_name(token[2:])
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def is_valid_property_class(token: str) -> bool:
|
|
71
|
+
if token.startswith(("p-", "u-", "e-")):
|
|
72
|
+
return is_valid_mf2_name(token[2:])
|
|
73
|
+
if token.startswith("dt-"):
|
|
74
|
+
return is_valid_mf2_name(token[3:])
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def root_types(classes: list[str]) -> list[str]:
|
|
79
|
+
return sorted({c for c in classes if is_valid_root_class(c)})
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def property_classes(classes: list[str]) -> list[str]:
|
|
83
|
+
return [c for c in classes if is_valid_property_class(c)]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def has_root_class(classes: list[str]) -> bool:
|
|
87
|
+
return any(is_valid_root_class(c) for c in classes)
|
mf2dom/dom.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""DOM helpers for JustHTML nodes.
|
|
2
|
+
|
|
3
|
+
JustHTML exposes a lightweight DOM-like tree. This module provides:
|
|
4
|
+
- Safe element detection
|
|
5
|
+
- Deterministic traversal helpers
|
|
6
|
+
- HTML `class` parsing that follows the HTML spec (ASCII whitespace only)
|
|
7
|
+
|
|
8
|
+
Important: `class` tokens are returned in document order (a list), because some
|
|
9
|
+
microformats parsing rules are order-sensitive.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from typing import TYPE_CHECKING, Any, Protocol
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING: # pragma: no cover
|
|
19
|
+
from collections.abc import Iterable, Iterator
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class HasDom(Protocol):
|
|
23
|
+
name: str
|
|
24
|
+
parent: Any | None
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def children(self) -> list[Any]: ... # pragma: no cover
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Element(HasDom, Protocol):
|
|
31
|
+
attrs: dict[str, str | None]
|
|
32
|
+
|
|
33
|
+
def to_html(
|
|
34
|
+
self, *, pretty: bool = True, indent_size: int = 2, indent: int = 0
|
|
35
|
+
) -> str: ... # pragma: no cover
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def is_element(node: Any) -> bool:
|
|
39
|
+
name = str(getattr(node, "name", ""))
|
|
40
|
+
if not name or name.startswith(("#", "!")):
|
|
41
|
+
return False
|
|
42
|
+
return isinstance(getattr(node, "attrs", None), dict)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def iter_child_nodes(node: HasDom) -> Iterator[Any]:
|
|
46
|
+
"""Yield child nodes (including text/comment/doctype nodes)."""
|
|
47
|
+
children = getattr(node, "children", None)
|
|
48
|
+
if children:
|
|
49
|
+
yield from children
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def iter_child_elements(node: HasDom) -> Iterator[Element]:
|
|
53
|
+
"""Yield element children, skipping `<template>` elements."""
|
|
54
|
+
for child in iter_child_nodes(node):
|
|
55
|
+
if is_element(child) and getattr(child, "name", "").lower() != "template":
|
|
56
|
+
yield child
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def iter_descendants(node: HasDom) -> Iterator[Any]:
|
|
60
|
+
"""Yield descendant nodes in document order (depth-first)."""
|
|
61
|
+
stack: list[Any] = list(reversed(list(iter_child_nodes(node))))
|
|
62
|
+
while stack:
|
|
63
|
+
cur = stack.pop()
|
|
64
|
+
yield cur
|
|
65
|
+
if hasattr(cur, "children"):
|
|
66
|
+
stack.extend(reversed(list(iter_child_nodes(cur))))
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def iter_descendant_elements(node: HasDom) -> Iterator[Element]:
|
|
70
|
+
"""Yield descendant elements, skipping `<template>` elements."""
|
|
71
|
+
for cur in iter_descendants(node):
|
|
72
|
+
if is_element(cur) and getattr(cur, "name", "").lower() != "template":
|
|
73
|
+
yield cur
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def iter_preorder_elements(root: Element) -> Iterator[Element]:
|
|
77
|
+
"""Yield `root` then its descendant elements (pre-order)."""
|
|
78
|
+
yield root
|
|
79
|
+
yield from iter_descendant_elements(root)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def get_attr(el: Element, name: str) -> str | None:
|
|
83
|
+
"""Get an attribute value (or None if missing)."""
|
|
84
|
+
return el.attrs.get(name)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def set_attr(el: Element, name: str, value: str | None) -> None:
|
|
88
|
+
"""Set an attribute value (use None for boolean attributes)."""
|
|
89
|
+
el.attrs[name] = value
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get_classes(el: Element) -> list[str]:
|
|
93
|
+
"""Return `class` tokens in document order.
|
|
94
|
+
|
|
95
|
+
Per HTML, class attributes are split on ASCII whitespace only
|
|
96
|
+
(` \\t\\n\\f\\r`). Non-ASCII whitespace characters are treated as part of the token.
|
|
97
|
+
"""
|
|
98
|
+
raw = get_attr(el, "class")
|
|
99
|
+
if not raw:
|
|
100
|
+
return []
|
|
101
|
+
# Per HTML, class is split on ASCII whitespace only.
|
|
102
|
+
raw = raw.strip(" \t\n\f\r")
|
|
103
|
+
if not raw:
|
|
104
|
+
return []
|
|
105
|
+
return [c for c in _ASCII_WHITESPACE_RE.split(raw) if c]
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
_ASCII_WHITESPACE_RE = re.compile(r"[ \t\n\f\r]+")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def has_any_class(el: Element, names: Iterable[str]) -> bool:
|
|
112
|
+
classes = set(get_classes(el))
|
|
113
|
+
return any(name in classes for name in names)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def has_class_prefix(el: Element, prefixes: Iterable[str]) -> bool:
|
|
117
|
+
"""Return True if any class token starts with one of `prefixes`."""
|
|
118
|
+
prefix_tuple = tuple(prefixes)
|
|
119
|
+
return any(cls.startswith(prefix_tuple) for cls in get_classes(el))
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def ancestor_elements(el: Element) -> Iterator[Element]:
|
|
123
|
+
"""Yield ancestor elements starting from the parent."""
|
|
124
|
+
cur: Any | None = el.parent
|
|
125
|
+
while cur is not None:
|
|
126
|
+
if is_element(cur):
|
|
127
|
+
yield cur
|
|
128
|
+
cur = getattr(cur, "parent", None)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@dataclass(frozen=True, slots=True)
|
|
132
|
+
class ValueClassNodes:
|
|
133
|
+
nodes: list[Element]
|
mf2dom/implied.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Implied property parsing (name/photo/url).
|
|
2
|
+
|
|
3
|
+
Implied properties are applied when an mf2 item has no explicit corresponding
|
|
4
|
+
properties, per the mf2 parsing algorithm.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
from .classes import has_root_class, is_valid_property_class
|
|
13
|
+
from .dom import Element, get_attr, get_classes, iter_child_elements
|
|
14
|
+
from .text import text_content
|
|
15
|
+
from .urls import parse_srcset, try_urljoin
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING: # pragma: no cover
|
|
18
|
+
from .types import UrlObject, UrlValue
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _is_microformat_root(el: Element) -> bool:
|
|
22
|
+
return has_root_class(get_classes(el))
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _has_property_class(el: Element) -> bool:
|
|
26
|
+
return any(is_valid_property_class(cls) for cls in get_classes(el))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _is_implied_candidate(el: Element) -> bool:
|
|
30
|
+
return not _is_microformat_root(el) and not _has_property_class(el)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
_WHITESPACE_RE = re.compile(r"\s+")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def implied_name(root: Element, base_url: str | None) -> str:
|
|
37
|
+
"""Compute implied `name` for an item root."""
|
|
38
|
+
|
|
39
|
+
def non_empty(val: str | None) -> bool:
|
|
40
|
+
return val is not None and val != ""
|
|
41
|
+
|
|
42
|
+
def normalize_ws(val: str) -> str:
|
|
43
|
+
return _WHITESPACE_RE.sub(" ", val).strip()
|
|
44
|
+
|
|
45
|
+
if root.name.lower() in {"img", "area"}:
|
|
46
|
+
alt = get_attr(root, "alt")
|
|
47
|
+
if alt is not None:
|
|
48
|
+
return normalize_ws(alt)
|
|
49
|
+
|
|
50
|
+
if root.name.lower() == "abbr":
|
|
51
|
+
title = get_attr(root, "title")
|
|
52
|
+
if title is not None:
|
|
53
|
+
return normalize_ws(title)
|
|
54
|
+
|
|
55
|
+
children = list(iter_child_elements(root))
|
|
56
|
+
candidate: Element | None = None
|
|
57
|
+
if len(children) == 1:
|
|
58
|
+
candidate = children[0]
|
|
59
|
+
if _is_microformat_root(candidate):
|
|
60
|
+
candidate = None
|
|
61
|
+
elif candidate.name.lower() not in {"img", "area", "abbr"}:
|
|
62
|
+
grand = list(iter_child_elements(candidate))
|
|
63
|
+
if len(grand) == 1:
|
|
64
|
+
candidate = grand[0]
|
|
65
|
+
if candidate.name.lower() not in {"img", "area", "abbr"} or _is_microformat_root(
|
|
66
|
+
candidate
|
|
67
|
+
):
|
|
68
|
+
candidate = None
|
|
69
|
+
|
|
70
|
+
if candidate is not None:
|
|
71
|
+
if candidate.name.lower() in {"img", "area"}:
|
|
72
|
+
alt = get_attr(candidate, "alt")
|
|
73
|
+
if non_empty(alt):
|
|
74
|
+
return normalize_ws(alt or "")
|
|
75
|
+
if candidate.name.lower() == "abbr":
|
|
76
|
+
title = get_attr(candidate, "title")
|
|
77
|
+
if non_empty(title):
|
|
78
|
+
return normalize_ws(title or "")
|
|
79
|
+
|
|
80
|
+
return normalize_ws(text_content(root, replace_img=True, img_to_src=False, base_url=base_url))
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _img_value(img: Element, base_url: str | None) -> UrlValue | None:
|
|
84
|
+
src = get_attr(img, "src")
|
|
85
|
+
if src is None:
|
|
86
|
+
return None
|
|
87
|
+
abs_src = try_urljoin(base_url, src) or src
|
|
88
|
+
alt = get_attr(img, "alt")
|
|
89
|
+
srcset = get_attr(img, "srcset")
|
|
90
|
+
if alt is not None or srcset:
|
|
91
|
+
out: UrlObject = {"value": abs_src}
|
|
92
|
+
if alt is not None:
|
|
93
|
+
out["alt"] = alt
|
|
94
|
+
if srcset:
|
|
95
|
+
out["srcset"] = parse_srcset(srcset, base_url)
|
|
96
|
+
return out
|
|
97
|
+
return abs_src
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def implied_photo(root: Element, base_url: str | None) -> UrlValue | None:
|
|
101
|
+
"""Compute implied `photo` for an item root."""
|
|
102
|
+
if root.name.lower() == "img":
|
|
103
|
+
return _img_value(root, base_url)
|
|
104
|
+
if root.name.lower() == "object":
|
|
105
|
+
data = get_attr(root, "data")
|
|
106
|
+
if data is not None:
|
|
107
|
+
return try_urljoin(base_url, data) or data
|
|
108
|
+
|
|
109
|
+
def has_u_property(el: Element) -> bool:
|
|
110
|
+
return any(cls.startswith("u-") and is_valid_property_class(cls) for cls in get_classes(el))
|
|
111
|
+
|
|
112
|
+
def photo_child(children: list[Element]) -> Element | None:
|
|
113
|
+
imgs = [c for c in children if c.name.lower() == "img"]
|
|
114
|
+
if len(imgs) == 1 and not _is_microformat_root(imgs[0]) and not has_u_property(imgs[0]):
|
|
115
|
+
return imgs[0]
|
|
116
|
+
objs = [c for c in children if c.name.lower() == "object"]
|
|
117
|
+
if len(objs) == 1 and not _is_microformat_root(objs[0]) and not has_u_property(objs[0]):
|
|
118
|
+
return objs[0]
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
children = list(iter_child_elements(root))
|
|
122
|
+
candidate = photo_child(children)
|
|
123
|
+
if candidate is None and len(children) == 1 and not _is_microformat_root(children[0]):
|
|
124
|
+
candidate = photo_child(list(iter_child_elements(children[0])))
|
|
125
|
+
|
|
126
|
+
if candidate is None:
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
if candidate.name.lower() == "img":
|
|
130
|
+
return _img_value(candidate, base_url)
|
|
131
|
+
data = get_attr(candidate, "data")
|
|
132
|
+
if data is not None:
|
|
133
|
+
return try_urljoin(base_url, data) or data
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def implied_url(root: Element, base_url: str | None) -> str | None:
|
|
138
|
+
"""Compute implied `url` for an item root."""
|
|
139
|
+
if root.name.lower() in {"a", "area"}:
|
|
140
|
+
href = get_attr(root, "href")
|
|
141
|
+
if href is not None:
|
|
142
|
+
return try_urljoin(base_url, href) or href
|
|
143
|
+
|
|
144
|
+
def has_u_property(el: Element) -> bool:
|
|
145
|
+
return any(cls.startswith("u-") and is_valid_property_class(cls) for cls in get_classes(el))
|
|
146
|
+
|
|
147
|
+
def url_child(children: list[Element]) -> Element | None:
|
|
148
|
+
as_ = [c for c in children if c.name.lower() == "a"]
|
|
149
|
+
if len(as_) == 1 and not _is_microformat_root(as_[0]) and not has_u_property(as_[0]):
|
|
150
|
+
return as_[0]
|
|
151
|
+
areas = [c for c in children if c.name.lower() == "area"]
|
|
152
|
+
if len(areas) == 1 and not _is_microformat_root(areas[0]) and not has_u_property(areas[0]):
|
|
153
|
+
return areas[0]
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
children = list(iter_child_elements(root))
|
|
157
|
+
candidate = url_child(children)
|
|
158
|
+
if candidate is None and len(children) == 1 and not _is_microformat_root(children[0]):
|
|
159
|
+
candidate = url_child(list(iter_child_elements(children[0])))
|
|
160
|
+
if candidate is None:
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
href = get_attr(candidate, "href")
|
|
164
|
+
if href is None:
|
|
165
|
+
return None
|
|
166
|
+
return try_urljoin(base_url, href) or href
|