mf2dom 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mf2dom/__init__.py +8 -0
- mf2dom/classes.py +87 -0
- mf2dom/dom.py +133 -0
- mf2dom/implied.py +166 -0
- mf2dom/parser.py +395 -0
- mf2dom/properties.py +257 -0
- mf2dom/renderer.py +601 -0
- mf2dom/text.py +66 -0
- mf2dom/types.py +57 -0
- mf2dom/urls.py +31 -0
- mf2dom/vcp.py +211 -0
- mf2dom-0.1.9.dist-info/METADATA +94 -0
- mf2dom-0.1.9.dist-info/RECORD +15 -0
- mf2dom-0.1.9.dist-info/WHEEL +4 -0
- mf2dom-0.1.9.dist-info/licenses/LICENSE.md +651 -0
mf2dom/types.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Type definitions for mf2dom.
|
|
2
|
+
|
|
3
|
+
These TypedDicts model the JSON output defined by the Microformats2 parsing
|
|
4
|
+
specification and the official test suite.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import NotRequired, TypeAlias, TypedDict
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class UrlObject(TypedDict):
|
|
13
|
+
value: str
|
|
14
|
+
alt: NotRequired[str]
|
|
15
|
+
srcset: NotRequired[dict[str, str]]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class EValue(TypedDict, total=False):
|
|
19
|
+
value: str
|
|
20
|
+
html: str
|
|
21
|
+
lang: str
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
PropertyPrimitive: TypeAlias = str
|
|
25
|
+
UrlValue: TypeAlias = str | UrlObject
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Mf2Item(TypedDict, total=False):
|
|
29
|
+
type: list[str]
|
|
30
|
+
properties: dict[str, list[PropertyValue]]
|
|
31
|
+
id: str
|
|
32
|
+
children: list[Mf2Item]
|
|
33
|
+
value: PropertyValue
|
|
34
|
+
html: str
|
|
35
|
+
lang: str
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
PropertyValue: TypeAlias = PropertyPrimitive | UrlObject | EValue | Mf2Item
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class RelUrl(TypedDict, total=False):
|
|
42
|
+
rels: list[str]
|
|
43
|
+
text: str
|
|
44
|
+
media: str
|
|
45
|
+
hreflang: str
|
|
46
|
+
type: str
|
|
47
|
+
title: str
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
Mf2Document = TypedDict(
|
|
51
|
+
"Mf2Document",
|
|
52
|
+
{
|
|
53
|
+
"items": list[Mf2Item],
|
|
54
|
+
"rels": dict[str, list[str]],
|
|
55
|
+
"rel-urls": dict[str, RelUrl],
|
|
56
|
+
},
|
|
57
|
+
)
|
mf2dom/urls.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""URL utilities (joining and `srcset` parsing)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from urllib.parse import urljoin
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def try_urljoin(base: str | None, url: str | None, *, allow_fragments: bool = True) -> str | None:
|
|
10
|
+
if url is None:
|
|
11
|
+
return None
|
|
12
|
+
if url.startswith(("https://", "http://")):
|
|
13
|
+
return url
|
|
14
|
+
if not base:
|
|
15
|
+
return url
|
|
16
|
+
try:
|
|
17
|
+
return urljoin(base, url, allow_fragments=allow_fragments)
|
|
18
|
+
except ValueError:
|
|
19
|
+
return url
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
_SRCSET_RE = re.compile(r"(\S+)\s*([\d.]+[xw])?\s*,?\s*", re.MULTILINE)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def parse_srcset(srcset: str, base_url: str | None) -> dict[str, str]:
|
|
26
|
+
sources: dict[str, str] = {}
|
|
27
|
+
for url, descriptor in _SRCSET_RE.findall(srcset):
|
|
28
|
+
key = descriptor or "1x"
|
|
29
|
+
if key not in sources:
|
|
30
|
+
sources[key] = try_urljoin(base_url, url.strip(",")) or url.strip(",")
|
|
31
|
+
return sources
|
mf2dom/vcp.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""Value Class Pattern (VCP) parsing.
|
|
2
|
+
|
|
3
|
+
Implements the mf2 Value Class Pattern for `p-*`, `u-*`, and `dt-*` properties.
|
|
4
|
+
See: https://microformats.org/wiki/value-class-pattern
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
from .classes import has_root_class, is_valid_property_class
|
|
13
|
+
from .dom import (
|
|
14
|
+
Element,
|
|
15
|
+
get_attr,
|
|
16
|
+
get_classes,
|
|
17
|
+
iter_child_elements,
|
|
18
|
+
)
|
|
19
|
+
from .text import text_content
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING: # pragma: no cover
|
|
22
|
+
from collections.abc import Iterator
|
|
23
|
+
|
|
24
|
+
_DATE_RE = r"(\d{4}-\d{2}-\d{2})|(\d{4}-\d{3})"
|
|
25
|
+
_SEC_RE = r"(:(?P<second>\d{2})(\.\d+)?)"
|
|
26
|
+
_RAWTIME_RE = rf"(?P<hour>\d{{1,2}})(:(?P<minute>\d{{2}}){_SEC_RE}?)?"
|
|
27
|
+
_AMPM_RE = r"am|pm|a\.m\.|p\.m\.|AM|PM|A\.M\.|P\.M\."
|
|
28
|
+
_TIMEZONE_RE = r"Z|[+-]\d{1,2}:?\d{2}?"
|
|
29
|
+
_TIME_RE = rf"(?P<rawtime>{_RAWTIME_RE})( ?(?P<ampm>{_AMPM_RE}))?( ?(?P<tz>{_TIMEZONE_RE}))?"
|
|
30
|
+
_DATETIME_RE = rf"(?P<date>{_DATE_RE})(?P<separator>[Tt ])(?P<time>{_TIME_RE})"
|
|
31
|
+
|
|
32
|
+
_TIME_RE_COMPILED = re.compile(_TIME_RE + "$")
|
|
33
|
+
_DATE_RE_COMPILED = re.compile(_DATE_RE + "$")
|
|
34
|
+
_TZ_ONLY_RE_COMPILED = re.compile(_TIMEZONE_RE + "$")
|
|
35
|
+
_DATETIME_RE_COMPILED = re.compile(_DATETIME_RE + "$")
|
|
36
|
+
|
|
37
|
+
_HOURS_IN_HALF_DAY = 12
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _is_value_node(el: Element) -> bool:
|
|
41
|
+
classes = set(get_classes(el))
|
|
42
|
+
return "value" in classes or "value-title" in classes
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _is_property_node(el: Element) -> bool:
|
|
46
|
+
return any(is_valid_property_class(c) for c in get_classes(el))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _is_microformat_root(el: Element) -> bool:
|
|
50
|
+
return has_root_class(get_classes(el))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _iter_value_nodes(root: Element) -> Iterator[Element]:
|
|
54
|
+
# Descendants (not self), in document order, but do not traverse into nested
|
|
55
|
+
# properties or microformats unless the node itself is a value node.
|
|
56
|
+
stack: list[Element] = list(reversed(list(iter_child_elements(root))))
|
|
57
|
+
while stack:
|
|
58
|
+
el = stack.pop()
|
|
59
|
+
if _is_value_node(el):
|
|
60
|
+
yield el
|
|
61
|
+
continue
|
|
62
|
+
if _is_property_node(el) or _is_microformat_root(el):
|
|
63
|
+
continue
|
|
64
|
+
stack.extend(reversed(list(iter_child_elements(el))))
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def text(root: Element) -> str | None:
|
|
68
|
+
parts: list[str] = []
|
|
69
|
+
for el in _iter_value_nodes(root):
|
|
70
|
+
classes = set(get_classes(el))
|
|
71
|
+
if "value-title" in classes:
|
|
72
|
+
title = get_attr(el, "title")
|
|
73
|
+
if title is not None:
|
|
74
|
+
parts.append(title)
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
tag = el.name.lower()
|
|
78
|
+
if tag in {"img", "area"}:
|
|
79
|
+
alt = get_attr(el, "alt")
|
|
80
|
+
if alt is not None:
|
|
81
|
+
parts.append(alt)
|
|
82
|
+
else:
|
|
83
|
+
parts.append(text_content(el))
|
|
84
|
+
elif tag in {"data", "input"}:
|
|
85
|
+
val = get_attr(el, "value")
|
|
86
|
+
parts.append(val if val is not None else text_content(el))
|
|
87
|
+
elif tag == "abbr":
|
|
88
|
+
title = get_attr(el, "title")
|
|
89
|
+
parts.append(title if title is not None else text_content(el))
|
|
90
|
+
else:
|
|
91
|
+
parts.append(text_content(el))
|
|
92
|
+
|
|
93
|
+
if not parts:
|
|
94
|
+
return None
|
|
95
|
+
return "".join(parts)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def datetime(root: Element, default_date: str | None) -> tuple[str, str | None] | None:
|
|
99
|
+
raw_parts: list[tuple[str, bool]] = []
|
|
100
|
+
for el in _iter_value_nodes(root):
|
|
101
|
+
classes = set(get_classes(el))
|
|
102
|
+
if "value-title" in classes:
|
|
103
|
+
title = get_attr(el, "title")
|
|
104
|
+
if title:
|
|
105
|
+
raw_parts.append((title.strip(), False))
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
tag = el.name.lower()
|
|
109
|
+
if tag in {"img", "area"}:
|
|
110
|
+
alt = get_attr(el, "alt") or text_content(el)
|
|
111
|
+
if alt:
|
|
112
|
+
raw_parts.append((alt.strip(), False))
|
|
113
|
+
elif tag in {"data", "input"}:
|
|
114
|
+
val = get_attr(el, "value") or text_content(el)
|
|
115
|
+
if val:
|
|
116
|
+
raw_parts.append((val.strip(), False))
|
|
117
|
+
elif tag == "abbr":
|
|
118
|
+
title = get_attr(el, "title") or text_content(el)
|
|
119
|
+
if title:
|
|
120
|
+
raw_parts.append((title.strip(), False))
|
|
121
|
+
elif tag in {"del", "ins", "time"}:
|
|
122
|
+
dt = get_attr(el, "datetime") or text_content(el)
|
|
123
|
+
if dt:
|
|
124
|
+
raw_parts.append((dt.strip(), True))
|
|
125
|
+
else:
|
|
126
|
+
txt = text_content(el)
|
|
127
|
+
if txt:
|
|
128
|
+
raw_parts.append((txt.strip(), False))
|
|
129
|
+
|
|
130
|
+
if not raw_parts:
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
date_part: str | None = None
|
|
134
|
+
time_part: str | None = None
|
|
135
|
+
time_part_from_time_el = False
|
|
136
|
+
tz_part: str | None = None
|
|
137
|
+
|
|
138
|
+
for part, from_time_el in raw_parts:
|
|
139
|
+
dt_match = _DATETIME_RE_COMPILED.match(part)
|
|
140
|
+
if dt_match:
|
|
141
|
+
if date_part is None and time_part is None and tz_part is None:
|
|
142
|
+
normalized = normalize_datetime(part, match=dt_match)
|
|
143
|
+
return normalized, dt_match.group("date")
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
if date_part is None and _DATE_RE_COMPILED.match(part):
|
|
147
|
+
date_part = part
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
time_match = _TIME_RE_COMPILED.match(part)
|
|
151
|
+
if time_part is None and time_match:
|
|
152
|
+
time_part = part
|
|
153
|
+
time_part_from_time_el = from_time_el
|
|
154
|
+
tz_group = time_match.group("tz")
|
|
155
|
+
if tz_part is None and tz_group:
|
|
156
|
+
tz_part = tz_group
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
if tz_part is None and _TZ_ONLY_RE_COMPILED.match(part):
|
|
160
|
+
tz_part = part
|
|
161
|
+
|
|
162
|
+
if date_part is None and time_part is None:
|
|
163
|
+
return None
|
|
164
|
+
if date_part is None and time_part is not None:
|
|
165
|
+
date_part = default_date
|
|
166
|
+
|
|
167
|
+
value = f"{date_part} {time_part}" if date_part and time_part else date_part or time_part or ""
|
|
168
|
+
|
|
169
|
+
if tz_part and time_part and tz_part not in value:
|
|
170
|
+
value += tz_part
|
|
171
|
+
|
|
172
|
+
if time_part_from_time_el:
|
|
173
|
+
# In the official test suite, timezones with a colon originating from <time>/<ins>/<del>
|
|
174
|
+
# value nodes are normalized to the compact form (e.g. -08:00 => -0800).
|
|
175
|
+
value = re.sub(r"([+-]\d{1,2}):(\d{2})$", r"\1\2", value)
|
|
176
|
+
|
|
177
|
+
match = _DATETIME_RE_COMPILED.match(value)
|
|
178
|
+
if match:
|
|
179
|
+
value = normalize_datetime(value, match=match)
|
|
180
|
+
return value, date_part
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def normalize_datetime(dtstr: str, *, match: re.Match[str] | None = None) -> str:
|
|
184
|
+
match = match or _DATETIME_RE_COMPILED.match(dtstr)
|
|
185
|
+
if not match:
|
|
186
|
+
return dtstr
|
|
187
|
+
|
|
188
|
+
datestr = match.group("date")
|
|
189
|
+
separator = match.group("separator")
|
|
190
|
+
hour = match.group("hour")
|
|
191
|
+
minute = match.group("minute")
|
|
192
|
+
second = match.group("second")
|
|
193
|
+
ampm = match.group("ampm")
|
|
194
|
+
tz = match.group("tz") or ""
|
|
195
|
+
|
|
196
|
+
# Only normalize when AM/PM is present.
|
|
197
|
+
if not ampm:
|
|
198
|
+
return dtstr
|
|
199
|
+
|
|
200
|
+
hour_int = int(hour)
|
|
201
|
+
if ampm.lower().startswith("a") and hour_int == _HOURS_IN_HALF_DAY:
|
|
202
|
+
hour_int = 0
|
|
203
|
+
elif ampm.lower().startswith("p") and hour_int < _HOURS_IN_HALF_DAY:
|
|
204
|
+
hour_int += _HOURS_IN_HALF_DAY
|
|
205
|
+
|
|
206
|
+
minute_out = minute or "00"
|
|
207
|
+
time_out = f"{hour_int:02d}:{minute_out}"
|
|
208
|
+
if second is not None:
|
|
209
|
+
time_out += f":{second}"
|
|
210
|
+
|
|
211
|
+
return f"{datestr}{separator}{time_out}{tz}"
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mf2dom
|
|
3
|
+
Version: 0.1.9
|
|
4
|
+
Summary: Microformats2 (mf2) parser and renderer powered by JustHTML.
|
|
5
|
+
Author-email: Beto Dealmeida <contact@robida.net>
|
|
6
|
+
License-File: LICENSE.md
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Requires-Dist: justhtml>=0.12.0
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# mf2dom
|
|
12
|
+
|
|
13
|
+
Microformats2 (mf2) parser and deterministic renderer powered by JustHTML.
|
|
14
|
+
|
|
15
|
+
`mf2dom` focuses on:
|
|
16
|
+
- Correct mf2 parsing (validated against the official `microformats-tests` suite)
|
|
17
|
+
- Deterministic HTML rendering and stable round-trips (`HTML -> mf2 -> HTML`)
|
|
18
|
+
- A small runtime surface area (no network I/O, no BeautifulSoup)
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install mf2dom
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Requires Python 3.11+.
|
|
27
|
+
|
|
28
|
+
## Quickstart
|
|
29
|
+
|
|
30
|
+
Parse mf2 JSON from HTML:
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
import mf2dom
|
|
34
|
+
|
|
35
|
+
html = '<a class="h-card u-url p-name" href="/me">Alice</a>'
|
|
36
|
+
doc = mf2dom.parse(html, base_url="https://example.com/")
|
|
37
|
+
print(doc["items"])
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
The parsed document is a dict with `items`, `rels`, and `rel-urls` keys (mf2 JSON shape).
|
|
41
|
+
|
|
42
|
+
Render mf2 JSON back into canonical HTML:
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
html2 = mf2dom.render(doc)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Async parsing (offloads to a thread):
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
doc = await mf2dom.parse_async(html, base_url="https://example.com/")
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## API
|
|
55
|
+
|
|
56
|
+
- `mf2dom.parse(html, *, base_url=None, url=None) -> dict`
|
|
57
|
+
- `html` can be a string/bytes, a `justhtml.JustHTML` instance, or a JustHTML root node.
|
|
58
|
+
- `base_url` controls resolution of relative URLs (preferred). `url` is a deprecated alias.
|
|
59
|
+
- `mf2dom.parse_async(...)` is `parse(...)` via `asyncio.to_thread(...)`.
|
|
60
|
+
- `mf2dom.render(doc) -> str` renders a deterministic HTML representation of an mf2 document.
|
|
61
|
+
|
|
62
|
+
## Why mf2dom vs mf2py?
|
|
63
|
+
|
|
64
|
+
Both libraries parse microformats, but they optimize for different use cases:
|
|
65
|
+
|
|
66
|
+
- Choose `mf2dom` if you need deterministic rendering, stable round-trips, and a smaller/no-network
|
|
67
|
+
runtime surface (useful for normalization, caching, and “canonical mf2 HTML” fixtures).
|
|
68
|
+
- Choose `mf2py` if you need URL fetching, microformats1 compatibility, metaformats support, or
|
|
69
|
+
wider Python version support.
|
|
70
|
+
|
|
71
|
+
## Testing & correctness
|
|
72
|
+
|
|
73
|
+
- Official parsing fixtures: `tests/test_official_microformats_suite.py` runs the upstream
|
|
74
|
+
`microformats-tests` JSON fixtures.
|
|
75
|
+
- Coverage gate: `pyproject.toml` enforces 100% branch coverage.
|
|
76
|
+
|
|
77
|
+
To run the official fixture suite locally, check out `microformats-tests` as a sibling directory:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
git clone https://github.com/microformats/microformats-tests ../microformats-tests
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Development (uv)
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
uv sync --group dev
|
|
87
|
+
uv run pytest
|
|
88
|
+
uv run coverage run -m pytest && uv run coverage report
|
|
89
|
+
uv run pre-commit install
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## License
|
|
93
|
+
|
|
94
|
+
AGPL 3
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
mf2dom/__init__.py,sha256=YmTk1FZ8B2u6J2LfZETnxxuNWa88pCcpYDXdBaaFlt0,215
|
|
2
|
+
mf2dom/classes.py,sha256=39YrnFFTh1onGs_EIRkV-EFFEBm2atfpcfbzDydMtps,2921
|
|
3
|
+
mf2dom/dom.py,sha256=GgKSjwcrWrn8Dow-SuDf295ZgwopvhyZMkAMcshQACc,4061
|
|
4
|
+
mf2dom/implied.py,sha256=P8HrXX1JLNzZGl2psstd4qnfAHYuPHjsG-qUD4mjqnc,5893
|
|
5
|
+
mf2dom/parser.py,sha256=dZTzs2Q-dKM_FuA5jnLd6-VNlpTLJyYGnkgQwJvaQnc,13372
|
|
6
|
+
mf2dom/properties.py,sha256=BFtVsntKzN8yu52bjjjSTPwK674E48wfRneY314k41w,7629
|
|
7
|
+
mf2dom/renderer.py,sha256=UkWYEpJeRbxF-83bnqsNPTgTETpdaUaU1SoUSSmICSs,19261
|
|
8
|
+
mf2dom/text.py,sha256=g15f58Oh-tSUrOSCNK661YeGLTaamttlNrrgixF8g2I,1827
|
|
9
|
+
mf2dom/types.py,sha256=mKRe9UaVVV0HlMJRefUSbHCsbFBvIKo6xlv7eLhRSuk,1100
|
|
10
|
+
mf2dom/urls.py,sha256=VeyRxYNl3CyZu-dxLIWf7b1GF0Bol4yEVTOLHDS9F_M,895
|
|
11
|
+
mf2dom/vcp.py,sha256=d8nH7ORA1P5lJe-HwdQInmmIZPHEjTAuT_6KPHEvbqg,7001
|
|
12
|
+
mf2dom-0.1.9.dist-info/METADATA,sha256=jj5scPTQIunaAPOmXtYVnNLDQJzxSsyn6Z0IlVrq344,2643
|
|
13
|
+
mf2dom-0.1.9.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
14
|
+
mf2dom-0.1.9.dist-info/licenses/LICENSE.md,sha256=MqCnOBu8uXsEOzRZWh9EBVfVz-kE9NkXcLCrtGXo2yU,34354
|
|
15
|
+
mf2dom-0.1.9.dist-info/RECORD,,
|