mf2dom 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mf2dom/types.py ADDED
@@ -0,0 +1,57 @@
1
+ """Type definitions for mf2dom.
2
+
3
+ These TypedDicts model the JSON output defined by the Microformats2 parsing
4
+ specification and the official test suite.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import NotRequired, TypeAlias, TypedDict
10
+
11
+
12
+ class UrlObject(TypedDict):
13
+ value: str
14
+ alt: NotRequired[str]
15
+ srcset: NotRequired[dict[str, str]]
16
+
17
+
18
+ class EValue(TypedDict, total=False):
19
+ value: str
20
+ html: str
21
+ lang: str
22
+
23
+
24
+ PropertyPrimitive: TypeAlias = str
25
+ UrlValue: TypeAlias = str | UrlObject
26
+
27
+
28
+ class Mf2Item(TypedDict, total=False):
29
+ type: list[str]
30
+ properties: dict[str, list[PropertyValue]]
31
+ id: str
32
+ children: list[Mf2Item]
33
+ value: PropertyValue
34
+ html: str
35
+ lang: str
36
+
37
+
38
+ PropertyValue: TypeAlias = PropertyPrimitive | UrlObject | EValue | Mf2Item
39
+
40
+
41
+ class RelUrl(TypedDict, total=False):
42
+ rels: list[str]
43
+ text: str
44
+ media: str
45
+ hreflang: str
46
+ type: str
47
+ title: str
48
+
49
+
50
+ Mf2Document = TypedDict(
51
+ "Mf2Document",
52
+ {
53
+ "items": list[Mf2Item],
54
+ "rels": dict[str, list[str]],
55
+ "rel-urls": dict[str, RelUrl],
56
+ },
57
+ )
mf2dom/urls.py ADDED
@@ -0,0 +1,31 @@
1
+ """URL utilities (joining and `srcset` parsing)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from urllib.parse import urljoin
7
+
8
+
9
+ def try_urljoin(base: str | None, url: str | None, *, allow_fragments: bool = True) -> str | None:
10
+ if url is None:
11
+ return None
12
+ if url.startswith(("https://", "http://")):
13
+ return url
14
+ if not base:
15
+ return url
16
+ try:
17
+ return urljoin(base, url, allow_fragments=allow_fragments)
18
+ except ValueError:
19
+ return url
20
+
21
+
22
+ _SRCSET_RE = re.compile(r"(\S+)\s*([\d.]+[xw])?\s*,?\s*", re.MULTILINE)
23
+
24
+
25
+ def parse_srcset(srcset: str, base_url: str | None) -> dict[str, str]:
26
+ sources: dict[str, str] = {}
27
+ for url, descriptor in _SRCSET_RE.findall(srcset):
28
+ key = descriptor or "1x"
29
+ if key not in sources:
30
+ sources[key] = try_urljoin(base_url, url.strip(",")) or url.strip(",")
31
+ return sources
mf2dom/vcp.py ADDED
@@ -0,0 +1,211 @@
1
+ """Value Class Pattern (VCP) parsing.
2
+
3
+ Implements the mf2 Value Class Pattern for `p-*`, `u-*`, and `dt-*` properties.
4
+ See: https://microformats.org/wiki/value-class-pattern
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import re
10
+ from typing import TYPE_CHECKING
11
+
12
+ from .classes import has_root_class, is_valid_property_class
13
+ from .dom import (
14
+ Element,
15
+ get_attr,
16
+ get_classes,
17
+ iter_child_elements,
18
+ )
19
+ from .text import text_content
20
+
21
+ if TYPE_CHECKING: # pragma: no cover
22
+ from collections.abc import Iterator
23
+
24
+ _DATE_RE = r"(\d{4}-\d{2}-\d{2})|(\d{4}-\d{3})"
25
+ _SEC_RE = r"(:(?P<second>\d{2})(\.\d+)?)"
26
+ _RAWTIME_RE = rf"(?P<hour>\d{{1,2}})(:(?P<minute>\d{{2}}){_SEC_RE}?)?"
27
+ _AMPM_RE = r"am|pm|a\.m\.|p\.m\.|AM|PM|A\.M\.|P\.M\."
28
+ _TIMEZONE_RE = r"Z|[+-]\d{1,2}:?\d{2}?"
29
+ _TIME_RE = rf"(?P<rawtime>{_RAWTIME_RE})( ?(?P<ampm>{_AMPM_RE}))?( ?(?P<tz>{_TIMEZONE_RE}))?"
30
+ _DATETIME_RE = rf"(?P<date>{_DATE_RE})(?P<separator>[Tt ])(?P<time>{_TIME_RE})"
31
+
32
+ _TIME_RE_COMPILED = re.compile(_TIME_RE + "$")
33
+ _DATE_RE_COMPILED = re.compile(_DATE_RE + "$")
34
+ _TZ_ONLY_RE_COMPILED = re.compile(_TIMEZONE_RE + "$")
35
+ _DATETIME_RE_COMPILED = re.compile(_DATETIME_RE + "$")
36
+
37
+ _HOURS_IN_HALF_DAY = 12
38
+
39
+
40
+ def _is_value_node(el: Element) -> bool:
41
+ classes = set(get_classes(el))
42
+ return "value" in classes or "value-title" in classes
43
+
44
+
45
+ def _is_property_node(el: Element) -> bool:
46
+ return any(is_valid_property_class(c) for c in get_classes(el))
47
+
48
+
49
+ def _is_microformat_root(el: Element) -> bool:
50
+ return has_root_class(get_classes(el))
51
+
52
+
53
+ def _iter_value_nodes(root: Element) -> Iterator[Element]:
54
+ # Descendants (not self), in document order, but do not traverse into nested
55
+ # properties or microformats unless the node itself is a value node.
56
+ stack: list[Element] = list(reversed(list(iter_child_elements(root))))
57
+ while stack:
58
+ el = stack.pop()
59
+ if _is_value_node(el):
60
+ yield el
61
+ continue
62
+ if _is_property_node(el) or _is_microformat_root(el):
63
+ continue
64
+ stack.extend(reversed(list(iter_child_elements(el))))
65
+
66
+
67
+ def text(root: Element) -> str | None:
68
+ parts: list[str] = []
69
+ for el in _iter_value_nodes(root):
70
+ classes = set(get_classes(el))
71
+ if "value-title" in classes:
72
+ title = get_attr(el, "title")
73
+ if title is not None:
74
+ parts.append(title)
75
+ continue
76
+
77
+ tag = el.name.lower()
78
+ if tag in {"img", "area"}:
79
+ alt = get_attr(el, "alt")
80
+ if alt is not None:
81
+ parts.append(alt)
82
+ else:
83
+ parts.append(text_content(el))
84
+ elif tag in {"data", "input"}:
85
+ val = get_attr(el, "value")
86
+ parts.append(val if val is not None else text_content(el))
87
+ elif tag == "abbr":
88
+ title = get_attr(el, "title")
89
+ parts.append(title if title is not None else text_content(el))
90
+ else:
91
+ parts.append(text_content(el))
92
+
93
+ if not parts:
94
+ return None
95
+ return "".join(parts)
96
+
97
+
98
+ def datetime(root: Element, default_date: str | None) -> tuple[str, str | None] | None:
99
+ raw_parts: list[tuple[str, bool]] = []
100
+ for el in _iter_value_nodes(root):
101
+ classes = set(get_classes(el))
102
+ if "value-title" in classes:
103
+ title = get_attr(el, "title")
104
+ if title:
105
+ raw_parts.append((title.strip(), False))
106
+ continue
107
+
108
+ tag = el.name.lower()
109
+ if tag in {"img", "area"}:
110
+ alt = get_attr(el, "alt") or text_content(el)
111
+ if alt:
112
+ raw_parts.append((alt.strip(), False))
113
+ elif tag in {"data", "input"}:
114
+ val = get_attr(el, "value") or text_content(el)
115
+ if val:
116
+ raw_parts.append((val.strip(), False))
117
+ elif tag == "abbr":
118
+ title = get_attr(el, "title") or text_content(el)
119
+ if title:
120
+ raw_parts.append((title.strip(), False))
121
+ elif tag in {"del", "ins", "time"}:
122
+ dt = get_attr(el, "datetime") or text_content(el)
123
+ if dt:
124
+ raw_parts.append((dt.strip(), True))
125
+ else:
126
+ txt = text_content(el)
127
+ if txt:
128
+ raw_parts.append((txt.strip(), False))
129
+
130
+ if not raw_parts:
131
+ return None
132
+
133
+ date_part: str | None = None
134
+ time_part: str | None = None
135
+ time_part_from_time_el = False
136
+ tz_part: str | None = None
137
+
138
+ for part, from_time_el in raw_parts:
139
+ dt_match = _DATETIME_RE_COMPILED.match(part)
140
+ if dt_match:
141
+ if date_part is None and time_part is None and tz_part is None:
142
+ normalized = normalize_datetime(part, match=dt_match)
143
+ return normalized, dt_match.group("date")
144
+ continue
145
+
146
+ if date_part is None and _DATE_RE_COMPILED.match(part):
147
+ date_part = part
148
+ continue
149
+
150
+ time_match = _TIME_RE_COMPILED.match(part)
151
+ if time_part is None and time_match:
152
+ time_part = part
153
+ time_part_from_time_el = from_time_el
154
+ tz_group = time_match.group("tz")
155
+ if tz_part is None and tz_group:
156
+ tz_part = tz_group
157
+ continue
158
+
159
+ if tz_part is None and _TZ_ONLY_RE_COMPILED.match(part):
160
+ tz_part = part
161
+
162
+ if date_part is None and time_part is None:
163
+ return None
164
+ if date_part is None and time_part is not None:
165
+ date_part = default_date
166
+
167
+ value = f"{date_part} {time_part}" if date_part and time_part else date_part or time_part or ""
168
+
169
+ if tz_part and time_part and tz_part not in value:
170
+ value += tz_part
171
+
172
+ if time_part_from_time_el:
173
+ # In the official test suite, timezones with a colon originating from <time>/<ins>/<del>
174
+ # value nodes are normalized to the compact form (e.g. -08:00 => -0800).
175
+ value = re.sub(r"([+-]\d{1,2}):(\d{2})$", r"\1\2", value)
176
+
177
+ match = _DATETIME_RE_COMPILED.match(value)
178
+ if match:
179
+ value = normalize_datetime(value, match=match)
180
+ return value, date_part
181
+
182
+
183
+ def normalize_datetime(dtstr: str, *, match: re.Match[str] | None = None) -> str:
184
+ match = match or _DATETIME_RE_COMPILED.match(dtstr)
185
+ if not match:
186
+ return dtstr
187
+
188
+ datestr = match.group("date")
189
+ separator = match.group("separator")
190
+ hour = match.group("hour")
191
+ minute = match.group("minute")
192
+ second = match.group("second")
193
+ ampm = match.group("ampm")
194
+ tz = match.group("tz") or ""
195
+
196
+ # Only normalize when AM/PM is present.
197
+ if not ampm:
198
+ return dtstr
199
+
200
+ hour_int = int(hour)
201
+ if ampm.lower().startswith("a") and hour_int == _HOURS_IN_HALF_DAY:
202
+ hour_int = 0
203
+ elif ampm.lower().startswith("p") and hour_int < _HOURS_IN_HALF_DAY:
204
+ hour_int += _HOURS_IN_HALF_DAY
205
+
206
+ minute_out = minute or "00"
207
+ time_out = f"{hour_int:02d}:{minute_out}"
208
+ if second is not None:
209
+ time_out += f":{second}"
210
+
211
+ return f"{datestr}{separator}{time_out}{tz}"
@@ -0,0 +1,94 @@
1
+ Metadata-Version: 2.4
2
+ Name: mf2dom
3
+ Version: 0.1.9
4
+ Summary: Microformats2 (mf2) parser and renderer powered by JustHTML.
5
+ Author-email: Beto Dealmeida <contact@robida.net>
6
+ License-File: LICENSE.md
7
+ Requires-Python: >=3.11
8
+ Requires-Dist: justhtml>=0.12.0
9
+ Description-Content-Type: text/markdown
10
+
11
+ # mf2dom
12
+
13
+ Microformats2 (mf2) parser and deterministic renderer powered by JustHTML.
14
+
15
+ `mf2dom` focuses on:
16
+ - Correct mf2 parsing (validated against the official `microformats-tests` suite)
17
+ - Deterministic HTML rendering and stable round-trips (`HTML -> mf2 -> HTML`)
18
+ - A small runtime surface area (no network I/O, no BeautifulSoup)
19
+
20
+ ## Installation
21
+
22
+ ```bash
23
+ pip install mf2dom
24
+ ```
25
+
26
+ Requires Python 3.11+.
27
+
28
+ ## Quickstart
29
+
30
+ Parse mf2 JSON from HTML:
31
+
32
+ ```python
33
+ import mf2dom
34
+
35
+ html = '<a class="h-card u-url p-name" href="/me">Alice</a>'
36
+ doc = mf2dom.parse(html, base_url="https://example.com/")
37
+ print(doc["items"])
38
+ ```
39
+
40
+ The parsed document is a dict with `items`, `rels`, and `rel-urls` keys (mf2 JSON shape).
41
+
42
+ Render mf2 JSON back into canonical HTML:
43
+
44
+ ```python
45
+ html2 = mf2dom.render(doc)
46
+ ```
47
+
48
+ Async parsing (offloads to a thread):
49
+
50
+ ```python
51
+ doc = await mf2dom.parse_async(html, base_url="https://example.com/")
52
+ ```
53
+
54
+ ## API
55
+
56
+ - `mf2dom.parse(html, *, base_url=None, url=None) -> dict`
57
+ - `html` can be a string/bytes, a `justhtml.JustHTML` instance, or a JustHTML root node.
58
+ - `base_url` controls resolution of relative URLs (preferred). `url` is a deprecated alias.
59
+ - `mf2dom.parse_async(...)` is `parse(...)` via `asyncio.to_thread(...)`.
60
+ - `mf2dom.render(doc) -> str` renders a deterministic HTML representation of an mf2 document.
61
+
62
+ ## Why mf2dom vs mf2py?
63
+
64
+ Both libraries parse microformats, but they optimize for different use cases:
65
+
66
+ - Choose `mf2dom` if you need deterministic rendering, stable round-trips, and a smaller/no-network
67
+ runtime surface (useful for normalization, caching, and “canonical mf2 HTML” fixtures).
68
+ - Choose `mf2py` if you need URL fetching, microformats1 compatibility, metaformats support, or
69
+ wider Python version support.
70
+
71
+ ## Testing & correctness
72
+
73
+ - Official parsing fixtures: `tests/test_official_microformats_suite.py` runs the upstream
74
+ `microformats-tests` JSON fixtures.
75
+ - Coverage gate: `pyproject.toml` enforces 100% branch coverage.
76
+
77
+ To run the official fixture suite locally, check out `microformats-tests` as a sibling directory:
78
+
79
+ ```bash
80
+ git clone https://github.com/microformats/microformats-tests ../microformats-tests
81
+ ```
82
+
83
+ ## Development (uv)
84
+
85
+ ```bash
86
+ uv sync --group dev
87
+ uv run pytest
88
+ uv run coverage run -m pytest && uv run coverage report
89
+ uv run pre-commit install
90
+ ```
91
+
92
+ ## License
93
+
94
+ AGPL 3
@@ -0,0 +1,15 @@
1
+ mf2dom/__init__.py,sha256=YmTk1FZ8B2u6J2LfZETnxxuNWa88pCcpYDXdBaaFlt0,215
2
+ mf2dom/classes.py,sha256=39YrnFFTh1onGs_EIRkV-EFFEBm2atfpcfbzDydMtps,2921
3
+ mf2dom/dom.py,sha256=GgKSjwcrWrn8Dow-SuDf295ZgwopvhyZMkAMcshQACc,4061
4
+ mf2dom/implied.py,sha256=P8HrXX1JLNzZGl2psstd4qnfAHYuPHjsG-qUD4mjqnc,5893
5
+ mf2dom/parser.py,sha256=dZTzs2Q-dKM_FuA5jnLd6-VNlpTLJyYGnkgQwJvaQnc,13372
6
+ mf2dom/properties.py,sha256=BFtVsntKzN8yu52bjjjSTPwK674E48wfRneY314k41w,7629
7
+ mf2dom/renderer.py,sha256=UkWYEpJeRbxF-83bnqsNPTgTETpdaUaU1SoUSSmICSs,19261
8
+ mf2dom/text.py,sha256=g15f58Oh-tSUrOSCNK661YeGLTaamttlNrrgixF8g2I,1827
9
+ mf2dom/types.py,sha256=mKRe9UaVVV0HlMJRefUSbHCsbFBvIKo6xlv7eLhRSuk,1100
10
+ mf2dom/urls.py,sha256=VeyRxYNl3CyZu-dxLIWf7b1GF0Bol4yEVTOLHDS9F_M,895
11
+ mf2dom/vcp.py,sha256=d8nH7ORA1P5lJe-HwdQInmmIZPHEjTAuT_6KPHEvbqg,7001
12
+ mf2dom-0.1.9.dist-info/METADATA,sha256=jj5scPTQIunaAPOmXtYVnNLDQJzxSsyn6Z0IlVrq344,2643
13
+ mf2dom-0.1.9.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
14
+ mf2dom-0.1.9.dist-info/licenses/LICENSE.md,sha256=MqCnOBu8uXsEOzRZWh9EBVfVz-kE9NkXcLCrtGXo2yU,34354
15
+ mf2dom-0.1.9.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any