mf2dom 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mf2dom/parser.py ADDED
@@ -0,0 +1,395 @@
1
+ """Microformats2 parser.
2
+
3
+ Implements the mf2 parsing algorithm:
4
+ https://microformats.org/wiki/microformats2-parsing
5
+
6
+ Entry points:
7
+ - `parse(...)` for synchronous parsing
8
+ - `parse_async(...)` for running parsing off-thread
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ from collections import defaultdict
15
+ from dataclasses import dataclass
16
+ from typing import TYPE_CHECKING, Literal, TypeAlias, cast
17
+
18
+ from justhtml import JustHTML
19
+
20
+ from .dom import (
21
+ Element,
22
+ HasDom,
23
+ ancestor_elements,
24
+ get_attr,
25
+ iter_child_elements,
26
+ iter_descendant_elements,
27
+ )
28
+ from .implied import implied_name, implied_photo, implied_url
29
+ from .properties import (
30
+ is_microformat_root,
31
+ parse_dt,
32
+ parse_e,
33
+ parse_p,
34
+ parse_u,
35
+ property_classes,
36
+ root_types,
37
+ )
38
+ from .text import text_content
39
+ from .types import EValue, Mf2Document, Mf2Item
40
+ from .urls import try_urljoin
41
+
42
+ if TYPE_CHECKING: # pragma: no cover
43
+ from .types import PropertyValue, RelUrl
44
+
45
+
46
+ @dataclass(slots=True)
47
+ class _ParseContext:
48
+ base_url: str | None
49
+ document_lang: str | None
50
+
51
+
52
+ HtmlInput: TypeAlias = str | bytes | bytearray | memoryview | JustHTML | HasDom
53
+
54
+
55
+ def _first_lang(doc_root: HasDom) -> str | None:
56
+ for el in iter_descendant_elements(doc_root):
57
+ if el.name.lower() == "html":
58
+ return get_attr(el, "lang")
59
+ return None
60
+
61
+
62
+ def _discover_base_url(doc_root: HasDom, *, base_url: str | None) -> str | None:
63
+ for el in iter_descendant_elements(doc_root):
64
+ if el.name.lower() != "base":
65
+ continue
66
+ href = get_attr(el, "href")
67
+ if not href:
68
+ break
69
+ # Absolute replaces; relative is joined against provided URL if any.
70
+ joined = try_urljoin(base_url, href)
71
+ base_url = joined or href
72
+ break
73
+ return base_url
74
+
75
+
76
+ def _split_tokens(value: str | None) -> list[str]:
77
+ if not value:
78
+ return []
79
+ return [t for t in value.split() if t]
80
+
81
+
82
+ _REL_ATTRS: tuple[Literal["media"], Literal["hreflang"], Literal["type"], Literal["title"]] = (
83
+ "media",
84
+ "hreflang",
85
+ "type",
86
+ "title",
87
+ )
88
+
89
+
90
+ def _parse_rels(
91
+ doc_root: HasDom, *, base_url: str | None
92
+ ) -> tuple[dict[str, list[str]], dict[str, RelUrl]]:
93
+ rels: dict[str, list[str]] = defaultdict(list)
94
+ rel_urls: dict[str, RelUrl] = {}
95
+
96
+ for el in iter_descendant_elements(doc_root):
97
+ if el.name.lower() not in {"a", "area", "link"}:
98
+ continue
99
+ rel_attr = get_attr(el, "rel")
100
+ if not rel_attr:
101
+ continue
102
+ href = get_attr(el, "href")
103
+ if href is None:
104
+ continue
105
+ abs_href = try_urljoin(base_url, href) or href
106
+ rel_tokens = _split_tokens(rel_attr)
107
+ if not rel_tokens:
108
+ continue
109
+
110
+ for rel in rel_tokens:
111
+ if abs_href not in rels[rel]:
112
+ rels[rel].append(abs_href)
113
+
114
+ entry = rel_urls.setdefault(abs_href, {"rels": []})
115
+ entry_rels = entry["rels"]
116
+ for rel in rel_tokens:
117
+ if rel not in entry_rels:
118
+ entry_rels.append(rel)
119
+
120
+ if "text" not in entry:
121
+ entry["text"] = text_content(el)
122
+
123
+ for attr in _REL_ATTRS:
124
+ v = get_attr(el, attr)
125
+ if v is not None and v != "":
126
+ entry.setdefault(attr, v)
127
+
128
+ return dict(rels), rel_urls
129
+
130
+
131
+ def _has_ancestor_microformat_root(el: Element) -> bool:
132
+ return any(is_microformat_root(a) for a in ancestor_elements(el))
133
+
134
+
135
+ def _top_level_roots(doc_root: HasDom) -> list[Element]:
136
+ roots: list[Element] = []
137
+ for el in iter_descendant_elements(doc_root):
138
+ if not is_microformat_root(el):
139
+ continue
140
+ if _has_ancestor_microformat_root(el):
141
+ continue
142
+ roots.append(el)
143
+ return roots
144
+
145
+
146
+ def _is_property_for_parent(el: Element) -> bool:
147
+ return bool(property_classes(el))
148
+
149
+
150
+ def _parse_item(
151
+ el: Element,
152
+ ctx: _ParseContext,
153
+ *,
154
+ parent_lang: str | None,
155
+ ignore_root_property_classes: frozenset[str] = frozenset(),
156
+ ) -> Mf2Item:
157
+ types = root_types(el)
158
+ item: Mf2Item = {"type": types, "properties": {}}
159
+
160
+ element_id = get_attr(el, "id")
161
+ if element_id:
162
+ item["id"] = element_id
163
+
164
+ root_lang = get_attr(el, "lang") or parent_lang
165
+ children: list[Mf2Item] = []
166
+
167
+ props: dict[str, list[PropertyValue]] = defaultdict(list)
168
+ default_date: str | None = None
169
+ has_p = False
170
+ has_u = False
171
+ has_e = False
172
+ has_nested_microformat = False
173
+
174
+ def add_prop(name: str, value: PropertyValue) -> None:
175
+ props[name].append(value)
176
+
177
+ def simple_value(prop_class: str, target: Element) -> PropertyValue:
178
+ nonlocal default_date
179
+ if prop_class.startswith("p-"):
180
+ return parse_p(target, base_url=ctx.base_url) # pragma: no cover
181
+ if prop_class.startswith("u-"):
182
+ return parse_u(target, base_url=ctx.base_url) # pragma: no cover
183
+ if prop_class.startswith("dt-"):
184
+ dt = parse_dt(target, default_date=default_date)
185
+ if dt.date:
186
+ default_date = dt.date
187
+ return dt.value
188
+ if prop_class.startswith("e-"):
189
+ return parse_e(
190
+ target, base_url=ctx.base_url, root_lang=root_lang, document_lang=ctx.document_lang
191
+ )
192
+ return "" # pragma: no cover
193
+
194
+ def embedded_value(prop_class: str, embedded_item: Mf2Item, target: Element) -> PropertyValue:
195
+ props_obj = embedded_item.get("properties")
196
+ if not isinstance(props_obj, dict):
197
+ return simple_value(prop_class, target) # pragma: no cover
198
+
199
+ def descendant_name_class_info(root: Element) -> tuple[bool, bool]:
200
+ """Return (has_p_name, has_any_typed_name)."""
201
+ has_p_name = False
202
+ has_any_typed_name = False
203
+ name_tokens = {"p-name", "u-name", "dt-name", "e-name"}
204
+
205
+ stack: list[Element] = list(iter_child_elements(root))
206
+ while stack:
207
+ cur = stack.pop()
208
+ pcs = property_classes(cur)
209
+ if pcs:
210
+ for pc in pcs:
211
+ if pc == "p-name":
212
+ has_p_name = True
213
+ has_any_typed_name = True
214
+ break
215
+ if pc in name_tokens:
216
+ has_any_typed_name = True
217
+ if has_p_name and has_any_typed_name:
218
+ return True, True
219
+ if is_microformat_root(cur):
220
+ continue
221
+ stack.extend(iter_child_elements(cur))
222
+ return has_p_name, has_any_typed_name
223
+
224
+ if prop_class.startswith("u-"):
225
+ found_url = False
226
+ for key in ("url", "uid"):
227
+ vals = props_obj.get(key)
228
+ if not isinstance(vals, list) or not vals:
229
+ continue
230
+ found_url = True
231
+ candidate = vals[0]
232
+ url = candidate if isinstance(candidate, str) else str(candidate.get("value", "")) # type: ignore[union-attr]
233
+ if url.startswith(("http://", "https://")):
234
+ return candidate
235
+ if found_url:
236
+ # If we have a URL property but it's not an absolute URL, fall back to plain text.
237
+ return parse_p(target, base_url=ctx.base_url)
238
+ # Otherwise, parse the `u-*` value from the element itself (URL join behavior).
239
+ return parse_u(target, base_url=ctx.base_url)
240
+
241
+ if prop_class.startswith("p-"):
242
+ vals = props_obj.get("name")
243
+ if (
244
+ isinstance(vals, list)
245
+ and vals
246
+ and isinstance(vals[0], str)
247
+ and not vals[0].startswith(("http://", "https://"))
248
+ ):
249
+ has_p_name, has_any_typed_name = descendant_name_class_info(target)
250
+ # Favor the embedded `p-name` value; otherwise only use implied name.
251
+ if has_p_name or not has_any_typed_name:
252
+ return vals[0]
253
+ return parse_p(target, base_url=ctx.base_url)
254
+
255
+ return simple_value(prop_class, target)
256
+
257
+ def handle_property_class(prop_class: str, target: Element) -> None:
258
+ nonlocal default_date
259
+ nonlocal has_p, has_u, has_e
260
+ if prop_class.startswith("p-"):
261
+ has_p = True
262
+ add_prop(prop_class[2:], parse_p(target, base_url=ctx.base_url))
263
+ elif prop_class.startswith("u-"):
264
+ has_u = True
265
+ add_prop(prop_class[2:], parse_u(target, base_url=ctx.base_url))
266
+ elif prop_class.startswith("dt-"):
267
+ dt = parse_dt(target, default_date=default_date)
268
+ add_prop(prop_class[3:], dt.value)
269
+ if dt.date:
270
+ default_date = dt.date
271
+ elif prop_class.startswith("e-"):
272
+ has_e = True
273
+ add_prop(
274
+ prop_class[2:],
275
+ parse_e(
276
+ target,
277
+ base_url=ctx.base_url,
278
+ root_lang=root_lang,
279
+ document_lang=ctx.document_lang,
280
+ ),
281
+ )
282
+ else: # pragma: no cover
283
+ return
284
+
285
+ def walk(node: Element, *, is_root: bool) -> None:
286
+ nonlocal has_e, has_nested_microformat, has_p, has_u
287
+ if not is_root and is_microformat_root(node):
288
+ has_nested_microformat = True
289
+ nested = _parse_item(
290
+ node,
291
+ ctx,
292
+ parent_lang=root_lang,
293
+ ignore_root_property_classes=frozenset(property_classes(node))
294
+ if _is_property_for_parent(node)
295
+ else frozenset(),
296
+ )
297
+ if _is_property_for_parent(node):
298
+ for pc in property_classes(node):
299
+ if pc.startswith("p-"):
300
+ has_p = True
301
+ elif pc.startswith("u-"):
302
+ has_u = True
303
+ elif pc.startswith("e-"):
304
+ has_e = True
305
+ name = pc.split("-", 1)[1]
306
+ embedded = cast(Mf2Item, dict(nested))
307
+ val = embedded_value(pc, nested, node)
308
+ if pc.startswith("e-") and isinstance(val, dict):
309
+ e_val = cast(EValue, val)
310
+ embedded["value"] = e_val["value"]
311
+ embedded["html"] = e_val["html"]
312
+ if "lang" in e_val:
313
+ embedded["lang"] = e_val["lang"]
314
+ else:
315
+ embedded["value"] = val
316
+ add_prop(name, embedded)
317
+ else:
318
+ children.append(nested)
319
+ return
320
+
321
+ pcs = property_classes(node)
322
+ if is_root and ignore_root_property_classes:
323
+ pcs = [pc for pc in pcs if pc not in ignore_root_property_classes]
324
+ for pc in pcs:
325
+ handle_property_class(pc, node)
326
+
327
+ for child in iter_child_elements(node):
328
+ walk(child, is_root=False)
329
+
330
+ walk(el, is_root=True)
331
+
332
+ # Apply implied properties if missing.
333
+ if "name" not in props and not has_p and not has_e and not has_nested_microformat:
334
+ props["name"].append(implied_name(el, ctx.base_url))
335
+ if "photo" not in props and not has_u and not has_nested_microformat:
336
+ photo = implied_photo(el, ctx.base_url)
337
+ if photo is not None:
338
+ props["photo"].append(photo)
339
+ if "url" not in props and not has_u and not has_nested_microformat:
340
+ url = implied_url(el, ctx.base_url)
341
+ if url is not None:
342
+ props["url"].append(url)
343
+
344
+ item["properties"] = dict(props)
345
+ if children:
346
+ item["children"] = children
347
+ return item
348
+
349
+
350
+ def parse(
351
+ html: HtmlInput | None,
352
+ *,
353
+ base_url: str | None = None,
354
+ url: str | None = None,
355
+ ) -> Mf2Document:
356
+ """Parse Microformats2 JSON from HTML or a JustHTML document.
357
+
358
+ Returns a dict containing `items`, `rels`, and `rel-urls`.
359
+
360
+ Args:
361
+ html: HTML markup, a JustHTML instance, or a JustHTML root node.
362
+ base_url: Base URL for resolving relative URLs. Prefer this parameter.
363
+ url: Deprecated alias for `base_url`.
364
+ """
365
+ if base_url is not None and url is not None and base_url != url:
366
+ msg = "Provide only one of `base_url` or `url`."
367
+ raise ValueError(msg)
368
+ if base_url is None:
369
+ base_url = url
370
+
371
+ if isinstance(html, JustHTML):
372
+ doc_root = cast(HasDom, html.root)
373
+ elif html is None or isinstance(html, str | bytes | bytearray | memoryview):
374
+ doc_root = cast(HasDom, JustHTML(html).root)
375
+ else:
376
+ doc_root = cast(HasDom, html)
377
+
378
+ base_url = _discover_base_url(doc_root, base_url=base_url)
379
+ document_lang = _first_lang(doc_root)
380
+ ctx = _ParseContext(base_url=base_url, document_lang=document_lang)
381
+
382
+ rels, rel_urls = _parse_rels(doc_root, base_url=base_url)
383
+ items = [
384
+ _parse_item(root, ctx, parent_lang=document_lang) for root in _top_level_roots(doc_root)
385
+ ]
386
+ return {"items": items, "rels": rels, "rel-urls": rel_urls}
387
+
388
+
389
+ async def parse_async(
390
+ html: HtmlInput | None,
391
+ *,
392
+ base_url: str | None = None,
393
+ url: str | None = None,
394
+ ) -> Mf2Document:
395
+ return await asyncio.to_thread(parse, html, base_url=base_url, url=url)
mf2dom/properties.py ADDED
@@ -0,0 +1,257 @@
1
+ """Property parsing for mf2 (`p-`, `u-`, `dt-`, `e-`) and microformat detection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from dataclasses import dataclass
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ from justhtml.constants import VOID_ELEMENTS
10
+
11
+ from .classes import has_root_class
12
+ from .classes import property_classes as mf2_property_classes
13
+ from .classes import root_types as mf2_root_types
14
+ from .dom import (
15
+ Element,
16
+ get_attr,
17
+ get_classes,
18
+ is_element,
19
+ iter_preorder_elements,
20
+ )
21
+ from .text import text_content
22
+ from .urls import parse_srcset, try_urljoin
23
+ from .vcp import _DATE_RE, _DATETIME_RE_COMPILED, _TIME_RE, normalize_datetime
24
+ from .vcp import datetime as vcp_datetime
25
+ from .vcp import text as vcp_text
26
+
27
+ if TYPE_CHECKING: # pragma: no cover
28
+ from .types import EValue, UrlObject, UrlValue
29
+
30
+
31
+ @dataclass(slots=True)
32
+ class DtResult:
33
+ value: str
34
+ date: str | None
35
+
36
+
37
+ def parse_p(el: Element, *, base_url: str | None) -> str:
38
+ if (v := vcp_text(el)) is not None:
39
+ return v
40
+
41
+ title = get_attr(el, "title")
42
+ tag = el.name.lower()
43
+ if title is not None and tag in {"abbr", "link"}:
44
+ return title
45
+
46
+ value = get_attr(el, "value")
47
+ if value is not None and tag in {"data", "input"}:
48
+ return value
49
+
50
+ alt = get_attr(el, "alt")
51
+ if alt is not None and tag in {"img", "area"}:
52
+ return alt
53
+
54
+ return text_content(el, replace_img=True, img_to_src=False, base_url=base_url).strip()
55
+
56
+
57
+ def _img_value(img: Element, base_url: str | None) -> UrlValue | None:
58
+ src = get_attr(img, "src")
59
+ if src is None:
60
+ return None
61
+ src_abs = try_urljoin(base_url, src) or src
62
+ alt = get_attr(img, "alt")
63
+ srcset = get_attr(img, "srcset")
64
+ if alt is not None or srcset:
65
+ out: UrlObject = {"value": src_abs}
66
+ if alt is not None:
67
+ out["alt"] = alt
68
+ if srcset:
69
+ out["srcset"] = parse_srcset(srcset, base_url)
70
+ return out
71
+ return src_abs
72
+
73
+
74
+ def parse_u(el: Element, *, base_url: str | None) -> UrlValue:
75
+ tag = el.name.lower()
76
+
77
+ href = get_attr(el, "href")
78
+ if href is not None and tag in {"a", "area", "link"}:
79
+ return try_urljoin(base_url, href) or href
80
+
81
+ if tag == "img":
82
+ img = _img_value(el, base_url)
83
+ if img is not None:
84
+ return img
85
+
86
+ src = get_attr(el, "src")
87
+ if src is not None and tag in {"audio", "video", "source", "iframe"}:
88
+ return try_urljoin(base_url, src) or src
89
+
90
+ poster = get_attr(el, "poster")
91
+ if poster is not None and tag == "video":
92
+ return try_urljoin(base_url, poster) or poster
93
+
94
+ data = get_attr(el, "data")
95
+ if data is not None and tag == "object":
96
+ return try_urljoin(base_url, data) or data
97
+
98
+ v = vcp_text(el)
99
+ if v is not None:
100
+ return try_urljoin(base_url, v) or v
101
+
102
+ if tag == "abbr":
103
+ title = get_attr(el, "title")
104
+ if title is not None:
105
+ return try_urljoin(base_url, title) or title
106
+
107
+ value = get_attr(el, "value")
108
+ if value is not None and tag in {"data", "input"}:
109
+ return try_urljoin(base_url, value) or value
110
+
111
+ txt = text_content(el).strip()
112
+ return try_urljoin(base_url, txt) or txt
113
+
114
+
115
+ _TIME_ONLY_RE = re.compile(_TIME_RE + "$")
116
+ _DATETIME_RE = _DATETIME_RE_COMPILED
117
+
118
+
119
+ def parse_dt(el: Element, *, default_date: str | None) -> DtResult:
120
+ v = vcp_datetime(el, default_date)
121
+ if v is not None:
122
+ return DtResult(value=v[0], date=v[1])
123
+
124
+ tag = el.name.lower()
125
+ prop_value: str
126
+ from_attr = False
127
+ if tag in {"time", "ins", "del"}:
128
+ dt = get_attr(el, "datetime")
129
+ if dt is not None:
130
+ prop_value = dt
131
+ from_attr = True
132
+ else:
133
+ prop_value = text_content(el)
134
+ elif tag == "abbr":
135
+ title = get_attr(el, "title")
136
+ if title is not None:
137
+ prop_value = title
138
+ from_attr = True
139
+ else:
140
+ prop_value = text_content(el)
141
+ elif tag in {"data", "input"}:
142
+ value = get_attr(el, "value")
143
+ if value is not None:
144
+ prop_value = value
145
+ from_attr = True
146
+ else:
147
+ prop_value = text_content(el)
148
+ else:
149
+ prop_value = text_content(el)
150
+
151
+ stripped = prop_value.strip()
152
+
153
+ time_match = _TIME_ONLY_RE.match(stripped)
154
+ if time_match and default_date:
155
+ combined = f"{default_date} {stripped}"
156
+ match = _DATETIME_RE.match(combined)
157
+ return DtResult(value=normalize_datetime(combined, match=match), date=default_date)
158
+
159
+ match = _DATETIME_RE.match(stripped)
160
+ if match:
161
+ normalized = normalize_datetime(stripped, match=match)
162
+ # If normalization didn't change (no AM/PM), preserve original attribute spacing.
163
+ if from_attr and normalized == stripped:
164
+ return DtResult(value=prop_value, date=match.group("date"))
165
+ return DtResult(value=normalized, date=match.group("date"))
166
+ date_match = re.match(_DATE_RE + "$", stripped)
167
+ return DtResult(
168
+ value=(prop_value if from_attr else stripped),
169
+ date=(date_match.group(0) if date_match else None),
170
+ )
171
+
172
+
173
+ _URL_ATTRS_IN_E = ("href", "src", "cite", "data", "poster")
174
+
175
+
176
+ def _inner_html(el: Element) -> str:
177
+ return "".join(_serialize_node(child) for child in el.children or [])
178
+
179
+
180
+ def _escape_text(text: str) -> str:
181
+ return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
182
+
183
+
184
+ def _escape_attr_value(value: str) -> str:
185
+ return value.replace("&", "&amp;").replace('"', "&quot;")
186
+
187
+
188
+ def _serialize_element(el: Element) -> str:
189
+ name = el.name
190
+ parts: list[str] = [f"<{name}"]
191
+ for key, value in (el.attrs or {}).items():
192
+ if value is None:
193
+ parts.append(f" {key}")
194
+ else:
195
+ parts.append(f' {key}="{_escape_attr_value(str(value))}"')
196
+ parts.append(">")
197
+ start = "".join(parts)
198
+ if name.lower() in VOID_ELEMENTS:
199
+ return start
200
+ inner = "".join(_serialize_node(child) for child in el.children or [])
201
+ return f"{start}{inner}</{name}>"
202
+
203
+
204
+ def _serialize_node(node: Any) -> str:
205
+ name = getattr(node, "name", "")
206
+ if name == "#text":
207
+ data = getattr(node, "data", None)
208
+ return _escape_text(str(data)) if data is not None else ""
209
+ if name == "#comment":
210
+ data = getattr(node, "data", "") or ""
211
+ return f"<!--{data}-->"
212
+ if name in {"!doctype", "#document", "#document-fragment"}:
213
+ return "".join(_serialize_node(child) for child in getattr(node, "children", None) or [])
214
+ if name.lower() == "template":
215
+ return ""
216
+ if is_element(node):
217
+ return _serialize_element(node)
218
+ return ""
219
+
220
+
221
+ def parse_e(
222
+ el: Element,
223
+ *,
224
+ base_url: str | None,
225
+ root_lang: str | None,
226
+ document_lang: str | None,
227
+ ) -> EValue:
228
+ clone = el.clone_node(deep=True) # type: ignore[attr-defined]
229
+ for tag in iter_preorder_elements(clone):
230
+ for attr in _URL_ATTRS_IN_E:
231
+ val = get_attr(tag, attr)
232
+ if val is not None:
233
+ tag.attrs[attr] = try_urljoin(base_url, val)
234
+
235
+ out: EValue = {
236
+ "value": text_content(el, replace_img=True, base_url=base_url).strip(),
237
+ "html": "",
238
+ }
239
+
240
+ lang = get_attr(el, "lang") or root_lang or document_lang
241
+ if lang:
242
+ out["lang"] = lang
243
+
244
+ out["html"] = _inner_html(clone).strip()
245
+ return out
246
+
247
+
248
+ def is_microformat_root(el: Element) -> bool:
249
+ return has_root_class(get_classes(el))
250
+
251
+
252
+ def root_types(el: Element) -> list[str]:
253
+ return mf2_root_types(get_classes(el))
254
+
255
+
256
+ def property_classes(el: Element) -> list[str]:
257
+ return mf2_property_classes(get_classes(el))