aimd-html 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aimd_html/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """Defuddle-backed HTML extraction helpers."""
2
+
3
+ from .defuddle import DefuddleResult, extract_html_with_defuddle
4
+
5
+ __all__ = ["DefuddleResult", "extract_html_with_defuddle"]
aimd_html/defuddle.py ADDED
@@ -0,0 +1,58 @@
1
+ """Small Python wrapper around the Defuddle CLI.
2
+
3
+ Defuddle is distributed as a Node/TypeScript package. This wrapper keeps the
4
+ Python package boundary explicit and invokes ``npx defuddle`` when callers opt
5
+ into HTML extraction.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+ import json
12
+ from pathlib import Path
13
+ import subprocess
14
+
15
+
16
+ @dataclass(slots=True, frozen=True)
17
+ class DefuddleResult:
18
+ """Parsed HTML content and metadata returned by Defuddle."""
19
+
20
+ content: str
21
+ title: str | None = None
22
+ author: str | None = None
23
+ source: str | None = None
24
+
25
+
26
+ def extract_html_with_defuddle(
27
+ source: str | Path,
28
+ *,
29
+ markdown: bool = True,
30
+ npx_command: str = "npx",
31
+ ) -> DefuddleResult:
32
+ """Extract readable content from a URL, HTML file, or HTML piped source.
33
+
34
+ Requires Node.js/npm at runtime. ``source`` is passed to
35
+ ``npx defuddle parse`` as a URL or local file path.
36
+ """
37
+ command = [npx_command, "defuddle", "parse", str(source), "--json"]
38
+ if markdown:
39
+ command.append("--markdown")
40
+
41
+ completed = subprocess.run(
42
+ command,
43
+ capture_output=True,
44
+ text=True,
45
+ check=False,
46
+ )
47
+ if completed.returncode != 0:
48
+ stderr = completed.stderr.strip() or completed.stdout.strip()
49
+ raise RuntimeError(f"defuddle failed: {stderr}")
50
+
51
+ payload = json.loads(completed.stdout)
52
+ content = payload.get("contentMarkdown") or payload.get("content") or ""
53
+ return DefuddleResult(
54
+ content=content,
55
+ title=payload.get("title"),
56
+ author=payload.get("author"),
57
+ source=payload.get("source") or payload.get("url"),
58
+ )
@@ -0,0 +1,7 @@
1
+ Metadata-Version: 2.3
2
+ Name: aimd-html
3
+ Version: 0.9.2
4
+ Summary: Defuddle-backed HTML extraction helpers for aimd workflows.
5
+ Author: Shu Li
6
+ Author-email: Shu Li <zetarylee@gmail.com>
7
+ Requires-Python: >=3.10, <3.13
@@ -0,0 +1,5 @@
1
+ aimd_html/__init__.py,sha256=CdQEPlBc7E_fAOmf5QWw5rEgJHxs-fdzYH0Ty7K78w0,173
2
+ aimd_html/defuddle.py,sha256=x4DKvyqFWMUrQiemaqOID8-TpC58BlYVElsMTlhXlnY,1661
3
+ aimd_html-0.9.2.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
4
+ aimd_html-0.9.2.dist-info/METADATA,sha256=JGTGx-qSU97oHJb67ZwEogPPK2TwkSQ0zjv-zZ4HXSk,211
5
+ aimd_html-0.9.2.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.8.24
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any