aimd-html 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aimd_html/__init__.py +5 -0
- aimd_html/defuddle.py +58 -0
- aimd_html-0.9.2.dist-info/METADATA +7 -0
- aimd_html-0.9.2.dist-info/RECORD +5 -0
- aimd_html-0.9.2.dist-info/WHEEL +4 -0
aimd_html/__init__.py
ADDED
aimd_html/defuddle.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Small Python wrapper around the Defuddle CLI.
|
|
2
|
+
|
|
3
|
+
Defuddle is distributed as a Node/TypeScript package. This wrapper keeps the
|
|
4
|
+
Python package boundary explicit and invokes ``npx defuddle`` when callers opt
|
|
5
|
+
into HTML extraction.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
import json
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
import subprocess
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(slots=True, frozen=True)
|
|
17
|
+
class DefuddleResult:
|
|
18
|
+
"""Parsed HTML content and metadata returned by Defuddle."""
|
|
19
|
+
|
|
20
|
+
content: str
|
|
21
|
+
title: str | None = None
|
|
22
|
+
author: str | None = None
|
|
23
|
+
source: str | None = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def extract_html_with_defuddle(
|
|
27
|
+
source: str | Path,
|
|
28
|
+
*,
|
|
29
|
+
markdown: bool = True,
|
|
30
|
+
npx_command: str = "npx",
|
|
31
|
+
) -> DefuddleResult:
|
|
32
|
+
"""Extract readable content from a URL, HTML file, or HTML piped source.
|
|
33
|
+
|
|
34
|
+
Requires Node.js/npm at runtime. ``source`` is passed to
|
|
35
|
+
``npx defuddle parse`` as a URL or local file path.
|
|
36
|
+
"""
|
|
37
|
+
command = [npx_command, "defuddle", "parse", str(source), "--json"]
|
|
38
|
+
if markdown:
|
|
39
|
+
command.append("--markdown")
|
|
40
|
+
|
|
41
|
+
completed = subprocess.run(
|
|
42
|
+
command,
|
|
43
|
+
capture_output=True,
|
|
44
|
+
text=True,
|
|
45
|
+
check=False,
|
|
46
|
+
)
|
|
47
|
+
if completed.returncode != 0:
|
|
48
|
+
stderr = completed.stderr.strip() or completed.stdout.strip()
|
|
49
|
+
raise RuntimeError(f"defuddle failed: {stderr}")
|
|
50
|
+
|
|
51
|
+
payload = json.loads(completed.stdout)
|
|
52
|
+
content = payload.get("contentMarkdown") or payload.get("content") or ""
|
|
53
|
+
return DefuddleResult(
|
|
54
|
+
content=content,
|
|
55
|
+
title=payload.get("title"),
|
|
56
|
+
author=payload.get("author"),
|
|
57
|
+
source=payload.get("source") or payload.get("url"),
|
|
58
|
+
)
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
aimd_html/__init__.py,sha256=CdQEPlBc7E_fAOmf5QWw5rEgJHxs-fdzYH0Ty7K78w0,173
|
|
2
|
+
aimd_html/defuddle.py,sha256=x4DKvyqFWMUrQiemaqOID8-TpC58BlYVElsMTlhXlnY,1661
|
|
3
|
+
aimd_html-0.9.2.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
|
|
4
|
+
aimd_html-0.9.2.dist-info/METADATA,sha256=JGTGx-qSU97oHJb67ZwEogPPK2TwkSQ0zjv-zZ4HXSk,211
|
|
5
|
+
aimd_html-0.9.2.dist-info/RECORD,,
|