htmlquill 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- htmlquill/__init__.py +11 -0
- htmlquill/_version.py +24 -0
- htmlquill/analyse.py +75 -0
- htmlquill/auth.py +268 -0
- htmlquill/challenge.py +49 -0
- htmlquill/clean.py +136 -0
- htmlquill/cli.py +259 -0
- htmlquill/commands/__init__.py +1 -0
- htmlquill/commands/analyse.py +122 -0
- htmlquill/commands/auth.py +138 -0
- htmlquill/commands/config.py +175 -0
- htmlquill/commands/convert.py +207 -0
- htmlquill/commands/doctor.py +54 -0
- htmlquill/commands/helpers.py +42 -0
- htmlquill/commands/preview.py +60 -0
- htmlquill/config.py +458 -0
- htmlquill/core.py +213 -0
- htmlquill/doctor.py +287 -0
- htmlquill/fetch.py +417 -0
- htmlquill/filenames.py +98 -0
- htmlquill/paths.py +31 -0
- htmlquill/preview.py +45 -0
- htmlquill/py.typed +0 -0
- htmlquill/render.py +650 -0
- htmlquill/urls.py +12 -0
- htmlquill-0.1.0.dist-info/METADATA +254 -0
- htmlquill-0.1.0.dist-info/RECORD +31 -0
- htmlquill-0.1.0.dist-info/WHEEL +5 -0
- htmlquill-0.1.0.dist-info/entry_points.txt +2 -0
- htmlquill-0.1.0.dist-info/licenses/LICENSE +21 -0
- htmlquill-0.1.0.dist-info/top_level.txt +1 -0
htmlquill/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""htmlquill — HTML to Markdown converter."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from htmlquill._version import __version__, version
|
|
7
|
+
except ImportError:
|
|
8
|
+
__version__ = version = "0.0.0+unknown"
|
|
9
|
+
from htmlquill.core import html_to_markdown, url_to_markdown
|
|
10
|
+
|
|
11
|
+
__all__ = ["__version__", "version", "html_to_markdown", "url_to_markdown"]
|
htmlquill/_version.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# file generated by vcs-versioning
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"__version__",
|
|
7
|
+
"__version_tuple__",
|
|
8
|
+
"version",
|
|
9
|
+
"version_tuple",
|
|
10
|
+
"__commit_id__",
|
|
11
|
+
"commit_id",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
version: str
|
|
15
|
+
__version__: str
|
|
16
|
+
__version_tuple__: tuple[int | str, ...]
|
|
17
|
+
version_tuple: tuple[int | str, ...]
|
|
18
|
+
commit_id: str | None
|
|
19
|
+
__commit_id__: str | None
|
|
20
|
+
|
|
21
|
+
__version__ = version = '0.1.0'
|
|
22
|
+
__version_tuple__ = version_tuple = (0, 1, 0)
|
|
23
|
+
|
|
24
|
+
__commit_id__ = commit_id = None
|
htmlquill/analyse.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Markdown analysis helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
import re
|
|
7
|
+
from dataclasses import asdict, dataclass
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class MarkdownStats:
|
|
12
|
+
lines: int
|
|
13
|
+
nonblank_lines: int
|
|
14
|
+
chars: int
|
|
15
|
+
words: int
|
|
16
|
+
headings: int
|
|
17
|
+
headings_by_level: dict[str, int]
|
|
18
|
+
code_blocks: int
|
|
19
|
+
inline_code_spans: int
|
|
20
|
+
images: int
|
|
21
|
+
links: int
|
|
22
|
+
tables: int
|
|
23
|
+
blockquotes: int
|
|
24
|
+
list_items: int
|
|
25
|
+
frontmatter: bool
|
|
26
|
+
estimated_reading_minutes: int
|
|
27
|
+
|
|
28
|
+
def to_dict(self) -> dict[str, object]:
|
|
29
|
+
return asdict(self)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
_HEADING_RE = re.compile(r"^(#{1,6})\s+\S", re.MULTILINE)
|
|
33
|
+
_FENCED_CODE_RE = re.compile(r"(^|\n)```.*?\n.*?(\n```|$)", re.DOTALL)
|
|
34
|
+
_IMAGE_RE = re.compile(r"!\[[^\]]*\]\([^)]+\)")
|
|
35
|
+
_LINK_RE = re.compile(r"(?<!!)\[[^\]]+\]\([^)]+\)")
|
|
36
|
+
_INLINE_CODE_RE = re.compile(r"(?<!`)`[^`\n]+`(?!`)")
|
|
37
|
+
_LIST_ITEM_RE = re.compile(r"^\s*(?:[-*+]|\d+[.)])\s+\S", re.MULTILINE)
|
|
38
|
+
_BLOCKQUOTE_RE = re.compile(r"^\s*>\s?", re.MULTILINE)
|
|
39
|
+
_TABLE_SEPARATOR_RE = re.compile(r"^\s*\|?\s*:?-{3,}:?\s*(\|\s*:?-{3,}:?\s*)+\|?\s*$")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _strip_fenced_code(markdown: str) -> str:
|
|
43
|
+
return _FENCED_CODE_RE.sub("\n", markdown)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def count_markdown_stats(markdown: str) -> MarkdownStats:
|
|
47
|
+
lines = markdown.splitlines()
|
|
48
|
+
nonblank = [line for line in lines if line.strip()]
|
|
49
|
+
|
|
50
|
+
headings_by_level = {f"h{i}": 0 for i in range(1, 7)}
|
|
51
|
+
for match in _HEADING_RE.finditer(markdown):
|
|
52
|
+
headings_by_level[f"h{len(match.group(1))}"] += 1
|
|
53
|
+
|
|
54
|
+
without_blocks = _strip_fenced_code(markdown)
|
|
55
|
+
words = len(re.findall(r"\b[\w'-]+\b", without_blocks))
|
|
56
|
+
|
|
57
|
+
tables = sum(1 for i in range(1, len(lines)) if _TABLE_SEPARATOR_RE.match(lines[i]))
|
|
58
|
+
|
|
59
|
+
return MarkdownStats(
|
|
60
|
+
lines=len(lines),
|
|
61
|
+
nonblank_lines=len(nonblank),
|
|
62
|
+
chars=len(markdown),
|
|
63
|
+
words=words,
|
|
64
|
+
headings=sum(headings_by_level.values()),
|
|
65
|
+
headings_by_level=headings_by_level,
|
|
66
|
+
code_blocks=sum(1 for _ in _FENCED_CODE_RE.finditer(markdown)),
|
|
67
|
+
inline_code_spans=len(_INLINE_CODE_RE.findall(without_blocks)),
|
|
68
|
+
images=len(_IMAGE_RE.findall(markdown)),
|
|
69
|
+
links=len(_LINK_RE.findall(markdown)),
|
|
70
|
+
tables=tables,
|
|
71
|
+
blockquotes=len(_BLOCKQUOTE_RE.findall(markdown)),
|
|
72
|
+
list_items=len(_LIST_ITEM_RE.findall(markdown)),
|
|
73
|
+
frontmatter=markdown.startswith("---\n"),
|
|
74
|
+
estimated_reading_minutes=max(1, math.ceil(words / 220)) if words else 0,
|
|
75
|
+
)
|
htmlquill/auth.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
"""Authentication/session state helpers for htmlquill."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import stat
|
|
8
|
+
import warnings
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from htmlquill.paths import default_auth_path, env_flag
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class CookieConfig:
|
|
18
|
+
name: str
|
|
19
|
+
value: str
|
|
20
|
+
domain: str | None = None
|
|
21
|
+
path: str | None = None
|
|
22
|
+
secure: bool = False
|
|
23
|
+
http_only: bool = False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(frozen=True)
|
|
27
|
+
class AuthProfile:
|
|
28
|
+
name: str
|
|
29
|
+
kind: str
|
|
30
|
+
cookies: tuple[CookieConfig, ...] = ()
|
|
31
|
+
playwright_storage_state: Path | None = None
|
|
32
|
+
chromium_user_data_dir: Path | None = None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(frozen=True)
|
|
36
|
+
class AuthStore:
|
|
37
|
+
version: int = 1
|
|
38
|
+
profiles: dict[str, AuthProfile] = field(default_factory=dict)
|
|
39
|
+
source_path: Path | None = None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass(frozen=True)
|
|
43
|
+
class ResolvedAuth:
|
|
44
|
+
profile_name: str | None = None
|
|
45
|
+
cookies: list[dict[str, object]] | None = None
|
|
46
|
+
playwright_storage_state: str | None = None
|
|
47
|
+
chromium_user_data_dir: str | None = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# Use shared helpers from htmlquill.paths
|
|
51
|
+
_env_flag = env_flag
|
|
52
|
+
default_auth_path = default_auth_path
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def auth_enabled_for_run(no_auth: bool) -> bool:
|
|
56
|
+
"""Return whether auth loading is enabled for this invocation."""
|
|
57
|
+
|
|
58
|
+
if no_auth:
|
|
59
|
+
return False
|
|
60
|
+
return not _env_flag("HTMLQUILL_NO_AUTH")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def resolve_auth_path(
|
|
64
|
+
*,
|
|
65
|
+
explicit_auth_path: str | Path | None,
|
|
66
|
+
config_auth_path: str | None,
|
|
67
|
+
config_dir: Path | None,
|
|
68
|
+
) -> Path:
|
|
69
|
+
"""Resolve auth path from explicit path, env, config, or default."""
|
|
70
|
+
|
|
71
|
+
if explicit_auth_path is not None:
|
|
72
|
+
return Path(explicit_auth_path).expanduser()
|
|
73
|
+
|
|
74
|
+
env_path = os.environ.get("HTMLQUILL_AUTH")
|
|
75
|
+
if env_path:
|
|
76
|
+
return Path(env_path).expanduser()
|
|
77
|
+
|
|
78
|
+
if config_auth_path:
|
|
79
|
+
configured = Path(config_auth_path).expanduser()
|
|
80
|
+
if configured.is_absolute() or config_dir is None:
|
|
81
|
+
return configured
|
|
82
|
+
return (config_dir / configured).resolve()
|
|
83
|
+
|
|
84
|
+
return default_auth_path(config_dir)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _warn_or_fail_on_permissions(path: Path, *, strict_permissions: bool) -> None:
|
|
88
|
+
if os.name == "nt":
|
|
89
|
+
return
|
|
90
|
+
mode = stat.S_IMODE(path.stat().st_mode)
|
|
91
|
+
if mode & 0o077:
|
|
92
|
+
msg = (
|
|
93
|
+
f"auth file {path} is group/world accessible (mode {oct(mode)}); "
|
|
94
|
+
"recommended mode is 0o600"
|
|
95
|
+
)
|
|
96
|
+
if strict_permissions:
|
|
97
|
+
raise PermissionError(msg)
|
|
98
|
+
warnings.warn(msg, stacklevel=2)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _parse_cookie(index: int, raw: Any) -> CookieConfig:
|
|
102
|
+
if not isinstance(raw, dict):
|
|
103
|
+
raise ValueError(f"cookies[{index}] must be an object")
|
|
104
|
+
|
|
105
|
+
name = raw.get("name")
|
|
106
|
+
value = raw.get("value")
|
|
107
|
+
if not isinstance(name, str) or not isinstance(value, str):
|
|
108
|
+
raise ValueError(f"cookies[{index}] requires string 'name' and 'value'")
|
|
109
|
+
|
|
110
|
+
domain = raw.get("domain")
|
|
111
|
+
path = raw.get("path")
|
|
112
|
+
secure = bool(raw.get("secure", False))
|
|
113
|
+
http_only = bool(raw.get("httpOnly", raw.get("http_only", False)))
|
|
114
|
+
|
|
115
|
+
if domain is not None and not isinstance(domain, str):
|
|
116
|
+
raise ValueError(f"cookies[{index}].domain must be a string")
|
|
117
|
+
if path is not None and not isinstance(path, str):
|
|
118
|
+
raise ValueError(f"cookies[{index}].path must be a string")
|
|
119
|
+
|
|
120
|
+
return CookieConfig(
|
|
121
|
+
name=name,
|
|
122
|
+
value=value,
|
|
123
|
+
domain=domain,
|
|
124
|
+
path=path,
|
|
125
|
+
secure=secure,
|
|
126
|
+
http_only=http_only,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _expand_profile_path(raw_path: Any, *, base_dir: Path) -> Path | None:
|
|
131
|
+
if raw_path in (None, ""):
|
|
132
|
+
return None
|
|
133
|
+
if not isinstance(raw_path, str):
|
|
134
|
+
raise ValueError("profile path values must be strings")
|
|
135
|
+
parsed = Path(raw_path).expanduser()
|
|
136
|
+
if parsed.is_absolute():
|
|
137
|
+
return parsed
|
|
138
|
+
return (base_dir / parsed).resolve()
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _parse_profile(name: str, raw: Any, *, base_dir: Path) -> AuthProfile:
|
|
142
|
+
if not isinstance(raw, dict):
|
|
143
|
+
raise ValueError(f"profiles.{name} must be an object")
|
|
144
|
+
|
|
145
|
+
kind = raw.get("kind", "cookies")
|
|
146
|
+
if not isinstance(kind, str):
|
|
147
|
+
raise ValueError(f"profiles.{name}.kind must be a string")
|
|
148
|
+
|
|
149
|
+
cookies_data = raw.get("cookies", [])
|
|
150
|
+
if not isinstance(cookies_data, list):
|
|
151
|
+
raise ValueError(f"profiles.{name}.cookies must be an array")
|
|
152
|
+
cookies = tuple(_parse_cookie(i, c) for i, c in enumerate(cookies_data))
|
|
153
|
+
|
|
154
|
+
playwright_storage_state = _expand_profile_path(
|
|
155
|
+
raw.get("playwright_storage_state"), base_dir=base_dir
|
|
156
|
+
)
|
|
157
|
+
chromium_user_data_dir = _expand_profile_path(
|
|
158
|
+
raw.get("chromium_user_data_dir"), base_dir=base_dir
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return AuthProfile(
|
|
162
|
+
name=name,
|
|
163
|
+
kind=kind,
|
|
164
|
+
cookies=cookies,
|
|
165
|
+
playwright_storage_state=playwright_storage_state,
|
|
166
|
+
chromium_user_data_dir=chromium_user_data_dir,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def load_auth(path: Path, *, strict_permissions: bool = True) -> AuthStore:
|
|
171
|
+
"""Load auth JSON from *path*."""
|
|
172
|
+
|
|
173
|
+
expanded = path.expanduser()
|
|
174
|
+
_warn_or_fail_on_permissions(expanded, strict_permissions=strict_permissions)
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
payload = json.loads(expanded.read_text(encoding="utf-8"))
|
|
178
|
+
except OSError as exc:
|
|
179
|
+
raise ValueError(f"failed to read auth file {expanded}: {exc}") from exc
|
|
180
|
+
except json.JSONDecodeError as exc:
|
|
181
|
+
raise ValueError(f"failed to parse auth file {expanded}: {exc}") from exc
|
|
182
|
+
|
|
183
|
+
if not isinstance(payload, dict):
|
|
184
|
+
raise ValueError("auth JSON root must be an object")
|
|
185
|
+
|
|
186
|
+
version = payload.get("version", 1)
|
|
187
|
+
if not isinstance(version, int):
|
|
188
|
+
raise ValueError("auth version must be an integer")
|
|
189
|
+
|
|
190
|
+
profiles_raw = payload.get("profiles", {})
|
|
191
|
+
if not isinstance(profiles_raw, dict):
|
|
192
|
+
raise ValueError("auth profiles must be an object")
|
|
193
|
+
|
|
194
|
+
base_dir = expanded.parent
|
|
195
|
+
profiles = {
|
|
196
|
+
profile_name: _parse_profile(profile_name, raw, base_dir=base_dir)
|
|
197
|
+
for profile_name, raw in profiles_raw.items()
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
return AuthStore(version=version, profiles=profiles, source_path=expanded)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def resolve_auth_profile(auth_store: AuthStore, name: str | None) -> AuthProfile | None:
|
|
204
|
+
"""Resolve one auth profile by *name*."""
|
|
205
|
+
|
|
206
|
+
if name is None:
|
|
207
|
+
return None
|
|
208
|
+
if name not in auth_store.profiles:
|
|
209
|
+
available = ", ".join(sorted(auth_store.profiles)) or "(none)"
|
|
210
|
+
raise ValueError(f"auth profile {name!r} not found; available: {available}")
|
|
211
|
+
return auth_store.profiles[name]
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def resolve_auth(
|
|
215
|
+
auth_store: AuthStore | None,
|
|
216
|
+
*,
|
|
217
|
+
profile_name: str | None,
|
|
218
|
+
) -> ResolvedAuth:
|
|
219
|
+
"""Resolve concrete fetch auth values for an optional auth profile."""
|
|
220
|
+
|
|
221
|
+
if auth_store is None:
|
|
222
|
+
return ResolvedAuth(profile_name=None)
|
|
223
|
+
|
|
224
|
+
profile = resolve_auth_profile(auth_store, profile_name)
|
|
225
|
+
if profile is None:
|
|
226
|
+
return ResolvedAuth(profile_name=None)
|
|
227
|
+
|
|
228
|
+
cookie_payload: list[dict[str, object]] | None = None
|
|
229
|
+
if profile.cookies:
|
|
230
|
+
cookie_payload = [
|
|
231
|
+
{
|
|
232
|
+
"name": cookie.name,
|
|
233
|
+
"value": cookie.value,
|
|
234
|
+
"domain": cookie.domain,
|
|
235
|
+
"path": cookie.path,
|
|
236
|
+
"secure": cookie.secure,
|
|
237
|
+
"httpOnly": cookie.http_only,
|
|
238
|
+
}
|
|
239
|
+
for cookie in profile.cookies
|
|
240
|
+
]
|
|
241
|
+
|
|
242
|
+
return ResolvedAuth(
|
|
243
|
+
profile_name=profile.name,
|
|
244
|
+
cookies=cookie_payload,
|
|
245
|
+
playwright_storage_state=(
|
|
246
|
+
str(profile.playwright_storage_state)
|
|
247
|
+
if profile.playwright_storage_state is not None
|
|
248
|
+
else None
|
|
249
|
+
),
|
|
250
|
+
chromium_user_data_dir=(
|
|
251
|
+
str(profile.chromium_user_data_dir)
|
|
252
|
+
if profile.chromium_user_data_dir is not None
|
|
253
|
+
else None
|
|
254
|
+
),
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def redacted_auth_dict(resolved: ResolvedAuth) -> dict[str, object]:
|
|
259
|
+
"""Return a redacted dict suitable for ``--print-config`` output."""
|
|
260
|
+
|
|
261
|
+
cookies_count = len(resolved.cookies or [])
|
|
262
|
+
return {
|
|
263
|
+
"profile": resolved.profile_name,
|
|
264
|
+
"cookies": "<redacted>" if cookies_count else None,
|
|
265
|
+
"cookies_count": cookies_count,
|
|
266
|
+
"playwright_storage_state": resolved.playwright_storage_state,
|
|
267
|
+
"chromium_user_data_dir": resolved.chromium_user_data_dir,
|
|
268
|
+
}
|
htmlquill/challenge.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Challenge/interstitial page detection helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Sequence
|
|
6
|
+
|
|
7
|
+
DEFAULT_CHALLENGE_MARKERS: tuple[str, ...] = (
|
|
8
|
+
"Performing security verification",
|
|
9
|
+
"Please wait for verification",
|
|
10
|
+
"security service to protect against malicious bots",
|
|
11
|
+
"verifies you are not a bot",
|
|
12
|
+
"Checking if the site connection is secure",
|
|
13
|
+
"Just a moment",
|
|
14
|
+
"js_challenge",
|
|
15
|
+
"You've been blocked by network security",
|
|
16
|
+
"blocked by network security",
|
|
17
|
+
"If you think you've been blocked by mistake, file a ticket",
|
|
18
|
+
"Please try to login with your Reddit account",
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ChallengePageError(RuntimeError):
|
|
23
|
+
"""Raised when fetched HTML appears to be a challenge page."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def is_challenge_page(
|
|
27
|
+
html: str,
|
|
28
|
+
url: str | None = None,
|
|
29
|
+
markers: Sequence[str] = DEFAULT_CHALLENGE_MARKERS,
|
|
30
|
+
) -> bool:
|
|
31
|
+
"""Return ``True`` if *html* contains known challenge page markers."""
|
|
32
|
+
|
|
33
|
+
del url # reserved for future URL-specific marker behavior
|
|
34
|
+
lower = html.lower()
|
|
35
|
+
return any(marker.lower() in lower for marker in markers)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def assert_not_challenge_page(
|
|
39
|
+
html: str,
|
|
40
|
+
*,
|
|
41
|
+
url: str | None = None,
|
|
42
|
+
markers: Sequence[str] = DEFAULT_CHALLENGE_MARKERS,
|
|
43
|
+
) -> None:
|
|
44
|
+
"""Raise :class:`ChallengePageError` when *html* looks like an interstitial."""
|
|
45
|
+
|
|
46
|
+
if is_challenge_page(html, url=url, markers=markers):
|
|
47
|
+
raise ChallengePageError(
|
|
48
|
+
"received a security verification page instead of article HTML"
|
|
49
|
+
)
|
htmlquill/clean.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""HTML parsing and cleanup for htmlquill."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from bs4 import BeautifulSoup, Tag
|
|
6
|
+
|
|
7
|
+
DROP_SELECTORS = [
|
|
8
|
+
"script",
|
|
9
|
+
"style",
|
|
10
|
+
"noscript",
|
|
11
|
+
"template",
|
|
12
|
+
"nav",
|
|
13
|
+
"footer",
|
|
14
|
+
"[hidden]",
|
|
15
|
+
'[aria-hidden="true"]',
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
NOISE_TEXTS = {
|
|
19
|
+
"Press enter or click to view image in full size",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
ACTION_HREF_PARTS = (
|
|
23
|
+
"/m/signin",
|
|
24
|
+
"operation=register",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _drop_action_controls(root: BeautifulSoup | Tag) -> None:
|
|
29
|
+
"""Remove action control elements from the DOM."""
|
|
30
|
+
# Drop anchors that point to action/sign-in flows
|
|
31
|
+
for a in list(root.find_all("a")):
|
|
32
|
+
href = str(a.get("href", ""))
|
|
33
|
+
if any(part in href for part in ACTION_HREF_PARTS):
|
|
34
|
+
a.decompose()
|
|
35
|
+
continue
|
|
36
|
+
# Drop empty anchors without images that have role=button
|
|
37
|
+
if a.get("role") == "button":
|
|
38
|
+
has_img = a.find("img") is not None
|
|
39
|
+
if not has_img and not a.get_text(strip=True):
|
|
40
|
+
a.decompose()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _drop_noise_text_nodes(root: BeautifulSoup | Tag) -> None:
|
|
44
|
+
"""Remove known noise text from the DOM."""
|
|
45
|
+
for text_node in list(root.find_all(string=True)):
|
|
46
|
+
if str(text_node).strip() in NOISE_TEXTS:
|
|
47
|
+
# Remove the parent element if it only contains noise
|
|
48
|
+
parent = text_node.parent
|
|
49
|
+
if parent is not None and isinstance(parent, Tag):
|
|
50
|
+
parent_text = parent.get_text(strip=True)
|
|
51
|
+
if parent_text in NOISE_TEXTS:
|
|
52
|
+
parent.decompose()
|
|
53
|
+
else:
|
|
54
|
+
text_node.extract()
|
|
55
|
+
else:
|
|
56
|
+
text_node.extract()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _drop_non_figure_svgs(root: BeautifulSoup | Tag) -> None:
|
|
60
|
+
"""Drop SVG outside figure context; keep figure-embedded scientific SVG."""
|
|
61
|
+
for svg in list(root.find_all("svg")):
|
|
62
|
+
if svg.find_parent("figure") is None:
|
|
63
|
+
svg.decompose()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _content_score(node: Tag) -> int:
|
|
67
|
+
"""Score a candidate content root by content density."""
|
|
68
|
+
score = 0
|
|
69
|
+
score += len(node.find_all("p")) * 3
|
|
70
|
+
score += len(node.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])) * 2
|
|
71
|
+
text_len = len(node.get_text(strip=True))
|
|
72
|
+
score += text_len // 100
|
|
73
|
+
# Penalize nodes with many action/sign-in links
|
|
74
|
+
action_links = 0
|
|
75
|
+
for a in node.find_all("a"):
|
|
76
|
+
href = str(a.get("href", ""))
|
|
77
|
+
if "/m/signin" in href or "operation=register" in href:
|
|
78
|
+
action_links += 1
|
|
79
|
+
score -= action_links * 5
|
|
80
|
+
return score
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def parse_and_clean(html: str) -> BeautifulSoup | Tag:
|
|
84
|
+
"""Parse HTML and remove unwanted elements.
|
|
85
|
+
|
|
86
|
+
Strips scripts, styles, navigation, footers, hidden elements, and other
|
|
87
|
+
non-content nodes. Selects the best content container using a
|
|
88
|
+
content-scoring heuristic that prefers ``<article>`` over ``<main>``,
|
|
89
|
+
then ``<body>``, or the whole document.
|
|
90
|
+
|
|
91
|
+
Parameters
|
|
92
|
+
----------
|
|
93
|
+
html
|
|
94
|
+
Raw HTML string to parse.
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
BeautifulSoup | Tag
|
|
99
|
+
The cleaned root element ready for rendering.
|
|
100
|
+
"""
|
|
101
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
102
|
+
|
|
103
|
+
for selector in DROP_SELECTORS:
|
|
104
|
+
for node in soup.select(selector):
|
|
105
|
+
node.decompose()
|
|
106
|
+
|
|
107
|
+
_drop_non_figure_svgs(soup)
|
|
108
|
+
|
|
109
|
+
for node in list(soup.find_all(style=True)):
|
|
110
|
+
attrs = node.attrs
|
|
111
|
+
if attrs is None:
|
|
112
|
+
continue
|
|
113
|
+
style = str(attrs.get("style", "")).replace(" ", "").lower()
|
|
114
|
+
if "display:none" in style or "visibility:hidden" in style:
|
|
115
|
+
node.decompose()
|
|
116
|
+
|
|
117
|
+
# Drop action controls and noise text
|
|
118
|
+
_drop_action_controls(soup)
|
|
119
|
+
_drop_noise_text_nodes(soup)
|
|
120
|
+
|
|
121
|
+
# Gather content candidates and score them
|
|
122
|
+
candidates: list[Tag] = []
|
|
123
|
+
for tag in soup.find_all("article"):
|
|
124
|
+
candidates.append(tag)
|
|
125
|
+
for tag in soup.find_all("main"):
|
|
126
|
+
candidates.append(tag)
|
|
127
|
+
# Also consider [role=main]
|
|
128
|
+
for tag in soup.select("[role=main]"):
|
|
129
|
+
if tag not in candidates:
|
|
130
|
+
candidates.append(tag)
|
|
131
|
+
|
|
132
|
+
if candidates:
|
|
133
|
+
best = max(candidates, key=_content_score)
|
|
134
|
+
return best
|
|
135
|
+
|
|
136
|
+
return soup.body or soup
|