htmlquill 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
htmlquill/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ """htmlquill — HTML to Markdown converter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ try:
6
+ from htmlquill._version import __version__, version
7
+ except ImportError:
8
+ __version__ = version = "0.0.0+unknown"
9
+ from htmlquill.core import html_to_markdown, url_to_markdown
10
+
11
+ __all__ = ["__version__", "version", "html_to_markdown", "url_to_markdown"]
htmlquill/_version.py ADDED
@@ -0,0 +1,24 @@
1
+ # file generated by vcs-versioning
2
+ # don't change, don't track in version control
3
+ from __future__ import annotations
4
+
5
+ __all__ = [
6
+ "__version__",
7
+ "__version_tuple__",
8
+ "version",
9
+ "version_tuple",
10
+ "__commit_id__",
11
+ "commit_id",
12
+ ]
13
+
14
+ version: str
15
+ __version__: str
16
+ __version_tuple__: tuple[int | str, ...]
17
+ version_tuple: tuple[int | str, ...]
18
+ commit_id: str | None
19
+ __commit_id__: str | None
20
+
21
+ __version__ = version = '0.1.0'
22
+ __version_tuple__ = version_tuple = (0, 1, 0)
23
+
24
+ __commit_id__ = commit_id = None
htmlquill/analyse.py ADDED
@@ -0,0 +1,75 @@
1
+ """Markdown analysis helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ import re
7
+ from dataclasses import asdict, dataclass
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class MarkdownStats:
12
+ lines: int
13
+ nonblank_lines: int
14
+ chars: int
15
+ words: int
16
+ headings: int
17
+ headings_by_level: dict[str, int]
18
+ code_blocks: int
19
+ inline_code_spans: int
20
+ images: int
21
+ links: int
22
+ tables: int
23
+ blockquotes: int
24
+ list_items: int
25
+ frontmatter: bool
26
+ estimated_reading_minutes: int
27
+
28
+ def to_dict(self) -> dict[str, object]:
29
+ return asdict(self)
30
+
31
+
32
+ _HEADING_RE = re.compile(r"^(#{1,6})\s+\S", re.MULTILINE)
33
+ _FENCED_CODE_RE = re.compile(r"(^|\n)```.*?\n.*?(\n```|$)", re.DOTALL)
34
+ _IMAGE_RE = re.compile(r"!\[[^\]]*\]\([^)]+\)")
35
+ _LINK_RE = re.compile(r"(?<!!)\[[^\]]+\]\([^)]+\)")
36
+ _INLINE_CODE_RE = re.compile(r"(?<!`)`[^`\n]+`(?!`)")
37
+ _LIST_ITEM_RE = re.compile(r"^\s*(?:[-*+]|\d+[.)])\s+\S", re.MULTILINE)
38
+ _BLOCKQUOTE_RE = re.compile(r"^\s*>\s?", re.MULTILINE)
39
+ _TABLE_SEPARATOR_RE = re.compile(r"^\s*\|?\s*:?-{3,}:?\s*(\|\s*:?-{3,}:?\s*)+\|?\s*$")
40
+
41
+
42
+ def _strip_fenced_code(markdown: str) -> str:
43
+ return _FENCED_CODE_RE.sub("\n", markdown)
44
+
45
+
46
+ def count_markdown_stats(markdown: str) -> MarkdownStats:
47
+ lines = markdown.splitlines()
48
+ nonblank = [line for line in lines if line.strip()]
49
+
50
+ headings_by_level = {f"h{i}": 0 for i in range(1, 7)}
51
+ for match in _HEADING_RE.finditer(markdown):
52
+ headings_by_level[f"h{len(match.group(1))}"] += 1
53
+
54
+ without_blocks = _strip_fenced_code(markdown)
55
+ words = len(re.findall(r"\b[\w'-]+\b", without_blocks))
56
+
57
+ tables = sum(1 for i in range(1, len(lines)) if _TABLE_SEPARATOR_RE.match(lines[i]))
58
+
59
+ return MarkdownStats(
60
+ lines=len(lines),
61
+ nonblank_lines=len(nonblank),
62
+ chars=len(markdown),
63
+ words=words,
64
+ headings=sum(headings_by_level.values()),
65
+ headings_by_level=headings_by_level,
66
+ code_blocks=sum(1 for _ in _FENCED_CODE_RE.finditer(markdown)),
67
+ inline_code_spans=len(_INLINE_CODE_RE.findall(without_blocks)),
68
+ images=len(_IMAGE_RE.findall(markdown)),
69
+ links=len(_LINK_RE.findall(markdown)),
70
+ tables=tables,
71
+ blockquotes=len(_BLOCKQUOTE_RE.findall(markdown)),
72
+ list_items=len(_LIST_ITEM_RE.findall(markdown)),
73
+ frontmatter=markdown.startswith("---\n"),
74
+ estimated_reading_minutes=max(1, math.ceil(words / 220)) if words else 0,
75
+ )
htmlquill/auth.py ADDED
@@ -0,0 +1,268 @@
1
+ """Authentication/session state helpers for htmlquill."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ import stat
8
+ import warnings
9
+ from dataclasses import dataclass, field
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from htmlquill.paths import default_auth_path, env_flag
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class CookieConfig:
18
+ name: str
19
+ value: str
20
+ domain: str | None = None
21
+ path: str | None = None
22
+ secure: bool = False
23
+ http_only: bool = False
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class AuthProfile:
28
+ name: str
29
+ kind: str
30
+ cookies: tuple[CookieConfig, ...] = ()
31
+ playwright_storage_state: Path | None = None
32
+ chromium_user_data_dir: Path | None = None
33
+
34
+
35
+ @dataclass(frozen=True)
36
+ class AuthStore:
37
+ version: int = 1
38
+ profiles: dict[str, AuthProfile] = field(default_factory=dict)
39
+ source_path: Path | None = None
40
+
41
+
42
+ @dataclass(frozen=True)
43
+ class ResolvedAuth:
44
+ profile_name: str | None = None
45
+ cookies: list[dict[str, object]] | None = None
46
+ playwright_storage_state: str | None = None
47
+ chromium_user_data_dir: str | None = None
48
+
49
+
50
+ # Use shared helpers from htmlquill.paths
51
+ _env_flag = env_flag
52
+ default_auth_path = default_auth_path
53
+
54
+
55
+ def auth_enabled_for_run(no_auth: bool) -> bool:
56
+ """Return whether auth loading is enabled for this invocation."""
57
+
58
+ if no_auth:
59
+ return False
60
+ return not _env_flag("HTMLQUILL_NO_AUTH")
61
+
62
+
63
+ def resolve_auth_path(
64
+ *,
65
+ explicit_auth_path: str | Path | None,
66
+ config_auth_path: str | None,
67
+ config_dir: Path | None,
68
+ ) -> Path:
69
+ """Resolve auth path from explicit path, env, config, or default."""
70
+
71
+ if explicit_auth_path is not None:
72
+ return Path(explicit_auth_path).expanduser()
73
+
74
+ env_path = os.environ.get("HTMLQUILL_AUTH")
75
+ if env_path:
76
+ return Path(env_path).expanduser()
77
+
78
+ if config_auth_path:
79
+ configured = Path(config_auth_path).expanduser()
80
+ if configured.is_absolute() or config_dir is None:
81
+ return configured
82
+ return (config_dir / configured).resolve()
83
+
84
+ return default_auth_path(config_dir)
85
+
86
+
87
+ def _warn_or_fail_on_permissions(path: Path, *, strict_permissions: bool) -> None:
88
+ if os.name == "nt":
89
+ return
90
+ mode = stat.S_IMODE(path.stat().st_mode)
91
+ if mode & 0o077:
92
+ msg = (
93
+ f"auth file {path} is group/world accessible (mode {oct(mode)}); "
94
+ "recommended mode is 0o600"
95
+ )
96
+ if strict_permissions:
97
+ raise PermissionError(msg)
98
+ warnings.warn(msg, stacklevel=2)
99
+
100
+
101
+ def _parse_cookie(index: int, raw: Any) -> CookieConfig:
102
+ if not isinstance(raw, dict):
103
+ raise ValueError(f"cookies[{index}] must be an object")
104
+
105
+ name = raw.get("name")
106
+ value = raw.get("value")
107
+ if not isinstance(name, str) or not isinstance(value, str):
108
+ raise ValueError(f"cookies[{index}] requires string 'name' and 'value'")
109
+
110
+ domain = raw.get("domain")
111
+ path = raw.get("path")
112
+ secure = bool(raw.get("secure", False))
113
+ http_only = bool(raw.get("httpOnly", raw.get("http_only", False)))
114
+
115
+ if domain is not None and not isinstance(domain, str):
116
+ raise ValueError(f"cookies[{index}].domain must be a string")
117
+ if path is not None and not isinstance(path, str):
118
+ raise ValueError(f"cookies[{index}].path must be a string")
119
+
120
+ return CookieConfig(
121
+ name=name,
122
+ value=value,
123
+ domain=domain,
124
+ path=path,
125
+ secure=secure,
126
+ http_only=http_only,
127
+ )
128
+
129
+
130
+ def _expand_profile_path(raw_path: Any, *, base_dir: Path) -> Path | None:
131
+ if raw_path in (None, ""):
132
+ return None
133
+ if not isinstance(raw_path, str):
134
+ raise ValueError("profile path values must be strings")
135
+ parsed = Path(raw_path).expanduser()
136
+ if parsed.is_absolute():
137
+ return parsed
138
+ return (base_dir / parsed).resolve()
139
+
140
+
141
+ def _parse_profile(name: str, raw: Any, *, base_dir: Path) -> AuthProfile:
142
+ if not isinstance(raw, dict):
143
+ raise ValueError(f"profiles.{name} must be an object")
144
+
145
+ kind = raw.get("kind", "cookies")
146
+ if not isinstance(kind, str):
147
+ raise ValueError(f"profiles.{name}.kind must be a string")
148
+
149
+ cookies_data = raw.get("cookies", [])
150
+ if not isinstance(cookies_data, list):
151
+ raise ValueError(f"profiles.{name}.cookies must be an array")
152
+ cookies = tuple(_parse_cookie(i, c) for i, c in enumerate(cookies_data))
153
+
154
+ playwright_storage_state = _expand_profile_path(
155
+ raw.get("playwright_storage_state"), base_dir=base_dir
156
+ )
157
+ chromium_user_data_dir = _expand_profile_path(
158
+ raw.get("chromium_user_data_dir"), base_dir=base_dir
159
+ )
160
+
161
+ return AuthProfile(
162
+ name=name,
163
+ kind=kind,
164
+ cookies=cookies,
165
+ playwright_storage_state=playwright_storage_state,
166
+ chromium_user_data_dir=chromium_user_data_dir,
167
+ )
168
+
169
+
170
+ def load_auth(path: Path, *, strict_permissions: bool = True) -> AuthStore:
171
+ """Load auth JSON from *path*."""
172
+
173
+ expanded = path.expanduser()
174
+ _warn_or_fail_on_permissions(expanded, strict_permissions=strict_permissions)
175
+
176
+ try:
177
+ payload = json.loads(expanded.read_text(encoding="utf-8"))
178
+ except OSError as exc:
179
+ raise ValueError(f"failed to read auth file {expanded}: {exc}") from exc
180
+ except json.JSONDecodeError as exc:
181
+ raise ValueError(f"failed to parse auth file {expanded}: {exc}") from exc
182
+
183
+ if not isinstance(payload, dict):
184
+ raise ValueError("auth JSON root must be an object")
185
+
186
+ version = payload.get("version", 1)
187
+ if not isinstance(version, int):
188
+ raise ValueError("auth version must be an integer")
189
+
190
+ profiles_raw = payload.get("profiles", {})
191
+ if not isinstance(profiles_raw, dict):
192
+ raise ValueError("auth profiles must be an object")
193
+
194
+ base_dir = expanded.parent
195
+ profiles = {
196
+ profile_name: _parse_profile(profile_name, raw, base_dir=base_dir)
197
+ for profile_name, raw in profiles_raw.items()
198
+ }
199
+
200
+ return AuthStore(version=version, profiles=profiles, source_path=expanded)
201
+
202
+
203
+ def resolve_auth_profile(auth_store: AuthStore, name: str | None) -> AuthProfile | None:
204
+ """Resolve one auth profile by *name*."""
205
+
206
+ if name is None:
207
+ return None
208
+ if name not in auth_store.profiles:
209
+ available = ", ".join(sorted(auth_store.profiles)) or "(none)"
210
+ raise ValueError(f"auth profile {name!r} not found; available: {available}")
211
+ return auth_store.profiles[name]
212
+
213
+
214
+ def resolve_auth(
215
+ auth_store: AuthStore | None,
216
+ *,
217
+ profile_name: str | None,
218
+ ) -> ResolvedAuth:
219
+ """Resolve concrete fetch auth values for an optional auth profile."""
220
+
221
+ if auth_store is None:
222
+ return ResolvedAuth(profile_name=None)
223
+
224
+ profile = resolve_auth_profile(auth_store, profile_name)
225
+ if profile is None:
226
+ return ResolvedAuth(profile_name=None)
227
+
228
+ cookie_payload: list[dict[str, object]] | None = None
229
+ if profile.cookies:
230
+ cookie_payload = [
231
+ {
232
+ "name": cookie.name,
233
+ "value": cookie.value,
234
+ "domain": cookie.domain,
235
+ "path": cookie.path,
236
+ "secure": cookie.secure,
237
+ "httpOnly": cookie.http_only,
238
+ }
239
+ for cookie in profile.cookies
240
+ ]
241
+
242
+ return ResolvedAuth(
243
+ profile_name=profile.name,
244
+ cookies=cookie_payload,
245
+ playwright_storage_state=(
246
+ str(profile.playwright_storage_state)
247
+ if profile.playwright_storage_state is not None
248
+ else None
249
+ ),
250
+ chromium_user_data_dir=(
251
+ str(profile.chromium_user_data_dir)
252
+ if profile.chromium_user_data_dir is not None
253
+ else None
254
+ ),
255
+ )
256
+
257
+
258
+ def redacted_auth_dict(resolved: ResolvedAuth) -> dict[str, object]:
259
+ """Return a redacted dict suitable for ``--print-config`` output."""
260
+
261
+ cookies_count = len(resolved.cookies or [])
262
+ return {
263
+ "profile": resolved.profile_name,
264
+ "cookies": "<redacted>" if cookies_count else None,
265
+ "cookies_count": cookies_count,
266
+ "playwright_storage_state": resolved.playwright_storage_state,
267
+ "chromium_user_data_dir": resolved.chromium_user_data_dir,
268
+ }
htmlquill/challenge.py ADDED
@@ -0,0 +1,49 @@
1
+ """Challenge/interstitial page detection helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Sequence
6
+
7
+ DEFAULT_CHALLENGE_MARKERS: tuple[str, ...] = (
8
+ "Performing security verification",
9
+ "Please wait for verification",
10
+ "security service to protect against malicious bots",
11
+ "verifies you are not a bot",
12
+ "Checking if the site connection is secure",
13
+ "Just a moment",
14
+ "js_challenge",
15
+ "You've been blocked by network security",
16
+ "blocked by network security",
17
+ "If you think you've been blocked by mistake, file a ticket",
18
+ "Please try to login with your Reddit account",
19
+ )
20
+
21
+
22
+ class ChallengePageError(RuntimeError):
23
+ """Raised when fetched HTML appears to be a challenge page."""
24
+
25
+
26
+ def is_challenge_page(
27
+ html: str,
28
+ url: str | None = None,
29
+ markers: Sequence[str] = DEFAULT_CHALLENGE_MARKERS,
30
+ ) -> bool:
31
+ """Return ``True`` if *html* contains known challenge page markers."""
32
+
33
+ del url # reserved for future URL-specific marker behavior
34
+ lower = html.lower()
35
+ return any(marker.lower() in lower for marker in markers)
36
+
37
+
38
+ def assert_not_challenge_page(
39
+ html: str,
40
+ *,
41
+ url: str | None = None,
42
+ markers: Sequence[str] = DEFAULT_CHALLENGE_MARKERS,
43
+ ) -> None:
44
+ """Raise :class:`ChallengePageError` when *html* looks like an interstitial."""
45
+
46
+ if is_challenge_page(html, url=url, markers=markers):
47
+ raise ChallengePageError(
48
+ "received a security verification page instead of article HTML"
49
+ )
htmlquill/clean.py ADDED
@@ -0,0 +1,136 @@
1
+ """HTML parsing and cleanup for htmlquill."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from bs4 import BeautifulSoup, Tag
6
+
7
+ DROP_SELECTORS = [
8
+ "script",
9
+ "style",
10
+ "noscript",
11
+ "template",
12
+ "nav",
13
+ "footer",
14
+ "[hidden]",
15
+ '[aria-hidden="true"]',
16
+ ]
17
+
18
+ NOISE_TEXTS = {
19
+ "Press enter or click to view image in full size",
20
+ }
21
+
22
+ ACTION_HREF_PARTS = (
23
+ "/m/signin",
24
+ "operation=register",
25
+ )
26
+
27
+
28
+ def _drop_action_controls(root: BeautifulSoup | Tag) -> None:
29
+ """Remove action control elements from the DOM."""
30
+ # Drop anchors that point to action/sign-in flows
31
+ for a in list(root.find_all("a")):
32
+ href = str(a.get("href", ""))
33
+ if any(part in href for part in ACTION_HREF_PARTS):
34
+ a.decompose()
35
+ continue
36
+ # Drop empty anchors without images that have role=button
37
+ if a.get("role") == "button":
38
+ has_img = a.find("img") is not None
39
+ if not has_img and not a.get_text(strip=True):
40
+ a.decompose()
41
+
42
+
43
+ def _drop_noise_text_nodes(root: BeautifulSoup | Tag) -> None:
44
+ """Remove known noise text from the DOM."""
45
+ for text_node in list(root.find_all(string=True)):
46
+ if str(text_node).strip() in NOISE_TEXTS:
47
+ # Remove the parent element if it only contains noise
48
+ parent = text_node.parent
49
+ if parent is not None and isinstance(parent, Tag):
50
+ parent_text = parent.get_text(strip=True)
51
+ if parent_text in NOISE_TEXTS:
52
+ parent.decompose()
53
+ else:
54
+ text_node.extract()
55
+ else:
56
+ text_node.extract()
57
+
58
+
59
+ def _drop_non_figure_svgs(root: BeautifulSoup | Tag) -> None:
60
+ """Drop SVG outside figure context; keep figure-embedded scientific SVG."""
61
+ for svg in list(root.find_all("svg")):
62
+ if svg.find_parent("figure") is None:
63
+ svg.decompose()
64
+
65
+
66
+ def _content_score(node: Tag) -> int:
67
+ """Score a candidate content root by content density."""
68
+ score = 0
69
+ score += len(node.find_all("p")) * 3
70
+ score += len(node.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])) * 2
71
+ text_len = len(node.get_text(strip=True))
72
+ score += text_len // 100
73
+ # Penalize nodes with many action/sign-in links
74
+ action_links = 0
75
+ for a in node.find_all("a"):
76
+ href = str(a.get("href", ""))
77
+ if "/m/signin" in href or "operation=register" in href:
78
+ action_links += 1
79
+ score -= action_links * 5
80
+ return score
81
+
82
+
83
+ def parse_and_clean(html: str) -> BeautifulSoup | Tag:
84
+ """Parse HTML and remove unwanted elements.
85
+
86
+ Strips scripts, styles, navigation, footers, hidden elements, and other
87
+ non-content nodes. Selects the best content container using a
88
+ content-scoring heuristic that prefers ``<article>`` over ``<main>``,
89
+ then ``<body>``, or the whole document.
90
+
91
+ Parameters
92
+ ----------
93
+ html
94
+ Raw HTML string to parse.
95
+
96
+ Returns
97
+ -------
98
+ BeautifulSoup | Tag
99
+ The cleaned root element ready for rendering.
100
+ """
101
+ soup = BeautifulSoup(html, "html.parser")
102
+
103
+ for selector in DROP_SELECTORS:
104
+ for node in soup.select(selector):
105
+ node.decompose()
106
+
107
+ _drop_non_figure_svgs(soup)
108
+
109
+ for node in list(soup.find_all(style=True)):
110
+ attrs = node.attrs
111
+ if attrs is None:
112
+ continue
113
+ style = str(attrs.get("style", "")).replace(" ", "").lower()
114
+ if "display:none" in style or "visibility:hidden" in style:
115
+ node.decompose()
116
+
117
+ # Drop action controls and noise text
118
+ _drop_action_controls(soup)
119
+ _drop_noise_text_nodes(soup)
120
+
121
+ # Gather content candidates and score them
122
+ candidates: list[Tag] = []
123
+ for tag in soup.find_all("article"):
124
+ candidates.append(tag)
125
+ for tag in soup.find_all("main"):
126
+ candidates.append(tag)
127
+ # Also consider [role=main]
128
+ for tag in soup.select("[role=main]"):
129
+ if tag not in candidates:
130
+ candidates.append(tag)
131
+
132
+ if candidates:
133
+ best = max(candidates, key=_content_score)
134
+ return best
135
+
136
+ return soup.body or soup