android-watcher 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. android_watcher/__init__.py +10 -0
  2. android_watcher/catalog/__init__.py +32 -0
  3. android_watcher/catalog/catalog.toml +531 -0
  4. android_watcher/cli.py +161 -0
  5. android_watcher/config.py +262 -0
  6. android_watcher/detect/__init__.py +1 -0
  7. android_watcher/detect/_normalize.py +192 -0
  8. android_watcher/detect/android_sitemap.py +540 -0
  9. android_watcher/detect/base.py +14 -0
  10. android_watcher/detect/content.py +99 -0
  11. android_watcher/detect/feed.py +135 -0
  12. android_watcher/detect/sitemap.py +203 -0
  13. android_watcher/doctor.py +125 -0
  14. android_watcher/fetch.py +162 -0
  15. android_watcher/group.py +79 -0
  16. android_watcher/lock.py +32 -0
  17. android_watcher/models.py +156 -0
  18. android_watcher/notify/__init__.py +1 -0
  19. android_watcher/notify/base.py +21 -0
  20. android_watcher/notify/email.py +52 -0
  21. android_watcher/notify/html.py +114 -0
  22. android_watcher/notify/render.py +239 -0
  23. android_watcher/notify/slack.py +124 -0
  24. android_watcher/notify/telegram.py +46 -0
  25. android_watcher/rank.py +84 -0
  26. android_watcher/registry.py +38 -0
  27. android_watcher/run.py +283 -0
  28. android_watcher/schedule.py +488 -0
  29. android_watcher/seed/__init__.py +45 -0
  30. android_watcher/seed/seed.sql.gz +0 -0
  31. android_watcher/store.py +492 -0
  32. android_watcher/triage/__init__.py +1 -0
  33. android_watcher/triage/base.py +25 -0
  34. android_watcher/triage/claude_cli.py +185 -0
  35. android_watcher/triage/noop.py +24 -0
  36. android_watcher/tui/__init__.py +1 -0
  37. android_watcher/tui/app.py +163 -0
  38. android_watcher/tui/configio.py +215 -0
  39. android_watcher/tui/screens.py +927 -0
  40. android_watcher-1.0.0.dist-info/METADATA +310 -0
  41. android_watcher-1.0.0.dist-info/RECORD +44 -0
  42. android_watcher-1.0.0.dist-info/WHEEL +4 -0
  43. android_watcher-1.0.0.dist-info/entry_points.txt +2 -0
  44. android_watcher-1.0.0.dist-info/licenses/LICENSE +21 -0
android_watcher/cli.py ADDED
@@ -0,0 +1,161 @@
1
+ """android-watcher CLI: argparse router."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import sys
7
+
8
+ from . import __version__
9
+ from .catalog import load_catalog
10
+ from .config import load_config
11
+ from .doctor import run_doctor
12
+ from .models import AlreadyRunning, ConfigError
13
+ from .notify.render import render_email
14
+ from .run import configure_file_logging, run_once
15
+ from .schedule import install_schedule, remove_schedule, schedule_status
16
+ from .tui.app import AndroidWatcher
17
+ from .tui.configio import load_or_default
18
+
19
+
20
+ def _cmd_catalog(args: argparse.Namespace) -> int:
21
+ for source in load_catalog():
22
+ state = "on " if source.enabled else "off"
23
+ print(f"[{state}] {source.id:<28} {source.detector:<16} {source.category}")
24
+ return 0
25
+
26
+
27
+ def _cmd_run(args: argparse.Namespace) -> int:
28
+ try:
29
+ config = load_config(args.config)
30
+ except ConfigError as exc:
31
+ print(f"android-watcher: configuration error: {exc}", file=sys.stderr)
32
+ return 1
33
+ log_file = configure_file_logging()
34
+ print(f"android-watcher: running… (progress logged to {log_file})", file=sys.stderr)
35
+ try:
36
+ digest = run_once(config, force=args.force)
37
+ except AlreadyRunning:
38
+ print("android-watcher: another run is in progress; exiting.")
39
+ return 0
40
+ print(f"android-watcher: {digest.change_count()} change(s) delivered.")
41
+ return 0
42
+
43
+
44
+ def _cmd_test(args: argparse.Namespace) -> int:
45
+ try:
46
+ config = load_config(args.config)
47
+ except ConfigError as exc:
48
+ print(f"android-watcher: configuration error: {exc}", file=sys.stderr)
49
+ return 1
50
+ configure_file_logging()
51
+ checks = run_doctor(config)
52
+ failed = [
53
+ c for c in checks if not c.ok and (c.name == "ai-backend" or c.name.startswith("channel"))
54
+ ]
55
+ digest = run_once(config, dry_run=True)
56
+ _, plaintext = render_email(digest)
57
+ print(plaintext)
58
+ if failed:
59
+ for c in failed:
60
+ print(f"FAIL {c.name}: {c.detail}")
61
+ return 1
62
+ return 0
63
+
64
+
65
+ def _cmd_doctor(args: argparse.Namespace) -> int:
66
+ try:
67
+ config = load_config(args.config)
68
+ except ConfigError as exc:
69
+ print(f"android-watcher: configuration error: {exc}", file=sys.stderr)
70
+ return 1
71
+ print("Running checks (verifying the sitemap may take up to ~30s)…", file=sys.stderr)
72
+ checks = run_doctor(config)
73
+ any_failed = False
74
+ for c in checks:
75
+ status = "OK " if c.ok else "FAIL"
76
+ if not c.ok:
77
+ any_failed = True
78
+ print(f"{status} {c.name}: {c.detail}")
79
+ return 1 if any_failed else 0
80
+
81
+
82
+ def _cmd_schedule(args: argparse.Namespace) -> int:
83
+ if args.action == "install":
84
+ config = load_config(args.config) if args.config else load_or_default()[0]
85
+ install_schedule(config)
86
+ return 0
87
+ if args.action == "remove":
88
+ remove_schedule()
89
+ return 0
90
+ # status
91
+ check = schedule_status()
92
+ status = "OK " if check.ok else "FAIL"
93
+ print(f"{status} {check.name}: {check.detail}")
94
+ return 0 if check.ok else 1
95
+
96
+
97
+ def _cmd_tui(args: argparse.Namespace) -> int:
98
+ config, existed = load_or_default()
99
+ # Configuration is incomplete until a delivery channel is set, so run the
100
+ # wizard from the start in that case too — not only when the file is absent.
101
+ # (Short-circuits on a fresh install before touching the config fields.)
102
+ first_run = (not existed) or not (
103
+ config.email.enabled or config.slack.enabled or config.telegram.enabled
104
+ )
105
+ result = AndroidWatcher(config=config, first_run=first_run).run()
106
+ if isinstance(result, str):
107
+ print(result)
108
+ return 0
109
+
110
+
111
+ def _build_parser() -> argparse.ArgumentParser:
112
+ parser = argparse.ArgumentParser(
113
+ prog="android-watcher",
114
+ description="Watch official Google Android sites and deliver a ranked digest.",
115
+ )
116
+ parser.add_argument("--version", action="version", version=f"android-watcher {__version__}")
117
+ parser.add_argument(
118
+ "--config",
119
+ default=None,
120
+ metavar="PATH",
121
+ help="Path to config file (default: platform config dir).",
122
+ )
123
+ parser.set_defaults(func=_cmd_tui)
124
+
125
+ sub = parser.add_subparsers(dest="command")
126
+
127
+ run_p = sub.add_parser("run", help="Run one detection-triage-notify cycle.")
128
+ run_p.add_argument(
129
+ "--force",
130
+ action="store_true",
131
+ help="Run even if the last cycle is not yet due.",
132
+ )
133
+
134
+ sub.add_parser("test", help="Dry-run and render the current digest to stdout.")
135
+ sub.add_parser("catalog", help="List configured sources from the catalog.")
136
+ sub.add_parser("doctor", help="Run health checks.")
137
+
138
+ schedule_p = sub.add_parser("schedule", help="Manage the native scheduled job.")
139
+ schedule_p.add_argument(
140
+ "action",
141
+ choices=["install", "remove", "status"],
142
+ help="Schedule action to perform.",
143
+ )
144
+
145
+ # Wire up handlers after all subparsers are registered.
146
+ for name, func in [
147
+ ("run", _cmd_run),
148
+ ("test", _cmd_test),
149
+ ("catalog", _cmd_catalog),
150
+ ("doctor", _cmd_doctor),
151
+ ("schedule", _cmd_schedule),
152
+ ]:
153
+ sub.choices[name].set_defaults(func=func)
154
+
155
+ return parser
156
+
157
+
158
+ def main(argv: list[str] | None = None) -> int:
159
+ parser = _build_parser()
160
+ args = parser.parse_args(argv)
161
+ return args.func(args)
@@ -0,0 +1,262 @@
1
+ """Config dataclasses, path helpers, TOML loader, and env interpolation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ import tomllib
8
+ from dataclasses import dataclass, field
9
+ from typing import Any, Literal
10
+
11
+ import platformdirs
12
+
13
+ from .models import ConfigError, Source # ConfigError defined once in models.py
14
+
15
+ __all__ = [
16
+ "AIConfig",
17
+ "Config",
18
+ "ConfigError",
19
+ "DigestConfig",
20
+ "EmailChannel",
21
+ "ScheduleConfig",
22
+ "SlackChannel",
23
+ "TelegramChannel",
24
+ "config_path",
25
+ "data_path",
26
+ "db_path",
27
+ "load_config",
28
+ "log_path",
29
+ ]
30
+
31
+ APP_NAME = "android-watcher"
32
+ _ENV_RE = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
33
+ _VALID_INTERVALS = {"hourly", "daily", "weekly", "cron"}
34
+ _VALID_AI_MODES = {"claude_cli", "off"}
35
+ _VALID_EMPTY = {"send", "skip"}
36
+
37
+
38
+ @dataclass
39
+ class ScheduleConfig:
40
+ interval: Literal["hourly", "daily", "weekly", "cron"] = "daily"
41
+ at: str = "09:00" # one or more HH:MM, comma-separated
42
+ days: str = "mon" # weekly only: comma-separated weekday abbrevs (mon..sun)
43
+ cron: str = ""
44
+
45
+
46
+ @dataclass
47
+ class AIConfig:
48
+ mode: Literal["claude_cli", "off"] = "claude_cli"
49
+ model: str = "claude-sonnet-4-6"
50
+
51
+
52
+ @dataclass
53
+ class DigestConfig:
54
+ max_items: int = 10
55
+ empty: Literal["send", "skip"] = "send"
56
+
57
+
58
+ @dataclass
59
+ class EmailChannel:
60
+ enabled: bool = False
61
+ smtp_host: str = ""
62
+ smtp_port: int = 465
63
+ username: str = ""
64
+ password: str = ""
65
+ sender: str = "" # maps to TOML key "from"
66
+ recipient: str = "" # maps to TOML key "to"
67
+
68
+
69
+ @dataclass
70
+ class SlackChannel:
71
+ enabled: bool = False
72
+ bot_token: str = "" # secret; supports ${ENV_VAR}
73
+ channel: str = ""
74
+
75
+
76
+ @dataclass
77
+ class TelegramChannel:
78
+ enabled: bool = False
79
+ bot_token: str = "" # secret; supports ${ENV_VAR}
80
+ chat_id: str = ""
81
+
82
+
83
+ @dataclass
84
+ class Config:
85
+ schedule: ScheduleConfig
86
+ ai: AIConfig
87
+ digest: DigestConfig
88
+ sort: dict[str, int]
89
+ email: EmailChannel
90
+ slack: SlackChannel
91
+ telegram: TelegramChannel
92
+ custom_sources: list[Source]
93
+ enabled_source_ids: set[str] = field(default_factory=set)
94
+
95
+
96
+ def config_path() -> str:
97
+ return os.path.join(platformdirs.user_config_dir(APP_NAME), "config.toml")
98
+
99
+
100
+ def data_path() -> str:
101
+ return platformdirs.user_data_dir(APP_NAME)
102
+
103
+
104
+ def db_path() -> str:
105
+ return os.path.join(data_path(), "state.db")
106
+
107
+
108
+ def log_path() -> str:
109
+ """Path to the run log. ~/Library/Logs/android-watcher.log on macOS."""
110
+ import sys # noqa: PLC0415 - localized; keeps module import graph lean
111
+
112
+ if sys.platform == "darwin":
113
+ return os.path.join(os.path.expanduser("~/Library/Logs"), "android-watcher.log")
114
+ return os.path.join(platformdirs.user_log_dir(APP_NAME), "android-watcher.log")
115
+
116
+
117
+ def _interpolate(value: str, *, expand: bool) -> str:
118
+ """Resolve ${ENV} references in a secret-bearing field.
119
+
120
+ When expand=False, leave the literal untouched and never raise (the TUI
121
+ editor re-saves config without baking secrets into the file).
122
+ """
123
+ if not expand:
124
+ return value
125
+
126
+ def repl(match: re.Match[str]) -> str:
127
+ name = match.group(1)
128
+ if name not in os.environ:
129
+ raise ConfigError(f"config references undefined env var ${{{name}}}")
130
+ return os.environ[name]
131
+
132
+ return _ENV_RE.sub(repl, value)
133
+
134
+
135
+ def load_config(path: str | None = None, *, expand: bool = True) -> Config:
136
+ """Load config from *path* (defaults to the platform config path).
137
+
138
+ Missing file returns all defaults. Raises ConfigError on invalid TOML or
139
+ contradictory schedule, or when slack is enabled without bot_token+channel.
140
+ Interpolation of ${ENV_VAR} applies only to secret-bearing fields (email
141
+ password, slack bot_token, telegram bot_token).
142
+ """
143
+ target = path or config_path()
144
+ try:
145
+ with open(target, "rb") as fh:
146
+ raw = tomllib.load(fh)
147
+ except FileNotFoundError:
148
+ raw = {}
149
+ except tomllib.TOMLDecodeError as exc:
150
+ raise ConfigError(f"invalid TOML in {target}: {exc}") from exc
151
+
152
+ # Interpolation is scoped to secret-bearing fields only (see _load_email /
153
+ # _load_slack / _load_telegram). Every other string, including URLs, is
154
+ # passed through verbatim so a literal "${" is never an error.
155
+ schedule = _load_schedule(raw.get("schedule", {}))
156
+ ai = _load_ai(raw.get("ai", {}))
157
+ digest = _load_digest(raw.get("digest", {}))
158
+ sort = _load_sort(raw.get("sort", {}))
159
+ channels = raw.get("channels", {})
160
+ email = _load_email(channels.get("email", {}), expand=expand)
161
+ slack = _load_slack(channels.get("slack", {}), expand=expand)
162
+ telegram = _load_telegram(channels.get("telegram", {}), expand=expand)
163
+ custom_sources = [_load_source(e) for e in raw.get("custom_source", [])]
164
+ enabled = set(raw.get("enabled_sources", []))
165
+
166
+ if slack.enabled and not (slack.bot_token and slack.channel):
167
+ raise ConfigError("slack channel enabled but bot_token and channel are required")
168
+
169
+ return Config(
170
+ schedule=schedule,
171
+ ai=ai,
172
+ digest=digest,
173
+ sort=sort,
174
+ email=email,
175
+ slack=slack,
176
+ telegram=telegram,
177
+ custom_sources=custom_sources,
178
+ enabled_source_ids=enabled,
179
+ )
180
+
181
+
182
+ def _load_schedule(d: dict[str, Any]) -> ScheduleConfig:
183
+ interval = d.get("interval", "daily")
184
+ if interval not in _VALID_INTERVALS:
185
+ raise ConfigError(
186
+ f"schedule.interval must be one of {sorted(_VALID_INTERVALS)}, got {interval!r}"
187
+ )
188
+ cron = d.get("cron", "")
189
+ if interval == "cron" and not cron:
190
+ raise ConfigError("schedule.interval = 'cron' requires a non-empty schedule.cron")
191
+ if interval != "cron" and cron:
192
+ raise ConfigError(
193
+ f"schedule.cron is set but interval is {interval!r}; "
194
+ "set interval = 'cron' or clear cron"
195
+ )
196
+ return ScheduleConfig(
197
+ interval=interval, at=d.get("at", "09:00"), days=d.get("days", "mon"), cron=cron
198
+ )
199
+
200
+
201
+ def _load_ai(d: dict[str, Any]) -> AIConfig:
202
+ mode = d.get("mode", "claude_cli")
203
+ if mode not in _VALID_AI_MODES:
204
+ raise ConfigError(f"ai.mode must be one of {sorted(_VALID_AI_MODES)}, got {mode!r}")
205
+ return AIConfig(mode=mode, model=d.get("model", "claude-sonnet-4-6"))
206
+
207
+
208
+ def _load_digest(d: dict[str, Any]) -> DigestConfig:
209
+ empty = d.get("empty", "send")
210
+ if empty not in _VALID_EMPTY:
211
+ raise ConfigError(f"digest.empty must be one of {sorted(_VALID_EMPTY)}, got {empty!r}")
212
+ return DigestConfig(
213
+ max_items=int(d.get("max_items", 10)),
214
+ empty=empty,
215
+ )
216
+
217
+
218
+ def _load_sort(d: dict[str, Any]) -> dict[str, int]:
219
+ return {str(k): int(v) for k, v in d.items()}
220
+
221
+
222
+ def _load_email(d: dict[str, Any], *, expand: bool) -> EmailChannel:
223
+ return EmailChannel(
224
+ enabled=bool(d.get("enabled", False)),
225
+ smtp_host=d.get("smtp_host", ""),
226
+ smtp_port=int(d.get("smtp_port", 465)),
227
+ username=d.get("username", ""),
228
+ password=_interpolate(d.get("password", ""), expand=expand), # secret
229
+ sender=d.get("from", ""),
230
+ recipient=d.get("to", ""),
231
+ )
232
+
233
+
234
+ def _load_slack(d: dict[str, Any], *, expand: bool) -> SlackChannel:
235
+ return SlackChannel(
236
+ enabled=bool(d.get("enabled", False)),
237
+ bot_token=_interpolate(d.get("bot_token", ""), expand=expand), # secret
238
+ channel=d.get("channel", ""),
239
+ )
240
+
241
+
242
+ def _load_telegram(d: dict[str, Any], *, expand: bool) -> TelegramChannel:
243
+ return TelegramChannel(
244
+ enabled=bool(d.get("enabled", False)),
245
+ bot_token=_interpolate(d.get("bot_token", ""), expand=expand), # secret
246
+ chat_id=d.get("chat_id", ""),
247
+ )
248
+
249
+
250
+ def _load_source(e: dict[str, Any]) -> Source:
251
+ return Source(
252
+ id=e["id"],
253
+ name=e["name"],
254
+ category=e["category"],
255
+ detector=e["detector"],
256
+ url=e["url"],
257
+ enabled=e.get("enabled", True),
258
+ path_prefix=e.get("path_prefix", ""),
259
+ feed_url=e.get("feed_url", ""),
260
+ content_selector=e.get("content_selector", ""),
261
+ default_weight=e.get("default_weight", 0),
262
+ )
@@ -0,0 +1 @@
1
+ from . import android_sitemap, content, feed, sitemap # noqa: F401
@@ -0,0 +1,192 @@
1
+ """Content normalisation helpers shared across detectors.
2
+
3
+ Uses only stdlib (``html.parser``) — no third-party HTML library required.
4
+ Later detectors (sitemap content-confirm, etc.) import from here.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import hashlib
10
+ import re
11
+ from html.parser import HTMLParser
12
+
13
+ # A page whose normalised main-text falls below this character count is treated
14
+ # as a client-side render shell. The content detector refuses to baseline it
15
+ # and emits no Change; ``doctor`` surfaces the condition separately.
16
+ EMPTY_RENDER_THRESHOLD: int = 50
17
+
18
+ # Tags whose text content we discard entirely.
19
+ _SKIP_TAGS: frozenset[str] = frozenset({"script", "style", "noscript", "template"})
20
+
21
+
22
+ class _TextExtractor(HTMLParser):
23
+ """Walk HTML and collect visible text, discarding script/style."""
24
+
25
+ def __init__(self) -> None:
26
+ super().__init__(convert_charrefs=True)
27
+ self._skip_depth: int = 0
28
+ self._parts: list[str] = []
29
+
30
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
31
+ if tag in _SKIP_TAGS:
32
+ self._skip_depth += 1
33
+
34
+ def handle_endtag(self, tag: str) -> None:
35
+ if tag in _SKIP_TAGS and self._skip_depth > 0:
36
+ self._skip_depth -= 1
37
+
38
+ def handle_data(self, data: str) -> None:
39
+ if self._skip_depth == 0:
40
+ stripped = data.strip()
41
+ if stripped:
42
+ self._parts.append(stripped)
43
+
44
+ def text(self) -> str:
45
+ return " ".join(self._parts)
46
+
47
+
48
+ class _SelectorExtractor(HTMLParser):
49
+ """Extract the inner HTML of the first element matching a simple selector.
50
+
51
+ Supported selector forms (the subset needed by this project):
52
+ - ``#id`` — matches the first element with that id attribute.
53
+ - ``tag`` — matches the first element with that tag name.
54
+ - ``""`` — no selector; returns the full HTML unchanged (caller should
55
+ pass the raw HTML back to ``_TextExtractor``).
56
+ """
57
+
58
+ def __init__(self, selector: str) -> None:
59
+ super().__init__(convert_charrefs=False)
60
+ self._selector = selector.strip()
61
+ self._match_id: str | None = None
62
+ self._match_tag: str | None = None
63
+
64
+ if self._selector.startswith("#"):
65
+ self._match_id = self._selector[1:]
66
+ elif self._selector:
67
+ self._match_tag = self._selector
68
+
69
+ self._depth: int = 0 # nesting depth inside the matched element
70
+ self._capturing: bool = False
71
+ self._found: bool = False
72
+ self._raw_parts: list[str] = []
73
+
74
+ def _matches(self, tag: str, attrs: list[tuple[str, str | None]]) -> bool:
75
+ if self._match_id is not None:
76
+ attr_dict = dict(attrs)
77
+ return attr_dict.get("id") == self._match_id
78
+ if self._match_tag is not None:
79
+ return tag == self._match_tag
80
+ return False
81
+
82
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
83
+ if self._found:
84
+ return
85
+ if self._capturing:
86
+ self._depth += 1
87
+ # Re-serialise the opening tag so inner elements are kept.
88
+ attr_str = "".join(f' {k}="{v}"' if v is not None else f" {k}" for k, v in attrs)
89
+ self._raw_parts.append(f"<{tag}{attr_str}>")
90
+ return
91
+ if self._matches(tag, attrs):
92
+ self._capturing = True
93
+ self._depth = 1
94
+
95
+ def handle_endtag(self, tag: str) -> None:
96
+ if not self._capturing or self._found:
97
+ return
98
+ self._depth -= 1
99
+ if self._depth == 0:
100
+ self._capturing = False
101
+ self._found = True
102
+ else:
103
+ self._raw_parts.append(f"</{tag}>")
104
+
105
+ def handle_data(self, data: str) -> None:
106
+ if self._capturing and not self._found:
107
+ self._raw_parts.append(data)
108
+
109
+ def handle_entityref(self, name: str) -> None: # type: ignore[override]
110
+ if self._capturing and not self._found:
111
+ self._raw_parts.append(f"&{name};")
112
+
113
+ def handle_charref(self, name: str) -> None: # type: ignore[override]
114
+ if self._capturing and not self._found:
115
+ self._raw_parts.append(f"&#{name};")
116
+
117
+ def fragment(self) -> str | None:
118
+ """Return the captured inner HTML, or None if no match was found."""
119
+ return "".join(self._raw_parts) if self._found else None
120
+
121
+
122
+ def extract_main(html: str, selector: str = "") -> str:
123
+ """Return the HTML fragment addressed by *selector*, or *html* if no match.
124
+
125
+ Selector forms: ``#id``, ``tagname``, or ``""`` (whole document).
126
+ Falls back to the full *html* when the selector matches nothing so the
127
+ caller always gets something to normalise.
128
+ """
129
+ if not selector:
130
+ return html
131
+ extractor = _SelectorExtractor(selector)
132
+ extractor.feed(html)
133
+ return extractor.fragment() or html
134
+
135
+
136
+ def normalize_text(html_fragment: str) -> str:
137
+ """Strip all markup and attributes; collapse whitespace to single spaces.
138
+
139
+ Two fragments that differ only in CSS classes, random data attributes, or
140
+ extra whitespace will produce the same normalised string and therefore the
141
+ same :func:`content_hash`.
142
+ """
143
+ parser = _TextExtractor()
144
+ parser.feed(html_fragment)
145
+ text = parser.text()
146
+ # Collapse any remaining internal whitespace runs.
147
+ return re.sub(r"\s+", " ", text).strip()
148
+
149
+
150
+ def content_hash(text: str) -> str:
151
+ """SHA-256 of the normalised text (hex string)."""
152
+ return hashlib.sha256(text.encode()).hexdigest()
153
+
154
+
155
+ class _TitleExtractor(HTMLParser):
156
+ """Capture the text inside the first <title> element."""
157
+
158
+ def __init__(self) -> None:
159
+ super().__init__(convert_charrefs=True)
160
+ self._in = False
161
+ self._done = False
162
+ self._parts: list[str] = []
163
+
164
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
165
+ if tag == "title" and not self._done:
166
+ self._in = True
167
+
168
+ def handle_endtag(self, tag: str) -> None:
169
+ if tag == "title" and self._in:
170
+ self._in = False
171
+ self._done = True
172
+
173
+ def handle_data(self, data: str) -> None:
174
+ if self._in:
175
+ self._parts.append(data)
176
+
177
+ def title(self) -> str:
178
+ return re.sub(r"\s+", " ", "".join(self._parts)).strip()
179
+
180
+
181
+ def extract_title(html: str) -> str:
182
+ """The page's <title> text (whitespace-collapsed), or "" if absent/unparsable.
183
+
184
+ The stdlib HTML parser raises on some binary payloads; callers treat a
185
+ failure as "no title", so this never propagates an exception.
186
+ """
187
+ parser = _TitleExtractor()
188
+ try:
189
+ parser.feed(html)
190
+ except Exception: # noqa: BLE001 - binary/garbage HTML => no title
191
+ return ""
192
+ return parser.title()