android-watcher 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- android_watcher/__init__.py +10 -0
- android_watcher/catalog/__init__.py +32 -0
- android_watcher/catalog/catalog.toml +531 -0
- android_watcher/cli.py +161 -0
- android_watcher/config.py +262 -0
- android_watcher/detect/__init__.py +1 -0
- android_watcher/detect/_normalize.py +192 -0
- android_watcher/detect/android_sitemap.py +540 -0
- android_watcher/detect/base.py +14 -0
- android_watcher/detect/content.py +99 -0
- android_watcher/detect/feed.py +135 -0
- android_watcher/detect/sitemap.py +203 -0
- android_watcher/doctor.py +125 -0
- android_watcher/fetch.py +162 -0
- android_watcher/group.py +79 -0
- android_watcher/lock.py +32 -0
- android_watcher/models.py +156 -0
- android_watcher/notify/__init__.py +1 -0
- android_watcher/notify/base.py +21 -0
- android_watcher/notify/email.py +52 -0
- android_watcher/notify/html.py +114 -0
- android_watcher/notify/render.py +239 -0
- android_watcher/notify/slack.py +124 -0
- android_watcher/notify/telegram.py +46 -0
- android_watcher/rank.py +84 -0
- android_watcher/registry.py +38 -0
- android_watcher/run.py +283 -0
- android_watcher/schedule.py +488 -0
- android_watcher/seed/__init__.py +45 -0
- android_watcher/seed/seed.sql.gz +0 -0
- android_watcher/store.py +492 -0
- android_watcher/triage/__init__.py +1 -0
- android_watcher/triage/base.py +25 -0
- android_watcher/triage/claude_cli.py +185 -0
- android_watcher/triage/noop.py +24 -0
- android_watcher/tui/__init__.py +1 -0
- android_watcher/tui/app.py +163 -0
- android_watcher/tui/configio.py +215 -0
- android_watcher/tui/screens.py +927 -0
- android_watcher-1.0.0.dist-info/METADATA +310 -0
- android_watcher-1.0.0.dist-info/RECORD +44 -0
- android_watcher-1.0.0.dist-info/WHEEL +4 -0
- android_watcher-1.0.0.dist-info/entry_points.txt +2 -0
- android_watcher-1.0.0.dist-info/licenses/LICENSE +21 -0
android_watcher/cli.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""android-watcher CLI: argparse router."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
from . import __version__
|
|
9
|
+
from .catalog import load_catalog
|
|
10
|
+
from .config import load_config
|
|
11
|
+
from .doctor import run_doctor
|
|
12
|
+
from .models import AlreadyRunning, ConfigError
|
|
13
|
+
from .notify.render import render_email
|
|
14
|
+
from .run import configure_file_logging, run_once
|
|
15
|
+
from .schedule import install_schedule, remove_schedule, schedule_status
|
|
16
|
+
from .tui.app import AndroidWatcher
|
|
17
|
+
from .tui.configio import load_or_default
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _cmd_catalog(args: argparse.Namespace) -> int:
|
|
21
|
+
for source in load_catalog():
|
|
22
|
+
state = "on " if source.enabled else "off"
|
|
23
|
+
print(f"[{state}] {source.id:<28} {source.detector:<16} {source.category}")
|
|
24
|
+
return 0
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _cmd_run(args: argparse.Namespace) -> int:
|
|
28
|
+
try:
|
|
29
|
+
config = load_config(args.config)
|
|
30
|
+
except ConfigError as exc:
|
|
31
|
+
print(f"android-watcher: configuration error: {exc}", file=sys.stderr)
|
|
32
|
+
return 1
|
|
33
|
+
log_file = configure_file_logging()
|
|
34
|
+
print(f"android-watcher: running… (progress logged to {log_file})", file=sys.stderr)
|
|
35
|
+
try:
|
|
36
|
+
digest = run_once(config, force=args.force)
|
|
37
|
+
except AlreadyRunning:
|
|
38
|
+
print("android-watcher: another run is in progress; exiting.")
|
|
39
|
+
return 0
|
|
40
|
+
print(f"android-watcher: {digest.change_count()} change(s) delivered.")
|
|
41
|
+
return 0
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _cmd_test(args: argparse.Namespace) -> int:
|
|
45
|
+
try:
|
|
46
|
+
config = load_config(args.config)
|
|
47
|
+
except ConfigError as exc:
|
|
48
|
+
print(f"android-watcher: configuration error: {exc}", file=sys.stderr)
|
|
49
|
+
return 1
|
|
50
|
+
configure_file_logging()
|
|
51
|
+
checks = run_doctor(config)
|
|
52
|
+
failed = [
|
|
53
|
+
c for c in checks if not c.ok and (c.name == "ai-backend" or c.name.startswith("channel"))
|
|
54
|
+
]
|
|
55
|
+
digest = run_once(config, dry_run=True)
|
|
56
|
+
_, plaintext = render_email(digest)
|
|
57
|
+
print(plaintext)
|
|
58
|
+
if failed:
|
|
59
|
+
for c in failed:
|
|
60
|
+
print(f"FAIL {c.name}: {c.detail}")
|
|
61
|
+
return 1
|
|
62
|
+
return 0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _cmd_doctor(args: argparse.Namespace) -> int:
|
|
66
|
+
try:
|
|
67
|
+
config = load_config(args.config)
|
|
68
|
+
except ConfigError as exc:
|
|
69
|
+
print(f"android-watcher: configuration error: {exc}", file=sys.stderr)
|
|
70
|
+
return 1
|
|
71
|
+
print("Running checks (verifying the sitemap may take up to ~30s)…", file=sys.stderr)
|
|
72
|
+
checks = run_doctor(config)
|
|
73
|
+
any_failed = False
|
|
74
|
+
for c in checks:
|
|
75
|
+
status = "OK " if c.ok else "FAIL"
|
|
76
|
+
if not c.ok:
|
|
77
|
+
any_failed = True
|
|
78
|
+
print(f"{status} {c.name}: {c.detail}")
|
|
79
|
+
return 1 if any_failed else 0
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _cmd_schedule(args: argparse.Namespace) -> int:
|
|
83
|
+
if args.action == "install":
|
|
84
|
+
config = load_config(args.config) if args.config else load_or_default()[0]
|
|
85
|
+
install_schedule(config)
|
|
86
|
+
return 0
|
|
87
|
+
if args.action == "remove":
|
|
88
|
+
remove_schedule()
|
|
89
|
+
return 0
|
|
90
|
+
# status
|
|
91
|
+
check = schedule_status()
|
|
92
|
+
status = "OK " if check.ok else "FAIL"
|
|
93
|
+
print(f"{status} {check.name}: {check.detail}")
|
|
94
|
+
return 0 if check.ok else 1
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _cmd_tui(args: argparse.Namespace) -> int:
|
|
98
|
+
config, existed = load_or_default()
|
|
99
|
+
# Configuration is incomplete until a delivery channel is set, so run the
|
|
100
|
+
# wizard from the start in that case too — not only when the file is absent.
|
|
101
|
+
# (Short-circuits on a fresh install before touching the config fields.)
|
|
102
|
+
first_run = (not existed) or not (
|
|
103
|
+
config.email.enabled or config.slack.enabled or config.telegram.enabled
|
|
104
|
+
)
|
|
105
|
+
result = AndroidWatcher(config=config, first_run=first_run).run()
|
|
106
|
+
if isinstance(result, str):
|
|
107
|
+
print(result)
|
|
108
|
+
return 0
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
112
|
+
parser = argparse.ArgumentParser(
|
|
113
|
+
prog="android-watcher",
|
|
114
|
+
description="Watch official Google Android sites and deliver a ranked digest.",
|
|
115
|
+
)
|
|
116
|
+
parser.add_argument("--version", action="version", version=f"android-watcher {__version__}")
|
|
117
|
+
parser.add_argument(
|
|
118
|
+
"--config",
|
|
119
|
+
default=None,
|
|
120
|
+
metavar="PATH",
|
|
121
|
+
help="Path to config file (default: platform config dir).",
|
|
122
|
+
)
|
|
123
|
+
parser.set_defaults(func=_cmd_tui)
|
|
124
|
+
|
|
125
|
+
sub = parser.add_subparsers(dest="command")
|
|
126
|
+
|
|
127
|
+
run_p = sub.add_parser("run", help="Run one detection-triage-notify cycle.")
|
|
128
|
+
run_p.add_argument(
|
|
129
|
+
"--force",
|
|
130
|
+
action="store_true",
|
|
131
|
+
help="Run even if the last cycle is not yet due.",
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
sub.add_parser("test", help="Dry-run and render the current digest to stdout.")
|
|
135
|
+
sub.add_parser("catalog", help="List configured sources from the catalog.")
|
|
136
|
+
sub.add_parser("doctor", help="Run health checks.")
|
|
137
|
+
|
|
138
|
+
schedule_p = sub.add_parser("schedule", help="Manage the native scheduled job.")
|
|
139
|
+
schedule_p.add_argument(
|
|
140
|
+
"action",
|
|
141
|
+
choices=["install", "remove", "status"],
|
|
142
|
+
help="Schedule action to perform.",
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Wire up handlers after all subparsers are registered.
|
|
146
|
+
for name, func in [
|
|
147
|
+
("run", _cmd_run),
|
|
148
|
+
("test", _cmd_test),
|
|
149
|
+
("catalog", _cmd_catalog),
|
|
150
|
+
("doctor", _cmd_doctor),
|
|
151
|
+
("schedule", _cmd_schedule),
|
|
152
|
+
]:
|
|
153
|
+
sub.choices[name].set_defaults(func=func)
|
|
154
|
+
|
|
155
|
+
return parser
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def main(argv: list[str] | None = None) -> int:
|
|
159
|
+
parser = _build_parser()
|
|
160
|
+
args = parser.parse_args(argv)
|
|
161
|
+
return args.func(args)
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
"""Config dataclasses, path helpers, TOML loader, and env interpolation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import tomllib
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Any, Literal
|
|
10
|
+
|
|
11
|
+
import platformdirs
|
|
12
|
+
|
|
13
|
+
from .models import ConfigError, Source # ConfigError defined once in models.py
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"AIConfig",
|
|
17
|
+
"Config",
|
|
18
|
+
"ConfigError",
|
|
19
|
+
"DigestConfig",
|
|
20
|
+
"EmailChannel",
|
|
21
|
+
"ScheduleConfig",
|
|
22
|
+
"SlackChannel",
|
|
23
|
+
"TelegramChannel",
|
|
24
|
+
"config_path",
|
|
25
|
+
"data_path",
|
|
26
|
+
"db_path",
|
|
27
|
+
"load_config",
|
|
28
|
+
"log_path",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
APP_NAME = "android-watcher"
|
|
32
|
+
_ENV_RE = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
|
|
33
|
+
_VALID_INTERVALS = {"hourly", "daily", "weekly", "cron"}
|
|
34
|
+
_VALID_AI_MODES = {"claude_cli", "off"}
|
|
35
|
+
_VALID_EMPTY = {"send", "skip"}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class ScheduleConfig:
|
|
40
|
+
interval: Literal["hourly", "daily", "weekly", "cron"] = "daily"
|
|
41
|
+
at: str = "09:00" # one or more HH:MM, comma-separated
|
|
42
|
+
days: str = "mon" # weekly only: comma-separated weekday abbrevs (mon..sun)
|
|
43
|
+
cron: str = ""
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class AIConfig:
|
|
48
|
+
mode: Literal["claude_cli", "off"] = "claude_cli"
|
|
49
|
+
model: str = "claude-sonnet-4-6"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class DigestConfig:
|
|
54
|
+
max_items: int = 10
|
|
55
|
+
empty: Literal["send", "skip"] = "send"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class EmailChannel:
|
|
60
|
+
enabled: bool = False
|
|
61
|
+
smtp_host: str = ""
|
|
62
|
+
smtp_port: int = 465
|
|
63
|
+
username: str = ""
|
|
64
|
+
password: str = ""
|
|
65
|
+
sender: str = "" # maps to TOML key "from"
|
|
66
|
+
recipient: str = "" # maps to TOML key "to"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class SlackChannel:
|
|
71
|
+
enabled: bool = False
|
|
72
|
+
bot_token: str = "" # secret; supports ${ENV_VAR}
|
|
73
|
+
channel: str = ""
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class TelegramChannel:
|
|
78
|
+
enabled: bool = False
|
|
79
|
+
bot_token: str = "" # secret; supports ${ENV_VAR}
|
|
80
|
+
chat_id: str = ""
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class Config:
|
|
85
|
+
schedule: ScheduleConfig
|
|
86
|
+
ai: AIConfig
|
|
87
|
+
digest: DigestConfig
|
|
88
|
+
sort: dict[str, int]
|
|
89
|
+
email: EmailChannel
|
|
90
|
+
slack: SlackChannel
|
|
91
|
+
telegram: TelegramChannel
|
|
92
|
+
custom_sources: list[Source]
|
|
93
|
+
enabled_source_ids: set[str] = field(default_factory=set)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def config_path() -> str:
|
|
97
|
+
return os.path.join(platformdirs.user_config_dir(APP_NAME), "config.toml")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def data_path() -> str:
|
|
101
|
+
return platformdirs.user_data_dir(APP_NAME)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def db_path() -> str:
|
|
105
|
+
return os.path.join(data_path(), "state.db")
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def log_path() -> str:
|
|
109
|
+
"""Path to the run log. ~/Library/Logs/android-watcher.log on macOS."""
|
|
110
|
+
import sys # noqa: PLC0415 - localized; keeps module import graph lean
|
|
111
|
+
|
|
112
|
+
if sys.platform == "darwin":
|
|
113
|
+
return os.path.join(os.path.expanduser("~/Library/Logs"), "android-watcher.log")
|
|
114
|
+
return os.path.join(platformdirs.user_log_dir(APP_NAME), "android-watcher.log")
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _interpolate(value: str, *, expand: bool) -> str:
|
|
118
|
+
"""Resolve ${ENV} references in a secret-bearing field.
|
|
119
|
+
|
|
120
|
+
When expand=False, leave the literal untouched and never raise (the TUI
|
|
121
|
+
editor re-saves config without baking secrets into the file).
|
|
122
|
+
"""
|
|
123
|
+
if not expand:
|
|
124
|
+
return value
|
|
125
|
+
|
|
126
|
+
def repl(match: re.Match[str]) -> str:
|
|
127
|
+
name = match.group(1)
|
|
128
|
+
if name not in os.environ:
|
|
129
|
+
raise ConfigError(f"config references undefined env var ${{{name}}}")
|
|
130
|
+
return os.environ[name]
|
|
131
|
+
|
|
132
|
+
return _ENV_RE.sub(repl, value)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def load_config(path: str | None = None, *, expand: bool = True) -> Config:
|
|
136
|
+
"""Load config from *path* (defaults to the platform config path).
|
|
137
|
+
|
|
138
|
+
Missing file returns all defaults. Raises ConfigError on invalid TOML or
|
|
139
|
+
contradictory schedule, or when slack is enabled without bot_token+channel.
|
|
140
|
+
Interpolation of ${ENV_VAR} applies only to secret-bearing fields (email
|
|
141
|
+
password, slack bot_token, telegram bot_token).
|
|
142
|
+
"""
|
|
143
|
+
target = path or config_path()
|
|
144
|
+
try:
|
|
145
|
+
with open(target, "rb") as fh:
|
|
146
|
+
raw = tomllib.load(fh)
|
|
147
|
+
except FileNotFoundError:
|
|
148
|
+
raw = {}
|
|
149
|
+
except tomllib.TOMLDecodeError as exc:
|
|
150
|
+
raise ConfigError(f"invalid TOML in {target}: {exc}") from exc
|
|
151
|
+
|
|
152
|
+
# Interpolation is scoped to secret-bearing fields only (see _load_email /
|
|
153
|
+
# _load_slack / _load_telegram). Every other string, including URLs, is
|
|
154
|
+
# passed through verbatim so a literal "${" is never an error.
|
|
155
|
+
schedule = _load_schedule(raw.get("schedule", {}))
|
|
156
|
+
ai = _load_ai(raw.get("ai", {}))
|
|
157
|
+
digest = _load_digest(raw.get("digest", {}))
|
|
158
|
+
sort = _load_sort(raw.get("sort", {}))
|
|
159
|
+
channels = raw.get("channels", {})
|
|
160
|
+
email = _load_email(channels.get("email", {}), expand=expand)
|
|
161
|
+
slack = _load_slack(channels.get("slack", {}), expand=expand)
|
|
162
|
+
telegram = _load_telegram(channels.get("telegram", {}), expand=expand)
|
|
163
|
+
custom_sources = [_load_source(e) for e in raw.get("custom_source", [])]
|
|
164
|
+
enabled = set(raw.get("enabled_sources", []))
|
|
165
|
+
|
|
166
|
+
if slack.enabled and not (slack.bot_token and slack.channel):
|
|
167
|
+
raise ConfigError("slack channel enabled but bot_token and channel are required")
|
|
168
|
+
|
|
169
|
+
return Config(
|
|
170
|
+
schedule=schedule,
|
|
171
|
+
ai=ai,
|
|
172
|
+
digest=digest,
|
|
173
|
+
sort=sort,
|
|
174
|
+
email=email,
|
|
175
|
+
slack=slack,
|
|
176
|
+
telegram=telegram,
|
|
177
|
+
custom_sources=custom_sources,
|
|
178
|
+
enabled_source_ids=enabled,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _load_schedule(d: dict[str, Any]) -> ScheduleConfig:
|
|
183
|
+
interval = d.get("interval", "daily")
|
|
184
|
+
if interval not in _VALID_INTERVALS:
|
|
185
|
+
raise ConfigError(
|
|
186
|
+
f"schedule.interval must be one of {sorted(_VALID_INTERVALS)}, got {interval!r}"
|
|
187
|
+
)
|
|
188
|
+
cron = d.get("cron", "")
|
|
189
|
+
if interval == "cron" and not cron:
|
|
190
|
+
raise ConfigError("schedule.interval = 'cron' requires a non-empty schedule.cron")
|
|
191
|
+
if interval != "cron" and cron:
|
|
192
|
+
raise ConfigError(
|
|
193
|
+
f"schedule.cron is set but interval is {interval!r}; "
|
|
194
|
+
"set interval = 'cron' or clear cron"
|
|
195
|
+
)
|
|
196
|
+
return ScheduleConfig(
|
|
197
|
+
interval=interval, at=d.get("at", "09:00"), days=d.get("days", "mon"), cron=cron
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _load_ai(d: dict[str, Any]) -> AIConfig:
|
|
202
|
+
mode = d.get("mode", "claude_cli")
|
|
203
|
+
if mode not in _VALID_AI_MODES:
|
|
204
|
+
raise ConfigError(f"ai.mode must be one of {sorted(_VALID_AI_MODES)}, got {mode!r}")
|
|
205
|
+
return AIConfig(mode=mode, model=d.get("model", "claude-sonnet-4-6"))
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _load_digest(d: dict[str, Any]) -> DigestConfig:
|
|
209
|
+
empty = d.get("empty", "send")
|
|
210
|
+
if empty not in _VALID_EMPTY:
|
|
211
|
+
raise ConfigError(f"digest.empty must be one of {sorted(_VALID_EMPTY)}, got {empty!r}")
|
|
212
|
+
return DigestConfig(
|
|
213
|
+
max_items=int(d.get("max_items", 10)),
|
|
214
|
+
empty=empty,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _load_sort(d: dict[str, Any]) -> dict[str, int]:
|
|
219
|
+
return {str(k): int(v) for k, v in d.items()}
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _load_email(d: dict[str, Any], *, expand: bool) -> EmailChannel:
|
|
223
|
+
return EmailChannel(
|
|
224
|
+
enabled=bool(d.get("enabled", False)),
|
|
225
|
+
smtp_host=d.get("smtp_host", ""),
|
|
226
|
+
smtp_port=int(d.get("smtp_port", 465)),
|
|
227
|
+
username=d.get("username", ""),
|
|
228
|
+
password=_interpolate(d.get("password", ""), expand=expand), # secret
|
|
229
|
+
sender=d.get("from", ""),
|
|
230
|
+
recipient=d.get("to", ""),
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _load_slack(d: dict[str, Any], *, expand: bool) -> SlackChannel:
|
|
235
|
+
return SlackChannel(
|
|
236
|
+
enabled=bool(d.get("enabled", False)),
|
|
237
|
+
bot_token=_interpolate(d.get("bot_token", ""), expand=expand), # secret
|
|
238
|
+
channel=d.get("channel", ""),
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _load_telegram(d: dict[str, Any], *, expand: bool) -> TelegramChannel:
|
|
243
|
+
return TelegramChannel(
|
|
244
|
+
enabled=bool(d.get("enabled", False)),
|
|
245
|
+
bot_token=_interpolate(d.get("bot_token", ""), expand=expand), # secret
|
|
246
|
+
chat_id=d.get("chat_id", ""),
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def _load_source(e: dict[str, Any]) -> Source:
|
|
251
|
+
return Source(
|
|
252
|
+
id=e["id"],
|
|
253
|
+
name=e["name"],
|
|
254
|
+
category=e["category"],
|
|
255
|
+
detector=e["detector"],
|
|
256
|
+
url=e["url"],
|
|
257
|
+
enabled=e.get("enabled", True),
|
|
258
|
+
path_prefix=e.get("path_prefix", ""),
|
|
259
|
+
feed_url=e.get("feed_url", ""),
|
|
260
|
+
content_selector=e.get("content_selector", ""),
|
|
261
|
+
default_weight=e.get("default_weight", 0),
|
|
262
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from . import android_sitemap, content, feed, sitemap # noqa: F401
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""Content normalisation helpers shared across detectors.
|
|
2
|
+
|
|
3
|
+
Uses only stdlib (``html.parser``) — no third-party HTML library required.
|
|
4
|
+
Later detectors (sitemap content-confirm, etc.) import from here.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import re
|
|
11
|
+
from html.parser import HTMLParser
|
|
12
|
+
|
|
13
|
+
# A page whose normalised main-text falls below this character count is treated
|
|
14
|
+
# as a client-side render shell. The content detector refuses to baseline it
|
|
15
|
+
# and emits no Change; ``doctor`` surfaces the condition separately.
|
|
16
|
+
EMPTY_RENDER_THRESHOLD: int = 50
|
|
17
|
+
|
|
18
|
+
# Tags whose text content we discard entirely.
|
|
19
|
+
_SKIP_TAGS: frozenset[str] = frozenset({"script", "style", "noscript", "template"})
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class _TextExtractor(HTMLParser):
|
|
23
|
+
"""Walk HTML and collect visible text, discarding script/style."""
|
|
24
|
+
|
|
25
|
+
def __init__(self) -> None:
|
|
26
|
+
super().__init__(convert_charrefs=True)
|
|
27
|
+
self._skip_depth: int = 0
|
|
28
|
+
self._parts: list[str] = []
|
|
29
|
+
|
|
30
|
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
31
|
+
if tag in _SKIP_TAGS:
|
|
32
|
+
self._skip_depth += 1
|
|
33
|
+
|
|
34
|
+
def handle_endtag(self, tag: str) -> None:
|
|
35
|
+
if tag in _SKIP_TAGS and self._skip_depth > 0:
|
|
36
|
+
self._skip_depth -= 1
|
|
37
|
+
|
|
38
|
+
def handle_data(self, data: str) -> None:
|
|
39
|
+
if self._skip_depth == 0:
|
|
40
|
+
stripped = data.strip()
|
|
41
|
+
if stripped:
|
|
42
|
+
self._parts.append(stripped)
|
|
43
|
+
|
|
44
|
+
def text(self) -> str:
|
|
45
|
+
return " ".join(self._parts)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class _SelectorExtractor(HTMLParser):
|
|
49
|
+
"""Extract the inner HTML of the first element matching a simple selector.
|
|
50
|
+
|
|
51
|
+
Supported selector forms (the subset needed by this project):
|
|
52
|
+
- ``#id`` — matches the first element with that id attribute.
|
|
53
|
+
- ``tag`` — matches the first element with that tag name.
|
|
54
|
+
- ``""`` — no selector; returns the full HTML unchanged (caller should
|
|
55
|
+
pass the raw HTML back to ``_TextExtractor``).
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(self, selector: str) -> None:
|
|
59
|
+
super().__init__(convert_charrefs=False)
|
|
60
|
+
self._selector = selector.strip()
|
|
61
|
+
self._match_id: str | None = None
|
|
62
|
+
self._match_tag: str | None = None
|
|
63
|
+
|
|
64
|
+
if self._selector.startswith("#"):
|
|
65
|
+
self._match_id = self._selector[1:]
|
|
66
|
+
elif self._selector:
|
|
67
|
+
self._match_tag = self._selector
|
|
68
|
+
|
|
69
|
+
self._depth: int = 0 # nesting depth inside the matched element
|
|
70
|
+
self._capturing: bool = False
|
|
71
|
+
self._found: bool = False
|
|
72
|
+
self._raw_parts: list[str] = []
|
|
73
|
+
|
|
74
|
+
def _matches(self, tag: str, attrs: list[tuple[str, str | None]]) -> bool:
|
|
75
|
+
if self._match_id is not None:
|
|
76
|
+
attr_dict = dict(attrs)
|
|
77
|
+
return attr_dict.get("id") == self._match_id
|
|
78
|
+
if self._match_tag is not None:
|
|
79
|
+
return tag == self._match_tag
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
83
|
+
if self._found:
|
|
84
|
+
return
|
|
85
|
+
if self._capturing:
|
|
86
|
+
self._depth += 1
|
|
87
|
+
# Re-serialise the opening tag so inner elements are kept.
|
|
88
|
+
attr_str = "".join(f' {k}="{v}"' if v is not None else f" {k}" for k, v in attrs)
|
|
89
|
+
self._raw_parts.append(f"<{tag}{attr_str}>")
|
|
90
|
+
return
|
|
91
|
+
if self._matches(tag, attrs):
|
|
92
|
+
self._capturing = True
|
|
93
|
+
self._depth = 1
|
|
94
|
+
|
|
95
|
+
def handle_endtag(self, tag: str) -> None:
|
|
96
|
+
if not self._capturing or self._found:
|
|
97
|
+
return
|
|
98
|
+
self._depth -= 1
|
|
99
|
+
if self._depth == 0:
|
|
100
|
+
self._capturing = False
|
|
101
|
+
self._found = True
|
|
102
|
+
else:
|
|
103
|
+
self._raw_parts.append(f"</{tag}>")
|
|
104
|
+
|
|
105
|
+
def handle_data(self, data: str) -> None:
|
|
106
|
+
if self._capturing and not self._found:
|
|
107
|
+
self._raw_parts.append(data)
|
|
108
|
+
|
|
109
|
+
def handle_entityref(self, name: str) -> None: # type: ignore[override]
|
|
110
|
+
if self._capturing and not self._found:
|
|
111
|
+
self._raw_parts.append(f"&{name};")
|
|
112
|
+
|
|
113
|
+
def handle_charref(self, name: str) -> None: # type: ignore[override]
|
|
114
|
+
if self._capturing and not self._found:
|
|
115
|
+
self._raw_parts.append(f"&#{name};")
|
|
116
|
+
|
|
117
|
+
def fragment(self) -> str | None:
|
|
118
|
+
"""Return the captured inner HTML, or None if no match was found."""
|
|
119
|
+
return "".join(self._raw_parts) if self._found else None
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def extract_main(html: str, selector: str = "") -> str:
|
|
123
|
+
"""Return the HTML fragment addressed by *selector*, or *html* if no match.
|
|
124
|
+
|
|
125
|
+
Selector forms: ``#id``, ``tagname``, or ``""`` (whole document).
|
|
126
|
+
Falls back to the full *html* when the selector matches nothing so the
|
|
127
|
+
caller always gets something to normalise.
|
|
128
|
+
"""
|
|
129
|
+
if not selector:
|
|
130
|
+
return html
|
|
131
|
+
extractor = _SelectorExtractor(selector)
|
|
132
|
+
extractor.feed(html)
|
|
133
|
+
return extractor.fragment() or html
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def normalize_text(html_fragment: str) -> str:
|
|
137
|
+
"""Strip all markup and attributes; collapse whitespace to single spaces.
|
|
138
|
+
|
|
139
|
+
Two fragments that differ only in CSS classes, random data attributes, or
|
|
140
|
+
extra whitespace will produce the same normalised string and therefore the
|
|
141
|
+
same :func:`content_hash`.
|
|
142
|
+
"""
|
|
143
|
+
parser = _TextExtractor()
|
|
144
|
+
parser.feed(html_fragment)
|
|
145
|
+
text = parser.text()
|
|
146
|
+
# Collapse any remaining internal whitespace runs.
|
|
147
|
+
return re.sub(r"\s+", " ", text).strip()
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def content_hash(text: str) -> str:
|
|
151
|
+
"""SHA-256 of the normalised text (hex string)."""
|
|
152
|
+
return hashlib.sha256(text.encode()).hexdigest()
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class _TitleExtractor(HTMLParser):
|
|
156
|
+
"""Capture the text inside the first <title> element."""
|
|
157
|
+
|
|
158
|
+
def __init__(self) -> None:
|
|
159
|
+
super().__init__(convert_charrefs=True)
|
|
160
|
+
self._in = False
|
|
161
|
+
self._done = False
|
|
162
|
+
self._parts: list[str] = []
|
|
163
|
+
|
|
164
|
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
165
|
+
if tag == "title" and not self._done:
|
|
166
|
+
self._in = True
|
|
167
|
+
|
|
168
|
+
def handle_endtag(self, tag: str) -> None:
|
|
169
|
+
if tag == "title" and self._in:
|
|
170
|
+
self._in = False
|
|
171
|
+
self._done = True
|
|
172
|
+
|
|
173
|
+
def handle_data(self, data: str) -> None:
|
|
174
|
+
if self._in:
|
|
175
|
+
self._parts.append(data)
|
|
176
|
+
|
|
177
|
+
def title(self) -> str:
|
|
178
|
+
return re.sub(r"\s+", " ", "".join(self._parts)).strip()
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def extract_title(html: str) -> str:
|
|
182
|
+
"""The page's <title> text (whitespace-collapsed), or "" if absent/unparsable.
|
|
183
|
+
|
|
184
|
+
The stdlib HTML parser raises on some binary payloads; callers treat a
|
|
185
|
+
failure as "no title", so this never propagates an exception.
|
|
186
|
+
"""
|
|
187
|
+
parser = _TitleExtractor()
|
|
188
|
+
try:
|
|
189
|
+
parser.feed(html)
|
|
190
|
+
except Exception: # noqa: BLE001 - binary/garbage HTML => no title
|
|
191
|
+
return ""
|
|
192
|
+
return parser.title()
|