android-watcher 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- android_watcher/__init__.py +10 -0
- android_watcher/catalog/__init__.py +32 -0
- android_watcher/catalog/catalog.toml +531 -0
- android_watcher/cli.py +161 -0
- android_watcher/config.py +262 -0
- android_watcher/detect/__init__.py +1 -0
- android_watcher/detect/_normalize.py +192 -0
- android_watcher/detect/android_sitemap.py +540 -0
- android_watcher/detect/base.py +14 -0
- android_watcher/detect/content.py +99 -0
- android_watcher/detect/feed.py +135 -0
- android_watcher/detect/sitemap.py +203 -0
- android_watcher/doctor.py +125 -0
- android_watcher/fetch.py +162 -0
- android_watcher/group.py +79 -0
- android_watcher/lock.py +32 -0
- android_watcher/models.py +156 -0
- android_watcher/notify/__init__.py +1 -0
- android_watcher/notify/base.py +21 -0
- android_watcher/notify/email.py +52 -0
- android_watcher/notify/html.py +114 -0
- android_watcher/notify/render.py +239 -0
- android_watcher/notify/slack.py +124 -0
- android_watcher/notify/telegram.py +46 -0
- android_watcher/rank.py +84 -0
- android_watcher/registry.py +38 -0
- android_watcher/run.py +283 -0
- android_watcher/schedule.py +488 -0
- android_watcher/seed/__init__.py +45 -0
- android_watcher/seed/seed.sql.gz +0 -0
- android_watcher/store.py +492 -0
- android_watcher/triage/__init__.py +1 -0
- android_watcher/triage/base.py +25 -0
- android_watcher/triage/claude_cli.py +185 -0
- android_watcher/triage/noop.py +24 -0
- android_watcher/tui/__init__.py +1 -0
- android_watcher/tui/app.py +163 -0
- android_watcher/tui/configio.py +215 -0
- android_watcher/tui/screens.py +927 -0
- android_watcher-1.0.0.dist-info/METADATA +310 -0
- android_watcher-1.0.0.dist-info/RECORD +44 -0
- android_watcher-1.0.0.dist-info/WHEEL +4 -0
- android_watcher-1.0.0.dist-info/entry_points.txt +2 -0
- android_watcher-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
from urllib.parse import urlsplit, urlunsplit
|
|
5
|
+
|
|
6
|
+
from defusedxml import ElementTree as ET
|
|
7
|
+
|
|
8
|
+
from ..models import Change, Source
|
|
9
|
+
from .base import DETECTORS
|
|
10
|
+
|
|
11
|
+
_ATOM = "{http://www.w3.org/2005/Atom}"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _normalize_link(link: str) -> str:
|
|
15
|
+
parts = urlsplit(link.strip())
|
|
16
|
+
# Strip query and fragment; keep scheme/host/path; drop trailing slash on path.
|
|
17
|
+
path = parts.path.rstrip("/") or "/"
|
|
18
|
+
return urlunsplit((parts.scheme, parts.netloc, path, "", ""))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _hash(title: str, summary: str) -> str:
|
|
22
|
+
h = hashlib.sha256()
|
|
23
|
+
h.update(title.strip().encode())
|
|
24
|
+
h.update(b"\x00")
|
|
25
|
+
h.update(summary.strip().encode())
|
|
26
|
+
return h.hexdigest()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _text(el: ET.Element | None) -> str:
|
|
30
|
+
return (el.text or "").strip() if el is not None else ""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _parse_items(xml: str) -> list[dict]:
|
|
34
|
+
root = ET.fromstring(xml)
|
|
35
|
+
items: list[dict] = []
|
|
36
|
+
|
|
37
|
+
# Atom
|
|
38
|
+
for entry in root.findall(f"{_ATOM}entry"):
|
|
39
|
+
link_el = entry.find(f"{_ATOM}link")
|
|
40
|
+
link = (link_el.get("href", "") if link_el is not None else "").strip()
|
|
41
|
+
id_raw = _text(entry.find(f"{_ATOM}id"))
|
|
42
|
+
title = _text(entry.find(f"{_ATOM}title"))
|
|
43
|
+
summary_el = entry.find(f"{_ATOM}summary")
|
|
44
|
+
if summary_el is None:
|
|
45
|
+
summary_el = entry.find(f"{_ATOM}content")
|
|
46
|
+
summary = _text(summary_el)
|
|
47
|
+
# Atom <id> is always treated as a permalink identity (opaque IRI).
|
|
48
|
+
items.append(
|
|
49
|
+
{
|
|
50
|
+
"id_raw": id_raw,
|
|
51
|
+
"id_is_permalink": bool(id_raw),
|
|
52
|
+
"link": link,
|
|
53
|
+
"title": title,
|
|
54
|
+
"summary": summary,
|
|
55
|
+
}
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# RSS (root tag is <rss> or <channel> is a child)
|
|
59
|
+
channel = root.find("channel")
|
|
60
|
+
if channel is None and root.tag == "channel":
|
|
61
|
+
channel = root
|
|
62
|
+
if channel is not None:
|
|
63
|
+
for item in channel.findall("item"):
|
|
64
|
+
guid_el = item.find("guid")
|
|
65
|
+
guid_raw = _text(guid_el)
|
|
66
|
+
is_permalink = (
|
|
67
|
+
guid_el is not None
|
|
68
|
+
and guid_el.get("isPermaLink", "true").lower() != "false"
|
|
69
|
+
and bool(guid_raw)
|
|
70
|
+
)
|
|
71
|
+
link = _text(item.find("link"))
|
|
72
|
+
title = _text(item.find("title"))
|
|
73
|
+
summary = _text(item.find("description"))
|
|
74
|
+
items.append(
|
|
75
|
+
{
|
|
76
|
+
"id_raw": guid_raw,
|
|
77
|
+
"id_is_permalink": is_permalink,
|
|
78
|
+
"link": link,
|
|
79
|
+
"title": title,
|
|
80
|
+
"summary": summary,
|
|
81
|
+
}
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return items
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _identity(item: dict) -> str:
|
|
88
|
+
# Prefer a permalink id/guid; use it VERBATIM (an Atom <id> is an opaque IRI,
|
|
89
|
+
# often a tag: URI that must not be URL-normalized). Only the link-URL
|
|
90
|
+
# fallback is normalized. Never trust a non-permalink raw guid alone
|
|
91
|
+
# (Medium/Blogger reuse them).
|
|
92
|
+
if item["id_is_permalink"] and item["id_raw"]:
|
|
93
|
+
return item["id_raw"]
|
|
94
|
+
return _normalize_link(item["link"])
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@DETECTORS.register("feed")
|
|
98
|
+
class FeedDetector:
|
|
99
|
+
async def detect(self, source: Source, store, fetcher) -> list[Change]:
|
|
100
|
+
url = source.feed_url or source.url
|
|
101
|
+
res = await fetcher.fetch(url)
|
|
102
|
+
if res.not_modified or not res.text:
|
|
103
|
+
return []
|
|
104
|
+
changes: list[Change] = []
|
|
105
|
+
for item in _parse_items(res.text):
|
|
106
|
+
identity = _identity(item)
|
|
107
|
+
if not identity:
|
|
108
|
+
continue
|
|
109
|
+
content_hash = _hash(item["title"], item["summary"])
|
|
110
|
+
prior = store.seen_feed_item(source.id, identity)
|
|
111
|
+
if prior is None:
|
|
112
|
+
changes.append(
|
|
113
|
+
Change(
|
|
114
|
+
source_id=source.id,
|
|
115
|
+
url=item["link"] or identity,
|
|
116
|
+
change_kind="new",
|
|
117
|
+
title=item["title"],
|
|
118
|
+
raw_diff=f"{item['title']}\n\n{item['summary']}".strip()[:500],
|
|
119
|
+
fetched_hash=content_hash,
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
store.upsert_seen_feed_item(source.id, identity, content_hash)
|
|
123
|
+
elif prior != content_hash:
|
|
124
|
+
changes.append(
|
|
125
|
+
Change(
|
|
126
|
+
source_id=source.id,
|
|
127
|
+
url=item["link"] or identity,
|
|
128
|
+
change_kind="updated",
|
|
129
|
+
title=item["title"],
|
|
130
|
+
raw_diff=f"{item['title']}\n\n{item['summary']}".strip()[:500],
|
|
131
|
+
fetched_hash=content_hash,
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
store.upsert_seen_feed_item(source.id, identity, content_hash)
|
|
135
|
+
return changes
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""Generic sitemap detector: candidate-then-confirm.
|
|
2
|
+
|
|
3
|
+
A <lastmod> bump in the sitemap is a *candidate* — it is never recorded as a
|
|
4
|
+
Change on its own. The page is fetched and its normalized content is hashed;
|
|
5
|
+
only a real content-hash move produces a Change.
|
|
6
|
+
|
|
7
|
+
If the confirm fetch returns 304 (ETag/If-Modified-Since unchanged), the
|
|
8
|
+
sitemap lastmod is bumped in the snapshot (so the candidate quiesces next run)
|
|
9
|
+
but no Change is emitted.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
from urllib.parse import urlsplit
|
|
16
|
+
|
|
17
|
+
import defusedxml.ElementTree as ET
|
|
18
|
+
|
|
19
|
+
from ..models import Change, Source
|
|
20
|
+
from ._normalize import (
|
|
21
|
+
EMPTY_RENDER_THRESHOLD,
|
|
22
|
+
content_hash,
|
|
23
|
+
extract_main,
|
|
24
|
+
extract_title,
|
|
25
|
+
normalize_text,
|
|
26
|
+
)
|
|
27
|
+
from .base import DETECTORS
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
_SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _parse_urlset(xml_text: str) -> list[tuple[str, str]]:
|
|
35
|
+
"""Return [(loc, lastmod), ...] from a sitemap <urlset> XML string.
|
|
36
|
+
|
|
37
|
+
Missing or empty <lastmod> values are returned as empty strings.
|
|
38
|
+
"""
|
|
39
|
+
root = ET.fromstring(xml_text)
|
|
40
|
+
entries: list[tuple[str, str]] = []
|
|
41
|
+
for url_el in root.findall(f"{{{_SITEMAP_NS}}}url"):
|
|
42
|
+
loc_el = url_el.find(f"{{{_SITEMAP_NS}}}loc")
|
|
43
|
+
lastmod_el = url_el.find(f"{{{_SITEMAP_NS}}}lastmod")
|
|
44
|
+
if loc_el is None or not (loc_el.text or "").strip():
|
|
45
|
+
continue
|
|
46
|
+
loc = loc_el.text.strip()
|
|
47
|
+
lastmod = (lastmod_el.text or "").strip() if lastmod_el is not None else ""
|
|
48
|
+
entries.append((loc, lastmod))
|
|
49
|
+
return entries
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _matches_prefix(loc: str, prefix: str) -> bool:
|
|
53
|
+
if not prefix:
|
|
54
|
+
return True
|
|
55
|
+
return urlsplit(loc).path.startswith(prefix)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
async def confirm_candidate(
|
|
59
|
+
source: Source,
|
|
60
|
+
store: object,
|
|
61
|
+
fetcher: object,
|
|
62
|
+
loc: str,
|
|
63
|
+
lastmod: str,
|
|
64
|
+
*,
|
|
65
|
+
emit_new: bool = False,
|
|
66
|
+
) -> Change | None:
|
|
67
|
+
"""Per-URL confirm that never raises. Any failure — robots-blocked
|
|
68
|
+
(``Disallowed``), a binary masquerading as HTML (the stdlib HTML parser
|
|
69
|
+
throws on it), a transport error — is logged and skipped, so one bad URL
|
|
70
|
+
cannot abort a source's whole detection run."""
|
|
71
|
+
try:
|
|
72
|
+
return await _confirm_candidate(source, store, fetcher, loc, lastmod, emit_new=emit_new)
|
|
73
|
+
except Exception as exc: # noqa: BLE001 - per-URL isolation is the point
|
|
74
|
+
logger.warning("confirm_candidate skipped %r: %s", loc, exc)
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
async def _confirm_candidate(
|
|
79
|
+
source: Source,
|
|
80
|
+
store: object,
|
|
81
|
+
fetcher: object,
|
|
82
|
+
loc: str,
|
|
83
|
+
lastmod: str,
|
|
84
|
+
*,
|
|
85
|
+
emit_new: bool = False,
|
|
86
|
+
) -> Change | None:
|
|
87
|
+
"""Fetch *loc*, hash normalized content, return a Change only on a real move.
|
|
88
|
+
|
|
89
|
+
Contract (pinned):
|
|
90
|
+
- First content capture (never seen, or fetch-free baseline with an empty
|
|
91
|
+
content_hash): baseline silently, return None — UNLESS ``emit_new`` is set
|
|
92
|
+
(a genuinely new URL discovered after a baseline exists), in which case the
|
|
93
|
+
captured content is returned as Change(change_kind="new").
|
|
94
|
+
- Identical re-confirm (hash unchanged): return None.
|
|
95
|
+
- Content-hash move: return Change(change_kind="updated").
|
|
96
|
+
- 304 from server: persist new lastmod (quiesce), return None.
|
|
97
|
+
"""
|
|
98
|
+
res = await fetcher.fetch(loc, conditional=True) # type: ignore[union-attr]
|
|
99
|
+
snap = store.get_snapshot(source.id, loc) # type: ignore[union-attr]
|
|
100
|
+
|
|
101
|
+
if res.not_modified:
|
|
102
|
+
# Server confirms content is unchanged despite a lastmod advance.
|
|
103
|
+
# Persist the new lastmod so this candidate won't re-fire next run.
|
|
104
|
+
if snap is not None:
|
|
105
|
+
store.upsert_snapshot( # type: ignore[union-attr]
|
|
106
|
+
source.id,
|
|
107
|
+
loc,
|
|
108
|
+
signal_type="sitemap",
|
|
109
|
+
content_hash=snap.content_hash,
|
|
110
|
+
lastmod=lastmod,
|
|
111
|
+
excerpt=snap.excerpt,
|
|
112
|
+
)
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
text = normalize_text(extract_main(res.text, source.content_selector))
|
|
116
|
+
|
|
117
|
+
if len(text) < EMPTY_RENDER_THRESHOLD:
|
|
118
|
+
logger.warning(
|
|
119
|
+
"sitemap detector: page at %r returned a JS-shell (text length %d < %d) — "
|
|
120
|
+
"skipping baseline/change; doctor will surface this",
|
|
121
|
+
loc,
|
|
122
|
+
len(text),
|
|
123
|
+
EMPTY_RENDER_THRESHOLD,
|
|
124
|
+
)
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
new_hash = content_hash(text)
|
|
128
|
+
# The page's own <title> names the change; the source name is only a fallback
|
|
129
|
+
# (so a digest never reads "Android Open Source Project" for every page).
|
|
130
|
+
title = extract_title(res.text) or source.name
|
|
131
|
+
|
|
132
|
+
# First content capture is silent: either the URL was never seen, or it was
|
|
133
|
+
# baselined fetch-free on a prior run (empty content_hash) and this is its
|
|
134
|
+
# first real fetch. A Change requires a genuine prior content hash that
|
|
135
|
+
# moved — a lastmod bump alone never counts.
|
|
136
|
+
first_capture = snap is None or not snap.content_hash
|
|
137
|
+
store.upsert_snapshot( # type: ignore[union-attr]
|
|
138
|
+
source.id,
|
|
139
|
+
loc,
|
|
140
|
+
signal_type="sitemap",
|
|
141
|
+
content_hash=new_hash,
|
|
142
|
+
lastmod=lastmod,
|
|
143
|
+
excerpt=text[:500],
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
if first_capture:
|
|
147
|
+
if emit_new:
|
|
148
|
+
# Genuinely new URL after baseline: report its first capture as "new".
|
|
149
|
+
return Change(
|
|
150
|
+
source_id=source.id,
|
|
151
|
+
url=loc,
|
|
152
|
+
change_kind="new",
|
|
153
|
+
title=title,
|
|
154
|
+
raw_diff=text[:500],
|
|
155
|
+
fetched_hash=new_hash,
|
|
156
|
+
)
|
|
157
|
+
return None # baseline silently
|
|
158
|
+
|
|
159
|
+
if snap.content_hash == new_hash:
|
|
160
|
+
return None # content re-confirmed identical
|
|
161
|
+
|
|
162
|
+
return Change(
|
|
163
|
+
source_id=source.id,
|
|
164
|
+
url=loc,
|
|
165
|
+
change_kind="updated",
|
|
166
|
+
title=title,
|
|
167
|
+
raw_diff=text[:500],
|
|
168
|
+
fetched_hash=new_hash,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@DETECTORS.register("sitemap")
|
|
173
|
+
class SitemapDetector:
|
|
174
|
+
async def detect(self, source: Source, store: object, fetcher: object) -> list[Change]:
|
|
175
|
+
res = await fetcher.fetch(source.url, conditional=True) # type: ignore[union-attr]
|
|
176
|
+
if res.not_modified or not res.text:
|
|
177
|
+
return []
|
|
178
|
+
|
|
179
|
+
entries = _parse_urlset(res.text)
|
|
180
|
+
matched = [
|
|
181
|
+
(loc, lastmod) for loc, lastmod in entries if _matches_prefix(loc, source.path_prefix)
|
|
182
|
+
]
|
|
183
|
+
|
|
184
|
+
if source.path_prefix and not matched:
|
|
185
|
+
logger.warning(
|
|
186
|
+
"sitemap detector: no URLs matched path_prefix %r for source %r — "
|
|
187
|
+
"check the prefix or the sitemap URL",
|
|
188
|
+
source.path_prefix,
|
|
189
|
+
source.id,
|
|
190
|
+
)
|
|
191
|
+
return []
|
|
192
|
+
|
|
193
|
+
changes: list[Change] = []
|
|
194
|
+
for loc, lastmod in matched:
|
|
195
|
+
snap = store.get_snapshot(source.id, loc) # type: ignore[union-attr]
|
|
196
|
+
# Skip if lastmod is present and unchanged — not even a candidate
|
|
197
|
+
if snap is not None and lastmod and snap.lastmod == lastmod:
|
|
198
|
+
continue
|
|
199
|
+
change = await confirm_candidate(source, store, fetcher, loc, lastmod)
|
|
200
|
+
if change is not None:
|
|
201
|
+
changes.append(change)
|
|
202
|
+
|
|
203
|
+
return changes
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Doctor health checks.
|
|
2
|
+
|
|
3
|
+
``run_doctor(config)`` returns a list of ``Check`` objects covering:
|
|
4
|
+
- sitemap path-prefix resolution for each android_sitemap source
|
|
5
|
+
- AI backend availability
|
|
6
|
+
- schedule status (soft dependency on the schedule module, imported lazily)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
import shutil
|
|
13
|
+
|
|
14
|
+
from android_watcher import __version__
|
|
15
|
+
from android_watcher.config import Config, db_path
|
|
16
|
+
from android_watcher.detect.android_sitemap import (
|
|
17
|
+
INDEX_URL,
|
|
18
|
+
_index_url_for,
|
|
19
|
+
load_sitemap,
|
|
20
|
+
prefix_count,
|
|
21
|
+
)
|
|
22
|
+
from android_watcher.fetch import USER_AGENT, Fetcher
|
|
23
|
+
from android_watcher.models import Check
|
|
24
|
+
from android_watcher.run import resolve_sources
|
|
25
|
+
from android_watcher.store import Store
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _check_ai(config: Config) -> Check:
|
|
29
|
+
if config.ai.mode == "off":
|
|
30
|
+
return Check("ai-backend", True, "AI disabled")
|
|
31
|
+
path = shutil.which("claude")
|
|
32
|
+
if path:
|
|
33
|
+
return Check("ai-backend", True, f"claude found at {path}")
|
|
34
|
+
return Check("ai-backend", False, "claude not found on PATH")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _check_seed() -> Check:
|
|
38
|
+
"""Report the imported baseline seed date and snapshot count, if any."""
|
|
39
|
+
store = Store(db_path())
|
|
40
|
+
store.migrate()
|
|
41
|
+
try:
|
|
42
|
+
count = store.snapshot_count()
|
|
43
|
+
date = store.seed_date()
|
|
44
|
+
finally:
|
|
45
|
+
store.close()
|
|
46
|
+
if count == 0:
|
|
47
|
+
return Check("seed", True, "no baseline yet; first run will establish one")
|
|
48
|
+
if date:
|
|
49
|
+
return Check("seed", True, f"baseline seeded {date} ({count} snapshots)")
|
|
50
|
+
return Check("seed", True, f"baseline established ({count} snapshots)")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _check_schedule() -> Check:
|
|
54
|
+
try:
|
|
55
|
+
from android_watcher.schedule import schedule_status # noqa: PLC0415
|
|
56
|
+
except ImportError:
|
|
57
|
+
return Check("schedule", False, "schedule module unavailable")
|
|
58
|
+
return schedule_status()
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _load_sitemap_entries(index_url: str = INDEX_URL) -> list[tuple[str, str]]:
|
|
62
|
+
"""Fetch one host's sitemap once and return the flat entry list.
|
|
63
|
+
|
|
64
|
+
Extracted so tests can patch this function directly instead of having to
|
|
65
|
+
wire up a real Store + Fetcher + asyncio event loop.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
async def _load() -> list[tuple[str, str]]:
|
|
69
|
+
store = Store(db_path())
|
|
70
|
+
store.migrate()
|
|
71
|
+
fetcher = Fetcher(store, user_agent=USER_AGENT.format(version=__version__))
|
|
72
|
+
try:
|
|
73
|
+
# Time-box it: a sitemap can be large (~300 MB uncached), so doctor
|
|
74
|
+
# reports a slow/unavailable sitemap rather than appearing to hang.
|
|
75
|
+
return await asyncio.wait_for(load_sitemap(fetcher, index_url), timeout=30)
|
|
76
|
+
finally:
|
|
77
|
+
await fetcher.close()
|
|
78
|
+
|
|
79
|
+
return asyncio.run(_load())
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _check_prefixes(config: Config) -> list[Check]:
|
|
83
|
+
targets = [s for s in resolve_sources(config) if s.detector == "android_sitemap"]
|
|
84
|
+
if not targets:
|
|
85
|
+
return []
|
|
86
|
+
|
|
87
|
+
by_host: dict[str, list] = {}
|
|
88
|
+
for s in targets:
|
|
89
|
+
by_host.setdefault(_index_url_for(s), []).append(s)
|
|
90
|
+
|
|
91
|
+
checks: list[Check] = []
|
|
92
|
+
for index_url, srcs in by_host.items():
|
|
93
|
+
host = index_url.split("/sitemap.xml")[0]
|
|
94
|
+
try:
|
|
95
|
+
entries = _load_sitemap_entries(index_url)
|
|
96
|
+
except TimeoutError:
|
|
97
|
+
checks.append(Check(f"sitemap:{host}", True, "fetch slow; run once to cache"))
|
|
98
|
+
continue
|
|
99
|
+
except Exception as exc: # noqa: BLE001 - any fetch/parse failure is a soft check
|
|
100
|
+
checks.append(Check(f"sitemap:{host}", False, f"unavailable; run once first ({exc})"))
|
|
101
|
+
continue
|
|
102
|
+
if not entries:
|
|
103
|
+
checks.append(Check(f"sitemap:{host}", True, "cached (304); not re-verified"))
|
|
104
|
+
continue
|
|
105
|
+
for s in srcs:
|
|
106
|
+
if not s.path_prefix:
|
|
107
|
+
checks.append(Check(f"prefix:{s.id}", True, f"watches host ({len(entries)} URLs)"))
|
|
108
|
+
continue
|
|
109
|
+
count = prefix_count(entries, s.path_prefix)
|
|
110
|
+
if count == 0:
|
|
111
|
+
checks.append(
|
|
112
|
+
Check(f"prefix:{s.path_prefix}", False, "stale prefix: 0 sitemap URLs match")
|
|
113
|
+
)
|
|
114
|
+
else:
|
|
115
|
+
checks.append(Check(f"prefix:{s.path_prefix}", True, f"resolves ({count} URLs)"))
|
|
116
|
+
return checks
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def run_doctor(config: Config) -> list[Check]:
|
|
120
|
+
checks: list[Check] = []
|
|
121
|
+
checks.extend(_check_prefixes(config))
|
|
122
|
+
checks.append(_check_seed())
|
|
123
|
+
checks.append(_check_ai(config))
|
|
124
|
+
checks.append(_check_schedule())
|
|
125
|
+
return checks
|
android_watcher/fetch.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""Async HTTP fetch layer for android-watcher.
|
|
2
|
+
|
|
3
|
+
Provides ``Fetcher``, a concurrency-limited async client that:
|
|
4
|
+
- Sets a descriptive User-Agent.
|
|
5
|
+
- Honors robots.txt per host (urllib.robotparser).
|
|
6
|
+
- Applies a per-host crawl delay.
|
|
7
|
+
- Retries with exponential backoff on 5xx / transport errors.
|
|
8
|
+
- Supports conditional GET via Store.http_cache_get/put.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import logging
|
|
15
|
+
import urllib.robotparser
|
|
16
|
+
from urllib.parse import urlsplit
|
|
17
|
+
|
|
18
|
+
import httpx
|
|
19
|
+
|
|
20
|
+
from .models import Disallowed, FetchResult
|
|
21
|
+
from .store import Store
|
|
22
|
+
|
|
23
|
+
log = logging.getLogger("android_watcher.fetch")
|
|
24
|
+
|
|
25
|
+
USER_AGENT = "android-watcher/{version}"
|
|
26
|
+
|
|
27
|
+
MAX_RETRIES = 4
|
|
28
|
+
BACKOFF_BASE = 0.5
|
|
29
|
+
# Per-request httpx timeouts (connect/read/write/pool). read is the gap between
|
|
30
|
+
# bytes, not total, so a large steady download is fine; a stalled one trips it.
|
|
31
|
+
_TIMEOUT = httpx.Timeout(connect=10.0, read=30.0, write=30.0, pool=10.0)
|
|
32
|
+
# Hard ceiling on a single fetch including retries + backoff, so one bad URL can
|
|
33
|
+
# never hang a run indefinitely.
|
|
34
|
+
FETCH_DEADLINE = 120.0
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class Fetcher:
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
store: Store,
|
|
41
|
+
*,
|
|
42
|
+
user_agent: str,
|
|
43
|
+
concurrency: int = 4,
|
|
44
|
+
crawl_delay: float = 0.5,
|
|
45
|
+
):
|
|
46
|
+
self._store = store
|
|
47
|
+
self._user_agent = user_agent
|
|
48
|
+
self._crawl_delay = crawl_delay
|
|
49
|
+
self._sem = asyncio.Semaphore(concurrency)
|
|
50
|
+
self._client = httpx.AsyncClient(
|
|
51
|
+
headers={"User-Agent": user_agent},
|
|
52
|
+
follow_redirects=True,
|
|
53
|
+
timeout=_TIMEOUT,
|
|
54
|
+
)
|
|
55
|
+
self._robots: dict[str, urllib.robotparser.RobotFileParser | None] = {}
|
|
56
|
+
self._last_fetch: dict[str, float] = {}
|
|
57
|
+
|
|
58
|
+
async def fetch(self, url: str, *, conditional: bool = False) -> FetchResult:
|
|
59
|
+
if not await self._robots_ok(url):
|
|
60
|
+
raise Disallowed(url)
|
|
61
|
+
|
|
62
|
+
headers: dict[str, str] = {}
|
|
63
|
+
if conditional:
|
|
64
|
+
etag, last_modified = self._store.http_cache_get(url)
|
|
65
|
+
if etag:
|
|
66
|
+
headers["If-None-Match"] = etag
|
|
67
|
+
if last_modified:
|
|
68
|
+
headers["If-Modified-Since"] = last_modified
|
|
69
|
+
|
|
70
|
+
async with self._sem:
|
|
71
|
+
await self._respect_crawl_delay(url)
|
|
72
|
+
log.info("downloading %s", url)
|
|
73
|
+
resp = await asyncio.wait_for(
|
|
74
|
+
self._get_with_backoff(url, headers), timeout=FETCH_DEADLINE
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
if resp.status_code == 304:
|
|
78
|
+
return FetchResult(url=url, status=304, text="", not_modified=True)
|
|
79
|
+
|
|
80
|
+
etag = resp.headers.get("ETag", "")
|
|
81
|
+
last_modified = resp.headers.get("Last-Modified", "")
|
|
82
|
+
# Only persist validators when at least one is non-empty; never clobber
|
|
83
|
+
# an existing cache entry with ("","") on a validator-less 200.
|
|
84
|
+
if conditional and (etag or last_modified):
|
|
85
|
+
self._store.http_cache_put(url, etag, last_modified)
|
|
86
|
+
|
|
87
|
+
return FetchResult(
|
|
88
|
+
url=url,
|
|
89
|
+
status=resp.status_code,
|
|
90
|
+
text=resp.text,
|
|
91
|
+
etag=etag,
|
|
92
|
+
last_modified=last_modified,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
async def close(self) -> None:
|
|
96
|
+
await self._client.aclose()
|
|
97
|
+
|
|
98
|
+
async def _get_with_backoff(self, url: str, headers: dict[str, str]) -> httpx.Response:
|
|
99
|
+
# Merge User-Agent into per-request headers so it is sent even when
|
|
100
|
+
# tests inject a bare _client that has no default headers set.
|
|
101
|
+
request_headers = {"User-Agent": self._user_agent, **headers}
|
|
102
|
+
last_exc: Exception | None = None
|
|
103
|
+
for attempt in range(MAX_RETRIES):
|
|
104
|
+
try:
|
|
105
|
+
resp = await self._client.get(url, headers=request_headers)
|
|
106
|
+
except (httpx.TransportError, httpx.TimeoutException) as exc:
|
|
107
|
+
last_exc = exc
|
|
108
|
+
resp = None
|
|
109
|
+
if resp is not None and resp.status_code < 500:
|
|
110
|
+
return resp
|
|
111
|
+
if attempt == MAX_RETRIES - 1:
|
|
112
|
+
if resp is not None:
|
|
113
|
+
return resp
|
|
114
|
+
raise last_exc # type: ignore[misc]
|
|
115
|
+
await asyncio.sleep(BACKOFF_BASE * (2**attempt))
|
|
116
|
+
raise last_exc # unreachable
|
|
117
|
+
|
|
118
|
+
async def _respect_crawl_delay(self, url: str) -> None:
|
|
119
|
+
host = _host_root(url)
|
|
120
|
+
delay = self._crawl_delay_for(url)
|
|
121
|
+
loop = asyncio.get_event_loop()
|
|
122
|
+
now = loop.time()
|
|
123
|
+
last = self._last_fetch.get(host)
|
|
124
|
+
if last is not None:
|
|
125
|
+
wait = delay - (now - last)
|
|
126
|
+
if wait > 0:
|
|
127
|
+
await asyncio.sleep(wait)
|
|
128
|
+
self._last_fetch[host] = loop.time()
|
|
129
|
+
|
|
130
|
+
def _crawl_delay_for(self, url: str) -> float:
|
|
131
|
+
rp = self._robots.get(_host_root(url))
|
|
132
|
+
if rp is not None:
|
|
133
|
+
cd = rp.crawl_delay(self._user_agent)
|
|
134
|
+
if cd is not None:
|
|
135
|
+
return float(cd)
|
|
136
|
+
return self._crawl_delay
|
|
137
|
+
|
|
138
|
+
async def _robots_ok(self, url: str) -> bool:
|
|
139
|
+
host = _host_root(url)
|
|
140
|
+
if host not in self._robots:
|
|
141
|
+
rp: urllib.robotparser.RobotFileParser | None = urllib.robotparser.RobotFileParser()
|
|
142
|
+
try:
|
|
143
|
+
# Fetch via the timed httpx client. urllib's RobotFileParser.read()
|
|
144
|
+
# uses urlopen with NO timeout and can hang a run forever if a host
|
|
145
|
+
# stalls; the shared client carries a 30s timeout instead.
|
|
146
|
+
resp = await self._client.get(f"{host}/robots.txt")
|
|
147
|
+
if resp.status_code >= 400:
|
|
148
|
+
rp = None # treat missing/forbidden robots as "allow"
|
|
149
|
+
else:
|
|
150
|
+
rp.parse(resp.text.splitlines())
|
|
151
|
+
except (httpx.HTTPError, httpx.InvalidURL):
|
|
152
|
+
rp = None # robots unavailable => allow
|
|
153
|
+
self._robots[host] = rp
|
|
154
|
+
rp = self._robots[host]
|
|
155
|
+
if rp is None:
|
|
156
|
+
return True
|
|
157
|
+
return rp.can_fetch(self._user_agent, url)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _host_root(url: str) -> str:
|
|
161
|
+
parts = urlsplit(url)
|
|
162
|
+
return f"{parts.scheme}://{parts.netloc}"
|
android_watcher/group.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Group ledger changes into DigestGroups: model group_key first, heuristic fallback."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from .config import Config
|
|
8
|
+
from .models import Change, DigestGroup, Source
|
|
9
|
+
|
|
10
|
+
_WORD = re.compile(r"[a-z0-9]+")
|
|
11
|
+
_DIGITS = re.compile(r"\d")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _title_case(text: str) -> str:
|
|
15
|
+
"""Capitalize the first letter of each word, preserving existing capitals so
|
|
16
|
+
acronyms survive (GKI stays GKI, OS stays OS, not Gki/Os)."""
|
|
17
|
+
return " ".join(w[:1].upper() + w[1:] if w else w for w in text.split(" "))
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def heuristic_prefix(title: str) -> str:
|
|
21
|
+
"""A coarse grouping key for changes without a model group_key.
|
|
22
|
+
|
|
23
|
+
Lowercase the title, keep leading words up to the first numeric run (so
|
|
24
|
+
'Android 13 release builds' and 'Android 14 release builds' collide), capped
|
|
25
|
+
at four words so unrelated titles do not over-merge.
|
|
26
|
+
"""
|
|
27
|
+
words = _WORD.findall(title.lower())
|
|
28
|
+
kept: list[str] = []
|
|
29
|
+
for w in words:
|
|
30
|
+
if _DIGITS.search(w):
|
|
31
|
+
break
|
|
32
|
+
kept.append(w)
|
|
33
|
+
if len(kept) >= 4:
|
|
34
|
+
break
|
|
35
|
+
return " ".join(kept) if kept else title.strip().lower()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def group_changes(
|
|
39
|
+
changes: list[Change], sources: dict[str, Source], config: Config
|
|
40
|
+
) -> list[DigestGroup]:
|
|
41
|
+
# Lazy import avoids a rank<->group module cycle: rank.py will import
|
|
42
|
+
# group_changes at the top level (Task 4), so a top-level import of _score
|
|
43
|
+
# here would create rank -> group -> rank with _score not yet defined.
|
|
44
|
+
from .rank import _score
|
|
45
|
+
|
|
46
|
+
buckets: dict[str, list[Change]] = {}
|
|
47
|
+
for c in changes:
|
|
48
|
+
sub = c.group_key or heuristic_prefix(c.title)
|
|
49
|
+
key = f"{c.source_id}::{sub}"
|
|
50
|
+
buckets.setdefault(key, []).append(c)
|
|
51
|
+
|
|
52
|
+
groups: list[DigestGroup] = []
|
|
53
|
+
for key, members in buckets.items():
|
|
54
|
+
members.sort(key=lambda c: (c.detected_at, c.id or 0), reverse=True)
|
|
55
|
+
source = sources.get(members[0].source_id)
|
|
56
|
+
# Heading: prefer the model's group headline; else the representative page
|
|
57
|
+
# title. Summary: prefer the model's group summary; else the representative
|
|
58
|
+
# change's own one-line description (so every group shows a sentence).
|
|
59
|
+
summary = next((m.group_summary for m in members if m.group_summary), None) or next(
|
|
60
|
+
(m.description for m in members if m.description), None
|
|
61
|
+
)
|
|
62
|
+
raw_title = next((m.group_title for m in members if m.group_title), None) or next(
|
|
63
|
+
(m.title for m in members if m.title), None
|
|
64
|
+
)
|
|
65
|
+
title = _title_case(raw_title) if raw_title else members[0].url
|
|
66
|
+
score = max(_score(m, source, config) for m in members)
|
|
67
|
+
groups.append(
|
|
68
|
+
DigestGroup(
|
|
69
|
+
key=key,
|
|
70
|
+
title=title,
|
|
71
|
+
summary=summary,
|
|
72
|
+
category=source.category if source else "",
|
|
73
|
+
source_id=members[0].source_id,
|
|
74
|
+
change_kind=members[0].change_kind,
|
|
75
|
+
members=members,
|
|
76
|
+
score=score,
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
return groups
|