android-watcher 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. android_watcher/__init__.py +10 -0
  2. android_watcher/catalog/__init__.py +32 -0
  3. android_watcher/catalog/catalog.toml +531 -0
  4. android_watcher/cli.py +161 -0
  5. android_watcher/config.py +262 -0
  6. android_watcher/detect/__init__.py +1 -0
  7. android_watcher/detect/_normalize.py +192 -0
  8. android_watcher/detect/android_sitemap.py +540 -0
  9. android_watcher/detect/base.py +14 -0
  10. android_watcher/detect/content.py +99 -0
  11. android_watcher/detect/feed.py +135 -0
  12. android_watcher/detect/sitemap.py +203 -0
  13. android_watcher/doctor.py +125 -0
  14. android_watcher/fetch.py +162 -0
  15. android_watcher/group.py +79 -0
  16. android_watcher/lock.py +32 -0
  17. android_watcher/models.py +156 -0
  18. android_watcher/notify/__init__.py +1 -0
  19. android_watcher/notify/base.py +21 -0
  20. android_watcher/notify/email.py +52 -0
  21. android_watcher/notify/html.py +114 -0
  22. android_watcher/notify/render.py +239 -0
  23. android_watcher/notify/slack.py +124 -0
  24. android_watcher/notify/telegram.py +46 -0
  25. android_watcher/rank.py +84 -0
  26. android_watcher/registry.py +38 -0
  27. android_watcher/run.py +283 -0
  28. android_watcher/schedule.py +488 -0
  29. android_watcher/seed/__init__.py +45 -0
  30. android_watcher/seed/seed.sql.gz +0 -0
  31. android_watcher/store.py +492 -0
  32. android_watcher/triage/__init__.py +1 -0
  33. android_watcher/triage/base.py +25 -0
  34. android_watcher/triage/claude_cli.py +185 -0
  35. android_watcher/triage/noop.py +24 -0
  36. android_watcher/tui/__init__.py +1 -0
  37. android_watcher/tui/app.py +163 -0
  38. android_watcher/tui/configio.py +215 -0
  39. android_watcher/tui/screens.py +927 -0
  40. android_watcher-1.0.0.dist-info/METADATA +310 -0
  41. android_watcher-1.0.0.dist-info/RECORD +44 -0
  42. android_watcher-1.0.0.dist-info/WHEEL +4 -0
  43. android_watcher-1.0.0.dist-info/entry_points.txt +2 -0
  44. android_watcher-1.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,135 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ from urllib.parse import urlsplit, urlunsplit
5
+
6
+ from defusedxml import ElementTree as ET
7
+
8
+ from ..models import Change, Source
9
+ from .base import DETECTORS
10
+
11
+ _ATOM = "{http://www.w3.org/2005/Atom}"
12
+
13
+
14
+ def _normalize_link(link: str) -> str:
15
+ parts = urlsplit(link.strip())
16
+ # Strip query and fragment; keep scheme/host/path; drop trailing slash on path.
17
+ path = parts.path.rstrip("/") or "/"
18
+ return urlunsplit((parts.scheme, parts.netloc, path, "", ""))
19
+
20
+
21
+ def _hash(title: str, summary: str) -> str:
22
+ h = hashlib.sha256()
23
+ h.update(title.strip().encode())
24
+ h.update(b"\x00")
25
+ h.update(summary.strip().encode())
26
+ return h.hexdigest()
27
+
28
+
29
+ def _text(el: ET.Element | None) -> str:
30
+ return (el.text or "").strip() if el is not None else ""
31
+
32
+
33
+ def _parse_items(xml: str) -> list[dict]:
34
+ root = ET.fromstring(xml)
35
+ items: list[dict] = []
36
+
37
+ # Atom
38
+ for entry in root.findall(f"{_ATOM}entry"):
39
+ link_el = entry.find(f"{_ATOM}link")
40
+ link = (link_el.get("href", "") if link_el is not None else "").strip()
41
+ id_raw = _text(entry.find(f"{_ATOM}id"))
42
+ title = _text(entry.find(f"{_ATOM}title"))
43
+ summary_el = entry.find(f"{_ATOM}summary")
44
+ if summary_el is None:
45
+ summary_el = entry.find(f"{_ATOM}content")
46
+ summary = _text(summary_el)
47
+ # Atom <id> is always treated as a permalink identity (opaque IRI).
48
+ items.append(
49
+ {
50
+ "id_raw": id_raw,
51
+ "id_is_permalink": bool(id_raw),
52
+ "link": link,
53
+ "title": title,
54
+ "summary": summary,
55
+ }
56
+ )
57
+
58
+ # RSS (root tag is <rss> or <channel> is a child)
59
+ channel = root.find("channel")
60
+ if channel is None and root.tag == "channel":
61
+ channel = root
62
+ if channel is not None:
63
+ for item in channel.findall("item"):
64
+ guid_el = item.find("guid")
65
+ guid_raw = _text(guid_el)
66
+ is_permalink = (
67
+ guid_el is not None
68
+ and guid_el.get("isPermaLink", "true").lower() != "false"
69
+ and bool(guid_raw)
70
+ )
71
+ link = _text(item.find("link"))
72
+ title = _text(item.find("title"))
73
+ summary = _text(item.find("description"))
74
+ items.append(
75
+ {
76
+ "id_raw": guid_raw,
77
+ "id_is_permalink": is_permalink,
78
+ "link": link,
79
+ "title": title,
80
+ "summary": summary,
81
+ }
82
+ )
83
+
84
+ return items
85
+
86
+
87
+ def _identity(item: dict) -> str:
88
+ # Prefer a permalink id/guid; use it VERBATIM (an Atom <id> is an opaque IRI,
89
+ # often a tag: URI that must not be URL-normalized). Only the link-URL
90
+ # fallback is normalized. Never trust a non-permalink raw guid alone
91
+ # (Medium/Blogger reuse them).
92
+ if item["id_is_permalink"] and item["id_raw"]:
93
+ return item["id_raw"]
94
+ return _normalize_link(item["link"])
95
+
96
+
97
+ @DETECTORS.register("feed")
98
+ class FeedDetector:
99
+ async def detect(self, source: Source, store, fetcher) -> list[Change]:
100
+ url = source.feed_url or source.url
101
+ res = await fetcher.fetch(url)
102
+ if res.not_modified or not res.text:
103
+ return []
104
+ changes: list[Change] = []
105
+ for item in _parse_items(res.text):
106
+ identity = _identity(item)
107
+ if not identity:
108
+ continue
109
+ content_hash = _hash(item["title"], item["summary"])
110
+ prior = store.seen_feed_item(source.id, identity)
111
+ if prior is None:
112
+ changes.append(
113
+ Change(
114
+ source_id=source.id,
115
+ url=item["link"] or identity,
116
+ change_kind="new",
117
+ title=item["title"],
118
+ raw_diff=f"{item['title']}\n\n{item['summary']}".strip()[:500],
119
+ fetched_hash=content_hash,
120
+ )
121
+ )
122
+ store.upsert_seen_feed_item(source.id, identity, content_hash)
123
+ elif prior != content_hash:
124
+ changes.append(
125
+ Change(
126
+ source_id=source.id,
127
+ url=item["link"] or identity,
128
+ change_kind="updated",
129
+ title=item["title"],
130
+ raw_diff=f"{item['title']}\n\n{item['summary']}".strip()[:500],
131
+ fetched_hash=content_hash,
132
+ )
133
+ )
134
+ store.upsert_seen_feed_item(source.id, identity, content_hash)
135
+ return changes
@@ -0,0 +1,203 @@
1
+ """Generic sitemap detector: candidate-then-confirm.
2
+
3
+ A <lastmod> bump in the sitemap is a *candidate* — it is never recorded as a
4
+ Change on its own. The page is fetched and its normalized content is hashed;
5
+ only a real content-hash move produces a Change.
6
+
7
+ If the confirm fetch returns 304 (ETag/If-Modified-Since unchanged), the
8
+ sitemap lastmod is bumped in the snapshot (so the candidate quiesces next run)
9
+ but no Change is emitted.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+ from urllib.parse import urlsplit
16
+
17
+ import defusedxml.ElementTree as ET
18
+
19
+ from ..models import Change, Source
20
+ from ._normalize import (
21
+ EMPTY_RENDER_THRESHOLD,
22
+ content_hash,
23
+ extract_main,
24
+ extract_title,
25
+ normalize_text,
26
+ )
27
+ from .base import DETECTORS
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ _SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
32
+
33
+
34
+ def _parse_urlset(xml_text: str) -> list[tuple[str, str]]:
35
+ """Return [(loc, lastmod), ...] from a sitemap <urlset> XML string.
36
+
37
+ Missing or empty <lastmod> values are returned as empty strings.
38
+ """
39
+ root = ET.fromstring(xml_text)
40
+ entries: list[tuple[str, str]] = []
41
+ for url_el in root.findall(f"{{{_SITEMAP_NS}}}url"):
42
+ loc_el = url_el.find(f"{{{_SITEMAP_NS}}}loc")
43
+ lastmod_el = url_el.find(f"{{{_SITEMAP_NS}}}lastmod")
44
+ if loc_el is None or not (loc_el.text or "").strip():
45
+ continue
46
+ loc = loc_el.text.strip()
47
+ lastmod = (lastmod_el.text or "").strip() if lastmod_el is not None else ""
48
+ entries.append((loc, lastmod))
49
+ return entries
50
+
51
+
52
+ def _matches_prefix(loc: str, prefix: str) -> bool:
53
+ if not prefix:
54
+ return True
55
+ return urlsplit(loc).path.startswith(prefix)
56
+
57
+
58
+ async def confirm_candidate(
59
+ source: Source,
60
+ store: object,
61
+ fetcher: object,
62
+ loc: str,
63
+ lastmod: str,
64
+ *,
65
+ emit_new: bool = False,
66
+ ) -> Change | None:
67
+ """Per-URL confirm that never raises. Any failure — robots-blocked
68
+ (``Disallowed``), a binary masquerading as HTML (the stdlib HTML parser
69
+ throws on it), a transport error — is logged and skipped, so one bad URL
70
+ cannot abort a source's whole detection run."""
71
+ try:
72
+ return await _confirm_candidate(source, store, fetcher, loc, lastmod, emit_new=emit_new)
73
+ except Exception as exc: # noqa: BLE001 - per-URL isolation is the point
74
+ logger.warning("confirm_candidate skipped %r: %s", loc, exc)
75
+ return None
76
+
77
+
78
+ async def _confirm_candidate(
79
+ source: Source,
80
+ store: object,
81
+ fetcher: object,
82
+ loc: str,
83
+ lastmod: str,
84
+ *,
85
+ emit_new: bool = False,
86
+ ) -> Change | None:
87
+ """Fetch *loc*, hash normalized content, return a Change only on a real move.
88
+
89
+ Contract (pinned):
90
+ - First content capture (never seen, or fetch-free baseline with an empty
91
+ content_hash): baseline silently, return None — UNLESS ``emit_new`` is set
92
+ (a genuinely new URL discovered after a baseline exists), in which case the
93
+ captured content is returned as Change(change_kind="new").
94
+ - Identical re-confirm (hash unchanged): return None.
95
+ - Content-hash move: return Change(change_kind="updated").
96
+ - 304 from server: persist new lastmod (quiesce), return None.
97
+ """
98
+ res = await fetcher.fetch(loc, conditional=True) # type: ignore[union-attr]
99
+ snap = store.get_snapshot(source.id, loc) # type: ignore[union-attr]
100
+
101
+ if res.not_modified:
102
+ # Server confirms content is unchanged despite a lastmod advance.
103
+ # Persist the new lastmod so this candidate won't re-fire next run.
104
+ if snap is not None:
105
+ store.upsert_snapshot( # type: ignore[union-attr]
106
+ source.id,
107
+ loc,
108
+ signal_type="sitemap",
109
+ content_hash=snap.content_hash,
110
+ lastmod=lastmod,
111
+ excerpt=snap.excerpt,
112
+ )
113
+ return None
114
+
115
+ text = normalize_text(extract_main(res.text, source.content_selector))
116
+
117
+ if len(text) < EMPTY_RENDER_THRESHOLD:
118
+ logger.warning(
119
+ "sitemap detector: page at %r returned a JS-shell (text length %d < %d) — "
120
+ "skipping baseline/change; doctor will surface this",
121
+ loc,
122
+ len(text),
123
+ EMPTY_RENDER_THRESHOLD,
124
+ )
125
+ return None
126
+
127
+ new_hash = content_hash(text)
128
+ # The page's own <title> names the change; the source name is only a fallback
129
+ # (so a digest never reads "Android Open Source Project" for every page).
130
+ title = extract_title(res.text) or source.name
131
+
132
+ # First content capture is silent: either the URL was never seen, or it was
133
+ # baselined fetch-free on a prior run (empty content_hash) and this is its
134
+ # first real fetch. A Change requires a genuine prior content hash that
135
+ # moved — a lastmod bump alone never counts.
136
+ first_capture = snap is None or not snap.content_hash
137
+ store.upsert_snapshot( # type: ignore[union-attr]
138
+ source.id,
139
+ loc,
140
+ signal_type="sitemap",
141
+ content_hash=new_hash,
142
+ lastmod=lastmod,
143
+ excerpt=text[:500],
144
+ )
145
+
146
+ if first_capture:
147
+ if emit_new:
148
+ # Genuinely new URL after baseline: report its first capture as "new".
149
+ return Change(
150
+ source_id=source.id,
151
+ url=loc,
152
+ change_kind="new",
153
+ title=title,
154
+ raw_diff=text[:500],
155
+ fetched_hash=new_hash,
156
+ )
157
+ return None # baseline silently
158
+
159
+ if snap.content_hash == new_hash:
160
+ return None # content re-confirmed identical
161
+
162
+ return Change(
163
+ source_id=source.id,
164
+ url=loc,
165
+ change_kind="updated",
166
+ title=title,
167
+ raw_diff=text[:500],
168
+ fetched_hash=new_hash,
169
+ )
170
+
171
+
172
+ @DETECTORS.register("sitemap")
173
+ class SitemapDetector:
174
+ async def detect(self, source: Source, store: object, fetcher: object) -> list[Change]:
175
+ res = await fetcher.fetch(source.url, conditional=True) # type: ignore[union-attr]
176
+ if res.not_modified or not res.text:
177
+ return []
178
+
179
+ entries = _parse_urlset(res.text)
180
+ matched = [
181
+ (loc, lastmod) for loc, lastmod in entries if _matches_prefix(loc, source.path_prefix)
182
+ ]
183
+
184
+ if source.path_prefix and not matched:
185
+ logger.warning(
186
+ "sitemap detector: no URLs matched path_prefix %r for source %r — "
187
+ "check the prefix or the sitemap URL",
188
+ source.path_prefix,
189
+ source.id,
190
+ )
191
+ return []
192
+
193
+ changes: list[Change] = []
194
+ for loc, lastmod in matched:
195
+ snap = store.get_snapshot(source.id, loc) # type: ignore[union-attr]
196
+ # Skip if lastmod is present and unchanged — not even a candidate
197
+ if snap is not None and lastmod and snap.lastmod == lastmod:
198
+ continue
199
+ change = await confirm_candidate(source, store, fetcher, loc, lastmod)
200
+ if change is not None:
201
+ changes.append(change)
202
+
203
+ return changes
@@ -0,0 +1,125 @@
1
+ """Doctor health checks.
2
+
3
+ ``run_doctor(config)`` returns a list of ``Check`` objects covering:
4
+ - sitemap path-prefix resolution for each android_sitemap source
5
+ - AI backend availability
6
+ - schedule status (soft dependency on the schedule module, imported lazily)
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import asyncio
12
+ import shutil
13
+
14
+ from android_watcher import __version__
15
+ from android_watcher.config import Config, db_path
16
+ from android_watcher.detect.android_sitemap import (
17
+ INDEX_URL,
18
+ _index_url_for,
19
+ load_sitemap,
20
+ prefix_count,
21
+ )
22
+ from android_watcher.fetch import USER_AGENT, Fetcher
23
+ from android_watcher.models import Check
24
+ from android_watcher.run import resolve_sources
25
+ from android_watcher.store import Store
26
+
27
+
28
+ def _check_ai(config: Config) -> Check:
29
+ if config.ai.mode == "off":
30
+ return Check("ai-backend", True, "AI disabled")
31
+ path = shutil.which("claude")
32
+ if path:
33
+ return Check("ai-backend", True, f"claude found at {path}")
34
+ return Check("ai-backend", False, "claude not found on PATH")
35
+
36
+
37
+ def _check_seed() -> Check:
38
+ """Report the imported baseline seed date and snapshot count, if any."""
39
+ store = Store(db_path())
40
+ store.migrate()
41
+ try:
42
+ count = store.snapshot_count()
43
+ date = store.seed_date()
44
+ finally:
45
+ store.close()
46
+ if count == 0:
47
+ return Check("seed", True, "no baseline yet; first run will establish one")
48
+ if date:
49
+ return Check("seed", True, f"baseline seeded {date} ({count} snapshots)")
50
+ return Check("seed", True, f"baseline established ({count} snapshots)")
51
+
52
+
53
+ def _check_schedule() -> Check:
54
+ try:
55
+ from android_watcher.schedule import schedule_status # noqa: PLC0415
56
+ except ImportError:
57
+ return Check("schedule", False, "schedule module unavailable")
58
+ return schedule_status()
59
+
60
+
61
+ def _load_sitemap_entries(index_url: str = INDEX_URL) -> list[tuple[str, str]]:
62
+ """Fetch one host's sitemap once and return the flat entry list.
63
+
64
+ Extracted so tests can patch this function directly instead of having to
65
+ wire up a real Store + Fetcher + asyncio event loop.
66
+ """
67
+
68
+ async def _load() -> list[tuple[str, str]]:
69
+ store = Store(db_path())
70
+ store.migrate()
71
+ fetcher = Fetcher(store, user_agent=USER_AGENT.format(version=__version__))
72
+ try:
73
+ # Time-box it: a sitemap can be large (~300 MB uncached), so doctor
74
+ # reports a slow/unavailable sitemap rather than appearing to hang.
75
+ return await asyncio.wait_for(load_sitemap(fetcher, index_url), timeout=30)
76
+ finally:
77
+ await fetcher.close()
78
+
79
+ return asyncio.run(_load())
80
+
81
+
82
+ def _check_prefixes(config: Config) -> list[Check]:
83
+ targets = [s for s in resolve_sources(config) if s.detector == "android_sitemap"]
84
+ if not targets:
85
+ return []
86
+
87
+ by_host: dict[str, list] = {}
88
+ for s in targets:
89
+ by_host.setdefault(_index_url_for(s), []).append(s)
90
+
91
+ checks: list[Check] = []
92
+ for index_url, srcs in by_host.items():
93
+ host = index_url.split("/sitemap.xml")[0]
94
+ try:
95
+ entries = _load_sitemap_entries(index_url)
96
+ except TimeoutError:
97
+ checks.append(Check(f"sitemap:{host}", True, "fetch slow; run once to cache"))
98
+ continue
99
+ except Exception as exc: # noqa: BLE001 - any fetch/parse failure is a soft check
100
+ checks.append(Check(f"sitemap:{host}", False, f"unavailable; run once first ({exc})"))
101
+ continue
102
+ if not entries:
103
+ checks.append(Check(f"sitemap:{host}", True, "cached (304); not re-verified"))
104
+ continue
105
+ for s in srcs:
106
+ if not s.path_prefix:
107
+ checks.append(Check(f"prefix:{s.id}", True, f"watches host ({len(entries)} URLs)"))
108
+ continue
109
+ count = prefix_count(entries, s.path_prefix)
110
+ if count == 0:
111
+ checks.append(
112
+ Check(f"prefix:{s.path_prefix}", False, "stale prefix: 0 sitemap URLs match")
113
+ )
114
+ else:
115
+ checks.append(Check(f"prefix:{s.path_prefix}", True, f"resolves ({count} URLs)"))
116
+ return checks
117
+
118
+
119
+ def run_doctor(config: Config) -> list[Check]:
120
+ checks: list[Check] = []
121
+ checks.extend(_check_prefixes(config))
122
+ checks.append(_check_seed())
123
+ checks.append(_check_ai(config))
124
+ checks.append(_check_schedule())
125
+ return checks
@@ -0,0 +1,162 @@
1
+ """Async HTTP fetch layer for android-watcher.
2
+
3
+ Provides ``Fetcher``, a concurrency-limited async client that:
4
+ - Sets a descriptive User-Agent.
5
+ - Honors robots.txt per host (urllib.robotparser).
6
+ - Applies a per-host crawl delay.
7
+ - Retries with exponential backoff on 5xx / transport errors.
8
+ - Supports conditional GET via Store.http_cache_get/put.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ import logging
15
+ import urllib.robotparser
16
+ from urllib.parse import urlsplit
17
+
18
+ import httpx
19
+
20
+ from .models import Disallowed, FetchResult
21
+ from .store import Store
22
+
23
+ log = logging.getLogger("android_watcher.fetch")
24
+
25
+ USER_AGENT = "android-watcher/{version}"
26
+
27
+ MAX_RETRIES = 4
28
+ BACKOFF_BASE = 0.5
29
+ # Per-request httpx timeouts (connect/read/write/pool). read is the gap between
30
+ # bytes, not total, so a large steady download is fine; a stalled one trips it.
31
+ _TIMEOUT = httpx.Timeout(connect=10.0, read=30.0, write=30.0, pool=10.0)
32
+ # Hard ceiling on a single fetch including retries + backoff, so one bad URL can
33
+ # never hang a run indefinitely.
34
+ FETCH_DEADLINE = 120.0
35
+
36
+
37
+ class Fetcher:
38
+ def __init__(
39
+ self,
40
+ store: Store,
41
+ *,
42
+ user_agent: str,
43
+ concurrency: int = 4,
44
+ crawl_delay: float = 0.5,
45
+ ):
46
+ self._store = store
47
+ self._user_agent = user_agent
48
+ self._crawl_delay = crawl_delay
49
+ self._sem = asyncio.Semaphore(concurrency)
50
+ self._client = httpx.AsyncClient(
51
+ headers={"User-Agent": user_agent},
52
+ follow_redirects=True,
53
+ timeout=_TIMEOUT,
54
+ )
55
+ self._robots: dict[str, urllib.robotparser.RobotFileParser | None] = {}
56
+ self._last_fetch: dict[str, float] = {}
57
+
58
+ async def fetch(self, url: str, *, conditional: bool = False) -> FetchResult:
59
+ if not await self._robots_ok(url):
60
+ raise Disallowed(url)
61
+
62
+ headers: dict[str, str] = {}
63
+ if conditional:
64
+ etag, last_modified = self._store.http_cache_get(url)
65
+ if etag:
66
+ headers["If-None-Match"] = etag
67
+ if last_modified:
68
+ headers["If-Modified-Since"] = last_modified
69
+
70
+ async with self._sem:
71
+ await self._respect_crawl_delay(url)
72
+ log.info("downloading %s", url)
73
+ resp = await asyncio.wait_for(
74
+ self._get_with_backoff(url, headers), timeout=FETCH_DEADLINE
75
+ )
76
+
77
+ if resp.status_code == 304:
78
+ return FetchResult(url=url, status=304, text="", not_modified=True)
79
+
80
+ etag = resp.headers.get("ETag", "")
81
+ last_modified = resp.headers.get("Last-Modified", "")
82
+ # Only persist validators when at least one is non-empty; never clobber
83
+ # an existing cache entry with ("","") on a validator-less 200.
84
+ if conditional and (etag or last_modified):
85
+ self._store.http_cache_put(url, etag, last_modified)
86
+
87
+ return FetchResult(
88
+ url=url,
89
+ status=resp.status_code,
90
+ text=resp.text,
91
+ etag=etag,
92
+ last_modified=last_modified,
93
+ )
94
+
95
+ async def close(self) -> None:
96
+ await self._client.aclose()
97
+
98
+ async def _get_with_backoff(self, url: str, headers: dict[str, str]) -> httpx.Response:
99
+ # Merge User-Agent into per-request headers so it is sent even when
100
+ # tests inject a bare _client that has no default headers set.
101
+ request_headers = {"User-Agent": self._user_agent, **headers}
102
+ last_exc: Exception | None = None
103
+ for attempt in range(MAX_RETRIES):
104
+ try:
105
+ resp = await self._client.get(url, headers=request_headers)
106
+ except (httpx.TransportError, httpx.TimeoutException) as exc:
107
+ last_exc = exc
108
+ resp = None
109
+ if resp is not None and resp.status_code < 500:
110
+ return resp
111
+ if attempt == MAX_RETRIES - 1:
112
+ if resp is not None:
113
+ return resp
114
+ raise last_exc # type: ignore[misc]
115
+ await asyncio.sleep(BACKOFF_BASE * (2**attempt))
116
+ raise last_exc # unreachable
117
+
118
+ async def _respect_crawl_delay(self, url: str) -> None:
119
+ host = _host_root(url)
120
+ delay = self._crawl_delay_for(url)
121
+ loop = asyncio.get_event_loop()
122
+ now = loop.time()
123
+ last = self._last_fetch.get(host)
124
+ if last is not None:
125
+ wait = delay - (now - last)
126
+ if wait > 0:
127
+ await asyncio.sleep(wait)
128
+ self._last_fetch[host] = loop.time()
129
+
130
+ def _crawl_delay_for(self, url: str) -> float:
131
+ rp = self._robots.get(_host_root(url))
132
+ if rp is not None:
133
+ cd = rp.crawl_delay(self._user_agent)
134
+ if cd is not None:
135
+ return float(cd)
136
+ return self._crawl_delay
137
+
138
+ async def _robots_ok(self, url: str) -> bool:
139
+ host = _host_root(url)
140
+ if host not in self._robots:
141
+ rp: urllib.robotparser.RobotFileParser | None = urllib.robotparser.RobotFileParser()
142
+ try:
143
+ # Fetch via the timed httpx client. urllib's RobotFileParser.read()
144
+ # uses urlopen with NO timeout and can hang a run forever if a host
145
+ # stalls; the shared client carries a 30s timeout instead.
146
+ resp = await self._client.get(f"{host}/robots.txt")
147
+ if resp.status_code >= 400:
148
+ rp = None # treat missing/forbidden robots as "allow"
149
+ else:
150
+ rp.parse(resp.text.splitlines())
151
+ except (httpx.HTTPError, httpx.InvalidURL):
152
+ rp = None # robots unavailable => allow
153
+ self._robots[host] = rp
154
+ rp = self._robots[host]
155
+ if rp is None:
156
+ return True
157
+ return rp.can_fetch(self._user_agent, url)
158
+
159
+
160
+ def _host_root(url: str) -> str:
161
+ parts = urlsplit(url)
162
+ return f"{parts.scheme}://{parts.netloc}"
@@ -0,0 +1,79 @@
1
+ """Group ledger changes into DigestGroups: model group_key first, heuristic fallback."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ from .config import Config
8
+ from .models import Change, DigestGroup, Source
9
+
10
+ _WORD = re.compile(r"[a-z0-9]+")
11
+ _DIGITS = re.compile(r"\d")
12
+
13
+
14
+ def _title_case(text: str) -> str:
15
+ """Capitalize the first letter of each word, preserving existing capitals so
16
+ acronyms survive (GKI stays GKI, OS stays OS, not Gki/Os)."""
17
+ return " ".join(w[:1].upper() + w[1:] if w else w for w in text.split(" "))
18
+
19
+
20
+ def heuristic_prefix(title: str) -> str:
21
+ """A coarse grouping key for changes without a model group_key.
22
+
23
+ Lowercase the title, keep leading words up to the first numeric run (so
24
+ 'Android 13 release builds' and 'Android 14 release builds' collide), capped
25
+ at four words so unrelated titles do not over-merge.
26
+ """
27
+ words = _WORD.findall(title.lower())
28
+ kept: list[str] = []
29
+ for w in words:
30
+ if _DIGITS.search(w):
31
+ break
32
+ kept.append(w)
33
+ if len(kept) >= 4:
34
+ break
35
+ return " ".join(kept) if kept else title.strip().lower()
36
+
37
+
38
+ def group_changes(
39
+ changes: list[Change], sources: dict[str, Source], config: Config
40
+ ) -> list[DigestGroup]:
41
+ # Lazy import avoids a rank<->group module cycle: rank.py will import
42
+ # group_changes at the top level (Task 4), so a top-level import of _score
43
+ # here would create rank -> group -> rank with _score not yet defined.
44
+ from .rank import _score
45
+
46
+ buckets: dict[str, list[Change]] = {}
47
+ for c in changes:
48
+ sub = c.group_key or heuristic_prefix(c.title)
49
+ key = f"{c.source_id}::{sub}"
50
+ buckets.setdefault(key, []).append(c)
51
+
52
+ groups: list[DigestGroup] = []
53
+ for key, members in buckets.items():
54
+ members.sort(key=lambda c: (c.detected_at, c.id or 0), reverse=True)
55
+ source = sources.get(members[0].source_id)
56
+ # Heading: prefer the model's group headline; else the representative page
57
+ # title. Summary: prefer the model's group summary; else the representative
58
+ # change's own one-line description (so every group shows a sentence).
59
+ summary = next((m.group_summary for m in members if m.group_summary), None) or next(
60
+ (m.description for m in members if m.description), None
61
+ )
62
+ raw_title = next((m.group_title for m in members if m.group_title), None) or next(
63
+ (m.title for m in members if m.title), None
64
+ )
65
+ title = _title_case(raw_title) if raw_title else members[0].url
66
+ score = max(_score(m, source, config) for m in members)
67
+ groups.append(
68
+ DigestGroup(
69
+ key=key,
70
+ title=title,
71
+ summary=summary,
72
+ category=source.category if source else "",
73
+ source_id=members[0].source_id,
74
+ change_kind=members[0].change_kind,
75
+ members=members,
76
+ score=score,
77
+ )
78
+ )
79
+ return groups