android-watcher 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. android_watcher/__init__.py +10 -0
  2. android_watcher/catalog/__init__.py +32 -0
  3. android_watcher/catalog/catalog.toml +531 -0
  4. android_watcher/cli.py +161 -0
  5. android_watcher/config.py +262 -0
  6. android_watcher/detect/__init__.py +1 -0
  7. android_watcher/detect/_normalize.py +192 -0
  8. android_watcher/detect/android_sitemap.py +540 -0
  9. android_watcher/detect/base.py +14 -0
  10. android_watcher/detect/content.py +99 -0
  11. android_watcher/detect/feed.py +135 -0
  12. android_watcher/detect/sitemap.py +203 -0
  13. android_watcher/doctor.py +125 -0
  14. android_watcher/fetch.py +162 -0
  15. android_watcher/group.py +79 -0
  16. android_watcher/lock.py +32 -0
  17. android_watcher/models.py +156 -0
  18. android_watcher/notify/__init__.py +1 -0
  19. android_watcher/notify/base.py +21 -0
  20. android_watcher/notify/email.py +52 -0
  21. android_watcher/notify/html.py +114 -0
  22. android_watcher/notify/render.py +239 -0
  23. android_watcher/notify/slack.py +124 -0
  24. android_watcher/notify/telegram.py +46 -0
  25. android_watcher/rank.py +84 -0
  26. android_watcher/registry.py +38 -0
  27. android_watcher/run.py +283 -0
  28. android_watcher/schedule.py +488 -0
  29. android_watcher/seed/__init__.py +45 -0
  30. android_watcher/seed/seed.sql.gz +0 -0
  31. android_watcher/store.py +492 -0
  32. android_watcher/triage/__init__.py +1 -0
  33. android_watcher/triage/base.py +25 -0
  34. android_watcher/triage/claude_cli.py +185 -0
  35. android_watcher/triage/noop.py +24 -0
  36. android_watcher/tui/__init__.py +1 -0
  37. android_watcher/tui/app.py +163 -0
  38. android_watcher/tui/configio.py +215 -0
  39. android_watcher/tui/screens.py +927 -0
  40. android_watcher-1.0.0.dist-info/METADATA +310 -0
  41. android_watcher-1.0.0.dist-info/RECORD +44 -0
  42. android_watcher-1.0.0.dist-info/WHEEL +4 -0
  43. android_watcher-1.0.0.dist-info/entry_points.txt +2 -0
  44. android_watcher-1.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,540 @@
1
+ """Host-agnostic sitemap detector.
2
+
3
+ Parses a host's sitemap (a ``<sitemapindex>`` of shards, or a single
4
+ ``<urlset>``) once per run via a run-scoped cache held on the Fetcher instance
5
+ and keyed by the sitemap-index URL (derived from each source's host). Several
6
+ sources on the same host share one download; different hosts each get their own.
7
+ Originally specific to developer.android.com (hence the registered name), it now
8
+ also serves source.android.com, developers.google.com, kotlinlang.org, etc.
9
+
10
+ English only: locale-prefixed URLs (/fr/...) and ``?hl=<non-en>`` variants are
11
+ dropped at parse time, so only canonical English pages are ever watched.
12
+
13
+ Per-source filtering (applied against the shared, cached entry list):
14
+ - ``path_prefix`` — include only URLs under this path ("" = whole host).
15
+ - ``exclude_prefixes``— drop URLs under any of these paths.
16
+ - ``require_segment`` — keep only URLs whose path has a matching segment
17
+ (``seg == s`` or ``seg startswith s-``); e.g. "android".
18
+ - ``reference_mode`` — keep | drop | index_only for /reference docs. index_only
19
+ keeps only index/summary pages (Kotlin-preferred), so
20
+ the huge per-symbol class/function reference is dropped.
21
+ - most-specific-prefix-wins: a URL under a nested same-host source's longer
22
+ prefix belongs to that source.
23
+ - version-dedup: URLs differing only by a dotted version segment (9.4) or a
24
+ ?version=/?api= query collapse to the latest; bare-integer paths like
25
+ /about/versions/14 are untouched (distinct releases).
26
+
27
+ Baseline / change semantics:
28
+ - First sight of a brand-new URL is baselined fetch-free (sitemap lastmod, empty
29
+ content_hash, no fetch) when the source has no baseline yet. Once a baseline
30
+ exists, a never-seen URL is content-confirmed and reported as Change("new").
31
+ - An already-baselined URL whose lastmod moves is content-confirmed; a real
32
+ content move is Change("updated"). lastmod alone never emits a Change.
33
+
34
+ Public API (consumed by doctor, the seed builder, and the catalog verify-script):
35
+ - ``load_sitemap(fetcher, index_url=INDEX_URL)`` — flat [(loc, lastmod), ...].
36
+ - ``prefix_count(entries, prefix)`` — count matching URLs (0 => stale prefix).
37
+ - ``make_shard_cache()`` — factory; ``ShardCache.load(fetcher, index_url)``.
38
+ - ``baseline_all(source, store, fetcher)`` — full-content baseline (seed builder).
39
+ """
40
+
41
+ from __future__ import annotations
42
+
43
+ import asyncio
44
+ import io
45
+ import logging
46
+ import re
47
+ import time
48
+ from urllib.parse import parse_qs, urlsplit
49
+
50
+ from defusedxml.ElementTree import iterparse
51
+
52
+ from ..catalog import load_catalog
53
+ from ..models import Change, Source
54
+ from .base import DETECTORS
55
+ from .sitemap import confirm_candidate
56
+
57
+ logger = logging.getLogger(__name__)
58
+
59
+ _SM = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
60
+ INDEX_URL = "https://developer.android.com/sitemap.xml"
61
+
62
+ # Leading locale path segment (/fr/..., /pt-br/...) — matched against an explicit
63
+ # set rather than a 2-letter regex so real sections like /tv, /xr, /ai survive.
64
+ _LOCALES = frozenset(
65
+ {
66
+ "ar",
67
+ "bn",
68
+ "de",
69
+ "en",
70
+ "es",
71
+ "es-419",
72
+ "fa",
73
+ "fr",
74
+ "he",
75
+ "hi",
76
+ "id",
77
+ "in",
78
+ "it",
79
+ "iw",
80
+ "ja",
81
+ "ko",
82
+ "ms",
83
+ "pl",
84
+ "pt",
85
+ "pt-br",
86
+ "ru",
87
+ "th",
88
+ "tr",
89
+ "uk",
90
+ "vi",
91
+ "zh-cn",
92
+ "zh-hk",
93
+ "zh-tw",
94
+ }
95
+ )
96
+
97
+ # Reserved reference index/summary leaf names (everything else under /reference
98
+ # is a per-symbol class/function page). Derived empirically from the sitemap.
99
+ _REFERENCE_INDEX_LEAVES = frozenset(
100
+ {"package-summary", "packages", "classes", "composables", "modifiers"}
101
+ )
102
+
103
+ # A dotted version segment (9.4, 8.13.0). Requires a dot, so bare integers like
104
+ # /about/versions/14 (distinct Android releases) are NOT treated as versions.
105
+ _VERSION_RE = re.compile(r"^\d+(?:\.\d+)+$")
106
+
107
+ # Non-HTML asset extensions. Sitemaps list PDFs, images, archives, etc.; fetching
108
+ # them as text feeds binary to the HTML parser (it throws) and pollutes baselines
109
+ # with binary "excerpts" (the source of NUL bytes). Never watch these.
110
+ _BINARY_EXT = (
111
+ ".pdf",
112
+ ".png",
113
+ ".jpg",
114
+ ".jpeg",
115
+ ".gif",
116
+ ".svg",
117
+ ".webp",
118
+ ".ico",
119
+ ".bmp",
120
+ ".mp4",
121
+ ".webm",
122
+ ".mov",
123
+ ".zip",
124
+ ".tar",
125
+ ".gz",
126
+ ".tgz",
127
+ ".jar",
128
+ ".aar",
129
+ ".apk",
130
+ ".aab",
131
+ ".woff",
132
+ ".woff2",
133
+ ".ttf",
134
+ ".otf",
135
+ ".eot",
136
+ ".css",
137
+ ".js",
138
+ ".wasm",
139
+ ".bin",
140
+ ".dmg",
141
+ ".exe",
142
+ )
143
+
144
+ # >= this many prefix URLs sharing one NEW lastmod => treat as a section
145
+ # regeneration and collapse to a single candidate.
146
+ BULK_COLLAPSE_THRESHOLD = 3
147
+
148
+
149
+ def _is_localized(loc: str) -> bool:
150
+ """True if *loc* is a translation: a leading locale path segment, or an
151
+ `hl` query parameter requesting any language other than English."""
152
+ parts = urlsplit(loc)
153
+ seg = parts.path.split("/", 2)
154
+ if len(seg) > 1 and seg[1] in _LOCALES:
155
+ return True
156
+ hl = parse_qs(parts.query).get("hl")
157
+ return bool(hl) and hl[0] != "en"
158
+
159
+
160
+ def _iter_shard(xml: str):
161
+ """Stream <url> elements, yield (loc, lastmod) for canonical English URLs.
162
+
163
+ Uses iterparse + element.clear() so a 34 MB shard never materializes as a
164
+ DOM. No path filtering here — one cached parse serves every source on the
165
+ host; per-source filtering happens later against the cached list.
166
+ """
167
+ loc = lastmod = ""
168
+ for _event, el in iterparse(io.StringIO(xml), events=("end",)):
169
+ tag = el.tag
170
+ if tag == f"{_SM}loc":
171
+ loc = (el.text or "").strip()
172
+ elif tag == f"{_SM}lastmod":
173
+ lastmod = (el.text or "").strip()
174
+ elif tag == f"{_SM}url":
175
+ if loc and not _is_localized(loc):
176
+ yield loc, lastmod
177
+ loc = lastmod = ""
178
+ el.clear()
179
+
180
+
181
+ def _index_url_for(source: Source) -> str:
182
+ """Derive the sitemap-index URL from a source's host: <scheme>://<host>/sitemap.xml."""
183
+ p = urlsplit(source.url)
184
+ return f"{p.scheme}://{p.netloc}/sitemap.xml"
185
+
186
+
187
+ class ShardCache:
188
+ """Run-scoped parse of one host's sitemap (index-of-shards or single urlset)."""
189
+
190
+ def __init__(self) -> None:
191
+ self._loaded = False
192
+ self._lock = asyncio.Lock() # serializes concurrent population
193
+ self.entries: list[tuple[str, str]] = [] # flat (loc, lastmod)
194
+
195
+ async def load(self, fetcher, index_url: str = INDEX_URL) -> list[tuple[str, str]]:
196
+ # CONCURRENCY: many sources on a host call load() concurrently in one run.
197
+ # Guard population with an asyncio.Lock so the FIRST caller fetches once and
198
+ # the rest await the in-flight load. Double-checked: fast path skips the lock.
199
+ if self._loaded:
200
+ return self.entries
201
+ async with self._lock:
202
+ if self._loaded:
203
+ return self.entries
204
+ t0 = time.monotonic()
205
+ idx = await fetcher.fetch(index_url, conditional=True)
206
+ text = idx.text if not idx.not_modified else ""
207
+ if text and "<sitemapindex" in text[:500]:
208
+ await self._load_index(fetcher, text)
209
+ elif text:
210
+ # A bare <urlset> served directly at the sitemap URL (single shard).
211
+ self.entries.extend(_iter_shard(text))
212
+ logger.info(
213
+ "sitemap: %d urls (single urlset) in %.1fs",
214
+ len(self.entries),
215
+ time.monotonic() - t0,
216
+ )
217
+ self._loaded = True
218
+ return self.entries
219
+
220
+ async def _load_index(self, fetcher, index_text: str) -> None:
221
+ shard_locs: list[str] = []
222
+ for _e, el in iterparse(io.StringIO(index_text), events=("end",)):
223
+ if el.tag == f"{_SM}loc" and el.text:
224
+ shard_locs.append(el.text.strip())
225
+ if el.tag == f"{_SM}sitemap":
226
+ el.clear()
227
+ total = len(shard_locs)
228
+ logger.info("sitemap index: %d shard(s)", total)
229
+ t = time.monotonic()
230
+
231
+ async def _fetch(n: int, url: str):
232
+ ts = time.monotonic()
233
+ res = await fetcher.fetch(url, conditional=True)
234
+ return n, res, time.monotonic() - ts
235
+
236
+ tasks = [asyncio.create_task(_fetch(n, u)) for n, u in enumerate(shard_locs, 1)]
237
+ for fut in asyncio.as_completed(tasks):
238
+ n, res, dt = await fut
239
+ if res.not_modified or not res.text:
240
+ logger.info(" shard %d/%d: 304/empty in %.1fs", n, total, dt)
241
+ continue
242
+ before = len(self.entries)
243
+ self.entries.extend(_iter_shard(res.text))
244
+ logger.info(
245
+ " shard %d/%d: %dKB, %d urls, %.1fs",
246
+ n,
247
+ total,
248
+ len(res.text) // 1024,
249
+ len(self.entries) - before,
250
+ dt,
251
+ )
252
+ logger.info(
253
+ "sitemap loaded: %d urls from %d shard(s) in %.1fs",
254
+ len(self.entries),
255
+ total,
256
+ time.monotonic() - t,
257
+ )
258
+
259
+ def stale_prefix(self, prefix: str) -> bool:
260
+ return prefix_count(self.entries, prefix) == 0
261
+
262
+
263
+ # ---- public API -------------------------------------------------------------
264
+
265
+
266
+ def make_shard_cache() -> ShardCache:
267
+ return ShardCache()
268
+
269
+
270
+ async def load_sitemap(fetcher, index_url: str = INDEX_URL) -> list[tuple[str, str]]:
271
+ """Parse a host's sitemap once per run; cached on the Fetcher per index URL."""
272
+ return await _cache_for(fetcher, index_url).load(fetcher, index_url)
273
+
274
+
275
+ def prefix_count(entries: list[tuple[str, str]], prefix: str) -> int:
276
+ """How many sitemap URLs fall under a path prefix (0 => stale prefix)."""
277
+ if not prefix:
278
+ return len(entries)
279
+ return sum(1 for loc, _lastmod in entries if urlsplit(loc).path.startswith(prefix))
280
+
281
+
282
+ def _cache_for(fetcher, index_url: str = INDEX_URL) -> ShardCache:
283
+ caches = getattr(fetcher, "_shard_caches", None)
284
+ if caches is None:
285
+ caches = {}
286
+ fetcher._shard_caches = caches
287
+ cache = caches.get(index_url)
288
+ if cache is None:
289
+ cache = make_shard_cache()
290
+ caches[index_url] = cache
291
+ return cache
292
+
293
+
294
+ def _representative(locs: list[str]) -> str:
295
+ """Pick the shortest-path URL for a cluster as its canonical representative."""
296
+ return min(locs, key=lambda loc: len(urlsplit(loc).path))
297
+
298
+
299
+ def _descendant_prefixes(source: Source) -> set[str]:
300
+ """Path prefixes of OTHER enabled same-host sitemap sources nested strictly
301
+ under this source's prefix (most-specific-prefix-wins)."""
302
+ out: set[str] = set()
303
+ src_index = _index_url_for(source)
304
+ for cat_src in load_catalog():
305
+ if (
306
+ cat_src.detector == "android_sitemap"
307
+ and cat_src.enabled
308
+ and cat_src.id != source.id
309
+ and cat_src.path_prefix
310
+ and _index_url_for(cat_src) == src_index
311
+ and len(cat_src.path_prefix) > len(source.path_prefix)
312
+ and cat_src.path_prefix.startswith(source.path_prefix)
313
+ ):
314
+ out.add(cat_src.path_prefix)
315
+ return out
316
+
317
+
318
+ def _is_reference(path: str) -> bool:
319
+ return "reference" in path.strip("/").split("/")
320
+
321
+
322
+ def _kotlin_twin(path: str) -> str | None:
323
+ """The Kotlin-variant path for a Java reference URL, or None if already Kotlin
324
+ (or not a reference URL). Inserts 'kotlin' right after the 'reference' segment."""
325
+ segs = path.strip("/").split("/")
326
+ if "reference" not in segs:
327
+ return None
328
+ i = segs.index("reference")
329
+ if i + 1 < len(segs) and segs[i + 1] == "kotlin":
330
+ return None
331
+ return "/" + "/".join(segs[: i + 1] + ["kotlin"] + segs[i + 1 :])
332
+
333
+
334
+ def _watched(source: Source, loc: str, all_paths: set[str]) -> bool:
335
+ """Whether *loc* passes this source's include/exclude/segment/reference filters."""
336
+ path = urlsplit(loc).path
337
+ if path.lower().endswith(_BINARY_EXT):
338
+ return False # PDFs, images, archives, etc. are not text pages
339
+ if source.path_prefix and not path.startswith(source.path_prefix):
340
+ return False
341
+ if any(path.startswith(x) for x in source.exclude_prefixes):
342
+ return False
343
+ if source.require_segment:
344
+ rs = source.require_segment
345
+ if not any(s == rs or s.startswith(rs + "-") for s in path.strip("/").split("/")):
346
+ return False
347
+ if _is_reference(path):
348
+ mode = source.reference_mode
349
+ if mode == "drop":
350
+ return False
351
+ if mode == "index_only":
352
+ leaf = path.rstrip("/").split("/")[-1]
353
+ if leaf not in _REFERENCE_INDEX_LEAVES:
354
+ return False
355
+ twin = _kotlin_twin(path) # Kotlin-preferred: drop Java if a Kotlin twin exists
356
+ if twin is not None and twin in all_paths:
357
+ return False
358
+ return True
359
+
360
+
361
+ def _version_key(loc: str):
362
+ """(canonical-group, version-tuple) if *loc* carries a dotted version segment
363
+ or a numeric ?version=/?api= query, else (None, None)."""
364
+ parts = urlsplit(loc)
365
+ segs = parts.path.strip("/").split("/")
366
+ for i, s in enumerate(segs):
367
+ if _VERSION_RE.match(s):
368
+ canon = "/" + "/".join(segs[:i] + ["*"] + segs[i + 1 :]) + "?" + parts.query
369
+ return canon, tuple(int(x) for x in s.split("."))
370
+ q = parse_qs(parts.query)
371
+ for key in ("version", "api", "apilevel"):
372
+ vals = q.get(key)
373
+ if vals and vals[0].replace(".", "").isdigit():
374
+ base = parts.path + "|" + key
375
+ v = vals[0]
376
+ return base, (tuple(int(x) for x in v.split(".")) if "." in v else (int(v),))
377
+ return None, None
378
+
379
+
380
+ def _dedup_versions(items: list[tuple[str, str]]) -> list[tuple[str, str]]:
381
+ """Collapse URLs differing only by a dotted version / version query to the
382
+ latest; everything without a detectable version passes through unchanged."""
383
+ best: dict[str, tuple[tuple[int, ...], str, str]] = {}
384
+ passthrough: list[tuple[str, str]] = []
385
+ for loc, lastmod in items:
386
+ canon, vkey = _version_key(loc)
387
+ if canon is None:
388
+ passthrough.append((loc, lastmod))
389
+ continue
390
+ cur = best.get(canon)
391
+ if cur is None or vkey > cur[0]:
392
+ best[canon] = (vkey, loc, lastmod)
393
+ return passthrough + [(loc, lastmod) for _v, loc, lastmod in best.values()]
394
+
395
+
396
+ def _gather_watched(source: Source, entries: list[tuple[str, str]]) -> list[tuple[str, str]]:
397
+ """Apply all source filters + most-specific-prefix dedup + version-dedup."""
398
+ all_paths = {urlsplit(loc).path for loc, _ in entries}
399
+ descendants = _descendant_prefixes(source)
400
+ out: list[tuple[str, str]] = []
401
+ for loc, lastmod in entries:
402
+ path = urlsplit(loc).path
403
+ if any(path.startswith(dp) for dp in descendants):
404
+ continue
405
+ if not _watched(source, loc, all_paths):
406
+ continue
407
+ out.append((loc, lastmod))
408
+ return _dedup_versions(out)
409
+
410
+
411
+ async def baseline_all(source: Source, store, fetcher) -> int:
412
+ """Full-content baseline for the seed builder: fetch + hash EVERY watched URL
413
+ for this source (no fetch-free shortcut), recording a snapshot.
414
+
415
+ Returns the number newly baselined. Resumable: URLs already baselined with a
416
+ non-empty content_hash are skipped. Confirm-fetches run concurrently (the
417
+ Fetcher's semaphore + crawl delay bound the rate); progress logs every 100.
418
+ """
419
+ entries = await load_sitemap(fetcher, _index_url_for(source))
420
+ todo = [
421
+ (loc, lastmod)
422
+ for loc, lastmod in _gather_watched(source, entries)
423
+ if not ((snap := store.get_snapshot(source.id, loc)) is not None and snap.content_hash)
424
+ ]
425
+ total = len(todo)
426
+ if not total:
427
+ return 0
428
+ logger.info("%s: baselining %d URL(s)…", source.id, total)
429
+ tasks = [
430
+ asyncio.create_task(confirm_candidate(source, store, fetcher, loc, lastmod))
431
+ for loc, lastmod in todo
432
+ ]
433
+ done = 0
434
+ for fut in asyncio.as_completed(tasks):
435
+ await fut
436
+ done += 1
437
+ if done % 100 == 0 or done == total:
438
+ logger.info("%s: baselined %d/%d", source.id, done, total)
439
+ return total
440
+
441
+
442
+ @DETECTORS.register("android_sitemap")
443
+ class AndroidSitemapDetector:
444
+ async def detect(self, source: Source, store, fetcher) -> list[Change]:
445
+ entries = await load_sitemap(fetcher, _index_url_for(source))
446
+
447
+ # Stale-prefix health check: a non-empty prefix that matches nothing is a
448
+ # config problem, not a change. Log and return []; doctor surfaces it.
449
+ if source.path_prefix and prefix_count(entries, source.path_prefix) == 0:
450
+ logger.warning(
451
+ "android_sitemap: path_prefix %r matches zero sitemap URLs for %r",
452
+ source.path_prefix,
453
+ source.id,
454
+ )
455
+ return []
456
+
457
+ watched = _gather_watched(source, entries)
458
+ has_baseline = store.source_has_snapshots(source.id)
459
+
460
+ # Partition: brand-new URLs (genuinely new vs first-ever baseline) and
461
+ # already-seen URLs whose lastmod moved.
462
+ new_pages: list[tuple[str, str]] = [] # new after baseline -> Change("new")
463
+ fetchfree: list[tuple[str, str]] = [] # first-ever baseline -> silent
464
+ candidates: list[tuple[str, str]] = [] # lastmod moved -> Change("updated")
465
+ for loc, lastmod in watched:
466
+ snap = store.get_snapshot(source.id, loc)
467
+ if snap is None:
468
+ (new_pages if has_baseline else fetchfree).append((loc, lastmod))
469
+ continue
470
+ if lastmod and snap.lastmod == lastmod:
471
+ continue
472
+ candidates.append((loc, lastmod))
473
+
474
+ # First-ever baseline: record lastmod with empty hash, no fetch.
475
+ for loc, lastmod in fetchfree:
476
+ store.upsert_snapshot(
477
+ source.id, loc, signal_type="sitemap", content_hash="", lastmod=lastmod, excerpt=""
478
+ )
479
+ if fetchfree:
480
+ logger.info("%s: %d new URL(s) baselined fetch-free", source.id, len(fetchfree))
481
+
482
+ changes: list[Change] = []
483
+
484
+ # Genuinely new URLs (after baseline): confirm-fetch + report as "new".
485
+ if new_pages:
486
+ logger.info("%s: %d new page(s) -> confirm…", source.id, len(new_pages))
487
+ tasks = [
488
+ asyncio.create_task(
489
+ confirm_candidate(source, store, fetcher, loc, lastmod, emit_new=True)
490
+ )
491
+ for loc, lastmod in new_pages
492
+ ]
493
+ for fut in asyncio.as_completed(tasks):
494
+ ch = await fut
495
+ if ch is not None:
496
+ changes.append(ch)
497
+
498
+ if not candidates:
499
+ return changes
500
+
501
+ # Updated URLs: cluster by lastmod, collapse bulk section regenerations.
502
+ by_lastmod: dict[str, list[str]] = {}
503
+ for loc, lastmod in candidates:
504
+ by_lastmod.setdefault(lastmod, []).append(loc)
505
+ fetches = sum(
506
+ 1 if len(locs) >= BULK_COLLAPSE_THRESHOLD else len(locs) for locs in by_lastmod.values()
507
+ )
508
+ logger.info(
509
+ "%s: %d candidate(s), %d cluster(s) -> ~%d confirm-fetch(es)…",
510
+ source.id,
511
+ len(candidates),
512
+ len(by_lastmod),
513
+ fetches,
514
+ )
515
+ for lastmod, locs in by_lastmod.items():
516
+ if len(locs) >= BULK_COLLAPSE_THRESHOLD:
517
+ rep = _representative(locs)
518
+ change = await confirm_candidate(source, store, fetcher, rep, lastmod)
519
+ for loc in locs:
520
+ if loc != rep:
521
+ snap = store.get_snapshot(source.id, loc)
522
+ prior = snap.content_hash if snap is not None else ""
523
+ excerpt = snap.excerpt if snap is not None else ""
524
+ store.upsert_snapshot(
525
+ source.id,
526
+ loc,
527
+ signal_type="sitemap",
528
+ content_hash=prior,
529
+ lastmod=lastmod,
530
+ excerpt=excerpt,
531
+ )
532
+ if change is not None:
533
+ changes.append(change)
534
+ else:
535
+ for loc in locs:
536
+ change = await confirm_candidate(source, store, fetcher, loc, lastmod)
537
+ if change is not None:
538
+ changes.append(change)
539
+
540
+ return changes
@@ -0,0 +1,14 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Protocol, runtime_checkable
4
+
5
+ from ..models import Change, Source
6
+ from ..registry import Registry
7
+
8
+
9
+ @runtime_checkable
10
+ class Detector(Protocol):
11
+ async def detect(self, source: Source, store, fetcher) -> list[Change]: ...
12
+
13
+
14
+ DETECTORS: Registry[Detector] = Registry("detector")
@@ -0,0 +1,99 @@
1
+ """Content-hash detector.
2
+
3
+ Fetches a page, extracts and normalises the selected region, and emits an
4
+ "updated" Change when the normalised text moves. Cosmetic edits (CSS classes,
5
+ whitespace, attributes) leave the normalised text — and therefore the hash —
6
+ unchanged, so they never raise a signal.
7
+
8
+ Empty-render guard: a page whose normalised main-text falls below
9
+ ``EMPTY_RENDER_THRESHOLD`` is a client-rendered shell, a HEALTH condition rather
10
+ than a change. The detector refuses to baseline it and emits no Change; it logs
11
+ a warning and ``doctor`` surfaces the condition.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import difflib
17
+ import logging
18
+
19
+ from ..models import Change, Source
20
+ from ._normalize import (
21
+ EMPTY_RENDER_THRESHOLD,
22
+ content_hash,
23
+ extract_main,
24
+ extract_title,
25
+ normalize_text,
26
+ )
27
+ from .base import DETECTORS
28
+
29
+ log = logging.getLogger(__name__)
30
+
31
+
32
+ @DETECTORS.register("content")
33
+ class ContentDetector:
34
+ async def detect(self, source: Source, store, fetcher) -> list[Change]:
35
+ res = await fetcher.fetch(source.url, conditional=True)
36
+ if res.not_modified or not res.text:
37
+ return []
38
+
39
+ text = normalize_text(extract_main(res.text, source.content_selector))
40
+
41
+ if len(text) < EMPTY_RENDER_THRESHOLD:
42
+ # Health, not changes: a JS shell renders almost no text. Do NOT
43
+ # baseline (so we never lock in an empty hash) and do NOT emit a
44
+ # Change. doctor's content-render check surfaces the condition.
45
+ log.warning(
46
+ "content detector: %s rendered empty (%d chars < %d) — "
47
+ "likely a JS shell; not baselined",
48
+ source.url,
49
+ len(text),
50
+ EMPTY_RENDER_THRESHOLD,
51
+ )
52
+ return []
53
+
54
+ new_hash = content_hash(text)
55
+ snap = store.get_snapshot(source.id, source.url)
56
+
57
+ if snap is None:
58
+ # First sight: baseline silently (snapshot only, no Change).
59
+ store.upsert_snapshot(
60
+ source.id,
61
+ source.url,
62
+ signal_type="content",
63
+ content_hash=new_hash,
64
+ lastmod="",
65
+ excerpt=text[:500],
66
+ )
67
+ return []
68
+
69
+ if snap.content_hash == new_hash:
70
+ return []
71
+
72
+ raw_diff = "\n".join(
73
+ difflib.unified_diff(
74
+ snap.excerpt.splitlines(),
75
+ text.splitlines(),
76
+ fromfile="before",
77
+ tofile="after",
78
+ lineterm="",
79
+ )
80
+ )
81
+
82
+ store.upsert_snapshot(
83
+ source.id,
84
+ source.url,
85
+ signal_type="content",
86
+ content_hash=new_hash,
87
+ lastmod="",
88
+ excerpt=text[:500],
89
+ )
90
+ return [
91
+ Change(
92
+ source_id=source.id,
93
+ url=source.url,
94
+ change_kind="updated",
95
+ title=extract_title(res.text) or source.name,
96
+ raw_diff=raw_diff[:2000],
97
+ fetched_hash=new_hash,
98
+ )
99
+ ]