android-watcher 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- android_watcher/__init__.py +10 -0
- android_watcher/catalog/__init__.py +32 -0
- android_watcher/catalog/catalog.toml +531 -0
- android_watcher/cli.py +161 -0
- android_watcher/config.py +262 -0
- android_watcher/detect/__init__.py +1 -0
- android_watcher/detect/_normalize.py +192 -0
- android_watcher/detect/android_sitemap.py +540 -0
- android_watcher/detect/base.py +14 -0
- android_watcher/detect/content.py +99 -0
- android_watcher/detect/feed.py +135 -0
- android_watcher/detect/sitemap.py +203 -0
- android_watcher/doctor.py +125 -0
- android_watcher/fetch.py +162 -0
- android_watcher/group.py +79 -0
- android_watcher/lock.py +32 -0
- android_watcher/models.py +156 -0
- android_watcher/notify/__init__.py +1 -0
- android_watcher/notify/base.py +21 -0
- android_watcher/notify/email.py +52 -0
- android_watcher/notify/html.py +114 -0
- android_watcher/notify/render.py +239 -0
- android_watcher/notify/slack.py +124 -0
- android_watcher/notify/telegram.py +46 -0
- android_watcher/rank.py +84 -0
- android_watcher/registry.py +38 -0
- android_watcher/run.py +283 -0
- android_watcher/schedule.py +488 -0
- android_watcher/seed/__init__.py +45 -0
- android_watcher/seed/seed.sql.gz +0 -0
- android_watcher/store.py +492 -0
- android_watcher/triage/__init__.py +1 -0
- android_watcher/triage/base.py +25 -0
- android_watcher/triage/claude_cli.py +185 -0
- android_watcher/triage/noop.py +24 -0
- android_watcher/tui/__init__.py +1 -0
- android_watcher/tui/app.py +163 -0
- android_watcher/tui/configio.py +215 -0
- android_watcher/tui/screens.py +927 -0
- android_watcher-1.0.0.dist-info/METADATA +310 -0
- android_watcher-1.0.0.dist-info/RECORD +44 -0
- android_watcher-1.0.0.dist-info/WHEEL +4 -0
- android_watcher-1.0.0.dist-info/entry_points.txt +2 -0
- android_watcher-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,540 @@
|
|
|
1
|
+
"""Host-agnostic sitemap detector.
|
|
2
|
+
|
|
3
|
+
Parses a host's sitemap (a ``<sitemapindex>`` of shards, or a single
|
|
4
|
+
``<urlset>``) once per run via a run-scoped cache held on the Fetcher instance
|
|
5
|
+
and keyed by the sitemap-index URL (derived from each source's host). Several
|
|
6
|
+
sources on the same host share one download; different hosts each get their own.
|
|
7
|
+
Originally specific to developer.android.com (hence the registered name), it now
|
|
8
|
+
also serves source.android.com, developers.google.com, kotlinlang.org, etc.
|
|
9
|
+
|
|
10
|
+
English only: locale-prefixed URLs (/fr/...) and ``?hl=<non-en>`` variants are
|
|
11
|
+
dropped at parse time, so only canonical English pages are ever watched.
|
|
12
|
+
|
|
13
|
+
Per-source filtering (applied against the shared, cached entry list):
|
|
14
|
+
- ``path_prefix`` — include only URLs under this path ("" = whole host).
|
|
15
|
+
- ``exclude_prefixes``— drop URLs under any of these paths.
|
|
16
|
+
- ``require_segment`` — keep only URLs whose path has a matching segment
|
|
17
|
+
(``seg == s`` or ``seg startswith s-``); e.g. "android".
|
|
18
|
+
- ``reference_mode`` — keep | drop | index_only for /reference docs. index_only
|
|
19
|
+
keeps only index/summary pages (Kotlin-preferred), so
|
|
20
|
+
the huge per-symbol class/function reference is dropped.
|
|
21
|
+
- most-specific-prefix-wins: a URL under a nested same-host source's longer
|
|
22
|
+
prefix belongs to that source.
|
|
23
|
+
- version-dedup: URLs differing only by a dotted version segment (9.4) or a
|
|
24
|
+
?version=/?api= query collapse to the latest; bare-integer paths like
|
|
25
|
+
/about/versions/14 are untouched (distinct releases).
|
|
26
|
+
|
|
27
|
+
Baseline / change semantics:
|
|
28
|
+
- First sight of a brand-new URL is baselined fetch-free (sitemap lastmod, empty
|
|
29
|
+
content_hash, no fetch) when the source has no baseline yet. Once a baseline
|
|
30
|
+
exists, a never-seen URL is content-confirmed and reported as Change("new").
|
|
31
|
+
- An already-baselined URL whose lastmod moves is content-confirmed; a real
|
|
32
|
+
content move is Change("updated"). lastmod alone never emits a Change.
|
|
33
|
+
|
|
34
|
+
Public API (consumed by doctor, the seed builder, and the catalog verify-script):
|
|
35
|
+
- ``load_sitemap(fetcher, index_url=INDEX_URL)`` — flat [(loc, lastmod), ...].
|
|
36
|
+
- ``prefix_count(entries, prefix)`` — count matching URLs (0 => stale prefix).
|
|
37
|
+
- ``make_shard_cache()`` — factory; ``ShardCache.load(fetcher, index_url)``.
|
|
38
|
+
- ``baseline_all(source, store, fetcher)`` — full-content baseline (seed builder).
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
from __future__ import annotations
|
|
42
|
+
|
|
43
|
+
import asyncio
|
|
44
|
+
import io
|
|
45
|
+
import logging
|
|
46
|
+
import re
|
|
47
|
+
import time
|
|
48
|
+
from urllib.parse import parse_qs, urlsplit
|
|
49
|
+
|
|
50
|
+
from defusedxml.ElementTree import iterparse
|
|
51
|
+
|
|
52
|
+
from ..catalog import load_catalog
|
|
53
|
+
from ..models import Change, Source
|
|
54
|
+
from .base import DETECTORS
|
|
55
|
+
from .sitemap import confirm_candidate
|
|
56
|
+
|
|
57
|
+
logger = logging.getLogger(__name__)
|
|
58
|
+
|
|
59
|
+
_SM = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
|
|
60
|
+
INDEX_URL = "https://developer.android.com/sitemap.xml"
|
|
61
|
+
|
|
62
|
+
# Leading locale path segment (/fr/..., /pt-br/...) — matched against an explicit
|
|
63
|
+
# set rather than a 2-letter regex so real sections like /tv, /xr, /ai survive.
|
|
64
|
+
_LOCALES = frozenset(
|
|
65
|
+
{
|
|
66
|
+
"ar",
|
|
67
|
+
"bn",
|
|
68
|
+
"de",
|
|
69
|
+
"en",
|
|
70
|
+
"es",
|
|
71
|
+
"es-419",
|
|
72
|
+
"fa",
|
|
73
|
+
"fr",
|
|
74
|
+
"he",
|
|
75
|
+
"hi",
|
|
76
|
+
"id",
|
|
77
|
+
"in",
|
|
78
|
+
"it",
|
|
79
|
+
"iw",
|
|
80
|
+
"ja",
|
|
81
|
+
"ko",
|
|
82
|
+
"ms",
|
|
83
|
+
"pl",
|
|
84
|
+
"pt",
|
|
85
|
+
"pt-br",
|
|
86
|
+
"ru",
|
|
87
|
+
"th",
|
|
88
|
+
"tr",
|
|
89
|
+
"uk",
|
|
90
|
+
"vi",
|
|
91
|
+
"zh-cn",
|
|
92
|
+
"zh-hk",
|
|
93
|
+
"zh-tw",
|
|
94
|
+
}
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Reserved reference index/summary leaf names (everything else under /reference
|
|
98
|
+
# is a per-symbol class/function page). Derived empirically from the sitemap.
|
|
99
|
+
_REFERENCE_INDEX_LEAVES = frozenset(
|
|
100
|
+
{"package-summary", "packages", "classes", "composables", "modifiers"}
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# A dotted version segment (9.4, 8.13.0). Requires a dot, so bare integers like
|
|
104
|
+
# /about/versions/14 (distinct Android releases) are NOT treated as versions.
|
|
105
|
+
_VERSION_RE = re.compile(r"^\d+(?:\.\d+)+$")
|
|
106
|
+
|
|
107
|
+
# Non-HTML asset extensions. Sitemaps list PDFs, images, archives, etc.; fetching
|
|
108
|
+
# them as text feeds binary to the HTML parser (it throws) and pollutes baselines
|
|
109
|
+
# with binary "excerpts" (the source of NUL bytes). Never watch these.
|
|
110
|
+
_BINARY_EXT = (
|
|
111
|
+
".pdf",
|
|
112
|
+
".png",
|
|
113
|
+
".jpg",
|
|
114
|
+
".jpeg",
|
|
115
|
+
".gif",
|
|
116
|
+
".svg",
|
|
117
|
+
".webp",
|
|
118
|
+
".ico",
|
|
119
|
+
".bmp",
|
|
120
|
+
".mp4",
|
|
121
|
+
".webm",
|
|
122
|
+
".mov",
|
|
123
|
+
".zip",
|
|
124
|
+
".tar",
|
|
125
|
+
".gz",
|
|
126
|
+
".tgz",
|
|
127
|
+
".jar",
|
|
128
|
+
".aar",
|
|
129
|
+
".apk",
|
|
130
|
+
".aab",
|
|
131
|
+
".woff",
|
|
132
|
+
".woff2",
|
|
133
|
+
".ttf",
|
|
134
|
+
".otf",
|
|
135
|
+
".eot",
|
|
136
|
+
".css",
|
|
137
|
+
".js",
|
|
138
|
+
".wasm",
|
|
139
|
+
".bin",
|
|
140
|
+
".dmg",
|
|
141
|
+
".exe",
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# >= this many prefix URLs sharing one NEW lastmod => treat as a section
|
|
145
|
+
# regeneration and collapse to a single candidate.
|
|
146
|
+
BULK_COLLAPSE_THRESHOLD = 3
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _is_localized(loc: str) -> bool:
|
|
150
|
+
"""True if *loc* is a translation: a leading locale path segment, or an
|
|
151
|
+
`hl` query parameter requesting any language other than English."""
|
|
152
|
+
parts = urlsplit(loc)
|
|
153
|
+
seg = parts.path.split("/", 2)
|
|
154
|
+
if len(seg) > 1 and seg[1] in _LOCALES:
|
|
155
|
+
return True
|
|
156
|
+
hl = parse_qs(parts.query).get("hl")
|
|
157
|
+
return bool(hl) and hl[0] != "en"
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _iter_shard(xml: str):
|
|
161
|
+
"""Stream <url> elements, yield (loc, lastmod) for canonical English URLs.
|
|
162
|
+
|
|
163
|
+
Uses iterparse + element.clear() so a 34 MB shard never materializes as a
|
|
164
|
+
DOM. No path filtering here — one cached parse serves every source on the
|
|
165
|
+
host; per-source filtering happens later against the cached list.
|
|
166
|
+
"""
|
|
167
|
+
loc = lastmod = ""
|
|
168
|
+
for _event, el in iterparse(io.StringIO(xml), events=("end",)):
|
|
169
|
+
tag = el.tag
|
|
170
|
+
if tag == f"{_SM}loc":
|
|
171
|
+
loc = (el.text or "").strip()
|
|
172
|
+
elif tag == f"{_SM}lastmod":
|
|
173
|
+
lastmod = (el.text or "").strip()
|
|
174
|
+
elif tag == f"{_SM}url":
|
|
175
|
+
if loc and not _is_localized(loc):
|
|
176
|
+
yield loc, lastmod
|
|
177
|
+
loc = lastmod = ""
|
|
178
|
+
el.clear()
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _index_url_for(source: Source) -> str:
|
|
182
|
+
"""Derive the sitemap-index URL from a source's host: <scheme>://<host>/sitemap.xml."""
|
|
183
|
+
p = urlsplit(source.url)
|
|
184
|
+
return f"{p.scheme}://{p.netloc}/sitemap.xml"
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class ShardCache:
|
|
188
|
+
"""Run-scoped parse of one host's sitemap (index-of-shards or single urlset)."""
|
|
189
|
+
|
|
190
|
+
def __init__(self) -> None:
|
|
191
|
+
self._loaded = False
|
|
192
|
+
self._lock = asyncio.Lock() # serializes concurrent population
|
|
193
|
+
self.entries: list[tuple[str, str]] = [] # flat (loc, lastmod)
|
|
194
|
+
|
|
195
|
+
async def load(self, fetcher, index_url: str = INDEX_URL) -> list[tuple[str, str]]:
|
|
196
|
+
# CONCURRENCY: many sources on a host call load() concurrently in one run.
|
|
197
|
+
# Guard population with an asyncio.Lock so the FIRST caller fetches once and
|
|
198
|
+
# the rest await the in-flight load. Double-checked: fast path skips the lock.
|
|
199
|
+
if self._loaded:
|
|
200
|
+
return self.entries
|
|
201
|
+
async with self._lock:
|
|
202
|
+
if self._loaded:
|
|
203
|
+
return self.entries
|
|
204
|
+
t0 = time.monotonic()
|
|
205
|
+
idx = await fetcher.fetch(index_url, conditional=True)
|
|
206
|
+
text = idx.text if not idx.not_modified else ""
|
|
207
|
+
if text and "<sitemapindex" in text[:500]:
|
|
208
|
+
await self._load_index(fetcher, text)
|
|
209
|
+
elif text:
|
|
210
|
+
# A bare <urlset> served directly at the sitemap URL (single shard).
|
|
211
|
+
self.entries.extend(_iter_shard(text))
|
|
212
|
+
logger.info(
|
|
213
|
+
"sitemap: %d urls (single urlset) in %.1fs",
|
|
214
|
+
len(self.entries),
|
|
215
|
+
time.monotonic() - t0,
|
|
216
|
+
)
|
|
217
|
+
self._loaded = True
|
|
218
|
+
return self.entries
|
|
219
|
+
|
|
220
|
+
async def _load_index(self, fetcher, index_text: str) -> None:
|
|
221
|
+
shard_locs: list[str] = []
|
|
222
|
+
for _e, el in iterparse(io.StringIO(index_text), events=("end",)):
|
|
223
|
+
if el.tag == f"{_SM}loc" and el.text:
|
|
224
|
+
shard_locs.append(el.text.strip())
|
|
225
|
+
if el.tag == f"{_SM}sitemap":
|
|
226
|
+
el.clear()
|
|
227
|
+
total = len(shard_locs)
|
|
228
|
+
logger.info("sitemap index: %d shard(s)", total)
|
|
229
|
+
t = time.monotonic()
|
|
230
|
+
|
|
231
|
+
async def _fetch(n: int, url: str):
|
|
232
|
+
ts = time.monotonic()
|
|
233
|
+
res = await fetcher.fetch(url, conditional=True)
|
|
234
|
+
return n, res, time.monotonic() - ts
|
|
235
|
+
|
|
236
|
+
tasks = [asyncio.create_task(_fetch(n, u)) for n, u in enumerate(shard_locs, 1)]
|
|
237
|
+
for fut in asyncio.as_completed(tasks):
|
|
238
|
+
n, res, dt = await fut
|
|
239
|
+
if res.not_modified or not res.text:
|
|
240
|
+
logger.info(" shard %d/%d: 304/empty in %.1fs", n, total, dt)
|
|
241
|
+
continue
|
|
242
|
+
before = len(self.entries)
|
|
243
|
+
self.entries.extend(_iter_shard(res.text))
|
|
244
|
+
logger.info(
|
|
245
|
+
" shard %d/%d: %dKB, %d urls, %.1fs",
|
|
246
|
+
n,
|
|
247
|
+
total,
|
|
248
|
+
len(res.text) // 1024,
|
|
249
|
+
len(self.entries) - before,
|
|
250
|
+
dt,
|
|
251
|
+
)
|
|
252
|
+
logger.info(
|
|
253
|
+
"sitemap loaded: %d urls from %d shard(s) in %.1fs",
|
|
254
|
+
len(self.entries),
|
|
255
|
+
total,
|
|
256
|
+
time.monotonic() - t,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
def stale_prefix(self, prefix: str) -> bool:
|
|
260
|
+
return prefix_count(self.entries, prefix) == 0
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
# ---- public API -------------------------------------------------------------
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def make_shard_cache() -> ShardCache:
|
|
267
|
+
return ShardCache()
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
async def load_sitemap(fetcher, index_url: str = INDEX_URL) -> list[tuple[str, str]]:
|
|
271
|
+
"""Parse a host's sitemap once per run; cached on the Fetcher per index URL."""
|
|
272
|
+
return await _cache_for(fetcher, index_url).load(fetcher, index_url)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def prefix_count(entries: list[tuple[str, str]], prefix: str) -> int:
|
|
276
|
+
"""How many sitemap URLs fall under a path prefix (0 => stale prefix)."""
|
|
277
|
+
if not prefix:
|
|
278
|
+
return len(entries)
|
|
279
|
+
return sum(1 for loc, _lastmod in entries if urlsplit(loc).path.startswith(prefix))
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _cache_for(fetcher, index_url: str = INDEX_URL) -> ShardCache:
|
|
283
|
+
caches = getattr(fetcher, "_shard_caches", None)
|
|
284
|
+
if caches is None:
|
|
285
|
+
caches = {}
|
|
286
|
+
fetcher._shard_caches = caches
|
|
287
|
+
cache = caches.get(index_url)
|
|
288
|
+
if cache is None:
|
|
289
|
+
cache = make_shard_cache()
|
|
290
|
+
caches[index_url] = cache
|
|
291
|
+
return cache
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _representative(locs: list[str]) -> str:
|
|
295
|
+
"""Pick the shortest-path URL for a cluster as its canonical representative."""
|
|
296
|
+
return min(locs, key=lambda loc: len(urlsplit(loc).path))
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _descendant_prefixes(source: Source) -> set[str]:
|
|
300
|
+
"""Path prefixes of OTHER enabled same-host sitemap sources nested strictly
|
|
301
|
+
under this source's prefix (most-specific-prefix-wins)."""
|
|
302
|
+
out: set[str] = set()
|
|
303
|
+
src_index = _index_url_for(source)
|
|
304
|
+
for cat_src in load_catalog():
|
|
305
|
+
if (
|
|
306
|
+
cat_src.detector == "android_sitemap"
|
|
307
|
+
and cat_src.enabled
|
|
308
|
+
and cat_src.id != source.id
|
|
309
|
+
and cat_src.path_prefix
|
|
310
|
+
and _index_url_for(cat_src) == src_index
|
|
311
|
+
and len(cat_src.path_prefix) > len(source.path_prefix)
|
|
312
|
+
and cat_src.path_prefix.startswith(source.path_prefix)
|
|
313
|
+
):
|
|
314
|
+
out.add(cat_src.path_prefix)
|
|
315
|
+
return out
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def _is_reference(path: str) -> bool:
|
|
319
|
+
return "reference" in path.strip("/").split("/")
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def _kotlin_twin(path: str) -> str | None:
|
|
323
|
+
"""The Kotlin-variant path for a Java reference URL, or None if already Kotlin
|
|
324
|
+
(or not a reference URL). Inserts 'kotlin' right after the 'reference' segment."""
|
|
325
|
+
segs = path.strip("/").split("/")
|
|
326
|
+
if "reference" not in segs:
|
|
327
|
+
return None
|
|
328
|
+
i = segs.index("reference")
|
|
329
|
+
if i + 1 < len(segs) and segs[i + 1] == "kotlin":
|
|
330
|
+
return None
|
|
331
|
+
return "/" + "/".join(segs[: i + 1] + ["kotlin"] + segs[i + 1 :])
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _watched(source: Source, loc: str, all_paths: set[str]) -> bool:
|
|
335
|
+
"""Whether *loc* passes this source's include/exclude/segment/reference filters."""
|
|
336
|
+
path = urlsplit(loc).path
|
|
337
|
+
if path.lower().endswith(_BINARY_EXT):
|
|
338
|
+
return False # PDFs, images, archives, etc. are not text pages
|
|
339
|
+
if source.path_prefix and not path.startswith(source.path_prefix):
|
|
340
|
+
return False
|
|
341
|
+
if any(path.startswith(x) for x in source.exclude_prefixes):
|
|
342
|
+
return False
|
|
343
|
+
if source.require_segment:
|
|
344
|
+
rs = source.require_segment
|
|
345
|
+
if not any(s == rs or s.startswith(rs + "-") for s in path.strip("/").split("/")):
|
|
346
|
+
return False
|
|
347
|
+
if _is_reference(path):
|
|
348
|
+
mode = source.reference_mode
|
|
349
|
+
if mode == "drop":
|
|
350
|
+
return False
|
|
351
|
+
if mode == "index_only":
|
|
352
|
+
leaf = path.rstrip("/").split("/")[-1]
|
|
353
|
+
if leaf not in _REFERENCE_INDEX_LEAVES:
|
|
354
|
+
return False
|
|
355
|
+
twin = _kotlin_twin(path) # Kotlin-preferred: drop Java if a Kotlin twin exists
|
|
356
|
+
if twin is not None and twin in all_paths:
|
|
357
|
+
return False
|
|
358
|
+
return True
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def _version_key(loc: str):
|
|
362
|
+
"""(canonical-group, version-tuple) if *loc* carries a dotted version segment
|
|
363
|
+
or a numeric ?version=/?api= query, else (None, None)."""
|
|
364
|
+
parts = urlsplit(loc)
|
|
365
|
+
segs = parts.path.strip("/").split("/")
|
|
366
|
+
for i, s in enumerate(segs):
|
|
367
|
+
if _VERSION_RE.match(s):
|
|
368
|
+
canon = "/" + "/".join(segs[:i] + ["*"] + segs[i + 1 :]) + "?" + parts.query
|
|
369
|
+
return canon, tuple(int(x) for x in s.split("."))
|
|
370
|
+
q = parse_qs(parts.query)
|
|
371
|
+
for key in ("version", "api", "apilevel"):
|
|
372
|
+
vals = q.get(key)
|
|
373
|
+
if vals and vals[0].replace(".", "").isdigit():
|
|
374
|
+
base = parts.path + "|" + key
|
|
375
|
+
v = vals[0]
|
|
376
|
+
return base, (tuple(int(x) for x in v.split(".")) if "." in v else (int(v),))
|
|
377
|
+
return None, None
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def _dedup_versions(items: list[tuple[str, str]]) -> list[tuple[str, str]]:
|
|
381
|
+
"""Collapse URLs differing only by a dotted version / version query to the
|
|
382
|
+
latest; everything without a detectable version passes through unchanged."""
|
|
383
|
+
best: dict[str, tuple[tuple[int, ...], str, str]] = {}
|
|
384
|
+
passthrough: list[tuple[str, str]] = []
|
|
385
|
+
for loc, lastmod in items:
|
|
386
|
+
canon, vkey = _version_key(loc)
|
|
387
|
+
if canon is None:
|
|
388
|
+
passthrough.append((loc, lastmod))
|
|
389
|
+
continue
|
|
390
|
+
cur = best.get(canon)
|
|
391
|
+
if cur is None or vkey > cur[0]:
|
|
392
|
+
best[canon] = (vkey, loc, lastmod)
|
|
393
|
+
return passthrough + [(loc, lastmod) for _v, loc, lastmod in best.values()]
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def _gather_watched(source: Source, entries: list[tuple[str, str]]) -> list[tuple[str, str]]:
|
|
397
|
+
"""Apply all source filters + most-specific-prefix dedup + version-dedup."""
|
|
398
|
+
all_paths = {urlsplit(loc).path for loc, _ in entries}
|
|
399
|
+
descendants = _descendant_prefixes(source)
|
|
400
|
+
out: list[tuple[str, str]] = []
|
|
401
|
+
for loc, lastmod in entries:
|
|
402
|
+
path = urlsplit(loc).path
|
|
403
|
+
if any(path.startswith(dp) for dp in descendants):
|
|
404
|
+
continue
|
|
405
|
+
if not _watched(source, loc, all_paths):
|
|
406
|
+
continue
|
|
407
|
+
out.append((loc, lastmod))
|
|
408
|
+
return _dedup_versions(out)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
async def baseline_all(source: Source, store, fetcher) -> int:
|
|
412
|
+
"""Full-content baseline for the seed builder: fetch + hash EVERY watched URL
|
|
413
|
+
for this source (no fetch-free shortcut), recording a snapshot.
|
|
414
|
+
|
|
415
|
+
Returns the number newly baselined. Resumable: URLs already baselined with a
|
|
416
|
+
non-empty content_hash are skipped. Confirm-fetches run concurrently (the
|
|
417
|
+
Fetcher's semaphore + crawl delay bound the rate); progress logs every 100.
|
|
418
|
+
"""
|
|
419
|
+
entries = await load_sitemap(fetcher, _index_url_for(source))
|
|
420
|
+
todo = [
|
|
421
|
+
(loc, lastmod)
|
|
422
|
+
for loc, lastmod in _gather_watched(source, entries)
|
|
423
|
+
if not ((snap := store.get_snapshot(source.id, loc)) is not None and snap.content_hash)
|
|
424
|
+
]
|
|
425
|
+
total = len(todo)
|
|
426
|
+
if not total:
|
|
427
|
+
return 0
|
|
428
|
+
logger.info("%s: baselining %d URL(s)…", source.id, total)
|
|
429
|
+
tasks = [
|
|
430
|
+
asyncio.create_task(confirm_candidate(source, store, fetcher, loc, lastmod))
|
|
431
|
+
for loc, lastmod in todo
|
|
432
|
+
]
|
|
433
|
+
done = 0
|
|
434
|
+
for fut in asyncio.as_completed(tasks):
|
|
435
|
+
await fut
|
|
436
|
+
done += 1
|
|
437
|
+
if done % 100 == 0 or done == total:
|
|
438
|
+
logger.info("%s: baselined %d/%d", source.id, done, total)
|
|
439
|
+
return total
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
@DETECTORS.register("android_sitemap")
|
|
443
|
+
class AndroidSitemapDetector:
|
|
444
|
+
async def detect(self, source: Source, store, fetcher) -> list[Change]:
|
|
445
|
+
entries = await load_sitemap(fetcher, _index_url_for(source))
|
|
446
|
+
|
|
447
|
+
# Stale-prefix health check: a non-empty prefix that matches nothing is a
|
|
448
|
+
# config problem, not a change. Log and return []; doctor surfaces it.
|
|
449
|
+
if source.path_prefix and prefix_count(entries, source.path_prefix) == 0:
|
|
450
|
+
logger.warning(
|
|
451
|
+
"android_sitemap: path_prefix %r matches zero sitemap URLs for %r",
|
|
452
|
+
source.path_prefix,
|
|
453
|
+
source.id,
|
|
454
|
+
)
|
|
455
|
+
return []
|
|
456
|
+
|
|
457
|
+
watched = _gather_watched(source, entries)
|
|
458
|
+
has_baseline = store.source_has_snapshots(source.id)
|
|
459
|
+
|
|
460
|
+
# Partition: brand-new URLs (genuinely new vs first-ever baseline) and
|
|
461
|
+
# already-seen URLs whose lastmod moved.
|
|
462
|
+
new_pages: list[tuple[str, str]] = [] # new after baseline -> Change("new")
|
|
463
|
+
fetchfree: list[tuple[str, str]] = [] # first-ever baseline -> silent
|
|
464
|
+
candidates: list[tuple[str, str]] = [] # lastmod moved -> Change("updated")
|
|
465
|
+
for loc, lastmod in watched:
|
|
466
|
+
snap = store.get_snapshot(source.id, loc)
|
|
467
|
+
if snap is None:
|
|
468
|
+
(new_pages if has_baseline else fetchfree).append((loc, lastmod))
|
|
469
|
+
continue
|
|
470
|
+
if lastmod and snap.lastmod == lastmod:
|
|
471
|
+
continue
|
|
472
|
+
candidates.append((loc, lastmod))
|
|
473
|
+
|
|
474
|
+
# First-ever baseline: record lastmod with empty hash, no fetch.
|
|
475
|
+
for loc, lastmod in fetchfree:
|
|
476
|
+
store.upsert_snapshot(
|
|
477
|
+
source.id, loc, signal_type="sitemap", content_hash="", lastmod=lastmod, excerpt=""
|
|
478
|
+
)
|
|
479
|
+
if fetchfree:
|
|
480
|
+
logger.info("%s: %d new URL(s) baselined fetch-free", source.id, len(fetchfree))
|
|
481
|
+
|
|
482
|
+
changes: list[Change] = []
|
|
483
|
+
|
|
484
|
+
# Genuinely new URLs (after baseline): confirm-fetch + report as "new".
|
|
485
|
+
if new_pages:
|
|
486
|
+
logger.info("%s: %d new page(s) -> confirm…", source.id, len(new_pages))
|
|
487
|
+
tasks = [
|
|
488
|
+
asyncio.create_task(
|
|
489
|
+
confirm_candidate(source, store, fetcher, loc, lastmod, emit_new=True)
|
|
490
|
+
)
|
|
491
|
+
for loc, lastmod in new_pages
|
|
492
|
+
]
|
|
493
|
+
for fut in asyncio.as_completed(tasks):
|
|
494
|
+
ch = await fut
|
|
495
|
+
if ch is not None:
|
|
496
|
+
changes.append(ch)
|
|
497
|
+
|
|
498
|
+
if not candidates:
|
|
499
|
+
return changes
|
|
500
|
+
|
|
501
|
+
# Updated URLs: cluster by lastmod, collapse bulk section regenerations.
|
|
502
|
+
by_lastmod: dict[str, list[str]] = {}
|
|
503
|
+
for loc, lastmod in candidates:
|
|
504
|
+
by_lastmod.setdefault(lastmod, []).append(loc)
|
|
505
|
+
fetches = sum(
|
|
506
|
+
1 if len(locs) >= BULK_COLLAPSE_THRESHOLD else len(locs) for locs in by_lastmod.values()
|
|
507
|
+
)
|
|
508
|
+
logger.info(
|
|
509
|
+
"%s: %d candidate(s), %d cluster(s) -> ~%d confirm-fetch(es)…",
|
|
510
|
+
source.id,
|
|
511
|
+
len(candidates),
|
|
512
|
+
len(by_lastmod),
|
|
513
|
+
fetches,
|
|
514
|
+
)
|
|
515
|
+
for lastmod, locs in by_lastmod.items():
|
|
516
|
+
if len(locs) >= BULK_COLLAPSE_THRESHOLD:
|
|
517
|
+
rep = _representative(locs)
|
|
518
|
+
change = await confirm_candidate(source, store, fetcher, rep, lastmod)
|
|
519
|
+
for loc in locs:
|
|
520
|
+
if loc != rep:
|
|
521
|
+
snap = store.get_snapshot(source.id, loc)
|
|
522
|
+
prior = snap.content_hash if snap is not None else ""
|
|
523
|
+
excerpt = snap.excerpt if snap is not None else ""
|
|
524
|
+
store.upsert_snapshot(
|
|
525
|
+
source.id,
|
|
526
|
+
loc,
|
|
527
|
+
signal_type="sitemap",
|
|
528
|
+
content_hash=prior,
|
|
529
|
+
lastmod=lastmod,
|
|
530
|
+
excerpt=excerpt,
|
|
531
|
+
)
|
|
532
|
+
if change is not None:
|
|
533
|
+
changes.append(change)
|
|
534
|
+
else:
|
|
535
|
+
for loc in locs:
|
|
536
|
+
change = await confirm_candidate(source, store, fetcher, loc, lastmod)
|
|
537
|
+
if change is not None:
|
|
538
|
+
changes.append(change)
|
|
539
|
+
|
|
540
|
+
return changes
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Protocol, runtime_checkable
|
|
4
|
+
|
|
5
|
+
from ..models import Change, Source
|
|
6
|
+
from ..registry import Registry
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@runtime_checkable
|
|
10
|
+
class Detector(Protocol):
|
|
11
|
+
async def detect(self, source: Source, store, fetcher) -> list[Change]: ...
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
DETECTORS: Registry[Detector] = Registry("detector")
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Content-hash detector.
|
|
2
|
+
|
|
3
|
+
Fetches a page, extracts and normalises the selected region, and emits an
|
|
4
|
+
"updated" Change when the normalised text moves. Cosmetic edits (CSS classes,
|
|
5
|
+
whitespace, attributes) leave the normalised text — and therefore the hash —
|
|
6
|
+
unchanged, so they never raise a signal.
|
|
7
|
+
|
|
8
|
+
Empty-render guard: a page whose normalised main-text falls below
|
|
9
|
+
``EMPTY_RENDER_THRESHOLD`` is a client-rendered shell, a HEALTH condition rather
|
|
10
|
+
than a change. The detector refuses to baseline it and emits no Change; it logs
|
|
11
|
+
a warning and ``doctor`` surfaces the condition.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import difflib
|
|
17
|
+
import logging
|
|
18
|
+
|
|
19
|
+
from ..models import Change, Source
|
|
20
|
+
from ._normalize import (
|
|
21
|
+
EMPTY_RENDER_THRESHOLD,
|
|
22
|
+
content_hash,
|
|
23
|
+
extract_main,
|
|
24
|
+
extract_title,
|
|
25
|
+
normalize_text,
|
|
26
|
+
)
|
|
27
|
+
from .base import DETECTORS
|
|
28
|
+
|
|
29
|
+
log = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@DETECTORS.register("content")
|
|
33
|
+
class ContentDetector:
|
|
34
|
+
async def detect(self, source: Source, store, fetcher) -> list[Change]:
|
|
35
|
+
res = await fetcher.fetch(source.url, conditional=True)
|
|
36
|
+
if res.not_modified or not res.text:
|
|
37
|
+
return []
|
|
38
|
+
|
|
39
|
+
text = normalize_text(extract_main(res.text, source.content_selector))
|
|
40
|
+
|
|
41
|
+
if len(text) < EMPTY_RENDER_THRESHOLD:
|
|
42
|
+
# Health, not changes: a JS shell renders almost no text. Do NOT
|
|
43
|
+
# baseline (so we never lock in an empty hash) and do NOT emit a
|
|
44
|
+
# Change. doctor's content-render check surfaces the condition.
|
|
45
|
+
log.warning(
|
|
46
|
+
"content detector: %s rendered empty (%d chars < %d) — "
|
|
47
|
+
"likely a JS shell; not baselined",
|
|
48
|
+
source.url,
|
|
49
|
+
len(text),
|
|
50
|
+
EMPTY_RENDER_THRESHOLD,
|
|
51
|
+
)
|
|
52
|
+
return []
|
|
53
|
+
|
|
54
|
+
new_hash = content_hash(text)
|
|
55
|
+
snap = store.get_snapshot(source.id, source.url)
|
|
56
|
+
|
|
57
|
+
if snap is None:
|
|
58
|
+
# First sight: baseline silently (snapshot only, no Change).
|
|
59
|
+
store.upsert_snapshot(
|
|
60
|
+
source.id,
|
|
61
|
+
source.url,
|
|
62
|
+
signal_type="content",
|
|
63
|
+
content_hash=new_hash,
|
|
64
|
+
lastmod="",
|
|
65
|
+
excerpt=text[:500],
|
|
66
|
+
)
|
|
67
|
+
return []
|
|
68
|
+
|
|
69
|
+
if snap.content_hash == new_hash:
|
|
70
|
+
return []
|
|
71
|
+
|
|
72
|
+
raw_diff = "\n".join(
|
|
73
|
+
difflib.unified_diff(
|
|
74
|
+
snap.excerpt.splitlines(),
|
|
75
|
+
text.splitlines(),
|
|
76
|
+
fromfile="before",
|
|
77
|
+
tofile="after",
|
|
78
|
+
lineterm="",
|
|
79
|
+
)
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
store.upsert_snapshot(
|
|
83
|
+
source.id,
|
|
84
|
+
source.url,
|
|
85
|
+
signal_type="content",
|
|
86
|
+
content_hash=new_hash,
|
|
87
|
+
lastmod="",
|
|
88
|
+
excerpt=text[:500],
|
|
89
|
+
)
|
|
90
|
+
return [
|
|
91
|
+
Change(
|
|
92
|
+
source_id=source.id,
|
|
93
|
+
url=source.url,
|
|
94
|
+
change_kind="updated",
|
|
95
|
+
title=extract_title(res.text) or source.name,
|
|
96
|
+
raw_diff=raw_diff[:2000],
|
|
97
|
+
fetched_hash=new_hash,
|
|
98
|
+
)
|
|
99
|
+
]
|