pagedigest 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pagedigest/__init__.py ADDED
@@ -0,0 +1,31 @@
1
+ from .core import (
2
+ audit,
3
+ check_site,
4
+ diff,
5
+ fetch,
6
+ fetch_manifest_url,
7
+ format_state_header,
8
+ identity_digest,
9
+ live_manifest_url,
10
+ manifest_url,
11
+ parse_state_header,
12
+ resolve_url_key,
13
+ validate_manifest,
14
+ verify_live,
15
+ )
16
+
17
+ __all__ = [
18
+ "fetch",
19
+ "fetch_manifest_url",
20
+ "format_state_header",
21
+ "diff",
22
+ "audit",
23
+ "check_site",
24
+ "identity_digest",
25
+ "live_manifest_url",
26
+ "manifest_url",
27
+ "parse_state_header",
28
+ "resolve_url_key",
29
+ "validate_manifest",
30
+ "verify_live",
31
+ ]
pagedigest/cli.py ADDED
@@ -0,0 +1,113 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import sys
5
+ from collections.abc import Sequence
6
+ from typing import TextIO
7
+
8
+ from .core import DEFAULT_MAX_AUDIT_BYTES, DEFAULT_MAX_MANIFEST_BYTES, verify_live
9
+
10
+
11
+ def _add_verify_live_args(parser: argparse.ArgumentParser) -> None:
12
+ parser.add_argument("base_url", help="Site base URL, e.g. https://example.com")
13
+ parser.add_argument(
14
+ "--manifest-url",
15
+ help="Override manifest URL (defaults to /.well-known/pagedigest.json)",
16
+ )
17
+ parser.add_argument(
18
+ "--sample-size",
19
+ type=int,
20
+ default=25,
21
+ help="Number of digest entries to sample",
22
+ )
23
+ parser.add_argument(
24
+ "--seed",
25
+ type=int,
26
+ default=42,
27
+ help="Random seed for deterministic sampling",
28
+ )
29
+ parser.add_argument("--timeout", type=int, default=15, help="HTTP timeout seconds")
30
+ parser.add_argument(
31
+ "--max-bytes",
32
+ type=int,
33
+ default=DEFAULT_MAX_AUDIT_BYTES,
34
+ help="Abort identity fetches larger than this many bytes",
35
+ )
36
+ parser.add_argument(
37
+ "--manifest-max-bytes",
38
+ type=int,
39
+ default=DEFAULT_MAX_MANIFEST_BYTES,
40
+ help="Abort manifest fetches larger than this many bytes",
41
+ )
42
+
43
+
44
+ def _run_verify_live(args: argparse.Namespace, stdout: TextIO, stderr: TextIO) -> int:
45
+ if args.sample_size < 0:
46
+ print("sample-size must be non-negative", file=stderr)
47
+ return 1
48
+ if args.timeout <= 0:
49
+ print("timeout must be positive", file=stderr)
50
+ return 1
51
+ if args.max_bytes <= 0:
52
+ print("max-bytes must be positive", file=stderr)
53
+ return 1
54
+ if args.manifest_max_bytes <= 0:
55
+ print("manifest-max-bytes must be positive", file=stderr)
56
+ return 1
57
+
58
+ result = verify_live(
59
+ args.base_url,
60
+ manifest_url_override=args.manifest_url,
61
+ sample_size=args.sample_size,
62
+ seed=args.seed,
63
+ timeout=args.timeout,
64
+ max_bytes=args.max_bytes,
65
+ manifest_max_bytes=args.manifest_max_bytes,
66
+ )
67
+
68
+ print(f"manifest: {result.get('manifest_url')}", file=stdout)
69
+ if not result.get("ok"):
70
+ error = result.get("error") or "unknown-error"
71
+ status_code = result.get("status_code")
72
+ suffix = f":{status_code}" if status_code is not None else ""
73
+ print(f"error: {error}{suffix}", file=stderr)
74
+ return 1
75
+
76
+ print(f"sampled: {result['sampled']}", file=stdout)
77
+ print(f"match: {result['match_count']}", file=stdout)
78
+ print(f"mismatch: {result['mismatch_count']}", file=stdout)
79
+ print(f"inconclusive: {result['inconclusive_count']}", file=stdout)
80
+
81
+ for item in result["results"]:
82
+ if item["status"] != "match":
83
+ print(f"- {item['status']}: {item['url']} ({item['detail']})", file=stdout)
84
+
85
+ if result["mismatch_count"] > 0:
86
+ return 2
87
+ return 0
88
+
89
+
90
+ def verify_live_main(
91
+ argv: Sequence[str] | None = None, stdout: TextIO | None = None, stderr: TextIO | None = None
92
+ ) -> int:
93
+ parser = argparse.ArgumentParser(description="Verify pagedigest digest values over the wire")
94
+ _add_verify_live_args(parser)
95
+ return _run_verify_live(parser.parse_args(argv), stdout or sys.stdout, stderr or sys.stderr)
96
+
97
+
98
+ def main(argv: Sequence[str] | None = None, stdout: TextIO | None = None, stderr: TextIO | None = None) -> int:
99
+ parser = argparse.ArgumentParser(prog="pagedigest")
100
+ subcommands = parser.add_subparsers(dest="command")
101
+ verify_parser = subcommands.add_parser("verify-live", help="Verify manifest digests against live responses")
102
+ _add_verify_live_args(verify_parser)
103
+
104
+ args = parser.parse_args(argv)
105
+ if args.command == "verify-live":
106
+ return _run_verify_live(args, stdout or sys.stdout, stderr or sys.stderr)
107
+
108
+ parser.print_help(stderr or sys.stderr)
109
+ return 1
110
+
111
+
112
+ if __name__ == "__main__":
113
+ raise SystemExit(main())
pagedigest/core.py ADDED
@@ -0,0 +1,767 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ import random
6
+ import re
7
+ from dataclasses import dataclass
8
+ from datetime import datetime, timezone
9
+ from typing import Any
10
+ from urllib.parse import urljoin, urlsplit, urlunsplit
11
+
12
+ import requests
13
+
14
+ MANIFEST_PATH = "/.well-known/pagedigest.json"
15
+ DEFAULT_MAX_MANIFEST_BYTES = 10 * 1024 * 1024
16
+ DEFAULT_MAX_AUDIT_BYTES = 10 * 1024 * 1024
17
+ URL_KEY_PATTERN = re.compile(r"^/([^#]*)?$")
18
+ DIGEST_PATTERN = re.compile(r"^sha256:[a-f0-9]{64}$")
19
+ TIMESTAMP_PATTERN = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|\+00:00)$")
20
+ STATE_HEADER_PATTERN = re.compile(r'^site_rev=(0|[1-9]\d*)(?:; manifest="([^"\\\r\n]+)")?$')
21
+ UNRESERVED = set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~")
22
+
23
+
24
+ @dataclass
25
+ class FetchResult:
26
+ ok: bool
27
+ status_code: int | None
28
+ manifest: dict[str, Any] | None
29
+ etag: str | None
30
+ last_modified: str | None
31
+ error: str | None
32
+
33
+
34
+ @dataclass
35
+ class LiveAuditItem:
36
+ url_key: str
37
+ url: str
38
+ status: str
39
+ detail: str
40
+
41
+
42
+ def _is_non_negative_int(value: Any) -> bool:
43
+ return type(value) is int and value >= 0
44
+
45
+
46
+ def format_state_header(site_rev: int, manifest: str | None = None) -> str:
47
+ """Format the optional PageDigest-State request header."""
48
+ if not _is_non_negative_int(site_rev):
49
+ raise ValueError("invalid-site-rev")
50
+ value = f"site_rev={site_rev}"
51
+ if manifest is not None:
52
+ if (
53
+ not isinstance(manifest, str)
54
+ or not manifest.startswith("/")
55
+ or "#" in manifest
56
+ or any(ch in manifest for ch in ('"', "\\", "\r", "\n"))
57
+ ):
58
+ raise ValueError("invalid-state-manifest")
59
+ value += f'; manifest="{manifest}"'
60
+ return value
61
+
62
+
63
+ def parse_state_header(value: str) -> dict[str, Any]:
64
+ """Parse PageDigest-State using the strict v1 optional-client syntax."""
65
+ if not isinstance(value, str) or (match := STATE_HEADER_PATTERN.fullmatch(value)) is None:
66
+ raise ValueError("invalid-state-header")
67
+ manifest = match.group(2)
68
+ if manifest is not None and (not manifest.startswith("/") or "#" in manifest):
69
+ raise ValueError("invalid-state-manifest")
70
+ return {
71
+ "site_rev": int(match.group(1)),
72
+ **({"manifest": manifest} if manifest is not None else {}),
73
+ }
74
+
75
+
76
+ def _valid_percent_encoding(value: str, index: int) -> bool:
77
+ if index + 2 >= len(value):
78
+ return False
79
+ return all(ch in "0123456789ABCDEFabcdef" for ch in value[index + 1 : index + 3])
80
+
81
+
82
+ def _validate_url_key(key: Any) -> str | None:
83
+ if not isinstance(key, str):
84
+ return "invalid-url-key-type"
85
+ if not URL_KEY_PATTERN.match(key):
86
+ return "invalid-url-key-pattern"
87
+ if " " in key:
88
+ return "invalid-url-key-space"
89
+
90
+ index = 0
91
+ while index < len(key):
92
+ ch = key[index]
93
+ if ch == "%":
94
+ if not _valid_percent_encoding(key, index):
95
+ return "invalid-url-key-encoding"
96
+ index += 3
97
+ continue
98
+ if ord(ch) > 127:
99
+ return "invalid-url-key-unencoded"
100
+ if ch not in UNRESERVED and ch not in {
101
+ "/",
102
+ "?",
103
+ "&",
104
+ "=",
105
+ ":",
106
+ "@",
107
+ "!",
108
+ "$",
109
+ "'",
110
+ "(",
111
+ ")",
112
+ "*",
113
+ "+",
114
+ ",",
115
+ ";",
116
+ }:
117
+ return "invalid-url-key-unencoded"
118
+ index += 1
119
+ return None
120
+
121
+
122
+ def resolve_url_key(base_url: str, url_key: str) -> str:
123
+ """Resolve a manifest key without allowing it to escape the base origin."""
124
+ if (validation_error := _validate_url_key(url_key)) is not None:
125
+ raise ValueError(validation_error)
126
+
127
+ base = urlsplit(base_url)
128
+ if base.scheme.lower() not in {"http", "https"} or not base.netloc:
129
+ raise ValueError("invalid-base-url")
130
+
131
+ origin = urlunsplit((base.scheme, base.netloc, "/", "", ""))
132
+ resolved = urljoin(origin, url_key)
133
+ target = urlsplit(resolved)
134
+ if target.scheme.lower() != base.scheme.lower() or target.netloc.lower() != base.netloc.lower():
135
+ raise ValueError("url-key-origin-escape")
136
+ return resolved
137
+
138
+
139
+ def manifest_url(base_url: str) -> str:
140
+ """Return the origin-root pagedigest manifest URL for a site/page URL."""
141
+ base = urlsplit(base_url)
142
+ if base.scheme.lower() not in {"http", "https"} or not base.netloc:
143
+ raise ValueError("invalid-base-url")
144
+ return urlunsplit((base.scheme, base.netloc, MANIFEST_PATH, "", ""))
145
+
146
+
147
+ def live_manifest_url(base_url: str, manifest_url_override: str | None = None) -> str:
148
+ """Return the manifest URL used by live verification."""
149
+ if manifest_url_override:
150
+ parsed = urlsplit(manifest_url_override)
151
+ if parsed.scheme.lower() not in {"http", "https"} or not parsed.netloc:
152
+ raise ValueError("invalid-manifest-url")
153
+ return manifest_url_override
154
+ return manifest_url(base_url)
155
+
156
+
157
+ def _validate_timestamp(value: Any, field: str) -> str | None:
158
+ if not isinstance(value, str) or not TIMESTAMP_PATTERN.match(value):
159
+ return f"invalid-{field}"
160
+ try:
161
+ parsed = datetime.fromisoformat(value.replace("Z", "+00:00"))
162
+ except ValueError:
163
+ return f"invalid-{field}"
164
+ if parsed.utcoffset() != timezone.utc.utcoffset(parsed):
165
+ return f"invalid-{field}"
166
+ return None
167
+
168
+
169
+ def _validate_coverage(coverage: Any) -> str | None:
170
+ if not isinstance(coverage, dict):
171
+ return "invalid-coverage-type"
172
+
173
+ mode = coverage.get("mode")
174
+ if mode == "complete":
175
+ return None
176
+ if mode == "prefixes":
177
+ prefixes = coverage.get("prefixes")
178
+ if not isinstance(prefixes, list) or not prefixes:
179
+ return "invalid-coverage-prefixes"
180
+ for prefix in prefixes:
181
+ if not isinstance(prefix, str) or not prefix.startswith("/"):
182
+ return "invalid-coverage-prefix"
183
+ return None
184
+ return "invalid-coverage-mode"
185
+
186
+
187
+ def validate_manifest(manifest: dict[str, Any]) -> str | None:
188
+ """Return an error code when the manifest is structurally invalid."""
189
+ for field in ("version", "generated", "site_rev", "entries"):
190
+ if field not in manifest:
191
+ return f"missing-{field}"
192
+
193
+ if manifest.get("version") != 1:
194
+ return "unsupported-version"
195
+
196
+ if (err := _validate_timestamp(manifest.get("generated"), "generated")) is not None:
197
+ return err
198
+
199
+ if not _is_non_negative_int(manifest.get("site_rev")):
200
+ return "invalid-site-rev"
201
+
202
+ entries = manifest.get("entries")
203
+ if not isinstance(entries, dict):
204
+ return "invalid-entries"
205
+
206
+ if "coverage" in manifest:
207
+ if (err := _validate_coverage(manifest.get("coverage"))) is not None:
208
+ return err
209
+
210
+ for url_key, entry in entries.items():
211
+ if (err := _validate_url_key(url_key)) is not None:
212
+ return err
213
+ if not isinstance(entry, dict):
214
+ return "invalid-entry-type"
215
+ if not _is_non_negative_int(entry.get("rev")):
216
+ return "invalid-rev"
217
+ if "digest" in entry:
218
+ digest = entry.get("digest")
219
+ if not isinstance(digest, str) or not DIGEST_PATTERN.match(digest):
220
+ return "invalid-digest"
221
+ if "modified" in entry:
222
+ if (err := _validate_timestamp(entry.get("modified"), "modified")) is not None:
223
+ return err
224
+
225
+ return None
226
+
227
+
228
+ def _read_manifest_body(response: requests.Response, max_bytes: int) -> tuple[bytes | None, str | None]:
229
+ content_length = response.headers.get("Content-Length")
230
+ if content_length is not None:
231
+ try:
232
+ if int(content_length) > max_bytes:
233
+ return None, "manifest-too-large"
234
+ except ValueError:
235
+ return None, "invalid-content-length"
236
+
237
+ chunks: list[bytes] = []
238
+ total = 0
239
+ for chunk in response.iter_content(chunk_size=65536):
240
+ if not chunk:
241
+ continue
242
+ total += len(chunk)
243
+ if total > max_bytes:
244
+ return None, "manifest-too-large"
245
+ chunks.append(chunk)
246
+ return b"".join(chunks), None
247
+
248
+
249
+ def identity_digest(
250
+ response: requests.Response, max_bytes: int = DEFAULT_MAX_AUDIT_BYTES
251
+ ) -> tuple[str | None, str | None]:
252
+ """Stream-hash a response body, capped at ``max_bytes``.
253
+
254
+ Returns ``(digest, None)`` where digest is ``sha256:<hex>``, or
255
+ ``(None, error_code)`` (``body-too-large`` / ``invalid-content-length``)
256
+ when the body cannot be safely hashed. ``response`` must have been fetched
257
+ with ``stream=True``; the caller owns closing it.
258
+ """
259
+ content_length = response.headers.get("Content-Length")
260
+ if content_length is not None:
261
+ try:
262
+ if int(content_length) > max_bytes:
263
+ return None, "body-too-large"
264
+ except ValueError:
265
+ return None, "invalid-content-length"
266
+
267
+ hasher = hashlib.sha256()
268
+ total = 0
269
+ for chunk in response.iter_content(chunk_size=65536):
270
+ if not chunk:
271
+ continue
272
+ total += len(chunk)
273
+ if total > max_bytes:
274
+ return None, "body-too-large"
275
+ hasher.update(chunk)
276
+ return "sha256:" + hasher.hexdigest(), None
277
+
278
+
279
+ def fetch(
280
+ base_url: str,
281
+ timeout: int = 10,
282
+ etag: str | None = None,
283
+ last_modified: str | None = None,
284
+ session: requests.Session | None = None,
285
+ max_bytes: int = DEFAULT_MAX_MANIFEST_BYTES,
286
+ ) -> FetchResult:
287
+ """Fetch the pagedigest manifest with graceful fallback semantics."""
288
+ s = session or requests.Session()
289
+ headers: dict[str, str] = {}
290
+ if etag:
291
+ headers["If-None-Match"] = etag
292
+ if last_modified:
293
+ headers["If-Modified-Since"] = last_modified
294
+
295
+ try:
296
+ url = manifest_url(base_url)
297
+ r = s.get(url, headers=headers, timeout=timeout, stream=True)
298
+ except requests.RequestException as exc:
299
+ return FetchResult(False, None, None, None, None, str(exc))
300
+ except ValueError as exc:
301
+ return FetchResult(False, None, None, None, None, str(exc))
302
+
303
+ try:
304
+ if r.status_code == 304:
305
+ return FetchResult(True, 304, None, r.headers.get("ETag"), r.headers.get("Last-Modified"), None)
306
+
307
+ if r.status_code != 200:
308
+ return FetchResult(
309
+ False,
310
+ r.status_code,
311
+ None,
312
+ r.headers.get("ETag"),
313
+ r.headers.get("Last-Modified"),
314
+ "manifest-unavailable",
315
+ )
316
+
317
+ body, size_error = _read_manifest_body(r, max_bytes)
318
+ if size_error is not None:
319
+ return FetchResult(
320
+ False, r.status_code, None, r.headers.get("ETag"), r.headers.get("Last-Modified"), size_error
321
+ )
322
+
323
+ assert body is not None
324
+ try:
325
+ manifest = json.loads(body)
326
+ except ValueError:
327
+ return FetchResult(
328
+ False, r.status_code, None, r.headers.get("ETag"), r.headers.get("Last-Modified"), "invalid-json"
329
+ )
330
+
331
+ if not isinstance(manifest, dict):
332
+ return FetchResult(
333
+ False,
334
+ r.status_code,
335
+ None,
336
+ r.headers.get("ETag"),
337
+ r.headers.get("Last-Modified"),
338
+ "invalid-manifest-type",
339
+ )
340
+
341
+ if (validation_error := validate_manifest(manifest)) is not None:
342
+ return FetchResult(
343
+ False, r.status_code, None, r.headers.get("ETag"), r.headers.get("Last-Modified"), validation_error
344
+ )
345
+
346
+ return FetchResult(True, r.status_code, manifest, r.headers.get("ETag"), r.headers.get("Last-Modified"), None)
347
+ finally:
348
+ close = getattr(r, "close", None)
349
+ if callable(close):
350
+ close()
351
+
352
+
353
+ def fetch_manifest_url(
354
+ url: str,
355
+ timeout: int = 10,
356
+ session: requests.Session | None = None,
357
+ max_bytes: int = DEFAULT_MAX_MANIFEST_BYTES,
358
+ ) -> FetchResult:
359
+ """Fetch and validate a concrete manifest URL.
360
+
361
+ This is useful for deployment gates that need to verify a non-default or
362
+ pre-production manifest URL while still applying normal PageDigest
363
+ validation and response-size limits.
364
+ """
365
+ parsed = urlsplit(url)
366
+ if parsed.scheme.lower() not in {"http", "https"} or not parsed.netloc:
367
+ return FetchResult(False, None, None, None, None, "invalid-manifest-url")
368
+
369
+ s = session or requests.Session()
370
+ try:
371
+ r = s.get(url, timeout=timeout, stream=True)
372
+ except requests.RequestException as exc:
373
+ return FetchResult(False, None, None, None, None, str(exc))
374
+
375
+ try:
376
+ if r.status_code != 200:
377
+ return FetchResult(
378
+ False,
379
+ r.status_code,
380
+ None,
381
+ r.headers.get("ETag"),
382
+ r.headers.get("Last-Modified"),
383
+ "manifest-unavailable",
384
+ )
385
+
386
+ body, size_error = _read_manifest_body(r, max_bytes)
387
+ if size_error is not None:
388
+ return FetchResult(
389
+ False, r.status_code, None, r.headers.get("ETag"), r.headers.get("Last-Modified"), size_error
390
+ )
391
+
392
+ assert body is not None
393
+ try:
394
+ manifest = json.loads(body)
395
+ except ValueError:
396
+ return FetchResult(
397
+ False, r.status_code, None, r.headers.get("ETag"), r.headers.get("Last-Modified"), "invalid-json"
398
+ )
399
+
400
+ if not isinstance(manifest, dict):
401
+ return FetchResult(
402
+ False,
403
+ r.status_code,
404
+ None,
405
+ r.headers.get("ETag"),
406
+ r.headers.get("Last-Modified"),
407
+ "invalid-manifest-type",
408
+ )
409
+
410
+ if (validation_error := validate_manifest(manifest)) is not None:
411
+ return FetchResult(
412
+ False, r.status_code, None, r.headers.get("ETag"), r.headers.get("Last-Modified"), validation_error
413
+ )
414
+
415
+ return FetchResult(True, r.status_code, manifest, r.headers.get("ETag"), r.headers.get("Last-Modified"), None)
416
+ finally:
417
+ close = getattr(r, "close", None)
418
+ if callable(close):
419
+ close()
420
+
421
+
422
+ def diff(
423
+ manifest: dict[str, Any],
424
+ cached_site_rev: int | None,
425
+ cached_revs: dict[str, int] | None,
426
+ ) -> dict[str, Any]:
427
+ """Compare a manifest against cached state and return crawl decisions."""
428
+ cached_revs = cached_revs or {}
429
+ site_rev = manifest["site_rev"]
430
+ entries: dict[str, Any] = manifest["entries"]
431
+ coverage_mode = (manifest.get("coverage") or {}).get("mode")
432
+
433
+ if cached_site_rev is not None and site_rev < cached_site_rev:
434
+ return {
435
+ "site_changed": False,
436
+ "changed": [],
437
+ "new": [],
438
+ "unchanged": [],
439
+ "removed": [],
440
+ "anomalies": [
441
+ {
442
+ "reason": "site-rev-decrease",
443
+ "cached": cached_site_rev,
444
+ "manifest": site_rev,
445
+ }
446
+ ],
447
+ "site_anomaly": "site-rev-decrease",
448
+ "site_rev": site_rev,
449
+ }
450
+
451
+ if cached_site_rev is not None and site_rev == cached_site_rev:
452
+ return {
453
+ "site_changed": False,
454
+ "changed": [],
455
+ "new": [],
456
+ "unchanged": sorted(entries.keys()),
457
+ "removed": [],
458
+ "anomalies": [],
459
+ "site_rev": site_rev,
460
+ }
461
+
462
+ changed: list[str] = []
463
+ new: list[str] = []
464
+ unchanged: list[str] = []
465
+ anomalies: list[dict[str, Any]] = []
466
+
467
+ for url_key, entry in entries.items():
468
+ rev = entry.get("rev")
469
+ prev = cached_revs.get(url_key)
470
+ if prev is None:
471
+ new.append(url_key)
472
+ elif rev > prev:
473
+ changed.append(url_key)
474
+ elif rev == prev:
475
+ unchanged.append(url_key)
476
+ else:
477
+ anomalies.append({"url": url_key, "reason": "rev-decrease", "cached": prev, "manifest": rev})
478
+
479
+ removed = sorted(set(cached_revs) - set(entries))
480
+ if removed and coverage_mode != "complete":
481
+ removed = []
482
+
483
+ return {
484
+ "site_changed": cached_site_rev is None or site_rev != cached_site_rev,
485
+ "changed": sorted(changed),
486
+ "new": sorted(new),
487
+ "unchanged": sorted(unchanged),
488
+ "removed": removed,
489
+ "anomalies": anomalies,
490
+ "site_rev": site_rev,
491
+ }
492
+
493
+
494
+ def audit(
495
+ base_url: str,
496
+ url_key: str,
497
+ expected_digest: str,
498
+ timeout: int = 10,
499
+ session: requests.Session | None = None,
500
+ max_bytes: int = DEFAULT_MAX_AUDIT_BYTES,
501
+ ) -> dict[str, Any]:
502
+ """Audit a digest claim using identity-encoding fetch semantics.
503
+
504
+ The response body is streamed and capped at ``max_bytes`` so a publisher
505
+ cannot exhaust auditor memory with an oversized page.
506
+ """
507
+ s = session or requests.Session()
508
+ try:
509
+ url = resolve_url_key(base_url, url_key)
510
+ except ValueError as exc:
511
+ return {"result": "inconclusive", "reason": str(exc)}
512
+ try:
513
+ r = s.get(
514
+ url,
515
+ headers={"Accept-Encoding": "identity"},
516
+ timeout=timeout,
517
+ allow_redirects=False,
518
+ stream=True,
519
+ )
520
+ except requests.RequestException as exc:
521
+ return {"result": "inconclusive", "reason": "network-error", "error": str(exc)}
522
+
523
+ try:
524
+ if 300 <= r.status_code < 400:
525
+ return {"result": "inconclusive", "reason": "redirect", "status_code": r.status_code}
526
+ if r.status_code < 200 or r.status_code >= 300:
527
+ return {"result": "inconclusive", "reason": "non-success", "status_code": r.status_code}
528
+
529
+ computed, size_error = identity_digest(r, max_bytes)
530
+ if size_error is not None:
531
+ return {"result": "inconclusive", "reason": size_error}
532
+ if computed == expected_digest:
533
+ return {"result": "match", "computed": computed}
534
+ return {"result": "mismatch", "computed": computed, "expected": expected_digest}
535
+ finally:
536
+ close = getattr(r, "close", None)
537
+ if callable(close):
538
+ close()
539
+
540
+
541
+ def _audit_detail(outcome: dict[str, Any]) -> str:
542
+ result = outcome.get("result")
543
+ if result == "match":
544
+ return "ok"
545
+ if result == "mismatch":
546
+ return f"expected={outcome.get('expected')} computed={outcome.get('computed')}"
547
+ reason = str(outcome.get("reason", "unknown"))
548
+ if "status_code" in outcome:
549
+ return f"{reason}:{outcome['status_code']}"
550
+ if "error" in outcome:
551
+ return f"{reason}: {outcome['error']}"
552
+ return reason
553
+
554
+
555
+ def verify_live(
556
+ base_url: str,
557
+ manifest_url_override: str | None = None,
558
+ sample_size: int = 25,
559
+ seed: int = 42,
560
+ timeout: int = 15,
561
+ max_bytes: int = DEFAULT_MAX_AUDIT_BYTES,
562
+ manifest_max_bytes: int = DEFAULT_MAX_MANIFEST_BYTES,
563
+ session: requests.Session | None = None,
564
+ ) -> dict[str, Any]:
565
+ """Verify sampled manifest digests against live identity-encoded responses.
566
+
567
+ The return value is shaped for CLIs and deployment gates. ``mismatch_count``
568
+ is the hard failure signal; inconclusive entries are reported without
569
+ failing because redirects, network failures, or intentional large bodies can
570
+ be deployment-environment dependent.
571
+ """
572
+ try:
573
+ manifest_location = live_manifest_url(base_url, manifest_url_override)
574
+ except ValueError as exc:
575
+ return {
576
+ "ok": False,
577
+ "manifest_url": manifest_url_override,
578
+ "error": str(exc),
579
+ "sampled": 0,
580
+ "match_count": 0,
581
+ "mismatch_count": 0,
582
+ "inconclusive_count": 0,
583
+ "results": [],
584
+ }
585
+
586
+ s = session or requests.Session()
587
+ fetched = fetch_manifest_url(manifest_location, timeout=timeout, session=s, max_bytes=manifest_max_bytes)
588
+ if not fetched.ok:
589
+ return {
590
+ "ok": False,
591
+ "manifest_url": manifest_location,
592
+ "error": fetched.error,
593
+ "status_code": fetched.status_code,
594
+ "sampled": 0,
595
+ "match_count": 0,
596
+ "mismatch_count": 0,
597
+ "inconclusive_count": 0,
598
+ "results": [],
599
+ }
600
+
601
+ assert fetched.manifest is not None
602
+ digest_entries: list[tuple[str, str]] = []
603
+ for url_key, entry in fetched.manifest["entries"].items():
604
+ if not isinstance(entry, dict):
605
+ continue
606
+ digest = entry.get("digest")
607
+ if isinstance(url_key, str) and isinstance(digest, str):
608
+ digest_entries.append((url_key, digest))
609
+
610
+ if not digest_entries:
611
+ return {
612
+ "ok": True,
613
+ "manifest_url": manifest_location,
614
+ "sampled": 0,
615
+ "match_count": 0,
616
+ "mismatch_count": 0,
617
+ "inconclusive_count": 0,
618
+ "results": [],
619
+ }
620
+
621
+ picker = random.Random(seed)
622
+ sample_count = min(sample_size, len(digest_entries))
623
+ sample = picker.sample(digest_entries, sample_count)
624
+ results: list[LiveAuditItem] = []
625
+ for url_key, expected_digest in sample:
626
+ try:
627
+ resolved = resolve_url_key(base_url, url_key)
628
+ except ValueError:
629
+ resolved = url_key
630
+ outcome = audit(base_url, url_key, expected_digest, timeout=timeout, session=s, max_bytes=max_bytes)
631
+ results.append(
632
+ LiveAuditItem(
633
+ url_key=url_key,
634
+ url=resolved,
635
+ status=str(outcome.get("result", "inconclusive")),
636
+ detail=_audit_detail(outcome),
637
+ )
638
+ )
639
+
640
+ match_count = sum(item.status == "match" for item in results)
641
+ mismatch_count = sum(item.status == "mismatch" for item in results)
642
+ inconclusive_count = sum(item.status == "inconclusive" for item in results)
643
+ return {
644
+ "ok": True,
645
+ "manifest_url": manifest_location,
646
+ "sampled": sample_count,
647
+ "match_count": match_count,
648
+ "mismatch_count": mismatch_count,
649
+ "inconclusive_count": inconclusive_count,
650
+ "results": [
651
+ {
652
+ "url_key": item.url_key,
653
+ "url": item.url,
654
+ "status": item.status,
655
+ "detail": item.detail,
656
+ }
657
+ for item in results
658
+ ],
659
+ }
660
+
661
+
662
+ def _sample_audit_candidates(
663
+ manifest: dict[str, Any],
664
+ unchanged: list[str],
665
+ sample_audit_rate: float,
666
+ rng: random.Random | None = None,
667
+ ) -> list[dict[str, str]]:
668
+ if sample_audit_rate <= 0:
669
+ return []
670
+
671
+ entries = manifest["entries"]
672
+ pool: list[dict[str, str]] = []
673
+ for url_key in unchanged:
674
+ entry = entries.get(url_key)
675
+ if not isinstance(entry, dict):
676
+ continue
677
+ digest = entry.get("digest")
678
+ if isinstance(digest, str) and DIGEST_PATTERN.match(digest):
679
+ pool.append({"url": url_key, "digest": digest})
680
+
681
+ if not pool:
682
+ return []
683
+
684
+ sample_size = max(1, int(len(pool) * sample_audit_rate)) if sample_audit_rate < 1 else len(pool)
685
+ sample_size = min(sample_size, len(pool))
686
+ picker = rng or random.Random()
687
+ return picker.sample(pool, sample_size)
688
+
689
+
690
+ def check_site(
691
+ base_url: str,
692
+ cached_site_rev: int | None,
693
+ cached_revs: dict[str, int] | None,
694
+ timeout: int = 10,
695
+ etag: str | None = None,
696
+ last_modified: str | None = None,
697
+ sample_audit_rate: float = 0.0,
698
+ session: requests.Session | None = None,
699
+ max_bytes: int = DEFAULT_MAX_MANIFEST_BYTES,
700
+ rng: random.Random | None = None,
701
+ ) -> dict[str, Any]:
702
+ """High-level convenience API: fetch + diff + optional sampled audit plan.
703
+
704
+ Any anomaly — a ``site_rev`` decrease or even a single per-URL ``rev``
705
+ decrease — triggers a whole-site fallback. This is deliberately stricter
706
+ than SPEC §5.2.2, which scopes an isolated per-URL decrease to that URL;
707
+ callers that need finer-grained handling should call ``fetch`` and ``diff``
708
+ directly and apply their own scoping policy.
709
+ """
710
+ result = fetch(
711
+ base_url,
712
+ timeout=timeout,
713
+ etag=etag,
714
+ last_modified=last_modified,
715
+ session=session,
716
+ max_bytes=max_bytes,
717
+ )
718
+ if not result.ok:
719
+ return {
720
+ "fallback": True,
721
+ "error": result.error,
722
+ "status_code": result.status_code,
723
+ "etag": result.etag,
724
+ "last_modified": result.last_modified,
725
+ }
726
+
727
+ if result.status_code == 304:
728
+ return {
729
+ "fallback": False,
730
+ "not_modified": True,
731
+ "changed": [],
732
+ "etag": result.etag,
733
+ "last_modified": result.last_modified,
734
+ }
735
+
736
+ assert result.manifest is not None
737
+ decisions = diff(result.manifest, cached_site_rev, cached_revs)
738
+
739
+ if decisions.get("site_anomaly") or decisions.get("anomalies"):
740
+ return {
741
+ "fallback": True,
742
+ "error": decisions.get("site_anomaly") or "manifest-anomaly",
743
+ "status_code": result.status_code,
744
+ "etag": result.etag,
745
+ "last_modified": result.last_modified,
746
+ "manifest": result.manifest,
747
+ "anomalies": decisions.get("anomalies", []),
748
+ }
749
+
750
+ audit_candidates = _sample_audit_candidates(
751
+ result.manifest,
752
+ decisions["unchanged"],
753
+ sample_audit_rate,
754
+ rng=rng,
755
+ )
756
+
757
+ decisions.update(
758
+ {
759
+ "fallback": False,
760
+ "manifest": result.manifest,
761
+ "etag": result.etag,
762
+ "last_modified": result.last_modified,
763
+ "audit_candidates": audit_candidates,
764
+ "sample_audit_rate": sample_audit_rate,
765
+ }
766
+ )
767
+ return decisions
@@ -0,0 +1,87 @@
1
+ Metadata-Version: 2.4
2
+ Name: pagedigest
3
+ Version: 0.1.0
4
+ Summary: Minimal pagedigest consumer reference implementation
5
+ License-Expression: MIT
6
+ Project-URL: Homepage, https://pagedigest.org
7
+ Project-URL: Repository, https://github.com/maxwellsantoro/pagedigest
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: requests>=2.31.0
11
+ Provides-Extra: dev
12
+ Requires-Dist: jsonschema>=4.23.0; extra == "dev"
13
+ Requires-Dist: ruff>=0.8.0; extra == "dev"
14
+
15
+ # Python Consumer (Minimal Reference)
16
+
17
+ ## Install (from repo)
18
+
19
+ ```bash
20
+ cd implementations/python-consumer
21
+ uv sync
22
+ uv pip install -e .
23
+ ```
24
+
25
+ Requires Python ≥3.9. Runtime dependency: `requests`.
26
+
27
+ ## API
28
+
29
+ - `fetch` — fetch and validate manifest; graceful fallback on errors
30
+ - `diff` — compare manifest to cached `site_rev` / per-URL `rev`
31
+ - `audit` — identity-encoding digest check (streams the body with a size cap)
32
+ - `check_site` — `fetch` + `diff` + optional sampled audit plan
33
+ - `verify_live` — fetch a manifest and sample live identity-encoded responses for digest verification
34
+ - `identity_digest` — stream-hash a `stream=True` response body, capped at `max_bytes`, for custom audit pipelines
35
+ - `validate_manifest`, `resolve_url_key`, `manifest_url` — validation and URL helpers
36
+ - `format_state_header`, `parse_state_header` — strict optional `PageDigest-State` helpers
37
+
38
+ ## CLI
39
+
40
+ ```bash
41
+ pagedigest verify-live https://example.com --sample-size 25
42
+ ```
43
+
44
+ `verify-live` exits `2` on digest mismatches so it can act as a deployment gate.
45
+ Redirects, network errors, and body-size caps are reported as inconclusive.
46
+
47
+ ## Example
48
+
49
+ ```python
50
+ from pagedigest import check_site
51
+
52
+ decision = check_site(
53
+ "https://example.com",
54
+ cached_site_rev=12,
55
+ cached_revs={"/": 3, "/about": 1},
56
+ sample_audit_rate=0.01,
57
+ )
58
+ ```
59
+
60
+ After a successful manifest check, an integration may make its observed state
61
+ visible on subsequent page requests:
62
+
63
+ ```python
64
+ from pagedigest import format_state_header
65
+
66
+ headers = {
67
+ "PageDigest-State": format_state_header(
68
+ decision["manifest"]["site_rev"],
69
+ "/.well-known/pagedigest.json",
70
+ )
71
+ }
72
+ ```
73
+
74
+ This is a corroborating observation signal, not authentication. See
75
+ [SPEC.md §5.4](../../SPEC.md#54-optional-cooperation-request-header).
76
+
77
+ Conformance fixtures: `tests/test_vectors.py` exercises `../../test-vectors/`.
78
+
79
+ ## Persistent cache example
80
+
81
+ ```bash
82
+ uv run python examples/cache_persistence.py https://example.com ./pagedigest-cache.json
83
+ ```
84
+
85
+ The example stores `site_rev`, per-URL `rev`, `ETag`, and `Last-Modified`
86
+ between runs. It prints page fetch decisions so crawler/indexer integrations can
87
+ replace the `print` calls with their own fetch pipeline.
@@ -0,0 +1,8 @@
1
+ pagedigest/__init__.py,sha256=5_7rVcczpeFmn4beRjhCESCNxWJFzXqDhso3Noum988,551
2
+ pagedigest/cli.py,sha256=DZMfvAXG3IWSs9mpmUPQbunNUgz21m3-qRrZlDsuMuk,3884
3
+ pagedigest/core.py,sha256=E7zM2p0lXe1M7fB7pip_o456NAtbmyEZhbELIjZJFmA,25637
4
+ pagedigest-0.1.0.dist-info/METADATA,sha256=ERIRyWoki40NMOMmi-shF-_1Ny_n9bXUa9R3rPMXrbU,2684
5
+ pagedigest-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
6
+ pagedigest-0.1.0.dist-info/entry_points.txt,sha256=12ut79n1k1-sslMBvqtpAxpB7PasYAUagDg703aW1Fs,51
7
+ pagedigest-0.1.0.dist-info/top_level.txt,sha256=EknBwadpgRqtK-Ke13-rltM3m-j7z6Kb05E-pU9ATtg,11
8
+ pagedigest-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ pagedigest = pagedigest.cli:main
@@ -0,0 +1 @@
1
+ pagedigest