pagedigest 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pagedigest/__init__.py +31 -0
- pagedigest/cli.py +113 -0
- pagedigest/core.py +767 -0
- pagedigest-0.1.0.dist-info/METADATA +87 -0
- pagedigest-0.1.0.dist-info/RECORD +8 -0
- pagedigest-0.1.0.dist-info/WHEEL +5 -0
- pagedigest-0.1.0.dist-info/entry_points.txt +2 -0
- pagedigest-0.1.0.dist-info/top_level.txt +1 -0
pagedigest/__init__.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from .core import (
|
|
2
|
+
audit,
|
|
3
|
+
check_site,
|
|
4
|
+
diff,
|
|
5
|
+
fetch,
|
|
6
|
+
fetch_manifest_url,
|
|
7
|
+
format_state_header,
|
|
8
|
+
identity_digest,
|
|
9
|
+
live_manifest_url,
|
|
10
|
+
manifest_url,
|
|
11
|
+
parse_state_header,
|
|
12
|
+
resolve_url_key,
|
|
13
|
+
validate_manifest,
|
|
14
|
+
verify_live,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"fetch",
|
|
19
|
+
"fetch_manifest_url",
|
|
20
|
+
"format_state_header",
|
|
21
|
+
"diff",
|
|
22
|
+
"audit",
|
|
23
|
+
"check_site",
|
|
24
|
+
"identity_digest",
|
|
25
|
+
"live_manifest_url",
|
|
26
|
+
"manifest_url",
|
|
27
|
+
"parse_state_header",
|
|
28
|
+
"resolve_url_key",
|
|
29
|
+
"validate_manifest",
|
|
30
|
+
"verify_live",
|
|
31
|
+
]
|
pagedigest/cli.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from collections.abc import Sequence
|
|
6
|
+
from typing import TextIO
|
|
7
|
+
|
|
8
|
+
from .core import DEFAULT_MAX_AUDIT_BYTES, DEFAULT_MAX_MANIFEST_BYTES, verify_live
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _add_verify_live_args(parser: argparse.ArgumentParser) -> None:
|
|
12
|
+
parser.add_argument("base_url", help="Site base URL, e.g. https://example.com")
|
|
13
|
+
parser.add_argument(
|
|
14
|
+
"--manifest-url",
|
|
15
|
+
help="Override manifest URL (defaults to /.well-known/pagedigest.json)",
|
|
16
|
+
)
|
|
17
|
+
parser.add_argument(
|
|
18
|
+
"--sample-size",
|
|
19
|
+
type=int,
|
|
20
|
+
default=25,
|
|
21
|
+
help="Number of digest entries to sample",
|
|
22
|
+
)
|
|
23
|
+
parser.add_argument(
|
|
24
|
+
"--seed",
|
|
25
|
+
type=int,
|
|
26
|
+
default=42,
|
|
27
|
+
help="Random seed for deterministic sampling",
|
|
28
|
+
)
|
|
29
|
+
parser.add_argument("--timeout", type=int, default=15, help="HTTP timeout seconds")
|
|
30
|
+
parser.add_argument(
|
|
31
|
+
"--max-bytes",
|
|
32
|
+
type=int,
|
|
33
|
+
default=DEFAULT_MAX_AUDIT_BYTES,
|
|
34
|
+
help="Abort identity fetches larger than this many bytes",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"--manifest-max-bytes",
|
|
38
|
+
type=int,
|
|
39
|
+
default=DEFAULT_MAX_MANIFEST_BYTES,
|
|
40
|
+
help="Abort manifest fetches larger than this many bytes",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _run_verify_live(args: argparse.Namespace, stdout: TextIO, stderr: TextIO) -> int:
|
|
45
|
+
if args.sample_size < 0:
|
|
46
|
+
print("sample-size must be non-negative", file=stderr)
|
|
47
|
+
return 1
|
|
48
|
+
if args.timeout <= 0:
|
|
49
|
+
print("timeout must be positive", file=stderr)
|
|
50
|
+
return 1
|
|
51
|
+
if args.max_bytes <= 0:
|
|
52
|
+
print("max-bytes must be positive", file=stderr)
|
|
53
|
+
return 1
|
|
54
|
+
if args.manifest_max_bytes <= 0:
|
|
55
|
+
print("manifest-max-bytes must be positive", file=stderr)
|
|
56
|
+
return 1
|
|
57
|
+
|
|
58
|
+
result = verify_live(
|
|
59
|
+
args.base_url,
|
|
60
|
+
manifest_url_override=args.manifest_url,
|
|
61
|
+
sample_size=args.sample_size,
|
|
62
|
+
seed=args.seed,
|
|
63
|
+
timeout=args.timeout,
|
|
64
|
+
max_bytes=args.max_bytes,
|
|
65
|
+
manifest_max_bytes=args.manifest_max_bytes,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
print(f"manifest: {result.get('manifest_url')}", file=stdout)
|
|
69
|
+
if not result.get("ok"):
|
|
70
|
+
error = result.get("error") or "unknown-error"
|
|
71
|
+
status_code = result.get("status_code")
|
|
72
|
+
suffix = f":{status_code}" if status_code is not None else ""
|
|
73
|
+
print(f"error: {error}{suffix}", file=stderr)
|
|
74
|
+
return 1
|
|
75
|
+
|
|
76
|
+
print(f"sampled: {result['sampled']}", file=stdout)
|
|
77
|
+
print(f"match: {result['match_count']}", file=stdout)
|
|
78
|
+
print(f"mismatch: {result['mismatch_count']}", file=stdout)
|
|
79
|
+
print(f"inconclusive: {result['inconclusive_count']}", file=stdout)
|
|
80
|
+
|
|
81
|
+
for item in result["results"]:
|
|
82
|
+
if item["status"] != "match":
|
|
83
|
+
print(f"- {item['status']}: {item['url']} ({item['detail']})", file=stdout)
|
|
84
|
+
|
|
85
|
+
if result["mismatch_count"] > 0:
|
|
86
|
+
return 2
|
|
87
|
+
return 0
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def verify_live_main(
|
|
91
|
+
argv: Sequence[str] | None = None, stdout: TextIO | None = None, stderr: TextIO | None = None
|
|
92
|
+
) -> int:
|
|
93
|
+
parser = argparse.ArgumentParser(description="Verify pagedigest digest values over the wire")
|
|
94
|
+
_add_verify_live_args(parser)
|
|
95
|
+
return _run_verify_live(parser.parse_args(argv), stdout or sys.stdout, stderr or sys.stderr)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def main(argv: Sequence[str] | None = None, stdout: TextIO | None = None, stderr: TextIO | None = None) -> int:
|
|
99
|
+
parser = argparse.ArgumentParser(prog="pagedigest")
|
|
100
|
+
subcommands = parser.add_subparsers(dest="command")
|
|
101
|
+
verify_parser = subcommands.add_parser("verify-live", help="Verify manifest digests against live responses")
|
|
102
|
+
_add_verify_live_args(verify_parser)
|
|
103
|
+
|
|
104
|
+
args = parser.parse_args(argv)
|
|
105
|
+
if args.command == "verify-live":
|
|
106
|
+
return _run_verify_live(args, stdout or sys.stdout, stderr or sys.stderr)
|
|
107
|
+
|
|
108
|
+
parser.print_help(stderr or sys.stderr)
|
|
109
|
+
return 1
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
if __name__ == "__main__":
|
|
113
|
+
raise SystemExit(main())
|
pagedigest/core.py
ADDED
|
@@ -0,0 +1,767 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import random
|
|
6
|
+
import re
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from typing import Any
|
|
10
|
+
from urllib.parse import urljoin, urlsplit, urlunsplit
|
|
11
|
+
|
|
12
|
+
import requests
|
|
13
|
+
|
|
14
|
+
MANIFEST_PATH = "/.well-known/pagedigest.json"
|
|
15
|
+
DEFAULT_MAX_MANIFEST_BYTES = 10 * 1024 * 1024
|
|
16
|
+
DEFAULT_MAX_AUDIT_BYTES = 10 * 1024 * 1024
|
|
17
|
+
URL_KEY_PATTERN = re.compile(r"^/([^#]*)?$")
|
|
18
|
+
DIGEST_PATTERN = re.compile(r"^sha256:[a-f0-9]{64}$")
|
|
19
|
+
TIMESTAMP_PATTERN = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|\+00:00)$")
|
|
20
|
+
STATE_HEADER_PATTERN = re.compile(r'^site_rev=(0|[1-9]\d*)(?:; manifest="([^"\\\r\n]+)")?$')
|
|
21
|
+
UNRESERVED = set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class FetchResult:
|
|
26
|
+
ok: bool
|
|
27
|
+
status_code: int | None
|
|
28
|
+
manifest: dict[str, Any] | None
|
|
29
|
+
etag: str | None
|
|
30
|
+
last_modified: str | None
|
|
31
|
+
error: str | None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class LiveAuditItem:
|
|
36
|
+
url_key: str
|
|
37
|
+
url: str
|
|
38
|
+
status: str
|
|
39
|
+
detail: str
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _is_non_negative_int(value: Any) -> bool:
|
|
43
|
+
return type(value) is int and value >= 0
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def format_state_header(site_rev: int, manifest: str | None = None) -> str:
|
|
47
|
+
"""Format the optional PageDigest-State request header."""
|
|
48
|
+
if not _is_non_negative_int(site_rev):
|
|
49
|
+
raise ValueError("invalid-site-rev")
|
|
50
|
+
value = f"site_rev={site_rev}"
|
|
51
|
+
if manifest is not None:
|
|
52
|
+
if (
|
|
53
|
+
not isinstance(manifest, str)
|
|
54
|
+
or not manifest.startswith("/")
|
|
55
|
+
or "#" in manifest
|
|
56
|
+
or any(ch in manifest for ch in ('"', "\\", "\r", "\n"))
|
|
57
|
+
):
|
|
58
|
+
raise ValueError("invalid-state-manifest")
|
|
59
|
+
value += f'; manifest="{manifest}"'
|
|
60
|
+
return value
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def parse_state_header(value: str) -> dict[str, Any]:
|
|
64
|
+
"""Parse PageDigest-State using the strict v1 optional-client syntax."""
|
|
65
|
+
if not isinstance(value, str) or (match := STATE_HEADER_PATTERN.fullmatch(value)) is None:
|
|
66
|
+
raise ValueError("invalid-state-header")
|
|
67
|
+
manifest = match.group(2)
|
|
68
|
+
if manifest is not None and (not manifest.startswith("/") or "#" in manifest):
|
|
69
|
+
raise ValueError("invalid-state-manifest")
|
|
70
|
+
return {
|
|
71
|
+
"site_rev": int(match.group(1)),
|
|
72
|
+
**({"manifest": manifest} if manifest is not None else {}),
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _valid_percent_encoding(value: str, index: int) -> bool:
|
|
77
|
+
if index + 2 >= len(value):
|
|
78
|
+
return False
|
|
79
|
+
return all(ch in "0123456789ABCDEFabcdef" for ch in value[index + 1 : index + 3])
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _validate_url_key(key: Any) -> str | None:
|
|
83
|
+
if not isinstance(key, str):
|
|
84
|
+
return "invalid-url-key-type"
|
|
85
|
+
if not URL_KEY_PATTERN.match(key):
|
|
86
|
+
return "invalid-url-key-pattern"
|
|
87
|
+
if " " in key:
|
|
88
|
+
return "invalid-url-key-space"
|
|
89
|
+
|
|
90
|
+
index = 0
|
|
91
|
+
while index < len(key):
|
|
92
|
+
ch = key[index]
|
|
93
|
+
if ch == "%":
|
|
94
|
+
if not _valid_percent_encoding(key, index):
|
|
95
|
+
return "invalid-url-key-encoding"
|
|
96
|
+
index += 3
|
|
97
|
+
continue
|
|
98
|
+
if ord(ch) > 127:
|
|
99
|
+
return "invalid-url-key-unencoded"
|
|
100
|
+
if ch not in UNRESERVED and ch not in {
|
|
101
|
+
"/",
|
|
102
|
+
"?",
|
|
103
|
+
"&",
|
|
104
|
+
"=",
|
|
105
|
+
":",
|
|
106
|
+
"@",
|
|
107
|
+
"!",
|
|
108
|
+
"$",
|
|
109
|
+
"'",
|
|
110
|
+
"(",
|
|
111
|
+
")",
|
|
112
|
+
"*",
|
|
113
|
+
"+",
|
|
114
|
+
",",
|
|
115
|
+
";",
|
|
116
|
+
}:
|
|
117
|
+
return "invalid-url-key-unencoded"
|
|
118
|
+
index += 1
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def resolve_url_key(base_url: str, url_key: str) -> str:
|
|
123
|
+
"""Resolve a manifest key without allowing it to escape the base origin."""
|
|
124
|
+
if (validation_error := _validate_url_key(url_key)) is not None:
|
|
125
|
+
raise ValueError(validation_error)
|
|
126
|
+
|
|
127
|
+
base = urlsplit(base_url)
|
|
128
|
+
if base.scheme.lower() not in {"http", "https"} or not base.netloc:
|
|
129
|
+
raise ValueError("invalid-base-url")
|
|
130
|
+
|
|
131
|
+
origin = urlunsplit((base.scheme, base.netloc, "/", "", ""))
|
|
132
|
+
resolved = urljoin(origin, url_key)
|
|
133
|
+
target = urlsplit(resolved)
|
|
134
|
+
if target.scheme.lower() != base.scheme.lower() or target.netloc.lower() != base.netloc.lower():
|
|
135
|
+
raise ValueError("url-key-origin-escape")
|
|
136
|
+
return resolved
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def manifest_url(base_url: str) -> str:
|
|
140
|
+
"""Return the origin-root pagedigest manifest URL for a site/page URL."""
|
|
141
|
+
base = urlsplit(base_url)
|
|
142
|
+
if base.scheme.lower() not in {"http", "https"} or not base.netloc:
|
|
143
|
+
raise ValueError("invalid-base-url")
|
|
144
|
+
return urlunsplit((base.scheme, base.netloc, MANIFEST_PATH, "", ""))
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def live_manifest_url(base_url: str, manifest_url_override: str | None = None) -> str:
|
|
148
|
+
"""Return the manifest URL used by live verification."""
|
|
149
|
+
if manifest_url_override:
|
|
150
|
+
parsed = urlsplit(manifest_url_override)
|
|
151
|
+
if parsed.scheme.lower() not in {"http", "https"} or not parsed.netloc:
|
|
152
|
+
raise ValueError("invalid-manifest-url")
|
|
153
|
+
return manifest_url_override
|
|
154
|
+
return manifest_url(base_url)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _validate_timestamp(value: Any, field: str) -> str | None:
|
|
158
|
+
if not isinstance(value, str) or not TIMESTAMP_PATTERN.match(value):
|
|
159
|
+
return f"invalid-{field}"
|
|
160
|
+
try:
|
|
161
|
+
parsed = datetime.fromisoformat(value.replace("Z", "+00:00"))
|
|
162
|
+
except ValueError:
|
|
163
|
+
return f"invalid-{field}"
|
|
164
|
+
if parsed.utcoffset() != timezone.utc.utcoffset(parsed):
|
|
165
|
+
return f"invalid-{field}"
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _validate_coverage(coverage: Any) -> str | None:
|
|
170
|
+
if not isinstance(coverage, dict):
|
|
171
|
+
return "invalid-coverage-type"
|
|
172
|
+
|
|
173
|
+
mode = coverage.get("mode")
|
|
174
|
+
if mode == "complete":
|
|
175
|
+
return None
|
|
176
|
+
if mode == "prefixes":
|
|
177
|
+
prefixes = coverage.get("prefixes")
|
|
178
|
+
if not isinstance(prefixes, list) or not prefixes:
|
|
179
|
+
return "invalid-coverage-prefixes"
|
|
180
|
+
for prefix in prefixes:
|
|
181
|
+
if not isinstance(prefix, str) or not prefix.startswith("/"):
|
|
182
|
+
return "invalid-coverage-prefix"
|
|
183
|
+
return None
|
|
184
|
+
return "invalid-coverage-mode"
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def validate_manifest(manifest: dict[str, Any]) -> str | None:
|
|
188
|
+
"""Return an error code when the manifest is structurally invalid."""
|
|
189
|
+
for field in ("version", "generated", "site_rev", "entries"):
|
|
190
|
+
if field not in manifest:
|
|
191
|
+
return f"missing-{field}"
|
|
192
|
+
|
|
193
|
+
if manifest.get("version") != 1:
|
|
194
|
+
return "unsupported-version"
|
|
195
|
+
|
|
196
|
+
if (err := _validate_timestamp(manifest.get("generated"), "generated")) is not None:
|
|
197
|
+
return err
|
|
198
|
+
|
|
199
|
+
if not _is_non_negative_int(manifest.get("site_rev")):
|
|
200
|
+
return "invalid-site-rev"
|
|
201
|
+
|
|
202
|
+
entries = manifest.get("entries")
|
|
203
|
+
if not isinstance(entries, dict):
|
|
204
|
+
return "invalid-entries"
|
|
205
|
+
|
|
206
|
+
if "coverage" in manifest:
|
|
207
|
+
if (err := _validate_coverage(manifest.get("coverage"))) is not None:
|
|
208
|
+
return err
|
|
209
|
+
|
|
210
|
+
for url_key, entry in entries.items():
|
|
211
|
+
if (err := _validate_url_key(url_key)) is not None:
|
|
212
|
+
return err
|
|
213
|
+
if not isinstance(entry, dict):
|
|
214
|
+
return "invalid-entry-type"
|
|
215
|
+
if not _is_non_negative_int(entry.get("rev")):
|
|
216
|
+
return "invalid-rev"
|
|
217
|
+
if "digest" in entry:
|
|
218
|
+
digest = entry.get("digest")
|
|
219
|
+
if not isinstance(digest, str) or not DIGEST_PATTERN.match(digest):
|
|
220
|
+
return "invalid-digest"
|
|
221
|
+
if "modified" in entry:
|
|
222
|
+
if (err := _validate_timestamp(entry.get("modified"), "modified")) is not None:
|
|
223
|
+
return err
|
|
224
|
+
|
|
225
|
+
return None
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _read_manifest_body(response: requests.Response, max_bytes: int) -> tuple[bytes | None, str | None]:
|
|
229
|
+
content_length = response.headers.get("Content-Length")
|
|
230
|
+
if content_length is not None:
|
|
231
|
+
try:
|
|
232
|
+
if int(content_length) > max_bytes:
|
|
233
|
+
return None, "manifest-too-large"
|
|
234
|
+
except ValueError:
|
|
235
|
+
return None, "invalid-content-length"
|
|
236
|
+
|
|
237
|
+
chunks: list[bytes] = []
|
|
238
|
+
total = 0
|
|
239
|
+
for chunk in response.iter_content(chunk_size=65536):
|
|
240
|
+
if not chunk:
|
|
241
|
+
continue
|
|
242
|
+
total += len(chunk)
|
|
243
|
+
if total > max_bytes:
|
|
244
|
+
return None, "manifest-too-large"
|
|
245
|
+
chunks.append(chunk)
|
|
246
|
+
return b"".join(chunks), None
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def identity_digest(
|
|
250
|
+
response: requests.Response, max_bytes: int = DEFAULT_MAX_AUDIT_BYTES
|
|
251
|
+
) -> tuple[str | None, str | None]:
|
|
252
|
+
"""Stream-hash a response body, capped at ``max_bytes``.
|
|
253
|
+
|
|
254
|
+
Returns ``(digest, None)`` where digest is ``sha256:<hex>``, or
|
|
255
|
+
``(None, error_code)`` (``body-too-large`` / ``invalid-content-length``)
|
|
256
|
+
when the body cannot be safely hashed. ``response`` must have been fetched
|
|
257
|
+
with ``stream=True``; the caller owns closing it.
|
|
258
|
+
"""
|
|
259
|
+
content_length = response.headers.get("Content-Length")
|
|
260
|
+
if content_length is not None:
|
|
261
|
+
try:
|
|
262
|
+
if int(content_length) > max_bytes:
|
|
263
|
+
return None, "body-too-large"
|
|
264
|
+
except ValueError:
|
|
265
|
+
return None, "invalid-content-length"
|
|
266
|
+
|
|
267
|
+
hasher = hashlib.sha256()
|
|
268
|
+
total = 0
|
|
269
|
+
for chunk in response.iter_content(chunk_size=65536):
|
|
270
|
+
if not chunk:
|
|
271
|
+
continue
|
|
272
|
+
total += len(chunk)
|
|
273
|
+
if total > max_bytes:
|
|
274
|
+
return None, "body-too-large"
|
|
275
|
+
hasher.update(chunk)
|
|
276
|
+
return "sha256:" + hasher.hexdigest(), None
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def fetch(
|
|
280
|
+
base_url: str,
|
|
281
|
+
timeout: int = 10,
|
|
282
|
+
etag: str | None = None,
|
|
283
|
+
last_modified: str | None = None,
|
|
284
|
+
session: requests.Session | None = None,
|
|
285
|
+
max_bytes: int = DEFAULT_MAX_MANIFEST_BYTES,
|
|
286
|
+
) -> FetchResult:
|
|
287
|
+
"""Fetch the pagedigest manifest with graceful fallback semantics."""
|
|
288
|
+
s = session or requests.Session()
|
|
289
|
+
headers: dict[str, str] = {}
|
|
290
|
+
if etag:
|
|
291
|
+
headers["If-None-Match"] = etag
|
|
292
|
+
if last_modified:
|
|
293
|
+
headers["If-Modified-Since"] = last_modified
|
|
294
|
+
|
|
295
|
+
try:
|
|
296
|
+
url = manifest_url(base_url)
|
|
297
|
+
r = s.get(url, headers=headers, timeout=timeout, stream=True)
|
|
298
|
+
except requests.RequestException as exc:
|
|
299
|
+
return FetchResult(False, None, None, None, None, str(exc))
|
|
300
|
+
except ValueError as exc:
|
|
301
|
+
return FetchResult(False, None, None, None, None, str(exc))
|
|
302
|
+
|
|
303
|
+
try:
|
|
304
|
+
if r.status_code == 304:
|
|
305
|
+
return FetchResult(True, 304, None, r.headers.get("ETag"), r.headers.get("Last-Modified"), None)
|
|
306
|
+
|
|
307
|
+
if r.status_code != 200:
|
|
308
|
+
return FetchResult(
|
|
309
|
+
False,
|
|
310
|
+
r.status_code,
|
|
311
|
+
None,
|
|
312
|
+
r.headers.get("ETag"),
|
|
313
|
+
r.headers.get("Last-Modified"),
|
|
314
|
+
"manifest-unavailable",
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
body, size_error = _read_manifest_body(r, max_bytes)
|
|
318
|
+
if size_error is not None:
|
|
319
|
+
return FetchResult(
|
|
320
|
+
False, r.status_code, None, r.headers.get("ETag"), r.headers.get("Last-Modified"), size_error
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
assert body is not None
|
|
324
|
+
try:
|
|
325
|
+
manifest = json.loads(body)
|
|
326
|
+
except ValueError:
|
|
327
|
+
return FetchResult(
|
|
328
|
+
False, r.status_code, None, r.headers.get("ETag"), r.headers.get("Last-Modified"), "invalid-json"
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
if not isinstance(manifest, dict):
|
|
332
|
+
return FetchResult(
|
|
333
|
+
False,
|
|
334
|
+
r.status_code,
|
|
335
|
+
None,
|
|
336
|
+
r.headers.get("ETag"),
|
|
337
|
+
r.headers.get("Last-Modified"),
|
|
338
|
+
"invalid-manifest-type",
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
if (validation_error := validate_manifest(manifest)) is not None:
|
|
342
|
+
return FetchResult(
|
|
343
|
+
False, r.status_code, None, r.headers.get("ETag"), r.headers.get("Last-Modified"), validation_error
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
return FetchResult(True, r.status_code, manifest, r.headers.get("ETag"), r.headers.get("Last-Modified"), None)
|
|
347
|
+
finally:
|
|
348
|
+
close = getattr(r, "close", None)
|
|
349
|
+
if callable(close):
|
|
350
|
+
close()
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def fetch_manifest_url(
|
|
354
|
+
url: str,
|
|
355
|
+
timeout: int = 10,
|
|
356
|
+
session: requests.Session | None = None,
|
|
357
|
+
max_bytes: int = DEFAULT_MAX_MANIFEST_BYTES,
|
|
358
|
+
) -> FetchResult:
|
|
359
|
+
"""Fetch and validate a concrete manifest URL.
|
|
360
|
+
|
|
361
|
+
This is useful for deployment gates that need to verify a non-default or
|
|
362
|
+
pre-production manifest URL while still applying normal PageDigest
|
|
363
|
+
validation and response-size limits.
|
|
364
|
+
"""
|
|
365
|
+
parsed = urlsplit(url)
|
|
366
|
+
if parsed.scheme.lower() not in {"http", "https"} or not parsed.netloc:
|
|
367
|
+
return FetchResult(False, None, None, None, None, "invalid-manifest-url")
|
|
368
|
+
|
|
369
|
+
s = session or requests.Session()
|
|
370
|
+
try:
|
|
371
|
+
r = s.get(url, timeout=timeout, stream=True)
|
|
372
|
+
except requests.RequestException as exc:
|
|
373
|
+
return FetchResult(False, None, None, None, None, str(exc))
|
|
374
|
+
|
|
375
|
+
try:
|
|
376
|
+
if r.status_code != 200:
|
|
377
|
+
return FetchResult(
|
|
378
|
+
False,
|
|
379
|
+
r.status_code,
|
|
380
|
+
None,
|
|
381
|
+
r.headers.get("ETag"),
|
|
382
|
+
r.headers.get("Last-Modified"),
|
|
383
|
+
"manifest-unavailable",
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
body, size_error = _read_manifest_body(r, max_bytes)
|
|
387
|
+
if size_error is not None:
|
|
388
|
+
return FetchResult(
|
|
389
|
+
False, r.status_code, None, r.headers.get("ETag"), r.headers.get("Last-Modified"), size_error
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
assert body is not None
|
|
393
|
+
try:
|
|
394
|
+
manifest = json.loads(body)
|
|
395
|
+
except ValueError:
|
|
396
|
+
return FetchResult(
|
|
397
|
+
False, r.status_code, None, r.headers.get("ETag"), r.headers.get("Last-Modified"), "invalid-json"
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
if not isinstance(manifest, dict):
|
|
401
|
+
return FetchResult(
|
|
402
|
+
False,
|
|
403
|
+
r.status_code,
|
|
404
|
+
None,
|
|
405
|
+
r.headers.get("ETag"),
|
|
406
|
+
r.headers.get("Last-Modified"),
|
|
407
|
+
"invalid-manifest-type",
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
if (validation_error := validate_manifest(manifest)) is not None:
|
|
411
|
+
return FetchResult(
|
|
412
|
+
False, r.status_code, None, r.headers.get("ETag"), r.headers.get("Last-Modified"), validation_error
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
return FetchResult(True, r.status_code, manifest, r.headers.get("ETag"), r.headers.get("Last-Modified"), None)
|
|
416
|
+
finally:
|
|
417
|
+
close = getattr(r, "close", None)
|
|
418
|
+
if callable(close):
|
|
419
|
+
close()
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def diff(
|
|
423
|
+
manifest: dict[str, Any],
|
|
424
|
+
cached_site_rev: int | None,
|
|
425
|
+
cached_revs: dict[str, int] | None,
|
|
426
|
+
) -> dict[str, Any]:
|
|
427
|
+
"""Compare a manifest against cached state and return crawl decisions."""
|
|
428
|
+
cached_revs = cached_revs or {}
|
|
429
|
+
site_rev = manifest["site_rev"]
|
|
430
|
+
entries: dict[str, Any] = manifest["entries"]
|
|
431
|
+
coverage_mode = (manifest.get("coverage") or {}).get("mode")
|
|
432
|
+
|
|
433
|
+
if cached_site_rev is not None and site_rev < cached_site_rev:
|
|
434
|
+
return {
|
|
435
|
+
"site_changed": False,
|
|
436
|
+
"changed": [],
|
|
437
|
+
"new": [],
|
|
438
|
+
"unchanged": [],
|
|
439
|
+
"removed": [],
|
|
440
|
+
"anomalies": [
|
|
441
|
+
{
|
|
442
|
+
"reason": "site-rev-decrease",
|
|
443
|
+
"cached": cached_site_rev,
|
|
444
|
+
"manifest": site_rev,
|
|
445
|
+
}
|
|
446
|
+
],
|
|
447
|
+
"site_anomaly": "site-rev-decrease",
|
|
448
|
+
"site_rev": site_rev,
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
if cached_site_rev is not None and site_rev == cached_site_rev:
|
|
452
|
+
return {
|
|
453
|
+
"site_changed": False,
|
|
454
|
+
"changed": [],
|
|
455
|
+
"new": [],
|
|
456
|
+
"unchanged": sorted(entries.keys()),
|
|
457
|
+
"removed": [],
|
|
458
|
+
"anomalies": [],
|
|
459
|
+
"site_rev": site_rev,
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
changed: list[str] = []
|
|
463
|
+
new: list[str] = []
|
|
464
|
+
unchanged: list[str] = []
|
|
465
|
+
anomalies: list[dict[str, Any]] = []
|
|
466
|
+
|
|
467
|
+
for url_key, entry in entries.items():
|
|
468
|
+
rev = entry.get("rev")
|
|
469
|
+
prev = cached_revs.get(url_key)
|
|
470
|
+
if prev is None:
|
|
471
|
+
new.append(url_key)
|
|
472
|
+
elif rev > prev:
|
|
473
|
+
changed.append(url_key)
|
|
474
|
+
elif rev == prev:
|
|
475
|
+
unchanged.append(url_key)
|
|
476
|
+
else:
|
|
477
|
+
anomalies.append({"url": url_key, "reason": "rev-decrease", "cached": prev, "manifest": rev})
|
|
478
|
+
|
|
479
|
+
removed = sorted(set(cached_revs) - set(entries))
|
|
480
|
+
if removed and coverage_mode != "complete":
|
|
481
|
+
removed = []
|
|
482
|
+
|
|
483
|
+
return {
|
|
484
|
+
"site_changed": cached_site_rev is None or site_rev != cached_site_rev,
|
|
485
|
+
"changed": sorted(changed),
|
|
486
|
+
"new": sorted(new),
|
|
487
|
+
"unchanged": sorted(unchanged),
|
|
488
|
+
"removed": removed,
|
|
489
|
+
"anomalies": anomalies,
|
|
490
|
+
"site_rev": site_rev,
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def audit(
|
|
495
|
+
base_url: str,
|
|
496
|
+
url_key: str,
|
|
497
|
+
expected_digest: str,
|
|
498
|
+
timeout: int = 10,
|
|
499
|
+
session: requests.Session | None = None,
|
|
500
|
+
max_bytes: int = DEFAULT_MAX_AUDIT_BYTES,
|
|
501
|
+
) -> dict[str, Any]:
|
|
502
|
+
"""Audit a digest claim using identity-encoding fetch semantics.
|
|
503
|
+
|
|
504
|
+
The response body is streamed and capped at ``max_bytes`` so a publisher
|
|
505
|
+
cannot exhaust auditor memory with an oversized page.
|
|
506
|
+
"""
|
|
507
|
+
s = session or requests.Session()
|
|
508
|
+
try:
|
|
509
|
+
url = resolve_url_key(base_url, url_key)
|
|
510
|
+
except ValueError as exc:
|
|
511
|
+
return {"result": "inconclusive", "reason": str(exc)}
|
|
512
|
+
try:
|
|
513
|
+
r = s.get(
|
|
514
|
+
url,
|
|
515
|
+
headers={"Accept-Encoding": "identity"},
|
|
516
|
+
timeout=timeout,
|
|
517
|
+
allow_redirects=False,
|
|
518
|
+
stream=True,
|
|
519
|
+
)
|
|
520
|
+
except requests.RequestException as exc:
|
|
521
|
+
return {"result": "inconclusive", "reason": "network-error", "error": str(exc)}
|
|
522
|
+
|
|
523
|
+
try:
|
|
524
|
+
if 300 <= r.status_code < 400:
|
|
525
|
+
return {"result": "inconclusive", "reason": "redirect", "status_code": r.status_code}
|
|
526
|
+
if r.status_code < 200 or r.status_code >= 300:
|
|
527
|
+
return {"result": "inconclusive", "reason": "non-success", "status_code": r.status_code}
|
|
528
|
+
|
|
529
|
+
computed, size_error = identity_digest(r, max_bytes)
|
|
530
|
+
if size_error is not None:
|
|
531
|
+
return {"result": "inconclusive", "reason": size_error}
|
|
532
|
+
if computed == expected_digest:
|
|
533
|
+
return {"result": "match", "computed": computed}
|
|
534
|
+
return {"result": "mismatch", "computed": computed, "expected": expected_digest}
|
|
535
|
+
finally:
|
|
536
|
+
close = getattr(r, "close", None)
|
|
537
|
+
if callable(close):
|
|
538
|
+
close()
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def _audit_detail(outcome: dict[str, Any]) -> str:
|
|
542
|
+
result = outcome.get("result")
|
|
543
|
+
if result == "match":
|
|
544
|
+
return "ok"
|
|
545
|
+
if result == "mismatch":
|
|
546
|
+
return f"expected={outcome.get('expected')} computed={outcome.get('computed')}"
|
|
547
|
+
reason = str(outcome.get("reason", "unknown"))
|
|
548
|
+
if "status_code" in outcome:
|
|
549
|
+
return f"{reason}:{outcome['status_code']}"
|
|
550
|
+
if "error" in outcome:
|
|
551
|
+
return f"{reason}: {outcome['error']}"
|
|
552
|
+
return reason
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
def verify_live(
|
|
556
|
+
base_url: str,
|
|
557
|
+
manifest_url_override: str | None = None,
|
|
558
|
+
sample_size: int = 25,
|
|
559
|
+
seed: int = 42,
|
|
560
|
+
timeout: int = 15,
|
|
561
|
+
max_bytes: int = DEFAULT_MAX_AUDIT_BYTES,
|
|
562
|
+
manifest_max_bytes: int = DEFAULT_MAX_MANIFEST_BYTES,
|
|
563
|
+
session: requests.Session | None = None,
|
|
564
|
+
) -> dict[str, Any]:
|
|
565
|
+
"""Verify sampled manifest digests against live identity-encoded responses.
|
|
566
|
+
|
|
567
|
+
The return value is shaped for CLIs and deployment gates. ``mismatch_count``
|
|
568
|
+
is the hard failure signal; inconclusive entries are reported without
|
|
569
|
+
failing because redirects, network failures, or intentional large bodies can
|
|
570
|
+
be deployment-environment dependent.
|
|
571
|
+
"""
|
|
572
|
+
try:
|
|
573
|
+
manifest_location = live_manifest_url(base_url, manifest_url_override)
|
|
574
|
+
except ValueError as exc:
|
|
575
|
+
return {
|
|
576
|
+
"ok": False,
|
|
577
|
+
"manifest_url": manifest_url_override,
|
|
578
|
+
"error": str(exc),
|
|
579
|
+
"sampled": 0,
|
|
580
|
+
"match_count": 0,
|
|
581
|
+
"mismatch_count": 0,
|
|
582
|
+
"inconclusive_count": 0,
|
|
583
|
+
"results": [],
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
s = session or requests.Session()
|
|
587
|
+
fetched = fetch_manifest_url(manifest_location, timeout=timeout, session=s, max_bytes=manifest_max_bytes)
|
|
588
|
+
if not fetched.ok:
|
|
589
|
+
return {
|
|
590
|
+
"ok": False,
|
|
591
|
+
"manifest_url": manifest_location,
|
|
592
|
+
"error": fetched.error,
|
|
593
|
+
"status_code": fetched.status_code,
|
|
594
|
+
"sampled": 0,
|
|
595
|
+
"match_count": 0,
|
|
596
|
+
"mismatch_count": 0,
|
|
597
|
+
"inconclusive_count": 0,
|
|
598
|
+
"results": [],
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
assert fetched.manifest is not None
|
|
602
|
+
digest_entries: list[tuple[str, str]] = []
|
|
603
|
+
for url_key, entry in fetched.manifest["entries"].items():
|
|
604
|
+
if not isinstance(entry, dict):
|
|
605
|
+
continue
|
|
606
|
+
digest = entry.get("digest")
|
|
607
|
+
if isinstance(url_key, str) and isinstance(digest, str):
|
|
608
|
+
digest_entries.append((url_key, digest))
|
|
609
|
+
|
|
610
|
+
if not digest_entries:
|
|
611
|
+
return {
|
|
612
|
+
"ok": True,
|
|
613
|
+
"manifest_url": manifest_location,
|
|
614
|
+
"sampled": 0,
|
|
615
|
+
"match_count": 0,
|
|
616
|
+
"mismatch_count": 0,
|
|
617
|
+
"inconclusive_count": 0,
|
|
618
|
+
"results": [],
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
picker = random.Random(seed)
|
|
622
|
+
sample_count = min(sample_size, len(digest_entries))
|
|
623
|
+
sample = picker.sample(digest_entries, sample_count)
|
|
624
|
+
results: list[LiveAuditItem] = []
|
|
625
|
+
for url_key, expected_digest in sample:
|
|
626
|
+
try:
|
|
627
|
+
resolved = resolve_url_key(base_url, url_key)
|
|
628
|
+
except ValueError:
|
|
629
|
+
resolved = url_key
|
|
630
|
+
outcome = audit(base_url, url_key, expected_digest, timeout=timeout, session=s, max_bytes=max_bytes)
|
|
631
|
+
results.append(
|
|
632
|
+
LiveAuditItem(
|
|
633
|
+
url_key=url_key,
|
|
634
|
+
url=resolved,
|
|
635
|
+
status=str(outcome.get("result", "inconclusive")),
|
|
636
|
+
detail=_audit_detail(outcome),
|
|
637
|
+
)
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
match_count = sum(item.status == "match" for item in results)
|
|
641
|
+
mismatch_count = sum(item.status == "mismatch" for item in results)
|
|
642
|
+
inconclusive_count = sum(item.status == "inconclusive" for item in results)
|
|
643
|
+
return {
|
|
644
|
+
"ok": True,
|
|
645
|
+
"manifest_url": manifest_location,
|
|
646
|
+
"sampled": sample_count,
|
|
647
|
+
"match_count": match_count,
|
|
648
|
+
"mismatch_count": mismatch_count,
|
|
649
|
+
"inconclusive_count": inconclusive_count,
|
|
650
|
+
"results": [
|
|
651
|
+
{
|
|
652
|
+
"url_key": item.url_key,
|
|
653
|
+
"url": item.url,
|
|
654
|
+
"status": item.status,
|
|
655
|
+
"detail": item.detail,
|
|
656
|
+
}
|
|
657
|
+
for item in results
|
|
658
|
+
],
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
def _sample_audit_candidates(
|
|
663
|
+
manifest: dict[str, Any],
|
|
664
|
+
unchanged: list[str],
|
|
665
|
+
sample_audit_rate: float,
|
|
666
|
+
rng: random.Random | None = None,
|
|
667
|
+
) -> list[dict[str, str]]:
|
|
668
|
+
if sample_audit_rate <= 0:
|
|
669
|
+
return []
|
|
670
|
+
|
|
671
|
+
entries = manifest["entries"]
|
|
672
|
+
pool: list[dict[str, str]] = []
|
|
673
|
+
for url_key in unchanged:
|
|
674
|
+
entry = entries.get(url_key)
|
|
675
|
+
if not isinstance(entry, dict):
|
|
676
|
+
continue
|
|
677
|
+
digest = entry.get("digest")
|
|
678
|
+
if isinstance(digest, str) and DIGEST_PATTERN.match(digest):
|
|
679
|
+
pool.append({"url": url_key, "digest": digest})
|
|
680
|
+
|
|
681
|
+
if not pool:
|
|
682
|
+
return []
|
|
683
|
+
|
|
684
|
+
sample_size = max(1, int(len(pool) * sample_audit_rate)) if sample_audit_rate < 1 else len(pool)
|
|
685
|
+
sample_size = min(sample_size, len(pool))
|
|
686
|
+
picker = rng or random.Random()
|
|
687
|
+
return picker.sample(pool, sample_size)
|
|
688
|
+
|
|
689
|
+
|
|
690
|
+
def check_site(
|
|
691
|
+
base_url: str,
|
|
692
|
+
cached_site_rev: int | None,
|
|
693
|
+
cached_revs: dict[str, int] | None,
|
|
694
|
+
timeout: int = 10,
|
|
695
|
+
etag: str | None = None,
|
|
696
|
+
last_modified: str | None = None,
|
|
697
|
+
sample_audit_rate: float = 0.0,
|
|
698
|
+
session: requests.Session | None = None,
|
|
699
|
+
max_bytes: int = DEFAULT_MAX_MANIFEST_BYTES,
|
|
700
|
+
rng: random.Random | None = None,
|
|
701
|
+
) -> dict[str, Any]:
|
|
702
|
+
"""High-level convenience API: fetch + diff + optional sampled audit plan.
|
|
703
|
+
|
|
704
|
+
Any anomaly — a ``site_rev`` decrease or even a single per-URL ``rev``
|
|
705
|
+
decrease — triggers a whole-site fallback. This is deliberately stricter
|
|
706
|
+
than SPEC §5.2.2, which scopes an isolated per-URL decrease to that URL;
|
|
707
|
+
callers that need finer-grained handling should call ``fetch`` and ``diff``
|
|
708
|
+
directly and apply their own scoping policy.
|
|
709
|
+
"""
|
|
710
|
+
result = fetch(
|
|
711
|
+
base_url,
|
|
712
|
+
timeout=timeout,
|
|
713
|
+
etag=etag,
|
|
714
|
+
last_modified=last_modified,
|
|
715
|
+
session=session,
|
|
716
|
+
max_bytes=max_bytes,
|
|
717
|
+
)
|
|
718
|
+
if not result.ok:
|
|
719
|
+
return {
|
|
720
|
+
"fallback": True,
|
|
721
|
+
"error": result.error,
|
|
722
|
+
"status_code": result.status_code,
|
|
723
|
+
"etag": result.etag,
|
|
724
|
+
"last_modified": result.last_modified,
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
if result.status_code == 304:
|
|
728
|
+
return {
|
|
729
|
+
"fallback": False,
|
|
730
|
+
"not_modified": True,
|
|
731
|
+
"changed": [],
|
|
732
|
+
"etag": result.etag,
|
|
733
|
+
"last_modified": result.last_modified,
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
assert result.manifest is not None
|
|
737
|
+
decisions = diff(result.manifest, cached_site_rev, cached_revs)
|
|
738
|
+
|
|
739
|
+
if decisions.get("site_anomaly") or decisions.get("anomalies"):
|
|
740
|
+
return {
|
|
741
|
+
"fallback": True,
|
|
742
|
+
"error": decisions.get("site_anomaly") or "manifest-anomaly",
|
|
743
|
+
"status_code": result.status_code,
|
|
744
|
+
"etag": result.etag,
|
|
745
|
+
"last_modified": result.last_modified,
|
|
746
|
+
"manifest": result.manifest,
|
|
747
|
+
"anomalies": decisions.get("anomalies", []),
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
audit_candidates = _sample_audit_candidates(
|
|
751
|
+
result.manifest,
|
|
752
|
+
decisions["unchanged"],
|
|
753
|
+
sample_audit_rate,
|
|
754
|
+
rng=rng,
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
decisions.update(
|
|
758
|
+
{
|
|
759
|
+
"fallback": False,
|
|
760
|
+
"manifest": result.manifest,
|
|
761
|
+
"etag": result.etag,
|
|
762
|
+
"last_modified": result.last_modified,
|
|
763
|
+
"audit_candidates": audit_candidates,
|
|
764
|
+
"sample_audit_rate": sample_audit_rate,
|
|
765
|
+
}
|
|
766
|
+
)
|
|
767
|
+
return decisions
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pagedigest
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Minimal pagedigest consumer reference implementation
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Project-URL: Homepage, https://pagedigest.org
|
|
7
|
+
Project-URL: Repository, https://github.com/maxwellsantoro/pagedigest
|
|
8
|
+
Requires-Python: >=3.9
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: requests>=2.31.0
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: jsonschema>=4.23.0; extra == "dev"
|
|
13
|
+
Requires-Dist: ruff>=0.8.0; extra == "dev"
|
|
14
|
+
|
|
15
|
+
# Python Consumer (Minimal Reference)
|
|
16
|
+
|
|
17
|
+
## Install (from repo)
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
cd implementations/python-consumer
|
|
21
|
+
uv sync
|
|
22
|
+
uv pip install -e .
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Requires Python ≥3.9. Runtime dependency: `requests`.
|
|
26
|
+
|
|
27
|
+
## API
|
|
28
|
+
|
|
29
|
+
- `fetch` — fetch and validate manifest; graceful fallback on errors
|
|
30
|
+
- `diff` — compare manifest to cached `site_rev` / per-URL `rev`
|
|
31
|
+
- `audit` — identity-encoding digest check (streams the body with a size cap)
|
|
32
|
+
- `check_site` — `fetch` + `diff` + optional sampled audit plan
|
|
33
|
+
- `verify_live` — fetch a manifest and sample live identity-encoded responses for digest verification
|
|
34
|
+
- `identity_digest` — stream-hash a `stream=True` response body, capped at `max_bytes`, for custom audit pipelines
|
|
35
|
+
- `validate_manifest`, `resolve_url_key`, `manifest_url` — validation and URL helpers
|
|
36
|
+
- `format_state_header`, `parse_state_header` — strict optional `PageDigest-State` helpers
|
|
37
|
+
|
|
38
|
+
## CLI
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pagedigest verify-live https://example.com --sample-size 25
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
`verify-live` exits `2` on digest mismatches so it can act as a deployment gate.
|
|
45
|
+
Redirects, network errors, and body-size caps are reported as inconclusive.
|
|
46
|
+
|
|
47
|
+
## Example
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from pagedigest import check_site
|
|
51
|
+
|
|
52
|
+
decision = check_site(
|
|
53
|
+
"https://example.com",
|
|
54
|
+
cached_site_rev=12,
|
|
55
|
+
cached_revs={"/": 3, "/about": 1},
|
|
56
|
+
sample_audit_rate=0.01,
|
|
57
|
+
)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
After a successful manifest check, an integration may make its observed state
|
|
61
|
+
visible on subsequent page requests:
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from pagedigest import format_state_header
|
|
65
|
+
|
|
66
|
+
headers = {
|
|
67
|
+
"PageDigest-State": format_state_header(
|
|
68
|
+
decision["manifest"]["site_rev"],
|
|
69
|
+
"/.well-known/pagedigest.json",
|
|
70
|
+
)
|
|
71
|
+
}
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
This is a corroborating observation signal, not authentication. See
|
|
75
|
+
[SPEC.md §5.4](../../SPEC.md#54-optional-cooperation-request-header).
|
|
76
|
+
|
|
77
|
+
Conformance fixtures: `tests/test_vectors.py` exercises `../../test-vectors/`.
|
|
78
|
+
|
|
79
|
+
## Persistent cache example
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
uv run python examples/cache_persistence.py https://example.com ./pagedigest-cache.json
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
The example stores `site_rev`, per-URL `rev`, `ETag`, and `Last-Modified`
|
|
86
|
+
between runs. It prints page fetch decisions so crawler/indexer integrations can
|
|
87
|
+
replace the `print` calls with their own fetch pipeline.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
pagedigest/__init__.py,sha256=5_7rVcczpeFmn4beRjhCESCNxWJFzXqDhso3Noum988,551
|
|
2
|
+
pagedigest/cli.py,sha256=DZMfvAXG3IWSs9mpmUPQbunNUgz21m3-qRrZlDsuMuk,3884
|
|
3
|
+
pagedigest/core.py,sha256=E7zM2p0lXe1M7fB7pip_o456NAtbmyEZhbELIjZJFmA,25637
|
|
4
|
+
pagedigest-0.1.0.dist-info/METADATA,sha256=ERIRyWoki40NMOMmi-shF-_1Ny_n9bXUa9R3rPMXrbU,2684
|
|
5
|
+
pagedigest-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
6
|
+
pagedigest-0.1.0.dist-info/entry_points.txt,sha256=12ut79n1k1-sslMBvqtpAxpB7PasYAUagDg703aW1Fs,51
|
|
7
|
+
pagedigest-0.1.0.dist-info/top_level.txt,sha256=EknBwadpgRqtK-Ke13-rltM3m-j7z6Kb05E-pU9ATtg,11
|
|
8
|
+
pagedigest-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pagedigest
|