pagedigest 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pagedigest-0.1.0/PKG-INFO +87 -0
- pagedigest-0.1.0/README.md +73 -0
- pagedigest-0.1.0/pagedigest/__init__.py +31 -0
- pagedigest-0.1.0/pagedigest/cli.py +113 -0
- pagedigest-0.1.0/pagedigest/core.py +767 -0
- pagedigest-0.1.0/pagedigest.egg-info/PKG-INFO +87 -0
- pagedigest-0.1.0/pagedigest.egg-info/SOURCES.txt +15 -0
- pagedigest-0.1.0/pagedigest.egg-info/dependency_links.txt +1 -0
- pagedigest-0.1.0/pagedigest.egg-info/entry_points.txt +2 -0
- pagedigest-0.1.0/pagedigest.egg-info/requires.txt +5 -0
- pagedigest-0.1.0/pagedigest.egg-info/top_level.txt +1 -0
- pagedigest-0.1.0/pyproject.toml +33 -0
- pagedigest-0.1.0/setup.cfg +4 -0
- pagedigest-0.1.0/tests/test_cli.py +75 -0
- pagedigest-0.1.0/tests/test_content_hygiene.py +71 -0
- pagedigest-0.1.0/tests/test_core.py +557 -0
- pagedigest-0.1.0/tests/test_vectors.py +106 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pagedigest
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Minimal pagedigest consumer reference implementation
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Project-URL: Homepage, https://pagedigest.org
|
|
7
|
+
Project-URL: Repository, https://github.com/maxwellsantoro/pagedigest
|
|
8
|
+
Requires-Python: >=3.9
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: requests>=2.31.0
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: jsonschema>=4.23.0; extra == "dev"
|
|
13
|
+
Requires-Dist: ruff>=0.8.0; extra == "dev"
|
|
14
|
+
|
|
15
|
+
# Python Consumer (Minimal Reference)
|
|
16
|
+
|
|
17
|
+
## Install (from repo)
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
cd implementations/python-consumer
|
|
21
|
+
uv sync
|
|
22
|
+
uv pip install -e .
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Requires Python ≥3.9. Runtime dependency: `requests`.
|
|
26
|
+
|
|
27
|
+
## API
|
|
28
|
+
|
|
29
|
+
- `fetch` — fetch and validate manifest; graceful fallback on errors
|
|
30
|
+
- `diff` — compare manifest to cached `site_rev` / per-URL `rev`
|
|
31
|
+
- `audit` — identity-encoding digest check (streams the body with a size cap)
|
|
32
|
+
- `check_site` — `fetch` + `diff` + optional sampled audit plan
|
|
33
|
+
- `verify_live` — fetch a manifest and sample live identity-encoded responses for digest verification
|
|
34
|
+
- `identity_digest` — stream-hash a `stream=True` response body, capped at `max_bytes`, for custom audit pipelines
|
|
35
|
+
- `validate_manifest`, `resolve_url_key`, `manifest_url` — validation and URL helpers
|
|
36
|
+
- `format_state_header`, `parse_state_header` — strict optional `PageDigest-State` helpers
|
|
37
|
+
|
|
38
|
+
## CLI
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pagedigest verify-live https://example.com --sample-size 25
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
`verify-live` exits `2` on digest mismatches so it can act as a deployment gate.
|
|
45
|
+
Redirects, network errors, and body-size caps are reported as inconclusive.
|
|
46
|
+
|
|
47
|
+
## Example
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from pagedigest import check_site
|
|
51
|
+
|
|
52
|
+
decision = check_site(
|
|
53
|
+
"https://example.com",
|
|
54
|
+
cached_site_rev=12,
|
|
55
|
+
cached_revs={"/": 3, "/about": 1},
|
|
56
|
+
sample_audit_rate=0.01,
|
|
57
|
+
)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
After a successful manifest check, an integration may make its observed state
|
|
61
|
+
visible on subsequent page requests:
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from pagedigest import format_state_header
|
|
65
|
+
|
|
66
|
+
headers = {
|
|
67
|
+
"PageDigest-State": format_state_header(
|
|
68
|
+
decision["manifest"]["site_rev"],
|
|
69
|
+
"/.well-known/pagedigest.json",
|
|
70
|
+
)
|
|
71
|
+
}
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
This is a corroborating observation signal, not authentication. See
|
|
75
|
+
[SPEC.md §5.4](../../SPEC.md#54-optional-cooperation-request-header).
|
|
76
|
+
|
|
77
|
+
Conformance fixtures: `tests/test_vectors.py` exercises `../../test-vectors/`.
|
|
78
|
+
|
|
79
|
+
## Persistent cache example
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
uv run python examples/cache_persistence.py https://example.com ./pagedigest-cache.json
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
The example stores `site_rev`, per-URL `rev`, `ETag`, and `Last-Modified`
|
|
86
|
+
between runs. It prints page fetch decisions so crawler/indexer integrations can
|
|
87
|
+
replace the `print` calls with their own fetch pipeline.
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Python Consumer (Minimal Reference)
|
|
2
|
+
|
|
3
|
+
## Install (from repo)
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
cd implementations/python-consumer
|
|
7
|
+
uv sync
|
|
8
|
+
uv pip install -e .
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Requires Python ≥3.9. Runtime dependency: `requests`.
|
|
12
|
+
|
|
13
|
+
## API
|
|
14
|
+
|
|
15
|
+
- `fetch` — fetch and validate manifest; graceful fallback on errors
|
|
16
|
+
- `diff` — compare manifest to cached `site_rev` / per-URL `rev`
|
|
17
|
+
- `audit` — identity-encoding digest check (streams the body with a size cap)
|
|
18
|
+
- `check_site` — `fetch` + `diff` + optional sampled audit plan
|
|
19
|
+
- `verify_live` — fetch a manifest and sample live identity-encoded responses for digest verification
|
|
20
|
+
- `identity_digest` — stream-hash a `stream=True` response body, capped at `max_bytes`, for custom audit pipelines
|
|
21
|
+
- `validate_manifest`, `resolve_url_key`, `manifest_url` — validation and URL helpers
|
|
22
|
+
- `format_state_header`, `parse_state_header` — strict optional `PageDigest-State` helpers
|
|
23
|
+
|
|
24
|
+
## CLI
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pagedigest verify-live https://example.com --sample-size 25
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
`verify-live` exits `2` on digest mismatches so it can act as a deployment gate.
|
|
31
|
+
Redirects, network errors, and body-size caps are reported as inconclusive.
|
|
32
|
+
|
|
33
|
+
## Example
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from pagedigest import check_site
|
|
37
|
+
|
|
38
|
+
decision = check_site(
|
|
39
|
+
"https://example.com",
|
|
40
|
+
cached_site_rev=12,
|
|
41
|
+
cached_revs={"/": 3, "/about": 1},
|
|
42
|
+
sample_audit_rate=0.01,
|
|
43
|
+
)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
After a successful manifest check, an integration may make its observed state
|
|
47
|
+
visible on subsequent page requests:
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from pagedigest import format_state_header
|
|
51
|
+
|
|
52
|
+
headers = {
|
|
53
|
+
"PageDigest-State": format_state_header(
|
|
54
|
+
decision["manifest"]["site_rev"],
|
|
55
|
+
"/.well-known/pagedigest.json",
|
|
56
|
+
)
|
|
57
|
+
}
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
This is a corroborating observation signal, not authentication. See
|
|
61
|
+
[SPEC.md §5.4](../../SPEC.md#54-optional-cooperation-request-header).
|
|
62
|
+
|
|
63
|
+
Conformance fixtures: `tests/test_vectors.py` exercises `../../test-vectors/`.
|
|
64
|
+
|
|
65
|
+
## Persistent cache example
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
uv run python examples/cache_persistence.py https://example.com ./pagedigest-cache.json
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
The example stores `site_rev`, per-URL `rev`, `ETag`, and `Last-Modified`
|
|
72
|
+
between runs. It prints page fetch decisions so crawler/indexer integrations can
|
|
73
|
+
replace the `print` calls with their own fetch pipeline.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from .core import (
|
|
2
|
+
audit,
|
|
3
|
+
check_site,
|
|
4
|
+
diff,
|
|
5
|
+
fetch,
|
|
6
|
+
fetch_manifest_url,
|
|
7
|
+
format_state_header,
|
|
8
|
+
identity_digest,
|
|
9
|
+
live_manifest_url,
|
|
10
|
+
manifest_url,
|
|
11
|
+
parse_state_header,
|
|
12
|
+
resolve_url_key,
|
|
13
|
+
validate_manifest,
|
|
14
|
+
verify_live,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"fetch",
|
|
19
|
+
"fetch_manifest_url",
|
|
20
|
+
"format_state_header",
|
|
21
|
+
"diff",
|
|
22
|
+
"audit",
|
|
23
|
+
"check_site",
|
|
24
|
+
"identity_digest",
|
|
25
|
+
"live_manifest_url",
|
|
26
|
+
"manifest_url",
|
|
27
|
+
"parse_state_header",
|
|
28
|
+
"resolve_url_key",
|
|
29
|
+
"validate_manifest",
|
|
30
|
+
"verify_live",
|
|
31
|
+
]
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from collections.abc import Sequence
|
|
6
|
+
from typing import TextIO
|
|
7
|
+
|
|
8
|
+
from .core import DEFAULT_MAX_AUDIT_BYTES, DEFAULT_MAX_MANIFEST_BYTES, verify_live
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _add_verify_live_args(parser: argparse.ArgumentParser) -> None:
|
|
12
|
+
parser.add_argument("base_url", help="Site base URL, e.g. https://example.com")
|
|
13
|
+
parser.add_argument(
|
|
14
|
+
"--manifest-url",
|
|
15
|
+
help="Override manifest URL (defaults to /.well-known/pagedigest.json)",
|
|
16
|
+
)
|
|
17
|
+
parser.add_argument(
|
|
18
|
+
"--sample-size",
|
|
19
|
+
type=int,
|
|
20
|
+
default=25,
|
|
21
|
+
help="Number of digest entries to sample",
|
|
22
|
+
)
|
|
23
|
+
parser.add_argument(
|
|
24
|
+
"--seed",
|
|
25
|
+
type=int,
|
|
26
|
+
default=42,
|
|
27
|
+
help="Random seed for deterministic sampling",
|
|
28
|
+
)
|
|
29
|
+
parser.add_argument("--timeout", type=int, default=15, help="HTTP timeout seconds")
|
|
30
|
+
parser.add_argument(
|
|
31
|
+
"--max-bytes",
|
|
32
|
+
type=int,
|
|
33
|
+
default=DEFAULT_MAX_AUDIT_BYTES,
|
|
34
|
+
help="Abort identity fetches larger than this many bytes",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"--manifest-max-bytes",
|
|
38
|
+
type=int,
|
|
39
|
+
default=DEFAULT_MAX_MANIFEST_BYTES,
|
|
40
|
+
help="Abort manifest fetches larger than this many bytes",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _run_verify_live(args: argparse.Namespace, stdout: TextIO, stderr: TextIO) -> int:
|
|
45
|
+
if args.sample_size < 0:
|
|
46
|
+
print("sample-size must be non-negative", file=stderr)
|
|
47
|
+
return 1
|
|
48
|
+
if args.timeout <= 0:
|
|
49
|
+
print("timeout must be positive", file=stderr)
|
|
50
|
+
return 1
|
|
51
|
+
if args.max_bytes <= 0:
|
|
52
|
+
print("max-bytes must be positive", file=stderr)
|
|
53
|
+
return 1
|
|
54
|
+
if args.manifest_max_bytes <= 0:
|
|
55
|
+
print("manifest-max-bytes must be positive", file=stderr)
|
|
56
|
+
return 1
|
|
57
|
+
|
|
58
|
+
result = verify_live(
|
|
59
|
+
args.base_url,
|
|
60
|
+
manifest_url_override=args.manifest_url,
|
|
61
|
+
sample_size=args.sample_size,
|
|
62
|
+
seed=args.seed,
|
|
63
|
+
timeout=args.timeout,
|
|
64
|
+
max_bytes=args.max_bytes,
|
|
65
|
+
manifest_max_bytes=args.manifest_max_bytes,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
print(f"manifest: {result.get('manifest_url')}", file=stdout)
|
|
69
|
+
if not result.get("ok"):
|
|
70
|
+
error = result.get("error") or "unknown-error"
|
|
71
|
+
status_code = result.get("status_code")
|
|
72
|
+
suffix = f":{status_code}" if status_code is not None else ""
|
|
73
|
+
print(f"error: {error}{suffix}", file=stderr)
|
|
74
|
+
return 1
|
|
75
|
+
|
|
76
|
+
print(f"sampled: {result['sampled']}", file=stdout)
|
|
77
|
+
print(f"match: {result['match_count']}", file=stdout)
|
|
78
|
+
print(f"mismatch: {result['mismatch_count']}", file=stdout)
|
|
79
|
+
print(f"inconclusive: {result['inconclusive_count']}", file=stdout)
|
|
80
|
+
|
|
81
|
+
for item in result["results"]:
|
|
82
|
+
if item["status"] != "match":
|
|
83
|
+
print(f"- {item['status']}: {item['url']} ({item['detail']})", file=stdout)
|
|
84
|
+
|
|
85
|
+
if result["mismatch_count"] > 0:
|
|
86
|
+
return 2
|
|
87
|
+
return 0
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def verify_live_main(
|
|
91
|
+
argv: Sequence[str] | None = None, stdout: TextIO | None = None, stderr: TextIO | None = None
|
|
92
|
+
) -> int:
|
|
93
|
+
parser = argparse.ArgumentParser(description="Verify pagedigest digest values over the wire")
|
|
94
|
+
_add_verify_live_args(parser)
|
|
95
|
+
return _run_verify_live(parser.parse_args(argv), stdout or sys.stdout, stderr or sys.stderr)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def main(argv: Sequence[str] | None = None, stdout: TextIO | None = None, stderr: TextIO | None = None) -> int:
|
|
99
|
+
parser = argparse.ArgumentParser(prog="pagedigest")
|
|
100
|
+
subcommands = parser.add_subparsers(dest="command")
|
|
101
|
+
verify_parser = subcommands.add_parser("verify-live", help="Verify manifest digests against live responses")
|
|
102
|
+
_add_verify_live_args(verify_parser)
|
|
103
|
+
|
|
104
|
+
args = parser.parse_args(argv)
|
|
105
|
+
if args.command == "verify-live":
|
|
106
|
+
return _run_verify_live(args, stdout or sys.stdout, stderr or sys.stderr)
|
|
107
|
+
|
|
108
|
+
parser.print_help(stderr or sys.stderr)
|
|
109
|
+
return 1
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
if __name__ == "__main__":
|
|
113
|
+
raise SystemExit(main())
|