compile-pdf-core 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- compile_pdf_core/__init__.py +14 -0
- compile_pdf_core/api/__init__.py +0 -0
- compile_pdf_core/api/auth.py +109 -0
- compile_pdf_core/api/middleware.py +73 -0
- compile_pdf_core/cache.py +141 -0
- compile_pdf_core/lineage/__init__.py +0 -0
- compile_pdf_core/lineage/store.py +373 -0
- compile_pdf_core/queue_status.py +94 -0
- compile_pdf_core/retention/__init__.py +51 -0
- compile_pdf_core/retention/api.py +57 -0
- compile_pdf_core/retention/consent.py +84 -0
- compile_pdf_core/retention/store.py +223 -0
- compile_pdf_core/tasks.py +212 -0
- compile_pdf_core/version.py +53 -0
- compile_pdf_core-0.1.0.dist-info/METADATA +35 -0
- compile_pdf_core-0.1.0.dist-info/RECORD +17 -0
- compile_pdf_core-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""compile-pdf-core — shared infrastructure for the CompilePDF producer family.
|
|
2
|
+
|
|
3
|
+
Provides lineage storage, retention consent, cache key computation,
|
|
4
|
+
Celery task wrappers, queue-depth resolution, and API auth/middleware
|
|
5
|
+
used by every CompilePDF producer (trap, impose, marks, rewrite).
|
|
6
|
+
|
|
7
|
+
Producers import from this package rather than duplicating infra.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from compile_pdf_core.version import VERSION
|
|
11
|
+
|
|
12
|
+
__version__ = VERSION
|
|
13
|
+
|
|
14
|
+
__all__ = ["VERSION", "__version__"]
|
|
File without changes
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Authentication modes for compile-pdf API.
|
|
2
|
+
|
|
3
|
+
Lifts the codex_pdf.api.auth surface verbatim per spec §1.10 — five modes
|
|
4
|
+
selected via ``COMPILE_AUTH_MODE`` (comma-separated subset of
|
|
5
|
+
``none``, ``bearer``, ``api-key``, ``internal``, ``basic``).
|
|
6
|
+
|
|
7
|
+
Reuse rationale: codex's auth surface is already proven against the same
|
|
8
|
+
threat model (internal calls + public-facing marketing demos). Lifting it
|
|
9
|
+
verbatim minimizes new attack surface and keeps operator muscle memory
|
|
10
|
+
uniform across codex/compile.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
import secrets
|
|
17
|
+
from collections.abc import Iterable
|
|
18
|
+
|
|
19
|
+
from fastapi import HTTPException, Request, status
|
|
20
|
+
|
|
21
|
+
ALL_MODES = frozenset({"none", "bearer", "api-key", "internal", "basic"})
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_active_modes() -> frozenset[str]:
|
|
25
|
+
"""Read ``COMPILE_AUTH_MODE`` env var; default to ``none`` if unset."""
|
|
26
|
+
raw = os.environ.get("COMPILE_AUTH_MODE", "").strip()
|
|
27
|
+
if not raw:
|
|
28
|
+
return frozenset({"none"})
|
|
29
|
+
requested = {token.strip().lower() for token in raw.split(",") if token.strip()}
|
|
30
|
+
invalid = requested - ALL_MODES
|
|
31
|
+
if invalid:
|
|
32
|
+
raise RuntimeError(
|
|
33
|
+
f"COMPILE_AUTH_MODE contains unknown modes: {sorted(invalid)} "
|
|
34
|
+
f"(valid: {sorted(ALL_MODES)})"
|
|
35
|
+
)
|
|
36
|
+
return frozenset(requested)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _check_bearer(authorization: str | None) -> bool:
|
|
40
|
+
expected = os.environ.get("COMPILE_BEARER_TOKEN", "")
|
|
41
|
+
if not expected or not authorization:
|
|
42
|
+
return False
|
|
43
|
+
if not authorization.lower().startswith("bearer "):
|
|
44
|
+
return False
|
|
45
|
+
presented = authorization[len("Bearer ") :].strip()
|
|
46
|
+
return secrets.compare_digest(presented.encode(), expected.encode())
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _check_api_key(api_key: str | None) -> bool:
|
|
50
|
+
expected = os.environ.get("COMPILE_API_KEY", "")
|
|
51
|
+
if not expected or not api_key:
|
|
52
|
+
return False
|
|
53
|
+
return secrets.compare_digest(api_key.encode(), expected.encode())
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _check_internal(internal_token: str | None) -> bool:
|
|
57
|
+
expected = os.environ.get("COMPILE_INTERNAL_TOKEN", "")
|
|
58
|
+
if not expected or not internal_token:
|
|
59
|
+
return False
|
|
60
|
+
return secrets.compare_digest(internal_token.encode(), expected.encode())
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _check_basic(authorization: str | None) -> bool:
|
|
64
|
+
if os.environ.get("COMPILE_BASIC_AUTH_ENABLED", "").lower() not in {"1", "true", "yes"}:
|
|
65
|
+
return False
|
|
66
|
+
expected_user = os.environ.get("COMPILE_BASIC_AUTH_USER", "")
|
|
67
|
+
expected_pass = os.environ.get("COMPILE_BASIC_AUTH_PASS", "")
|
|
68
|
+
if not expected_user or not expected_pass or not authorization:
|
|
69
|
+
return False
|
|
70
|
+
if not authorization.lower().startswith("basic "):
|
|
71
|
+
return False
|
|
72
|
+
import base64
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
decoded = base64.b64decode(authorization[len("Basic ") :]).decode("utf-8")
|
|
76
|
+
except Exception:
|
|
77
|
+
return False
|
|
78
|
+
if ":" not in decoded:
|
|
79
|
+
return False
|
|
80
|
+
user, _, pwd = decoded.partition(":")
|
|
81
|
+
return secrets.compare_digest(user.encode(), expected_user.encode()) and secrets.compare_digest(
|
|
82
|
+
pwd.encode(), expected_pass.encode()
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def authenticate(request: Request, _modes: Iterable[str] | None = None) -> str:
|
|
87
|
+
"""Dependency for FastAPI routes that require authentication.
|
|
88
|
+
|
|
89
|
+
Returns the mode that succeeded, raises 401 if all configured modes fail.
|
|
90
|
+
Healthcheck routes opt out by not declaring this dependency.
|
|
91
|
+
"""
|
|
92
|
+
modes = frozenset(_modes) if _modes is not None else get_active_modes()
|
|
93
|
+
if "none" in modes:
|
|
94
|
+
return "none"
|
|
95
|
+
|
|
96
|
+
if "bearer" in modes and _check_bearer(request.headers.get("Authorization")):
|
|
97
|
+
return "bearer"
|
|
98
|
+
if "api-key" in modes and _check_api_key(request.headers.get("X-Compile-Key")):
|
|
99
|
+
return "api-key"
|
|
100
|
+
if "internal" in modes and _check_internal(request.headers.get("X-Compile-Internal")):
|
|
101
|
+
return "internal"
|
|
102
|
+
if "basic" in modes and _check_basic(request.headers.get("Authorization")):
|
|
103
|
+
return "basic"
|
|
104
|
+
|
|
105
|
+
raise HTTPException(
|
|
106
|
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
|
107
|
+
detail="authentication required",
|
|
108
|
+
headers={"WWW-Authenticate": "Bearer"},
|
|
109
|
+
)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Request-id middleware mirroring codex-pdf 1.5's planned shape.
|
|
2
|
+
|
|
3
|
+
Per spec §0 + IMPL-PLAN Phase 0 deliverable 0.2: every request gets a
|
|
4
|
+
correlation ID that flows from upstream callers through compile to codex
|
|
5
|
+
and back. ``X-Compile-Request-Id`` is the canonical header; missing IDs
|
|
6
|
+
are generated fresh; the value is echoed in the response header and added
|
|
7
|
+
to structured-log records.
|
|
8
|
+
|
|
9
|
+
The same middleware also stamps ``X-Compile-Instance-Id`` on responses so
|
|
10
|
+
operators can identify which replica answered the request during
|
|
11
|
+
multi-instance rollouts.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import os
|
|
17
|
+
import secrets
|
|
18
|
+
import socket
|
|
19
|
+
|
|
20
|
+
import structlog
|
|
21
|
+
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
|
|
22
|
+
from starlette.requests import Request
|
|
23
|
+
from starlette.responses import Response
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _resolve_instance_id() -> str:
|
|
27
|
+
"""``COMPILE_INSTANCE_ID`` env var wins; falls back to hostname.
|
|
28
|
+
|
|
29
|
+
Used by both the middleware (response header) and the /healthz route.
|
|
30
|
+
"""
|
|
31
|
+
explicit = os.environ.get("COMPILE_INSTANCE_ID", "").strip()
|
|
32
|
+
if explicit:
|
|
33
|
+
return explicit
|
|
34
|
+
return socket.gethostname() or "unknown"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
INSTANCE_ID = _resolve_instance_id()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class RequestIdMiddleware(BaseHTTPMiddleware):
|
|
41
|
+
"""Reads or generates ``X-Compile-Request-Id``, stores it on
|
|
42
|
+
``request.state.request_id``, echoes in response headers, and binds it
|
|
43
|
+
to the structlog context so every log line correlates."""
|
|
44
|
+
|
|
45
|
+
async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
|
|
46
|
+
request_id = (
|
|
47
|
+
request.headers.get("X-Compile-Request-Id")
|
|
48
|
+
or request.headers.get("x-compile-request-id")
|
|
49
|
+
or secrets.token_hex(8)
|
|
50
|
+
)
|
|
51
|
+
request.state.request_id = request_id
|
|
52
|
+
|
|
53
|
+
# Bind to structlog context so every log line during this request
|
|
54
|
+
# carries the request_id automatically.
|
|
55
|
+
structlog.contextvars.clear_contextvars()
|
|
56
|
+
structlog.contextvars.bind_contextvars(
|
|
57
|
+
request_id=request_id,
|
|
58
|
+
instance_id=INSTANCE_ID,
|
|
59
|
+
method=request.method,
|
|
60
|
+
path=request.url.path,
|
|
61
|
+
)
|
|
62
|
+
# Also propagate any upstream codex request-id we received so the
|
|
63
|
+
# full lint→compile→codex chain is queryable in logs.
|
|
64
|
+
upstream_codex_request_id = request.headers.get("X-Codex-Request-Id")
|
|
65
|
+
if upstream_codex_request_id:
|
|
66
|
+
structlog.contextvars.bind_contextvars(
|
|
67
|
+
upstream_codex_request_id=upstream_codex_request_id
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
response = await call_next(request)
|
|
71
|
+
response.headers["X-Compile-Request-Id"] = request_id
|
|
72
|
+
response.headers["X-Compile-Instance-Id"] = INSTANCE_ID
|
|
73
|
+
return response
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Cache key composition + plan canonicalization.
|
|
2
|
+
|
|
3
|
+
Per spec §1.6 + §1.6a — cache key components (alphabetical-by-name so
|
|
4
|
+
the digest is reproducible across language implementations):
|
|
5
|
+
|
|
6
|
+
1. ``codex_document_schema_version``
|
|
7
|
+
2. ``codex_pdf_package_version``
|
|
8
|
+
3. ``color_schema_version`` (codex_pdf.color.COLOR_SCHEMA_VERSION)
|
|
9
|
+
4. ``geom_schema_version`` (codex_pdf.geom.GEOM_SCHEMA_VERSION)
|
|
10
|
+
5. ``compile_version``
|
|
11
|
+
6. ``producer`` (rewrite | marks | impose | trap)
|
|
12
|
+
7. ``sha256(canonical_plan)``
|
|
13
|
+
8. ``sha256(input_bytes)``
|
|
14
|
+
|
|
15
|
+
A Codex section bump auto-invalidates affected cached outputs (load-bearing
|
|
16
|
+
operational property).
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import hashlib
|
|
22
|
+
import json
|
|
23
|
+
from collections.abc import Mapping
|
|
24
|
+
from decimal import ROUND_HALF_EVEN, Decimal
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
from compile_pdf_core.version import VERSION as COMPILE_VERSION
|
|
28
|
+
|
|
29
|
+
_PLAN_CANONICAL_NUMBER_QUANTIZE = Decimal("1E-12")
|
|
30
|
+
"""Numeric precision used during canonicalization. 12 decimal places is enough
|
|
31
|
+
to disambiguate prepress measurements (which rarely exceed ~5 decimals)
|
|
32
|
+
without introducing float drift across Python/JS/Go implementations."""
|
|
33
|
+
|
|
34
|
+
_DROPPED_KEYS = frozenset({"comment", "notes", "_dev_meta"})
|
|
35
|
+
"""Keys stripped from canonical plan before hashing.
|
|
36
|
+
Operators can decorate plans with these keys for human readability
|
|
37
|
+
without the markings affecting the cache key."""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def canonicalize_plan(plan: Mapping[str, Any] | list[Any] | str | int | float | bool | None) -> Any:
|
|
41
|
+
"""Return a canonical, sortable, drop-null-decorated copy of a plan.
|
|
42
|
+
|
|
43
|
+
Canonicalization steps (per spec §2.2):
|
|
44
|
+
|
|
45
|
+
1. Sort all dict keys recursively.
|
|
46
|
+
2. Normalize numbers to fixed-decimal (round-half-even) so different
|
|
47
|
+
JSON serializers produce identical byte sequences.
|
|
48
|
+
3. Strip ``comment`` / ``notes`` / ``_dev_meta`` keys.
|
|
49
|
+
4. Drop ``None`` values (treat as absent).
|
|
50
|
+
|
|
51
|
+
Used by :func:`compute_cache_key`. Pure function; no I/O.
|
|
52
|
+
"""
|
|
53
|
+
if plan is None:
|
|
54
|
+
return None
|
|
55
|
+
if isinstance(plan, bool):
|
|
56
|
+
# bool must be checked before int (bool is a subclass of int in Python).
|
|
57
|
+
return plan
|
|
58
|
+
if isinstance(plan, int):
|
|
59
|
+
return plan
|
|
60
|
+
if isinstance(plan, float):
|
|
61
|
+
# Round-half-even via Decimal so the digest is portable.
|
|
62
|
+
quantized = (
|
|
63
|
+
Decimal(repr(plan))
|
|
64
|
+
.quantize(_PLAN_CANONICAL_NUMBER_QUANTIZE, rounding=ROUND_HALF_EVEN)
|
|
65
|
+
.normalize()
|
|
66
|
+
)
|
|
67
|
+
as_str = format(quantized, "f")
|
|
68
|
+
# Re-parse so e.g. "1.0" stays a number in JSON, not a string.
|
|
69
|
+
try:
|
|
70
|
+
int_val = int(as_str)
|
|
71
|
+
if "." not in as_str:
|
|
72
|
+
return int_val
|
|
73
|
+
except ValueError:
|
|
74
|
+
pass
|
|
75
|
+
return float(as_str)
|
|
76
|
+
if isinstance(plan, str):
|
|
77
|
+
return plan
|
|
78
|
+
if isinstance(plan, list):
|
|
79
|
+
return [canonicalize_plan(item) for item in plan]
|
|
80
|
+
if isinstance(plan, Mapping):
|
|
81
|
+
return {
|
|
82
|
+
key: canonicalize_plan(value)
|
|
83
|
+
for key, value in sorted(plan.items())
|
|
84
|
+
if key not in _DROPPED_KEYS and value is not None
|
|
85
|
+
}
|
|
86
|
+
raise TypeError(f"Unsupported plan element type: {type(plan)!r}")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def hash_canonical_plan(plan: Mapping[str, Any]) -> str:
|
|
90
|
+
"""Return the SHA-256 of a canonicalized plan, hex-encoded.
|
|
91
|
+
|
|
92
|
+
The plan is canonicalized via :func:`canonicalize_plan` and then
|
|
93
|
+
serialized with ``json.dumps(..., separators=(",", ":"), ensure_ascii=False,
|
|
94
|
+
sort_keys=False)`` (sort_keys=False is safe because canonicalization already
|
|
95
|
+
sorted recursively).
|
|
96
|
+
"""
|
|
97
|
+
canonical = canonicalize_plan(plan)
|
|
98
|
+
serialized = json.dumps(canonical, separators=(",", ":"), ensure_ascii=False)
|
|
99
|
+
return hashlib.sha256(serialized.encode("utf-8")).hexdigest()
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def compute_cache_key(
|
|
103
|
+
*,
|
|
104
|
+
producer: str,
|
|
105
|
+
input_sha256: str,
|
|
106
|
+
canonical_plan_sha256: str,
|
|
107
|
+
codex_pdf_package_version: str,
|
|
108
|
+
color_schema_version: str,
|
|
109
|
+
geom_schema_version: str,
|
|
110
|
+
codex_document_schema_version: str,
|
|
111
|
+
compile_version: str = COMPILE_VERSION,
|
|
112
|
+
) -> str:
|
|
113
|
+
"""Compose the per-job cache key.
|
|
114
|
+
|
|
115
|
+
Returns hex-encoded SHA-256. Components are concatenated alphabetical-by-name
|
|
116
|
+
with ``|`` separator so the digest is reproducible across implementations.
|
|
117
|
+
|
|
118
|
+
See spec §1.6a for the rationale on each component:
|
|
119
|
+
|
|
120
|
+
- ``codex_document_schema_version`` — top-level codex-document schema
|
|
121
|
+
- ``codex_pdf_package_version`` — catches Codex bug fixes without schema bump
|
|
122
|
+
- ``color_schema_version`` — invalidates on /v1/color/* changes
|
|
123
|
+
- ``geom_schema_version`` — invalidates on /v1/geom/* changes
|
|
124
|
+
- ``compile_version`` — captures Compile engine changes
|
|
125
|
+
- ``producer`` — distinguishes the four producer endpoints
|
|
126
|
+
- ``canonical_plan_sha256`` — plan hashed via :func:`hash_canonical_plan`
|
|
127
|
+
- ``input_sha256`` — sha256 of the raw input PDF bytes
|
|
128
|
+
"""
|
|
129
|
+
components = "|".join(
|
|
130
|
+
[
|
|
131
|
+
codex_document_schema_version,
|
|
132
|
+
codex_pdf_package_version,
|
|
133
|
+
color_schema_version,
|
|
134
|
+
geom_schema_version,
|
|
135
|
+
compile_version,
|
|
136
|
+
producer,
|
|
137
|
+
canonical_plan_sha256,
|
|
138
|
+
input_sha256,
|
|
139
|
+
]
|
|
140
|
+
)
|
|
141
|
+
return hashlib.sha256(components.encode("utf-8")).hexdigest()
|
|
File without changes
|