codex-pdf 1.7.2__tar.gz → 1.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/CLAUDE.md +1 -1
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/PKG-INFO +3 -1
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/clients/ts/package.json +1 -1
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/codex-edge/README.md +1 -1
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/codex-edge/wrangler.toml +1 -1
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/pyproject.toml +4 -1
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/api/main.py +109 -3
- codex_pdf-1.8.0/src/codex_pdf/api/retention.py +312 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/version.py +1 -1
- codex_pdf-1.8.0/tests/test_retention_consent.py +94 -0
- codex_pdf-1.8.0/tests/test_retention_integration.py +166 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/uv.lock +85 -2
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/.cursor/rules/service-ownership.mdc +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/.github/workflows/ci.yml +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/.gitignore +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/.windsurf/rules/service-ownership.md +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/CONTRIBUTING.md +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/Dockerfile +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/LICENSE +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/Procfile +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/README.md +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/SECURITY.md +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/clients/ts/README.md +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/clients/ts/package-lock.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/clients/ts/src/color.ts +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/clients/ts/src/index.test.ts +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/clients/ts/src/index.ts +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/clients/ts/tsconfig.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/codex-edge/.github/workflows/deploy.yml +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/codex-edge/package-lock.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/codex-edge/package.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/codex-edge/src/cache_key.ts +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/codex-edge/src/env.ts +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/codex-edge/src/handlers/extract.ts +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/codex-edge/src/handlers/probe.ts +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/codex-edge/src/index.ts +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/codex-edge/src/sse_tee.ts +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/codex-edge/test/cache_key.test.ts +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/codex-edge/tsconfig.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/docs/architecture.md +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/docs/cli.md +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/docs/contract.md +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/docs/deploy.md +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/docs/operations/codex-change-ripple.md +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/docs/operations/marketing-deploy-template.md +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/docs/parity.md +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/docs/preflight-ingest.md +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/docs/service-ownership-contract.md +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/railway.speculator.toml +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/railway.toml +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/reports/audit/mislocated-closure.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/reports/audit/produce_surface.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/reports/dieline_calibration_report.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/reports/parity/codex_deep.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/reports/parity/codex_inventory.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/reports/parity/codex_summary.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/reports/parity/criterion4_parser_surface.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/reports/parity/pdfx4_deep.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/reports/parity/pdfx4_inventory.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/reports/parity/pdfx4_summary.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/reports/parity/render_baseline.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/reports/parity/viewer_essentials.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/CHANGELOG.md +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/codex-annotation.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/codex-box.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/codex-color-space.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/codex-document.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/codex-font.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/codex-form-xobject.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/codex-image.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/codex-issue.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/codex-ocg.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/codex-output-intent.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/codex-page-object.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/codex-page.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/codex-preflight-report.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/codex-source.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/codex-spot-colorant.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/codex-transparency-tree.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/codex-trap-evidence.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/codex-warning.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/schemas/v1/probe.schema.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/scripts/calibrate_dieline_heuristics.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/scripts/parity_viewer_essentials.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/scripts/produce_surface_audit.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/__init__.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/api/__init__.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/api/auth.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/api/blob_store.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/api/cache.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/api/url_ingest.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/api/warmup.pdf +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/api/warmup.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/cli.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/client/__init__.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/client/http_client.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/color/__init__.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/color/alt_space.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/color/color_math.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/color/curated.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/color/data/pantone_reference.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/color/normalize.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/color/pantone.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/color/resolver.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/eval/__init__.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/eval/ps_type4.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/extract/__init__.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/extract/annotations.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/extract/color.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/extract/common.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/extract/content_inventory.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/extract/dieline_detector.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/extract/document.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/extract/fonts.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/extract/forms.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/extract/images.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/extract/ocg.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/extract/probe.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/extract/signals.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/extract/structure.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/extract/summary.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/extract/transparency.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/extract/trapping.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/geom/__init__.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/geom/box.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/geom/matrix.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/geom/path.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/geom/tile.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/geom/units.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/models/__init__.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/models/v1.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/parity.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/preflight_ingest/__init__.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/preflight_ingest/adapters.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/render/__init__.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/render/_common.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/render/content_stream.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/render/layer.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/render/page.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/render/separations.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/schema.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/speculator/__init__.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/speculator/__main__.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/src/codex_pdf/speculator/consumer.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/conftest.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/fixtures/conforming/minimal.pdf +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/fixtures/generate_fixtures.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/fixtures/violating/no_output_intent.pdf +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/fixtures/violating/no_trim_box.pdf +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/fixtures/violating/no_xmp.pdf +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/fixtures/violating/pdf_1_4.pdf +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/golden/1.0.0/reference.json +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_alt_space.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_api.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_cache.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_cli_contract.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_client_routing.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_color.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_extract_analysis_signals.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_extract_structural.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_geom.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_golden.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_golden_corpus.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_models.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_parity.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_pdf_sha256.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_preflight_ingest.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_produce_surface_audit.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_schema.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_schemas_all.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_speculator.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_summary_dieline.py +0 -0
- {codex_pdf-1.7.2 → codex_pdf-1.8.0}/tests/test_summary_spot_colors.py +0 -0
|
@@ -25,7 +25,7 @@ For new products (Forge, Trap, Impose, Marks, etc.), map capabilities to one own
|
|
|
25
25
|
|
|
26
26
|
When work spans layers, define a contract seam and keep logic in its owner service.
|
|
27
27
|
|
|
28
|
-
## Deployed surface (1.
|
|
28
|
+
## Deployed surface (1.8.0)
|
|
29
29
|
|
|
30
30
|
Codex now runs as **three services** in production. They share the
|
|
31
31
|
same content-addressed cache key format
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codex-pdf
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.8.0
|
|
4
4
|
Summary: Authoritative, versioned PDF facts contract for Think Neverland tools.
|
|
5
5
|
Author-email: Think Neverland <dev@thinkneverland.com>
|
|
6
6
|
License-Expression: AGPL-3.0-or-later
|
|
@@ -19,6 +19,8 @@ Provides-Extra: geom
|
|
|
19
19
|
Requires-Dist: pyclipr>=0.1.8; extra == 'geom'
|
|
20
20
|
Provides-Extra: redis
|
|
21
21
|
Requires-Dist: redis>=5.0; extra == 'redis'
|
|
22
|
+
Provides-Extra: retain
|
|
23
|
+
Requires-Dist: boto3>=1.34; extra == 'retain'
|
|
22
24
|
Description-Content-Type: text/markdown
|
|
23
25
|
|
|
24
26
|
---
|
|
@@ -10,7 +10,7 @@ the Railway codex-pdf service.
|
|
|
10
10
|
- **Account**: `99aa3f9229469650a746a7d39ac58448` (`Quincy@thinkneverland.com's Account`)
|
|
11
11
|
- **KV namespace `CACHE`**: `89a21ce1937046018a3d9d38f4e763ff` (preview `a4856d6f3b244087b907c189c2a2277d`)
|
|
12
12
|
- **Origin** (`CODEX_ORIGIN_URL`): `https://codex-pdf-lint-sidecar-production.up.railway.app`
|
|
13
|
-
- **Codex version pinned**: `1.
|
|
13
|
+
- **Codex version pinned**: `1.8.0` (`CODEX_VERSION` var — bump on origin release)
|
|
14
14
|
- **TTLs**: probe 24 h, Phase 1 24 h, Phase 2 7 d
|
|
15
15
|
|
|
16
16
|
## What it caches
|
|
@@ -23,7 +23,7 @@ CODEX_ORIGIN_URL = "https://codex-pdf-lint-sidecar-production.up.railway.app"
|
|
|
23
23
|
# Codex package VERSION at deploy time. MUST match the origin's
|
|
24
24
|
# `codex_pdf.version.VERSION` so KV keys line up. Bump on every
|
|
25
25
|
# origin release.
|
|
26
|
-
CODEX_VERSION = "1.
|
|
26
|
+
CODEX_VERSION = "1.8.0"
|
|
27
27
|
# TTL (seconds) for cached SSE event payloads.
|
|
28
28
|
PROBE_TTL = "86400" # 24 h — small payload, refresh daily
|
|
29
29
|
PHASE1_TTL = "86400" # 24 h — matches origin Redis TTL
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "codex-pdf"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.8.0"
|
|
8
8
|
description = "Authoritative, versioned PDF facts contract for Think Neverland tools."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.12"
|
|
@@ -29,6 +29,9 @@ redis = [
|
|
|
29
29
|
geom = [
|
|
30
30
|
"pyclipr>=0.1.8",
|
|
31
31
|
]
|
|
32
|
+
retain = [
|
|
33
|
+
"boto3>=1.34",
|
|
34
|
+
]
|
|
32
35
|
|
|
33
36
|
[project.scripts]
|
|
34
37
|
codex-pdf = "codex_pdf.cli:main"
|
|
@@ -64,6 +64,11 @@ from starlette.middleware.base import BaseHTTPMiddleware
|
|
|
64
64
|
from codex_pdf.api.auth import authenticate
|
|
65
65
|
from codex_pdf.api.blob_store import make_blob_store
|
|
66
66
|
from codex_pdf.api.cache import cache_key, make_cache
|
|
67
|
+
from codex_pdf.api.retention import (
|
|
68
|
+
make_retention_store,
|
|
69
|
+
normalise_tenant,
|
|
70
|
+
parse_retention_consent,
|
|
71
|
+
)
|
|
67
72
|
from codex_pdf.api.warmup import warmup_worker
|
|
68
73
|
from codex_pdf.api.url_ingest import fetch_pdf_from_url
|
|
69
74
|
from codex_pdf.color import (
|
|
@@ -163,6 +168,7 @@ def _record(endpoint: str, status_code: int, duration: float) -> None:
|
|
|
163
168
|
|
|
164
169
|
_cache = make_cache()
|
|
165
170
|
_blob_store = make_blob_store()
|
|
171
|
+
_retention_store = make_retention_store()
|
|
166
172
|
|
|
167
173
|
|
|
168
174
|
def _repo_root() -> Path:
|
|
@@ -790,7 +796,11 @@ def _pre_render_bg(raw: bytes) -> None:
|
|
|
790
796
|
|
|
791
797
|
|
|
792
798
|
async def _extract_impl(
|
|
793
|
-
request: Request,
|
|
799
|
+
request: Request,
|
|
800
|
+
pdf: UploadFile | None,
|
|
801
|
+
*,
|
|
802
|
+
endpoint_label: str,
|
|
803
|
+
retain_form_value: str | None = None,
|
|
794
804
|
) -> JSONResponse:
|
|
795
805
|
started = time.perf_counter()
|
|
796
806
|
try:
|
|
@@ -805,6 +815,7 @@ async def _extract_impl(
|
|
|
805
815
|
payload["pdf_sha256"] = sha
|
|
806
816
|
# Warm the page-1 render cache in the background — don't block the response.
|
|
807
817
|
asyncio.ensure_future(loop.run_in_executor(None, _pre_render_bg, raw))
|
|
818
|
+
await _maybe_retain(request, raw, sha, payload, retain_form_value)
|
|
808
819
|
_record(endpoint_label, 200, time.perf_counter() - started)
|
|
809
820
|
return JSONResponse(payload)
|
|
810
821
|
except HTTPException as exc:
|
|
@@ -819,21 +830,116 @@ async def _extract_impl(
|
|
|
819
830
|
) from exc
|
|
820
831
|
|
|
821
832
|
|
|
833
|
+
async def _maybe_retain(
|
|
834
|
+
request: Request,
|
|
835
|
+
raw: bytes,
|
|
836
|
+
sha: str,
|
|
837
|
+
payload: dict[str, Any],
|
|
838
|
+
retain_form_value: str | None,
|
|
839
|
+
) -> None:
|
|
840
|
+
"""Audit-log the consent decision; persist to S3 if opted in.
|
|
841
|
+
|
|
842
|
+
Audit log fires on every extract regardless of consent so the
|
|
843
|
+
"off" path is observable. Persistence only runs when consent is
|
|
844
|
+
affirmative AND retention is configured. A persist failure logs
|
|
845
|
+
but does not break the extract response — the user already got
|
|
846
|
+
their answer.
|
|
847
|
+
"""
|
|
848
|
+
decision = parse_retention_consent(
|
|
849
|
+
retain_form_value,
|
|
850
|
+
request.headers.get("x-compile-retain-for-training"),
|
|
851
|
+
)
|
|
852
|
+
tenant = normalise_tenant(request.headers.get("x-codex-tenant"))
|
|
853
|
+
request_id = request.headers.get("x-codex-request-id") or ""
|
|
854
|
+
configured = _retention_store is not None
|
|
855
|
+
logger.info(
|
|
856
|
+
"extract_consent decision=%s source=%s mismatch=%s tenant=%s sha=%s "
|
|
857
|
+
"request_id=%s retention_configured=%s",
|
|
858
|
+
decision.consent,
|
|
859
|
+
decision.source,
|
|
860
|
+
decision.mismatch,
|
|
861
|
+
tenant,
|
|
862
|
+
sha[:16],
|
|
863
|
+
request_id,
|
|
864
|
+
configured,
|
|
865
|
+
)
|
|
866
|
+
if not (decision.consent and _retention_store is not None):
|
|
867
|
+
return
|
|
868
|
+
try:
|
|
869
|
+
loop = asyncio.get_event_loop()
|
|
870
|
+
keys = await loop.run_in_executor(
|
|
871
|
+
None,
|
|
872
|
+
lambda: _retention_store.put(
|
|
873
|
+
pdf_bytes=raw,
|
|
874
|
+
extract_payload=payload,
|
|
875
|
+
request_id=request_id,
|
|
876
|
+
tenant=tenant,
|
|
877
|
+
sha256=sha,
|
|
878
|
+
codex_version=VERSION,
|
|
879
|
+
consent_source=decision.source,
|
|
880
|
+
),
|
|
881
|
+
)
|
|
882
|
+
logger.info(
|
|
883
|
+
"extract_retained sha=%s tenant=%s keys=%s",
|
|
884
|
+
sha[:16],
|
|
885
|
+
tenant,
|
|
886
|
+
list(keys.values()),
|
|
887
|
+
)
|
|
888
|
+
except Exception:
|
|
889
|
+
logger.exception("retention persist failed sha=%s tenant=%s", sha[:16], tenant)
|
|
890
|
+
|
|
891
|
+
|
|
822
892
|
@app.post("/extract", include_in_schema=False, dependencies=[Depends(authenticate)])
|
|
823
893
|
async def extract_root_endpoint(
|
|
824
894
|
request: Request,
|
|
825
895
|
pdf: UploadFile | None = File(default=None),
|
|
896
|
+
retain_for_training: str | None = Form(default=None),
|
|
826
897
|
) -> JSONResponse:
|
|
827
|
-
return await _extract_impl(
|
|
898
|
+
return await _extract_impl(
|
|
899
|
+
request, pdf, endpoint_label="extract", retain_form_value=retain_for_training
|
|
900
|
+
)
|
|
828
901
|
|
|
829
902
|
|
|
830
903
|
@app.post("/v1/extract", dependencies=[Depends(authenticate)])
|
|
831
904
|
async def extract_endpoint(
|
|
832
905
|
request: Request,
|
|
833
906
|
pdf: UploadFile | None = File(default=None),
|
|
907
|
+
retain_for_training: str | None = Form(default=None),
|
|
834
908
|
) -> JSONResponse:
|
|
835
909
|
"""Extract a CodexDocument from an uploaded PDF or remote URL."""
|
|
836
|
-
return await _extract_impl(
|
|
910
|
+
return await _extract_impl(
|
|
911
|
+
request, pdf, endpoint_label="extract", retain_form_value=retain_for_training
|
|
912
|
+
)
|
|
913
|
+
|
|
914
|
+
|
|
915
|
+
class _RetentionDeleteRequest(BaseModel):
|
|
916
|
+
sha256: str = Field(..., description="Lower-case hex SHA-256 of the PDF to erase.")
|
|
917
|
+
|
|
918
|
+
|
|
919
|
+
@app.post("/v1/retention/delete", dependencies=[Depends(authenticate)])
|
|
920
|
+
async def retention_delete_endpoint(payload: _RetentionDeleteRequest) -> JSONResponse:
|
|
921
|
+
"""Erase every retained object for ``sha256`` (DSAR endpoint).
|
|
922
|
+
|
|
923
|
+
503 when retention is not configured (operator hasn't set
|
|
924
|
+
``CODEX_RETAIN_BUCKET``). 400 on a malformed sha256. ``deleted=0``
|
|
925
|
+
when no objects matched — DSAR replay is idempotent, callers
|
|
926
|
+
don't need to distinguish "never stored" from "already erased".
|
|
927
|
+
"""
|
|
928
|
+
if _retention_store is None:
|
|
929
|
+
raise HTTPException(
|
|
930
|
+
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
|
931
|
+
detail="retention not configured (CODEX_RETAIN_BUCKET unset)",
|
|
932
|
+
)
|
|
933
|
+
sha = payload.sha256.strip().lower()
|
|
934
|
+
loop = asyncio.get_event_loop()
|
|
935
|
+
try:
|
|
936
|
+
deleted = await loop.run_in_executor(None, _retention_store.delete, sha)
|
|
937
|
+
except ValueError as exc:
|
|
938
|
+
raise HTTPException(
|
|
939
|
+
status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc)
|
|
940
|
+
) from exc
|
|
941
|
+
logger.info("retention_delete sha=%s deleted=%d", sha[:16], deleted)
|
|
942
|
+
return JSONResponse({"sha256": sha, "deleted": deleted})
|
|
837
943
|
|
|
838
944
|
|
|
839
945
|
async def _read_probe_pdf(request: Request, pdf: UploadFile | None) -> bytes:
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
"""Opt-in PDF retention for the marketing demo.
|
|
2
|
+
|
|
3
|
+
When a request to ``/v1/extract`` carries an explicit "yes, retain for
|
|
4
|
+
training" signal, the codex sidecar persists the input PDF, the
|
|
5
|
+
extract response, and a tiny metadata object to S3-compatible storage
|
|
6
|
+
under a hive-partitioned key. Everything else (no flag, ``false``
|
|
7
|
+
flag, storage unconfigured) is a no-op — the bytes leave memory the
|
|
8
|
+
moment the response ships, exactly like before.
|
|
9
|
+
|
|
10
|
+
Object key layout::
|
|
11
|
+
|
|
12
|
+
{prefix}/tenant={tenant}/dt={YYYY-MM-DD}/sha256={hex64}/document.pdf
|
|
13
|
+
{prefix}/tenant={tenant}/dt={YYYY-MM-DD}/sha256={hex64}/extract.json
|
|
14
|
+
{prefix}/tenant={tenant}/dt={YYYY-MM-DD}/sha256={hex64}/meta.json
|
|
15
|
+
|
|
16
|
+
Hive partitioning makes the bucket Athena/Glue-queryable later
|
|
17
|
+
without a migration. ``dt=`` makes an S3 Lifecycle rule trivial
|
|
18
|
+
(operator-owned — the app does *not* try to manage lifecycle
|
|
19
|
+
policies). ``sha256=`` dedupes idempotent re-uploads of the same
|
|
20
|
+
file on the same day.
|
|
21
|
+
|
|
22
|
+
``CODEX_RETAIN_TTL_DAYS`` is informational: the app writes the
|
|
23
|
+
declared retention window into ``meta.json`` and the audit log, but
|
|
24
|
+
expiry is enforced by the bucket's lifecycle rule the operator
|
|
25
|
+
configures out-of-band.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import json
|
|
31
|
+
import logging
|
|
32
|
+
import os
|
|
33
|
+
import re
|
|
34
|
+
from dataclasses import dataclass
|
|
35
|
+
from datetime import datetime, timezone
|
|
36
|
+
from typing import Any, Protocol
|
|
37
|
+
|
|
38
|
+
logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
_TRUE_TOKENS = frozenset({"true", "1", "yes", "on"})
|
|
41
|
+
_TENANT_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,62}$")
|
|
42
|
+
_SHA256_RE = re.compile(r"^[0-9a-f]{64}$")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
# Consent parsing
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass(frozen=True)
|
|
51
|
+
class ConsentDecision:
|
|
52
|
+
consent: bool
|
|
53
|
+
source: str # "form" | "header" | "both" | "none"
|
|
54
|
+
mismatch: bool # form != header when both were present
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _truthy(value: str | None) -> bool | None:
|
|
58
|
+
"""Return ``True``/``False`` for a recognised token, ``None`` if absent.
|
|
59
|
+
|
|
60
|
+
Recognised true tokens: ``true``, ``1``, ``yes``, ``on`` (case-
|
|
61
|
+
insensitive, whitespace-stripped). Anything else — including
|
|
62
|
+
``"false"``, ``"0"``, ``"no"``, ``"off"``, ``""``, garbage — is
|
|
63
|
+
explicitly false. The three-valued return lets the caller tell
|
|
64
|
+
"user said no" from "user said nothing" when reconciling form
|
|
65
|
+
against header.
|
|
66
|
+
"""
|
|
67
|
+
if value is None:
|
|
68
|
+
return None
|
|
69
|
+
token = value.strip().lower()
|
|
70
|
+
if not token:
|
|
71
|
+
return None
|
|
72
|
+
return token in _TRUE_TOKENS
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def parse_retention_consent(
|
|
76
|
+
form_value: str | None, header_value: str | None
|
|
77
|
+
) -> ConsentDecision:
|
|
78
|
+
"""Reconcile the form-field and header signals from the demo uploader.
|
|
79
|
+
|
|
80
|
+
The browser checkbox is canonical; the header is fallback only
|
|
81
|
+
when the form field is absent. If both are present and disagree,
|
|
82
|
+
the form wins and the mismatch is flagged so the audit log can
|
|
83
|
+
surface the integration bug without overriding user intent.
|
|
84
|
+
"""
|
|
85
|
+
form = _truthy(form_value)
|
|
86
|
+
header = _truthy(header_value)
|
|
87
|
+
|
|
88
|
+
if form is None and header is None:
|
|
89
|
+
return ConsentDecision(consent=False, source="none", mismatch=False)
|
|
90
|
+
if form is None:
|
|
91
|
+
return ConsentDecision(consent=bool(header), source="header", mismatch=False)
|
|
92
|
+
if header is None:
|
|
93
|
+
return ConsentDecision(consent=form, source="form", mismatch=False)
|
|
94
|
+
|
|
95
|
+
mismatch = form != header
|
|
96
|
+
if form and header:
|
|
97
|
+
return ConsentDecision(consent=True, source="both", mismatch=False)
|
|
98
|
+
return ConsentDecision(consent=form, source="form", mismatch=mismatch)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def normalise_tenant(raw: str | None) -> str:
|
|
102
|
+
"""Validate ``X-Codex-Tenant`` and fall back to ``default``.
|
|
103
|
+
|
|
104
|
+
Invalid values fall back rather than 400ing so an upstream typo
|
|
105
|
+
doesn't break the user-facing extract. The fallback + warning is
|
|
106
|
+
visible in the audit log.
|
|
107
|
+
"""
|
|
108
|
+
if raw is None:
|
|
109
|
+
return "default"
|
|
110
|
+
candidate = raw.strip().lower()
|
|
111
|
+
if not candidate:
|
|
112
|
+
return "default"
|
|
113
|
+
if not _TENANT_RE.match(candidate):
|
|
114
|
+
logger.warning("retention tenant header rejected raw=%r → default", raw)
|
|
115
|
+
return "default"
|
|
116
|
+
return candidate
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
# Storage
|
|
121
|
+
# ---------------------------------------------------------------------------
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@dataclass(frozen=True)
|
|
125
|
+
class RetentionConfig:
|
|
126
|
+
bucket: str
|
|
127
|
+
prefix: str
|
|
128
|
+
ttl_days: int
|
|
129
|
+
endpoint_url: str | None
|
|
130
|
+
region: str
|
|
131
|
+
access_key_id: str | None
|
|
132
|
+
secret_access_key: str | None
|
|
133
|
+
|
|
134
|
+
@classmethod
|
|
135
|
+
def from_env(cls) -> RetentionConfig | None:
|
|
136
|
+
bucket = (os.environ.get("CODEX_RETAIN_BUCKET") or "").strip()
|
|
137
|
+
if not bucket:
|
|
138
|
+
return None
|
|
139
|
+
try:
|
|
140
|
+
ttl_days = int(os.environ.get("CODEX_RETAIN_TTL_DAYS", "90"))
|
|
141
|
+
except ValueError:
|
|
142
|
+
logger.warning("CODEX_RETAIN_TTL_DAYS is not an int → defaulting to 90")
|
|
143
|
+
ttl_days = 90
|
|
144
|
+
return cls(
|
|
145
|
+
bucket=bucket,
|
|
146
|
+
prefix=(os.environ.get("CODEX_RETAIN_PREFIX") or "").strip().strip("/"),
|
|
147
|
+
ttl_days=ttl_days,
|
|
148
|
+
endpoint_url=(os.environ.get("CODEX_RETAIN_ENDPOINT_URL") or "").strip() or None,
|
|
149
|
+
region=(os.environ.get("CODEX_RETAIN_REGION") or "us-east-1").strip(),
|
|
150
|
+
access_key_id=(os.environ.get("CODEX_RETAIN_ACCESS_KEY_ID") or "").strip() or None,
|
|
151
|
+
secret_access_key=(
|
|
152
|
+
(os.environ.get("CODEX_RETAIN_SECRET_ACCESS_KEY") or "").strip() or None
|
|
153
|
+
),
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class _S3Client(Protocol):
|
|
158
|
+
def put_object(self, **kwargs: Any) -> Any: ...
|
|
159
|
+
def list_objects_v2(self, **kwargs: Any) -> Any: ...
|
|
160
|
+
def delete_objects(self, **kwargs: Any) -> Any: ...
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _utc_date() -> str:
|
|
164
|
+
return datetime.now(timezone.utc).date().isoformat()
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _utc_ts() -> str:
|
|
168
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _object_key(prefix: str, tenant: str, dt: str, sha: str, suffix: str) -> str:
|
|
172
|
+
parts = [
|
|
173
|
+
p for p in (prefix, f"tenant={tenant}", f"dt={dt}", f"sha256={sha}", suffix) if p
|
|
174
|
+
]
|
|
175
|
+
return "/".join(parts)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class RetentionStore:
|
|
179
|
+
"""Three-object-per-event S3 writer.
|
|
180
|
+
|
|
181
|
+
The S3 client is constructor-injected so tests substitute a
|
|
182
|
+
``MagicMock`` without depending on boto3 (or moto). Production
|
|
183
|
+
instantiation goes through ``make_retention_store()``.
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
def __init__(self, config: RetentionConfig, client: _S3Client) -> None:
|
|
187
|
+
self._config = config
|
|
188
|
+
self._client = client
|
|
189
|
+
|
|
190
|
+
@property
|
|
191
|
+
def config(self) -> RetentionConfig:
|
|
192
|
+
return self._config
|
|
193
|
+
|
|
194
|
+
def put(
|
|
195
|
+
self,
|
|
196
|
+
*,
|
|
197
|
+
pdf_bytes: bytes,
|
|
198
|
+
extract_payload: dict[str, Any],
|
|
199
|
+
request_id: str,
|
|
200
|
+
tenant: str,
|
|
201
|
+
sha256: str,
|
|
202
|
+
codex_version: str,
|
|
203
|
+
consent_source: str,
|
|
204
|
+
) -> dict[str, str]:
|
|
205
|
+
"""Write ``document.pdf``, ``extract.json``, ``meta.json``.
|
|
206
|
+
|
|
207
|
+
Returns the three object keys for the audit log.
|
|
208
|
+
"""
|
|
209
|
+
dt = _utc_date()
|
|
210
|
+
meta: dict[str, Any] = {
|
|
211
|
+
"request_id": request_id,
|
|
212
|
+
"ts": _utc_ts(),
|
|
213
|
+
"sha256": sha256,
|
|
214
|
+
"content_length": len(pdf_bytes),
|
|
215
|
+
"tenant": tenant,
|
|
216
|
+
"codex_version": codex_version,
|
|
217
|
+
"consent_source": consent_source,
|
|
218
|
+
"retention_window_days": self._config.ttl_days,
|
|
219
|
+
}
|
|
220
|
+
pdf_key = _object_key(self._config.prefix, tenant, dt, sha256, "document.pdf")
|
|
221
|
+
extract_key = _object_key(self._config.prefix, tenant, dt, sha256, "extract.json")
|
|
222
|
+
meta_key = _object_key(self._config.prefix, tenant, dt, sha256, "meta.json")
|
|
223
|
+
|
|
224
|
+
self._client.put_object(
|
|
225
|
+
Bucket=self._config.bucket,
|
|
226
|
+
Key=pdf_key,
|
|
227
|
+
Body=pdf_bytes,
|
|
228
|
+
ContentType="application/pdf",
|
|
229
|
+
)
|
|
230
|
+
self._client.put_object(
|
|
231
|
+
Bucket=self._config.bucket,
|
|
232
|
+
Key=extract_key,
|
|
233
|
+
Body=json.dumps(extract_payload, sort_keys=True, separators=(",", ":")).encode(
|
|
234
|
+
"utf-8"
|
|
235
|
+
),
|
|
236
|
+
ContentType="application/json",
|
|
237
|
+
)
|
|
238
|
+
self._client.put_object(
|
|
239
|
+
Bucket=self._config.bucket,
|
|
240
|
+
Key=meta_key,
|
|
241
|
+
Body=json.dumps(meta, sort_keys=True, separators=(",", ":")).encode("utf-8"),
|
|
242
|
+
ContentType="application/json",
|
|
243
|
+
)
|
|
244
|
+
return {"pdf": pdf_key, "extract": extract_key, "meta": meta_key}
|
|
245
|
+
|
|
246
|
+
def delete(self, sha256: str) -> int:
|
|
247
|
+
"""Erase every object with ``sha256={sha}/`` in its key.
|
|
248
|
+
|
|
249
|
+
Scans every ``dt=`` / ``tenant=`` partition. S3 has no native
|
|
250
|
+
sha-suffix search, but a 90-day window with a couple of
|
|
251
|
+
tenants is small enough that the linear scan is fine — and
|
|
252
|
+
avoids a parallel index we'd have to keep consistent.
|
|
253
|
+
"""
|
|
254
|
+
if not _SHA256_RE.match(sha256):
|
|
255
|
+
raise ValueError(f"invalid sha256: {sha256!r}")
|
|
256
|
+
matches: list[dict[str, str]] = []
|
|
257
|
+
token: str | None = None
|
|
258
|
+
list_prefix = f"{self._config.prefix}/" if self._config.prefix else ""
|
|
259
|
+
while True:
|
|
260
|
+
kwargs: dict[str, Any] = {
|
|
261
|
+
"Bucket": self._config.bucket,
|
|
262
|
+
"Prefix": list_prefix,
|
|
263
|
+
}
|
|
264
|
+
if token:
|
|
265
|
+
kwargs["ContinuationToken"] = token
|
|
266
|
+
resp = self._client.list_objects_v2(**kwargs)
|
|
267
|
+
for entry in resp.get("Contents", []) or []:
|
|
268
|
+
key = entry.get("Key", "")
|
|
269
|
+
if f"/sha256={sha256}/" in key:
|
|
270
|
+
matches.append({"Key": key})
|
|
271
|
+
if not resp.get("IsTruncated"):
|
|
272
|
+
break
|
|
273
|
+
token = resp.get("NextContinuationToken")
|
|
274
|
+
if not token:
|
|
275
|
+
break
|
|
276
|
+
|
|
277
|
+
deleted = 0
|
|
278
|
+
for i in range(0, len(matches), 1000):
|
|
279
|
+
batch = matches[i : i + 1000]
|
|
280
|
+
self._client.delete_objects(
|
|
281
|
+
Bucket=self._config.bucket, Delete={"Objects": batch}
|
|
282
|
+
)
|
|
283
|
+
deleted += len(batch)
|
|
284
|
+
return deleted
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def make_retention_store() -> RetentionStore | None:
|
|
288
|
+
"""Build a production ``RetentionStore`` from env, or return ``None``.
|
|
289
|
+
|
|
290
|
+
``None`` is the explicit "feature off" sentinel. The boto3 import
|
|
291
|
+
is deferred so the base wheel doesn't pull boto3 in unless the
|
|
292
|
+
operator opts in via ``CODEX_RETAIN_BUCKET``.
|
|
293
|
+
"""
|
|
294
|
+
config = RetentionConfig.from_env()
|
|
295
|
+
if config is None:
|
|
296
|
+
return None
|
|
297
|
+
try:
|
|
298
|
+
import boto3 # type: ignore[import-not-found]
|
|
299
|
+
except ImportError:
|
|
300
|
+
logger.warning(
|
|
301
|
+
"CODEX_RETAIN_BUCKET set but boto3 is not installed — "
|
|
302
|
+
"install codex-pdf[retain] to enable retention. Falling back to disabled."
|
|
303
|
+
)
|
|
304
|
+
return None
|
|
305
|
+
client_kwargs: dict[str, Any] = {"region_name": config.region}
|
|
306
|
+
if config.endpoint_url:
|
|
307
|
+
client_kwargs["endpoint_url"] = config.endpoint_url
|
|
308
|
+
if config.access_key_id and config.secret_access_key:
|
|
309
|
+
client_kwargs["aws_access_key_id"] = config.access_key_id
|
|
310
|
+
client_kwargs["aws_secret_access_key"] = config.secret_access_key
|
|
311
|
+
client = boto3.client("s3", **client_kwargs)
|
|
312
|
+
return RetentionStore(config, client)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Unit tests for the retention consent parser.
|
|
2
|
+
|
|
3
|
+
The marketing demo sends both a form field (``retain_for_training``)
|
|
4
|
+
and a header (``X-Compile-Retain-For-Training``). These tests pin the
|
|
5
|
+
full truth matrix: missing/present × every token variant × form-vs-
|
|
6
|
+
header reconciliation. Anything outside the documented true tokens
|
|
7
|
+
must be false — a typo like ``"yes please"`` is not an opt-in.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import pytest
|
|
13
|
+
|
|
14
|
+
from codex_pdf.api.retention import (
|
|
15
|
+
ConsentDecision,
|
|
16
|
+
normalise_tenant,
|
|
17
|
+
parse_retention_consent,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@pytest.mark.parametrize(
|
|
22
|
+
"form,header,want_consent,want_source,want_mismatch",
|
|
23
|
+
[
|
|
24
|
+
# Neither present → off.
|
|
25
|
+
(None, None, False, "none", False),
|
|
26
|
+
("", None, False, "none", False),
|
|
27
|
+
(None, "", False, "none", False),
|
|
28
|
+
(" ", " ", False, "none", False),
|
|
29
|
+
# Header-only paths.
|
|
30
|
+
(None, "true", True, "header", False),
|
|
31
|
+
(None, "TRUE", True, "header", False),
|
|
32
|
+
(None, "1", True, "header", False),
|
|
33
|
+
(None, "yes", True, "header", False),
|
|
34
|
+
(None, "on", True, "header", False),
|
|
35
|
+
(None, "false", False, "header", False),
|
|
36
|
+
(None, "0", False, "header", False),
|
|
37
|
+
(None, "no", False, "header", False),
|
|
38
|
+
(None, "off", False, "header", False),
|
|
39
|
+
(None, "maybe", False, "header", False),
|
|
40
|
+
# Form-only paths.
|
|
41
|
+
("true", None, True, "form", False),
|
|
42
|
+
("TRUE", None, True, "form", False),
|
|
43
|
+
("1", None, True, "form", False),
|
|
44
|
+
("yes", None, True, "form", False),
|
|
45
|
+
("on", None, True, "form", False),
|
|
46
|
+
("false", None, False, "form", False),
|
|
47
|
+
("0", None, False, "form", False),
|
|
48
|
+
("no", None, False, "form", False),
|
|
49
|
+
("off", None, False, "form", False),
|
|
50
|
+
# Both present + agree.
|
|
51
|
+
("true", "true", True, "both", False),
|
|
52
|
+
("1", "yes", True, "both", False),
|
|
53
|
+
("false", "false", False, "form", False),
|
|
54
|
+
# Both present + disagree → form wins, mismatch flagged.
|
|
55
|
+
("true", "false", True, "form", True),
|
|
56
|
+
("false", "true", False, "form", True),
|
|
57
|
+
("yes", "no", True, "form", True),
|
|
58
|
+
# Whitespace + case tolerance.
|
|
59
|
+
(" Yes ", None, True, "form", False),
|
|
60
|
+
(None, " ON\n", True, "header", False),
|
|
61
|
+
],
|
|
62
|
+
)
|
|
63
|
+
def test_parse_retention_consent_matrix(
|
|
64
|
+
form: str | None,
|
|
65
|
+
header: str | None,
|
|
66
|
+
want_consent: bool,
|
|
67
|
+
want_source: str,
|
|
68
|
+
want_mismatch: bool,
|
|
69
|
+
) -> None:
|
|
70
|
+
got = parse_retention_consent(form, header)
|
|
71
|
+
assert got == ConsentDecision(
|
|
72
|
+
consent=want_consent, source=want_source, mismatch=want_mismatch
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@pytest.mark.parametrize(
|
|
77
|
+
"raw,want",
|
|
78
|
+
[
|
|
79
|
+
(None, "default"),
|
|
80
|
+
("", "default"),
|
|
81
|
+
(" ", "default"),
|
|
82
|
+
("compile-marketing", "compile-marketing"),
|
|
83
|
+
("COMPILE-Marketing", "compile-marketing"),
|
|
84
|
+
("acme42", "acme42"),
|
|
85
|
+
# Invalid → silent fallback (logged).
|
|
86
|
+
("under_score", "default"),
|
|
87
|
+
("has space", "default"),
|
|
88
|
+
("-leadingdash", "default"),
|
|
89
|
+
("a" * 64, "default"),
|
|
90
|
+
("a/b", "default"),
|
|
91
|
+
],
|
|
92
|
+
)
|
|
93
|
+
def test_normalise_tenant(raw: str | None, want: str) -> None:
|
|
94
|
+
assert normalise_tenant(raw) == want
|