codex-pdf 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. codex_pdf-0.1.1/.github/workflows/ci.yml +35 -0
  2. codex_pdf-0.1.1/.gitignore +11 -0
  3. codex_pdf-0.1.1/LICENSE +7 -0
  4. codex_pdf-0.1.1/PKG-INFO +84 -0
  5. codex_pdf-0.1.1/README.md +70 -0
  6. codex_pdf-0.1.1/docs/architecture.md +35 -0
  7. codex_pdf-0.1.1/docs/backward-compatibility.md +24 -0
  8. codex_pdf-0.1.1/docs/cleanup-stop-gates.md +25 -0
  9. codex_pdf-0.1.1/docs/cli.md +44 -0
  10. codex_pdf-0.1.1/docs/contract.md +36 -0
  11. codex_pdf-0.1.1/docs/discovery-audit.md +61 -0
  12. codex_pdf-0.1.1/docs/migration-plan.md +32 -0
  13. codex_pdf-0.1.1/docs/parity.md +36 -0
  14. codex_pdf-0.1.1/docs/preflight-ingest.md +33 -0
  15. codex_pdf-0.1.1/pyproject.toml +41 -0
  16. codex_pdf-0.1.1/reports/parity/viewer_essentials.json +195 -0
  17. codex_pdf-0.1.1/schemas/CHANGELOG.md +19 -0
  18. codex_pdf-0.1.1/schemas/v1/codex-annotation.schema.json +16 -0
  19. codex_pdf-0.1.1/schemas/v1/codex-box.schema.json +14 -0
  20. codex_pdf-0.1.1/schemas/v1/codex-color-space.schema.json +35 -0
  21. codex_pdf-0.1.1/schemas/v1/codex-document.schema.json +50 -0
  22. codex_pdf-0.1.1/schemas/v1/codex-font.schema.json +18 -0
  23. codex_pdf-0.1.1/schemas/v1/codex-form-xobject.schema.json +13 -0
  24. codex_pdf-0.1.1/schemas/v1/codex-image.schema.json +28 -0
  25. codex_pdf-0.1.1/schemas/v1/codex-issue.schema.json +17 -0
  26. codex_pdf-0.1.1/schemas/v1/codex-ocg.schema.json +15 -0
  27. codex_pdf-0.1.1/schemas/v1/codex-output-intent.schema.json +12 -0
  28. codex_pdf-0.1.1/schemas/v1/codex-page-object.schema.json +20 -0
  29. codex_pdf-0.1.1/schemas/v1/codex-page.schema.json +36 -0
  30. codex_pdf-0.1.1/schemas/v1/codex-preflight-report.schema.json +16 -0
  31. codex_pdf-0.1.1/schemas/v1/codex-source.schema.json +12 -0
  32. codex_pdf-0.1.1/schemas/v1/codex-spot-colorant.schema.json +13 -0
  33. codex_pdf-0.1.1/schemas/v1/codex-transparency-tree.schema.json +22 -0
  34. codex_pdf-0.1.1/schemas/v1/codex-trap-evidence.schema.json +25 -0
  35. codex_pdf-0.1.1/schemas/v1/codex-warning.schema.json +13 -0
  36. codex_pdf-0.1.1/scripts/parity_viewer_essentials.py +41 -0
  37. codex_pdf-0.1.1/src/codex_pdf/__init__.py +6 -0
  38. codex_pdf-0.1.1/src/codex_pdf/cli.py +125 -0
  39. codex_pdf-0.1.1/src/codex_pdf/extract/__init__.py +5 -0
  40. codex_pdf-0.1.1/src/codex_pdf/extract/annotations.py +39 -0
  41. codex_pdf-0.1.1/src/codex_pdf/extract/color.py +99 -0
  42. codex_pdf-0.1.1/src/codex_pdf/extract/common.py +31 -0
  43. codex_pdf-0.1.1/src/codex_pdf/extract/content_inventory.py +68 -0
  44. codex_pdf-0.1.1/src/codex_pdf/extract/document.py +97 -0
  45. codex_pdf-0.1.1/src/codex_pdf/extract/fonts.py +50 -0
  46. codex_pdf-0.1.1/src/codex_pdf/extract/forms.py +46 -0
  47. codex_pdf-0.1.1/src/codex_pdf/extract/images.py +45 -0
  48. codex_pdf-0.1.1/src/codex_pdf/extract/ocg.py +65 -0
  49. codex_pdf-0.1.1/src/codex_pdf/extract/structure.py +57 -0
  50. codex_pdf-0.1.1/src/codex_pdf/extract/transparency.py +31 -0
  51. codex_pdf-0.1.1/src/codex_pdf/extract/trapping.py +46 -0
  52. codex_pdf-0.1.1/src/codex_pdf/models/__init__.py +5 -0
  53. codex_pdf-0.1.1/src/codex_pdf/models/v1.py +285 -0
  54. codex_pdf-0.1.1/src/codex_pdf/parity.py +298 -0
  55. codex_pdf-0.1.1/src/codex_pdf/preflight_ingest/__init__.py +21 -0
  56. codex_pdf-0.1.1/src/codex_pdf/preflight_ingest/adapters.py +219 -0
  57. codex_pdf-0.1.1/src/codex_pdf/schema.py +20 -0
  58. codex_pdf-0.1.1/src/codex_pdf/version.py +3 -0
  59. codex_pdf-0.1.1/tests/golden/1.0.0/reference.json +67 -0
  60. codex_pdf-0.1.1/tests/test_extract_structural.py +25 -0
  61. codex_pdf-0.1.1/tests/test_golden.py +13 -0
  62. codex_pdf-0.1.1/tests/test_golden_corpus.py +30 -0
  63. codex_pdf-0.1.1/tests/test_models.py +18 -0
  64. codex_pdf-0.1.1/tests/test_parity.py +115 -0
  65. codex_pdf-0.1.1/tests/test_preflight_ingest.py +42 -0
  66. codex_pdf-0.1.1/tests/test_schema.py +11 -0
  67. codex_pdf-0.1.1/tests/test_schemas_all.py +11 -0
  68. codex_pdf-0.1.1/uv.lock +618 -0
@@ -0,0 +1,35 @@
1
+ name: ci
2
+
3
+ on:
4
+ pull_request:
5
+ push:
6
+ branches: [main]
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+ - uses: actions/setup-python@v5
14
+ with:
15
+ python-version: "3.12"
16
+ - name: Install
17
+ run: |
18
+ python -m pip install --upgrade pip
19
+ pip install -e . pytest
20
+ - name: Test
21
+ run: pytest
22
+ - name: Parity Gate (Deep Profile)
23
+ run: |
24
+ python - <<'PY'
25
+ import fitz
26
+ from pathlib import Path
27
+ root = Path("tmp-parity")
28
+ root.mkdir(exist_ok=True)
29
+ path = root / "fixture.pdf"
30
+ doc = fitz.open()
31
+ page = doc.new_page(width=612, height=792)
32
+ page.insert_text((72, 72), "codex parity fixture")
33
+ doc.save(path)
34
+ PY
35
+ codex-pdf parity --fixtures-root tmp-parity --profile deep --max-files 5
@@ -0,0 +1,11 @@
1
+ .venv/
2
+ __pycache__/
3
+ .pytest_cache/
4
+ .ruff_cache/
5
+ .mypy_cache/
6
+ *.pyc
7
+ dist/
8
+ build/
9
+ *.egg-info/
10
+ coverage.xml
11
+ htmlcov/
@@ -0,0 +1,7 @@
1
+ SPDX-License-Identifier: AGPL-3.0-or-later
2
+
3
+ codexPDF is distributed under the GNU Affero General Public License v3.0
4
+ or (at your option) any later version.
5
+
6
+ You should have received a copy of the GNU Affero General Public License
7
+ along with this program. If not, see https://www.gnu.org/licenses/.
@@ -0,0 +1,84 @@
1
+ Metadata-Version: 2.4
2
+ Name: codex-pdf
3
+ Version: 0.1.1
4
+ Summary: Authoritative, versioned PDF facts contract for Think Neverland tools.
5
+ Author-email: Think Neverland <dev@thinkneverland.com>
6
+ License-Expression: AGPL-3.0-or-later
7
+ License-File: LICENSE
8
+ Requires-Python: >=3.12
9
+ Requires-Dist: jsonschema>=4.23
10
+ Requires-Dist: pikepdf>=9.0
11
+ Requires-Dist: pydantic>=2.8
12
+ Requires-Dist: pymupdf>=1.24
13
+ Description-Content-Type: text/markdown
14
+
15
+ ---
16
+ title: "Overview"
17
+ description: "Authoritative read-only PDF facts engine for Think Neverland tools. Versioned contract, schema-validated output, and consumer-agnostic extraction."
18
+ group: "Getting started"
19
+ order: 1
20
+ slug: "overview"
21
+ ---
22
+
23
+ # codexPDF
24
+
25
+ `codexPDF` is Think Neverland's authoritative, read-only PDF facts reference.
26
+
27
+ Other engines consult `codexPDF` for canonical document facts instead of
28
+ re-parsing PDFs independently. The contract is versioned and schema-validated.
29
+
30
+ ## Status
31
+
32
+ Current baseline includes:
33
+
34
+ - Python package (`codex_pdf`) with typed models
35
+ - CLI (`codex-pdf extract|schema|validate|probe|parity`)
36
+ - Versioned schemas in `schemas/v1/`
37
+ - Golden output harness under `tests/golden/`
38
+
39
+ ## Quickstart
40
+
41
+ ```bash
42
+ uv sync
43
+ uv run codex-pdf probe input.pdf --json
44
+ uv run codex-pdf extract input.pdf --pretty > out.json
45
+ uv run codex-pdf validate out.json
46
+ uv run codex-pdf parity --fixtures-root tests/fixtures --profile summary --max-files 5
47
+ uv run codex-pdf parity --fixtures-root tests/fixtures --profile inventory --max-files 5
48
+ uv run codex-pdf parity --fixtures-root tests/fixtures --profile deep --max-files 5
49
+ ```
50
+
51
+ Optional external baseline comparison (consumer-specific adapter provided at runtime):
52
+
53
+ ```bash
54
+ uv run codex-pdf parity \
55
+ --fixtures-root /path/to/pdfs \
56
+ --profile summary \
57
+ --baseline-command "<your_command_with_{pdf}_placeholder>"
58
+ ```
59
+
60
+ ## Contract
61
+
62
+ The public API is the JSON contract rooted at `CodexDocument`.
63
+
64
+ - Schema path: `schemas/v1/codex-document.schema.json`
65
+ - Runtime model: `codex_pdf.models.v1.CodexDocument`
66
+ - Stability policy: SemVer (`major` for breaking contract changes)
67
+
68
+ ## Documentation
69
+
70
+ | Topic | Doc |
71
+ | --- | --- |
72
+ | Architecture and boundaries | [docs/architecture.md](./docs/architecture.md) |
73
+ | CLI commands and usage patterns | [docs/cli.md](./docs/cli.md) |
74
+ | Contract and schema versioning | [docs/contract.md](./docs/contract.md) |
75
+ | Parity profiles and baselines | [docs/parity.md](./docs/parity.md) |
76
+ | Preflight ingest adapters | [docs/preflight-ingest.md](./docs/preflight-ingest.md) |
77
+ | Migration sequencing | [docs/migration-plan.md](./docs/migration-plan.md) |
78
+ | Legacy discovery audit | [docs/discovery-audit.md](./docs/discovery-audit.md) |
79
+ | Backward compatibility requirements | [docs/backward-compatibility.md](./docs/backward-compatibility.md) |
80
+ | Cleanup stop-gates policy | [docs/cleanup-stop-gates.md](./docs/cleanup-stop-gates.md) |
81
+
82
+ ## License
83
+
84
+ AGPL-3.0-or-later.
@@ -0,0 +1,70 @@
1
+ ---
2
+ title: "Overview"
3
+ description: "Authoritative read-only PDF facts engine for Think Neverland tools. Versioned contract, schema-validated output, and consumer-agnostic extraction."
4
+ group: "Getting started"
5
+ order: 1
6
+ slug: "overview"
7
+ ---
8
+
9
+ # codexPDF
10
+
11
+ `codexPDF` is Think Neverland's authoritative, read-only PDF facts reference.
12
+
13
+ Other engines consult `codexPDF` for canonical document facts instead of
14
+ re-parsing PDFs independently. The contract is versioned and schema-validated.
15
+
16
+ ## Status
17
+
18
+ Current baseline includes:
19
+
20
+ - Python package (`codex_pdf`) with typed models
21
+ - CLI (`codex-pdf extract|schema|validate|probe|parity`)
22
+ - Versioned schemas in `schemas/v1/`
23
+ - Golden output harness under `tests/golden/`
24
+
25
+ ## Quickstart
26
+
27
+ ```bash
28
+ uv sync
29
+ uv run codex-pdf probe input.pdf --json
30
+ uv run codex-pdf extract input.pdf --pretty > out.json
31
+ uv run codex-pdf validate out.json
32
+ uv run codex-pdf parity --fixtures-root tests/fixtures --profile summary --max-files 5
33
+ uv run codex-pdf parity --fixtures-root tests/fixtures --profile inventory --max-files 5
34
+ uv run codex-pdf parity --fixtures-root tests/fixtures --profile deep --max-files 5
35
+ ```
36
+
37
+ Optional external baseline comparison (consumer-specific adapter provided at runtime):
38
+
39
+ ```bash
40
+ uv run codex-pdf parity \
41
+ --fixtures-root /path/to/pdfs \
42
+ --profile summary \
43
+ --baseline-command "<your_command_with_{pdf}_placeholder>"
44
+ ```
45
+
46
+ ## Contract
47
+
48
+ The public API is the JSON contract rooted at `CodexDocument`.
49
+
50
+ - Schema path: `schemas/v1/codex-document.schema.json`
51
+ - Runtime model: `codex_pdf.models.v1.CodexDocument`
52
+ - Stability policy: SemVer (`major` for breaking contract changes)
53
+
54
+ ## Documentation
55
+
56
+ | Topic | Doc |
57
+ | --- | --- |
58
+ | Architecture and boundaries | [docs/architecture.md](./docs/architecture.md) |
59
+ | CLI commands and usage patterns | [docs/cli.md](./docs/cli.md) |
60
+ | Contract and schema versioning | [docs/contract.md](./docs/contract.md) |
61
+ | Parity profiles and baselines | [docs/parity.md](./docs/parity.md) |
62
+ | Preflight ingest adapters | [docs/preflight-ingest.md](./docs/preflight-ingest.md) |
63
+ | Migration sequencing | [docs/migration-plan.md](./docs/migration-plan.md) |
64
+ | Legacy discovery audit | [docs/discovery-audit.md](./docs/discovery-audit.md) |
65
+ | Backward compatibility requirements | [docs/backward-compatibility.md](./docs/backward-compatibility.md) |
66
+ | Cleanup stop-gates policy | [docs/cleanup-stop-gates.md](./docs/cleanup-stop-gates.md) |
67
+
68
+ ## License
69
+
70
+ AGPL-3.0-or-later.
@@ -0,0 +1,35 @@
1
+ ---
2
+ title: "Architecture"
3
+ description: "codexPDF boundaries, extraction pipeline shape, and the contract-first model used by downstream tools."
4
+ group: "Getting started"
5
+ order: 2
6
+ ---
7
+
8
+ # Architecture
9
+
10
+ `codexPDF` is a contract-first facts engine for PDF documents.
11
+
12
+ ## Boundary
13
+
14
+ - Read-only extraction only.
15
+ - No rendering, layout, mutation, or rule adjudication.
16
+ - Consumer-agnostic output: same contract regardless of caller.
17
+
18
+ ## Pipeline
19
+
20
+ 1. Input PDF bytes are loaded by the extractor layer.
21
+ 2. Domain extractors populate `CodexDocument` fields (pages, boxes, fonts,
22
+ images, color spaces, OCG/layers, annotations, transparency, trapping).
23
+ 3. Output is serialized as JSON against published schema definitions in
24
+ `schemas/v1/`.
25
+
26
+ ## Primary contract
27
+
28
+ - Runtime model: `codex_pdf.models.v1.CodexDocument`
29
+ - Schema: `schemas/v1/codex-document.schema.json`
30
+ - Version marker: `schema_version` field in payload
31
+
32
+ ## Consumer relationship
33
+
34
+ Downstream engines should treat codex output as the source of truth for
35
+ document facts and keep any product-specific behavior in adapter layers.
@@ -0,0 +1,24 @@
1
+ ---
2
+ title: "Backward Compatibility"
3
+ description: "Consumer payload compatibility expectations during codexPDF rollout and cutover."
4
+ group: "Project"
5
+ order: 8
6
+ ---
7
+
8
+ # Backward Compatibility Requirements
9
+
10
+ During migration, preserve the existing consumer payloads while introducing
11
+ codex-backed data:
12
+
13
+ - `lint-pdf` viewer essentials shape:
14
+ - `pdf_version`, `page_count`, `is_encrypted`, `pages`, `info_dict`
15
+ - findings payload fields consumed by `lint-pdf-ui`:
16
+ - `inspection_id`, `severity`, `page_num`, `bbox`, `message`, `details`
17
+ - `loupe-pdf/types` public contracts:
18
+ - `PageInfo`, `LayerInfo`, `ViewerConfig`, `ColorSample`
19
+
20
+ Compatibility strategy:
21
+
22
+ 1. Keep existing endpoints unchanged.
23
+ 2. Introduce codex fields as additive (`codex_*`) metadata.
24
+ 3. Maintain feature-flagged fallback to legacy parser paths.
@@ -0,0 +1,25 @@
1
+ ---
2
+ title: "Cleanup Stop Gates"
3
+ description: "Release gates required before downstream parser deletion and hard cutover enforcement."
4
+ group: "Project"
5
+ order: 9
6
+ ---
7
+
8
+ # STOP-Gated Cleanup Policy
9
+
10
+ No deletions of parse code in downstream repositories are permitted until all
11
+ conditions below pass and are approved.
12
+
13
+ ## Required gates
14
+
15
+ 1. Dual-run parity report on reference corpus is green.
16
+ 2. Contract schema remains backward-compatible for pinned consumers.
17
+ 3. Latest shipping release candidates of `lint-pdf` and `loupe-pdf` pass CI with codex enabled.
18
+ 4. Explicit go/no-go approval recorded for each repository cleanup PR.
19
+
20
+ ## Candidate cleanup targets (future)
21
+
22
+ - `lint-pdf`: direct parser/semantic extraction branches replaced by codex adapter.
23
+ - `loupe-pdf`: byte-scan spot extraction where codex provides canonical data.
24
+ - `lint-pdf-ui`: legacy field alias handling after all APIs converge.
25
+ - `assay-pdf`: none required; codex remains optional shell-out integration.
@@ -0,0 +1,44 @@
1
+ ---
2
+ title: "CLI"
3
+ description: "Command reference for extract, schema, validate, probe, and parity workflows."
4
+ group: "Getting started"
5
+ order: 3
6
+ ---
7
+
8
+ # CLI
9
+
10
+ `codex-pdf` exposes a contract-oriented CLI.
11
+
12
+ ## Commands
13
+
14
+ - `extract <input_pdf>` — emit full `CodexDocument` JSON.
15
+ - `schema` — print schema JSON (published or runtime-generated).
16
+ - `validate <codex_json>` — validate output against published schema.
17
+ - `probe <input_pdf>` — return lightweight metadata summary.
18
+ - `parity` — compare codex projections against baseline projections.
19
+
20
+ ## Common usage
21
+
22
+ ```bash
23
+ uv run codex-pdf extract input.pdf --pretty > out.json
24
+ uv run codex-pdf validate out.json
25
+ uv run codex-pdf probe input.pdf --json
26
+ ```
27
+
28
+ ## Parity usage
29
+
30
+ ```bash
31
+ uv run codex-pdf parity \
32
+ --fixtures-root tests/fixtures \
33
+ --profile deep \
34
+ --max-files 10
35
+ ```
36
+
37
+ Baseline command mode:
38
+
39
+ ```bash
40
+ uv run codex-pdf parity \
41
+ --fixtures-root /path/to/pdfs \
42
+ --profile summary \
43
+ --baseline-command "<command with {pdf} placeholder>"
44
+ ```
@@ -0,0 +1,36 @@
1
+ ---
2
+ title: "Contract and Schemas"
3
+ description: "CodexDocument model, schema publishing approach, and compatibility policy."
4
+ group: "Reference"
5
+ order: 4
6
+ ---
7
+
8
+ # Contract and Schemas
9
+
10
+ The codex public contract is rooted at `CodexDocument`.
11
+
12
+ ## Runtime model
13
+
14
+ - Python model: `codex_pdf.models.v1.CodexDocument`
15
+ - Child types: page boxes, inventories, fonts, images, color spaces, OCGs,
16
+ annotations, preflight reports, and warnings
17
+
18
+ ## Published schemas
19
+
20
+ - Schema root: `schemas/v1/codex-document.schema.json`
21
+ - Child schemas: `schemas/v1/codex-*.schema.json`
22
+ - Changelog: `schemas/CHANGELOG.md`
23
+
24
+ ## Versioning policy
25
+
26
+ - `schema_version` in payload tracks contract version.
27
+ - Breaking contract changes increment major version.
28
+ - Non-breaking additive changes use minor/patch increments.
29
+
30
+ ## Validation
31
+
32
+ Use the CLI validator:
33
+
34
+ ```bash
35
+ uv run codex-pdf validate out.json
36
+ ```
@@ -0,0 +1,61 @@
1
+ ---
2
+ title: "Discovery Audit"
3
+ description: "Initial cross-repo parsing inventory used to design codexPDF migration boundaries and ownership."
4
+ group: "Project"
5
+ order: 6
6
+ ---
7
+
8
+ # Discovery Audit
9
+
10
+ This audit captures parse-related extraction surfaces before migration.
11
+
12
+ ## lint-pdf
13
+
14
+ - `src/lintpdf/parser/pikepdf_adapter.py`
15
+ - `PikePDFAdapter.open`, `parse_content_stream`, `get_stream_data`
16
+ - **Disposition:** move/rewrite into codex extractor core.
17
+ - `src/lintpdf/semantic/builder.py`
18
+ - `SemanticModelBuilder.build`
19
+ - **Disposition:** rewrite into codex semantic inventory builder.
20
+ - `src/lintpdf/semantic/interpreter.py`
21
+ - `ContentStreamInterpreter.interpret`
22
+ - **Disposition:** rewrite with codex event inventory contracts.
23
+ - `src/lintpdf/imports/*.py`
24
+ - `PitStopXmlParser`, `Callas*Parser`, `AcrobatXmlParser`
25
+ - **Disposition:** adapt into codex preflight ingest adapters.
26
+
27
+ ## lint-pdf-ui
28
+
29
+ - `packages/viewer-shared/src/PdfViewer.tsx`
30
+ - `mergeConfig` JSON normalization for viewer payloads.
31
+ - **Disposition:** stay in UI; add codex compatibility fields.
32
+ - `packages/viewer-shared/src/lintpdf/sources/finding-overlay.ts`
33
+ - finding-to-overlay translation.
34
+ - **Disposition:** stay in UI boundary adapter.
35
+
36
+ ## loupe-pdf
37
+
38
+ - `browser/index.ts`
39
+ - `extractOcgIds`, `detectSpotInksFromPdfBytes`
40
+ - **Disposition:** move facts extraction to codex, keep rendering in loupe.
41
+ - `fallback-pdfjs/index.ts`
42
+ - fallback page/layer extraction.
43
+ - **Disposition:** keep as visualization fallback path.
44
+ - `types/index.ts`
45
+ - core viewer contracts.
46
+ - **Disposition:** extend for codex document transport shape.
47
+
48
+ ## assay-pdf
49
+
50
+ - `src/assay_pdf/spec/parser.py`
51
+ - GWG workbook parsing.
52
+ - **Disposition:** stay in assay.
53
+ - `src/assay_pdf/harness/runners/*.py`
54
+ - engine output parsing.
55
+ - **Disposition:** add codex CLI shell-out runner; keep MIT boundary.
56
+
57
+ ## Overlap highlights
58
+
59
+ - Duplicate parse facts for OCG/layers across `lint-pdf` and `loupe-pdf`.
60
+ - Spot/separation extraction differs in completeness.
61
+ - UI contract naming drift risk (`findings_source` vs historical `preflight_source`).
@@ -0,0 +1,32 @@
1
+ ---
2
+ title: "Migration Plan"
3
+ description: "Phased rollout plan for moving PDF fact extraction from downstream engines into codexPDF."
4
+ group: "Project"
5
+ order: 7
6
+ ---
7
+
8
+ # Migration Plan
9
+
10
+ ## Phase 0 (this repo)
11
+
12
+ 1. Publish `CodexDocument` contract and schema.
13
+ 2. Ship CLI (`extract`, `schema`, `validate`, `probe`).
14
+ 3. Commit golden output harness.
15
+
16
+ ## Phase 1 (non-destructive adapters)
17
+
18
+ 1. `lint-pdf`: optional codex-backed extraction path behind feature flag.
19
+ 2. `lint-pdf-ui`: accept codex payload metadata fields while preserving existing config.
20
+ 3. `loupe-pdf`: add contract type for codex-fed metadata facts.
21
+ 4. `assay-pdf`: add codex subprocess runner (`codex-pdf extract`) with no in-process import.
22
+
23
+ ## Phase 2 (parity hardening)
24
+
25
+ 1. Run dual-path comparisons on reference corpus.
26
+ 2. Resolve mismatches in page boxes, OCG inventory, and spot metadata.
27
+ 3. Freeze schema at `1.0.0` once parity gates pass.
28
+
29
+ ## Phase 3 (STOP-gated cleanup)
30
+
31
+ - No parser deletion until explicit approval.
32
+ - Submit one deletion proposal per repo with before/after API compatibility proof.
@@ -0,0 +1,36 @@
1
+ ---
2
+ title: "Parity"
3
+ description: "Projection-based parity checks used to compare codex output with external baselines."
4
+ group: "Reference"
5
+ order: 5
6
+ ---
7
+
8
+ # Parity
9
+
10
+ Parity verifies that codex output matches an expected projection shape.
11
+
12
+ ## Profiles
13
+
14
+ - `summary` — core viewer essentials (version, page count, encryption, boxes)
15
+ - `inventory` — aggregate and per-page inventory counts
16
+ - `deep` — expanded conformance/trapping/count snapshots
17
+
18
+ ## Output
19
+
20
+ Parity writes a JSON report with:
21
+
22
+ - profile
23
+ - fixture set
24
+ - per-file case results
25
+ - diff list per case
26
+ - total diff count
27
+
28
+ ## Typical workflow
29
+
30
+ ```bash
31
+ uv run codex-pdf parity --fixtures-root tests/fixtures --profile summary
32
+ uv run codex-pdf parity --fixtures-root tests/fixtures --profile inventory
33
+ uv run codex-pdf parity --fixtures-root tests/fixtures --profile deep
34
+ ```
35
+
36
+ Use `--fail-on-diff` in CI for gating.
@@ -0,0 +1,33 @@
1
+ ---
2
+ title: "Preflight Ingest"
3
+ description: "Adapters that normalize external preflight reports into codex issue payloads."
4
+ group: "Reference"
5
+ order: 6
6
+ ---
7
+
8
+ # Preflight Ingest
9
+
10
+ codex includes report adapters to normalize external findings into a
11
+ single issue model.
12
+
13
+ ## Supported formats
14
+
15
+ - `lintpdf_json`
16
+ - `callas_json`
17
+ - `callas_xml`
18
+ - `pitstop_xml`
19
+ - `acrobat_xml`
20
+
21
+ ## Normalized output
22
+
23
+ All adapters emit `CodexPreflightReport` with:
24
+
25
+ - `source_engine`
26
+ - `ingest_format`
27
+ - normalized `issues` list (`CodexIssue`)
28
+ - optional ingest warnings
29
+
30
+ ## Entry point
31
+
32
+ - Adapter module: `codex_pdf.preflight_ingest.adapters`
33
+ - Dispatcher: `parse_preflight_report(content, fmt)`
@@ -0,0 +1,41 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "codex-pdf"
7
+ version = "0.1.1"
8
+ description = "Authoritative, versioned PDF facts contract for Think Neverland tools."
9
+ readme = "README.md"
10
+ requires-python = ">=3.12"
11
+ license = "AGPL-3.0-or-later"
12
+ authors = [{ name = "Think Neverland", email = "dev@thinkneverland.com" }]
13
+ dependencies = [
14
+ "pydantic>=2.8",
15
+ "jsonschema>=4.23",
16
+ "PyMuPDF>=1.24",
17
+ "pikepdf>=9.0",
18
+ ]
19
+
20
+ [project.scripts]
21
+ codex-pdf = "codex_pdf.cli:main"
22
+
23
+ [tool.hatch.build.targets.wheel]
24
+ packages = ["src/codex_pdf"]
25
+
26
+ [tool.pytest.ini_options]
27
+ testpaths = ["tests"]
28
+ addopts = ["-ra", "--strict-markers", "--strict-config"]
29
+
30
+ [tool.ruff]
31
+ target-version = "py312"
32
+ line-length = 100
33
+
34
+ [tool.ruff.lint]
35
+ select = ["E", "F", "I", "UP", "B", "SIM", "RUF"]
36
+ ignore = ["E501"]
37
+
38
+ [dependency-groups]
39
+ dev = [
40
+ "pytest>=9.0.3",
41
+ ]