codex-pdf 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codex_pdf-0.1.1/.github/workflows/ci.yml +35 -0
- codex_pdf-0.1.1/.gitignore +11 -0
- codex_pdf-0.1.1/LICENSE +7 -0
- codex_pdf-0.1.1/PKG-INFO +84 -0
- codex_pdf-0.1.1/README.md +70 -0
- codex_pdf-0.1.1/docs/architecture.md +35 -0
- codex_pdf-0.1.1/docs/backward-compatibility.md +24 -0
- codex_pdf-0.1.1/docs/cleanup-stop-gates.md +25 -0
- codex_pdf-0.1.1/docs/cli.md +44 -0
- codex_pdf-0.1.1/docs/contract.md +36 -0
- codex_pdf-0.1.1/docs/discovery-audit.md +61 -0
- codex_pdf-0.1.1/docs/migration-plan.md +32 -0
- codex_pdf-0.1.1/docs/parity.md +36 -0
- codex_pdf-0.1.1/docs/preflight-ingest.md +33 -0
- codex_pdf-0.1.1/pyproject.toml +41 -0
- codex_pdf-0.1.1/reports/parity/viewer_essentials.json +195 -0
- codex_pdf-0.1.1/schemas/CHANGELOG.md +19 -0
- codex_pdf-0.1.1/schemas/v1/codex-annotation.schema.json +16 -0
- codex_pdf-0.1.1/schemas/v1/codex-box.schema.json +14 -0
- codex_pdf-0.1.1/schemas/v1/codex-color-space.schema.json +35 -0
- codex_pdf-0.1.1/schemas/v1/codex-document.schema.json +50 -0
- codex_pdf-0.1.1/schemas/v1/codex-font.schema.json +18 -0
- codex_pdf-0.1.1/schemas/v1/codex-form-xobject.schema.json +13 -0
- codex_pdf-0.1.1/schemas/v1/codex-image.schema.json +28 -0
- codex_pdf-0.1.1/schemas/v1/codex-issue.schema.json +17 -0
- codex_pdf-0.1.1/schemas/v1/codex-ocg.schema.json +15 -0
- codex_pdf-0.1.1/schemas/v1/codex-output-intent.schema.json +12 -0
- codex_pdf-0.1.1/schemas/v1/codex-page-object.schema.json +20 -0
- codex_pdf-0.1.1/schemas/v1/codex-page.schema.json +36 -0
- codex_pdf-0.1.1/schemas/v1/codex-preflight-report.schema.json +16 -0
- codex_pdf-0.1.1/schemas/v1/codex-source.schema.json +12 -0
- codex_pdf-0.1.1/schemas/v1/codex-spot-colorant.schema.json +13 -0
- codex_pdf-0.1.1/schemas/v1/codex-transparency-tree.schema.json +22 -0
- codex_pdf-0.1.1/schemas/v1/codex-trap-evidence.schema.json +25 -0
- codex_pdf-0.1.1/schemas/v1/codex-warning.schema.json +13 -0
- codex_pdf-0.1.1/scripts/parity_viewer_essentials.py +41 -0
- codex_pdf-0.1.1/src/codex_pdf/__init__.py +6 -0
- codex_pdf-0.1.1/src/codex_pdf/cli.py +125 -0
- codex_pdf-0.1.1/src/codex_pdf/extract/__init__.py +5 -0
- codex_pdf-0.1.1/src/codex_pdf/extract/annotations.py +39 -0
- codex_pdf-0.1.1/src/codex_pdf/extract/color.py +99 -0
- codex_pdf-0.1.1/src/codex_pdf/extract/common.py +31 -0
- codex_pdf-0.1.1/src/codex_pdf/extract/content_inventory.py +68 -0
- codex_pdf-0.1.1/src/codex_pdf/extract/document.py +97 -0
- codex_pdf-0.1.1/src/codex_pdf/extract/fonts.py +50 -0
- codex_pdf-0.1.1/src/codex_pdf/extract/forms.py +46 -0
- codex_pdf-0.1.1/src/codex_pdf/extract/images.py +45 -0
- codex_pdf-0.1.1/src/codex_pdf/extract/ocg.py +65 -0
- codex_pdf-0.1.1/src/codex_pdf/extract/structure.py +57 -0
- codex_pdf-0.1.1/src/codex_pdf/extract/transparency.py +31 -0
- codex_pdf-0.1.1/src/codex_pdf/extract/trapping.py +46 -0
- codex_pdf-0.1.1/src/codex_pdf/models/__init__.py +5 -0
- codex_pdf-0.1.1/src/codex_pdf/models/v1.py +285 -0
- codex_pdf-0.1.1/src/codex_pdf/parity.py +298 -0
- codex_pdf-0.1.1/src/codex_pdf/preflight_ingest/__init__.py +21 -0
- codex_pdf-0.1.1/src/codex_pdf/preflight_ingest/adapters.py +219 -0
- codex_pdf-0.1.1/src/codex_pdf/schema.py +20 -0
- codex_pdf-0.1.1/src/codex_pdf/version.py +3 -0
- codex_pdf-0.1.1/tests/golden/1.0.0/reference.json +67 -0
- codex_pdf-0.1.1/tests/test_extract_structural.py +25 -0
- codex_pdf-0.1.1/tests/test_golden.py +13 -0
- codex_pdf-0.1.1/tests/test_golden_corpus.py +30 -0
- codex_pdf-0.1.1/tests/test_models.py +18 -0
- codex_pdf-0.1.1/tests/test_parity.py +115 -0
- codex_pdf-0.1.1/tests/test_preflight_ingest.py +42 -0
- codex_pdf-0.1.1/tests/test_schema.py +11 -0
- codex_pdf-0.1.1/tests/test_schemas_all.py +11 -0
- codex_pdf-0.1.1/uv.lock +618 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
name: ci
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
push:
|
|
6
|
+
branches: [main]
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
- uses: actions/setup-python@v5
|
|
14
|
+
with:
|
|
15
|
+
python-version: "3.12"
|
|
16
|
+
- name: Install
|
|
17
|
+
run: |
|
|
18
|
+
python -m pip install --upgrade pip
|
|
19
|
+
pip install -e . pytest
|
|
20
|
+
- name: Test
|
|
21
|
+
run: pytest
|
|
22
|
+
- name: Parity Gate (Deep Profile)
|
|
23
|
+
run: |
|
|
24
|
+
python - <<'PY'
|
|
25
|
+
import fitz
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
root = Path("tmp-parity")
|
|
28
|
+
root.mkdir(exist_ok=True)
|
|
29
|
+
path = root / "fixture.pdf"
|
|
30
|
+
doc = fitz.open()
|
|
31
|
+
page = doc.new_page(width=612, height=792)
|
|
32
|
+
page.insert_text((72, 72), "codex parity fixture")
|
|
33
|
+
doc.save(path)
|
|
34
|
+
PY
|
|
35
|
+
codex-pdf parity --fixtures-root tmp-parity --profile deep --max-files 5
|
codex_pdf-0.1.1/LICENSE
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
|
|
3
|
+
codexPDF is distributed under the GNU Affero General Public License v3.0
|
|
4
|
+
or (at your option) any later version.
|
|
5
|
+
|
|
6
|
+
You should have received a copy of the GNU Affero General Public License
|
|
7
|
+
along with this program. If not, see https://www.gnu.org/licenses/.
|
codex_pdf-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: codex-pdf
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Authoritative, versioned PDF facts contract for Think Neverland tools.
|
|
5
|
+
Author-email: Think Neverland <dev@thinkneverland.com>
|
|
6
|
+
License-Expression: AGPL-3.0-or-later
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Python: >=3.12
|
|
9
|
+
Requires-Dist: jsonschema>=4.23
|
|
10
|
+
Requires-Dist: pikepdf>=9.0
|
|
11
|
+
Requires-Dist: pydantic>=2.8
|
|
12
|
+
Requires-Dist: pymupdf>=1.24
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
title: "Overview"
|
|
17
|
+
description: "Authoritative read-only PDF facts engine for Think Neverland tools. Versioned contract, schema-validated output, and consumer-agnostic extraction."
|
|
18
|
+
group: "Getting started"
|
|
19
|
+
order: 1
|
|
20
|
+
slug: "overview"
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
# codexPDF
|
|
24
|
+
|
|
25
|
+
`codexPDF` is Think Neverland's authoritative, read-only PDF facts reference.
|
|
26
|
+
|
|
27
|
+
Other engines consult `codexPDF` for canonical document facts instead of
|
|
28
|
+
re-parsing PDFs independently. The contract is versioned and schema-validated.
|
|
29
|
+
|
|
30
|
+
## Status
|
|
31
|
+
|
|
32
|
+
Current baseline includes:
|
|
33
|
+
|
|
34
|
+
- Python package (`codex_pdf`) with typed models
|
|
35
|
+
- CLI (`codex-pdf extract|schema|validate|probe|parity`)
|
|
36
|
+
- Versioned schemas in `schemas/v1/`
|
|
37
|
+
- Golden output harness under `tests/golden/`
|
|
38
|
+
|
|
39
|
+
## Quickstart
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
uv sync
|
|
43
|
+
uv run codex-pdf probe input.pdf --json
|
|
44
|
+
uv run codex-pdf extract input.pdf --pretty > out.json
|
|
45
|
+
uv run codex-pdf validate out.json
|
|
46
|
+
uv run codex-pdf parity --fixtures-root tests/fixtures --profile summary --max-files 5
|
|
47
|
+
uv run codex-pdf parity --fixtures-root tests/fixtures --profile inventory --max-files 5
|
|
48
|
+
uv run codex-pdf parity --fixtures-root tests/fixtures --profile deep --max-files 5
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Optional external baseline comparison (consumer-specific adapter provided at runtime):
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
uv run codex-pdf parity \
|
|
55
|
+
--fixtures-root /path/to/pdfs \
|
|
56
|
+
--profile summary \
|
|
57
|
+
--baseline-command "<your_command_with_{pdf}_placeholder>"
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Contract
|
|
61
|
+
|
|
62
|
+
The public API is the JSON contract rooted at `CodexDocument`.
|
|
63
|
+
|
|
64
|
+
- Schema path: `schemas/v1/codex-document.schema.json`
|
|
65
|
+
- Runtime model: `codex_pdf.models.v1.CodexDocument`
|
|
66
|
+
- Stability policy: SemVer (`major` for breaking contract changes)
|
|
67
|
+
|
|
68
|
+
## Documentation
|
|
69
|
+
|
|
70
|
+
| Topic | Doc |
|
|
71
|
+
| --- | --- |
|
|
72
|
+
| Architecture and boundaries | [docs/architecture.md](./docs/architecture.md) |
|
|
73
|
+
| CLI commands and usage patterns | [docs/cli.md](./docs/cli.md) |
|
|
74
|
+
| Contract and schema versioning | [docs/contract.md](./docs/contract.md) |
|
|
75
|
+
| Parity profiles and baselines | [docs/parity.md](./docs/parity.md) |
|
|
76
|
+
| Preflight ingest adapters | [docs/preflight-ingest.md](./docs/preflight-ingest.md) |
|
|
77
|
+
| Migration sequencing | [docs/migration-plan.md](./docs/migration-plan.md) |
|
|
78
|
+
| Legacy discovery audit | [docs/discovery-audit.md](./docs/discovery-audit.md) |
|
|
79
|
+
| Backward compatibility requirements | [docs/backward-compatibility.md](./docs/backward-compatibility.md) |
|
|
80
|
+
| Cleanup stop-gates policy | [docs/cleanup-stop-gates.md](./docs/cleanup-stop-gates.md) |
|
|
81
|
+
|
|
82
|
+
## License
|
|
83
|
+
|
|
84
|
+
AGPL-3.0-or-later.
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Overview"
|
|
3
|
+
description: "Authoritative read-only PDF facts engine for Think Neverland tools. Versioned contract, schema-validated output, and consumer-agnostic extraction."
|
|
4
|
+
group: "Getting started"
|
|
5
|
+
order: 1
|
|
6
|
+
slug: "overview"
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# codexPDF
|
|
10
|
+
|
|
11
|
+
`codexPDF` is Think Neverland's authoritative, read-only PDF facts reference.
|
|
12
|
+
|
|
13
|
+
Other engines consult `codexPDF` for canonical document facts instead of
|
|
14
|
+
re-parsing PDFs independently. The contract is versioned and schema-validated.
|
|
15
|
+
|
|
16
|
+
## Status
|
|
17
|
+
|
|
18
|
+
Current baseline includes:
|
|
19
|
+
|
|
20
|
+
- Python package (`codex_pdf`) with typed models
|
|
21
|
+
- CLI (`codex-pdf extract|schema|validate|probe|parity`)
|
|
22
|
+
- Versioned schemas in `schemas/v1/`
|
|
23
|
+
- Golden output harness under `tests/golden/`
|
|
24
|
+
|
|
25
|
+
## Quickstart
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
uv sync
|
|
29
|
+
uv run codex-pdf probe input.pdf --json
|
|
30
|
+
uv run codex-pdf extract input.pdf --pretty > out.json
|
|
31
|
+
uv run codex-pdf validate out.json
|
|
32
|
+
uv run codex-pdf parity --fixtures-root tests/fixtures --profile summary --max-files 5
|
|
33
|
+
uv run codex-pdf parity --fixtures-root tests/fixtures --profile inventory --max-files 5
|
|
34
|
+
uv run codex-pdf parity --fixtures-root tests/fixtures --profile deep --max-files 5
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Optional external baseline comparison (consumer-specific adapter provided at runtime):
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
uv run codex-pdf parity \
|
|
41
|
+
--fixtures-root /path/to/pdfs \
|
|
42
|
+
--profile summary \
|
|
43
|
+
--baseline-command "<your_command_with_{pdf}_placeholder>"
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Contract
|
|
47
|
+
|
|
48
|
+
The public API is the JSON contract rooted at `CodexDocument`.
|
|
49
|
+
|
|
50
|
+
- Schema path: `schemas/v1/codex-document.schema.json`
|
|
51
|
+
- Runtime model: `codex_pdf.models.v1.CodexDocument`
|
|
52
|
+
- Stability policy: SemVer (`major` for breaking contract changes)
|
|
53
|
+
|
|
54
|
+
## Documentation
|
|
55
|
+
|
|
56
|
+
| Topic | Doc |
|
|
57
|
+
| --- | --- |
|
|
58
|
+
| Architecture and boundaries | [docs/architecture.md](./docs/architecture.md) |
|
|
59
|
+
| CLI commands and usage patterns | [docs/cli.md](./docs/cli.md) |
|
|
60
|
+
| Contract and schema versioning | [docs/contract.md](./docs/contract.md) |
|
|
61
|
+
| Parity profiles and baselines | [docs/parity.md](./docs/parity.md) |
|
|
62
|
+
| Preflight ingest adapters | [docs/preflight-ingest.md](./docs/preflight-ingest.md) |
|
|
63
|
+
| Migration sequencing | [docs/migration-plan.md](./docs/migration-plan.md) |
|
|
64
|
+
| Legacy discovery audit | [docs/discovery-audit.md](./docs/discovery-audit.md) |
|
|
65
|
+
| Backward compatibility requirements | [docs/backward-compatibility.md](./docs/backward-compatibility.md) |
|
|
66
|
+
| Cleanup stop-gates policy | [docs/cleanup-stop-gates.md](./docs/cleanup-stop-gates.md) |
|
|
67
|
+
|
|
68
|
+
## License
|
|
69
|
+
|
|
70
|
+
AGPL-3.0-or-later.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Architecture"
|
|
3
|
+
description: "codexPDF boundaries, extraction pipeline shape, and the contract-first model used by downstream tools."
|
|
4
|
+
group: "Getting started"
|
|
5
|
+
order: 2
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Architecture
|
|
9
|
+
|
|
10
|
+
`codexPDF` is a contract-first facts engine for PDF documents.
|
|
11
|
+
|
|
12
|
+
## Boundary
|
|
13
|
+
|
|
14
|
+
- Read-only extraction only.
|
|
15
|
+
- No rendering, layout, mutation, or rule adjudication.
|
|
16
|
+
- Consumer-agnostic output: same contract regardless of caller.
|
|
17
|
+
|
|
18
|
+
## Pipeline
|
|
19
|
+
|
|
20
|
+
1. Input PDF bytes are loaded by the extractor layer.
|
|
21
|
+
2. Domain extractors populate `CodexDocument` fields (pages, boxes, fonts,
|
|
22
|
+
images, color spaces, OCG/layers, annotations, transparency, trapping).
|
|
23
|
+
3. Output is serialized as JSON against published schema definitions in
|
|
24
|
+
`schemas/v1/`.
|
|
25
|
+
|
|
26
|
+
## Primary contract
|
|
27
|
+
|
|
28
|
+
- Runtime model: `codex_pdf.models.v1.CodexDocument`
|
|
29
|
+
- Schema: `schemas/v1/codex-document.schema.json`
|
|
30
|
+
- Version marker: `schema_version` field in payload
|
|
31
|
+
|
|
32
|
+
## Consumer relationship
|
|
33
|
+
|
|
34
|
+
Downstream engines should treat codex output as the source of truth for
|
|
35
|
+
document facts and keep any product-specific behavior in adapter layers.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Backward Compatibility"
|
|
3
|
+
description: "Consumer payload compatibility expectations during codexPDF rollout and cutover."
|
|
4
|
+
group: "Project"
|
|
5
|
+
order: 8
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Backward Compatibility Requirements
|
|
9
|
+
|
|
10
|
+
During migration, preserve the existing consumer payloads while introducing
|
|
11
|
+
codex-backed data:
|
|
12
|
+
|
|
13
|
+
- `lint-pdf` viewer essentials shape:
|
|
14
|
+
- `pdf_version`, `page_count`, `is_encrypted`, `pages`, `info_dict`
|
|
15
|
+
- findings payload fields consumed by `lint-pdf-ui`:
|
|
16
|
+
- `inspection_id`, `severity`, `page_num`, `bbox`, `message`, `details`
|
|
17
|
+
- `loupe-pdf/types` public contracts:
|
|
18
|
+
- `PageInfo`, `LayerInfo`, `ViewerConfig`, `ColorSample`
|
|
19
|
+
|
|
20
|
+
Compatibility strategy:
|
|
21
|
+
|
|
22
|
+
1. Keep existing endpoints unchanged.
|
|
23
|
+
2. Introduce codex fields as additive (`codex_*`) metadata.
|
|
24
|
+
3. Maintain feature-flagged fallback to legacy parser paths.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Cleanup Stop Gates"
|
|
3
|
+
description: "Release gates required before downstream parser deletion and hard cutover enforcement."
|
|
4
|
+
group: "Project"
|
|
5
|
+
order: 9
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# STOP-Gated Cleanup Policy
|
|
9
|
+
|
|
10
|
+
No deletions of parse code in downstream repositories are permitted until all
|
|
11
|
+
conditions below pass and are approved.
|
|
12
|
+
|
|
13
|
+
## Required gates
|
|
14
|
+
|
|
15
|
+
1. Dual-run parity report on reference corpus is green.
|
|
16
|
+
2. Contract schema remains backward-compatible for pinned consumers.
|
|
17
|
+
3. Latest shipping release candidates of `lint-pdf` and `loupe-pdf` pass CI with codex enabled.
|
|
18
|
+
4. Explicit go/no-go approval recorded for each repository cleanup PR.
|
|
19
|
+
|
|
20
|
+
## Candidate cleanup targets (future)
|
|
21
|
+
|
|
22
|
+
- `lint-pdf`: direct parser/semantic extraction branches replaced by codex adapter.
|
|
23
|
+
- `loupe-pdf`: byte-scan spot extraction where codex provides canonical data.
|
|
24
|
+
- `lint-pdf-ui`: legacy field alias handling after all APIs converge.
|
|
25
|
+
- `assay-pdf`: none required; codex remains optional shell-out integration.
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "CLI"
|
|
3
|
+
description: "Command reference for extract, schema, validate, probe, and parity workflows."
|
|
4
|
+
group: "Getting started"
|
|
5
|
+
order: 3
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# CLI
|
|
9
|
+
|
|
10
|
+
`codex-pdf` exposes a contract-oriented CLI.
|
|
11
|
+
|
|
12
|
+
## Commands
|
|
13
|
+
|
|
14
|
+
- `extract <input_pdf>` — emit full `CodexDocument` JSON.
|
|
15
|
+
- `schema` — print schema JSON (published or runtime-generated).
|
|
16
|
+
- `validate <codex_json>` — validate output against published schema.
|
|
17
|
+
- `probe <input_pdf>` — return lightweight metadata summary.
|
|
18
|
+
- `parity` — compare codex projections against baseline projections.
|
|
19
|
+
|
|
20
|
+
## Common usage
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
uv run codex-pdf extract input.pdf --pretty > out.json
|
|
24
|
+
uv run codex-pdf validate out.json
|
|
25
|
+
uv run codex-pdf probe input.pdf --json
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Parity usage
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
uv run codex-pdf parity \
|
|
32
|
+
--fixtures-root tests/fixtures \
|
|
33
|
+
--profile deep \
|
|
34
|
+
--max-files 10
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Baseline command mode:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
uv run codex-pdf parity \
|
|
41
|
+
--fixtures-root /path/to/pdfs \
|
|
42
|
+
--profile summary \
|
|
43
|
+
--baseline-command "<command with {pdf} placeholder>"
|
|
44
|
+
```
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Contract and Schemas"
|
|
3
|
+
description: "CodexDocument model, schema publishing approach, and compatibility policy."
|
|
4
|
+
group: "Reference"
|
|
5
|
+
order: 4
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Contract and Schemas
|
|
9
|
+
|
|
10
|
+
The codex public contract is rooted at `CodexDocument`.
|
|
11
|
+
|
|
12
|
+
## Runtime model
|
|
13
|
+
|
|
14
|
+
- Python model: `codex_pdf.models.v1.CodexDocument`
|
|
15
|
+
- Child types: page boxes, inventories, fonts, images, color spaces, OCGs,
|
|
16
|
+
annotations, preflight reports, and warnings
|
|
17
|
+
|
|
18
|
+
## Published schemas
|
|
19
|
+
|
|
20
|
+
- Schema root: `schemas/v1/codex-document.schema.json`
|
|
21
|
+
- Child schemas: `schemas/v1/codex-*.schema.json`
|
|
22
|
+
- Changelog: `schemas/CHANGELOG.md`
|
|
23
|
+
|
|
24
|
+
## Versioning policy
|
|
25
|
+
|
|
26
|
+
- `schema_version` in payload tracks contract version.
|
|
27
|
+
- Breaking contract changes increment major version.
|
|
28
|
+
- Non-breaking additive changes use minor/patch increments.
|
|
29
|
+
|
|
30
|
+
## Validation
|
|
31
|
+
|
|
32
|
+
Use the CLI validator:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
uv run codex-pdf validate out.json
|
|
36
|
+
```
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Discovery Audit"
|
|
3
|
+
description: "Initial cross-repo parsing inventory used to design codexPDF migration boundaries and ownership."
|
|
4
|
+
group: "Project"
|
|
5
|
+
order: 6
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Discovery Audit
|
|
9
|
+
|
|
10
|
+
This audit captures parse-related extraction surfaces before migration.
|
|
11
|
+
|
|
12
|
+
## lint-pdf
|
|
13
|
+
|
|
14
|
+
- `src/lintpdf/parser/pikepdf_adapter.py`
|
|
15
|
+
- `PikePDFAdapter.open`, `parse_content_stream`, `get_stream_data`
|
|
16
|
+
- **Disposition:** move/rewrite into codex extractor core.
|
|
17
|
+
- `src/lintpdf/semantic/builder.py`
|
|
18
|
+
- `SemanticModelBuilder.build`
|
|
19
|
+
- **Disposition:** rewrite into codex semantic inventory builder.
|
|
20
|
+
- `src/lintpdf/semantic/interpreter.py`
|
|
21
|
+
- `ContentStreamInterpreter.interpret`
|
|
22
|
+
- **Disposition:** rewrite with codex event inventory contracts.
|
|
23
|
+
- `src/lintpdf/imports/*.py`
|
|
24
|
+
- `PitStopXmlParser`, `Callas*Parser`, `AcrobatXmlParser`
|
|
25
|
+
- **Disposition:** adapt into codex preflight ingest adapters.
|
|
26
|
+
|
|
27
|
+
## lint-pdf-ui
|
|
28
|
+
|
|
29
|
+
- `packages/viewer-shared/src/PdfViewer.tsx`
|
|
30
|
+
- `mergeConfig` JSON normalization for viewer payloads.
|
|
31
|
+
- **Disposition:** stay in UI; add codex compatibility fields.
|
|
32
|
+
- `packages/viewer-shared/src/lintpdf/sources/finding-overlay.ts`
|
|
33
|
+
- finding-to-overlay translation.
|
|
34
|
+
- **Disposition:** stay in UI boundary adapter.
|
|
35
|
+
|
|
36
|
+
## loupe-pdf
|
|
37
|
+
|
|
38
|
+
- `browser/index.ts`
|
|
39
|
+
- `extractOcgIds`, `detectSpotInksFromPdfBytes`
|
|
40
|
+
- **Disposition:** move facts extraction to codex, keep rendering in loupe.
|
|
41
|
+
- `fallback-pdfjs/index.ts`
|
|
42
|
+
- fallback page/layer extraction.
|
|
43
|
+
- **Disposition:** keep as visualization fallback path.
|
|
44
|
+
- `types/index.ts`
|
|
45
|
+
- core viewer contracts.
|
|
46
|
+
- **Disposition:** extend for codex document transport shape.
|
|
47
|
+
|
|
48
|
+
## assay-pdf
|
|
49
|
+
|
|
50
|
+
- `src/assay_pdf/spec/parser.py`
|
|
51
|
+
- GWG workbook parsing.
|
|
52
|
+
- **Disposition:** stay in assay.
|
|
53
|
+
- `src/assay_pdf/harness/runners/*.py`
|
|
54
|
+
- engine output parsing.
|
|
55
|
+
- **Disposition:** add codex CLI shell-out runner; keep MIT boundary.
|
|
56
|
+
|
|
57
|
+
## Overlap highlights
|
|
58
|
+
|
|
59
|
+
- Duplicate parse facts for OCG/layers across `lint-pdf` and `loupe-pdf`.
|
|
60
|
+
- Spot/separation extraction differs in completeness.
|
|
61
|
+
- UI contract naming drift risk (`findings_source` vs historical `preflight_source`).
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Migration Plan"
|
|
3
|
+
description: "Phased rollout plan for moving PDF fact extraction from downstream engines into codexPDF."
|
|
4
|
+
group: "Project"
|
|
5
|
+
order: 7
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Migration Plan
|
|
9
|
+
|
|
10
|
+
## Phase 0 (this repo)
|
|
11
|
+
|
|
12
|
+
1. Publish `CodexDocument` contract and schema.
|
|
13
|
+
2. Ship CLI (`extract`, `schema`, `validate`, `probe`).
|
|
14
|
+
3. Commit golden output harness.
|
|
15
|
+
|
|
16
|
+
## Phase 1 (non-destructive adapters)
|
|
17
|
+
|
|
18
|
+
1. `lint-pdf`: optional codex-backed extraction path behind feature flag.
|
|
19
|
+
2. `lint-pdf-ui`: accept codex payload metadata fields while preserving existing config.
|
|
20
|
+
3. `loupe-pdf`: add contract type for codex-fed metadata facts.
|
|
21
|
+
4. `assay-pdf`: add codex subprocess runner (`codex-pdf extract`) with no in-process import.
|
|
22
|
+
|
|
23
|
+
## Phase 2 (parity hardening)
|
|
24
|
+
|
|
25
|
+
1. Run dual-path comparisons on reference corpus.
|
|
26
|
+
2. Resolve mismatches in page boxes, OCG inventory, and spot metadata.
|
|
27
|
+
3. Freeze schema at `1.0.0` once parity gates pass.
|
|
28
|
+
|
|
29
|
+
## Phase 3 (STOP-gated cleanup)
|
|
30
|
+
|
|
31
|
+
- No parser deletion until explicit approval.
|
|
32
|
+
- Submit one deletion proposal per repo with before/after API compatibility proof.
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Parity"
|
|
3
|
+
description: "Projection-based parity checks used to compare codex output with external baselines."
|
|
4
|
+
group: "Reference"
|
|
5
|
+
order: 5
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Parity
|
|
9
|
+
|
|
10
|
+
Parity verifies that codex output matches an expected projection shape.
|
|
11
|
+
|
|
12
|
+
## Profiles
|
|
13
|
+
|
|
14
|
+
- `summary` — core viewer essentials (version, page count, encryption, boxes)
|
|
15
|
+
- `inventory` — aggregate and per-page inventory counts
|
|
16
|
+
- `deep` — expanded conformance/trapping/count snapshots
|
|
17
|
+
|
|
18
|
+
## Output
|
|
19
|
+
|
|
20
|
+
Parity writes a JSON report with:
|
|
21
|
+
|
|
22
|
+
- profile
|
|
23
|
+
- fixture set
|
|
24
|
+
- per-file case results
|
|
25
|
+
- diff list per case
|
|
26
|
+
- total diff count
|
|
27
|
+
|
|
28
|
+
## Typical workflow
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
uv run codex-pdf parity --fixtures-root tests/fixtures --profile summary
|
|
32
|
+
uv run codex-pdf parity --fixtures-root tests/fixtures --profile inventory
|
|
33
|
+
uv run codex-pdf parity --fixtures-root tests/fixtures --profile deep
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Use `--fail-on-diff` in CI for gating.
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Preflight Ingest"
|
|
3
|
+
description: "Adapters that normalize external preflight reports into codex issue payloads."
|
|
4
|
+
group: "Reference"
|
|
5
|
+
order: 6
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Preflight Ingest
|
|
9
|
+
|
|
10
|
+
codex includes report adapters to normalize external findings into a
|
|
11
|
+
single issue model.
|
|
12
|
+
|
|
13
|
+
## Supported formats
|
|
14
|
+
|
|
15
|
+
- `lintpdf_json`
|
|
16
|
+
- `callas_json`
|
|
17
|
+
- `callas_xml`
|
|
18
|
+
- `pitstop_xml`
|
|
19
|
+
- `acrobat_xml`
|
|
20
|
+
|
|
21
|
+
## Normalized output
|
|
22
|
+
|
|
23
|
+
All adapters emit `CodexPreflightReport` with:
|
|
24
|
+
|
|
25
|
+
- `source_engine`
|
|
26
|
+
- `ingest_format`
|
|
27
|
+
- normalized `issues` list (`CodexIssue`)
|
|
28
|
+
- optional ingest warnings
|
|
29
|
+
|
|
30
|
+
## Entry point
|
|
31
|
+
|
|
32
|
+
- Adapter module: `codex_pdf.preflight_ingest.adapters`
|
|
33
|
+
- Dispatcher: `parse_preflight_report(content, fmt)`
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "codex-pdf"
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
description = "Authoritative, versioned PDF facts contract for Think Neverland tools."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
license = "AGPL-3.0-or-later"
|
|
12
|
+
authors = [{ name = "Think Neverland", email = "dev@thinkneverland.com" }]
|
|
13
|
+
dependencies = [
|
|
14
|
+
"pydantic>=2.8",
|
|
15
|
+
"jsonschema>=4.23",
|
|
16
|
+
"PyMuPDF>=1.24",
|
|
17
|
+
"pikepdf>=9.0",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.scripts]
|
|
21
|
+
codex-pdf = "codex_pdf.cli:main"
|
|
22
|
+
|
|
23
|
+
[tool.hatch.build.targets.wheel]
|
|
24
|
+
packages = ["src/codex_pdf"]
|
|
25
|
+
|
|
26
|
+
[tool.pytest.ini_options]
|
|
27
|
+
testpaths = ["tests"]
|
|
28
|
+
addopts = ["-ra", "--strict-markers", "--strict-config"]
|
|
29
|
+
|
|
30
|
+
[tool.ruff]
|
|
31
|
+
target-version = "py312"
|
|
32
|
+
line-length = 100
|
|
33
|
+
|
|
34
|
+
[tool.ruff.lint]
|
|
35
|
+
select = ["E", "F", "I", "UP", "B", "SIM", "RUF"]
|
|
36
|
+
ignore = ["E501"]
|
|
37
|
+
|
|
38
|
+
[dependency-groups]
|
|
39
|
+
dev = [
|
|
40
|
+
"pytest>=9.0.3",
|
|
41
|
+
]
|