codex-pdf 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codex_pdf/__init__.py +6 -0
- codex_pdf/cli.py +125 -0
- codex_pdf/extract/__init__.py +5 -0
- codex_pdf/extract/annotations.py +39 -0
- codex_pdf/extract/color.py +99 -0
- codex_pdf/extract/common.py +31 -0
- codex_pdf/extract/content_inventory.py +68 -0
- codex_pdf/extract/document.py +97 -0
- codex_pdf/extract/fonts.py +50 -0
- codex_pdf/extract/forms.py +46 -0
- codex_pdf/extract/images.py +45 -0
- codex_pdf/extract/ocg.py +65 -0
- codex_pdf/extract/structure.py +57 -0
- codex_pdf/extract/transparency.py +31 -0
- codex_pdf/extract/trapping.py +46 -0
- codex_pdf/models/__init__.py +5 -0
- codex_pdf/models/v1.py +285 -0
- codex_pdf/parity.py +298 -0
- codex_pdf/preflight_ingest/__init__.py +21 -0
- codex_pdf/preflight_ingest/adapters.py +219 -0
- codex_pdf/schema.py +20 -0
- codex_pdf/version.py +3 -0
- codex_pdf-0.1.1.dist-info/METADATA +84 -0
- codex_pdf-0.1.1.dist-info/RECORD +27 -0
- codex_pdf-0.1.1.dist-info/WHEEL +4 -0
- codex_pdf-0.1.1.dist-info/entry_points.txt +2 -0
- codex_pdf-0.1.1.dist-info/licenses/LICENSE +7 -0
codex_pdf/__init__.py
ADDED
codex_pdf/cli.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""codex-pdf CLI."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import json
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from jsonschema import validate
|
|
10
|
+
|
|
11
|
+
from codex_pdf.extract import extract_from_path
|
|
12
|
+
from codex_pdf.parity import run_parity_from_namespace
|
|
13
|
+
from codex_pdf.schema import codex_document_schema, load_published_schema
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _repo_root() -> Path:
|
|
17
|
+
return Path(__file__).resolve().parent.parent.parent
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def cmd_extract(args: argparse.Namespace) -> int:
|
|
21
|
+
doc = extract_from_path(Path(args.input_pdf))
|
|
22
|
+
payload = doc.model_dump(mode="json")
|
|
23
|
+
if args.pretty:
|
|
24
|
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
|
25
|
+
else:
|
|
26
|
+
print(json.dumps(payload, separators=(",", ":")))
|
|
27
|
+
return 0
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def cmd_schema(args: argparse.Namespace) -> int:
|
|
31
|
+
if args.published:
|
|
32
|
+
schema = load_published_schema(_repo_root())
|
|
33
|
+
else:
|
|
34
|
+
schema = codex_document_schema()
|
|
35
|
+
print(json.dumps(schema, indent=2, sort_keys=True))
|
|
36
|
+
return 0
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def cmd_validate(args: argparse.Namespace) -> int:
|
|
40
|
+
payload = json.loads(Path(args.codex_json).read_text(encoding="utf-8"))
|
|
41
|
+
schema = load_published_schema(_repo_root())
|
|
42
|
+
validate(payload, schema)
|
|
43
|
+
print("valid")
|
|
44
|
+
return 0
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def cmd_probe(args: argparse.Namespace) -> int:
|
|
48
|
+
doc = extract_from_path(Path(args.input_pdf))
|
|
49
|
+
result = {
|
|
50
|
+
"pdf_version": doc.pdf_version,
|
|
51
|
+
"page_count": len(doc.pages),
|
|
52
|
+
"is_encrypted": doc.is_encrypted,
|
|
53
|
+
"output_intents": [x.model_dump(mode="json") for x in doc.output_intents],
|
|
54
|
+
"document_id": doc.document_id,
|
|
55
|
+
}
|
|
56
|
+
if args.json:
|
|
57
|
+
print(json.dumps(result, indent=2, sort_keys=True))
|
|
58
|
+
else:
|
|
59
|
+
print(
|
|
60
|
+
f"pdf_version={result['pdf_version']} "
|
|
61
|
+
f"page_count={result['page_count']} "
|
|
62
|
+
f"is_encrypted={result['is_encrypted']}"
|
|
63
|
+
)
|
|
64
|
+
return 0
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def cmd_parity(args: argparse.Namespace) -> int:
|
|
68
|
+
return run_parity_from_namespace(args, _repo_root())
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
72
|
+
parser = argparse.ArgumentParser(prog="codex-pdf")
|
|
73
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
74
|
+
|
|
75
|
+
extract = sub.add_parser("extract", help="Extract a CodexDocument from a PDF.")
|
|
76
|
+
extract.add_argument("input_pdf")
|
|
77
|
+
extract.add_argument("--pretty", action="store_true")
|
|
78
|
+
extract.set_defaults(func=cmd_extract)
|
|
79
|
+
|
|
80
|
+
schema = sub.add_parser("schema", help="Print JSON Schema for CodexDocument.")
|
|
81
|
+
schema.add_argument("--version", default="1")
|
|
82
|
+
schema.add_argument("--name", default="codex-document")
|
|
83
|
+
schema.add_argument("--published", action="store_true")
|
|
84
|
+
schema.set_defaults(func=cmd_schema)
|
|
85
|
+
|
|
86
|
+
validate_cmd = sub.add_parser("validate", help="Validate codex JSON against published schema.")
|
|
87
|
+
validate_cmd.add_argument("codex_json")
|
|
88
|
+
validate_cmd.set_defaults(func=cmd_validate)
|
|
89
|
+
|
|
90
|
+
probe = sub.add_parser("probe", help="Fast metadata probe.")
|
|
91
|
+
probe.add_argument("input_pdf")
|
|
92
|
+
probe.add_argument("--json", action="store_true")
|
|
93
|
+
probe.set_defaults(func=cmd_probe)
|
|
94
|
+
|
|
95
|
+
parity = sub.add_parser("parity", help="Run consumer-agnostic parity projection checks.")
|
|
96
|
+
parity.add_argument("--profile", choices=["summary", "inventory", "deep"], default="summary")
|
|
97
|
+
parity.add_argument("--fixtures-root", required=True, help="Path to fixture corpus root.")
|
|
98
|
+
parity.add_argument(
|
|
99
|
+
"--output",
|
|
100
|
+
default=str(_repo_root() / "reports" / "parity" / "viewer_essentials.json"),
|
|
101
|
+
help="Path to write parity JSON report.",
|
|
102
|
+
)
|
|
103
|
+
parity.add_argument("--max-files", type=int, default=10, help="Limit number of PDFs.")
|
|
104
|
+
parity.add_argument(
|
|
105
|
+
"--baseline-command",
|
|
106
|
+
default=None,
|
|
107
|
+
help=(
|
|
108
|
+
"Optional shell command template that prints JSON projection to stdout. "
|
|
109
|
+
"Use {pdf} placeholder for input path."
|
|
110
|
+
),
|
|
111
|
+
)
|
|
112
|
+
parity.add_argument("--fail-on-diff", action="store_true", help="Return non-zero if any diff exists.")
|
|
113
|
+
parity.set_defaults(func=cmd_parity)
|
|
114
|
+
|
|
115
|
+
return parser
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def main() -> int:
|
|
119
|
+
parser = build_parser()
|
|
120
|
+
args = parser.parse_args()
|
|
121
|
+
return args.func(args)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
if __name__ == "__main__":
|
|
125
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Annotation extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from codex_pdf.extract.common import safe_box
|
|
8
|
+
from codex_pdf.models.v1 import CodexAnnotation
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def extract_annotations_fitz(doc: Any) -> list[CodexAnnotation]:
|
|
12
|
+
annotations: list[CodexAnnotation] = []
|
|
13
|
+
for page_num, page in enumerate(doc, start=1):
|
|
14
|
+
try:
|
|
15
|
+
annots = page.annots()
|
|
16
|
+
if not annots:
|
|
17
|
+
continue
|
|
18
|
+
for ann in annots:
|
|
19
|
+
rect = getattr(ann, "rect", None)
|
|
20
|
+
bbox = safe_box(rect) if rect is not None else None
|
|
21
|
+
subtype = getattr(ann, "type", None)
|
|
22
|
+
subtype_name = str(subtype[1]) if isinstance(subtype, tuple) and len(subtype) > 1 else None
|
|
23
|
+
contents = None
|
|
24
|
+
info = getattr(ann, "info", {})
|
|
25
|
+
if isinstance(info, dict):
|
|
26
|
+
contents = info.get("content")
|
|
27
|
+
annotations.append(
|
|
28
|
+
CodexAnnotation(
|
|
29
|
+
annotation_id=f"p{page_num}-a{len(annotations)+1}",
|
|
30
|
+
subtype=subtype_name,
|
|
31
|
+
page_num=page_num,
|
|
32
|
+
rect=bbox,
|
|
33
|
+
contents=contents,
|
|
34
|
+
has_appearance_stream=False,
|
|
35
|
+
)
|
|
36
|
+
)
|
|
37
|
+
except Exception:
|
|
38
|
+
continue
|
|
39
|
+
return annotations
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Color space and output intent extraction (pikepdf fallback)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from codex_pdf.extract.common import obj_id, pdf_name
|
|
9
|
+
from codex_pdf.models.v1 import CodexColorSpace, CodexOutputIntent, CodexSpotColorant
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def extract_color_space(value: Any, cs_id: str) -> CodexColorSpace | None:
|
|
13
|
+
try:
|
|
14
|
+
if str(value).startswith("/"):
|
|
15
|
+
family = pdf_name(value)
|
|
16
|
+
if family in {
|
|
17
|
+
"DeviceGray",
|
|
18
|
+
"DeviceRGB",
|
|
19
|
+
"DeviceCMYK",
|
|
20
|
+
"Pattern",
|
|
21
|
+
"Lab",
|
|
22
|
+
"CalRGB",
|
|
23
|
+
"CalGray",
|
|
24
|
+
"Indexed",
|
|
25
|
+
}:
|
|
26
|
+
return CodexColorSpace(id=cs_id, family=family, canonical={"raw": str(value)})
|
|
27
|
+
|
|
28
|
+
if isinstance(value, list) and len(value) > 0:
|
|
29
|
+
first = pdf_name(value[0]) or "ICCBased"
|
|
30
|
+
if first == "Separation":
|
|
31
|
+
spot_name = pdf_name(value[1]) if len(value) > 1 else "Unknown"
|
|
32
|
+
alt = pdf_name(value[2]) if len(value) > 2 else None
|
|
33
|
+
return CodexColorSpace(
|
|
34
|
+
id=cs_id,
|
|
35
|
+
family="Separation",
|
|
36
|
+
canonical={"raw": [str(v) for v in value]},
|
|
37
|
+
alternate_space_id=alt,
|
|
38
|
+
spot_colorants=[CodexSpotColorant(name=spot_name or "Unknown", alternate_space_id=alt)],
|
|
39
|
+
)
|
|
40
|
+
if first == "DeviceN":
|
|
41
|
+
names = value[1] if len(value) > 1 else []
|
|
42
|
+
alt = pdf_name(value[2]) if len(value) > 2 else None
|
|
43
|
+
spots: list[CodexSpotColorant] = []
|
|
44
|
+
if isinstance(names, list):
|
|
45
|
+
for n in names:
|
|
46
|
+
if pdf_name(n) and pdf_name(n) not in {"All", "None"}:
|
|
47
|
+
spots.append(CodexSpotColorant(name=pdf_name(n) or "Unknown", alternate_space_id=alt))
|
|
48
|
+
return CodexColorSpace(
|
|
49
|
+
id=cs_id,
|
|
50
|
+
family="DeviceN",
|
|
51
|
+
canonical={"raw": [str(v) for v in value]},
|
|
52
|
+
alternate_space_id=alt,
|
|
53
|
+
spot_colorants=spots,
|
|
54
|
+
)
|
|
55
|
+
if first in {"ICCBased", "Lab", "CalRGB", "CalGray", "Indexed", "Pattern"}:
|
|
56
|
+
return CodexColorSpace(id=cs_id, family=first, canonical={"raw": [str(v) for v in value]})
|
|
57
|
+
except Exception:
|
|
58
|
+
return None
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def extract_color_world_pikepdf(pdf_bytes: bytes) -> tuple[list[CodexOutputIntent], list[CodexColorSpace]]:
|
|
63
|
+
output_intents: list[CodexOutputIntent] = []
|
|
64
|
+
color_spaces: list[CodexColorSpace] = []
|
|
65
|
+
try:
|
|
66
|
+
import pikepdf
|
|
67
|
+
|
|
68
|
+
with pikepdf.open(BytesIO(pdf_bytes)) as pdf:
|
|
69
|
+
root = pdf.Root
|
|
70
|
+
out_arr = root.get("/OutputIntents", [])
|
|
71
|
+
for idx, oi in enumerate(out_arr):
|
|
72
|
+
output_intents.append(
|
|
73
|
+
CodexOutputIntent(
|
|
74
|
+
subtype=pdf_name(oi.get("/S")) if hasattr(oi, "get") else None,
|
|
75
|
+
output_condition_identifier=str(oi.get("/OutputConditionIdentifier"))
|
|
76
|
+
if hasattr(oi, "get") and oi.get("/OutputConditionIdentifier") is not None
|
|
77
|
+
else None,
|
|
78
|
+
profile_id=obj_id(oi.get("/DestOutputProfile"), f"outputintent-{idx}")
|
|
79
|
+
if hasattr(oi, "get")
|
|
80
|
+
else None,
|
|
81
|
+
)
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
cs_seen: set[str] = set()
|
|
85
|
+
for page in pdf.pages:
|
|
86
|
+
resources = page.obj.get("/Resources", {}) if hasattr(page.obj, "get") else {}
|
|
87
|
+
cs_dict = resources.get("/ColorSpace", {}) if hasattr(resources, "get") else {}
|
|
88
|
+
if hasattr(cs_dict, "items"):
|
|
89
|
+
for cs_name, cs_val in cs_dict.items():
|
|
90
|
+
cs_id = str(cs_name)
|
|
91
|
+
if cs_id in cs_seen:
|
|
92
|
+
continue
|
|
93
|
+
cs_seen.add(cs_id)
|
|
94
|
+
cs = extract_color_space(cs_val, cs_id)
|
|
95
|
+
if cs is not None:
|
|
96
|
+
color_spaces.append(cs)
|
|
97
|
+
except Exception:
|
|
98
|
+
pass
|
|
99
|
+
return output_intents, color_spaces
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Shared extraction helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from codex_pdf.models.v1 import CodexBBox
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def safe_box(rect: object) -> CodexBBox:
|
|
11
|
+
x0 = float(getattr(rect, "x0", 0.0))
|
|
12
|
+
y0 = float(getattr(rect, "y0", 0.0))
|
|
13
|
+
x1 = float(getattr(rect, "x1", 0.0))
|
|
14
|
+
y1 = float(getattr(rect, "y1", 0.0))
|
|
15
|
+
return CodexBBox(x0=x0, y0=y0, x1=x1, y1=y1)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def pdf_name(value: Any) -> str | None:
|
|
19
|
+
if value is None:
|
|
20
|
+
return None
|
|
21
|
+
text = str(value)
|
|
22
|
+
if text.startswith("/"):
|
|
23
|
+
return text[1:]
|
|
24
|
+
return text
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def obj_id(value: Any, fallback: str) -> str:
|
|
28
|
+
objgen = getattr(value, "objgen", None)
|
|
29
|
+
if isinstance(objgen, tuple) and len(objgen) >= 1:
|
|
30
|
+
return f"obj-{objgen[0]}"
|
|
31
|
+
return fallback
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Content inventory extraction (object-level placeholders)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from codex_pdf.models.v1 import (
|
|
8
|
+
CodexColorUsage,
|
|
9
|
+
CodexGraphicsStateSnapshot,
|
|
10
|
+
CodexPageObject,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def extract_page_inventory_fitz(doc: Any) -> dict[int, list[CodexPageObject]]:
|
|
15
|
+
"""Return page_num -> content object inventory.
|
|
16
|
+
|
|
17
|
+
This approximates object classes from currently exposed fitz APIs and serves
|
|
18
|
+
as codex's canonical object list until deeper content-stream parser stages
|
|
19
|
+
are plugged in.
|
|
20
|
+
"""
|
|
21
|
+
inventories: dict[int, list[CodexPageObject]] = {}
|
|
22
|
+
for page_num, page in enumerate(doc, start=1):
|
|
23
|
+
objects: list[CodexPageObject] = []
|
|
24
|
+
|
|
25
|
+
# Text spans via textpage dictionary.
|
|
26
|
+
try:
|
|
27
|
+
tdict = page.get_text("dict")
|
|
28
|
+
block_idx = 0
|
|
29
|
+
for block in tdict.get("blocks", []):
|
|
30
|
+
block_idx += 1
|
|
31
|
+
if block.get("type") == 0:
|
|
32
|
+
objects.append(
|
|
33
|
+
CodexPageObject(
|
|
34
|
+
object_id=f"p{page_num}-text-{block_idx}",
|
|
35
|
+
kind="text",
|
|
36
|
+
graphics_state=CodexGraphicsStateSnapshot(),
|
|
37
|
+
color_usage=CodexColorUsage(),
|
|
38
|
+
)
|
|
39
|
+
)
|
|
40
|
+
elif block.get("type") == 1:
|
|
41
|
+
objects.append(
|
|
42
|
+
CodexPageObject(
|
|
43
|
+
object_id=f"p{page_num}-raster-{block_idx}",
|
|
44
|
+
kind="raster",
|
|
45
|
+
graphics_state=CodexGraphicsStateSnapshot(),
|
|
46
|
+
color_usage=CodexColorUsage(),
|
|
47
|
+
)
|
|
48
|
+
)
|
|
49
|
+
except Exception:
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
# Drawings become vector objects.
|
|
53
|
+
try:
|
|
54
|
+
drawings = page.get_drawings() or []
|
|
55
|
+
for idx, _draw in enumerate(drawings, start=1):
|
|
56
|
+
objects.append(
|
|
57
|
+
CodexPageObject(
|
|
58
|
+
object_id=f"p{page_num}-vector-{idx}",
|
|
59
|
+
kind="vector",
|
|
60
|
+
graphics_state=CodexGraphicsStateSnapshot(),
|
|
61
|
+
color_usage=CodexColorUsage(),
|
|
62
|
+
)
|
|
63
|
+
)
|
|
64
|
+
except Exception:
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
inventories[page_num] = objects
|
|
68
|
+
return inventories
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Document extraction entrypoints."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from codex_pdf.models.v1 import CodexDocument, CodexInfoDict, CodexSourceRef, CodexXmpPacket
|
|
9
|
+
from codex_pdf.version import __version__
|
|
10
|
+
from codex_pdf.extract.annotations import extract_annotations_fitz
|
|
11
|
+
from codex_pdf.extract.color import extract_color_world_pikepdf
|
|
12
|
+
from codex_pdf.extract.content_inventory import extract_page_inventory_fitz
|
|
13
|
+
from codex_pdf.extract.fonts import extract_fonts_fitz
|
|
14
|
+
from codex_pdf.extract.forms import extract_forms_pikepdf
|
|
15
|
+
from codex_pdf.extract.images import extract_images_fitz
|
|
16
|
+
from codex_pdf.extract.ocg import extract_ocgs_pikepdf
|
|
17
|
+
from codex_pdf.extract.structure import (
|
|
18
|
+
conformance_claims_from_metadata,
|
|
19
|
+
extract_structure_fitz,
|
|
20
|
+
)
|
|
21
|
+
from codex_pdf.extract.trapping import derive_trapped_flag, extract_trap_evidence
|
|
22
|
+
from codex_pdf.extract.transparency import extract_transparency_fitz
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def extract_document(pdf_bytes: bytes, *, source_uri: str | None = None) -> CodexDocument:
|
|
26
|
+
"""Extract a baseline CodexDocument from raw PDF bytes."""
|
|
27
|
+
digest = hashlib.sha256(pdf_bytes).hexdigest()
|
|
28
|
+
pages = []
|
|
29
|
+
fonts = []
|
|
30
|
+
images = []
|
|
31
|
+
annotations = []
|
|
32
|
+
output_intents = []
|
|
33
|
+
color_spaces = []
|
|
34
|
+
ocgs = []
|
|
35
|
+
form_xobjects = []
|
|
36
|
+
info = CodexInfoDict()
|
|
37
|
+
xmp = CodexXmpPacket(present=False)
|
|
38
|
+
pdf_version = "unknown"
|
|
39
|
+
is_encrypted = False
|
|
40
|
+
trapped_flag = None
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
import fitz
|
|
44
|
+
|
|
45
|
+
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
|
46
|
+
pdf_version, is_encrypted, info, xmp, pages = extract_structure_fitz(doc)
|
|
47
|
+
fonts = extract_fonts_fitz(doc)
|
|
48
|
+
images = extract_images_fitz(doc)
|
|
49
|
+
annotations = extract_annotations_fitz(doc)
|
|
50
|
+
|
|
51
|
+
page_inventory = extract_page_inventory_fitz(doc)
|
|
52
|
+
transparency = extract_transparency_fitz(doc)
|
|
53
|
+
for page in pages:
|
|
54
|
+
page.inventory = page_inventory.get(page.page_num, [])
|
|
55
|
+
if 0 < page.page_num <= len(transparency):
|
|
56
|
+
page.transparency_tree = transparency[page.page_num - 1]
|
|
57
|
+
|
|
58
|
+
trapped_flag = derive_trapped_flag(doc)
|
|
59
|
+
except Exception:
|
|
60
|
+
# Fall back to skeleton with minimal metadata.
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
# Structural fallback extraction not exposed through PyMuPDF APIs.
|
|
64
|
+
output_intents, color_spaces = extract_color_world_pikepdf(pdf_bytes)
|
|
65
|
+
ocgs = extract_ocgs_pikepdf(pdf_bytes)
|
|
66
|
+
form_xobjects = extract_forms_pikepdf(pdf_bytes)
|
|
67
|
+
trap_evidence = extract_trap_evidence(
|
|
68
|
+
trapped_flag=trapped_flag,
|
|
69
|
+
ocg_names=[x.name for x in ocgs],
|
|
70
|
+
annotation_subtypes=[x.subtype or "" for x in annotations],
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
return CodexDocument(
|
|
74
|
+
codex_version=__version__,
|
|
75
|
+
document_id=digest,
|
|
76
|
+
source=CodexSourceRef(uri=source_uri, sha256=digest, size_bytes=len(pdf_bytes)),
|
|
77
|
+
pdf_version=pdf_version,
|
|
78
|
+
is_encrypted=is_encrypted,
|
|
79
|
+
conformance=conformance_claims_from_metadata(info, xmp),
|
|
80
|
+
info=info,
|
|
81
|
+
xmp=xmp,
|
|
82
|
+
trapped_flag=trapped_flag,
|
|
83
|
+
output_intents=output_intents,
|
|
84
|
+
color_spaces=color_spaces,
|
|
85
|
+
fonts=fonts,
|
|
86
|
+
images=images,
|
|
87
|
+
ocgs=ocgs,
|
|
88
|
+
form_xobjects=form_xobjects,
|
|
89
|
+
trap_evidence=trap_evidence,
|
|
90
|
+
annotations=annotations,
|
|
91
|
+
pages=pages,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def extract_from_path(path: Path) -> CodexDocument:
|
|
96
|
+
data = path.read_bytes()
|
|
97
|
+
return extract_document(data, source_uri=str(path))
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Font extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from codex_pdf.models.v1 import CodexFont
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _outline_type(subtype: str) -> str:
|
|
11
|
+
if "TrueType" in subtype:
|
|
12
|
+
return "TrueType"
|
|
13
|
+
if "CFF" in subtype or "Type1C" in subtype:
|
|
14
|
+
return "CFF"
|
|
15
|
+
if "Type1" in subtype:
|
|
16
|
+
return "Type1"
|
|
17
|
+
if "Type3" in subtype:
|
|
18
|
+
return "Type3"
|
|
19
|
+
if "CID" in subtype:
|
|
20
|
+
return "CID"
|
|
21
|
+
return "unknown"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def extract_fonts_fitz(doc: Any) -> list[CodexFont]:
|
|
25
|
+
fonts: list[CodexFont] = []
|
|
26
|
+
for idx, page in enumerate(doc, start=1):
|
|
27
|
+
try:
|
|
28
|
+
for font in page.get_fonts(full=True):
|
|
29
|
+
font_key = str(font[0]) if len(font) > 0 else f"page{idx}-font"
|
|
30
|
+
base_name = str(font[3]) if len(font) > 3 else None
|
|
31
|
+
subtype = str(font[2]) if len(font) > 2 else "unknown"
|
|
32
|
+
existing = next((f for f in fonts if f.font_id == font_key), None)
|
|
33
|
+
if existing is None:
|
|
34
|
+
embedded = "subset" if base_name and "+" in base_name else "unknown"
|
|
35
|
+
fonts.append(
|
|
36
|
+
CodexFont(
|
|
37
|
+
font_id=font_key,
|
|
38
|
+
base_name=base_name,
|
|
39
|
+
subtype=subtype,
|
|
40
|
+
outline_type=_outline_type(subtype),
|
|
41
|
+
embedded=embedded, # best-effort from naming convention.
|
|
42
|
+
missing_glyphs_detected=False,
|
|
43
|
+
page_refs=[idx],
|
|
44
|
+
)
|
|
45
|
+
)
|
|
46
|
+
elif idx not in existing.page_refs:
|
|
47
|
+
existing.page_refs.append(idx)
|
|
48
|
+
except Exception:
|
|
49
|
+
continue
|
|
50
|
+
return fonts
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Form XObject extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
|
|
7
|
+
from codex_pdf.extract.common import obj_id, pdf_name
|
|
8
|
+
from codex_pdf.models.v1 import CodexFormXObject
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def extract_forms_pikepdf(pdf_bytes: bytes) -> list[CodexFormXObject]:
|
|
12
|
+
form_xobjects: list[CodexFormXObject] = []
|
|
13
|
+
try:
|
|
14
|
+
import pikepdf
|
|
15
|
+
|
|
16
|
+
with pikepdf.open(BytesIO(pdf_bytes)) as pdf:
|
|
17
|
+
seen: set[str] = set()
|
|
18
|
+
for page_idx, page in enumerate(pdf.pages, start=1):
|
|
19
|
+
resources = page.obj.get("/Resources", {}) if hasattr(page.obj, "get") else {}
|
|
20
|
+
xobj_dict = resources.get("/XObject", {}) if hasattr(resources, "get") else {}
|
|
21
|
+
if not hasattr(xobj_dict, "items"):
|
|
22
|
+
continue
|
|
23
|
+
for x_name, x_obj in xobj_dict.items():
|
|
24
|
+
subtype = pdf_name(x_obj.get("/Subtype")) if hasattr(x_obj, "get") else None
|
|
25
|
+
if subtype != "Form":
|
|
26
|
+
continue
|
|
27
|
+
x_id = obj_id(x_obj, f"p{page_idx}-{x_name}")
|
|
28
|
+
if x_id in seen:
|
|
29
|
+
continue
|
|
30
|
+
seen.add(x_id)
|
|
31
|
+
child_refs: list[str] = []
|
|
32
|
+
child_res = x_obj.get("/Resources", {}) if hasattr(x_obj, "get") else {}
|
|
33
|
+
child_xobj = child_res.get("/XObject", {}) if hasattr(child_res, "get") else {}
|
|
34
|
+
if hasattr(child_xobj, "items"):
|
|
35
|
+
for child_name, child_obj in child_xobj.items():
|
|
36
|
+
child_refs.append(obj_id(child_obj, str(child_name)))
|
|
37
|
+
form_xobjects.append(
|
|
38
|
+
CodexFormXObject(
|
|
39
|
+
object_id=x_id,
|
|
40
|
+
parent_object_id=None,
|
|
41
|
+
resource_refs=child_refs or [str(x_name)],
|
|
42
|
+
)
|
|
43
|
+
)
|
|
44
|
+
except Exception:
|
|
45
|
+
pass
|
|
46
|
+
return form_xobjects
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Image extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from codex_pdf.models.v1 import CodexImage, CodexResolution
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _estimate_dpi(width_px: int, height_px: int, page_width_pts: float, page_height_pts: float) -> CodexResolution:
|
|
11
|
+
width_in = max(page_width_pts / 72.0, 0.001)
|
|
12
|
+
height_in = max(page_height_pts / 72.0, 0.001)
|
|
13
|
+
return CodexResolution(x_dpi=width_px / width_in, y_dpi=height_px / height_in)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def extract_images_fitz(doc: Any) -> list[CodexImage]:
|
|
17
|
+
images: list[CodexImage] = []
|
|
18
|
+
for page_num, page in enumerate(doc, start=1):
|
|
19
|
+
page_w = float(getattr(page.rect, "width", 0.0))
|
|
20
|
+
page_h = float(getattr(page.rect, "height", 0.0))
|
|
21
|
+
try:
|
|
22
|
+
for img in page.get_images(full=True):
|
|
23
|
+
xref = img[0] if len(img) > 0 else -1
|
|
24
|
+
width = int(img[2]) if len(img) > 2 else 0
|
|
25
|
+
height = int(img[3]) if len(img) > 3 else 0
|
|
26
|
+
bpc = int(img[4]) if len(img) > 4 else None
|
|
27
|
+
cs_name = str(img[5]) if len(img) > 5 else None
|
|
28
|
+
filters = str(img[8]) if len(img) > 8 and img[8] is not None else None
|
|
29
|
+
smask = bool(img[1]) if len(img) > 1 else False
|
|
30
|
+
images.append(
|
|
31
|
+
CodexImage(
|
|
32
|
+
image_id=f"p{page_num}-x{xref}",
|
|
33
|
+
page_num=page_num,
|
|
34
|
+
width_px=width,
|
|
35
|
+
height_px=height,
|
|
36
|
+
bits_per_component=bpc,
|
|
37
|
+
color_space_id=cs_name,
|
|
38
|
+
compression=filters,
|
|
39
|
+
soft_mask=smask,
|
|
40
|
+
effective_resolution_dpi=_estimate_dpi(width, height, page_w, page_h),
|
|
41
|
+
)
|
|
42
|
+
)
|
|
43
|
+
except Exception:
|
|
44
|
+
continue
|
|
45
|
+
return images
|
codex_pdf/extract/ocg.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Optional content group extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
|
|
7
|
+
from codex_pdf.extract.common import obj_id, pdf_name
|
|
8
|
+
from codex_pdf.models.v1 import CodexOCG
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
_STEP_HINTS = {
|
|
12
|
+
"trap": "Trap",
|
|
13
|
+
"white": "White",
|
|
14
|
+
"varnish": "Varnish",
|
|
15
|
+
"cut": "Cutting",
|
|
16
|
+
"fold": "Folding",
|
|
17
|
+
"dieline": "Dieline",
|
|
18
|
+
"emboss": "Emboss",
|
|
19
|
+
"bleed": "Bleed",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _processing_step(name: str) -> str | None:
|
|
24
|
+
lower = name.lower()
|
|
25
|
+
for key, value in _STEP_HINTS.items():
|
|
26
|
+
if key in lower:
|
|
27
|
+
return value
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def extract_ocgs_pikepdf(pdf_bytes: bytes) -> list[CodexOCG]:
|
|
32
|
+
ocgs: list[CodexOCG] = []
|
|
33
|
+
try:
|
|
34
|
+
import pikepdf
|
|
35
|
+
|
|
36
|
+
with pikepdf.open(BytesIO(pdf_bytes)) as pdf:
|
|
37
|
+
root = pdf.Root
|
|
38
|
+
oc_props = root.get("/OCProperties", {})
|
|
39
|
+
off_set: set[str] = set()
|
|
40
|
+
default_cfg = oc_props.get("/D", {}) if hasattr(oc_props, "get") else {}
|
|
41
|
+
off_list = default_cfg.get("/OFF", []) if hasattr(default_cfg, "get") else []
|
|
42
|
+
for item in off_list:
|
|
43
|
+
off_set.add(obj_id(item, str(item)))
|
|
44
|
+
ocg_arr = oc_props.get("/OCGs", []) if hasattr(oc_props, "get") else []
|
|
45
|
+
for idx, ocg in enumerate(ocg_arr):
|
|
46
|
+
ocg_id = obj_id(ocg, f"ocg-{idx}")
|
|
47
|
+
intent_raw = ocg.get("/Intent", []) if hasattr(ocg, "get") else []
|
|
48
|
+
intents: list[str] = []
|
|
49
|
+
if isinstance(intent_raw, list):
|
|
50
|
+
intents = [pdf_name(x) or str(x) for x in intent_raw]
|
|
51
|
+
elif intent_raw is not None:
|
|
52
|
+
intents = [pdf_name(intent_raw) or str(intent_raw)]
|
|
53
|
+
name = str(ocg.get("/Name")) if hasattr(ocg, "get") and ocg.get("/Name") else ocg_id
|
|
54
|
+
ocgs.append(
|
|
55
|
+
CodexOCG(
|
|
56
|
+
ocg_id=ocg_id,
|
|
57
|
+
name=name,
|
|
58
|
+
default_visible=ocg_id not in off_set,
|
|
59
|
+
intent=intents,
|
|
60
|
+
iso19593_processing_step=_processing_step(name),
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
except Exception:
|
|
64
|
+
pass
|
|
65
|
+
return ocgs
|