codex-pdf 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codex_pdf/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """codexPDF public package surface."""
2
+
3
+ from codex_pdf.models.v1 import CodexDocument
4
+ from codex_pdf.version import __version__
5
+
6
+ __all__ = ["CodexDocument", "__version__"]
codex_pdf/cli.py ADDED
@@ -0,0 +1,125 @@
1
+ """codex-pdf CLI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ from pathlib import Path
8
+
9
+ from jsonschema import validate
10
+
11
+ from codex_pdf.extract import extract_from_path
12
+ from codex_pdf.parity import run_parity_from_namespace
13
+ from codex_pdf.schema import codex_document_schema, load_published_schema
14
+
15
+
16
+ def _repo_root() -> Path:
17
+ return Path(__file__).resolve().parent.parent.parent
18
+
19
+
20
+ def cmd_extract(args: argparse.Namespace) -> int:
21
+ doc = extract_from_path(Path(args.input_pdf))
22
+ payload = doc.model_dump(mode="json")
23
+ if args.pretty:
24
+ print(json.dumps(payload, indent=2, sort_keys=True))
25
+ else:
26
+ print(json.dumps(payload, separators=(",", ":")))
27
+ return 0
28
+
29
+
30
+ def cmd_schema(args: argparse.Namespace) -> int:
31
+ if args.published:
32
+ schema = load_published_schema(_repo_root())
33
+ else:
34
+ schema = codex_document_schema()
35
+ print(json.dumps(schema, indent=2, sort_keys=True))
36
+ return 0
37
+
38
+
39
+ def cmd_validate(args: argparse.Namespace) -> int:
40
+ payload = json.loads(Path(args.codex_json).read_text(encoding="utf-8"))
41
+ schema = load_published_schema(_repo_root())
42
+ validate(payload, schema)
43
+ print("valid")
44
+ return 0
45
+
46
+
47
+ def cmd_probe(args: argparse.Namespace) -> int:
48
+ doc = extract_from_path(Path(args.input_pdf))
49
+ result = {
50
+ "pdf_version": doc.pdf_version,
51
+ "page_count": len(doc.pages),
52
+ "is_encrypted": doc.is_encrypted,
53
+ "output_intents": [x.model_dump(mode="json") for x in doc.output_intents],
54
+ "document_id": doc.document_id,
55
+ }
56
+ if args.json:
57
+ print(json.dumps(result, indent=2, sort_keys=True))
58
+ else:
59
+ print(
60
+ f"pdf_version={result['pdf_version']} "
61
+ f"page_count={result['page_count']} "
62
+ f"is_encrypted={result['is_encrypted']}"
63
+ )
64
+ return 0
65
+
66
+
67
+ def cmd_parity(args: argparse.Namespace) -> int:
68
+ return run_parity_from_namespace(args, _repo_root())
69
+
70
+
71
+ def build_parser() -> argparse.ArgumentParser:
72
+ parser = argparse.ArgumentParser(prog="codex-pdf")
73
+ sub = parser.add_subparsers(dest="command", required=True)
74
+
75
+ extract = sub.add_parser("extract", help="Extract a CodexDocument from a PDF.")
76
+ extract.add_argument("input_pdf")
77
+ extract.add_argument("--pretty", action="store_true")
78
+ extract.set_defaults(func=cmd_extract)
79
+
80
+ schema = sub.add_parser("schema", help="Print JSON Schema for CodexDocument.")
81
+ schema.add_argument("--version", default="1")
82
+ schema.add_argument("--name", default="codex-document")
83
+ schema.add_argument("--published", action="store_true")
84
+ schema.set_defaults(func=cmd_schema)
85
+
86
+ validate_cmd = sub.add_parser("validate", help="Validate codex JSON against published schema.")
87
+ validate_cmd.add_argument("codex_json")
88
+ validate_cmd.set_defaults(func=cmd_validate)
89
+
90
+ probe = sub.add_parser("probe", help="Fast metadata probe.")
91
+ probe.add_argument("input_pdf")
92
+ probe.add_argument("--json", action="store_true")
93
+ probe.set_defaults(func=cmd_probe)
94
+
95
+ parity = sub.add_parser("parity", help="Run consumer-agnostic parity projection checks.")
96
+ parity.add_argument("--profile", choices=["summary", "inventory", "deep"], default="summary")
97
+ parity.add_argument("--fixtures-root", required=True, help="Path to fixture corpus root.")
98
+ parity.add_argument(
99
+ "--output",
100
+ default=str(_repo_root() / "reports" / "parity" / "viewer_essentials.json"),
101
+ help="Path to write parity JSON report.",
102
+ )
103
+ parity.add_argument("--max-files", type=int, default=10, help="Limit number of PDFs.")
104
+ parity.add_argument(
105
+ "--baseline-command",
106
+ default=None,
107
+ help=(
108
+ "Optional shell command template that prints JSON projection to stdout. "
109
+ "Use {pdf} placeholder for input path."
110
+ ),
111
+ )
112
+ parity.add_argument("--fail-on-diff", action="store_true", help="Return non-zero if any diff exists.")
113
+ parity.set_defaults(func=cmd_parity)
114
+
115
+ return parser
116
+
117
+
118
+ def main() -> int:
119
+ parser = build_parser()
120
+ args = parser.parse_args()
121
+ return args.func(args)
122
+
123
+
124
+ if __name__ == "__main__":
125
+ raise SystemExit(main())
@@ -0,0 +1,5 @@
1
+ """Extraction interfaces."""
2
+
3
+ from codex_pdf.extract.document import extract_document, extract_from_path
4
+
5
+ __all__ = ["extract_document", "extract_from_path"]
@@ -0,0 +1,39 @@
1
+ """Annotation extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from codex_pdf.extract.common import safe_box
8
+ from codex_pdf.models.v1 import CodexAnnotation
9
+
10
+
11
+ def extract_annotations_fitz(doc: Any) -> list[CodexAnnotation]:
12
+ annotations: list[CodexAnnotation] = []
13
+ for page_num, page in enumerate(doc, start=1):
14
+ try:
15
+ annots = page.annots()
16
+ if not annots:
17
+ continue
18
+ for ann in annots:
19
+ rect = getattr(ann, "rect", None)
20
+ bbox = safe_box(rect) if rect is not None else None
21
+ subtype = getattr(ann, "type", None)
22
+ subtype_name = str(subtype[1]) if isinstance(subtype, tuple) and len(subtype) > 1 else None
23
+ contents = None
24
+ info = getattr(ann, "info", {})
25
+ if isinstance(info, dict):
26
+ contents = info.get("content")
27
+ annotations.append(
28
+ CodexAnnotation(
29
+ annotation_id=f"p{page_num}-a{len(annotations)+1}",
30
+ subtype=subtype_name,
31
+ page_num=page_num,
32
+ rect=bbox,
33
+ contents=contents,
34
+ has_appearance_stream=False,
35
+ )
36
+ )
37
+ except Exception:
38
+ continue
39
+ return annotations
@@ -0,0 +1,99 @@
1
+ """Color space and output intent extraction (pikepdf fallback)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from io import BytesIO
6
+ from typing import Any
7
+
8
+ from codex_pdf.extract.common import obj_id, pdf_name
9
+ from codex_pdf.models.v1 import CodexColorSpace, CodexOutputIntent, CodexSpotColorant
10
+
11
+
12
+ def extract_color_space(value: Any, cs_id: str) -> CodexColorSpace | None:
13
+ try:
14
+ if str(value).startswith("/"):
15
+ family = pdf_name(value)
16
+ if family in {
17
+ "DeviceGray",
18
+ "DeviceRGB",
19
+ "DeviceCMYK",
20
+ "Pattern",
21
+ "Lab",
22
+ "CalRGB",
23
+ "CalGray",
24
+ "Indexed",
25
+ }:
26
+ return CodexColorSpace(id=cs_id, family=family, canonical={"raw": str(value)})
27
+
28
+ if isinstance(value, list) and len(value) > 0:
29
+ first = pdf_name(value[0]) or "ICCBased"
30
+ if first == "Separation":
31
+ spot_name = pdf_name(value[1]) if len(value) > 1 else "Unknown"
32
+ alt = pdf_name(value[2]) if len(value) > 2 else None
33
+ return CodexColorSpace(
34
+ id=cs_id,
35
+ family="Separation",
36
+ canonical={"raw": [str(v) for v in value]},
37
+ alternate_space_id=alt,
38
+ spot_colorants=[CodexSpotColorant(name=spot_name or "Unknown", alternate_space_id=alt)],
39
+ )
40
+ if first == "DeviceN":
41
+ names = value[1] if len(value) > 1 else []
42
+ alt = pdf_name(value[2]) if len(value) > 2 else None
43
+ spots: list[CodexSpotColorant] = []
44
+ if isinstance(names, list):
45
+ for n in names:
46
+ if pdf_name(n) and pdf_name(n) not in {"All", "None"}:
47
+ spots.append(CodexSpotColorant(name=pdf_name(n) or "Unknown", alternate_space_id=alt))
48
+ return CodexColorSpace(
49
+ id=cs_id,
50
+ family="DeviceN",
51
+ canonical={"raw": [str(v) for v in value]},
52
+ alternate_space_id=alt,
53
+ spot_colorants=spots,
54
+ )
55
+ if first in {"ICCBased", "Lab", "CalRGB", "CalGray", "Indexed", "Pattern"}:
56
+ return CodexColorSpace(id=cs_id, family=first, canonical={"raw": [str(v) for v in value]})
57
+ except Exception:
58
+ return None
59
+ return None
60
+
61
+
62
+ def extract_color_world_pikepdf(pdf_bytes: bytes) -> tuple[list[CodexOutputIntent], list[CodexColorSpace]]:
63
+ output_intents: list[CodexOutputIntent] = []
64
+ color_spaces: list[CodexColorSpace] = []
65
+ try:
66
+ import pikepdf
67
+
68
+ with pikepdf.open(BytesIO(pdf_bytes)) as pdf:
69
+ root = pdf.Root
70
+ out_arr = root.get("/OutputIntents", [])
71
+ for idx, oi in enumerate(out_arr):
72
+ output_intents.append(
73
+ CodexOutputIntent(
74
+ subtype=pdf_name(oi.get("/S")) if hasattr(oi, "get") else None,
75
+ output_condition_identifier=str(oi.get("/OutputConditionIdentifier"))
76
+ if hasattr(oi, "get") and oi.get("/OutputConditionIdentifier") is not None
77
+ else None,
78
+ profile_id=obj_id(oi.get("/DestOutputProfile"), f"outputintent-{idx}")
79
+ if hasattr(oi, "get")
80
+ else None,
81
+ )
82
+ )
83
+
84
+ cs_seen: set[str] = set()
85
+ for page in pdf.pages:
86
+ resources = page.obj.get("/Resources", {}) if hasattr(page.obj, "get") else {}
87
+ cs_dict = resources.get("/ColorSpace", {}) if hasattr(resources, "get") else {}
88
+ if hasattr(cs_dict, "items"):
89
+ for cs_name, cs_val in cs_dict.items():
90
+ cs_id = str(cs_name)
91
+ if cs_id in cs_seen:
92
+ continue
93
+ cs_seen.add(cs_id)
94
+ cs = extract_color_space(cs_val, cs_id)
95
+ if cs is not None:
96
+ color_spaces.append(cs)
97
+ except Exception:
98
+ pass
99
+ return output_intents, color_spaces
@@ -0,0 +1,31 @@
1
+ """Shared extraction helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from codex_pdf.models.v1 import CodexBBox
8
+
9
+
10
+ def safe_box(rect: object) -> CodexBBox:
11
+ x0 = float(getattr(rect, "x0", 0.0))
12
+ y0 = float(getattr(rect, "y0", 0.0))
13
+ x1 = float(getattr(rect, "x1", 0.0))
14
+ y1 = float(getattr(rect, "y1", 0.0))
15
+ return CodexBBox(x0=x0, y0=y0, x1=x1, y1=y1)
16
+
17
+
18
+ def pdf_name(value: Any) -> str | None:
19
+ if value is None:
20
+ return None
21
+ text = str(value)
22
+ if text.startswith("/"):
23
+ return text[1:]
24
+ return text
25
+
26
+
27
+ def obj_id(value: Any, fallback: str) -> str:
28
+ objgen = getattr(value, "objgen", None)
29
+ if isinstance(objgen, tuple) and len(objgen) >= 1:
30
+ return f"obj-{objgen[0]}"
31
+ return fallback
@@ -0,0 +1,68 @@
1
+ """Content inventory extraction (object-level placeholders)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from codex_pdf.models.v1 import (
8
+ CodexColorUsage,
9
+ CodexGraphicsStateSnapshot,
10
+ CodexPageObject,
11
+ )
12
+
13
+
14
+ def extract_page_inventory_fitz(doc: Any) -> dict[int, list[CodexPageObject]]:
15
+ """Return page_num -> content object inventory.
16
+
17
+ This approximates object classes from currently exposed fitz APIs and serves
18
+ as codex's canonical object list until deeper content-stream parser stages
19
+ are plugged in.
20
+ """
21
+ inventories: dict[int, list[CodexPageObject]] = {}
22
+ for page_num, page in enumerate(doc, start=1):
23
+ objects: list[CodexPageObject] = []
24
+
25
+ # Text spans via textpage dictionary.
26
+ try:
27
+ tdict = page.get_text("dict")
28
+ block_idx = 0
29
+ for block in tdict.get("blocks", []):
30
+ block_idx += 1
31
+ if block.get("type") == 0:
32
+ objects.append(
33
+ CodexPageObject(
34
+ object_id=f"p{page_num}-text-{block_idx}",
35
+ kind="text",
36
+ graphics_state=CodexGraphicsStateSnapshot(),
37
+ color_usage=CodexColorUsage(),
38
+ )
39
+ )
40
+ elif block.get("type") == 1:
41
+ objects.append(
42
+ CodexPageObject(
43
+ object_id=f"p{page_num}-raster-{block_idx}",
44
+ kind="raster",
45
+ graphics_state=CodexGraphicsStateSnapshot(),
46
+ color_usage=CodexColorUsage(),
47
+ )
48
+ )
49
+ except Exception:
50
+ pass
51
+
52
+ # Drawings become vector objects.
53
+ try:
54
+ drawings = page.get_drawings() or []
55
+ for idx, _draw in enumerate(drawings, start=1):
56
+ objects.append(
57
+ CodexPageObject(
58
+ object_id=f"p{page_num}-vector-{idx}",
59
+ kind="vector",
60
+ graphics_state=CodexGraphicsStateSnapshot(),
61
+ color_usage=CodexColorUsage(),
62
+ )
63
+ )
64
+ except Exception:
65
+ pass
66
+
67
+ inventories[page_num] = objects
68
+ return inventories
@@ -0,0 +1,97 @@
1
+ """Document extraction entrypoints."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ from pathlib import Path
7
+
8
+ from codex_pdf.models.v1 import CodexDocument, CodexInfoDict, CodexSourceRef, CodexXmpPacket
9
+ from codex_pdf.version import __version__
10
+ from codex_pdf.extract.annotations import extract_annotations_fitz
11
+ from codex_pdf.extract.color import extract_color_world_pikepdf
12
+ from codex_pdf.extract.content_inventory import extract_page_inventory_fitz
13
+ from codex_pdf.extract.fonts import extract_fonts_fitz
14
+ from codex_pdf.extract.forms import extract_forms_pikepdf
15
+ from codex_pdf.extract.images import extract_images_fitz
16
+ from codex_pdf.extract.ocg import extract_ocgs_pikepdf
17
+ from codex_pdf.extract.structure import (
18
+ conformance_claims_from_metadata,
19
+ extract_structure_fitz,
20
+ )
21
+ from codex_pdf.extract.trapping import derive_trapped_flag, extract_trap_evidence
22
+ from codex_pdf.extract.transparency import extract_transparency_fitz
23
+
24
+
25
+ def extract_document(pdf_bytes: bytes, *, source_uri: str | None = None) -> CodexDocument:
26
+ """Extract a baseline CodexDocument from raw PDF bytes."""
27
+ digest = hashlib.sha256(pdf_bytes).hexdigest()
28
+ pages = []
29
+ fonts = []
30
+ images = []
31
+ annotations = []
32
+ output_intents = []
33
+ color_spaces = []
34
+ ocgs = []
35
+ form_xobjects = []
36
+ info = CodexInfoDict()
37
+ xmp = CodexXmpPacket(present=False)
38
+ pdf_version = "unknown"
39
+ is_encrypted = False
40
+ trapped_flag = None
41
+
42
+ try:
43
+ import fitz
44
+
45
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
46
+ pdf_version, is_encrypted, info, xmp, pages = extract_structure_fitz(doc)
47
+ fonts = extract_fonts_fitz(doc)
48
+ images = extract_images_fitz(doc)
49
+ annotations = extract_annotations_fitz(doc)
50
+
51
+ page_inventory = extract_page_inventory_fitz(doc)
52
+ transparency = extract_transparency_fitz(doc)
53
+ for page in pages:
54
+ page.inventory = page_inventory.get(page.page_num, [])
55
+ if 0 < page.page_num <= len(transparency):
56
+ page.transparency_tree = transparency[page.page_num - 1]
57
+
58
+ trapped_flag = derive_trapped_flag(doc)
59
+ except Exception:
60
+ # Fall back to skeleton with minimal metadata.
61
+ pass
62
+
63
+ # Structural fallback extraction not exposed through PyMuPDF APIs.
64
+ output_intents, color_spaces = extract_color_world_pikepdf(pdf_bytes)
65
+ ocgs = extract_ocgs_pikepdf(pdf_bytes)
66
+ form_xobjects = extract_forms_pikepdf(pdf_bytes)
67
+ trap_evidence = extract_trap_evidence(
68
+ trapped_flag=trapped_flag,
69
+ ocg_names=[x.name for x in ocgs],
70
+ annotation_subtypes=[x.subtype or "" for x in annotations],
71
+ )
72
+
73
+ return CodexDocument(
74
+ codex_version=__version__,
75
+ document_id=digest,
76
+ source=CodexSourceRef(uri=source_uri, sha256=digest, size_bytes=len(pdf_bytes)),
77
+ pdf_version=pdf_version,
78
+ is_encrypted=is_encrypted,
79
+ conformance=conformance_claims_from_metadata(info, xmp),
80
+ info=info,
81
+ xmp=xmp,
82
+ trapped_flag=trapped_flag,
83
+ output_intents=output_intents,
84
+ color_spaces=color_spaces,
85
+ fonts=fonts,
86
+ images=images,
87
+ ocgs=ocgs,
88
+ form_xobjects=form_xobjects,
89
+ trap_evidence=trap_evidence,
90
+ annotations=annotations,
91
+ pages=pages,
92
+ )
93
+
94
+
95
+ def extract_from_path(path: Path) -> CodexDocument:
96
+ data = path.read_bytes()
97
+ return extract_document(data, source_uri=str(path))
@@ -0,0 +1,50 @@
1
+ """Font extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from codex_pdf.models.v1 import CodexFont
8
+
9
+
10
+ def _outline_type(subtype: str) -> str:
11
+ if "TrueType" in subtype:
12
+ return "TrueType"
13
+ if "CFF" in subtype or "Type1C" in subtype:
14
+ return "CFF"
15
+ if "Type1" in subtype:
16
+ return "Type1"
17
+ if "Type3" in subtype:
18
+ return "Type3"
19
+ if "CID" in subtype:
20
+ return "CID"
21
+ return "unknown"
22
+
23
+
24
+ def extract_fonts_fitz(doc: Any) -> list[CodexFont]:
25
+ fonts: list[CodexFont] = []
26
+ for idx, page in enumerate(doc, start=1):
27
+ try:
28
+ for font in page.get_fonts(full=True):
29
+ font_key = str(font[0]) if len(font) > 0 else f"page{idx}-font"
30
+ base_name = str(font[3]) if len(font) > 3 else None
31
+ subtype = str(font[2]) if len(font) > 2 else "unknown"
32
+ existing = next((f for f in fonts if f.font_id == font_key), None)
33
+ if existing is None:
34
+ embedded = "subset" if base_name and "+" in base_name else "unknown"
35
+ fonts.append(
36
+ CodexFont(
37
+ font_id=font_key,
38
+ base_name=base_name,
39
+ subtype=subtype,
40
+ outline_type=_outline_type(subtype),
41
+ embedded=embedded, # best-effort from naming convention.
42
+ missing_glyphs_detected=False,
43
+ page_refs=[idx],
44
+ )
45
+ )
46
+ elif idx not in existing.page_refs:
47
+ existing.page_refs.append(idx)
48
+ except Exception:
49
+ continue
50
+ return fonts
@@ -0,0 +1,46 @@
1
+ """Form XObject extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from io import BytesIO
6
+
7
+ from codex_pdf.extract.common import obj_id, pdf_name
8
+ from codex_pdf.models.v1 import CodexFormXObject
9
+
10
+
11
+ def extract_forms_pikepdf(pdf_bytes: bytes) -> list[CodexFormXObject]:
12
+ form_xobjects: list[CodexFormXObject] = []
13
+ try:
14
+ import pikepdf
15
+
16
+ with pikepdf.open(BytesIO(pdf_bytes)) as pdf:
17
+ seen: set[str] = set()
18
+ for page_idx, page in enumerate(pdf.pages, start=1):
19
+ resources = page.obj.get("/Resources", {}) if hasattr(page.obj, "get") else {}
20
+ xobj_dict = resources.get("/XObject", {}) if hasattr(resources, "get") else {}
21
+ if not hasattr(xobj_dict, "items"):
22
+ continue
23
+ for x_name, x_obj in xobj_dict.items():
24
+ subtype = pdf_name(x_obj.get("/Subtype")) if hasattr(x_obj, "get") else None
25
+ if subtype != "Form":
26
+ continue
27
+ x_id = obj_id(x_obj, f"p{page_idx}-{x_name}")
28
+ if x_id in seen:
29
+ continue
30
+ seen.add(x_id)
31
+ child_refs: list[str] = []
32
+ child_res = x_obj.get("/Resources", {}) if hasattr(x_obj, "get") else {}
33
+ child_xobj = child_res.get("/XObject", {}) if hasattr(child_res, "get") else {}
34
+ if hasattr(child_xobj, "items"):
35
+ for child_name, child_obj in child_xobj.items():
36
+ child_refs.append(obj_id(child_obj, str(child_name)))
37
+ form_xobjects.append(
38
+ CodexFormXObject(
39
+ object_id=x_id,
40
+ parent_object_id=None,
41
+ resource_refs=child_refs or [str(x_name)],
42
+ )
43
+ )
44
+ except Exception:
45
+ pass
46
+ return form_xobjects
@@ -0,0 +1,45 @@
1
+ """Image extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from codex_pdf.models.v1 import CodexImage, CodexResolution
8
+
9
+
10
+ def _estimate_dpi(width_px: int, height_px: int, page_width_pts: float, page_height_pts: float) -> CodexResolution:
11
+ width_in = max(page_width_pts / 72.0, 0.001)
12
+ height_in = max(page_height_pts / 72.0, 0.001)
13
+ return CodexResolution(x_dpi=width_px / width_in, y_dpi=height_px / height_in)
14
+
15
+
16
+ def extract_images_fitz(doc: Any) -> list[CodexImage]:
17
+ images: list[CodexImage] = []
18
+ for page_num, page in enumerate(doc, start=1):
19
+ page_w = float(getattr(page.rect, "width", 0.0))
20
+ page_h = float(getattr(page.rect, "height", 0.0))
21
+ try:
22
+ for img in page.get_images(full=True):
23
+ xref = img[0] if len(img) > 0 else -1
24
+ width = int(img[2]) if len(img) > 2 else 0
25
+ height = int(img[3]) if len(img) > 3 else 0
26
+ bpc = int(img[4]) if len(img) > 4 else None
27
+ cs_name = str(img[5]) if len(img) > 5 else None
28
+ filters = str(img[8]) if len(img) > 8 and img[8] is not None else None
29
+ smask = bool(img[1]) if len(img) > 1 else False
30
+ images.append(
31
+ CodexImage(
32
+ image_id=f"p{page_num}-x{xref}",
33
+ page_num=page_num,
34
+ width_px=width,
35
+ height_px=height,
36
+ bits_per_component=bpc,
37
+ color_space_id=cs_name,
38
+ compression=filters,
39
+ soft_mask=smask,
40
+ effective_resolution_dpi=_estimate_dpi(width, height, page_w, page_h),
41
+ )
42
+ )
43
+ except Exception:
44
+ continue
45
+ return images
@@ -0,0 +1,65 @@
1
+ """Optional content group extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from io import BytesIO
6
+
7
+ from codex_pdf.extract.common import obj_id, pdf_name
8
+ from codex_pdf.models.v1 import CodexOCG
9
+
10
+
11
+ _STEP_HINTS = {
12
+ "trap": "Trap",
13
+ "white": "White",
14
+ "varnish": "Varnish",
15
+ "cut": "Cutting",
16
+ "fold": "Folding",
17
+ "dieline": "Dieline",
18
+ "emboss": "Emboss",
19
+ "bleed": "Bleed",
20
+ }
21
+
22
+
23
+ def _processing_step(name: str) -> str | None:
24
+ lower = name.lower()
25
+ for key, value in _STEP_HINTS.items():
26
+ if key in lower:
27
+ return value
28
+ return None
29
+
30
+
31
+ def extract_ocgs_pikepdf(pdf_bytes: bytes) -> list[CodexOCG]:
32
+ ocgs: list[CodexOCG] = []
33
+ try:
34
+ import pikepdf
35
+
36
+ with pikepdf.open(BytesIO(pdf_bytes)) as pdf:
37
+ root = pdf.Root
38
+ oc_props = root.get("/OCProperties", {})
39
+ off_set: set[str] = set()
40
+ default_cfg = oc_props.get("/D", {}) if hasattr(oc_props, "get") else {}
41
+ off_list = default_cfg.get("/OFF", []) if hasattr(default_cfg, "get") else []
42
+ for item in off_list:
43
+ off_set.add(obj_id(item, str(item)))
44
+ ocg_arr = oc_props.get("/OCGs", []) if hasattr(oc_props, "get") else []
45
+ for idx, ocg in enumerate(ocg_arr):
46
+ ocg_id = obj_id(ocg, f"ocg-{idx}")
47
+ intent_raw = ocg.get("/Intent", []) if hasattr(ocg, "get") else []
48
+ intents: list[str] = []
49
+ if isinstance(intent_raw, list):
50
+ intents = [pdf_name(x) or str(x) for x in intent_raw]
51
+ elif intent_raw is not None:
52
+ intents = [pdf_name(intent_raw) or str(intent_raw)]
53
+ name = str(ocg.get("/Name")) if hasattr(ocg, "get") and ocg.get("/Name") else ocg_id
54
+ ocgs.append(
55
+ CodexOCG(
56
+ ocg_id=ocg_id,
57
+ name=name,
58
+ default_visible=ocg_id not in off_set,
59
+ intent=intents,
60
+ iso19593_processing_step=_processing_step(name),
61
+ )
62
+ )
63
+ except Exception:
64
+ pass
65
+ return ocgs