codex-pdf 1.4.1__tar.gz → 1.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/PKG-INFO +1 -1
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/pyproject.toml +1 -1
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/v1/codex-document.schema.json +34 -1
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/extract/signals.py +5 -9
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/extract/summary.py +164 -1
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/models/v1.py +16 -1
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/version.py +1 -1
- codex_pdf-1.4.2/tests/test_summary_dieline.py +80 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/uv.lock +1 -1
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/.cursor/rules/service-ownership.mdc +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/.github/workflows/ci.yml +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/.gitignore +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/.windsurf/rules/service-ownership.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/APPROVALS.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/CLAUDE.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/Dockerfile +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/LICENSE +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/Procfile +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/README.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/clients/ts/README.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/clients/ts/package-lock.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/clients/ts/package.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/clients/ts/src/color.ts +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/clients/ts/src/index.test.ts +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/clients/ts/src/index.ts +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/clients/ts/tsconfig.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/docs/architecture.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/docs/backward-compatibility.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/docs/cleanup-stop-gates.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/docs/cli.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/docs/contract.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/docs/deploy.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/docs/discovery-audit.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/docs/migration-plan.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/docs/operations/codex-change-ripple.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/docs/operations/marketing-deploy-template.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/docs/parity.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/docs/preflight-ingest.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/docs/release-1.0.0.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/docs/service-ownership-contract.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/railway.toml +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/reports/audit/mislocated-closure.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/reports/audit/produce_surface.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/reports/parity/codex_deep.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/reports/parity/codex_inventory.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/reports/parity/codex_summary.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/reports/parity/criterion4_parser_surface.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/reports/parity/pdfx4_deep.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/reports/parity/pdfx4_inventory.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/reports/parity/pdfx4_summary.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/reports/parity/render_baseline.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/reports/parity/viewer_essentials.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/CHANGELOG.md +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/v1/codex-annotation.schema.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/v1/codex-box.schema.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/v1/codex-color-space.schema.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/v1/codex-font.schema.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/v1/codex-form-xobject.schema.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/v1/codex-image.schema.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/v1/codex-issue.schema.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/v1/codex-ocg.schema.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/v1/codex-output-intent.schema.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/v1/codex-page-object.schema.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/v1/codex-page.schema.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/v1/codex-preflight-report.schema.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/v1/codex-source.schema.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/v1/codex-spot-colorant.schema.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/v1/codex-transparency-tree.schema.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/v1/codex-trap-evidence.schema.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/schemas/v1/codex-warning.schema.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/scripts/parity_viewer_essentials.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/scripts/produce_surface_audit.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/__init__.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/api/__init__.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/api/auth.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/api/cache.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/api/main.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/api/url_ingest.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/cli.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/client/__init__.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/client/http_client.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/color/__init__.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/color/color_math.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/color/curated.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/color/data/pantone_reference.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/color/normalize.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/color/pantone.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/color/resolver.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/eval/__init__.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/eval/ps_type4.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/extract/__init__.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/extract/annotations.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/extract/color.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/extract/common.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/extract/content_inventory.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/extract/document.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/extract/fonts.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/extract/forms.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/extract/images.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/extract/ocg.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/extract/structure.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/extract/transparency.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/extract/trapping.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/geom/__init__.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/geom/box.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/geom/matrix.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/geom/path.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/geom/tile.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/geom/units.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/models/__init__.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/parity.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/preflight_ingest/__init__.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/preflight_ingest/adapters.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/render/__init__.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/render/_common.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/render/content_stream.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/render/layer.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/render/page.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/render/separations.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/src/codex_pdf/schema.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/conftest.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/fixtures/conforming/minimal.pdf +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/fixtures/generate_fixtures.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/fixtures/violating/no_output_intent.pdf +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/fixtures/violating/no_trim_box.pdf +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/fixtures/violating/no_xmp.pdf +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/fixtures/violating/pdf_1_4.pdf +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/golden/1.0.0/reference.json +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/test_api.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/test_cache.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/test_cli_contract.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/test_color.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/test_extract_analysis_signals.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/test_extract_structural.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/test_geom.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/test_golden.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/test_golden_corpus.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/test_models.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/test_parity.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/test_preflight_ingest.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/test_produce_surface_audit.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/test_schema.py +0 -0
- {codex_pdf-1.4.1 → codex_pdf-1.4.2}/tests/test_schemas_all.py +0 -0
|
@@ -1468,7 +1468,8 @@
|
|
|
1468
1468
|
"enum": [
|
|
1469
1469
|
"ocg_name",
|
|
1470
1470
|
"ocg_processing_step",
|
|
1471
|
-
"trap_layer"
|
|
1471
|
+
"trap_layer",
|
|
1472
|
+
"analysis_signal"
|
|
1472
1473
|
],
|
|
1473
1474
|
"title": "Source",
|
|
1474
1475
|
"type": "string"
|
|
@@ -1496,6 +1497,31 @@
|
|
|
1496
1497
|
],
|
|
1497
1498
|
"default": null,
|
|
1498
1499
|
"title": "Processing Step"
|
|
1500
|
+
},
|
|
1501
|
+
"confidence": {
|
|
1502
|
+
"default": 0.5,
|
|
1503
|
+
"maximum": 1.0,
|
|
1504
|
+
"minimum": 0.0,
|
|
1505
|
+
"title": "Confidence",
|
|
1506
|
+
"type": "number"
|
|
1507
|
+
},
|
|
1508
|
+
"reason_codes": {
|
|
1509
|
+
"items": {
|
|
1510
|
+
"enum": [
|
|
1511
|
+
"name_keyword",
|
|
1512
|
+
"iso19593_processing_step",
|
|
1513
|
+
"trap_layer_keyword",
|
|
1514
|
+
"analysis_ocg_marked_keyword",
|
|
1515
|
+
"analysis_dash_pattern",
|
|
1516
|
+
"analysis_thin_stroke",
|
|
1517
|
+
"analysis_stroke_dominant",
|
|
1518
|
+
"analysis_dense_path_network",
|
|
1519
|
+
"analysis_low_fill_ratio"
|
|
1520
|
+
],
|
|
1521
|
+
"type": "string"
|
|
1522
|
+
},
|
|
1523
|
+
"title": "Reason Codes",
|
|
1524
|
+
"type": "array"
|
|
1499
1525
|
}
|
|
1500
1526
|
},
|
|
1501
1527
|
"required": [
|
|
@@ -1519,6 +1545,13 @@
|
|
|
1519
1545
|
"title": "Candidates",
|
|
1520
1546
|
"type": "array"
|
|
1521
1547
|
},
|
|
1548
|
+
"overall_confidence": {
|
|
1549
|
+
"default": 0.0,
|
|
1550
|
+
"maximum": 1.0,
|
|
1551
|
+
"minimum": 0.0,
|
|
1552
|
+
"title": "Overall Confidence",
|
|
1553
|
+
"type": "number"
|
|
1554
|
+
},
|
|
1522
1555
|
"trapped_flag": {
|
|
1523
1556
|
"anyOf": [
|
|
1524
1557
|
{
|
|
@@ -18,20 +18,16 @@ def extract_analysis_signals_pikepdf(pdf_bytes: bytes) -> dict[str, Any]:
|
|
|
18
18
|
with pikepdf.open(BytesIO(pdf_bytes)) as pdf:
|
|
19
19
|
out["spot_names"] = _collect_spot_names(pdf)
|
|
20
20
|
out["layer_names"] = _collect_layer_names(pdf)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
21
|
+
for idx, page in enumerate(pdf.pages, start=1):
|
|
22
|
+
page_signals = _extract_page_signals(page)
|
|
23
|
+
if page_signals:
|
|
24
|
+
out[f"page_{idx}"] = page_signals
|
|
24
25
|
except Exception:
|
|
25
26
|
return {}
|
|
26
27
|
return out
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
def
|
|
30
|
-
try:
|
|
31
|
-
page = pdf.pages[0]
|
|
32
|
-
except Exception:
|
|
33
|
-
return {}
|
|
34
|
-
|
|
30
|
+
def _extract_page_signals(page: Any) -> dict[str, Any]:
|
|
35
31
|
resources = page.get("/Resources") if hasattr(page, "get") else None
|
|
36
32
|
cs_dict = resources.get("/ColorSpace") if resources and hasattr(resources, "get") else None
|
|
37
33
|
props_dict = resources.get("/Properties") if resources and hasattr(resources, "get") else None
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import re
|
|
6
|
+
from typing import Any
|
|
6
7
|
|
|
7
8
|
from codex_pdf.models.v1 import (
|
|
8
9
|
CodexDocument,
|
|
@@ -104,6 +105,8 @@ def _dieline_candidates(doc: CodexDocument) -> CodexSummaryDielineMetrics:
|
|
|
104
105
|
source: str,
|
|
105
106
|
ocg_id: str | None = None,
|
|
106
107
|
processing_step: str | None = None,
|
|
108
|
+
confidence: float = 0.5,
|
|
109
|
+
reason_codes: list[str] | None = None,
|
|
107
110
|
) -> None:
|
|
108
111
|
trimmed = name.strip()
|
|
109
112
|
if not trimmed:
|
|
@@ -118,12 +121,20 @@ def _dieline_candidates(doc: CodexDocument) -> CodexSummaryDielineMetrics:
|
|
|
118
121
|
source=source, # type: ignore[arg-type]
|
|
119
122
|
ocg_id=ocg_id,
|
|
120
123
|
processing_step=processing_step,
|
|
124
|
+
confidence=max(0.0, min(1.0, confidence)),
|
|
125
|
+
reason_codes=sorted(set(reason_codes or [])), # type: ignore[arg-type]
|
|
121
126
|
)
|
|
122
127
|
)
|
|
123
128
|
|
|
124
129
|
for ocg in doc.ocgs:
|
|
125
130
|
if _DIELINE_PATTERN.search(ocg.name):
|
|
126
|
-
_add(
|
|
131
|
+
_add(
|
|
132
|
+
name=ocg.name,
|
|
133
|
+
source="ocg_name",
|
|
134
|
+
ocg_id=ocg.ocg_id,
|
|
135
|
+
confidence=0.95,
|
|
136
|
+
reason_codes=["name_keyword"],
|
|
137
|
+
)
|
|
127
138
|
if ocg.iso19593_processing_step and _DIELINE_PATTERN.search(
|
|
128
139
|
ocg.iso19593_processing_step
|
|
129
140
|
):
|
|
@@ -132,6 +143,8 @@ def _dieline_candidates(doc: CodexDocument) -> CodexSummaryDielineMetrics:
|
|
|
132
143
|
source="ocg_processing_step",
|
|
133
144
|
ocg_id=ocg.ocg_id,
|
|
134
145
|
processing_step=ocg.iso19593_processing_step,
|
|
146
|
+
confidence=0.98,
|
|
147
|
+
reason_codes=["iso19593_processing_step"],
|
|
135
148
|
)
|
|
136
149
|
|
|
137
150
|
for layer in doc.trap_evidence.trap_layers:
|
|
@@ -142,15 +155,165 @@ def _dieline_candidates(doc: CodexDocument) -> CodexSummaryDielineMetrics:
|
|
|
142
155
|
source="trap_layer",
|
|
143
156
|
ocg_id=layer.ocg_id,
|
|
144
157
|
processing_step=layer.processing_step,
|
|
158
|
+
confidence=0.9,
|
|
159
|
+
reason_codes=["trap_layer_keyword"],
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
for page_num, page_signal in _iter_page_signals(doc.analysis):
|
|
163
|
+
signal_hits = _dieline_signal_candidates(page_num, page_signal)
|
|
164
|
+
for hit_name, hit_confidence, hit_reasons in signal_hits:
|
|
165
|
+
_add(
|
|
166
|
+
name=hit_name,
|
|
167
|
+
source="analysis_signal",
|
|
168
|
+
confidence=hit_confidence,
|
|
169
|
+
reason_codes=hit_reasons,
|
|
145
170
|
)
|
|
146
171
|
|
|
147
172
|
return CodexSummaryDielineMetrics(
|
|
148
173
|
count=len(candidates),
|
|
149
174
|
candidates=candidates,
|
|
175
|
+
overall_confidence=max((c.confidence for c in candidates), default=0.0),
|
|
150
176
|
trapped_flag=doc.trapped_flag,
|
|
151
177
|
)
|
|
152
178
|
|
|
153
179
|
|
|
180
|
+
def _iter_page_signals(analysis: dict[str, Any]) -> list[tuple[int, dict[str, Any]]]:
|
|
181
|
+
out: list[tuple[int, dict[str, Any]]] = []
|
|
182
|
+
for key, value in analysis.items():
|
|
183
|
+
if not isinstance(value, dict):
|
|
184
|
+
continue
|
|
185
|
+
if key.startswith("page_"):
|
|
186
|
+
suffix = key.removeprefix("page_")
|
|
187
|
+
if suffix.isdigit():
|
|
188
|
+
out.append((int(suffix), value))
|
|
189
|
+
return sorted(out, key=lambda item: item[0])
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _as_float(value: Any) -> float | None:
|
|
193
|
+
if isinstance(value, (int, float)):
|
|
194
|
+
return float(value)
|
|
195
|
+
if isinstance(value, str):
|
|
196
|
+
try:
|
|
197
|
+
return float(value)
|
|
198
|
+
except ValueError:
|
|
199
|
+
return None
|
|
200
|
+
return None
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _dieline_signal_candidates(
|
|
204
|
+
page_num: int,
|
|
205
|
+
page_signal: dict[str, Any],
|
|
206
|
+
) -> list[tuple[str, float, list[str]]]:
|
|
207
|
+
content_ops = page_signal.get("content_ops")
|
|
208
|
+
if not isinstance(content_ops, list):
|
|
209
|
+
return []
|
|
210
|
+
|
|
211
|
+
path_ops = 0
|
|
212
|
+
stroke_ops = 0
|
|
213
|
+
fill_ops = 0
|
|
214
|
+
dash_ops = 0
|
|
215
|
+
thin_stroke_ops = 0
|
|
216
|
+
ocg_marked_content: set[str] = set()
|
|
217
|
+
|
|
218
|
+
prop_to_ocg_name = page_signal.get("prop_to_ocg_name")
|
|
219
|
+
ocg_map = prop_to_ocg_name if isinstance(prop_to_ocg_name, dict) else {}
|
|
220
|
+
|
|
221
|
+
for entry in content_ops:
|
|
222
|
+
if not isinstance(entry, dict):
|
|
223
|
+
continue
|
|
224
|
+
op = str(entry.get("op") or "").strip()
|
|
225
|
+
operands = entry.get("operands")
|
|
226
|
+
operands_list = operands if isinstance(operands, list) else []
|
|
227
|
+
|
|
228
|
+
if op in {"m", "l", "c", "v", "y", "re", "h"}:
|
|
229
|
+
path_ops += 1
|
|
230
|
+
elif op in {"S", "s"}:
|
|
231
|
+
stroke_ops += 1
|
|
232
|
+
elif op in {"f", "f*", "F", "B", "B*", "b", "b*"}:
|
|
233
|
+
fill_ops += 1
|
|
234
|
+
elif op == "d":
|
|
235
|
+
# Dashed strokes are a strong fold/crease indicator in packaging art.
|
|
236
|
+
if operands_list and isinstance(operands_list[0], list):
|
|
237
|
+
if any((_as_float(x) or 0.0) > 0.0 for x in operands_list[0]):
|
|
238
|
+
dash_ops += 1
|
|
239
|
+
elif op == "w" and operands_list:
|
|
240
|
+
width = _as_float(operands_list[0])
|
|
241
|
+
if width is not None and width <= 1.0:
|
|
242
|
+
thin_stroke_ops += 1
|
|
243
|
+
elif op == "BDC" and len(operands_list) >= 2:
|
|
244
|
+
maybe_type = str(operands_list[0]).lstrip("/")
|
|
245
|
+
prop_name = str(operands_list[1]).lstrip("/")
|
|
246
|
+
if maybe_type == "OC":
|
|
247
|
+
mapped = ocg_map.get(prop_name)
|
|
248
|
+
if mapped and _DIELINE_PATTERN.search(str(mapped)):
|
|
249
|
+
ocg_marked_content.add(str(mapped))
|
|
250
|
+
|
|
251
|
+
hits: list[tuple[str, float, list[str]]] = []
|
|
252
|
+
for name in sorted(ocg_marked_content):
|
|
253
|
+
hits.append(
|
|
254
|
+
(
|
|
255
|
+
f"{name} (page {page_num}, oc-marked)",
|
|
256
|
+
0.92,
|
|
257
|
+
["analysis_ocg_marked_keyword"],
|
|
258
|
+
)
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Structural heuristics for files with missing spot/color semantics.
|
|
262
|
+
fold_like = dash_ops >= 2 and stroke_ops >= 4
|
|
263
|
+
dieline_like = (
|
|
264
|
+
stroke_ops >= 8
|
|
265
|
+
and path_ops >= 24
|
|
266
|
+
and fill_ops <= max(1, stroke_ops // 4)
|
|
267
|
+
and (thin_stroke_ops >= 2 or dash_ops >= 1)
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
if fold_like:
|
|
271
|
+
fold_reasons: list[str] = []
|
|
272
|
+
fold_confidence = 0.45
|
|
273
|
+
if dash_ops >= 2:
|
|
274
|
+
fold_reasons.append("analysis_dash_pattern")
|
|
275
|
+
fold_confidence += 0.2
|
|
276
|
+
if stroke_ops >= 4:
|
|
277
|
+
fold_reasons.append("analysis_stroke_dominant")
|
|
278
|
+
fold_confidence += 0.15
|
|
279
|
+
if thin_stroke_ops >= 1:
|
|
280
|
+
fold_reasons.append("analysis_thin_stroke")
|
|
281
|
+
fold_confidence += 0.1
|
|
282
|
+
hits.append(
|
|
283
|
+
(
|
|
284
|
+
f"foldline-like vector strokes (page {page_num})",
|
|
285
|
+
min(0.9, fold_confidence),
|
|
286
|
+
fold_reasons,
|
|
287
|
+
)
|
|
288
|
+
)
|
|
289
|
+
if dieline_like:
|
|
290
|
+
dieline_reasons: list[str] = []
|
|
291
|
+
dieline_confidence = 0.5
|
|
292
|
+
if path_ops >= 24:
|
|
293
|
+
dieline_reasons.append("analysis_dense_path_network")
|
|
294
|
+
dieline_confidence += 0.2
|
|
295
|
+
if stroke_ops >= 8:
|
|
296
|
+
dieline_reasons.append("analysis_stroke_dominant")
|
|
297
|
+
dieline_confidence += 0.15
|
|
298
|
+
if fill_ops <= max(1, stroke_ops // 4):
|
|
299
|
+
dieline_reasons.append("analysis_low_fill_ratio")
|
|
300
|
+
dieline_confidence += 0.1
|
|
301
|
+
if thin_stroke_ops >= 2:
|
|
302
|
+
dieline_reasons.append("analysis_thin_stroke")
|
|
303
|
+
dieline_confidence += 0.05
|
|
304
|
+
if dash_ops >= 1:
|
|
305
|
+
dieline_reasons.append("analysis_dash_pattern")
|
|
306
|
+
dieline_confidence += 0.05
|
|
307
|
+
hits.append(
|
|
308
|
+
(
|
|
309
|
+
f"dieline-like vector path network (page {page_num})",
|
|
310
|
+
min(0.95, dieline_confidence),
|
|
311
|
+
dieline_reasons,
|
|
312
|
+
)
|
|
313
|
+
)
|
|
314
|
+
return hits
|
|
315
|
+
|
|
316
|
+
|
|
154
317
|
def _image_metrics(doc: CodexDocument) -> CodexSummaryImageMetrics:
|
|
155
318
|
dpi_values: list[float] = []
|
|
156
319
|
below_300 = 0
|
|
@@ -328,14 +328,29 @@ class CodexSummarySpotColorMetrics(BaseModel):
|
|
|
328
328
|
|
|
329
329
|
class CodexSummaryDielineCandidate(BaseModel):
|
|
330
330
|
name: str
|
|
331
|
-
source: Literal["ocg_name", "ocg_processing_step", "trap_layer"]
|
|
331
|
+
source: Literal["ocg_name", "ocg_processing_step", "trap_layer", "analysis_signal"]
|
|
332
332
|
ocg_id: str | None = None
|
|
333
333
|
processing_step: str | None = None
|
|
334
|
+
confidence: float = Field(default=0.5, ge=0.0, le=1.0)
|
|
335
|
+
reason_codes: list[
|
|
336
|
+
Literal[
|
|
337
|
+
"name_keyword",
|
|
338
|
+
"iso19593_processing_step",
|
|
339
|
+
"trap_layer_keyword",
|
|
340
|
+
"analysis_ocg_marked_keyword",
|
|
341
|
+
"analysis_dash_pattern",
|
|
342
|
+
"analysis_thin_stroke",
|
|
343
|
+
"analysis_stroke_dominant",
|
|
344
|
+
"analysis_dense_path_network",
|
|
345
|
+
"analysis_low_fill_ratio",
|
|
346
|
+
]
|
|
347
|
+
] = Field(default_factory=list)
|
|
334
348
|
|
|
335
349
|
|
|
336
350
|
class CodexSummaryDielineMetrics(BaseModel):
|
|
337
351
|
count: int = 0
|
|
338
352
|
candidates: list[CodexSummaryDielineCandidate] = Field(default_factory=list)
|
|
353
|
+
overall_confidence: float = Field(default=0.0, ge=0.0, le=1.0)
|
|
339
354
|
trapped_flag: Literal["True", "False", "Unknown"] | None = None
|
|
340
355
|
|
|
341
356
|
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from codex_pdf.extract.summary import build_document_summary
|
|
2
|
+
from codex_pdf.models.v1 import CodexDocument, CodexOCG, CodexSourceRef
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def _doc_with_analysis(analysis: dict) -> CodexDocument:
|
|
6
|
+
return CodexDocument(
|
|
7
|
+
codex_version="1.4.0",
|
|
8
|
+
document_id="deadbeef",
|
|
9
|
+
source=CodexSourceRef(uri="fixture.pdf", sha256="deadbeef", size_bytes=1234),
|
|
10
|
+
analysis=analysis,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_dieline_summary_detects_name_based_candidates() -> None:
|
|
15
|
+
doc = _doc_with_analysis({})
|
|
16
|
+
doc.ocgs = [CodexOCG(ocg_id="oc1", name="Dieline")]
|
|
17
|
+
summary = build_document_summary(doc)
|
|
18
|
+
assert summary.dieline.count >= 1
|
|
19
|
+
ocg_hits = [c for c in summary.dieline.candidates if c.source == "ocg_name"]
|
|
20
|
+
assert ocg_hits
|
|
21
|
+
assert all(c.confidence >= 0.9 for c in ocg_hits)
|
|
22
|
+
assert all("name_keyword" in c.reason_codes for c in ocg_hits)
|
|
23
|
+
assert summary.dieline.overall_confidence >= 0.9
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_dieline_summary_detects_structural_candidates_without_spot_names() -> None:
|
|
27
|
+
# No spot names / no OCG labels. Detection should still find likely linework.
|
|
28
|
+
ops = []
|
|
29
|
+
for _ in range(32):
|
|
30
|
+
ops.append({"op": "m", "operands": [0, 0]})
|
|
31
|
+
ops.append({"op": "l", "operands": [100, 0]})
|
|
32
|
+
for _ in range(12):
|
|
33
|
+
ops.append({"op": "S", "operands": []})
|
|
34
|
+
ops.extend(
|
|
35
|
+
[
|
|
36
|
+
{"op": "w", "operands": [0.5]},
|
|
37
|
+
{"op": "d", "operands": [[2, 2], 0]},
|
|
38
|
+
{"op": "d", "operands": [[3, 3], 0]},
|
|
39
|
+
]
|
|
40
|
+
)
|
|
41
|
+
doc = _doc_with_analysis({"page_1": {"content_ops": ops}})
|
|
42
|
+
summary = build_document_summary(doc)
|
|
43
|
+
signal_hits = [c for c in summary.dieline.candidates if c.source == "analysis_signal"]
|
|
44
|
+
assert signal_hits
|
|
45
|
+
names = [c.name for c in summary.dieline.candidates]
|
|
46
|
+
assert any("dieline-like" in name for name in names)
|
|
47
|
+
assert any("foldline-like" in name for name in names)
|
|
48
|
+
fold = next(c for c in signal_hits if "foldline-like" in c.name)
|
|
49
|
+
assert fold.confidence >= 0.7
|
|
50
|
+
assert "analysis_dash_pattern" in fold.reason_codes
|
|
51
|
+
dieline = next(c for c in signal_hits if "dieline-like" in c.name)
|
|
52
|
+
assert dieline.confidence >= 0.8
|
|
53
|
+
assert "analysis_dense_path_network" in dieline.reason_codes
|
|
54
|
+
assert summary.dieline.overall_confidence >= dieline.confidence
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_dieline_summary_uses_non_first_page_analysis() -> None:
|
|
58
|
+
ops = []
|
|
59
|
+
for _ in range(24):
|
|
60
|
+
ops.append({"op": "re", "operands": [0, 0, 10, 10]})
|
|
61
|
+
for _ in range(10):
|
|
62
|
+
ops.append({"op": "S", "operands": []})
|
|
63
|
+
ops.extend(
|
|
64
|
+
[
|
|
65
|
+
{"op": "w", "operands": [0.75]},
|
|
66
|
+
{"op": "d", "operands": [[1, 1], 0]},
|
|
67
|
+
{"op": "d", "operands": [[1, 2], 0]},
|
|
68
|
+
]
|
|
69
|
+
)
|
|
70
|
+
doc = _doc_with_analysis({"page_2": {"content_ops": ops}})
|
|
71
|
+
summary = build_document_summary(doc)
|
|
72
|
+
page2_hit = next(c for c in summary.dieline.candidates if "page 2" in c.name)
|
|
73
|
+
assert page2_hit.source == "analysis_signal"
|
|
74
|
+
assert page2_hit.reason_codes
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def test_dieline_summary_overall_confidence_defaults_to_zero() -> None:
|
|
78
|
+
summary = build_document_summary(_doc_with_analysis({}))
|
|
79
|
+
assert summary.dieline.count == 0
|
|
80
|
+
assert summary.dieline.overall_confidence == 0.0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|