pdfa-parser 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfa_parser/__init__.py +58 -0
- pdfa_parser/__main__.py +5 -0
- pdfa_parser/_bootstrap.py +44 -0
- pdfa_parser/_test_runner.py +15 -0
- pdfa_parser/data/PDFA_def.ps +91 -0
- pdfa_parser/data/srgb.icc +0 -0
- pdfa_parser/dependencies/__init__.py +25 -0
- pdfa_parser/dependencies/_base.py +204 -0
- pdfa_parser/dependencies/_ghostscript.py +142 -0
- pdfa_parser/dependencies/_jre.py +105 -0
- pdfa_parser/dependencies/_manager.py +57 -0
- pdfa_parser/dependencies/_verapdf.py +139 -0
- pdfa_parser/implementations/__init__.py +4 -0
- pdfa_parser/implementations/ghostscript_adapter.py +20 -0
- pdfa_parser/implementations/verapdf_adapter.py +69 -0
- pdfa_parser/interfaces/__init__.py +4 -0
- pdfa_parser/interfaces/base_adapter.py +95 -0
- pdfa_parser/interfaces/binary_executer.py +41 -0
- pdfa_parser/main.py +64 -0
- pdfa_parser/pdf_parser.py +304 -0
- pdfa_parser/py.typed +0 -0
- pdfa_parser/settings.py +47 -0
- pdfa_parser-0.1.0.dist-info/METADATA +252 -0
- pdfa_parser-0.1.0.dist-info/RECORD +27 -0
- pdfa_parser-0.1.0.dist-info/WHEEL +4 -0
- pdfa_parser-0.1.0.dist-info/entry_points.txt +3 -0
- pdfa_parser-0.1.0.dist-info/licenses/LICENSE +674 -0
pdfa_parser/__init__.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""pdfa-parser – Convert PDFs to PDF/A and validate compliance.
|
|
2
|
+
|
|
3
|
+
Quick-start::
|
|
4
|
+
|
|
5
|
+
from pdfa_parser import PdfParser, create_parser
|
|
6
|
+
|
|
7
|
+
# one-liner: uses default GhostScript + VeraPDF adapters
|
|
8
|
+
parser = create_parser()
|
|
9
|
+
parser.convert("input.pdf", "output.pdf")
|
|
10
|
+
result = parser.validate("output.pdf")
|
|
11
|
+
print(result.compliant)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from pdfa_parser.dependencies import DependencyManager
|
|
15
|
+
from pdfa_parser.implementations.ghostscript_adapter import GhostScriptAdapter
|
|
16
|
+
from pdfa_parser.implementations.verapdf_adapter import VeraPDFAdapter
|
|
17
|
+
from pdfa_parser.interfaces.base_adapter import IBaseAdapter
|
|
18
|
+
from pdfa_parser.interfaces.binary_executer import BinaryExecuter
|
|
19
|
+
from pdfa_parser.pdf_parser import PdfParser, ValidationResult
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def create_parser(
|
|
23
|
+
*,
|
|
24
|
+
pdfa_level: int = 2,
|
|
25
|
+
with_verapdf: bool = True,
|
|
26
|
+
extra_gs_args: tuple[str, ...] = (),
|
|
27
|
+
) -> PdfParser:
|
|
28
|
+
"""Factory that builds a :class:`PdfParser` with default adapters.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
pdfa_level: PDF/A conformance level (1, 2 or 3).
|
|
32
|
+
with_verapdf: If ``True`` (default) attach the VeraPDF executor
|
|
33
|
+
so :meth:`PdfParser.validate` is available.
|
|
34
|
+
extra_gs_args: Extra CLI flags appended to every GhostScript call.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
A ready-to-use :class:`PdfParser` instance.
|
|
38
|
+
"""
|
|
39
|
+
gs = BinaryExecuter(GhostScriptAdapter())
|
|
40
|
+
verapdf = BinaryExecuter(VeraPDFAdapter()) if with_verapdf else None
|
|
41
|
+
return PdfParser(
|
|
42
|
+
gs_executer=gs,
|
|
43
|
+
verapdf_executer=verapdf,
|
|
44
|
+
pdfa_level=pdfa_level,
|
|
45
|
+
extra_gs_args=extra_gs_args,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
__all__ = [
|
|
50
|
+
"DependencyManager",
|
|
51
|
+
"PdfParser",
|
|
52
|
+
"ValidationResult",
|
|
53
|
+
"GhostScriptAdapter",
|
|
54
|
+
"VeraPDFAdapter",
|
|
55
|
+
"IBaseAdapter",
|
|
56
|
+
"BinaryExecuter",
|
|
57
|
+
"create_parser",
|
|
58
|
+
]
|
pdfa_parser/__main__.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Bootstrap compatibility shim.
|
|
2
|
+
|
|
3
|
+
This module re-exports symbols from the new :mod:`pdfa_parser.dependencies`
|
|
4
|
+
package so that existing code importing from ``_bootstrap`` continues to work.
|
|
5
|
+
|
|
6
|
+
.. deprecated::
|
|
7
|
+
Import from :mod:`pdfa_parser.dependencies` instead.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from pdfa_parser.dependencies import (
|
|
13
|
+
BIN_DIR,
|
|
14
|
+
GS_DIR,
|
|
15
|
+
JRE_DIR,
|
|
16
|
+
VERAPDF_DIR,
|
|
17
|
+
DependencyManager,
|
|
18
|
+
)
|
|
19
|
+
from pdfa_parser.dependencies._base import OS
|
|
20
|
+
from pdfa_parser.dependencies._ghostscript import _gs_binary_name
|
|
21
|
+
from pdfa_parser.dependencies._verapdf import _verapdf_binary_name
|
|
22
|
+
|
|
23
|
+
_manager = DependencyManager()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def ensure_binaries(*, ghostscript: bool = True, verapdf: bool = True) -> None:
|
|
27
|
+
"""Legacy API. Prefer :class:`DependencyManager`."""
|
|
28
|
+
if ghostscript:
|
|
29
|
+
_manager.ensure_ghostscript()
|
|
30
|
+
if verapdf:
|
|
31
|
+
_manager.ensure_verapdf()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
"BIN_DIR",
|
|
36
|
+
"DependencyManager",
|
|
37
|
+
"GS_DIR",
|
|
38
|
+
"JRE_DIR",
|
|
39
|
+
"OS",
|
|
40
|
+
"VERAPDF_DIR",
|
|
41
|
+
"_gs_binary_name",
|
|
42
|
+
"_verapdf_binary_name",
|
|
43
|
+
"ensure_binaries",
|
|
44
|
+
]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Test runner entry-point so ``uv run test`` works.
|
|
2
|
+
|
|
3
|
+
This is registered as ``test`` in ``[project.scripts]``.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def main() -> None:
|
|
12
|
+
"""Invoke pytest programmatically and exit with its return code."""
|
|
13
|
+
import pytest
|
|
14
|
+
|
|
15
|
+
sys.exit(pytest.main(["-v", "tests/"]))
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
%!
|
|
2
|
+
% This is a sample prefix file for creating a PDF/A document.
|
|
3
|
+
% Users should modify entries marked with "Customize".
|
|
4
|
+
% This assumes an ICC profile resides in the file (srgb.icc),
|
|
5
|
+
% in the current directory unless the user modifies the corresponding line below.
|
|
6
|
+
|
|
7
|
+
% Define entries in the document Info dictionary :
|
|
8
|
+
[ /Title (Title) % Customise
|
|
9
|
+
/DOCINFO pdfmark
|
|
10
|
+
|
|
11
|
+
% Define an ICC profile :
|
|
12
|
+
/ICCProfile (srgb.icc) % Customise
|
|
13
|
+
def
|
|
14
|
+
|
|
15
|
+
[/_objdef {icc_PDFA} /type /stream /OBJ pdfmark
|
|
16
|
+
|
|
17
|
+
%% This code attempts to set the /N (number of components) key for the ICC colour space.
|
|
18
|
+
%% To do this it checks the ColorConversionStrategy or the device ProcessColorModel if
|
|
19
|
+
%% ColorConversionStrategy is not set.
|
|
20
|
+
%% This is not 100% reliable. A better solution is for the user to edit this and replace
|
|
21
|
+
%% the code between the ---8<--- lines with a simple declaration like:
|
|
22
|
+
%% /N 3
|
|
23
|
+
%% where the value of N is the number of components from the profile defined in /ICCProfile above.
|
|
24
|
+
%%
|
|
25
|
+
[{icc_PDFA}
|
|
26
|
+
<<
|
|
27
|
+
%% ----------8<--------------8<-------------8<--------------8<----------
|
|
28
|
+
systemdict /ColorConversionStrategy known {
|
|
29
|
+
systemdict /ColorConversionStrategy get cvn dup /Gray eq {
|
|
30
|
+
pop /N 1 false
|
|
31
|
+
}{
|
|
32
|
+
dup /RGB eq {
|
|
33
|
+
pop /N 3 false
|
|
34
|
+
}{
|
|
35
|
+
/CMYK eq {
|
|
36
|
+
/N 4 false
|
|
37
|
+
}{
|
|
38
|
+
(\tColorConversionStrategy not a device space, falling back to ProcessColorModel, output may not be valid PDF/A.\n)=
|
|
39
|
+
true
|
|
40
|
+
} ifelse
|
|
41
|
+
} ifelse
|
|
42
|
+
} ifelse
|
|
43
|
+
} {
|
|
44
|
+
(\tColorConversionStrategy not set, falling back to ProcessColorModel, output may not be valid PDF/A.\n)=
|
|
45
|
+
true
|
|
46
|
+
} ifelse
|
|
47
|
+
|
|
48
|
+
{
|
|
49
|
+
currentpagedevice /ProcessColorModel get
|
|
50
|
+
dup /DeviceGray eq {
|
|
51
|
+
pop /N 1
|
|
52
|
+
}{
|
|
53
|
+
dup /DeviceRGB eq {
|
|
54
|
+
pop /N 3
|
|
55
|
+
}{
|
|
56
|
+
dup /DeviceCMYK eq {
|
|
57
|
+
pop /N 4
|
|
58
|
+
} {
|
|
59
|
+
(\tProcessColorModel not a device space.)=
|
|
60
|
+
/ProcessColorModel cvx /rangecheck signalerror
|
|
61
|
+
} ifelse
|
|
62
|
+
} ifelse
|
|
63
|
+
} ifelse
|
|
64
|
+
} if
|
|
65
|
+
%% ----------8<--------------8<-------------8<--------------8<----------
|
|
66
|
+
|
|
67
|
+
>> /PUT pdfmark
|
|
68
|
+
[
|
|
69
|
+
{icc_PDFA}
|
|
70
|
+
{ICCProfile (r) file} stopped
|
|
71
|
+
{
|
|
72
|
+
(\n\tFailed to open the supplied ICCProfile for reading. This may be due to\n) print
|
|
73
|
+
(\t an incorrect filename or a failure to add --permit-file-read=<profile>\n) print
|
|
74
|
+
(\t to the command line. This PostScript program needs to open the file\n) print
|
|
75
|
+
(\t and you must explicitly grant it permission to do so.\n\n) print
|
|
76
|
+
(\tPDF/A processing aborted, output may not be a PDF/A file.\n\n) print
|
|
77
|
+
cleartomark
|
|
78
|
+
}
|
|
79
|
+
{
|
|
80
|
+
/PUT pdfmark
|
|
81
|
+
% Define the output intent dictionary :
|
|
82
|
+
|
|
83
|
+
[/_objdef {OutputIntent_PDFA} /type /dict /OBJ pdfmark
|
|
84
|
+
[{OutputIntent_PDFA} <<
|
|
85
|
+
/Type /OutputIntent % Must be so (the standard requires).
|
|
86
|
+
/S /GTS_PDFA1 % Must be so (the standard requires).
|
|
87
|
+
/DestOutputProfile {icc_PDFA} % Must be so (see above).
|
|
88
|
+
/OutputConditionIdentifier (sRGB) % Customize
|
|
89
|
+
>> /PUT pdfmark
|
|
90
|
+
[{Catalog} <</OutputIntents [ {OutputIntent_PDFA} ]>> /PUT pdfmark
|
|
91
|
+
} ifelse
|
|
Binary file
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Dependency management for external tools (GhostScript, JRE, VeraPDF).
|
|
2
|
+
|
|
3
|
+
This package replaces the monolithic ``_bootstrap`` module with a structured
|
|
4
|
+
OOP design built around the :class:`Dependency` / :class:`ResolutionStrategy`
|
|
5
|
+
abstractions and a high-level :class:`DependencyManager` orchestrator.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pdfa_parser.dependencies._base import BIN_DIR, Dependency, ResolutionStrategy
|
|
9
|
+
from pdfa_parser.dependencies._ghostscript import GS_DIR, GhostScriptDependency
|
|
10
|
+
from pdfa_parser.dependencies._jre import JRE_DIR, JREDependency
|
|
11
|
+
from pdfa_parser.dependencies._manager import DependencyManager
|
|
12
|
+
from pdfa_parser.dependencies._verapdf import VERAPDF_DIR, VeraPDFDependency
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"BIN_DIR",
|
|
16
|
+
"Dependency",
|
|
17
|
+
"DependencyManager",
|
|
18
|
+
"GhostScriptDependency",
|
|
19
|
+
"GS_DIR",
|
|
20
|
+
"JRE_DIR",
|
|
21
|
+
"JREDependency",
|
|
22
|
+
"ResolutionStrategy",
|
|
23
|
+
"VeraPDFDependency",
|
|
24
|
+
"VERAPDF_DIR",
|
|
25
|
+
]
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""Base abstractions and shared utilities for dependency resolution.
|
|
2
|
+
|
|
3
|
+
Provides :class:`Dependency` (Template Method) and :class:`ResolutionStrategy`
|
|
4
|
+
(Strategy pattern) as the two core building blocks. Concrete dependencies
|
|
5
|
+
implement ``_find_binary`` and ``_strategies``; the base class handles caching,
|
|
6
|
+
logging, and the resolve-or-fail loop.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import platform
|
|
14
|
+
import shutil
|
|
15
|
+
import stat
|
|
16
|
+
import tarfile
|
|
17
|
+
import urllib.request
|
|
18
|
+
import zipfile
|
|
19
|
+
from abc import ABC, abstractmethod
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
log = logging.getLogger("pdfa_parser.dependencies")
|
|
23
|
+
|
|
24
|
+
# ── Platform detection ───────────────────────────────────────────────
|
|
25
|
+
|
|
26
|
+
OS = platform.system().lower()
|
|
27
|
+
ARCH = platform.machine().lower()
|
|
28
|
+
_ARCH_MAP = {"x86_64": "x64", "amd64": "x64", "aarch64": "aarch64", "arm64": "aarch64"}
|
|
29
|
+
NORM_ARCH = _ARCH_MAP.get(ARCH, ARCH)
|
|
30
|
+
|
|
31
|
+
# ── Layout ───────────────────────────────────────────────────────────
|
|
32
|
+
|
|
33
|
+
_PKG_DIR = Path(__file__).resolve().parent.parent # pdfa_parser/
|
|
34
|
+
_SRC_DIR = _PKG_DIR.parent # src/
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _resolve_bin_dir() -> Path:
|
|
38
|
+
"""Return the directory where downloaded binaries are stored.
|
|
39
|
+
|
|
40
|
+
* **Development / editable install** → ``<repo>/src/bin``
|
|
41
|
+
(detected by the presence of a ``pyproject.toml`` nearby).
|
|
42
|
+
* **Regular pip install** → ``~/.local/share/pdfa-parser/bin``
|
|
43
|
+
so we always have a writable location.
|
|
44
|
+
"""
|
|
45
|
+
dev_bin = _SRC_DIR / "bin"
|
|
46
|
+
dev_marker = _SRC_DIR.parent / "pyproject.toml"
|
|
47
|
+
if dev_marker.is_file():
|
|
48
|
+
return dev_bin
|
|
49
|
+
|
|
50
|
+
data_home = os.environ.get("XDG_DATA_HOME", "")
|
|
51
|
+
if data_home:
|
|
52
|
+
return Path(data_home) / "pdfa-parser" / "bin"
|
|
53
|
+
if OS == "windows":
|
|
54
|
+
base = Path(os.environ.get("LOCALAPPDATA", Path.home() / "AppData" / "Local"))
|
|
55
|
+
return base / "pdfa-parser" / "bin"
|
|
56
|
+
return Path.home() / ".local" / "share" / "pdfa-parser" / "bin"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
BIN_DIR = _resolve_bin_dir()
|
|
60
|
+
|
|
61
|
+
# ── Shared utilities ─────────────────────────────────────────────────
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def init_logging() -> None:
|
|
65
|
+
"""Ensure the dependency logger has at least one handler."""
|
|
66
|
+
if not log.handlers:
|
|
67
|
+
h = logging.StreamHandler()
|
|
68
|
+
h.setFormatter(logging.Formatter("[pdfa-parser] %(message)s"))
|
|
69
|
+
log.addHandler(h)
|
|
70
|
+
log.setLevel(logging.INFO)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def download(url: str, dest: Path) -> None:
|
|
74
|
+
"""Download *url* to *dest*."""
|
|
75
|
+
log.info("Downloading %s ...", url)
|
|
76
|
+
req = urllib.request.Request(url, headers={"User-Agent": "pdfa-parser/1"})
|
|
77
|
+
with urllib.request.urlopen(req) as resp, open(dest, "wb") as f:
|
|
78
|
+
shutil.copyfileobj(resp, f)
|
|
79
|
+
log.info("Saved %s (%.1f MB)", dest.name, dest.stat().st_size / 1_048_576)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def ensure_dir(d: Path) -> None:
|
|
83
|
+
"""Create *d* and all parents if they don't exist."""
|
|
84
|
+
d.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def chmod_executable(p: Path) -> None:
|
|
88
|
+
"""Add the executable bit on Unix systems."""
|
|
89
|
+
if OS != "windows":
|
|
90
|
+
p.chmod(p.stat().st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def extract_archive(archive: Path, dest: Path) -> None:
|
|
94
|
+
"""Extract a ``.zip`` or ``.tar.gz`` archive into *dest*."""
|
|
95
|
+
if archive.suffix == ".zip":
|
|
96
|
+
with zipfile.ZipFile(archive) as zf:
|
|
97
|
+
zf.extractall(dest)
|
|
98
|
+
elif archive.name.endswith(".tar.gz"):
|
|
99
|
+
with tarfile.open(archive, "r:gz") as tar:
|
|
100
|
+
tar.extractall(dest, filter="data")
|
|
101
|
+
else:
|
|
102
|
+
raise ValueError(f"Unsupported archive format: {archive.name}")
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# ── Abstract classes ─────────────────────────────────────────────────
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class ResolutionStrategy(ABC):
|
|
109
|
+
"""A single way to obtain a dependency binary.
|
|
110
|
+
|
|
111
|
+
Each strategy either succeeds (returns the binary :class:`Path`) or
|
|
112
|
+
indicates failure (returns ``None``) so the caller can fall through
|
|
113
|
+
to the next strategy.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
@abstractmethod
|
|
117
|
+
def try_resolve(self) -> Path | None:
|
|
118
|
+
"""Attempt to resolve the dependency.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
The path to the usable binary, or ``None`` when this strategy
|
|
122
|
+
cannot provide it.
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class Dependency(ABC):
|
|
127
|
+
"""An external tool required by the library at runtime.
|
|
128
|
+
|
|
129
|
+
Uses the **Template Method** pattern: :meth:`ensure` drives the
|
|
130
|
+
resolution loop while subclasses supply :meth:`_find_binary` (check
|
|
131
|
+
for an already-installed binary) and :meth:`_strategies` (ordered
|
|
132
|
+
list of :class:`ResolutionStrategy` instances to try).
|
|
133
|
+
|
|
134
|
+
The resolved path is cached so repeated :meth:`ensure` calls are
|
|
135
|
+
essentially free.
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
def __init__(self) -> None:
|
|
139
|
+
self._resolved_path: Path | None = None
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
@abstractmethod
|
|
143
|
+
def name(self) -> str:
|
|
144
|
+
"""Human-readable dependency name (e.g. ``'GhostScript'``)."""
|
|
145
|
+
|
|
146
|
+
@abstractmethod
|
|
147
|
+
def _find_binary(self) -> Path | None:
|
|
148
|
+
"""Look for an already-installed binary.
|
|
149
|
+
|
|
150
|
+
Return its path, or ``None`` if not found. This should check
|
|
151
|
+
only the **local** install directory — system-wide lookups belong
|
|
152
|
+
in a :class:`ResolutionStrategy`.
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
@abstractmethod
|
|
156
|
+
def _strategies(self) -> list[ResolutionStrategy]:
|
|
157
|
+
"""Return an ordered list of resolution strategies to try."""
|
|
158
|
+
|
|
159
|
+
# ── public API ───────────────────────────────────────────────────
|
|
160
|
+
|
|
161
|
+
def is_available(self) -> bool:
|
|
162
|
+
"""``True`` if the binary is already installed locally."""
|
|
163
|
+
return self._find_binary() is not None
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
def path(self) -> Path:
|
|
167
|
+
"""Resolved binary path. Valid only after :meth:`ensure`."""
|
|
168
|
+
if self._resolved_path is None:
|
|
169
|
+
raise RuntimeError(
|
|
170
|
+
f"{self.name} has not been resolved yet. Call ensure() first."
|
|
171
|
+
)
|
|
172
|
+
return self._resolved_path
|
|
173
|
+
|
|
174
|
+
def ensure(self) -> Path:
|
|
175
|
+
"""Ensure the dependency is available. Idempotent.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
Absolute path to the binary.
|
|
179
|
+
|
|
180
|
+
Raises:
|
|
181
|
+
RuntimeError: If no strategy can provide the binary.
|
|
182
|
+
"""
|
|
183
|
+
if self._resolved_path is not None:
|
|
184
|
+
return self._resolved_path
|
|
185
|
+
|
|
186
|
+
existing = self._find_binary()
|
|
187
|
+
if existing is not None:
|
|
188
|
+
self._resolved_path = existing
|
|
189
|
+
return existing
|
|
190
|
+
|
|
191
|
+
init_logging()
|
|
192
|
+
log.info("=== Setting up %s ===", self.name)
|
|
193
|
+
|
|
194
|
+
for strategy in self._strategies():
|
|
195
|
+
result = strategy.try_resolve()
|
|
196
|
+
if result is not None:
|
|
197
|
+
self._resolved_path = result
|
|
198
|
+
log.info("%s ready: %s", self.name, result)
|
|
199
|
+
return result
|
|
200
|
+
|
|
201
|
+
raise RuntimeError(
|
|
202
|
+
f"Could not resolve '{self.name}': no resolution strategy succeeded. "
|
|
203
|
+
f"Install it manually or check your internet connection."
|
|
204
|
+
)
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""GhostScript dependency resolution."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import shutil
|
|
6
|
+
import subprocess
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from pdfa_parser.dependencies._base import (
|
|
10
|
+
BIN_DIR,
|
|
11
|
+
OS,
|
|
12
|
+
Dependency,
|
|
13
|
+
ResolutionStrategy,
|
|
14
|
+
chmod_executable,
|
|
15
|
+
download,
|
|
16
|
+
ensure_dir,
|
|
17
|
+
extract_archive,
|
|
18
|
+
log,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
GS_DIR = BIN_DIR / "ghostscript"
|
|
22
|
+
|
|
23
|
+
GITHUB_RAW = "https://raw.githubusercontent.com/Ilusinusmate/pdfa-parser/main"
|
|
24
|
+
|
|
25
|
+
GS_ARCHIVES: dict[str, str] = {
|
|
26
|
+
"windows": "gs_win64.zip",
|
|
27
|
+
"linux": "gs_linux_x64.tar.gz",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _gs_binary_name() -> str:
|
|
32
|
+
return "gswin64c.exe" if OS == "windows" else "gs"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ── Strategies ───────────────────────────────────────────────────────
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SystemGhostScriptStrategy(ResolutionStrategy):
|
|
39
|
+
"""Use a system-installed ``gs`` directly from PATH."""
|
|
40
|
+
|
|
41
|
+
def try_resolve(self) -> Path | None:
|
|
42
|
+
system_gs = shutil.which("gs") if OS != "windows" else shutil.which("gswin64c")
|
|
43
|
+
if system_gs is None:
|
|
44
|
+
return None
|
|
45
|
+
log.info("Using system GhostScript: %s", system_gs)
|
|
46
|
+
return Path(system_gs)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class PackageManagerStrategy(ResolutionStrategy):
|
|
50
|
+
"""Install GhostScript via the system package manager (apt-get).
|
|
51
|
+
|
|
52
|
+
Only works on Debian / Ubuntu-based systems (common in Docker images).
|
|
53
|
+
Requires root or sudo access.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def try_resolve(self) -> Path | None:
|
|
57
|
+
if OS != "linux":
|
|
58
|
+
return None
|
|
59
|
+
if shutil.which("apt-get") is None:
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
log.info("Installing GhostScript via apt-get ...")
|
|
63
|
+
try:
|
|
64
|
+
subprocess.run(
|
|
65
|
+
["apt-get", "update", "-qq"],
|
|
66
|
+
check=True,
|
|
67
|
+
capture_output=True,
|
|
68
|
+
timeout=120,
|
|
69
|
+
)
|
|
70
|
+
subprocess.run(
|
|
71
|
+
["apt-get", "install", "-y", "-qq", "ghostscript"],
|
|
72
|
+
check=True,
|
|
73
|
+
capture_output=True,
|
|
74
|
+
timeout=120,
|
|
75
|
+
)
|
|
76
|
+
except (
|
|
77
|
+
subprocess.CalledProcessError,
|
|
78
|
+
FileNotFoundError,
|
|
79
|
+
PermissionError,
|
|
80
|
+
) as exc:
|
|
81
|
+
log.warning("apt-get install ghostscript failed: %s", exc)
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
gs = shutil.which("gs")
|
|
85
|
+
if gs is None:
|
|
86
|
+
return None
|
|
87
|
+
log.info("Installed GhostScript via apt-get: %s", gs)
|
|
88
|
+
return Path(gs)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class GitHubArchiveStrategy(ResolutionStrategy):
|
|
92
|
+
"""Download the compressed GhostScript archive from the GitHub repo."""
|
|
93
|
+
|
|
94
|
+
def try_resolve(self) -> Path | None:
|
|
95
|
+
archive_name = GS_ARCHIVES.get(OS)
|
|
96
|
+
if archive_name is None:
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
local_archive = BIN_DIR / archive_name
|
|
100
|
+
if not local_archive.is_file():
|
|
101
|
+
ensure_dir(BIN_DIR)
|
|
102
|
+
try:
|
|
103
|
+
download(f"{GITHUB_RAW}/src/bin/{archive_name}", local_archive)
|
|
104
|
+
except Exception as exc: # noqa: BLE001
|
|
105
|
+
log.warning("GitHub archive download failed: %s", exc)
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
ensure_dir(GS_DIR)
|
|
109
|
+
extract_archive(local_archive, GS_DIR)
|
|
110
|
+
|
|
111
|
+
binary = GS_DIR / _gs_binary_name()
|
|
112
|
+
if not binary.is_file():
|
|
113
|
+
return None
|
|
114
|
+
chmod_executable(binary)
|
|
115
|
+
return binary
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# ── Dependency ───────────────────────────────────────────────────────
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class GhostScriptDependency(Dependency):
|
|
122
|
+
"""Resolves GhostScript: system PATH → apt-get → GitHub archive."""
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def name(self) -> str:
|
|
126
|
+
return "GhostScript"
|
|
127
|
+
|
|
128
|
+
def _find_binary(self) -> Path | None:
|
|
129
|
+
# Check local bin dir first
|
|
130
|
+
binary = GS_DIR / _gs_binary_name()
|
|
131
|
+
if binary.is_file():
|
|
132
|
+
return binary
|
|
133
|
+
# Then check system PATH
|
|
134
|
+
system_gs = shutil.which(_gs_binary_name())
|
|
135
|
+
return Path(system_gs) if system_gs else None
|
|
136
|
+
|
|
137
|
+
def _strategies(self) -> list[ResolutionStrategy]:
|
|
138
|
+
return [
|
|
139
|
+
SystemGhostScriptStrategy(),
|
|
140
|
+
PackageManagerStrategy(),
|
|
141
|
+
GitHubArchiveStrategy(),
|
|
142
|
+
]
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""JRE (Adoptium Temurin) dependency resolution."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import shutil
|
|
6
|
+
import tarfile
|
|
7
|
+
import tempfile
|
|
8
|
+
import zipfile
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from pdfa_parser.dependencies._base import (
|
|
12
|
+
BIN_DIR,
|
|
13
|
+
NORM_ARCH,
|
|
14
|
+
OS,
|
|
15
|
+
Dependency,
|
|
16
|
+
ResolutionStrategy,
|
|
17
|
+
chmod_executable,
|
|
18
|
+
download,
|
|
19
|
+
ensure_dir,
|
|
20
|
+
log,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
JRE_DIR = BIN_DIR / "jre"
|
|
24
|
+
|
|
25
|
+
ADOPTIUM_JRE_VERSION = "21"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _java_binary_name() -> str:
|
|
29
|
+
return "java.exe" if OS == "windows" else "java"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _find_java_in(root: Path) -> Path | None:
|
|
33
|
+
"""Recursively search *root* for the ``java`` binary."""
|
|
34
|
+
name = _java_binary_name()
|
|
35
|
+
for candidate in root.rglob(name):
|
|
36
|
+
if candidate.is_file():
|
|
37
|
+
return candidate
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _adoptium_url() -> str:
|
|
42
|
+
os_name = "mac" if OS == "darwin" else OS
|
|
43
|
+
return (
|
|
44
|
+
f"https://api.adoptium.net/v3/binary/latest/"
|
|
45
|
+
f"{ADOPTIUM_JRE_VERSION}/ga/{os_name}/{NORM_ARCH}"
|
|
46
|
+
f"/jre/hotspot/normal/eclipse"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ── Strategies ───────────────────────────────────────────────────────
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class SystemJavaStrategy(ResolutionStrategy):
|
|
54
|
+
"""Use the system-installed ``java`` from PATH."""
|
|
55
|
+
|
|
56
|
+
def try_resolve(self) -> Path | None:
|
|
57
|
+
java = shutil.which("java")
|
|
58
|
+
if java is None:
|
|
59
|
+
return None
|
|
60
|
+
log.info("Using system Java: %s", java)
|
|
61
|
+
return Path(java)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class AdoptiumDownloadStrategy(ResolutionStrategy):
|
|
65
|
+
"""Download the Adoptium Temurin JRE."""
|
|
66
|
+
|
|
67
|
+
def try_resolve(self) -> Path | None:
|
|
68
|
+
ensure_dir(JRE_DIR)
|
|
69
|
+
url = _adoptium_url()
|
|
70
|
+
|
|
71
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
72
|
+
tmp_path = Path(tmp)
|
|
73
|
+
if OS == "windows":
|
|
74
|
+
archive = tmp_path / "jre.zip"
|
|
75
|
+
download(url, archive)
|
|
76
|
+
with zipfile.ZipFile(archive) as zf:
|
|
77
|
+
zf.extractall(JRE_DIR)
|
|
78
|
+
else:
|
|
79
|
+
archive = tmp_path / "jre.tar.gz"
|
|
80
|
+
download(url, archive)
|
|
81
|
+
with tarfile.open(archive, "r:gz") as tar:
|
|
82
|
+
tar.extractall(JRE_DIR, filter="data")
|
|
83
|
+
|
|
84
|
+
java = _find_java_in(JRE_DIR)
|
|
85
|
+
if java is None:
|
|
86
|
+
raise RuntimeError("java not found after JRE extraction")
|
|
87
|
+
chmod_executable(java)
|
|
88
|
+
return java
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# ── Dependency ───────────────────────────────────────────────────────
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class JREDependency(Dependency):
|
|
95
|
+
"""Resolves Java: system PATH → Adoptium download."""
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def name(self) -> str:
|
|
99
|
+
return "JRE"
|
|
100
|
+
|
|
101
|
+
def _find_binary(self) -> Path | None:
|
|
102
|
+
return _find_java_in(JRE_DIR)
|
|
103
|
+
|
|
104
|
+
def _strategies(self) -> list[ResolutionStrategy]:
|
|
105
|
+
return [SystemJavaStrategy(), AdoptiumDownloadStrategy()]
|