cvfile 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cvfile-0.1.0/.gitignore +9 -0
- cvfile-0.1.0/PKG-INFO +74 -0
- cvfile-0.1.0/README.md +45 -0
- cvfile-0.1.0/examples/build_python_sample.py +70 -0
- cvfile-0.1.0/pyproject.toml +57 -0
- cvfile-0.1.0/src/cvfile/__init__.py +49 -0
- cvfile-0.1.0/src/cvfile/_constants.py +26 -0
- cvfile-0.1.0/src/cvfile/_pdf.py +255 -0
- cvfile-0.1.0/src/cvfile/_security.py +183 -0
- cvfile-0.1.0/src/cvfile/_types.py +89 -0
- cvfile-0.1.0/src/cvfile/_xmp.py +240 -0
- cvfile-0.1.0/src/cvfile/detect.py +25 -0
- cvfile-0.1.0/src/cvfile/embed/__init__.py +39 -0
- cvfile-0.1.0/src/cvfile/embed/_chunk.py +130 -0
- cvfile-0.1.0/src/cvfile/embed/_embed.py +66 -0
- cvfile-0.1.0/src/cvfile/embed/_embeddings.py +176 -0
- cvfile-0.1.0/src/cvfile/embed/_huggingface.py +88 -0
- cvfile-0.1.0/src/cvfile/embed/_search.py +78 -0
- cvfile-0.1.0/src/cvfile/extract.py +65 -0
- cvfile-0.1.0/src/cvfile/inspect.py +25 -0
- cvfile-0.1.0/src/cvfile/integrations/__init__.py +5 -0
- cvfile-0.1.0/src/cvfile/integrations/langchain.py +144 -0
- cvfile-0.1.0/src/cvfile/integrations/llamaindex.py +113 -0
- cvfile-0.1.0/src/cvfile/pack.py +173 -0
- cvfile-0.1.0/src/cvfile/server/__init__.py +31 -0
- cvfile-0.1.0/src/cvfile/server/_conneg.py +129 -0
- cvfile-0.1.0/src/cvfile/server/_handler.py +176 -0
- cvfile-0.1.0/src/cvfile/server/asgi.py +146 -0
- cvfile-0.1.0/src/cvfile/server/wsgi.py +125 -0
- cvfile-0.1.0/src/cvfile/validate.py +140 -0
- cvfile-0.1.0/tests/test_embed.py +134 -0
- cvfile-0.1.0/tests/test_interop.py +66 -0
- cvfile-0.1.0/tests/test_round_trip.py +105 -0
- cvfile-0.1.0/tests/test_security.py +60 -0
- cvfile-0.1.0/tests/test_server_conneg.py +74 -0
- cvfile-0.1.0/tests/test_server_handlers.py +195 -0
cvfile-0.1.0/.gitignore
ADDED
cvfile-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cvfile
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Reference SDK for the .cv open file format — pack, extract, inspect, validate.
|
|
5
|
+
Project-URL: Homepage, https://cvfile.org
|
|
6
|
+
Project-URL: Repository, https://github.com/cvfile/cv
|
|
7
|
+
Author: cvfile.org
|
|
8
|
+
License: Apache-2.0
|
|
9
|
+
Keywords: ats,cv,embeddings,markdown,pdf,pdfa,rag,resume
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Requires-Dist: pypdf<6,>=5.0
|
|
19
|
+
Provides-Extra: dev
|
|
20
|
+
Requires-Dist: cbor2>=5.6; extra == 'dev'
|
|
21
|
+
Requires-Dist: httpx>=0.27; extra == 'dev'
|
|
22
|
+
Requires-Dist: mypy>=1.13; extra == 'dev'
|
|
23
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
24
|
+
Requires-Dist: ruff>=0.7; extra == 'dev'
|
|
25
|
+
Provides-Extra: embed
|
|
26
|
+
Requires-Dist: cbor2>=5.6; extra == 'embed'
|
|
27
|
+
Requires-Dist: httpx>=0.27; extra == 'embed'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# `cvfile`
|
|
31
|
+
|
|
32
|
+
Reference Python SDK for the [`.cv`](https://cvfile.org) open file format.
|
|
33
|
+
|
|
34
|
+
## Install
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install cvfile
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Pack
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from cvfile import pack
|
|
44
|
+
|
|
45
|
+
with open("resume.pdf", "rb") as f:
|
|
46
|
+
pdf_bytes = f.read()
|
|
47
|
+
with open("resume.md") as f:
|
|
48
|
+
md = f.read()
|
|
49
|
+
|
|
50
|
+
cv_bytes = pack(
|
|
51
|
+
pdf=pdf_bytes,
|
|
52
|
+
markdown=md,
|
|
53
|
+
metadata={"primary_language": "en"},
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
with open("resume.cv", "wb") as f:
|
|
57
|
+
f.write(cv_bytes)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Extract
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from cvfile import extract, extract_markdown
|
|
64
|
+
|
|
65
|
+
file = extract(open("resume.cv", "rb").read())
|
|
66
|
+
print(file.metadata.version) # "0.1"
|
|
67
|
+
print([p.name for p in file.payloads]) # ['resume.md', 'resume.html']
|
|
68
|
+
|
|
69
|
+
md = extract_markdown(open("resume.cv", "rb").read())
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## License
|
|
73
|
+
|
|
74
|
+
Apache-2.0.
|
cvfile-0.1.0/README.md
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# `cvfile`
|
|
2
|
+
|
|
3
|
+
Reference Python SDK for the [`.cv`](https://cvfile.org) open file format.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install cvfile
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Pack
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from cvfile import pack
|
|
15
|
+
|
|
16
|
+
with open("resume.pdf", "rb") as f:
|
|
17
|
+
pdf_bytes = f.read()
|
|
18
|
+
with open("resume.md") as f:
|
|
19
|
+
md = f.read()
|
|
20
|
+
|
|
21
|
+
cv_bytes = pack(
|
|
22
|
+
pdf=pdf_bytes,
|
|
23
|
+
markdown=md,
|
|
24
|
+
metadata={"primary_language": "en"},
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
with open("resume.cv", "wb") as f:
|
|
28
|
+
f.write(cv_bytes)
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Extract
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from cvfile import extract, extract_markdown
|
|
35
|
+
|
|
36
|
+
file = extract(open("resume.cv", "rb").read())
|
|
37
|
+
print(file.metadata.version) # "0.1"
|
|
38
|
+
print([p.name for p in file.payloads]) # ['resume.md', 'resume.html']
|
|
39
|
+
|
|
40
|
+
md = extract_markdown(open("resume.cv", "rb").read())
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## License
|
|
44
|
+
|
|
45
|
+
Apache-2.0.
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Build a Python-produced .cv fixture for cross-SDK interop testing.
|
|
2
|
+
|
|
3
|
+
Outputs to packages/sdk-js/tests/fixtures/python-produced.cv so the JS test
|
|
4
|
+
suite can verify it extracts identically.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import io
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import pypdf
|
|
13
|
+
|
|
14
|
+
from cvfile import extract, inspect, pack, validate
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
SAMPLE_MD = """# Marie Curie
|
|
18
|
+
|
|
19
|
+
Physicist and chemist · Paris, France
|
|
20
|
+
|
|
21
|
+
## Notable
|
|
22
|
+
|
|
23
|
+
* Discovered polonium and radium
|
|
24
|
+
* Two Nobel Prizes (Physics 1903, Chemistry 1911)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
SAMPLE_HTML = """<!doctype html>
|
|
28
|
+
<html lang="en"><head><meta charset="utf-8"><title>Marie Curie</title></head>
|
|
29
|
+
<body><h1>Marie Curie</h1><p>Physicist and chemist.</p></body></html>"""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def make_blank_pdf() -> bytes:
|
|
33
|
+
writer = pypdf.PdfWriter()
|
|
34
|
+
writer.add_blank_page(width=300, height=400)
|
|
35
|
+
buf = io.BytesIO()
|
|
36
|
+
writer.write(buf)
|
|
37
|
+
return buf.getvalue()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def main() -> None:
|
|
41
|
+
repo_root = Path(__file__).resolve().parents[3]
|
|
42
|
+
out_dir = repo_root / "packages" / "sdk-js" / "tests" / "fixtures"
|
|
43
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
44
|
+
|
|
45
|
+
cv = pack(
|
|
46
|
+
pdf=make_blank_pdf(),
|
|
47
|
+
markdown=SAMPLE_MD,
|
|
48
|
+
html=SAMPLE_HTML,
|
|
49
|
+
json_resume={"basics": {"name": "Marie Curie"}},
|
|
50
|
+
metadata={"primary_language": "en", "generator": "cvfile-py-examples/marie-curie"},
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
out_path = out_dir / "python-produced.cv"
|
|
54
|
+
out_path.write_bytes(cv)
|
|
55
|
+
print(f"Wrote {out_path} ({len(cv)} bytes)")
|
|
56
|
+
|
|
57
|
+
# Self-verify the fixture is sane.
|
|
58
|
+
file = extract(cv)
|
|
59
|
+
print(f" payloads: {[p.name for p in file.payloads]}")
|
|
60
|
+
print(f" metadata: version={file.metadata.version} lang={file.metadata.primary_language}")
|
|
61
|
+
|
|
62
|
+
meta = inspect(cv)
|
|
63
|
+
assert meta.primary_payload == "resume.md"
|
|
64
|
+
|
|
65
|
+
report = validate(cv)
|
|
66
|
+
print(f" validate: ok={report.ok} issues={len(report.issues)}")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
if __name__ == "__main__":
|
|
70
|
+
main()
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "cvfile"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Reference SDK for the .cv open file format — pack, extract, inspect, validate."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "Apache-2.0" }
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [{ name = "cvfile.org" }]
|
|
13
|
+
keywords = ["cv", "resume", "pdf", "pdfa", "markdown", "embeddings", "rag", "ats"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"License :: OSI Approved :: Apache Software License",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.10",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Programming Language :: Python :: 3.13",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"pypdf>=5.0,<6",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.urls]
|
|
28
|
+
Homepage = "https://cvfile.org"
|
|
29
|
+
Repository = "https://github.com/cvfile/cv"
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
embed = [
|
|
33
|
+
"cbor2>=5.6",
|
|
34
|
+
"httpx>=0.27",
|
|
35
|
+
]
|
|
36
|
+
dev = [
|
|
37
|
+
"pytest>=8.0",
|
|
38
|
+
"ruff>=0.7",
|
|
39
|
+
"mypy>=1.13",
|
|
40
|
+
"cbor2>=5.6",
|
|
41
|
+
"httpx>=0.27",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
[tool.hatch.build.targets.wheel]
|
|
45
|
+
packages = ["src/cvfile"]
|
|
46
|
+
|
|
47
|
+
[tool.ruff]
|
|
48
|
+
line-length = 120
|
|
49
|
+
target-version = "py310"
|
|
50
|
+
|
|
51
|
+
[tool.ruff.lint]
|
|
52
|
+
select = ["E", "F", "I", "B", "UP", "N", "SIM", "RUF"]
|
|
53
|
+
|
|
54
|
+
[tool.mypy]
|
|
55
|
+
python_version = "3.12"
|
|
56
|
+
strict = true
|
|
57
|
+
warn_unreachable = true
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Reference SDK for the .cv open file format."""
|
|
2
|
+
|
|
3
|
+
from cvfile._constants import (
|
|
4
|
+
CV_NAMESPACE_PREFIX,
|
|
5
|
+
CV_NAMESPACE_URI,
|
|
6
|
+
CV_SPEC_VERSION,
|
|
7
|
+
DEFAULT_PAYLOAD_NAMES,
|
|
8
|
+
PAYLOAD_MIME_TYPES,
|
|
9
|
+
)
|
|
10
|
+
from cvfile._types import (
|
|
11
|
+
AlternateMeta,
|
|
12
|
+
CvFile,
|
|
13
|
+
CvMetadata,
|
|
14
|
+
EmbeddingSpaceSummary,
|
|
15
|
+
ExtractedPayload,
|
|
16
|
+
IntegrityEntry,
|
|
17
|
+
Payload,
|
|
18
|
+
ValidationIssue,
|
|
19
|
+
ValidationReport,
|
|
20
|
+
)
|
|
21
|
+
from cvfile.detect import is_cv_file
|
|
22
|
+
from cvfile.extract import extract, extract_html, extract_markdown
|
|
23
|
+
from cvfile.inspect import inspect
|
|
24
|
+
from cvfile.pack import pack
|
|
25
|
+
from cvfile.validate import validate
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"CV_NAMESPACE_PREFIX",
|
|
29
|
+
"CV_NAMESPACE_URI",
|
|
30
|
+
"CV_SPEC_VERSION",
|
|
31
|
+
"DEFAULT_PAYLOAD_NAMES",
|
|
32
|
+
"PAYLOAD_MIME_TYPES",
|
|
33
|
+
"AlternateMeta",
|
|
34
|
+
"CvFile",
|
|
35
|
+
"CvMetadata",
|
|
36
|
+
"EmbeddingSpaceSummary",
|
|
37
|
+
"ExtractedPayload",
|
|
38
|
+
"IntegrityEntry",
|
|
39
|
+
"Payload",
|
|
40
|
+
"ValidationIssue",
|
|
41
|
+
"ValidationReport",
|
|
42
|
+
"extract",
|
|
43
|
+
"extract_html",
|
|
44
|
+
"extract_markdown",
|
|
45
|
+
"inspect",
|
|
46
|
+
"is_cv_file",
|
|
47
|
+
"pack",
|
|
48
|
+
"validate",
|
|
49
|
+
]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Constants shared across the cvfile SDK."""
|
|
2
|
+
|
|
3
|
+
CV_SPEC_VERSION = "0.1"
|
|
4
|
+
|
|
5
|
+
CV_NAMESPACE_URI = "http://ns.cvfile.org/cv/1.0/"
|
|
6
|
+
CV_NAMESPACE_PREFIX = "cv"
|
|
7
|
+
|
|
8
|
+
DEFAULT_GENERATOR = f"cvfile-py/{CV_SPEC_VERSION}"
|
|
9
|
+
|
|
10
|
+
DEFAULT_PAYLOAD_NAMES = {
|
|
11
|
+
"markdown": "resume.md",
|
|
12
|
+
"html": "resume.html",
|
|
13
|
+
"json": "resume.json",
|
|
14
|
+
"embeddings": "embeddings.cbor",
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
PAYLOAD_MIME_TYPES = {
|
|
18
|
+
"markdown": "text/markdown",
|
|
19
|
+
"html": "text/html",
|
|
20
|
+
"json": "application/json",
|
|
21
|
+
"embeddings": "application/vnd.cv.embeddings+cbor",
|
|
22
|
+
"pdf": "application/pdf",
|
|
23
|
+
"cv": "application/vnd.cv+pdf",
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
MAX_PAYLOAD_BYTES_DEFAULT = 16 * 1024 * 1024
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
"""Thin pypdf wrapper for /AF Associated Files and /Metadata streams.
|
|
2
|
+
|
|
3
|
+
This module isolates pypdf so we can swap to pikepdf later without changing the
|
|
4
|
+
public API. PDF/A-3 conformance work that requires deeper PDF rewriting (font
|
|
5
|
+
embedding, ICC profile injection on arbitrary input PDFs) will land here.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import io
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from datetime import datetime, timezone
|
|
13
|
+
from typing import Literal
|
|
14
|
+
|
|
15
|
+
from pypdf import PdfReader, PdfWriter
|
|
16
|
+
from pypdf.generic import (
|
|
17
|
+
ArrayObject,
|
|
18
|
+
ByteStringObject,
|
|
19
|
+
DecodedStreamObject,
|
|
20
|
+
DictionaryObject,
|
|
21
|
+
IndirectObject,
|
|
22
|
+
NameObject,
|
|
23
|
+
NumberObject,
|
|
24
|
+
StreamObject,
|
|
25
|
+
TextStringObject,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
AFRelationshipKind = Literal["Alternative", "Data", "Supplement"]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass(frozen=True, slots=True)
|
|
32
|
+
class RawPayload:
|
|
33
|
+
name: str
|
|
34
|
+
mime_type: str
|
|
35
|
+
relationship: AFRelationshipKind
|
|
36
|
+
bytes_: bytes
|
|
37
|
+
description: str | None = None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def load_writer(pdf_bytes: bytes) -> PdfWriter:
|
|
41
|
+
reader = PdfReader(io.BytesIO(pdf_bytes))
|
|
42
|
+
writer = PdfWriter(clone_from=reader)
|
|
43
|
+
return writer
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def add_associated_file(
|
|
47
|
+
writer: PdfWriter,
|
|
48
|
+
*,
|
|
49
|
+
name: str,
|
|
50
|
+
data: bytes,
|
|
51
|
+
mime_type: str,
|
|
52
|
+
description: str,
|
|
53
|
+
relationship: AFRelationshipKind,
|
|
54
|
+
creation_date: datetime,
|
|
55
|
+
modification_date: datetime,
|
|
56
|
+
) -> None:
|
|
57
|
+
"""Attach `data` as an Associated File on the document catalog (/AF)."""
|
|
58
|
+
|
|
59
|
+
embedded_stream = DecodedStreamObject()
|
|
60
|
+
embedded_stream.set_data(data)
|
|
61
|
+
embedded_stream.update(
|
|
62
|
+
{
|
|
63
|
+
NameObject("/Type"): NameObject("/EmbeddedFile"),
|
|
64
|
+
NameObject("/Subtype"): NameObject(_mime_to_name(mime_type)),
|
|
65
|
+
NameObject("/Length"): NumberObject(len(data)),
|
|
66
|
+
NameObject("/Params"): DictionaryObject(
|
|
67
|
+
{
|
|
68
|
+
NameObject("/CreationDate"): TextStringObject(_pdf_date(creation_date)),
|
|
69
|
+
NameObject("/ModDate"): TextStringObject(_pdf_date(modification_date)),
|
|
70
|
+
NameObject("/Size"): NumberObject(len(data)),
|
|
71
|
+
}
|
|
72
|
+
),
|
|
73
|
+
}
|
|
74
|
+
)
|
|
75
|
+
embedded_ref = writer._add_object(embedded_stream)
|
|
76
|
+
|
|
77
|
+
filespec = DictionaryObject(
|
|
78
|
+
{
|
|
79
|
+
NameObject("/Type"): NameObject("/Filespec"),
|
|
80
|
+
NameObject("/F"): TextStringObject(name),
|
|
81
|
+
NameObject("/UF"): TextStringObject(name),
|
|
82
|
+
NameObject("/Desc"): TextStringObject(description),
|
|
83
|
+
NameObject("/AFRelationship"): NameObject(f"/{relationship}"),
|
|
84
|
+
NameObject("/EF"): DictionaryObject(
|
|
85
|
+
{NameObject("/F"): embedded_ref, NameObject("/UF"): embedded_ref}
|
|
86
|
+
),
|
|
87
|
+
}
|
|
88
|
+
)
|
|
89
|
+
filespec_ref = writer._add_object(filespec)
|
|
90
|
+
|
|
91
|
+
catalog = writer._root_object
|
|
92
|
+
af_array = catalog.get(NameObject("/AF"))
|
|
93
|
+
if isinstance(af_array, IndirectObject):
|
|
94
|
+
af_array = af_array.get_object()
|
|
95
|
+
if not isinstance(af_array, ArrayObject):
|
|
96
|
+
af_array = ArrayObject()
|
|
97
|
+
catalog[NameObject("/AF")] = af_array
|
|
98
|
+
af_array.append(filespec_ref)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def set_metadata_xml(writer: PdfWriter, xml: str) -> None:
|
|
102
|
+
data = xml.encode("utf-8")
|
|
103
|
+
stream = DecodedStreamObject()
|
|
104
|
+
stream.set_data(data)
|
|
105
|
+
stream.update(
|
|
106
|
+
{
|
|
107
|
+
NameObject("/Type"): NameObject("/Metadata"),
|
|
108
|
+
NameObject("/Subtype"): NameObject("/XML"),
|
|
109
|
+
NameObject("/Length"): NumberObject(len(data)),
|
|
110
|
+
}
|
|
111
|
+
)
|
|
112
|
+
ref = writer._add_object(stream)
|
|
113
|
+
writer._root_object[NameObject("/Metadata")] = ref
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def write_to_bytes(writer: PdfWriter) -> bytes:
|
|
117
|
+
_ensure_trailer_id(writer)
|
|
118
|
+
buf = io.BytesIO()
|
|
119
|
+
writer.write(buf)
|
|
120
|
+
return buf.getvalue()
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _ensure_trailer_id(writer: PdfWriter) -> None:
|
|
124
|
+
"""Set the trailer /ID array (PDF/A-3u rule 6.1.3)."""
|
|
125
|
+
import secrets
|
|
126
|
+
|
|
127
|
+
if getattr(writer, "_ID", None):
|
|
128
|
+
return
|
|
129
|
+
id_hex = secrets.token_hex(16).upper().encode("ascii")
|
|
130
|
+
writer._ID = ArrayObject([ByteStringObject(id_hex), ByteStringObject(id_hex)])
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def read_associated_files(reader: PdfReader) -> list[RawPayload]:
|
|
134
|
+
catalog = reader.trailer.get("/Root")
|
|
135
|
+
if catalog is None:
|
|
136
|
+
return []
|
|
137
|
+
catalog = catalog.get_object() if isinstance(catalog, IndirectObject) else catalog
|
|
138
|
+
af = catalog.get("/AF") if isinstance(catalog, DictionaryObject) else None
|
|
139
|
+
if af is None:
|
|
140
|
+
return []
|
|
141
|
+
if isinstance(af, IndirectObject):
|
|
142
|
+
af = af.get_object()
|
|
143
|
+
if not isinstance(af, ArrayObject):
|
|
144
|
+
return []
|
|
145
|
+
|
|
146
|
+
out: list[RawPayload] = []
|
|
147
|
+
for entry in af:
|
|
148
|
+
filespec = entry.get_object() if isinstance(entry, IndirectObject) else entry
|
|
149
|
+
payload = _parse_filespec(filespec)
|
|
150
|
+
if payload:
|
|
151
|
+
out.append(payload)
|
|
152
|
+
return out
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def read_metadata_xml(reader: PdfReader) -> str | None:
|
|
156
|
+
root = reader.trailer.get("/Root")
|
|
157
|
+
if root is None:
|
|
158
|
+
return None
|
|
159
|
+
root = root.get_object() if isinstance(root, IndirectObject) else root
|
|
160
|
+
meta = root.get("/Metadata") if isinstance(root, DictionaryObject) else None
|
|
161
|
+
if meta is None:
|
|
162
|
+
return None
|
|
163
|
+
meta = meta.get_object() if isinstance(meta, IndirectObject) else meta
|
|
164
|
+
if not isinstance(meta, StreamObject):
|
|
165
|
+
return None
|
|
166
|
+
data = meta.get_data()
|
|
167
|
+
if isinstance(data, str):
|
|
168
|
+
return data
|
|
169
|
+
return data.decode("utf-8", errors="replace")
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _parse_filespec(filespec: DictionaryObject) -> RawPayload | None:
|
|
173
|
+
if not isinstance(filespec, DictionaryObject):
|
|
174
|
+
return None
|
|
175
|
+
ef = filespec.get("/EF")
|
|
176
|
+
if ef is None:
|
|
177
|
+
return None
|
|
178
|
+
ef = ef.get_object() if isinstance(ef, IndirectObject) else ef
|
|
179
|
+
if not isinstance(ef, DictionaryObject):
|
|
180
|
+
return None
|
|
181
|
+
stream_ref = ef.get("/UF") or ef.get("/F")
|
|
182
|
+
if stream_ref is None:
|
|
183
|
+
return None
|
|
184
|
+
stream = stream_ref.get_object() if isinstance(stream_ref, IndirectObject) else stream_ref
|
|
185
|
+
if not isinstance(stream, StreamObject):
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
data = stream.get_data()
|
|
189
|
+
if isinstance(data, str):
|
|
190
|
+
data = data.encode("latin-1", errors="replace")
|
|
191
|
+
|
|
192
|
+
name_obj = filespec.get("/UF") or filespec.get("/F")
|
|
193
|
+
if name_obj is None:
|
|
194
|
+
return None
|
|
195
|
+
name = str(name_obj)
|
|
196
|
+
|
|
197
|
+
subtype = stream.get("/Subtype") or filespec.get("/Subtype")
|
|
198
|
+
mime_type = _name_to_mime(str(subtype)) if subtype else "application/octet-stream"
|
|
199
|
+
|
|
200
|
+
desc_obj = filespec.get("/Desc")
|
|
201
|
+
description = str(desc_obj) if desc_obj else None
|
|
202
|
+
|
|
203
|
+
rel_obj = filespec.get("/AFRelationship")
|
|
204
|
+
rel_str = str(rel_obj).lstrip("/") if rel_obj else "Supplement"
|
|
205
|
+
if rel_str not in {"Alternative", "Data", "Supplement"}:
|
|
206
|
+
rel_str = "Supplement"
|
|
207
|
+
rel: AFRelationshipKind = rel_str # type: ignore[assignment]
|
|
208
|
+
|
|
209
|
+
return RawPayload(name=name, mime_type=mime_type, relationship=rel, bytes_=bytes(data), description=description)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _mime_to_name(mime: str) -> str:
|
|
213
|
+
"""Wrap a MIME type for use as a PDF Name. pypdf's NameObject handles
|
|
214
|
+
the per-character #XX escaping itself when serializing; we MUST NOT
|
|
215
|
+
pre-escape, otherwise the '#' of our own escape gets re-escaped to '#23'.
|
|
216
|
+
"""
|
|
217
|
+
return "/" + mime
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _name_to_mime(name: str) -> str:
|
|
221
|
+
s = name.lstrip("/")
|
|
222
|
+
out = []
|
|
223
|
+
i = 0
|
|
224
|
+
while i < len(s):
|
|
225
|
+
c = s[i]
|
|
226
|
+
if c == "#" and i + 2 < len(s):
|
|
227
|
+
try:
|
|
228
|
+
out.append(chr(int(s[i + 1 : i + 3], 16)))
|
|
229
|
+
i += 3
|
|
230
|
+
continue
|
|
231
|
+
except ValueError:
|
|
232
|
+
pass
|
|
233
|
+
out.append(c)
|
|
234
|
+
i += 1
|
|
235
|
+
return "".join(out)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _pdf_date(dt: datetime) -> str:
|
|
239
|
+
"""Format a datetime as a PDF date string (D:YYYYMMDDHHmmSS+HH'mm')."""
|
|
240
|
+
if dt.tzinfo is None:
|
|
241
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
242
|
+
return dt.astimezone(timezone.utc).strftime("D:%Y%m%d%H%M%SZ")
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
__all__ = [
|
|
246
|
+
"AFRelationshipKind",
|
|
247
|
+
"ByteStringObject",
|
|
248
|
+
"RawPayload",
|
|
249
|
+
"add_associated_file",
|
|
250
|
+
"load_writer",
|
|
251
|
+
"read_associated_files",
|
|
252
|
+
"read_metadata_xml",
|
|
253
|
+
"set_metadata_xml",
|
|
254
|
+
"write_to_bytes",
|
|
255
|
+
]
|