cvfile 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cvfile/__init__.py ADDED
@@ -0,0 +1,49 @@
1
+ """Reference SDK for the .cv open file format."""
2
+
3
+ from cvfile._constants import (
4
+ CV_NAMESPACE_PREFIX,
5
+ CV_NAMESPACE_URI,
6
+ CV_SPEC_VERSION,
7
+ DEFAULT_PAYLOAD_NAMES,
8
+ PAYLOAD_MIME_TYPES,
9
+ )
10
+ from cvfile._types import (
11
+ AlternateMeta,
12
+ CvFile,
13
+ CvMetadata,
14
+ EmbeddingSpaceSummary,
15
+ ExtractedPayload,
16
+ IntegrityEntry,
17
+ Payload,
18
+ ValidationIssue,
19
+ ValidationReport,
20
+ )
21
+ from cvfile.detect import is_cv_file
22
+ from cvfile.extract import extract, extract_html, extract_markdown
23
+ from cvfile.inspect import inspect
24
+ from cvfile.pack import pack
25
+ from cvfile.validate import validate
26
+
27
+ __all__ = [
28
+ "CV_NAMESPACE_PREFIX",
29
+ "CV_NAMESPACE_URI",
30
+ "CV_SPEC_VERSION",
31
+ "DEFAULT_PAYLOAD_NAMES",
32
+ "PAYLOAD_MIME_TYPES",
33
+ "AlternateMeta",
34
+ "CvFile",
35
+ "CvMetadata",
36
+ "EmbeddingSpaceSummary",
37
+ "ExtractedPayload",
38
+ "IntegrityEntry",
39
+ "Payload",
40
+ "ValidationIssue",
41
+ "ValidationReport",
42
+ "extract",
43
+ "extract_html",
44
+ "extract_markdown",
45
+ "inspect",
46
+ "is_cv_file",
47
+ "pack",
48
+ "validate",
49
+ ]
cvfile/_constants.py ADDED
@@ -0,0 +1,26 @@
1
+ """Constants shared across the cvfile SDK."""
2
+
3
+ CV_SPEC_VERSION = "0.1"
4
+
5
+ CV_NAMESPACE_URI = "http://ns.cvfile.org/cv/1.0/"
6
+ CV_NAMESPACE_PREFIX = "cv"
7
+
8
+ DEFAULT_GENERATOR = f"cvfile-py/{CV_SPEC_VERSION}"
9
+
10
+ DEFAULT_PAYLOAD_NAMES = {
11
+ "markdown": "resume.md",
12
+ "html": "resume.html",
13
+ "json": "resume.json",
14
+ "embeddings": "embeddings.cbor",
15
+ }
16
+
17
+ PAYLOAD_MIME_TYPES = {
18
+ "markdown": "text/markdown",
19
+ "html": "text/html",
20
+ "json": "application/json",
21
+ "embeddings": "application/vnd.cv.embeddings+cbor",
22
+ "pdf": "application/pdf",
23
+ "cv": "application/vnd.cv+pdf",
24
+ }
25
+
26
+ MAX_PAYLOAD_BYTES_DEFAULT = 16 * 1024 * 1024
cvfile/_pdf.py ADDED
@@ -0,0 +1,255 @@
1
+ """Thin pypdf wrapper for /AF Associated Files and /Metadata streams.
2
+
3
+ This module isolates pypdf so we can swap to pikepdf later without changing the
4
+ public API. PDF/A-3 conformance work that requires deeper PDF rewriting (font
5
+ embedding, ICC profile injection on arbitrary input PDFs) will land here.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import io
11
+ from dataclasses import dataclass
12
+ from datetime import datetime, timezone
13
+ from typing import Literal
14
+
15
+ from pypdf import PdfReader, PdfWriter
16
+ from pypdf.generic import (
17
+ ArrayObject,
18
+ ByteStringObject,
19
+ DecodedStreamObject,
20
+ DictionaryObject,
21
+ IndirectObject,
22
+ NameObject,
23
+ NumberObject,
24
+ StreamObject,
25
+ TextStringObject,
26
+ )
27
+
28
+ AFRelationshipKind = Literal["Alternative", "Data", "Supplement"]
29
+
30
+
31
+ @dataclass(frozen=True, slots=True)
32
+ class RawPayload:
33
+ name: str
34
+ mime_type: str
35
+ relationship: AFRelationshipKind
36
+ bytes_: bytes
37
+ description: str | None = None
38
+
39
+
40
+ def load_writer(pdf_bytes: bytes) -> PdfWriter:
41
+ reader = PdfReader(io.BytesIO(pdf_bytes))
42
+ writer = PdfWriter(clone_from=reader)
43
+ return writer
44
+
45
+
46
+ def add_associated_file(
47
+ writer: PdfWriter,
48
+ *,
49
+ name: str,
50
+ data: bytes,
51
+ mime_type: str,
52
+ description: str,
53
+ relationship: AFRelationshipKind,
54
+ creation_date: datetime,
55
+ modification_date: datetime,
56
+ ) -> None:
57
+ """Attach `data` as an Associated File on the document catalog (/AF)."""
58
+
59
+ embedded_stream = DecodedStreamObject()
60
+ embedded_stream.set_data(data)
61
+ embedded_stream.update(
62
+ {
63
+ NameObject("/Type"): NameObject("/EmbeddedFile"),
64
+ NameObject("/Subtype"): NameObject(_mime_to_name(mime_type)),
65
+ NameObject("/Length"): NumberObject(len(data)),
66
+ NameObject("/Params"): DictionaryObject(
67
+ {
68
+ NameObject("/CreationDate"): TextStringObject(_pdf_date(creation_date)),
69
+ NameObject("/ModDate"): TextStringObject(_pdf_date(modification_date)),
70
+ NameObject("/Size"): NumberObject(len(data)),
71
+ }
72
+ ),
73
+ }
74
+ )
75
+ embedded_ref = writer._add_object(embedded_stream)
76
+
77
+ filespec = DictionaryObject(
78
+ {
79
+ NameObject("/Type"): NameObject("/Filespec"),
80
+ NameObject("/F"): TextStringObject(name),
81
+ NameObject("/UF"): TextStringObject(name),
82
+ NameObject("/Desc"): TextStringObject(description),
83
+ NameObject("/AFRelationship"): NameObject(f"/{relationship}"),
84
+ NameObject("/EF"): DictionaryObject(
85
+ {NameObject("/F"): embedded_ref, NameObject("/UF"): embedded_ref}
86
+ ),
87
+ }
88
+ )
89
+ filespec_ref = writer._add_object(filespec)
90
+
91
+ catalog = writer._root_object
92
+ af_array = catalog.get(NameObject("/AF"))
93
+ if isinstance(af_array, IndirectObject):
94
+ af_array = af_array.get_object()
95
+ if not isinstance(af_array, ArrayObject):
96
+ af_array = ArrayObject()
97
+ catalog[NameObject("/AF")] = af_array
98
+ af_array.append(filespec_ref)
99
+
100
+
101
+ def set_metadata_xml(writer: PdfWriter, xml: str) -> None:
102
+ data = xml.encode("utf-8")
103
+ stream = DecodedStreamObject()
104
+ stream.set_data(data)
105
+ stream.update(
106
+ {
107
+ NameObject("/Type"): NameObject("/Metadata"),
108
+ NameObject("/Subtype"): NameObject("/XML"),
109
+ NameObject("/Length"): NumberObject(len(data)),
110
+ }
111
+ )
112
+ ref = writer._add_object(stream)
113
+ writer._root_object[NameObject("/Metadata")] = ref
114
+
115
+
116
+ def write_to_bytes(writer: PdfWriter) -> bytes:
117
+ _ensure_trailer_id(writer)
118
+ buf = io.BytesIO()
119
+ writer.write(buf)
120
+ return buf.getvalue()
121
+
122
+
123
+ def _ensure_trailer_id(writer: PdfWriter) -> None:
124
+ """Set the trailer /ID array (PDF/A-3u rule 6.1.3)."""
125
+ import secrets
126
+
127
+ if getattr(writer, "_ID", None):
128
+ return
129
+ id_hex = secrets.token_hex(16).upper().encode("ascii")
130
+ writer._ID = ArrayObject([ByteStringObject(id_hex), ByteStringObject(id_hex)])
131
+
132
+
133
+ def read_associated_files(reader: PdfReader) -> list[RawPayload]:
134
+ catalog = reader.trailer.get("/Root")
135
+ if catalog is None:
136
+ return []
137
+ catalog = catalog.get_object() if isinstance(catalog, IndirectObject) else catalog
138
+ af = catalog.get("/AF") if isinstance(catalog, DictionaryObject) else None
139
+ if af is None:
140
+ return []
141
+ if isinstance(af, IndirectObject):
142
+ af = af.get_object()
143
+ if not isinstance(af, ArrayObject):
144
+ return []
145
+
146
+ out: list[RawPayload] = []
147
+ for entry in af:
148
+ filespec = entry.get_object() if isinstance(entry, IndirectObject) else entry
149
+ payload = _parse_filespec(filespec)
150
+ if payload:
151
+ out.append(payload)
152
+ return out
153
+
154
+
155
+ def read_metadata_xml(reader: PdfReader) -> str | None:
156
+ root = reader.trailer.get("/Root")
157
+ if root is None:
158
+ return None
159
+ root = root.get_object() if isinstance(root, IndirectObject) else root
160
+ meta = root.get("/Metadata") if isinstance(root, DictionaryObject) else None
161
+ if meta is None:
162
+ return None
163
+ meta = meta.get_object() if isinstance(meta, IndirectObject) else meta
164
+ if not isinstance(meta, StreamObject):
165
+ return None
166
+ data = meta.get_data()
167
+ if isinstance(data, str):
168
+ return data
169
+ return data.decode("utf-8", errors="replace")
170
+
171
+
172
+ def _parse_filespec(filespec: DictionaryObject) -> RawPayload | None:
173
+ if not isinstance(filespec, DictionaryObject):
174
+ return None
175
+ ef = filespec.get("/EF")
176
+ if ef is None:
177
+ return None
178
+ ef = ef.get_object() if isinstance(ef, IndirectObject) else ef
179
+ if not isinstance(ef, DictionaryObject):
180
+ return None
181
+ stream_ref = ef.get("/UF") or ef.get("/F")
182
+ if stream_ref is None:
183
+ return None
184
+ stream = stream_ref.get_object() if isinstance(stream_ref, IndirectObject) else stream_ref
185
+ if not isinstance(stream, StreamObject):
186
+ return None
187
+
188
+ data = stream.get_data()
189
+ if isinstance(data, str):
190
+ data = data.encode("latin-1", errors="replace")
191
+
192
+ name_obj = filespec.get("/UF") or filespec.get("/F")
193
+ if name_obj is None:
194
+ return None
195
+ name = str(name_obj)
196
+
197
+ subtype = stream.get("/Subtype") or filespec.get("/Subtype")
198
+ mime_type = _name_to_mime(str(subtype)) if subtype else "application/octet-stream"
199
+
200
+ desc_obj = filespec.get("/Desc")
201
+ description = str(desc_obj) if desc_obj else None
202
+
203
+ rel_obj = filespec.get("/AFRelationship")
204
+ rel_str = str(rel_obj).lstrip("/") if rel_obj else "Supplement"
205
+ if rel_str not in {"Alternative", "Data", "Supplement"}:
206
+ rel_str = "Supplement"
207
+ rel: AFRelationshipKind = rel_str # type: ignore[assignment]
208
+
209
+ return RawPayload(name=name, mime_type=mime_type, relationship=rel, bytes_=bytes(data), description=description)
210
+
211
+
212
+ def _mime_to_name(mime: str) -> str:
213
+ """Wrap a MIME type for use as a PDF Name. pypdf's NameObject handles
214
+ the per-character #XX escaping itself when serializing; we MUST NOT
215
+ pre-escape, otherwise the '#' of our own escape gets re-escaped to '#23'.
216
+ """
217
+ return "/" + mime
218
+
219
+
220
+ def _name_to_mime(name: str) -> str:
221
+ s = name.lstrip("/")
222
+ out = []
223
+ i = 0
224
+ while i < len(s):
225
+ c = s[i]
226
+ if c == "#" and i + 2 < len(s):
227
+ try:
228
+ out.append(chr(int(s[i + 1 : i + 3], 16)))
229
+ i += 3
230
+ continue
231
+ except ValueError:
232
+ pass
233
+ out.append(c)
234
+ i += 1
235
+ return "".join(out)
236
+
237
+
238
+ def _pdf_date(dt: datetime) -> str:
239
+ """Format a datetime as a PDF date string (D:YYYYMMDDHHmmSS+HH'mm')."""
240
+ if dt.tzinfo is None:
241
+ dt = dt.replace(tzinfo=timezone.utc)
242
+ return dt.astimezone(timezone.utc).strftime("D:%Y%m%d%H%M%SZ")
243
+
244
+
245
+ __all__ = [
246
+ "AFRelationshipKind",
247
+ "ByteStringObject",
248
+ "RawPayload",
249
+ "add_associated_file",
250
+ "load_writer",
251
+ "read_associated_files",
252
+ "read_metadata_xml",
253
+ "set_metadata_xml",
254
+ "write_to_bytes",
255
+ ]
cvfile/_security.py ADDED
@@ -0,0 +1,183 @@
1
+ """Detects PDF constructs forbidden by .cv spec §3.4.
2
+
3
+ Walks the catalog object graph from the trailer. Each rule maps to a stable
4
+ error code matching the JS SDK so cross-language tests share expectations.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pypdf import PdfReader
10
+ from pypdf.generic import (
11
+ ArrayObject,
12
+ DictionaryObject,
13
+ IndirectObject,
14
+ NameObject,
15
+ PdfObject,
16
+ )
17
+
18
+ from cvfile._types import ValidationIssue
19
+
20
+
21
+ def scan_forbidden_constructs(reader: PdfReader) -> list[ValidationIssue]:
22
+ issues: list[ValidationIssue] = []
23
+ seen: set[int] = set()
24
+
25
+ root = reader.trailer.get("/Root")
26
+ if root is None:
27
+ return issues
28
+
29
+ _walk(_resolve(root), seen, issues)
30
+ return _dedupe(issues)
31
+
32
+
33
+ def _walk(obj: PdfObject | None, seen: set[int], issues: list[ValidationIssue]) -> None:
34
+ if obj is None:
35
+ return
36
+ obj_id = id(obj)
37
+ if obj_id in seen:
38
+ return
39
+ seen.add(obj_id)
40
+
41
+ if isinstance(obj, DictionaryObject):
42
+ _inspect_dict(obj, issues)
43
+ for value in obj.values():
44
+ _walk(_resolve(value), seen, issues)
45
+ elif isinstance(obj, ArrayObject):
46
+ for item in obj:
47
+ _walk(_resolve(item), seen, issues)
48
+
49
+
50
+ def _inspect_dict(d: DictionaryObject, issues: list[ValidationIssue]) -> None:
51
+ type_name = _name_of(d.get("/Type"))
52
+ subtype = _name_of(d.get("/S"))
53
+
54
+ if type_name == "Action" or subtype:
55
+ _inspect_action(d, subtype, issues)
56
+
57
+ if type_name == "Filespec":
58
+ _inspect_filespec(d, issues)
59
+
60
+ if d.get("/JavaScript") is not None:
61
+ issues.append(
62
+ ValidationIssue(
63
+ code="javascript-names-tree",
64
+ level="error",
65
+ message="Document declares /JavaScript names entries; JavaScript actions are forbidden (spec §3.4)",
66
+ )
67
+ )
68
+
69
+
70
+ def _inspect_action(d: DictionaryObject, subtype: str | None, issues: list[ValidationIssue]) -> None:
71
+ if subtype == "JavaScript" or d.get("/JS") is not None:
72
+ issues.append(
73
+ ValidationIssue(
74
+ code="javascript-action",
75
+ level="error",
76
+ message="Found /Action with subtype /JavaScript or /JS entry (spec §3.4)",
77
+ )
78
+ )
79
+ return
80
+
81
+ if subtype == "Launch":
82
+ issues.append(
83
+ ValidationIssue(
84
+ code="launch-action",
85
+ level="error",
86
+ message="Found /Launch action; running external programs is forbidden (spec §3.4)",
87
+ )
88
+ )
89
+ return
90
+
91
+ if subtype == "ImportData":
92
+ issues.append(
93
+ ValidationIssue(
94
+ code="import-data-action",
95
+ level="error",
96
+ message="Found /ImportData action; data import is forbidden (spec §3.4)",
97
+ )
98
+ )
99
+ return
100
+
101
+ if subtype == "SubmitForm":
102
+ target = _filespec_target(_resolve(d.get("/F")))
103
+ if not target or not target.lower().startswith("mailto:"):
104
+ issues.append(
105
+ ValidationIssue(
106
+ code="submit-form-external",
107
+ level="error",
108
+ message=(
109
+ f'/SubmitForm action targets non-mailto URI "{target}" (spec §3.4)'
110
+ if target
111
+ else "Found /SubmitForm action with no inspectable target (spec §3.4)"
112
+ ),
113
+ )
114
+ )
115
+
116
+
117
+ def _inspect_filespec(d: DictionaryObject, issues: list[ValidationIssue]) -> None:
118
+ if d.get("/EF") is not None:
119
+ return
120
+ target = _filespec_target(d)
121
+ issues.append(
122
+ ValidationIssue(
123
+ code="external-filespec",
124
+ level="error",
125
+ message=(
126
+ f'External /Filespec "{target}" (spec §3.4)'
127
+ if target
128
+ else "External /Filespec with no /EF (spec §3.4)"
129
+ ),
130
+ payload=target,
131
+ )
132
+ )
133
+
134
+
135
+ def _filespec_target(value: PdfObject | None) -> str | None:
136
+ if value is None:
137
+ return None
138
+ if isinstance(value, str):
139
+ return value
140
+ if isinstance(value, DictionaryObject):
141
+ for key in ("/UF", "/F"):
142
+ entry = _resolve(value.get(key))
143
+ if isinstance(entry, str):
144
+ return entry
145
+ if isinstance(value, ArrayObject):
146
+ parts = [str(_resolve(item)) for item in value if _resolve(item) is not None]
147
+ return "/".join(parts) if parts else None
148
+ return None
149
+
150
+
151
+ def _name_of(value: PdfObject | None) -> str | None:
152
+ if isinstance(value, NameObject):
153
+ s = str(value)
154
+ return s[1:] if s.startswith("/") else s
155
+ if isinstance(value, str) and value.startswith("/"):
156
+ return value[1:]
157
+ return None
158
+
159
+
160
+ def _resolve(value: PdfObject | None) -> PdfObject | None:
161
+ if value is None:
162
+ return None
163
+ if isinstance(value, IndirectObject):
164
+ try:
165
+ return value.get_object()
166
+ except Exception:
167
+ return None
168
+ return value
169
+
170
+
171
+ def _dedupe(issues: list[ValidationIssue]) -> list[ValidationIssue]:
172
+ seen: set[tuple[str, str | None, str]] = set()
173
+ out: list[ValidationIssue] = []
174
+ for issue in issues:
175
+ key = (issue.code, issue.payload, issue.message)
176
+ if key in seen:
177
+ continue
178
+ seen.add(key)
179
+ out.append(issue)
180
+ return out
181
+
182
+
183
+ __all__ = ["scan_forbidden_constructs"]
cvfile/_types.py ADDED
@@ -0,0 +1,89 @@
1
+ """Public dataclasses returned and accepted by the cvfile SDK."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from datetime import datetime
7
+ from typing import Literal
8
+
9
+ AFRelationshipKind = Literal["Alternative", "Data", "Supplement"]
10
+
11
+
12
+ @dataclass(frozen=True, slots=True)
13
+ class Payload:
14
+ data: bytes | str
15
+ name: str
16
+ mime_type: str
17
+ language: str | None = None
18
+ relationship: AFRelationshipKind = "Alternative"
19
+ description: str | None = None
20
+
21
+
22
+ @dataclass(frozen=True, slots=True)
23
+ class AlternateMeta:
24
+ payload: str
25
+ language: str
26
+ mime_type: str
27
+
28
+
29
+ @dataclass(frozen=True, slots=True)
30
+ class IntegrityEntry:
31
+ payload: str
32
+ algorithm: str
33
+ digest: str
34
+
35
+
36
+ @dataclass(frozen=True, slots=True)
37
+ class EmbeddingSpaceSummary:
38
+ model: str
39
+ dimension: int
40
+ metric: Literal["cosine", "dot", "euclidean"]
41
+ chunks: int
42
+
43
+
44
+ @dataclass(frozen=True, slots=True)
45
+ class CvMetadata:
46
+ version: str
47
+ primary_language: str
48
+ primary_payload: str
49
+ created: datetime | None = None
50
+ modified: datetime | None = None
51
+ generator: str | None = None
52
+ alternates: tuple[AlternateMeta, ...] = field(default_factory=tuple)
53
+ integrity: tuple[IntegrityEntry, ...] = field(default_factory=tuple)
54
+ embeddings: tuple[EmbeddingSpaceSummary, ...] = field(default_factory=tuple)
55
+
56
+
57
+ @dataclass(frozen=True, slots=True)
58
+ class ExtractedPayload:
59
+ name: str
60
+ mime_type: str
61
+ relationship: AFRelationshipKind
62
+ bytes_: bytes
63
+ language: str | None = None
64
+ description: str | None = None
65
+
66
+ def text(self) -> str:
67
+ return self.bytes_.decode("utf-8")
68
+
69
+
70
+ @dataclass(frozen=True, slots=True)
71
+ class CvFile:
72
+ bytes_: bytes
73
+ metadata: CvMetadata
74
+ payloads: tuple[ExtractedPayload, ...]
75
+
76
+
77
+ @dataclass(frozen=True, slots=True)
78
+ class ValidationIssue:
79
+ code: str
80
+ level: Literal["error", "warning"]
81
+ message: str
82
+ payload: str | None = None
83
+
84
+
85
+ @dataclass(frozen=True, slots=True)
86
+ class ValidationReport:
87
+ ok: bool
88
+ level: Literal["cv-strict", "cv-lenient"]
89
+ issues: tuple[ValidationIssue, ...]