PyPI - cvfile - Versions diffs - 0.1.0__py3-none-any.whl - Mend

cvfile 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

cvfile/__init__.py +49 -0
cvfile/_constants.py +26 -0
cvfile/_pdf.py +255 -0
cvfile/_security.py +183 -0
cvfile/_types.py +89 -0
cvfile/_xmp.py +240 -0
cvfile/detect.py +25 -0
cvfile/embed/__init__.py +39 -0
cvfile/embed/_chunk.py +130 -0
cvfile/embed/_embed.py +66 -0
cvfile/embed/_embeddings.py +176 -0
cvfile/embed/_huggingface.py +88 -0
cvfile/embed/_search.py +78 -0
cvfile/extract.py +65 -0
cvfile/inspect.py +25 -0
cvfile/integrations/__init__.py +5 -0
cvfile/integrations/langchain.py +144 -0
cvfile/integrations/llamaindex.py +113 -0
cvfile/pack.py +173 -0
cvfile/server/__init__.py +31 -0
cvfile/server/_conneg.py +129 -0
cvfile/server/_handler.py +176 -0
cvfile/server/asgi.py +146 -0
cvfile/server/wsgi.py +125 -0
cvfile/validate.py +140 -0
cvfile-0.1.0.dist-info/METADATA +74 -0
cvfile-0.1.0.dist-info/RECORD +28 -0
cvfile-0.1.0.dist-info/WHEEL +4 -0

cvfile/__init__.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""Reference SDK for the .cv open file format."""
+from cvfile._constants import (
+    CV_NAMESPACE_PREFIX,
+    CV_NAMESPACE_URI,
+    CV_SPEC_VERSION,
+    DEFAULT_PAYLOAD_NAMES,
+    PAYLOAD_MIME_TYPES,
+)
+from cvfile._types import (
+    AlternateMeta,
+    CvFile,
+    CvMetadata,
+    EmbeddingSpaceSummary,
+    ExtractedPayload,
+    IntegrityEntry,
+    Payload,
+    ValidationIssue,
+    ValidationReport,
+)
+from cvfile.detect import is_cv_file
+from cvfile.extract import extract, extract_html, extract_markdown
+from cvfile.inspect import inspect
+from cvfile.pack import pack
+from cvfile.validate import validate
+__all__ = [
+    "CV_NAMESPACE_PREFIX",
+    "CV_NAMESPACE_URI",
+    "CV_SPEC_VERSION",
+    "DEFAULT_PAYLOAD_NAMES",
+    "PAYLOAD_MIME_TYPES",
+    "AlternateMeta",
+    "CvFile",
+    "CvMetadata",
+    "EmbeddingSpaceSummary",
+    "ExtractedPayload",
+    "IntegrityEntry",
+    "Payload",
+    "ValidationIssue",
+    "ValidationReport",
+    "extract",
+    "extract_html",
+    "extract_markdown",
+    "inspect",
+    "is_cv_file",
+    "pack",
+    "validate",
+]

cvfile/_constants.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""Constants shared across the cvfile SDK."""
+CV_SPEC_VERSION = "0.1"
+CV_NAMESPACE_URI = "http://ns.cvfile.org/cv/1.0/"
+CV_NAMESPACE_PREFIX = "cv"
+DEFAULT_GENERATOR = f"cvfile-py/{CV_SPEC_VERSION}"
+DEFAULT_PAYLOAD_NAMES = {
+    "markdown": "resume.md",
+    "html": "resume.html",
+    "json": "resume.json",
+    "embeddings": "embeddings.cbor",
+}
+PAYLOAD_MIME_TYPES = {
+    "markdown": "text/markdown",
+    "html": "text/html",
+    "json": "application/json",
+    "embeddings": "application/vnd.cv.embeddings+cbor",
+    "pdf": "application/pdf",
+    "cv": "application/vnd.cv+pdf",
+}
+MAX_PAYLOAD_BYTES_DEFAULT = 16 * 1024 * 1024

cvfile/_pdf.py ADDED Viewed

@@ -0,0 +1,255 @@
+"""Thin pypdf wrapper for /AF Associated Files and /Metadata streams.
+This module isolates pypdf so we can swap to pikepdf later without changing the
+public API. PDF/A-3 conformance work that requires deeper PDF rewriting (font
+embedding, ICC profile injection on arbitrary input PDFs) will land here.
+"""
+from __future__ import annotations
+import io
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from typing import Literal
+from pypdf import PdfReader, PdfWriter
+from pypdf.generic import (
+    ArrayObject,
+    ByteStringObject,
+    DecodedStreamObject,
+    DictionaryObject,
+    IndirectObject,
+    NameObject,
+    NumberObject,
+    StreamObject,
+    TextStringObject,
+)
+AFRelationshipKind = Literal["Alternative", "Data", "Supplement"]
+@dataclass(frozen=True, slots=True)
+class RawPayload:
+    name: str
+    mime_type: str
+    relationship: AFRelationshipKind
+    bytes_: bytes
+    description: str | None = None
+def load_writer(pdf_bytes: bytes) -> PdfWriter:
+    reader = PdfReader(io.BytesIO(pdf_bytes))
+    writer = PdfWriter(clone_from=reader)
+    return writer
+def add_associated_file(
+    writer: PdfWriter,
+    *,
+    name: str,
+    data: bytes,
+    mime_type: str,
+    description: str,
+    relationship: AFRelationshipKind,
+    creation_date: datetime,
+    modification_date: datetime,
+) -> None:
+    """Attach `data` as an Associated File on the document catalog (/AF)."""
+    embedded_stream = DecodedStreamObject()
+    embedded_stream.set_data(data)
+    embedded_stream.update(
+        {
+            NameObject("/Type"): NameObject("/EmbeddedFile"),
+            NameObject("/Subtype"): NameObject(_mime_to_name(mime_type)),
+            NameObject("/Length"): NumberObject(len(data)),
+            NameObject("/Params"): DictionaryObject(
+                {
+                    NameObject("/CreationDate"): TextStringObject(_pdf_date(creation_date)),
+                    NameObject("/ModDate"): TextStringObject(_pdf_date(modification_date)),
+                    NameObject("/Size"): NumberObject(len(data)),
+                }
+            ),
+        }
+    )
+    embedded_ref = writer._add_object(embedded_stream)
+    filespec = DictionaryObject(
+        {
+            NameObject("/Type"): NameObject("/Filespec"),
+            NameObject("/F"): TextStringObject(name),
+            NameObject("/UF"): TextStringObject(name),
+            NameObject("/Desc"): TextStringObject(description),
+            NameObject("/AFRelationship"): NameObject(f"/{relationship}"),
+            NameObject("/EF"): DictionaryObject(
+                {NameObject("/F"): embedded_ref, NameObject("/UF"): embedded_ref}
+            ),
+        }
+    )
+    filespec_ref = writer._add_object(filespec)
+    catalog = writer._root_object
+    af_array = catalog.get(NameObject("/AF"))
+    if isinstance(af_array, IndirectObject):
+        af_array = af_array.get_object()
+    if not isinstance(af_array, ArrayObject):
+        af_array = ArrayObject()
+        catalog[NameObject("/AF")] = af_array
+    af_array.append(filespec_ref)
+def set_metadata_xml(writer: PdfWriter, xml: str) -> None:
+    data = xml.encode("utf-8")
+    stream = DecodedStreamObject()
+    stream.set_data(data)
+    stream.update(
+        {
+            NameObject("/Type"): NameObject("/Metadata"),
+            NameObject("/Subtype"): NameObject("/XML"),
+            NameObject("/Length"): NumberObject(len(data)),
+        }
+    )
+    ref = writer._add_object(stream)
+    writer._root_object[NameObject("/Metadata")] = ref
+def write_to_bytes(writer: PdfWriter) -> bytes:
+    _ensure_trailer_id(writer)
+    buf = io.BytesIO()
+    writer.write(buf)
+    return buf.getvalue()
+def _ensure_trailer_id(writer: PdfWriter) -> None:
+    """Set the trailer /ID array (PDF/A-3u rule 6.1.3)."""
+    import secrets
+    if getattr(writer, "_ID", None):
+        return
+    id_hex = secrets.token_hex(16).upper().encode("ascii")
+    writer._ID = ArrayObject([ByteStringObject(id_hex), ByteStringObject(id_hex)])
+def read_associated_files(reader: PdfReader) -> list[RawPayload]:
+    catalog = reader.trailer.get("/Root")
+    if catalog is None:
+        return []
+    catalog = catalog.get_object() if isinstance(catalog, IndirectObject) else catalog
+    af = catalog.get("/AF") if isinstance(catalog, DictionaryObject) else None
+    if af is None:
+        return []
+    if isinstance(af, IndirectObject):
+        af = af.get_object()
+    if not isinstance(af, ArrayObject):
+        return []
+    out: list[RawPayload] = []
+    for entry in af:
+        filespec = entry.get_object() if isinstance(entry, IndirectObject) else entry
+        payload = _parse_filespec(filespec)
+        if payload:
+            out.append(payload)
+    return out
+def read_metadata_xml(reader: PdfReader) -> str | None:
+    root = reader.trailer.get("/Root")
+    if root is None:
+        return None
+    root = root.get_object() if isinstance(root, IndirectObject) else root
+    meta = root.get("/Metadata") if isinstance(root, DictionaryObject) else None
+    if meta is None:
+        return None
+    meta = meta.get_object() if isinstance(meta, IndirectObject) else meta
+    if not isinstance(meta, StreamObject):
+        return None
+    data = meta.get_data()
+    if isinstance(data, str):
+        return data
+    return data.decode("utf-8", errors="replace")
+def _parse_filespec(filespec: DictionaryObject) -> RawPayload | None:
+    if not isinstance(filespec, DictionaryObject):
+        return None
+    ef = filespec.get("/EF")
+    if ef is None:
+        return None
+    ef = ef.get_object() if isinstance(ef, IndirectObject) else ef
+    if not isinstance(ef, DictionaryObject):
+        return None
+    stream_ref = ef.get("/UF") or ef.get("/F")
+    if stream_ref is None:
+        return None
+    stream = stream_ref.get_object() if isinstance(stream_ref, IndirectObject) else stream_ref
+    if not isinstance(stream, StreamObject):
+        return None
+    data = stream.get_data()
+    if isinstance(data, str):
+        data = data.encode("latin-1", errors="replace")
+    name_obj = filespec.get("/UF") or filespec.get("/F")
+    if name_obj is None:
+        return None
+    name = str(name_obj)
+    subtype = stream.get("/Subtype") or filespec.get("/Subtype")
+    mime_type = _name_to_mime(str(subtype)) if subtype else "application/octet-stream"
+    desc_obj = filespec.get("/Desc")
+    description = str(desc_obj) if desc_obj else None
+    rel_obj = filespec.get("/AFRelationship")
+    rel_str = str(rel_obj).lstrip("/") if rel_obj else "Supplement"
+    if rel_str not in {"Alternative", "Data", "Supplement"}:
+        rel_str = "Supplement"
+    rel: AFRelationshipKind = rel_str  # type: ignore[assignment]
+    return RawPayload(name=name, mime_type=mime_type, relationship=rel, bytes_=bytes(data), description=description)
+def _mime_to_name(mime: str) -> str:
+    """Wrap a MIME type for use as a PDF Name. pypdf's NameObject handles
+    the per-character #XX escaping itself when serializing; we MUST NOT
+    pre-escape, otherwise the '#' of our own escape gets re-escaped to '#23'.
+    """
+    return "/" + mime
+def _name_to_mime(name: str) -> str:
+    s = name.lstrip("/")
+    out = []
+    i = 0
+    while i < len(s):
+        c = s[i]
+        if c == "#" and i + 2 < len(s):
+            try:
+                out.append(chr(int(s[i + 1 : i + 3], 16)))
+                i += 3
+                continue
+            except ValueError:
+                pass
+        out.append(c)
+        i += 1
+    return "".join(out)
+def _pdf_date(dt: datetime) -> str:
+    """Format a datetime as a PDF date string (D:YYYYMMDDHHmmSS+HH'mm')."""
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=timezone.utc)
+    return dt.astimezone(timezone.utc).strftime("D:%Y%m%d%H%M%SZ")
+__all__ = [
+    "AFRelationshipKind",
+    "ByteStringObject",
+    "RawPayload",
+    "add_associated_file",
+    "load_writer",
+    "read_associated_files",
+    "read_metadata_xml",
+    "set_metadata_xml",
+    "write_to_bytes",
+]

cvfile/_security.py ADDED Viewed

@@ -0,0 +1,183 @@
+"""Detects PDF constructs forbidden by .cv spec §3.4.
+Walks the catalog object graph from the trailer. Each rule maps to a stable
+error code matching the JS SDK so cross-language tests share expectations.
+"""
+from __future__ import annotations
+from pypdf import PdfReader
+from pypdf.generic import (
+    ArrayObject,
+    DictionaryObject,
+    IndirectObject,
+    NameObject,
+    PdfObject,
+)
+from cvfile._types import ValidationIssue
+def scan_forbidden_constructs(reader: PdfReader) -> list[ValidationIssue]:
+    issues: list[ValidationIssue] = []
+    seen: set[int] = set()
+    root = reader.trailer.get("/Root")
+    if root is None:
+        return issues
+    _walk(_resolve(root), seen, issues)
+    return _dedupe(issues)
+def _walk(obj: PdfObject | None, seen: set[int], issues: list[ValidationIssue]) -> None:
+    if obj is None:
+        return
+    obj_id = id(obj)
+    if obj_id in seen:
+        return
+    seen.add(obj_id)
+    if isinstance(obj, DictionaryObject):
+        _inspect_dict(obj, issues)
+        for value in obj.values():
+            _walk(_resolve(value), seen, issues)
+    elif isinstance(obj, ArrayObject):
+        for item in obj:
+            _walk(_resolve(item), seen, issues)
+def _inspect_dict(d: DictionaryObject, issues: list[ValidationIssue]) -> None:
+    type_name = _name_of(d.get("/Type"))
+    subtype = _name_of(d.get("/S"))
+    if type_name == "Action" or subtype:
+        _inspect_action(d, subtype, issues)
+    if type_name == "Filespec":
+        _inspect_filespec(d, issues)
+    if d.get("/JavaScript") is not None:
+        issues.append(
+            ValidationIssue(
+                code="javascript-names-tree",
+                level="error",
+                message="Document declares /JavaScript names entries; JavaScript actions are forbidden (spec §3.4)",
+            )
+        )
+def _inspect_action(d: DictionaryObject, subtype: str | None, issues: list[ValidationIssue]) -> None:
+    if subtype == "JavaScript" or d.get("/JS") is not None:
+        issues.append(
+            ValidationIssue(
+                code="javascript-action",
+                level="error",
+                message="Found /Action with subtype /JavaScript or /JS entry (spec §3.4)",
+            )
+        )
+        return
+    if subtype == "Launch":
+        issues.append(
+            ValidationIssue(
+                code="launch-action",
+                level="error",
+                message="Found /Launch action; running external programs is forbidden (spec §3.4)",
+            )
+        )
+        return
+    if subtype == "ImportData":
+        issues.append(
+            ValidationIssue(
+                code="import-data-action",
+                level="error",
+                message="Found /ImportData action; data import is forbidden (spec §3.4)",
+            )
+        )
+        return
+    if subtype == "SubmitForm":
+        target = _filespec_target(_resolve(d.get("/F")))
+        if not target or not target.lower().startswith("mailto:"):
+            issues.append(
+                ValidationIssue(
+                    code="submit-form-external",
+                    level="error",
+                    message=(
+                        f'/SubmitForm action targets non-mailto URI "{target}" (spec §3.4)'
+                        if target
+                        else "Found /SubmitForm action with no inspectable target (spec §3.4)"
+                    ),
+                )
+            )
+def _inspect_filespec(d: DictionaryObject, issues: list[ValidationIssue]) -> None:
+    if d.get("/EF") is not None:
+        return
+    target = _filespec_target(d)
+    issues.append(
+        ValidationIssue(
+            code="external-filespec",
+            level="error",
+            message=(
+                f'External /Filespec "{target}" (spec §3.4)'
+                if target
+                else "External /Filespec with no /EF (spec §3.4)"
+            ),
+            payload=target,
+        )
+    )
+def _filespec_target(value: PdfObject | None) -> str | None:
+    if value is None:
+        return None
+    if isinstance(value, str):
+        return value
+    if isinstance(value, DictionaryObject):
+        for key in ("/UF", "/F"):
+            entry = _resolve(value.get(key))
+            if isinstance(entry, str):
+                return entry
+    if isinstance(value, ArrayObject):
+        parts = [str(_resolve(item)) for item in value if _resolve(item) is not None]
+        return "/".join(parts) if parts else None
+    return None
+def _name_of(value: PdfObject | None) -> str | None:
+    if isinstance(value, NameObject):
+        s = str(value)
+        return s[1:] if s.startswith("/") else s
+    if isinstance(value, str) and value.startswith("/"):
+        return value[1:]
+    return None
+def _resolve(value: PdfObject | None) -> PdfObject | None:
+    if value is None:
+        return None
+    if isinstance(value, IndirectObject):
+        try:
+            return value.get_object()
+        except Exception:
+            return None
+    return value
+def _dedupe(issues: list[ValidationIssue]) -> list[ValidationIssue]:
+    seen: set[tuple[str, str | None, str]] = set()
+    out: list[ValidationIssue] = []
+    for issue in issues:
+        key = (issue.code, issue.payload, issue.message)
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(issue)
+    return out
+__all__ = ["scan_forbidden_constructs"]

cvfile/_types.py ADDED Viewed

@@ -0,0 +1,89 @@
+"""Public dataclasses returned and accepted by the cvfile SDK."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Literal
+AFRelationshipKind = Literal["Alternative", "Data", "Supplement"]
+@dataclass(frozen=True, slots=True)
+class Payload:
+    data: bytes | str
+    name: str
+    mime_type: str
+    language: str | None = None
+    relationship: AFRelationshipKind = "Alternative"
+    description: str | None = None
+@dataclass(frozen=True, slots=True)
+class AlternateMeta:
+    payload: str
+    language: str
+    mime_type: str
+@dataclass(frozen=True, slots=True)
+class IntegrityEntry:
+    payload: str
+    algorithm: str
+    digest: str
+@dataclass(frozen=True, slots=True)
+class EmbeddingSpaceSummary:
+    model: str
+    dimension: int
+    metric: Literal["cosine", "dot", "euclidean"]
+    chunks: int
+@dataclass(frozen=True, slots=True)
+class CvMetadata:
+    version: str
+    primary_language: str
+    primary_payload: str
+    created: datetime | None = None
+    modified: datetime | None = None
+    generator: str | None = None
+    alternates: tuple[AlternateMeta, ...] = field(default_factory=tuple)
+    integrity: tuple[IntegrityEntry, ...] = field(default_factory=tuple)
+    embeddings: tuple[EmbeddingSpaceSummary, ...] = field(default_factory=tuple)
+@dataclass(frozen=True, slots=True)
+class ExtractedPayload:
+    name: str
+    mime_type: str
+    relationship: AFRelationshipKind
+    bytes_: bytes
+    language: str | None = None
+    description: str | None = None
+    def text(self) -> str:
+        return self.bytes_.decode("utf-8")
+@dataclass(frozen=True, slots=True)
+class CvFile:
+    bytes_: bytes
+    metadata: CvMetadata
+    payloads: tuple[ExtractedPayload, ...]
+@dataclass(frozen=True, slots=True)
+class ValidationIssue:
+    code: str
+    level: Literal["error", "warning"]
+    message: str
+    payload: str | None = None
+@dataclass(frozen=True, slots=True)
+class ValidationReport:
+    ok: bool
+    level: Literal["cv-strict", "cv-lenient"]
+    issues: tuple[ValidationIssue, ...]