PyPI - pyjelly - Versions diffs - 0.1.0__py3-none-any.whl - Mend

pyjelly 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pyjelly might be problematic. Click here for more details.

Files changed (28) hide show

pyjelly/__init__.py +0 -0
pyjelly/_proto/grpc.proto +33 -0
pyjelly/_proto/patch.proto +165 -0
pyjelly/_proto/rdf.proto +384 -0
pyjelly/errors.py +10 -0
pyjelly/integrations/__init__.py +0 -0
pyjelly/integrations/rdflib/__init__.py +24 -0
pyjelly/integrations/rdflib/parse.py +233 -0
pyjelly/integrations/rdflib/serialize.py +119 -0
pyjelly/jelly/__init__.py +5 -0
pyjelly/jelly/rdf_pb2.py +70 -0
pyjelly/jelly/rdf_pb2.pyi +230 -0
pyjelly/options.py +126 -0
pyjelly/parse/__init__.py +0 -0
pyjelly/parse/decode.py +233 -0
pyjelly/parse/ioutils.py +86 -0
pyjelly/parse/lookup.py +70 -0
pyjelly/serialize/__init__.py +0 -0
pyjelly/serialize/encode.py +197 -0
pyjelly/serialize/flows.py +94 -0
pyjelly/serialize/ioutils.py +13 -0
pyjelly/serialize/lookup.py +131 -0
pyjelly/serialize/streams.py +133 -0
pyjelly-0.1.0.dist-info/METADATA +10 -0
pyjelly-0.1.0.dist-info/RECORD +28 -0
pyjelly-0.1.0.dist-info/WHEEL +4 -0
pyjelly-0.1.0.dist-info/entry_points.txt +7 -0
pyjelly-0.1.0.dist-info/licenses/LICENSE +201 -0

pyjelly/options.py ADDED Viewed

@@ -0,0 +1,126 @@
+from __future__ import annotations
+import mimetypes
+from contextlib import suppress
+from dataclasses import dataclass, field
+from typing import Final
+from typing_extensions import Self
+from pyjelly import jelly
+from pyjelly.errors import (
+    JellyAssertionError,
+    JellyConformanceError,
+    JellyNotImplementedError,
+)
+MIN_NAME_LOOKUP_SIZE: Final[int] = 8
+MAX_LOOKUP_SIZE: Final[int] = 4096
+MAX_VERSION: Final[int] = 2
+DEFAULT_NAME_LOOKUP_SIZE: Final[int] = 4000
+DEFAULT_PREFIX_LOOKUP_SIZE: Final[int] = 150
+DEFAULT_DATATYPE_LOOKUP_SIZE: Final[int] = 32
+STRING_DATATYPE_IRI = "http://www.w3.org/2001/XMLSchema#string"
+INTEGRATION_SIDE_EFFECTS: bool = True
+"""
+Whether to allow integration module imports to trigger side effects.
+These side effects are cheap and may include populating some registries
+for guessing the defaults for external integrations that work with Jelly.
+"""
+MIMETYPES = ("application/x-jelly-rdf",)
+def register_mimetypes(extension: str = ".jelly") -> None:
+    """
+    Associate files that have Jelly extension with Jelly MIME types.
+    >>> register_mimetypes()
+    >>> mimetypes.guess_type("out.jelly")
+    ('application/x-jelly-rdf', None)
+    """
+    for mimetype in MIMETYPES:
+        mimetypes.add_type(mimetype, extension)
+@dataclass(frozen=True)
+class LookupPreset:
+    max_names: int = DEFAULT_NAME_LOOKUP_SIZE
+    max_prefixes: int = DEFAULT_PREFIX_LOOKUP_SIZE
+    max_datatypes: int = DEFAULT_DATATYPE_LOOKUP_SIZE
+    def __post_init__(self) -> None:
+        if self.max_names < MIN_NAME_LOOKUP_SIZE:
+            msg = "name lookup size must be at least 8"
+            raise JellyConformanceError(msg)
+    @classmethod
+    def small(cls) -> Self:
+        return cls(max_names=128, max_prefixes=32, max_datatypes=32)
+@dataclass(frozen=True)
+class StreamTypes:
+    physical_type: jelly.PhysicalStreamType
+    logical_type: jelly.LogicalStreamType = jelly.LOGICAL_STREAM_TYPE_UNSPECIFIED
+    @property
+    def flat(self) -> bool:
+        return self.logical_type in (
+            jelly.LOGICAL_STREAM_TYPE_FLAT_TRIPLES,
+            jelly.LOGICAL_STREAM_TYPE_FLAT_QUADS,
+        )
+    def __repr__(self) -> str:
+        with suppress(ValueError):
+            physical_type_name = jelly.PhysicalStreamType.Name(self.physical_type)
+            logical_type_name = jelly.LogicalStreamType.Name(self.logical_type)
+            return f"StreamTypes({physical_type_name}, {logical_type_name})"
+        return f"StreamTypes({self.physical_type}, {self.logical_type})"
+    def __post_init__(self) -> None:
+        if self.physical_type == jelly.PHYSICAL_STREAM_TYPE_UNSPECIFIED:
+            msg = "physical type must be specified"
+            raise JellyNotImplementedError(msg)
+        validate_type_compatibility(
+            physical_type=self.physical_type,
+            logical_type=self.logical_type,
+        )
+@dataclass(frozen=True)
+class StreamOptions:
+    stream_types: StreamTypes
+    lookup_preset: LookupPreset = field(default_factory=LookupPreset)
+    generalized_statements: bool = False
+    rdf_star: bool = False
+    version: int = MAX_VERSION
+    delimited: bool = True
+    namespace_declarations: bool = False
+    stream_name: str = ""
+TRIPLES_ONLY_LOGICAL_TYPES = {
+    jelly.LOGICAL_STREAM_TYPE_GRAPHS,
+    jelly.LOGICAL_STREAM_TYPE_SUBJECT_GRAPHS,
+    jelly.LOGICAL_STREAM_TYPE_FLAT_TRIPLES,
+}
+def validate_type_compatibility(
+    physical_type: jelly.PhysicalStreamType,
+    logical_type: jelly.LogicalStreamType,
+) -> None:
+    if logical_type == jelly.LOGICAL_STREAM_TYPE_UNSPECIFIED:
+        return
+    triples_physical_type = physical_type == jelly.PHYSICAL_STREAM_TYPE_TRIPLES
+    triples_logical_type = logical_type in TRIPLES_ONLY_LOGICAL_TYPES
+    if triples_physical_type != triples_logical_type:
+        physical_type_name = jelly.PhysicalStreamType.Name(physical_type)
+        logical_type_name = jelly.LogicalStreamType.Name(logical_type)
+        msg = f"{physical_type_name} is not compatible with {logical_type_name}"
+        raise JellyAssertionError(msg)

pyjelly/parse/__init__.py ADDED Viewed

File without changes

pyjelly/parse/decode.py ADDED Viewed

@@ -0,0 +1,233 @@
+from __future__ import annotations
+from abc import ABCMeta, abstractmethod
+from collections.abc import Iterable, Sequence
+from typing import Any, ClassVar
+from typing_extensions import Never
+from pyjelly import jelly
+from pyjelly.options import LookupPreset, StreamOptions, StreamTypes
+from pyjelly.parse.lookup import LookupDecoder
+def options_from_frame(
+    frame: jelly.RdfStreamFrame,
+    *,
+    delimited: bool,
+) -> StreamOptions:
+    row = frame.rows[0]
+    options = row.options
+    return StreamOptions(
+        stream_types=StreamTypes(
+            physical_type=options.physical_type,
+            logical_type=options.logical_type,
+        ),
+        lookup_preset=LookupPreset(
+            max_names=options.max_name_table_size,
+            max_prefixes=options.max_prefix_table_size,
+            max_datatypes=options.max_datatype_table_size,
+        ),
+        stream_name=options.stream_name,
+        version=options.version,
+        delimited=delimited,
+    )
+def _adapter_missing(feature: str, *, options: StreamOptions) -> Never:
+    physical_type_name = jelly.PhysicalStreamType.Name(
+        options.stream_types.physical_type
+    )
+    logical_type_name = jelly.LogicalStreamType.Name(options.stream_types.logical_type)
+    msg = (
+        f"adapter with {physical_type_name} and {logical_type_name} "
+        f"does not implement {feature}"
+    )
+    raise NotImplementedError(msg)
+class Adapter(metaclass=ABCMeta):
+    def __init__(self, options: StreamOptions) -> None:
+        self.options = options
+    # Obligatory abstract methods--all adapters must implement these
+    @abstractmethod
+    def iri(self, iri: str) -> Any:
+        raise NotImplementedError
+    @abstractmethod
+    def default_graph(self) -> Any:
+        raise NotImplementedError
+    @abstractmethod
+    def bnode(self, bnode: str) -> Any:
+        raise NotImplementedError
+    @abstractmethod
+    def literal(
+        self,
+        lex: str,
+        language: str | None = None,
+        datatype: str | None = None,
+    ) -> Any:
+        raise NotImplementedError
+    # Optional abstract methods--not required to be implemented by all adapters
+    def triple(self, terms: Iterable[Any]) -> Any:  # noqa: ARG002
+        _adapter_missing("decoding triples", options=self.options)
+    def quad(self, terms: Iterable[Any]) -> Any:  # noqa: ARG002
+        _adapter_missing("decoding quads", options=self.options)
+    def graph_start(self, graph_id: Any) -> Any:  # noqa: ARG002
+        _adapter_missing("decoding graph start markers", options=self.options)
+    def graph_end(self) -> Any:
+        _adapter_missing("decoding graph end markers", options=self.options)
+    def namespace_declaration(self, name: str, iri: str) -> Any:  # noqa: ARG002
+        _adapter_missing("decoding namespace declarations", options=self.options)
+    def frame(self) -> Any:
+        return None
+class Decoder:
+    def __init__(self, adapter: Adapter) -> None:
+        self.adapter = adapter
+        self.names = LookupDecoder(lookup_size=self.options.lookup_preset.max_names)
+        self.prefixes = LookupDecoder(
+            lookup_size=self.options.lookup_preset.max_prefixes
+        )
+        self.datatypes = LookupDecoder(
+            lookup_size=self.options.lookup_preset.max_datatypes
+        )
+        self.repeated_terms: dict[str, jelly.RdfIri | str | jelly.RdfLiteral] = {}
+    @property
+    def options(self) -> StreamOptions:
+        return self.adapter.options
+    def decode_frame(self, frame: jelly.RdfStreamFrame) -> Any:
+        for row_owner in frame.rows:
+            row = getattr(row_owner, row_owner.WhichOneof("row"))
+            self.decode_row(row)
+        return self.adapter.frame()
+    def decode_row(self, row: Any) -> Any | None:
+        try:
+            decode_row = self.row_handlers[type(row)]
+        except KeyError:
+            msg = f"decoder not implemented for {type(row)}"
+            raise TypeError(msg) from None
+        return decode_row(self, row)
+    def validate_stream_options(self, options: jelly.RdfStreamOptions) -> None:
+        assert self.options.stream_name == options.stream_name
+        assert self.options.version >= options.version
+        assert self.options.lookup_preset.max_prefixes == options.max_prefix_table_size
+        assert (
+            self.options.lookup_preset.max_datatypes == options.max_datatype_table_size
+        )
+        assert self.options.lookup_preset.max_names == options.max_name_table_size
+    def ingest_prefix_entry(self, entry: jelly.RdfPrefixEntry) -> None:
+        self.prefixes.assign_entry(index=entry.id, value=entry.value)
+    def ingest_name_entry(self, entry: jelly.RdfNameEntry) -> None:
+        self.names.assign_entry(index=entry.id, value=entry.value)
+    def ingest_datatype_entry(self, entry: jelly.RdfDatatypeEntry) -> None:
+        self.datatypes.assign_entry(index=entry.id, value=entry.value)
+    def decode_term(self, term: Any) -> Any:
+        try:
+            decode_term = self.term_handlers[type(term)]
+        except KeyError:
+            msg = f"decoder not implemented for {type(term)}"
+            raise TypeError(msg) from None
+        return decode_term(self, term)
+    def decode_iri(self, iri: jelly.RdfIri) -> Any:
+        name = self.names.decode_name_term_index(iri.name_id)
+        prefix = self.prefixes.decode_prefix_term_index(iri.prefix_id)
+        return self.adapter.iri(iri=prefix + name)
+    def decode_default_graph(self, _: jelly.RdfDefaultGraph) -> Any:
+        return self.adapter.default_graph()
+    def decode_bnode(self, bnode: str) -> Any:
+        return self.adapter.bnode(bnode)
+    def decode_literal(self, literal: jelly.RdfLiteral) -> Any:
+        language = datatype = None
+        if literal.langtag:
+            language = literal.langtag
+        elif self.datatypes.lookup_size and literal.HasField("datatype"):
+            datatype = self.datatypes.decode_datatype_term_index(literal.datatype)
+        return self.adapter.literal(
+            lex=literal.lex,
+            language=language,
+            datatype=datatype,
+        )
+    def decode_namespace_declaration(
+        self,
+        declaration: jelly.RdfNamespaceDeclaration,
+    ) -> Any:
+        iri = self.decode_iri(declaration.value)
+        return self.adapter.namespace_declaration(declaration.name, iri)
+    def decode_graph_start(self, graph_start: jelly.RdfGraphStart) -> Any:
+        term = getattr(graph_start, graph_start.WhichOneof("graph"))
+        return self.adapter.graph_start(self.decode_term(term))
+    def decode_graph_end(self, _: jelly.RdfGraphEnd) -> Any:
+        return self.adapter.graph_end()
+    def decode_statement(
+        self,
+        statement: jelly.RdfTriple | jelly.RdfQuad,
+        oneofs: Sequence[str],
+    ) -> Any:
+        terms = []
+        for oneof in oneofs:
+            field = statement.WhichOneof(oneof)
+            if field:
+                jelly_term = getattr(statement, field)
+                decoded_term = self.decode_term(jelly_term)
+                self.repeated_terms[oneof] = decoded_term
+            else:
+                decoded_term = self.repeated_terms[oneof]
+                if decoded_term is None:
+                    msg = f"missing repeated term {oneof}"
+                    raise ValueError(msg)
+            terms.append(decoded_term)
+        return terms
+    def decode_triple(self, triple: jelly.RdfTriple) -> Any:
+        terms = self.decode_statement(triple, ("subject", "predicate", "object"))
+        return self.adapter.triple(terms)
+    def decode_quad(self, quad: jelly.RdfQuad) -> Any:
+        terms = self.decode_statement(quad, ("subject", "predicate", "object", "graph"))
+        return self.adapter.quad(terms)
+    # dispatch by invariant type (no C3 resolution)
+    row_handlers: ClassVar = {
+        jelly.RdfStreamOptions: validate_stream_options,
+        jelly.RdfPrefixEntry: ingest_prefix_entry,
+        jelly.RdfNameEntry: ingest_name_entry,
+        jelly.RdfDatatypeEntry: ingest_datatype_entry,
+        jelly.RdfTriple: decode_triple,
+        jelly.RdfQuad: decode_quad,
+        jelly.RdfGraphStart: decode_graph_start,
+        jelly.RdfGraphEnd: decode_graph_end,
+        jelly.RdfNamespaceDeclaration: decode_namespace_declaration,
+    }
+    term_handlers: ClassVar = {
+        jelly.RdfIri: decode_iri,
+        str: decode_bnode,
+        jelly.RdfLiteral: decode_literal,
+        jelly.RdfDefaultGraph: decode_default_graph,
+    }

pyjelly/parse/ioutils.py ADDED Viewed

@@ -0,0 +1,86 @@
+import os
+from collections.abc import Generator, Iterator
+from itertools import chain
+from typing import IO
+from google.protobuf.proto import parse, parse_length_prefixed
+from pyjelly import jelly
+from pyjelly.errors import JellyConformanceError
+from pyjelly.options import StreamOptions
+from pyjelly.parse.decode import options_from_frame
+def delimited_jelly_hint(header: bytes) -> bool:
+    """
+    Detect whether a Jelly file is delimited from its first 3 bytes.
+    Truth table (notation: `0A` = `0x0A`, `NN` = `not 0x0A`, `??` = _don't care_):
+    | Byte 1 | Byte 2 | Byte 3 | Result                                   |
+    |--------|--------|--------|------------------------------------------|
+    | `NN`   |  `??`  |  `??`  | Delimited                                |
+    | `0A`   |  `NN`  |  `??`  | Non-delimited                            |
+    | `0A`   |  `0A`  |  `NN`  | Delimited (size = 10)                    |
+    | `0A`   |  `0A`  |  `0A`  | Non-delimited (stream options size = 10) |
+    >>> delimited_jelly_hint(bytes([0x00, 0x00, 0x00]))
+    True
+    >>> delimited_jelly_hint(bytes([0x00, 0x00, 0x0A]))
+    True
+    >>> delimited_jelly_hint(bytes([0x00, 0x0A, 0x00]))
+    True
+    >>> delimited_jelly_hint(bytes([0x00, 0x0A, 0x0A]))
+    True
+    >>> delimited_jelly_hint(bytes([0x0A, 0x00, 0x00]))
+    False
+    >>> delimited_jelly_hint(bytes([0x0A, 0x00, 0x0A]))
+    False
+    >>> delimited_jelly_hint(bytes([0x0A, 0x0A, 0x00]))
+    True
+    >>> delimited_jelly_hint(bytes([0x0A, 0x0A, 0x0A]))
+    False
+    """
+    magic = 0x0A
+    return len(header) == 3 and (  # noqa: PLR2004
+        header[0] != magic or (header[1] == magic and header[2] != magic)
+    )
+def frame_iterator(inp: IO[bytes]) -> Generator[jelly.RdfStreamFrame]:
+    while frame := parse_length_prefixed(jelly.RdfStreamFrame, inp):
+        if frame.rows:
+            yield frame
+def get_options_and_frames(
+    inp: IO[bytes],
+) -> tuple[StreamOptions, Iterator[jelly.RdfStreamFrame]]:
+    is_delimited = delimited_jelly_hint(bytes_read := inp.read(3))
+    inp.seek(-len(bytes_read), os.SEEK_CUR)
+    if is_delimited:
+        frames = frame_iterator(inp)
+        first_frame = next(frames, None)
+        if first_frame is None:
+            msg = "No non-empty frames found in the stream"
+            raise JellyConformanceError(msg)
+        options = options_from_frame(first_frame, delimited=True)
+        return options, chain((first_frame,), frames)
+    frame = parse(jelly.RdfStreamFrame, inp.read())
+    if not frame.rows:
+        msg = "The stream is corrupted (only contains an empty frame)"
+        raise JellyConformanceError(msg)
+    options = options_from_frame(frame, delimited=False)
+    return options, iter((frame,))

pyjelly/parse/lookup.py ADDED Viewed

@@ -0,0 +1,70 @@
+from __future__ import annotations
+from collections import deque
+from dataclasses import dataclass
+from pyjelly.errors import JellyAssertionError, JellyConformanceError
+from pyjelly.options import MAX_LOOKUP_SIZE
+@dataclass
+class LookupDecoder:
+    """
+    Shared base for RDF lookup encoders using Jelly compression.
+    Tracks the last assigned and last reused index.
+    Parameters
+    ----------
+    lookup_size
+        Maximum lookup size.
+    """
+    last_assigned_index: int
+    last_reused_index: int
+    def __init__(self, *, lookup_size: int) -> None:
+        if lookup_size > MAX_LOOKUP_SIZE:
+            msg = f"lookup size must be less than {MAX_LOOKUP_SIZE}"
+            raise JellyAssertionError(msg)
+        self.lookup_size = lookup_size
+        placeholders = (None,) * lookup_size
+        self.data: deque[str | None] = deque(placeholders, maxlen=lookup_size)
+        self.last_assigned_index = 0
+        self.last_reused_index = 0
+    def assign_entry(self, index: int, value: str) -> None:
+        previous_index = self.last_assigned_index
+        if index == 0:
+            index = previous_index + 1
+        assert index > 0
+        self.data[index - 1] = value
+        self.last_assigned_index = index
+    def at(self, index: int) -> str:
+        self.last_reused_index = index
+        value = self.data[index - 1]
+        if value is None:
+            msg = f"invalid resolved index {index}"
+            raise IndexError(msg)
+        return value
+    def decode_prefix_term_index(self, index: int) -> str:
+        actual_index = index or self.last_reused_index
+        if actual_index == 0:
+            return ""
+        return self.at(actual_index)
+    def decode_name_term_index(self, index: int) -> str:
+        actual_index = index or self.last_reused_index + 1
+        if actual_index == 0:
+            msg = "0 is not a valid name term index"
+            raise JellyConformanceError(msg)
+        return self.at(actual_index)
+    def decode_datatype_term_index(self, index: int) -> str | None:
+        if index == 0:
+            msg = "0 is not a valid datatype term index"
+            raise JellyConformanceError(msg)
+        return self.at(index)

pyjelly/serialize/__init__.py ADDED Viewed

File without changes