PyPI - pyjelly - Versions diffs - 0.7.1__cp311-cp311-macosx_11_0_x86_64.whl - Mend

pyjelly 0.7.1__cp311-cp311-macosx_11_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

cb523b6bada1c6eba8b4__mypyc.cpython-311-darwin.so +0 -0
pyjelly/__init__.py +0 -0
pyjelly/_proto/grpc.proto +33 -0
pyjelly/_proto/patch.proto +165 -0
pyjelly/_proto/rdf.proto +384 -0
pyjelly/errors.py +10 -0
pyjelly/integrations/__init__.py +0 -0
pyjelly/integrations/generic/__init__.py +0 -0
pyjelly/integrations/generic/generic_sink.py +202 -0
pyjelly/integrations/generic/parse.py +412 -0
pyjelly/integrations/generic/serialize.cpython-311-darwin.so +0 -0
pyjelly/integrations/generic/serialize.py +402 -0
pyjelly/integrations/rdflib/__init__.py +24 -0
pyjelly/integrations/rdflib/parse.py +560 -0
pyjelly/integrations/rdflib/serialize.py +408 -0
pyjelly/jelly/__init__.py +5 -0
pyjelly/jelly/rdf_pb2.py +70 -0
pyjelly/jelly/rdf_pb2.pyi +231 -0
pyjelly/options.py +141 -0
pyjelly/parse/__init__.py +0 -0
pyjelly/parse/decode.cpython-311-darwin.so +0 -0
pyjelly/parse/decode.py +447 -0
pyjelly/parse/ioutils.cpython-311-darwin.so +0 -0
pyjelly/parse/ioutils.py +115 -0
pyjelly/parse/lookup.cpython-311-darwin.so +0 -0
pyjelly/parse/lookup.py +70 -0
pyjelly/serialize/__init__.py +0 -0
pyjelly/serialize/encode.cpython-311-darwin.so +0 -0
pyjelly/serialize/encode.py +397 -0
pyjelly/serialize/flows.py +196 -0
pyjelly/serialize/ioutils.cpython-311-darwin.so +0 -0
pyjelly/serialize/ioutils.py +13 -0
pyjelly/serialize/lookup.cpython-311-darwin.so +0 -0
pyjelly/serialize/lookup.py +137 -0
pyjelly/serialize/streams.cpython-311-darwin.so +0 -0
pyjelly/serialize/streams.py +281 -0
pyjelly-0.7.1.dist-info/METADATA +114 -0
pyjelly-0.7.1.dist-info/RECORD +41 -0
pyjelly-0.7.1.dist-info/WHEEL +6 -0
pyjelly-0.7.1.dist-info/entry_points.txt +7 -0
pyjelly-0.7.1.dist-info/licenses/LICENSE +201 -0

pyjelly/serialize/encode.py ADDED Viewed

@@ -0,0 +1,397 @@
+from __future__ import annotations
+from collections.abc import Iterable, Iterator, Sequence
+from enum import IntEnum
+from typing import TypeAlias, TypeVar
+from mypy_extensions import mypyc_attr
+from pyjelly import jelly, options
+from pyjelly.errors import JellyConformanceError
+from pyjelly.serialize.lookup import LookupEncoder
+def split_iri(iri_string: str) -> tuple[str, str]:
+    """
+    Split iri into prefix and name.
+    Args:
+        iri_string (str): full iri string.
+    Returns:
+        tuple[str, str]: iri's prefix and name.
+    """
+    name = iri_string
+    prefix = ""
+    for sep in "#", "/":
+        prefix, char, name = iri_string.rpartition(sep)
+        if char:
+            return prefix + char, name
+    return prefix, name
+T = TypeVar("T")
+Rows: TypeAlias = Sequence[jelly.RdfStreamRow]
+Statement: TypeAlias = jelly.RdfQuad | jelly.RdfTriple
+HasGraph: TypeAlias = jelly.RdfQuad | jelly.RdfGraphStart
+Terms: TypeAlias = (
+    jelly.RdfIri | jelly.RdfLiteral | str | jelly.RdfDefaultGraph | jelly.RdfTriple
+)
+@mypyc_attr(allow_interpreted_subclasses=True)
+class TermEncoder:
+    def __init__(
+        self,
+        lookup_preset: options.LookupPreset | None = None,
+    ) -> None:
+        if lookup_preset is None:
+            lookup_preset = options.LookupPreset()
+        self.lookup_preset = lookup_preset
+        self.names = LookupEncoder(lookup_size=lookup_preset.max_names)
+        self.prefixes = LookupEncoder(lookup_size=lookup_preset.max_prefixes)
+        self.datatypes = LookupEncoder(lookup_size=lookup_preset.max_datatypes)
+    def encode_iri_indices(self, iri_string: str) -> tuple[Rows, int, int]:
+        """
+        Encode lookup indices for IRI.
+        Args:
+            iri_string (str): full iri in string format.
+        Returns:
+            tuple[Rows, int, int]: additional rows (if any) and
+                indices in prefix and name tables.
+        """
+        prefix, name = split_iri(iri_string)
+        if self.prefixes.lookup.max_size:
+            prefix_entry_index = self.prefixes.encode_entry_index(prefix)
+        else:
+            name = iri_string
+            prefix_entry_index = None
+        name_entry_index = self.names.encode_entry_index(name)
+        term_rows = []
+        if prefix_entry_index is not None:
+            prefix_entry = jelly.RdfPrefixEntry(id=prefix_entry_index, value=prefix)
+            term_rows.append(jelly.RdfStreamRow(prefix=prefix_entry))
+        if name_entry_index is not None:
+            name_entry = jelly.RdfNameEntry(id=name_entry_index, value=name)
+            term_rows.append(jelly.RdfStreamRow(name=name_entry))
+        prefix_index = self.prefixes.encode_prefix_term_index(prefix)
+        name_index = self.names.encode_name_term_index(name)
+        return term_rows, prefix_index, name_index
+    def encode_iri(self, iri_string: str, iri: jelly.RdfIri) -> Rows:
+        """
+        Encode iri.
+        Args:
+            iri_string (str): full iri in string format.
+            iri (jelly.RdfIri): iri to fill
+        Returns:
+            Rows: extra rows for prefix and name tables, if any.
+        """
+        term_rows, prefix_index, name_index = self.encode_iri_indices(iri_string)
+        iri.prefix_id = prefix_index
+        iri.name_id = name_index
+        return term_rows
+    def encode_default_graph(self, g_default_graph: jelly.RdfDefaultGraph) -> Rows:
+        """
+        Encode default graph.
+        Returns:
+            Rows: empty extra rows (for API consistency)
+        """
+        g_default_graph.CopyFrom(jelly.RdfDefaultGraph())
+        return ()
+    def encode_literal(
+        self,
+        *,
+        lex: str,
+        language: str | None = None,
+        datatype: str | None = None,
+        literal: jelly.RdfLiteral,
+    ) -> Rows:
+        """
+        Encode literal.
+        Args:
+            lex (str): lexical form/literal value
+            language (str | None, optional): langtag. Defaults to None.
+            datatype (str | None, optional): data type if
+            it is a typed literal. Defaults to None.
+            literal (jelly.RdfLiteral): literal to fill.
+        Raises:
+            JellyConformanceError: if datatype specified while
+                datatable is not used.
+        Returns:
+            Rows: extra rows (i.e., datatype entries).
+        """
+        datatype_id = None
+        term_rows: tuple[()] | tuple[jelly.RdfStreamRow] = ()
+        if datatype and datatype != options.STRING_DATATYPE_IRI:
+            if self.datatypes.lookup.max_size == 0:
+                msg = (
+                    f"can't encode literal with type {datatype}: "
+                    "datatype lookup cannot be used if disabled "
+                    "(its size was set to 0)"
+                )
+                raise JellyConformanceError(msg)
+            datatype_entry_id = self.datatypes.encode_entry_index(datatype)
+            if datatype_entry_id is not None:
+                entry = jelly.RdfDatatypeEntry(id=datatype_entry_id, value=datatype)
+                term_rows = (jelly.RdfStreamRow(datatype=entry),)
+            datatype_id = self.datatypes.encode_datatype_term_index(datatype)
+        literal.lex = lex
+        if language:
+            literal.langtag = language
+        if datatype_id:
+            literal.datatype = datatype_id
+        return term_rows
+    def encode_quoted_triple(
+        self, terms: Iterable[object], quoted_statement: jelly.RdfTriple
+    ) -> Rows:
+        """
+        Encode a quoted triple.
+        Notes:
+            Although a triple, it is treated as a part of a statement.
+            Repeated terms are not used when encoding quoted triples.
+        Args:
+            terms (Iterable[object]): triple terms to encode.
+            quoted_statement (jelly.RdfTriple): quoted triple to fill.
+        Returns:
+            Rows: additional stream rows with preceeding
+                information (prefixes, names, datatypes rows, if any).
+        """
+        rows: list[jelly.RdfStreamRow] = []
+        terms = iter(terms)
+        extra_rows = self.encode_spo(next(terms), Slot.subject, quoted_statement)
+        rows.extend(extra_rows)
+        extra_rows = self.encode_spo(next(terms), Slot.predicate, quoted_statement)
+        rows.extend(extra_rows)
+        extra_rows = self.encode_spo(next(terms), Slot.object, quoted_statement)
+        rows.extend(extra_rows)
+        return rows
+    def encode_spo(self, term: object, slot: Slot, statement: Statement) -> Rows:
+        msg = f"unsupported term type: {type(term)}"
+        raise NotImplementedError(msg)
+    def encode_graph(self, term: object, statement: HasGraph) -> Rows:
+        msg = f"unsupported term type: {type(term)}"
+        raise NotImplementedError(msg)
+    def get_iri_field(self, statement: Statement, slot: Slot) -> jelly.RdfIri:
+        """Get IRI field directly based on slot."""
+        if slot == Slot.subject:
+            return statement.s_iri
+        if slot == Slot.predicate:
+            return statement.p_iri
+        return statement.o_iri
+    def get_literal_field(self, statement: Statement, slot: Slot) -> jelly.RdfLiteral:
+        """Get literal field directly based on slot."""
+        if slot == Slot.subject:
+            return statement.s_literal
+        if slot == Slot.predicate:
+            return statement.p_literal
+        return statement.o_literal
+    def set_bnode_field(
+        self, statement: Statement, slot: Slot, identifier: str
+    ) -> None:
+        """Set bnode field directly based on slot."""
+        if slot == Slot.subject:
+            statement.s_bnode = identifier
+        elif slot == Slot.predicate:
+            statement.p_bnode = identifier
+        else:
+            statement.o_bnode = identifier
+    def get_triple_field(self, statement: Statement, slot: Slot) -> jelly.RdfTriple:
+        """Get triple term field directly based on slot."""
+        if slot == Slot.subject:
+            return statement.s_triple_term
+        if slot == Slot.predicate:
+            return statement.p_triple_term
+        return statement.o_triple_term
+class Slot(IntEnum):
+    subject = 0
+    predicate = 1
+    object = 2
+    graph = 3
+def encode_spo(
+    terms: Iterator[object],
+    term_encoder: TermEncoder,
+    repeated_terms: list[object | None],
+    statement: Statement,
+) -> list[jelly.RdfStreamRow]:
+    """
+    Encode the s/p/o of a statement.
+    Args:
+        terms (Iterator[object]): iterator for original terms to encode
+        term_encoder (TermEncoder): encoder with lookup tables
+        repeated_terms (list[object | None): list of repeated terms.
+        statement (Statement): Triple/Quad to fill.
+    Returns:
+        list[jelly.RdfStreamRow] extra rows to append.
+    """
+    rows: list[jelly.RdfStreamRow] = []
+    s = next(terms)
+    if repeated_terms[Slot.subject] != s:
+        extra_rows = term_encoder.encode_spo(s, Slot.subject, statement)
+        rows.extend(extra_rows)
+        repeated_terms[Slot.subject] = s
+    p = next(terms)
+    if repeated_terms[Slot.predicate] != p:
+        extra_rows = term_encoder.encode_spo(p, Slot.predicate, statement)
+        rows.extend(extra_rows)
+        repeated_terms[Slot.predicate] = p
+    o = next(terms)
+    if repeated_terms[Slot.object] != o:
+        extra_rows = term_encoder.encode_spo(o, Slot.object, statement)
+        rows.extend(extra_rows)
+        repeated_terms[Slot.object] = o
+    return rows
+def encode_triple(
+    terms: Iterable[object],
+    term_encoder: TermEncoder,
+    repeated_terms: list[object | None],
+) -> list[jelly.RdfStreamRow]:
+    """
+    Encode one triple.
+    Args:
+        terms (Iterable[object]): original terms to encode
+        term_encoder (TermEncoder): current encoder with lookup tables
+        repeated_terms (list[object | None]): list of repeated terms.
+    Returns:
+        list[jelly.RdfStreamRow]: list of rows to add to the current flow.
+    """
+    triple = jelly.RdfTriple()
+    terms = iter(terms)
+    rows = encode_spo(terms, term_encoder, repeated_terms, triple)
+    row = jelly.RdfStreamRow(triple=triple)
+    rows.append(row)
+    return rows
+def encode_quad(
+    terms: Iterable[object],
+    term_encoder: TermEncoder,
+    repeated_terms: list[object | None],
+) -> list[jelly.RdfStreamRow]:
+    """
+    Encode one quad.
+    Args:
+        terms (Iterable[object]): original terms to encode
+        term_encoder (TermEncoder): current encoder with lookup tables
+        repeated_terms (list[object | None]): list of repeated terms.
+    Returns:
+        list[jelly.RdfStreamRow]: list of messages to append to current flow.
+    """
+    terms = iter(terms)
+    quad = jelly.RdfQuad()
+    rows = encode_spo(terms, term_encoder, repeated_terms, quad)
+    g = next(terms)
+    if repeated_terms[Slot.graph] != g:
+        extra_rows = term_encoder.encode_graph(g, quad)
+        rows.extend(extra_rows)
+        repeated_terms[Slot.graph] = g
+    row = jelly.RdfStreamRow(quad=quad)
+    rows.append(row)
+    return rows
+def encode_namespace_declaration(
+    name: str,
+    value: str,
+    term_encoder: TermEncoder,
+) -> list[jelly.RdfStreamRow]:
+    """
+    Encode namespace declaration.
+    Args:
+        name (str): namespace prefix label
+        value (str): namespace iri
+        term_encoder (TermEncoder): current encoder
+    Returns:
+        list[jelly.RdfStreamRow]: list of messages to append to current flow.
+    """
+    iri = jelly.RdfIri()
+    [*rows] = term_encoder.encode_iri(value, iri=iri)
+    declaration = jelly.RdfNamespaceDeclaration(name=name, value=iri)
+    row = jelly.RdfStreamRow(namespace=declaration)
+    rows.append(row)
+    return rows
+def encode_options(
+    lookup_preset: options.LookupPreset,
+    stream_types: options.StreamTypes,
+    params: options.StreamParameters,
+) -> jelly.RdfStreamRow:
+    """
+    Encode stream options to ProtoBuf message.
+    Args:
+        lookup_preset (options.LookupPreset): lookup tables options
+        stream_types (options.StreamTypes): physical and logical types
+        params (options.StreamParameters): other params.
+    Returns:
+        jelly.RdfStreamRow: encoded stream options row
+    """
+    return jelly.RdfStreamRow(
+        options=jelly.RdfStreamOptions(
+            stream_name=params.stream_name,
+            physical_type=stream_types.physical_type,
+            generalized_statements=params.generalized_statements,
+            rdf_star=params.rdf_star,
+            max_name_table_size=lookup_preset.max_names,
+            max_prefix_table_size=lookup_preset.max_prefixes,
+            max_datatype_table_size=lookup_preset.max_datatypes,
+            logical_type=stream_types.logical_type,
+            version=params.version,
+        )
+    )

pyjelly/serialize/flows.py ADDED Viewed

@@ -0,0 +1,196 @@
+from __future__ import annotations
+from collections import UserList
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Any, ClassVar
+from typing_extensions import override
+from pyjelly import jelly
+DEFAULT_FRAME_SIZE = 250
+class FrameFlow(UserList[jelly.RdfStreamRow]):
+    """
+    Abstract base class for producing Jelly frames from RDF stream rows.
+    Collects stream rows and assembles them into RdfStreamFrame objects when ready.
+    Allows for passing LogicalStreamType, required for
+        logical subtypes and non-delimited streams.
+    """
+    logical_type: jelly.LogicalStreamType
+    registry: ClassVar[dict[jelly.LogicalStreamType, type[FrameFlow]]] = {}
+    def __init__(
+        self,
+        initlist: Iterable[jelly.RdfStreamRow] | None = None,
+        *,
+        logical_type: jelly.LogicalStreamType | None = None,
+        **__kwargs: Any,
+    ) -> None:
+        super().__init__(initlist)
+        self.logical_type = logical_type or self.__class__.logical_type
+    def frame_from_graph(self) -> jelly.RdfStreamFrame | None:
+        """
+        Treat the current rows as a graph and produce a frame.
+        Default implementation returns None.
+        """
+        return None
+    def frame_from_dataset(self) -> jelly.RdfStreamFrame | None:
+        """
+        Treat the current rows as a dataset and produce a frame.
+        Default implementation returns None.
+        """
+        return None
+    def frame_from_bounds(self) -> jelly.RdfStreamFrame | None:
+        return None
+    def to_stream_frame(self) -> jelly.RdfStreamFrame | None:
+        """
+        Create stream frame from flow content.
+        Notes:
+            Clears flow content after creating the frame.
+        Returns:
+            jelly.RdfStreamFrame | None: stream frame
+        """
+        if not self:
+            return None
+        frame = jelly.RdfStreamFrame(rows=self)
+        self.clear()
+        return frame
+class ManualFrameFlow(FrameFlow):
+    """
+    Produces frames only when manually requested (never automatically).
+    !!! warning
+        All stream rows are kept in memory until `to_stream_frame()` is called.
+        This may lead to high memory usage for large streams.
+    Used for non-delimited serialization.
+    """
+    logical_type = jelly.LOGICAL_STREAM_TYPE_UNSPECIFIED
+@dataclass
+class BoundedFrameFlow(FrameFlow):
+    """
+    Produce frames automatically when a fixed number of rows is reached.
+    Used for delimited encoding (default mode).
+    """
+    logical_type = jelly.LOGICAL_STREAM_TYPE_UNSPECIFIED
+    frame_size: int
+    @override
+    def __init__(
+        self,
+        initlist: Iterable[jelly.RdfStreamRow] | None = None,
+        logical_type: jelly.LogicalStreamType | None = None,
+        *,
+        frame_size: int | None = None,
+    ) -> None:
+        super().__init__(initlist, logical_type=logical_type)
+        self.frame_size = frame_size or DEFAULT_FRAME_SIZE
+    @override
+    def frame_from_bounds(self) -> jelly.RdfStreamFrame | None:
+        """
+        Emit frame from flow if full.
+        Returns:
+            jelly.RdfStreamFrame | None: stream frame
+        """
+        if len(self) >= self.frame_size:
+            return self.to_stream_frame()
+        return None
+class FlatTriplesFrameFlow(BoundedFrameFlow):
+    logical_type = jelly.LOGICAL_STREAM_TYPE_FLAT_TRIPLES
+class FlatQuadsFrameFlow(BoundedFrameFlow):
+    logical_type = jelly.LOGICAL_STREAM_TYPE_FLAT_QUADS
+class GraphsFrameFlow(FrameFlow):
+    logical_type = jelly.LOGICAL_STREAM_TYPE_GRAPHS
+    def frame_from_graph(self) -> jelly.RdfStreamFrame | None:
+        """
+        Emit current flow content (one graph) as jelly frame.
+        Returns:
+            jelly.RdfStreamFrame | None: jelly frame or none if
+                flow is empty.
+        """
+        return self.to_stream_frame()
+class DatasetsFrameFlow(FrameFlow):
+    logical_type = jelly.LOGICAL_STREAM_TYPE_DATASETS
+    def frame_from_dataset(self) -> jelly.RdfStreamFrame | None:
+        """
+        Emit current flow content (dataset) as jelly frame.
+        Returns:
+            jelly.RdfStreamFrame | None: jelly frame or none if
+                flow is empty.
+        """
+        return self.to_stream_frame()
+FLOW_DISPATCH: dict[jelly.LogicalStreamType, type[FrameFlow]] = {
+    jelly.LOGICAL_STREAM_TYPE_FLAT_TRIPLES: FlatTriplesFrameFlow,
+    jelly.LOGICAL_STREAM_TYPE_FLAT_QUADS: FlatQuadsFrameFlow,
+    jelly.LOGICAL_STREAM_TYPE_GRAPHS: GraphsFrameFlow,
+    jelly.LOGICAL_STREAM_TYPE_DATASETS: DatasetsFrameFlow,
+}
+def flow_for_type(logical_type: jelly.LogicalStreamType) -> type[FrameFlow]:
+    """
+    Return flow based on logical type requested.
+    Note: uses base logical type for subtypes (i.e., SUBJECT_GRAPHS uses
+        the same flow as its base type GRAPHS).
+    Args:
+        logical_type (jelly.LogicalStreamType): logical type requested.
+    Raises:
+        NotImplementedError: if (base) logical stream type is not supported.
+    Returns:
+        type[FrameFlow]: FrameFlow for respective logical type.
+    """
+    try:
+        base_logical_type_value = logical_type % 10
+        base_name = jelly.LogicalStreamType.Name(base_logical_type_value)
+        return FLOW_DISPATCH[getattr(jelly.LogicalStreamType, base_name)]
+    except KeyError:
+        msg = (
+            "unsupported logical stream type: "
+            f"{jelly.LogicalStreamType.Name(logical_type)}"
+        )
+        raise NotImplementedError(msg) from None

pyjelly/serialize/ioutils.cpython-311-darwin.so ADDED Viewed

Binary file

pyjelly/serialize/ioutils.py ADDED Viewed

@@ -0,0 +1,13 @@
+from typing import IO
+from google.protobuf.proto import serialize_length_prefixed
+from pyjelly import jelly
+def write_delimited(frame: jelly.RdfStreamFrame, output_stream: IO[bytes]) -> None:
+    serialize_length_prefixed(frame, output_stream)
+def write_single(frame: jelly.RdfStreamFrame, output_stream: IO[bytes]) -> None:
+    output_stream.write(frame.SerializeToString(deterministic=True))

pyjelly/serialize/lookup.cpython-311-darwin.so ADDED Viewed

Binary file