pyjelly 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyjelly might be problematic. Click here for more details.

pyjelly/options.py ADDED
@@ -0,0 +1,126 @@
1
+ from __future__ import annotations
2
+
3
+ import mimetypes
4
+ from contextlib import suppress
5
+ from dataclasses import dataclass, field
6
+ from typing import Final
7
+ from typing_extensions import Self
8
+
9
+ from pyjelly import jelly
10
+ from pyjelly.errors import (
11
+ JellyAssertionError,
12
+ JellyConformanceError,
13
+ JellyNotImplementedError,
14
+ )
15
+
16
+ MIN_NAME_LOOKUP_SIZE: Final[int] = 8
17
+
18
+ MAX_LOOKUP_SIZE: Final[int] = 4096
19
+ MAX_VERSION: Final[int] = 2
20
+
21
+ DEFAULT_NAME_LOOKUP_SIZE: Final[int] = 4000
22
+ DEFAULT_PREFIX_LOOKUP_SIZE: Final[int] = 150
23
+ DEFAULT_DATATYPE_LOOKUP_SIZE: Final[int] = 32
24
+
25
+ STRING_DATATYPE_IRI = "http://www.w3.org/2001/XMLSchema#string"
26
+
27
+ INTEGRATION_SIDE_EFFECTS: bool = True
28
+ """
29
+ Whether to allow integration module imports to trigger side effects.
30
+
31
+ These side effects are cheap and may include populating some registries
32
+ for guessing the defaults for external integrations that work with Jelly.
33
+ """
34
+
35
+ MIMETYPES = ("application/x-jelly-rdf",)
36
+
37
+
38
+ def register_mimetypes(extension: str = ".jelly") -> None:
39
+ """
40
+ Associate files that have Jelly extension with Jelly MIME types.
41
+
42
+ >>> register_mimetypes()
43
+ >>> mimetypes.guess_type("out.jelly")
44
+ ('application/x-jelly-rdf', None)
45
+ """
46
+ for mimetype in MIMETYPES:
47
+ mimetypes.add_type(mimetype, extension)
48
+
49
+
50
+ @dataclass(frozen=True)
51
+ class LookupPreset:
52
+ max_names: int = DEFAULT_NAME_LOOKUP_SIZE
53
+ max_prefixes: int = DEFAULT_PREFIX_LOOKUP_SIZE
54
+ max_datatypes: int = DEFAULT_DATATYPE_LOOKUP_SIZE
55
+
56
+ def __post_init__(self) -> None:
57
+ if self.max_names < MIN_NAME_LOOKUP_SIZE:
58
+ msg = "name lookup size must be at least 8"
59
+ raise JellyConformanceError(msg)
60
+
61
+ @classmethod
62
+ def small(cls) -> Self:
63
+ return cls(max_names=128, max_prefixes=32, max_datatypes=32)
64
+
65
+
66
+ @dataclass(frozen=True)
67
+ class StreamTypes:
68
+ physical_type: jelly.PhysicalStreamType
69
+ logical_type: jelly.LogicalStreamType = jelly.LOGICAL_STREAM_TYPE_UNSPECIFIED
70
+
71
+ @property
72
+ def flat(self) -> bool:
73
+ return self.logical_type in (
74
+ jelly.LOGICAL_STREAM_TYPE_FLAT_TRIPLES,
75
+ jelly.LOGICAL_STREAM_TYPE_FLAT_QUADS,
76
+ )
77
+
78
+ def __repr__(self) -> str:
79
+ with suppress(ValueError):
80
+ physical_type_name = jelly.PhysicalStreamType.Name(self.physical_type)
81
+ logical_type_name = jelly.LogicalStreamType.Name(self.logical_type)
82
+ return f"StreamTypes({physical_type_name}, {logical_type_name})"
83
+ return f"StreamTypes({self.physical_type}, {self.logical_type})"
84
+
85
+ def __post_init__(self) -> None:
86
+ if self.physical_type == jelly.PHYSICAL_STREAM_TYPE_UNSPECIFIED:
87
+ msg = "physical type must be specified"
88
+ raise JellyNotImplementedError(msg)
89
+ validate_type_compatibility(
90
+ physical_type=self.physical_type,
91
+ logical_type=self.logical_type,
92
+ )
93
+
94
+
95
+ @dataclass(frozen=True)
96
+ class StreamOptions:
97
+ stream_types: StreamTypes
98
+ lookup_preset: LookupPreset = field(default_factory=LookupPreset)
99
+ generalized_statements: bool = False
100
+ rdf_star: bool = False
101
+ version: int = MAX_VERSION
102
+ delimited: bool = True
103
+ namespace_declarations: bool = False
104
+ stream_name: str = ""
105
+
106
+
107
+ TRIPLES_ONLY_LOGICAL_TYPES = {
108
+ jelly.LOGICAL_STREAM_TYPE_GRAPHS,
109
+ jelly.LOGICAL_STREAM_TYPE_SUBJECT_GRAPHS,
110
+ jelly.LOGICAL_STREAM_TYPE_FLAT_TRIPLES,
111
+ }
112
+
113
+
114
+ def validate_type_compatibility(
115
+ physical_type: jelly.PhysicalStreamType,
116
+ logical_type: jelly.LogicalStreamType,
117
+ ) -> None:
118
+ if logical_type == jelly.LOGICAL_STREAM_TYPE_UNSPECIFIED:
119
+ return
120
+ triples_physical_type = physical_type == jelly.PHYSICAL_STREAM_TYPE_TRIPLES
121
+ triples_logical_type = logical_type in TRIPLES_ONLY_LOGICAL_TYPES
122
+ if triples_physical_type != triples_logical_type:
123
+ physical_type_name = jelly.PhysicalStreamType.Name(physical_type)
124
+ logical_type_name = jelly.LogicalStreamType.Name(logical_type)
125
+ msg = f"{physical_type_name} is not compatible with {logical_type_name}"
126
+ raise JellyAssertionError(msg)
File without changes
@@ -0,0 +1,233 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABCMeta, abstractmethod
4
+ from collections.abc import Iterable, Sequence
5
+ from typing import Any, ClassVar
6
+ from typing_extensions import Never
7
+
8
+ from pyjelly import jelly
9
+ from pyjelly.options import LookupPreset, StreamOptions, StreamTypes
10
+ from pyjelly.parse.lookup import LookupDecoder
11
+
12
+
13
+ def options_from_frame(
14
+ frame: jelly.RdfStreamFrame,
15
+ *,
16
+ delimited: bool,
17
+ ) -> StreamOptions:
18
+ row = frame.rows[0]
19
+ options = row.options
20
+ return StreamOptions(
21
+ stream_types=StreamTypes(
22
+ physical_type=options.physical_type,
23
+ logical_type=options.logical_type,
24
+ ),
25
+ lookup_preset=LookupPreset(
26
+ max_names=options.max_name_table_size,
27
+ max_prefixes=options.max_prefix_table_size,
28
+ max_datatypes=options.max_datatype_table_size,
29
+ ),
30
+ stream_name=options.stream_name,
31
+ version=options.version,
32
+ delimited=delimited,
33
+ )
34
+
35
+
36
+ def _adapter_missing(feature: str, *, options: StreamOptions) -> Never:
37
+ physical_type_name = jelly.PhysicalStreamType.Name(
38
+ options.stream_types.physical_type
39
+ )
40
+ logical_type_name = jelly.LogicalStreamType.Name(options.stream_types.logical_type)
41
+ msg = (
42
+ f"adapter with {physical_type_name} and {logical_type_name} "
43
+ f"does not implement {feature}"
44
+ )
45
+ raise NotImplementedError(msg)
46
+
47
+
48
+ class Adapter(metaclass=ABCMeta):
49
+ def __init__(self, options: StreamOptions) -> None:
50
+ self.options = options
51
+
52
+ # Obligatory abstract methods--all adapters must implement these
53
+ @abstractmethod
54
+ def iri(self, iri: str) -> Any:
55
+ raise NotImplementedError
56
+
57
+ @abstractmethod
58
+ def default_graph(self) -> Any:
59
+ raise NotImplementedError
60
+
61
+ @abstractmethod
62
+ def bnode(self, bnode: str) -> Any:
63
+ raise NotImplementedError
64
+
65
+ @abstractmethod
66
+ def literal(
67
+ self,
68
+ lex: str,
69
+ language: str | None = None,
70
+ datatype: str | None = None,
71
+ ) -> Any:
72
+ raise NotImplementedError
73
+
74
+ # Optional abstract methods--not required to be implemented by all adapters
75
+ def triple(self, terms: Iterable[Any]) -> Any: # noqa: ARG002
76
+ _adapter_missing("decoding triples", options=self.options)
77
+
78
+ def quad(self, terms: Iterable[Any]) -> Any: # noqa: ARG002
79
+ _adapter_missing("decoding quads", options=self.options)
80
+
81
+ def graph_start(self, graph_id: Any) -> Any: # noqa: ARG002
82
+ _adapter_missing("decoding graph start markers", options=self.options)
83
+
84
+ def graph_end(self) -> Any:
85
+ _adapter_missing("decoding graph end markers", options=self.options)
86
+
87
+ def namespace_declaration(self, name: str, iri: str) -> Any: # noqa: ARG002
88
+ _adapter_missing("decoding namespace declarations", options=self.options)
89
+
90
+ def frame(self) -> Any:
91
+ return None
92
+
93
+
94
+ class Decoder:
95
+ def __init__(self, adapter: Adapter) -> None:
96
+ self.adapter = adapter
97
+ self.names = LookupDecoder(lookup_size=self.options.lookup_preset.max_names)
98
+ self.prefixes = LookupDecoder(
99
+ lookup_size=self.options.lookup_preset.max_prefixes
100
+ )
101
+ self.datatypes = LookupDecoder(
102
+ lookup_size=self.options.lookup_preset.max_datatypes
103
+ )
104
+ self.repeated_terms: dict[str, jelly.RdfIri | str | jelly.RdfLiteral] = {}
105
+
106
+ @property
107
+ def options(self) -> StreamOptions:
108
+ return self.adapter.options
109
+
110
+ def decode_frame(self, frame: jelly.RdfStreamFrame) -> Any:
111
+ for row_owner in frame.rows:
112
+ row = getattr(row_owner, row_owner.WhichOneof("row"))
113
+ self.decode_row(row)
114
+ return self.adapter.frame()
115
+
116
+ def decode_row(self, row: Any) -> Any | None:
117
+ try:
118
+ decode_row = self.row_handlers[type(row)]
119
+ except KeyError:
120
+ msg = f"decoder not implemented for {type(row)}"
121
+ raise TypeError(msg) from None
122
+ return decode_row(self, row)
123
+
124
+ def validate_stream_options(self, options: jelly.RdfStreamOptions) -> None:
125
+ assert self.options.stream_name == options.stream_name
126
+ assert self.options.version >= options.version
127
+ assert self.options.lookup_preset.max_prefixes == options.max_prefix_table_size
128
+ assert (
129
+ self.options.lookup_preset.max_datatypes == options.max_datatype_table_size
130
+ )
131
+ assert self.options.lookup_preset.max_names == options.max_name_table_size
132
+
133
+ def ingest_prefix_entry(self, entry: jelly.RdfPrefixEntry) -> None:
134
+ self.prefixes.assign_entry(index=entry.id, value=entry.value)
135
+
136
+ def ingest_name_entry(self, entry: jelly.RdfNameEntry) -> None:
137
+ self.names.assign_entry(index=entry.id, value=entry.value)
138
+
139
+ def ingest_datatype_entry(self, entry: jelly.RdfDatatypeEntry) -> None:
140
+ self.datatypes.assign_entry(index=entry.id, value=entry.value)
141
+
142
+ def decode_term(self, term: Any) -> Any:
143
+ try:
144
+ decode_term = self.term_handlers[type(term)]
145
+ except KeyError:
146
+ msg = f"decoder not implemented for {type(term)}"
147
+ raise TypeError(msg) from None
148
+ return decode_term(self, term)
149
+
150
+ def decode_iri(self, iri: jelly.RdfIri) -> Any:
151
+ name = self.names.decode_name_term_index(iri.name_id)
152
+ prefix = self.prefixes.decode_prefix_term_index(iri.prefix_id)
153
+ return self.adapter.iri(iri=prefix + name)
154
+
155
+ def decode_default_graph(self, _: jelly.RdfDefaultGraph) -> Any:
156
+ return self.adapter.default_graph()
157
+
158
+ def decode_bnode(self, bnode: str) -> Any:
159
+ return self.adapter.bnode(bnode)
160
+
161
+ def decode_literal(self, literal: jelly.RdfLiteral) -> Any:
162
+ language = datatype = None
163
+ if literal.langtag:
164
+ language = literal.langtag
165
+ elif self.datatypes.lookup_size and literal.HasField("datatype"):
166
+ datatype = self.datatypes.decode_datatype_term_index(literal.datatype)
167
+ return self.adapter.literal(
168
+ lex=literal.lex,
169
+ language=language,
170
+ datatype=datatype,
171
+ )
172
+
173
+ def decode_namespace_declaration(
174
+ self,
175
+ declaration: jelly.RdfNamespaceDeclaration,
176
+ ) -> Any:
177
+ iri = self.decode_iri(declaration.value)
178
+ return self.adapter.namespace_declaration(declaration.name, iri)
179
+
180
+ def decode_graph_start(self, graph_start: jelly.RdfGraphStart) -> Any:
181
+ term = getattr(graph_start, graph_start.WhichOneof("graph"))
182
+ return self.adapter.graph_start(self.decode_term(term))
183
+
184
+ def decode_graph_end(self, _: jelly.RdfGraphEnd) -> Any:
185
+ return self.adapter.graph_end()
186
+
187
+ def decode_statement(
188
+ self,
189
+ statement: jelly.RdfTriple | jelly.RdfQuad,
190
+ oneofs: Sequence[str],
191
+ ) -> Any:
192
+ terms = []
193
+ for oneof in oneofs:
194
+ field = statement.WhichOneof(oneof)
195
+ if field:
196
+ jelly_term = getattr(statement, field)
197
+ decoded_term = self.decode_term(jelly_term)
198
+ self.repeated_terms[oneof] = decoded_term
199
+ else:
200
+ decoded_term = self.repeated_terms[oneof]
201
+ if decoded_term is None:
202
+ msg = f"missing repeated term {oneof}"
203
+ raise ValueError(msg)
204
+ terms.append(decoded_term)
205
+ return terms
206
+
207
+ def decode_triple(self, triple: jelly.RdfTriple) -> Any:
208
+ terms = self.decode_statement(triple, ("subject", "predicate", "object"))
209
+ return self.adapter.triple(terms)
210
+
211
+ def decode_quad(self, quad: jelly.RdfQuad) -> Any:
212
+ terms = self.decode_statement(quad, ("subject", "predicate", "object", "graph"))
213
+ return self.adapter.quad(terms)
214
+
215
+ # dispatch by invariant type (no C3 resolution)
216
+ row_handlers: ClassVar = {
217
+ jelly.RdfStreamOptions: validate_stream_options,
218
+ jelly.RdfPrefixEntry: ingest_prefix_entry,
219
+ jelly.RdfNameEntry: ingest_name_entry,
220
+ jelly.RdfDatatypeEntry: ingest_datatype_entry,
221
+ jelly.RdfTriple: decode_triple,
222
+ jelly.RdfQuad: decode_quad,
223
+ jelly.RdfGraphStart: decode_graph_start,
224
+ jelly.RdfGraphEnd: decode_graph_end,
225
+ jelly.RdfNamespaceDeclaration: decode_namespace_declaration,
226
+ }
227
+
228
+ term_handlers: ClassVar = {
229
+ jelly.RdfIri: decode_iri,
230
+ str: decode_bnode,
231
+ jelly.RdfLiteral: decode_literal,
232
+ jelly.RdfDefaultGraph: decode_default_graph,
233
+ }
@@ -0,0 +1,86 @@
1
+ import os
2
+ from collections.abc import Generator, Iterator
3
+ from itertools import chain
4
+ from typing import IO
5
+
6
+ from google.protobuf.proto import parse, parse_length_prefixed
7
+
8
+ from pyjelly import jelly
9
+ from pyjelly.errors import JellyConformanceError
10
+ from pyjelly.options import StreamOptions
11
+ from pyjelly.parse.decode import options_from_frame
12
+
13
+
14
+ def delimited_jelly_hint(header: bytes) -> bool:
15
+ """
16
+ Detect whether a Jelly file is delimited from its first 3 bytes.
17
+
18
+ Truth table (notation: `0A` = `0x0A`, `NN` = `not 0x0A`, `??` = _don't care_):
19
+
20
+ | Byte 1 | Byte 2 | Byte 3 | Result |
21
+ |--------|--------|--------|------------------------------------------|
22
+ | `NN` | `??` | `??` | Delimited |
23
+ | `0A` | `NN` | `??` | Non-delimited |
24
+ | `0A` | `0A` | `NN` | Delimited (size = 10) |
25
+ | `0A` | `0A` | `0A` | Non-delimited (stream options size = 10) |
26
+
27
+ >>> delimited_jelly_hint(bytes([0x00, 0x00, 0x00]))
28
+ True
29
+
30
+ >>> delimited_jelly_hint(bytes([0x00, 0x00, 0x0A]))
31
+ True
32
+
33
+ >>> delimited_jelly_hint(bytes([0x00, 0x0A, 0x00]))
34
+ True
35
+
36
+ >>> delimited_jelly_hint(bytes([0x00, 0x0A, 0x0A]))
37
+ True
38
+
39
+ >>> delimited_jelly_hint(bytes([0x0A, 0x00, 0x00]))
40
+ False
41
+
42
+ >>> delimited_jelly_hint(bytes([0x0A, 0x00, 0x0A]))
43
+ False
44
+
45
+ >>> delimited_jelly_hint(bytes([0x0A, 0x0A, 0x00]))
46
+ True
47
+
48
+ >>> delimited_jelly_hint(bytes([0x0A, 0x0A, 0x0A]))
49
+ False
50
+ """
51
+ magic = 0x0A
52
+ return len(header) == 3 and ( # noqa: PLR2004
53
+ header[0] != magic or (header[1] == magic and header[2] != magic)
54
+ )
55
+
56
+
57
+ def frame_iterator(inp: IO[bytes]) -> Generator[jelly.RdfStreamFrame]:
58
+ while frame := parse_length_prefixed(jelly.RdfStreamFrame, inp):
59
+ if frame.rows:
60
+ yield frame
61
+
62
+
63
+ def get_options_and_frames(
64
+ inp: IO[bytes],
65
+ ) -> tuple[StreamOptions, Iterator[jelly.RdfStreamFrame]]:
66
+ is_delimited = delimited_jelly_hint(bytes_read := inp.read(3))
67
+ inp.seek(-len(bytes_read), os.SEEK_CUR)
68
+
69
+ if is_delimited:
70
+ frames = frame_iterator(inp)
71
+ first_frame = next(frames, None)
72
+ if first_frame is None:
73
+ msg = "No non-empty frames found in the stream"
74
+ raise JellyConformanceError(msg)
75
+
76
+ options = options_from_frame(first_frame, delimited=True)
77
+ return options, chain((first_frame,), frames)
78
+
79
+ frame = parse(jelly.RdfStreamFrame, inp.read())
80
+
81
+ if not frame.rows:
82
+ msg = "The stream is corrupted (only contains an empty frame)"
83
+ raise JellyConformanceError(msg)
84
+
85
+ options = options_from_frame(frame, delimited=False)
86
+ return options, iter((frame,))
@@ -0,0 +1,70 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import deque
4
+ from dataclasses import dataclass
5
+
6
+ from pyjelly.errors import JellyAssertionError, JellyConformanceError
7
+ from pyjelly.options import MAX_LOOKUP_SIZE
8
+
9
+
10
+ @dataclass
11
+ class LookupDecoder:
12
+ """
13
+ Shared base for RDF lookup encoders using Jelly compression.
14
+
15
+ Tracks the last assigned and last reused index.
16
+
17
+ Parameters
18
+ ----------
19
+ lookup_size
20
+ Maximum lookup size.
21
+
22
+ """
23
+
24
+ last_assigned_index: int
25
+ last_reused_index: int
26
+
27
+ def __init__(self, *, lookup_size: int) -> None:
28
+ if lookup_size > MAX_LOOKUP_SIZE:
29
+ msg = f"lookup size must be less than {MAX_LOOKUP_SIZE}"
30
+ raise JellyAssertionError(msg)
31
+ self.lookup_size = lookup_size
32
+ placeholders = (None,) * lookup_size
33
+ self.data: deque[str | None] = deque(placeholders, maxlen=lookup_size)
34
+ self.last_assigned_index = 0
35
+ self.last_reused_index = 0
36
+
37
+ def assign_entry(self, index: int, value: str) -> None:
38
+ previous_index = self.last_assigned_index
39
+ if index == 0:
40
+ index = previous_index + 1
41
+ assert index > 0
42
+ self.data[index - 1] = value
43
+ self.last_assigned_index = index
44
+
45
+ def at(self, index: int) -> str:
46
+ self.last_reused_index = index
47
+ value = self.data[index - 1]
48
+ if value is None:
49
+ msg = f"invalid resolved index {index}"
50
+ raise IndexError(msg)
51
+ return value
52
+
53
+ def decode_prefix_term_index(self, index: int) -> str:
54
+ actual_index = index or self.last_reused_index
55
+ if actual_index == 0:
56
+ return ""
57
+ return self.at(actual_index)
58
+
59
+ def decode_name_term_index(self, index: int) -> str:
60
+ actual_index = index or self.last_reused_index + 1
61
+ if actual_index == 0:
62
+ msg = "0 is not a valid name term index"
63
+ raise JellyConformanceError(msg)
64
+ return self.at(actual_index)
65
+
66
+ def decode_datatype_term_index(self, index: int) -> str | None:
67
+ if index == 0:
68
+ msg = "0 is not a valid datatype term index"
69
+ raise JellyConformanceError(msg)
70
+ return self.at(index)
File without changes