pyjelly 0.7.1__cp311-cp311-macosx_11_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cb523b6bada1c6eba8b4__mypyc.cpython-311-darwin.so +0 -0
- pyjelly/__init__.py +0 -0
- pyjelly/_proto/grpc.proto +33 -0
- pyjelly/_proto/patch.proto +165 -0
- pyjelly/_proto/rdf.proto +384 -0
- pyjelly/errors.py +10 -0
- pyjelly/integrations/__init__.py +0 -0
- pyjelly/integrations/generic/__init__.py +0 -0
- pyjelly/integrations/generic/generic_sink.py +202 -0
- pyjelly/integrations/generic/parse.py +412 -0
- pyjelly/integrations/generic/serialize.cpython-311-darwin.so +0 -0
- pyjelly/integrations/generic/serialize.py +402 -0
- pyjelly/integrations/rdflib/__init__.py +24 -0
- pyjelly/integrations/rdflib/parse.py +560 -0
- pyjelly/integrations/rdflib/serialize.py +408 -0
- pyjelly/jelly/__init__.py +5 -0
- pyjelly/jelly/rdf_pb2.py +70 -0
- pyjelly/jelly/rdf_pb2.pyi +231 -0
- pyjelly/options.py +141 -0
- pyjelly/parse/__init__.py +0 -0
- pyjelly/parse/decode.cpython-311-darwin.so +0 -0
- pyjelly/parse/decode.py +447 -0
- pyjelly/parse/ioutils.cpython-311-darwin.so +0 -0
- pyjelly/parse/ioutils.py +115 -0
- pyjelly/parse/lookup.cpython-311-darwin.so +0 -0
- pyjelly/parse/lookup.py +70 -0
- pyjelly/serialize/__init__.py +0 -0
- pyjelly/serialize/encode.cpython-311-darwin.so +0 -0
- pyjelly/serialize/encode.py +397 -0
- pyjelly/serialize/flows.py +196 -0
- pyjelly/serialize/ioutils.cpython-311-darwin.so +0 -0
- pyjelly/serialize/ioutils.py +13 -0
- pyjelly/serialize/lookup.cpython-311-darwin.so +0 -0
- pyjelly/serialize/lookup.py +137 -0
- pyjelly/serialize/streams.cpython-311-darwin.so +0 -0
- pyjelly/serialize/streams.py +281 -0
- pyjelly-0.7.1.dist-info/METADATA +114 -0
- pyjelly-0.7.1.dist-info/RECORD +41 -0
- pyjelly-0.7.1.dist-info/WHEEL +6 -0
- pyjelly-0.7.1.dist-info/entry_points.txt +7 -0
- pyjelly-0.7.1.dist-info/licenses/LICENSE +201 -0
pyjelly/parse/decode.py
ADDED
|
@@ -0,0 +1,447 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABCMeta, abstractmethod
|
|
4
|
+
from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
|
|
5
|
+
from enum import Enum, auto
|
|
6
|
+
from typing import Any, ClassVar, NamedTuple
|
|
7
|
+
from typing_extensions import Never
|
|
8
|
+
|
|
9
|
+
from mypy_extensions import mypyc_attr
|
|
10
|
+
|
|
11
|
+
from pyjelly import jelly
|
|
12
|
+
from pyjelly.options import MAX_VERSION, LookupPreset, StreamParameters, StreamTypes
|
|
13
|
+
from pyjelly.parse.lookup import LookupDecoder
|
|
14
|
+
|
|
15
|
+
RowHandler = Callable[[Any], Any | None]
|
|
16
|
+
TermHandler = Callable[[Any], Any | None]
|
|
17
|
+
RdfStreamOptions = jelly.RdfStreamOptions
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ParsingMode(Enum):
|
|
21
|
+
"""
|
|
22
|
+
Specifies how jelly frames should be treated.
|
|
23
|
+
|
|
24
|
+
Modes:
|
|
25
|
+
FLAT
|
|
26
|
+
Yield all frames as one Graph or Dataset.
|
|
27
|
+
GROUPED
|
|
28
|
+
Yield one Graph/Dataset per frame (grouped parsing).
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
FLAT = auto()
|
|
32
|
+
GROUPED = auto()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@mypyc_attr(allow_interpreted_subclasses=True)
|
|
36
|
+
class ParserOptions(NamedTuple):
|
|
37
|
+
stream_types: StreamTypes
|
|
38
|
+
lookup_preset: LookupPreset
|
|
39
|
+
params: StreamParameters
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def options_from_frame(
|
|
43
|
+
frame: jelly.RdfStreamFrame,
|
|
44
|
+
*,
|
|
45
|
+
delimited: bool,
|
|
46
|
+
) -> ParserOptions:
|
|
47
|
+
"""
|
|
48
|
+
Fill stream options based on the options row.
|
|
49
|
+
|
|
50
|
+
Notes:
|
|
51
|
+
generalized_statements, rdf_star, and namespace declarations
|
|
52
|
+
are set to false by default
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
frame (jelly.RdfStreamFrame): first non-empty frame from the stream
|
|
56
|
+
delimited (bool): derived delimited flag
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
ParserOptions: filled options with types/lookups/stream parameters information
|
|
60
|
+
|
|
61
|
+
"""
|
|
62
|
+
row = frame.rows[0]
|
|
63
|
+
options = row.options
|
|
64
|
+
nd = getattr(options, "namespace_declarations", False) or (
|
|
65
|
+
options.version >= MAX_VERSION
|
|
66
|
+
)
|
|
67
|
+
return ParserOptions(
|
|
68
|
+
stream_types=StreamTypes(
|
|
69
|
+
physical_type=options.physical_type,
|
|
70
|
+
logical_type=options.logical_type,
|
|
71
|
+
),
|
|
72
|
+
lookup_preset=LookupPreset(
|
|
73
|
+
max_names=options.max_name_table_size,
|
|
74
|
+
max_prefixes=options.max_prefix_table_size,
|
|
75
|
+
max_datatypes=options.max_datatype_table_size,
|
|
76
|
+
),
|
|
77
|
+
params=StreamParameters(
|
|
78
|
+
stream_name=options.stream_name,
|
|
79
|
+
generalized_statements=options.generalized_statements,
|
|
80
|
+
rdf_star=options.rdf_star,
|
|
81
|
+
version=options.version,
|
|
82
|
+
delimited=delimited,
|
|
83
|
+
namespace_declarations=nd,
|
|
84
|
+
),
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _adapter_missing(feature: str, *, stream_types: StreamTypes) -> Never:
|
|
89
|
+
physical_type_name = jelly.PhysicalStreamType.Name(stream_types.physical_type)
|
|
90
|
+
logical_type_name = jelly.LogicalStreamType.Name(stream_types.logical_type)
|
|
91
|
+
msg = (
|
|
92
|
+
f"adapter with {physical_type_name} and {logical_type_name} "
|
|
93
|
+
f"does not implement {feature}"
|
|
94
|
+
)
|
|
95
|
+
raise NotImplementedError(msg)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@mypyc_attr(allow_interpreted_subclasses=True)
|
|
99
|
+
class Adapter(metaclass=ABCMeta):
|
|
100
|
+
def __init__(
|
|
101
|
+
self, options: ParserOptions, parsing_mode: ParsingMode = ParsingMode.FLAT
|
|
102
|
+
) -> None:
|
|
103
|
+
self.options = options
|
|
104
|
+
self.parsing_mode = parsing_mode
|
|
105
|
+
|
|
106
|
+
# Obligatory abstract methods--all adapters must implement these
|
|
107
|
+
@abstractmethod
|
|
108
|
+
def iri(self, iri: str) -> Any:
|
|
109
|
+
raise NotImplementedError
|
|
110
|
+
|
|
111
|
+
@abstractmethod
|
|
112
|
+
def default_graph(self) -> Any:
|
|
113
|
+
raise NotImplementedError
|
|
114
|
+
|
|
115
|
+
@abstractmethod
|
|
116
|
+
def bnode(self, bnode: str) -> Any:
|
|
117
|
+
raise NotImplementedError
|
|
118
|
+
|
|
119
|
+
@abstractmethod
|
|
120
|
+
def literal(
|
|
121
|
+
self,
|
|
122
|
+
lex: str,
|
|
123
|
+
language: str | None = None,
|
|
124
|
+
datatype: str | None = None,
|
|
125
|
+
) -> Any:
|
|
126
|
+
raise NotImplementedError
|
|
127
|
+
|
|
128
|
+
# Optional abstract methods--not required to be implemented by all adapters
|
|
129
|
+
def triple(self, terms: Iterable[Any]) -> Any: # noqa: ARG002
|
|
130
|
+
_adapter_missing("decoding triples", stream_types=self.options.stream_types)
|
|
131
|
+
|
|
132
|
+
def quad(self, terms: Iterable[Any]) -> Any: # noqa: ARG002
|
|
133
|
+
_adapter_missing("decoding quads", stream_types=self.options.stream_types)
|
|
134
|
+
|
|
135
|
+
def graph_start(self, graph_id: Any) -> Any: # noqa: ARG002
|
|
136
|
+
_adapter_missing(
|
|
137
|
+
"decoding graph start markers", stream_types=self.options.stream_types
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
def graph_end(self) -> Any:
|
|
141
|
+
_adapter_missing(
|
|
142
|
+
"decoding graph end markers", stream_types=self.options.stream_types
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
def namespace_declaration(self, name: str, iri: str) -> Any: # noqa: ARG002
|
|
146
|
+
_adapter_missing(
|
|
147
|
+
"decoding namespace declarations",
|
|
148
|
+
stream_types=self.options.stream_types,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
def quoted_triple(self, terms: Iterable[Any]) -> Any: # noqa: ARG002
|
|
152
|
+
_adapter_missing(
|
|
153
|
+
"decoding quoted triple", stream_types=self.options.stream_types
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
def frame(self) -> Any:
|
|
157
|
+
return None
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@mypyc_attr(allow_interpreted_subclasses=True)
|
|
161
|
+
class Decoder:
|
|
162
|
+
_ROW_HANDLER_NAMES: ClassVar[Mapping[type[Any], str]] = {
|
|
163
|
+
jelly.RdfStreamOptions: "validate_stream_options",
|
|
164
|
+
jelly.RdfPrefixEntry: "ingest_prefix_entry",
|
|
165
|
+
jelly.RdfNameEntry: "ingest_name_entry",
|
|
166
|
+
jelly.RdfDatatypeEntry: "ingest_datatype_entry",
|
|
167
|
+
jelly.RdfTriple: "decode_triple",
|
|
168
|
+
jelly.RdfQuad: "decode_quad",
|
|
169
|
+
jelly.RdfGraphStart: "decode_graph_start",
|
|
170
|
+
jelly.RdfGraphEnd: "decode_graph_end",
|
|
171
|
+
jelly.RdfNamespaceDeclaration: "decode_namespace_declaration",
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
_TERM_HANDLER_NAMES: ClassVar[Mapping[type[Any], str]] = {
|
|
175
|
+
jelly.RdfIri: "decode_iri",
|
|
176
|
+
str: "decode_bnode",
|
|
177
|
+
jelly.RdfLiteral: "decode_literal",
|
|
178
|
+
jelly.RdfDefaultGraph: "decode_default_graph",
|
|
179
|
+
jelly.RdfTriple: "decode_quoted_triple",
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
def __init__(self, adapter: Adapter) -> None:
|
|
183
|
+
"""
|
|
184
|
+
Initialize decoder.
|
|
185
|
+
|
|
186
|
+
Initializes decoder with a lookup tables with preset sizes,
|
|
187
|
+
integration-dependent adapter and empty repeated terms dictionary.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
adapter (Adapter): integration-dependent adapter that specifies terms
|
|
191
|
+
conversion to specific objects, framing,
|
|
192
|
+
namespace declarations, and graphs/datasets forming.
|
|
193
|
+
|
|
194
|
+
"""
|
|
195
|
+
self.adapter = adapter
|
|
196
|
+
self.names = LookupDecoder(lookup_size=self.options.lookup_preset.max_names)
|
|
197
|
+
self.prefixes = LookupDecoder(
|
|
198
|
+
lookup_size=self.options.lookup_preset.max_prefixes
|
|
199
|
+
)
|
|
200
|
+
self.datatypes = LookupDecoder(
|
|
201
|
+
lookup_size=self.options.lookup_preset.max_datatypes
|
|
202
|
+
)
|
|
203
|
+
self.repeated_terms: dict[str, jelly.RdfIri | str | jelly.RdfLiteral] = {}
|
|
204
|
+
|
|
205
|
+
self.row_handlers: dict[type[Any], RowHandler] = {
|
|
206
|
+
t: getattr(self, name) for t, name in self._ROW_HANDLER_NAMES.items()
|
|
207
|
+
}
|
|
208
|
+
self.term_handlers: dict[type[Any], TermHandler] = {
|
|
209
|
+
t: getattr(self, name) for t, name in self._TERM_HANDLER_NAMES.items()
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
@property
|
|
213
|
+
def options(self) -> ParserOptions:
|
|
214
|
+
return self.adapter.options
|
|
215
|
+
|
|
216
|
+
def iter_rows(self, frame: jelly.RdfStreamFrame) -> Iterator[Any]:
|
|
217
|
+
"""
|
|
218
|
+
Iterate through rows in the frame.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
frame (jelly.RdfStreamFrame): jelly frame
|
|
222
|
+
Yields:
|
|
223
|
+
Iterator[Any]: decoded rows
|
|
224
|
+
|
|
225
|
+
"""
|
|
226
|
+
for row_owner in frame.rows:
|
|
227
|
+
row = getattr(row_owner, row_owner.WhichOneof("row"))
|
|
228
|
+
decoded_row = self.decode_row(row)
|
|
229
|
+
if isinstance(
|
|
230
|
+
row, (jelly.RdfTriple, jelly.RdfQuad, jelly.RdfNamespaceDeclaration)
|
|
231
|
+
):
|
|
232
|
+
yield decoded_row
|
|
233
|
+
|
|
234
|
+
def decode_row(self, row: Any) -> Any | None:
|
|
235
|
+
"""
|
|
236
|
+
Decode a row based on its type.
|
|
237
|
+
|
|
238
|
+
Notes: uses custom adapters to decode triples/quads, namespace declarations,
|
|
239
|
+
graph start/end.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
row (Any): protobuf row message
|
|
243
|
+
|
|
244
|
+
Raises:
|
|
245
|
+
TypeError: raises error if this type of protobuf message does not have
|
|
246
|
+
a respective handler
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
Any | None: decoded row -
|
|
250
|
+
result from calling decode_row (row type appropriate handler)
|
|
251
|
+
|
|
252
|
+
"""
|
|
253
|
+
handler = self.row_handlers.get(type(row))
|
|
254
|
+
if handler is None:
|
|
255
|
+
msg = f"decoder not implemented for {type(row)}"
|
|
256
|
+
raise TypeError(msg) from None
|
|
257
|
+
return handler(row)
|
|
258
|
+
|
|
259
|
+
def validate_stream_options(self, options: jelly.RdfStreamOptions) -> None:
|
|
260
|
+
stream_types, lookup_preset, params = self.options
|
|
261
|
+
assert stream_types.physical_type == options.physical_type
|
|
262
|
+
assert stream_types.logical_type == options.logical_type
|
|
263
|
+
assert params.stream_name == options.stream_name
|
|
264
|
+
assert params.version >= options.version
|
|
265
|
+
assert lookup_preset.max_prefixes == options.max_prefix_table_size
|
|
266
|
+
assert lookup_preset.max_datatypes == options.max_datatype_table_size
|
|
267
|
+
assert lookup_preset.max_names == options.max_name_table_size
|
|
268
|
+
|
|
269
|
+
def ingest_prefix_entry(self, entry: jelly.RdfPrefixEntry) -> None:
|
|
270
|
+
"""
|
|
271
|
+
Update prefix lookup table based on the table entry.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
entry (jelly.RdfPrefixEntry): prefix message, containing id and value
|
|
275
|
+
|
|
276
|
+
"""
|
|
277
|
+
self.prefixes.assign_entry(index=entry.id, value=entry.value)
|
|
278
|
+
|
|
279
|
+
def ingest_name_entry(self, entry: jelly.RdfNameEntry) -> None:
|
|
280
|
+
"""
|
|
281
|
+
Update name lookup table based on the table entry.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
entry (jelly.RdfNameEntry): name message, containing id and value
|
|
285
|
+
|
|
286
|
+
"""
|
|
287
|
+
self.names.assign_entry(index=entry.id, value=entry.value)
|
|
288
|
+
|
|
289
|
+
def ingest_datatype_entry(self, entry: jelly.RdfDatatypeEntry) -> None:
|
|
290
|
+
"""
|
|
291
|
+
Update datatype lookup table based on the table entry.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
entry (jelly.RdfDatatypeEntry): name message, containing id and value
|
|
295
|
+
|
|
296
|
+
"""
|
|
297
|
+
self.datatypes.assign_entry(index=entry.id, value=entry.value)
|
|
298
|
+
|
|
299
|
+
def decode_term(self, term: Any) -> Any:
|
|
300
|
+
"""
|
|
301
|
+
Decode a term based on its type: IRI/literal/BN/default graph.
|
|
302
|
+
|
|
303
|
+
Notes: requires a custom adapter with implemented methods for terms decoding.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
term (Any): IRI/literal/BN(string)/Default graph message
|
|
307
|
+
|
|
308
|
+
Raises:
|
|
309
|
+
TypeError: raises error if no handler for the term is found
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
Any: decoded term (currently, rdflib objects, e.g., rdflib.term.URIRef)
|
|
313
|
+
|
|
314
|
+
"""
|
|
315
|
+
decode_term = self.term_handlers.get(type(term))
|
|
316
|
+
if decode_term is None:
|
|
317
|
+
msg = f"decoder not implemented for {type(term)}"
|
|
318
|
+
raise TypeError(msg) from None
|
|
319
|
+
return decode_term(term)
|
|
320
|
+
|
|
321
|
+
def decode_iri(self, iri: jelly.RdfIri) -> Any:
|
|
322
|
+
"""
|
|
323
|
+
Decode RdfIri message to IRI using a custom adapter.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
iri (jelly.RdfIri): RdfIri message
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
Any: IRI, based on adapter implementation, e.g., rdflib.term.URIRef
|
|
330
|
+
|
|
331
|
+
"""
|
|
332
|
+
name = self.names.decode_name_term_index(iri.name_id)
|
|
333
|
+
prefix = self.prefixes.decode_prefix_term_index(iri.prefix_id)
|
|
334
|
+
return self.adapter.iri(iri=prefix + name)
|
|
335
|
+
|
|
336
|
+
def decode_default_graph(self, _: jelly.RdfDefaultGraph) -> Any:
|
|
337
|
+
return self.adapter.default_graph()
|
|
338
|
+
|
|
339
|
+
def decode_bnode(self, bnode: str) -> Any:
|
|
340
|
+
"""
|
|
341
|
+
Decode string message to blank node (BN) using a custom adapter.
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
bnode (str): blank node id
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
Any: blank node object from the custom adapter
|
|
348
|
+
|
|
349
|
+
"""
|
|
350
|
+
return self.adapter.bnode(bnode)
|
|
351
|
+
|
|
352
|
+
def decode_literal(self, literal: jelly.RdfLiteral) -> Any:
|
|
353
|
+
"""
|
|
354
|
+
Decode RdfLiteral to literal based on custom adapter implementation.
|
|
355
|
+
|
|
356
|
+
Notes: checks for langtag existence;
|
|
357
|
+
for datatype checks for non-zero table size and datatype field presence
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
literal (jelly.RdfLiteral): RdfLiteral message
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
Any: literal returned by the custom adapter
|
|
364
|
+
|
|
365
|
+
"""
|
|
366
|
+
language = datatype = None
|
|
367
|
+
if literal.langtag:
|
|
368
|
+
language = literal.langtag
|
|
369
|
+
elif self.datatypes.lookup_size and literal.HasField("datatype"):
|
|
370
|
+
datatype = self.datatypes.decode_datatype_term_index(literal.datatype)
|
|
371
|
+
return self.adapter.literal(
|
|
372
|
+
lex=literal.lex,
|
|
373
|
+
language=language,
|
|
374
|
+
datatype=datatype,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
def decode_namespace_declaration(
|
|
378
|
+
self,
|
|
379
|
+
declaration: jelly.RdfNamespaceDeclaration,
|
|
380
|
+
) -> Any:
|
|
381
|
+
iri = self.decode_iri(declaration.value)
|
|
382
|
+
return self.adapter.namespace_declaration(declaration.name, iri)
|
|
383
|
+
|
|
384
|
+
def decode_graph_start(self, graph_start: jelly.RdfGraphStart) -> Any:
|
|
385
|
+
term = getattr(graph_start, graph_start.WhichOneof("graph"))
|
|
386
|
+
return self.adapter.graph_start(self.decode_term(term))
|
|
387
|
+
|
|
388
|
+
def decode_graph_end(self, _: jelly.RdfGraphEnd) -> Any:
|
|
389
|
+
return self.adapter.graph_end()
|
|
390
|
+
|
|
391
|
+
def decode_statement(
|
|
392
|
+
self,
|
|
393
|
+
statement: jelly.RdfTriple | jelly.RdfQuad,
|
|
394
|
+
oneofs: Sequence[str],
|
|
395
|
+
) -> Any:
|
|
396
|
+
"""
|
|
397
|
+
Decode a triple/quad message.
|
|
398
|
+
|
|
399
|
+
Notes: also updates repeated terms dictionary
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
statement (jelly.RdfTriple | jelly.RdfQuad): triple/quad message
|
|
403
|
+
oneofs (Sequence[str]): terms s/p/o/g(if quads)
|
|
404
|
+
|
|
405
|
+
Raises:
|
|
406
|
+
ValueError: if a missing repeated term is encountered
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
Any: a list of decoded terms
|
|
410
|
+
|
|
411
|
+
"""
|
|
412
|
+
terms = []
|
|
413
|
+
for oneof in oneofs:
|
|
414
|
+
field = statement.WhichOneof(oneof)
|
|
415
|
+
if field:
|
|
416
|
+
jelly_term = getattr(statement, field)
|
|
417
|
+
decoded_term = self.decode_term(jelly_term)
|
|
418
|
+
self.repeated_terms[oneof] = decoded_term
|
|
419
|
+
else:
|
|
420
|
+
decoded_term = self.repeated_terms[oneof]
|
|
421
|
+
if decoded_term is None:
|
|
422
|
+
msg = f"missing repeated term {oneof}"
|
|
423
|
+
raise ValueError(msg)
|
|
424
|
+
terms.append(decoded_term)
|
|
425
|
+
return terms
|
|
426
|
+
|
|
427
|
+
def decode_triple(self, triple: jelly.RdfTriple) -> Any:
|
|
428
|
+
terms = self.decode_statement(triple, ("subject", "predicate", "object"))
|
|
429
|
+
return self.adapter.triple(terms)
|
|
430
|
+
|
|
431
|
+
def decode_quoted_triple(self, triple: jelly.RdfTriple) -> Any:
|
|
432
|
+
oneofs: Sequence[str] = ("subject", "predicate", "object")
|
|
433
|
+
terms = []
|
|
434
|
+
for oneof in oneofs:
|
|
435
|
+
field = triple.WhichOneof(oneof)
|
|
436
|
+
if field:
|
|
437
|
+
jelly_term = getattr(triple, field)
|
|
438
|
+
decoded_term = self.decode_term(jelly_term)
|
|
439
|
+
else:
|
|
440
|
+
msg = "repeated terms are not allowed in quoted triples"
|
|
441
|
+
raise ValueError(msg)
|
|
442
|
+
terms.append(decoded_term)
|
|
443
|
+
return self.adapter.quoted_triple(terms)
|
|
444
|
+
|
|
445
|
+
def decode_quad(self, quad: jelly.RdfQuad) -> Any:
|
|
446
|
+
terms = self.decode_statement(quad, ("subject", "predicate", "object", "graph"))
|
|
447
|
+
return self.adapter.quad(terms)
|
|
Binary file
|
pyjelly/parse/ioutils.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import os
|
|
3
|
+
from collections.abc import Generator, Iterator
|
|
4
|
+
from itertools import chain
|
|
5
|
+
from typing import IO
|
|
6
|
+
|
|
7
|
+
from google.protobuf.proto import parse, parse_length_prefixed
|
|
8
|
+
|
|
9
|
+
from pyjelly import jelly
|
|
10
|
+
from pyjelly.errors import JellyConformanceError
|
|
11
|
+
from pyjelly.parse.decode import ParserOptions, options_from_frame
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def delimited_jelly_hint(header: bytes) -> bool:
|
|
15
|
+
"""
|
|
16
|
+
Detect whether a Jelly file is delimited from its first 3 bytes.
|
|
17
|
+
|
|
18
|
+
Truth table (notation: `0A` = `0x0A`, `NN` = `not 0x0A`, `??` = _don't care_):
|
|
19
|
+
|
|
20
|
+
| Byte 1 | Byte 2 | Byte 3 | Result |
|
|
21
|
+
|--------|--------|--------|------------------------------------------|
|
|
22
|
+
| `NN` | `??` | `??` | Delimited |
|
|
23
|
+
| `0A` | `NN` | `??` | Non-delimited |
|
|
24
|
+
| `0A` | `0A` | `NN` | Delimited (size = 10) |
|
|
25
|
+
| `0A` | `0A` | `0A` | Non-delimited (stream options size = 10) |
|
|
26
|
+
|
|
27
|
+
>>> delimited_jelly_hint(bytes([0x00, 0x00, 0x00]))
|
|
28
|
+
True
|
|
29
|
+
|
|
30
|
+
>>> delimited_jelly_hint(bytes([0x00, 0x00, 0x0A]))
|
|
31
|
+
True
|
|
32
|
+
|
|
33
|
+
>>> delimited_jelly_hint(bytes([0x00, 0x0A, 0x00]))
|
|
34
|
+
True
|
|
35
|
+
|
|
36
|
+
>>> delimited_jelly_hint(bytes([0x00, 0x0A, 0x0A]))
|
|
37
|
+
True
|
|
38
|
+
|
|
39
|
+
>>> delimited_jelly_hint(bytes([0x0A, 0x00, 0x00]))
|
|
40
|
+
False
|
|
41
|
+
|
|
42
|
+
>>> delimited_jelly_hint(bytes([0x0A, 0x00, 0x0A]))
|
|
43
|
+
False
|
|
44
|
+
|
|
45
|
+
>>> delimited_jelly_hint(bytes([0x0A, 0x0A, 0x00]))
|
|
46
|
+
True
|
|
47
|
+
|
|
48
|
+
>>> delimited_jelly_hint(bytes([0x0A, 0x0A, 0x0A]))
|
|
49
|
+
False
|
|
50
|
+
"""
|
|
51
|
+
magic = 0x0A
|
|
52
|
+
return len(header) >= 3 and ( # noqa: PLR2004
|
|
53
|
+
header[0] != magic or (header[1] == magic and header[2] != magic)
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def frame_iterator(inp: IO[bytes]) -> Generator[jelly.RdfStreamFrame]:
|
|
58
|
+
while frame := parse_length_prefixed(jelly.RdfStreamFrame, inp):
|
|
59
|
+
yield frame
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_options_and_frames(
|
|
63
|
+
inp: IO[bytes],
|
|
64
|
+
) -> tuple[ParserOptions, Iterator[jelly.RdfStreamFrame]]:
|
|
65
|
+
"""
|
|
66
|
+
Return stream options and frames from the buffered binary stream.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
inp (IO[bytes]): jelly buffered binary stream
|
|
70
|
+
|
|
71
|
+
Raises:
|
|
72
|
+
JellyConformanceError: if no non-empty frames detected in the delimited stream
|
|
73
|
+
JellyConformanceError: if non-delimited,
|
|
74
|
+
error is raised if no rows are detected (empty frame)
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
tuple[ParserOptions, Iterator[jelly.RdfStreamFrame]]: ParserOptions holds:
|
|
78
|
+
stream types, lookup presets and other stream options
|
|
79
|
+
|
|
80
|
+
"""
|
|
81
|
+
if not inp.seekable():
|
|
82
|
+
# Input may not be seekable (e.g. a network stream) -- then we need to buffer
|
|
83
|
+
# it to determine if it's delimited.
|
|
84
|
+
# See also: https://github.com/Jelly-RDF/pyjelly/issues/298
|
|
85
|
+
inp = io.BufferedReader(inp) # type: ignore[arg-type, type-var, unused-ignore]
|
|
86
|
+
is_delimited = delimited_jelly_hint(inp.peek(3))
|
|
87
|
+
else:
|
|
88
|
+
is_delimited = delimited_jelly_hint(bytes_read := inp.read(3))
|
|
89
|
+
inp.seek(-len(bytes_read), os.SEEK_CUR)
|
|
90
|
+
|
|
91
|
+
if is_delimited:
|
|
92
|
+
first_frame = None
|
|
93
|
+
skipped_frames = []
|
|
94
|
+
frames = frame_iterator(inp)
|
|
95
|
+
for frame in frames:
|
|
96
|
+
if not frame.rows:
|
|
97
|
+
skipped_frames.append(frame)
|
|
98
|
+
else:
|
|
99
|
+
first_frame = frame
|
|
100
|
+
break
|
|
101
|
+
if first_frame is None:
|
|
102
|
+
msg = "No non-empty frames found in the stream"
|
|
103
|
+
raise JellyConformanceError(msg)
|
|
104
|
+
|
|
105
|
+
options = options_from_frame(first_frame, delimited=True)
|
|
106
|
+
return options, chain(skipped_frames, (first_frame,), frames)
|
|
107
|
+
|
|
108
|
+
frame = parse(jelly.RdfStreamFrame, inp.read())
|
|
109
|
+
|
|
110
|
+
if not frame.rows:
|
|
111
|
+
msg = "The stream is corrupted (only contains an empty frame)"
|
|
112
|
+
raise JellyConformanceError(msg)
|
|
113
|
+
|
|
114
|
+
options = options_from_frame(frame, delimited=False)
|
|
115
|
+
return options, iter((frame,))
|
|
Binary file
|
pyjelly/parse/lookup.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import deque
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
from pyjelly.errors import JellyAssertionError, JellyConformanceError
|
|
7
|
+
from pyjelly.options import MAX_LOOKUP_SIZE
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class LookupDecoder:
|
|
12
|
+
"""
|
|
13
|
+
Shared base for RDF lookup encoders using Jelly compression.
|
|
14
|
+
|
|
15
|
+
Tracks the last assigned and last reused index.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
lookup_size
|
|
20
|
+
Maximum lookup size.
|
|
21
|
+
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
last_assigned_index: int
|
|
25
|
+
last_reused_index: int
|
|
26
|
+
|
|
27
|
+
def __init__(self, *, lookup_size: int) -> None:
|
|
28
|
+
if lookup_size > MAX_LOOKUP_SIZE:
|
|
29
|
+
msg = f"lookup size cannot be larger than {MAX_LOOKUP_SIZE}"
|
|
30
|
+
raise JellyAssertionError(msg)
|
|
31
|
+
self.lookup_size = lookup_size
|
|
32
|
+
placeholders = (None,) * lookup_size
|
|
33
|
+
self.data: deque[str | None] = deque(placeholders, maxlen=lookup_size)
|
|
34
|
+
self.last_assigned_index = 0
|
|
35
|
+
self.last_reused_index = 0
|
|
36
|
+
|
|
37
|
+
def assign_entry(self, index: int, value: str) -> None:
|
|
38
|
+
previous_index = self.last_assigned_index
|
|
39
|
+
if index == 0:
|
|
40
|
+
index = previous_index + 1
|
|
41
|
+
assert index > 0
|
|
42
|
+
self.data[index - 1] = value
|
|
43
|
+
self.last_assigned_index = index
|
|
44
|
+
|
|
45
|
+
def at(self, index: int) -> str:
|
|
46
|
+
self.last_reused_index = index
|
|
47
|
+
value = self.data[index - 1]
|
|
48
|
+
if value is None:
|
|
49
|
+
msg = f"invalid resolved index {index}"
|
|
50
|
+
raise IndexError(msg)
|
|
51
|
+
return value
|
|
52
|
+
|
|
53
|
+
def decode_prefix_term_index(self, index: int) -> str:
|
|
54
|
+
actual_index = index or self.last_reused_index
|
|
55
|
+
if actual_index == 0:
|
|
56
|
+
return ""
|
|
57
|
+
return self.at(actual_index)
|
|
58
|
+
|
|
59
|
+
def decode_name_term_index(self, index: int) -> str:
|
|
60
|
+
actual_index = index or self.last_reused_index + 1
|
|
61
|
+
if actual_index == 0:
|
|
62
|
+
msg = "0 is not a valid name term index"
|
|
63
|
+
raise JellyConformanceError(msg)
|
|
64
|
+
return self.at(actual_index)
|
|
65
|
+
|
|
66
|
+
def decode_datatype_term_index(self, index: int) -> str | None:
|
|
67
|
+
if index == 0:
|
|
68
|
+
msg = "0 is not a valid datatype term index"
|
|
69
|
+
raise JellyConformanceError(msg)
|
|
70
|
+
return self.at(index)
|
|
File without changes
|
|
Binary file
|