pyjelly 0.2.3__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyjelly might be problematic. Click here for more details.
- pyjelly/integrations/rdflib/parse.py +364 -166
- pyjelly/integrations/rdflib/serialize.py +168 -43
- pyjelly/jelly/rdf_pb2.py +3 -3
- pyjelly/jelly/rdf_pb2.pyi +2 -1
- pyjelly/parse/decode.py +166 -5
- pyjelly/parse/ioutils.py +26 -4
- pyjelly/serialize/encode.py +117 -0
- pyjelly/serialize/flows.py +70 -13
- pyjelly/serialize/streams.py +87 -3
- {pyjelly-0.2.3.dist-info → pyjelly-0.4.0.dist-info}/METADATA +8 -8
- {pyjelly-0.2.3.dist-info → pyjelly-0.4.0.dist-info}/RECORD +14 -14
- {pyjelly-0.2.3.dist-info → pyjelly-0.4.0.dist-info}/WHEEL +0 -0
- {pyjelly-0.2.3.dist-info → pyjelly-0.4.0.dist-info}/entry_points.txt +0 -0
- {pyjelly-0.2.3.dist-info → pyjelly-0.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,12 +1,14 @@
|
|
|
1
|
+
# ruff: noqa: I001
|
|
1
2
|
from __future__ import annotations
|
|
2
3
|
|
|
3
4
|
from collections.abc import Generator
|
|
4
5
|
from functools import singledispatch
|
|
5
|
-
from typing import
|
|
6
|
+
from typing import Any, IO
|
|
6
7
|
from typing_extensions import override
|
|
7
8
|
|
|
8
9
|
import rdflib
|
|
9
|
-
from rdflib
|
|
10
|
+
from rdflib import Graph
|
|
11
|
+
from rdflib.graph import DATASET_DEFAULT_GRAPH_ID, Dataset, QuotedGraph
|
|
10
12
|
from rdflib.serializer import Serializer as RDFLibSerializer
|
|
11
13
|
|
|
12
14
|
from pyjelly import jelly
|
|
@@ -18,11 +20,22 @@ from pyjelly.serialize.streams import (
|
|
|
18
20
|
SerializerOptions,
|
|
19
21
|
Stream,
|
|
20
22
|
TripleStream,
|
|
21
|
-
)
|
|
23
|
+
) # ruff: enable
|
|
22
24
|
|
|
23
25
|
|
|
24
26
|
class RDFLibTermEncoder(TermEncoder):
|
|
25
27
|
def encode_any(self, term: object, slot: Slot) -> RowsAndTerm:
|
|
28
|
+
"""
|
|
29
|
+
Encode term based on its RDFLib object.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
term (object): term to encode
|
|
33
|
+
slot (Slot): its place in statement.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
RowsAndTerm: encoded extra rows and a jelly term to encode
|
|
37
|
+
|
|
38
|
+
"""
|
|
26
39
|
if slot is Slot.graph and term == DATASET_DEFAULT_GRAPH_ID:
|
|
27
40
|
return self.encode_default_graph()
|
|
28
41
|
|
|
@@ -50,7 +63,10 @@ def namespace_declarations(store: Graph, stream: Stream) -> None:
|
|
|
50
63
|
|
|
51
64
|
|
|
52
65
|
@singledispatch
|
|
53
|
-
def stream_frames(
|
|
66
|
+
def stream_frames(
|
|
67
|
+
stream: Stream,
|
|
68
|
+
data: Graph, # noqa: ARG001
|
|
69
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
54
70
|
msg = f"invalid stream implementation {stream}"
|
|
55
71
|
raise TypeError(msg)
|
|
56
72
|
|
|
@@ -60,6 +76,21 @@ def triples_stream_frames(
|
|
|
60
76
|
stream: TripleStream,
|
|
61
77
|
data: Graph | Dataset,
|
|
62
78
|
) -> Generator[jelly.RdfStreamFrame]:
|
|
79
|
+
"""
|
|
80
|
+
Serialize a Graph/Dataset into jelly frames.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
stream (TripleStream): stream that specifies triples processing
|
|
84
|
+
data (Graph | Dataset): Graph/Dataset to serialize.
|
|
85
|
+
|
|
86
|
+
Notes:
|
|
87
|
+
if Dataset is given, its graphs are unpacked and iterated over
|
|
88
|
+
if flow is GraphsFrameFlow, emits a frame per graph.
|
|
89
|
+
|
|
90
|
+
Yields:
|
|
91
|
+
Generator[jelly.RdfStreamFrame]: jelly frames.
|
|
92
|
+
|
|
93
|
+
"""
|
|
63
94
|
stream.enroll()
|
|
64
95
|
if stream.options.params.namespace_declarations:
|
|
65
96
|
namespace_declarations(data, stream)
|
|
@@ -68,6 +99,7 @@ def triples_stream_frames(
|
|
|
68
99
|
for terms in graph:
|
|
69
100
|
if frame := stream.triple(terms):
|
|
70
101
|
yield frame
|
|
102
|
+
# this part turns each graph to a frame for graphs logical type
|
|
71
103
|
if frame := stream.flow.frame_from_graph():
|
|
72
104
|
yield frame
|
|
73
105
|
if stream.stream_types.flat and (frame := stream.flow.to_stream_frame()):
|
|
@@ -79,6 +111,20 @@ def quads_stream_frames(
|
|
|
79
111
|
stream: QuadStream,
|
|
80
112
|
data: Dataset,
|
|
81
113
|
) -> Generator[jelly.RdfStreamFrame]:
|
|
114
|
+
"""
|
|
115
|
+
Serialize a Dataset into jelly frames.
|
|
116
|
+
|
|
117
|
+
Notes:
|
|
118
|
+
Emits one frame per dataset if flow is of DatasetsFrameFlow.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
stream (QuadStream): stream that specifies quads processing
|
|
122
|
+
data (Dataset): Dataset to serialize.
|
|
123
|
+
|
|
124
|
+
Yields:
|
|
125
|
+
Generator[jelly.RdfStreamFrame]: jelly frames
|
|
126
|
+
|
|
127
|
+
"""
|
|
82
128
|
assert isinstance(data, Dataset)
|
|
83
129
|
stream.enroll()
|
|
84
130
|
if stream.options.params.namespace_declarations:
|
|
@@ -97,6 +143,21 @@ def graphs_stream_frames(
|
|
|
97
143
|
stream: GraphStream,
|
|
98
144
|
data: Dataset,
|
|
99
145
|
) -> Generator[jelly.RdfStreamFrame]:
|
|
146
|
+
"""
|
|
147
|
+
Serialize a Dataset into jelly frames as a stream of graphs.
|
|
148
|
+
|
|
149
|
+
Notes:
|
|
150
|
+
If flow of DatasetsFrameFlow type, the whole dataset
|
|
151
|
+
will be encoded into one frame.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
stream (GraphStream): stream that specifies graphs processing
|
|
155
|
+
data (Dataset): Dataset to serialize.
|
|
156
|
+
|
|
157
|
+
Yields:
|
|
158
|
+
Generator[jelly.RdfStreamFrame]: jelly frames
|
|
159
|
+
|
|
160
|
+
"""
|
|
100
161
|
assert isinstance(data, Dataset)
|
|
101
162
|
stream.enroll()
|
|
102
163
|
if stream.options.params.namespace_declarations:
|
|
@@ -109,6 +170,48 @@ def graphs_stream_frames(
|
|
|
109
170
|
yield frame
|
|
110
171
|
|
|
111
172
|
|
|
173
|
+
def guess_options(sink: Graph | Dataset) -> SerializerOptions:
|
|
174
|
+
"""
|
|
175
|
+
Guess the serializer options based on the store type.
|
|
176
|
+
|
|
177
|
+
>>> guess_options(Graph()).logical_type
|
|
178
|
+
1
|
|
179
|
+
>>> guess_options(Dataset()).logical_type
|
|
180
|
+
2
|
|
181
|
+
"""
|
|
182
|
+
logical_type = (
|
|
183
|
+
jelly.LOGICAL_STREAM_TYPE_FLAT_QUADS
|
|
184
|
+
if isinstance(sink, Dataset)
|
|
185
|
+
else jelly.LOGICAL_STREAM_TYPE_FLAT_TRIPLES
|
|
186
|
+
)
|
|
187
|
+
return SerializerOptions(logical_type=logical_type)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def guess_stream(options: SerializerOptions, sink: Graph | Dataset) -> Stream:
|
|
191
|
+
"""
|
|
192
|
+
Return an appropriate stream implementation for the given options.
|
|
193
|
+
|
|
194
|
+
Notes: if base(!) logical type is GRAPHS and Dataset is given,
|
|
195
|
+
initializes TripleStream
|
|
196
|
+
|
|
197
|
+
>>> graph_ser = RDFLibJellySerializer(Graph())
|
|
198
|
+
>>> ds_ser = RDFLibJellySerializer(Dataset())
|
|
199
|
+
|
|
200
|
+
>>> type(guess_stream(guess_options(graph_ser.store), graph_ser.store))
|
|
201
|
+
<class 'pyjelly.serialize.streams.TripleStream'>
|
|
202
|
+
>>> type(guess_stream(guess_options(ds_ser.store), ds_ser.store))
|
|
203
|
+
<class 'pyjelly.serialize.streams.QuadStream'>
|
|
204
|
+
"""
|
|
205
|
+
stream_cls: type[Stream]
|
|
206
|
+
if (options.logical_type % 10) != jelly.LOGICAL_STREAM_TYPE_GRAPHS and isinstance(
|
|
207
|
+
sink, Dataset
|
|
208
|
+
):
|
|
209
|
+
stream_cls = QuadStream
|
|
210
|
+
else:
|
|
211
|
+
stream_cls = TripleStream
|
|
212
|
+
return stream_cls.for_rdflib(options=options)
|
|
213
|
+
|
|
214
|
+
|
|
112
215
|
class RDFLibJellySerializer(RDFLibSerializer):
|
|
113
216
|
"""
|
|
114
217
|
RDFLib serializer for writing graphs in Jelly RDF stream format.
|
|
@@ -124,43 +227,6 @@ class RDFLibJellySerializer(RDFLibSerializer):
|
|
|
124
227
|
raise NotImplementedError(msg)
|
|
125
228
|
super().__init__(store)
|
|
126
229
|
|
|
127
|
-
def guess_options(self) -> SerializerOptions:
|
|
128
|
-
"""
|
|
129
|
-
Guess the serializer options based on the store type.
|
|
130
|
-
|
|
131
|
-
>>> RDFLibJellySerializer(Graph()).guess_options().logical_type
|
|
132
|
-
1
|
|
133
|
-
>>> RDFLibJellySerializer(Dataset()).guess_options().logical_type
|
|
134
|
-
2
|
|
135
|
-
"""
|
|
136
|
-
logical_type = (
|
|
137
|
-
jelly.LOGICAL_STREAM_TYPE_FLAT_QUADS
|
|
138
|
-
if isinstance(self.store, Dataset)
|
|
139
|
-
else jelly.LOGICAL_STREAM_TYPE_FLAT_TRIPLES
|
|
140
|
-
)
|
|
141
|
-
return SerializerOptions(logical_type=logical_type)
|
|
142
|
-
|
|
143
|
-
def guess_stream(self, options: SerializerOptions) -> Stream:
|
|
144
|
-
"""
|
|
145
|
-
Return an appropriate stream implementation for the given options.
|
|
146
|
-
|
|
147
|
-
>>> graph_ser = RDFLibJellySerializer(Graph())
|
|
148
|
-
>>> ds_ser = RDFLibJellySerializer(Dataset())
|
|
149
|
-
|
|
150
|
-
>>> type(graph_ser.guess_stream(graph_ser.guess_options()))
|
|
151
|
-
<class 'pyjelly.serialize.streams.TripleStream'>
|
|
152
|
-
>>> type(ds_ser.guess_stream(ds_ser.guess_options()))
|
|
153
|
-
<class 'pyjelly.serialize.streams.QuadStream'>
|
|
154
|
-
"""
|
|
155
|
-
stream_cls: type[Stream]
|
|
156
|
-
if options.logical_type != jelly.LOGICAL_STREAM_TYPE_GRAPHS and isinstance(
|
|
157
|
-
self.store, Dataset
|
|
158
|
-
):
|
|
159
|
-
stream_cls = QuadStream
|
|
160
|
-
else:
|
|
161
|
-
stream_cls = TripleStream
|
|
162
|
-
return stream_cls.for_rdflib(options=options)
|
|
163
|
-
|
|
164
230
|
@override
|
|
165
231
|
def serialize( # type: ignore[override]
|
|
166
232
|
self,
|
|
@@ -171,10 +237,69 @@ class RDFLibJellySerializer(RDFLibSerializer):
|
|
|
171
237
|
options: SerializerOptions | None = None,
|
|
172
238
|
**unused: Any,
|
|
173
239
|
) -> None:
|
|
240
|
+
"""
|
|
241
|
+
Serialize self.store content to Jelly format.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
out (IO[bytes]): output buffered writer
|
|
245
|
+
stream (Stream | None, optional): Jelly stream object. Defaults to None.
|
|
246
|
+
options (SerializerOptions | None, optional): Serializer options
|
|
247
|
+
if defined beforehand, e.g., read from a separate file.
|
|
248
|
+
Defaults to None.
|
|
249
|
+
**unused(Any): unused args for RDFLib serialize
|
|
250
|
+
|
|
251
|
+
"""
|
|
174
252
|
if options is None:
|
|
175
|
-
options = self.
|
|
253
|
+
options = guess_options(self.store)
|
|
176
254
|
if stream is None:
|
|
177
|
-
stream =
|
|
255
|
+
stream = guess_stream(options, self.store)
|
|
178
256
|
write = write_delimited if stream.options.params.delimited else write_single
|
|
179
257
|
for stream_frame in stream_frames(stream, self.store):
|
|
180
258
|
write(stream_frame, out)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def grouped_stream_to_frames(
|
|
262
|
+
sink_generator: Generator[Graph] | Generator[Dataset],
|
|
263
|
+
options: SerializerOptions | None = None,
|
|
264
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
265
|
+
"""
|
|
266
|
+
Transform Graphs/Datasets into Jelly frames, one frame per Graph/Dataset.
|
|
267
|
+
|
|
268
|
+
Note: options are guessed if not provided.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
sink_generator (Generator[Graph] | Generator[Dataset]): Generator of
|
|
272
|
+
Graphs/Dataset to transform.
|
|
273
|
+
options (SerializerOptions | None, optional): stream options to use.
|
|
274
|
+
Options are guessed based on the sink store type. Defaults to None.
|
|
275
|
+
|
|
276
|
+
Yields:
|
|
277
|
+
Generator[jelly.RdfStreamFrame]: produced Jelly frames
|
|
278
|
+
|
|
279
|
+
"""
|
|
280
|
+
stream = None
|
|
281
|
+
for sink in sink_generator:
|
|
282
|
+
if not stream:
|
|
283
|
+
if options is None:
|
|
284
|
+
options = guess_options(sink)
|
|
285
|
+
stream = guess_stream(options, sink)
|
|
286
|
+
yield from stream_frames(stream, sink)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def grouped_stream_to_file(
|
|
290
|
+
stream: Generator[Graph] | Generator[Dataset],
|
|
291
|
+
output_file: IO[bytes],
|
|
292
|
+
**kwargs: Any,
|
|
293
|
+
) -> None:
|
|
294
|
+
"""
|
|
295
|
+
Write stream of Graphs/Datasets to a binary file.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
stream (Generator[Graph] | Generator[Dataset]): Generator of
|
|
299
|
+
Graphs/Dataset to transform.
|
|
300
|
+
output_file (IO[bytes]): opened output file.
|
|
301
|
+
**kwargs (Any): options to pass to stream.
|
|
302
|
+
|
|
303
|
+
"""
|
|
304
|
+
for frame in grouped_stream_to_frames(stream, **kwargs):
|
|
305
|
+
write_delimited(frame, output_file)
|
pyjelly/jelly/rdf_pb2.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
3
3
|
# NO CHECKED-IN PROTOBUF GENCODE
|
|
4
4
|
# source: rdf.proto
|
|
5
|
-
# Protobuf Python Version:
|
|
5
|
+
# Protobuf Python Version: 6.31.0
|
|
6
6
|
"""Generated protocol buffer code."""
|
|
7
7
|
from google.protobuf import descriptor as _descriptor
|
|
8
8
|
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
@@ -11,8 +11,8 @@ from google.protobuf import symbol_database as _symbol_database
|
|
|
11
11
|
from google.protobuf.internal import builder as _builder
|
|
12
12
|
_runtime_version.ValidateProtobufRuntimeVersion(
|
|
13
13
|
_runtime_version.Domain.PUBLIC,
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
6,
|
|
15
|
+
31,
|
|
16
16
|
0,
|
|
17
17
|
'',
|
|
18
18
|
'rdf.proto'
|
pyjelly/jelly/rdf_pb2.pyi
CHANGED
|
@@ -2,7 +2,8 @@ from google.protobuf.internal import containers as _containers
|
|
|
2
2
|
from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
|
|
3
3
|
from google.protobuf import descriptor as _descriptor
|
|
4
4
|
from google.protobuf import message as _message
|
|
5
|
-
from
|
|
5
|
+
from collections.abc import Iterable as _Iterable, Mapping as _Mapping
|
|
6
|
+
from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
|
|
6
7
|
|
|
7
8
|
DESCRIPTOR: _descriptor.FileDescriptor
|
|
8
9
|
|
pyjelly/parse/decode.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from abc import ABCMeta, abstractmethod
|
|
4
|
-
from collections.abc import Iterable, Sequence
|
|
4
|
+
from collections.abc import Iterable, Iterator, Sequence
|
|
5
|
+
from enum import Enum, auto
|
|
5
6
|
from typing import Any, ClassVar, NamedTuple
|
|
6
7
|
from typing_extensions import Never
|
|
7
8
|
|
|
@@ -10,6 +11,21 @@ from pyjelly.options import LookupPreset, StreamParameters, StreamTypes
|
|
|
10
11
|
from pyjelly.parse.lookup import LookupDecoder
|
|
11
12
|
|
|
12
13
|
|
|
14
|
+
class ParsingMode(Enum):
|
|
15
|
+
"""
|
|
16
|
+
Specifies how jelly frames should be treated.
|
|
17
|
+
|
|
18
|
+
Modes:
|
|
19
|
+
FLAT
|
|
20
|
+
Yield all frames as one Graph or Dataset.
|
|
21
|
+
GROUPED
|
|
22
|
+
Yield one Graph/Dataset per frame (grouped parsing).
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
FLAT = auto()
|
|
26
|
+
GROUPED = auto()
|
|
27
|
+
|
|
28
|
+
|
|
13
29
|
class ParserOptions(NamedTuple):
|
|
14
30
|
stream_types: StreamTypes
|
|
15
31
|
lookup_preset: LookupPreset
|
|
@@ -21,6 +37,21 @@ def options_from_frame(
|
|
|
21
37
|
*,
|
|
22
38
|
delimited: bool,
|
|
23
39
|
) -> ParserOptions:
|
|
40
|
+
"""
|
|
41
|
+
Fill stream options based on the options row.
|
|
42
|
+
|
|
43
|
+
Notes:
|
|
44
|
+
generalized_statements, rdf_star, and namespace declarations
|
|
45
|
+
are set to false by default
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
frame (jelly.RdfStreamFrame): first non-empty frame from the stream
|
|
49
|
+
delimited (bool): derived delimited flag
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
ParserOptions: filled options with types/lookups/stream parameters information
|
|
53
|
+
|
|
54
|
+
"""
|
|
24
55
|
row = frame.rows[0]
|
|
25
56
|
options = row.options
|
|
26
57
|
return ParserOptions(
|
|
@@ -52,8 +83,11 @@ def _adapter_missing(feature: str, *, stream_types: StreamTypes) -> Never:
|
|
|
52
83
|
|
|
53
84
|
|
|
54
85
|
class Adapter(metaclass=ABCMeta):
|
|
55
|
-
def __init__(
|
|
86
|
+
def __init__(
|
|
87
|
+
self, options: ParserOptions, parsing_mode: ParsingMode = ParsingMode.FLAT
|
|
88
|
+
) -> None:
|
|
56
89
|
self.options = options
|
|
90
|
+
self.parsing_mode = parsing_mode
|
|
57
91
|
|
|
58
92
|
# Obligatory abstract methods--all adapters must implement these
|
|
59
93
|
@abstractmethod
|
|
@@ -106,6 +140,18 @@ class Adapter(metaclass=ABCMeta):
|
|
|
106
140
|
|
|
107
141
|
class Decoder:
|
|
108
142
|
def __init__(self, adapter: Adapter) -> None:
|
|
143
|
+
"""
|
|
144
|
+
Initialize decoder.
|
|
145
|
+
|
|
146
|
+
Initializes decoder with a lookup tables with preset sizes,
|
|
147
|
+
integration-dependent adapter and empty repeated terms dictionary.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
adapter (Adapter): integration-dependent adapter that specifies terms
|
|
151
|
+
conversion to specific objects, framing,
|
|
152
|
+
namespace declarations, and graphs/datasets forming.
|
|
153
|
+
|
|
154
|
+
"""
|
|
109
155
|
self.adapter = adapter
|
|
110
156
|
self.names = LookupDecoder(lookup_size=self.options.lookup_preset.max_names)
|
|
111
157
|
self.prefixes = LookupDecoder(
|
|
@@ -120,13 +166,43 @@ class Decoder:
|
|
|
120
166
|
def options(self) -> ParserOptions:
|
|
121
167
|
return self.adapter.options
|
|
122
168
|
|
|
123
|
-
def
|
|
169
|
+
def iter_rows(self, frame: jelly.RdfStreamFrame) -> Iterator[Any]:
|
|
170
|
+
"""
|
|
171
|
+
Iterate through rows in the frame.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
frame (jelly.RdfStreamFrame): jelly frame
|
|
175
|
+
Yields:
|
|
176
|
+
Iterator[Any]: decoded rows
|
|
177
|
+
|
|
178
|
+
"""
|
|
124
179
|
for row_owner in frame.rows:
|
|
125
180
|
row = getattr(row_owner, row_owner.WhichOneof("row"))
|
|
126
|
-
self.decode_row(row)
|
|
127
|
-
|
|
181
|
+
decoded_row = self.decode_row(row)
|
|
182
|
+
if isinstance(
|
|
183
|
+
row, (jelly.RdfTriple, jelly.RdfQuad, jelly.RdfNamespaceDeclaration)
|
|
184
|
+
):
|
|
185
|
+
yield decoded_row
|
|
128
186
|
|
|
129
187
|
def decode_row(self, row: Any) -> Any | None:
|
|
188
|
+
"""
|
|
189
|
+
Decode a row based on its type.
|
|
190
|
+
|
|
191
|
+
Notes: uses custom adapters to decode triples/quads, namespace declarations,
|
|
192
|
+
graph start/end.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
row (Any): protobuf row message
|
|
196
|
+
|
|
197
|
+
Raises:
|
|
198
|
+
TypeError: raises error if this type of protobuf message does not have
|
|
199
|
+
a respective handler
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
Any | None: decoded row -
|
|
203
|
+
result from calling decode_row (row type appropriate handler)
|
|
204
|
+
|
|
205
|
+
"""
|
|
130
206
|
try:
|
|
131
207
|
decode_row = self.row_handlers[type(row)]
|
|
132
208
|
except KeyError:
|
|
@@ -145,15 +221,51 @@ class Decoder:
|
|
|
145
221
|
assert lookup_preset.max_names == options.max_name_table_size
|
|
146
222
|
|
|
147
223
|
def ingest_prefix_entry(self, entry: jelly.RdfPrefixEntry) -> None:
|
|
224
|
+
"""
|
|
225
|
+
Update prefix lookup table based on the table entry.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
entry (jelly.RdfPrefixEntry): prefix message, containing id and value
|
|
229
|
+
|
|
230
|
+
"""
|
|
148
231
|
self.prefixes.assign_entry(index=entry.id, value=entry.value)
|
|
149
232
|
|
|
150
233
|
def ingest_name_entry(self, entry: jelly.RdfNameEntry) -> None:
|
|
234
|
+
"""
|
|
235
|
+
Update name lookup table based on the table entry.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
entry (jelly.RdfNameEntry): name message, containing id and value
|
|
239
|
+
|
|
240
|
+
"""
|
|
151
241
|
self.names.assign_entry(index=entry.id, value=entry.value)
|
|
152
242
|
|
|
153
243
|
def ingest_datatype_entry(self, entry: jelly.RdfDatatypeEntry) -> None:
|
|
244
|
+
"""
|
|
245
|
+
Update datatype lookup table based on the table entry.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
entry (jelly.RdfDatatypeEntry): name message, containing id and value
|
|
249
|
+
|
|
250
|
+
"""
|
|
154
251
|
self.datatypes.assign_entry(index=entry.id, value=entry.value)
|
|
155
252
|
|
|
156
253
|
def decode_term(self, term: Any) -> Any:
|
|
254
|
+
"""
|
|
255
|
+
Decode a term based on its type: IRI/literal/BN/default graph.
|
|
256
|
+
|
|
257
|
+
Notes: requires a custom adapter with implemented methods for terms decoding.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
term (Any): IRI/literal/BN(string)/Default graph message
|
|
261
|
+
|
|
262
|
+
Raises:
|
|
263
|
+
TypeError: raises error if no handler for the term is found
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
Any: decoded term (currently, rdflib objects, e.g., rdflib.term.URIRef)
|
|
267
|
+
|
|
268
|
+
"""
|
|
157
269
|
try:
|
|
158
270
|
decode_term = self.term_handlers[type(term)]
|
|
159
271
|
except KeyError:
|
|
@@ -162,6 +274,16 @@ class Decoder:
|
|
|
162
274
|
return decode_term(self, term)
|
|
163
275
|
|
|
164
276
|
def decode_iri(self, iri: jelly.RdfIri) -> Any:
|
|
277
|
+
"""
|
|
278
|
+
Decode RdfIri message to IRI using a custom adapter.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
iri (jelly.RdfIri): RdfIri message
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
Any: IRI, based on adapter implementation, e.g., rdflib.term.URIRef
|
|
285
|
+
|
|
286
|
+
"""
|
|
165
287
|
name = self.names.decode_name_term_index(iri.name_id)
|
|
166
288
|
prefix = self.prefixes.decode_prefix_term_index(iri.prefix_id)
|
|
167
289
|
return self.adapter.iri(iri=prefix + name)
|
|
@@ -170,9 +292,32 @@ class Decoder:
|
|
|
170
292
|
return self.adapter.default_graph()
|
|
171
293
|
|
|
172
294
|
def decode_bnode(self, bnode: str) -> Any:
|
|
295
|
+
"""
|
|
296
|
+
Decode string message to blank node (BN) using a custom adapter.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
bnode (str): blank node id
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
Any: blank node object from the custom adapter
|
|
303
|
+
|
|
304
|
+
"""
|
|
173
305
|
return self.adapter.bnode(bnode)
|
|
174
306
|
|
|
175
307
|
def decode_literal(self, literal: jelly.RdfLiteral) -> Any:
|
|
308
|
+
"""
|
|
309
|
+
Decode RdfLiteral to literal based on custom adapter implementation.
|
|
310
|
+
|
|
311
|
+
Notes: checks for langtag existence;
|
|
312
|
+
for datatype checks for non-zero table size and datatype field presence
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
literal (jelly.RdfLiteral): RdfLiteral message
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
Any: literal returned by the custom adapter
|
|
319
|
+
|
|
320
|
+
"""
|
|
176
321
|
language = datatype = None
|
|
177
322
|
if literal.langtag:
|
|
178
323
|
language = literal.langtag
|
|
@@ -203,6 +348,22 @@ class Decoder:
|
|
|
203
348
|
statement: jelly.RdfTriple | jelly.RdfQuad,
|
|
204
349
|
oneofs: Sequence[str],
|
|
205
350
|
) -> Any:
|
|
351
|
+
"""
|
|
352
|
+
Decode a triple/quad message.
|
|
353
|
+
|
|
354
|
+
Notes: also updates repeated terms dictionary
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
statement (jelly.RdfTriple | jelly.RdfQuad): triple/quad message
|
|
358
|
+
oneofs (Sequence[str]): terms s/p/o/g(if quads)
|
|
359
|
+
|
|
360
|
+
Raises:
|
|
361
|
+
ValueError: if a missing repeated term is encountered
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
Any: a list of decoded terms
|
|
365
|
+
|
|
366
|
+
"""
|
|
206
367
|
terms = []
|
|
207
368
|
for oneof in oneofs:
|
|
208
369
|
field = statement.WhichOneof(oneof)
|
pyjelly/parse/ioutils.py
CHANGED
|
@@ -55,25 +55,47 @@ def delimited_jelly_hint(header: bytes) -> bool:
|
|
|
55
55
|
|
|
56
56
|
def frame_iterator(inp: IO[bytes]) -> Generator[jelly.RdfStreamFrame]:
|
|
57
57
|
while frame := parse_length_prefixed(jelly.RdfStreamFrame, inp):
|
|
58
|
-
|
|
59
|
-
yield frame
|
|
58
|
+
yield frame
|
|
60
59
|
|
|
61
60
|
|
|
62
61
|
def get_options_and_frames(
|
|
63
62
|
inp: IO[bytes],
|
|
64
63
|
) -> tuple[ParserOptions, Iterator[jelly.RdfStreamFrame]]:
|
|
64
|
+
"""
|
|
65
|
+
Return stream options and frames from the buffered binary stream.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
inp (IO[bytes]): jelly buffered binary stream
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
JellyConformanceError: if no non-empty frames detected in the delimited stream
|
|
72
|
+
JellyConformanceError: if non-delimited,
|
|
73
|
+
error is raised if no rows are detected (empty frame)
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
tuple[ParserOptions, Iterator[jelly.RdfStreamFrame]]: ParserOptions holds:
|
|
77
|
+
stream types, lookup presets and other stream options
|
|
78
|
+
|
|
79
|
+
"""
|
|
65
80
|
is_delimited = delimited_jelly_hint(bytes_read := inp.read(3))
|
|
66
81
|
inp.seek(-len(bytes_read), os.SEEK_CUR)
|
|
67
82
|
|
|
68
83
|
if is_delimited:
|
|
84
|
+
first_frame = None
|
|
85
|
+
skipped_frames = []
|
|
69
86
|
frames = frame_iterator(inp)
|
|
70
|
-
|
|
87
|
+
for frame in frames:
|
|
88
|
+
if not frame.rows:
|
|
89
|
+
skipped_frames.append(frame)
|
|
90
|
+
else:
|
|
91
|
+
first_frame = frame
|
|
92
|
+
break
|
|
71
93
|
if first_frame is None:
|
|
72
94
|
msg = "No non-empty frames found in the stream"
|
|
73
95
|
raise JellyConformanceError(msg)
|
|
74
96
|
|
|
75
97
|
options = options_from_frame(first_frame, delimited=True)
|
|
76
|
-
return options, chain((first_frame,), frames)
|
|
98
|
+
return options, chain(skipped_frames, (first_frame,), frames)
|
|
77
99
|
|
|
78
100
|
frame = parse(jelly.RdfStreamFrame, inp.read())
|
|
79
101
|
|