pyjelly 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyjelly might be problematic. Click here for more details.
- pyjelly/integrations/generic/__init__.py +0 -0
- pyjelly/integrations/generic/generic_sink.py +163 -0
- pyjelly/integrations/generic/parse.py +339 -0
- pyjelly/integrations/generic/serialize.py +361 -0
- pyjelly/integrations/rdflib/parse.py +235 -156
- pyjelly/integrations/rdflib/serialize.py +189 -60
- pyjelly/jelly/rdf_pb2.py +3 -3
- pyjelly/jelly/rdf_pb2.pyi +2 -1
- pyjelly/options.py +9 -0
- pyjelly/parse/decode.py +32 -10
- pyjelly/parse/ioutils.py +10 -4
- pyjelly/serialize/encode.py +30 -3
- pyjelly/serialize/flows.py +24 -14
- pyjelly/serialize/streams.py +5 -2
- {pyjelly-0.3.0.dist-info → pyjelly-0.5.0.dist-info}/METADATA +10 -9
- pyjelly-0.5.0.dist-info/RECORD +32 -0
- pyjelly-0.3.0.dist-info/RECORD +0 -28
- {pyjelly-0.3.0.dist-info → pyjelly-0.5.0.dist-info}/WHEEL +0 -0
- {pyjelly-0.3.0.dist-info → pyjelly-0.5.0.dist-info}/entry_points.txt +0 -0
- {pyjelly-0.3.0.dist-info → pyjelly-0.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,12 +1,16 @@
|
|
|
1
|
+
# ruff: noqa: I001
|
|
1
2
|
from __future__ import annotations
|
|
2
|
-
|
|
3
|
+
from typing import cast
|
|
3
4
|
from collections.abc import Generator
|
|
4
5
|
from functools import singledispatch
|
|
5
|
-
from typing import
|
|
6
|
+
from typing import Any, IO
|
|
6
7
|
from typing_extensions import override
|
|
8
|
+
from itertools import chain
|
|
9
|
+
from pyjelly.integrations.rdflib.parse import Quad, Triple
|
|
7
10
|
|
|
8
11
|
import rdflib
|
|
9
|
-
from rdflib
|
|
12
|
+
from rdflib import Graph
|
|
13
|
+
from rdflib.graph import DATASET_DEFAULT_GRAPH_ID, Dataset, QuotedGraph
|
|
10
14
|
from rdflib.serializer import Serializer as RDFLibSerializer
|
|
11
15
|
|
|
12
16
|
from pyjelly import jelly
|
|
@@ -18,7 +22,9 @@ from pyjelly.serialize.streams import (
|
|
|
18
22
|
SerializerOptions,
|
|
19
23
|
Stream,
|
|
20
24
|
TripleStream,
|
|
21
|
-
)
|
|
25
|
+
) # ruff: enable
|
|
26
|
+
|
|
27
|
+
QUAD_ARITY = 4
|
|
22
28
|
|
|
23
29
|
|
|
24
30
|
class RDFLibTermEncoder(TermEncoder):
|
|
@@ -61,7 +67,10 @@ def namespace_declarations(store: Graph, stream: Stream) -> None:
|
|
|
61
67
|
|
|
62
68
|
|
|
63
69
|
@singledispatch
|
|
64
|
-
def stream_frames(
|
|
70
|
+
def stream_frames(
|
|
71
|
+
stream: Stream,
|
|
72
|
+
data: Graph | Generator[Quad | Triple], # noqa: ARG001
|
|
73
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
65
74
|
msg = f"invalid stream implementation {stream}"
|
|
66
75
|
raise TypeError(msg)
|
|
67
76
|
|
|
@@ -69,14 +78,15 @@ def stream_frames(stream: Stream, data: Graph) -> Generator[jelly.RdfStreamFrame
|
|
|
69
78
|
@stream_frames.register(TripleStream)
|
|
70
79
|
def triples_stream_frames(
|
|
71
80
|
stream: TripleStream,
|
|
72
|
-
data: Graph | Dataset,
|
|
81
|
+
data: Graph | Dataset | Generator[Triple],
|
|
73
82
|
) -> Generator[jelly.RdfStreamFrame]:
|
|
74
83
|
"""
|
|
75
84
|
Serialize a Graph/Dataset into jelly frames.
|
|
76
85
|
|
|
77
86
|
Args:
|
|
78
87
|
stream (TripleStream): stream that specifies triples processing
|
|
79
|
-
data (Graph | Dataset):
|
|
88
|
+
data (Graph | Dataset | Generator[Triple]):
|
|
89
|
+
Graph/Dataset/Statements to serialize.
|
|
80
90
|
|
|
81
91
|
Notes:
|
|
82
92
|
if Dataset is given, its graphs are unpacked and iterated over
|
|
@@ -87,24 +97,24 @@ def triples_stream_frames(
|
|
|
87
97
|
|
|
88
98
|
"""
|
|
89
99
|
stream.enroll()
|
|
90
|
-
if stream.options.params.namespace_declarations:
|
|
100
|
+
if isinstance(data, Graph) and stream.options.params.namespace_declarations:
|
|
91
101
|
namespace_declarations(data, stream)
|
|
102
|
+
|
|
92
103
|
graphs = (data,) if not isinstance(data, Dataset) else data.graphs()
|
|
93
104
|
for graph in graphs:
|
|
94
105
|
for terms in graph:
|
|
95
106
|
if frame := stream.triple(terms):
|
|
96
107
|
yield frame
|
|
97
|
-
# this part turns each graph to a frame for graphs logical type
|
|
98
108
|
if frame := stream.flow.frame_from_graph():
|
|
99
109
|
yield frame
|
|
100
110
|
if stream.stream_types.flat and (frame := stream.flow.to_stream_frame()):
|
|
101
111
|
yield frame
|
|
102
112
|
|
|
103
113
|
|
|
104
|
-
@stream_frames.register
|
|
114
|
+
@stream_frames.register(QuadStream)
|
|
105
115
|
def quads_stream_frames(
|
|
106
116
|
stream: QuadStream,
|
|
107
|
-
data: Dataset,
|
|
117
|
+
data: Dataset | Generator[Quad],
|
|
108
118
|
) -> Generator[jelly.RdfStreamFrame]:
|
|
109
119
|
"""
|
|
110
120
|
Serialize a Dataset into jelly frames.
|
|
@@ -114,17 +124,23 @@ def quads_stream_frames(
|
|
|
114
124
|
|
|
115
125
|
Args:
|
|
116
126
|
stream (QuadStream): stream that specifies quads processing
|
|
117
|
-
data (Dataset): Dataset to serialize.
|
|
127
|
+
data (Dataset | Generator[Quad]): Dataset to serialize.
|
|
118
128
|
|
|
119
129
|
Yields:
|
|
120
130
|
Generator[jelly.RdfStreamFrame]: jelly frames
|
|
121
131
|
|
|
122
132
|
"""
|
|
123
|
-
assert isinstance(data, Dataset)
|
|
124
133
|
stream.enroll()
|
|
125
134
|
if stream.options.params.namespace_declarations:
|
|
126
|
-
namespace_declarations(data, stream)
|
|
127
|
-
|
|
135
|
+
namespace_declarations(data, stream) # type: ignore[arg-type]
|
|
136
|
+
|
|
137
|
+
iterator: Generator[Quad, None, None]
|
|
138
|
+
if isinstance(data, Dataset):
|
|
139
|
+
iterator = cast(Generator[Quad, None, None], data.quads())
|
|
140
|
+
else:
|
|
141
|
+
iterator = data
|
|
142
|
+
|
|
143
|
+
for terms in iterator:
|
|
128
144
|
if frame := stream.quad(terms):
|
|
129
145
|
yield frame
|
|
130
146
|
if frame := stream.flow.frame_from_dataset():
|
|
@@ -133,10 +149,10 @@ def quads_stream_frames(
|
|
|
133
149
|
yield frame
|
|
134
150
|
|
|
135
151
|
|
|
136
|
-
@stream_frames.register
|
|
152
|
+
@stream_frames.register(GraphStream)
|
|
137
153
|
def graphs_stream_frames(
|
|
138
154
|
stream: GraphStream,
|
|
139
|
-
data: Dataset,
|
|
155
|
+
data: Dataset | Generator[Quad],
|
|
140
156
|
) -> Generator[jelly.RdfStreamFrame]:
|
|
141
157
|
"""
|
|
142
158
|
Serialize a Dataset into jelly frames as a stream of graphs.
|
|
@@ -147,24 +163,76 @@ def graphs_stream_frames(
|
|
|
147
163
|
|
|
148
164
|
Args:
|
|
149
165
|
stream (GraphStream): stream that specifies graphs processing
|
|
150
|
-
data (Dataset): Dataset to serialize.
|
|
166
|
+
data (Dataset | Generator[Quad]): Dataset to serialize.
|
|
151
167
|
|
|
152
168
|
Yields:
|
|
153
169
|
Generator[jelly.RdfStreamFrame]: jelly frames
|
|
154
170
|
|
|
155
171
|
"""
|
|
156
|
-
assert isinstance(data, Dataset)
|
|
157
172
|
stream.enroll()
|
|
158
173
|
if stream.options.params.namespace_declarations:
|
|
159
|
-
namespace_declarations(data, stream)
|
|
160
|
-
|
|
174
|
+
namespace_declarations(data, stream) # type: ignore[arg-type]
|
|
175
|
+
|
|
176
|
+
if isinstance(data, Dataset):
|
|
177
|
+
graphs = data.graphs()
|
|
178
|
+
else:
|
|
179
|
+
ds = Dataset()
|
|
180
|
+
for quad in data:
|
|
181
|
+
ctx = ds.get_context(quad.g)
|
|
182
|
+
ctx.add((quad.s, quad.p, quad.o))
|
|
183
|
+
graphs = ds.graphs()
|
|
184
|
+
|
|
185
|
+
for graph in graphs:
|
|
161
186
|
yield from stream.graph(graph_id=graph.identifier, graph=graph)
|
|
187
|
+
|
|
162
188
|
if frame := stream.flow.frame_from_dataset():
|
|
163
189
|
yield frame
|
|
164
190
|
if stream.stream_types.flat and (frame := stream.flow.to_stream_frame()):
|
|
165
191
|
yield frame
|
|
166
192
|
|
|
167
193
|
|
|
194
|
+
def guess_options(sink: Graph | Dataset) -> SerializerOptions:
|
|
195
|
+
"""
|
|
196
|
+
Guess the serializer options based on the store type.
|
|
197
|
+
|
|
198
|
+
>>> guess_options(Graph()).logical_type
|
|
199
|
+
1
|
|
200
|
+
>>> guess_options(Dataset()).logical_type
|
|
201
|
+
2
|
|
202
|
+
"""
|
|
203
|
+
logical_type = (
|
|
204
|
+
jelly.LOGICAL_STREAM_TYPE_FLAT_QUADS
|
|
205
|
+
if isinstance(sink, Dataset)
|
|
206
|
+
else jelly.LOGICAL_STREAM_TYPE_FLAT_TRIPLES
|
|
207
|
+
)
|
|
208
|
+
return SerializerOptions(logical_type=logical_type)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def guess_stream(options: SerializerOptions, sink: Graph | Dataset) -> Stream:
|
|
212
|
+
"""
|
|
213
|
+
Return an appropriate stream implementation for the given options.
|
|
214
|
+
|
|
215
|
+
Notes: if base(!) logical type is GRAPHS and Dataset is given,
|
|
216
|
+
initializes TripleStream
|
|
217
|
+
|
|
218
|
+
>>> graph_ser = RDFLibJellySerializer(Graph())
|
|
219
|
+
>>> ds_ser = RDFLibJellySerializer(Dataset())
|
|
220
|
+
|
|
221
|
+
>>> type(guess_stream(guess_options(graph_ser.store), graph_ser.store))
|
|
222
|
+
<class 'pyjelly.serialize.streams.TripleStream'>
|
|
223
|
+
>>> type(guess_stream(guess_options(ds_ser.store), ds_ser.store))
|
|
224
|
+
<class 'pyjelly.serialize.streams.QuadStream'>
|
|
225
|
+
"""
|
|
226
|
+
stream_cls: type[Stream]
|
|
227
|
+
if (options.logical_type % 10) != jelly.LOGICAL_STREAM_TYPE_GRAPHS and isinstance(
|
|
228
|
+
sink, Dataset
|
|
229
|
+
):
|
|
230
|
+
stream_cls = QuadStream
|
|
231
|
+
else:
|
|
232
|
+
stream_cls = TripleStream
|
|
233
|
+
return stream_cls.for_rdflib(options=options)
|
|
234
|
+
|
|
235
|
+
|
|
168
236
|
class RDFLibJellySerializer(RDFLibSerializer):
|
|
169
237
|
"""
|
|
170
238
|
RDFLib serializer for writing graphs in Jelly RDF stream format.
|
|
@@ -180,43 +248,6 @@ class RDFLibJellySerializer(RDFLibSerializer):
|
|
|
180
248
|
raise NotImplementedError(msg)
|
|
181
249
|
super().__init__(store)
|
|
182
250
|
|
|
183
|
-
def guess_options(self) -> SerializerOptions:
|
|
184
|
-
"""
|
|
185
|
-
Guess the serializer options based on the store type.
|
|
186
|
-
|
|
187
|
-
>>> RDFLibJellySerializer(Graph()).guess_options().logical_type
|
|
188
|
-
1
|
|
189
|
-
>>> RDFLibJellySerializer(Dataset()).guess_options().logical_type
|
|
190
|
-
2
|
|
191
|
-
"""
|
|
192
|
-
logical_type = (
|
|
193
|
-
jelly.LOGICAL_STREAM_TYPE_FLAT_QUADS
|
|
194
|
-
if isinstance(self.store, Dataset)
|
|
195
|
-
else jelly.LOGICAL_STREAM_TYPE_FLAT_TRIPLES
|
|
196
|
-
)
|
|
197
|
-
return SerializerOptions(logical_type=logical_type)
|
|
198
|
-
|
|
199
|
-
def guess_stream(self, options: SerializerOptions) -> Stream:
|
|
200
|
-
"""
|
|
201
|
-
Return an appropriate stream implementation for the given options.
|
|
202
|
-
|
|
203
|
-
>>> graph_ser = RDFLibJellySerializer(Graph())
|
|
204
|
-
>>> ds_ser = RDFLibJellySerializer(Dataset())
|
|
205
|
-
|
|
206
|
-
>>> type(graph_ser.guess_stream(graph_ser.guess_options()))
|
|
207
|
-
<class 'pyjelly.serialize.streams.TripleStream'>
|
|
208
|
-
>>> type(ds_ser.guess_stream(ds_ser.guess_options()))
|
|
209
|
-
<class 'pyjelly.serialize.streams.QuadStream'>
|
|
210
|
-
"""
|
|
211
|
-
stream_cls: type[Stream]
|
|
212
|
-
if options.logical_type != jelly.LOGICAL_STREAM_TYPE_GRAPHS and isinstance(
|
|
213
|
-
self.store, Dataset
|
|
214
|
-
):
|
|
215
|
-
stream_cls = QuadStream
|
|
216
|
-
else:
|
|
217
|
-
stream_cls = TripleStream
|
|
218
|
-
return stream_cls.for_rdflib(options=options)
|
|
219
|
-
|
|
220
251
|
@override
|
|
221
252
|
def serialize( # type: ignore[override]
|
|
222
253
|
self,
|
|
@@ -240,9 +271,107 @@ class RDFLibJellySerializer(RDFLibSerializer):
|
|
|
240
271
|
|
|
241
272
|
"""
|
|
242
273
|
if options is None:
|
|
243
|
-
options = self.
|
|
274
|
+
options = guess_options(self.store)
|
|
244
275
|
if stream is None:
|
|
245
|
-
stream =
|
|
276
|
+
stream = guess_stream(options, self.store)
|
|
246
277
|
write = write_delimited if stream.options.params.delimited else write_single
|
|
247
278
|
for stream_frame in stream_frames(stream, self.store):
|
|
248
279
|
write(stream_frame, out)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def grouped_stream_to_frames(
|
|
283
|
+
sink_generator: Generator[Graph] | Generator[Dataset],
|
|
284
|
+
options: SerializerOptions | None = None,
|
|
285
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
286
|
+
"""
|
|
287
|
+
Transform Graphs/Datasets into Jelly frames, one frame per Graph/Dataset.
|
|
288
|
+
|
|
289
|
+
Note: options are guessed if not provided.
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
sink_generator (Generator[Graph] | Generator[Dataset]): Generator of
|
|
293
|
+
Graphs/Dataset to transform.
|
|
294
|
+
options (SerializerOptions | None, optional): stream options to use.
|
|
295
|
+
Options are guessed based on the sink store type. Defaults to None.
|
|
296
|
+
|
|
297
|
+
Yields:
|
|
298
|
+
Generator[jelly.RdfStreamFrame]: produced Jelly frames
|
|
299
|
+
|
|
300
|
+
"""
|
|
301
|
+
stream = None
|
|
302
|
+
for sink in sink_generator:
|
|
303
|
+
if not stream:
|
|
304
|
+
if options is None:
|
|
305
|
+
options = guess_options(sink)
|
|
306
|
+
stream = guess_stream(options, sink)
|
|
307
|
+
yield from stream_frames(stream, sink)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def grouped_stream_to_file(
|
|
311
|
+
stream: Generator[Graph] | Generator[Dataset],
|
|
312
|
+
output_file: IO[bytes],
|
|
313
|
+
**kwargs: Any,
|
|
314
|
+
) -> None:
|
|
315
|
+
"""
|
|
316
|
+
Write stream of Graphs/Datasets to a binary file.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
stream (Generator[Graph] | Generator[Dataset]): Generator of
|
|
320
|
+
Graphs/Dataset to transform.
|
|
321
|
+
output_file (IO[bytes]): output buffered writer.
|
|
322
|
+
**kwargs (Any): options to pass to stream.
|
|
323
|
+
|
|
324
|
+
"""
|
|
325
|
+
for frame in grouped_stream_to_frames(stream, **kwargs):
|
|
326
|
+
write_delimited(frame, output_file)
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def flat_stream_to_frames(
|
|
330
|
+
statements: Generator[Triple | Quad],
|
|
331
|
+
options: SerializerOptions | None = None,
|
|
332
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
333
|
+
"""
|
|
334
|
+
Serialize a stream of raw triples or quads into Jelly frames.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
statements (Generator[Triple | Quad]):
|
|
338
|
+
s/p/o triples or s/p/o/g quads to serialize.
|
|
339
|
+
options (SerializerOptions | None, optional):
|
|
340
|
+
if omitted, guessed based on the first tuple.
|
|
341
|
+
|
|
342
|
+
Yields:
|
|
343
|
+
Generator[jelly.RdfStreamFrame]: generated frames.
|
|
344
|
+
|
|
345
|
+
"""
|
|
346
|
+
first = next(statements, None)
|
|
347
|
+
if first is None:
|
|
348
|
+
return
|
|
349
|
+
|
|
350
|
+
sink = Dataset() if len(first) == QUAD_ARITY else Graph()
|
|
351
|
+
if options is None:
|
|
352
|
+
options = guess_options(sink)
|
|
353
|
+
stream = guess_stream(options, sink)
|
|
354
|
+
|
|
355
|
+
combined: Generator[Triple | Quad] | Graph = (
|
|
356
|
+
item for item in chain([first], statements)
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
yield from stream_frames(stream, combined)
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def flat_stream_to_file(
|
|
363
|
+
statements: Generator[Triple | Quad],
|
|
364
|
+
output_file: IO[bytes],
|
|
365
|
+
options: SerializerOptions | None = None,
|
|
366
|
+
) -> None:
|
|
367
|
+
"""
|
|
368
|
+
Write Triple or Quad events to a binary file in Jelly flat format.
|
|
369
|
+
|
|
370
|
+
Args:
|
|
371
|
+
statements (Generator[Triple | Quad]): statements to serialize.
|
|
372
|
+
output_file (IO[bytes]): output buffered writer.
|
|
373
|
+
options (SerializerOptions | None, optional): stream options.
|
|
374
|
+
|
|
375
|
+
"""
|
|
376
|
+
for frame in flat_stream_to_frames(statements, options):
|
|
377
|
+
write_delimited(frame, output_file)
|
pyjelly/jelly/rdf_pb2.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
3
3
|
# NO CHECKED-IN PROTOBUF GENCODE
|
|
4
4
|
# source: rdf.proto
|
|
5
|
-
# Protobuf Python Version:
|
|
5
|
+
# Protobuf Python Version: 6.31.0
|
|
6
6
|
"""Generated protocol buffer code."""
|
|
7
7
|
from google.protobuf import descriptor as _descriptor
|
|
8
8
|
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
@@ -11,8 +11,8 @@ from google.protobuf import symbol_database as _symbol_database
|
|
|
11
11
|
from google.protobuf.internal import builder as _builder
|
|
12
12
|
_runtime_version.ValidateProtobufRuntimeVersion(
|
|
13
13
|
_runtime_version.Domain.PUBLIC,
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
6,
|
|
15
|
+
31,
|
|
16
16
|
0,
|
|
17
17
|
'',
|
|
18
18
|
'rdf.proto'
|
pyjelly/jelly/rdf_pb2.pyi
CHANGED
|
@@ -2,7 +2,8 @@ from google.protobuf.internal import containers as _containers
|
|
|
2
2
|
from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
|
|
3
3
|
from google.protobuf import descriptor as _descriptor
|
|
4
4
|
from google.protobuf import message as _message
|
|
5
|
-
from
|
|
5
|
+
from collections.abc import Iterable as _Iterable, Mapping as _Mapping
|
|
6
|
+
from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
|
|
6
7
|
|
|
7
8
|
DESCRIPTOR: _descriptor.FileDescriptor
|
|
8
9
|
|
pyjelly/options.py
CHANGED
|
@@ -15,6 +15,7 @@ from pyjelly.errors import (
|
|
|
15
15
|
MIN_NAME_LOOKUP_SIZE: Final[int] = 8
|
|
16
16
|
|
|
17
17
|
MAX_LOOKUP_SIZE: Final[int] = 4096
|
|
18
|
+
MIN_VERSION: Final[int] = 1
|
|
18
19
|
MAX_VERSION: Final[int] = 2
|
|
19
20
|
|
|
20
21
|
DEFAULT_NAME_LOOKUP_SIZE: Final[int] = 4000
|
|
@@ -97,6 +98,14 @@ class StreamParameters:
|
|
|
97
98
|
namespace_declarations: bool = False
|
|
98
99
|
stream_name: str = ""
|
|
99
100
|
|
|
101
|
+
def __post_init__(self) -> None:
|
|
102
|
+
selected = MAX_VERSION if self.namespace_declarations else MIN_VERSION
|
|
103
|
+
if not (MIN_VERSION <= selected <= MAX_VERSION):
|
|
104
|
+
msg = f"""Error occured while settin up the Stream options.
|
|
105
|
+
Version must be between {MIN_VERSION} and {MAX_VERSION}."""
|
|
106
|
+
raise JellyConformanceError(msg)
|
|
107
|
+
object.__setattr__(self, "version", selected)
|
|
108
|
+
|
|
100
109
|
|
|
101
110
|
TRIPLES_ONLY_LOGICAL_TYPES = {
|
|
102
111
|
jelly.LOGICAL_STREAM_TYPE_GRAPHS,
|
pyjelly/parse/decode.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from abc import ABCMeta, abstractmethod
|
|
4
|
-
from collections.abc import Iterable, Sequence
|
|
4
|
+
from collections.abc import Iterable, Iterator, Sequence
|
|
5
5
|
from enum import Enum, auto
|
|
6
6
|
from typing import Any, ClassVar, NamedTuple
|
|
7
7
|
from typing_extensions import Never
|
|
@@ -66,6 +66,8 @@ def options_from_frame(
|
|
|
66
66
|
),
|
|
67
67
|
params=StreamParameters(
|
|
68
68
|
stream_name=options.stream_name,
|
|
69
|
+
generalized_statements=options.generalized_statements,
|
|
70
|
+
rdf_star=options.rdf_star,
|
|
69
71
|
version=options.version,
|
|
70
72
|
delimited=delimited,
|
|
71
73
|
),
|
|
@@ -134,6 +136,11 @@ class Adapter(metaclass=ABCMeta):
|
|
|
134
136
|
stream_types=self.options.stream_types,
|
|
135
137
|
)
|
|
136
138
|
|
|
139
|
+
def quoted_triple(self, terms: Iterable[Any]) -> Any: # noqa: ARG002
|
|
140
|
+
_adapter_missing(
|
|
141
|
+
"decoding quoted triple", stream_types=self.options.stream_types
|
|
142
|
+
)
|
|
143
|
+
|
|
137
144
|
def frame(self) -> Any:
|
|
138
145
|
return None
|
|
139
146
|
|
|
@@ -166,23 +173,23 @@ class Decoder:
|
|
|
166
173
|
def options(self) -> ParserOptions:
|
|
167
174
|
return self.adapter.options
|
|
168
175
|
|
|
169
|
-
def
|
|
176
|
+
def iter_rows(self, frame: jelly.RdfStreamFrame) -> Iterator[Any]:
|
|
170
177
|
"""
|
|
171
|
-
|
|
178
|
+
Iterate through rows in the frame.
|
|
172
179
|
|
|
173
180
|
Args:
|
|
174
181
|
frame (jelly.RdfStreamFrame): jelly frame
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
Any: custom obj based on adapter logic
|
|
182
|
+
Yields:
|
|
183
|
+
Iterator[Any]: decoded rows
|
|
178
184
|
|
|
179
185
|
"""
|
|
180
186
|
for row_owner in frame.rows:
|
|
181
187
|
row = getattr(row_owner, row_owner.WhichOneof("row"))
|
|
182
|
-
self.decode_row(row)
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
188
|
+
decoded_row = self.decode_row(row)
|
|
189
|
+
if isinstance(
|
|
190
|
+
row, (jelly.RdfTriple, jelly.RdfQuad, jelly.RdfNamespaceDeclaration)
|
|
191
|
+
):
|
|
192
|
+
yield decoded_row
|
|
186
193
|
|
|
187
194
|
def decode_row(self, row: Any) -> Any | None:
|
|
188
195
|
"""
|
|
@@ -383,6 +390,20 @@ class Decoder:
|
|
|
383
390
|
terms = self.decode_statement(triple, ("subject", "predicate", "object"))
|
|
384
391
|
return self.adapter.triple(terms)
|
|
385
392
|
|
|
393
|
+
def decode_quoted_triple(self, triple: jelly.RdfTriple) -> Any:
|
|
394
|
+
oneofs: Sequence[str] = ("subject", "predicate", "object")
|
|
395
|
+
terms = []
|
|
396
|
+
for oneof in oneofs:
|
|
397
|
+
field = triple.WhichOneof(oneof)
|
|
398
|
+
if field:
|
|
399
|
+
jelly_term = getattr(triple, field)
|
|
400
|
+
decoded_term = self.decode_term(jelly_term)
|
|
401
|
+
else:
|
|
402
|
+
msg = "repeated terms are not allowed in quoted triples"
|
|
403
|
+
raise ValueError(msg)
|
|
404
|
+
terms.append(decoded_term)
|
|
405
|
+
return self.adapter.quoted_triple(terms)
|
|
406
|
+
|
|
386
407
|
def decode_quad(self, quad: jelly.RdfQuad) -> Any:
|
|
387
408
|
terms = self.decode_statement(quad, ("subject", "predicate", "object", "graph"))
|
|
388
409
|
return self.adapter.quad(terms)
|
|
@@ -405,4 +426,5 @@ class Decoder:
|
|
|
405
426
|
str: decode_bnode,
|
|
406
427
|
jelly.RdfLiteral: decode_literal,
|
|
407
428
|
jelly.RdfDefaultGraph: decode_default_graph,
|
|
429
|
+
jelly.RdfTriple: decode_quoted_triple,
|
|
408
430
|
}
|
pyjelly/parse/ioutils.py
CHANGED
|
@@ -55,8 +55,7 @@ def delimited_jelly_hint(header: bytes) -> bool:
|
|
|
55
55
|
|
|
56
56
|
def frame_iterator(inp: IO[bytes]) -> Generator[jelly.RdfStreamFrame]:
|
|
57
57
|
while frame := parse_length_prefixed(jelly.RdfStreamFrame, inp):
|
|
58
|
-
|
|
59
|
-
yield frame
|
|
58
|
+
yield frame
|
|
60
59
|
|
|
61
60
|
|
|
62
61
|
def get_options_and_frames(
|
|
@@ -82,14 +81,21 @@ def get_options_and_frames(
|
|
|
82
81
|
inp.seek(-len(bytes_read), os.SEEK_CUR)
|
|
83
82
|
|
|
84
83
|
if is_delimited:
|
|
84
|
+
first_frame = None
|
|
85
|
+
skipped_frames = []
|
|
85
86
|
frames = frame_iterator(inp)
|
|
86
|
-
|
|
87
|
+
for frame in frames:
|
|
88
|
+
if not frame.rows:
|
|
89
|
+
skipped_frames.append(frame)
|
|
90
|
+
else:
|
|
91
|
+
first_frame = frame
|
|
92
|
+
break
|
|
87
93
|
if first_frame is None:
|
|
88
94
|
msg = "No non-empty frames found in the stream"
|
|
89
95
|
raise JellyConformanceError(msg)
|
|
90
96
|
|
|
91
97
|
options = options_from_frame(first_frame, delimited=True)
|
|
92
|
-
return options, chain((first_frame,), frames)
|
|
98
|
+
return options, chain(skipped_frames, (first_frame,), frames)
|
|
93
99
|
|
|
94
100
|
frame = parse(jelly.RdfStreamFrame, inp.read())
|
|
95
101
|
|
pyjelly/serialize/encode.py
CHANGED
|
@@ -32,9 +32,8 @@ def split_iri(iri_string: str) -> tuple[str, str]:
|
|
|
32
32
|
|
|
33
33
|
T = TypeVar("T")
|
|
34
34
|
RowsAnd: TypeAlias = tuple[Sequence[jelly.RdfStreamRow], T]
|
|
35
|
-
RowsAndTerm: TypeAlias =
|
|
36
|
-
|
|
37
|
-
)
|
|
35
|
+
RowsAndTerm: TypeAlias = "RowsAnd[jelly.RdfIri | jelly.RdfLiteral | str | \
|
|
36
|
+
jelly.RdfDefaultGraph | jelly.RdfTriple]"
|
|
38
37
|
|
|
39
38
|
|
|
40
39
|
class TermEncoder:
|
|
@@ -43,6 +42,7 @@ class TermEncoder:
|
|
|
43
42
|
jelly.RdfLiteral: "literal",
|
|
44
43
|
str: "bnode",
|
|
45
44
|
jelly.RdfDefaultGraph: "default_graph",
|
|
45
|
+
jelly.RdfTriple: "triple_term",
|
|
46
46
|
}
|
|
47
47
|
|
|
48
48
|
def __init__(
|
|
@@ -163,6 +163,33 @@ class TermEncoder:
|
|
|
163
163
|
datatype=datatype_id,
|
|
164
164
|
)
|
|
165
165
|
|
|
166
|
+
def encode_quoted_triple(self, terms: Iterable[object]) -> RowsAndTerm:
|
|
167
|
+
"""
|
|
168
|
+
Encode a quoted triple.
|
|
169
|
+
|
|
170
|
+
Notes:
|
|
171
|
+
Although a triple, it is treated as a part of a statement.
|
|
172
|
+
Repeated terms are not used when encoding quoted triples.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
terms (Iterable[object]): triple terms to encode.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
RowsAndTerm: additional stream rows with preceeding
|
|
179
|
+
information (prefixes, names, datatypes rows, if any)
|
|
180
|
+
and the encoded triple row.
|
|
181
|
+
|
|
182
|
+
"""
|
|
183
|
+
statement: dict[str, Any] = {}
|
|
184
|
+
rows: list[jelly.RdfStreamRow] = []
|
|
185
|
+
for slot, term in zip(Slot, terms):
|
|
186
|
+
extra_rows, value = self.encode_any(term, slot)
|
|
187
|
+
oneof = self.TERM_ONEOF_NAMES[type(value)]
|
|
188
|
+
rows.extend(extra_rows)
|
|
189
|
+
field = f"{slot}_{oneof}"
|
|
190
|
+
statement[field] = value
|
|
191
|
+
return rows, jelly.RdfTriple(**statement)
|
|
192
|
+
|
|
166
193
|
def encode_any(self, term: object, slot: Slot) -> RowsAndTerm:
|
|
167
194
|
msg = f"unsupported term type: {type(term)}"
|
|
168
195
|
raise NotImplementedError(msg)
|
pyjelly/serialize/flows.py
CHANGED
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
from collections import UserList
|
|
4
4
|
from collections.abc import Iterable
|
|
5
5
|
from dataclasses import dataclass
|
|
6
|
-
from typing import ClassVar
|
|
6
|
+
from typing import Any, ClassVar
|
|
7
7
|
from typing_extensions import override
|
|
8
8
|
|
|
9
9
|
from pyjelly import jelly
|
|
@@ -16,11 +16,24 @@ class FrameFlow(UserList[jelly.RdfStreamRow]):
|
|
|
16
16
|
Abstract base class for producing Jelly frames from RDF stream rows.
|
|
17
17
|
|
|
18
18
|
Collects stream rows and assembles them into RdfStreamFrame objects when ready.
|
|
19
|
+
|
|
20
|
+
Allows for passing LogicalStreamType, required for
|
|
21
|
+
logical subtypes and non-delimited streams.
|
|
19
22
|
"""
|
|
20
23
|
|
|
21
24
|
logical_type: jelly.LogicalStreamType
|
|
22
25
|
registry: ClassVar[dict[jelly.LogicalStreamType, type[FrameFlow]]] = {}
|
|
23
26
|
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
initlist: Iterable[jelly.RdfStreamRow] | None = None,
|
|
30
|
+
*,
|
|
31
|
+
logical_type: jelly.LogicalStreamType | None = None,
|
|
32
|
+
**__kwargs: Any,
|
|
33
|
+
) -> None:
|
|
34
|
+
super().__init__(initlist)
|
|
35
|
+
self.logical_type = logical_type or self.__class__.logical_type
|
|
36
|
+
|
|
24
37
|
def frame_from_graph(self) -> jelly.RdfStreamFrame | None:
|
|
25
38
|
"""
|
|
26
39
|
Treat the current rows as a graph and produce a frame.
|
|
@@ -71,15 +84,6 @@ class ManualFrameFlow(FrameFlow):
|
|
|
71
84
|
|
|
72
85
|
logical_type = jelly.LOGICAL_STREAM_TYPE_UNSPECIFIED
|
|
73
86
|
|
|
74
|
-
def __init__(
|
|
75
|
-
self,
|
|
76
|
-
initlist: Iterable[jelly.RdfStreamRow] | None = None,
|
|
77
|
-
*,
|
|
78
|
-
logical_type: jelly.LogicalStreamType = jelly.LOGICAL_STREAM_TYPE_UNSPECIFIED,
|
|
79
|
-
) -> None:
|
|
80
|
-
super().__init__(initlist)
|
|
81
|
-
self.logical_type = logical_type
|
|
82
|
-
|
|
83
87
|
|
|
84
88
|
@dataclass
|
|
85
89
|
class BoundedFrameFlow(FrameFlow):
|
|
@@ -92,13 +96,15 @@ class BoundedFrameFlow(FrameFlow):
|
|
|
92
96
|
logical_type = jelly.LOGICAL_STREAM_TYPE_UNSPECIFIED
|
|
93
97
|
frame_size: int
|
|
94
98
|
|
|
99
|
+
@override
|
|
95
100
|
def __init__(
|
|
96
101
|
self,
|
|
97
102
|
initlist: Iterable[jelly.RdfStreamRow] | None = None,
|
|
103
|
+
logical_type: jelly.LogicalStreamType | None = None,
|
|
98
104
|
*,
|
|
99
105
|
frame_size: int | None = None,
|
|
100
106
|
) -> None:
|
|
101
|
-
super().__init__(initlist)
|
|
107
|
+
super().__init__(initlist, logical_type=logical_type)
|
|
102
108
|
self.frame_size = frame_size or DEFAULT_FRAME_SIZE
|
|
103
109
|
|
|
104
110
|
@override
|
|
@@ -153,7 +159,6 @@ class DatasetsFrameFlow(FrameFlow):
|
|
|
153
159
|
return self.to_stream_frame()
|
|
154
160
|
|
|
155
161
|
|
|
156
|
-
# TODO(Nastya): issue #184
|
|
157
162
|
FLOW_DISPATCH: dict[jelly.LogicalStreamType, type[FrameFlow]] = {
|
|
158
163
|
jelly.LOGICAL_STREAM_TYPE_FLAT_TRIPLES: FlatTriplesFrameFlow,
|
|
159
164
|
jelly.LOGICAL_STREAM_TYPE_FLAT_QUADS: FlatQuadsFrameFlow,
|
|
@@ -166,18 +171,23 @@ def flow_for_type(logical_type: jelly.LogicalStreamType) -> type[FrameFlow]:
|
|
|
166
171
|
"""
|
|
167
172
|
Return flow based on logical type requested.
|
|
168
173
|
|
|
174
|
+
Note: uses base logical type for subtypes (i.e., SUBJECT_GRAPHS uses
|
|
175
|
+
the same flow as its base type GRAPHS).
|
|
176
|
+
|
|
169
177
|
Args:
|
|
170
178
|
logical_type (jelly.LogicalStreamType): logical type requested.
|
|
171
179
|
|
|
172
180
|
Raises:
|
|
173
|
-
NotImplementedError: if logical type not supported.
|
|
181
|
+
NotImplementedError: if (base) logical stream type is not supported.
|
|
174
182
|
|
|
175
183
|
Returns:
|
|
176
184
|
type[FrameFlow]: FrameFlow for respective logical type.
|
|
177
185
|
|
|
178
186
|
"""
|
|
179
187
|
try:
|
|
180
|
-
|
|
188
|
+
base_logical_type_value = logical_type % 10
|
|
189
|
+
base_name = jelly.LogicalStreamType.Name(base_logical_type_value)
|
|
190
|
+
return FLOW_DISPATCH[getattr(jelly.LogicalStreamType, base_name)]
|
|
181
191
|
except KeyError:
|
|
182
192
|
msg = (
|
|
183
193
|
"unsupported logical stream type: "
|