pyjelly 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyjelly might be problematic. Click here for more details.
- pyjelly/integrations/generic/__init__.py +0 -0
- pyjelly/integrations/generic/generic_sink.py +163 -0
- pyjelly/integrations/generic/parse.py +339 -0
- pyjelly/integrations/generic/serialize.py +361 -0
- pyjelly/integrations/rdflib/parse.py +235 -156
- pyjelly/integrations/rdflib/serialize.py +189 -60
- pyjelly/jelly/rdf_pb2.py +3 -3
- pyjelly/jelly/rdf_pb2.pyi +2 -1
- pyjelly/options.py +9 -0
- pyjelly/parse/decode.py +32 -10
- pyjelly/parse/ioutils.py +10 -4
- pyjelly/serialize/encode.py +30 -3
- pyjelly/serialize/flows.py +24 -14
- pyjelly/serialize/streams.py +5 -2
- {pyjelly-0.3.0.dist-info → pyjelly-0.5.0.dist-info}/METADATA +10 -9
- pyjelly-0.5.0.dist-info/RECORD +32 -0
- pyjelly-0.3.0.dist-info/RECORD +0 -28
- {pyjelly-0.3.0.dist-info → pyjelly-0.5.0.dist-info}/WHEEL +0 -0
- {pyjelly-0.3.0.dist-info → pyjelly-0.5.0.dist-info}/entry_points.txt +0 -0
- {pyjelly-0.3.0.dist-info → pyjelly-0.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
# ruff: noqa: I001
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import cast
|
|
4
|
+
from collections.abc import Generator
|
|
5
|
+
from functools import singledispatch
|
|
6
|
+
from typing import Any, IO
|
|
7
|
+
from itertools import chain
|
|
8
|
+
from pyjelly.integrations.generic.generic_sink import (
|
|
9
|
+
GenericStatementSink,
|
|
10
|
+
Quad,
|
|
11
|
+
Triple,
|
|
12
|
+
DEFAULT_GRAPH_IDENTIFIER,
|
|
13
|
+
IRI,
|
|
14
|
+
BlankNode,
|
|
15
|
+
Literal,
|
|
16
|
+
Node,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
from pyjelly import jelly
|
|
20
|
+
from pyjelly.serialize.encode import RowsAndTerm, Slot, TermEncoder
|
|
21
|
+
from pyjelly.serialize.ioutils import write_delimited
|
|
22
|
+
from pyjelly.serialize.streams import (
|
|
23
|
+
GraphStream,
|
|
24
|
+
QuadStream,
|
|
25
|
+
SerializerOptions,
|
|
26
|
+
Stream,
|
|
27
|
+
TripleStream,
|
|
28
|
+
) # ruff: enable
|
|
29
|
+
|
|
30
|
+
QUAD_ARITY = 4
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class GenericSinkTermEncoder(TermEncoder):
|
|
34
|
+
def encode_any(self, term: object, slot: Slot) -> RowsAndTerm:
|
|
35
|
+
"""
|
|
36
|
+
Encode term based on its GenericSink object.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
term (object): term to encode
|
|
40
|
+
slot (Slot): its place in statement.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
RowsAndTerm: encoded extra rows and a jelly term to encode
|
|
44
|
+
|
|
45
|
+
"""
|
|
46
|
+
if slot is Slot.graph and term == DEFAULT_GRAPH_IDENTIFIER:
|
|
47
|
+
return self.encode_default_graph()
|
|
48
|
+
|
|
49
|
+
if isinstance(term, IRI):
|
|
50
|
+
return self.encode_iri(term._iri)
|
|
51
|
+
|
|
52
|
+
if isinstance(term, Literal):
|
|
53
|
+
return self.encode_literal(
|
|
54
|
+
lex=term._lex,
|
|
55
|
+
language=term._langtag,
|
|
56
|
+
datatype=term._datatype,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
if isinstance(term, BlankNode):
|
|
60
|
+
return self.encode_bnode(term._identifier)
|
|
61
|
+
|
|
62
|
+
if isinstance(term, Triple):
|
|
63
|
+
return self.encode_quoted_triple(term)
|
|
64
|
+
|
|
65
|
+
return super().encode_any(term, slot) # error if not handled
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def namespace_declarations(store: GenericStatementSink, stream: Stream) -> None:
|
|
69
|
+
for prefix, namespace in store.namespaces:
|
|
70
|
+
stream.namespace_declaration(name=prefix, iri=str(namespace))
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@singledispatch
|
|
74
|
+
def stream_frames(
|
|
75
|
+
stream: Stream,
|
|
76
|
+
data: GenericStatementSink | Generator[Quad | Triple], # noqa: ARG001
|
|
77
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
78
|
+
msg = f"invalid stream implementation {stream}"
|
|
79
|
+
raise TypeError(msg)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@stream_frames.register(TripleStream)
|
|
83
|
+
def triples_stream_frames(
|
|
84
|
+
stream: TripleStream,
|
|
85
|
+
data: GenericStatementSink | Generator[Triple],
|
|
86
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
87
|
+
"""
|
|
88
|
+
Serialize a GenericStatementSink into frames using physical type triples stream.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
stream (TripleStream): stream that specifies triples processing
|
|
92
|
+
data (GenericStatementSink | Generator[Triple]):
|
|
93
|
+
GenericStatementSink/Statements to serialize.
|
|
94
|
+
|
|
95
|
+
Yields:
|
|
96
|
+
Generator[jelly.RdfStreamFrame]: jelly frames.
|
|
97
|
+
|
|
98
|
+
"""
|
|
99
|
+
stream.enroll()
|
|
100
|
+
if (
|
|
101
|
+
isinstance(data, GenericStatementSink)
|
|
102
|
+
and stream.options.params.namespace_declarations
|
|
103
|
+
):
|
|
104
|
+
namespace_declarations(data, stream)
|
|
105
|
+
|
|
106
|
+
graphs = (data,)
|
|
107
|
+
for graph in graphs:
|
|
108
|
+
for terms in graph:
|
|
109
|
+
if frame := stream.triple(terms):
|
|
110
|
+
yield frame
|
|
111
|
+
if frame := stream.flow.frame_from_graph():
|
|
112
|
+
yield frame
|
|
113
|
+
if stream.stream_types.flat and (frame := stream.flow.to_stream_frame()):
|
|
114
|
+
yield frame
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@stream_frames.register(QuadStream)
|
|
118
|
+
def quads_stream_frames(
|
|
119
|
+
stream: QuadStream,
|
|
120
|
+
data: GenericStatementSink | Generator[Quad],
|
|
121
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
122
|
+
"""
|
|
123
|
+
Serialize a GenericStatementSink into jelly frames using physical type quads stream.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
stream (QuadStream): stream that specifies quads processing
|
|
127
|
+
data (GenericStatementSink | Generator[Quad]): Dataset to serialize.
|
|
128
|
+
|
|
129
|
+
Yields:
|
|
130
|
+
Generator[jelly.RdfStreamFrame]: jelly frames
|
|
131
|
+
|
|
132
|
+
"""
|
|
133
|
+
stream.enroll()
|
|
134
|
+
if stream.options.params.namespace_declarations:
|
|
135
|
+
namespace_declarations(data, stream) # type: ignore[arg-type]
|
|
136
|
+
|
|
137
|
+
iterator: Generator[Quad]
|
|
138
|
+
if isinstance(data, GenericStatementSink):
|
|
139
|
+
iterator = cast(Generator[Quad], data.store)
|
|
140
|
+
else:
|
|
141
|
+
iterator = data
|
|
142
|
+
|
|
143
|
+
for terms in iterator:
|
|
144
|
+
if frame := stream.quad(terms):
|
|
145
|
+
yield frame
|
|
146
|
+
if frame := stream.flow.frame_from_dataset():
|
|
147
|
+
yield frame
|
|
148
|
+
if stream.stream_types.flat and (frame := stream.flow.to_stream_frame()):
|
|
149
|
+
yield frame
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@stream_frames.register(GraphStream)
|
|
153
|
+
def graphs_stream_frames(
|
|
154
|
+
stream: GraphStream,
|
|
155
|
+
data: GenericStatementSink | Generator[Quad],
|
|
156
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
157
|
+
"""
|
|
158
|
+
Serialize a GenericStatementSink into jelly frames as a stream of graphs.
|
|
159
|
+
|
|
160
|
+
Notes:
|
|
161
|
+
If flow of DatasetsFrameFlow type, the whole dataset
|
|
162
|
+
will be encoded into one frame.
|
|
163
|
+
Graphs are generated from the GenericStatementSink by
|
|
164
|
+
iterating over statements and yielding one new GenericStatementSink
|
|
165
|
+
per a sequence of quads with the same g term.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
stream (GraphStream): stream that specifies graphs processing
|
|
169
|
+
data (GenericStatementSink | Generator[Quad]): Dataset to serialize.
|
|
170
|
+
|
|
171
|
+
Yields:
|
|
172
|
+
Generator[jelly.RdfStreamFrame]: jelly frames
|
|
173
|
+
|
|
174
|
+
"""
|
|
175
|
+
stream.enroll()
|
|
176
|
+
if stream.options.params.namespace_declarations:
|
|
177
|
+
namespace_declarations(data, stream) # type: ignore[arg-type]
|
|
178
|
+
|
|
179
|
+
statements: Generator[Quad]
|
|
180
|
+
if isinstance(data, GenericStatementSink):
|
|
181
|
+
statements = cast(Generator[Quad], data.store)
|
|
182
|
+
graphs = split_to_graphs(statements)
|
|
183
|
+
elif iter(data):
|
|
184
|
+
statements = data
|
|
185
|
+
graphs = split_to_graphs(statements)
|
|
186
|
+
|
|
187
|
+
for graph in graphs:
|
|
188
|
+
yield from stream.graph(graph_id=graph.identifier, graph=graph)
|
|
189
|
+
|
|
190
|
+
if frame := stream.flow.frame_from_dataset():
|
|
191
|
+
yield frame
|
|
192
|
+
if stream.stream_types.flat and (frame := stream.flow.to_stream_frame()):
|
|
193
|
+
yield frame
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def split_to_graphs(data: Generator[Quad]) -> Generator[GenericStatementSink]:
|
|
197
|
+
"""
|
|
198
|
+
Split a generator of quads to graphs.
|
|
199
|
+
|
|
200
|
+
Notes:
|
|
201
|
+
New graph is generated by
|
|
202
|
+
iterating over statements and yielding one new GenericStatementSink
|
|
203
|
+
per a sequence of quads with the same g term.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
data (Generator[Quad]): generator of quads
|
|
207
|
+
|
|
208
|
+
Yields:
|
|
209
|
+
Generator[GenericStatementSink]: generator of GenericStatementSinks,
|
|
210
|
+
each having triples in store and identifier set.
|
|
211
|
+
|
|
212
|
+
"""
|
|
213
|
+
current_g: Node | None = None
|
|
214
|
+
current_sink: GenericStatementSink | None = None
|
|
215
|
+
for statement in data:
|
|
216
|
+
if current_g != statement.g:
|
|
217
|
+
if current_sink is not None:
|
|
218
|
+
yield current_sink
|
|
219
|
+
|
|
220
|
+
current_g = statement.g
|
|
221
|
+
current_sink = GenericStatementSink(identifier=current_g)
|
|
222
|
+
|
|
223
|
+
assert current_sink is not None
|
|
224
|
+
current_sink.add(Triple(statement.s, statement.p, statement.o))
|
|
225
|
+
|
|
226
|
+
if current_sink is not None:
|
|
227
|
+
yield current_sink
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def guess_options(sink: GenericStatementSink) -> SerializerOptions:
|
|
231
|
+
"""Guess the serializer options based on the store type."""
|
|
232
|
+
logical_type = (
|
|
233
|
+
jelly.LOGICAL_STREAM_TYPE_FLAT_TRIPLES
|
|
234
|
+
if sink.is_triples_sink
|
|
235
|
+
else jelly.LOGICAL_STREAM_TYPE_FLAT_QUADS
|
|
236
|
+
)
|
|
237
|
+
return SerializerOptions(logical_type=logical_type)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def guess_stream(options: SerializerOptions, sink: GenericStatementSink) -> Stream:
|
|
241
|
+
"""
|
|
242
|
+
Return an appropriate stream implementation for the given options.
|
|
243
|
+
|
|
244
|
+
Notes: if base(!) logical type is GRAPHS and sink.is_triples_sink is false,
|
|
245
|
+
initializes TripleStream
|
|
246
|
+
"""
|
|
247
|
+
stream_cls: type[Stream]
|
|
248
|
+
if (
|
|
249
|
+
options.logical_type % 10
|
|
250
|
+
) != jelly.LOGICAL_STREAM_TYPE_GRAPHS and not sink.is_triples_sink:
|
|
251
|
+
stream_cls = QuadStream
|
|
252
|
+
else:
|
|
253
|
+
stream_cls = TripleStream
|
|
254
|
+
if options is not None:
|
|
255
|
+
lookup_preset = options.lookup_preset
|
|
256
|
+
return stream_cls(
|
|
257
|
+
encoder=GenericSinkTermEncoder(lookup_preset=lookup_preset),
|
|
258
|
+
options=options,
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def grouped_stream_to_frames(
|
|
263
|
+
sink_generator: Generator[GenericStatementSink],
|
|
264
|
+
options: SerializerOptions | None = None,
|
|
265
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
266
|
+
"""
|
|
267
|
+
Transform multiple GenericStatementSinks into Jelly frames.
|
|
268
|
+
|
|
269
|
+
Notes:
|
|
270
|
+
One frame per GenericStatementSink.
|
|
271
|
+
|
|
272
|
+
Note: options are guessed if not provided.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
sink_generator (Generator[GenericStatementSink]): Generator of
|
|
276
|
+
GenericStatementSink to transform.
|
|
277
|
+
options (SerializerOptions | None, optional): stream options to use.
|
|
278
|
+
Options are guessed based on the sink store type. Defaults to None.
|
|
279
|
+
|
|
280
|
+
Yields:
|
|
281
|
+
Generator[jelly.RdfStreamFrame]: produced Jelly frames
|
|
282
|
+
|
|
283
|
+
"""
|
|
284
|
+
stream = None
|
|
285
|
+
for sink in sink_generator:
|
|
286
|
+
if not stream:
|
|
287
|
+
if options is None:
|
|
288
|
+
options = guess_options(sink)
|
|
289
|
+
stream = guess_stream(options, sink)
|
|
290
|
+
yield from stream_frames(stream, sink)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def grouped_stream_to_file(
|
|
294
|
+
stream: Generator[GenericStatementSink],
|
|
295
|
+
output_file: IO[bytes],
|
|
296
|
+
**kwargs: Any,
|
|
297
|
+
) -> None:
|
|
298
|
+
"""
|
|
299
|
+
Write stream of GenericStatementSink to a binary file.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
stream (Generator[GenericStatementSink]): Generator of
|
|
303
|
+
GenericStatementSink to serialize.
|
|
304
|
+
output_file (IO[bytes]): output buffered writer.
|
|
305
|
+
**kwargs (Any): options to pass to stream.
|
|
306
|
+
|
|
307
|
+
"""
|
|
308
|
+
for frame in grouped_stream_to_frames(stream, **kwargs):
|
|
309
|
+
write_delimited(frame, output_file)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def flat_stream_to_frames(
|
|
313
|
+
statements: Generator[Triple | Quad],
|
|
314
|
+
options: SerializerOptions | None = None,
|
|
315
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
316
|
+
"""
|
|
317
|
+
Serialize a stream of raw GenericStatementSink's triples or quads into Jelly frames.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
statements (Generator[Triple | Quad]):
|
|
321
|
+
s/p/o triples or s/p/o/g quads to serialize.
|
|
322
|
+
options (SerializerOptions | None, optional):
|
|
323
|
+
if omitted, guessed based on the first tuple.
|
|
324
|
+
|
|
325
|
+
Yields:
|
|
326
|
+
Generator[jelly.RdfStreamFrame]: generated frames.
|
|
327
|
+
|
|
328
|
+
"""
|
|
329
|
+
first = next(statements, None)
|
|
330
|
+
if first is None:
|
|
331
|
+
return
|
|
332
|
+
|
|
333
|
+
sink = GenericStatementSink()
|
|
334
|
+
sink.add(first)
|
|
335
|
+
if options is None:
|
|
336
|
+
options = guess_options(sink)
|
|
337
|
+
stream = guess_stream(options, sink)
|
|
338
|
+
|
|
339
|
+
combined: Generator[Triple | Quad] | GenericStatementSink = (
|
|
340
|
+
item for item in chain([first], statements)
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
yield from stream_frames(stream, combined)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def flat_stream_to_file(
|
|
347
|
+
statements: Generator[Triple | Quad],
|
|
348
|
+
output_file: IO[bytes],
|
|
349
|
+
options: SerializerOptions | None = None,
|
|
350
|
+
) -> None:
|
|
351
|
+
"""
|
|
352
|
+
Write Triple or Quad events to a binary file.
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
statements (Generator[Triple | Quad]): statements to serialize.
|
|
356
|
+
output_file (IO[bytes]): output buffered writer.
|
|
357
|
+
options (SerializerOptions | None, optional): stream options.
|
|
358
|
+
|
|
359
|
+
"""
|
|
360
|
+
for frame in flat_stream_to_frames(statements, options):
|
|
361
|
+
write_delimited(frame, output_file)
|