pyjelly 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyjelly might be problematic. Click here for more details.

@@ -0,0 +1,361 @@
1
+ # ruff: noqa: I001
2
+ from __future__ import annotations
3
+ from typing import cast
4
+ from collections.abc import Generator
5
+ from functools import singledispatch
6
+ from typing import Any, IO
7
+ from itertools import chain
8
+ from pyjelly.integrations.generic.generic_sink import (
9
+ GenericStatementSink,
10
+ Quad,
11
+ Triple,
12
+ DEFAULT_GRAPH_IDENTIFIER,
13
+ IRI,
14
+ BlankNode,
15
+ Literal,
16
+ Node,
17
+ )
18
+
19
+ from pyjelly import jelly
20
+ from pyjelly.serialize.encode import RowsAndTerm, Slot, TermEncoder
21
+ from pyjelly.serialize.ioutils import write_delimited
22
+ from pyjelly.serialize.streams import (
23
+ GraphStream,
24
+ QuadStream,
25
+ SerializerOptions,
26
+ Stream,
27
+ TripleStream,
28
+ ) # ruff: enable
29
+
30
+ QUAD_ARITY = 4
31
+
32
+
33
+ class GenericSinkTermEncoder(TermEncoder):
34
+ def encode_any(self, term: object, slot: Slot) -> RowsAndTerm:
35
+ """
36
+ Encode term based on its GenericSink object.
37
+
38
+ Args:
39
+ term (object): term to encode
40
+ slot (Slot): its place in statement.
41
+
42
+ Returns:
43
+ RowsAndTerm: encoded extra rows and a jelly term to encode
44
+
45
+ """
46
+ if slot is Slot.graph and term == DEFAULT_GRAPH_IDENTIFIER:
47
+ return self.encode_default_graph()
48
+
49
+ if isinstance(term, IRI):
50
+ return self.encode_iri(term._iri)
51
+
52
+ if isinstance(term, Literal):
53
+ return self.encode_literal(
54
+ lex=term._lex,
55
+ language=term._langtag,
56
+ datatype=term._datatype,
57
+ )
58
+
59
+ if isinstance(term, BlankNode):
60
+ return self.encode_bnode(term._identifier)
61
+
62
+ if isinstance(term, Triple):
63
+ return self.encode_quoted_triple(term)
64
+
65
+ return super().encode_any(term, slot) # error if not handled
66
+
67
+
68
+ def namespace_declarations(store: GenericStatementSink, stream: Stream) -> None:
69
+ for prefix, namespace in store.namespaces:
70
+ stream.namespace_declaration(name=prefix, iri=str(namespace))
71
+
72
+
73
+ @singledispatch
74
+ def stream_frames(
75
+ stream: Stream,
76
+ data: GenericStatementSink | Generator[Quad | Triple], # noqa: ARG001
77
+ ) -> Generator[jelly.RdfStreamFrame]:
78
+ msg = f"invalid stream implementation {stream}"
79
+ raise TypeError(msg)
80
+
81
+
82
+ @stream_frames.register(TripleStream)
83
+ def triples_stream_frames(
84
+ stream: TripleStream,
85
+ data: GenericStatementSink | Generator[Triple],
86
+ ) -> Generator[jelly.RdfStreamFrame]:
87
+ """
88
+ Serialize a GenericStatementSink into frames using physical type triples stream.
89
+
90
+ Args:
91
+ stream (TripleStream): stream that specifies triples processing
92
+ data (GenericStatementSink | Generator[Triple]):
93
+ GenericStatementSink/Statements to serialize.
94
+
95
+ Yields:
96
+ Generator[jelly.RdfStreamFrame]: jelly frames.
97
+
98
+ """
99
+ stream.enroll()
100
+ if (
101
+ isinstance(data, GenericStatementSink)
102
+ and stream.options.params.namespace_declarations
103
+ ):
104
+ namespace_declarations(data, stream)
105
+
106
+ graphs = (data,)
107
+ for graph in graphs:
108
+ for terms in graph:
109
+ if frame := stream.triple(terms):
110
+ yield frame
111
+ if frame := stream.flow.frame_from_graph():
112
+ yield frame
113
+ if stream.stream_types.flat and (frame := stream.flow.to_stream_frame()):
114
+ yield frame
115
+
116
+
117
+ @stream_frames.register(QuadStream)
118
+ def quads_stream_frames(
119
+ stream: QuadStream,
120
+ data: GenericStatementSink | Generator[Quad],
121
+ ) -> Generator[jelly.RdfStreamFrame]:
122
+ """
123
+ Serialize a GenericStatementSink into jelly frames using physical type quads stream.
124
+
125
+ Args:
126
+ stream (QuadStream): stream that specifies quads processing
127
+ data (GenericStatementSink | Generator[Quad]): Dataset to serialize.
128
+
129
+ Yields:
130
+ Generator[jelly.RdfStreamFrame]: jelly frames
131
+
132
+ """
133
+ stream.enroll()
134
+ if stream.options.params.namespace_declarations:
135
+ namespace_declarations(data, stream) # type: ignore[arg-type]
136
+
137
+ iterator: Generator[Quad]
138
+ if isinstance(data, GenericStatementSink):
139
+ iterator = cast(Generator[Quad], data.store)
140
+ else:
141
+ iterator = data
142
+
143
+ for terms in iterator:
144
+ if frame := stream.quad(terms):
145
+ yield frame
146
+ if frame := stream.flow.frame_from_dataset():
147
+ yield frame
148
+ if stream.stream_types.flat and (frame := stream.flow.to_stream_frame()):
149
+ yield frame
150
+
151
+
152
+ @stream_frames.register(GraphStream)
153
+ def graphs_stream_frames(
154
+ stream: GraphStream,
155
+ data: GenericStatementSink | Generator[Quad],
156
+ ) -> Generator[jelly.RdfStreamFrame]:
157
+ """
158
+ Serialize a GenericStatementSink into jelly frames as a stream of graphs.
159
+
160
+ Notes:
161
+ If flow of DatasetsFrameFlow type, the whole dataset
162
+ will be encoded into one frame.
163
+ Graphs are generated from the GenericStatementSink by
164
+ iterating over statements and yielding one new GenericStatementSink
165
+ per a sequence of quads with the same g term.
166
+
167
+ Args:
168
+ stream (GraphStream): stream that specifies graphs processing
169
+ data (GenericStatementSink | Generator[Quad]): Dataset to serialize.
170
+
171
+ Yields:
172
+ Generator[jelly.RdfStreamFrame]: jelly frames
173
+
174
+ """
175
+ stream.enroll()
176
+ if stream.options.params.namespace_declarations:
177
+ namespace_declarations(data, stream) # type: ignore[arg-type]
178
+
179
+ statements: Generator[Quad]
180
+ if isinstance(data, GenericStatementSink):
181
+ statements = cast(Generator[Quad], data.store)
182
+ graphs = split_to_graphs(statements)
183
+ elif iter(data):
184
+ statements = data
185
+ graphs = split_to_graphs(statements)
186
+
187
+ for graph in graphs:
188
+ yield from stream.graph(graph_id=graph.identifier, graph=graph)
189
+
190
+ if frame := stream.flow.frame_from_dataset():
191
+ yield frame
192
+ if stream.stream_types.flat and (frame := stream.flow.to_stream_frame()):
193
+ yield frame
194
+
195
+
196
+ def split_to_graphs(data: Generator[Quad]) -> Generator[GenericStatementSink]:
197
+ """
198
+ Split a generator of quads to graphs.
199
+
200
+ Notes:
201
+ New graph is generated by
202
+ iterating over statements and yielding one new GenericStatementSink
203
+ per a sequence of quads with the same g term.
204
+
205
+ Args:
206
+ data (Generator[Quad]): generator of quads
207
+
208
+ Yields:
209
+ Generator[GenericStatementSink]: generator of GenericStatementSinks,
210
+ each having triples in store and identifier set.
211
+
212
+ """
213
+ current_g: Node | None = None
214
+ current_sink: GenericStatementSink | None = None
215
+ for statement in data:
216
+ if current_g != statement.g:
217
+ if current_sink is not None:
218
+ yield current_sink
219
+
220
+ current_g = statement.g
221
+ current_sink = GenericStatementSink(identifier=current_g)
222
+
223
+ assert current_sink is not None
224
+ current_sink.add(Triple(statement.s, statement.p, statement.o))
225
+
226
+ if current_sink is not None:
227
+ yield current_sink
228
+
229
+
230
+ def guess_options(sink: GenericStatementSink) -> SerializerOptions:
231
+ """Guess the serializer options based on the store type."""
232
+ logical_type = (
233
+ jelly.LOGICAL_STREAM_TYPE_FLAT_TRIPLES
234
+ if sink.is_triples_sink
235
+ else jelly.LOGICAL_STREAM_TYPE_FLAT_QUADS
236
+ )
237
+ return SerializerOptions(logical_type=logical_type)
238
+
239
+
240
+ def guess_stream(options: SerializerOptions, sink: GenericStatementSink) -> Stream:
241
+ """
242
+ Return an appropriate stream implementation for the given options.
243
+
244
+ Notes: if base(!) logical type is GRAPHS and sink.is_triples_sink is false,
245
+ initializes TripleStream
246
+ """
247
+ stream_cls: type[Stream]
248
+ if (
249
+ options.logical_type % 10
250
+ ) != jelly.LOGICAL_STREAM_TYPE_GRAPHS and not sink.is_triples_sink:
251
+ stream_cls = QuadStream
252
+ else:
253
+ stream_cls = TripleStream
254
+ if options is not None:
255
+ lookup_preset = options.lookup_preset
256
+ return stream_cls(
257
+ encoder=GenericSinkTermEncoder(lookup_preset=lookup_preset),
258
+ options=options,
259
+ )
260
+
261
+
262
+ def grouped_stream_to_frames(
263
+ sink_generator: Generator[GenericStatementSink],
264
+ options: SerializerOptions | None = None,
265
+ ) -> Generator[jelly.RdfStreamFrame]:
266
+ """
267
+ Transform multiple GenericStatementSinks into Jelly frames.
268
+
269
+ Notes:
270
+ One frame per GenericStatementSink.
271
+
272
+ Note: options are guessed if not provided.
273
+
274
+ Args:
275
+ sink_generator (Generator[GenericStatementSink]): Generator of
276
+ GenericStatementSink to transform.
277
+ options (SerializerOptions | None, optional): stream options to use.
278
+ Options are guessed based on the sink store type. Defaults to None.
279
+
280
+ Yields:
281
+ Generator[jelly.RdfStreamFrame]: produced Jelly frames
282
+
283
+ """
284
+ stream = None
285
+ for sink in sink_generator:
286
+ if not stream:
287
+ if options is None:
288
+ options = guess_options(sink)
289
+ stream = guess_stream(options, sink)
290
+ yield from stream_frames(stream, sink)
291
+
292
+
293
+ def grouped_stream_to_file(
294
+ stream: Generator[GenericStatementSink],
295
+ output_file: IO[bytes],
296
+ **kwargs: Any,
297
+ ) -> None:
298
+ """
299
+ Write stream of GenericStatementSink to a binary file.
300
+
301
+ Args:
302
+ stream (Generator[GenericStatementSink]): Generator of
303
+ GenericStatementSink to serialize.
304
+ output_file (IO[bytes]): output buffered writer.
305
+ **kwargs (Any): options to pass to stream.
306
+
307
+ """
308
+ for frame in grouped_stream_to_frames(stream, **kwargs):
309
+ write_delimited(frame, output_file)
310
+
311
+
312
+ def flat_stream_to_frames(
313
+ statements: Generator[Triple | Quad],
314
+ options: SerializerOptions | None = None,
315
+ ) -> Generator[jelly.RdfStreamFrame]:
316
+ """
317
+ Serialize a stream of raw GenericStatementSink's triples or quads into Jelly frames.
318
+
319
+ Args:
320
+ statements (Generator[Triple | Quad]):
321
+ s/p/o triples or s/p/o/g quads to serialize.
322
+ options (SerializerOptions | None, optional):
323
+ if omitted, guessed based on the first tuple.
324
+
325
+ Yields:
326
+ Generator[jelly.RdfStreamFrame]: generated frames.
327
+
328
+ """
329
+ first = next(statements, None)
330
+ if first is None:
331
+ return
332
+
333
+ sink = GenericStatementSink()
334
+ sink.add(first)
335
+ if options is None:
336
+ options = guess_options(sink)
337
+ stream = guess_stream(options, sink)
338
+
339
+ combined: Generator[Triple | Quad] | GenericStatementSink = (
340
+ item for item in chain([first], statements)
341
+ )
342
+
343
+ yield from stream_frames(stream, combined)
344
+
345
+
346
+ def flat_stream_to_file(
347
+ statements: Generator[Triple | Quad],
348
+ output_file: IO[bytes],
349
+ options: SerializerOptions | None = None,
350
+ ) -> None:
351
+ """
352
+ Write Triple or Quad events to a binary file.
353
+
354
+ Args:
355
+ statements (Generator[Triple | Quad]): statements to serialize.
356
+ output_file (IO[bytes]): output buffered writer.
357
+ options (SerializerOptions | None, optional): stream options.
358
+
359
+ """
360
+ for frame in flat_stream_to_frames(statements, options):
361
+ write_delimited(frame, output_file)