pyjelly 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyjelly might be problematic. Click here for more details.

File without changes
@@ -0,0 +1,163 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import deque
4
+ from collections.abc import Generator
5
+ from typing import IO, NamedTuple, Union
6
+
7
+ DEFAULT_GRAPH_IDENTIFIER = ""
8
+
9
+
10
+ class BlankNode:
11
+ """Class for blank nodes, storing BN's identifier as a string."""
12
+
13
+ def __init__(self, identifier: str) -> None:
14
+ self._identifier: str = identifier
15
+
16
+ def __str__(self) -> str:
17
+ return f"_:{self._identifier}"
18
+
19
+ def __repr__(self) -> str:
20
+ return f"BlankNode(identifier={self._identifier})"
21
+
22
+
23
+ class IRI:
24
+ """Class for IRIs, storing IRI as a string."""
25
+
26
+ def __init__(self, iri: str) -> None:
27
+ self._iri: str = iri
28
+
29
+ def __str__(self) -> str:
30
+ return f"<{self._iri}>"
31
+
32
+ def __repr__(self) -> str:
33
+ return f"IRI({self._iri})"
34
+
35
+
36
+ class Literal:
37
+ """
38
+ Class for literals.
39
+
40
+ Notes:
41
+ Consists of: lexical form, and optional language tag and datatype.
42
+ All parts of literal are stored as strings.
43
+
44
+ """
45
+
46
+ def __init__(
47
+ self, lex: str, langtag: str | None = None, datatype: str | None = None
48
+ ) -> None:
49
+ self._lex: str = lex
50
+ self._langtag: str | None = langtag
51
+ self._datatype: str | None = datatype
52
+
53
+ def __str__(self) -> str:
54
+ suffix = ""
55
+ if self._langtag:
56
+ suffix = f"@{self._langtag}"
57
+ elif self._datatype:
58
+ suffix = f"^^<{self._datatype}>"
59
+ return f'"{self._lex}"{suffix}'
60
+
61
+ def __repr__(self) -> str:
62
+ return (
63
+ f"Literal({self._lex!r}, langtag={self._langtag!r}, "
64
+ f"datatype={self._datatype!r})"
65
+ )
66
+
67
+
68
+ Node = Union[BlankNode, IRI, Literal, "Triple", str]
69
+
70
+
71
+ TRIPLE_ARITY = 3
72
+
73
+
74
+ class Triple(NamedTuple):
75
+ """Class for RDF triples."""
76
+
77
+ s: Node
78
+ p: Node
79
+ o: Node
80
+
81
+
82
+ class Quad(NamedTuple):
83
+ """Class for RDF quads."""
84
+
85
+ s: Node
86
+ p: Node
87
+ o: Node
88
+ g: Node
89
+
90
+
91
+ class Prefix(NamedTuple):
92
+ """Class for generic namespace declaration."""
93
+
94
+ prefix: str
95
+ iri: IRI
96
+
97
+
98
+ class GenericStatementSink:
99
+ _store: deque[Triple | Quad]
100
+
101
+ def __init__(self, identifier: Node = DEFAULT_GRAPH_IDENTIFIER) -> None:
102
+ """
103
+ Initialize statements storage, namespaces dictionary, and parser.
104
+
105
+ Notes:
106
+ _store preserves the order of statements.
107
+
108
+ Args:
109
+ identifier (str, optional): Identifier for a sink.
110
+ Defaults to DEFAULT_GRAPH_IDENTIFIER.
111
+
112
+ """
113
+ self._store: deque[Triple | Quad] = deque()
114
+ self._namespaces: dict[str, IRI] = {}
115
+ self._identifier = identifier
116
+
117
+ def add(self, statement: Triple | Quad) -> None:
118
+ self._store.append(statement)
119
+
120
+ def bind(self, prefix: str, namespace: IRI) -> None:
121
+ self._namespaces.update({prefix: namespace})
122
+
123
+ def __iter__(self) -> Generator[Triple | Quad]:
124
+ yield from self._store
125
+
126
+ def __len__(self) -> int:
127
+ return len(self._store)
128
+
129
+ @property
130
+ def namespaces(self) -> Generator[tuple[str, IRI]]:
131
+ yield from self._namespaces.items()
132
+
133
+ @property
134
+ def identifier(self) -> Node:
135
+ return self._identifier
136
+
137
+ @property
138
+ def store(self) -> Generator[Triple | Quad]:
139
+ yield from self._store
140
+
141
+ @property
142
+ def is_triples_sink(self) -> bool:
143
+ """
144
+ Check if the sink contains triples or quads.
145
+
146
+ Returns:
147
+ bool: true, if length of statement is 3.
148
+
149
+ """
150
+ return len(self._store[0]) == TRIPLE_ARITY
151
+
152
+ def parse(self, input_file: IO[bytes]) -> None:
153
+ from pyjelly.integrations.generic.parse import parse_jelly_to_graph
154
+
155
+ parsed_result = parse_jelly_to_graph(input_file)
156
+ self._store = parsed_result._store
157
+ self._namespaces = parsed_result._namespaces
158
+ self._identifier = parsed_result._identifier
159
+
160
+ def serialize(self, output_file: IO[bytes]) -> None:
161
+ from pyjelly.integrations.generic.serialize import grouped_stream_to_file
162
+
163
+ grouped_stream_to_file((sink for sink in [self]), output_file)
@@ -0,0 +1,339 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Generator, Iterable
4
+ from itertools import chain
5
+ from typing import IO, Any, Callable, Union
6
+ from typing_extensions import override
7
+
8
+ from pyjelly import jelly
9
+ from pyjelly.errors import JellyConformanceError
10
+ from pyjelly.integrations.generic.generic_sink import (
11
+ IRI,
12
+ BlankNode,
13
+ GenericStatementSink,
14
+ Literal,
15
+ Prefix,
16
+ Quad,
17
+ Triple,
18
+ )
19
+ from pyjelly.parse.decode import Adapter, Decoder, ParserOptions
20
+ from pyjelly.parse.ioutils import get_options_and_frames
21
+
22
+ Statement = Union[Triple, Quad]
23
+
24
+
25
+ class GenericStatementSinkAdapter(Adapter):
26
+ """
27
+ Implement Adapter for generic statements.
28
+
29
+ Notes:
30
+ Returns custom RDF terms expected by GenericStatementSink,
31
+ handles namespace declarations, and quoted triples.
32
+
33
+ Args:
34
+ Adapter (_type_): base Adapter class
35
+
36
+ """
37
+
38
+ @override
39
+ def iri(self, iri: str) -> IRI:
40
+ return IRI(iri)
41
+
42
+ @override
43
+ def bnode(self, bnode: str) -> BlankNode:
44
+ return BlankNode(bnode)
45
+
46
+ @override
47
+ def default_graph(self) -> str:
48
+ return ""
49
+
50
+ @override
51
+ def literal(
52
+ self,
53
+ lex: str,
54
+ language: str | None = None,
55
+ datatype: str | None = None,
56
+ ) -> Literal:
57
+ return Literal(lex, language, datatype)
58
+
59
+ @override
60
+ def namespace_declaration(self, name: str, iri: str) -> Prefix:
61
+ return Prefix(name, self.iri(iri))
62
+
63
+ @override
64
+ def quoted_triple(self, terms: Iterable[Any]) -> Triple:
65
+ return Triple(*terms)
66
+
67
+
68
+ class GenericTriplesAdapter(GenericStatementSinkAdapter):
69
+ """
70
+ Triples adapted implementation for GenericStatementSink.
71
+
72
+ Args:
73
+ GenericStatementSinkAdapter (_type_): base GenericStatementSink
74
+ adapter implementation that handles terms and namespaces.
75
+
76
+ """
77
+
78
+ def __init__(
79
+ self,
80
+ options: ParserOptions,
81
+ ) -> None:
82
+ super().__init__(options=options)
83
+
84
+ @override
85
+ def triple(self, terms: Iterable[Any]) -> Triple:
86
+ return Triple(*terms)
87
+
88
+
89
+ class GenericQuadsBaseAdapter(GenericStatementSinkAdapter):
90
+ def __init__(self, options: ParserOptions) -> None:
91
+ super().__init__(options=options)
92
+
93
+
94
+ class GenericQuadsAdapter(GenericQuadsBaseAdapter):
95
+ """
96
+ Extends GenericQuadsBaseAdapter for QUADS physical type.
97
+
98
+ Args:
99
+ GenericQuadsBaseAdapter (_type_): quads adapter that handles
100
+ base quads processing.
101
+
102
+ """
103
+
104
+ @override
105
+ def quad(self, terms: Iterable[Any]) -> Quad:
106
+ return Quad(*terms)
107
+
108
+
109
+ class GenericGraphsAdapter(GenericQuadsBaseAdapter):
110
+ """
111
+ Extends GenericQuadsBaseAdapter for GRAPHS physical type.
112
+
113
+ Notes:
114
+ introduces graph start/end, checks if graph exists.
115
+
116
+ Args:
117
+ GenericQuadsBaseAdapter (_type_): quads adapter that handles
118
+ base quads processing.
119
+
120
+ Raises:
121
+ JellyConformanceError: raised if graph start message was not received.
122
+
123
+ """
124
+
125
+ _graph_id: str | None
126
+
127
+ def __init__(
128
+ self,
129
+ options: ParserOptions,
130
+ ) -> None:
131
+ super().__init__(options=options)
132
+ self._graph_id = None
133
+
134
+ @property
135
+ def graph(self) -> None:
136
+ if self._graph_id is None:
137
+ msg = "new graph was not started"
138
+ raise JellyConformanceError(msg)
139
+
140
+ @override
141
+ def graph_start(self, graph_id: str) -> None:
142
+ self._graph_id = graph_id
143
+
144
+ @override
145
+ def triple(self, terms: Iterable[Any]) -> Quad:
146
+ return Quad(*chain(terms, [self._graph_id]))
147
+
148
+ @override
149
+ def graph_end(self) -> None:
150
+ self._graph_id = None
151
+
152
+
153
+ def parse_triples_stream(
154
+ frames: Iterable[jelly.RdfStreamFrame],
155
+ options: ParserOptions,
156
+ ) -> Generator[Iterable[Triple | Prefix]]:
157
+ """
158
+ Parse flat triple stream.
159
+
160
+ Args:
161
+ frames (Iterable[jelly.RdfStreamFrame]): iterator over stream frames
162
+ options (ParserOptions): stream options
163
+
164
+ Yields:
165
+ Generator[Iterable[Triple | Prefix]]:
166
+ Generator of iterables of Triple or Prefix objects,
167
+ one iterable per frame.
168
+
169
+ """
170
+ adapter = GenericTriplesAdapter(options)
171
+ decoder = Decoder(adapter=adapter)
172
+ for frame in frames:
173
+ yield decoder.iter_rows(frame)
174
+ return
175
+
176
+
177
+ def parse_quads_stream(
178
+ frames: Iterable[jelly.RdfStreamFrame],
179
+ options: ParserOptions,
180
+ ) -> Generator[Iterable[Quad | Prefix]]:
181
+ """
182
+ Parse flat quads stream.
183
+
184
+ Args:
185
+ frames (Iterable[jelly.RdfStreamFrame]): iterator over stream frames
186
+ options (ParserOptions): stream options
187
+
188
+ Yields:
189
+ Generator[Iterable[Quad | Prefix]]:
190
+ Generator of iterables of Quad or Prefix objects,
191
+ one iterable per frame.
192
+
193
+ """
194
+ adapter_class: type[GenericQuadsBaseAdapter]
195
+ if options.stream_types.physical_type == jelly.PHYSICAL_STREAM_TYPE_QUADS:
196
+ adapter_class = GenericQuadsAdapter
197
+ else:
198
+ adapter_class = GenericGraphsAdapter
199
+ adapter = adapter_class(options=options)
200
+ decoder = Decoder(adapter=adapter)
201
+ for frame in frames:
202
+ yield decoder.iter_rows(frame)
203
+ return
204
+
205
+
206
+ def parse_jelly_grouped(
207
+ inp: IO[bytes],
208
+ sink_factory: Callable[[], GenericStatementSink] = lambda: GenericStatementSink(),
209
+ ) -> Generator[GenericStatementSink]:
210
+ """
211
+ Take a jelly file and return generators of generic statements sinks.
212
+
213
+ Yields one generic statements sink per frame.
214
+
215
+ Args:
216
+ inp (IO[bytes]): input jelly buffered binary stream
217
+ sink_factory (Callable): lambda to construct a statement sink.
218
+ By default creates an empty in-memory GenericStatementSink.
219
+
220
+ Raises:
221
+ NotImplementedError: is raised if a physical type is not implemented
222
+
223
+ Yields:
224
+ Generator[GenericStatementSink]:
225
+ returns generators for GenericStatementSink, regardless of stream type.
226
+
227
+ """
228
+ options, frames = get_options_and_frames(inp)
229
+ if options.stream_types.physical_type == jelly.PHYSICAL_STREAM_TYPE_TRIPLES:
230
+ for graph in parse_triples_stream(
231
+ frames=frames,
232
+ options=options,
233
+ ):
234
+ sink = sink_factory()
235
+ for graph_item in graph:
236
+ if isinstance(graph_item, Prefix):
237
+ sink.bind(graph_item.prefix, graph_item.iri)
238
+ else:
239
+ sink.add(graph_item)
240
+ yield sink
241
+ return
242
+ elif options.stream_types.physical_type in (
243
+ jelly.PHYSICAL_STREAM_TYPE_QUADS,
244
+ jelly.PHYSICAL_STREAM_TYPE_GRAPHS,
245
+ ):
246
+ for dataset in parse_quads_stream(
247
+ frames=frames,
248
+ options=options,
249
+ ):
250
+ sink = sink_factory()
251
+ for item in dataset:
252
+ if isinstance(item, Prefix):
253
+ sink.bind(item.prefix, item.iri)
254
+ else:
255
+ sink.add(item)
256
+ yield sink
257
+ return
258
+
259
+ physical_type_name = jelly.PhysicalStreamType.Name(
260
+ options.stream_types.physical_type
261
+ )
262
+ msg = f"the stream type {physical_type_name} is not supported "
263
+ raise NotImplementedError(msg)
264
+
265
+
266
+ def parse_jelly_to_graph(
267
+ inp: IO[bytes],
268
+ sink_factory: Callable[[], GenericStatementSink] = lambda: GenericStatementSink(),
269
+ ) -> GenericStatementSink:
270
+ """
271
+ Add statements from Generator to GenericStatementSink.
272
+
273
+ Args:
274
+ inp (IO[bytes]): input jelly stream.
275
+ sink_factory (Callable[[], GenericStatementSink]): factory to create
276
+ statement sink.
277
+ By default creates an empty in-memory GenericStatementSink.
278
+ Has no division for datasets/graphs,
279
+ utilizes the same underlying data structures.
280
+
281
+ Returns:
282
+ GenericStatementSink: GenericStatementSink with statements.
283
+
284
+ """
285
+ options, frames = get_options_and_frames(inp)
286
+ sink = sink_factory()
287
+
288
+ for item in parse_jelly_flat(inp=inp, frames=frames, options=options):
289
+ if isinstance(item, Prefix):
290
+ sink.bind(item.prefix, item.iri)
291
+ else:
292
+ sink.add(item)
293
+ return sink
294
+
295
+
296
+ def parse_jelly_flat(
297
+ inp: IO[bytes],
298
+ frames: Iterable[jelly.RdfStreamFrame] | None = None,
299
+ options: ParserOptions | None = None,
300
+ ) -> Generator[Statement | Prefix]:
301
+ """
302
+ Parse jelly file with FLAT logical type into a Generator of stream events.
303
+
304
+ Args:
305
+ inp (IO[bytes]): input jelly buffered binary stream.
306
+ frames (Iterable[jelly.RdfStreamFrame | None):
307
+ jelly frames if read before.
308
+ options (ParserOptions | None): stream options
309
+ if read before.
310
+
311
+ Raises:
312
+ NotImplementedError: if physical type is not supported
313
+
314
+ Yields:
315
+ Generator[Statement | Prefix]: Generator of stream events
316
+
317
+ """
318
+ if not frames or not options:
319
+ options, frames = get_options_and_frames(inp)
320
+
321
+ if options.stream_types.physical_type == jelly.PHYSICAL_STREAM_TYPE_TRIPLES:
322
+ for triples in parse_triples_stream(frames=frames, options=options):
323
+ yield from triples
324
+ return
325
+ if options.stream_types.physical_type in (
326
+ jelly.PHYSICAL_STREAM_TYPE_QUADS,
327
+ jelly.PHYSICAL_STREAM_TYPE_GRAPHS,
328
+ ):
329
+ for quads in parse_quads_stream(
330
+ frames=frames,
331
+ options=options,
332
+ ):
333
+ yield from quads
334
+ return
335
+ physical_type_name = jelly.PhysicalStreamType.Name(
336
+ options.stream_types.physical_type
337
+ )
338
+ msg = f"the stream type {physical_type_name} is not supported "
339
+ raise NotImplementedError(msg)