pyjelly 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyjelly might be problematic. Click here for more details.
- pyjelly/integrations/generic/__init__.py +0 -0
- pyjelly/integrations/generic/generic_sink.py +163 -0
- pyjelly/integrations/generic/parse.py +339 -0
- pyjelly/integrations/generic/serialize.py +361 -0
- pyjelly/integrations/rdflib/parse.py +3 -3
- pyjelly/integrations/rdflib/serialize.py +91 -19
- pyjelly/options.py +9 -0
- pyjelly/parse/decode.py +22 -0
- pyjelly/serialize/encode.py +30 -3
- {pyjelly-0.4.0.dist-info → pyjelly-0.5.0.dist-info}/METADATA +3 -2
- {pyjelly-0.4.0.dist-info → pyjelly-0.5.0.dist-info}/RECORD +14 -10
- {pyjelly-0.4.0.dist-info → pyjelly-0.5.0.dist-info}/WHEEL +0 -0
- {pyjelly-0.4.0.dist-info → pyjelly-0.5.0.dist-info}/entry_points.txt +0 -0
- {pyjelly-0.4.0.dist-info → pyjelly-0.5.0.dist-info}/licenses/LICENSE +0 -0
|
File without changes
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import deque
|
|
4
|
+
from collections.abc import Generator
|
|
5
|
+
from typing import IO, NamedTuple, Union
|
|
6
|
+
|
|
7
|
+
DEFAULT_GRAPH_IDENTIFIER = ""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BlankNode:
|
|
11
|
+
"""Class for blank nodes, storing BN's identifier as a string."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, identifier: str) -> None:
|
|
14
|
+
self._identifier: str = identifier
|
|
15
|
+
|
|
16
|
+
def __str__(self) -> str:
|
|
17
|
+
return f"_:{self._identifier}"
|
|
18
|
+
|
|
19
|
+
def __repr__(self) -> str:
|
|
20
|
+
return f"BlankNode(identifier={self._identifier})"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class IRI:
|
|
24
|
+
"""Class for IRIs, storing IRI as a string."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, iri: str) -> None:
|
|
27
|
+
self._iri: str = iri
|
|
28
|
+
|
|
29
|
+
def __str__(self) -> str:
|
|
30
|
+
return f"<{self._iri}>"
|
|
31
|
+
|
|
32
|
+
def __repr__(self) -> str:
|
|
33
|
+
return f"IRI({self._iri})"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Literal:
|
|
37
|
+
"""
|
|
38
|
+
Class for literals.
|
|
39
|
+
|
|
40
|
+
Notes:
|
|
41
|
+
Consists of: lexical form, and optional language tag and datatype.
|
|
42
|
+
All parts of literal are stored as strings.
|
|
43
|
+
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self, lex: str, langtag: str | None = None, datatype: str | None = None
|
|
48
|
+
) -> None:
|
|
49
|
+
self._lex: str = lex
|
|
50
|
+
self._langtag: str | None = langtag
|
|
51
|
+
self._datatype: str | None = datatype
|
|
52
|
+
|
|
53
|
+
def __str__(self) -> str:
|
|
54
|
+
suffix = ""
|
|
55
|
+
if self._langtag:
|
|
56
|
+
suffix = f"@{self._langtag}"
|
|
57
|
+
elif self._datatype:
|
|
58
|
+
suffix = f"^^<{self._datatype}>"
|
|
59
|
+
return f'"{self._lex}"{suffix}'
|
|
60
|
+
|
|
61
|
+
def __repr__(self) -> str:
|
|
62
|
+
return (
|
|
63
|
+
f"Literal({self._lex!r}, langtag={self._langtag!r}, "
|
|
64
|
+
f"datatype={self._datatype!r})"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
Node = Union[BlankNode, IRI, Literal, "Triple", str]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
TRIPLE_ARITY = 3
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class Triple(NamedTuple):
|
|
75
|
+
"""Class for RDF triples."""
|
|
76
|
+
|
|
77
|
+
s: Node
|
|
78
|
+
p: Node
|
|
79
|
+
o: Node
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class Quad(NamedTuple):
|
|
83
|
+
"""Class for RDF quads."""
|
|
84
|
+
|
|
85
|
+
s: Node
|
|
86
|
+
p: Node
|
|
87
|
+
o: Node
|
|
88
|
+
g: Node
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class Prefix(NamedTuple):
|
|
92
|
+
"""Class for generic namespace declaration."""
|
|
93
|
+
|
|
94
|
+
prefix: str
|
|
95
|
+
iri: IRI
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class GenericStatementSink:
|
|
99
|
+
_store: deque[Triple | Quad]
|
|
100
|
+
|
|
101
|
+
def __init__(self, identifier: Node = DEFAULT_GRAPH_IDENTIFIER) -> None:
|
|
102
|
+
"""
|
|
103
|
+
Initialize statements storage, namespaces dictionary, and parser.
|
|
104
|
+
|
|
105
|
+
Notes:
|
|
106
|
+
_store preserves the order of statements.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
identifier (str, optional): Identifier for a sink.
|
|
110
|
+
Defaults to DEFAULT_GRAPH_IDENTIFIER.
|
|
111
|
+
|
|
112
|
+
"""
|
|
113
|
+
self._store: deque[Triple | Quad] = deque()
|
|
114
|
+
self._namespaces: dict[str, IRI] = {}
|
|
115
|
+
self._identifier = identifier
|
|
116
|
+
|
|
117
|
+
def add(self, statement: Triple | Quad) -> None:
|
|
118
|
+
self._store.append(statement)
|
|
119
|
+
|
|
120
|
+
def bind(self, prefix: str, namespace: IRI) -> None:
|
|
121
|
+
self._namespaces.update({prefix: namespace})
|
|
122
|
+
|
|
123
|
+
def __iter__(self) -> Generator[Triple | Quad]:
|
|
124
|
+
yield from self._store
|
|
125
|
+
|
|
126
|
+
def __len__(self) -> int:
|
|
127
|
+
return len(self._store)
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def namespaces(self) -> Generator[tuple[str, IRI]]:
|
|
131
|
+
yield from self._namespaces.items()
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def identifier(self) -> Node:
|
|
135
|
+
return self._identifier
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def store(self) -> Generator[Triple | Quad]:
|
|
139
|
+
yield from self._store
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def is_triples_sink(self) -> bool:
|
|
143
|
+
"""
|
|
144
|
+
Check if the sink contains triples or quads.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
bool: true, if length of statement is 3.
|
|
148
|
+
|
|
149
|
+
"""
|
|
150
|
+
return len(self._store[0]) == TRIPLE_ARITY
|
|
151
|
+
|
|
152
|
+
def parse(self, input_file: IO[bytes]) -> None:
|
|
153
|
+
from pyjelly.integrations.generic.parse import parse_jelly_to_graph
|
|
154
|
+
|
|
155
|
+
parsed_result = parse_jelly_to_graph(input_file)
|
|
156
|
+
self._store = parsed_result._store
|
|
157
|
+
self._namespaces = parsed_result._namespaces
|
|
158
|
+
self._identifier = parsed_result._identifier
|
|
159
|
+
|
|
160
|
+
def serialize(self, output_file: IO[bytes]) -> None:
|
|
161
|
+
from pyjelly.integrations.generic.serialize import grouped_stream_to_file
|
|
162
|
+
|
|
163
|
+
grouped_stream_to_file((sink for sink in [self]), output_file)
|
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Generator, Iterable
|
|
4
|
+
from itertools import chain
|
|
5
|
+
from typing import IO, Any, Callable, Union
|
|
6
|
+
from typing_extensions import override
|
|
7
|
+
|
|
8
|
+
from pyjelly import jelly
|
|
9
|
+
from pyjelly.errors import JellyConformanceError
|
|
10
|
+
from pyjelly.integrations.generic.generic_sink import (
|
|
11
|
+
IRI,
|
|
12
|
+
BlankNode,
|
|
13
|
+
GenericStatementSink,
|
|
14
|
+
Literal,
|
|
15
|
+
Prefix,
|
|
16
|
+
Quad,
|
|
17
|
+
Triple,
|
|
18
|
+
)
|
|
19
|
+
from pyjelly.parse.decode import Adapter, Decoder, ParserOptions
|
|
20
|
+
from pyjelly.parse.ioutils import get_options_and_frames
|
|
21
|
+
|
|
22
|
+
Statement = Union[Triple, Quad]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class GenericStatementSinkAdapter(Adapter):
|
|
26
|
+
"""
|
|
27
|
+
Implement Adapter for generic statements.
|
|
28
|
+
|
|
29
|
+
Notes:
|
|
30
|
+
Returns custom RDF terms expected by GenericStatementSink,
|
|
31
|
+
handles namespace declarations, and quoted triples.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
Adapter (_type_): base Adapter class
|
|
35
|
+
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
@override
|
|
39
|
+
def iri(self, iri: str) -> IRI:
|
|
40
|
+
return IRI(iri)
|
|
41
|
+
|
|
42
|
+
@override
|
|
43
|
+
def bnode(self, bnode: str) -> BlankNode:
|
|
44
|
+
return BlankNode(bnode)
|
|
45
|
+
|
|
46
|
+
@override
|
|
47
|
+
def default_graph(self) -> str:
|
|
48
|
+
return ""
|
|
49
|
+
|
|
50
|
+
@override
|
|
51
|
+
def literal(
|
|
52
|
+
self,
|
|
53
|
+
lex: str,
|
|
54
|
+
language: str | None = None,
|
|
55
|
+
datatype: str | None = None,
|
|
56
|
+
) -> Literal:
|
|
57
|
+
return Literal(lex, language, datatype)
|
|
58
|
+
|
|
59
|
+
@override
|
|
60
|
+
def namespace_declaration(self, name: str, iri: str) -> Prefix:
|
|
61
|
+
return Prefix(name, self.iri(iri))
|
|
62
|
+
|
|
63
|
+
@override
|
|
64
|
+
def quoted_triple(self, terms: Iterable[Any]) -> Triple:
|
|
65
|
+
return Triple(*terms)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class GenericTriplesAdapter(GenericStatementSinkAdapter):
|
|
69
|
+
"""
|
|
70
|
+
Triples adapted implementation for GenericStatementSink.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
GenericStatementSinkAdapter (_type_): base GenericStatementSink
|
|
74
|
+
adapter implementation that handles terms and namespaces.
|
|
75
|
+
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
options: ParserOptions,
|
|
81
|
+
) -> None:
|
|
82
|
+
super().__init__(options=options)
|
|
83
|
+
|
|
84
|
+
@override
|
|
85
|
+
def triple(self, terms: Iterable[Any]) -> Triple:
|
|
86
|
+
return Triple(*terms)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class GenericQuadsBaseAdapter(GenericStatementSinkAdapter):
|
|
90
|
+
def __init__(self, options: ParserOptions) -> None:
|
|
91
|
+
super().__init__(options=options)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class GenericQuadsAdapter(GenericQuadsBaseAdapter):
|
|
95
|
+
"""
|
|
96
|
+
Extends GenericQuadsBaseAdapter for QUADS physical type.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
GenericQuadsBaseAdapter (_type_): quads adapter that handles
|
|
100
|
+
base quads processing.
|
|
101
|
+
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
@override
|
|
105
|
+
def quad(self, terms: Iterable[Any]) -> Quad:
|
|
106
|
+
return Quad(*terms)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class GenericGraphsAdapter(GenericQuadsBaseAdapter):
|
|
110
|
+
"""
|
|
111
|
+
Extends GenericQuadsBaseAdapter for GRAPHS physical type.
|
|
112
|
+
|
|
113
|
+
Notes:
|
|
114
|
+
introduces graph start/end, checks if graph exists.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
GenericQuadsBaseAdapter (_type_): quads adapter that handles
|
|
118
|
+
base quads processing.
|
|
119
|
+
|
|
120
|
+
Raises:
|
|
121
|
+
JellyConformanceError: raised if graph start message was not received.
|
|
122
|
+
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
_graph_id: str | None
|
|
126
|
+
|
|
127
|
+
def __init__(
|
|
128
|
+
self,
|
|
129
|
+
options: ParserOptions,
|
|
130
|
+
) -> None:
|
|
131
|
+
super().__init__(options=options)
|
|
132
|
+
self._graph_id = None
|
|
133
|
+
|
|
134
|
+
@property
|
|
135
|
+
def graph(self) -> None:
|
|
136
|
+
if self._graph_id is None:
|
|
137
|
+
msg = "new graph was not started"
|
|
138
|
+
raise JellyConformanceError(msg)
|
|
139
|
+
|
|
140
|
+
@override
|
|
141
|
+
def graph_start(self, graph_id: str) -> None:
|
|
142
|
+
self._graph_id = graph_id
|
|
143
|
+
|
|
144
|
+
@override
|
|
145
|
+
def triple(self, terms: Iterable[Any]) -> Quad:
|
|
146
|
+
return Quad(*chain(terms, [self._graph_id]))
|
|
147
|
+
|
|
148
|
+
@override
|
|
149
|
+
def graph_end(self) -> None:
|
|
150
|
+
self._graph_id = None
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def parse_triples_stream(
|
|
154
|
+
frames: Iterable[jelly.RdfStreamFrame],
|
|
155
|
+
options: ParserOptions,
|
|
156
|
+
) -> Generator[Iterable[Triple | Prefix]]:
|
|
157
|
+
"""
|
|
158
|
+
Parse flat triple stream.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
frames (Iterable[jelly.RdfStreamFrame]): iterator over stream frames
|
|
162
|
+
options (ParserOptions): stream options
|
|
163
|
+
|
|
164
|
+
Yields:
|
|
165
|
+
Generator[Iterable[Triple | Prefix]]:
|
|
166
|
+
Generator of iterables of Triple or Prefix objects,
|
|
167
|
+
one iterable per frame.
|
|
168
|
+
|
|
169
|
+
"""
|
|
170
|
+
adapter = GenericTriplesAdapter(options)
|
|
171
|
+
decoder = Decoder(adapter=adapter)
|
|
172
|
+
for frame in frames:
|
|
173
|
+
yield decoder.iter_rows(frame)
|
|
174
|
+
return
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def parse_quads_stream(
|
|
178
|
+
frames: Iterable[jelly.RdfStreamFrame],
|
|
179
|
+
options: ParserOptions,
|
|
180
|
+
) -> Generator[Iterable[Quad | Prefix]]:
|
|
181
|
+
"""
|
|
182
|
+
Parse flat quads stream.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
frames (Iterable[jelly.RdfStreamFrame]): iterator over stream frames
|
|
186
|
+
options (ParserOptions): stream options
|
|
187
|
+
|
|
188
|
+
Yields:
|
|
189
|
+
Generator[Iterable[Quad | Prefix]]:
|
|
190
|
+
Generator of iterables of Quad or Prefix objects,
|
|
191
|
+
one iterable per frame.
|
|
192
|
+
|
|
193
|
+
"""
|
|
194
|
+
adapter_class: type[GenericQuadsBaseAdapter]
|
|
195
|
+
if options.stream_types.physical_type == jelly.PHYSICAL_STREAM_TYPE_QUADS:
|
|
196
|
+
adapter_class = GenericQuadsAdapter
|
|
197
|
+
else:
|
|
198
|
+
adapter_class = GenericGraphsAdapter
|
|
199
|
+
adapter = adapter_class(options=options)
|
|
200
|
+
decoder = Decoder(adapter=adapter)
|
|
201
|
+
for frame in frames:
|
|
202
|
+
yield decoder.iter_rows(frame)
|
|
203
|
+
return
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def parse_jelly_grouped(
|
|
207
|
+
inp: IO[bytes],
|
|
208
|
+
sink_factory: Callable[[], GenericStatementSink] = lambda: GenericStatementSink(),
|
|
209
|
+
) -> Generator[GenericStatementSink]:
|
|
210
|
+
"""
|
|
211
|
+
Take a jelly file and return generators of generic statements sinks.
|
|
212
|
+
|
|
213
|
+
Yields one generic statements sink per frame.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
inp (IO[bytes]): input jelly buffered binary stream
|
|
217
|
+
sink_factory (Callable): lambda to construct a statement sink.
|
|
218
|
+
By default creates an empty in-memory GenericStatementSink.
|
|
219
|
+
|
|
220
|
+
Raises:
|
|
221
|
+
NotImplementedError: is raised if a physical type is not implemented
|
|
222
|
+
|
|
223
|
+
Yields:
|
|
224
|
+
Generator[GenericStatementSink]:
|
|
225
|
+
returns generators for GenericStatementSink, regardless of stream type.
|
|
226
|
+
|
|
227
|
+
"""
|
|
228
|
+
options, frames = get_options_and_frames(inp)
|
|
229
|
+
if options.stream_types.physical_type == jelly.PHYSICAL_STREAM_TYPE_TRIPLES:
|
|
230
|
+
for graph in parse_triples_stream(
|
|
231
|
+
frames=frames,
|
|
232
|
+
options=options,
|
|
233
|
+
):
|
|
234
|
+
sink = sink_factory()
|
|
235
|
+
for graph_item in graph:
|
|
236
|
+
if isinstance(graph_item, Prefix):
|
|
237
|
+
sink.bind(graph_item.prefix, graph_item.iri)
|
|
238
|
+
else:
|
|
239
|
+
sink.add(graph_item)
|
|
240
|
+
yield sink
|
|
241
|
+
return
|
|
242
|
+
elif options.stream_types.physical_type in (
|
|
243
|
+
jelly.PHYSICAL_STREAM_TYPE_QUADS,
|
|
244
|
+
jelly.PHYSICAL_STREAM_TYPE_GRAPHS,
|
|
245
|
+
):
|
|
246
|
+
for dataset in parse_quads_stream(
|
|
247
|
+
frames=frames,
|
|
248
|
+
options=options,
|
|
249
|
+
):
|
|
250
|
+
sink = sink_factory()
|
|
251
|
+
for item in dataset:
|
|
252
|
+
if isinstance(item, Prefix):
|
|
253
|
+
sink.bind(item.prefix, item.iri)
|
|
254
|
+
else:
|
|
255
|
+
sink.add(item)
|
|
256
|
+
yield sink
|
|
257
|
+
return
|
|
258
|
+
|
|
259
|
+
physical_type_name = jelly.PhysicalStreamType.Name(
|
|
260
|
+
options.stream_types.physical_type
|
|
261
|
+
)
|
|
262
|
+
msg = f"the stream type {physical_type_name} is not supported "
|
|
263
|
+
raise NotImplementedError(msg)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def parse_jelly_to_graph(
|
|
267
|
+
inp: IO[bytes],
|
|
268
|
+
sink_factory: Callable[[], GenericStatementSink] = lambda: GenericStatementSink(),
|
|
269
|
+
) -> GenericStatementSink:
|
|
270
|
+
"""
|
|
271
|
+
Add statements from Generator to GenericStatementSink.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
inp (IO[bytes]): input jelly stream.
|
|
275
|
+
sink_factory (Callable[[], GenericStatementSink]): factory to create
|
|
276
|
+
statement sink.
|
|
277
|
+
By default creates an empty in-memory GenericStatementSink.
|
|
278
|
+
Has no division for datasets/graphs,
|
|
279
|
+
utilizes the same underlying data structures.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
GenericStatementSink: GenericStatementSink with statements.
|
|
283
|
+
|
|
284
|
+
"""
|
|
285
|
+
options, frames = get_options_and_frames(inp)
|
|
286
|
+
sink = sink_factory()
|
|
287
|
+
|
|
288
|
+
for item in parse_jelly_flat(inp=inp, frames=frames, options=options):
|
|
289
|
+
if isinstance(item, Prefix):
|
|
290
|
+
sink.bind(item.prefix, item.iri)
|
|
291
|
+
else:
|
|
292
|
+
sink.add(item)
|
|
293
|
+
return sink
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def parse_jelly_flat(
|
|
297
|
+
inp: IO[bytes],
|
|
298
|
+
frames: Iterable[jelly.RdfStreamFrame] | None = None,
|
|
299
|
+
options: ParserOptions | None = None,
|
|
300
|
+
) -> Generator[Statement | Prefix]:
|
|
301
|
+
"""
|
|
302
|
+
Parse jelly file with FLAT logical type into a Generator of stream events.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
inp (IO[bytes]): input jelly buffered binary stream.
|
|
306
|
+
frames (Iterable[jelly.RdfStreamFrame | None):
|
|
307
|
+
jelly frames if read before.
|
|
308
|
+
options (ParserOptions | None): stream options
|
|
309
|
+
if read before.
|
|
310
|
+
|
|
311
|
+
Raises:
|
|
312
|
+
NotImplementedError: if physical type is not supported
|
|
313
|
+
|
|
314
|
+
Yields:
|
|
315
|
+
Generator[Statement | Prefix]: Generator of stream events
|
|
316
|
+
|
|
317
|
+
"""
|
|
318
|
+
if not frames or not options:
|
|
319
|
+
options, frames = get_options_and_frames(inp)
|
|
320
|
+
|
|
321
|
+
if options.stream_types.physical_type == jelly.PHYSICAL_STREAM_TYPE_TRIPLES:
|
|
322
|
+
for triples in parse_triples_stream(frames=frames, options=options):
|
|
323
|
+
yield from triples
|
|
324
|
+
return
|
|
325
|
+
if options.stream_types.physical_type in (
|
|
326
|
+
jelly.PHYSICAL_STREAM_TYPE_QUADS,
|
|
327
|
+
jelly.PHYSICAL_STREAM_TYPE_GRAPHS,
|
|
328
|
+
):
|
|
329
|
+
for quads in parse_quads_stream(
|
|
330
|
+
frames=frames,
|
|
331
|
+
options=options,
|
|
332
|
+
):
|
|
333
|
+
yield from quads
|
|
334
|
+
return
|
|
335
|
+
physical_type_name = jelly.PhysicalStreamType.Name(
|
|
336
|
+
options.stream_types.physical_type
|
|
337
|
+
)
|
|
338
|
+
msg = f"the stream type {physical_type_name} is not supported "
|
|
339
|
+
raise NotImplementedError(msg)
|
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
# ruff: noqa: I001
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import cast
|
|
4
|
+
from collections.abc import Generator
|
|
5
|
+
from functools import singledispatch
|
|
6
|
+
from typing import Any, IO
|
|
7
|
+
from itertools import chain
|
|
8
|
+
from pyjelly.integrations.generic.generic_sink import (
|
|
9
|
+
GenericStatementSink,
|
|
10
|
+
Quad,
|
|
11
|
+
Triple,
|
|
12
|
+
DEFAULT_GRAPH_IDENTIFIER,
|
|
13
|
+
IRI,
|
|
14
|
+
BlankNode,
|
|
15
|
+
Literal,
|
|
16
|
+
Node,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
from pyjelly import jelly
|
|
20
|
+
from pyjelly.serialize.encode import RowsAndTerm, Slot, TermEncoder
|
|
21
|
+
from pyjelly.serialize.ioutils import write_delimited
|
|
22
|
+
from pyjelly.serialize.streams import (
|
|
23
|
+
GraphStream,
|
|
24
|
+
QuadStream,
|
|
25
|
+
SerializerOptions,
|
|
26
|
+
Stream,
|
|
27
|
+
TripleStream,
|
|
28
|
+
) # ruff: enable
|
|
29
|
+
|
|
30
|
+
QUAD_ARITY = 4
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class GenericSinkTermEncoder(TermEncoder):
|
|
34
|
+
def encode_any(self, term: object, slot: Slot) -> RowsAndTerm:
|
|
35
|
+
"""
|
|
36
|
+
Encode term based on its GenericSink object.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
term (object): term to encode
|
|
40
|
+
slot (Slot): its place in statement.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
RowsAndTerm: encoded extra rows and a jelly term to encode
|
|
44
|
+
|
|
45
|
+
"""
|
|
46
|
+
if slot is Slot.graph and term == DEFAULT_GRAPH_IDENTIFIER:
|
|
47
|
+
return self.encode_default_graph()
|
|
48
|
+
|
|
49
|
+
if isinstance(term, IRI):
|
|
50
|
+
return self.encode_iri(term._iri)
|
|
51
|
+
|
|
52
|
+
if isinstance(term, Literal):
|
|
53
|
+
return self.encode_literal(
|
|
54
|
+
lex=term._lex,
|
|
55
|
+
language=term._langtag,
|
|
56
|
+
datatype=term._datatype,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
if isinstance(term, BlankNode):
|
|
60
|
+
return self.encode_bnode(term._identifier)
|
|
61
|
+
|
|
62
|
+
if isinstance(term, Triple):
|
|
63
|
+
return self.encode_quoted_triple(term)
|
|
64
|
+
|
|
65
|
+
return super().encode_any(term, slot) # error if not handled
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def namespace_declarations(store: GenericStatementSink, stream: Stream) -> None:
|
|
69
|
+
for prefix, namespace in store.namespaces:
|
|
70
|
+
stream.namespace_declaration(name=prefix, iri=str(namespace))
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@singledispatch
|
|
74
|
+
def stream_frames(
|
|
75
|
+
stream: Stream,
|
|
76
|
+
data: GenericStatementSink | Generator[Quad | Triple], # noqa: ARG001
|
|
77
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
78
|
+
msg = f"invalid stream implementation {stream}"
|
|
79
|
+
raise TypeError(msg)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@stream_frames.register(TripleStream)
|
|
83
|
+
def triples_stream_frames(
|
|
84
|
+
stream: TripleStream,
|
|
85
|
+
data: GenericStatementSink | Generator[Triple],
|
|
86
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
87
|
+
"""
|
|
88
|
+
Serialize a GenericStatementSink into frames using physical type triples stream.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
stream (TripleStream): stream that specifies triples processing
|
|
92
|
+
data (GenericStatementSink | Generator[Triple]):
|
|
93
|
+
GenericStatementSink/Statements to serialize.
|
|
94
|
+
|
|
95
|
+
Yields:
|
|
96
|
+
Generator[jelly.RdfStreamFrame]: jelly frames.
|
|
97
|
+
|
|
98
|
+
"""
|
|
99
|
+
stream.enroll()
|
|
100
|
+
if (
|
|
101
|
+
isinstance(data, GenericStatementSink)
|
|
102
|
+
and stream.options.params.namespace_declarations
|
|
103
|
+
):
|
|
104
|
+
namespace_declarations(data, stream)
|
|
105
|
+
|
|
106
|
+
graphs = (data,)
|
|
107
|
+
for graph in graphs:
|
|
108
|
+
for terms in graph:
|
|
109
|
+
if frame := stream.triple(terms):
|
|
110
|
+
yield frame
|
|
111
|
+
if frame := stream.flow.frame_from_graph():
|
|
112
|
+
yield frame
|
|
113
|
+
if stream.stream_types.flat and (frame := stream.flow.to_stream_frame()):
|
|
114
|
+
yield frame
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@stream_frames.register(QuadStream)
|
|
118
|
+
def quads_stream_frames(
|
|
119
|
+
stream: QuadStream,
|
|
120
|
+
data: GenericStatementSink | Generator[Quad],
|
|
121
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
122
|
+
"""
|
|
123
|
+
Serialize a GenericStatementSink into jelly frames using physical type quads stream.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
stream (QuadStream): stream that specifies quads processing
|
|
127
|
+
data (GenericStatementSink | Generator[Quad]): Dataset to serialize.
|
|
128
|
+
|
|
129
|
+
Yields:
|
|
130
|
+
Generator[jelly.RdfStreamFrame]: jelly frames
|
|
131
|
+
|
|
132
|
+
"""
|
|
133
|
+
stream.enroll()
|
|
134
|
+
if stream.options.params.namespace_declarations:
|
|
135
|
+
namespace_declarations(data, stream) # type: ignore[arg-type]
|
|
136
|
+
|
|
137
|
+
iterator: Generator[Quad]
|
|
138
|
+
if isinstance(data, GenericStatementSink):
|
|
139
|
+
iterator = cast(Generator[Quad], data.store)
|
|
140
|
+
else:
|
|
141
|
+
iterator = data
|
|
142
|
+
|
|
143
|
+
for terms in iterator:
|
|
144
|
+
if frame := stream.quad(terms):
|
|
145
|
+
yield frame
|
|
146
|
+
if frame := stream.flow.frame_from_dataset():
|
|
147
|
+
yield frame
|
|
148
|
+
if stream.stream_types.flat and (frame := stream.flow.to_stream_frame()):
|
|
149
|
+
yield frame
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@stream_frames.register(GraphStream)
|
|
153
|
+
def graphs_stream_frames(
|
|
154
|
+
stream: GraphStream,
|
|
155
|
+
data: GenericStatementSink | Generator[Quad],
|
|
156
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
157
|
+
"""
|
|
158
|
+
Serialize a GenericStatementSink into jelly frames as a stream of graphs.
|
|
159
|
+
|
|
160
|
+
Notes:
|
|
161
|
+
If flow of DatasetsFrameFlow type, the whole dataset
|
|
162
|
+
will be encoded into one frame.
|
|
163
|
+
Graphs are generated from the GenericStatementSink by
|
|
164
|
+
iterating over statements and yielding one new GenericStatementSink
|
|
165
|
+
per a sequence of quads with the same g term.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
stream (GraphStream): stream that specifies graphs processing
|
|
169
|
+
data (GenericStatementSink | Generator[Quad]): Dataset to serialize.
|
|
170
|
+
|
|
171
|
+
Yields:
|
|
172
|
+
Generator[jelly.RdfStreamFrame]: jelly frames
|
|
173
|
+
|
|
174
|
+
"""
|
|
175
|
+
stream.enroll()
|
|
176
|
+
if stream.options.params.namespace_declarations:
|
|
177
|
+
namespace_declarations(data, stream) # type: ignore[arg-type]
|
|
178
|
+
|
|
179
|
+
statements: Generator[Quad]
|
|
180
|
+
if isinstance(data, GenericStatementSink):
|
|
181
|
+
statements = cast(Generator[Quad], data.store)
|
|
182
|
+
graphs = split_to_graphs(statements)
|
|
183
|
+
elif iter(data):
|
|
184
|
+
statements = data
|
|
185
|
+
graphs = split_to_graphs(statements)
|
|
186
|
+
|
|
187
|
+
for graph in graphs:
|
|
188
|
+
yield from stream.graph(graph_id=graph.identifier, graph=graph)
|
|
189
|
+
|
|
190
|
+
if frame := stream.flow.frame_from_dataset():
|
|
191
|
+
yield frame
|
|
192
|
+
if stream.stream_types.flat and (frame := stream.flow.to_stream_frame()):
|
|
193
|
+
yield frame
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def split_to_graphs(data: Generator[Quad]) -> Generator[GenericStatementSink]:
|
|
197
|
+
"""
|
|
198
|
+
Split a generator of quads to graphs.
|
|
199
|
+
|
|
200
|
+
Notes:
|
|
201
|
+
New graph is generated by
|
|
202
|
+
iterating over statements and yielding one new GenericStatementSink
|
|
203
|
+
per a sequence of quads with the same g term.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
data (Generator[Quad]): generator of quads
|
|
207
|
+
|
|
208
|
+
Yields:
|
|
209
|
+
Generator[GenericStatementSink]: generator of GenericStatementSinks,
|
|
210
|
+
each having triples in store and identifier set.
|
|
211
|
+
|
|
212
|
+
"""
|
|
213
|
+
current_g: Node | None = None
|
|
214
|
+
current_sink: GenericStatementSink | None = None
|
|
215
|
+
for statement in data:
|
|
216
|
+
if current_g != statement.g:
|
|
217
|
+
if current_sink is not None:
|
|
218
|
+
yield current_sink
|
|
219
|
+
|
|
220
|
+
current_g = statement.g
|
|
221
|
+
current_sink = GenericStatementSink(identifier=current_g)
|
|
222
|
+
|
|
223
|
+
assert current_sink is not None
|
|
224
|
+
current_sink.add(Triple(statement.s, statement.p, statement.o))
|
|
225
|
+
|
|
226
|
+
if current_sink is not None:
|
|
227
|
+
yield current_sink
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def guess_options(sink: GenericStatementSink) -> SerializerOptions:
|
|
231
|
+
"""Guess the serializer options based on the store type."""
|
|
232
|
+
logical_type = (
|
|
233
|
+
jelly.LOGICAL_STREAM_TYPE_FLAT_TRIPLES
|
|
234
|
+
if sink.is_triples_sink
|
|
235
|
+
else jelly.LOGICAL_STREAM_TYPE_FLAT_QUADS
|
|
236
|
+
)
|
|
237
|
+
return SerializerOptions(logical_type=logical_type)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def guess_stream(options: SerializerOptions, sink: GenericStatementSink) -> Stream:
|
|
241
|
+
"""
|
|
242
|
+
Return an appropriate stream implementation for the given options.
|
|
243
|
+
|
|
244
|
+
Notes: if base(!) logical type is GRAPHS and sink.is_triples_sink is false,
|
|
245
|
+
initializes TripleStream
|
|
246
|
+
"""
|
|
247
|
+
stream_cls: type[Stream]
|
|
248
|
+
if (
|
|
249
|
+
options.logical_type % 10
|
|
250
|
+
) != jelly.LOGICAL_STREAM_TYPE_GRAPHS and not sink.is_triples_sink:
|
|
251
|
+
stream_cls = QuadStream
|
|
252
|
+
else:
|
|
253
|
+
stream_cls = TripleStream
|
|
254
|
+
if options is not None:
|
|
255
|
+
lookup_preset = options.lookup_preset
|
|
256
|
+
return stream_cls(
|
|
257
|
+
encoder=GenericSinkTermEncoder(lookup_preset=lookup_preset),
|
|
258
|
+
options=options,
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def grouped_stream_to_frames(
|
|
263
|
+
sink_generator: Generator[GenericStatementSink],
|
|
264
|
+
options: SerializerOptions | None = None,
|
|
265
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
266
|
+
"""
|
|
267
|
+
Transform multiple GenericStatementSinks into Jelly frames.
|
|
268
|
+
|
|
269
|
+
Notes:
|
|
270
|
+
One frame per GenericStatementSink.
|
|
271
|
+
|
|
272
|
+
Note: options are guessed if not provided.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
sink_generator (Generator[GenericStatementSink]): Generator of
|
|
276
|
+
GenericStatementSink to transform.
|
|
277
|
+
options (SerializerOptions | None, optional): stream options to use.
|
|
278
|
+
Options are guessed based on the sink store type. Defaults to None.
|
|
279
|
+
|
|
280
|
+
Yields:
|
|
281
|
+
Generator[jelly.RdfStreamFrame]: produced Jelly frames
|
|
282
|
+
|
|
283
|
+
"""
|
|
284
|
+
stream = None
|
|
285
|
+
for sink in sink_generator:
|
|
286
|
+
if not stream:
|
|
287
|
+
if options is None:
|
|
288
|
+
options = guess_options(sink)
|
|
289
|
+
stream = guess_stream(options, sink)
|
|
290
|
+
yield from stream_frames(stream, sink)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def grouped_stream_to_file(
|
|
294
|
+
stream: Generator[GenericStatementSink],
|
|
295
|
+
output_file: IO[bytes],
|
|
296
|
+
**kwargs: Any,
|
|
297
|
+
) -> None:
|
|
298
|
+
"""
|
|
299
|
+
Write stream of GenericStatementSink to a binary file.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
stream (Generator[GenericStatementSink]): Generator of
|
|
303
|
+
GenericStatementSink to serialize.
|
|
304
|
+
output_file (IO[bytes]): output buffered writer.
|
|
305
|
+
**kwargs (Any): options to pass to stream.
|
|
306
|
+
|
|
307
|
+
"""
|
|
308
|
+
for frame in grouped_stream_to_frames(stream, **kwargs):
|
|
309
|
+
write_delimited(frame, output_file)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def flat_stream_to_frames(
|
|
313
|
+
statements: Generator[Triple | Quad],
|
|
314
|
+
options: SerializerOptions | None = None,
|
|
315
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
316
|
+
"""
|
|
317
|
+
Serialize a stream of raw GenericStatementSink's triples or quads into Jelly frames.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
statements (Generator[Triple | Quad]):
|
|
321
|
+
s/p/o triples or s/p/o/g quads to serialize.
|
|
322
|
+
options (SerializerOptions | None, optional):
|
|
323
|
+
if omitted, guessed based on the first tuple.
|
|
324
|
+
|
|
325
|
+
Yields:
|
|
326
|
+
Generator[jelly.RdfStreamFrame]: generated frames.
|
|
327
|
+
|
|
328
|
+
"""
|
|
329
|
+
first = next(statements, None)
|
|
330
|
+
if first is None:
|
|
331
|
+
return
|
|
332
|
+
|
|
333
|
+
sink = GenericStatementSink()
|
|
334
|
+
sink.add(first)
|
|
335
|
+
if options is None:
|
|
336
|
+
options = guess_options(sink)
|
|
337
|
+
stream = guess_stream(options, sink)
|
|
338
|
+
|
|
339
|
+
combined: Generator[Triple | Quad] | GenericStatementSink = (
|
|
340
|
+
item for item in chain([first], statements)
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
yield from stream_frames(stream, combined)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def flat_stream_to_file(
|
|
347
|
+
statements: Generator[Triple | Quad],
|
|
348
|
+
output_file: IO[bytes],
|
|
349
|
+
options: SerializerOptions | None = None,
|
|
350
|
+
) -> None:
|
|
351
|
+
"""
|
|
352
|
+
Write Triple or Quad events to a binary file.
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
statements (Generator[Triple | Quad]): statements to serialize.
|
|
356
|
+
output_file (IO[bytes]): output buffered writer.
|
|
357
|
+
options (SerializerOptions | None, optional): stream options.
|
|
358
|
+
|
|
359
|
+
"""
|
|
360
|
+
for frame in flat_stream_to_frames(statements, options):
|
|
361
|
+
write_delimited(frame, output_file)
|
|
@@ -315,7 +315,7 @@ def parse_jelly_grouped(
|
|
|
315
315
|
dataset_factory: Callable[[], Dataset] = lambda: Dataset(),
|
|
316
316
|
) -> Generator[Graph] | Generator[Dataset]:
|
|
317
317
|
"""
|
|
318
|
-
Take jelly file and return generators based on the detected
|
|
318
|
+
Take jelly file and return generators based on the detected physical type.
|
|
319
319
|
|
|
320
320
|
Yields one graph/dataset per frame.
|
|
321
321
|
|
|
@@ -329,7 +329,7 @@ def parse_jelly_grouped(
|
|
|
329
329
|
but you can pass something else here.
|
|
330
330
|
|
|
331
331
|
Raises:
|
|
332
|
-
NotImplementedError: is raised if a
|
|
332
|
+
NotImplementedError: is raised if a physical type is not implemented
|
|
333
333
|
|
|
334
334
|
Yields:
|
|
335
335
|
Generator[Graph] | Generator[Dataset]:
|
|
@@ -426,7 +426,7 @@ def parse_jelly_flat(
|
|
|
426
426
|
options: ParserOptions | None = None,
|
|
427
427
|
) -> Generator[Statement | Prefix]:
|
|
428
428
|
"""
|
|
429
|
-
Parse jelly file with FLAT
|
|
429
|
+
Parse jelly file with FLAT logical type into a Generator of stream events.
|
|
430
430
|
|
|
431
431
|
Args:
|
|
432
432
|
inp (IO[bytes]): input jelly buffered binary stream.
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
# ruff: noqa: I001
|
|
2
2
|
from __future__ import annotations
|
|
3
|
-
|
|
3
|
+
from typing import cast
|
|
4
4
|
from collections.abc import Generator
|
|
5
5
|
from functools import singledispatch
|
|
6
6
|
from typing import Any, IO
|
|
7
7
|
from typing_extensions import override
|
|
8
|
+
from itertools import chain
|
|
9
|
+
from pyjelly.integrations.rdflib.parse import Quad, Triple
|
|
8
10
|
|
|
9
11
|
import rdflib
|
|
10
12
|
from rdflib import Graph
|
|
@@ -22,6 +24,8 @@ from pyjelly.serialize.streams import (
|
|
|
22
24
|
TripleStream,
|
|
23
25
|
) # ruff: enable
|
|
24
26
|
|
|
27
|
+
QUAD_ARITY = 4
|
|
28
|
+
|
|
25
29
|
|
|
26
30
|
class RDFLibTermEncoder(TermEncoder):
|
|
27
31
|
def encode_any(self, term: object, slot: Slot) -> RowsAndTerm:
|
|
@@ -65,7 +69,7 @@ def namespace_declarations(store: Graph, stream: Stream) -> None:
|
|
|
65
69
|
@singledispatch
|
|
66
70
|
def stream_frames(
|
|
67
71
|
stream: Stream,
|
|
68
|
-
data: Graph, # noqa: ARG001
|
|
72
|
+
data: Graph | Generator[Quad | Triple], # noqa: ARG001
|
|
69
73
|
) -> Generator[jelly.RdfStreamFrame]:
|
|
70
74
|
msg = f"invalid stream implementation {stream}"
|
|
71
75
|
raise TypeError(msg)
|
|
@@ -74,14 +78,15 @@ def stream_frames(
|
|
|
74
78
|
@stream_frames.register(TripleStream)
|
|
75
79
|
def triples_stream_frames(
|
|
76
80
|
stream: TripleStream,
|
|
77
|
-
data: Graph | Dataset,
|
|
81
|
+
data: Graph | Dataset | Generator[Triple],
|
|
78
82
|
) -> Generator[jelly.RdfStreamFrame]:
|
|
79
83
|
"""
|
|
80
84
|
Serialize a Graph/Dataset into jelly frames.
|
|
81
85
|
|
|
82
86
|
Args:
|
|
83
87
|
stream (TripleStream): stream that specifies triples processing
|
|
84
|
-
data (Graph | Dataset):
|
|
88
|
+
data (Graph | Dataset | Generator[Triple]):
|
|
89
|
+
Graph/Dataset/Statements to serialize.
|
|
85
90
|
|
|
86
91
|
Notes:
|
|
87
92
|
if Dataset is given, its graphs are unpacked and iterated over
|
|
@@ -92,24 +97,24 @@ def triples_stream_frames(
|
|
|
92
97
|
|
|
93
98
|
"""
|
|
94
99
|
stream.enroll()
|
|
95
|
-
if stream.options.params.namespace_declarations:
|
|
100
|
+
if isinstance(data, Graph) and stream.options.params.namespace_declarations:
|
|
96
101
|
namespace_declarations(data, stream)
|
|
102
|
+
|
|
97
103
|
graphs = (data,) if not isinstance(data, Dataset) else data.graphs()
|
|
98
104
|
for graph in graphs:
|
|
99
105
|
for terms in graph:
|
|
100
106
|
if frame := stream.triple(terms):
|
|
101
107
|
yield frame
|
|
102
|
-
# this part turns each graph to a frame for graphs logical type
|
|
103
108
|
if frame := stream.flow.frame_from_graph():
|
|
104
109
|
yield frame
|
|
105
110
|
if stream.stream_types.flat and (frame := stream.flow.to_stream_frame()):
|
|
106
111
|
yield frame
|
|
107
112
|
|
|
108
113
|
|
|
109
|
-
@stream_frames.register
|
|
114
|
+
@stream_frames.register(QuadStream)
|
|
110
115
|
def quads_stream_frames(
|
|
111
116
|
stream: QuadStream,
|
|
112
|
-
data: Dataset,
|
|
117
|
+
data: Dataset | Generator[Quad],
|
|
113
118
|
) -> Generator[jelly.RdfStreamFrame]:
|
|
114
119
|
"""
|
|
115
120
|
Serialize a Dataset into jelly frames.
|
|
@@ -119,17 +124,23 @@ def quads_stream_frames(
|
|
|
119
124
|
|
|
120
125
|
Args:
|
|
121
126
|
stream (QuadStream): stream that specifies quads processing
|
|
122
|
-
data (Dataset): Dataset to serialize.
|
|
127
|
+
data (Dataset | Generator[Quad]): Dataset to serialize.
|
|
123
128
|
|
|
124
129
|
Yields:
|
|
125
130
|
Generator[jelly.RdfStreamFrame]: jelly frames
|
|
126
131
|
|
|
127
132
|
"""
|
|
128
|
-
assert isinstance(data, Dataset)
|
|
129
133
|
stream.enroll()
|
|
130
134
|
if stream.options.params.namespace_declarations:
|
|
131
|
-
namespace_declarations(data, stream)
|
|
132
|
-
|
|
135
|
+
namespace_declarations(data, stream) # type: ignore[arg-type]
|
|
136
|
+
|
|
137
|
+
iterator: Generator[Quad, None, None]
|
|
138
|
+
if isinstance(data, Dataset):
|
|
139
|
+
iterator = cast(Generator[Quad, None, None], data.quads())
|
|
140
|
+
else:
|
|
141
|
+
iterator = data
|
|
142
|
+
|
|
143
|
+
for terms in iterator:
|
|
133
144
|
if frame := stream.quad(terms):
|
|
134
145
|
yield frame
|
|
135
146
|
if frame := stream.flow.frame_from_dataset():
|
|
@@ -138,10 +149,10 @@ def quads_stream_frames(
|
|
|
138
149
|
yield frame
|
|
139
150
|
|
|
140
151
|
|
|
141
|
-
@stream_frames.register
|
|
152
|
+
@stream_frames.register(GraphStream)
|
|
142
153
|
def graphs_stream_frames(
|
|
143
154
|
stream: GraphStream,
|
|
144
|
-
data: Dataset,
|
|
155
|
+
data: Dataset | Generator[Quad],
|
|
145
156
|
) -> Generator[jelly.RdfStreamFrame]:
|
|
146
157
|
"""
|
|
147
158
|
Serialize a Dataset into jelly frames as a stream of graphs.
|
|
@@ -152,18 +163,28 @@ def graphs_stream_frames(
|
|
|
152
163
|
|
|
153
164
|
Args:
|
|
154
165
|
stream (GraphStream): stream that specifies graphs processing
|
|
155
|
-
data (Dataset): Dataset to serialize.
|
|
166
|
+
data (Dataset | Generator[Quad]): Dataset to serialize.
|
|
156
167
|
|
|
157
168
|
Yields:
|
|
158
169
|
Generator[jelly.RdfStreamFrame]: jelly frames
|
|
159
170
|
|
|
160
171
|
"""
|
|
161
|
-
assert isinstance(data, Dataset)
|
|
162
172
|
stream.enroll()
|
|
163
173
|
if stream.options.params.namespace_declarations:
|
|
164
|
-
namespace_declarations(data, stream)
|
|
165
|
-
|
|
174
|
+
namespace_declarations(data, stream) # type: ignore[arg-type]
|
|
175
|
+
|
|
176
|
+
if isinstance(data, Dataset):
|
|
177
|
+
graphs = data.graphs()
|
|
178
|
+
else:
|
|
179
|
+
ds = Dataset()
|
|
180
|
+
for quad in data:
|
|
181
|
+
ctx = ds.get_context(quad.g)
|
|
182
|
+
ctx.add((quad.s, quad.p, quad.o))
|
|
183
|
+
graphs = ds.graphs()
|
|
184
|
+
|
|
185
|
+
for graph in graphs:
|
|
166
186
|
yield from stream.graph(graph_id=graph.identifier, graph=graph)
|
|
187
|
+
|
|
167
188
|
if frame := stream.flow.frame_from_dataset():
|
|
168
189
|
yield frame
|
|
169
190
|
if stream.stream_types.flat and (frame := stream.flow.to_stream_frame()):
|
|
@@ -297,9 +318,60 @@ def grouped_stream_to_file(
|
|
|
297
318
|
Args:
|
|
298
319
|
stream (Generator[Graph] | Generator[Dataset]): Generator of
|
|
299
320
|
Graphs/Dataset to transform.
|
|
300
|
-
output_file (IO[bytes]):
|
|
321
|
+
output_file (IO[bytes]): output buffered writer.
|
|
301
322
|
**kwargs (Any): options to pass to stream.
|
|
302
323
|
|
|
303
324
|
"""
|
|
304
325
|
for frame in grouped_stream_to_frames(stream, **kwargs):
|
|
305
326
|
write_delimited(frame, output_file)
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def flat_stream_to_frames(
|
|
330
|
+
statements: Generator[Triple | Quad],
|
|
331
|
+
options: SerializerOptions | None = None,
|
|
332
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
333
|
+
"""
|
|
334
|
+
Serialize a stream of raw triples or quads into Jelly frames.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
statements (Generator[Triple | Quad]):
|
|
338
|
+
s/p/o triples or s/p/o/g quads to serialize.
|
|
339
|
+
options (SerializerOptions | None, optional):
|
|
340
|
+
if omitted, guessed based on the first tuple.
|
|
341
|
+
|
|
342
|
+
Yields:
|
|
343
|
+
Generator[jelly.RdfStreamFrame]: generated frames.
|
|
344
|
+
|
|
345
|
+
"""
|
|
346
|
+
first = next(statements, None)
|
|
347
|
+
if first is None:
|
|
348
|
+
return
|
|
349
|
+
|
|
350
|
+
sink = Dataset() if len(first) == QUAD_ARITY else Graph()
|
|
351
|
+
if options is None:
|
|
352
|
+
options = guess_options(sink)
|
|
353
|
+
stream = guess_stream(options, sink)
|
|
354
|
+
|
|
355
|
+
combined: Generator[Triple | Quad] | Graph = (
|
|
356
|
+
item for item in chain([first], statements)
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
yield from stream_frames(stream, combined)
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def flat_stream_to_file(
|
|
363
|
+
statements: Generator[Triple | Quad],
|
|
364
|
+
output_file: IO[bytes],
|
|
365
|
+
options: SerializerOptions | None = None,
|
|
366
|
+
) -> None:
|
|
367
|
+
"""
|
|
368
|
+
Write Triple or Quad events to a binary file in Jelly flat format.
|
|
369
|
+
|
|
370
|
+
Args:
|
|
371
|
+
statements (Generator[Triple | Quad]): statements to serialize.
|
|
372
|
+
output_file (IO[bytes]): output buffered writer.
|
|
373
|
+
options (SerializerOptions | None, optional): stream options.
|
|
374
|
+
|
|
375
|
+
"""
|
|
376
|
+
for frame in flat_stream_to_frames(statements, options):
|
|
377
|
+
write_delimited(frame, output_file)
|
pyjelly/options.py
CHANGED
|
@@ -15,6 +15,7 @@ from pyjelly.errors import (
|
|
|
15
15
|
MIN_NAME_LOOKUP_SIZE: Final[int] = 8
|
|
16
16
|
|
|
17
17
|
MAX_LOOKUP_SIZE: Final[int] = 4096
|
|
18
|
+
MIN_VERSION: Final[int] = 1
|
|
18
19
|
MAX_VERSION: Final[int] = 2
|
|
19
20
|
|
|
20
21
|
DEFAULT_NAME_LOOKUP_SIZE: Final[int] = 4000
|
|
@@ -97,6 +98,14 @@ class StreamParameters:
|
|
|
97
98
|
namespace_declarations: bool = False
|
|
98
99
|
stream_name: str = ""
|
|
99
100
|
|
|
101
|
+
def __post_init__(self) -> None:
|
|
102
|
+
selected = MAX_VERSION if self.namespace_declarations else MIN_VERSION
|
|
103
|
+
if not (MIN_VERSION <= selected <= MAX_VERSION):
|
|
104
|
+
msg = f"""Error occured while settin up the Stream options.
|
|
105
|
+
Version must be between {MIN_VERSION} and {MAX_VERSION}."""
|
|
106
|
+
raise JellyConformanceError(msg)
|
|
107
|
+
object.__setattr__(self, "version", selected)
|
|
108
|
+
|
|
100
109
|
|
|
101
110
|
TRIPLES_ONLY_LOGICAL_TYPES = {
|
|
102
111
|
jelly.LOGICAL_STREAM_TYPE_GRAPHS,
|
pyjelly/parse/decode.py
CHANGED
|
@@ -66,6 +66,8 @@ def options_from_frame(
|
|
|
66
66
|
),
|
|
67
67
|
params=StreamParameters(
|
|
68
68
|
stream_name=options.stream_name,
|
|
69
|
+
generalized_statements=options.generalized_statements,
|
|
70
|
+
rdf_star=options.rdf_star,
|
|
69
71
|
version=options.version,
|
|
70
72
|
delimited=delimited,
|
|
71
73
|
),
|
|
@@ -134,6 +136,11 @@ class Adapter(metaclass=ABCMeta):
|
|
|
134
136
|
stream_types=self.options.stream_types,
|
|
135
137
|
)
|
|
136
138
|
|
|
139
|
+
def quoted_triple(self, terms: Iterable[Any]) -> Any: # noqa: ARG002
|
|
140
|
+
_adapter_missing(
|
|
141
|
+
"decoding quoted triple", stream_types=self.options.stream_types
|
|
142
|
+
)
|
|
143
|
+
|
|
137
144
|
def frame(self) -> Any:
|
|
138
145
|
return None
|
|
139
146
|
|
|
@@ -383,6 +390,20 @@ class Decoder:
|
|
|
383
390
|
terms = self.decode_statement(triple, ("subject", "predicate", "object"))
|
|
384
391
|
return self.adapter.triple(terms)
|
|
385
392
|
|
|
393
|
+
def decode_quoted_triple(self, triple: jelly.RdfTriple) -> Any:
|
|
394
|
+
oneofs: Sequence[str] = ("subject", "predicate", "object")
|
|
395
|
+
terms = []
|
|
396
|
+
for oneof in oneofs:
|
|
397
|
+
field = triple.WhichOneof(oneof)
|
|
398
|
+
if field:
|
|
399
|
+
jelly_term = getattr(triple, field)
|
|
400
|
+
decoded_term = self.decode_term(jelly_term)
|
|
401
|
+
else:
|
|
402
|
+
msg = "repeated terms are not allowed in quoted triples"
|
|
403
|
+
raise ValueError(msg)
|
|
404
|
+
terms.append(decoded_term)
|
|
405
|
+
return self.adapter.quoted_triple(terms)
|
|
406
|
+
|
|
386
407
|
def decode_quad(self, quad: jelly.RdfQuad) -> Any:
|
|
387
408
|
terms = self.decode_statement(quad, ("subject", "predicate", "object", "graph"))
|
|
388
409
|
return self.adapter.quad(terms)
|
|
@@ -405,4 +426,5 @@ class Decoder:
|
|
|
405
426
|
str: decode_bnode,
|
|
406
427
|
jelly.RdfLiteral: decode_literal,
|
|
407
428
|
jelly.RdfDefaultGraph: decode_default_graph,
|
|
429
|
+
jelly.RdfTriple: decode_quoted_triple,
|
|
408
430
|
}
|
pyjelly/serialize/encode.py
CHANGED
|
@@ -32,9 +32,8 @@ def split_iri(iri_string: str) -> tuple[str, str]:
|
|
|
32
32
|
|
|
33
33
|
T = TypeVar("T")
|
|
34
34
|
RowsAnd: TypeAlias = tuple[Sequence[jelly.RdfStreamRow], T]
|
|
35
|
-
RowsAndTerm: TypeAlias =
|
|
36
|
-
|
|
37
|
-
)
|
|
35
|
+
RowsAndTerm: TypeAlias = "RowsAnd[jelly.RdfIri | jelly.RdfLiteral | str | \
|
|
36
|
+
jelly.RdfDefaultGraph | jelly.RdfTriple]"
|
|
38
37
|
|
|
39
38
|
|
|
40
39
|
class TermEncoder:
|
|
@@ -43,6 +42,7 @@ class TermEncoder:
|
|
|
43
42
|
jelly.RdfLiteral: "literal",
|
|
44
43
|
str: "bnode",
|
|
45
44
|
jelly.RdfDefaultGraph: "default_graph",
|
|
45
|
+
jelly.RdfTriple: "triple_term",
|
|
46
46
|
}
|
|
47
47
|
|
|
48
48
|
def __init__(
|
|
@@ -163,6 +163,33 @@ class TermEncoder:
|
|
|
163
163
|
datatype=datatype_id,
|
|
164
164
|
)
|
|
165
165
|
|
|
166
|
+
def encode_quoted_triple(self, terms: Iterable[object]) -> RowsAndTerm:
|
|
167
|
+
"""
|
|
168
|
+
Encode a quoted triple.
|
|
169
|
+
|
|
170
|
+
Notes:
|
|
171
|
+
Although a triple, it is treated as a part of a statement.
|
|
172
|
+
Repeated terms are not used when encoding quoted triples.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
terms (Iterable[object]): triple terms to encode.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
RowsAndTerm: additional stream rows with preceeding
|
|
179
|
+
information (prefixes, names, datatypes rows, if any)
|
|
180
|
+
and the encoded triple row.
|
|
181
|
+
|
|
182
|
+
"""
|
|
183
|
+
statement: dict[str, Any] = {}
|
|
184
|
+
rows: list[jelly.RdfStreamRow] = []
|
|
185
|
+
for slot, term in zip(Slot, terms):
|
|
186
|
+
extra_rows, value = self.encode_any(term, slot)
|
|
187
|
+
oneof = self.TERM_ONEOF_NAMES[type(value)]
|
|
188
|
+
rows.extend(extra_rows)
|
|
189
|
+
field = f"{slot}_{oneof}"
|
|
190
|
+
statement[field] = value
|
|
191
|
+
return rows, jelly.RdfTriple(**statement)
|
|
192
|
+
|
|
166
193
|
def encode_any(self, term: object, slot: Slot) -> RowsAndTerm:
|
|
167
194
|
msg = f"unsupported term type: {type(term)}"
|
|
168
195
|
raise NotImplementedError(msg)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pyjelly
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Jelly-RDF implementation for Python
|
|
5
5
|
Project-URL: Homepage, https://w3id.org/jelly/pyjelly
|
|
6
6
|
Project-URL: Documentation, https://w3id.org/jelly/pyjelly
|
|
@@ -29,12 +29,13 @@ Classifier: Topic :: Software Development :: Libraries
|
|
|
29
29
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
30
30
|
Requires-Python: >=3.9
|
|
31
31
|
Requires-Dist: protobuf>=6.30.0
|
|
32
|
+
Requires-Dist: pytest-cov>=6.2.1
|
|
32
33
|
Requires-Dist: typing-extensions>=4.12.2
|
|
33
34
|
Provides-Extra: rdflib
|
|
34
35
|
Requires-Dist: rdflib>=7.1.4; extra == 'rdflib'
|
|
35
36
|
Description-Content-Type: text/markdown
|
|
36
37
|
|
|
37
|
-
[](https://w3id.org/jelly/pyjelly) [](https://pypi.org/project/pyjelly/) [](https://pypi.org/project/pyjelly/) [](https://opensource.org/licenses/Apache-2.0) [](https://github.com/Jelly-RDF/pyjelly/actions/workflows/ci.yml) [](https://discord.gg/A8sN5XwVa5)
|
|
38
|
+
[](https://w3id.org/jelly/pyjelly) [](https://pypi.org/project/pyjelly/) [](https://pypi.org/project/pyjelly/) [](https://opensource.org/licenses/Apache-2.0) [](https://github.com/Jelly-RDF/pyjelly/actions/workflows/ci.yml) [](https://codecov.io/gh/Jelly-RDF/pyjelly) [](https://discord.gg/A8sN5XwVa5)
|
|
38
39
|
|
|
39
40
|
# pyjelly
|
|
40
41
|
|
|
@@ -1,28 +1,32 @@
|
|
|
1
1
|
pyjelly/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
pyjelly/errors.py,sha256=R-xRB4a9S19J9dzAL4a5MCaBwb9ev_kvphGFkQJX6ZU,332
|
|
3
|
-
pyjelly/options.py,sha256=
|
|
3
|
+
pyjelly/options.py,sha256=vjUjwifD1SFj_b3wvz8D50Tv2wbgGrVF0urG9Zpx3VQ,4307
|
|
4
4
|
pyjelly/_proto/grpc.proto,sha256=3PfcZWqKhUSzP_T-xT-80raUYERr_dXWd8rITzXIqek,1188
|
|
5
5
|
pyjelly/_proto/patch.proto,sha256=gASUm0xDG9J1advNoq_cCsJYxudTbQaiZQBq4oW3kw4,5291
|
|
6
6
|
pyjelly/_proto/rdf.proto,sha256=EKxyG421B4m0Wx5-6jjojdga_hA3jpZfF6-T3lMc0hI,12763
|
|
7
7
|
pyjelly/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
pyjelly/integrations/generic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
pyjelly/integrations/generic/generic_sink.py,sha256=zJoz6hI46Z13u4aa4zSXlbXFkHO7wLVW-czlytK-IOI,4024
|
|
10
|
+
pyjelly/integrations/generic/parse.py,sha256=BLavZP-CgstDfQtrMzhsn1OnGQQBe-DNmmxqCWvdAmc,9564
|
|
11
|
+
pyjelly/integrations/generic/serialize.py,sha256=F3giZpPrMAuIL9LHm58e1u_9Y-wwMy9rPUmpxb3OVTY,10893
|
|
8
12
|
pyjelly/integrations/rdflib/__init__.py,sha256=lpIz6iildMf5bDvj3aBqZJ7kgKFrTx_tsqSb6PkLis0,552
|
|
9
|
-
pyjelly/integrations/rdflib/parse.py,sha256=
|
|
10
|
-
pyjelly/integrations/rdflib/serialize.py,sha256=
|
|
13
|
+
pyjelly/integrations/rdflib/parse.py,sha256=73i4BAI72ZfZycNRlAPboRcpgPAS7XFG5Yesnfe7yME,13718
|
|
14
|
+
pyjelly/integrations/rdflib/serialize.py,sha256=LwWeBZDoyQL4mC6wta2TnW_1ys3_kqxmAFkAoJMJnOw,11675
|
|
11
15
|
pyjelly/jelly/__init__.py,sha256=9kacwn8Ew_1fcgj1abz6miEz-AtUdPT2ltFWaRIE5VE,126
|
|
12
16
|
pyjelly/jelly/rdf_pb2.py,sha256=qjgS3kQnCJqoOmgzvgk1BeYxGbeDX2zygJPc2vDjRts,8952
|
|
13
17
|
pyjelly/jelly/rdf_pb2.pyi,sha256=-gxZO-r2wyN68l83XomySz60c82SZmoPKh1HxamBjZs,11816
|
|
14
18
|
pyjelly/parse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
-
pyjelly/parse/decode.py,sha256=
|
|
19
|
+
pyjelly/parse/decode.py,sha256=tVn2e6UmGqjFplIFFlOvZYMb50jCLvkRuaZky45CVNg,14220
|
|
16
20
|
pyjelly/parse/ioutils.py,sha256=O3wRtL5tf1WyIZ1LTfHjHwjKEGrhIWqFisOWjYmspNg,3434
|
|
17
21
|
pyjelly/parse/lookup.py,sha256=1AbdZEycLC4tRfh3fgF5hv5PrhwhdWvCUC53iHt-E4c,2193
|
|
18
22
|
pyjelly/serialize/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
-
pyjelly/serialize/encode.py,sha256=
|
|
23
|
+
pyjelly/serialize/encode.py,sha256=Yr8StCm4u5oJ1ZDFFHnRVnJFtPlCyX7fCFBgmB5Drgw,10521
|
|
20
24
|
pyjelly/serialize/flows.py,sha256=0C2soigJKyHr3xoR-7v0kc1RL8COwnuCRd4iVZpukFU,5524
|
|
21
25
|
pyjelly/serialize/ioutils.py,sha256=2_NaadLfHO3jKR1ZV7aK6jQ09sPKBar9iLFHYwourz8,400
|
|
22
26
|
pyjelly/serialize/lookup.py,sha256=h0lYFjdB6CIuN2DzAW6EE4ILJFUuto3paAK6DG1DZYg,4091
|
|
23
27
|
pyjelly/serialize/streams.py,sha256=F_T3k9yLSPtUW2ZaL99hmjlPKmgG4nYNeNXUiee3jEY,8421
|
|
24
|
-
pyjelly-0.
|
|
25
|
-
pyjelly-0.
|
|
26
|
-
pyjelly-0.
|
|
27
|
-
pyjelly-0.
|
|
28
|
-
pyjelly-0.
|
|
28
|
+
pyjelly-0.5.0.dist-info/METADATA,sha256=LZ9VubOV_XRCC-Bqk7RWmfbcYJETPfKL7-YnhrVQr5Y,4786
|
|
29
|
+
pyjelly-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
30
|
+
pyjelly-0.5.0.dist-info/entry_points.txt,sha256=kUG0p9zso7HpitdMaQaXEj_KSqgOGsL0Ky9ARbecN1g,339
|
|
31
|
+
pyjelly-0.5.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
32
|
+
pyjelly-0.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|