pyjelly 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyjelly might be problematic. Click here for more details.
- pyjelly/__init__.py +0 -0
- pyjelly/_proto/grpc.proto +33 -0
- pyjelly/_proto/patch.proto +165 -0
- pyjelly/_proto/rdf.proto +384 -0
- pyjelly/errors.py +10 -0
- pyjelly/integrations/__init__.py +0 -0
- pyjelly/integrations/rdflib/__init__.py +24 -0
- pyjelly/integrations/rdflib/parse.py +233 -0
- pyjelly/integrations/rdflib/serialize.py +119 -0
- pyjelly/jelly/__init__.py +5 -0
- pyjelly/jelly/rdf_pb2.py +70 -0
- pyjelly/jelly/rdf_pb2.pyi +230 -0
- pyjelly/options.py +126 -0
- pyjelly/parse/__init__.py +0 -0
- pyjelly/parse/decode.py +233 -0
- pyjelly/parse/ioutils.py +86 -0
- pyjelly/parse/lookup.py +70 -0
- pyjelly/serialize/__init__.py +0 -0
- pyjelly/serialize/encode.py +197 -0
- pyjelly/serialize/flows.py +94 -0
- pyjelly/serialize/ioutils.py +13 -0
- pyjelly/serialize/lookup.py +131 -0
- pyjelly/serialize/streams.py +133 -0
- pyjelly-0.1.0.dist-info/METADATA +10 -0
- pyjelly-0.1.0.dist-info/RECORD +28 -0
- pyjelly-0.1.0.dist-info/WHEEL +4 -0
- pyjelly-0.1.0.dist-info/entry_points.txt +7 -0
- pyjelly-0.1.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable, Sequence
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Any, ClassVar, TypeVar
|
|
6
|
+
from typing_extensions import TypeAlias
|
|
7
|
+
|
|
8
|
+
from pyjelly import jelly, options
|
|
9
|
+
from pyjelly.errors import JellyConformanceError
|
|
10
|
+
from pyjelly.serialize.lookup import LookupEncoder
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def split_iri(iri_string: str) -> tuple[str, str]:
|
|
14
|
+
name = iri_string
|
|
15
|
+
prefix = ""
|
|
16
|
+
for sep in "#", "/":
|
|
17
|
+
prefix, char, name = iri_string.rpartition(sep)
|
|
18
|
+
if char:
|
|
19
|
+
return prefix + char, name
|
|
20
|
+
return prefix, name
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
T = TypeVar("T")
|
|
24
|
+
RowsAnd: TypeAlias = tuple[Sequence[jelly.RdfStreamRow], T]
|
|
25
|
+
RowsAndTerm: TypeAlias = (
|
|
26
|
+
"RowsAnd[jelly.RdfIri | jelly.RdfLiteral | str | jelly.RdfDefaultGraph]"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class TermEncoder:
|
|
31
|
+
TERM_ONEOF_NAMES: ClassVar = {
|
|
32
|
+
jelly.RdfIri: "iri",
|
|
33
|
+
jelly.RdfLiteral: "literal",
|
|
34
|
+
str: "bnode",
|
|
35
|
+
jelly.RdfDefaultGraph: "default_graph",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
max_names: int = options.DEFAULT_NAME_LOOKUP_SIZE,
|
|
41
|
+
max_prefixes: int = options.DEFAULT_PREFIX_LOOKUP_SIZE,
|
|
42
|
+
max_datatypes: int = options.DEFAULT_DATATYPE_LOOKUP_SIZE,
|
|
43
|
+
) -> None:
|
|
44
|
+
self.names = LookupEncoder(lookup_size=max_names)
|
|
45
|
+
self.prefixes = LookupEncoder(lookup_size=max_prefixes)
|
|
46
|
+
self.datatypes = LookupEncoder(lookup_size=max_datatypes)
|
|
47
|
+
|
|
48
|
+
def encode_iri(self, iri_string: str) -> RowsAnd[jelly.RdfIri]:
|
|
49
|
+
prefix, name = split_iri(iri_string)
|
|
50
|
+
if prefix and self.prefixes.lookup.max_size:
|
|
51
|
+
prefix_entry_index = self.prefixes.encode_entry_index(prefix)
|
|
52
|
+
else:
|
|
53
|
+
name = iri_string
|
|
54
|
+
prefix_entry_index = None
|
|
55
|
+
|
|
56
|
+
name_entry_index = self.names.encode_entry_index(name)
|
|
57
|
+
|
|
58
|
+
term_rows = []
|
|
59
|
+
|
|
60
|
+
if prefix_entry_index is not None:
|
|
61
|
+
prefix_entry = jelly.RdfPrefixEntry(id=prefix_entry_index, value=prefix)
|
|
62
|
+
term_rows.append(jelly.RdfStreamRow(prefix=prefix_entry))
|
|
63
|
+
|
|
64
|
+
if name_entry_index is not None:
|
|
65
|
+
name_entry = jelly.RdfNameEntry(id=name_entry_index, value=name)
|
|
66
|
+
term_rows.append(jelly.RdfStreamRow(name=name_entry))
|
|
67
|
+
|
|
68
|
+
prefix_index = self.prefixes.encode_prefix_term_index(prefix)
|
|
69
|
+
name_index = self.names.encode_name_term_index(name)
|
|
70
|
+
return term_rows, jelly.RdfIri(prefix_id=prefix_index, name_id=name_index)
|
|
71
|
+
|
|
72
|
+
def encode_default_graph(self) -> RowsAnd[jelly.RdfDefaultGraph]:
|
|
73
|
+
return (), jelly.RdfDefaultGraph()
|
|
74
|
+
|
|
75
|
+
def encode_bnode(self, bnode: str) -> RowsAnd[str]:
|
|
76
|
+
return (), bnode
|
|
77
|
+
|
|
78
|
+
def encode_literal(
|
|
79
|
+
self,
|
|
80
|
+
*,
|
|
81
|
+
lex: str,
|
|
82
|
+
language: str | None = None,
|
|
83
|
+
datatype: str | None = None,
|
|
84
|
+
) -> RowsAnd[jelly.RdfLiteral]:
|
|
85
|
+
datatype_id = None
|
|
86
|
+
term_rows: tuple[()] | tuple[jelly.RdfStreamRow] = ()
|
|
87
|
+
|
|
88
|
+
if datatype and datatype != options.STRING_DATATYPE_IRI:
|
|
89
|
+
if self.datatypes.lookup.max_size == 0:
|
|
90
|
+
msg = (
|
|
91
|
+
f"can't encode literal with type {datatype}: "
|
|
92
|
+
"datatype lookup cannot be used if disabled "
|
|
93
|
+
"(its size was set to 0)"
|
|
94
|
+
)
|
|
95
|
+
raise JellyConformanceError(msg)
|
|
96
|
+
datatype_entry_id = self.datatypes.encode_entry_index(datatype)
|
|
97
|
+
|
|
98
|
+
if datatype_entry_id is not None:
|
|
99
|
+
entry = jelly.RdfDatatypeEntry(id=datatype_entry_id, value=datatype)
|
|
100
|
+
term_rows = (jelly.RdfStreamRow(datatype=entry),)
|
|
101
|
+
|
|
102
|
+
datatype_id = self.datatypes.encode_datatype_term_index(datatype)
|
|
103
|
+
|
|
104
|
+
return term_rows, jelly.RdfLiteral(
|
|
105
|
+
lex=lex,
|
|
106
|
+
langtag=language,
|
|
107
|
+
datatype=datatype_id,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def encode_any(self, term: object, slot: Slot) -> RowsAndTerm:
|
|
111
|
+
msg = f"unsupported term type: {type(term)}"
|
|
112
|
+
raise NotImplementedError(msg)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class Slot(str, Enum):
|
|
116
|
+
"""Slots for encoding RDF terms."""
|
|
117
|
+
|
|
118
|
+
subject = "s"
|
|
119
|
+
predicate = "p"
|
|
120
|
+
object = "o"
|
|
121
|
+
graph = "g"
|
|
122
|
+
|
|
123
|
+
def __str__(self) -> str:
|
|
124
|
+
return self.value
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def new_repeated_terms() -> dict[Slot, object]:
|
|
128
|
+
"""Create a new dictionary for repeated terms."""
|
|
129
|
+
return dict.fromkeys(Slot)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def encode_statement(
|
|
133
|
+
terms: Iterable[object],
|
|
134
|
+
term_encoder: TermEncoder,
|
|
135
|
+
repeated_terms: dict[Slot, object],
|
|
136
|
+
) -> tuple[list[jelly.RdfStreamRow], dict[str, Any]]:
|
|
137
|
+
statement: dict[str, object] = {}
|
|
138
|
+
rows: list[jelly.RdfStreamRow] = []
|
|
139
|
+
for slot, term in zip(Slot, terms):
|
|
140
|
+
if repeated_terms[slot] != term:
|
|
141
|
+
extra_rows, value = term_encoder.encode_any(term, slot)
|
|
142
|
+
oneof = term_encoder.TERM_ONEOF_NAMES[type(value)]
|
|
143
|
+
rows.extend(extra_rows)
|
|
144
|
+
field = f"{slot}_{oneof}"
|
|
145
|
+
statement[field] = value
|
|
146
|
+
repeated_terms[slot] = term
|
|
147
|
+
return rows, statement
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def encode_triple(
|
|
151
|
+
terms: Iterable[object],
|
|
152
|
+
term_encoder: TermEncoder,
|
|
153
|
+
repeated_terms: dict[Slot, object],
|
|
154
|
+
) -> list[jelly.RdfStreamRow]:
|
|
155
|
+
rows, statement = encode_statement(terms, term_encoder, repeated_terms)
|
|
156
|
+
row = jelly.RdfStreamRow(triple=jelly.RdfTriple(**statement))
|
|
157
|
+
rows.append(row)
|
|
158
|
+
return rows
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def encode_quad(
|
|
162
|
+
terms: Iterable[object],
|
|
163
|
+
term_encoder: TermEncoder,
|
|
164
|
+
repeated_terms: dict[Slot, object],
|
|
165
|
+
) -> list[jelly.RdfStreamRow]:
|
|
166
|
+
rows, statement = encode_statement(terms, term_encoder, repeated_terms)
|
|
167
|
+
row = jelly.RdfStreamRow(quad=jelly.RdfQuad(**statement))
|
|
168
|
+
rows.append(row)
|
|
169
|
+
return rows
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def encode_namespace_declaration(
|
|
173
|
+
name: str,
|
|
174
|
+
value: str,
|
|
175
|
+
term_encoder: TermEncoder,
|
|
176
|
+
) -> list[jelly.RdfStreamRow]:
|
|
177
|
+
[*rows], iri = term_encoder.encode_iri(value)
|
|
178
|
+
declaration = jelly.RdfNamespaceDeclaration(name=name, value=iri)
|
|
179
|
+
row = jelly.RdfStreamRow(namespace=declaration)
|
|
180
|
+
rows.append(row)
|
|
181
|
+
return rows
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def encode_options(options: options.StreamOptions) -> jelly.RdfStreamRow:
|
|
185
|
+
return jelly.RdfStreamRow(
|
|
186
|
+
options=jelly.RdfStreamOptions(
|
|
187
|
+
stream_name=options.stream_name,
|
|
188
|
+
physical_type=options.stream_types.physical_type,
|
|
189
|
+
generalized_statements=options.generalized_statements,
|
|
190
|
+
rdf_star=options.rdf_star,
|
|
191
|
+
max_name_table_size=options.lookup_preset.max_names,
|
|
192
|
+
max_prefix_table_size=options.lookup_preset.max_prefixes,
|
|
193
|
+
max_datatype_table_size=options.lookup_preset.max_datatypes,
|
|
194
|
+
logical_type=options.stream_types.logical_type,
|
|
195
|
+
version=options.version,
|
|
196
|
+
)
|
|
197
|
+
)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import UserList
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import ClassVar
|
|
7
|
+
from typing_extensions import override
|
|
8
|
+
|
|
9
|
+
from pyjelly import jelly
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class FrameFlow(UserList[jelly.RdfStreamRow]):
|
|
13
|
+
"""
|
|
14
|
+
Abstract base class for producing Jelly frames from RDF stream rows.
|
|
15
|
+
|
|
16
|
+
Collects stream rows and assembles them into RdfStreamFrame objects when ready.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
logical_type: ClassVar[jelly.LogicalStreamType]
|
|
20
|
+
registry: ClassVar[dict[jelly.LogicalStreamType, type[FrameFlow]]] = {}
|
|
21
|
+
|
|
22
|
+
def frame_from_bounds(self) -> jelly.RdfStreamFrame | None:
|
|
23
|
+
return None
|
|
24
|
+
|
|
25
|
+
def to_stream_frame(self) -> jelly.RdfStreamFrame | None:
|
|
26
|
+
if not self:
|
|
27
|
+
return None
|
|
28
|
+
frame = jelly.RdfStreamFrame(rows=self)
|
|
29
|
+
self.clear()
|
|
30
|
+
return frame
|
|
31
|
+
|
|
32
|
+
def __init_subclass__(cls) -> None:
|
|
33
|
+
"""
|
|
34
|
+
Register subclasses of FrameFlow with their logical stream type.
|
|
35
|
+
|
|
36
|
+
This allows for dynamic dispatch based on the logical stream type.
|
|
37
|
+
"""
|
|
38
|
+
if cls.logical_type != jelly.LOGICAL_STREAM_TYPE_UNSPECIFIED:
|
|
39
|
+
cls.registry[cls.logical_type] = cls
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ManualFrameFlow(FrameFlow):
|
|
43
|
+
"""
|
|
44
|
+
Produces frames only when manually requested (never automatically).
|
|
45
|
+
|
|
46
|
+
!!! warning
|
|
47
|
+
All stream rows are kept in memory until `to_stream_frame()` is called.
|
|
48
|
+
This may lead to high memory usage for large streams.
|
|
49
|
+
|
|
50
|
+
Used for non-delimited serialization.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
logical_type = jelly.LOGICAL_STREAM_TYPE_UNSPECIFIED
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class BoundedFrameFlow(FrameFlow):
|
|
58
|
+
"""
|
|
59
|
+
Produces frames automatically when a fixed number of rows is reached.
|
|
60
|
+
|
|
61
|
+
Used for delimited encoding (default mode).
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
logical_type = jelly.LOGICAL_STREAM_TYPE_UNSPECIFIED
|
|
65
|
+
|
|
66
|
+
frame_size: int
|
|
67
|
+
default_frame_size: ClassVar[int] = 250
|
|
68
|
+
|
|
69
|
+
def __init__(
|
|
70
|
+
self,
|
|
71
|
+
initlist: Iterable[jelly.RdfStreamRow] | None = None,
|
|
72
|
+
*,
|
|
73
|
+
frame_size: int | None = None,
|
|
74
|
+
) -> None:
|
|
75
|
+
super().__init__(initlist)
|
|
76
|
+
self.frame_size = frame_size or self.default_frame_size
|
|
77
|
+
|
|
78
|
+
@override
|
|
79
|
+
def frame_from_bounds(self) -> jelly.RdfStreamFrame | None:
|
|
80
|
+
if len(self) >= self.frame_size:
|
|
81
|
+
return self.to_stream_frame()
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# Fallback for unspecified logical types
|
|
86
|
+
FrameFlow.registry[jelly.LOGICAL_STREAM_TYPE_UNSPECIFIED] = BoundedFrameFlow
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class FlatTriplesFrameFlow(BoundedFrameFlow):
|
|
90
|
+
logical_type = jelly.LOGICAL_STREAM_TYPE_FLAT_TRIPLES
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class FlatQuadsFrameFlow(BoundedFrameFlow):
|
|
94
|
+
logical_type = jelly.LOGICAL_STREAM_TYPE_FLAT_QUADS
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from typing import IO
|
|
2
|
+
|
|
3
|
+
from google.protobuf.proto import serialize_length_prefixed
|
|
4
|
+
|
|
5
|
+
from pyjelly import jelly
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def write_delimited(frame: jelly.RdfStreamFrame, output_stream: IO[bytes]) -> None:
|
|
9
|
+
serialize_length_prefixed(frame, output_stream)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def write_single(frame: jelly.RdfStreamFrame, output_stream: IO[bytes]) -> None:
|
|
13
|
+
output_stream.write(frame.SerializeToString(deterministic=True))
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import OrderedDict
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import final
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@final
|
|
9
|
+
class Lookup:
|
|
10
|
+
"""
|
|
11
|
+
Fixed-size 1-based string-to-index mapping with LRU eviction.
|
|
12
|
+
|
|
13
|
+
- Assigns incrementing indices starting from 1.
|
|
14
|
+
- After reaching the maximum size, reuses the existing indices from evicting
|
|
15
|
+
the least-recently-used entries.
|
|
16
|
+
- Index 0 is reserved for delta encoding in Jelly streams.
|
|
17
|
+
|
|
18
|
+
To check if a key exists, use `.move(key)` and catch `KeyError`.
|
|
19
|
+
If `KeyError` is raised, the key can be inserted with `.insert(key)`.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
max_size
|
|
24
|
+
Maximum number of entries. Zero disables lookup.
|
|
25
|
+
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, max_size: int) -> None:
|
|
29
|
+
self.data = OrderedDict[str, int]()
|
|
30
|
+
self.max_size = max_size
|
|
31
|
+
self._evicting = False
|
|
32
|
+
|
|
33
|
+
def make_last_to_evict(self, key: str) -> None:
|
|
34
|
+
self.data.move_to_end(key)
|
|
35
|
+
|
|
36
|
+
def insert(self, key: str) -> int:
|
|
37
|
+
if not self.max_size:
|
|
38
|
+
msg = "lookup is zero, cannot insert"
|
|
39
|
+
raise IndexError(msg)
|
|
40
|
+
assert key not in self.data, f"key {key!r} already present"
|
|
41
|
+
if self._evicting:
|
|
42
|
+
_, index = self.data.popitem(last=False)
|
|
43
|
+
self.data[key] = index
|
|
44
|
+
else:
|
|
45
|
+
index = len(self.data) + 1
|
|
46
|
+
self.data[key] = index
|
|
47
|
+
self._evicting = index == self.max_size
|
|
48
|
+
return index
|
|
49
|
+
|
|
50
|
+
def __repr__(self) -> str:
|
|
51
|
+
max_size, data = self.max_size, self.data
|
|
52
|
+
return f"Lookup({max_size=!r}, {data=!r})"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class LookupEncoder:
|
|
57
|
+
"""
|
|
58
|
+
Shared base for RDF lookup encoders using Jelly compression.
|
|
59
|
+
|
|
60
|
+
Tracks the last assigned and last reused index.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
lookup_size
|
|
65
|
+
Maximum lookup size.
|
|
66
|
+
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
last_assigned_index: int
|
|
70
|
+
last_reused_index: int
|
|
71
|
+
|
|
72
|
+
def __init__(self, *, lookup_size: int) -> None:
|
|
73
|
+
self.lookup = Lookup(max_size=lookup_size)
|
|
74
|
+
self.last_assigned_index = 0
|
|
75
|
+
self.last_reused_index = 0
|
|
76
|
+
|
|
77
|
+
def encode_entry_index(self, key: str) -> int | None:
|
|
78
|
+
"""
|
|
79
|
+
Get or assign the index to use in an entry.
|
|
80
|
+
|
|
81
|
+
Returns
|
|
82
|
+
-------
|
|
83
|
+
int or None
|
|
84
|
+
- 0 if the new index is sequential (`last_assigned_index + 1`)
|
|
85
|
+
- actual assigned/reused index otherwise
|
|
86
|
+
- None if the key already exists
|
|
87
|
+
|
|
88
|
+
If the return value is None, the entry is already in the lookup and does not
|
|
89
|
+
need to be emitted. Any integer value (including 0) means the entry is new
|
|
90
|
+
and should be emitted.
|
|
91
|
+
|
|
92
|
+
"""
|
|
93
|
+
try:
|
|
94
|
+
self.lookup.make_last_to_evict(key)
|
|
95
|
+
return None # noqa: TRY300
|
|
96
|
+
except KeyError:
|
|
97
|
+
previous_index = self.last_assigned_index
|
|
98
|
+
index = self.lookup.insert(key)
|
|
99
|
+
self.last_assigned_index = index
|
|
100
|
+
if index == previous_index + 1:
|
|
101
|
+
return 0
|
|
102
|
+
return index
|
|
103
|
+
|
|
104
|
+
def encode_term_index(self, value: str) -> int:
|
|
105
|
+
self.lookup.make_last_to_evict(value)
|
|
106
|
+
current_index = self.lookup.data[value]
|
|
107
|
+
self.last_reused_index = current_index
|
|
108
|
+
return current_index
|
|
109
|
+
|
|
110
|
+
def encode_prefix_term_index(self, value: str) -> int:
|
|
111
|
+
if not value or self.lookup.max_size == 0:
|
|
112
|
+
return 0
|
|
113
|
+
previous_index = self.last_reused_index
|
|
114
|
+
current_index = self.encode_term_index(value)
|
|
115
|
+
if value and previous_index == 0:
|
|
116
|
+
return current_index
|
|
117
|
+
if current_index == previous_index:
|
|
118
|
+
return 0
|
|
119
|
+
return current_index
|
|
120
|
+
|
|
121
|
+
def encode_name_term_index(self, value: str) -> int:
|
|
122
|
+
previous_index = self.last_reused_index
|
|
123
|
+
current_index = self.encode_term_index(value)
|
|
124
|
+
if current_index == previous_index + 1:
|
|
125
|
+
return 0
|
|
126
|
+
return current_index
|
|
127
|
+
|
|
128
|
+
def encode_datatype_term_index(self, value: str) -> int:
|
|
129
|
+
if self.lookup.max_size == 0:
|
|
130
|
+
return 0
|
|
131
|
+
return self.encode_term_index(value)
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Generator, Iterable
|
|
4
|
+
from typing import Any, ClassVar
|
|
5
|
+
|
|
6
|
+
from pyjelly import jelly
|
|
7
|
+
from pyjelly.options import StreamOptions
|
|
8
|
+
from pyjelly.serialize.encode import (
|
|
9
|
+
Slot,
|
|
10
|
+
TermEncoder,
|
|
11
|
+
encode_namespace_declaration,
|
|
12
|
+
encode_options,
|
|
13
|
+
encode_quad,
|
|
14
|
+
encode_triple,
|
|
15
|
+
new_repeated_terms,
|
|
16
|
+
)
|
|
17
|
+
from pyjelly.serialize.flows import FrameFlow, ManualFrameFlow
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Stream:
|
|
21
|
+
physical_type: ClassVar[jelly.PhysicalStreamType]
|
|
22
|
+
registry: ClassVar[dict[jelly.PhysicalStreamType, type[Stream]]] = {}
|
|
23
|
+
flow: FrameFlow
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
*,
|
|
28
|
+
options: StreamOptions,
|
|
29
|
+
encoder_class: type[TermEncoder],
|
|
30
|
+
**flow_args: Any,
|
|
31
|
+
) -> None:
|
|
32
|
+
assert options.stream_types.physical_type == self.physical_type
|
|
33
|
+
self.options = options
|
|
34
|
+
self.encoder = encoder_class(
|
|
35
|
+
max_prefixes=options.lookup_preset.max_prefixes,
|
|
36
|
+
max_names=options.lookup_preset.max_names,
|
|
37
|
+
max_datatypes=options.lookup_preset.max_datatypes,
|
|
38
|
+
)
|
|
39
|
+
flow_class = FrameFlow.registry[self.options.stream_types.logical_type]
|
|
40
|
+
if not options.delimited:
|
|
41
|
+
flow_class = ManualFrameFlow
|
|
42
|
+
self.flow = flow_class(**flow_args)
|
|
43
|
+
self.repeated_terms = new_repeated_terms()
|
|
44
|
+
self.enrolled = False
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def from_options(
|
|
48
|
+
options: StreamOptions,
|
|
49
|
+
encoder_class: type[TermEncoder] | None = None,
|
|
50
|
+
**flow_args: Any,
|
|
51
|
+
) -> Any:
|
|
52
|
+
if encoder_class is None:
|
|
53
|
+
from pyjelly.integrations.rdflib.serialize import RDFLibTermEncoder
|
|
54
|
+
|
|
55
|
+
encoder_class = RDFLibTermEncoder
|
|
56
|
+
stream_class = Stream.registry[options.stream_types.physical_type]
|
|
57
|
+
return stream_class(
|
|
58
|
+
options=options,
|
|
59
|
+
encoder_class=encoder_class,
|
|
60
|
+
**flow_args,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def enroll(self) -> None:
|
|
64
|
+
if not self.enrolled:
|
|
65
|
+
self.stream_options()
|
|
66
|
+
self.enrolled = True
|
|
67
|
+
|
|
68
|
+
def stream_options(self) -> None:
|
|
69
|
+
self.flow.append(encode_options(self.options))
|
|
70
|
+
|
|
71
|
+
def namespace_declaration(self, name: str, iri: str) -> None:
|
|
72
|
+
rows = encode_namespace_declaration(
|
|
73
|
+
name=name,
|
|
74
|
+
value=iri,
|
|
75
|
+
term_encoder=self.encoder,
|
|
76
|
+
)
|
|
77
|
+
self.flow.extend(rows)
|
|
78
|
+
|
|
79
|
+
def __init_subclass__(cls) -> None:
|
|
80
|
+
cls.registry[cls.physical_type] = cls
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class TripleStream(Stream):
|
|
84
|
+
physical_type = jelly.PHYSICAL_STREAM_TYPE_TRIPLES
|
|
85
|
+
|
|
86
|
+
def triple(self, terms: Iterable[object]) -> jelly.RdfStreamFrame | None:
|
|
87
|
+
new_rows = encode_triple(
|
|
88
|
+
terms,
|
|
89
|
+
term_encoder=self.encoder,
|
|
90
|
+
repeated_terms=self.repeated_terms,
|
|
91
|
+
)
|
|
92
|
+
self.flow.extend(new_rows)
|
|
93
|
+
if frame := self.flow.frame_from_bounds():
|
|
94
|
+
return frame
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class QuadStream(Stream):
|
|
99
|
+
physical_type = jelly.PHYSICAL_STREAM_TYPE_QUADS
|
|
100
|
+
|
|
101
|
+
def quad(self, terms: Iterable[object]) -> jelly.RdfStreamFrame | None:
|
|
102
|
+
new_rows = encode_quad(
|
|
103
|
+
terms,
|
|
104
|
+
term_encoder=self.encoder,
|
|
105
|
+
repeated_terms=self.repeated_terms,
|
|
106
|
+
)
|
|
107
|
+
self.flow.extend(new_rows)
|
|
108
|
+
if frame := self.flow.frame_from_bounds():
|
|
109
|
+
return frame
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class GraphStream(TripleStream):
|
|
114
|
+
physical_type = jelly.PHYSICAL_STREAM_TYPE_GRAPHS
|
|
115
|
+
|
|
116
|
+
def graph(
|
|
117
|
+
self,
|
|
118
|
+
graph_id: object,
|
|
119
|
+
graph: Iterable[Iterable[object]],
|
|
120
|
+
) -> Generator[jelly.RdfStreamFrame]:
|
|
121
|
+
[*graph_rows], graph_node = self.encoder.encode_any(graph_id, Slot.graph)
|
|
122
|
+
kw_name = f"{Slot.graph}_{self.encoder.TERM_ONEOF_NAMES[type(graph_node)]}"
|
|
123
|
+
kws: dict[Any, Any] = {kw_name: graph_node}
|
|
124
|
+
start_row = jelly.RdfStreamRow(graph_start=jelly.RdfGraphStart(**kws))
|
|
125
|
+
graph_rows.append(start_row)
|
|
126
|
+
self.flow.extend(graph_rows)
|
|
127
|
+
for triple in graph:
|
|
128
|
+
if frame := self.triple(triple):
|
|
129
|
+
yield frame
|
|
130
|
+
end_row = jelly.RdfStreamRow(graph_end=jelly.RdfGraphEnd())
|
|
131
|
+
self.flow.append(end_row)
|
|
132
|
+
if self.flow.frame_from_bounds():
|
|
133
|
+
yield self.flow.to_stream_frame() # type: ignore[misc]
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyjelly
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Jelly RDF implementation for Python
|
|
5
|
+
Author-email: Bartosz Sławecki <bartosz@neverblink.eu>, Anastasiya Danilenka <anastasiya@neverblink.eu>
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Requires-Dist: protobuf>=5.29.3
|
|
9
|
+
Requires-Dist: rdflib>=7.1.4
|
|
10
|
+
Requires-Dist: typing-extensions>=4.12.2
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
pyjelly/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
pyjelly/errors.py,sha256=R-xRB4a9S19J9dzAL4a5MCaBwb9ev_kvphGFkQJX6ZU,332
|
|
3
|
+
pyjelly/options.py,sha256=QyGLpHOyNvBSVYYtbLD2rW43gHXRY580NAA17G9dhHs,4045
|
|
4
|
+
pyjelly/_proto/grpc.proto,sha256=3PfcZWqKhUSzP_T-xT-80raUYERr_dXWd8rITzXIqek,1188
|
|
5
|
+
pyjelly/_proto/patch.proto,sha256=gASUm0xDG9J1advNoq_cCsJYxudTbQaiZQBq4oW3kw4,5291
|
|
6
|
+
pyjelly/_proto/rdf.proto,sha256=EKxyG421B4m0Wx5-6jjojdga_hA3jpZfF6-T3lMc0hI,12763
|
|
7
|
+
pyjelly/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
pyjelly/integrations/rdflib/__init__.py,sha256=lpIz6iildMf5bDvj3aBqZJ7kgKFrTx_tsqSb6PkLis0,552
|
|
9
|
+
pyjelly/integrations/rdflib/parse.py,sha256=k7cNSFgFXK0_4792eZ-lDRzzSqLI7DFMZmbsPD9SLyE,7474
|
|
10
|
+
pyjelly/integrations/rdflib/serialize.py,sha256=YNwKBD_a4oKNktUQa092UXvmdcu9JYAJDkYRfki2p-w,3940
|
|
11
|
+
pyjelly/jelly/__init__.py,sha256=9kacwn8Ew_1fcgj1abz6miEz-AtUdPT2ltFWaRIE5VE,126
|
|
12
|
+
pyjelly/jelly/rdf_pb2.py,sha256=L_fPtDaURFCpLIMqVdl4RwiWyVgEFOwtB4-If3MpoSg,8952
|
|
13
|
+
pyjelly/jelly/rdf_pb2.pyi,sha256=-Vv2HlpUWhaKPEb0YXOTx21cIKoqoBmTY8U6HPMUcLw,11789
|
|
14
|
+
pyjelly/parse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
+
pyjelly/parse/decode.py,sha256=7yfpZ4w8HeXTqWyDROsqfXSaZcCsdnYNhYoRtxRaWEs,8455
|
|
16
|
+
pyjelly/parse/ioutils.py,sha256=FnQNPiDAWLk0IXxUkmVxjsVEjC1y-dBTKKk6lf224SM,2747
|
|
17
|
+
pyjelly/parse/lookup.py,sha256=1AbdZEycLC4tRfh3fgF5hv5PrhwhdWvCUC53iHt-E4c,2193
|
|
18
|
+
pyjelly/serialize/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
pyjelly/serialize/encode.py,sha256=ev8Z7B-ptvch1Xu173lDO3JW3egW-QyzOngDfLDzKsI,6548
|
|
20
|
+
pyjelly/serialize/flows.py,sha256=vezvYeYYumEH0IceebogW4QwM2d1GeOv1yASrJWTTHc,2665
|
|
21
|
+
pyjelly/serialize/ioutils.py,sha256=2_NaadLfHO3jKR1ZV7aK6jQ09sPKBar9iLFHYwourz8,400
|
|
22
|
+
pyjelly/serialize/lookup.py,sha256=vH21uzs7gvjk-Yc0hoSC3_LPsVff86YHuUhikP9djYo,4047
|
|
23
|
+
pyjelly/serialize/streams.py,sha256=0csixgSnGprXhoHOoVimtoPFgt4mqU4Lgv_l6d0EW6g,4247
|
|
24
|
+
pyjelly-0.1.0.dist-info/METADATA,sha256=Kh6HOcK8aLfkSyszqaKOejBbqaI1JrR1gzJ85xfWlr8,348
|
|
25
|
+
pyjelly-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
26
|
+
pyjelly-0.1.0.dist-info/entry_points.txt,sha256=kUG0p9zso7HpitdMaQaXEj_KSqgOGsL0Ky9ARbecN1g,339
|
|
27
|
+
pyjelly-0.1.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
28
|
+
pyjelly-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
[rdf.plugins.parser]
|
|
2
|
+
application/x-jelly-rdf = pyjelly.integrations.rdflib.parse:RDFLibJellyParser
|
|
3
|
+
jelly = pyjelly.integrations.rdflib.parse:RDFLibJellyParser
|
|
4
|
+
|
|
5
|
+
[rdf.plugins.serializer]
|
|
6
|
+
application/x-jelly-rdf = pyjelly.integrations.rdflib.serialize:RDFLibJellySerializer
|
|
7
|
+
jelly = pyjelly.integrations.rdflib.serialize:RDFLibJellySerializer
|