rdf-starbase 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdf_starbase/__init__.py +57 -0
- rdf_starbase/ai_grounding.py +728 -0
- rdf_starbase/compat/__init__.py +26 -0
- rdf_starbase/compat/rdflib.py +1104 -0
- rdf_starbase/formats/__init__.py +29 -0
- rdf_starbase/formats/jsonld.py +488 -0
- rdf_starbase/formats/ntriples.py +419 -0
- rdf_starbase/formats/rdfxml.py +434 -0
- rdf_starbase/formats/turtle.py +882 -0
- rdf_starbase/models.py +92 -0
- rdf_starbase/registry.py +540 -0
- rdf_starbase/repositories.py +407 -0
- rdf_starbase/repository_api.py +739 -0
- rdf_starbase/sparql/__init__.py +35 -0
- rdf_starbase/sparql/ast.py +910 -0
- rdf_starbase/sparql/executor.py +1925 -0
- rdf_starbase/sparql/parser.py +1716 -0
- rdf_starbase/storage/__init__.py +44 -0
- rdf_starbase/storage/executor.py +1914 -0
- rdf_starbase/storage/facts.py +850 -0
- rdf_starbase/storage/lsm.py +531 -0
- rdf_starbase/storage/persistence.py +338 -0
- rdf_starbase/storage/quoted_triples.py +292 -0
- rdf_starbase/storage/reasoner.py +1035 -0
- rdf_starbase/storage/terms.py +628 -0
- rdf_starbase/store.py +1049 -0
- rdf_starbase/store_legacy.py +748 -0
- rdf_starbase/web.py +568 -0
- rdf_starbase-0.1.0.dist-info/METADATA +706 -0
- rdf_starbase-0.1.0.dist-info/RECORD +31 -0
- rdf_starbase-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,419 @@
|
|
|
1
|
+
"""
|
|
2
|
+
N-Triples and N-Quads Parser and Serializer.
|
|
3
|
+
|
|
4
|
+
N-Triples is the simplest RDF format: one triple per line.
|
|
5
|
+
N-Quads extends N-Triples with an optional graph name.
|
|
6
|
+
|
|
7
|
+
Grammar:
|
|
8
|
+
ntriplesDoc ::= triple? (EOL triple)* EOL?
|
|
9
|
+
triple ::= subject predicate object '.'
|
|
10
|
+
subject ::= IRIREF | BLANK_NODE_LABEL | quotedTriple
|
|
11
|
+
predicate ::= IRIREF
|
|
12
|
+
object ::= IRIREF | BLANK_NODE_LABEL | literal | quotedTriple
|
|
13
|
+
quotedTriple ::= '<<' subject predicate object '>>'
|
|
14
|
+
|
|
15
|
+
Reference: https://www.w3.org/TR/n-triples/
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from typing import Iterator, Optional, Tuple, List, Union
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
import re
|
|
22
|
+
from io import StringIO
|
|
23
|
+
|
|
24
|
+
from rdf_starbase.formats.turtle import Triple, ParsedDocument
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class NTriplesParser:
|
|
28
|
+
"""
|
|
29
|
+
Parser for N-Triples and N-Triples-Star format.
|
|
30
|
+
|
|
31
|
+
N-Triples is line-oriented: each line is one triple.
|
|
32
|
+
This makes it efficient for streaming large files.
|
|
33
|
+
|
|
34
|
+
Format:
|
|
35
|
+
<subject> <predicate> <object> .
|
|
36
|
+
<subject> <predicate> "literal" .
|
|
37
|
+
<subject> <predicate> "literal"@lang .
|
|
38
|
+
<subject> <predicate> "literal"^^<datatype> .
|
|
39
|
+
|
|
40
|
+
N-Triples-Star adds quoted triples:
|
|
41
|
+
<< <s> <p> <o> >> <p2> <o2> .
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
# Regex patterns for N-Triples tokens
|
|
45
|
+
IRI_PATTERN = re.compile(r'<([^>]*)>')
|
|
46
|
+
BLANK_NODE_PATTERN = re.compile(r'_:([A-Za-z_][A-Za-z0-9_.-]*)')
|
|
47
|
+
LITERAL_PATTERN = re.compile(
|
|
48
|
+
r'"((?:[^"\\]|\\.)*)"|' # Double-quoted string
|
|
49
|
+
r"'((?:[^'\\]|\\.)*)'" # Single-quoted string
|
|
50
|
+
)
|
|
51
|
+
LANG_TAG_PATTERN = re.compile(r'@([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)')
|
|
52
|
+
DATATYPE_PATTERN = re.compile(r'\^\^<([^>]*)>')
|
|
53
|
+
QUOTED_TRIPLE_START = re.compile(r'<<')
|
|
54
|
+
QUOTED_TRIPLE_END = re.compile(r'>>')
|
|
55
|
+
|
|
56
|
+
def __init__(self):
|
|
57
|
+
self.line_number = 0
|
|
58
|
+
|
|
59
|
+
def parse(self, source: Union[str, Path, StringIO]) -> ParsedDocument:
|
|
60
|
+
"""
|
|
61
|
+
Parse N-Triples content.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
source: N-Triples content as string, file path, or StringIO
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
ParsedDocument with triples (no prefixes in N-Triples)
|
|
68
|
+
"""
|
|
69
|
+
if isinstance(source, Path):
|
|
70
|
+
text = source.read_text(encoding="utf-8")
|
|
71
|
+
elif isinstance(source, StringIO):
|
|
72
|
+
text = source.read()
|
|
73
|
+
else:
|
|
74
|
+
text = source
|
|
75
|
+
|
|
76
|
+
triples = list(self.parse_lines(text.splitlines()))
|
|
77
|
+
return ParsedDocument(triples=triples)
|
|
78
|
+
|
|
79
|
+
def parse_file(self, path: Union[str, Path]) -> ParsedDocument:
|
|
80
|
+
"""Parse an N-Triples file."""
|
|
81
|
+
return self.parse(Path(path))
|
|
82
|
+
|
|
83
|
+
def parse_lines(self, lines: List[str]) -> Iterator[Triple]:
|
|
84
|
+
"""
|
|
85
|
+
Parse lines of N-Triples.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
lines: List of N-Triples lines
|
|
89
|
+
|
|
90
|
+
Yields:
|
|
91
|
+
Triple objects
|
|
92
|
+
"""
|
|
93
|
+
for i, line in enumerate(lines):
|
|
94
|
+
self.line_number = i + 1
|
|
95
|
+
|
|
96
|
+
# Strip whitespace and skip empty lines/comments
|
|
97
|
+
line = line.strip()
|
|
98
|
+
if not line or line.startswith('#'):
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
triple = self._parse_line(line)
|
|
103
|
+
if triple:
|
|
104
|
+
yield triple
|
|
105
|
+
except Exception as e:
|
|
106
|
+
raise ValueError(f"Error parsing line {self.line_number}: {e}\nLine: {line}")
|
|
107
|
+
|
|
108
|
+
def parse_stream(self, stream) -> Iterator[Triple]:
|
|
109
|
+
"""
|
|
110
|
+
Parse N-Triples from a stream (file-like object).
|
|
111
|
+
|
|
112
|
+
Useful for processing large files without loading into memory.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
stream: File-like object with readline()
|
|
116
|
+
|
|
117
|
+
Yields:
|
|
118
|
+
Triple objects
|
|
119
|
+
"""
|
|
120
|
+
self.line_number = 0
|
|
121
|
+
for line in stream:
|
|
122
|
+
self.line_number += 1
|
|
123
|
+
line = line.strip()
|
|
124
|
+
if not line or line.startswith('#'):
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
triple = self._parse_line(line)
|
|
128
|
+
if triple:
|
|
129
|
+
yield triple
|
|
130
|
+
|
|
131
|
+
def _parse_line(self, line: str) -> Optional[Triple]:
|
|
132
|
+
"""Parse a single N-Triples line."""
|
|
133
|
+
pos = 0
|
|
134
|
+
|
|
135
|
+
# Parse subject
|
|
136
|
+
subject, pos = self._parse_subject(line, pos)
|
|
137
|
+
pos = self._skip_ws(line, pos)
|
|
138
|
+
|
|
139
|
+
# Parse predicate
|
|
140
|
+
predicate, pos = self._parse_iri(line, pos)
|
|
141
|
+
pos = self._skip_ws(line, pos)
|
|
142
|
+
|
|
143
|
+
# Parse object
|
|
144
|
+
obj, pos = self._parse_object(line, pos)
|
|
145
|
+
pos = self._skip_ws(line, pos)
|
|
146
|
+
|
|
147
|
+
# Expect period
|
|
148
|
+
if pos < len(line) and line[pos] == '.':
|
|
149
|
+
pos += 1
|
|
150
|
+
|
|
151
|
+
# Handle RDF-Star: subject or object might be Triple
|
|
152
|
+
if isinstance(subject, Triple):
|
|
153
|
+
return Triple(
|
|
154
|
+
subject="",
|
|
155
|
+
predicate=predicate,
|
|
156
|
+
object=obj if isinstance(obj, str) else "",
|
|
157
|
+
subject_triple=subject,
|
|
158
|
+
object_triple=obj if isinstance(obj, Triple) else None
|
|
159
|
+
)
|
|
160
|
+
elif isinstance(obj, Triple):
|
|
161
|
+
return Triple(
|
|
162
|
+
subject=subject,
|
|
163
|
+
predicate=predicate,
|
|
164
|
+
object="",
|
|
165
|
+
object_triple=obj
|
|
166
|
+
)
|
|
167
|
+
else:
|
|
168
|
+
return Triple(subject=subject, predicate=predicate, object=obj)
|
|
169
|
+
|
|
170
|
+
def _skip_ws(self, line: str, pos: int) -> int:
|
|
171
|
+
"""Skip whitespace."""
|
|
172
|
+
while pos < len(line) and line[pos] in ' \t':
|
|
173
|
+
pos += 1
|
|
174
|
+
return pos
|
|
175
|
+
|
|
176
|
+
def _parse_subject(self, line: str, pos: int) -> Tuple[Union[str, Triple], int]:
|
|
177
|
+
"""Parse a subject (IRI, blank node, or quoted triple)."""
|
|
178
|
+
pos = self._skip_ws(line, pos)
|
|
179
|
+
|
|
180
|
+
# Check for quoted triple
|
|
181
|
+
if line[pos:pos+2] == '<<':
|
|
182
|
+
return self._parse_quoted_triple(line, pos)
|
|
183
|
+
|
|
184
|
+
# Check for blank node
|
|
185
|
+
if line[pos:pos+2] == '_:':
|
|
186
|
+
return self._parse_blank_node(line, pos)
|
|
187
|
+
|
|
188
|
+
# Otherwise IRI
|
|
189
|
+
return self._parse_iri(line, pos)
|
|
190
|
+
|
|
191
|
+
def _parse_object(self, line: str, pos: int) -> Tuple[Union[str, Triple], int]:
|
|
192
|
+
"""Parse an object (IRI, blank node, literal, or quoted triple)."""
|
|
193
|
+
pos = self._skip_ws(line, pos)
|
|
194
|
+
|
|
195
|
+
# Check for quoted triple
|
|
196
|
+
if line[pos:pos+2] == '<<':
|
|
197
|
+
return self._parse_quoted_triple(line, pos)
|
|
198
|
+
|
|
199
|
+
# Check for literal
|
|
200
|
+
if line[pos] == '"':
|
|
201
|
+
return self._parse_literal(line, pos)
|
|
202
|
+
|
|
203
|
+
# Check for blank node
|
|
204
|
+
if line[pos:pos+2] == '_:':
|
|
205
|
+
return self._parse_blank_node(line, pos)
|
|
206
|
+
|
|
207
|
+
# Otherwise IRI
|
|
208
|
+
return self._parse_iri(line, pos)
|
|
209
|
+
|
|
210
|
+
def _parse_iri(self, line: str, pos: int) -> Tuple[str, int]:
|
|
211
|
+
"""Parse an IRI <...>."""
|
|
212
|
+
if pos >= len(line) or line[pos] != '<':
|
|
213
|
+
raise ValueError(f"Expected '<' at position {pos}")
|
|
214
|
+
|
|
215
|
+
end = line.find('>', pos + 1)
|
|
216
|
+
if end == -1:
|
|
217
|
+
raise ValueError(f"Unclosed IRI at position {pos}")
|
|
218
|
+
|
|
219
|
+
iri = line[pos+1:end]
|
|
220
|
+
return iri, end + 1
|
|
221
|
+
|
|
222
|
+
def _parse_blank_node(self, line: str, pos: int) -> Tuple[str, int]:
|
|
223
|
+
"""Parse a blank node _:label."""
|
|
224
|
+
match = self.BLANK_NODE_PATTERN.match(line, pos)
|
|
225
|
+
if not match:
|
|
226
|
+
raise ValueError(f"Invalid blank node at position {pos}")
|
|
227
|
+
|
|
228
|
+
return f"_:{match.group(1)}", match.end()
|
|
229
|
+
|
|
230
|
+
def _parse_literal(self, line: str, pos: int) -> Tuple[str, int]:
|
|
231
|
+
"""Parse a literal "..."."""
|
|
232
|
+
if line[pos] != '"':
|
|
233
|
+
raise ValueError(f"Expected '\"' at position {pos}")
|
|
234
|
+
|
|
235
|
+
pos += 1
|
|
236
|
+
value = []
|
|
237
|
+
|
|
238
|
+
while pos < len(line):
|
|
239
|
+
c = line[pos]
|
|
240
|
+
if c == '"':
|
|
241
|
+
pos += 1
|
|
242
|
+
break
|
|
243
|
+
elif c == '\\':
|
|
244
|
+
pos += 1
|
|
245
|
+
if pos < len(line):
|
|
246
|
+
escaped = line[pos]
|
|
247
|
+
if escaped == 'n':
|
|
248
|
+
value.append('\n')
|
|
249
|
+
elif escaped == 't':
|
|
250
|
+
value.append('\t')
|
|
251
|
+
elif escaped == 'r':
|
|
252
|
+
value.append('\r')
|
|
253
|
+
elif escaped == '\\':
|
|
254
|
+
value.append('\\')
|
|
255
|
+
elif escaped == '"':
|
|
256
|
+
value.append('"')
|
|
257
|
+
elif escaped == 'u':
|
|
258
|
+
# Unicode escape \uXXXX
|
|
259
|
+
hex_chars = line[pos+1:pos+5]
|
|
260
|
+
value.append(chr(int(hex_chars, 16)))
|
|
261
|
+
pos += 4
|
|
262
|
+
elif escaped == 'U':
|
|
263
|
+
# Long unicode escape \UXXXXXXXX
|
|
264
|
+
hex_chars = line[pos+1:pos+9]
|
|
265
|
+
value.append(chr(int(hex_chars, 16)))
|
|
266
|
+
pos += 8
|
|
267
|
+
else:
|
|
268
|
+
value.append(escaped)
|
|
269
|
+
pos += 1
|
|
270
|
+
else:
|
|
271
|
+
value.append(c)
|
|
272
|
+
pos += 1
|
|
273
|
+
|
|
274
|
+
string_value = ''.join(value)
|
|
275
|
+
|
|
276
|
+
# Check for language tag
|
|
277
|
+
if pos < len(line) and line[pos] == '@':
|
|
278
|
+
lang_match = self.LANG_TAG_PATTERN.match(line, pos)
|
|
279
|
+
if lang_match:
|
|
280
|
+
lang = lang_match.group(1)
|
|
281
|
+
return f'"{string_value}"@{lang}', lang_match.end()
|
|
282
|
+
|
|
283
|
+
# Check for datatype
|
|
284
|
+
if pos < len(line) and line[pos:pos+2] == '^^':
|
|
285
|
+
pos += 2
|
|
286
|
+
datatype, pos = self._parse_iri(line, pos)
|
|
287
|
+
return f'"{string_value}"^^<{datatype}>', pos
|
|
288
|
+
|
|
289
|
+
return f'"{string_value}"', pos
|
|
290
|
+
|
|
291
|
+
def _parse_quoted_triple(self, line: str, pos: int) -> Tuple[Triple, int]:
|
|
292
|
+
"""Parse an RDF-Star quoted triple << s p o >>."""
|
|
293
|
+
if line[pos:pos+2] != '<<':
|
|
294
|
+
raise ValueError(f"Expected '<<' at position {pos}")
|
|
295
|
+
pos += 2
|
|
296
|
+
pos = self._skip_ws(line, pos)
|
|
297
|
+
|
|
298
|
+
# Parse subject
|
|
299
|
+
subject, pos = self._parse_subject(line, pos)
|
|
300
|
+
pos = self._skip_ws(line, pos)
|
|
301
|
+
|
|
302
|
+
# Parse predicate
|
|
303
|
+
predicate, pos = self._parse_iri(line, pos)
|
|
304
|
+
pos = self._skip_ws(line, pos)
|
|
305
|
+
|
|
306
|
+
# Parse object
|
|
307
|
+
obj, pos = self._parse_object(line, pos)
|
|
308
|
+
pos = self._skip_ws(line, pos)
|
|
309
|
+
|
|
310
|
+
# Expect >>
|
|
311
|
+
if line[pos:pos+2] != '>>':
|
|
312
|
+
raise ValueError(f"Expected '>>' at position {pos}")
|
|
313
|
+
pos += 2
|
|
314
|
+
|
|
315
|
+
if isinstance(subject, Triple):
|
|
316
|
+
triple = Triple("", predicate, obj if isinstance(obj, str) else "",
|
|
317
|
+
subject_triple=subject,
|
|
318
|
+
object_triple=obj if isinstance(obj, Triple) else None)
|
|
319
|
+
elif isinstance(obj, Triple):
|
|
320
|
+
triple = Triple(subject, predicate, "", object_triple=obj)
|
|
321
|
+
else:
|
|
322
|
+
triple = Triple(subject, predicate, obj)
|
|
323
|
+
|
|
324
|
+
return triple, pos
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
class NTriplesSerializer:
|
|
328
|
+
"""
|
|
329
|
+
Serializer for N-Triples format.
|
|
330
|
+
|
|
331
|
+
Output is one triple per line, fully expanded (no prefixes).
|
|
332
|
+
"""
|
|
333
|
+
|
|
334
|
+
def serialize(self, triples: List[Triple]) -> str:
|
|
335
|
+
"""
|
|
336
|
+
Serialize triples to N-Triples format.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
triples: List of Triple objects
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
N-Triples formatted string
|
|
343
|
+
"""
|
|
344
|
+
lines = []
|
|
345
|
+
for triple in triples:
|
|
346
|
+
lines.append(self._format_triple(triple))
|
|
347
|
+
return '\n'.join(lines) + '\n' if lines else ''
|
|
348
|
+
|
|
349
|
+
def serialize_to_file(self, triples: List[Triple], path: Union[str, Path]):
|
|
350
|
+
"""Serialize triples to an N-Triples file."""
|
|
351
|
+
content = self.serialize(triples)
|
|
352
|
+
Path(path).write_text(content, encoding="utf-8")
|
|
353
|
+
|
|
354
|
+
def _format_triple(self, triple: Triple) -> str:
|
|
355
|
+
"""Format a single triple as N-Triples line."""
|
|
356
|
+
s = self._format_subject(triple)
|
|
357
|
+
p = self._format_iri(triple.predicate)
|
|
358
|
+
o = self._format_object(triple)
|
|
359
|
+
return f"{s} {p} {o} ."
|
|
360
|
+
|
|
361
|
+
def _format_subject(self, triple: Triple) -> str:
|
|
362
|
+
"""Format the subject."""
|
|
363
|
+
if triple.subject_triple:
|
|
364
|
+
return self._format_quoted_triple(triple.subject_triple)
|
|
365
|
+
return self._format_term(triple.subject)
|
|
366
|
+
|
|
367
|
+
def _format_object(self, triple: Triple) -> str:
|
|
368
|
+
"""Format the object."""
|
|
369
|
+
if triple.object_triple:
|
|
370
|
+
return self._format_quoted_triple(triple.object_triple)
|
|
371
|
+
return self._format_term(triple.object)
|
|
372
|
+
|
|
373
|
+
def _format_quoted_triple(self, triple: Triple) -> str:
|
|
374
|
+
"""Format a quoted triple."""
|
|
375
|
+
s = self._format_subject(triple)
|
|
376
|
+
p = self._format_iri(triple.predicate)
|
|
377
|
+
o = self._format_object(triple)
|
|
378
|
+
return f"<< {s} {p} {o} >>"
|
|
379
|
+
|
|
380
|
+
def _format_term(self, term: str) -> str:
|
|
381
|
+
"""Format a term (IRI, blank node, or literal)."""
|
|
382
|
+
if term.startswith('_:'):
|
|
383
|
+
return term
|
|
384
|
+
elif term.startswith('"'):
|
|
385
|
+
return self._format_literal(term)
|
|
386
|
+
else:
|
|
387
|
+
return self._format_iri(term)
|
|
388
|
+
|
|
389
|
+
def _format_iri(self, iri: str) -> str:
|
|
390
|
+
"""Format an IRI."""
|
|
391
|
+
if iri.startswith('<') and iri.endswith('>'):
|
|
392
|
+
return iri
|
|
393
|
+
return f"<{iri}>"
|
|
394
|
+
|
|
395
|
+
def _format_literal(self, literal: str) -> str:
|
|
396
|
+
"""Format a literal, escaping special characters."""
|
|
397
|
+
# Already formatted literal
|
|
398
|
+
if literal.startswith('"'):
|
|
399
|
+
return literal
|
|
400
|
+
|
|
401
|
+
# Need to escape
|
|
402
|
+
escaped = literal.replace('\\', '\\\\').replace('"', '\\"')
|
|
403
|
+
escaped = escaped.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
|
|
404
|
+
return f'"{escaped}"'
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
# Convenience functions
|
|
408
|
+
def parse_ntriples(source: Union[str, Path]) -> ParsedDocument:
|
|
409
|
+
"""Parse N-Triples content or file."""
|
|
410
|
+
parser = NTriplesParser()
|
|
411
|
+
if isinstance(source, Path) or (isinstance(source, str) and len(source) < 500 and Path(source).exists()):
|
|
412
|
+
return parser.parse_file(source)
|
|
413
|
+
return parser.parse(source)
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def serialize_ntriples(triples: List[Triple]) -> str:
|
|
417
|
+
"""Serialize triples to N-Triples format."""
|
|
418
|
+
serializer = NTriplesSerializer()
|
|
419
|
+
return serializer.serialize(triples)
|