rdf-starbase 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdf_starbase/__init__.py +57 -0
- rdf_starbase/ai_grounding.py +728 -0
- rdf_starbase/compat/__init__.py +26 -0
- rdf_starbase/compat/rdflib.py +1104 -0
- rdf_starbase/formats/__init__.py +29 -0
- rdf_starbase/formats/jsonld.py +488 -0
- rdf_starbase/formats/ntriples.py +419 -0
- rdf_starbase/formats/rdfxml.py +434 -0
- rdf_starbase/formats/turtle.py +882 -0
- rdf_starbase/models.py +92 -0
- rdf_starbase/registry.py +540 -0
- rdf_starbase/repositories.py +407 -0
- rdf_starbase/repository_api.py +739 -0
- rdf_starbase/sparql/__init__.py +35 -0
- rdf_starbase/sparql/ast.py +910 -0
- rdf_starbase/sparql/executor.py +1925 -0
- rdf_starbase/sparql/parser.py +1716 -0
- rdf_starbase/storage/__init__.py +44 -0
- rdf_starbase/storage/executor.py +1914 -0
- rdf_starbase/storage/facts.py +850 -0
- rdf_starbase/storage/lsm.py +531 -0
- rdf_starbase/storage/persistence.py +338 -0
- rdf_starbase/storage/quoted_triples.py +292 -0
- rdf_starbase/storage/reasoner.py +1035 -0
- rdf_starbase/storage/terms.py +628 -0
- rdf_starbase/store.py +1049 -0
- rdf_starbase/store_legacy.py +748 -0
- rdf_starbase/web.py +568 -0
- rdf_starbase-0.1.0.dist-info/METADATA +706 -0
- rdf_starbase-0.1.0.dist-info/RECORD +31 -0
- rdf_starbase-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,882 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Turtle and Turtle-Star Parser.
|
|
3
|
+
|
|
4
|
+
Implements parsing of Turtle (Terse RDF Triple Language) format
|
|
5
|
+
with RDF-Star extensions for quoted triples.
|
|
6
|
+
|
|
7
|
+
Grammar based on W3C Turtle specification:
|
|
8
|
+
https://www.w3.org/TR/turtle/
|
|
9
|
+
|
|
10
|
+
With Turtle-Star extensions:
|
|
11
|
+
https://w3c.github.io/rdf-star/cg-spec/editors_draft.html
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from typing import Iterator, Optional, Tuple, List, Dict, Any, Union
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
import re
|
|
18
|
+
from io import StringIO
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class Triple:
|
|
23
|
+
"""A parsed RDF triple."""
|
|
24
|
+
subject: str
|
|
25
|
+
predicate: str
|
|
26
|
+
object: str
|
|
27
|
+
# For RDF-Star: if subject or object is a quoted triple
|
|
28
|
+
subject_triple: Optional["Triple"] = None
|
|
29
|
+
object_triple: Optional["Triple"] = None
|
|
30
|
+
|
|
31
|
+
def __str__(self) -> str:
|
|
32
|
+
s = f"<<{self.subject_triple}>>" if self.subject_triple else self.subject
|
|
33
|
+
o = f"<<{self.object_triple}>>" if self.object_triple else self.object
|
|
34
|
+
return f"{s} {self.predicate} {o}"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class ParsedDocument:
|
|
39
|
+
"""Result of parsing a Turtle document."""
|
|
40
|
+
prefixes: Dict[str, str] = field(default_factory=dict)
|
|
41
|
+
base: Optional[str] = None
|
|
42
|
+
triples: List[Triple] = field(default_factory=list)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class TurtleParser:
|
|
46
|
+
"""
|
|
47
|
+
Parser for Turtle and Turtle-Star format.
|
|
48
|
+
|
|
49
|
+
Supports:
|
|
50
|
+
- @prefix and @base directives
|
|
51
|
+
- PREFIX and BASE (SPARQL-style)
|
|
52
|
+
- Prefixed names (foaf:name)
|
|
53
|
+
- Full IRIs (<http://...>)
|
|
54
|
+
- Literals with language tags ("hello"@en)
|
|
55
|
+
- Literals with datatypes ("42"^^xsd:integer)
|
|
56
|
+
- Blank nodes (_:b1, [ ])
|
|
57
|
+
- Collections (a b c)
|
|
58
|
+
- RDF-Star quoted triples (<< s p o >>)
|
|
59
|
+
- Predicate-object lists (; separation)
|
|
60
|
+
- Object lists (, separation)
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
# Standard prefixes that are commonly used
|
|
64
|
+
STANDARD_PREFIXES = {
|
|
65
|
+
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
|
66
|
+
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
|
|
67
|
+
"xsd": "http://www.w3.org/2001/XMLSchema#",
|
|
68
|
+
"owl": "http://www.w3.org/2002/07/owl#",
|
|
69
|
+
"foaf": "http://xmlns.com/foaf/0.1/",
|
|
70
|
+
"dc": "http://purl.org/dc/elements/1.1/",
|
|
71
|
+
"dcterms": "http://purl.org/dc/terms/",
|
|
72
|
+
"skos": "http://www.w3.org/2004/02/skos/core#",
|
|
73
|
+
"prov": "http://www.w3.org/ns/prov#",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
# Token patterns
|
|
77
|
+
IRI_PATTERN = re.compile(r'<([^>]*)>')
|
|
78
|
+
PREFIXED_NAME_PATTERN = re.compile(r'([a-zA-Z_][\w-]*)?:([a-zA-Z_][\w.-]*)?')
|
|
79
|
+
BLANK_NODE_PATTERN = re.compile(r'_:([a-zA-Z_][\w.-]*)')
|
|
80
|
+
STRING_PATTERN = re.compile(r'"""([^"]*(?:""?(?!"))?)*"""|\'\'\'([^\']*(?:\'\'?(?!\'))?)*\'\'\'|"([^"\\]*(?:\\.[^"\\]*)*)"|\'([^\'\\]*(?:\\.[^\'\\]*)*)\'')
|
|
81
|
+
INTEGER_PATTERN = re.compile(r'[+-]?\d+')
|
|
82
|
+
DECIMAL_PATTERN = re.compile(r'[+-]?\d*\.\d+')
|
|
83
|
+
DOUBLE_PATTERN = re.compile(r'[+-]?(?:\d+\.\d*|\.\d+|\d+)[eE][+-]?\d+')
|
|
84
|
+
BOOLEAN_PATTERN = re.compile(r'true|false', re.IGNORECASE)
|
|
85
|
+
COMMENT_PATTERN = re.compile(r'#[^\n]*')
|
|
86
|
+
|
|
87
|
+
def __init__(self):
|
|
88
|
+
self.prefixes: Dict[str, str] = {}
|
|
89
|
+
self.base: Optional[str] = None
|
|
90
|
+
self.blank_node_counter = 0
|
|
91
|
+
self.text = ""
|
|
92
|
+
self.pos = 0
|
|
93
|
+
self.triples: List[Triple] = []
|
|
94
|
+
|
|
95
|
+
def parse(self, source: Union[str, Path, StringIO]) -> ParsedDocument:
|
|
96
|
+
"""
|
|
97
|
+
Parse a Turtle document.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
source: Turtle content as string, file path, or StringIO
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
ParsedDocument with prefixes, base, and triples
|
|
104
|
+
"""
|
|
105
|
+
if isinstance(source, Path):
|
|
106
|
+
self.text = source.read_text(encoding="utf-8")
|
|
107
|
+
elif isinstance(source, StringIO):
|
|
108
|
+
self.text = source.read()
|
|
109
|
+
else:
|
|
110
|
+
self.text = source
|
|
111
|
+
|
|
112
|
+
self.pos = 0
|
|
113
|
+
self.prefixes = {}
|
|
114
|
+
self.base = None
|
|
115
|
+
self.triples = []
|
|
116
|
+
self.blank_node_counter = 0
|
|
117
|
+
|
|
118
|
+
self._parse_document()
|
|
119
|
+
|
|
120
|
+
return ParsedDocument(
|
|
121
|
+
prefixes=self.prefixes.copy(),
|
|
122
|
+
base=self.base,
|
|
123
|
+
triples=self.triples.copy()
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def parse_file(self, path: Union[str, Path]) -> ParsedDocument:
|
|
127
|
+
"""Parse a Turtle file."""
|
|
128
|
+
return self.parse(Path(path))
|
|
129
|
+
|
|
130
|
+
def _parse_document(self):
|
|
131
|
+
"""Parse the entire document."""
|
|
132
|
+
while self.pos < len(self.text):
|
|
133
|
+
self._skip_ws_and_comments()
|
|
134
|
+
if self.pos >= len(self.text):
|
|
135
|
+
break
|
|
136
|
+
|
|
137
|
+
# Check for directives
|
|
138
|
+
if self._peek_text("@prefix"):
|
|
139
|
+
self._parse_prefix_directive()
|
|
140
|
+
elif self._peek_text("@base"):
|
|
141
|
+
self._parse_base_directive()
|
|
142
|
+
elif self._peek_text("PREFIX", case_insensitive=True):
|
|
143
|
+
self._parse_sparql_prefix()
|
|
144
|
+
elif self._peek_text("BASE", case_insensitive=True):
|
|
145
|
+
self._parse_sparql_base()
|
|
146
|
+
else:
|
|
147
|
+
# Parse statement (triples)
|
|
148
|
+
self._parse_statement()
|
|
149
|
+
|
|
150
|
+
def _skip_ws_and_comments(self):
|
|
151
|
+
"""Skip whitespace and comments."""
|
|
152
|
+
while self.pos < len(self.text):
|
|
153
|
+
c = self.text[self.pos]
|
|
154
|
+
if c in ' \t\n\r':
|
|
155
|
+
self.pos += 1
|
|
156
|
+
elif c == '#':
|
|
157
|
+
# Skip to end of line
|
|
158
|
+
while self.pos < len(self.text) and self.text[self.pos] != '\n':
|
|
159
|
+
self.pos += 1
|
|
160
|
+
else:
|
|
161
|
+
break
|
|
162
|
+
|
|
163
|
+
def _peek_text(self, text: str, case_insensitive: bool = False) -> bool:
|
|
164
|
+
"""Check if text appears at current position."""
|
|
165
|
+
end = self.pos + len(text)
|
|
166
|
+
if end > len(self.text):
|
|
167
|
+
return False
|
|
168
|
+
actual = self.text[self.pos:end]
|
|
169
|
+
if case_insensitive:
|
|
170
|
+
return actual.lower() == text.lower()
|
|
171
|
+
return actual == text
|
|
172
|
+
|
|
173
|
+
def _consume(self, text: str, case_insensitive: bool = False):
|
|
174
|
+
"""Consume expected text or raise error."""
|
|
175
|
+
if not self._peek_text(text, case_insensitive):
|
|
176
|
+
context = self.text[max(0, self.pos-20):self.pos+20]
|
|
177
|
+
raise ValueError(f"Expected '{text}' at position {self.pos}, context: ...{context}...")
|
|
178
|
+
self.pos += len(text)
|
|
179
|
+
|
|
180
|
+
def _parse_prefix_directive(self):
|
|
181
|
+
"""Parse @prefix directive."""
|
|
182
|
+
self._consume("@prefix")
|
|
183
|
+
self._skip_ws_and_comments()
|
|
184
|
+
|
|
185
|
+
# Parse prefix name
|
|
186
|
+
prefix = self._parse_prefix_name()
|
|
187
|
+
self._skip_ws_and_comments()
|
|
188
|
+
|
|
189
|
+
# Parse IRI
|
|
190
|
+
iri = self._parse_iri_ref()
|
|
191
|
+
self._skip_ws_and_comments()
|
|
192
|
+
|
|
193
|
+
# Consume period
|
|
194
|
+
self._consume(".")
|
|
195
|
+
|
|
196
|
+
self.prefixes[prefix] = iri
|
|
197
|
+
|
|
198
|
+
def _parse_sparql_prefix(self):
|
|
199
|
+
"""Parse PREFIX directive (SPARQL-style)."""
|
|
200
|
+
self._consume("PREFIX", case_insensitive=True)
|
|
201
|
+
self._skip_ws_and_comments()
|
|
202
|
+
|
|
203
|
+
prefix = self._parse_prefix_name()
|
|
204
|
+
self._skip_ws_and_comments()
|
|
205
|
+
|
|
206
|
+
iri = self._parse_iri_ref()
|
|
207
|
+
|
|
208
|
+
self.prefixes[prefix] = iri
|
|
209
|
+
|
|
210
|
+
def _parse_base_directive(self):
|
|
211
|
+
"""Parse @base directive."""
|
|
212
|
+
self._consume("@base")
|
|
213
|
+
self._skip_ws_and_comments()
|
|
214
|
+
|
|
215
|
+
self.base = self._parse_iri_ref()
|
|
216
|
+
self._skip_ws_and_comments()
|
|
217
|
+
|
|
218
|
+
self._consume(".")
|
|
219
|
+
|
|
220
|
+
def _parse_sparql_base(self):
|
|
221
|
+
"""Parse BASE directive (SPARQL-style)."""
|
|
222
|
+
self._consume("BASE", case_insensitive=True)
|
|
223
|
+
self._skip_ws_and_comments()
|
|
224
|
+
|
|
225
|
+
self.base = self._parse_iri_ref()
|
|
226
|
+
|
|
227
|
+
def _parse_prefix_name(self) -> str:
|
|
228
|
+
"""Parse a prefix name (e.g., 'foaf:')."""
|
|
229
|
+
start = self.pos
|
|
230
|
+
while self.pos < len(self.text) and self.text[self.pos] not in ': \t\n\r':
|
|
231
|
+
self.pos += 1
|
|
232
|
+
prefix = self.text[start:self.pos]
|
|
233
|
+
self._consume(":")
|
|
234
|
+
return prefix
|
|
235
|
+
|
|
236
|
+
def _parse_iri_ref(self) -> str:
|
|
237
|
+
"""Parse an IRI reference (<...>)."""
|
|
238
|
+
self._consume("<")
|
|
239
|
+
start = self.pos
|
|
240
|
+
while self.pos < len(self.text) and self.text[self.pos] != '>':
|
|
241
|
+
self.pos += 1
|
|
242
|
+
iri = self.text[start:self.pos]
|
|
243
|
+
self._consume(">")
|
|
244
|
+
|
|
245
|
+
# Resolve relative IRI against base
|
|
246
|
+
if self.base and not iri.startswith(('http://', 'https://', 'urn:', 'file:')):
|
|
247
|
+
iri = self.base + iri
|
|
248
|
+
|
|
249
|
+
return iri
|
|
250
|
+
|
|
251
|
+
def _parse_statement(self):
|
|
252
|
+
"""Parse a triple statement."""
|
|
253
|
+
subject = self._parse_subject()
|
|
254
|
+
if subject is None:
|
|
255
|
+
return
|
|
256
|
+
|
|
257
|
+
self._skip_ws_and_comments()
|
|
258
|
+
|
|
259
|
+
self._parse_predicate_object_list(subject)
|
|
260
|
+
|
|
261
|
+
self._skip_ws_and_comments()
|
|
262
|
+
if self.pos < len(self.text) and self.text[self.pos] == '.':
|
|
263
|
+
self.pos += 1
|
|
264
|
+
|
|
265
|
+
def _parse_subject(self) -> Optional[Union[str, Triple]]:
|
|
266
|
+
"""Parse a subject (IRI, blank node, or quoted triple)."""
|
|
267
|
+
self._skip_ws_and_comments()
|
|
268
|
+
if self.pos >= len(self.text):
|
|
269
|
+
return None
|
|
270
|
+
|
|
271
|
+
# Check for quoted triple (RDF-Star)
|
|
272
|
+
if self._peek_text("<<"):
|
|
273
|
+
return self._parse_quoted_triple()
|
|
274
|
+
|
|
275
|
+
# Check for blank node
|
|
276
|
+
if self._peek_text("["):
|
|
277
|
+
return self._parse_blank_node_property_list()
|
|
278
|
+
|
|
279
|
+
if self._peek_text("_:"):
|
|
280
|
+
return self._parse_blank_node_label()
|
|
281
|
+
|
|
282
|
+
# Check for collection
|
|
283
|
+
if self._peek_text("("):
|
|
284
|
+
return self._parse_collection()
|
|
285
|
+
|
|
286
|
+
# Otherwise, parse IRI or prefixed name
|
|
287
|
+
return self._parse_iri_or_prefixed()
|
|
288
|
+
|
|
289
|
+
def _parse_predicate_object_list(self, subject: Union[str, Triple]):
|
|
290
|
+
"""Parse predicate-object list (supports ; and ,)."""
|
|
291
|
+
while True:
|
|
292
|
+
self._skip_ws_and_comments()
|
|
293
|
+
if self.pos >= len(self.text):
|
|
294
|
+
break
|
|
295
|
+
|
|
296
|
+
# Parse predicate
|
|
297
|
+
predicate = self._parse_predicate()
|
|
298
|
+
if predicate is None:
|
|
299
|
+
break
|
|
300
|
+
|
|
301
|
+
self._skip_ws_and_comments()
|
|
302
|
+
|
|
303
|
+
# Parse object list (comma-separated)
|
|
304
|
+
self._parse_object_list(subject, predicate)
|
|
305
|
+
|
|
306
|
+
self._skip_ws_and_comments()
|
|
307
|
+
|
|
308
|
+
# Check for more predicates (;)
|
|
309
|
+
if self.pos < len(self.text) and self.text[self.pos] == ';':
|
|
310
|
+
self.pos += 1
|
|
311
|
+
self._skip_ws_and_comments()
|
|
312
|
+
# Check for trailing semicolon before period
|
|
313
|
+
if self.pos < len(self.text) and self.text[self.pos] in '.]:':
|
|
314
|
+
break
|
|
315
|
+
continue
|
|
316
|
+
else:
|
|
317
|
+
break
|
|
318
|
+
|
|
319
|
+
def _parse_predicate(self) -> Optional[str]:
|
|
320
|
+
"""Parse a predicate."""
|
|
321
|
+
self._skip_ws_and_comments()
|
|
322
|
+
if self.pos >= len(self.text):
|
|
323
|
+
return None
|
|
324
|
+
|
|
325
|
+
# Check for 'a' (shorthand for rdf:type)
|
|
326
|
+
if self.text[self.pos] == 'a' and self.pos + 1 < len(self.text) and self.text[self.pos + 1] in ' \t\n\r':
|
|
327
|
+
self.pos += 1
|
|
328
|
+
return "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
|
|
329
|
+
|
|
330
|
+
# Statement terminator or separator - no predicate here
|
|
331
|
+
if self.text[self.pos] in '.;,]':
|
|
332
|
+
return None
|
|
333
|
+
|
|
334
|
+
return self._parse_iri_or_prefixed()
|
|
335
|
+
|
|
336
|
+
def _parse_object_list(self, subject: Union[str, Triple], predicate: str):
|
|
337
|
+
"""Parse object list (comma-separated)."""
|
|
338
|
+
while True:
|
|
339
|
+
self._skip_ws_and_comments()
|
|
340
|
+
|
|
341
|
+
obj = self._parse_object()
|
|
342
|
+
if obj is None:
|
|
343
|
+
break
|
|
344
|
+
|
|
345
|
+
# Create triple
|
|
346
|
+
if isinstance(subject, Triple):
|
|
347
|
+
triple = Triple(
|
|
348
|
+
subject="",
|
|
349
|
+
predicate=predicate,
|
|
350
|
+
object=obj if isinstance(obj, str) else "",
|
|
351
|
+
subject_triple=subject,
|
|
352
|
+
object_triple=obj if isinstance(obj, Triple) else None
|
|
353
|
+
)
|
|
354
|
+
elif isinstance(obj, Triple):
|
|
355
|
+
triple = Triple(
|
|
356
|
+
subject=subject,
|
|
357
|
+
predicate=predicate,
|
|
358
|
+
object="",
|
|
359
|
+
object_triple=obj
|
|
360
|
+
)
|
|
361
|
+
else:
|
|
362
|
+
triple = Triple(subject=subject, predicate=predicate, object=obj)
|
|
363
|
+
|
|
364
|
+
self.triples.append(triple)
|
|
365
|
+
|
|
366
|
+
self._skip_ws_and_comments()
|
|
367
|
+
|
|
368
|
+
# Check for more objects (,)
|
|
369
|
+
if self.pos < len(self.text) and self.text[self.pos] == ',':
|
|
370
|
+
self.pos += 1
|
|
371
|
+
continue
|
|
372
|
+
else:
|
|
373
|
+
break
|
|
374
|
+
|
|
375
|
+
def _parse_object(self) -> Optional[Union[str, Triple]]:
|
|
376
|
+
"""Parse an object (IRI, blank node, literal, or quoted triple)."""
|
|
377
|
+
self._skip_ws_and_comments()
|
|
378
|
+
if self.pos >= len(self.text):
|
|
379
|
+
return None
|
|
380
|
+
|
|
381
|
+
c = self.text[self.pos]
|
|
382
|
+
|
|
383
|
+
# Quoted triple (RDF-Star)
|
|
384
|
+
if self._peek_text("<<"):
|
|
385
|
+
return self._parse_quoted_triple()
|
|
386
|
+
|
|
387
|
+
# Blank node property list
|
|
388
|
+
if c == '[':
|
|
389
|
+
return self._parse_blank_node_property_list()
|
|
390
|
+
|
|
391
|
+
# Blank node label
|
|
392
|
+
if self._peek_text("_:"):
|
|
393
|
+
return self._parse_blank_node_label()
|
|
394
|
+
|
|
395
|
+
# Collection
|
|
396
|
+
if c == '(':
|
|
397
|
+
return self._parse_collection()
|
|
398
|
+
|
|
399
|
+
# Literal
|
|
400
|
+
if c in '"\'':
|
|
401
|
+
return self._parse_literal()
|
|
402
|
+
|
|
403
|
+
# Numeric literals
|
|
404
|
+
if c in '+-' or c.isdigit():
|
|
405
|
+
return self._parse_numeric()
|
|
406
|
+
|
|
407
|
+
# Boolean
|
|
408
|
+
if self._peek_text("true") or self._peek_text("false"):
|
|
409
|
+
return self._parse_boolean()
|
|
410
|
+
|
|
411
|
+
# Otherwise IRI or prefixed name
|
|
412
|
+
if c not in '.;,]':
|
|
413
|
+
return self._parse_iri_or_prefixed()
|
|
414
|
+
|
|
415
|
+
return None
|
|
416
|
+
|
|
417
|
+
def _parse_iri_or_prefixed(self) -> str:
|
|
418
|
+
"""Parse an IRI (<...>) or prefixed name (prefix:local)."""
|
|
419
|
+
if self.text[self.pos] == '<':
|
|
420
|
+
return self._parse_iri_ref()
|
|
421
|
+
else:
|
|
422
|
+
return self._parse_prefixed_name()
|
|
423
|
+
|
|
424
|
+
def _parse_prefixed_name(self) -> str:
|
|
425
|
+
"""Parse a prefixed name (e.g., foaf:name)."""
|
|
426
|
+
start = self.pos
|
|
427
|
+
|
|
428
|
+
# Parse prefix part
|
|
429
|
+
prefix = ""
|
|
430
|
+
while self.pos < len(self.text) and self.text[self.pos] not in ': \t\n\r.;,[]()':
|
|
431
|
+
self.pos += 1
|
|
432
|
+
prefix = self.text[start:self.pos]
|
|
433
|
+
|
|
434
|
+
if self.pos >= len(self.text) or self.text[self.pos] != ':':
|
|
435
|
+
# Handle 'a' as special case for rdf:type
|
|
436
|
+
if prefix == 'a':
|
|
437
|
+
return "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
|
|
438
|
+
raise ValueError(f"Expected ':' in prefixed name at position {self.pos}")
|
|
439
|
+
|
|
440
|
+
self.pos += 1 # Skip ':'
|
|
441
|
+
|
|
442
|
+
# Parse local part
|
|
443
|
+
local_start = self.pos
|
|
444
|
+
while self.pos < len(self.text) and self.text[self.pos] not in ' \t\n\r.;,[]()<>"\'':
|
|
445
|
+
self.pos += 1
|
|
446
|
+
local = self.text[local_start:self.pos]
|
|
447
|
+
|
|
448
|
+
# Expand prefix
|
|
449
|
+
if prefix in self.prefixes:
|
|
450
|
+
return self.prefixes[prefix] + local
|
|
451
|
+
elif prefix in self.STANDARD_PREFIXES:
|
|
452
|
+
return self.STANDARD_PREFIXES[prefix] + local
|
|
453
|
+
elif prefix == "":
|
|
454
|
+
# Empty prefix, use base or default
|
|
455
|
+
if self.base:
|
|
456
|
+
return self.base + local
|
|
457
|
+
return local
|
|
458
|
+
else:
|
|
459
|
+
# Unknown prefix - keep as-is for now
|
|
460
|
+
return f"{prefix}:{local}"
|
|
461
|
+
|
|
462
|
+
def _parse_blank_node_label(self) -> str:
|
|
463
|
+
"""Parse a blank node label (_:name)."""
|
|
464
|
+
self._consume("_:")
|
|
465
|
+
start = self.pos
|
|
466
|
+
while self.pos < len(self.text) and self.text[self.pos] not in ' \t\n\r.;,[]()':
|
|
467
|
+
self.pos += 1
|
|
468
|
+
label = self.text[start:self.pos]
|
|
469
|
+
return f"_:{label}"
|
|
470
|
+
|
|
471
|
+
def _parse_blank_node_property_list(self) -> str:
|
|
472
|
+
"""Parse a blank node property list [ ... ]."""
|
|
473
|
+
self._consume("[")
|
|
474
|
+
|
|
475
|
+
# Generate unique blank node ID
|
|
476
|
+
self.blank_node_counter += 1
|
|
477
|
+
bnode = f"_:b{self.blank_node_counter}"
|
|
478
|
+
|
|
479
|
+
self._skip_ws_and_comments()
|
|
480
|
+
|
|
481
|
+
# Check for empty blank node
|
|
482
|
+
if self.pos < len(self.text) and self.text[self.pos] == ']':
|
|
483
|
+
self.pos += 1
|
|
484
|
+
return bnode
|
|
485
|
+
|
|
486
|
+
# Parse property list
|
|
487
|
+
self._parse_predicate_object_list(bnode)
|
|
488
|
+
|
|
489
|
+
self._skip_ws_and_comments()
|
|
490
|
+
self._consume("]")
|
|
491
|
+
|
|
492
|
+
return bnode
|
|
493
|
+
|
|
494
|
+
def _parse_collection(self) -> str:
|
|
495
|
+
"""Parse a collection ( ... )."""
|
|
496
|
+
self._consume("(")
|
|
497
|
+
self._skip_ws_and_comments()
|
|
498
|
+
|
|
499
|
+
items = []
|
|
500
|
+
while self.pos < len(self.text) and self.text[self.pos] != ')':
|
|
501
|
+
item = self._parse_object()
|
|
502
|
+
if item is not None:
|
|
503
|
+
items.append(item)
|
|
504
|
+
self._skip_ws_and_comments()
|
|
505
|
+
|
|
506
|
+
self._consume(")")
|
|
507
|
+
|
|
508
|
+
if not items:
|
|
509
|
+
return "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil"
|
|
510
|
+
|
|
511
|
+
# Build collection as linked list
|
|
512
|
+
RDF_FIRST = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first"
|
|
513
|
+
RDF_REST = "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest"
|
|
514
|
+
RDF_NIL = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil"
|
|
515
|
+
|
|
516
|
+
head = None
|
|
517
|
+
prev = None
|
|
518
|
+
|
|
519
|
+
for item in items:
|
|
520
|
+
self.blank_node_counter += 1
|
|
521
|
+
node = f"_:list{self.blank_node_counter}"
|
|
522
|
+
|
|
523
|
+
if head is None:
|
|
524
|
+
head = node
|
|
525
|
+
|
|
526
|
+
if prev is not None:
|
|
527
|
+
self.triples.append(Triple(prev, RDF_REST, node))
|
|
528
|
+
|
|
529
|
+
if isinstance(item, Triple):
|
|
530
|
+
self.triples.append(Triple(node, RDF_FIRST, "", object_triple=item))
|
|
531
|
+
else:
|
|
532
|
+
self.triples.append(Triple(node, RDF_FIRST, item))
|
|
533
|
+
|
|
534
|
+
prev = node
|
|
535
|
+
|
|
536
|
+
if prev is not None:
|
|
537
|
+
self.triples.append(Triple(prev, RDF_REST, RDF_NIL))
|
|
538
|
+
|
|
539
|
+
return head or RDF_NIL
|
|
540
|
+
|
|
541
|
+
def _parse_literal(self) -> str:
|
|
542
|
+
"""Parse a literal string."""
|
|
543
|
+
# Check for long string (triple quotes)
|
|
544
|
+
if self._peek_text('"""'):
|
|
545
|
+
return self._parse_long_string('"""')
|
|
546
|
+
elif self._peek_text("'''"):
|
|
547
|
+
return self._parse_long_string("'''")
|
|
548
|
+
|
|
549
|
+
# Regular string
|
|
550
|
+
quote = self.text[self.pos]
|
|
551
|
+
self.pos += 1
|
|
552
|
+
|
|
553
|
+
value = []
|
|
554
|
+
while self.pos < len(self.text):
|
|
555
|
+
c = self.text[self.pos]
|
|
556
|
+
if c == quote:
|
|
557
|
+
self.pos += 1
|
|
558
|
+
break
|
|
559
|
+
elif c == '\\':
|
|
560
|
+
self.pos += 1
|
|
561
|
+
if self.pos < len(self.text):
|
|
562
|
+
escaped = self.text[self.pos]
|
|
563
|
+
if escaped == 'n':
|
|
564
|
+
value.append('\n')
|
|
565
|
+
elif escaped == 't':
|
|
566
|
+
value.append('\t')
|
|
567
|
+
elif escaped == 'r':
|
|
568
|
+
value.append('\r')
|
|
569
|
+
elif escaped == '\\':
|
|
570
|
+
value.append('\\')
|
|
571
|
+
elif escaped == quote:
|
|
572
|
+
value.append(quote)
|
|
573
|
+
elif escaped == 'u':
|
|
574
|
+
# Unicode escape \uXXXX
|
|
575
|
+
self.pos += 1
|
|
576
|
+
hex_chars = self.text[self.pos:self.pos+4]
|
|
577
|
+
value.append(chr(int(hex_chars, 16)))
|
|
578
|
+
self.pos += 3
|
|
579
|
+
else:
|
|
580
|
+
value.append(escaped)
|
|
581
|
+
self.pos += 1
|
|
582
|
+
else:
|
|
583
|
+
value.append(c)
|
|
584
|
+
self.pos += 1
|
|
585
|
+
|
|
586
|
+
string_value = ''.join(value)
|
|
587
|
+
|
|
588
|
+
# Check for language tag or datatype
|
|
589
|
+
if self.pos < len(self.text) and self.text[self.pos] == '@':
|
|
590
|
+
self.pos += 1
|
|
591
|
+
lang_start = self.pos
|
|
592
|
+
while self.pos < len(self.text) and self.text[self.pos] not in ' \t\n\r.;,[]':
|
|
593
|
+
self.pos += 1
|
|
594
|
+
lang = self.text[lang_start:self.pos]
|
|
595
|
+
return f'"{string_value}"@{lang}'
|
|
596
|
+
elif self._peek_text("^^"):
|
|
597
|
+
self.pos += 2
|
|
598
|
+
datatype = self._parse_iri_or_prefixed()
|
|
599
|
+
return f'"{string_value}"^^<{datatype}>'
|
|
600
|
+
|
|
601
|
+
return f'"{string_value}"'
|
|
602
|
+
|
|
603
|
+
def _parse_long_string(self, delimiter: str) -> str:
|
|
604
|
+
"""Parse a long string (triple-quoted)."""
|
|
605
|
+
self._consume(delimiter)
|
|
606
|
+
|
|
607
|
+
value = []
|
|
608
|
+
while self.pos < len(self.text):
|
|
609
|
+
if self._peek_text(delimiter):
|
|
610
|
+
self._consume(delimiter)
|
|
611
|
+
break
|
|
612
|
+
value.append(self.text[self.pos])
|
|
613
|
+
self.pos += 1
|
|
614
|
+
|
|
615
|
+
string_value = ''.join(value)
|
|
616
|
+
|
|
617
|
+
# Check for language tag or datatype
|
|
618
|
+
if self.pos < len(self.text) and self.text[self.pos] == '@':
|
|
619
|
+
self.pos += 1
|
|
620
|
+
lang_start = self.pos
|
|
621
|
+
while self.pos < len(self.text) and self.text[self.pos] not in ' \t\n\r.;,[]':
|
|
622
|
+
self.pos += 1
|
|
623
|
+
lang = self.text[lang_start:self.pos]
|
|
624
|
+
return f'"{string_value}"@{lang}'
|
|
625
|
+
elif self._peek_text("^^"):
|
|
626
|
+
self.pos += 2
|
|
627
|
+
datatype = self._parse_iri_or_prefixed()
|
|
628
|
+
return f'"{string_value}"^^<{datatype}>'
|
|
629
|
+
|
|
630
|
+
return f'"{string_value}"'
|
|
631
|
+
|
|
632
|
+
def _parse_numeric(self) -> str:
|
|
633
|
+
"""Parse a numeric literal."""
|
|
634
|
+
start = self.pos
|
|
635
|
+
|
|
636
|
+
# Handle sign
|
|
637
|
+
if self.text[self.pos] in '+-':
|
|
638
|
+
self.pos += 1
|
|
639
|
+
|
|
640
|
+
# Parse digits
|
|
641
|
+
has_decimal = False
|
|
642
|
+
has_exponent = False
|
|
643
|
+
|
|
644
|
+
while self.pos < len(self.text):
|
|
645
|
+
c = self.text[self.pos]
|
|
646
|
+
if c.isdigit():
|
|
647
|
+
self.pos += 1
|
|
648
|
+
elif c == '.' and not has_decimal:
|
|
649
|
+
has_decimal = True
|
|
650
|
+
self.pos += 1
|
|
651
|
+
elif c in 'eE' and not has_exponent:
|
|
652
|
+
has_exponent = True
|
|
653
|
+
self.pos += 1
|
|
654
|
+
if self.pos < len(self.text) and self.text[self.pos] in '+-':
|
|
655
|
+
self.pos += 1
|
|
656
|
+
else:
|
|
657
|
+
break
|
|
658
|
+
|
|
659
|
+
value = self.text[start:self.pos]
|
|
660
|
+
|
|
661
|
+
# Determine datatype
|
|
662
|
+
if has_exponent:
|
|
663
|
+
return f'"{value}"^^<http://www.w3.org/2001/XMLSchema#double>'
|
|
664
|
+
elif has_decimal:
|
|
665
|
+
return f'"{value}"^^<http://www.w3.org/2001/XMLSchema#decimal>'
|
|
666
|
+
else:
|
|
667
|
+
return f'"{value}"^^<http://www.w3.org/2001/XMLSchema#integer>'
|
|
668
|
+
|
|
669
|
+
def _parse_boolean(self) -> str:
|
|
670
|
+
"""Parse a boolean literal."""
|
|
671
|
+
if self._peek_text("true"):
|
|
672
|
+
self._consume("true")
|
|
673
|
+
return '"true"^^<http://www.w3.org/2001/XMLSchema#boolean>'
|
|
674
|
+
else:
|
|
675
|
+
self._consume("false")
|
|
676
|
+
return '"false"^^<http://www.w3.org/2001/XMLSchema#boolean>'
|
|
677
|
+
|
|
678
|
+
def _parse_quoted_triple(self) -> Triple:
|
|
679
|
+
"""Parse an RDF-Star quoted triple << s p o >>."""
|
|
680
|
+
self._consume("<<")
|
|
681
|
+
self._skip_ws_and_comments()
|
|
682
|
+
|
|
683
|
+
subject = self._parse_subject()
|
|
684
|
+
self._skip_ws_and_comments()
|
|
685
|
+
|
|
686
|
+
predicate = self._parse_predicate()
|
|
687
|
+
self._skip_ws_and_comments()
|
|
688
|
+
|
|
689
|
+
obj = self._parse_object()
|
|
690
|
+
self._skip_ws_and_comments()
|
|
691
|
+
|
|
692
|
+
self._consume(">>")
|
|
693
|
+
|
|
694
|
+
if isinstance(subject, Triple):
|
|
695
|
+
return Triple("", predicate, obj if isinstance(obj, str) else "",
|
|
696
|
+
subject_triple=subject,
|
|
697
|
+
object_triple=obj if isinstance(obj, Triple) else None)
|
|
698
|
+
elif isinstance(obj, Triple):
|
|
699
|
+
return Triple(subject, predicate, "", object_triple=obj)
|
|
700
|
+
else:
|
|
701
|
+
return Triple(subject, predicate, obj)
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
class TurtleSerializer:
|
|
705
|
+
"""
|
|
706
|
+
Serializer for Turtle format.
|
|
707
|
+
|
|
708
|
+
Converts triples to Turtle format with:
|
|
709
|
+
- Prefix declarations
|
|
710
|
+
- Predicate-object grouping
|
|
711
|
+
- Proper escaping
|
|
712
|
+
- RDF-Star quoted triples
|
|
713
|
+
"""
|
|
714
|
+
|
|
715
|
+
def __init__(self, prefixes: Optional[Dict[str, str]] = None):
|
|
716
|
+
"""
|
|
717
|
+
Initialize serializer with optional prefixes.
|
|
718
|
+
|
|
719
|
+
Args:
|
|
720
|
+
prefixes: Dict mapping prefix to namespace IRI
|
|
721
|
+
"""
|
|
722
|
+
self.prefixes = prefixes or {}
|
|
723
|
+
self._reverse_prefixes: Dict[str, str] = {}
|
|
724
|
+
self._update_reverse_prefixes()
|
|
725
|
+
|
|
726
|
+
def _update_reverse_prefixes(self):
|
|
727
|
+
"""Build reverse prefix lookup."""
|
|
728
|
+
self._reverse_prefixes = {v: k for k, v in self.prefixes.items()}
|
|
729
|
+
|
|
730
|
+
def add_prefix(self, prefix: str, namespace: str):
|
|
731
|
+
"""Add a prefix mapping."""
|
|
732
|
+
self.prefixes[prefix] = namespace
|
|
733
|
+
self._reverse_prefixes[namespace] = prefix
|
|
734
|
+
|
|
735
|
+
def serialize(self, triples: List[Triple], base: Optional[str] = None) -> str:
|
|
736
|
+
"""
|
|
737
|
+
Serialize triples to Turtle format.
|
|
738
|
+
|
|
739
|
+
Args:
|
|
740
|
+
triples: List of Triple objects
|
|
741
|
+
base: Optional base IRI
|
|
742
|
+
|
|
743
|
+
Returns:
|
|
744
|
+
Turtle formatted string
|
|
745
|
+
"""
|
|
746
|
+
lines = []
|
|
747
|
+
|
|
748
|
+
# Write base if provided
|
|
749
|
+
if base:
|
|
750
|
+
lines.append(f"@base <{base}> .")
|
|
751
|
+
lines.append("")
|
|
752
|
+
|
|
753
|
+
# Write prefixes
|
|
754
|
+
for prefix, namespace in sorted(self.prefixes.items()):
|
|
755
|
+
lines.append(f"@prefix {prefix}: <{namespace}> .")
|
|
756
|
+
|
|
757
|
+
if self.prefixes:
|
|
758
|
+
lines.append("")
|
|
759
|
+
|
|
760
|
+
# Group triples by subject
|
|
761
|
+
by_subject: Dict[str, List[Triple]] = {}
|
|
762
|
+
for triple in triples:
|
|
763
|
+
key = self._subject_key(triple)
|
|
764
|
+
if key not in by_subject:
|
|
765
|
+
by_subject[key] = []
|
|
766
|
+
by_subject[key].append(triple)
|
|
767
|
+
|
|
768
|
+
# Write triples
|
|
769
|
+
for subject_key, subject_triples in by_subject.items():
|
|
770
|
+
# Write subject
|
|
771
|
+
subject = subject_triples[0]
|
|
772
|
+
lines.append(f"{self._format_subject(subject)}")
|
|
773
|
+
|
|
774
|
+
# Group by predicate
|
|
775
|
+
by_predicate: Dict[str, List[Triple]] = {}
|
|
776
|
+
for triple in subject_triples:
|
|
777
|
+
pred = triple.predicate
|
|
778
|
+
if pred not in by_predicate:
|
|
779
|
+
by_predicate[pred] = []
|
|
780
|
+
by_predicate[pred].append(triple)
|
|
781
|
+
|
|
782
|
+
pred_items = list(by_predicate.items())
|
|
783
|
+
for i, (pred, pred_triples) in enumerate(pred_items):
|
|
784
|
+
pred_str = self._compress_iri(pred)
|
|
785
|
+
if pred == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type":
|
|
786
|
+
pred_str = "a"
|
|
787
|
+
|
|
788
|
+
objects = [self._format_object(t) for t in pred_triples]
|
|
789
|
+
objects_str = " , ".join(objects)
|
|
790
|
+
|
|
791
|
+
if i < len(pred_items) - 1:
|
|
792
|
+
lines.append(f" {pred_str} {objects_str} ;")
|
|
793
|
+
else:
|
|
794
|
+
lines.append(f" {pred_str} {objects_str} .")
|
|
795
|
+
|
|
796
|
+
lines.append("")
|
|
797
|
+
|
|
798
|
+
return "\n".join(lines)
|
|
799
|
+
|
|
800
|
+
def serialize_to_file(self, triples: List[Triple], path: Union[str, Path],
|
|
801
|
+
base: Optional[str] = None):
|
|
802
|
+
"""Serialize triples to a Turtle file."""
|
|
803
|
+
content = self.serialize(triples, base)
|
|
804
|
+
Path(path).write_text(content, encoding="utf-8")
|
|
805
|
+
|
|
806
|
+
def _subject_key(self, triple: Triple) -> str:
|
|
807
|
+
"""Get a key for grouping by subject."""
|
|
808
|
+
if triple.subject_triple:
|
|
809
|
+
return f"<<{triple.subject_triple}>>"
|
|
810
|
+
return triple.subject
|
|
811
|
+
|
|
812
|
+
def _format_subject(self, triple: Triple) -> str:
|
|
813
|
+
"""Format the subject of a triple."""
|
|
814
|
+
if triple.subject_triple:
|
|
815
|
+
return self._format_quoted_triple(triple.subject_triple)
|
|
816
|
+
return self._compress_iri(triple.subject)
|
|
817
|
+
|
|
818
|
+
def _format_object(self, triple: Triple) -> str:
|
|
819
|
+
"""Format the object of a triple."""
|
|
820
|
+
if triple.object_triple:
|
|
821
|
+
return self._format_quoted_triple(triple.object_triple)
|
|
822
|
+
return self._format_term(triple.object)
|
|
823
|
+
|
|
824
|
+
def _format_quoted_triple(self, triple: Triple) -> str:
|
|
825
|
+
"""Format a quoted triple."""
|
|
826
|
+
s = self._format_subject(triple) if not triple.subject_triple else self._format_quoted_triple(triple.subject_triple)
|
|
827
|
+
if triple.subject and not triple.subject_triple:
|
|
828
|
+
s = self._compress_iri(triple.subject)
|
|
829
|
+
p = self._compress_iri(triple.predicate)
|
|
830
|
+
o = self._format_object(triple) if not triple.object_triple else self._format_quoted_triple(triple.object_triple)
|
|
831
|
+
if triple.object and not triple.object_triple:
|
|
832
|
+
o = self._format_term(triple.object)
|
|
833
|
+
return f"<< {s} {p} {o} >>"
|
|
834
|
+
|
|
835
|
+
def _format_term(self, term: str) -> str:
|
|
836
|
+
"""Format a term (IRI, blank node, or literal)."""
|
|
837
|
+
if term.startswith('_:'):
|
|
838
|
+
return term
|
|
839
|
+
elif term.startswith('"'):
|
|
840
|
+
return term # Already formatted literal
|
|
841
|
+
else:
|
|
842
|
+
return self._compress_iri(term)
|
|
843
|
+
|
|
844
|
+
def _compress_iri(self, iri: str) -> str:
|
|
845
|
+
"""Compress IRI using prefixes if possible."""
|
|
846
|
+
if iri.startswith('_:'):
|
|
847
|
+
return iri
|
|
848
|
+
|
|
849
|
+
for namespace, prefix in self._reverse_prefixes.items():
|
|
850
|
+
if iri.startswith(namespace):
|
|
851
|
+
local = iri[len(namespace):]
|
|
852
|
+
# Check if local part is valid for prefixed name
|
|
853
|
+
if self._is_valid_local(local):
|
|
854
|
+
return f"{prefix}:{local}"
|
|
855
|
+
|
|
856
|
+
return f"<{iri}>"
|
|
857
|
+
|
|
858
|
+
def _is_valid_local(self, local: str) -> bool:
|
|
859
|
+
"""Check if a local name is valid for a prefixed name."""
|
|
860
|
+
if not local:
|
|
861
|
+
return True
|
|
862
|
+
if local[0].isdigit():
|
|
863
|
+
return False
|
|
864
|
+
for c in local:
|
|
865
|
+
if not (c.isalnum() or c in '_-'):
|
|
866
|
+
return False
|
|
867
|
+
return True
|
|
868
|
+
|
|
869
|
+
|
|
870
|
+
# Convenience functions
|
|
871
|
+
def parse_turtle(source: Union[str, Path]) -> ParsedDocument:
|
|
872
|
+
"""Parse Turtle content or file."""
|
|
873
|
+
parser = TurtleParser()
|
|
874
|
+
if isinstance(source, Path) or (isinstance(source, str) and len(source) < 500 and Path(source).exists()):
|
|
875
|
+
return parser.parse_file(source)
|
|
876
|
+
return parser.parse(source)
|
|
877
|
+
|
|
878
|
+
|
|
879
|
+
def serialize_turtle(triples: List[Triple], prefixes: Optional[Dict[str, str]] = None) -> str:
|
|
880
|
+
"""Serialize triples to Turtle format."""
|
|
881
|
+
serializer = TurtleSerializer(prefixes)
|
|
882
|
+
return serializer.serialize(triples)
|