rdf-starbase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,882 @@
1
+ """
2
+ Turtle and Turtle-Star Parser.
3
+
4
+ Implements parsing of Turtle (Terse RDF Triple Language) format
5
+ with RDF-Star extensions for quoted triples.
6
+
7
+ Grammar based on W3C Turtle specification:
8
+ https://www.w3.org/TR/turtle/
9
+
10
+ With Turtle-Star extensions:
11
+ https://w3c.github.io/rdf-star/cg-spec/editors_draft.html
12
+ """
13
+
14
+ from typing import Iterator, Optional, Tuple, List, Dict, Any, Union
15
+ from dataclasses import dataclass, field
16
+ from pathlib import Path
17
+ import re
18
+ from io import StringIO
19
+
20
+
21
+ @dataclass
22
+ class Triple:
23
+ """A parsed RDF triple."""
24
+ subject: str
25
+ predicate: str
26
+ object: str
27
+ # For RDF-Star: if subject or object is a quoted triple
28
+ subject_triple: Optional["Triple"] = None
29
+ object_triple: Optional["Triple"] = None
30
+
31
+ def __str__(self) -> str:
32
+ s = f"<<{self.subject_triple}>>" if self.subject_triple else self.subject
33
+ o = f"<<{self.object_triple}>>" if self.object_triple else self.object
34
+ return f"{s} {self.predicate} {o}"
35
+
36
+
37
+ @dataclass
38
+ class ParsedDocument:
39
+ """Result of parsing a Turtle document."""
40
+ prefixes: Dict[str, str] = field(default_factory=dict)
41
+ base: Optional[str] = None
42
+ triples: List[Triple] = field(default_factory=list)
43
+
44
+
45
+ class TurtleParser:
46
+ """
47
+ Parser for Turtle and Turtle-Star format.
48
+
49
+ Supports:
50
+ - @prefix and @base directives
51
+ - PREFIX and BASE (SPARQL-style)
52
+ - Prefixed names (foaf:name)
53
+ - Full IRIs (<http://...>)
54
+ - Literals with language tags ("hello"@en)
55
+ - Literals with datatypes ("42"^^xsd:integer)
56
+ - Blank nodes (_:b1, [ ])
57
+ - Collections (a b c)
58
+ - RDF-Star quoted triples (<< s p o >>)
59
+ - Predicate-object lists (; separation)
60
+ - Object lists (, separation)
61
+ """
62
+
63
+ # Standard prefixes that are commonly used
64
+ STANDARD_PREFIXES = {
65
+ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
66
+ "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
67
+ "xsd": "http://www.w3.org/2001/XMLSchema#",
68
+ "owl": "http://www.w3.org/2002/07/owl#",
69
+ "foaf": "http://xmlns.com/foaf/0.1/",
70
+ "dc": "http://purl.org/dc/elements/1.1/",
71
+ "dcterms": "http://purl.org/dc/terms/",
72
+ "skos": "http://www.w3.org/2004/02/skos/core#",
73
+ "prov": "http://www.w3.org/ns/prov#",
74
+ }
75
+
76
+ # Token patterns
77
+ IRI_PATTERN = re.compile(r'<([^>]*)>')
78
+ PREFIXED_NAME_PATTERN = re.compile(r'([a-zA-Z_][\w-]*)?:([a-zA-Z_][\w.-]*)?')
79
+ BLANK_NODE_PATTERN = re.compile(r'_:([a-zA-Z_][\w.-]*)')
80
+ STRING_PATTERN = re.compile(r'"""([^"]*(?:""?(?!"))?)*"""|\'\'\'([^\']*(?:\'\'?(?!\'))?)*\'\'\'|"([^"\\]*(?:\\.[^"\\]*)*)"|\'([^\'\\]*(?:\\.[^\'\\]*)*)\'')
81
+ INTEGER_PATTERN = re.compile(r'[+-]?\d+')
82
+ DECIMAL_PATTERN = re.compile(r'[+-]?\d*\.\d+')
83
+ DOUBLE_PATTERN = re.compile(r'[+-]?(?:\d+\.\d*|\.\d+|\d+)[eE][+-]?\d+')
84
+ BOOLEAN_PATTERN = re.compile(r'true|false', re.IGNORECASE)
85
+ COMMENT_PATTERN = re.compile(r'#[^\n]*')
86
+
87
+ def __init__(self):
88
+ self.prefixes: Dict[str, str] = {}
89
+ self.base: Optional[str] = None
90
+ self.blank_node_counter = 0
91
+ self.text = ""
92
+ self.pos = 0
93
+ self.triples: List[Triple] = []
94
+
95
+ def parse(self, source: Union[str, Path, StringIO]) -> ParsedDocument:
96
+ """
97
+ Parse a Turtle document.
98
+
99
+ Args:
100
+ source: Turtle content as string, file path, or StringIO
101
+
102
+ Returns:
103
+ ParsedDocument with prefixes, base, and triples
104
+ """
105
+ if isinstance(source, Path):
106
+ self.text = source.read_text(encoding="utf-8")
107
+ elif isinstance(source, StringIO):
108
+ self.text = source.read()
109
+ else:
110
+ self.text = source
111
+
112
+ self.pos = 0
113
+ self.prefixes = {}
114
+ self.base = None
115
+ self.triples = []
116
+ self.blank_node_counter = 0
117
+
118
+ self._parse_document()
119
+
120
+ return ParsedDocument(
121
+ prefixes=self.prefixes.copy(),
122
+ base=self.base,
123
+ triples=self.triples.copy()
124
+ )
125
+
126
+ def parse_file(self, path: Union[str, Path]) -> ParsedDocument:
127
+ """Parse a Turtle file."""
128
+ return self.parse(Path(path))
129
+
130
+ def _parse_document(self):
131
+ """Parse the entire document."""
132
+ while self.pos < len(self.text):
133
+ self._skip_ws_and_comments()
134
+ if self.pos >= len(self.text):
135
+ break
136
+
137
+ # Check for directives
138
+ if self._peek_text("@prefix"):
139
+ self._parse_prefix_directive()
140
+ elif self._peek_text("@base"):
141
+ self._parse_base_directive()
142
+ elif self._peek_text("PREFIX", case_insensitive=True):
143
+ self._parse_sparql_prefix()
144
+ elif self._peek_text("BASE", case_insensitive=True):
145
+ self._parse_sparql_base()
146
+ else:
147
+ # Parse statement (triples)
148
+ self._parse_statement()
149
+
150
+ def _skip_ws_and_comments(self):
151
+ """Skip whitespace and comments."""
152
+ while self.pos < len(self.text):
153
+ c = self.text[self.pos]
154
+ if c in ' \t\n\r':
155
+ self.pos += 1
156
+ elif c == '#':
157
+ # Skip to end of line
158
+ while self.pos < len(self.text) and self.text[self.pos] != '\n':
159
+ self.pos += 1
160
+ else:
161
+ break
162
+
163
+ def _peek_text(self, text: str, case_insensitive: bool = False) -> bool:
164
+ """Check if text appears at current position."""
165
+ end = self.pos + len(text)
166
+ if end > len(self.text):
167
+ return False
168
+ actual = self.text[self.pos:end]
169
+ if case_insensitive:
170
+ return actual.lower() == text.lower()
171
+ return actual == text
172
+
173
+ def _consume(self, text: str, case_insensitive: bool = False):
174
+ """Consume expected text or raise error."""
175
+ if not self._peek_text(text, case_insensitive):
176
+ context = self.text[max(0, self.pos-20):self.pos+20]
177
+ raise ValueError(f"Expected '{text}' at position {self.pos}, context: ...{context}...")
178
+ self.pos += len(text)
179
+
180
+ def _parse_prefix_directive(self):
181
+ """Parse @prefix directive."""
182
+ self._consume("@prefix")
183
+ self._skip_ws_and_comments()
184
+
185
+ # Parse prefix name
186
+ prefix = self._parse_prefix_name()
187
+ self._skip_ws_and_comments()
188
+
189
+ # Parse IRI
190
+ iri = self._parse_iri_ref()
191
+ self._skip_ws_and_comments()
192
+
193
+ # Consume period
194
+ self._consume(".")
195
+
196
+ self.prefixes[prefix] = iri
197
+
198
+ def _parse_sparql_prefix(self):
199
+ """Parse PREFIX directive (SPARQL-style)."""
200
+ self._consume("PREFIX", case_insensitive=True)
201
+ self._skip_ws_and_comments()
202
+
203
+ prefix = self._parse_prefix_name()
204
+ self._skip_ws_and_comments()
205
+
206
+ iri = self._parse_iri_ref()
207
+
208
+ self.prefixes[prefix] = iri
209
+
210
+ def _parse_base_directive(self):
211
+ """Parse @base directive."""
212
+ self._consume("@base")
213
+ self._skip_ws_and_comments()
214
+
215
+ self.base = self._parse_iri_ref()
216
+ self._skip_ws_and_comments()
217
+
218
+ self._consume(".")
219
+
220
+ def _parse_sparql_base(self):
221
+ """Parse BASE directive (SPARQL-style)."""
222
+ self._consume("BASE", case_insensitive=True)
223
+ self._skip_ws_and_comments()
224
+
225
+ self.base = self._parse_iri_ref()
226
+
227
+ def _parse_prefix_name(self) -> str:
228
+ """Parse a prefix name (e.g., 'foaf:')."""
229
+ start = self.pos
230
+ while self.pos < len(self.text) and self.text[self.pos] not in ': \t\n\r':
231
+ self.pos += 1
232
+ prefix = self.text[start:self.pos]
233
+ self._consume(":")
234
+ return prefix
235
+
236
+ def _parse_iri_ref(self) -> str:
237
+ """Parse an IRI reference (<...>)."""
238
+ self._consume("<")
239
+ start = self.pos
240
+ while self.pos < len(self.text) and self.text[self.pos] != '>':
241
+ self.pos += 1
242
+ iri = self.text[start:self.pos]
243
+ self._consume(">")
244
+
245
+ # Resolve relative IRI against base
246
+ if self.base and not iri.startswith(('http://', 'https://', 'urn:', 'file:')):
247
+ iri = self.base + iri
248
+
249
+ return iri
250
+
251
+ def _parse_statement(self):
252
+ """Parse a triple statement."""
253
+ subject = self._parse_subject()
254
+ if subject is None:
255
+ return
256
+
257
+ self._skip_ws_and_comments()
258
+
259
+ self._parse_predicate_object_list(subject)
260
+
261
+ self._skip_ws_and_comments()
262
+ if self.pos < len(self.text) and self.text[self.pos] == '.':
263
+ self.pos += 1
264
+
265
+ def _parse_subject(self) -> Optional[Union[str, Triple]]:
266
+ """Parse a subject (IRI, blank node, or quoted triple)."""
267
+ self._skip_ws_and_comments()
268
+ if self.pos >= len(self.text):
269
+ return None
270
+
271
+ # Check for quoted triple (RDF-Star)
272
+ if self._peek_text("<<"):
273
+ return self._parse_quoted_triple()
274
+
275
+ # Check for blank node
276
+ if self._peek_text("["):
277
+ return self._parse_blank_node_property_list()
278
+
279
+ if self._peek_text("_:"):
280
+ return self._parse_blank_node_label()
281
+
282
+ # Check for collection
283
+ if self._peek_text("("):
284
+ return self._parse_collection()
285
+
286
+ # Otherwise, parse IRI or prefixed name
287
+ return self._parse_iri_or_prefixed()
288
+
289
+ def _parse_predicate_object_list(self, subject: Union[str, Triple]):
290
+ """Parse predicate-object list (supports ; and ,)."""
291
+ while True:
292
+ self._skip_ws_and_comments()
293
+ if self.pos >= len(self.text):
294
+ break
295
+
296
+ # Parse predicate
297
+ predicate = self._parse_predicate()
298
+ if predicate is None:
299
+ break
300
+
301
+ self._skip_ws_and_comments()
302
+
303
+ # Parse object list (comma-separated)
304
+ self._parse_object_list(subject, predicate)
305
+
306
+ self._skip_ws_and_comments()
307
+
308
+ # Check for more predicates (;)
309
+ if self.pos < len(self.text) and self.text[self.pos] == ';':
310
+ self.pos += 1
311
+ self._skip_ws_and_comments()
312
+ # Check for trailing semicolon before period
313
+ if self.pos < len(self.text) and self.text[self.pos] in '.]:':
314
+ break
315
+ continue
316
+ else:
317
+ break
318
+
319
+ def _parse_predicate(self) -> Optional[str]:
320
+ """Parse a predicate."""
321
+ self._skip_ws_and_comments()
322
+ if self.pos >= len(self.text):
323
+ return None
324
+
325
+ # Check for 'a' (shorthand for rdf:type)
326
+ if self.text[self.pos] == 'a' and self.pos + 1 < len(self.text) and self.text[self.pos + 1] in ' \t\n\r':
327
+ self.pos += 1
328
+ return "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
329
+
330
+ # Statement terminator or separator - no predicate here
331
+ if self.text[self.pos] in '.;,]':
332
+ return None
333
+
334
+ return self._parse_iri_or_prefixed()
335
+
336
+ def _parse_object_list(self, subject: Union[str, Triple], predicate: str):
337
+ """Parse object list (comma-separated)."""
338
+ while True:
339
+ self._skip_ws_and_comments()
340
+
341
+ obj = self._parse_object()
342
+ if obj is None:
343
+ break
344
+
345
+ # Create triple
346
+ if isinstance(subject, Triple):
347
+ triple = Triple(
348
+ subject="",
349
+ predicate=predicate,
350
+ object=obj if isinstance(obj, str) else "",
351
+ subject_triple=subject,
352
+ object_triple=obj if isinstance(obj, Triple) else None
353
+ )
354
+ elif isinstance(obj, Triple):
355
+ triple = Triple(
356
+ subject=subject,
357
+ predicate=predicate,
358
+ object="",
359
+ object_triple=obj
360
+ )
361
+ else:
362
+ triple = Triple(subject=subject, predicate=predicate, object=obj)
363
+
364
+ self.triples.append(triple)
365
+
366
+ self._skip_ws_and_comments()
367
+
368
+ # Check for more objects (,)
369
+ if self.pos < len(self.text) and self.text[self.pos] == ',':
370
+ self.pos += 1
371
+ continue
372
+ else:
373
+ break
374
+
375
+ def _parse_object(self) -> Optional[Union[str, Triple]]:
376
+ """Parse an object (IRI, blank node, literal, or quoted triple)."""
377
+ self._skip_ws_and_comments()
378
+ if self.pos >= len(self.text):
379
+ return None
380
+
381
+ c = self.text[self.pos]
382
+
383
+ # Quoted triple (RDF-Star)
384
+ if self._peek_text("<<"):
385
+ return self._parse_quoted_triple()
386
+
387
+ # Blank node property list
388
+ if c == '[':
389
+ return self._parse_blank_node_property_list()
390
+
391
+ # Blank node label
392
+ if self._peek_text("_:"):
393
+ return self._parse_blank_node_label()
394
+
395
+ # Collection
396
+ if c == '(':
397
+ return self._parse_collection()
398
+
399
+ # Literal
400
+ if c in '"\'':
401
+ return self._parse_literal()
402
+
403
+ # Numeric literals
404
+ if c in '+-' or c.isdigit():
405
+ return self._parse_numeric()
406
+
407
+ # Boolean
408
+ if self._peek_text("true") or self._peek_text("false"):
409
+ return self._parse_boolean()
410
+
411
+ # Otherwise IRI or prefixed name
412
+ if c not in '.;,]':
413
+ return self._parse_iri_or_prefixed()
414
+
415
+ return None
416
+
417
+ def _parse_iri_or_prefixed(self) -> str:
418
+ """Parse an IRI (<...>) or prefixed name (prefix:local)."""
419
+ if self.text[self.pos] == '<':
420
+ return self._parse_iri_ref()
421
+ else:
422
+ return self._parse_prefixed_name()
423
+
424
+ def _parse_prefixed_name(self) -> str:
425
+ """Parse a prefixed name (e.g., foaf:name)."""
426
+ start = self.pos
427
+
428
+ # Parse prefix part
429
+ prefix = ""
430
+ while self.pos < len(self.text) and self.text[self.pos] not in ': \t\n\r.;,[]()':
431
+ self.pos += 1
432
+ prefix = self.text[start:self.pos]
433
+
434
+ if self.pos >= len(self.text) or self.text[self.pos] != ':':
435
+ # Handle 'a' as special case for rdf:type
436
+ if prefix == 'a':
437
+ return "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
438
+ raise ValueError(f"Expected ':' in prefixed name at position {self.pos}")
439
+
440
+ self.pos += 1 # Skip ':'
441
+
442
+ # Parse local part
443
+ local_start = self.pos
444
+ while self.pos < len(self.text) and self.text[self.pos] not in ' \t\n\r.;,[]()<>"\'':
445
+ self.pos += 1
446
+ local = self.text[local_start:self.pos]
447
+
448
+ # Expand prefix
449
+ if prefix in self.prefixes:
450
+ return self.prefixes[prefix] + local
451
+ elif prefix in self.STANDARD_PREFIXES:
452
+ return self.STANDARD_PREFIXES[prefix] + local
453
+ elif prefix == "":
454
+ # Empty prefix, use base or default
455
+ if self.base:
456
+ return self.base + local
457
+ return local
458
+ else:
459
+ # Unknown prefix - keep as-is for now
460
+ return f"{prefix}:{local}"
461
+
462
+ def _parse_blank_node_label(self) -> str:
463
+ """Parse a blank node label (_:name)."""
464
+ self._consume("_:")
465
+ start = self.pos
466
+ while self.pos < len(self.text) and self.text[self.pos] not in ' \t\n\r.;,[]()':
467
+ self.pos += 1
468
+ label = self.text[start:self.pos]
469
+ return f"_:{label}"
470
+
471
+ def _parse_blank_node_property_list(self) -> str:
472
+ """Parse a blank node property list [ ... ]."""
473
+ self._consume("[")
474
+
475
+ # Generate unique blank node ID
476
+ self.blank_node_counter += 1
477
+ bnode = f"_:b{self.blank_node_counter}"
478
+
479
+ self._skip_ws_and_comments()
480
+
481
+ # Check for empty blank node
482
+ if self.pos < len(self.text) and self.text[self.pos] == ']':
483
+ self.pos += 1
484
+ return bnode
485
+
486
+ # Parse property list
487
+ self._parse_predicate_object_list(bnode)
488
+
489
+ self._skip_ws_and_comments()
490
+ self._consume("]")
491
+
492
+ return bnode
493
+
494
+ def _parse_collection(self) -> str:
495
+ """Parse a collection ( ... )."""
496
+ self._consume("(")
497
+ self._skip_ws_and_comments()
498
+
499
+ items = []
500
+ while self.pos < len(self.text) and self.text[self.pos] != ')':
501
+ item = self._parse_object()
502
+ if item is not None:
503
+ items.append(item)
504
+ self._skip_ws_and_comments()
505
+
506
+ self._consume(")")
507
+
508
+ if not items:
509
+ return "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil"
510
+
511
+ # Build collection as linked list
512
+ RDF_FIRST = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first"
513
+ RDF_REST = "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest"
514
+ RDF_NIL = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil"
515
+
516
+ head = None
517
+ prev = None
518
+
519
+ for item in items:
520
+ self.blank_node_counter += 1
521
+ node = f"_:list{self.blank_node_counter}"
522
+
523
+ if head is None:
524
+ head = node
525
+
526
+ if prev is not None:
527
+ self.triples.append(Triple(prev, RDF_REST, node))
528
+
529
+ if isinstance(item, Triple):
530
+ self.triples.append(Triple(node, RDF_FIRST, "", object_triple=item))
531
+ else:
532
+ self.triples.append(Triple(node, RDF_FIRST, item))
533
+
534
+ prev = node
535
+
536
+ if prev is not None:
537
+ self.triples.append(Triple(prev, RDF_REST, RDF_NIL))
538
+
539
+ return head or RDF_NIL
540
+
541
+ def _parse_literal(self) -> str:
542
+ """Parse a literal string."""
543
+ # Check for long string (triple quotes)
544
+ if self._peek_text('"""'):
545
+ return self._parse_long_string('"""')
546
+ elif self._peek_text("'''"):
547
+ return self._parse_long_string("'''")
548
+
549
+ # Regular string
550
+ quote = self.text[self.pos]
551
+ self.pos += 1
552
+
553
+ value = []
554
+ while self.pos < len(self.text):
555
+ c = self.text[self.pos]
556
+ if c == quote:
557
+ self.pos += 1
558
+ break
559
+ elif c == '\\':
560
+ self.pos += 1
561
+ if self.pos < len(self.text):
562
+ escaped = self.text[self.pos]
563
+ if escaped == 'n':
564
+ value.append('\n')
565
+ elif escaped == 't':
566
+ value.append('\t')
567
+ elif escaped == 'r':
568
+ value.append('\r')
569
+ elif escaped == '\\':
570
+ value.append('\\')
571
+ elif escaped == quote:
572
+ value.append(quote)
573
+ elif escaped == 'u':
574
+ # Unicode escape \uXXXX
575
+ self.pos += 1
576
+ hex_chars = self.text[self.pos:self.pos+4]
577
+ value.append(chr(int(hex_chars, 16)))
578
+ self.pos += 3
579
+ else:
580
+ value.append(escaped)
581
+ self.pos += 1
582
+ else:
583
+ value.append(c)
584
+ self.pos += 1
585
+
586
+ string_value = ''.join(value)
587
+
588
+ # Check for language tag or datatype
589
+ if self.pos < len(self.text) and self.text[self.pos] == '@':
590
+ self.pos += 1
591
+ lang_start = self.pos
592
+ while self.pos < len(self.text) and self.text[self.pos] not in ' \t\n\r.;,[]':
593
+ self.pos += 1
594
+ lang = self.text[lang_start:self.pos]
595
+ return f'"{string_value}"@{lang}'
596
+ elif self._peek_text("^^"):
597
+ self.pos += 2
598
+ datatype = self._parse_iri_or_prefixed()
599
+ return f'"{string_value}"^^<{datatype}>'
600
+
601
+ return f'"{string_value}"'
602
+
603
+ def _parse_long_string(self, delimiter: str) -> str:
604
+ """Parse a long string (triple-quoted)."""
605
+ self._consume(delimiter)
606
+
607
+ value = []
608
+ while self.pos < len(self.text):
609
+ if self._peek_text(delimiter):
610
+ self._consume(delimiter)
611
+ break
612
+ value.append(self.text[self.pos])
613
+ self.pos += 1
614
+
615
+ string_value = ''.join(value)
616
+
617
+ # Check for language tag or datatype
618
+ if self.pos < len(self.text) and self.text[self.pos] == '@':
619
+ self.pos += 1
620
+ lang_start = self.pos
621
+ while self.pos < len(self.text) and self.text[self.pos] not in ' \t\n\r.;,[]':
622
+ self.pos += 1
623
+ lang = self.text[lang_start:self.pos]
624
+ return f'"{string_value}"@{lang}'
625
+ elif self._peek_text("^^"):
626
+ self.pos += 2
627
+ datatype = self._parse_iri_or_prefixed()
628
+ return f'"{string_value}"^^<{datatype}>'
629
+
630
+ return f'"{string_value}"'
631
+
632
+ def _parse_numeric(self) -> str:
633
+ """Parse a numeric literal."""
634
+ start = self.pos
635
+
636
+ # Handle sign
637
+ if self.text[self.pos] in '+-':
638
+ self.pos += 1
639
+
640
+ # Parse digits
641
+ has_decimal = False
642
+ has_exponent = False
643
+
644
+ while self.pos < len(self.text):
645
+ c = self.text[self.pos]
646
+ if c.isdigit():
647
+ self.pos += 1
648
+ elif c == '.' and not has_decimal:
649
+ has_decimal = True
650
+ self.pos += 1
651
+ elif c in 'eE' and not has_exponent:
652
+ has_exponent = True
653
+ self.pos += 1
654
+ if self.pos < len(self.text) and self.text[self.pos] in '+-':
655
+ self.pos += 1
656
+ else:
657
+ break
658
+
659
+ value = self.text[start:self.pos]
660
+
661
+ # Determine datatype
662
+ if has_exponent:
663
+ return f'"{value}"^^<http://www.w3.org/2001/XMLSchema#double>'
664
+ elif has_decimal:
665
+ return f'"{value}"^^<http://www.w3.org/2001/XMLSchema#decimal>'
666
+ else:
667
+ return f'"{value}"^^<http://www.w3.org/2001/XMLSchema#integer>'
668
+
669
+ def _parse_boolean(self) -> str:
670
+ """Parse a boolean literal."""
671
+ if self._peek_text("true"):
672
+ self._consume("true")
673
+ return '"true"^^<http://www.w3.org/2001/XMLSchema#boolean>'
674
+ else:
675
+ self._consume("false")
676
+ return '"false"^^<http://www.w3.org/2001/XMLSchema#boolean>'
677
+
678
+ def _parse_quoted_triple(self) -> Triple:
679
+ """Parse an RDF-Star quoted triple << s p o >>."""
680
+ self._consume("<<")
681
+ self._skip_ws_and_comments()
682
+
683
+ subject = self._parse_subject()
684
+ self._skip_ws_and_comments()
685
+
686
+ predicate = self._parse_predicate()
687
+ self._skip_ws_and_comments()
688
+
689
+ obj = self._parse_object()
690
+ self._skip_ws_and_comments()
691
+
692
+ self._consume(">>")
693
+
694
+ if isinstance(subject, Triple):
695
+ return Triple("", predicate, obj if isinstance(obj, str) else "",
696
+ subject_triple=subject,
697
+ object_triple=obj if isinstance(obj, Triple) else None)
698
+ elif isinstance(obj, Triple):
699
+ return Triple(subject, predicate, "", object_triple=obj)
700
+ else:
701
+ return Triple(subject, predicate, obj)
702
+
703
+
704
+ class TurtleSerializer:
705
+ """
706
+ Serializer for Turtle format.
707
+
708
+ Converts triples to Turtle format with:
709
+ - Prefix declarations
710
+ - Predicate-object grouping
711
+ - Proper escaping
712
+ - RDF-Star quoted triples
713
+ """
714
+
715
+ def __init__(self, prefixes: Optional[Dict[str, str]] = None):
716
+ """
717
+ Initialize serializer with optional prefixes.
718
+
719
+ Args:
720
+ prefixes: Dict mapping prefix to namespace IRI
721
+ """
722
+ self.prefixes = prefixes or {}
723
+ self._reverse_prefixes: Dict[str, str] = {}
724
+ self._update_reverse_prefixes()
725
+
726
+ def _update_reverse_prefixes(self):
727
+ """Build reverse prefix lookup."""
728
+ self._reverse_prefixes = {v: k for k, v in self.prefixes.items()}
729
+
730
+ def add_prefix(self, prefix: str, namespace: str):
731
+ """Add a prefix mapping."""
732
+ self.prefixes[prefix] = namespace
733
+ self._reverse_prefixes[namespace] = prefix
734
+
735
+ def serialize(self, triples: List[Triple], base: Optional[str] = None) -> str:
736
+ """
737
+ Serialize triples to Turtle format.
738
+
739
+ Args:
740
+ triples: List of Triple objects
741
+ base: Optional base IRI
742
+
743
+ Returns:
744
+ Turtle formatted string
745
+ """
746
+ lines = []
747
+
748
+ # Write base if provided
749
+ if base:
750
+ lines.append(f"@base <{base}> .")
751
+ lines.append("")
752
+
753
+ # Write prefixes
754
+ for prefix, namespace in sorted(self.prefixes.items()):
755
+ lines.append(f"@prefix {prefix}: <{namespace}> .")
756
+
757
+ if self.prefixes:
758
+ lines.append("")
759
+
760
+ # Group triples by subject
761
+ by_subject: Dict[str, List[Triple]] = {}
762
+ for triple in triples:
763
+ key = self._subject_key(triple)
764
+ if key not in by_subject:
765
+ by_subject[key] = []
766
+ by_subject[key].append(triple)
767
+
768
+ # Write triples
769
+ for subject_key, subject_triples in by_subject.items():
770
+ # Write subject
771
+ subject = subject_triples[0]
772
+ lines.append(f"{self._format_subject(subject)}")
773
+
774
+ # Group by predicate
775
+ by_predicate: Dict[str, List[Triple]] = {}
776
+ for triple in subject_triples:
777
+ pred = triple.predicate
778
+ if pred not in by_predicate:
779
+ by_predicate[pred] = []
780
+ by_predicate[pred].append(triple)
781
+
782
+ pred_items = list(by_predicate.items())
783
+ for i, (pred, pred_triples) in enumerate(pred_items):
784
+ pred_str = self._compress_iri(pred)
785
+ if pred == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type":
786
+ pred_str = "a"
787
+
788
+ objects = [self._format_object(t) for t in pred_triples]
789
+ objects_str = " , ".join(objects)
790
+
791
+ if i < len(pred_items) - 1:
792
+ lines.append(f" {pred_str} {objects_str} ;")
793
+ else:
794
+ lines.append(f" {pred_str} {objects_str} .")
795
+
796
+ lines.append("")
797
+
798
+ return "\n".join(lines)
799
+
800
+ def serialize_to_file(self, triples: List[Triple], path: Union[str, Path],
801
+ base: Optional[str] = None):
802
+ """Serialize triples to a Turtle file."""
803
+ content = self.serialize(triples, base)
804
+ Path(path).write_text(content, encoding="utf-8")
805
+
806
+ def _subject_key(self, triple: Triple) -> str:
807
+ """Get a key for grouping by subject."""
808
+ if triple.subject_triple:
809
+ return f"<<{triple.subject_triple}>>"
810
+ return triple.subject
811
+
812
+ def _format_subject(self, triple: Triple) -> str:
813
+ """Format the subject of a triple."""
814
+ if triple.subject_triple:
815
+ return self._format_quoted_triple(triple.subject_triple)
816
+ return self._compress_iri(triple.subject)
817
+
818
+ def _format_object(self, triple: Triple) -> str:
819
+ """Format the object of a triple."""
820
+ if triple.object_triple:
821
+ return self._format_quoted_triple(triple.object_triple)
822
+ return self._format_term(triple.object)
823
+
824
+ def _format_quoted_triple(self, triple: Triple) -> str:
825
+ """Format a quoted triple."""
826
+ s = self._format_subject(triple) if not triple.subject_triple else self._format_quoted_triple(triple.subject_triple)
827
+ if triple.subject and not triple.subject_triple:
828
+ s = self._compress_iri(triple.subject)
829
+ p = self._compress_iri(triple.predicate)
830
+ o = self._format_object(triple) if not triple.object_triple else self._format_quoted_triple(triple.object_triple)
831
+ if triple.object and not triple.object_triple:
832
+ o = self._format_term(triple.object)
833
+ return f"<< {s} {p} {o} >>"
834
+
835
+ def _format_term(self, term: str) -> str:
836
+ """Format a term (IRI, blank node, or literal)."""
837
+ if term.startswith('_:'):
838
+ return term
839
+ elif term.startswith('"'):
840
+ return term # Already formatted literal
841
+ else:
842
+ return self._compress_iri(term)
843
+
844
+ def _compress_iri(self, iri: str) -> str:
845
+ """Compress IRI using prefixes if possible."""
846
+ if iri.startswith('_:'):
847
+ return iri
848
+
849
+ for namespace, prefix in self._reverse_prefixes.items():
850
+ if iri.startswith(namespace):
851
+ local = iri[len(namespace):]
852
+ # Check if local part is valid for prefixed name
853
+ if self._is_valid_local(local):
854
+ return f"{prefix}:{local}"
855
+
856
+ return f"<{iri}>"
857
+
858
+ def _is_valid_local(self, local: str) -> bool:
859
+ """Check if a local name is valid for a prefixed name."""
860
+ if not local:
861
+ return True
862
+ if local[0].isdigit():
863
+ return False
864
+ for c in local:
865
+ if not (c.isalnum() or c in '_-'):
866
+ return False
867
+ return True
868
+
869
+
870
+ # Convenience functions
871
+ def parse_turtle(source: Union[str, Path]) -> ParsedDocument:
872
+ """Parse Turtle content or file."""
873
+ parser = TurtleParser()
874
+ if isinstance(source, Path) or (isinstance(source, str) and len(source) < 500 and Path(source).exists()):
875
+ return parser.parse_file(source)
876
+ return parser.parse(source)
877
+
878
+
879
+ def serialize_turtle(triples: List[Triple], prefixes: Optional[Dict[str, str]] = None) -> str:
880
+ """Serialize triples to Turtle format."""
881
+ serializer = TurtleSerializer(prefixes)
882
+ return serializer.serialize(triples)