rdf-starbase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1104 @@
1
+ """
2
+ rdflib Compatibility Layer.
3
+
4
+ Provides a drop-in replacement for rdflib's Graph class, backed by RDF-StarBase's
5
+ high-performance Polars engine.
6
+
7
+ Usage:
8
+ # Instead of:
9
+ # from rdflib import Graph, URIRef, Literal, Namespace
10
+
11
+ # Use:
12
+ from rdf_starbase.compat.rdflib import Graph, URIRef, Literal, Namespace
13
+
14
+ g = Graph()
15
+ g.parse("data.ttl", format="turtle")
16
+
17
+ for s, p, o in g.triples((None, RDF.type, None)):
18
+ print(s)
19
+
20
+ g.serialize(destination="out.ttl", format="turtle")
21
+
22
+ This module aims to be API-compatible with rdflib while providing:
23
+ - 10-50x faster parsing
24
+ - 10-100x faster queries
25
+ - Native RDF-Star support
26
+ - Built-in provenance tracking
27
+ """
28
+
29
+ from typing import Optional, Iterator, Tuple, Any, Union, IO
30
+ from pathlib import Path
31
+ from io import StringIO, BytesIO
32
+ import re
33
+
34
+
35
+ # =============================================================================
36
+ # RDF Term Classes (rdflib-compatible)
37
+ # =============================================================================
38
+
39
+ class Identifier:
40
+ """Base class for RDF terms."""
41
+ __slots__ = ()
42
+
43
+
44
+ class URIRef(Identifier, str):
45
+ """
46
+ An RDF URI Reference.
47
+
48
+ Compatible with rdflib.URIRef.
49
+ """
50
+ __slots__ = ()
51
+
52
+ def __new__(cls, value: str, base: Optional[str] = None):
53
+ if base is not None:
54
+ # Resolve relative URI against base
55
+ # Simple implementation - full resolution would need urllib
56
+ if not value.startswith(('http://', 'https://', 'urn:', 'file://')):
57
+ if base.endswith('/'):
58
+ value = base + value
59
+ else:
60
+ value = base + '/' + value
61
+ return str.__new__(cls, value)
62
+
63
+ def __repr__(self):
64
+ return f"URIRef({super().__repr__()})"
65
+
66
+ def __hash__(self):
67
+ return str.__hash__(self)
68
+
69
+ def __eq__(self, other):
70
+ if isinstance(other, URIRef):
71
+ return str.__eq__(self, other)
72
+ return str.__eq__(self, other)
73
+
74
+ def n3(self, namespace_manager=None) -> str:
75
+ """Return N3/Turtle representation."""
76
+ # TODO: Use namespace_manager for prefix compression
77
+ return f"<{self}>"
78
+
79
+ def toPython(self) -> str:
80
+ """Return Python string representation."""
81
+ return str(self)
82
+
83
+
84
+ class Literal(Identifier):
85
+ """
86
+ An RDF Literal.
87
+
88
+ Compatible with rdflib.Literal.
89
+ """
90
+ __slots__ = ('_value', '_datatype', '_language')
91
+
92
+ def __init__(
93
+ self,
94
+ value: Any,
95
+ lang: Optional[str] = None,
96
+ datatype: Optional[URIRef] = None
97
+ ):
98
+ if lang is not None and datatype is not None:
99
+ raise TypeError("Literal cannot have both lang and datatype")
100
+
101
+ self._value = value
102
+ self._language = lang.lower() if lang else None
103
+
104
+ if datatype is not None:
105
+ self._datatype = URIRef(datatype) if not isinstance(datatype, URIRef) else datatype
106
+ elif lang is not None:
107
+ self._datatype = None # Language-tagged literals have no datatype
108
+ elif isinstance(value, bool):
109
+ self._datatype = XSD.boolean
110
+ elif isinstance(value, int):
111
+ self._datatype = XSD.integer
112
+ elif isinstance(value, float):
113
+ self._datatype = XSD.double
114
+ else:
115
+ self._datatype = XSD.string
116
+
117
+ @property
118
+ def value(self) -> Any:
119
+ return self._value
120
+
121
+ @property
122
+ def language(self) -> Optional[str]:
123
+ return self._language
124
+
125
+ @property
126
+ def datatype(self) -> Optional[URIRef]:
127
+ return self._datatype
128
+
129
+ def __str__(self):
130
+ return str(self._value)
131
+
132
+ def __repr__(self):
133
+ if self._language:
134
+ return f"Literal({self._value!r}, lang={self._language!r})"
135
+ elif self._datatype and self._datatype != XSD.string:
136
+ return f"Literal({self._value!r}, datatype={self._datatype!r})"
137
+ return f"Literal({self._value!r})"
138
+
139
+ def __hash__(self):
140
+ return hash((str(self._value), self._language, self._datatype))
141
+
142
+ def __eq__(self, other):
143
+ if isinstance(other, Literal):
144
+ return (
145
+ str(self._value) == str(other._value) and
146
+ self._language == other._language and
147
+ self._datatype == other._datatype
148
+ )
149
+ return str(self._value) == str(other)
150
+
151
+ def n3(self, namespace_manager=None) -> str:
152
+ """Return N3/Turtle representation."""
153
+ value_str = str(self._value)
154
+ # Escape special characters
155
+ value_str = value_str.replace('\\', '\\\\').replace('"', '\\"')
156
+
157
+ if self._language:
158
+ return f'"{value_str}"@{self._language}'
159
+ elif self._datatype and self._datatype != XSD.string:
160
+ return f'"{value_str}"^^<{self._datatype}>'
161
+ return f'"{value_str}"'
162
+
163
+ def toPython(self) -> Any:
164
+ """Convert to Python native type."""
165
+ if self._datatype == XSD.integer:
166
+ return int(self._value)
167
+ elif self._datatype == XSD.double or self._datatype == XSD.decimal:
168
+ return float(self._value)
169
+ elif self._datatype == XSD.boolean:
170
+ return str(self._value).lower() in ('true', '1')
171
+ return str(self._value)
172
+
173
+
174
+ class BNode(Identifier):
175
+ """
176
+ An RDF Blank Node.
177
+
178
+ Compatible with rdflib.BNode.
179
+ """
180
+ __slots__ = ('_id',)
181
+ _next_id = 0
182
+
183
+ def __init__(self, value: Optional[str] = None):
184
+ if value is None:
185
+ BNode._next_id += 1
186
+ self._id = f"N{BNode._next_id}"
187
+ else:
188
+ self._id = value
189
+
190
+ def __str__(self):
191
+ return self._id
192
+
193
+ def __repr__(self):
194
+ return f"BNode({self._id!r})"
195
+
196
+ def __hash__(self):
197
+ return hash(self._id)
198
+
199
+ def __eq__(self, other):
200
+ if isinstance(other, BNode):
201
+ return self._id == other._id
202
+ return False
203
+
204
+ def n3(self, namespace_manager=None) -> str:
205
+ """Return N3/Turtle representation."""
206
+ return f"_:{self._id}"
207
+
208
+ def toPython(self) -> str:
209
+ return self._id
210
+
211
+
212
+ # =============================================================================
213
+ # Namespace Support
214
+ # =============================================================================
215
+
216
+ class Namespace(URIRef):
217
+ """
218
+ An RDF Namespace.
219
+
220
+ Allows attribute access for creating URIRefs:
221
+ RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
222
+ RDF.type # Returns URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")
223
+ """
224
+ __slots__ = ()
225
+
226
+ def __new__(cls, value: str):
227
+ return URIRef.__new__(cls, value)
228
+
229
+ def __getattr__(self, name: str) -> URIRef:
230
+ if name.startswith('_'):
231
+ raise AttributeError(f"'{type(self).__name__}' has no attribute '{name}'")
232
+ return URIRef(f"{self}{name}")
233
+
234
+ def __getitem__(self, key: str) -> URIRef:
235
+ return URIRef(f"{self}{key}")
236
+
237
+ def term(self, name: str) -> URIRef:
238
+ return URIRef(f"{self}{name}")
239
+
240
+
241
+ class ClosedNamespace(Namespace):
242
+ """A namespace with a fixed set of terms."""
243
+
244
+ def __new__(cls, uri: str, terms: list):
245
+ inst = Namespace.__new__(cls, uri)
246
+ inst._terms = frozenset(terms)
247
+ return inst
248
+
249
+ def __getattr__(self, name: str) -> URIRef:
250
+ if name.startswith('_'):
251
+ raise AttributeError(f"'{type(self).__name__}' has no attribute '{name}'")
252
+ if name not in self._terms:
253
+ raise AttributeError(f"term '{name}' not in namespace")
254
+ return URIRef(f"{self}{name}")
255
+
256
+
257
+ # Well-known namespaces
258
+ RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
259
+ RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
260
+ OWL = Namespace("http://www.w3.org/2002/07/owl#")
261
+ XSD = Namespace("http://www.w3.org/2001/XMLSchema#")
262
+ FOAF = Namespace("http://xmlns.com/foaf/0.1/")
263
+ DC = Namespace("http://purl.org/dc/elements/1.1/")
264
+ DCTERMS = Namespace("http://purl.org/dc/terms/")
265
+ SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")
266
+ PROV = Namespace("http://www.w3.org/ns/prov#")
267
+
268
+
269
+ # =============================================================================
270
+ # Graph Class (Main API)
271
+ # =============================================================================
272
+
273
+ # Type alias for triples
274
+ Triple = Tuple[Identifier, URIRef, Identifier]
275
+
276
+
277
+ class Graph:
278
+ """
279
+ An RDF Graph backed by RDF-StarBase.
280
+
281
+ API-compatible with rdflib.Graph while providing superior performance.
282
+
283
+ Example:
284
+ g = Graph()
285
+ g.parse("data.ttl", format="turtle")
286
+
287
+ for s, p, o in g.triples((None, RDF.type, None)):
288
+ print(f"{s} is a {o}")
289
+
290
+ g.add((URIRef("http://example.org/s"), RDF.type, FOAF.Person))
291
+ g.serialize(destination="out.ttl", format="turtle")
292
+ """
293
+
294
+ def __init__(self, store=None, identifier=None):
295
+ """
296
+ Create a new Graph.
297
+
298
+ Args:
299
+ store: Optional backing store (ignored, uses TripleStore)
300
+ identifier: Optional graph identifier
301
+ """
302
+ # Import here to avoid circular imports
303
+ from rdf_starbase import TripleStore
304
+
305
+ self._store = TripleStore()
306
+ self._identifier = identifier or BNode()
307
+ self._namespace_manager = NamespaceManager(self)
308
+
309
+ @property
310
+ def store(self):
311
+ """The backing store."""
312
+ return self._store
313
+
314
+ @property
315
+ def identifier(self):
316
+ """The graph identifier."""
317
+ return self._identifier
318
+
319
+ @property
320
+ def namespace_manager(self):
321
+ """The namespace manager."""
322
+ return self._namespace_manager
323
+
324
+ def __len__(self) -> int:
325
+ """Return the number of triples in the graph."""
326
+ return len(self._store)
327
+
328
+ def __iter__(self) -> Iterator[Triple]:
329
+ """Iterate over all triples."""
330
+ return self.triples((None, None, None))
331
+
332
+ def __contains__(self, triple: Triple) -> bool:
333
+ """Check if a triple is in the graph."""
334
+ s, p, o = triple
335
+ for _ in self.triples((s, p, o)):
336
+ return True
337
+ return False
338
+
339
+ def add(self, triple: Triple) -> "Graph":
340
+ """
341
+ Add a triple to the graph.
342
+
343
+ Args:
344
+ triple: A (subject, predicate, object) tuple
345
+
346
+ Returns:
347
+ self for chaining
348
+ """
349
+ from rdf_starbase.models import ProvenanceContext
350
+
351
+ s, p, o = triple
352
+ prov = ProvenanceContext(source="rdflib_compat", confidence=1.0)
353
+ self._store.add_triple(
354
+ subject=str(s),
355
+ predicate=str(p),
356
+ obj=self._term_to_value(o),
357
+ provenance=prov
358
+ )
359
+ return self
360
+
361
+ def remove(self, triple: Triple) -> "Graph":
362
+ """
363
+ Remove a triple from the graph.
364
+
365
+ Args:
366
+ triple: A (subject, predicate, object) tuple with optional None wildcards
367
+
368
+ Returns:
369
+ self for chaining
370
+ """
371
+ s, p, o = triple
372
+ # Get matching triples and deprecate them
373
+ matches = self._store.get_triples(
374
+ subject=str(s) if s is not None else None,
375
+ predicate=str(p) if p is not None else None,
376
+ obj=self._term_to_value(o) if o is not None else None,
377
+ )
378
+
379
+ for row in matches.iter_rows(named=True):
380
+ # Deprecate the assertion
381
+ if 'assertion_id' in row:
382
+ try:
383
+ from uuid import UUID
384
+ self._store.deprecate(UUID(row['assertion_id']))
385
+ except (ValueError, KeyError):
386
+ pass
387
+ return self
388
+
389
+ def triples(
390
+ self,
391
+ pattern: Tuple[Optional[Identifier], Optional[URIRef], Optional[Identifier]]
392
+ ) -> Iterator[Triple]:
393
+ """
394
+ Iterate over triples matching a pattern.
395
+
396
+ Args:
397
+ pattern: (subject, predicate, object) with None as wildcard
398
+
399
+ Yields:
400
+ Matching (subject, predicate, object) tuples
401
+ """
402
+ s, p, o = pattern
403
+
404
+ results = self._store.get_triples(
405
+ subject=str(s) if s is not None else None,
406
+ predicate=str(p) if p is not None else None,
407
+ obj=self._term_to_value(o) if o is not None else None,
408
+ )
409
+
410
+ for row in results.iter_rows(named=True):
411
+ yield (
412
+ self._value_to_term(row['subject'], is_uri=True),
413
+ URIRef(row['predicate']),
414
+ self._value_to_term(row['object']),
415
+ )
416
+
417
+ def subjects(
418
+ self,
419
+ predicate: Optional[URIRef] = None,
420
+ object: Optional[Identifier] = None,
421
+ unique: bool = True
422
+ ) -> Iterator[Identifier]:
423
+ """Iterate over subjects matching the pattern."""
424
+ seen = set() if unique else None
425
+ for s, p, o in self.triples((None, predicate, object)):
426
+ if seen is not None:
427
+ if s in seen:
428
+ continue
429
+ seen.add(s)
430
+ yield s
431
+
432
+ def predicates(
433
+ self,
434
+ subject: Optional[Identifier] = None,
435
+ object: Optional[Identifier] = None,
436
+ unique: bool = True
437
+ ) -> Iterator[URIRef]:
438
+ """Iterate over predicates matching the pattern."""
439
+ seen = set() if unique else None
440
+ for s, p, o in self.triples((subject, None, object)):
441
+ if seen is not None:
442
+ if p in seen:
443
+ continue
444
+ seen.add(p)
445
+ yield p
446
+
447
+ def objects(
448
+ self,
449
+ subject: Optional[Identifier] = None,
450
+ predicate: Optional[URIRef] = None,
451
+ unique: bool = True
452
+ ) -> Iterator[Identifier]:
453
+ """Iterate over objects matching the pattern."""
454
+ seen = set() if unique else None
455
+ for s, p, o in self.triples((subject, predicate, None)):
456
+ if seen is not None:
457
+ if o in seen:
458
+ continue
459
+ seen.add(o)
460
+ yield o
461
+
462
+ def subject_objects(
463
+ self,
464
+ predicate: Optional[URIRef] = None,
465
+ unique: bool = True
466
+ ) -> Iterator[Tuple[Identifier, Identifier]]:
467
+ """Iterate over (subject, object) pairs matching the predicate."""
468
+ seen = set() if unique else None
469
+ for s, p, o in self.triples((None, predicate, None)):
470
+ pair = (s, o)
471
+ if seen is not None:
472
+ if pair in seen:
473
+ continue
474
+ seen.add(pair)
475
+ yield pair
476
+
477
+ def subject_predicates(
478
+ self,
479
+ object: Optional[Identifier] = None,
480
+ unique: bool = True
481
+ ) -> Iterator[Tuple[Identifier, URIRef]]:
482
+ """Iterate over (subject, predicate) pairs matching the object."""
483
+ seen = set() if unique else None
484
+ for s, p, o in self.triples((None, None, object)):
485
+ pair = (s, p)
486
+ if seen is not None:
487
+ if pair in seen:
488
+ continue
489
+ seen.add(pair)
490
+ yield pair
491
+
492
+ def predicate_objects(
493
+ self,
494
+ subject: Optional[Identifier] = None,
495
+ unique: bool = True
496
+ ) -> Iterator[Tuple[URIRef, Identifier]]:
497
+ """Iterate over (predicate, object) pairs matching the subject."""
498
+ seen = set() if unique else None
499
+ for s, p, o in self.triples((subject, None, None)):
500
+ pair = (p, o)
501
+ if seen is not None:
502
+ if pair in seen:
503
+ continue
504
+ seen.add(pair)
505
+ yield pair
506
+
507
+ def value(
508
+ self,
509
+ subject: Optional[Identifier] = None,
510
+ predicate: Optional[URIRef] = None,
511
+ object: Optional[Identifier] = None,
512
+ default: Any = None,
513
+ any: bool = True
514
+ ) -> Optional[Identifier]:
515
+ """Get a single value for the unbound component."""
516
+ for s, p, o in self.triples((subject, predicate, object)):
517
+ if subject is None:
518
+ return s
519
+ elif predicate is None:
520
+ return p
521
+ else:
522
+ return o
523
+ return default
524
+
525
+ def parse(
526
+ self,
527
+ source: Optional[Union[str, Path, IO]] = None,
528
+ publicID: Optional[str] = None,
529
+ format: Optional[str] = None,
530
+ location: Optional[str] = None,
531
+ file: Optional[IO] = None,
532
+ data: Optional[Union[str, bytes]] = None,
533
+ **kwargs
534
+ ) -> "Graph":
535
+ """
536
+ Parse RDF data into this graph.
537
+
538
+ Args:
539
+ source: File path, URL, or file-like object
540
+ publicID: The logical URI of the graph
541
+ format: Format hint (turtle, xml, n3, nt, json-ld)
542
+ location: Alternative to source (URL to fetch)
543
+ file: File-like object
544
+ data: Raw string/bytes data
545
+
546
+ Returns:
547
+ self for chaining
548
+ """
549
+ # Determine the content to parse
550
+ content = None
551
+
552
+ if data is not None:
553
+ content = data if isinstance(data, str) else data.decode('utf-8')
554
+ elif file is not None:
555
+ content = file.read()
556
+ if isinstance(content, bytes):
557
+ content = content.decode('utf-8')
558
+ elif source is not None:
559
+ if isinstance(source, (str, Path)):
560
+ path = Path(source)
561
+ if path.exists():
562
+ content = path.read_text(encoding='utf-8')
563
+ if format is None:
564
+ format = self._guess_format(path)
565
+ else:
566
+ # Might be a URL - try to fetch
567
+ # For now, just raise
568
+ raise FileNotFoundError(f"File not found: {source}")
569
+ elif hasattr(source, 'read'):
570
+ content = source.read()
571
+ if isinstance(content, bytes):
572
+ content = content.decode('utf-8')
573
+ elif location is not None:
574
+ raise NotImplementedError("URL fetching not implemented yet")
575
+ else:
576
+ raise ValueError("No input source provided")
577
+
578
+ # Determine format
579
+ if format is None:
580
+ format = 'turtle' # Default
581
+
582
+ format = format.lower()
583
+ if format in ('ttl', 'turtle', 'n3'):
584
+ self._parse_turtle(content)
585
+ elif format in ('nt', 'ntriples', 'n-triples'):
586
+ self._parse_ntriples(content)
587
+ elif format in ('xml', 'rdf/xml', 'rdfxml', 'application/rdf+xml'):
588
+ self._parse_rdfxml(content)
589
+ elif format in ('json-ld', 'jsonld'):
590
+ self._parse_jsonld(content)
591
+ else:
592
+ raise ValueError(f"Unknown format: {format}")
593
+
594
+ return self
595
+
596
+ def serialize(
597
+ self,
598
+ destination: Optional[Union[str, Path, IO]] = None,
599
+ format: str = "turtle",
600
+ base: Optional[str] = None,
601
+ encoding: Optional[str] = None,
602
+ **kwargs
603
+ ) -> Optional[str]:
604
+ """
605
+ Serialize the graph to RDF.
606
+
607
+ Args:
608
+ destination: File path or file-like object (None for string return)
609
+ format: Output format (turtle, xml, nt, json-ld)
610
+ base: Base URI
611
+ encoding: Character encoding
612
+
613
+ Returns:
614
+ Serialized string if destination is None
615
+ """
616
+ format = format.lower()
617
+
618
+ if format in ('ttl', 'turtle', 'n3'):
619
+ content = self._serialize_turtle()
620
+ elif format in ('nt', 'ntriples', 'n-triples'):
621
+ content = self._serialize_ntriples()
622
+ elif format in ('xml', 'rdf/xml', 'rdfxml', 'pretty-xml'):
623
+ content = self._serialize_rdfxml()
624
+ elif format in ('json-ld', 'jsonld'):
625
+ content = self._serialize_jsonld()
626
+ else:
627
+ raise ValueError(f"Unknown format: {format}")
628
+
629
+ if destination is None:
630
+ return content
631
+
632
+ if isinstance(destination, (str, Path)):
633
+ Path(destination).write_text(content, encoding=encoding or 'utf-8')
634
+ else:
635
+ destination.write(content.encode(encoding or 'utf-8') if hasattr(destination, 'mode') and 'b' in destination.mode else content)
636
+
637
+ return None
638
+
639
+ def bind(self, prefix: str, namespace: Union[str, URIRef, Namespace], override: bool = True, replace: bool = False):
640
+ """Bind a namespace prefix."""
641
+ self._namespace_manager.bind(prefix, namespace, override, replace)
642
+ return self
643
+
644
+ def namespaces(self) -> Iterator[Tuple[str, URIRef]]:
645
+ """Iterate over bound namespace prefixes."""
646
+ return iter(self._namespace_manager.namespaces())
647
+
648
+ def query(self, query: str, initBindings=None, initNs=None, **kwargs):
649
+ """
650
+ Execute a SPARQL query.
651
+
652
+ Args:
653
+ query: SPARQL query string
654
+ initBindings: Initial variable bindings
655
+ initNs: Namespace prefix mappings
656
+
657
+ Returns:
658
+ Query results
659
+ """
660
+ from rdf_starbase import execute_sparql
661
+
662
+ # Add namespace prefixes if provided
663
+ if initNs:
664
+ prefix_block = ""
665
+ for prefix, ns in initNs.items():
666
+ prefix_block += f"PREFIX {prefix}: <{ns}>\n"
667
+ query = prefix_block + query
668
+
669
+ result = execute_sparql(self._store, query)
670
+ return QueryResult(result, initBindings)
671
+
672
+ def update(self, update_query: str, initBindings=None, initNs=None, **kwargs):
673
+ """Execute a SPARQL Update query."""
674
+ from rdf_starbase import execute_sparql
675
+
676
+ if initNs:
677
+ prefix_block = ""
678
+ for prefix, ns in initNs.items():
679
+ prefix_block += f"PREFIX {prefix}: <{ns}>\n"
680
+ update_query = prefix_block + update_query
681
+
682
+ return execute_sparql(self._store, update_query)
683
+
684
+ # =========================================================================
685
+ # Internal parsing methods
686
+ # =========================================================================
687
+
688
+ def _parse_turtle(self, content: str):
689
+ """Parse Turtle content."""
690
+ from rdf_starbase.formats.turtle import parse_turtle
691
+
692
+ doc = parse_turtle(content)
693
+
694
+ # Extract columns from parsed triples
695
+ subjects = [t.subject for t in doc.triples]
696
+ predicates = [t.predicate for t in doc.triples]
697
+ objects = [t.object for t in doc.triples]
698
+
699
+ # Use columnar insert (much faster than one-by-one)
700
+ self._store.add_triples_columnar(
701
+ subjects=subjects,
702
+ predicates=predicates,
703
+ objects=objects,
704
+ source="turtle_parse",
705
+ confidence=1.0,
706
+ )
707
+
708
+ def _parse_ntriples(self, content: str):
709
+ """Parse N-Triples content."""
710
+ from rdf_starbase.formats.ntriples import parse_ntriples
711
+
712
+ doc = parse_ntriples(content)
713
+
714
+ subjects = [t.subject for t in doc.triples]
715
+ predicates = [t.predicate for t in doc.triples]
716
+ objects = [t.object for t in doc.triples]
717
+
718
+ self._store.add_triples_columnar(
719
+ subjects=subjects,
720
+ predicates=predicates,
721
+ objects=objects,
722
+ source="ntriples_parse",
723
+ confidence=1.0,
724
+ )
725
+
726
+ def _parse_rdfxml(self, content: str):
727
+ """Parse RDF/XML content."""
728
+ from rdf_starbase.formats.rdfxml import parse_rdfxml
729
+
730
+ doc = parse_rdfxml(content)
731
+
732
+ subjects = [t.subject for t in doc.triples]
733
+ predicates = [t.predicate for t in doc.triples]
734
+ objects = [t.object for t in doc.triples]
735
+
736
+ self._store.add_triples_columnar(
737
+ subjects=subjects,
738
+ predicates=predicates,
739
+ objects=objects,
740
+ source="rdfxml_parse",
741
+ confidence=1.0,
742
+ )
743
+
744
+ def _parse_jsonld(self, content: str):
745
+ """Parse JSON-LD content."""
746
+ from rdf_starbase.formats.jsonld import parse_jsonld
747
+
748
+ doc = parse_jsonld(content)
749
+
750
+ subjects = [t.subject for t in doc.triples]
751
+ predicates = [t.predicate for t in doc.triples]
752
+ objects = [t.object for t in doc.triples]
753
+
754
+ self._store.add_triples_columnar(
755
+ subjects=subjects,
756
+ predicates=predicates,
757
+ objects=objects,
758
+ source="jsonld_parse",
759
+ confidence=1.0,
760
+ )
761
+
762
+ # =========================================================================
763
+ # Internal serialization methods
764
+ # =========================================================================
765
+
766
+ def _serialize_turtle(self) -> str:
767
+ """Serialize to Turtle."""
768
+ lines = []
769
+
770
+ # Convert namespaces to dict and write prefix declarations
771
+ prefixes = {prefix: str(ns) for prefix, ns in self._namespace_manager.namespaces()}
772
+ for prefix, namespace in sorted(prefixes.items()):
773
+ lines.append(f"@prefix {prefix}: <{namespace}> .")
774
+
775
+ if prefixes:
776
+ lines.append("")
777
+
778
+ # Build reverse prefix lookup for compression
779
+ reverse_prefixes = {v: k for k, v in prefixes.items()}
780
+
781
+ def compress_uri(uri: str) -> str:
782
+ """Try to compress URI with prefix."""
783
+ for ns, prefix in sorted(reverse_prefixes.items(), key=lambda x: -len(x[0])):
784
+ if uri.startswith(ns):
785
+ local = uri[len(ns):]
786
+ # Only use prefix if local part is valid
787
+ if local and local[0].isalpha() and all(c.isalnum() or c == '_' for c in local):
788
+ return f"{prefix}:{local}"
789
+ return f"<{uri}>"
790
+
791
+ def format_value(v) -> str:
792
+ """Format a value as Turtle."""
793
+ if isinstance(v, str):
794
+ if v.startswith(('http://', 'https://', 'urn:')):
795
+ return compress_uri(v)
796
+ elif v.startswith('_:'):
797
+ return v
798
+ else:
799
+ # Escape and quote literal
800
+ escaped = v.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n')
801
+ return f'"{escaped}"'
802
+ else:
803
+ return f'"{v}"'
804
+
805
+ # Group by subject for prettier output
806
+ by_subject = {}
807
+ for row in self._store._df.iter_rows(named=True):
808
+ if row.get('deprecated', False):
809
+ continue
810
+ s = row['subject']
811
+ if s not in by_subject:
812
+ by_subject[s] = []
813
+ by_subject[s].append((row['predicate'], row['object']))
814
+
815
+ # Write grouped triples
816
+ for subject, po_list in by_subject.items():
817
+ s_str = compress_uri(subject) if subject.startswith(('http://', 'https://')) else subject
818
+ lines.append(f"{s_str}")
819
+
820
+ for i, (pred, obj) in enumerate(po_list):
821
+ p_str = compress_uri(pred)
822
+ o_str = format_value(obj)
823
+ sep = " ;" if i < len(po_list) - 1 else " ."
824
+ lines.append(f" {p_str} {o_str}{sep}")
825
+
826
+ lines.append("")
827
+
828
+ return '\n'.join(lines)
829
+
830
+ def _serialize_ntriples(self) -> str:
831
+ """Serialize to N-Triples."""
832
+ lines = []
833
+ for row in self._store._df.iter_rows(named=True):
834
+ if row.get('deprecated', False):
835
+ continue
836
+ s = row['subject']
837
+ p = row['predicate']
838
+ o = row['object']
839
+
840
+ # Format subject
841
+ s_str = f"<{s}>" if not s.startswith('_:') else s
842
+ p_str = f"<{p}>"
843
+
844
+ # Format object
845
+ if isinstance(o, str) and (o.startswith('http://') or o.startswith('https://') or o.startswith('urn:')):
846
+ o_str = f"<{o}>"
847
+ elif isinstance(o, str) and o.startswith('_:'):
848
+ o_str = o
849
+ else:
850
+ o_str = f'"{o}"'
851
+
852
+ lines.append(f"{s_str} {p_str} {o_str} .")
853
+
854
+ return '\n'.join(lines)
855
+
856
+ def _serialize_rdfxml(self) -> str:
857
+ """Serialize to RDF/XML."""
858
+ # Basic implementation
859
+ lines = [
860
+ '<?xml version="1.0" encoding="UTF-8"?>',
861
+ '<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">',
862
+ ]
863
+
864
+ for row in self._store._df.iter_rows(named=True):
865
+ if row.get('deprecated', False):
866
+ continue
867
+ s = row['subject']
868
+ p = row['predicate']
869
+ o = row['object']
870
+
871
+ lines.append(f' <rdf:Description rdf:about="{s}">')
872
+
873
+ # Simple predicate handling
874
+ if isinstance(o, str) and o.startswith('http'):
875
+ lines.append(f' <{p} rdf:resource="{o}"/>')
876
+ else:
877
+ lines.append(f' <{p}>{o}</{p}>')
878
+
879
+ lines.append(' </rdf:Description>')
880
+
881
+ lines.append('</rdf:RDF>')
882
+ return '\n'.join(lines)
883
+
884
+ def _serialize_jsonld(self) -> str:
885
+ """Serialize to JSON-LD."""
886
+ import json
887
+
888
+ # Group by subject
889
+ subjects = {}
890
+ for row in self._store._df.iter_rows(named=True):
891
+ if row.get('deprecated', False):
892
+ continue
893
+ s = row['subject']
894
+ p = row['predicate']
895
+ o = row['object']
896
+
897
+ if s not in subjects:
898
+ subjects[s] = {"@id": s}
899
+
900
+ if p == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type":
901
+ if "@type" not in subjects[s]:
902
+ subjects[s]["@type"] = []
903
+ subjects[s]["@type"].append(o)
904
+ else:
905
+ if p not in subjects[s]:
906
+ subjects[s][p] = []
907
+ subjects[s][p].append(o)
908
+
909
+ return json.dumps(list(subjects.values()), indent=2)
910
+
911
+ # =========================================================================
912
+ # Helper methods
913
+ # =========================================================================
914
+
915
+ def _guess_format(self, path: Path) -> str:
916
+ """Guess format from file extension."""
917
+ suffix = path.suffix.lower()
918
+ return {
919
+ '.ttl': 'turtle',
920
+ '.turtle': 'turtle',
921
+ '.n3': 'n3',
922
+ '.nt': 'nt',
923
+ '.ntriples': 'nt',
924
+ '.rdf': 'xml',
925
+ '.xml': 'xml',
926
+ '.owl': 'xml',
927
+ '.jsonld': 'json-ld',
928
+ '.json': 'json-ld',
929
+ }.get(suffix, 'turtle')
930
+
931
+ def _term_to_value(self, term: Identifier) -> Any:
932
+ """Convert an RDF term to a storage value."""
933
+ if isinstance(term, URIRef):
934
+ return str(term)
935
+ elif isinstance(term, Literal):
936
+ return term.toPython()
937
+ elif isinstance(term, BNode):
938
+ return f"_:{term._id}"
939
+ else:
940
+ return str(term)
941
+
942
+ def _value_to_term(self, value: Any, is_uri: bool = False) -> Identifier:
943
+ """Convert a storage value to an RDF term."""
944
+ if isinstance(value, str):
945
+ if value.startswith('_:'):
946
+ return BNode(value[2:])
947
+ elif value.startswith(('http://', 'https://', 'urn:', 'file://')) or is_uri:
948
+ return URIRef(value)
949
+ else:
950
+ return Literal(value)
951
+ elif isinstance(value, (int, float, bool)):
952
+ return Literal(value)
953
+ else:
954
+ return Literal(str(value))
955
+
956
+
957
+ # =============================================================================
958
+ # Namespace Manager
959
+ # =============================================================================
960
+
961
+ class NamespaceManager:
962
+ """Manages namespace prefix bindings for a graph."""
963
+
964
+ def __init__(self, graph: Optional[Graph] = None):
965
+ self._graph = graph
966
+ self._bindings: dict[str, URIRef] = {}
967
+ self._reverse: dict[str, str] = {}
968
+
969
+ # Default bindings
970
+ self.bind("rdf", RDF)
971
+ self.bind("rdfs", RDFS)
972
+ self.bind("owl", OWL)
973
+ self.bind("xsd", XSD)
974
+
975
+ def bind(
976
+ self,
977
+ prefix: str,
978
+ namespace: Union[str, URIRef, Namespace],
979
+ override: bool = True,
980
+ replace: bool = False
981
+ ):
982
+ """Bind a prefix to a namespace."""
983
+ ns = URIRef(namespace) if not isinstance(namespace, URIRef) else namespace
984
+
985
+ if not override and prefix in self._bindings:
986
+ return
987
+
988
+ if replace:
989
+ # Remove old binding for this namespace
990
+ old_prefix = self._reverse.get(str(ns))
991
+ if old_prefix:
992
+ del self._bindings[old_prefix]
993
+
994
+ self._bindings[prefix] = ns
995
+ self._reverse[str(ns)] = prefix
996
+
997
+ def namespaces(self) -> Iterator[Tuple[str, URIRef]]:
998
+ """Iterate over (prefix, namespace) pairs."""
999
+ return iter(self._bindings.items())
1000
+
1001
+ def compute_qname(self, uri: str, generate: bool = True) -> Tuple[str, str, str]:
1002
+ """Compute a qname (prefix, namespace, local) for a URI."""
1003
+ for prefix, ns in sorted(self._bindings.items(), key=lambda x: -len(x[1])):
1004
+ ns_str = str(ns)
1005
+ if uri.startswith(ns_str):
1006
+ local = uri[len(ns_str):]
1007
+ return (prefix, ns_str, local)
1008
+
1009
+ # Try to generate a prefix
1010
+ if generate:
1011
+ # Split URI into namespace and local
1012
+ for sep in ('#', '/', ':'):
1013
+ if sep in uri:
1014
+ idx = uri.rfind(sep)
1015
+ ns = uri[:idx + 1]
1016
+ local = uri[idx + 1:]
1017
+ if ns in self._reverse:
1018
+ return (self._reverse[ns], ns, local)
1019
+ # Generate new prefix
1020
+ prefix = f"ns{len(self._bindings)}"
1021
+ self.bind(prefix, ns)
1022
+ return (prefix, ns, local)
1023
+
1024
+ raise ValueError(f"Cannot compute qname for {uri}")
1025
+
1026
+
1027
+ # =============================================================================
1028
+ # Query Results
1029
+ # =============================================================================
1030
+
1031
+ class QueryResult:
1032
+ """Wrapper for SPARQL query results."""
1033
+
1034
+ def __init__(self, result, bindings=None):
1035
+ self._result = result
1036
+ self._bindings = bindings or {}
1037
+
1038
+ def __iter__(self):
1039
+ """Iterate over result rows."""
1040
+ import polars as pl
1041
+ if isinstance(self._result, pl.DataFrame):
1042
+ for row in self._result.iter_rows(named=True):
1043
+ yield QueryRow(row)
1044
+ elif isinstance(self._result, bool):
1045
+ yield self._result
1046
+ else:
1047
+ yield self._result
1048
+
1049
+ def __bool__(self):
1050
+ """For ASK queries."""
1051
+ if isinstance(self._result, bool):
1052
+ return self._result
1053
+ return len(self._result) > 0
1054
+
1055
+
1056
+ class QueryRow:
1057
+ """A single result row from a SPARQL query."""
1058
+
1059
+ def __init__(self, row: dict):
1060
+ self._row = row
1061
+ self._keys = list(row.keys())
1062
+
1063
+ def __getitem__(self, key):
1064
+ # Support both string keys and integer indices
1065
+ if isinstance(key, int):
1066
+ if 0 <= key < len(self._keys):
1067
+ key = self._keys[key]
1068
+ else:
1069
+ raise IndexError(f"Row index out of range: {key}")
1070
+
1071
+ value = self._row.get(key)
1072
+ if value is None:
1073
+ return None
1074
+ if isinstance(value, str):
1075
+ if value.startswith(('http://', 'https://', 'urn:')):
1076
+ return URIRef(value)
1077
+ elif value.startswith('_:'):
1078
+ return BNode(value[2:])
1079
+ return Literal(value)
1080
+ return Literal(value)
1081
+
1082
+ def __iter__(self):
1083
+ return iter(self._row.values())
1084
+
1085
+ def asdict(self):
1086
+ return dict(self._row)
1087
+
1088
+
1089
+ # =============================================================================
1090
+ # Exports
1091
+ # =============================================================================
1092
+
1093
+ __all__ = [
1094
+ # Term classes
1095
+ 'URIRef', 'Literal', 'BNode', 'Identifier',
1096
+ # Namespace
1097
+ 'Namespace', 'ClosedNamespace', 'NamespaceManager',
1098
+ # Well-known namespaces
1099
+ 'RDF', 'RDFS', 'OWL', 'XSD', 'FOAF', 'DC', 'DCTERMS', 'SKOS', 'PROV',
1100
+ # Graph
1101
+ 'Graph',
1102
+ # Query
1103
+ 'QueryResult', 'QueryRow',
1104
+ ]