rdf-starbase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,338 @@
1
+ """
2
+ Persistence layer for RDF-StarBase storage.
3
+
4
+ Provides save/load functionality for the dictionary-encoded storage layer:
5
+ - TermDict: Term catalog (term_id, kind, lex)
6
+ - FactStore: Facts table (g, s, p, o, provenance)
7
+ - QtDict: Quoted triples table (qt_id, s_id, p_id, o_id)
8
+
9
+ Uses Parquet format for efficient, columnar storage with good compression.
10
+ """
11
+
12
+ from __future__ import annotations
13
+ from pathlib import Path
14
+ from typing import Optional
15
+ import os
16
+
17
+ import polars as pl
18
+
19
+ from rdf_starbase.storage.terms import TermDict, Term, TermId, TermKind, make_term_id
20
+ from rdf_starbase.storage.quoted_triples import QtDict, QuotedTriple
21
+ from rdf_starbase.storage.facts import FactStore
22
+
23
+
24
+ class StoragePersistence:
25
+ """
26
+ Handles save/load operations for the storage layer.
27
+
28
+ File layout:
29
+ base_path/
30
+ terms.parquet - TermDict catalog
31
+ facts.parquet - FactStore facts
32
+ quoted.parquet - QtDict quoted triples
33
+ metadata.parquet - Counters and metadata
34
+ """
35
+
36
+ TERMS_FILE = "terms.parquet"
37
+ FACTS_FILE = "facts.parquet"
38
+ QUOTED_FILE = "quoted.parquet"
39
+ METADATA_FILE = "metadata.parquet"
40
+
41
+ def __init__(self, base_path: str | Path):
42
+ """
43
+ Initialize persistence with a base directory path.
44
+
45
+ Args:
46
+ base_path: Directory where storage files will be saved/loaded
47
+ """
48
+ self.base_path = Path(base_path)
49
+
50
+ def save(
51
+ self,
52
+ term_dict: TermDict,
53
+ fact_store: FactStore,
54
+ qt_dict: QtDict
55
+ ) -> None:
56
+ """
57
+ Save all storage components to disk.
58
+
59
+ Args:
60
+ term_dict: The term dictionary to save
61
+ fact_store: The fact store to save
62
+ qt_dict: The quoted triple dictionary to save
63
+ """
64
+ # Ensure directory exists
65
+ self.base_path.mkdir(parents=True, exist_ok=True)
66
+
67
+ # Save term dictionary
68
+ self._save_terms(term_dict)
69
+
70
+ # Save facts
71
+ self._save_facts(fact_store)
72
+
73
+ # Save quoted triples
74
+ self._save_quoted(qt_dict)
75
+
76
+ # Save metadata (counters, etc.)
77
+ self._save_metadata(term_dict, fact_store, qt_dict)
78
+
79
+ def load(self) -> tuple[TermDict, FactStore, QtDict]:
80
+ """
81
+ Load all storage components from disk.
82
+
83
+ Returns:
84
+ Tuple of (TermDict, FactStore, QtDict)
85
+
86
+ Raises:
87
+ FileNotFoundError: If the storage directory doesn't exist
88
+ """
89
+ if not self.base_path.exists():
90
+ raise FileNotFoundError(f"Storage directory not found: {self.base_path}")
91
+
92
+ # Load term dictionary first (needed by others)
93
+ term_dict = self._load_terms()
94
+
95
+ # Load quoted triples (needed by fact_store)
96
+ qt_dict = self._load_quoted(term_dict)
97
+
98
+ # Load facts
99
+ fact_store = self._load_facts(term_dict, qt_dict)
100
+
101
+ # Restore metadata
102
+ self._load_metadata(term_dict, fact_store, qt_dict)
103
+
104
+ return term_dict, fact_store, qt_dict
105
+
106
+ def exists(self) -> bool:
107
+ """Check if a saved storage exists at the base path."""
108
+ return (
109
+ self.base_path.exists() and
110
+ (self.base_path / self.TERMS_FILE).exists()
111
+ )
112
+
113
+ def _save_terms(self, term_dict: TermDict) -> None:
114
+ """Save term dictionary to Parquet."""
115
+ # Build DataFrame from term_dict internal state
116
+ term_ids = []
117
+ kinds = []
118
+ lexes = []
119
+
120
+ for term_id, term in term_dict._id_to_term.items():
121
+ term_ids.append(term_id)
122
+ kinds.append(term.kind.value)
123
+ lexes.append(term.lex)
124
+
125
+ df = pl.DataFrame({
126
+ "term_id": pl.Series(term_ids, dtype=pl.UInt64),
127
+ "kind": pl.Series(kinds, dtype=pl.UInt8),
128
+ "lex": pl.Series(lexes, dtype=pl.Utf8),
129
+ })
130
+
131
+ df.write_parquet(self.base_path / self.TERMS_FILE)
132
+
133
+ def _load_terms(self) -> TermDict:
134
+ """Load term dictionary from Parquet."""
135
+ df = pl.read_parquet(self.base_path / self.TERMS_FILE)
136
+
137
+ term_dict = TermDict.__new__(TermDict)
138
+ term_dict._next_payload = {
139
+ TermKind.IRI: 0,
140
+ TermKind.LITERAL: 0,
141
+ TermKind.BNODE: 0,
142
+ TermKind.QUOTED_TRIPLE: 0,
143
+ }
144
+ term_dict._hash_to_id = {}
145
+ term_dict._id_to_term = {}
146
+ term_dict._collision_count = 0
147
+
148
+ # Initialize fast-path caches (added for performance)
149
+ term_dict._iri_cache = {}
150
+ term_dict._plain_literal_cache = {}
151
+ term_dict._bnode_cache = {}
152
+
153
+ # Restore terms
154
+ for row in df.iter_rows(named=True):
155
+ term_id = row["term_id"]
156
+ kind = TermKind(row["kind"])
157
+ lex = row["lex"]
158
+
159
+ term = Term(kind=kind, lex=lex)
160
+ term_dict._id_to_term[term_id] = term
161
+ term_dict._hash_to_id[term.compute_hash()] = term_id
162
+
163
+ # Populate fast-path caches
164
+ if kind == TermKind.IRI:
165
+ term_dict._iri_cache[lex] = term_id
166
+ elif kind == TermKind.BNODE:
167
+ term_dict._bnode_cache[lex] = term_id
168
+ elif kind == TermKind.LITERAL:
169
+ term_dict._plain_literal_cache[lex] = term_id
170
+
171
+ return term_dict
172
+
173
+ def _save_facts(self, fact_store: FactStore) -> None:
174
+ """Save fact store to Parquet."""
175
+ fact_store._df.write_parquet(self.base_path / self.FACTS_FILE)
176
+
177
+ def _load_facts(self, term_dict: TermDict, qt_dict: QtDict) -> FactStore:
178
+ """Load fact store from Parquet."""
179
+ fact_store = FactStore.__new__(FactStore)
180
+ fact_store._term_dict = term_dict
181
+ fact_store._qt_dict = qt_dict
182
+ fact_store._next_txn = 0
183
+ fact_store._default_graph_id = 0
184
+
185
+ facts_path = self.base_path / self.FACTS_FILE
186
+ if facts_path.exists():
187
+ fact_store._df = pl.read_parquet(facts_path)
188
+ else:
189
+ fact_store._df = fact_store._create_empty_dataframe()
190
+
191
+ return fact_store
192
+
193
+ def _save_quoted(self, qt_dict: QtDict) -> None:
194
+ """Save quoted triple dictionary to Parquet."""
195
+ qt_ids = []
196
+ s_ids = []
197
+ p_ids = []
198
+ o_ids = []
199
+
200
+ for qt_id, qt in qt_dict._id_to_qt.items():
201
+ qt_ids.append(qt_id)
202
+ s_ids.append(qt.s)
203
+ p_ids.append(qt.p)
204
+ o_ids.append(qt.o)
205
+
206
+ df = pl.DataFrame({
207
+ "qt_id": pl.Series(qt_ids, dtype=pl.UInt64),
208
+ "s": pl.Series(s_ids, dtype=pl.UInt64),
209
+ "p": pl.Series(p_ids, dtype=pl.UInt64),
210
+ "o": pl.Series(o_ids, dtype=pl.UInt64),
211
+ })
212
+
213
+ df.write_parquet(self.base_path / self.QUOTED_FILE)
214
+
215
+ def _load_quoted(self, term_dict: TermDict) -> QtDict:
216
+ """Load quoted triple dictionary from Parquet."""
217
+ qt_dict = QtDict.__new__(QtDict)
218
+ qt_dict._term_dict = term_dict
219
+ qt_dict._hash_to_id = {}
220
+ qt_dict._id_to_qt = {}
221
+ qt_dict._collision_count = 0
222
+
223
+ quoted_path = self.base_path / self.QUOTED_FILE
224
+ if quoted_path.exists():
225
+ df = pl.read_parquet(quoted_path)
226
+
227
+ for row in df.iter_rows(named=True):
228
+ qt_id = row["qt_id"]
229
+ qt = QuotedTriple(row["s"], row["p"], row["o"])
230
+ qt_dict._id_to_qt[qt_id] = qt
231
+ qt_dict._hash_to_id[hash(qt)] = qt_id
232
+
233
+ return qt_dict
234
+
235
+ def _save_metadata(
236
+ self,
237
+ term_dict: TermDict,
238
+ fact_store: FactStore,
239
+ qt_dict: QtDict
240
+ ) -> None:
241
+ """Save counters and metadata to Parquet."""
242
+ # Store counter values for each kind
243
+ df = pl.DataFrame({
244
+ "key": [
245
+ "next_iri", "next_literal", "next_bnode", "next_qt", "next_txn"
246
+ ],
247
+ "value": [
248
+ term_dict._next_payload[TermKind.IRI],
249
+ term_dict._next_payload[TermKind.LITERAL],
250
+ term_dict._next_payload[TermKind.BNODE],
251
+ term_dict._next_payload[TermKind.QUOTED_TRIPLE],
252
+ fact_store._next_txn,
253
+ ],
254
+ })
255
+
256
+ df.write_parquet(self.base_path / self.METADATA_FILE)
257
+
258
+ def _load_metadata(
259
+ self,
260
+ term_dict: TermDict,
261
+ fact_store: FactStore,
262
+ qt_dict: QtDict
263
+ ) -> None:
264
+ """Restore counters and metadata from Parquet."""
265
+ metadata_path = self.base_path / self.METADATA_FILE
266
+ if not metadata_path.exists():
267
+ # Infer counters from loaded data
268
+ self._infer_counters(term_dict, fact_store)
269
+ return
270
+
271
+ df = pl.read_parquet(metadata_path)
272
+
273
+ # Build a lookup dict
274
+ meta = dict(zip(df["key"].to_list(), df["value"].to_list()))
275
+
276
+ term_dict._next_payload[TermKind.IRI] = meta.get("next_iri", 0)
277
+ term_dict._next_payload[TermKind.LITERAL] = meta.get("next_literal", 0)
278
+ term_dict._next_payload[TermKind.BNODE] = meta.get("next_bnode", 0)
279
+ term_dict._next_payload[TermKind.QUOTED_TRIPLE] = meta.get("next_qt", 0)
280
+ fact_store._next_txn = meta.get("next_txn", 0)
281
+
282
+ # Re-initialize well-known IDs
283
+ term_dict._init_well_known()
284
+
285
+ def _infer_counters(
286
+ self,
287
+ term_dict: TermDict,
288
+ fact_store: FactStore
289
+ ) -> None:
290
+ """Infer counter values from loaded data."""
291
+ # Find max payload for each kind
292
+ for term_id, term in term_dict._id_to_term.items():
293
+ kind = term.kind
294
+ payload = term_id & 0x00FFFFFFFFFFFFFF # Extract payload
295
+ if payload >= term_dict._next_payload[kind]:
296
+ term_dict._next_payload[kind] = payload + 1
297
+
298
+ # Infer next_txn from facts
299
+ if len(fact_store._df) > 0 and "txn" in fact_store._df.columns:
300
+ max_txn = fact_store._df["txn"].max()
301
+ if max_txn is not None:
302
+ fact_store._next_txn = max_txn + 1
303
+
304
+ # Re-initialize well-known IDs
305
+ term_dict._init_well_known()
306
+
307
+
308
+ def save_storage(
309
+ base_path: str | Path,
310
+ term_dict: TermDict,
311
+ fact_store: FactStore,
312
+ qt_dict: QtDict
313
+ ) -> None:
314
+ """
315
+ Convenience function to save storage to disk.
316
+
317
+ Args:
318
+ base_path: Directory path for storage files
319
+ term_dict: Term dictionary to save
320
+ fact_store: Fact store to save
321
+ qt_dict: Quoted triple dictionary to save
322
+ """
323
+ persistence = StoragePersistence(base_path)
324
+ persistence.save(term_dict, fact_store, qt_dict)
325
+
326
+
327
+ def load_storage(base_path: str | Path) -> tuple[TermDict, FactStore, QtDict]:
328
+ """
329
+ Convenience function to load storage from disk.
330
+
331
+ Args:
332
+ base_path: Directory path containing storage files
333
+
334
+ Returns:
335
+ Tuple of (TermDict, FactStore, QtDict)
336
+ """
337
+ persistence = StoragePersistence(base_path)
338
+ return persistence.load()
@@ -0,0 +1,292 @@
1
+ """
2
+ Quoted Triple Dictionary.
3
+
4
+ Implements the qt_dict catalog for RDF★ quoted triples.
5
+ Quoted triples are first-class terms that can appear as subjects or objects.
6
+
7
+ Key design decisions (from storage-spec.md):
8
+ - qt_id is a TermId with QUOTED_TRIPLE kind
9
+ - Graph-agnostic quoting: key is (s,p,o) only (simpler, lower cardinality)
10
+ - Hash-based interning for fast bulk dedupe
11
+ - Stores qt_hash for fast rebuild at startup
12
+ """
13
+
14
+ from dataclasses import dataclass
15
+ from typing import Optional, Tuple
16
+ from pathlib import Path
17
+ import struct
18
+ import hashlib
19
+
20
+ import polars as pl
21
+
22
+ from rdf_starbase.storage.terms import (
23
+ TermId,
24
+ TermKind,
25
+ TermDict,
26
+ make_term_id,
27
+ get_term_payload,
28
+ )
29
+
30
+
31
+ # Type alias for quoted triple identifiers
32
+ QtId = TermId # QtId is a TermId with kind=QUOTED_TRIPLE
33
+
34
+
35
+ @dataclass(frozen=True, slots=True)
36
+ class QuotedTriple:
37
+ """
38
+ Internal representation of a quoted triple.
39
+
40
+ All components are TermIds (already dictionary-encoded).
41
+ """
42
+ s: TermId
43
+ p: TermId
44
+ o: TermId
45
+
46
+ def to_tuple(self) -> Tuple[TermId, TermId, TermId]:
47
+ """Return as tuple for hashing."""
48
+ return (self.s, self.p, self.o)
49
+
50
+ def compute_hash(self) -> int:
51
+ """Compute 128-bit hash for bulk deduplication."""
52
+ data = struct.pack('>QQQ', self.s, self.p, self.o)
53
+ h = hashlib.md5(data).digest()
54
+ return int.from_bytes(h, 'big')
55
+
56
+
57
+ class QtDict:
58
+ """
59
+ Quoted Triple Dictionary.
60
+
61
+ Catalogs quoted triples and assigns them stable QtIds (which are TermIds).
62
+ Supports O(1) lookup by (s,p,o) tuple and by qt_id.
63
+
64
+ Relationship with TermDict:
65
+ - QtDict allocates from the QUOTED_TRIPLE ID space
66
+ - The qt_id can be used as a subject or object in facts
67
+ - TermDict handles IRIs, literals, and bnodes; QtDict handles quoted triples
68
+ """
69
+
70
+ def __init__(self, term_dict: TermDict):
71
+ """
72
+ Initialize the quoted triple dictionary.
73
+
74
+ Args:
75
+ term_dict: The TermDict to coordinate ID allocation with
76
+ """
77
+ self._term_dict = term_dict
78
+
79
+ # Hash -> QtId (for interning)
80
+ self._hash_to_id: dict[int, QtId] = {}
81
+
82
+ # QtId -> QuotedTriple (for expansion)
83
+ self._id_to_qt: dict[QtId, QuotedTriple] = {}
84
+
85
+ # Statistics
86
+ self._collision_count = 0
87
+
88
+ def _allocate_id(self) -> QtId:
89
+ """Allocate the next QtId."""
90
+ # Use TermDict's allocation to keep ID spaces coordinated
91
+ payload = self._term_dict._next_payload[TermKind.QUOTED_TRIPLE]
92
+ self._term_dict._next_payload[TermKind.QUOTED_TRIPLE] = payload + 1
93
+ return make_term_id(TermKind.QUOTED_TRIPLE, payload)
94
+
95
+ def get_or_create(self, s: TermId, p: TermId, o: TermId) -> QtId:
96
+ """
97
+ Intern a quoted triple, returning its QtId.
98
+
99
+ If the quoted triple already exists, returns the existing ID.
100
+ Otherwise, allocates a new ID and stores the triple.
101
+
102
+ Args:
103
+ s: Subject TermId
104
+ p: Predicate TermId
105
+ o: Object TermId
106
+
107
+ Returns:
108
+ QtId for the quoted triple
109
+ """
110
+ qt = QuotedTriple(s, p, o)
111
+ qt_hash = qt.compute_hash()
112
+
113
+ if qt_hash in self._hash_to_id:
114
+ existing_id = self._hash_to_id[qt_hash]
115
+ # Verify it's actually the same triple (hash collision check)
116
+ if self._id_to_qt[existing_id] == qt:
117
+ return existing_id
118
+ # Hash collision
119
+ self._collision_count += 1
120
+
121
+ # Allocate new ID
122
+ qt_id = self._allocate_id()
123
+ self._hash_to_id[qt_hash] = qt_id
124
+ self._id_to_qt[qt_id] = qt
125
+
126
+ return qt_id
127
+
128
+ def get_or_create_batch(
129
+ self,
130
+ triples: list[Tuple[TermId, TermId, TermId]]
131
+ ) -> list[QtId]:
132
+ """
133
+ Bulk intern a batch of quoted triples.
134
+
135
+ Optimized for ingestion performance.
136
+ """
137
+ return [self.get_or_create(s, p, o) for s, p, o in triples]
138
+
139
+ def lookup(self, qt_id: QtId) -> Optional[QuotedTriple]:
140
+ """
141
+ Expand a QtId to its (s,p,o) components.
142
+
143
+ This is the critical operation for RDF★ expansion joins.
144
+ """
145
+ return self._id_to_qt.get(qt_id)
146
+
147
+ def lookup_batch(self, qt_ids: list[QtId]) -> list[Optional[QuotedTriple]]:
148
+ """
149
+ Bulk expand QtIds to their components.
150
+
151
+ Returns QuotedTriple objects (or None for unknown IDs).
152
+ """
153
+ return [self._id_to_qt.get(qt_id) for qt_id in qt_ids]
154
+
155
+ def expand_to_dataframe(self, qt_ids: list[QtId]) -> pl.DataFrame:
156
+ """
157
+ Expand a list of QtIds to a DataFrame with columns: qt_id, s, p, o.
158
+
159
+ This is the storage primitive for RDF★ expansion joins
160
+ (storage-spec.md §8: lookup_qt).
161
+ """
162
+ rows = []
163
+ for qt_id in qt_ids:
164
+ qt = self._id_to_qt.get(qt_id)
165
+ if qt is not None:
166
+ rows.append({
167
+ "qt_id": qt_id,
168
+ "s": qt.s,
169
+ "p": qt.p,
170
+ "o": qt.o,
171
+ })
172
+
173
+ if not rows:
174
+ return pl.DataFrame({
175
+ "qt_id": pl.Series([], dtype=pl.UInt64),
176
+ "s": pl.Series([], dtype=pl.UInt64),
177
+ "p": pl.Series([], dtype=pl.UInt64),
178
+ "o": pl.Series([], dtype=pl.UInt64),
179
+ })
180
+
181
+ return pl.DataFrame(rows).cast({
182
+ "qt_id": pl.UInt64,
183
+ "s": pl.UInt64,
184
+ "p": pl.UInt64,
185
+ "o": pl.UInt64,
186
+ })
187
+
188
+ def get_id(self, s: TermId, p: TermId, o: TermId) -> Optional[QtId]:
189
+ """Get the QtId for a triple if it exists, without creating it."""
190
+ qt = QuotedTriple(s, p, o)
191
+ qt_hash = qt.compute_hash()
192
+ if qt_hash not in self._hash_to_id:
193
+ return None
194
+ existing_id = self._hash_to_id[qt_hash]
195
+ if self._id_to_qt[existing_id] == qt:
196
+ return existing_id
197
+ return None
198
+
199
+ def contains(self, s: TermId, p: TermId, o: TermId) -> bool:
200
+ """Check if a quoted triple is already interned."""
201
+ return self.get_id(s, p, o) is not None
202
+
203
+ def __len__(self) -> int:
204
+ """Return the total number of quoted triples."""
205
+ return len(self._id_to_qt)
206
+
207
+ @property
208
+ def collision_count(self) -> int:
209
+ """Return the number of hash collisions encountered."""
210
+ return self._collision_count
211
+
212
+ # =========================================================================
213
+ # Persistence (Parquet)
214
+ # =========================================================================
215
+
216
+ def to_dataframe(self) -> pl.DataFrame:
217
+ """
218
+ Export the quoted triple dictionary to a Polars DataFrame.
219
+
220
+ Schema matches storage-spec.md §3.3:
221
+ - qt_id: u64
222
+ - s: u64
223
+ - p: u64
224
+ - o: u64
225
+ - qt_hash: stored as two u64 columns (hash_high, hash_low)
226
+ """
227
+ if not self._id_to_qt:
228
+ return pl.DataFrame({
229
+ "qt_id": pl.Series([], dtype=pl.UInt64),
230
+ "s": pl.Series([], dtype=pl.UInt64),
231
+ "p": pl.Series([], dtype=pl.UInt64),
232
+ "o": pl.Series([], dtype=pl.UInt64),
233
+ "hash_high": pl.Series([], dtype=pl.UInt64),
234
+ "hash_low": pl.Series([], dtype=pl.UInt64),
235
+ })
236
+
237
+ rows = []
238
+ for qt_id, qt in self._id_to_qt.items():
239
+ qt_hash = qt.compute_hash()
240
+ rows.append({
241
+ "qt_id": qt_id,
242
+ "s": qt.s,
243
+ "p": qt.p,
244
+ "o": qt.o,
245
+ "hash_high": qt_hash >> 64,
246
+ "hash_low": qt_hash & ((1 << 64) - 1),
247
+ })
248
+
249
+ return pl.DataFrame(rows).cast({
250
+ "qt_id": pl.UInt64,
251
+ "s": pl.UInt64,
252
+ "p": pl.UInt64,
253
+ "o": pl.UInt64,
254
+ "hash_high": pl.UInt64,
255
+ "hash_low": pl.UInt64,
256
+ })
257
+
258
+ def save(self, path: Path):
259
+ """Save the quoted triple dictionary to a Parquet file."""
260
+ path = Path(path)
261
+ path.mkdir(parents=True, exist_ok=True)
262
+ self.to_dataframe().write_parquet(path / "qt_dict.parquet")
263
+
264
+ @classmethod
265
+ def load(cls, path: Path, term_dict: TermDict) -> "QtDict":
266
+ """Load a quoted triple dictionary from a Parquet file."""
267
+ path = Path(path)
268
+
269
+ instance = cls(term_dict)
270
+
271
+ df = pl.read_parquet(path / "qt_dict.parquet")
272
+ for row in df.iter_rows(named=True):
273
+ qt_id = row["qt_id"]
274
+ qt = QuotedTriple(row["s"], row["p"], row["o"])
275
+ qt_hash = (row["hash_high"] << 64) | row["hash_low"]
276
+
277
+ instance._id_to_qt[qt_id] = qt
278
+ instance._hash_to_id[qt_hash] = qt_id
279
+
280
+ # Update sequence counter in TermDict
281
+ payload = get_term_payload(qt_id)
282
+ if payload >= term_dict._next_payload[TermKind.QUOTED_TRIPLE]:
283
+ term_dict._next_payload[TermKind.QUOTED_TRIPLE] = payload + 1
284
+
285
+ return instance
286
+
287
+ def stats(self) -> dict:
288
+ """Return statistics about the quoted triple dictionary."""
289
+ return {
290
+ "total_quoted_triples": len(self),
291
+ "hash_collisions": self._collision_count,
292
+ }