rdf-starbase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,531 @@
1
+ """
2
+ Predicate-Partitioned Parquet Storage.
3
+
4
+ Implements LSM-style base+delta storage with predicate partitioning
5
+ for high-performance RDF★ queries.
6
+
7
+ Key design decisions (from storage-spec.md):
8
+ - Primary partition key: predicate (p)
9
+ - Optional secondary partition key: graph (g)
10
+ - Base dataset: compacted partitions
11
+ - Delta dataset: append-only write batches
12
+ - Compaction with deduplication on (g,s,p,o)
13
+ """
14
+
15
+ from dataclasses import dataclass, field
16
+ from datetime import datetime, timezone
17
+ from pathlib import Path
18
+ from typing import Optional, List, Dict, Any
19
+ import json
20
+ import shutil
21
+
22
+ import polars as pl
23
+
24
+ from rdf_starbase.storage.terms import TermId, TermDict
25
+ from rdf_starbase.storage.quoted_triples import QtDict
26
+ from rdf_starbase.storage.facts import FactStore, FactFlags, DEFAULT_GRAPH_ID
27
+
28
+
29
+ # =============================================================================
30
+ # Partition Metadata
31
+ # =============================================================================
32
+
33
+ @dataclass
34
+ class PartitionStats:
35
+ """Statistics for a partition (predicate or predicate+graph)."""
36
+ predicate_id: TermId
37
+ graph_id: Optional[TermId]
38
+ row_count: int
39
+ min_subject: Optional[TermId] = None
40
+ max_subject: Optional[TermId] = None
41
+ distinct_subjects: int = 0
42
+ file_size_bytes: int = 0
43
+ last_compacted: Optional[datetime] = None
44
+
45
+ def to_dict(self) -> dict:
46
+ return {
47
+ "predicate_id": self.predicate_id,
48
+ "graph_id": self.graph_id,
49
+ "row_count": self.row_count,
50
+ "min_subject": self.min_subject,
51
+ "max_subject": self.max_subject,
52
+ "distinct_subjects": self.distinct_subjects,
53
+ "file_size_bytes": self.file_size_bytes,
54
+ "last_compacted": self.last_compacted.isoformat() if self.last_compacted else None,
55
+ }
56
+
57
+ @classmethod
58
+ def from_dict(cls, d: dict) -> "PartitionStats":
59
+ return cls(
60
+ predicate_id=d["predicate_id"],
61
+ graph_id=d.get("graph_id"),
62
+ row_count=d["row_count"],
63
+ min_subject=d.get("min_subject"),
64
+ max_subject=d.get("max_subject"),
65
+ distinct_subjects=d.get("distinct_subjects", 0),
66
+ file_size_bytes=d.get("file_size_bytes", 0),
67
+ last_compacted=datetime.fromisoformat(d["last_compacted"]) if d.get("last_compacted") else None,
68
+ )
69
+
70
+
71
+ # =============================================================================
72
+ # LSM Storage Manager
73
+ # =============================================================================
74
+
75
+ class LSMStorage:
76
+ """
77
+ LSM-style storage manager with predicate partitioning.
78
+
79
+ Directory structure:
80
+ - data/
81
+ - term_dict/
82
+ - term_dict.parquet
83
+ - term_hash.parquet
84
+ - qt_dict/
85
+ - qt_dict.parquet
86
+ - facts/
87
+ - base/
88
+ - p=<pid>/
89
+ - part-0.parquet
90
+ - part-1.parquet
91
+ - delta/
92
+ - p=<pid>/
93
+ - batch-<txn>.parquet
94
+ - meta/
95
+ - schema_version.json
96
+ - partitions.json
97
+ """
98
+
99
+ SCHEMA_VERSION = "1.0.0"
100
+
101
+ def __init__(
102
+ self,
103
+ path: Path,
104
+ term_dict: Optional[TermDict] = None,
105
+ qt_dict: Optional[QtDict] = None,
106
+ fact_store: Optional[FactStore] = None,
107
+ ):
108
+ """
109
+ Initialize the LSM storage manager.
110
+
111
+ Args:
112
+ path: Root directory for storage
113
+ term_dict: Existing TermDict (or create new)
114
+ qt_dict: Existing QtDict (or create new)
115
+ fact_store: Existing FactStore (or create new)
116
+ """
117
+ self.path = Path(path)
118
+
119
+ # Initialize or use existing components
120
+ if term_dict is None:
121
+ term_dict = TermDict()
122
+ self.term_dict = term_dict
123
+
124
+ if qt_dict is None:
125
+ qt_dict = QtDict(term_dict)
126
+ self.qt_dict = qt_dict
127
+
128
+ if fact_store is None:
129
+ fact_store = FactStore(term_dict, qt_dict)
130
+ self.fact_store = fact_store
131
+
132
+ # Partition statistics
133
+ self._partition_stats: Dict[TermId, PartitionStats] = {}
134
+
135
+ # Delta buffer (facts not yet written to Parquet)
136
+ self._delta_buffer: List[pl.DataFrame] = []
137
+ self._delta_txn_count = 0
138
+
139
+ # Configuration
140
+ self.delta_flush_threshold = 10000 # Flush delta when this many rows accumulated
141
+ self.compaction_size_threshold = 100000 # Compact when partition exceeds this
142
+
143
+ # =========================================================================
144
+ # Directory Structure
145
+ # =========================================================================
146
+
147
+ @property
148
+ def _term_dict_path(self) -> Path:
149
+ return self.path / "data" / "term_dict"
150
+
151
+ @property
152
+ def _qt_dict_path(self) -> Path:
153
+ return self.path / "data" / "qt_dict"
154
+
155
+ @property
156
+ def _facts_base_path(self) -> Path:
157
+ return self.path / "data" / "facts" / "base"
158
+
159
+ @property
160
+ def _facts_delta_path(self) -> Path:
161
+ return self.path / "data" / "facts" / "delta"
162
+
163
+ @property
164
+ def _meta_path(self) -> Path:
165
+ return self.path / "data" / "meta"
166
+
167
+ def _partition_base_path(self, p: TermId) -> Path:
168
+ return self._facts_base_path / f"p={p}"
169
+
170
+ def _partition_delta_path(self, p: TermId) -> Path:
171
+ return self._facts_delta_path / f"p={p}"
172
+
173
+ # =========================================================================
174
+ # Initialization and Persistence
175
+ # =========================================================================
176
+
177
+ def initialize(self):
178
+ """Create directory structure for a new storage."""
179
+ self._term_dict_path.mkdir(parents=True, exist_ok=True)
180
+ self._qt_dict_path.mkdir(parents=True, exist_ok=True)
181
+ self._facts_base_path.mkdir(parents=True, exist_ok=True)
182
+ self._facts_delta_path.mkdir(parents=True, exist_ok=True)
183
+ self._meta_path.mkdir(parents=True, exist_ok=True)
184
+
185
+ # Write schema version
186
+ self._save_schema_version()
187
+
188
+ def _save_schema_version(self):
189
+ """Save schema version metadata."""
190
+ meta = {
191
+ "schema_version": self.SCHEMA_VERSION,
192
+ "encoding": "tagged_ids",
193
+ "quoting_scope": "graph_agnostic",
194
+ "created_at": datetime.now(timezone.utc).isoformat(),
195
+ }
196
+ with open(self._meta_path / "schema_version.json", "w") as f:
197
+ json.dump(meta, f, indent=2)
198
+
199
+ def _save_partition_stats(self):
200
+ """Save partition statistics."""
201
+ stats_list = [s.to_dict() for s in self._partition_stats.values()]
202
+ with open(self._meta_path / "partitions.json", "w") as f:
203
+ json.dump(stats_list, f, indent=2)
204
+
205
+ def _load_partition_stats(self):
206
+ """Load partition statistics."""
207
+ stats_file = self._meta_path / "partitions.json"
208
+ if stats_file.exists():
209
+ with open(stats_file) as f:
210
+ stats_list = json.load(f)
211
+ self._partition_stats = {
212
+ s["predicate_id"]: PartitionStats.from_dict(s)
213
+ for s in stats_list
214
+ }
215
+
216
+ def save(self):
217
+ """
218
+ Persist all data to disk.
219
+
220
+ Saves:
221
+ - Term dictionary
222
+ - Quoted triple dictionary
223
+ - Flushes delta buffer to Parquet
224
+ - Partition statistics
225
+ """
226
+ self.initialize()
227
+
228
+ # Save dictionaries
229
+ self.term_dict.save(self._term_dict_path)
230
+ self.qt_dict.save(self._qt_dict_path)
231
+
232
+ # Flush any buffered deltas
233
+ self._flush_delta_buffer()
234
+
235
+ # Save partition stats
236
+ self._save_partition_stats()
237
+
238
+ @classmethod
239
+ def load(cls, path: Path) -> "LSMStorage":
240
+ """
241
+ Load storage from disk.
242
+
243
+ Reconstructs:
244
+ - Term dictionary
245
+ - Quoted triple dictionary
246
+ - Partition statistics
247
+ - In-memory fact store from base + delta
248
+ """
249
+ path = Path(path)
250
+
251
+ # Load term dictionary
252
+ term_dict = TermDict.load(path / "data" / "term_dict")
253
+
254
+ # Load qt dictionary
255
+ qt_dict = QtDict.load(path / "data" / "qt_dict", term_dict)
256
+
257
+ # Create storage instance
258
+ instance = cls(path, term_dict, qt_dict)
259
+
260
+ # Load partition stats
261
+ instance._load_partition_stats()
262
+
263
+ # Reconstruct fact store from base + delta
264
+ instance._reconstruct_fact_store()
265
+
266
+ return instance
267
+
268
+ def _reconstruct_fact_store(self):
269
+ """Reconstruct in-memory fact store from Parquet files."""
270
+ dfs = []
271
+
272
+ # Load base partitions
273
+ if self._facts_base_path.exists():
274
+ for p_dir in self._facts_base_path.iterdir():
275
+ if p_dir.is_dir() and p_dir.name.startswith("p="):
276
+ for parquet_file in p_dir.glob("*.parquet"):
277
+ dfs.append(pl.read_parquet(parquet_file))
278
+
279
+ # Load delta partitions
280
+ if self._facts_delta_path.exists():
281
+ for p_dir in self._facts_delta_path.iterdir():
282
+ if p_dir.is_dir() and p_dir.name.startswith("p="):
283
+ for parquet_file in p_dir.glob("*.parquet"):
284
+ dfs.append(pl.read_parquet(parquet_file))
285
+
286
+ if dfs:
287
+ self.fact_store._df = pl.concat(dfs, how="vertical")
288
+ # Update txn counter
289
+ if len(self.fact_store._df) > 0:
290
+ max_txn = self.fact_store._df["txn"].max()
291
+ if max_txn is not None:
292
+ self.fact_store._next_txn = int(max_txn) + 1
293
+
294
+ # =========================================================================
295
+ # Write Path (Batch Ingestion)
296
+ # =========================================================================
297
+
298
+ def add_facts_batch(
299
+ self,
300
+ facts: List[tuple], # (g, s, p, o)
301
+ source: Optional[TermId] = None,
302
+ confidence: float = 1.0,
303
+ process: Optional[TermId] = None,
304
+ ) -> int:
305
+ """
306
+ Add a batch of facts with shared provenance.
307
+
308
+ Facts are added to the in-memory store and buffered for delta writes.
309
+ """
310
+ txn = self.fact_store.add_facts_batch(
311
+ facts,
312
+ source=source,
313
+ confidence=confidence,
314
+ process=process,
315
+ )
316
+
317
+ # Buffer for delta write
318
+ self._delta_txn_count += 1
319
+
320
+ # Check if we should flush
321
+ if len(self.fact_store) > self.delta_flush_threshold:
322
+ self._flush_delta_buffer()
323
+
324
+ return txn
325
+
326
+ def _flush_delta_buffer(self):
327
+ """Write buffered facts to delta Parquet files."""
328
+ if len(self.fact_store) == 0:
329
+ return
330
+
331
+ df = self.fact_store.to_dataframe()
332
+
333
+ # Group by predicate
334
+ predicates = df.select("p").unique()["p"].to_list()
335
+
336
+ for p in predicates:
337
+ p_df = df.filter(pl.col("p") == p)
338
+
339
+ # Create partition directory
340
+ delta_dir = self._partition_delta_path(p)
341
+ delta_dir.mkdir(parents=True, exist_ok=True)
342
+
343
+ # Write with txn-based filename
344
+ txn = p_df["txn"].max()
345
+ delta_file = delta_dir / f"batch-{txn}.parquet"
346
+ p_df.write_parquet(delta_file)
347
+
348
+ # =========================================================================
349
+ # Compaction
350
+ # =========================================================================
351
+
352
+ def compact_partition(self, predicate_id: TermId):
353
+ """
354
+ Compact a single partition.
355
+
356
+ Algorithm (from storage-spec.md §7.4):
357
+ 1. Read base files for partition
358
+ 2. Read delta files for partition
359
+ 3. Concatenate → stable sort by (s, o, txn)
360
+ 4. Groupby (g, s, p, o) selecting latest row
361
+ 5. Write new base part file(s)
362
+ 6. Delete compacted delta files
363
+ 7. Update partition stats
364
+ """
365
+ base_dir = self._partition_base_path(predicate_id)
366
+ delta_dir = self._partition_delta_path(predicate_id)
367
+
368
+ dfs = []
369
+ delta_files = []
370
+
371
+ # Read base files
372
+ if base_dir.exists():
373
+ for f in base_dir.glob("*.parquet"):
374
+ dfs.append(pl.read_parquet(f))
375
+
376
+ # Read delta files
377
+ if delta_dir.exists():
378
+ for f in delta_dir.glob("*.parquet"):
379
+ dfs.append(pl.read_parquet(f))
380
+ delta_files.append(f)
381
+
382
+ if not dfs:
383
+ return
384
+
385
+ # Concatenate
386
+ combined = pl.concat(dfs, how="vertical")
387
+
388
+ # Sort by (s, o, txn)
389
+ combined = combined.sort(["s", "o", "txn"])
390
+
391
+ # Deduplicate: keep latest (max txn) for each (g, s, p, o)
392
+ # If deleted, apply last-write-wins
393
+ compacted = combined.group_by(["g", "s", "p", "o"]).agg([
394
+ pl.col("flags").last(),
395
+ pl.col("txn").last(),
396
+ pl.col("t_added").last(),
397
+ pl.col("source").last(),
398
+ pl.col("confidence").last(),
399
+ pl.col("process").last(),
400
+ ])
401
+
402
+ # Remove tombstones (deleted facts) during compaction
403
+ compacted = compacted.filter(
404
+ (pl.col("flags") & int(FactFlags.DELETED)) == 0
405
+ )
406
+
407
+ # Write new base file
408
+ base_dir.mkdir(parents=True, exist_ok=True)
409
+
410
+ # Remove old base files
411
+ for f in base_dir.glob("*.parquet"):
412
+ f.unlink()
413
+
414
+ # Write compacted data
415
+ if len(compacted) > 0:
416
+ compacted.write_parquet(base_dir / "part-0.parquet")
417
+
418
+ # Delete compacted delta files
419
+ for f in delta_files:
420
+ f.unlink()
421
+
422
+ # Update stats
423
+ self._partition_stats[predicate_id] = PartitionStats(
424
+ predicate_id=predicate_id,
425
+ graph_id=None,
426
+ row_count=len(compacted),
427
+ min_subject=compacted["s"].min() if len(compacted) > 0 else None,
428
+ max_subject=compacted["s"].max() if len(compacted) > 0 else None,
429
+ distinct_subjects=compacted.select("s").n_unique() if len(compacted) > 0 else 0,
430
+ file_size_bytes=(base_dir / "part-0.parquet").stat().st_size if len(compacted) > 0 else 0,
431
+ last_compacted=datetime.now(timezone.utc),
432
+ )
433
+
434
+ def compact_all(self):
435
+ """Compact all partitions."""
436
+ predicates = set()
437
+
438
+ # Collect predicates from base
439
+ if self._facts_base_path.exists():
440
+ for p_dir in self._facts_base_path.iterdir():
441
+ if p_dir.is_dir() and p_dir.name.startswith("p="):
442
+ p_id = int(p_dir.name.split("=")[1])
443
+ predicates.add(p_id)
444
+
445
+ # Collect predicates from delta
446
+ if self._facts_delta_path.exists():
447
+ for p_dir in self._facts_delta_path.iterdir():
448
+ if p_dir.is_dir() and p_dir.name.startswith("p="):
449
+ p_id = int(p_dir.name.split("=")[1])
450
+ predicates.add(p_id)
451
+
452
+ for p_id in predicates:
453
+ self.compact_partition(p_id)
454
+
455
+ self._save_partition_stats()
456
+
457
+ # =========================================================================
458
+ # Query Primitives
459
+ # =========================================================================
460
+
461
+ def scan_partition(
462
+ self,
463
+ predicate_id: TermId,
464
+ include_deleted: bool = False,
465
+ ) -> pl.DataFrame:
466
+ """
467
+ Scan a specific predicate partition.
468
+
469
+ Reads from both base and delta, returning combined results.
470
+ This enables partition pruning for predicate-selective queries.
471
+ """
472
+ dfs = []
473
+
474
+ # Read base
475
+ base_dir = self._partition_base_path(predicate_id)
476
+ if base_dir.exists():
477
+ for f in base_dir.glob("*.parquet"):
478
+ dfs.append(pl.read_parquet(f))
479
+
480
+ # Read delta
481
+ delta_dir = self._partition_delta_path(predicate_id)
482
+ if delta_dir.exists():
483
+ for f in delta_dir.glob("*.parquet"):
484
+ dfs.append(pl.read_parquet(f))
485
+
486
+ if not dfs:
487
+ return self.fact_store._create_empty_dataframe()
488
+
489
+ result = pl.concat(dfs, how="vertical")
490
+
491
+ if not include_deleted:
492
+ result = result.filter(
493
+ (pl.col("flags") & int(FactFlags.DELETED)) == 0
494
+ )
495
+
496
+ return result
497
+
498
+ def get_partition_stats(self, predicate_id: TermId) -> Optional[PartitionStats]:
499
+ """Get statistics for a partition."""
500
+ return self._partition_stats.get(predicate_id)
501
+
502
+ def list_partitions(self) -> List[TermId]:
503
+ """List all partition predicate IDs."""
504
+ return list(self._partition_stats.keys())
505
+
506
+ # =========================================================================
507
+ # Statistics
508
+ # =========================================================================
509
+
510
+ def stats(self) -> dict:
511
+ """Return storage statistics."""
512
+ total_base_size = 0
513
+ total_delta_size = 0
514
+
515
+ if self._facts_base_path.exists():
516
+ for f in self._facts_base_path.rglob("*.parquet"):
517
+ total_base_size += f.stat().st_size
518
+
519
+ if self._facts_delta_path.exists():
520
+ for f in self._facts_delta_path.rglob("*.parquet"):
521
+ total_delta_size += f.stat().st_size
522
+
523
+ return {
524
+ "term_dict": self.term_dict.stats(),
525
+ "qt_dict": self.qt_dict.stats(),
526
+ "fact_store": self.fact_store.stats(),
527
+ "partitions": len(self._partition_stats),
528
+ "base_size_bytes": total_base_size,
529
+ "delta_size_bytes": total_delta_size,
530
+ "total_size_bytes": total_base_size + total_delta_size,
531
+ }