rdf-starbase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,850 @@
1
+ """
2
+ Integer-Based Facts Store.
3
+
4
+ Implements the `facts` table with dictionary-encoded integer columns.
5
+ No string terms inside facts - everything is ID-based for maximum performance.
6
+
7
+ Key design decisions (from storage-spec.md):
8
+ - All columns are integer IDs (g, s, p, o are TermIds)
9
+ - RDF★ metadata triples stored by setting s/o to QtId
10
+ - Batch-first ingestion with monotonic txn IDs
11
+ - Flags bitset for asserted/inferred/deleted states
12
+ - Predicate-partitioned storage layout
13
+ """
14
+
15
+ from dataclasses import dataclass, field
16
+ from datetime import datetime, timezone
17
+ from enum import IntFlag
18
+ from typing import Optional, List, Tuple, Any
19
+ from pathlib import Path
20
+ from uuid import UUID, uuid4
21
+ import struct
22
+
23
+ import polars as pl
24
+
25
+ from rdf_starbase.storage.terms import (
26
+ TermId,
27
+ TermKind,
28
+ TermDict,
29
+ Term,
30
+ get_term_kind,
31
+ is_quoted_triple,
32
+ )
33
+ from rdf_starbase.storage.quoted_triples import QtDict, QtId
34
+
35
+
36
+ # =============================================================================
37
+ # Fact Flags
38
+ # =============================================================================
39
+
40
+ class FactFlags(IntFlag):
41
+ """
42
+ Bitset flags for fact state.
43
+
44
+ Stored in the `flags` column (u16).
45
+ """
46
+ NONE = 0
47
+ ASSERTED = 1 << 0 # Explicitly asserted
48
+ INFERRED = 1 << 1 # Derived by inference
49
+ DELETED = 1 << 2 # Tombstone (soft delete)
50
+ METADATA = 1 << 3 # This is a metadata triple (s or o is QtId)
51
+
52
+
53
+ # =============================================================================
54
+ # Default Graph
55
+ # =============================================================================
56
+
57
+ # The default graph is represented as TermId 0
58
+ DEFAULT_GRAPH_ID: TermId = 0
59
+
60
+
61
+ # =============================================================================
62
+ # Fact Store
63
+ # =============================================================================
64
+
65
+ class FactStore:
66
+ """
67
+ Integer-based facts store.
68
+
69
+ Stores quads as (g, s, p, o) where all components are TermIds.
70
+ Supports RDF★ by allowing QtIds as subjects or objects.
71
+
72
+ Schema matches storage-spec.md §3.4:
73
+ - g: u64 (GraphId, DEFAULT_GRAPH_ID for default)
74
+ - s: u64 (TermId, may be QtId)
75
+ - p: u64 (TermId)
76
+ - o: u64 (TermId, may be QtId)
77
+ - flags: u16 (FactFlags bitset)
78
+ - txn: u64 (transaction/commit ID)
79
+ - t_added: u64 (timestamp, microseconds since epoch)
80
+
81
+ Also stores provenance columns for backward compatibility:
82
+ - source: u64 (TermId for source IRI/literal)
83
+ - confidence: f64
84
+ - process: u64 (TermId for process IRI)
85
+ """
86
+
87
+ def __init__(self, term_dict: TermDict, qt_dict: QtDict):
88
+ """
89
+ Initialize the fact store.
90
+
91
+ Args:
92
+ term_dict: The TermDict for term interning
93
+ qt_dict: The QtDict for quoted triple interning
94
+ """
95
+ self._term_dict = term_dict
96
+ self._qt_dict = qt_dict
97
+
98
+ # Transaction counter
99
+ self._next_txn: int = 0
100
+
101
+ # Facts DataFrame with integer columns
102
+ self._df = self._create_empty_dataframe()
103
+
104
+ # Pre-intern the default graph marker
105
+ self._default_graph_id = DEFAULT_GRAPH_ID
106
+
107
+ def _create_empty_dataframe(self) -> pl.DataFrame:
108
+ """Create an empty facts DataFrame with the correct schema."""
109
+ return pl.DataFrame({
110
+ "g": pl.Series([], dtype=pl.UInt64),
111
+ "s": pl.Series([], dtype=pl.UInt64),
112
+ "p": pl.Series([], dtype=pl.UInt64),
113
+ "o": pl.Series([], dtype=pl.UInt64),
114
+ "flags": pl.Series([], dtype=pl.UInt16),
115
+ "txn": pl.Series([], dtype=pl.UInt64),
116
+ "t_added": pl.Series([], dtype=pl.UInt64),
117
+ # Provenance columns (for backward compatibility)
118
+ "source": pl.Series([], dtype=pl.UInt64),
119
+ "confidence": pl.Series([], dtype=pl.Float64),
120
+ "process": pl.Series([], dtype=pl.UInt64),
121
+ })
122
+
123
+ def _allocate_txn(self) -> int:
124
+ """Allocate the next transaction ID."""
125
+ txn = self._next_txn
126
+ self._next_txn += 1
127
+ return txn
128
+
129
+ def add_fact(
130
+ self,
131
+ s: TermId,
132
+ p: TermId,
133
+ o: TermId,
134
+ g: TermId = DEFAULT_GRAPH_ID,
135
+ flags: FactFlags = FactFlags.ASSERTED,
136
+ source: Optional[TermId] = None,
137
+ confidence: float = 1.0,
138
+ process: Optional[TermId] = None,
139
+ t_added: Optional[int] = None,
140
+ ) -> int:
141
+ """
142
+ Add a single fact to the store.
143
+
144
+ Args:
145
+ s: Subject TermId (may be QtId for metadata triples)
146
+ p: Predicate TermId
147
+ o: Object TermId (may be QtId for metadata triples)
148
+ g: Graph TermId (DEFAULT_GRAPH_ID for default graph)
149
+ flags: Fact flags (ASSERTED by default)
150
+ source: Source TermId for provenance
151
+ confidence: Confidence score (0.0 to 1.0)
152
+ process: Process TermId for provenance
153
+ t_added: Timestamp in microseconds since epoch (default: now)
154
+
155
+ Returns:
156
+ Transaction ID
157
+ """
158
+ txn = self._allocate_txn()
159
+ if t_added is None:
160
+ t_added = int(datetime.now(timezone.utc).timestamp() * 1_000_000)
161
+
162
+ # Auto-detect metadata triples
163
+ if is_quoted_triple(s) or is_quoted_triple(o):
164
+ flags = flags | FactFlags.METADATA
165
+
166
+ new_row = pl.DataFrame({
167
+ "g": [g],
168
+ "s": [s],
169
+ "p": [p],
170
+ "o": [o],
171
+ "flags": [int(flags)],
172
+ "txn": [txn],
173
+ "t_added": [t_added],
174
+ "source": [source if source is not None else 0],
175
+ "confidence": [confidence],
176
+ "process": [process if process is not None else 0],
177
+ }).cast({
178
+ "g": pl.UInt64,
179
+ "s": pl.UInt64,
180
+ "p": pl.UInt64,
181
+ "o": pl.UInt64,
182
+ "flags": pl.UInt16,
183
+ "txn": pl.UInt64,
184
+ "t_added": pl.UInt64,
185
+ "source": pl.UInt64,
186
+ "confidence": pl.Float64,
187
+ "process": pl.UInt64,
188
+ })
189
+
190
+ self._df = pl.concat([self._df, new_row], how="vertical")
191
+ return txn
192
+
193
+ def add_facts_batch(
194
+ self,
195
+ facts: List[Tuple[TermId, TermId, TermId, TermId]], # (g, s, p, o)
196
+ flags: FactFlags = FactFlags.ASSERTED,
197
+ source: Optional[TermId] = None,
198
+ confidence: float = 1.0,
199
+ process: Optional[TermId] = None,
200
+ ) -> int:
201
+ """
202
+ Add a batch of facts with shared provenance.
203
+
204
+ This is the recommended ingestion path for performance.
205
+
206
+ Args:
207
+ facts: List of (g, s, p, o) tuples
208
+ flags: Shared flags for all facts
209
+ source: Shared source TermId
210
+ confidence: Shared confidence score
211
+ process: Shared process TermId
212
+
213
+ Returns:
214
+ Transaction ID for the batch
215
+ """
216
+ if not facts:
217
+ return self._allocate_txn()
218
+
219
+ txn = self._allocate_txn()
220
+ t_added = int(datetime.now(timezone.utc).timestamp() * 1_000_000)
221
+
222
+ rows = []
223
+ for g, s, p, o in facts:
224
+ fact_flags = flags
225
+ if is_quoted_triple(s) or is_quoted_triple(o):
226
+ fact_flags = fact_flags | FactFlags.METADATA
227
+
228
+ rows.append({
229
+ "g": g,
230
+ "s": s,
231
+ "p": p,
232
+ "o": o,
233
+ "flags": int(fact_flags),
234
+ "txn": txn,
235
+ "t_added": t_added,
236
+ "source": source if source is not None else 0,
237
+ "confidence": confidence,
238
+ "process": process if process is not None else 0,
239
+ })
240
+
241
+ new_df = pl.DataFrame(rows).cast({
242
+ "g": pl.UInt64,
243
+ "s": pl.UInt64,
244
+ "p": pl.UInt64,
245
+ "o": pl.UInt64,
246
+ "flags": pl.UInt16,
247
+ "txn": pl.UInt64,
248
+ "t_added": pl.UInt64,
249
+ "source": pl.UInt64,
250
+ "confidence": pl.Float64,
251
+ "process": pl.UInt64,
252
+ })
253
+
254
+ self._df = pl.concat([self._df, new_df], how="vertical")
255
+ return txn
256
+
257
+ def add_facts_columnar(
258
+ self,
259
+ g_col: List[TermId],
260
+ s_col: List[TermId],
261
+ p_col: List[TermId],
262
+ o_col: List[TermId],
263
+ flags: FactFlags = FactFlags.ASSERTED,
264
+ source: Optional[TermId] = None,
265
+ confidence: float = 1.0,
266
+ process: Optional[TermId] = None,
267
+ ) -> int:
268
+ """
269
+ Add facts from pre-built column lists (TRUE vectorized path).
270
+
271
+ This is the FASTEST ingestion method. Build your column data
272
+ separately, then pass it here for a single DataFrame creation.
273
+
274
+ Args:
275
+ g_col: List of graph TermIds
276
+ s_col: List of subject TermIds
277
+ p_col: List of predicate TermIds
278
+ o_col: List of object TermIds
279
+ flags: Shared flags for all facts
280
+ source: Shared source TermId
281
+ confidence: Shared confidence score
282
+ process: Shared process TermId
283
+
284
+ Returns:
285
+ Transaction ID
286
+ """
287
+ n = len(s_col)
288
+ if n == 0:
289
+ return self._allocate_txn()
290
+
291
+ txn = self._allocate_txn()
292
+ t_added = int(datetime.now(timezone.utc).timestamp() * 1_000_000)
293
+
294
+ # Build DataFrame directly from columns (no Python loop)
295
+ new_df = pl.DataFrame({
296
+ "g": pl.Series(g_col, dtype=pl.UInt64),
297
+ "s": pl.Series(s_col, dtype=pl.UInt64),
298
+ "p": pl.Series(p_col, dtype=pl.UInt64),
299
+ "o": pl.Series(o_col, dtype=pl.UInt64),
300
+ "flags": pl.Series([int(flags)] * n, dtype=pl.UInt16),
301
+ "txn": pl.Series([txn] * n, dtype=pl.UInt64),
302
+ "t_added": pl.Series([t_added] * n, dtype=pl.UInt64),
303
+ "source": pl.Series([source if source else 0] * n, dtype=pl.UInt64),
304
+ "confidence": pl.Series([confidence] * n, dtype=pl.Float64),
305
+ "process": pl.Series([process if process else 0] * n, dtype=pl.UInt64),
306
+ })
307
+
308
+ self._df = pl.concat([self._df, new_df], how="vertical")
309
+ return txn
310
+
311
+ def add_facts_with_provenance(
312
+ self,
313
+ facts: List[Tuple[TermId, TermId, TermId, TermId, Optional[TermId], float, Optional[TermId]]],
314
+ flags: FactFlags = FactFlags.ASSERTED,
315
+ ) -> int:
316
+ """
317
+ Add facts with per-fact provenance (confidence, source, process).
318
+
319
+ This is the recommended path for ingesting data with provenance metadata
320
+ stored in native columns rather than as separate RDF triples.
321
+
322
+ Args:
323
+ facts: List of (g, s, p, o, source, confidence, process) tuples
324
+ - source: TermId for data source (or None)
325
+ - confidence: Float confidence score (0.0 to 1.0)
326
+ - process: TermId for generating process (or None)
327
+ flags: Base flags for all facts
328
+
329
+ Returns:
330
+ Transaction ID for the batch
331
+ """
332
+ if not facts:
333
+ return self._allocate_txn()
334
+
335
+ txn = self._allocate_txn()
336
+ t_added = int(datetime.now(timezone.utc).timestamp() * 1_000_000)
337
+
338
+ rows = []
339
+ for g, s, p, o, source, confidence, process in facts:
340
+ fact_flags = flags
341
+ if is_quoted_triple(s) or is_quoted_triple(o):
342
+ fact_flags = fact_flags | FactFlags.METADATA
343
+
344
+ rows.append({
345
+ "g": g,
346
+ "s": s,
347
+ "p": p,
348
+ "o": o,
349
+ "flags": int(fact_flags),
350
+ "txn": txn,
351
+ "t_added": t_added,
352
+ "source": source if source is not None else 0,
353
+ "confidence": confidence,
354
+ "process": process if process is not None else 0,
355
+ })
356
+
357
+ new_df = pl.DataFrame(rows).cast({
358
+ "g": pl.UInt64,
359
+ "s": pl.UInt64,
360
+ "p": pl.UInt64,
361
+ "o": pl.UInt64,
362
+ "flags": pl.UInt16,
363
+ "txn": pl.UInt64,
364
+ "t_added": pl.UInt64,
365
+ "source": pl.UInt64,
366
+ "confidence": pl.Float64,
367
+ "process": pl.UInt64,
368
+ })
369
+
370
+ self._df = pl.concat([self._df, new_df], how="vertical")
371
+ return txn
372
+
373
+ def scan_by_confidence(
374
+ self,
375
+ min_confidence: float,
376
+ max_confidence: Optional[float] = None,
377
+ include_metadata: bool = True,
378
+ ) -> pl.DataFrame:
379
+ """
380
+ Scan facts by confidence threshold using native column.
381
+
382
+ This is O(n) scan but uses vectorized Polars filtering -
383
+ no string parsing or joins required.
384
+
385
+ Args:
386
+ min_confidence: Minimum confidence (exclusive)
387
+ max_confidence: Maximum confidence (inclusive, optional)
388
+ include_metadata: Whether to include metadata facts
389
+
390
+ Returns:
391
+ DataFrame with all columns for matching facts
392
+ """
393
+ df = self._df.lazy()
394
+
395
+ # Filter by confidence
396
+ df = df.filter(pl.col("confidence") > min_confidence)
397
+ if max_confidence is not None:
398
+ df = df.filter(pl.col("confidence") <= max_confidence)
399
+
400
+ # Exclude deleted
401
+ df = df.filter((pl.col("flags") & int(FactFlags.DELETED)) == 0)
402
+
403
+ # Optionally filter out metadata facts
404
+ if not include_metadata:
405
+ df = df.filter((pl.col("flags") & int(FactFlags.METADATA)) == 0)
406
+
407
+ return df.collect()
408
+
409
+ def scan_by_source(
410
+ self,
411
+ source: TermId,
412
+ include_metadata: bool = True,
413
+ ) -> pl.DataFrame:
414
+ """
415
+ Scan facts by source using native column.
416
+
417
+ Args:
418
+ source: Source TermId to filter by
419
+ include_metadata: Whether to include metadata facts
420
+
421
+ Returns:
422
+ DataFrame with all columns for matching facts
423
+ """
424
+ df = self._df.lazy()
425
+ df = df.filter(pl.col("source") == source)
426
+ df = df.filter((pl.col("flags") & int(FactFlags.DELETED)) == 0)
427
+
428
+ if not include_metadata:
429
+ df = df.filter((pl.col("flags") & int(FactFlags.METADATA)) == 0)
430
+
431
+ return df.collect()
432
+
433
+ # =========================================================================
434
+ # Query Primitives (storage-spec.md §8)
435
+ # =========================================================================
436
+
437
+ def scan_facts(
438
+ self,
439
+ p: Optional[TermId] = None,
440
+ g: Optional[TermId] = None,
441
+ include_deleted: bool = False,
442
+ ) -> pl.DataFrame:
443
+ """
444
+ Scan facts with optional predicate and graph filters.
445
+
446
+ This is the primary scan primitive for query execution.
447
+ When predicate is specified, this enables partition pruning.
448
+
449
+ Args:
450
+ p: Optional predicate filter
451
+ g: Optional graph filter
452
+ include_deleted: Whether to include deleted facts
453
+
454
+ Returns:
455
+ DataFrame with columns: g, s, p, o, flags, txn, t_added, source, confidence, process
456
+ """
457
+ df = self._df.lazy()
458
+
459
+ if p is not None:
460
+ df = df.filter(pl.col("p") == p)
461
+
462
+ if g is not None:
463
+ df = df.filter(pl.col("g") == g)
464
+
465
+ if not include_deleted:
466
+ df = df.filter((pl.col("flags") & int(FactFlags.DELETED)) == 0)
467
+
468
+ return df.collect()
469
+
470
+ def scan_facts_by_s(
471
+ self,
472
+ s: TermId,
473
+ p: Optional[TermId] = None,
474
+ g: Optional[TermId] = None,
475
+ ) -> pl.DataFrame:
476
+ """
477
+ Scan facts by subject with optional predicate and graph filters.
478
+
479
+ Useful for "show me all facts about entity X" queries.
480
+ """
481
+ df = self._df.lazy().filter(pl.col("s") == s)
482
+
483
+ if p is not None:
484
+ df = df.filter(pl.col("p") == p)
485
+
486
+ if g is not None:
487
+ df = df.filter(pl.col("g") == g)
488
+
489
+ df = df.filter((pl.col("flags") & int(FactFlags.DELETED)) == 0)
490
+
491
+ return df.collect()
492
+
493
+ def scan_facts_by_o(
494
+ self,
495
+ o: TermId,
496
+ p: Optional[TermId] = None,
497
+ g: Optional[TermId] = None,
498
+ ) -> pl.DataFrame:
499
+ """
500
+ Scan facts by object with optional predicate and graph filters.
501
+
502
+ Useful for reverse lookups (inbound edges).
503
+ """
504
+ df = self._df.lazy().filter(pl.col("o") == o)
505
+
506
+ if p is not None:
507
+ df = df.filter(pl.col("p") == p)
508
+
509
+ if g is not None:
510
+ df = df.filter(pl.col("g") == g)
511
+
512
+ df = df.filter((pl.col("flags") & int(FactFlags.DELETED)) == 0)
513
+
514
+ return df.collect()
515
+
516
+ def scan_metadata_facts(
517
+ self,
518
+ qt_id: Optional[QtId] = None,
519
+ p: Optional[TermId] = None,
520
+ ) -> pl.DataFrame:
521
+ """
522
+ Scan facts where subject is a quoted triple (metadata facts).
523
+
524
+ This is the key primitive for RDF★ metadata queries.
525
+
526
+ Args:
527
+ qt_id: Optional specific quoted triple to filter by
528
+ p: Optional predicate filter (e.g., prov:wasDerivedFrom)
529
+ """
530
+ df = self._df.lazy().filter(
531
+ (pl.col("flags") & int(FactFlags.METADATA)) != 0
532
+ )
533
+
534
+ if qt_id is not None:
535
+ df = df.filter(pl.col("s") == qt_id)
536
+
537
+ if p is not None:
538
+ df = df.filter(pl.col("p") == p)
539
+
540
+ df = df.filter((pl.col("flags") & int(FactFlags.DELETED)) == 0)
541
+
542
+ return df.collect()
543
+
544
+ def scan_facts_at_time(
545
+ self,
546
+ as_of_time: datetime,
547
+ p: Optional[TermId] = None,
548
+ g: Optional[TermId] = None,
549
+ s: Optional[TermId] = None,
550
+ ) -> pl.DataFrame:
551
+ """
552
+ Time-travel query: return facts as they existed at a specific point in time.
553
+
554
+ This is a key capability for compliance and auditing:
555
+ - "What did we believe to be true on 2024-01-15?"
556
+ - "What assertions existed before the data refresh?"
557
+
558
+ Args:
559
+ as_of_time: The point in time to query
560
+ p: Optional predicate filter
561
+ g: Optional graph filter
562
+ s: Optional subject filter
563
+
564
+ Returns:
565
+ DataFrame with facts that existed at the specified time
566
+ """
567
+ # Convert datetime to microseconds timestamp
568
+ as_of_ts = int(as_of_time.timestamp() * 1_000_000)
569
+
570
+ df = self._df.lazy()
571
+
572
+ # Only include facts added before the specified time
573
+ df = df.filter(pl.col("t_added") <= as_of_ts)
574
+
575
+ # Apply optional filters
576
+ if p is not None:
577
+ df = df.filter(pl.col("p") == p)
578
+ if g is not None:
579
+ df = df.filter(pl.col("g") == g)
580
+ if s is not None:
581
+ df = df.filter(pl.col("s") == s)
582
+
583
+ # For time-travel, we need to show the state at that time
584
+ # If a fact was deleted after as_of_time, it should still show
585
+ # This implementation shows all facts added by that time
586
+ # (For full versioning, we'd need to track delete timestamps too)
587
+ df = df.filter((pl.col("flags") & int(FactFlags.DELETED)) == 0)
588
+
589
+ return df.collect()
590
+
591
+ def scan_facts_by_txn_range(
592
+ self,
593
+ start_txn: int,
594
+ end_txn: Optional[int] = None,
595
+ include_deleted: bool = False,
596
+ ) -> pl.DataFrame:
597
+ """
598
+ Scan facts by transaction ID range.
599
+
600
+ Useful for:
601
+ - Incremental sync: "give me all changes since txn 1000"
602
+ - Change data capture
603
+ - Debugging specific ingestion batches
604
+
605
+ Args:
606
+ start_txn: Start transaction ID (inclusive)
607
+ end_txn: End transaction ID (inclusive, optional)
608
+ include_deleted: Whether to include deleted facts
609
+
610
+ Returns:
611
+ DataFrame with facts in the specified transaction range
612
+ """
613
+ df = self._df.lazy().filter(pl.col("txn") >= start_txn)
614
+
615
+ if end_txn is not None:
616
+ df = df.filter(pl.col("txn") <= end_txn)
617
+
618
+ if not include_deleted:
619
+ df = df.filter((pl.col("flags") & int(FactFlags.DELETED)) == 0)
620
+
621
+ return df.collect()
622
+
623
+ # =========================================================================
624
+ # RDF★ Expansion Joins
625
+ # =========================================================================
626
+
627
+ def expand_qt_metadata(
628
+ self,
629
+ metadata_predicate: TermId,
630
+ ) -> pl.DataFrame:
631
+ """
632
+ Critical RDF★ expansion join.
633
+
634
+ Given a metadata predicate (e.g., prov:wasDerivedFrom), finds all
635
+ quoted triples with that metadata and expands them to (s, p, o).
636
+
637
+ Returns DataFrame with: qt_id, base_s, base_p, base_o, metadata_o
638
+
639
+ This implements the join pattern from storage-spec.md §8:
640
+ 1. scan_facts(p=metadata_predicate) → qt_id, metadata_o
641
+ 2. lookup_qt(qt_ids) → qt_id, s, p, o
642
+ 3. join → base triple + metadata value
643
+ """
644
+ # Step 1: Get all facts with the metadata predicate where subject is a qt
645
+ df1 = self._df.lazy().filter(
646
+ (pl.col("p") == metadata_predicate) &
647
+ ((pl.col("flags") & int(FactFlags.METADATA)) != 0) &
648
+ ((pl.col("flags") & int(FactFlags.DELETED)) == 0)
649
+ ).select([
650
+ pl.col("s").alias("qt_id"),
651
+ pl.col("o").alias("metadata_o"),
652
+ pl.col("confidence"),
653
+ pl.col("source"),
654
+ ]).collect()
655
+
656
+ if df1.is_empty():
657
+ return pl.DataFrame({
658
+ "qt_id": pl.Series([], dtype=pl.UInt64),
659
+ "base_s": pl.Series([], dtype=pl.UInt64),
660
+ "base_p": pl.Series([], dtype=pl.UInt64),
661
+ "base_o": pl.Series([], dtype=pl.UInt64),
662
+ "metadata_o": pl.Series([], dtype=pl.UInt64),
663
+ "confidence": pl.Series([], dtype=pl.Float64),
664
+ "source": pl.Series([], dtype=pl.UInt64),
665
+ })
666
+
667
+ # Step 2: Expand qt_ids to base triples
668
+ qt_ids = df1["qt_id"].to_list()
669
+ df2 = self._qt_dict.expand_to_dataframe(qt_ids)
670
+
671
+ if df2.is_empty():
672
+ return pl.DataFrame({
673
+ "qt_id": pl.Series([], dtype=pl.UInt64),
674
+ "base_s": pl.Series([], dtype=pl.UInt64),
675
+ "base_p": pl.Series([], dtype=pl.UInt64),
676
+ "base_o": pl.Series([], dtype=pl.UInt64),
677
+ "metadata_o": pl.Series([], dtype=pl.UInt64),
678
+ "confidence": pl.Series([], dtype=pl.Float64),
679
+ "source": pl.Series([], dtype=pl.UInt64),
680
+ })
681
+
682
+ # Rename columns for join
683
+ df2 = df2.rename({
684
+ "s": "base_s",
685
+ "p": "base_p",
686
+ "o": "base_o",
687
+ })
688
+
689
+ # Step 3: Join
690
+ return df1.join(df2, on="qt_id", how="inner")
691
+
692
+ def expand_metadata_df(self, metadata_df: pl.DataFrame) -> pl.DataFrame:
693
+ """
694
+ Expand a DataFrame of metadata facts.
695
+
696
+ Takes a DataFrame that has at minimum an 's' column containing qt_ids,
697
+ and expands each qt_id to its (base_s, base_p, base_o) components.
698
+
699
+ This is useful when you've already filtered metadata facts and
700
+ want to expand them.
701
+
702
+ Args:
703
+ metadata_df: DataFrame with 's' column containing qt_ids
704
+
705
+ Returns:
706
+ DataFrame with original columns plus base_s, base_p, base_o
707
+ """
708
+ if metadata_df.is_empty():
709
+ return metadata_df.with_columns([
710
+ pl.lit(0).cast(pl.UInt64).alias("base_s"),
711
+ pl.lit(0).cast(pl.UInt64).alias("base_p"),
712
+ pl.lit(0).cast(pl.UInt64).alias("base_o"),
713
+ ]).filter(pl.lit(False)) # Empty with correct schema
714
+
715
+ # Get qt_ids from subject column
716
+ qt_ids = metadata_df["s"].to_list()
717
+
718
+ # Expand using qt_dict
719
+ qt_df = self._qt_dict.expand_to_dataframe(qt_ids)
720
+
721
+ if qt_df.is_empty():
722
+ return metadata_df.with_columns([
723
+ pl.lit(0).cast(pl.UInt64).alias("base_s"),
724
+ pl.lit(0).cast(pl.UInt64).alias("base_p"),
725
+ pl.lit(0).cast(pl.UInt64).alias("base_o"),
726
+ ]).filter(pl.lit(False))
727
+
728
+ # Rename for clarity
729
+ qt_df = qt_df.rename({
730
+ "qt_id": "s", # Match the join key
731
+ "s": "base_s",
732
+ "p": "base_p",
733
+ "o": "base_o",
734
+ })
735
+
736
+ # Join on s (the qt_id)
737
+ return metadata_df.join(qt_df, on="s", how="inner")
738
+
739
+ # =========================================================================
740
+ # Soft Delete and Deprecation
741
+ # =========================================================================
742
+
743
+ def mark_deleted(
744
+ self,
745
+ s: Optional[TermId] = None,
746
+ p: Optional[TermId] = None,
747
+ o: Optional[TermId] = None,
748
+ g: Optional[TermId] = None,
749
+ ) -> int:
750
+ """
751
+ Soft-delete facts matching the given pattern.
752
+
753
+ Returns the number of facts marked as deleted.
754
+ """
755
+ mask = pl.lit(True)
756
+
757
+ if s is not None:
758
+ mask = mask & (pl.col("s") == s)
759
+ if p is not None:
760
+ mask = mask & (pl.col("p") == p)
761
+ if o is not None:
762
+ mask = mask & (pl.col("o") == o)
763
+ if g is not None:
764
+ mask = mask & (pl.col("g") == g)
765
+
766
+ before_count = self._df.filter(
767
+ mask & ((pl.col("flags") & int(FactFlags.DELETED)) == 0)
768
+ ).height
769
+
770
+ self._df = self._df.with_columns([
771
+ pl.when(mask)
772
+ .then(pl.col("flags") | int(FactFlags.DELETED))
773
+ .otherwise(pl.col("flags"))
774
+ .alias("flags")
775
+ ])
776
+
777
+ return before_count
778
+
779
+ # =========================================================================
780
+ # Persistence
781
+ # =========================================================================
782
+
783
+ def to_dataframe(self) -> pl.DataFrame:
784
+ """Return the facts as a DataFrame."""
785
+ return self._df
786
+
787
+ def save(self, path: Path):
788
+ """Save facts to a Parquet file."""
789
+ path = Path(path)
790
+ path.mkdir(parents=True, exist_ok=True)
791
+ self._df.write_parquet(path / "facts.parquet")
792
+
793
+ # Save transaction counter
794
+ with open(path / "facts_meta.txt", "w") as f:
795
+ f.write(f"next_txn={self._next_txn}\n")
796
+
797
+ @classmethod
798
+ def load(cls, path: Path, term_dict: TermDict, qt_dict: QtDict) -> "FactStore":
799
+ """Load facts from a Parquet file."""
800
+ path = Path(path)
801
+
802
+ instance = cls(term_dict, qt_dict)
803
+ instance._df = pl.read_parquet(path / "facts.parquet")
804
+
805
+ # Load transaction counter
806
+ meta_file = path / "facts_meta.txt"
807
+ if meta_file.exists():
808
+ with open(meta_file) as f:
809
+ for line in f:
810
+ if line.startswith("next_txn="):
811
+ instance._next_txn = int(line.split("=")[1].strip())
812
+
813
+ return instance
814
+
815
+ # =========================================================================
816
+ # Statistics
817
+ # =========================================================================
818
+
819
+ def __len__(self) -> int:
820
+ """Return the total number of facts (including deleted)."""
821
+ return len(self._df)
822
+
823
+ def count_active(self) -> int:
824
+ """Return the number of non-deleted facts."""
825
+ return self._df.filter(
826
+ (pl.col("flags") & int(FactFlags.DELETED)) == 0
827
+ ).height
828
+
829
+ def count_metadata(self) -> int:
830
+ """Return the number of metadata facts."""
831
+ return self._df.filter(
832
+ ((pl.col("flags") & int(FactFlags.METADATA)) != 0) &
833
+ ((pl.col("flags") & int(FactFlags.DELETED)) == 0)
834
+ ).height
835
+
836
+ def stats(self) -> dict:
837
+ """Return statistics about the fact store."""
838
+ active = self.count_active()
839
+ metadata = self.count_metadata()
840
+
841
+ return {
842
+ "total_facts": len(self),
843
+ "active_facts": active,
844
+ "deleted_facts": len(self) - active,
845
+ "metadata_facts": metadata,
846
+ "base_facts": active - metadata,
847
+ "next_txn": self._next_txn,
848
+ "unique_predicates": self._df.select("p").n_unique(),
849
+ "unique_subjects": self._df.select("s").n_unique(),
850
+ }