rdf-starbase 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdf_starbase/__init__.py +57 -0
- rdf_starbase/ai_grounding.py +728 -0
- rdf_starbase/compat/__init__.py +26 -0
- rdf_starbase/compat/rdflib.py +1104 -0
- rdf_starbase/formats/__init__.py +29 -0
- rdf_starbase/formats/jsonld.py +488 -0
- rdf_starbase/formats/ntriples.py +419 -0
- rdf_starbase/formats/rdfxml.py +434 -0
- rdf_starbase/formats/turtle.py +882 -0
- rdf_starbase/models.py +92 -0
- rdf_starbase/registry.py +540 -0
- rdf_starbase/repositories.py +407 -0
- rdf_starbase/repository_api.py +739 -0
- rdf_starbase/sparql/__init__.py +35 -0
- rdf_starbase/sparql/ast.py +910 -0
- rdf_starbase/sparql/executor.py +1925 -0
- rdf_starbase/sparql/parser.py +1716 -0
- rdf_starbase/storage/__init__.py +44 -0
- rdf_starbase/storage/executor.py +1914 -0
- rdf_starbase/storage/facts.py +850 -0
- rdf_starbase/storage/lsm.py +531 -0
- rdf_starbase/storage/persistence.py +338 -0
- rdf_starbase/storage/quoted_triples.py +292 -0
- rdf_starbase/storage/reasoner.py +1035 -0
- rdf_starbase/storage/terms.py +628 -0
- rdf_starbase/store.py +1049 -0
- rdf_starbase/store_legacy.py +748 -0
- rdf_starbase/web.py +568 -0
- rdf_starbase-0.1.0.dist-info/METADATA +706 -0
- rdf_starbase-0.1.0.dist-info/RECORD +31 -0
- rdf_starbase-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,850 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Integer-Based Facts Store.
|
|
3
|
+
|
|
4
|
+
Implements the `facts` table with dictionary-encoded integer columns.
|
|
5
|
+
No string terms inside facts - everything is ID-based for maximum performance.
|
|
6
|
+
|
|
7
|
+
Key design decisions (from storage-spec.md):
|
|
8
|
+
- All columns are integer IDs (g, s, p, o are TermIds)
|
|
9
|
+
- RDF★ metadata triples stored by setting s/o to QtId
|
|
10
|
+
- Batch-first ingestion with monotonic txn IDs
|
|
11
|
+
- Flags bitset for asserted/inferred/deleted states
|
|
12
|
+
- Predicate-partitioned storage layout
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from datetime import datetime, timezone
|
|
17
|
+
from enum import IntFlag
|
|
18
|
+
from typing import Optional, List, Tuple, Any
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from uuid import UUID, uuid4
|
|
21
|
+
import struct
|
|
22
|
+
|
|
23
|
+
import polars as pl
|
|
24
|
+
|
|
25
|
+
from rdf_starbase.storage.terms import (
|
|
26
|
+
TermId,
|
|
27
|
+
TermKind,
|
|
28
|
+
TermDict,
|
|
29
|
+
Term,
|
|
30
|
+
get_term_kind,
|
|
31
|
+
is_quoted_triple,
|
|
32
|
+
)
|
|
33
|
+
from rdf_starbase.storage.quoted_triples import QtDict, QtId
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# =============================================================================
|
|
37
|
+
# Fact Flags
|
|
38
|
+
# =============================================================================
|
|
39
|
+
|
|
40
|
+
class FactFlags(IntFlag):
|
|
41
|
+
"""
|
|
42
|
+
Bitset flags for fact state.
|
|
43
|
+
|
|
44
|
+
Stored in the `flags` column (u16).
|
|
45
|
+
"""
|
|
46
|
+
NONE = 0
|
|
47
|
+
ASSERTED = 1 << 0 # Explicitly asserted
|
|
48
|
+
INFERRED = 1 << 1 # Derived by inference
|
|
49
|
+
DELETED = 1 << 2 # Tombstone (soft delete)
|
|
50
|
+
METADATA = 1 << 3 # This is a metadata triple (s or o is QtId)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# =============================================================================
|
|
54
|
+
# Default Graph
|
|
55
|
+
# =============================================================================
|
|
56
|
+
|
|
57
|
+
# The default graph is represented as TermId 0
|
|
58
|
+
DEFAULT_GRAPH_ID: TermId = 0
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# =============================================================================
|
|
62
|
+
# Fact Store
|
|
63
|
+
# =============================================================================
|
|
64
|
+
|
|
65
|
+
class FactStore:
|
|
66
|
+
"""
|
|
67
|
+
Integer-based facts store.
|
|
68
|
+
|
|
69
|
+
Stores quads as (g, s, p, o) where all components are TermIds.
|
|
70
|
+
Supports RDF★ by allowing QtIds as subjects or objects.
|
|
71
|
+
|
|
72
|
+
Schema matches storage-spec.md §3.4:
|
|
73
|
+
- g: u64 (GraphId, DEFAULT_GRAPH_ID for default)
|
|
74
|
+
- s: u64 (TermId, may be QtId)
|
|
75
|
+
- p: u64 (TermId)
|
|
76
|
+
- o: u64 (TermId, may be QtId)
|
|
77
|
+
- flags: u16 (FactFlags bitset)
|
|
78
|
+
- txn: u64 (transaction/commit ID)
|
|
79
|
+
- t_added: u64 (timestamp, microseconds since epoch)
|
|
80
|
+
|
|
81
|
+
Also stores provenance columns for backward compatibility:
|
|
82
|
+
- source: u64 (TermId for source IRI/literal)
|
|
83
|
+
- confidence: f64
|
|
84
|
+
- process: u64 (TermId for process IRI)
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def __init__(self, term_dict: TermDict, qt_dict: QtDict):
|
|
88
|
+
"""
|
|
89
|
+
Initialize the fact store.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
term_dict: The TermDict for term interning
|
|
93
|
+
qt_dict: The QtDict for quoted triple interning
|
|
94
|
+
"""
|
|
95
|
+
self._term_dict = term_dict
|
|
96
|
+
self._qt_dict = qt_dict
|
|
97
|
+
|
|
98
|
+
# Transaction counter
|
|
99
|
+
self._next_txn: int = 0
|
|
100
|
+
|
|
101
|
+
# Facts DataFrame with integer columns
|
|
102
|
+
self._df = self._create_empty_dataframe()
|
|
103
|
+
|
|
104
|
+
# Pre-intern the default graph marker
|
|
105
|
+
self._default_graph_id = DEFAULT_GRAPH_ID
|
|
106
|
+
|
|
107
|
+
def _create_empty_dataframe(self) -> pl.DataFrame:
|
|
108
|
+
"""Create an empty facts DataFrame with the correct schema."""
|
|
109
|
+
return pl.DataFrame({
|
|
110
|
+
"g": pl.Series([], dtype=pl.UInt64),
|
|
111
|
+
"s": pl.Series([], dtype=pl.UInt64),
|
|
112
|
+
"p": pl.Series([], dtype=pl.UInt64),
|
|
113
|
+
"o": pl.Series([], dtype=pl.UInt64),
|
|
114
|
+
"flags": pl.Series([], dtype=pl.UInt16),
|
|
115
|
+
"txn": pl.Series([], dtype=pl.UInt64),
|
|
116
|
+
"t_added": pl.Series([], dtype=pl.UInt64),
|
|
117
|
+
# Provenance columns (for backward compatibility)
|
|
118
|
+
"source": pl.Series([], dtype=pl.UInt64),
|
|
119
|
+
"confidence": pl.Series([], dtype=pl.Float64),
|
|
120
|
+
"process": pl.Series([], dtype=pl.UInt64),
|
|
121
|
+
})
|
|
122
|
+
|
|
123
|
+
def _allocate_txn(self) -> int:
|
|
124
|
+
"""Allocate the next transaction ID."""
|
|
125
|
+
txn = self._next_txn
|
|
126
|
+
self._next_txn += 1
|
|
127
|
+
return txn
|
|
128
|
+
|
|
129
|
+
def add_fact(
|
|
130
|
+
self,
|
|
131
|
+
s: TermId,
|
|
132
|
+
p: TermId,
|
|
133
|
+
o: TermId,
|
|
134
|
+
g: TermId = DEFAULT_GRAPH_ID,
|
|
135
|
+
flags: FactFlags = FactFlags.ASSERTED,
|
|
136
|
+
source: Optional[TermId] = None,
|
|
137
|
+
confidence: float = 1.0,
|
|
138
|
+
process: Optional[TermId] = None,
|
|
139
|
+
t_added: Optional[int] = None,
|
|
140
|
+
) -> int:
|
|
141
|
+
"""
|
|
142
|
+
Add a single fact to the store.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
s: Subject TermId (may be QtId for metadata triples)
|
|
146
|
+
p: Predicate TermId
|
|
147
|
+
o: Object TermId (may be QtId for metadata triples)
|
|
148
|
+
g: Graph TermId (DEFAULT_GRAPH_ID for default graph)
|
|
149
|
+
flags: Fact flags (ASSERTED by default)
|
|
150
|
+
source: Source TermId for provenance
|
|
151
|
+
confidence: Confidence score (0.0 to 1.0)
|
|
152
|
+
process: Process TermId for provenance
|
|
153
|
+
t_added: Timestamp in microseconds since epoch (default: now)
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Transaction ID
|
|
157
|
+
"""
|
|
158
|
+
txn = self._allocate_txn()
|
|
159
|
+
if t_added is None:
|
|
160
|
+
t_added = int(datetime.now(timezone.utc).timestamp() * 1_000_000)
|
|
161
|
+
|
|
162
|
+
# Auto-detect metadata triples
|
|
163
|
+
if is_quoted_triple(s) or is_quoted_triple(o):
|
|
164
|
+
flags = flags | FactFlags.METADATA
|
|
165
|
+
|
|
166
|
+
new_row = pl.DataFrame({
|
|
167
|
+
"g": [g],
|
|
168
|
+
"s": [s],
|
|
169
|
+
"p": [p],
|
|
170
|
+
"o": [o],
|
|
171
|
+
"flags": [int(flags)],
|
|
172
|
+
"txn": [txn],
|
|
173
|
+
"t_added": [t_added],
|
|
174
|
+
"source": [source if source is not None else 0],
|
|
175
|
+
"confidence": [confidence],
|
|
176
|
+
"process": [process if process is not None else 0],
|
|
177
|
+
}).cast({
|
|
178
|
+
"g": pl.UInt64,
|
|
179
|
+
"s": pl.UInt64,
|
|
180
|
+
"p": pl.UInt64,
|
|
181
|
+
"o": pl.UInt64,
|
|
182
|
+
"flags": pl.UInt16,
|
|
183
|
+
"txn": pl.UInt64,
|
|
184
|
+
"t_added": pl.UInt64,
|
|
185
|
+
"source": pl.UInt64,
|
|
186
|
+
"confidence": pl.Float64,
|
|
187
|
+
"process": pl.UInt64,
|
|
188
|
+
})
|
|
189
|
+
|
|
190
|
+
self._df = pl.concat([self._df, new_row], how="vertical")
|
|
191
|
+
return txn
|
|
192
|
+
|
|
193
|
+
def add_facts_batch(
|
|
194
|
+
self,
|
|
195
|
+
facts: List[Tuple[TermId, TermId, TermId, TermId]], # (g, s, p, o)
|
|
196
|
+
flags: FactFlags = FactFlags.ASSERTED,
|
|
197
|
+
source: Optional[TermId] = None,
|
|
198
|
+
confidence: float = 1.0,
|
|
199
|
+
process: Optional[TermId] = None,
|
|
200
|
+
) -> int:
|
|
201
|
+
"""
|
|
202
|
+
Add a batch of facts with shared provenance.
|
|
203
|
+
|
|
204
|
+
This is the recommended ingestion path for performance.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
facts: List of (g, s, p, o) tuples
|
|
208
|
+
flags: Shared flags for all facts
|
|
209
|
+
source: Shared source TermId
|
|
210
|
+
confidence: Shared confidence score
|
|
211
|
+
process: Shared process TermId
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Transaction ID for the batch
|
|
215
|
+
"""
|
|
216
|
+
if not facts:
|
|
217
|
+
return self._allocate_txn()
|
|
218
|
+
|
|
219
|
+
txn = self._allocate_txn()
|
|
220
|
+
t_added = int(datetime.now(timezone.utc).timestamp() * 1_000_000)
|
|
221
|
+
|
|
222
|
+
rows = []
|
|
223
|
+
for g, s, p, o in facts:
|
|
224
|
+
fact_flags = flags
|
|
225
|
+
if is_quoted_triple(s) or is_quoted_triple(o):
|
|
226
|
+
fact_flags = fact_flags | FactFlags.METADATA
|
|
227
|
+
|
|
228
|
+
rows.append({
|
|
229
|
+
"g": g,
|
|
230
|
+
"s": s,
|
|
231
|
+
"p": p,
|
|
232
|
+
"o": o,
|
|
233
|
+
"flags": int(fact_flags),
|
|
234
|
+
"txn": txn,
|
|
235
|
+
"t_added": t_added,
|
|
236
|
+
"source": source if source is not None else 0,
|
|
237
|
+
"confidence": confidence,
|
|
238
|
+
"process": process if process is not None else 0,
|
|
239
|
+
})
|
|
240
|
+
|
|
241
|
+
new_df = pl.DataFrame(rows).cast({
|
|
242
|
+
"g": pl.UInt64,
|
|
243
|
+
"s": pl.UInt64,
|
|
244
|
+
"p": pl.UInt64,
|
|
245
|
+
"o": pl.UInt64,
|
|
246
|
+
"flags": pl.UInt16,
|
|
247
|
+
"txn": pl.UInt64,
|
|
248
|
+
"t_added": pl.UInt64,
|
|
249
|
+
"source": pl.UInt64,
|
|
250
|
+
"confidence": pl.Float64,
|
|
251
|
+
"process": pl.UInt64,
|
|
252
|
+
})
|
|
253
|
+
|
|
254
|
+
self._df = pl.concat([self._df, new_df], how="vertical")
|
|
255
|
+
return txn
|
|
256
|
+
|
|
257
|
+
def add_facts_columnar(
|
|
258
|
+
self,
|
|
259
|
+
g_col: List[TermId],
|
|
260
|
+
s_col: List[TermId],
|
|
261
|
+
p_col: List[TermId],
|
|
262
|
+
o_col: List[TermId],
|
|
263
|
+
flags: FactFlags = FactFlags.ASSERTED,
|
|
264
|
+
source: Optional[TermId] = None,
|
|
265
|
+
confidence: float = 1.0,
|
|
266
|
+
process: Optional[TermId] = None,
|
|
267
|
+
) -> int:
|
|
268
|
+
"""
|
|
269
|
+
Add facts from pre-built column lists (TRUE vectorized path).
|
|
270
|
+
|
|
271
|
+
This is the FASTEST ingestion method. Build your column data
|
|
272
|
+
separately, then pass it here for a single DataFrame creation.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
g_col: List of graph TermIds
|
|
276
|
+
s_col: List of subject TermIds
|
|
277
|
+
p_col: List of predicate TermIds
|
|
278
|
+
o_col: List of object TermIds
|
|
279
|
+
flags: Shared flags for all facts
|
|
280
|
+
source: Shared source TermId
|
|
281
|
+
confidence: Shared confidence score
|
|
282
|
+
process: Shared process TermId
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
Transaction ID
|
|
286
|
+
"""
|
|
287
|
+
n = len(s_col)
|
|
288
|
+
if n == 0:
|
|
289
|
+
return self._allocate_txn()
|
|
290
|
+
|
|
291
|
+
txn = self._allocate_txn()
|
|
292
|
+
t_added = int(datetime.now(timezone.utc).timestamp() * 1_000_000)
|
|
293
|
+
|
|
294
|
+
# Build DataFrame directly from columns (no Python loop)
|
|
295
|
+
new_df = pl.DataFrame({
|
|
296
|
+
"g": pl.Series(g_col, dtype=pl.UInt64),
|
|
297
|
+
"s": pl.Series(s_col, dtype=pl.UInt64),
|
|
298
|
+
"p": pl.Series(p_col, dtype=pl.UInt64),
|
|
299
|
+
"o": pl.Series(o_col, dtype=pl.UInt64),
|
|
300
|
+
"flags": pl.Series([int(flags)] * n, dtype=pl.UInt16),
|
|
301
|
+
"txn": pl.Series([txn] * n, dtype=pl.UInt64),
|
|
302
|
+
"t_added": pl.Series([t_added] * n, dtype=pl.UInt64),
|
|
303
|
+
"source": pl.Series([source if source else 0] * n, dtype=pl.UInt64),
|
|
304
|
+
"confidence": pl.Series([confidence] * n, dtype=pl.Float64),
|
|
305
|
+
"process": pl.Series([process if process else 0] * n, dtype=pl.UInt64),
|
|
306
|
+
})
|
|
307
|
+
|
|
308
|
+
self._df = pl.concat([self._df, new_df], how="vertical")
|
|
309
|
+
return txn
|
|
310
|
+
|
|
311
|
+
def add_facts_with_provenance(
|
|
312
|
+
self,
|
|
313
|
+
facts: List[Tuple[TermId, TermId, TermId, TermId, Optional[TermId], float, Optional[TermId]]],
|
|
314
|
+
flags: FactFlags = FactFlags.ASSERTED,
|
|
315
|
+
) -> int:
|
|
316
|
+
"""
|
|
317
|
+
Add facts with per-fact provenance (confidence, source, process).
|
|
318
|
+
|
|
319
|
+
This is the recommended path for ingesting data with provenance metadata
|
|
320
|
+
stored in native columns rather than as separate RDF triples.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
facts: List of (g, s, p, o, source, confidence, process) tuples
|
|
324
|
+
- source: TermId for data source (or None)
|
|
325
|
+
- confidence: Float confidence score (0.0 to 1.0)
|
|
326
|
+
- process: TermId for generating process (or None)
|
|
327
|
+
flags: Base flags for all facts
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
Transaction ID for the batch
|
|
331
|
+
"""
|
|
332
|
+
if not facts:
|
|
333
|
+
return self._allocate_txn()
|
|
334
|
+
|
|
335
|
+
txn = self._allocate_txn()
|
|
336
|
+
t_added = int(datetime.now(timezone.utc).timestamp() * 1_000_000)
|
|
337
|
+
|
|
338
|
+
rows = []
|
|
339
|
+
for g, s, p, o, source, confidence, process in facts:
|
|
340
|
+
fact_flags = flags
|
|
341
|
+
if is_quoted_triple(s) or is_quoted_triple(o):
|
|
342
|
+
fact_flags = fact_flags | FactFlags.METADATA
|
|
343
|
+
|
|
344
|
+
rows.append({
|
|
345
|
+
"g": g,
|
|
346
|
+
"s": s,
|
|
347
|
+
"p": p,
|
|
348
|
+
"o": o,
|
|
349
|
+
"flags": int(fact_flags),
|
|
350
|
+
"txn": txn,
|
|
351
|
+
"t_added": t_added,
|
|
352
|
+
"source": source if source is not None else 0,
|
|
353
|
+
"confidence": confidence,
|
|
354
|
+
"process": process if process is not None else 0,
|
|
355
|
+
})
|
|
356
|
+
|
|
357
|
+
new_df = pl.DataFrame(rows).cast({
|
|
358
|
+
"g": pl.UInt64,
|
|
359
|
+
"s": pl.UInt64,
|
|
360
|
+
"p": pl.UInt64,
|
|
361
|
+
"o": pl.UInt64,
|
|
362
|
+
"flags": pl.UInt16,
|
|
363
|
+
"txn": pl.UInt64,
|
|
364
|
+
"t_added": pl.UInt64,
|
|
365
|
+
"source": pl.UInt64,
|
|
366
|
+
"confidence": pl.Float64,
|
|
367
|
+
"process": pl.UInt64,
|
|
368
|
+
})
|
|
369
|
+
|
|
370
|
+
self._df = pl.concat([self._df, new_df], how="vertical")
|
|
371
|
+
return txn
|
|
372
|
+
|
|
373
|
+
def scan_by_confidence(
|
|
374
|
+
self,
|
|
375
|
+
min_confidence: float,
|
|
376
|
+
max_confidence: Optional[float] = None,
|
|
377
|
+
include_metadata: bool = True,
|
|
378
|
+
) -> pl.DataFrame:
|
|
379
|
+
"""
|
|
380
|
+
Scan facts by confidence threshold using native column.
|
|
381
|
+
|
|
382
|
+
This is O(n) scan but uses vectorized Polars filtering -
|
|
383
|
+
no string parsing or joins required.
|
|
384
|
+
|
|
385
|
+
Args:
|
|
386
|
+
min_confidence: Minimum confidence (exclusive)
|
|
387
|
+
max_confidence: Maximum confidence (inclusive, optional)
|
|
388
|
+
include_metadata: Whether to include metadata facts
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
DataFrame with all columns for matching facts
|
|
392
|
+
"""
|
|
393
|
+
df = self._df.lazy()
|
|
394
|
+
|
|
395
|
+
# Filter by confidence
|
|
396
|
+
df = df.filter(pl.col("confidence") > min_confidence)
|
|
397
|
+
if max_confidence is not None:
|
|
398
|
+
df = df.filter(pl.col("confidence") <= max_confidence)
|
|
399
|
+
|
|
400
|
+
# Exclude deleted
|
|
401
|
+
df = df.filter((pl.col("flags") & int(FactFlags.DELETED)) == 0)
|
|
402
|
+
|
|
403
|
+
# Optionally filter out metadata facts
|
|
404
|
+
if not include_metadata:
|
|
405
|
+
df = df.filter((pl.col("flags") & int(FactFlags.METADATA)) == 0)
|
|
406
|
+
|
|
407
|
+
return df.collect()
|
|
408
|
+
|
|
409
|
+
def scan_by_source(
|
|
410
|
+
self,
|
|
411
|
+
source: TermId,
|
|
412
|
+
include_metadata: bool = True,
|
|
413
|
+
) -> pl.DataFrame:
|
|
414
|
+
"""
|
|
415
|
+
Scan facts by source using native column.
|
|
416
|
+
|
|
417
|
+
Args:
|
|
418
|
+
source: Source TermId to filter by
|
|
419
|
+
include_metadata: Whether to include metadata facts
|
|
420
|
+
|
|
421
|
+
Returns:
|
|
422
|
+
DataFrame with all columns for matching facts
|
|
423
|
+
"""
|
|
424
|
+
df = self._df.lazy()
|
|
425
|
+
df = df.filter(pl.col("source") == source)
|
|
426
|
+
df = df.filter((pl.col("flags") & int(FactFlags.DELETED)) == 0)
|
|
427
|
+
|
|
428
|
+
if not include_metadata:
|
|
429
|
+
df = df.filter((pl.col("flags") & int(FactFlags.METADATA)) == 0)
|
|
430
|
+
|
|
431
|
+
return df.collect()
|
|
432
|
+
|
|
433
|
+
# =========================================================================
|
|
434
|
+
# Query Primitives (storage-spec.md §8)
|
|
435
|
+
# =========================================================================
|
|
436
|
+
|
|
437
|
+
def scan_facts(
|
|
438
|
+
self,
|
|
439
|
+
p: Optional[TermId] = None,
|
|
440
|
+
g: Optional[TermId] = None,
|
|
441
|
+
include_deleted: bool = False,
|
|
442
|
+
) -> pl.DataFrame:
|
|
443
|
+
"""
|
|
444
|
+
Scan facts with optional predicate and graph filters.
|
|
445
|
+
|
|
446
|
+
This is the primary scan primitive for query execution.
|
|
447
|
+
When predicate is specified, this enables partition pruning.
|
|
448
|
+
|
|
449
|
+
Args:
|
|
450
|
+
p: Optional predicate filter
|
|
451
|
+
g: Optional graph filter
|
|
452
|
+
include_deleted: Whether to include deleted facts
|
|
453
|
+
|
|
454
|
+
Returns:
|
|
455
|
+
DataFrame with columns: g, s, p, o, flags, txn, t_added, source, confidence, process
|
|
456
|
+
"""
|
|
457
|
+
df = self._df.lazy()
|
|
458
|
+
|
|
459
|
+
if p is not None:
|
|
460
|
+
df = df.filter(pl.col("p") == p)
|
|
461
|
+
|
|
462
|
+
if g is not None:
|
|
463
|
+
df = df.filter(pl.col("g") == g)
|
|
464
|
+
|
|
465
|
+
if not include_deleted:
|
|
466
|
+
df = df.filter((pl.col("flags") & int(FactFlags.DELETED)) == 0)
|
|
467
|
+
|
|
468
|
+
return df.collect()
|
|
469
|
+
|
|
470
|
+
def scan_facts_by_s(
|
|
471
|
+
self,
|
|
472
|
+
s: TermId,
|
|
473
|
+
p: Optional[TermId] = None,
|
|
474
|
+
g: Optional[TermId] = None,
|
|
475
|
+
) -> pl.DataFrame:
|
|
476
|
+
"""
|
|
477
|
+
Scan facts by subject with optional predicate and graph filters.
|
|
478
|
+
|
|
479
|
+
Useful for "show me all facts about entity X" queries.
|
|
480
|
+
"""
|
|
481
|
+
df = self._df.lazy().filter(pl.col("s") == s)
|
|
482
|
+
|
|
483
|
+
if p is not None:
|
|
484
|
+
df = df.filter(pl.col("p") == p)
|
|
485
|
+
|
|
486
|
+
if g is not None:
|
|
487
|
+
df = df.filter(pl.col("g") == g)
|
|
488
|
+
|
|
489
|
+
df = df.filter((pl.col("flags") & int(FactFlags.DELETED)) == 0)
|
|
490
|
+
|
|
491
|
+
return df.collect()
|
|
492
|
+
|
|
493
|
+
def scan_facts_by_o(
|
|
494
|
+
self,
|
|
495
|
+
o: TermId,
|
|
496
|
+
p: Optional[TermId] = None,
|
|
497
|
+
g: Optional[TermId] = None,
|
|
498
|
+
) -> pl.DataFrame:
|
|
499
|
+
"""
|
|
500
|
+
Scan facts by object with optional predicate and graph filters.
|
|
501
|
+
|
|
502
|
+
Useful for reverse lookups (inbound edges).
|
|
503
|
+
"""
|
|
504
|
+
df = self._df.lazy().filter(pl.col("o") == o)
|
|
505
|
+
|
|
506
|
+
if p is not None:
|
|
507
|
+
df = df.filter(pl.col("p") == p)
|
|
508
|
+
|
|
509
|
+
if g is not None:
|
|
510
|
+
df = df.filter(pl.col("g") == g)
|
|
511
|
+
|
|
512
|
+
df = df.filter((pl.col("flags") & int(FactFlags.DELETED)) == 0)
|
|
513
|
+
|
|
514
|
+
return df.collect()
|
|
515
|
+
|
|
516
|
+
def scan_metadata_facts(
|
|
517
|
+
self,
|
|
518
|
+
qt_id: Optional[QtId] = None,
|
|
519
|
+
p: Optional[TermId] = None,
|
|
520
|
+
) -> pl.DataFrame:
|
|
521
|
+
"""
|
|
522
|
+
Scan facts where subject is a quoted triple (metadata facts).
|
|
523
|
+
|
|
524
|
+
This is the key primitive for RDF★ metadata queries.
|
|
525
|
+
|
|
526
|
+
Args:
|
|
527
|
+
qt_id: Optional specific quoted triple to filter by
|
|
528
|
+
p: Optional predicate filter (e.g., prov:wasDerivedFrom)
|
|
529
|
+
"""
|
|
530
|
+
df = self._df.lazy().filter(
|
|
531
|
+
(pl.col("flags") & int(FactFlags.METADATA)) != 0
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
if qt_id is not None:
|
|
535
|
+
df = df.filter(pl.col("s") == qt_id)
|
|
536
|
+
|
|
537
|
+
if p is not None:
|
|
538
|
+
df = df.filter(pl.col("p") == p)
|
|
539
|
+
|
|
540
|
+
df = df.filter((pl.col("flags") & int(FactFlags.DELETED)) == 0)
|
|
541
|
+
|
|
542
|
+
return df.collect()
|
|
543
|
+
|
|
544
|
+
def scan_facts_at_time(
|
|
545
|
+
self,
|
|
546
|
+
as_of_time: datetime,
|
|
547
|
+
p: Optional[TermId] = None,
|
|
548
|
+
g: Optional[TermId] = None,
|
|
549
|
+
s: Optional[TermId] = None,
|
|
550
|
+
) -> pl.DataFrame:
|
|
551
|
+
"""
|
|
552
|
+
Time-travel query: return facts as they existed at a specific point in time.
|
|
553
|
+
|
|
554
|
+
This is a key capability for compliance and auditing:
|
|
555
|
+
- "What did we believe to be true on 2024-01-15?"
|
|
556
|
+
- "What assertions existed before the data refresh?"
|
|
557
|
+
|
|
558
|
+
Args:
|
|
559
|
+
as_of_time: The point in time to query
|
|
560
|
+
p: Optional predicate filter
|
|
561
|
+
g: Optional graph filter
|
|
562
|
+
s: Optional subject filter
|
|
563
|
+
|
|
564
|
+
Returns:
|
|
565
|
+
DataFrame with facts that existed at the specified time
|
|
566
|
+
"""
|
|
567
|
+
# Convert datetime to microseconds timestamp
|
|
568
|
+
as_of_ts = int(as_of_time.timestamp() * 1_000_000)
|
|
569
|
+
|
|
570
|
+
df = self._df.lazy()
|
|
571
|
+
|
|
572
|
+
# Only include facts added before the specified time
|
|
573
|
+
df = df.filter(pl.col("t_added") <= as_of_ts)
|
|
574
|
+
|
|
575
|
+
# Apply optional filters
|
|
576
|
+
if p is not None:
|
|
577
|
+
df = df.filter(pl.col("p") == p)
|
|
578
|
+
if g is not None:
|
|
579
|
+
df = df.filter(pl.col("g") == g)
|
|
580
|
+
if s is not None:
|
|
581
|
+
df = df.filter(pl.col("s") == s)
|
|
582
|
+
|
|
583
|
+
# For time-travel, we need to show the state at that time
|
|
584
|
+
# If a fact was deleted after as_of_time, it should still show
|
|
585
|
+
# This implementation shows all facts added by that time
|
|
586
|
+
# (For full versioning, we'd need to track delete timestamps too)
|
|
587
|
+
df = df.filter((pl.col("flags") & int(FactFlags.DELETED)) == 0)
|
|
588
|
+
|
|
589
|
+
return df.collect()
|
|
590
|
+
|
|
591
|
+
def scan_facts_by_txn_range(
|
|
592
|
+
self,
|
|
593
|
+
start_txn: int,
|
|
594
|
+
end_txn: Optional[int] = None,
|
|
595
|
+
include_deleted: bool = False,
|
|
596
|
+
) -> pl.DataFrame:
|
|
597
|
+
"""
|
|
598
|
+
Scan facts by transaction ID range.
|
|
599
|
+
|
|
600
|
+
Useful for:
|
|
601
|
+
- Incremental sync: "give me all changes since txn 1000"
|
|
602
|
+
- Change data capture
|
|
603
|
+
- Debugging specific ingestion batches
|
|
604
|
+
|
|
605
|
+
Args:
|
|
606
|
+
start_txn: Start transaction ID (inclusive)
|
|
607
|
+
end_txn: End transaction ID (inclusive, optional)
|
|
608
|
+
include_deleted: Whether to include deleted facts
|
|
609
|
+
|
|
610
|
+
Returns:
|
|
611
|
+
DataFrame with facts in the specified transaction range
|
|
612
|
+
"""
|
|
613
|
+
df = self._df.lazy().filter(pl.col("txn") >= start_txn)
|
|
614
|
+
|
|
615
|
+
if end_txn is not None:
|
|
616
|
+
df = df.filter(pl.col("txn") <= end_txn)
|
|
617
|
+
|
|
618
|
+
if not include_deleted:
|
|
619
|
+
df = df.filter((pl.col("flags") & int(FactFlags.DELETED)) == 0)
|
|
620
|
+
|
|
621
|
+
return df.collect()
|
|
622
|
+
|
|
623
|
+
# =========================================================================
|
|
624
|
+
# RDF★ Expansion Joins
|
|
625
|
+
# =========================================================================
|
|
626
|
+
|
|
627
|
+
def expand_qt_metadata(
|
|
628
|
+
self,
|
|
629
|
+
metadata_predicate: TermId,
|
|
630
|
+
) -> pl.DataFrame:
|
|
631
|
+
"""
|
|
632
|
+
Critical RDF★ expansion join.
|
|
633
|
+
|
|
634
|
+
Given a metadata predicate (e.g., prov:wasDerivedFrom), finds all
|
|
635
|
+
quoted triples with that metadata and expands them to (s, p, o).
|
|
636
|
+
|
|
637
|
+
Returns DataFrame with: qt_id, base_s, base_p, base_o, metadata_o
|
|
638
|
+
|
|
639
|
+
This implements the join pattern from storage-spec.md §8:
|
|
640
|
+
1. scan_facts(p=metadata_predicate) → qt_id, metadata_o
|
|
641
|
+
2. lookup_qt(qt_ids) → qt_id, s, p, o
|
|
642
|
+
3. join → base triple + metadata value
|
|
643
|
+
"""
|
|
644
|
+
# Step 1: Get all facts with the metadata predicate where subject is a qt
|
|
645
|
+
df1 = self._df.lazy().filter(
|
|
646
|
+
(pl.col("p") == metadata_predicate) &
|
|
647
|
+
((pl.col("flags") & int(FactFlags.METADATA)) != 0) &
|
|
648
|
+
((pl.col("flags") & int(FactFlags.DELETED)) == 0)
|
|
649
|
+
).select([
|
|
650
|
+
pl.col("s").alias("qt_id"),
|
|
651
|
+
pl.col("o").alias("metadata_o"),
|
|
652
|
+
pl.col("confidence"),
|
|
653
|
+
pl.col("source"),
|
|
654
|
+
]).collect()
|
|
655
|
+
|
|
656
|
+
if df1.is_empty():
|
|
657
|
+
return pl.DataFrame({
|
|
658
|
+
"qt_id": pl.Series([], dtype=pl.UInt64),
|
|
659
|
+
"base_s": pl.Series([], dtype=pl.UInt64),
|
|
660
|
+
"base_p": pl.Series([], dtype=pl.UInt64),
|
|
661
|
+
"base_o": pl.Series([], dtype=pl.UInt64),
|
|
662
|
+
"metadata_o": pl.Series([], dtype=pl.UInt64),
|
|
663
|
+
"confidence": pl.Series([], dtype=pl.Float64),
|
|
664
|
+
"source": pl.Series([], dtype=pl.UInt64),
|
|
665
|
+
})
|
|
666
|
+
|
|
667
|
+
# Step 2: Expand qt_ids to base triples
|
|
668
|
+
qt_ids = df1["qt_id"].to_list()
|
|
669
|
+
df2 = self._qt_dict.expand_to_dataframe(qt_ids)
|
|
670
|
+
|
|
671
|
+
if df2.is_empty():
|
|
672
|
+
return pl.DataFrame({
|
|
673
|
+
"qt_id": pl.Series([], dtype=pl.UInt64),
|
|
674
|
+
"base_s": pl.Series([], dtype=pl.UInt64),
|
|
675
|
+
"base_p": pl.Series([], dtype=pl.UInt64),
|
|
676
|
+
"base_o": pl.Series([], dtype=pl.UInt64),
|
|
677
|
+
"metadata_o": pl.Series([], dtype=pl.UInt64),
|
|
678
|
+
"confidence": pl.Series([], dtype=pl.Float64),
|
|
679
|
+
"source": pl.Series([], dtype=pl.UInt64),
|
|
680
|
+
})
|
|
681
|
+
|
|
682
|
+
# Rename columns for join
|
|
683
|
+
df2 = df2.rename({
|
|
684
|
+
"s": "base_s",
|
|
685
|
+
"p": "base_p",
|
|
686
|
+
"o": "base_o",
|
|
687
|
+
})
|
|
688
|
+
|
|
689
|
+
# Step 3: Join
|
|
690
|
+
return df1.join(df2, on="qt_id", how="inner")
|
|
691
|
+
|
|
692
|
+
def expand_metadata_df(self, metadata_df: pl.DataFrame) -> pl.DataFrame:
|
|
693
|
+
"""
|
|
694
|
+
Expand a DataFrame of metadata facts.
|
|
695
|
+
|
|
696
|
+
Takes a DataFrame that has at minimum an 's' column containing qt_ids,
|
|
697
|
+
and expands each qt_id to its (base_s, base_p, base_o) components.
|
|
698
|
+
|
|
699
|
+
This is useful when you've already filtered metadata facts and
|
|
700
|
+
want to expand them.
|
|
701
|
+
|
|
702
|
+
Args:
|
|
703
|
+
metadata_df: DataFrame with 's' column containing qt_ids
|
|
704
|
+
|
|
705
|
+
Returns:
|
|
706
|
+
DataFrame with original columns plus base_s, base_p, base_o
|
|
707
|
+
"""
|
|
708
|
+
if metadata_df.is_empty():
|
|
709
|
+
return metadata_df.with_columns([
|
|
710
|
+
pl.lit(0).cast(pl.UInt64).alias("base_s"),
|
|
711
|
+
pl.lit(0).cast(pl.UInt64).alias("base_p"),
|
|
712
|
+
pl.lit(0).cast(pl.UInt64).alias("base_o"),
|
|
713
|
+
]).filter(pl.lit(False)) # Empty with correct schema
|
|
714
|
+
|
|
715
|
+
# Get qt_ids from subject column
|
|
716
|
+
qt_ids = metadata_df["s"].to_list()
|
|
717
|
+
|
|
718
|
+
# Expand using qt_dict
|
|
719
|
+
qt_df = self._qt_dict.expand_to_dataframe(qt_ids)
|
|
720
|
+
|
|
721
|
+
if qt_df.is_empty():
|
|
722
|
+
return metadata_df.with_columns([
|
|
723
|
+
pl.lit(0).cast(pl.UInt64).alias("base_s"),
|
|
724
|
+
pl.lit(0).cast(pl.UInt64).alias("base_p"),
|
|
725
|
+
pl.lit(0).cast(pl.UInt64).alias("base_o"),
|
|
726
|
+
]).filter(pl.lit(False))
|
|
727
|
+
|
|
728
|
+
# Rename for clarity
|
|
729
|
+
qt_df = qt_df.rename({
|
|
730
|
+
"qt_id": "s", # Match the join key
|
|
731
|
+
"s": "base_s",
|
|
732
|
+
"p": "base_p",
|
|
733
|
+
"o": "base_o",
|
|
734
|
+
})
|
|
735
|
+
|
|
736
|
+
# Join on s (the qt_id)
|
|
737
|
+
return metadata_df.join(qt_df, on="s", how="inner")
|
|
738
|
+
|
|
739
|
+
# =========================================================================
|
|
740
|
+
# Soft Delete and Deprecation
|
|
741
|
+
# =========================================================================
|
|
742
|
+
|
|
743
|
+
def mark_deleted(
|
|
744
|
+
self,
|
|
745
|
+
s: Optional[TermId] = None,
|
|
746
|
+
p: Optional[TermId] = None,
|
|
747
|
+
o: Optional[TermId] = None,
|
|
748
|
+
g: Optional[TermId] = None,
|
|
749
|
+
) -> int:
|
|
750
|
+
"""
|
|
751
|
+
Soft-delete facts matching the given pattern.
|
|
752
|
+
|
|
753
|
+
Returns the number of facts marked as deleted.
|
|
754
|
+
"""
|
|
755
|
+
mask = pl.lit(True)
|
|
756
|
+
|
|
757
|
+
if s is not None:
|
|
758
|
+
mask = mask & (pl.col("s") == s)
|
|
759
|
+
if p is not None:
|
|
760
|
+
mask = mask & (pl.col("p") == p)
|
|
761
|
+
if o is not None:
|
|
762
|
+
mask = mask & (pl.col("o") == o)
|
|
763
|
+
if g is not None:
|
|
764
|
+
mask = mask & (pl.col("g") == g)
|
|
765
|
+
|
|
766
|
+
before_count = self._df.filter(
|
|
767
|
+
mask & ((pl.col("flags") & int(FactFlags.DELETED)) == 0)
|
|
768
|
+
).height
|
|
769
|
+
|
|
770
|
+
self._df = self._df.with_columns([
|
|
771
|
+
pl.when(mask)
|
|
772
|
+
.then(pl.col("flags") | int(FactFlags.DELETED))
|
|
773
|
+
.otherwise(pl.col("flags"))
|
|
774
|
+
.alias("flags")
|
|
775
|
+
])
|
|
776
|
+
|
|
777
|
+
return before_count
|
|
778
|
+
|
|
779
|
+
# =========================================================================
|
|
780
|
+
# Persistence
|
|
781
|
+
# =========================================================================
|
|
782
|
+
|
|
783
|
+
def to_dataframe(self) -> pl.DataFrame:
|
|
784
|
+
"""Return the facts as a DataFrame."""
|
|
785
|
+
return self._df
|
|
786
|
+
|
|
787
|
+
def save(self, path: Path):
|
|
788
|
+
"""Save facts to a Parquet file."""
|
|
789
|
+
path = Path(path)
|
|
790
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
791
|
+
self._df.write_parquet(path / "facts.parquet")
|
|
792
|
+
|
|
793
|
+
# Save transaction counter
|
|
794
|
+
with open(path / "facts_meta.txt", "w") as f:
|
|
795
|
+
f.write(f"next_txn={self._next_txn}\n")
|
|
796
|
+
|
|
797
|
+
@classmethod
|
|
798
|
+
def load(cls, path: Path, term_dict: TermDict, qt_dict: QtDict) -> "FactStore":
|
|
799
|
+
"""Load facts from a Parquet file."""
|
|
800
|
+
path = Path(path)
|
|
801
|
+
|
|
802
|
+
instance = cls(term_dict, qt_dict)
|
|
803
|
+
instance._df = pl.read_parquet(path / "facts.parquet")
|
|
804
|
+
|
|
805
|
+
# Load transaction counter
|
|
806
|
+
meta_file = path / "facts_meta.txt"
|
|
807
|
+
if meta_file.exists():
|
|
808
|
+
with open(meta_file) as f:
|
|
809
|
+
for line in f:
|
|
810
|
+
if line.startswith("next_txn="):
|
|
811
|
+
instance._next_txn = int(line.split("=")[1].strip())
|
|
812
|
+
|
|
813
|
+
return instance
|
|
814
|
+
|
|
815
|
+
# =========================================================================
|
|
816
|
+
# Statistics
|
|
817
|
+
# =========================================================================
|
|
818
|
+
|
|
819
|
+
def __len__(self) -> int:
|
|
820
|
+
"""Return the total number of facts (including deleted)."""
|
|
821
|
+
return len(self._df)
|
|
822
|
+
|
|
823
|
+
def count_active(self) -> int:
|
|
824
|
+
"""Return the number of non-deleted facts."""
|
|
825
|
+
return self._df.filter(
|
|
826
|
+
(pl.col("flags") & int(FactFlags.DELETED)) == 0
|
|
827
|
+
).height
|
|
828
|
+
|
|
829
|
+
def count_metadata(self) -> int:
|
|
830
|
+
"""Return the number of metadata facts."""
|
|
831
|
+
return self._df.filter(
|
|
832
|
+
((pl.col("flags") & int(FactFlags.METADATA)) != 0) &
|
|
833
|
+
((pl.col("flags") & int(FactFlags.DELETED)) == 0)
|
|
834
|
+
).height
|
|
835
|
+
|
|
836
|
+
def stats(self) -> dict:
|
|
837
|
+
"""Return statistics about the fact store."""
|
|
838
|
+
active = self.count_active()
|
|
839
|
+
metadata = self.count_metadata()
|
|
840
|
+
|
|
841
|
+
return {
|
|
842
|
+
"total_facts": len(self),
|
|
843
|
+
"active_facts": active,
|
|
844
|
+
"deleted_facts": len(self) - active,
|
|
845
|
+
"metadata_facts": metadata,
|
|
846
|
+
"base_facts": active - metadata,
|
|
847
|
+
"next_txn": self._next_txn,
|
|
848
|
+
"unique_predicates": self._df.select("p").n_unique(),
|
|
849
|
+
"unique_subjects": self._df.select("s").n_unique(),
|
|
850
|
+
}
|