PyPI - rdf-starbase - Versions diffs - 0.1.0__py3-none-any.whl - Mend

rdf-starbase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

rdf_starbase/__init__.py +57 -0
rdf_starbase/ai_grounding.py +728 -0
rdf_starbase/compat/__init__.py +26 -0
rdf_starbase/compat/rdflib.py +1104 -0
rdf_starbase/formats/__init__.py +29 -0
rdf_starbase/formats/jsonld.py +488 -0
rdf_starbase/formats/ntriples.py +419 -0
rdf_starbase/formats/rdfxml.py +434 -0
rdf_starbase/formats/turtle.py +882 -0
rdf_starbase/models.py +92 -0
rdf_starbase/registry.py +540 -0
rdf_starbase/repositories.py +407 -0
rdf_starbase/repository_api.py +739 -0
rdf_starbase/sparql/__init__.py +35 -0
rdf_starbase/sparql/ast.py +910 -0
rdf_starbase/sparql/executor.py +1925 -0
rdf_starbase/sparql/parser.py +1716 -0
rdf_starbase/storage/__init__.py +44 -0
rdf_starbase/storage/executor.py +1914 -0
rdf_starbase/storage/facts.py +850 -0
rdf_starbase/storage/lsm.py +531 -0
rdf_starbase/storage/persistence.py +338 -0
rdf_starbase/storage/quoted_triples.py +292 -0
rdf_starbase/storage/reasoner.py +1035 -0
rdf_starbase/storage/terms.py +628 -0
rdf_starbase/store.py +1049 -0
rdf_starbase/store_legacy.py +748 -0
rdf_starbase/web.py +568 -0
rdf_starbase-0.1.0.dist-info/METADATA +706 -0
rdf_starbase-0.1.0.dist-info/RECORD +31 -0
rdf_starbase-0.1.0.dist-info/WHEEL +4 -0

rdf_starbase/storage/persistence.py ADDED Viewed

@@ -0,0 +1,338 @@
+"""
+Persistence layer for RDF-StarBase storage.
+Provides save/load functionality for the dictionary-encoded storage layer:
+- TermDict: Term catalog (term_id, kind, lex)
+- FactStore: Facts table (g, s, p, o, provenance)
+- QtDict: Quoted triples table (qt_id, s_id, p_id, o_id)
+Uses Parquet format for efficient, columnar storage with good compression.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Optional
+import os
+import polars as pl
+from rdf_starbase.storage.terms import TermDict, Term, TermId, TermKind, make_term_id
+from rdf_starbase.storage.quoted_triples import QtDict, QuotedTriple
+from rdf_starbase.storage.facts import FactStore
+class StoragePersistence:
+    """
+    Handles save/load operations for the storage layer.
+    File layout:
+        base_path/
+            terms.parquet     - TermDict catalog
+            facts.parquet     - FactStore facts
+            quoted.parquet    - QtDict quoted triples
+            metadata.parquet  - Counters and metadata
+    """
+    TERMS_FILE = "terms.parquet"
+    FACTS_FILE = "facts.parquet"
+    QUOTED_FILE = "quoted.parquet"
+    METADATA_FILE = "metadata.parquet"
+    def __init__(self, base_path: str | Path):
+        """
+        Initialize persistence with a base directory path.
+        Args:
+            base_path: Directory where storage files will be saved/loaded
+        """
+        self.base_path = Path(base_path)
+    def save(
+        self,
+        term_dict: TermDict,
+        fact_store: FactStore,
+        qt_dict: QtDict
+    ) -> None:
+        """
+        Save all storage components to disk.
+        Args:
+            term_dict: The term dictionary to save
+            fact_store: The fact store to save
+            qt_dict: The quoted triple dictionary to save
+        """
+        # Ensure directory exists
+        self.base_path.mkdir(parents=True, exist_ok=True)
+        # Save term dictionary
+        self._save_terms(term_dict)
+        # Save facts
+        self._save_facts(fact_store)
+        # Save quoted triples
+        self._save_quoted(qt_dict)
+        # Save metadata (counters, etc.)
+        self._save_metadata(term_dict, fact_store, qt_dict)
+    def load(self) -> tuple[TermDict, FactStore, QtDict]:
+        """
+        Load all storage components from disk.
+        Returns:
+            Tuple of (TermDict, FactStore, QtDict)
+        Raises:
+            FileNotFoundError: If the storage directory doesn't exist
+        """
+        if not self.base_path.exists():
+            raise FileNotFoundError(f"Storage directory not found: {self.base_path}")
+        # Load term dictionary first (needed by others)
+        term_dict = self._load_terms()
+        # Load quoted triples (needed by fact_store)
+        qt_dict = self._load_quoted(term_dict)
+        # Load facts
+        fact_store = self._load_facts(term_dict, qt_dict)
+        # Restore metadata
+        self._load_metadata(term_dict, fact_store, qt_dict)
+        return term_dict, fact_store, qt_dict
+    def exists(self) -> bool:
+        """Check if a saved storage exists at the base path."""
+        return (
+            self.base_path.exists() and
+            (self.base_path / self.TERMS_FILE).exists()
+        )
+    def _save_terms(self, term_dict: TermDict) -> None:
+        """Save term dictionary to Parquet."""
+        # Build DataFrame from term_dict internal state
+        term_ids = []
+        kinds = []
+        lexes = []
+        for term_id, term in term_dict._id_to_term.items():
+            term_ids.append(term_id)
+            kinds.append(term.kind.value)
+            lexes.append(term.lex)
+        df = pl.DataFrame({
+            "term_id": pl.Series(term_ids, dtype=pl.UInt64),
+            "kind": pl.Series(kinds, dtype=pl.UInt8),
+            "lex": pl.Series(lexes, dtype=pl.Utf8),
+        })
+        df.write_parquet(self.base_path / self.TERMS_FILE)
+    def _load_terms(self) -> TermDict:
+        """Load term dictionary from Parquet."""
+        df = pl.read_parquet(self.base_path / self.TERMS_FILE)
+        term_dict = TermDict.__new__(TermDict)
+        term_dict._next_payload = {
+            TermKind.IRI: 0,
+            TermKind.LITERAL: 0,
+            TermKind.BNODE: 0,
+            TermKind.QUOTED_TRIPLE: 0,
+        }
+        term_dict._hash_to_id = {}
+        term_dict._id_to_term = {}
+        term_dict._collision_count = 0
+        # Initialize fast-path caches (added for performance)
+        term_dict._iri_cache = {}
+        term_dict._plain_literal_cache = {}
+        term_dict._bnode_cache = {}
+        # Restore terms
+        for row in df.iter_rows(named=True):
+            term_id = row["term_id"]
+            kind = TermKind(row["kind"])
+            lex = row["lex"]
+            term = Term(kind=kind, lex=lex)
+            term_dict._id_to_term[term_id] = term
+            term_dict._hash_to_id[term.compute_hash()] = term_id
+            # Populate fast-path caches
+            if kind == TermKind.IRI:
+                term_dict._iri_cache[lex] = term_id
+            elif kind == TermKind.BNODE:
+                term_dict._bnode_cache[lex] = term_id
+            elif kind == TermKind.LITERAL:
+                term_dict._plain_literal_cache[lex] = term_id
+        return term_dict
+    def _save_facts(self, fact_store: FactStore) -> None:
+        """Save fact store to Parquet."""
+        fact_store._df.write_parquet(self.base_path / self.FACTS_FILE)
+    def _load_facts(self, term_dict: TermDict, qt_dict: QtDict) -> FactStore:
+        """Load fact store from Parquet."""
+        fact_store = FactStore.__new__(FactStore)
+        fact_store._term_dict = term_dict
+        fact_store._qt_dict = qt_dict
+        fact_store._next_txn = 0
+        fact_store._default_graph_id = 0
+        facts_path = self.base_path / self.FACTS_FILE
+        if facts_path.exists():
+            fact_store._df = pl.read_parquet(facts_path)
+        else:
+            fact_store._df = fact_store._create_empty_dataframe()
+        return fact_store
+    def _save_quoted(self, qt_dict: QtDict) -> None:
+        """Save quoted triple dictionary to Parquet."""
+        qt_ids = []
+        s_ids = []
+        p_ids = []
+        o_ids = []
+        for qt_id, qt in qt_dict._id_to_qt.items():
+            qt_ids.append(qt_id)
+            s_ids.append(qt.s)
+            p_ids.append(qt.p)
+            o_ids.append(qt.o)
+        df = pl.DataFrame({
+            "qt_id": pl.Series(qt_ids, dtype=pl.UInt64),
+            "s": pl.Series(s_ids, dtype=pl.UInt64),
+            "p": pl.Series(p_ids, dtype=pl.UInt64),
+            "o": pl.Series(o_ids, dtype=pl.UInt64),
+        })
+        df.write_parquet(self.base_path / self.QUOTED_FILE)
+    def _load_quoted(self, term_dict: TermDict) -> QtDict:
+        """Load quoted triple dictionary from Parquet."""
+        qt_dict = QtDict.__new__(QtDict)
+        qt_dict._term_dict = term_dict
+        qt_dict._hash_to_id = {}
+        qt_dict._id_to_qt = {}
+        qt_dict._collision_count = 0
+        quoted_path = self.base_path / self.QUOTED_FILE
+        if quoted_path.exists():
+            df = pl.read_parquet(quoted_path)
+            for row in df.iter_rows(named=True):
+                qt_id = row["qt_id"]
+                qt = QuotedTriple(row["s"], row["p"], row["o"])
+                qt_dict._id_to_qt[qt_id] = qt
+                qt_dict._hash_to_id[hash(qt)] = qt_id
+        return qt_dict
+    def _save_metadata(
+        self,
+        term_dict: TermDict,
+        fact_store: FactStore,
+        qt_dict: QtDict
+    ) -> None:
+        """Save counters and metadata to Parquet."""
+        # Store counter values for each kind
+        df = pl.DataFrame({
+            "key": [
+                "next_iri", "next_literal", "next_bnode", "next_qt", "next_txn"
+            ],
+            "value": [
+                term_dict._next_payload[TermKind.IRI],
+                term_dict._next_payload[TermKind.LITERAL],
+                term_dict._next_payload[TermKind.BNODE],
+                term_dict._next_payload[TermKind.QUOTED_TRIPLE],
+                fact_store._next_txn,
+            ],
+        })
+        df.write_parquet(self.base_path / self.METADATA_FILE)
+    def _load_metadata(
+        self,
+        term_dict: TermDict,
+        fact_store: FactStore,
+        qt_dict: QtDict
+    ) -> None:
+        """Restore counters and metadata from Parquet."""
+        metadata_path = self.base_path / self.METADATA_FILE
+        if not metadata_path.exists():
+            # Infer counters from loaded data
+            self._infer_counters(term_dict, fact_store)
+            return
+        df = pl.read_parquet(metadata_path)
+        # Build a lookup dict
+        meta = dict(zip(df["key"].to_list(), df["value"].to_list()))
+        term_dict._next_payload[TermKind.IRI] = meta.get("next_iri", 0)
+        term_dict._next_payload[TermKind.LITERAL] = meta.get("next_literal", 0)
+        term_dict._next_payload[TermKind.BNODE] = meta.get("next_bnode", 0)
+        term_dict._next_payload[TermKind.QUOTED_TRIPLE] = meta.get("next_qt", 0)
+        fact_store._next_txn = meta.get("next_txn", 0)
+        # Re-initialize well-known IDs
+        term_dict._init_well_known()
+    def _infer_counters(
+        self,
+        term_dict: TermDict,
+        fact_store: FactStore
+    ) -> None:
+        """Infer counter values from loaded data."""
+        # Find max payload for each kind
+        for term_id, term in term_dict._id_to_term.items():
+            kind = term.kind
+            payload = term_id & 0x00FFFFFFFFFFFFFF  # Extract payload
+            if payload >= term_dict._next_payload[kind]:
+                term_dict._next_payload[kind] = payload + 1
+        # Infer next_txn from facts
+        if len(fact_store._df) > 0 and "txn" in fact_store._df.columns:
+            max_txn = fact_store._df["txn"].max()
+            if max_txn is not None:
+                fact_store._next_txn = max_txn + 1
+        # Re-initialize well-known IDs
+        term_dict._init_well_known()
+def save_storage(
+    base_path: str | Path,
+    term_dict: TermDict,
+    fact_store: FactStore,
+    qt_dict: QtDict
+) -> None:
+    """
+    Convenience function to save storage to disk.
+    Args:
+        base_path: Directory path for storage files
+        term_dict: Term dictionary to save
+        fact_store: Fact store to save
+        qt_dict: Quoted triple dictionary to save
+    """
+    persistence = StoragePersistence(base_path)
+    persistence.save(term_dict, fact_store, qt_dict)
+def load_storage(base_path: str | Path) -> tuple[TermDict, FactStore, QtDict]:
+    """
+    Convenience function to load storage from disk.
+    Args:
+        base_path: Directory path containing storage files
+    Returns:
+        Tuple of (TermDict, FactStore, QtDict)
+    """
+    persistence = StoragePersistence(base_path)
+    return persistence.load()

rdf_starbase/storage/quoted_triples.py ADDED Viewed

@@ -0,0 +1,292 @@
+"""
+Quoted Triple Dictionary.
+Implements the qt_dict catalog for RDF★ quoted triples.
+Quoted triples are first-class terms that can appear as subjects or objects.
+Key design decisions (from storage-spec.md):
+- qt_id is a TermId with QUOTED_TRIPLE kind
+- Graph-agnostic quoting: key is (s,p,o) only (simpler, lower cardinality)
+- Hash-based interning for fast bulk dedupe
+- Stores qt_hash for fast rebuild at startup
+"""
+from dataclasses import dataclass
+from typing import Optional, Tuple
+from pathlib import Path
+import struct
+import hashlib
+import polars as pl
+from rdf_starbase.storage.terms import (
+    TermId,
+    TermKind,
+    TermDict,
+    make_term_id,
+    get_term_payload,
+)
+# Type alias for quoted triple identifiers
+QtId = TermId  # QtId is a TermId with kind=QUOTED_TRIPLE
+@dataclass(frozen=True, slots=True)
+class QuotedTriple:
+    """
+    Internal representation of a quoted triple.
+    All components are TermIds (already dictionary-encoded).
+    """
+    s: TermId
+    p: TermId
+    o: TermId
+    def to_tuple(self) -> Tuple[TermId, TermId, TermId]:
+        """Return as tuple for hashing."""
+        return (self.s, self.p, self.o)
+    def compute_hash(self) -> int:
+        """Compute 128-bit hash for bulk deduplication."""
+        data = struct.pack('>QQQ', self.s, self.p, self.o)
+        h = hashlib.md5(data).digest()
+        return int.from_bytes(h, 'big')
+class QtDict:
+    """
+    Quoted Triple Dictionary.
+    Catalogs quoted triples and assigns them stable QtIds (which are TermIds).
+    Supports O(1) lookup by (s,p,o) tuple and by qt_id.
+    Relationship with TermDict:
+    - QtDict allocates from the QUOTED_TRIPLE ID space
+    - The qt_id can be used as a subject or object in facts
+    - TermDict handles IRIs, literals, and bnodes; QtDict handles quoted triples
+    """
+    def __init__(self, term_dict: TermDict):
+        """
+        Initialize the quoted triple dictionary.
+        Args:
+            term_dict: The TermDict to coordinate ID allocation with
+        """
+        self._term_dict = term_dict
+        # Hash -> QtId (for interning)
+        self._hash_to_id: dict[int, QtId] = {}
+        # QtId -> QuotedTriple (for expansion)
+        self._id_to_qt: dict[QtId, QuotedTriple] = {}
+        # Statistics
+        self._collision_count = 0
+    def _allocate_id(self) -> QtId:
+        """Allocate the next QtId."""
+        # Use TermDict's allocation to keep ID spaces coordinated
+        payload = self._term_dict._next_payload[TermKind.QUOTED_TRIPLE]
+        self._term_dict._next_payload[TermKind.QUOTED_TRIPLE] = payload + 1
+        return make_term_id(TermKind.QUOTED_TRIPLE, payload)
+    def get_or_create(self, s: TermId, p: TermId, o: TermId) -> QtId:
+        """
+        Intern a quoted triple, returning its QtId.
+        If the quoted triple already exists, returns the existing ID.
+        Otherwise, allocates a new ID and stores the triple.
+        Args:
+            s: Subject TermId
+            p: Predicate TermId
+            o: Object TermId
+        Returns:
+            QtId for the quoted triple
+        """
+        qt = QuotedTriple(s, p, o)
+        qt_hash = qt.compute_hash()
+        if qt_hash in self._hash_to_id:
+            existing_id = self._hash_to_id[qt_hash]
+            # Verify it's actually the same triple (hash collision check)
+            if self._id_to_qt[existing_id] == qt:
+                return existing_id
+            # Hash collision
+            self._collision_count += 1
+        # Allocate new ID
+        qt_id = self._allocate_id()
+        self._hash_to_id[qt_hash] = qt_id
+        self._id_to_qt[qt_id] = qt
+        return qt_id
+    def get_or_create_batch(
+        self,
+        triples: list[Tuple[TermId, TermId, TermId]]
+    ) -> list[QtId]:
+        """
+        Bulk intern a batch of quoted triples.
+        Optimized for ingestion performance.
+        """
+        return [self.get_or_create(s, p, o) for s, p, o in triples]
+    def lookup(self, qt_id: QtId) -> Optional[QuotedTriple]:
+        """
+        Expand a QtId to its (s,p,o) components.
+        This is the critical operation for RDF★ expansion joins.
+        """
+        return self._id_to_qt.get(qt_id)
+    def lookup_batch(self, qt_ids: list[QtId]) -> list[Optional[QuotedTriple]]:
+        """
+        Bulk expand QtIds to their components.
+        Returns QuotedTriple objects (or None for unknown IDs).
+        """
+        return [self._id_to_qt.get(qt_id) for qt_id in qt_ids]
+    def expand_to_dataframe(self, qt_ids: list[QtId]) -> pl.DataFrame:
+        """
+        Expand a list of QtIds to a DataFrame with columns: qt_id, s, p, o.
+        This is the storage primitive for RDF★ expansion joins
+        (storage-spec.md §8: lookup_qt).
+        """
+        rows = []
+        for qt_id in qt_ids:
+            qt = self._id_to_qt.get(qt_id)
+            if qt is not None:
+                rows.append({
+                    "qt_id": qt_id,
+                    "s": qt.s,
+                    "p": qt.p,
+                    "o": qt.o,
+                })
+        if not rows:
+            return pl.DataFrame({
+                "qt_id": pl.Series([], dtype=pl.UInt64),
+                "s": pl.Series([], dtype=pl.UInt64),
+                "p": pl.Series([], dtype=pl.UInt64),
+                "o": pl.Series([], dtype=pl.UInt64),
+            })
+        return pl.DataFrame(rows).cast({
+            "qt_id": pl.UInt64,
+            "s": pl.UInt64,
+            "p": pl.UInt64,
+            "o": pl.UInt64,
+        })
+    def get_id(self, s: TermId, p: TermId, o: TermId) -> Optional[QtId]:
+        """Get the QtId for a triple if it exists, without creating it."""
+        qt = QuotedTriple(s, p, o)
+        qt_hash = qt.compute_hash()
+        if qt_hash not in self._hash_to_id:
+            return None
+        existing_id = self._hash_to_id[qt_hash]
+        if self._id_to_qt[existing_id] == qt:
+            return existing_id
+        return None
+    def contains(self, s: TermId, p: TermId, o: TermId) -> bool:
+        """Check if a quoted triple is already interned."""
+        return self.get_id(s, p, o) is not None
+    def __len__(self) -> int:
+        """Return the total number of quoted triples."""
+        return len(self._id_to_qt)
+    @property
+    def collision_count(self) -> int:
+        """Return the number of hash collisions encountered."""
+        return self._collision_count
+    # =========================================================================
+    # Persistence (Parquet)
+    # =========================================================================
+    def to_dataframe(self) -> pl.DataFrame:
+        """
+        Export the quoted triple dictionary to a Polars DataFrame.
+        Schema matches storage-spec.md §3.3:
+        - qt_id: u64
+        - s: u64
+        - p: u64
+        - o: u64
+        - qt_hash: stored as two u64 columns (hash_high, hash_low)
+        """
+        if not self._id_to_qt:
+            return pl.DataFrame({
+                "qt_id": pl.Series([], dtype=pl.UInt64),
+                "s": pl.Series([], dtype=pl.UInt64),
+                "p": pl.Series([], dtype=pl.UInt64),
+                "o": pl.Series([], dtype=pl.UInt64),
+                "hash_high": pl.Series([], dtype=pl.UInt64),
+                "hash_low": pl.Series([], dtype=pl.UInt64),
+            })
+        rows = []
+        for qt_id, qt in self._id_to_qt.items():
+            qt_hash = qt.compute_hash()
+            rows.append({
+                "qt_id": qt_id,
+                "s": qt.s,
+                "p": qt.p,
+                "o": qt.o,
+                "hash_high": qt_hash >> 64,
+                "hash_low": qt_hash & ((1 << 64) - 1),
+            })
+        return pl.DataFrame(rows).cast({
+            "qt_id": pl.UInt64,
+            "s": pl.UInt64,
+            "p": pl.UInt64,
+            "o": pl.UInt64,
+            "hash_high": pl.UInt64,
+            "hash_low": pl.UInt64,
+        })
+    def save(self, path: Path):
+        """Save the quoted triple dictionary to a Parquet file."""
+        path = Path(path)
+        path.mkdir(parents=True, exist_ok=True)
+        self.to_dataframe().write_parquet(path / "qt_dict.parquet")
+    @classmethod
+    def load(cls, path: Path, term_dict: TermDict) -> "QtDict":
+        """Load a quoted triple dictionary from a Parquet file."""
+        path = Path(path)
+        instance = cls(term_dict)
+        df = pl.read_parquet(path / "qt_dict.parquet")
+        for row in df.iter_rows(named=True):
+            qt_id = row["qt_id"]
+            qt = QuotedTriple(row["s"], row["p"], row["o"])
+            qt_hash = (row["hash_high"] << 64) | row["hash_low"]
+            instance._id_to_qt[qt_id] = qt
+            instance._hash_to_id[qt_hash] = qt_id
+            # Update sequence counter in TermDict
+            payload = get_term_payload(qt_id)
+            if payload >= term_dict._next_payload[TermKind.QUOTED_TRIPLE]:
+                term_dict._next_payload[TermKind.QUOTED_TRIPLE] = payload + 1
+        return instance
+    def stats(self) -> dict:
+        """Return statistics about the quoted triple dictionary."""
+        return {
+            "total_quoted_triples": len(self),
+            "hash_collisions": self._collision_count,
+        }