PyPI - rdf-starbase - Versions diffs - 0.1.0__py3-none-any.whl - Mend

rdf-starbase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

rdf_starbase/__init__.py +57 -0
rdf_starbase/ai_grounding.py +728 -0
rdf_starbase/compat/__init__.py +26 -0
rdf_starbase/compat/rdflib.py +1104 -0
rdf_starbase/formats/__init__.py +29 -0
rdf_starbase/formats/jsonld.py +488 -0
rdf_starbase/formats/ntriples.py +419 -0
rdf_starbase/formats/rdfxml.py +434 -0
rdf_starbase/formats/turtle.py +882 -0
rdf_starbase/models.py +92 -0
rdf_starbase/registry.py +540 -0
rdf_starbase/repositories.py +407 -0
rdf_starbase/repository_api.py +739 -0
rdf_starbase/sparql/__init__.py +35 -0
rdf_starbase/sparql/ast.py +910 -0
rdf_starbase/sparql/executor.py +1925 -0
rdf_starbase/sparql/parser.py +1716 -0
rdf_starbase/storage/__init__.py +44 -0
rdf_starbase/storage/executor.py +1914 -0
rdf_starbase/storage/facts.py +850 -0
rdf_starbase/storage/lsm.py +531 -0
rdf_starbase/storage/persistence.py +338 -0
rdf_starbase/storage/quoted_triples.py +292 -0
rdf_starbase/storage/reasoner.py +1035 -0
rdf_starbase/storage/terms.py +628 -0
rdf_starbase/store.py +1049 -0
rdf_starbase/store_legacy.py +748 -0
rdf_starbase/web.py +568 -0
rdf_starbase-0.1.0.dist-info/METADATA +706 -0
rdf_starbase-0.1.0.dist-info/RECORD +31 -0
rdf_starbase-0.1.0.dist-info/WHEEL +4 -0

rdf_starbase/storage/terms.py ADDED Viewed

@@ -0,0 +1,628 @@
+"""
+Term Dictionary with Integer ID Encoding.
+Implements dictionary-encoded RDF terms for high-performance columnar storage.
+All terms (IRIs, literals, blank nodes, quoted triples) are mapped to u64 TermIds.
+Key design decisions (from storage-spec.md):
+- Tagged ID space: high bits encode term kind for O(1) kind detection
+- Hash-based interning: 128-bit hashes for fast bulk dedupe
+- Batch-first: bulk get_or_create operations for ingestion performance
+- Persistence: Parquet-backed for restart-safe term catalogs
+"""
+from dataclasses import dataclass, field
+from enum import IntEnum
+from typing import Optional, Any, Union
+from pathlib import Path
+import hashlib
+import struct
+import polars as pl
+# =============================================================================
+# Term Identity and Encoding
+# =============================================================================
+class TermKind(IntEnum):
+    """
+    RDF term kind enumeration.
+    Encoded in the high 2 bits of TermId for O(1) kind detection.
+    """
+    IRI = 0
+    LITERAL = 1
+    BNODE = 2
+    QUOTED_TRIPLE = 3
+# Type alias for term identifiers (u64)
+TermId = int
+# Constants for ID encoding
+KIND_SHIFT = 62
+KIND_MASK = 0x3  # 2 bits
+PAYLOAD_MASK = (1 << KIND_SHIFT) - 1
+def make_term_id(kind: TermKind, payload: int) -> TermId:
+    """Create a TermId from kind and payload."""
+    return (kind << KIND_SHIFT) | (payload & PAYLOAD_MASK)
+def get_term_kind(term_id: TermId) -> TermKind:
+    """Extract the term kind from a TermId (O(1) operation)."""
+    return TermKind((term_id >> KIND_SHIFT) & KIND_MASK)
+def get_term_payload(term_id: TermId) -> int:
+    """Extract the payload (sequence number) from a TermId."""
+    return term_id & PAYLOAD_MASK
+def is_quoted_triple(term_id: TermId) -> bool:
+    """Check if a TermId refers to a quoted triple."""
+    return get_term_kind(term_id) == TermKind.QUOTED_TRIPLE
+# =============================================================================
+# Term Representation
+# =============================================================================
+@dataclass(frozen=True, slots=True)
+class Term:
+    """
+    Internal representation of an RDF term.
+    Attributes:
+        kind: The type of term (IRI, LITERAL, BNODE, QUOTED_TRIPLE)
+        lex: Lexical form (IRI string, literal value, bnode label)
+        datatype_id: TermId of datatype IRI (for typed literals)
+        lang: Language tag (for language-tagged literals)
+    """
+    kind: TermKind
+    lex: str
+    datatype_id: Optional[TermId] = None
+    lang: Optional[str] = None
+    def __hash__(self) -> int:
+        return hash((self.kind, self.lex, self.datatype_id, self.lang))
+    def canonical_bytes(self) -> bytes:
+        """
+        Generate canonical byte representation for hashing.
+        Includes: kind tag, lexical form, datatype IRI, language tag.
+        """
+        parts = [
+            struct.pack('B', self.kind),
+            self.lex.encode('utf-8'),
+        ]
+        if self.datatype_id is not None:
+            parts.append(struct.pack('>Q', self.datatype_id))
+        if self.lang is not None:
+            parts.append(b'@')
+            parts.append(self.lang.encode('utf-8'))
+        return b'\x00'.join(parts)
+    def compute_hash(self) -> int:
+        """Compute 128-bit hash for bulk deduplication."""
+        h = hashlib.md5(self.canonical_bytes()).digest()
+        return int.from_bytes(h, 'big')
+    @classmethod
+    def iri(cls, value: str) -> "Term":
+        """Create an IRI term."""
+        return cls(kind=TermKind.IRI, lex=value)
+    @classmethod
+    def literal(
+        cls,
+        value: str,
+        datatype_id: Optional[TermId] = None,
+        lang: Optional[str] = None
+    ) -> "Term":
+        """Create a literal term."""
+        return cls(
+            kind=TermKind.LITERAL,
+            lex=value,
+            datatype_id=datatype_id,
+            lang=lang
+        )
+    @classmethod
+    def bnode(cls, label: str) -> "Term":
+        """Create a blank node term."""
+        return cls(kind=TermKind.BNODE, lex=label)
+# =============================================================================
+# Term Dictionary
+# =============================================================================
+class TermDict:
+    """
+    Dictionary-encoded term catalog.
+    Maps RDF terms to integer TermIds with:
+    - O(1) kind detection via tagged ID space
+    - Hash-based bulk interning for fast ingestion
+    - Parquet persistence for restart-safe catalogs
+    Thread-safety: NOT thread-safe. Use external synchronization for concurrent access.
+    """
+    # Well-known datatype IRIs (pre-interned)
+    XSD_STRING = "http://www.w3.org/2001/XMLSchema#string"
+    XSD_INTEGER = "http://www.w3.org/2001/XMLSchema#integer"
+    XSD_DECIMAL = "http://www.w3.org/2001/XMLSchema#decimal"
+    XSD_DOUBLE = "http://www.w3.org/2001/XMLSchema#double"
+    XSD_BOOLEAN = "http://www.w3.org/2001/XMLSchema#boolean"
+    XSD_DATETIME = "http://www.w3.org/2001/XMLSchema#dateTime"
+    RDF_LANGSTRING = "http://www.w3.org/1999/02/22-rdf-syntax-ns#langString"
+    def __init__(self):
+        # Per-kind sequence counters
+        self._next_payload: dict[TermKind, int] = {
+            TermKind.IRI: 0,
+            TermKind.LITERAL: 0,
+            TermKind.BNODE: 0,
+            TermKind.QUOTED_TRIPLE: 0,
+        }
+        # Forward map: hash -> TermId (for interning)
+        self._hash_to_id: dict[int, TermId] = {}
+        # Reverse map: TermId -> Term (for lookup)
+        self._id_to_term: dict[TermId, Term] = {}
+        # =================================================================
+        # FAST PATH CACHES: Direct string->TermId lookup (no hashing)
+        # These bypass the expensive MD5 computation for common cases
+        # =================================================================
+        self._iri_cache: dict[str, TermId] = {}           # IRI string -> TermId
+        self._plain_literal_cache: dict[str, TermId] = {} # Plain string literal -> TermId
+        self._bnode_cache: dict[str, TermId] = {}         # Blank node label -> TermId
+        # Statistics
+        self._collision_count = 0
+        # Pre-intern well-known datatypes
+        self._init_well_known()
+    def _init_well_known(self):
+        """Pre-intern well-known datatype IRIs and populate caches."""
+        self.xsd_string_id = self.get_or_create(Term.iri(self.XSD_STRING))
+        self.xsd_integer_id = self.get_or_create(Term.iri(self.XSD_INTEGER))
+        self.xsd_decimal_id = self.get_or_create(Term.iri(self.XSD_DECIMAL))
+        self.xsd_double_id = self.get_or_create(Term.iri(self.XSD_DOUBLE))
+        self.xsd_boolean_id = self.get_or_create(Term.iri(self.XSD_BOOLEAN))
+        self.xsd_datetime_id = self.get_or_create(Term.iri(self.XSD_DATETIME))
+        self.rdf_langstring_id = self.get_or_create(Term.iri(self.RDF_LANGSTRING))
+        # Populate fast-path IRI cache for well-known IRIs
+        self._iri_cache[self.XSD_STRING] = self.xsd_string_id
+        self._iri_cache[self.XSD_INTEGER] = self.xsd_integer_id
+        self._iri_cache[self.XSD_DECIMAL] = self.xsd_decimal_id
+        self._iri_cache[self.XSD_DOUBLE] = self.xsd_double_id
+        self._iri_cache[self.XSD_BOOLEAN] = self.xsd_boolean_id
+        self._iri_cache[self.XSD_DATETIME] = self.xsd_datetime_id
+        self._iri_cache[self.RDF_LANGSTRING] = self.rdf_langstring_id
+    def _allocate_id(self, kind: TermKind) -> TermId:
+        """Allocate the next TermId for a given kind."""
+        payload = self._next_payload[kind]
+        self._next_payload[kind] = payload + 1
+        return make_term_id(kind, payload)
+    def get_or_create(self, term: Term) -> TermId:
+        """
+        Intern a term, returning its TermId.
+        If the term already exists, returns the existing ID.
+        Otherwise, allocates a new ID and stores the term.
+        """
+        term_hash = term.compute_hash()
+        if term_hash in self._hash_to_id:
+            existing_id = self._hash_to_id[term_hash]
+            # Verify it's actually the same term (hash collision check)
+            if self._id_to_term[existing_id] == term:
+                return existing_id
+            # Hash collision - need to handle
+            self._collision_count += 1
+            # Fall through to create new entry with different ID
+        # Allocate new ID
+        term_id = self._allocate_id(term.kind)
+        self._hash_to_id[term_hash] = term_id
+        self._id_to_term[term_id] = term
+        return term_id
+    def get_or_create_batch(self, terms: list[Term]) -> list[TermId]:
+        """
+        Bulk intern a batch of terms.
+        Optimized for ingestion performance. Returns TermIds in the same order.
+        """
+        return [self.get_or_create(term) for term in terms]
+    def lookup(self, term_id: TermId) -> Optional[Term]:
+        """Look up a term by its ID."""
+        return self._id_to_term.get(term_id)
+    def lookup_batch(self, term_ids: list[TermId]) -> list[Optional[Term]]:
+        """Bulk lookup terms by their IDs."""
+        return [self._id_to_term.get(tid) for tid in term_ids]
+    def contains(self, term: Term) -> bool:
+        """Check if a term is already interned."""
+        term_hash = term.compute_hash()
+        if term_hash not in self._hash_to_id:
+            return False
+        existing_id = self._hash_to_id[term_hash]
+        return self._id_to_term[existing_id] == term
+    def get_id(self, term: Term) -> Optional[TermId]:
+        """Get the TermId for a term if it exists, without creating it."""
+        term_hash = term.compute_hash()
+        if term_hash not in self._hash_to_id:
+            return None
+        existing_id = self._hash_to_id[term_hash]
+        if self._id_to_term[existing_id] == term:
+            return existing_id
+        return None
+    def __len__(self) -> int:
+        """Return the total number of interned terms."""
+        return len(self._id_to_term)
+    def count_by_kind(self) -> dict[TermKind, int]:
+        """Return counts of terms by kind."""
+        return {kind: self._next_payload[kind] for kind in TermKind}
+    @property
+    def collision_count(self) -> int:
+        """Return the number of hash collisions encountered."""
+        return self._collision_count
+    # =========================================================================
+    # Persistence (Parquet)
+    # =========================================================================
+    def to_dataframe(self) -> pl.DataFrame:
+        """
+        Export the term dictionary to a Polars DataFrame.
+        Schema matches storage-spec.md §3.1:
+        - term_id: u64
+        - kind: u8
+        - lex: string
+        - datatype_id: u64 (nullable)
+        - lang: string (nullable)
+        """
+        if not self._id_to_term:
+            return pl.DataFrame({
+                "term_id": pl.Series([], dtype=pl.UInt64),
+                "kind": pl.Series([], dtype=pl.UInt8),
+                "lex": pl.Series([], dtype=pl.Utf8),
+                "datatype_id": pl.Series([], dtype=pl.UInt64),
+                "lang": pl.Series([], dtype=pl.Utf8),
+            })
+        rows = []
+        for term_id, term in self._id_to_term.items():
+            rows.append({
+                "term_id": term_id,
+                "kind": int(term.kind),
+                "lex": term.lex,
+                "datatype_id": term.datatype_id,
+                "lang": term.lang,
+            })
+        return pl.DataFrame(rows).cast({
+            "term_id": pl.UInt64,
+            "kind": pl.UInt8,
+        })
+    def to_hash_dataframe(self) -> pl.DataFrame:
+        """
+        Export the term hash table to a Polars DataFrame.
+        Schema matches storage-spec.md §3.2:
+        - term_hash: stored as two u64 columns (hash_high, hash_low)
+        - term_id: u64
+        """
+        if not self._hash_to_id:
+            return pl.DataFrame({
+                "hash_high": pl.Series([], dtype=pl.UInt64),
+                "hash_low": pl.Series([], dtype=pl.UInt64),
+                "term_id": pl.Series([], dtype=pl.UInt64),
+            })
+        rows = []
+        for term_hash, term_id in self._hash_to_id.items():
+            hash_high = term_hash >> 64
+            hash_low = term_hash & ((1 << 64) - 1)
+            rows.append({
+                "hash_high": hash_high,
+                "hash_low": hash_low,
+                "term_id": term_id,
+            })
+        return pl.DataFrame(rows).cast({
+            "hash_high": pl.UInt64,
+            "hash_low": pl.UInt64,
+            "term_id": pl.UInt64,
+        })
+    def save(self, path: Path):
+        """
+        Save the term dictionary to Parquet files.
+        Creates:
+        - {path}/term_dict.parquet
+        - {path}/term_hash.parquet
+        """
+        path = Path(path)
+        path.mkdir(parents=True, exist_ok=True)
+        self.to_dataframe().write_parquet(path / "term_dict.parquet")
+        self.to_hash_dataframe().write_parquet(path / "term_hash.parquet")
+    @classmethod
+    def load(cls, path: Path) -> "TermDict":
+        """
+        Load a term dictionary from Parquet files.
+        Expects:
+        - {path}/term_dict.parquet
+        - {path}/term_hash.parquet
+        """
+        path = Path(path)
+        instance = cls.__new__(cls)
+        instance._next_payload = {kind: 0 for kind in TermKind}
+        instance._hash_to_id = {}
+        instance._id_to_term = {}
+        instance._collision_count = 0
+        # Initialize fast-path caches
+        instance._iri_cache = {}
+        instance._plain_literal_cache = {}
+        instance._bnode_cache = {}
+        # Load term dictionary
+        term_df = pl.read_parquet(path / "term_dict.parquet")
+        for row in term_df.iter_rows(named=True):
+            term_id = row["term_id"]
+            term = Term(
+                kind=TermKind(row["kind"]),
+                lex=row["lex"],
+                datatype_id=row["datatype_id"],
+                lang=row["lang"],
+            )
+            instance._id_to_term[term_id] = term
+            # Populate fast-path caches
+            if term.kind == TermKind.IRI:
+                instance._iri_cache[term.lex] = term_id
+            elif term.kind == TermKind.BNODE:
+                instance._bnode_cache[term.lex] = term_id
+            elif term.kind == TermKind.LITERAL and term.lang is None:
+                # Only cache plain literals (no lang tag) - check if it's xsd:string
+                instance._plain_literal_cache[term.lex] = term_id
+            # Update sequence counters
+            kind = get_term_kind(term_id)
+            payload = get_term_payload(term_id)
+            if payload >= instance._next_payload[kind]:
+                instance._next_payload[kind] = payload + 1
+        # Load hash table
+        hash_df = pl.read_parquet(path / "term_hash.parquet")
+        for row in hash_df.iter_rows(named=True):
+            term_hash = (row["hash_high"] << 64) | row["hash_low"]
+            instance._hash_to_id[term_hash] = row["term_id"]
+        # Restore well-known IDs
+        instance._restore_well_known()
+        return instance
+    def _restore_well_known(self):
+        """Restore well-known datatype ID references after loading."""
+        for term_id, term in self._id_to_term.items():
+            if term.kind == TermKind.IRI:
+                if term.lex == self.XSD_STRING:
+                    self.xsd_string_id = term_id
+                elif term.lex == self.XSD_INTEGER:
+                    self.xsd_integer_id = term_id
+                elif term.lex == self.XSD_DECIMAL:
+                    self.xsd_decimal_id = term_id
+                elif term.lex == self.XSD_DOUBLE:
+                    self.xsd_double_id = term_id
+                elif term.lex == self.XSD_BOOLEAN:
+                    self.xsd_boolean_id = term_id
+                elif term.lex == self.XSD_DATETIME:
+                    self.xsd_datetime_id = term_id
+                elif term.lex == self.RDF_LANGSTRING:
+                    self.rdf_langstring_id = term_id
+    # =========================================================================
+    # Convenience methods for common term types (OPTIMIZED)
+    # =========================================================================
+    def intern_iri(self, value: str) -> TermId:
+        """Intern an IRI and return its TermId. Uses fast-path cache."""
+        # Fast path: direct string lookup (no Term object, no MD5)
+        cached = self._iri_cache.get(value)
+        if cached is not None:
+            return cached
+        # Slow path: create Term and intern via hash
+        term = Term.iri(value)
+        term_id = self._allocate_id(TermKind.IRI)
+        term_hash = term.compute_hash()
+        self._hash_to_id[term_hash] = term_id
+        self._id_to_term[term_id] = term
+        # Cache for future fast lookups
+        self._iri_cache[value] = term_id
+        return term_id
+    def intern_literal(
+        self,
+        value: Any,
+        datatype: Optional[str] = None,
+        lang: Optional[str] = None
+    ) -> TermId:
+        """
+        Intern a literal and return its TermId.
+        Automatically determines datatype from Python type if not specified.
+        Uses fast-path cache for plain string literals (the common case).
+        """
+        lex = str(value)
+        # FAST PATH: plain string literal with no lang tag (most common case)
+        # Use direct string lookup - no Term object, no MD5 hash
+        if lang is None and datatype is None and isinstance(value, str):
+            cached = self._plain_literal_cache.get(lex)
+            if cached is not None:
+                return cached
+            # Create and cache
+            term = Term.literal(lex, self.xsd_string_id, None)
+            term_id = self._allocate_id(TermKind.LITERAL)
+            term_hash = term.compute_hash()
+            self._hash_to_id[term_hash] = term_id
+            self._id_to_term[term_id] = term
+            self._plain_literal_cache[lex] = term_id
+            return term_id
+        # SLOW PATH: typed literals or lang-tagged strings
+        # Determine datatype ID
+        datatype_id = None
+        if lang is not None:
+            datatype_id = self.rdf_langstring_id
+        elif datatype is not None:
+            datatype_id = self.intern_iri(datatype)
+        elif isinstance(value, bool):
+            datatype_id = self.xsd_boolean_id
+        elif isinstance(value, int):
+            datatype_id = self.xsd_integer_id
+        elif isinstance(value, float):
+            datatype_id = self.xsd_decimal_id
+        else:
+            datatype_id = self.xsd_string_id
+        return self.get_or_create(Term.literal(lex, datatype_id, lang))
+    def intern_bnode(self, label: Optional[str] = None) -> TermId:
+        """
+        Intern a blank node and return its TermId. Uses fast-path cache.
+        If no label is provided, generates a unique one.
+        """
+        if label is None:
+            label = f"b{self._next_payload[TermKind.BNODE]}"
+        # Fast path: direct string lookup
+        cached = self._bnode_cache.get(label)
+        if cached is not None:
+            return cached
+        # Slow path: create and cache
+        term = Term.bnode(label)
+        term_id = self._allocate_id(TermKind.BNODE)
+        term_hash = term.compute_hash()
+        self._hash_to_id[term_hash] = term_id
+        self._id_to_term[term_id] = term
+        self._bnode_cache[label] = term_id
+        return term_id
+    def get_lex(self, term_id: TermId) -> Optional[str]:
+        """Get the lexical form of a term by its ID."""
+        term = self.lookup(term_id)
+        return term.lex if term else None
+    # =========================================================================
+    # Lookup methods (read-only)
+    # =========================================================================
+    def lookup_iri(self, value: str) -> Optional[TermId]:
+        """Look up an IRI's TermId without creating it."""
+        return self.get_id(Term.iri(value))
+    def lookup_literal(
+        self,
+        value: str,
+        datatype: Optional[str] = None,
+        lang: Optional[str] = None
+    ) -> Optional[TermId]:
+        """Look up a literal's TermId without creating it."""
+        # Determine datatype ID (must already exist)
+        datatype_id = None
+        if lang is not None:
+            datatype_id = self.rdf_langstring_id
+        elif datatype is not None:
+            dt_term = Term.iri(datatype)
+            datatype_id = self.get_id(dt_term)
+            if datatype_id is None:
+                return None  # Datatype not in dict means literal can't exist
+        else:
+            datatype_id = self.xsd_string_id
+        return self.get_id(Term.literal(value, datatype_id, lang))
+    def lookup_bnode(self, label: str) -> Optional[TermId]:
+        """Look up a blank node's TermId without creating it."""
+        return self.get_id(Term.bnode(label))
+    def build_literal_to_float_map(self) -> dict[TermId, float]:
+        """
+        Build a mapping from literal term IDs to float values.
+        Returns a dict for all literals that can be parsed as floats.
+        Used for vectorized confidence filtering.
+        """
+        result = {}
+        for term_id, term in self._id_to_term.items():
+            if term.kind == TermKind.LITERAL:
+                try:
+                    result[term_id] = float(term.lex)
+                except (ValueError, TypeError):
+                    continue
+        return result
+    def get_lex_series(self, term_ids: pl.Series) -> pl.Series:
+        """
+        Vectorized lookup of lexical forms for a series of term IDs.
+        Returns a Utf8 Series with lexical forms (null for missing IDs).
+        """
+        # Build a mapping dict for the unique IDs in the series
+        unique_ids = term_ids.unique().to_list()
+        id_to_lex = {}
+        for tid in unique_ids:
+            if tid is not None:
+                term = self._id_to_term.get(tid)
+                if term is not None:
+                    id_to_lex[tid] = term.lex
+        # Map using Polars map_elements for compatibility
+        return term_ids.map_elements(
+            lambda x: id_to_lex.get(x) if x is not None else None,
+            return_dtype=pl.Utf8
+        )
+    def stats(self) -> dict:
+        """Return statistics about the term dictionary."""
+        return {
+            "total_terms": len(self),
+            "by_kind": {kind.name: count for kind, count in self.count_by_kind().items()},
+            "hash_collisions": self._collision_count,
+        }