PyPI - lobster-vector - Versions diffs - 1.1.418__py3-none-any.whl - Mend

lobster-vector 1.1.418__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

lobster/vector/__init__.py +76 -0
lobster/vector/artifact.py +46 -0
lobster/vector/backends/__init__.py +9 -0
lobster/vector/backends/base.py +153 -0
lobster/vector/backends/chromadb_backend.py +509 -0
lobster/vector/backends/faiss_backend.py +311 -0
lobster/vector/backends/pgvector_backend.py +68 -0
lobster/vector/config.py +206 -0
lobster/vector/embeddings/__init__.py +9 -0
lobster/vector/embeddings/base.py +92 -0
lobster/vector/embeddings/minilm.py +128 -0
lobster/vector/embeddings/openai_embedder.py +132 -0
lobster/vector/embeddings/sapbert.py +130 -0
lobster/vector/ontology_graph.py +189 -0
lobster/vector/rerankers/__init__.py +7 -0
lobster/vector/rerankers/base.py +101 -0
lobster/vector/rerankers/cohere_reranker.py +151 -0
lobster/vector/rerankers/cross_encoder_reranker.py +120 -0
lobster/vector/service.py +360 -0
lobster_vector-1.1.418.dist-info/METADATA +34 -0
lobster_vector-1.1.418.dist-info/RECORD +23 -0
lobster_vector-1.1.418.dist-info/WHEEL +5 -0
lobster_vector-1.1.418.dist-info/top_level.txt +1 -0

lobster/vector/__init__.py ADDED Viewed

@@ -0,0 +1,76 @@
+"""
+Vector search infrastructure for Lobster AI.
+Provides pluggable vector database backends and embedding providers
+for semantic search across biomedical ontologies, literature, and datasets.
+Public API is exposed via __all__ but imports are lazy — importing this
+module does NOT load chromadb, torch, sentence-transformers, or any other
+heavy dependency. Classes are resolved on first access via __getattr__.
+Usage::
+    from lobster.vector import VectorSearchService, VectorSearchConfig
+    from lobster.vector import ONTOLOGY_COLLECTIONS
+    from lobster.vector.backends.base import BaseVectorBackend
+    from lobster.vector.embeddings.base import BaseEmbedder
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from lobster.vector.artifact import ArtifactMetadata, CollectionUnavailable
+    from lobster.vector.backends.base import BaseVectorBackend
+    from lobster.vector.config import VectorSearchConfig
+    from lobster.vector.embeddings.base import BaseEmbedder
+    from lobster.vector.rerankers.base import BaseReranker
+    from lobster.vector.service import VectorSearchService
+__all__ = [
+    "ArtifactMetadata",
+    "BaseReranker",
+    "BaseVectorBackend",
+    "BaseEmbedder",
+    "CollectionUnavailable",
+    "ONTOLOGY_COLLECTIONS",
+    "VectorSearchService",
+    "VectorSearchConfig",
+]
+def __getattr__(name: str):
+    if name == "VectorSearchService":
+        from lobster.vector.service import VectorSearchService
+        return VectorSearchService
+    if name == "VectorSearchConfig":
+        from lobster.vector.config import VectorSearchConfig
+        return VectorSearchConfig
+    if name == "BaseVectorBackend":
+        from lobster.vector.backends.base import BaseVectorBackend
+        return BaseVectorBackend
+    if name == "BaseEmbedder":
+        from lobster.vector.embeddings.base import BaseEmbedder
+        return BaseEmbedder
+    if name == "BaseReranker":
+        from lobster.vector.rerankers.base import BaseReranker
+        return BaseReranker
+    if name == "ONTOLOGY_COLLECTIONS":
+        from lobster.vector.service import ONTOLOGY_COLLECTIONS
+        return ONTOLOGY_COLLECTIONS
+    if name == "ArtifactMetadata":
+        from lobster.vector.artifact import ArtifactMetadata
+        return ArtifactMetadata
+    if name == "CollectionUnavailable":
+        from lobster.vector.artifact import CollectionUnavailable
+        return CollectionUnavailable
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

lobster/vector/artifact.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""Artifact metadata contract for vector collection compatibility."""
+from __future__ import annotations
+from pydantic import BaseModel, Field
+class ArtifactMetadata(BaseModel):
+    """Describes a pre-built vector collection artifact.
+    Used to verify runtime embedder compatibility before querying.
+    If the runtime embedder doesn't match the artifact's embedding config,
+    queries against that collection should fail closed.
+    """
+    embedding_provider: str = Field(
+        description="Provider that built embeddings (sapbert, minilm, openai)"
+    )
+    model_id: str = Field(description="Specific model ID used for embedding")
+    dimensions: int = Field(description="Embedding vector dimensionality")
+    collection: str = Field(
+        description="Collection name (e.g. mondo_v2024_01)"
+    )
+    collection_version: str = Field(
+        description="Version tag of the source ontology"
+    )
+    build_hash: str = Field(description="SHA256 of source OWL/OBO file")
+    build_date: str = Field(description="ISO 8601 build timestamp")
+class CollectionUnavailable:
+    """Returned when a collection cannot be queried safely."""
+    def __init__(
+        self,
+        collection: str,
+        reason: str,
+        expected: ArtifactMetadata | None = None,
+        actual_provider: str | None = None,
+    ):
+        self.collection = collection
+        self.reason = reason
+        self.expected = expected
+        self.actual_provider = actual_provider
+    def __repr__(self) -> str:
+        return f"CollectionUnavailable(collection={self.collection!r}, reason={self.reason!r})"

lobster/vector/backends/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""
+Vector database backend implementations.
+Provides BaseVectorBackend ABC and backend-specific implementations.
+Implementations are loaded lazily — importing this package does NOT
+trigger chromadb, faiss, or psycopg2 imports.
+"""
+__all__ = ["BaseVectorBackend"]

lobster/vector/backends/base.py ADDED Viewed

@@ -0,0 +1,153 @@
+"""
+Abstract base class for vector database backends.
+Defines the contract that all vector storage implementations must follow,
+enabling pluggable backends (ChromaDB, FAISS, pgvector) with a consistent API.
+Backend implementations are discovered via entry points and loaded lazily
+to avoid importing heavy dependencies at startup.
+Part of Phase 1 (Foundation) — implementations added in Phase 2+.
+"""
+from abc import ABC, abstractmethod
+from typing import Any
+class BaseVectorBackend(ABC):
+    """
+    Abstract interface for vector database backends.
+    All vector storage implementations must subclass this and implement
+    the four core operations: add, search, delete, count. The interface
+    uses simple Python types (lists, dicts) to avoid coupling to any
+    specific backend's data model.
+    Implementations should handle their own connection management and
+    resource cleanup.
+    """
+    @abstractmethod
+    def add_documents(
+        self,
+        collection_name: str,
+        ids: list[str],
+        embeddings: list[list[float]],
+        documents: list[str] | None = None,
+        metadatas: list[dict[str, Any]] | None = None,
+    ) -> None:
+        """
+        Add documents with embeddings to a collection.
+        Creates the collection if it does not exist. If a document with
+        a given ID already exists, it is overwritten (upsert semantics).
+        Args:
+            collection_name: Name of the target collection.
+            ids: Unique identifiers for each document. Must be same length
+                as embeddings.
+            embeddings: Pre-computed embedding vectors. Each inner list
+                must have the same dimensionality.
+            documents: Optional raw text documents corresponding to each
+                embedding. Stored alongside vectors for retrieval.
+            metadatas: Optional metadata dicts for each document. Used for
+                filtering and returned with search results.
+        Raises:
+            ValueError: If ids, embeddings, documents, or metadatas have
+                mismatched lengths.
+            ConnectionError: If the backend is unreachable.
+        """
+        pass
+    @abstractmethod
+    def search(
+        self,
+        collection_name: str,
+        query_embedding: list[float],
+        n_results: int = 5,
+    ) -> dict[str, Any]:
+        """
+        Search a collection by vector similarity.
+        Returns raw backend results in a column-oriented format compatible
+        with ChromaDB's response structure. Callers should normalize these
+        results into SearchResult/OntologyMatch models.
+        Args:
+            collection_name: Name of the collection to search.
+            query_embedding: Query vector. Must match the dimensionality
+                of stored embeddings.
+            n_results: Maximum number of results to return.
+        Returns:
+            dict[str, Any]: Raw results with keys:
+                - "ids": list[list[str]] — matched document IDs
+                - "distances": list[list[float]] — distance scores
+                - "documents": list[list[str | None]] — document texts
+                - "metadatas": list[list[dict | None]] — metadata dicts
+        Raises:
+            ValueError: If the collection does not exist.
+            ValueError: If query_embedding dimensionality does not match
+                the collection's embeddings.
+        """
+        pass
+    @abstractmethod
+    def delete(
+        self,
+        collection_name: str,
+        ids: list[str],
+    ) -> None:
+        """
+        Delete documents from a collection by ID.
+        Silently ignores IDs that do not exist in the collection.
+        Args:
+            collection_name: Name of the collection.
+            ids: List of document IDs to delete.
+        Raises:
+            ValueError: If the collection does not exist.
+            ConnectionError: If the backend is unreachable.
+        """
+        pass
+    @abstractmethod
+    def count(
+        self,
+        collection_name: str,
+    ) -> int:
+        """
+        Count the number of documents in a collection.
+        Args:
+            collection_name: Name of the collection.
+        Returns:
+            int: Number of documents in the collection.
+        Raises:
+            ValueError: If the collection does not exist.
+        """
+        pass
+    def collection_exists(self, collection_name: str) -> bool:
+        """
+        Check whether a collection exists in the backend.
+        Default implementation attempts count() and catches exceptions.
+        Backends may override this with a more efficient native check.
+        Args:
+            collection_name: Name of the collection to check.
+        Returns:
+            bool: True if the collection exists, False otherwise.
+        """
+        try:
+            self.count(collection_name)
+            return True
+        except Exception:
+            return False