lobster-vector 1.1.418__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,76 @@
1
+ """
2
+ Vector search infrastructure for Lobster AI.
3
+
4
+ Provides pluggable vector database backends and embedding providers
5
+ for semantic search across biomedical ontologies, literature, and datasets.
6
+
7
+ Public API is exposed via __all__ but imports are lazy — importing this
8
+ module does NOT load chromadb, torch, sentence-transformers, or any other
9
+ heavy dependency. Classes are resolved on first access via __getattr__.
10
+
11
+ Usage::
12
+
13
+ from lobster.vector import VectorSearchService, VectorSearchConfig
14
+ from lobster.vector import ONTOLOGY_COLLECTIONS
15
+ from lobster.vector.backends.base import BaseVectorBackend
16
+ from lobster.vector.embeddings.base import BaseEmbedder
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from typing import TYPE_CHECKING
22
+
23
+ if TYPE_CHECKING:
24
+ from lobster.vector.artifact import ArtifactMetadata, CollectionUnavailable
25
+ from lobster.vector.backends.base import BaseVectorBackend
26
+ from lobster.vector.config import VectorSearchConfig
27
+ from lobster.vector.embeddings.base import BaseEmbedder
28
+ from lobster.vector.rerankers.base import BaseReranker
29
+ from lobster.vector.service import VectorSearchService
30
+
31
+ __all__ = [
32
+ "ArtifactMetadata",
33
+ "BaseReranker",
34
+ "BaseVectorBackend",
35
+ "BaseEmbedder",
36
+ "CollectionUnavailable",
37
+ "ONTOLOGY_COLLECTIONS",
38
+ "VectorSearchService",
39
+ "VectorSearchConfig",
40
+ ]
41
+
42
+
43
+ def __getattr__(name: str):
44
+ if name == "VectorSearchService":
45
+ from lobster.vector.service import VectorSearchService
46
+
47
+ return VectorSearchService
48
+ if name == "VectorSearchConfig":
49
+ from lobster.vector.config import VectorSearchConfig
50
+
51
+ return VectorSearchConfig
52
+ if name == "BaseVectorBackend":
53
+ from lobster.vector.backends.base import BaseVectorBackend
54
+
55
+ return BaseVectorBackend
56
+ if name == "BaseEmbedder":
57
+ from lobster.vector.embeddings.base import BaseEmbedder
58
+
59
+ return BaseEmbedder
60
+ if name == "BaseReranker":
61
+ from lobster.vector.rerankers.base import BaseReranker
62
+
63
+ return BaseReranker
64
+ if name == "ONTOLOGY_COLLECTIONS":
65
+ from lobster.vector.service import ONTOLOGY_COLLECTIONS
66
+
67
+ return ONTOLOGY_COLLECTIONS
68
+ if name == "ArtifactMetadata":
69
+ from lobster.vector.artifact import ArtifactMetadata
70
+
71
+ return ArtifactMetadata
72
+ if name == "CollectionUnavailable":
73
+ from lobster.vector.artifact import CollectionUnavailable
74
+
75
+ return CollectionUnavailable
76
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,46 @@
1
+ """Artifact metadata contract for vector collection compatibility."""
2
+ from __future__ import annotations
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+
7
+ class ArtifactMetadata(BaseModel):
8
+ """Describes a pre-built vector collection artifact.
9
+
10
+ Used to verify runtime embedder compatibility before querying.
11
+ If the runtime embedder doesn't match the artifact's embedding config,
12
+ queries against that collection should fail closed.
13
+ """
14
+
15
+ embedding_provider: str = Field(
16
+ description="Provider that built embeddings (sapbert, minilm, openai)"
17
+ )
18
+ model_id: str = Field(description="Specific model ID used for embedding")
19
+ dimensions: int = Field(description="Embedding vector dimensionality")
20
+ collection: str = Field(
21
+ description="Collection name (e.g. mondo_v2024_01)"
22
+ )
23
+ collection_version: str = Field(
24
+ description="Version tag of the source ontology"
25
+ )
26
+ build_hash: str = Field(description="SHA256 of source OWL/OBO file")
27
+ build_date: str = Field(description="ISO 8601 build timestamp")
28
+
29
+
30
+ class CollectionUnavailable:
31
+ """Returned when a collection cannot be queried safely."""
32
+
33
+ def __init__(
34
+ self,
35
+ collection: str,
36
+ reason: str,
37
+ expected: ArtifactMetadata | None = None,
38
+ actual_provider: str | None = None,
39
+ ):
40
+ self.collection = collection
41
+ self.reason = reason
42
+ self.expected = expected
43
+ self.actual_provider = actual_provider
44
+
45
+ def __repr__(self) -> str:
46
+ return f"CollectionUnavailable(collection={self.collection!r}, reason={self.reason!r})"
@@ -0,0 +1,9 @@
1
+ """
2
+ Vector database backend implementations.
3
+
4
+ Provides BaseVectorBackend ABC and backend-specific implementations.
5
+ Implementations are loaded lazily — importing this package does NOT
6
+ trigger chromadb, faiss, or psycopg2 imports.
7
+ """
8
+
9
+ __all__ = ["BaseVectorBackend"]
@@ -0,0 +1,153 @@
1
+ """
2
+ Abstract base class for vector database backends.
3
+
4
+ Defines the contract that all vector storage implementations must follow,
5
+ enabling pluggable backends (ChromaDB, FAISS, pgvector) with a consistent API.
6
+ Backend implementations are discovered via entry points and loaded lazily
7
+ to avoid importing heavy dependencies at startup.
8
+
9
+ Part of Phase 1 (Foundation) — implementations added in Phase 2+.
10
+ """
11
+
12
+ from abc import ABC, abstractmethod
13
+ from typing import Any
14
+
15
+
16
+ class BaseVectorBackend(ABC):
17
+ """
18
+ Abstract interface for vector database backends.
19
+
20
+ All vector storage implementations must subclass this and implement
21
+ the four core operations: add, search, delete, count. The interface
22
+ uses simple Python types (lists, dicts) to avoid coupling to any
23
+ specific backend's data model.
24
+
25
+ Implementations should handle their own connection management and
26
+ resource cleanup.
27
+ """
28
+
29
+ @abstractmethod
30
+ def add_documents(
31
+ self,
32
+ collection_name: str,
33
+ ids: list[str],
34
+ embeddings: list[list[float]],
35
+ documents: list[str] | None = None,
36
+ metadatas: list[dict[str, Any]] | None = None,
37
+ ) -> None:
38
+ """
39
+ Add documents with embeddings to a collection.
40
+
41
+ Creates the collection if it does not exist. If a document with
42
+ a given ID already exists, it is overwritten (upsert semantics).
43
+
44
+ Args:
45
+ collection_name: Name of the target collection.
46
+ ids: Unique identifiers for each document. Must be same length
47
+ as embeddings.
48
+ embeddings: Pre-computed embedding vectors. Each inner list
49
+ must have the same dimensionality.
50
+ documents: Optional raw text documents corresponding to each
51
+ embedding. Stored alongside vectors for retrieval.
52
+ metadatas: Optional metadata dicts for each document. Used for
53
+ filtering and returned with search results.
54
+
55
+ Raises:
56
+ ValueError: If ids, embeddings, documents, or metadatas have
57
+ mismatched lengths.
58
+ ConnectionError: If the backend is unreachable.
59
+ """
60
+ pass
61
+
62
+ @abstractmethod
63
+ def search(
64
+ self,
65
+ collection_name: str,
66
+ query_embedding: list[float],
67
+ n_results: int = 5,
68
+ ) -> dict[str, Any]:
69
+ """
70
+ Search a collection by vector similarity.
71
+
72
+ Returns raw backend results in a column-oriented format compatible
73
+ with ChromaDB's response structure. Callers should normalize these
74
+ results into SearchResult/OntologyMatch models.
75
+
76
+ Args:
77
+ collection_name: Name of the collection to search.
78
+ query_embedding: Query vector. Must match the dimensionality
79
+ of stored embeddings.
80
+ n_results: Maximum number of results to return.
81
+
82
+ Returns:
83
+ dict[str, Any]: Raw results with keys:
84
+ - "ids": list[list[str]] — matched document IDs
85
+ - "distances": list[list[float]] — distance scores
86
+ - "documents": list[list[str | None]] — document texts
87
+ - "metadatas": list[list[dict | None]] — metadata dicts
88
+
89
+ Raises:
90
+ ValueError: If the collection does not exist.
91
+ ValueError: If query_embedding dimensionality does not match
92
+ the collection's embeddings.
93
+ """
94
+ pass
95
+
96
+ @abstractmethod
97
+ def delete(
98
+ self,
99
+ collection_name: str,
100
+ ids: list[str],
101
+ ) -> None:
102
+ """
103
+ Delete documents from a collection by ID.
104
+
105
+ Silently ignores IDs that do not exist in the collection.
106
+
107
+ Args:
108
+ collection_name: Name of the collection.
109
+ ids: List of document IDs to delete.
110
+
111
+ Raises:
112
+ ValueError: If the collection does not exist.
113
+ ConnectionError: If the backend is unreachable.
114
+ """
115
+ pass
116
+
117
+ @abstractmethod
118
+ def count(
119
+ self,
120
+ collection_name: str,
121
+ ) -> int:
122
+ """
123
+ Count the number of documents in a collection.
124
+
125
+ Args:
126
+ collection_name: Name of the collection.
127
+
128
+ Returns:
129
+ int: Number of documents in the collection.
130
+
131
+ Raises:
132
+ ValueError: If the collection does not exist.
133
+ """
134
+ pass
135
+
136
+ def collection_exists(self, collection_name: str) -> bool:
137
+ """
138
+ Check whether a collection exists in the backend.
139
+
140
+ Default implementation attempts count() and catches exceptions.
141
+ Backends may override this with a more efficient native check.
142
+
143
+ Args:
144
+ collection_name: Name of the collection to check.
145
+
146
+ Returns:
147
+ bool: True if the collection exists, False otherwise.
148
+ """
149
+ try:
150
+ self.count(collection_name)
151
+ return True
152
+ except Exception:
153
+ return False