omendb 0.0.16__cp312-cp312-macosx_10_12_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
omendb/langchain.py ADDED
@@ -0,0 +1,386 @@
1
+ """LangChain VectorStore integration for OmenDB.
2
+
3
+ This module provides a LangChain-compatible VectorStore implementation
4
+ that wraps OmenDB for seamless integration with LangChain RAG pipelines.
5
+
6
+ Example:
7
+ >>> from langchain_openai import OpenAIEmbeddings
8
+ >>> from omendb.langchain import OmenDBVectorStore
9
+ >>>
10
+ >>> # Create from texts
11
+ >>> vectorstore = OmenDBVectorStore.from_texts(
12
+ ... texts=["Hello world", "How are you?"],
13
+ ... embedding=OpenAIEmbeddings(),
14
+ ... path="./my_vectors",
15
+ ... )
16
+ >>>
17
+ >>> # Search
18
+ >>> docs = vectorstore.similarity_search("greeting", k=2)
19
+ >>> print(docs[0].page_content)
20
+ Hello world
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import uuid
26
+ from collections.abc import Iterable, Sequence
27
+ from typing import Any
28
+
29
+ from langchain_core.documents import Document
30
+ from langchain_core.embeddings import Embeddings
31
+ from langchain_core.vectorstores import VectorStore
32
+
33
+
34
+ class OmenDBVectorStore(VectorStore):
35
+ """LangChain VectorStore implementation using OmenDB.
36
+
37
+ OmenDB is a fast embedded vector database with HNSW + ACORN-1
38
+ that provides ~19,000 QPS @ 10K vectors with 100% recall.
39
+
40
+ Features:
41
+ - HNSW index with adaptive parameters
42
+ - Extended RaBitQ quantization (8x compression)
43
+ - ACORN-1 filtered search (37.79x speedup)
44
+ - MongoDB-style metadata filtering
45
+ - Automatic persistence with persistent storage
46
+
47
+ Args:
48
+ embedding: LangChain Embeddings model for text-to-vector conversion.
49
+ path: Path to database directory. Uses persistent persistent storage.
50
+ dimensions: Vector dimensionality. Auto-detected when loading
51
+ existing database.
52
+ **kwargs: Additional arguments passed to omendb.open().
53
+
54
+ Example:
55
+ >>> from langchain_openai import OpenAIEmbeddings
56
+ >>> from omendb.langchain import OmenDBVectorStore
57
+ >>>
58
+ >>> vectorstore = OmenDBVectorStore(
59
+ ... embedding=OpenAIEmbeddings(),
60
+ ... path="./my_vectors",
61
+ ... )
62
+ >>> vectorstore.add_texts(["Hello world", "How are you?"])
63
+ >>> docs = vectorstore.similarity_search("greeting", k=2)
64
+ """
65
+
66
+ def __init__(
67
+ self,
68
+ embedding: Embeddings,
69
+ path: str = "./omendb-vectors",
70
+ dimensions: int | None = None,
71
+ **kwargs: Any,
72
+ ) -> None:
73
+ """Initialize OmenDBVectorStore.
74
+
75
+ Args:
76
+ embedding: LangChain Embeddings model.
77
+ path: Path to database directory.
78
+ dimensions: Vector dimensionality (auto-detected from embedding if None).
79
+ **kwargs: Additional arguments for omendb.open().
80
+ """
81
+ import omendb
82
+
83
+ self._embedding = embedding
84
+ self._path = path
85
+
86
+ # Auto-detect dimensions from embedding model if not specified
87
+ if dimensions is None:
88
+ # Embed a test string to get dimensions
89
+ test_embedding = embedding.embed_query("test")
90
+ dimensions = len(test_embedding)
91
+
92
+ self._dimensions = dimensions
93
+ self._db = omendb.open(path, dimensions=dimensions, **kwargs)
94
+
95
+ @property
96
+ def embeddings(self) -> Embeddings:
97
+ """Return the embeddings model."""
98
+ return self._embedding
99
+
100
+ def add_texts(
101
+ self,
102
+ texts: Iterable[str],
103
+ metadatas: list[dict] | None = None,
104
+ ids: list[str] | None = None,
105
+ **kwargs: Any,
106
+ ) -> list[str]:
107
+ """Add texts to the vector store.
108
+
109
+ Args:
110
+ texts: Texts to add.
111
+ metadatas: Optional metadata for each text.
112
+ ids: Optional IDs for each text. Auto-generated if not provided.
113
+ **kwargs: Additional arguments (unused).
114
+
115
+ Returns:
116
+ List of IDs for added texts.
117
+ """
118
+ texts_list = list(texts)
119
+
120
+ # Generate embeddings
121
+ embeddings = self._embedding.embed_documents(texts_list)
122
+
123
+ # Generate IDs if not provided
124
+ if ids is None:
125
+ ids = [str(uuid.uuid4()) for _ in texts_list]
126
+
127
+ # Prepare metadata with page_content stored
128
+ if metadatas is None:
129
+ metadatas = [{} for _ in texts_list]
130
+
131
+ # Build batch for set
132
+ items = []
133
+ for text, embedding, id_, metadata in zip(texts_list, embeddings, ids, metadatas):
134
+ # Store page_content in metadata for retrieval
135
+ item_metadata = {**metadata, "page_content": text}
136
+ items.append(
137
+ {
138
+ "id": id_,
139
+ "vector": embedding,
140
+ "metadata": item_metadata,
141
+ }
142
+ )
143
+
144
+ self._db.set(items)
145
+ return ids
146
+
147
+ def add_documents(
148
+ self,
149
+ documents: list[Document],
150
+ ids: list[str] | None = None,
151
+ **kwargs: Any,
152
+ ) -> list[str]:
153
+ """Add documents to the vector store.
154
+
155
+ Args:
156
+ documents: LangChain Documents to add.
157
+ ids: Optional IDs for each document.
158
+ **kwargs: Additional arguments.
159
+
160
+ Returns:
161
+ List of IDs for added documents.
162
+ """
163
+ texts = [doc.page_content for doc in documents]
164
+ metadatas = [doc.metadata for doc in documents]
165
+ return self.add_texts(texts, metadatas=metadatas, ids=ids, **kwargs)
166
+
167
+ def delete(
168
+ self,
169
+ ids: list[str] | None = None,
170
+ **kwargs: Any,
171
+ ) -> bool | None:
172
+ """Delete documents by ID.
173
+
174
+ Args:
175
+ ids: List of IDs to delete.
176
+ **kwargs: Additional arguments (unused).
177
+
178
+ Returns:
179
+ True if deletion was successful.
180
+ """
181
+ if ids is None:
182
+ return False
183
+
184
+ deleted = self._db.delete(ids)
185
+ return deleted > 0
186
+
187
+ def similarity_search(
188
+ self,
189
+ query: str,
190
+ k: int = 4,
191
+ filter: dict | None = None,
192
+ **kwargs: Any,
193
+ ) -> list[Document]:
194
+ """Search for similar documents by text query.
195
+
196
+ Args:
197
+ query: Query text.
198
+ k: Number of results to return.
199
+ filter: Optional MongoDB-style metadata filter.
200
+ **kwargs: Additional arguments (unused).
201
+
202
+ Returns:
203
+ List of similar Documents.
204
+ """
205
+ embedding = self._embedding.embed_query(query)
206
+ return self.similarity_search_by_vector(embedding, k=k, filter=filter, **kwargs)
207
+
208
+ def similarity_search_by_vector(
209
+ self,
210
+ embedding: list[float],
211
+ k: int = 4,
212
+ filter: dict | None = None,
213
+ **kwargs: Any,
214
+ ) -> list[Document]:
215
+ """Search for similar documents by vector.
216
+
217
+ Args:
218
+ embedding: Query vector.
219
+ k: Number of results to return.
220
+ filter: Optional MongoDB-style metadata filter.
221
+ **kwargs: Additional arguments (unused).
222
+
223
+ Returns:
224
+ List of similar Documents.
225
+ """
226
+ results = self._db.search(query=embedding, k=k, filter=filter)
227
+
228
+ documents = []
229
+ for result in results:
230
+ metadata = result.get("metadata", {})
231
+ # Extract page_content from metadata
232
+ page_content = metadata.pop("page_content", "")
233
+ documents.append(
234
+ Document(
235
+ page_content=page_content,
236
+ metadata=metadata,
237
+ id=result.get("id"),
238
+ )
239
+ )
240
+
241
+ return documents
242
+
243
+ def similarity_search_with_score(
244
+ self,
245
+ query: str,
246
+ k: int = 4,
247
+ filter: dict | None = None,
248
+ **kwargs: Any,
249
+ ) -> list[tuple[Document, float]]:
250
+ """Search for similar documents with relevance scores.
251
+
252
+ Args:
253
+ query: Query text.
254
+ k: Number of results to return.
255
+ filter: Optional MongoDB-style metadata filter.
256
+ **kwargs: Additional arguments (unused).
257
+
258
+ Returns:
259
+ List of (Document, score) tuples. Lower scores = more similar.
260
+ """
261
+ embedding = self._embedding.embed_query(query)
262
+ results = self._db.search(query=embedding, k=k, filter=filter)
263
+
264
+ documents_with_scores = []
265
+ for result in results:
266
+ metadata = result.get("metadata", {})
267
+ page_content = metadata.pop("page_content", "")
268
+ doc = Document(
269
+ page_content=page_content,
270
+ metadata=metadata,
271
+ id=result.get("id"),
272
+ )
273
+ # OmenDB returns L2 distance (lower = more similar)
274
+ score = result.get("distance", 0.0)
275
+ documents_with_scores.append((doc, score))
276
+
277
+ return documents_with_scores
278
+
279
+ def get_by_ids(self, ids: Sequence[str]) -> list[Document]:
280
+ """Get documents by their IDs.
281
+
282
+ Args:
283
+ ids: Sequence of document IDs.
284
+
285
+ Returns:
286
+ List of Documents. Missing IDs are skipped.
287
+ """
288
+ documents = []
289
+ for id_ in ids:
290
+ result = self._db.get(id_)
291
+ if result is not None:
292
+ metadata = result.get("metadata", {})
293
+ page_content = metadata.pop("page_content", "")
294
+ documents.append(
295
+ Document(
296
+ page_content=page_content,
297
+ metadata=metadata,
298
+ id=result.get("id"),
299
+ )
300
+ )
301
+
302
+ return documents
303
+
304
+ @classmethod
305
+ def from_texts(
306
+ cls,
307
+ texts: list[str],
308
+ embedding: Embeddings,
309
+ metadatas: list[dict] | None = None,
310
+ ids: list[str] | None = None,
311
+ path: str = "./omendb-vectors",
312
+ **kwargs: Any,
313
+ ) -> OmenDBVectorStore:
314
+ """Create a vector store from texts.
315
+
316
+ Args:
317
+ texts: Texts to add.
318
+ embedding: LangChain Embeddings model.
319
+ metadatas: Optional metadata for each text.
320
+ ids: Optional IDs for each text.
321
+ path: Path to database directory.
322
+ **kwargs: Additional arguments for OmenDBVectorStore.
323
+
324
+ Returns:
325
+ Initialized OmenDBVectorStore with texts added.
326
+
327
+ Example:
328
+ >>> from langchain_openai import OpenAIEmbeddings
329
+ >>> vectorstore = OmenDBVectorStore.from_texts(
330
+ ... texts=["Hello", "World"],
331
+ ... embedding=OpenAIEmbeddings(),
332
+ ... path="./my_vectors",
333
+ ... )
334
+ """
335
+ store = cls(embedding=embedding, path=path, **kwargs)
336
+ store.add_texts(texts, metadatas=metadatas, ids=ids)
337
+ return store
338
+
339
+ @classmethod
340
+ def from_documents(
341
+ cls,
342
+ documents: list[Document],
343
+ embedding: Embeddings,
344
+ ids: list[str] | None = None,
345
+ path: str = "./omendb-vectors",
346
+ **kwargs: Any,
347
+ ) -> OmenDBVectorStore:
348
+ """Create a vector store from documents.
349
+
350
+ Args:
351
+ documents: LangChain Documents to add.
352
+ embedding: LangChain Embeddings model.
353
+ ids: Optional IDs for each document.
354
+ path: Path to database directory.
355
+ **kwargs: Additional arguments for OmenDBVectorStore.
356
+
357
+ Returns:
358
+ Initialized OmenDBVectorStore with documents added.
359
+
360
+ Example:
361
+ >>> from langchain_core.documents import Document
362
+ >>> from langchain_openai import OpenAIEmbeddings
363
+ >>> docs = [Document(page_content="Hello", metadata={"source": "test"})]
364
+ >>> vectorstore = OmenDBVectorStore.from_documents(
365
+ ... documents=docs,
366
+ ... embedding=OpenAIEmbeddings(),
367
+ ... )
368
+ """
369
+ texts = [doc.page_content for doc in documents]
370
+ metadatas = [doc.metadata for doc in documents]
371
+ return cls.from_texts(
372
+ texts,
373
+ embedding,
374
+ metadatas=metadatas,
375
+ ids=ids,
376
+ path=path,
377
+ **kwargs,
378
+ )
379
+
380
+ def __len__(self) -> int:
381
+ """Return the number of vectors in the store."""
382
+ return len(self._db)
383
+
384
+ def flush(self) -> None:
385
+ """Flush data to disk for persistence."""
386
+ self._db.flush()