omendb 0.0.16__cp312-cp312-macosx_10_12_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omendb/__init__.py +24 -0
- omendb/__init__.pyi +586 -0
- omendb/langchain.py +386 -0
- omendb/llamaindex.py +336 -0
- omendb/omendb.cpython-312-darwin.so +0 -0
- omendb-0.0.16.dist-info/METADATA +265 -0
- omendb-0.0.16.dist-info/RECORD +8 -0
- omendb-0.0.16.dist-info/WHEEL +4 -0
omendb/langchain.py
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
"""LangChain VectorStore integration for OmenDB.
|
|
2
|
+
|
|
3
|
+
This module provides a LangChain-compatible VectorStore implementation
|
|
4
|
+
that wraps OmenDB for seamless integration with LangChain RAG pipelines.
|
|
5
|
+
|
|
6
|
+
Example:
|
|
7
|
+
>>> from langchain_openai import OpenAIEmbeddings
|
|
8
|
+
>>> from omendb.langchain import OmenDBVectorStore
|
|
9
|
+
>>>
|
|
10
|
+
>>> # Create from texts
|
|
11
|
+
>>> vectorstore = OmenDBVectorStore.from_texts(
|
|
12
|
+
... texts=["Hello world", "How are you?"],
|
|
13
|
+
... embedding=OpenAIEmbeddings(),
|
|
14
|
+
... path="./my_vectors",
|
|
15
|
+
... )
|
|
16
|
+
>>>
|
|
17
|
+
>>> # Search
|
|
18
|
+
>>> docs = vectorstore.similarity_search("greeting", k=2)
|
|
19
|
+
>>> print(docs[0].page_content)
|
|
20
|
+
Hello world
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import uuid
|
|
26
|
+
from collections.abc import Iterable, Sequence
|
|
27
|
+
from typing import Any
|
|
28
|
+
|
|
29
|
+
from langchain_core.documents import Document
|
|
30
|
+
from langchain_core.embeddings import Embeddings
|
|
31
|
+
from langchain_core.vectorstores import VectorStore
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class OmenDBVectorStore(VectorStore):
|
|
35
|
+
"""LangChain VectorStore implementation using OmenDB.
|
|
36
|
+
|
|
37
|
+
OmenDB is a fast embedded vector database with HNSW + ACORN-1
|
|
38
|
+
that provides ~19,000 QPS @ 10K vectors with 100% recall.
|
|
39
|
+
|
|
40
|
+
Features:
|
|
41
|
+
- HNSW index with adaptive parameters
|
|
42
|
+
- Extended RaBitQ quantization (8x compression)
|
|
43
|
+
- ACORN-1 filtered search (37.79x speedup)
|
|
44
|
+
- MongoDB-style metadata filtering
|
|
45
|
+
- Automatic persistence with persistent storage
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
embedding: LangChain Embeddings model for text-to-vector conversion.
|
|
49
|
+
path: Path to database directory. Uses persistent persistent storage.
|
|
50
|
+
dimensions: Vector dimensionality. Auto-detected when loading
|
|
51
|
+
existing database.
|
|
52
|
+
**kwargs: Additional arguments passed to omendb.open().
|
|
53
|
+
|
|
54
|
+
Example:
|
|
55
|
+
>>> from langchain_openai import OpenAIEmbeddings
|
|
56
|
+
>>> from omendb.langchain import OmenDBVectorStore
|
|
57
|
+
>>>
|
|
58
|
+
>>> vectorstore = OmenDBVectorStore(
|
|
59
|
+
... embedding=OpenAIEmbeddings(),
|
|
60
|
+
... path="./my_vectors",
|
|
61
|
+
... )
|
|
62
|
+
>>> vectorstore.add_texts(["Hello world", "How are you?"])
|
|
63
|
+
>>> docs = vectorstore.similarity_search("greeting", k=2)
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(
|
|
67
|
+
self,
|
|
68
|
+
embedding: Embeddings,
|
|
69
|
+
path: str = "./omendb-vectors",
|
|
70
|
+
dimensions: int | None = None,
|
|
71
|
+
**kwargs: Any,
|
|
72
|
+
) -> None:
|
|
73
|
+
"""Initialize OmenDBVectorStore.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
embedding: LangChain Embeddings model.
|
|
77
|
+
path: Path to database directory.
|
|
78
|
+
dimensions: Vector dimensionality (auto-detected from embedding if None).
|
|
79
|
+
**kwargs: Additional arguments for omendb.open().
|
|
80
|
+
"""
|
|
81
|
+
import omendb
|
|
82
|
+
|
|
83
|
+
self._embedding = embedding
|
|
84
|
+
self._path = path
|
|
85
|
+
|
|
86
|
+
# Auto-detect dimensions from embedding model if not specified
|
|
87
|
+
if dimensions is None:
|
|
88
|
+
# Embed a test string to get dimensions
|
|
89
|
+
test_embedding = embedding.embed_query("test")
|
|
90
|
+
dimensions = len(test_embedding)
|
|
91
|
+
|
|
92
|
+
self._dimensions = dimensions
|
|
93
|
+
self._db = omendb.open(path, dimensions=dimensions, **kwargs)
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def embeddings(self) -> Embeddings:
|
|
97
|
+
"""Return the embeddings model."""
|
|
98
|
+
return self._embedding
|
|
99
|
+
|
|
100
|
+
def add_texts(
|
|
101
|
+
self,
|
|
102
|
+
texts: Iterable[str],
|
|
103
|
+
metadatas: list[dict] | None = None,
|
|
104
|
+
ids: list[str] | None = None,
|
|
105
|
+
**kwargs: Any,
|
|
106
|
+
) -> list[str]:
|
|
107
|
+
"""Add texts to the vector store.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
texts: Texts to add.
|
|
111
|
+
metadatas: Optional metadata for each text.
|
|
112
|
+
ids: Optional IDs for each text. Auto-generated if not provided.
|
|
113
|
+
**kwargs: Additional arguments (unused).
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
List of IDs for added texts.
|
|
117
|
+
"""
|
|
118
|
+
texts_list = list(texts)
|
|
119
|
+
|
|
120
|
+
# Generate embeddings
|
|
121
|
+
embeddings = self._embedding.embed_documents(texts_list)
|
|
122
|
+
|
|
123
|
+
# Generate IDs if not provided
|
|
124
|
+
if ids is None:
|
|
125
|
+
ids = [str(uuid.uuid4()) for _ in texts_list]
|
|
126
|
+
|
|
127
|
+
# Prepare metadata with page_content stored
|
|
128
|
+
if metadatas is None:
|
|
129
|
+
metadatas = [{} for _ in texts_list]
|
|
130
|
+
|
|
131
|
+
# Build batch for set
|
|
132
|
+
items = []
|
|
133
|
+
for text, embedding, id_, metadata in zip(texts_list, embeddings, ids, metadatas):
|
|
134
|
+
# Store page_content in metadata for retrieval
|
|
135
|
+
item_metadata = {**metadata, "page_content": text}
|
|
136
|
+
items.append(
|
|
137
|
+
{
|
|
138
|
+
"id": id_,
|
|
139
|
+
"vector": embedding,
|
|
140
|
+
"metadata": item_metadata,
|
|
141
|
+
}
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
self._db.set(items)
|
|
145
|
+
return ids
|
|
146
|
+
|
|
147
|
+
def add_documents(
|
|
148
|
+
self,
|
|
149
|
+
documents: list[Document],
|
|
150
|
+
ids: list[str] | None = None,
|
|
151
|
+
**kwargs: Any,
|
|
152
|
+
) -> list[str]:
|
|
153
|
+
"""Add documents to the vector store.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
documents: LangChain Documents to add.
|
|
157
|
+
ids: Optional IDs for each document.
|
|
158
|
+
**kwargs: Additional arguments.
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
List of IDs for added documents.
|
|
162
|
+
"""
|
|
163
|
+
texts = [doc.page_content for doc in documents]
|
|
164
|
+
metadatas = [doc.metadata for doc in documents]
|
|
165
|
+
return self.add_texts(texts, metadatas=metadatas, ids=ids, **kwargs)
|
|
166
|
+
|
|
167
|
+
def delete(
|
|
168
|
+
self,
|
|
169
|
+
ids: list[str] | None = None,
|
|
170
|
+
**kwargs: Any,
|
|
171
|
+
) -> bool | None:
|
|
172
|
+
"""Delete documents by ID.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
ids: List of IDs to delete.
|
|
176
|
+
**kwargs: Additional arguments (unused).
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
True if deletion was successful.
|
|
180
|
+
"""
|
|
181
|
+
if ids is None:
|
|
182
|
+
return False
|
|
183
|
+
|
|
184
|
+
deleted = self._db.delete(ids)
|
|
185
|
+
return deleted > 0
|
|
186
|
+
|
|
187
|
+
def similarity_search(
|
|
188
|
+
self,
|
|
189
|
+
query: str,
|
|
190
|
+
k: int = 4,
|
|
191
|
+
filter: dict | None = None,
|
|
192
|
+
**kwargs: Any,
|
|
193
|
+
) -> list[Document]:
|
|
194
|
+
"""Search for similar documents by text query.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
query: Query text.
|
|
198
|
+
k: Number of results to return.
|
|
199
|
+
filter: Optional MongoDB-style metadata filter.
|
|
200
|
+
**kwargs: Additional arguments (unused).
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
List of similar Documents.
|
|
204
|
+
"""
|
|
205
|
+
embedding = self._embedding.embed_query(query)
|
|
206
|
+
return self.similarity_search_by_vector(embedding, k=k, filter=filter, **kwargs)
|
|
207
|
+
|
|
208
|
+
def similarity_search_by_vector(
|
|
209
|
+
self,
|
|
210
|
+
embedding: list[float],
|
|
211
|
+
k: int = 4,
|
|
212
|
+
filter: dict | None = None,
|
|
213
|
+
**kwargs: Any,
|
|
214
|
+
) -> list[Document]:
|
|
215
|
+
"""Search for similar documents by vector.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
embedding: Query vector.
|
|
219
|
+
k: Number of results to return.
|
|
220
|
+
filter: Optional MongoDB-style metadata filter.
|
|
221
|
+
**kwargs: Additional arguments (unused).
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
List of similar Documents.
|
|
225
|
+
"""
|
|
226
|
+
results = self._db.search(query=embedding, k=k, filter=filter)
|
|
227
|
+
|
|
228
|
+
documents = []
|
|
229
|
+
for result in results:
|
|
230
|
+
metadata = result.get("metadata", {})
|
|
231
|
+
# Extract page_content from metadata
|
|
232
|
+
page_content = metadata.pop("page_content", "")
|
|
233
|
+
documents.append(
|
|
234
|
+
Document(
|
|
235
|
+
page_content=page_content,
|
|
236
|
+
metadata=metadata,
|
|
237
|
+
id=result.get("id"),
|
|
238
|
+
)
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
return documents
|
|
242
|
+
|
|
243
|
+
def similarity_search_with_score(
|
|
244
|
+
self,
|
|
245
|
+
query: str,
|
|
246
|
+
k: int = 4,
|
|
247
|
+
filter: dict | None = None,
|
|
248
|
+
**kwargs: Any,
|
|
249
|
+
) -> list[tuple[Document, float]]:
|
|
250
|
+
"""Search for similar documents with relevance scores.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
query: Query text.
|
|
254
|
+
k: Number of results to return.
|
|
255
|
+
filter: Optional MongoDB-style metadata filter.
|
|
256
|
+
**kwargs: Additional arguments (unused).
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
List of (Document, score) tuples. Lower scores = more similar.
|
|
260
|
+
"""
|
|
261
|
+
embedding = self._embedding.embed_query(query)
|
|
262
|
+
results = self._db.search(query=embedding, k=k, filter=filter)
|
|
263
|
+
|
|
264
|
+
documents_with_scores = []
|
|
265
|
+
for result in results:
|
|
266
|
+
metadata = result.get("metadata", {})
|
|
267
|
+
page_content = metadata.pop("page_content", "")
|
|
268
|
+
doc = Document(
|
|
269
|
+
page_content=page_content,
|
|
270
|
+
metadata=metadata,
|
|
271
|
+
id=result.get("id"),
|
|
272
|
+
)
|
|
273
|
+
# OmenDB returns L2 distance (lower = more similar)
|
|
274
|
+
score = result.get("distance", 0.0)
|
|
275
|
+
documents_with_scores.append((doc, score))
|
|
276
|
+
|
|
277
|
+
return documents_with_scores
|
|
278
|
+
|
|
279
|
+
def get_by_ids(self, ids: Sequence[str]) -> list[Document]:
|
|
280
|
+
"""Get documents by their IDs.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
ids: Sequence of document IDs.
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
List of Documents. Missing IDs are skipped.
|
|
287
|
+
"""
|
|
288
|
+
documents = []
|
|
289
|
+
for id_ in ids:
|
|
290
|
+
result = self._db.get(id_)
|
|
291
|
+
if result is not None:
|
|
292
|
+
metadata = result.get("metadata", {})
|
|
293
|
+
page_content = metadata.pop("page_content", "")
|
|
294
|
+
documents.append(
|
|
295
|
+
Document(
|
|
296
|
+
page_content=page_content,
|
|
297
|
+
metadata=metadata,
|
|
298
|
+
id=result.get("id"),
|
|
299
|
+
)
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
return documents
|
|
303
|
+
|
|
304
|
+
@classmethod
|
|
305
|
+
def from_texts(
|
|
306
|
+
cls,
|
|
307
|
+
texts: list[str],
|
|
308
|
+
embedding: Embeddings,
|
|
309
|
+
metadatas: list[dict] | None = None,
|
|
310
|
+
ids: list[str] | None = None,
|
|
311
|
+
path: str = "./omendb-vectors",
|
|
312
|
+
**kwargs: Any,
|
|
313
|
+
) -> OmenDBVectorStore:
|
|
314
|
+
"""Create a vector store from texts.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
texts: Texts to add.
|
|
318
|
+
embedding: LangChain Embeddings model.
|
|
319
|
+
metadatas: Optional metadata for each text.
|
|
320
|
+
ids: Optional IDs for each text.
|
|
321
|
+
path: Path to database directory.
|
|
322
|
+
**kwargs: Additional arguments for OmenDBVectorStore.
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
Initialized OmenDBVectorStore with texts added.
|
|
326
|
+
|
|
327
|
+
Example:
|
|
328
|
+
>>> from langchain_openai import OpenAIEmbeddings
|
|
329
|
+
>>> vectorstore = OmenDBVectorStore.from_texts(
|
|
330
|
+
... texts=["Hello", "World"],
|
|
331
|
+
... embedding=OpenAIEmbeddings(),
|
|
332
|
+
... path="./my_vectors",
|
|
333
|
+
... )
|
|
334
|
+
"""
|
|
335
|
+
store = cls(embedding=embedding, path=path, **kwargs)
|
|
336
|
+
store.add_texts(texts, metadatas=metadatas, ids=ids)
|
|
337
|
+
return store
|
|
338
|
+
|
|
339
|
+
@classmethod
|
|
340
|
+
def from_documents(
|
|
341
|
+
cls,
|
|
342
|
+
documents: list[Document],
|
|
343
|
+
embedding: Embeddings,
|
|
344
|
+
ids: list[str] | None = None,
|
|
345
|
+
path: str = "./omendb-vectors",
|
|
346
|
+
**kwargs: Any,
|
|
347
|
+
) -> OmenDBVectorStore:
|
|
348
|
+
"""Create a vector store from documents.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
documents: LangChain Documents to add.
|
|
352
|
+
embedding: LangChain Embeddings model.
|
|
353
|
+
ids: Optional IDs for each document.
|
|
354
|
+
path: Path to database directory.
|
|
355
|
+
**kwargs: Additional arguments for OmenDBVectorStore.
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
Initialized OmenDBVectorStore with documents added.
|
|
359
|
+
|
|
360
|
+
Example:
|
|
361
|
+
>>> from langchain_core.documents import Document
|
|
362
|
+
>>> from langchain_openai import OpenAIEmbeddings
|
|
363
|
+
>>> docs = [Document(page_content="Hello", metadata={"source": "test"})]
|
|
364
|
+
>>> vectorstore = OmenDBVectorStore.from_documents(
|
|
365
|
+
... documents=docs,
|
|
366
|
+
... embedding=OpenAIEmbeddings(),
|
|
367
|
+
... )
|
|
368
|
+
"""
|
|
369
|
+
texts = [doc.page_content for doc in documents]
|
|
370
|
+
metadatas = [doc.metadata for doc in documents]
|
|
371
|
+
return cls.from_texts(
|
|
372
|
+
texts,
|
|
373
|
+
embedding,
|
|
374
|
+
metadatas=metadatas,
|
|
375
|
+
ids=ids,
|
|
376
|
+
path=path,
|
|
377
|
+
**kwargs,
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
def __len__(self) -> int:
|
|
381
|
+
"""Return the number of vectors in the store."""
|
|
382
|
+
return len(self._db)
|
|
383
|
+
|
|
384
|
+
def flush(self) -> None:
|
|
385
|
+
"""Flush data to disk for persistence."""
|
|
386
|
+
self._db.flush()
|