omendb 0.0.16__cp312-cp312-macosx_10_12_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
omendb/llamaindex.py ADDED
@@ -0,0 +1,336 @@
1
+ """LlamaIndex VectorStore integration for OmenDB.
2
+
3
+ This module provides a LlamaIndex-compatible VectorStore implementation
4
+ that wraps OmenDB for seamless integration with LlamaIndex RAG pipelines.
5
+
6
+ Example:
7
+ >>> from llama_index.core import VectorStoreIndex, StorageContext
8
+ >>> from llama_index.embeddings.openai import OpenAIEmbedding
9
+ >>> from omendb.llamaindex import OmenDBVectorStore
10
+ >>>
11
+ >>> # Create vector store
12
+ >>> vector_store = OmenDBVectorStore(path="./my_vectors")
13
+ >>> storage_context = StorageContext.from_defaults(vector_store=vector_store)
14
+ >>>
15
+ >>> # Build index from documents
16
+ >>> index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
17
+ >>>
18
+ >>> # Query
19
+ >>> query_engine = index.as_query_engine()
20
+ >>> response = query_engine.query("What is the main topic?")
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ from typing import Any
26
+
27
+ from llama_index.core.schema import BaseNode, TextNode
28
+ from llama_index.core.vector_stores.types import (
29
+ BasePydanticVectorStore,
30
+ VectorStoreQuery,
31
+ VectorStoreQueryResult,
32
+ )
33
+
34
+
35
+ class OmenDBVectorStore(BasePydanticVectorStore):
36
+ """LlamaIndex VectorStore implementation using OmenDB.
37
+
38
+ OmenDB is a fast embedded vector database with HNSW + ACORN-1
39
+ that provides ~19,000 QPS @ 10K vectors with 100% recall.
40
+
41
+ Features:
42
+ - HNSW index with adaptive parameters
43
+ - Extended RaBitQ quantization (8x compression)
44
+ - ACORN-1 filtered search (37.79x speedup)
45
+ - MongoDB-style metadata filtering
46
+ - Automatic persistence with persistent storage
47
+
48
+ Args:
49
+ path: Path to database directory. Uses persistent persistent storage.
50
+ dimensions: Vector dimensionality. If None, auto-detected on first insert.
51
+ **kwargs: Additional arguments passed to omendb.open().
52
+
53
+ Example:
54
+ >>> from llama_index.core import VectorStoreIndex, StorageContext
55
+ >>> from omendb.llamaindex import OmenDBVectorStore
56
+ >>>
57
+ >>> vector_store = OmenDBVectorStore(path="./my_vectors")
58
+ >>> storage_context = StorageContext.from_defaults(vector_store=vector_store)
59
+ >>> index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
60
+ """
61
+
62
+ stores_text: bool = True
63
+ flat_metadata: bool = True
64
+
65
+ # Pydantic fields
66
+ path: str = "./omendb-vectors"
67
+ dimensions: int | None = None
68
+
69
+ # Private attributes (not serialized)
70
+ _db: Any = None
71
+ _initialized: bool = False
72
+
73
+ def __init__(
74
+ self,
75
+ path: str = "./omendb-vectors",
76
+ dimensions: int | None = None,
77
+ **kwargs: Any,
78
+ ) -> None:
79
+ """Initialize OmenDBVectorStore.
80
+
81
+ Args:
82
+ path: Path to database directory.
83
+ dimensions: Vector dimensionality (auto-detected on first insert if None).
84
+ **kwargs: Additional arguments for omendb.open().
85
+ """
86
+ super().__init__(path=path, dimensions=dimensions)
87
+ self._kwargs = kwargs
88
+ self._db = None
89
+ self._initialized = False
90
+
91
+ def _ensure_db(self, dimensions: int | None = None) -> None:
92
+ """Ensure database is initialized."""
93
+ if self._db is not None:
94
+ return
95
+
96
+ import omendb
97
+
98
+ # Use provided dimensions or fall back to stored
99
+ dims = dimensions or self.dimensions or 1536 # Default to OpenAI dimensions
100
+ self._db = omendb.open(self.path, dimensions=dims, **getattr(self, "_kwargs", {}))
101
+ self._initialized = True
102
+
103
+ @classmethod
104
+ def class_name(cls) -> str:
105
+ """Return class name for serialization."""
106
+ return "OmenDBVectorStore"
107
+
108
+ @property
109
+ def client(self) -> Any:
110
+ """Return the underlying OmenDB database client."""
111
+ self._ensure_db()
112
+ return self._db
113
+
114
+ def add(
115
+ self,
116
+ nodes: list[BaseNode],
117
+ **kwargs: Any,
118
+ ) -> list[str]:
119
+ """Add nodes to the vector store.
120
+
121
+ Args:
122
+ nodes: List of nodes to add.
123
+ **kwargs: Additional arguments (unused).
124
+
125
+ Returns:
126
+ List of node IDs that were added.
127
+ """
128
+ if not nodes:
129
+ return []
130
+
131
+ # Get dimensions from first node's embedding
132
+ first_embedding = nodes[0].get_embedding()
133
+ if first_embedding:
134
+ self._ensure_db(dimensions=len(first_embedding))
135
+ else:
136
+ self._ensure_db()
137
+
138
+ ids = []
139
+ items = []
140
+
141
+ for node in nodes:
142
+ node_id = node.node_id
143
+ embedding = node.get_embedding()
144
+
145
+ if embedding is None:
146
+ continue
147
+
148
+ # Build metadata from node
149
+ metadata = node.metadata.copy() if node.metadata else {}
150
+
151
+ # Store text content in metadata for retrieval
152
+ text = node.get_content()
153
+ if text:
154
+ metadata["_text"] = text
155
+
156
+ # Store node type info
157
+ metadata["_node_type"] = node.class_name()
158
+
159
+ items.append(
160
+ {
161
+ "id": node_id,
162
+ "vector": embedding,
163
+ "metadata": metadata,
164
+ }
165
+ )
166
+ ids.append(node_id)
167
+
168
+ if items:
169
+ self._db.set(items)
170
+
171
+ return ids
172
+
173
+ def delete(self, ref_doc_id: str, **kwargs: Any) -> None:
174
+ """Delete nodes by reference document ID.
175
+
176
+ Args:
177
+ ref_doc_id: The document ID to delete.
178
+ **kwargs: Additional arguments (unused).
179
+ """
180
+ self._ensure_db()
181
+ # OmenDB delete expects a list of IDs
182
+ self._db.delete([ref_doc_id])
183
+
184
+ def delete_nodes(
185
+ self,
186
+ node_ids: list[str] | None = None,
187
+ **kwargs: Any,
188
+ ) -> None:
189
+ """Delete specific nodes by their IDs.
190
+
191
+ Args:
192
+ node_ids: List of node IDs to delete.
193
+ **kwargs: Additional arguments (unused).
194
+ """
195
+ if not node_ids:
196
+ return
197
+
198
+ self._ensure_db()
199
+ self._db.delete(node_ids)
200
+
201
+ def query(
202
+ self,
203
+ query: VectorStoreQuery,
204
+ **kwargs: Any,
205
+ ) -> VectorStoreQueryResult:
206
+ """Query the vector store.
207
+
208
+ Args:
209
+ query: VectorStoreQuery containing query embedding and parameters.
210
+ **kwargs: Additional arguments (unused).
211
+
212
+ Returns:
213
+ VectorStoreQueryResult with matching nodes, similarities, and IDs.
214
+ """
215
+ self._ensure_db()
216
+
217
+ if query.query_embedding is None:
218
+ return VectorStoreQueryResult(nodes=[], similarities=[], ids=[])
219
+
220
+ # Build filter dict from LlamaIndex filters
221
+ filter_dict = None
222
+ if query.filters is not None:
223
+ filter_dict = self._convert_filters(query.filters)
224
+
225
+ # Query OmenDB
226
+ k = query.similarity_top_k or 10
227
+ results = self._db.search(
228
+ query=query.query_embedding,
229
+ k=k,
230
+ filter=filter_dict,
231
+ )
232
+
233
+ # Convert results to LlamaIndex format
234
+ nodes = []
235
+ similarities = []
236
+ ids = []
237
+
238
+ for result in results:
239
+ node_id = result.get("id", "")
240
+ distance = result.get("distance", 0.0)
241
+ metadata = result.get("metadata", {})
242
+
243
+ # Extract text from metadata
244
+ text = metadata.pop("_text", "")
245
+ metadata.pop("_node_type", None)
246
+
247
+ # Create TextNode
248
+ node = TextNode(
249
+ id_=node_id,
250
+ text=text,
251
+ metadata=metadata,
252
+ embedding=result.get("vector"),
253
+ )
254
+
255
+ nodes.append(node)
256
+ # Convert distance to similarity (assuming L2 distance)
257
+ # For L2: similarity = 1 / (1 + distance)
258
+ similarity = 1.0 / (1.0 + distance) if distance >= 0 else 0.0
259
+ similarities.append(similarity)
260
+ ids.append(node_id)
261
+
262
+ return VectorStoreQueryResult(
263
+ nodes=nodes,
264
+ similarities=similarities,
265
+ ids=ids,
266
+ )
267
+
268
+ def _convert_filters(self, filters: Any) -> dict[str, Any] | None:
269
+ """Convert LlamaIndex MetadataFilters to OmenDB filter format.
270
+
271
+ Args:
272
+ filters: LlamaIndex MetadataFilters object.
273
+
274
+ Returns:
275
+ OmenDB-compatible filter dictionary.
276
+ """
277
+ if filters is None:
278
+ return None
279
+
280
+ from llama_index.core.vector_stores.types import (
281
+ FilterCondition,
282
+ FilterOperator,
283
+ MetadataFilter,
284
+ MetadataFilters,
285
+ )
286
+
287
+ if not isinstance(filters, MetadataFilters):
288
+ return None
289
+
290
+ filter_list = []
291
+
292
+ for f in filters.filters:
293
+ if not isinstance(f, MetadataFilter):
294
+ continue
295
+
296
+ key = f.key
297
+ value = f.value
298
+ op = f.operator
299
+
300
+ # Map LlamaIndex operators to OmenDB operators
301
+ if op == FilterOperator.EQ:
302
+ filter_list.append({key: value})
303
+ elif op == FilterOperator.NE:
304
+ filter_list.append({key: {"$ne": value}})
305
+ elif op == FilterOperator.GT:
306
+ filter_list.append({key: {"$gt": value}})
307
+ elif op == FilterOperator.GTE:
308
+ filter_list.append({key: {"$gte": value}})
309
+ elif op == FilterOperator.LT:
310
+ filter_list.append({key: {"$lt": value}})
311
+ elif op == FilterOperator.LTE:
312
+ filter_list.append({key: {"$lte": value}})
313
+ elif op == FilterOperator.IN:
314
+ filter_list.append({key: {"$in": value}})
315
+ elif op == FilterOperator.CONTAINS:
316
+ filter_list.append({key: {"$contains": value}})
317
+ else:
318
+ # Default to equality
319
+ filter_list.append({key: value})
320
+
321
+ if not filter_list:
322
+ return None
323
+
324
+ # Combine filters based on condition
325
+ if len(filter_list) == 1:
326
+ return filter_list[0]
327
+
328
+ condition = getattr(filters, "condition", FilterCondition.AND)
329
+ if condition == FilterCondition.OR:
330
+ return {"$or": filter_list}
331
+ else:
332
+ return {"$and": filter_list}
333
+
334
+ def flush(self) -> None:
335
+ """Flush data to disk for persistence."""
336
+ self._db.flush()
Binary file
@@ -0,0 +1,265 @@
1
+ Metadata-Version: 2.4
2
+ Name: omendb
3
+ Version: 0.0.16
4
+ Classifier: Development Status :: 3 - Alpha
5
+ Classifier: Intended Audience :: Developers
6
+ Classifier: License :: OSI Approved :: GNU Affero General Public License v3
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3.9
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Classifier: Programming Language :: Rust
14
+ Requires-Dist: numpy>=1.24.4
15
+ Requires-Dist: langchain-core>=0.2.0 ; extra == 'langchain'
16
+ Requires-Dist: llama-index-core>=0.10.0 ; extra == 'llamaindex'
17
+ Provides-Extra: langchain
18
+ Provides-Extra: llamaindex
19
+ Summary: Fast embedded vector database with HNSW + ACORN-1 filtered search
20
+ Author: OmenDB Team
21
+ License: AGPL-3.0
22
+ Requires-Python: >=3.9
23
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
24
+ Project-URL: Homepage, https://github.com/omendb/omendb
25
+ Project-URL: Repository, https://github.com/omendb/omendb
26
+
27
+ # OmenDB
28
+
29
+ [![PyPI](https://img.shields.io/pypi/v/omendb)](https://pypi.org/project/omendb/)
30
+ [![License](https://img.shields.io/badge/License-AGPL_3.0-blue.svg)](https://github.com/omendb/omendb/blob/main/LICENSE)
31
+
32
+ Embedded vector database for Python and Node.js. No server, no setup, just install.
33
+
34
+ ```bash
35
+ pip install omendb
36
+ ```
37
+
38
+ ## Quick Start
39
+
40
+ ```python
41
+ import omendb
42
+
43
+ # Create database (persistent) - creates ./mydb.omen file
44
+ db = omendb.open("./mydb", dimensions=128)
45
+
46
+ # Add vectors with metadata
47
+ db.set([
48
+ {"id": "doc1", "vector": [0.1] * 128, "metadata": {"category": "science"}},
49
+ {"id": "doc2", "vector": [0.2] * 128, "metadata": {"category": "history"}},
50
+ ])
51
+
52
+ # Search
53
+ results = db.search([0.1] * 128, k=5)
54
+
55
+ # Filtered search
56
+ results = db.search([0.1] * 128, k=5, filter={"category": "science"})
57
+ ```
58
+
59
+ ## Features
60
+
61
+ - **Embedded** - Runs in-process, no server needed
62
+ - **Persistent** - Data survives restarts automatically
63
+ - **Filtered search** - Query by metadata with JSON-style filters
64
+ - **Hybrid search** - Combine vector similarity with BM25 text search
65
+ - **Quantization** - 4-8x smaller indexes with minimal recall loss
66
+
67
+ ## Platforms
68
+
69
+ | Platform | Status |
70
+ | ---------------------------- | ------------ |
71
+ | Linux (x86_64, ARM64) | Supported |
72
+ | macOS (Intel, Apple Silicon) | Supported |
73
+ | Windows (x86_64) | Experimental |
74
+
75
+ ## API
76
+
77
+ ```python
78
+ # Database
79
+ db = omendb.open(path, dimensions) # Open or create
80
+ db = omendb.open(":memory:", dimensions) # In-memory (ephemeral)
81
+
82
+ # CRUD
83
+ db.set(items) # Insert/update vectors
84
+ db.get(id) # Get by ID
85
+ db.get_many(ids) # Batch get by IDs
86
+ db.delete(ids) # Delete by IDs
87
+ db.delete_where(filter) # Delete by metadata filter
88
+ db.update(id, metadata) # Update metadata only
89
+
90
+ # Iteration
91
+ len(db) # Number of vectors
92
+ db.count() # Same as len(db)
93
+ db.count(filter={...}) # Count matching filter
94
+ db.ids() # Iterate all IDs (lazy)
95
+ db.items() # Get all items as list
96
+ db.exists(id) # Check if ID exists
97
+ "id" in db # Same as exists()
98
+ for item in db: ... # Iterate all items (lazy)
99
+
100
+ # Search
101
+ db.search(query, k) # Vector search
102
+ db.search(query, k, filter={...}) # Filtered search
103
+ db.search_batch(queries, k) # Batch search (parallel)
104
+
105
+ # Hybrid search (requires text field in vectors)
106
+ db.search_hybrid(query_vector, query_text, k)
107
+ db.search_hybrid(query_vector, query_text, k, alpha=0.7) # 70% vector, 30% text
108
+ db.search_hybrid(query_vector, query_text, k, subscores=True) # Return separate scores
109
+ db.search_text(query_text, k) # Text-only BM25
110
+
111
+ # Persistence
112
+ db.flush() # Flush to disk
113
+ ```
114
+
115
+ ## Filters
116
+
117
+ ```python
118
+ # Equality
119
+ {"field": "value"} # Shorthand
120
+ {"field": {"$eq": "value"}} # Explicit
121
+
122
+ # Comparison
123
+ {"field": {"$ne": "value"}} # Not equal
124
+ {"field": {"$gt": 10}} # Greater than
125
+ {"field": {"$gte": 10}} # Greater or equal
126
+ {"field": {"$lt": 10}} # Less than
127
+ {"field": {"$lte": 10}} # Less or equal
128
+
129
+ # Membership
130
+ {"field": {"$in": ["a", "b"]}} # In list
131
+ {"field": {"$contains": "sub"}} # String contains
132
+
133
+ # Logical
134
+ {"$and": [{...}, {...}]} # AND
135
+ {"$or": [{...}, {...}]} # OR
136
+ ```
137
+
138
+ ## Configuration
139
+
140
+ ```python
141
+ db = omendb.open(
142
+ "./mydb", # Creates ./mydb.omen + ./mydb.wal
143
+ dimensions=384,
144
+ m=16, # HNSW connections per node (default: 16)
145
+ ef_construction=200, # Index build quality (default: 100)
146
+ ef_search=100, # Search quality (default: 100)
147
+ quantization=True, # SQ8 quantization (default: None)
148
+ metric="cosine", # Distance metric (default: "l2")
149
+ )
150
+
151
+ # Quantization options:
152
+ # - True or "sq8": SQ8 ~4x smaller, ~99% recall (recommended)
153
+ # - "rabitq": RaBitQ ~8x smaller, ~98% recall
154
+ # - None/False: Full precision (default)
155
+
156
+ # Distance metric options:
157
+ # - "l2" or "euclidean": Euclidean distance (default)
158
+ # - "cosine": Cosine distance (1 - cosine similarity)
159
+ # - "dot" or "ip": Inner product (for MIPS)
160
+
161
+ # Context manager (auto-flush on exit)
162
+ with omendb.open("./db", dimensions=768) as db:
163
+ db.set([...])
164
+
165
+ # Hybrid search with alpha (0=text, 1=vector, default=0.5)
166
+ db.search_hybrid(query_vec, "query text", k=10, alpha=0.7)
167
+
168
+ # Get separate keyword and semantic scores for debugging/tuning
169
+ results = db.search_hybrid(query_vec, "query text", k=10, subscores=True)
170
+ # Returns: {"id": "...", "score": 0.85, "keyword_score": 0.92, "semantic_score": 0.78}
171
+ ```
172
+
173
+ ## Performance
174
+
175
+ **10K vectors, Apple M3 Max** (m=16, ef=100, k=10):
176
+
177
+ | Dimension | Single QPS | Batch QPS | Speedup |
178
+ | --------- | ---------- | --------- | ------- |
179
+ | 128D | 12,000+ | 87,000+ | 7.2x |
180
+ | 768D | 3,800+ | 20,500+ | 5.4x |
181
+ | 1536D | 1,600+ | 6,200+ | 3.8x |
182
+
183
+ **SIFT-1M** (1M vectors, 128D, m=16, ef=100, k=10):
184
+
185
+ | Machine | QPS | Recall |
186
+ | ------------ | ----- | ------ |
187
+ | i9-13900KF | 4,591 | 98.6% |
188
+ | Apple M3 Max | 3,216 | 98.4% |
189
+
190
+ **Quantization** reduces memory with minimal recall loss:
191
+
192
+ | Mode | Compression | Use Case |
193
+ | ------ | ----------- | ------------------------------ |
194
+ | f32 | 1x | Default, highest recall |
195
+ | sq8 | 4x | Recommended for most users |
196
+ | rabitq | 8x | Large datasets, cost-sensitive |
197
+
198
+ ```python
199
+ db = omendb.open("./db", dimensions=768, quantization=True) # Enable SQ8
200
+ ```
201
+
202
+ <details>
203
+ <summary>Benchmark methodology</summary>
204
+
205
+ - **Parameters**: m=16, ef_construction=100, ef_search=100
206
+ - **Batch**: Uses Rayon for parallel search across all cores
207
+ - **Recall**: Validated against brute-force ground truth on SIFT/GloVe
208
+ - **Reproduce**:
209
+ - Quick (10K): `uv run python benchmarks/run.py`
210
+ - SIFT-1M: `uv run python benchmarks/ann_dataset_test.py --dataset sift-128-euclidean`
211
+
212
+ </details>
213
+
214
+ ## Examples
215
+
216
+ See [`python/examples/`](python/examples/) for complete working examples:
217
+
218
+ - `quickstart.py` - Minimal working example
219
+ - `basic.py` - CRUD operations and persistence
220
+ - `filters.py` - All filter operators
221
+ - `rag.py` - RAG workflow with mock embeddings
222
+
223
+ ## Integrations
224
+
225
+ ### LangChain
226
+
227
+ ```bash
228
+ pip install omendb[langchain]
229
+ ```
230
+
231
+ ```python
232
+ from langchain_openai import OpenAIEmbeddings
233
+ from omendb.langchain import OmenDBVectorStore
234
+
235
+ store = OmenDBVectorStore.from_texts(
236
+ texts=["Paris is the capital of France"],
237
+ embedding=OpenAIEmbeddings(),
238
+ path="./langchain_vectors",
239
+ )
240
+ docs = store.similarity_search("capital of France", k=1)
241
+ ```
242
+
243
+ ### LlamaIndex
244
+
245
+ ```bash
246
+ pip install omendb[llamaindex]
247
+ ```
248
+
249
+ ```python
250
+ from llama_index.core import VectorStoreIndex, Document, StorageContext
251
+ from omendb.llamaindex import OmenDBVectorStore
252
+
253
+ vector_store = OmenDBVectorStore(path="./llama_vectors")
254
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
255
+ index = VectorStoreIndex.from_documents(
256
+ [Document(text="OmenDB is fast")],
257
+ storage_context=storage_context,
258
+ )
259
+ response = index.as_query_engine().query("What is OmenDB?")
260
+ ```
261
+
262
+ ## License
263
+
264
+ [AGPL-3.0](LICENSE)
265
+
@@ -0,0 +1,8 @@
1
+ omendb-0.0.16.dist-info/METADATA,sha256=4m4xKpNkHn3dXlnvMvpHyEMZWWBM65TKA1JmEHTyw5A,8551
2
+ omendb-0.0.16.dist-info/WHEEL,sha256=MaIE2g33Ws18PHdThiZqBSHdbWSY1CDh26xDJXEp6io,107
3
+ omendb/__init__.py,sha256=JEdxBoRYcrdOpaAixBcaZCiJYGHc7UhMslUGRuAhVuQ,834
4
+ omendb/__init__.pyi,sha256=_hu-1NqsoQjuvcl24s5WKvxWf5YyGMvs9fGiaVUzjKE,16906
5
+ omendb/langchain.py,sha256=w0W5W17ZlSzxoor1LYIzrZYrRvoKK0WW1oQBNcZ_7mk,12184
6
+ omendb/llamaindex.py,sha256=3aKSUufpn33uSK6h61BdKEZvmXNPbz-FexRmlALOtLs,10328
7
+ omendb/omendb.cpython-312-darwin.so,sha256=WfGunuoPESi1ZXNwjtaj2MUAiekemedwPno-gjflSI8,8259900
8
+ omendb-0.0.16.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.10.2)
3
+ Root-Is-Purelib: false
4
+ Tag: cp312-cp312-macosx_10_12_x86_64