omendb 0.0.23__cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
omendb/__init__.py ADDED
@@ -0,0 +1,24 @@
1
+ """OmenDB - Fast embedded vector database with HNSW + ACORN-1 filtered search.
2
+
3
+ Example (standalone):
4
+ >>> import omendb
5
+ >>> db = omendb.open("./my_vectors", dimensions=128)
6
+ >>> db.set([{"id": "doc1", "vector": [0.1] * 128, "metadata": {"title": "Hello"}}])
7
+ >>> results = db.search(query=[0.1] * 128, k=5)
8
+
9
+ Example (LangChain):
10
+ >>> from omendb.langchain import OmenDBVectorStore
11
+ >>> from langchain_openai import OpenAIEmbeddings
12
+ >>> store = OmenDBVectorStore.from_texts(
13
+ ... texts=["Hello world"],
14
+ ... embedding=OpenAIEmbeddings(),
15
+ ... path="./my_vectors",
16
+ ... )
17
+ >>> docs = store.similarity_search("greeting", k=1)
18
+ """
19
+
20
+ # Re-export everything from native Rust module
21
+ from omendb.omendb import VectorDatabase, open
22
+
23
+ __all__ = ["open", "VectorDatabase"]
24
+ __version__ = "0.0.23"
omendb/__init__.pyi ADDED
@@ -0,0 +1,589 @@
1
+ """Type stubs for omendb - Fast embedded vector database."""
2
+
3
+ from collections.abc import Iterator, Sequence
4
+ from typing import Any, Literal, TypedDict, overload
5
+
6
+ import numpy as np
7
+ import numpy.typing as npt
8
+ from typing_extensions import Self
9
+
10
+ # Type aliases for vectors
11
+ Vector = Sequence[float] | npt.NDArray[np.floating[Any]]
12
+ VectorBatch = Sequence[Sequence[float]] | npt.NDArray[np.floating[Any]]
13
+
14
+ class SearchResult(TypedDict):
15
+ """Single search result."""
16
+
17
+ id: str
18
+ distance: float
19
+ metadata: dict[str, Any]
20
+
21
+ class TextSearchResult(TypedDict):
22
+ """Single text search result."""
23
+
24
+ id: str
25
+ score: float
26
+ metadata: dict[str, Any]
27
+
28
+ class HybridSearchResult(TypedDict):
29
+ """Single hybrid search result."""
30
+
31
+ id: str
32
+ score: float
33
+ metadata: dict[str, Any]
34
+
35
+ class HybridSearchResultWithSubscores(TypedDict):
36
+ """Hybrid search result with separate keyword and semantic scores."""
37
+
38
+ id: str
39
+ score: float
40
+ metadata: dict[str, Any]
41
+ keyword_score: float | None # BM25 score (None if only matched vector search)
42
+ semantic_score: float | None # Vector distance (None if only matched text search)
43
+
44
+ class VectorRecord(TypedDict, total=False):
45
+ """Input record for set()."""
46
+
47
+ id: str # Required
48
+ vector: list[float] # Required
49
+ metadata: dict[str, Any]
50
+ text: str # For hybrid search - indexed AND auto-stored in metadata["text"]
51
+
52
+ class GetResult(TypedDict):
53
+ """Result from get()."""
54
+
55
+ id: str
56
+ vector: list[float]
57
+ metadata: dict[str, Any]
58
+
59
+ class StatsResult(TypedDict):
60
+ """Database statistics."""
61
+
62
+ dimensions: int
63
+ count: int
64
+ path: str
65
+
66
+ # Filter types for MongoDB-style queries
67
+ FilterValue = str | int | float | bool | None | list[Any] | dict[str, Any]
68
+ FilterOperator = TypedDict(
69
+ "FilterOperator",
70
+ {
71
+ "$eq": FilterValue,
72
+ "$ne": FilterValue,
73
+ "$gt": float,
74
+ "$gte": float,
75
+ "$lt": float,
76
+ "$lte": float,
77
+ "$in": list[FilterValue],
78
+ "$contains": str,
79
+ },
80
+ total=False,
81
+ )
82
+ MetadataFilter = dict[str, FilterValue | FilterOperator]
83
+
84
+ class VectorDatabase:
85
+ """High-performance embedded vector database.
86
+
87
+ Provides fast similarity search using HNSW indexing with:
88
+ - ~19,000 QPS @ 10K vectors with 100% recall
89
+ - 20,000-28,000 vec/s insert throughput
90
+ - Extended RaBitQ 8x compression
91
+ - ACORN-1 filtered search (37.79x speedup)
92
+
93
+ Supports context manager protocol for automatic cleanup.
94
+ """
95
+
96
+ @property
97
+ def dimensions(self) -> int:
98
+ """Vector dimensionality of this database."""
99
+ ...
100
+
101
+ # Set methods with multiple signatures
102
+ @overload
103
+ def set(
104
+ self,
105
+ id: str,
106
+ vector: Vector,
107
+ metadata: dict[str, Any] | None = None,
108
+ ) -> int:
109
+ """Insert single vector."""
110
+ ...
111
+
112
+ @overload
113
+ def set(self, items: list[VectorRecord]) -> int:
114
+ """Insert batch of vectors."""
115
+ ...
116
+
117
+ @overload
118
+ def set(
119
+ self,
120
+ *,
121
+ ids: list[str],
122
+ vectors: list[list[float]] | VectorBatch,
123
+ metadatas: list[dict[str, Any]] | None = None,
124
+ ) -> int:
125
+ """Insert batch using kwargs."""
126
+ ...
127
+
128
+ def set(
129
+ self,
130
+ id_or_items: str | list[VectorRecord] | None = None,
131
+ vector: Vector | None = None,
132
+ metadata: dict[str, Any] | None = None,
133
+ *,
134
+ ids: list[str] | None = None,
135
+ vectors: list[list[float]] | VectorBatch | None = None,
136
+ metadatas: list[dict[str, Any]] | None = None,
137
+ ) -> int:
138
+ """Set (insert or replace) vectors.
139
+
140
+ Supports multiple input formats:
141
+ - Single: set("id", [0.1, 0.2], {"key": "value"})
142
+ - Batch list: set([{"id": "a", "vector": [...], "metadata": {...}}])
143
+ - Batch kwargs: set(ids=["a"], vectors=[[...]], metadatas=[{...}])
144
+
145
+ Args:
146
+ id_or_items: Vector ID (str) or list of VectorRecord dicts
147
+ vector: Vector data (required when id_or_items is str)
148
+ metadata: Optional metadata dict
149
+ ids: List of IDs (batch kwargs format)
150
+ vectors: List of vectors (batch kwargs format)
151
+ metadatas: List of metadata dicts (batch kwargs format)
152
+
153
+ Returns:
154
+ Number of vectors inserted/updated.
155
+
156
+ Raises:
157
+ ValueError: If required fields missing or dimensions mismatch.
158
+ """
159
+ ...
160
+
161
+ def search(
162
+ self,
163
+ query: Vector,
164
+ k: int,
165
+ ef: int | None = None,
166
+ filter: MetadataFilter | None = None,
167
+ max_distance: float | None = None,
168
+ ) -> list[SearchResult]:
169
+ """Search for k nearest neighbors.
170
+
171
+ Args:
172
+ query: Query vector (list or 1D numpy array).
173
+ k: Number of nearest neighbors to return.
174
+ ef: Search width override (default: auto-tuned).
175
+ filter: MongoDB-style metadata filter.
176
+ max_distance: Maximum distance threshold (filters out distant results).
177
+
178
+ Returns:
179
+ List of results with id, distance, metadata.
180
+
181
+ Examples:
182
+ >>> results = db.search([0.1, 0.2, 0.3], k=5)
183
+ >>> results = db.search([...], k=10, filter={"category": "A"})
184
+ >>> results = db.search([...], k=10, max_distance=0.5)
185
+ """
186
+ ...
187
+
188
+ def search_batch(
189
+ self,
190
+ queries: VectorBatch,
191
+ k: int,
192
+ ef: int | None = None,
193
+ ) -> list[list[SearchResult]]:
194
+ """Batch search multiple queries with parallel execution.
195
+
196
+ Args:
197
+ queries: 2D numpy array or list of query vectors.
198
+ k: Number of nearest neighbors per query.
199
+ ef: Search width override.
200
+
201
+ Returns:
202
+ List of results for each query.
203
+ """
204
+ ...
205
+
206
+ def delete(self, ids: list[str]) -> int:
207
+ """Delete vectors by ID.
208
+
209
+ Args:
210
+ ids: List of vector IDs to delete.
211
+
212
+ Returns:
213
+ Number of vectors deleted.
214
+ """
215
+ ...
216
+
217
+ def delete_by_filter(self, filter: MetadataFilter) -> int:
218
+ """Delete vectors matching a metadata filter.
219
+
220
+ Evaluates the filter against all vectors and deletes those that match.
221
+ Uses the same MongoDB-style filter syntax as search().
222
+
223
+ Args:
224
+ filter: MongoDB-style metadata filter.
225
+
226
+ Returns:
227
+ Number of vectors deleted.
228
+
229
+ Examples:
230
+ >>> db.delete_by_filter({"status": "archived"})
231
+ 5
232
+ >>> db.delete_by_filter({"score": {"$lt": 0.5}})
233
+ 3
234
+ >>> db.delete_by_filter({"$and": [{"type": "draft"}, {"age": {"$gt": 30}}]})
235
+ 2
236
+ """
237
+ ...
238
+
239
+ def count(self, filter: MetadataFilter | None = None) -> int:
240
+ """Count vectors, optionally filtered by metadata.
241
+
242
+ Without a filter, returns total count (same as len(db)).
243
+ With a filter, returns count of vectors matching the filter.
244
+
245
+ Args:
246
+ filter: Optional MongoDB-style metadata filter.
247
+
248
+ Returns:
249
+ Number of vectors (matching filter if provided).
250
+
251
+ Examples:
252
+ >>> db.count()
253
+ 1000
254
+ >>> db.count(filter={"status": "active"})
255
+ 750
256
+ >>> db.count(filter={"score": {"$gte": 0.8}})
257
+ 250
258
+ """
259
+ ...
260
+
261
+ def update(
262
+ self,
263
+ id: str,
264
+ vector: Vector | None = None,
265
+ metadata: dict[str, Any] | None = None,
266
+ text: str | None = None,
267
+ ) -> None:
268
+ """Update vector, metadata, and/or text for existing ID.
269
+
270
+ At least one of vector, metadata, or text must be provided.
271
+
272
+ Args:
273
+ id: Vector ID to update.
274
+ vector: New vector data (optional).
275
+ metadata: New metadata (replaces existing, optional).
276
+ text: New text for hybrid search (re-indexed for BM25, optional).
277
+
278
+ Raises:
279
+ ValueError: If no update parameters provided.
280
+ RuntimeError: If vector with given ID doesn't exist.
281
+ """
282
+ ...
283
+
284
+ def get(self, id: str) -> GetResult | None:
285
+ """Get vector by ID.
286
+
287
+ Args:
288
+ id: Vector ID to retrieve.
289
+
290
+ Returns:
291
+ Dict with id, vector, metadata or None if not found.
292
+ """
293
+ ...
294
+
295
+ def get_ef_search(self) -> int:
296
+ """Get current ef_search value."""
297
+ ...
298
+
299
+ def set_ef_search(self, ef_search: int) -> None:
300
+ """Set ef_search value for search quality/speed tradeoff."""
301
+ ...
302
+
303
+ def optimize(self) -> int:
304
+ """Optimize index for cache-efficient search.
305
+
306
+ Returns:
307
+ Number of nodes reordered.
308
+ """
309
+ ...
310
+
311
+ def __len__(self) -> int:
312
+ """Number of vectors in database."""
313
+ ...
314
+
315
+ def is_empty(self) -> bool:
316
+ """Check if database is empty."""
317
+ ...
318
+
319
+ def ids(self) -> Iterator[str]:
320
+ """Iterate over all vector IDs (without loading vector data).
321
+
322
+ Returns a lazy iterator. Use `list(db.ids())` if you need all IDs at once.
323
+ Memory efficient for large datasets.
324
+
325
+ Returns:
326
+ Iterator over all vector IDs in the database.
327
+
328
+ Examples:
329
+ >>> for id in db.ids():
330
+ ... print(id)
331
+ >>> all_ids = list(db.ids()) # Get as list if needed
332
+ """
333
+ ...
334
+
335
+ def items(self) -> list[GetResult]:
336
+ """Get all items as list of dicts.
337
+
338
+ WARNING: Loads all vectors into memory. For 1M vectors at 768D,
339
+ this uses ~3GB RAM. For large datasets, use `for item in db:` which
340
+ is lazy, or use `ids()` + `get_batch()` with batching.
341
+
342
+ Returns:
343
+ List of {"id": str, "vector": list[float], "metadata": dict}
344
+ """
345
+ ...
346
+
347
+ def exists(self, id: str) -> bool:
348
+ """Check if an ID exists in the database.
349
+
350
+ Args:
351
+ id: Vector ID to check.
352
+
353
+ Returns:
354
+ True if ID exists and is not deleted.
355
+ """
356
+ ...
357
+
358
+ def __contains__(self, id: str) -> bool:
359
+ """Support `in` operator: `"id" in db`"""
360
+ ...
361
+
362
+ def __iter__(self) -> Iterator[GetResult]:
363
+ """Iterate over all items lazily.
364
+
365
+ Memory efficient: stores only IDs (~20MB for 1M items), fetches
366
+ vectors one at a time. Handles deletions during iteration gracefully.
367
+
368
+ For export/migration of small datasets, `items()` is more convenient.
369
+
370
+ Examples:
371
+ >>> for item in db:
372
+ ... print(item["id"])
373
+ >>> # Early termination is efficient
374
+ >>> for i, item in enumerate(db):
375
+ ... if i >= 10: break
376
+ """
377
+ ...
378
+
379
+ def get_batch(self, ids: list[str]) -> list[GetResult | None]:
380
+ """Get multiple vectors by ID.
381
+
382
+ Batch version of get(). More efficient than calling get() in a loop.
383
+
384
+ Args:
385
+ ids: List of vector IDs to retrieve.
386
+
387
+ Returns:
388
+ List of results in same order as input. None for missing IDs.
389
+ """
390
+ ...
391
+
392
+ def stats(self) -> StatsResult:
393
+ """Get database statistics."""
394
+ ...
395
+
396
+ def flush(self) -> None:
397
+ """Flush pending changes to disk."""
398
+ ...
399
+
400
+ def merge_from(self, other: VectorDatabase) -> int:
401
+ """Merge vectors from another database.
402
+
403
+ Args:
404
+ other: Source database to merge from.
405
+
406
+ Returns:
407
+ Number of vectors merged.
408
+ """
409
+ ...
410
+
411
+ # Collections
412
+ def collection(self, name: str) -> VectorDatabase:
413
+ """Create or get a named collection.
414
+
415
+ Args:
416
+ name: Collection name (alphanumeric and underscores).
417
+
418
+ Returns:
419
+ VectorDatabase instance for this collection.
420
+
421
+ Raises:
422
+ ValueError: If name is invalid or db is in-memory.
423
+ """
424
+ ...
425
+
426
+ def collections(self) -> list[str]:
427
+ """List all collection names."""
428
+ ...
429
+
430
+ def delete_collection(self, name: str) -> None:
431
+ """Delete a collection.
432
+
433
+ Args:
434
+ name: Collection name to delete.
435
+
436
+ Raises:
437
+ ValueError: If collection doesn't exist.
438
+ """
439
+ ...
440
+
441
+ # Hybrid search
442
+ def enable_text_search(self, buffer_mb: int | None = None) -> None:
443
+ """Enable text search for hybrid search.
444
+
445
+ Note: Called automatically when using set() with items that have a `text` field.
446
+ Only call manually if you need custom buffer_mb config.
447
+
448
+ Args:
449
+ buffer_mb: Writer buffer size in MB (default: 50).
450
+ """
451
+ ...
452
+
453
+ def has_text_search(self) -> bool:
454
+ """Check if text search is enabled."""
455
+ ...
456
+
457
+ def search_text(self, query: str, k: int) -> list[TextSearchResult]:
458
+ """Search using text only (BM25 scoring).
459
+
460
+ Args:
461
+ query: Text query.
462
+ k: Number of results.
463
+
464
+ Returns:
465
+ List of results with id, score, and metadata.
466
+ """
467
+ ...
468
+
469
+ @overload
470
+ def search_hybrid(
471
+ self,
472
+ query_vector: Vector,
473
+ query_text: str,
474
+ k: int,
475
+ filter: MetadataFilter | None = None,
476
+ alpha: float | None = None,
477
+ rrf_k: int | None = None,
478
+ subscores: Literal[False] | None = None,
479
+ ) -> list[HybridSearchResult]:
480
+ """Hybrid search combining vector and text."""
481
+ ...
482
+
483
+ @overload
484
+ def search_hybrid(
485
+ self,
486
+ query_vector: Vector,
487
+ query_text: str,
488
+ k: int,
489
+ filter: MetadataFilter | None = None,
490
+ alpha: float | None = None,
491
+ rrf_k: int | None = None,
492
+ subscores: Literal[True] = ...,
493
+ ) -> list[HybridSearchResultWithSubscores]:
494
+ """Hybrid search with separate keyword and semantic scores."""
495
+ ...
496
+
497
+ def search_hybrid(
498
+ self,
499
+ query_vector: Vector,
500
+ query_text: str,
501
+ k: int,
502
+ filter: MetadataFilter | None = None,
503
+ alpha: float | None = None,
504
+ rrf_k: int | None = None,
505
+ subscores: bool | None = None,
506
+ ) -> list[HybridSearchResult] | list[HybridSearchResultWithSubscores]:
507
+ """Hybrid search combining vector and text.
508
+
509
+ Uses Reciprocal Rank Fusion (RRF) to combine results.
510
+
511
+ Args:
512
+ query_vector: Query embedding.
513
+ query_text: Text query for BM25.
514
+ k: Number of results.
515
+ filter: Optional metadata filter.
516
+ alpha: Vector vs text weight (0.0=text, 1.0=vector, default=0.5).
517
+ rrf_k: RRF constant (default: 60).
518
+ subscores: Return separate keyword_score and semantic_score (default: False).
519
+
520
+ Returns:
521
+ List of results with id, score, metadata.
522
+ When subscores=True, also includes keyword_score and semantic_score.
523
+
524
+ Examples:
525
+ >>> results = db.search_hybrid(vec, "query", k=10)
526
+ >>> results = db.search_hybrid(vec, "query", k=10, subscores=True)
527
+ >>> for r in results:
528
+ ... print(f"{r['id']}: keyword={r['keyword_score']}, semantic={r['semantic_score']}")
529
+ """
530
+ ...
531
+
532
+ # Context manager
533
+ def __enter__(self) -> Self:
534
+ """Enter context manager."""
535
+ ...
536
+
537
+ def __exit__(
538
+ self,
539
+ exc_type: type[BaseException] | None,
540
+ exc_val: BaseException | None,
541
+ exc_tb: Any,
542
+ ) -> bool:
543
+ """Exit context manager, flush changes."""
544
+ ...
545
+
546
+ def open(
547
+ path: str,
548
+ dimensions: int,
549
+ m: int | None = None,
550
+ ef_construction: int | None = None,
551
+ ef_search: int | None = None,
552
+ quantization: bool | Literal["sq8", "rabitq"] | None = None,
553
+ rescore: bool | None = None,
554
+ oversample: float | None = None,
555
+ metric: Literal["l2", "euclidean", "cosine", "dot", "ip"] | None = None,
556
+ config: dict[str, Any] | None = None,
557
+ ) -> VectorDatabase:
558
+ """Open or create a vector database.
559
+
560
+ Args:
561
+ path: Database path, or ":memory:" for in-memory.
562
+ dimensions: Vector dimensionality.
563
+ m: HNSW neighbors per node (default: 16, range: 4-64).
564
+ ef_construction: Build quality (default: 100).
565
+ ef_search: Search quality (default: 100).
566
+ quantization: Enable quantization:
567
+ - True or "sq8": 4x smaller, ~99% recall (recommended)
568
+ - "rabitq": 8x smaller, ~98% recall
569
+ - None/False: Full precision
570
+ rescore: Rerank with full precision (default: True when quantized).
571
+ oversample: Candidate multiplier for rescoring (default: 3.0).
572
+ metric: Distance metric for similarity search (default: "l2"):
573
+ - "l2" or "euclidean": Euclidean distance (default)
574
+ - "cosine": Cosine distance (1 - cosine similarity)
575
+ - "dot" or "ip": Inner product (for MIPS)
576
+ config: Advanced config dict (deprecated).
577
+
578
+ Returns:
579
+ VectorDatabase instance.
580
+
581
+ Examples:
582
+ >>> db = omendb.open("./vectors", dimensions=768)
583
+ >>> db = omendb.open("./vectors", dimensions=768, quantization=True)
584
+ >>> db = omendb.open(":memory:", dimensions=128)
585
+ """
586
+ ...
587
+
588
+ __version__: str
589
+ __all__: list[str]