microvec 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- microvec-0.1.0.dist-info/METADATA +290 -0
- microvec-0.1.0.dist-info/RECORD +9 -0
- microvec-0.1.0.dist-info/WHEEL +4 -0
- microvector/__init__.py +36 -0
- microvector/core.py +436 -0
- microvector/exceptions.py +29 -0
- microvector/models.py +36 -0
- microvector/quantization.py +36 -0
- microvector/similarity.py +71 -0
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: microvec
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight, production-grade in-memory vector database
|
|
5
|
+
Project-URL: Homepage, https://github.com/huolter/microVector
|
|
6
|
+
Project-URL: Repository, https://github.com/huolter/microVector
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/huolter/microVector/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/huolter/microVector/blob/main/CHANGELOG.md
|
|
9
|
+
Author: huolter
|
|
10
|
+
License: MIT
|
|
11
|
+
Keywords: database,embeddings,rag,search,similarity,vector
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Database
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Typing :: Typed
|
|
23
|
+
Requires-Python: >=3.9
|
|
24
|
+
Requires-Dist: numpy>=1.21.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: black>=23.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: mypy>=1.5; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest-cov>=4.1; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest>=7.4; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# MicroVector
|
|
34
|
+
|
|
35
|
+
A lightweight, production-grade in-memory vector database for Python.
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
pip install microvector
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
No external services. No complex setup. Just numpy and your embeddings.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Features
|
|
46
|
+
|
|
47
|
+
- **Fast similarity search** — cosine, euclidean, and dot product metrics
|
|
48
|
+
- **Rich results** — every search result includes the document text and metadata, not just an index
|
|
49
|
+
- **Batch operations** — insert thousands of vectors in one call
|
|
50
|
+
- **Metadata filtering** — filter candidates before scoring with any Python predicate
|
|
51
|
+
- **Safe persistence** — JSON + numpy format (no pickle, no security risk)
|
|
52
|
+
- **Full type hints** — works great with mypy and IDEs
|
|
53
|
+
- **Zero required dependencies** beyond numpy
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Quick Start
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
import numpy as np
|
|
61
|
+
from microvector import MicroVectorDB
|
|
62
|
+
|
|
63
|
+
# Create a database for 768-dimensional embeddings (e.g. OpenAI ada-002)
|
|
64
|
+
db = MicroVectorDB(dimension=768)
|
|
65
|
+
|
|
66
|
+
# Add documents with their embeddings
|
|
67
|
+
db.add_node(embed("Paris is the capital of France"), "Paris is the capital of France")
|
|
68
|
+
db.add_node(embed("The Eiffel Tower is in Paris"), "The Eiffel Tower is in Paris")
|
|
69
|
+
db.add_node(embed("Python is a programming language"), "Python is a programming language")
|
|
70
|
+
|
|
71
|
+
# Search — results include document text, not just indexes
|
|
72
|
+
results = db.search_top_k(embed("What city is the Eiffel Tower in?"), k=2)
|
|
73
|
+
for r in results:
|
|
74
|
+
print(f"{r.score:.3f} {r.document}")
|
|
75
|
+
# 0.921 The Eiffel Tower is in Paris
|
|
76
|
+
# 0.887 Paris is the capital of France
|
|
77
|
+
|
|
78
|
+
# Save and reload
|
|
79
|
+
db.save("my_knowledge_base")
|
|
80
|
+
db = MicroVectorDB.load("my_knowledge_base")
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## Installation
|
|
86
|
+
|
|
87
|
+
**Requires Python 3.9+ and numpy >= 1.21.**
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
pip install microvector
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Or from source:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
git clone https://github.com/huolter/microVector
|
|
97
|
+
cd microVector
|
|
98
|
+
pip install -e ".[dev]"
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## API Reference
|
|
104
|
+
|
|
105
|
+
### `MicroVectorDB(dimension)`
|
|
106
|
+
|
|
107
|
+
Create a new database. All vectors must have this exact dimension.
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
db = MicroVectorDB(dimension=512)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
### `add_node(vector, document, metadata=None, num_bits=None) → int`
|
|
116
|
+
|
|
117
|
+
Add a single vector. Returns the assigned index.
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
idx = db.add_node(
|
|
121
|
+
vector=np.array([...]),
|
|
122
|
+
document="The text this vector represents",
|
|
123
|
+
metadata={"source": "wikipedia", "date": "2024-01"},
|
|
124
|
+
num_bits=8, # optional: apply 8-bit scalar quantization
|
|
125
|
+
)
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
| Parameter | Type | Description |
|
|
129
|
+
|-----------|------|-------------|
|
|
130
|
+
| `vector` | `np.ndarray` | 1-D array matching the database dimension |
|
|
131
|
+
| `document` | `str` | Text or identifier associated with this vector |
|
|
132
|
+
| `metadata` | `dict` | Optional key-value pairs stored alongside the vector |
|
|
133
|
+
| `num_bits` | `int` | Optional quantization (1–16 bits). Reduces memory at the cost of precision. |
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
### `add_nodes(vectors, documents, metadata=None) → list[int]`
|
|
138
|
+
|
|
139
|
+
Batch insert. All inputs are validated before any insertion occurs.
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
import numpy as np
|
|
143
|
+
|
|
144
|
+
vectors = np.random.rand(1000, 512)
|
|
145
|
+
docs = [f"Document {i}" for i in range(1000)]
|
|
146
|
+
indices = db.add_nodes(vectors, docs)
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
### `search_top_k(query_vector, k, metric='cosine', filter_fn=None) → list[SearchResult]`
|
|
152
|
+
|
|
153
|
+
Find the top-k most similar vectors. Returns results sorted by score descending.
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
results = db.search_top_k(query, k=5)
|
|
157
|
+
results = db.search_top_k(query, k=5, metric="euclidean")
|
|
158
|
+
|
|
159
|
+
# Filter before scoring — only consider documents tagged "news"
|
|
160
|
+
results = db.search_top_k(
|
|
161
|
+
query, k=5,
|
|
162
|
+
filter_fn=lambda node: node.metadata.get("type") == "news"
|
|
163
|
+
)
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
**Metrics:**
|
|
167
|
+
|
|
168
|
+
| Metric | Range | Notes |
|
|
169
|
+
|--------|-------|-------|
|
|
170
|
+
| `cosine` | [-1, 1] | Best for text embeddings. Direction-based, scale-invariant. |
|
|
171
|
+
| `euclidean` | (0, 1] | `1 / (1 + dist)`. Identical vectors = 1.0. |
|
|
172
|
+
| `dot` | (-∞, +∞) | Fastest. Best for pre-normalized unit vectors. |
|
|
173
|
+
|
|
174
|
+
**`SearchResult` fields:**
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
result.index # int — node's assigned index
|
|
178
|
+
result.document # str — the document text
|
|
179
|
+
result.score # float — similarity score (higher = more similar)
|
|
180
|
+
result.metadata # dict — metadata stored with this node
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
### `search_by_threshold(query_vector, min_score, metric='cosine') → list[SearchResult]`
|
|
186
|
+
|
|
187
|
+
Return all nodes with score >= `min_score`, sorted descending.
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
results = db.search_by_threshold(query, min_score=0.8)
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
### `get_node(index) → Node`
|
|
196
|
+
|
|
197
|
+
Retrieve a node by index.
|
|
198
|
+
|
|
199
|
+
```python
|
|
200
|
+
node = db.get_node(42)
|
|
201
|
+
print(node.document, node.metadata)
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
### `update_node(index, vector=None, document=None, metadata=None)`
|
|
207
|
+
|
|
208
|
+
Update fields of an existing node. Omit fields to leave them unchanged.
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
db.update_node(42, document="Updated text")
|
|
212
|
+
db.update_node(42, metadata={"status": "reviewed"})
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
### `remove_node(index)`
|
|
218
|
+
|
|
219
|
+
Remove a node. Its index is permanently retired (never reused).
|
|
220
|
+
|
|
221
|
+
```python
|
|
222
|
+
db.remove_node(42)
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
---
|
|
226
|
+
|
|
227
|
+
### `save(path)` / `MicroVectorDB.load(path)`
|
|
228
|
+
|
|
229
|
+
Persist to and restore from a `.mvdb/` directory. Safe format: JSON + numpy binary.
|
|
230
|
+
|
|
231
|
+
```python
|
|
232
|
+
db.save("my_db") # creates my_db.mvdb/
|
|
233
|
+
db = MicroVectorDB.load("my_db")
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
The `.mvdb/` directory contains:
|
|
237
|
+
- `index.json` — node metadata and documents (human-readable)
|
|
238
|
+
- `vectors.npy` — all vectors as a 2-D numpy array
|
|
239
|
+
|
|
240
|
+
---
|
|
241
|
+
|
|
242
|
+
### Utility methods
|
|
243
|
+
|
|
244
|
+
```python
|
|
245
|
+
len(db) # number of nodes
|
|
246
|
+
42 in db # check if index exists
|
|
247
|
+
repr(db) # MicroVectorDB(dimension=512, nodes=1000, next_index=1000)
|
|
248
|
+
db.stats() # {'count': 1000, 'dimension': 512, 'next_index': 1000, ...}
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
---
|
|
252
|
+
|
|
253
|
+
## Exceptions
|
|
254
|
+
|
|
255
|
+
All exceptions inherit from `MicroVectorError`.
|
|
256
|
+
|
|
257
|
+
```python
|
|
258
|
+
from microvector import (
|
|
259
|
+
MicroVectorError,
|
|
260
|
+
DimensionMismatchError, # wrong vector dimension
|
|
261
|
+
EmptyDatabaseError, # search on empty database
|
|
262
|
+
NodeNotFoundError, # get/update/remove non-existent index
|
|
263
|
+
)
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
---
|
|
267
|
+
|
|
268
|
+
## Design Notes
|
|
269
|
+
|
|
270
|
+
**Why in-memory?** MicroVector is designed for small-to-medium datasets (up to ~100k vectors) where the simplicity of pure-Python outweighs the need for a dedicated service. It starts instantly, needs no configuration, and is trivially embeddable in any Python application.
|
|
271
|
+
|
|
272
|
+
**Why not pickle?** Pickle can execute arbitrary code when loading untrusted files. MicroVector uses JSON + numpy binary format which is safe, portable, and inspectable with any text editor.
|
|
273
|
+
|
|
274
|
+
**Index monotonicity.** Deleted indexes are never reused. This prevents stale external references from silently pointing to new data.
|
|
275
|
+
|
|
276
|
+
---
|
|
277
|
+
|
|
278
|
+
## Roadmap
|
|
279
|
+
|
|
280
|
+
- [ ] Approximate nearest neighbor (HNSW)
|
|
281
|
+
- [ ] Voronoi cell indexing
|
|
282
|
+
- [ ] FAISS optional backend for large datasets
|
|
283
|
+
- [ ] Async search support
|
|
284
|
+
- [ ] Benchmarks
|
|
285
|
+
|
|
286
|
+
---
|
|
287
|
+
|
|
288
|
+
## License
|
|
289
|
+
|
|
290
|
+
MIT
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
microvector/__init__.py,sha256=aOlY0tDisQbs5JMtW0QLoLSUHnvyWQOrnYpyDAz5_P4,804
|
|
2
|
+
microvector/core.py,sha256=g1W3Ou2Tim6W8TmcHLiBpE_8djaewnIIONUZKNuloA4,14946
|
|
3
|
+
microvector/exceptions.py,sha256=NWHhUDeLuJGHRfEMHlbTdtWUwa1R6pBR5opcO_jwb-Y,916
|
|
4
|
+
microvector/models.py,sha256=-YfPkQ_0YP4pgXBDNnEpIle4n73u-chsk_20n4mQhus,894
|
|
5
|
+
microvector/quantization.py,sha256=DRemEHwhhkcoCUobpU-XAbbkzWI5Tct2WyJdJV5EAZE,1149
|
|
6
|
+
microvector/similarity.py,sha256=9SABrDuMYBin5gSBql_hjnukBO5FHkbUlKtNwJ8tv6I,2136
|
|
7
|
+
microvec-0.1.0.dist-info/METADATA,sha256=h4l8q3qMCdKliPbDQXWz33PywG_pNAWBq2-UADtP9Ns,7972
|
|
8
|
+
microvec-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
9
|
+
microvec-0.1.0.dist-info/RECORD,,
|
microvector/__init__.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""
|
|
2
|
+
microvector — A lightweight, production-grade in-memory vector database.
|
|
3
|
+
|
|
4
|
+
Quick start::
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from microvector import MicroVectorDB
|
|
8
|
+
|
|
9
|
+
db = MicroVectorDB(dimension=3)
|
|
10
|
+
db.add_node(np.array([1.0, 0.0, 0.0]), "hello world")
|
|
11
|
+
|
|
12
|
+
results = db.search_top_k(np.array([1.0, 0.0, 0.0]), k=1)
|
|
13
|
+
print(results[0].document) # "hello world"
|
|
14
|
+
print(results[0].score) # ~1.0
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
__version__ = "0.1.0"
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"MicroVectorDB",
|
|
21
|
+
"SearchResult",
|
|
22
|
+
"Node",
|
|
23
|
+
"MicroVectorError",
|
|
24
|
+
"DimensionMismatchError",
|
|
25
|
+
"EmptyDatabaseError",
|
|
26
|
+
"NodeNotFoundError",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
from .core import MicroVectorDB
|
|
30
|
+
from .exceptions import (
|
|
31
|
+
DimensionMismatchError,
|
|
32
|
+
EmptyDatabaseError,
|
|
33
|
+
MicroVectorError,
|
|
34
|
+
NodeNotFoundError,
|
|
35
|
+
)
|
|
36
|
+
from .models import Node, SearchResult
|
microvector/core.py
ADDED
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
"""Core MicroVectorDB class."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Callable
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
from .exceptions import DimensionMismatchError, EmptyDatabaseError, NodeNotFoundError
|
|
12
|
+
from .models import Node, SearchResult
|
|
13
|
+
from .quantization import binary_quantization
|
|
14
|
+
from .similarity import get_metric
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MicroVectorDB:
|
|
18
|
+
"""
|
|
19
|
+
A lightweight, in-memory vector database.
|
|
20
|
+
|
|
21
|
+
Indexes are assigned monotonically and are never reused after deletion.
|
|
22
|
+
All similarity scores follow the convention: higher = more similar.
|
|
23
|
+
|
|
24
|
+
Example:
|
|
25
|
+
>>> import numpy as np
|
|
26
|
+
>>> from microvector import MicroVectorDB
|
|
27
|
+
>>> db = MicroVectorDB(dimension=3)
|
|
28
|
+
>>> db.add_node(np.array([1.0, 0.0, 0.0]), "hello")
|
|
29
|
+
0
|
|
30
|
+
>>> results = db.search_top_k(np.array([1.0, 0.0, 0.0]), k=1)
|
|
31
|
+
>>> results[0].document
|
|
32
|
+
'hello'
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, dimension: int) -> None:
|
|
36
|
+
"""
|
|
37
|
+
Initialize a new MicroVectorDB.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
dimension: The fixed dimension of all vectors in this database.
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
ValueError: If dimension < 1.
|
|
44
|
+
"""
|
|
45
|
+
if dimension < 1:
|
|
46
|
+
raise ValueError(f"dimension must be >= 1, got {dimension}")
|
|
47
|
+
self._dimension = dimension
|
|
48
|
+
self._nodes: dict[int, Node] = {}
|
|
49
|
+
self._next_index: int = 0
|
|
50
|
+
|
|
51
|
+
# ------------------------------------------------------------------ #
|
|
52
|
+
# Internal helpers #
|
|
53
|
+
# ------------------------------------------------------------------ #
|
|
54
|
+
|
|
55
|
+
def _validate_vector(self, vector: np.ndarray) -> None:
|
|
56
|
+
"""Validate that a vector is a 1-D numpy array of the correct dimension."""
|
|
57
|
+
if not isinstance(vector, np.ndarray):
|
|
58
|
+
raise TypeError(f"vector must be np.ndarray, got {type(vector).__name__}")
|
|
59
|
+
if vector.ndim != 1:
|
|
60
|
+
raise ValueError(f"vector must be 1-D, got shape {vector.shape}")
|
|
61
|
+
if vector.shape[0] != self._dimension:
|
|
62
|
+
raise DimensionMismatchError(self._dimension, vector.shape[0])
|
|
63
|
+
|
|
64
|
+
# ------------------------------------------------------------------ #
|
|
65
|
+
# Mutation #
|
|
66
|
+
# ------------------------------------------------------------------ #
|
|
67
|
+
|
|
68
|
+
def add_node(
|
|
69
|
+
self,
|
|
70
|
+
vector: np.ndarray,
|
|
71
|
+
document: str,
|
|
72
|
+
metadata: dict[str, Any] | None = None,
|
|
73
|
+
num_bits: int | None = None,
|
|
74
|
+
) -> int:
|
|
75
|
+
"""
|
|
76
|
+
Add a single node to the database.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
vector: The embedding vector. Must match the database dimension.
|
|
80
|
+
document: The text or identifier associated with this vector.
|
|
81
|
+
metadata: Optional dict of arbitrary key-value pairs.
|
|
82
|
+
num_bits: If set, apply n-bit scalar quantization before storing.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
The index assigned to the new node.
|
|
86
|
+
|
|
87
|
+
Raises:
|
|
88
|
+
TypeError: If vector is not a numpy array.
|
|
89
|
+
DimensionMismatchError: If vector dimension doesn't match the database.
|
|
90
|
+
"""
|
|
91
|
+
self._validate_vector(vector)
|
|
92
|
+
v = (
|
|
93
|
+
binary_quantization(vector, num_bits)
|
|
94
|
+
if num_bits is not None
|
|
95
|
+
else vector.copy()
|
|
96
|
+
)
|
|
97
|
+
node = Node(
|
|
98
|
+
index=self._next_index,
|
|
99
|
+
vector=v,
|
|
100
|
+
document=document,
|
|
101
|
+
metadata=metadata or {},
|
|
102
|
+
)
|
|
103
|
+
self._nodes[self._next_index] = node
|
|
104
|
+
self._next_index += 1
|
|
105
|
+
return node.index
|
|
106
|
+
|
|
107
|
+
def add_nodes(
|
|
108
|
+
self,
|
|
109
|
+
vectors: np.ndarray | list[np.ndarray],
|
|
110
|
+
documents: list[str],
|
|
111
|
+
metadata: list[dict[str, Any] | None] | None = None,
|
|
112
|
+
) -> list[int]:
|
|
113
|
+
"""
|
|
114
|
+
Add multiple nodes in a single call.
|
|
115
|
+
|
|
116
|
+
All inputs are validated before any insertion occurs, so either all
|
|
117
|
+
nodes are added or none are (fail-fast, not partial).
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
vectors: A 2-D numpy array (n, dim) or list of 1-D arrays.
|
|
121
|
+
documents: List of document strings, one per vector.
|
|
122
|
+
metadata: Optional list of metadata dicts, one per vector.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
List of assigned indexes, in the same order as the inputs.
|
|
126
|
+
|
|
127
|
+
Raises:
|
|
128
|
+
ValueError: If vectors and documents have different lengths.
|
|
129
|
+
DimensionMismatchError: If any vector has wrong dimension.
|
|
130
|
+
"""
|
|
131
|
+
if isinstance(vectors, np.ndarray):
|
|
132
|
+
if vectors.ndim != 2:
|
|
133
|
+
raise ValueError(
|
|
134
|
+
"Batch vectors array must be 2-D (n_vectors, dimension)"
|
|
135
|
+
)
|
|
136
|
+
vector_list: list[np.ndarray] = [
|
|
137
|
+
vectors[i] for i in range(vectors.shape[0])
|
|
138
|
+
]
|
|
139
|
+
else:
|
|
140
|
+
vector_list = list(vectors)
|
|
141
|
+
|
|
142
|
+
if len(vector_list) != len(documents):
|
|
143
|
+
raise ValueError(
|
|
144
|
+
f"vectors and documents must have the same length: "
|
|
145
|
+
f"{len(vector_list)} != {len(documents)}"
|
|
146
|
+
)
|
|
147
|
+
if metadata is not None and len(metadata) != len(vector_list):
|
|
148
|
+
raise ValueError(
|
|
149
|
+
f"metadata list must match vectors length: "
|
|
150
|
+
f"{len(metadata)} != {len(vector_list)}"
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Validate all before inserting any
|
|
154
|
+
for v in vector_list:
|
|
155
|
+
self._validate_vector(v)
|
|
156
|
+
|
|
157
|
+
metas: list[dict[str, Any] | None] = (
|
|
158
|
+
metadata if metadata is not None else [None] * len(vector_list)
|
|
159
|
+
)
|
|
160
|
+
return [
|
|
161
|
+
self.add_node(v, doc, meta)
|
|
162
|
+
for v, doc, meta in zip(vector_list, documents, metas)
|
|
163
|
+
]
|
|
164
|
+
|
|
165
|
+
def get_node(self, index: int) -> Node:
|
|
166
|
+
"""
|
|
167
|
+
Retrieve a node by its index.
|
|
168
|
+
|
|
169
|
+
Raises:
|
|
170
|
+
NodeNotFoundError: If no node with that index exists.
|
|
171
|
+
"""
|
|
172
|
+
if index not in self._nodes:
|
|
173
|
+
raise NodeNotFoundError(index)
|
|
174
|
+
return self._nodes[index]
|
|
175
|
+
|
|
176
|
+
def update_node(
|
|
177
|
+
self,
|
|
178
|
+
index: int,
|
|
179
|
+
vector: np.ndarray | None = None,
|
|
180
|
+
document: str | None = None,
|
|
181
|
+
metadata: dict[str, Any] | None = None,
|
|
182
|
+
) -> None:
|
|
183
|
+
"""
|
|
184
|
+
Update one or more fields of an existing node.
|
|
185
|
+
|
|
186
|
+
Only provided fields are changed; omitted fields are left as-is.
|
|
187
|
+
|
|
188
|
+
Raises:
|
|
189
|
+
NodeNotFoundError: If no node with that index exists.
|
|
190
|
+
DimensionMismatchError: If the new vector has wrong dimension.
|
|
191
|
+
"""
|
|
192
|
+
node = self.get_node(index)
|
|
193
|
+
if vector is not None:
|
|
194
|
+
self._validate_vector(vector)
|
|
195
|
+
node.vector = vector.copy()
|
|
196
|
+
if document is not None:
|
|
197
|
+
node.document = document
|
|
198
|
+
if metadata is not None:
|
|
199
|
+
node.metadata = metadata
|
|
200
|
+
|
|
201
|
+
def remove_node(self, index: int) -> None:
|
|
202
|
+
"""
|
|
203
|
+
Remove a node by its index.
|
|
204
|
+
|
|
205
|
+
The index is permanently retired — it will not be reused for future nodes.
|
|
206
|
+
|
|
207
|
+
Raises:
|
|
208
|
+
NodeNotFoundError: If no node with that index exists.
|
|
209
|
+
"""
|
|
210
|
+
if index not in self._nodes:
|
|
211
|
+
raise NodeNotFoundError(index)
|
|
212
|
+
del self._nodes[index]
|
|
213
|
+
|
|
214
|
+
# ------------------------------------------------------------------ #
|
|
215
|
+
# Search #
|
|
216
|
+
# ------------------------------------------------------------------ #
|
|
217
|
+
|
|
218
|
+
def search_top_k(
|
|
219
|
+
self,
|
|
220
|
+
query_vector: np.ndarray,
|
|
221
|
+
k: int,
|
|
222
|
+
metric: str = "cosine",
|
|
223
|
+
filter_fn: Callable[[Node], bool] | None = None,
|
|
224
|
+
) -> list[SearchResult]:
|
|
225
|
+
"""
|
|
226
|
+
Find the top-k most similar nodes to a query vector.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
query_vector: The query embedding. Must match the database dimension.
|
|
230
|
+
k: Number of results to return. If k > number of nodes, all nodes
|
|
231
|
+
are returned (no error).
|
|
232
|
+
metric: Distance metric — 'cosine', 'euclidean', or 'dot'.
|
|
233
|
+
filter_fn: Optional predicate to pre-filter nodes before scoring.
|
|
234
|
+
Only nodes where filter_fn(node) is True are considered.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
List of SearchResult objects sorted by score descending.
|
|
238
|
+
Each result includes: index, document, score, metadata.
|
|
239
|
+
|
|
240
|
+
Raises:
|
|
241
|
+
EmptyDatabaseError: If the database has no nodes.
|
|
242
|
+
DimensionMismatchError: If query_vector dimension doesn't match.
|
|
243
|
+
ValueError: If k < 1 or metric is unknown.
|
|
244
|
+
"""
|
|
245
|
+
if not self._nodes:
|
|
246
|
+
raise EmptyDatabaseError()
|
|
247
|
+
self._validate_vector(query_vector)
|
|
248
|
+
if k < 1:
|
|
249
|
+
raise ValueError(f"k must be >= 1, got {k}")
|
|
250
|
+
|
|
251
|
+
sim_fn = get_metric(metric)
|
|
252
|
+
candidates = (
|
|
253
|
+
(n for n in self._nodes.values() if filter_fn(n))
|
|
254
|
+
if filter_fn is not None
|
|
255
|
+
else self._nodes.values()
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
results: list[SearchResult] = []
|
|
259
|
+
for node in candidates:
|
|
260
|
+
score = sim_fn(query_vector, node.vector)
|
|
261
|
+
results.append(
|
|
262
|
+
SearchResult(
|
|
263
|
+
index=node.index,
|
|
264
|
+
document=node.document,
|
|
265
|
+
score=score,
|
|
266
|
+
metadata=node.metadata,
|
|
267
|
+
)
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
results.sort(reverse=True)
|
|
271
|
+
return results[:k]
|
|
272
|
+
|
|
273
|
+
def search_by_threshold(
|
|
274
|
+
self,
|
|
275
|
+
query_vector: np.ndarray,
|
|
276
|
+
min_score: float,
|
|
277
|
+
metric: str = "cosine",
|
|
278
|
+
) -> list[SearchResult]:
|
|
279
|
+
"""
|
|
280
|
+
Return all nodes with similarity score >= min_score, sorted descending.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
query_vector: The query embedding.
|
|
284
|
+
min_score: Minimum score threshold (inclusive).
|
|
285
|
+
metric: Distance metric — 'cosine', 'euclidean', or 'dot'.
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
List of SearchResult objects with score >= min_score, sorted descending.
|
|
289
|
+
|
|
290
|
+
Raises:
|
|
291
|
+
EmptyDatabaseError: If the database has no nodes.
|
|
292
|
+
DimensionMismatchError: If query_vector dimension doesn't match.
|
|
293
|
+
"""
|
|
294
|
+
if not self._nodes:
|
|
295
|
+
raise EmptyDatabaseError()
|
|
296
|
+
self._validate_vector(query_vector)
|
|
297
|
+
|
|
298
|
+
sim_fn = get_metric(metric)
|
|
299
|
+
results: list[SearchResult] = []
|
|
300
|
+
for node in self._nodes.values():
|
|
301
|
+
score = sim_fn(query_vector, node.vector)
|
|
302
|
+
if score >= min_score:
|
|
303
|
+
results.append(
|
|
304
|
+
SearchResult(
|
|
305
|
+
index=node.index,
|
|
306
|
+
document=node.document,
|
|
307
|
+
score=score,
|
|
308
|
+
metadata=node.metadata,
|
|
309
|
+
)
|
|
310
|
+
)
|
|
311
|
+
results.sort(reverse=True)
|
|
312
|
+
return results
|
|
313
|
+
|
|
314
|
+
# ------------------------------------------------------------------ #
|
|
315
|
+
# Persistence #
|
|
316
|
+
# ------------------------------------------------------------------ #
|
|
317
|
+
|
|
318
|
+
def save(self, path: str | Path) -> None:
|
|
319
|
+
"""
|
|
320
|
+
Save the database to a ``.mvdb/`` directory.
|
|
321
|
+
|
|
322
|
+
The directory contains:
|
|
323
|
+
- ``index.json``: Node metadata, documents, and manifest.
|
|
324
|
+
- ``vectors.npy``: All vectors as a 2-D numpy array.
|
|
325
|
+
|
|
326
|
+
This format is safe (no arbitrary code execution unlike pickle),
|
|
327
|
+
portable, and human-inspectable.
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
path: Destination path. The ``.mvdb`` extension is added automatically
|
|
331
|
+
if not already present.
|
|
332
|
+
"""
|
|
333
|
+
path = Path(path)
|
|
334
|
+
if not path.name.endswith(".mvdb"):
|
|
335
|
+
path = path.with_suffix(".mvdb")
|
|
336
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
337
|
+
|
|
338
|
+
indices = sorted(self._nodes.keys())
|
|
339
|
+
manifest = {
|
|
340
|
+
"version": "1.0",
|
|
341
|
+
"dimension": self._dimension,
|
|
342
|
+
"next_index": self._next_index,
|
|
343
|
+
"count": len(indices),
|
|
344
|
+
"nodes": [
|
|
345
|
+
{
|
|
346
|
+
"index": i,
|
|
347
|
+
"document": self._nodes[i].document,
|
|
348
|
+
"metadata": self._nodes[i].metadata,
|
|
349
|
+
"vector_row": row,
|
|
350
|
+
}
|
|
351
|
+
for row, i in enumerate(indices)
|
|
352
|
+
],
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
if indices:
|
|
356
|
+
vector_matrix = np.stack([self._nodes[i].vector for i in indices])
|
|
357
|
+
else:
|
|
358
|
+
vector_matrix = np.empty((0, self._dimension), dtype=np.float32)
|
|
359
|
+
|
|
360
|
+
with open(path / "index.json", "w", encoding="utf-8") as f:
|
|
361
|
+
json.dump(manifest, f, indent=2)
|
|
362
|
+
np.save(str(path / "vectors.npy"), vector_matrix)
|
|
363
|
+
|
|
364
|
+
@classmethod
|
|
365
|
+
def load(cls, path: str | Path) -> MicroVectorDB:
|
|
366
|
+
"""
|
|
367
|
+
Load a database from a ``.mvdb/`` directory.
|
|
368
|
+
|
|
369
|
+
Args:
|
|
370
|
+
path: Path to the ``.mvdb`` directory (extension added if omitted).
|
|
371
|
+
|
|
372
|
+
Returns:
|
|
373
|
+
A fully restored MicroVectorDB instance.
|
|
374
|
+
|
|
375
|
+
Raises:
|
|
376
|
+
FileNotFoundError: If the path does not exist.
|
|
377
|
+
"""
|
|
378
|
+
path = Path(path)
|
|
379
|
+
if not path.name.endswith(".mvdb"):
|
|
380
|
+
path = path.with_suffix(".mvdb")
|
|
381
|
+
|
|
382
|
+
if not path.exists():
|
|
383
|
+
raise FileNotFoundError(f"No database found at {path}")
|
|
384
|
+
|
|
385
|
+
with open(path / "index.json", encoding="utf-8") as f:
|
|
386
|
+
manifest = json.load(f)
|
|
387
|
+
|
|
388
|
+
vectors = np.load(str(path / "vectors.npy"))
|
|
389
|
+
db = cls(dimension=manifest["dimension"])
|
|
390
|
+
db._next_index = manifest["next_index"]
|
|
391
|
+
|
|
392
|
+
for node_meta in manifest["nodes"]:
|
|
393
|
+
row = node_meta["vector_row"]
|
|
394
|
+
node = Node(
|
|
395
|
+
index=node_meta["index"],
|
|
396
|
+
vector=vectors[row],
|
|
397
|
+
document=node_meta["document"],
|
|
398
|
+
metadata=node_meta["metadata"],
|
|
399
|
+
)
|
|
400
|
+
db._nodes[node.index] = node
|
|
401
|
+
|
|
402
|
+
return db
|
|
403
|
+
|
|
404
|
+
# ------------------------------------------------------------------ #
|
|
405
|
+
# Dunder methods #
|
|
406
|
+
# ------------------------------------------------------------------ #
|
|
407
|
+
|
|
408
|
+
def __len__(self) -> int:
|
|
409
|
+
"""Return the number of nodes currently in the database."""
|
|
410
|
+
return len(self._nodes)
|
|
411
|
+
|
|
412
|
+
def __contains__(self, index: object) -> bool:
|
|
413
|
+
"""Check whether an index exists in the database."""
|
|
414
|
+
return index in self._nodes
|
|
415
|
+
|
|
416
|
+
def __repr__(self) -> str:
|
|
417
|
+
return (
|
|
418
|
+
f"MicroVectorDB(dimension={self._dimension}, "
|
|
419
|
+
f"nodes={len(self._nodes)}, "
|
|
420
|
+
f"next_index={self._next_index})"
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
def stats(self) -> dict[str, Any]:
|
|
424
|
+
"""
|
|
425
|
+
Return a summary of the database state.
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
Dict with keys: count, dimension, next_index, index_gaps, indices.
|
|
429
|
+
"""
|
|
430
|
+
return {
|
|
431
|
+
"count": len(self._nodes),
|
|
432
|
+
"dimension": self._dimension,
|
|
433
|
+
"next_index": self._next_index,
|
|
434
|
+
"index_gaps": self._next_index - len(self._nodes),
|
|
435
|
+
"indices": sorted(self._nodes.keys()),
|
|
436
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Custom exceptions for microvector."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class MicroVectorError(Exception):
|
|
5
|
+
"""Base exception for all microvector errors."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DimensionMismatchError(MicroVectorError):
|
|
9
|
+
"""Raised when a vector's dimension does not match the database dimension."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, expected: int, got: int) -> None:
|
|
12
|
+
super().__init__(f"Vector dimension mismatch: expected {expected}, got {got}")
|
|
13
|
+
self.expected = expected
|
|
14
|
+
self.got = got
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class EmptyDatabaseError(MicroVectorError):
|
|
18
|
+
"""Raised when a search is attempted on an empty database."""
|
|
19
|
+
|
|
20
|
+
def __init__(self) -> None:
|
|
21
|
+
super().__init__("Cannot search an empty database.")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class NodeNotFoundError(MicroVectorError):
|
|
25
|
+
"""Raised when a node with the given index does not exist."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, index: int) -> None:
|
|
28
|
+
super().__init__(f"No node found with index {index}.")
|
|
29
|
+
self.index = index
|
microvector/models.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Data models for microvector."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class Node:
|
|
13
|
+
"""A stored vector with its associated document and optional metadata."""
|
|
14
|
+
|
|
15
|
+
index: int
|
|
16
|
+
vector: np.ndarray
|
|
17
|
+
document: str
|
|
18
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
19
|
+
|
|
20
|
+
def __eq__(self, other: object) -> bool:
|
|
21
|
+
if not isinstance(other, Node):
|
|
22
|
+
return NotImplemented
|
|
23
|
+
return self.index == other.index and np.array_equal(self.vector, other.vector)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(frozen=True)
|
|
27
|
+
class SearchResult:
|
|
28
|
+
"""An immutable search result returned by query methods."""
|
|
29
|
+
|
|
30
|
+
index: int
|
|
31
|
+
document: str
|
|
32
|
+
score: float
|
|
33
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
34
|
+
|
|
35
|
+
def __lt__(self, other: SearchResult) -> bool:
|
|
36
|
+
return self.score < other.score
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Vector quantization utilities."""
|
|
2
|
+
|
|
3
|
+
from typing import cast
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def binary_quantization(vector: np.ndarray, num_bits: int) -> np.ndarray:
|
|
9
|
+
"""
|
|
10
|
+
Scalar quantization: maps float values into 2^num_bits integer buckets.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
vector: 1-D float array.
|
|
14
|
+
num_bits: Number of bits (1–16). Common values: 4, 8.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Quantized vector as float32.
|
|
18
|
+
|
|
19
|
+
Note:
|
|
20
|
+
When min == max (constant vector), all values map to bucket 0.
|
|
21
|
+
"""
|
|
22
|
+
if num_bits < 1 or num_bits > 16:
|
|
23
|
+
raise ValueError(f"num_bits must be between 1 and 16, got {num_bits}")
|
|
24
|
+
if vector.ndim != 1:
|
|
25
|
+
raise ValueError(f"vector must be 1-D, got shape {vector.shape}")
|
|
26
|
+
|
|
27
|
+
num_buckets = 2**num_bits
|
|
28
|
+
v_min, v_max = float(np.min(vector)), float(np.max(vector))
|
|
29
|
+
|
|
30
|
+
if v_min == v_max:
|
|
31
|
+
return cast(np.ndarray, np.zeros(vector.shape, dtype=np.float32))
|
|
32
|
+
|
|
33
|
+
ranges = np.linspace(v_min, v_max, num=num_buckets + 1, endpoint=True)
|
|
34
|
+
quantized = np.digitize(vector, ranges, right=True) - 1
|
|
35
|
+
quantized = np.clip(quantized, 0, num_buckets - 1)
|
|
36
|
+
return cast(np.ndarray, quantized.astype(np.float32))
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Similarity and distance functions for vector search."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Callable
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _validate_shapes(a: np.ndarray, b: np.ndarray) -> None:
|
|
11
|
+
if a.shape != b.shape:
|
|
12
|
+
raise ValueError(f"Shape mismatch: {a.shape} vs {b.shape}")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
|
16
|
+
"""
|
|
17
|
+
Cosine similarity between two vectors.
|
|
18
|
+
|
|
19
|
+
Returns a value in [-1.0, 1.0] where 1.0 means identical direction.
|
|
20
|
+
Returns 0.0 if either vector has zero norm (no direction to compare).
|
|
21
|
+
|
|
22
|
+
Fixes original bug: crashed with ZeroDivisionError on zero-norm vectors.
|
|
23
|
+
"""
|
|
24
|
+
_validate_shapes(a, b)
|
|
25
|
+
norm_a = float(np.linalg.norm(a))
|
|
26
|
+
norm_b = float(np.linalg.norm(b))
|
|
27
|
+
if norm_a == 0.0 or norm_b == 0.0:
|
|
28
|
+
return 0.0
|
|
29
|
+
raw = np.dot(a, b) / (norm_a * norm_b)
|
|
30
|
+
return float(np.clip(raw, -1.0, 1.0))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def euclidean_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
|
34
|
+
"""
|
|
35
|
+
Euclidean similarity between two vectors: 1 / (1 + distance).
|
|
36
|
+
|
|
37
|
+
Maps distance [0, ∞) → similarity (0, 1]. Identical vectors score 1.0.
|
|
38
|
+
|
|
39
|
+
Fixes original bug: returned raw negative distance, making far vectors
|
|
40
|
+
rank as more similar than near ones.
|
|
41
|
+
"""
|
|
42
|
+
_validate_shapes(a, b)
|
|
43
|
+
dist = float(np.linalg.norm(a - b))
|
|
44
|
+
return 1.0 / (1.0 + dist)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def dot_product_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
|
48
|
+
"""
|
|
49
|
+
Raw dot product similarity.
|
|
50
|
+
|
|
51
|
+
Not normalized to [0, 1]. Best used with pre-normalized embeddings
|
|
52
|
+
(e.g., unit-norm vectors from OpenAI or HuggingFace models).
|
|
53
|
+
"""
|
|
54
|
+
_validate_shapes(a, b)
|
|
55
|
+
return float(np.dot(a, b))
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
METRICS: dict[str, Callable[[np.ndarray, np.ndarray], float]] = {
|
|
59
|
+
"cosine": cosine_similarity,
|
|
60
|
+
"euclidean": euclidean_similarity,
|
|
61
|
+
"dot": dot_product_similarity,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_metric(name: str) -> Callable[[np.ndarray, np.ndarray], float]:
|
|
66
|
+
"""Return the similarity function for the given metric name."""
|
|
67
|
+
if name not in METRICS:
|
|
68
|
+
raise ValueError(
|
|
69
|
+
f"Unknown metric '{name}'. Choose from: {sorted(METRICS.keys())}"
|
|
70
|
+
)
|
|
71
|
+
return METRICS[name]
|