microvec 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,290 @@
1
+ Metadata-Version: 2.4
2
+ Name: microvec
3
+ Version: 0.1.0
4
+ Summary: A lightweight, production-grade in-memory vector database
5
+ Project-URL: Homepage, https://github.com/huolter/microVector
6
+ Project-URL: Repository, https://github.com/huolter/microVector
7
+ Project-URL: Bug Tracker, https://github.com/huolter/microVector/issues
8
+ Project-URL: Changelog, https://github.com/huolter/microVector/blob/main/CHANGELOG.md
9
+ Author: huolter
10
+ License: MIT
11
+ Keywords: database,embeddings,rag,search,similarity,vector
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Database
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Typing :: Typed
23
+ Requires-Python: >=3.9
24
+ Requires-Dist: numpy>=1.21.0
25
+ Provides-Extra: dev
26
+ Requires-Dist: black>=23.0; extra == 'dev'
27
+ Requires-Dist: mypy>=1.5; extra == 'dev'
28
+ Requires-Dist: pytest-cov>=4.1; extra == 'dev'
29
+ Requires-Dist: pytest>=7.4; extra == 'dev'
30
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
31
+ Description-Content-Type: text/markdown
32
+
33
+ # MicroVector
34
+
35
+ A lightweight, production-grade in-memory vector database for Python.
36
+
37
+ ```
38
+ pip install microvector
39
+ ```
40
+
41
+ No external services. No complex setup. Just numpy and your embeddings.
42
+
43
+ ---
44
+
45
+ ## Features
46
+
47
+ - **Fast similarity search** — cosine, euclidean, and dot product metrics
48
+ - **Rich results** — every search result includes the document text and metadata, not just an index
49
+ - **Batch operations** — insert thousands of vectors in one call
50
+ - **Metadata filtering** — filter candidates before scoring with any Python predicate
51
+ - **Safe persistence** — JSON + numpy format (no pickle, no security risk)
52
+ - **Full type hints** — works great with mypy and IDEs
53
+ - **Zero required dependencies** beyond numpy
54
+
55
+ ---
56
+
57
+ ## Quick Start
58
+
59
+ ```python
60
+ import numpy as np
61
+ from microvector import MicroVectorDB
62
+
63
+ # Create a database for 768-dimensional embeddings (e.g. OpenAI ada-002)
64
+ db = MicroVectorDB(dimension=768)
65
+
66
+ # Add documents with their embeddings
67
+ db.add_node(embed("Paris is the capital of France"), "Paris is the capital of France")
68
+ db.add_node(embed("The Eiffel Tower is in Paris"), "The Eiffel Tower is in Paris")
69
+ db.add_node(embed("Python is a programming language"), "Python is a programming language")
70
+
71
+ # Search — results include document text, not just indexes
72
+ results = db.search_top_k(embed("What city is the Eiffel Tower in?"), k=2)
73
+ for r in results:
74
+ print(f"{r.score:.3f} {r.document}")
75
+ # 0.921 The Eiffel Tower is in Paris
76
+ # 0.887 Paris is the capital of France
77
+
78
+ # Save and reload
79
+ db.save("my_knowledge_base")
80
+ db = MicroVectorDB.load("my_knowledge_base")
81
+ ```
82
+
83
+ ---
84
+
85
+ ## Installation
86
+
87
+ **Requires Python 3.9+ and numpy >= 1.21.**
88
+
89
+ ```bash
90
+ pip install microvector
91
+ ```
92
+
93
+ Or from source:
94
+
95
+ ```bash
96
+ git clone https://github.com/huolter/microVector
97
+ cd microVector
98
+ pip install -e ".[dev]"
99
+ ```
100
+
101
+ ---
102
+
103
+ ## API Reference
104
+
105
+ ### `MicroVectorDB(dimension)`
106
+
107
+ Create a new database. All vectors must have this exact dimension.
108
+
109
+ ```python
110
+ db = MicroVectorDB(dimension=512)
111
+ ```
112
+
113
+ ---
114
+
115
+ ### `add_node(vector, document, metadata=None, num_bits=None) → int`
116
+
117
+ Add a single vector. Returns the assigned index.
118
+
119
+ ```python
120
+ idx = db.add_node(
121
+ vector=np.array([...]),
122
+ document="The text this vector represents",
123
+ metadata={"source": "wikipedia", "date": "2024-01"},
124
+ num_bits=8, # optional: apply 8-bit scalar quantization
125
+ )
126
+ ```
127
+
128
+ | Parameter | Type | Description |
129
+ |-----------|------|-------------|
130
+ | `vector` | `np.ndarray` | 1-D array matching the database dimension |
131
+ | `document` | `str` | Text or identifier associated with this vector |
132
+ | `metadata` | `dict` | Optional key-value pairs stored alongside the vector |
133
+ | `num_bits` | `int` | Optional quantization (1–16 bits). Reduces memory at the cost of precision. |
134
+
135
+ ---
136
+
137
+ ### `add_nodes(vectors, documents, metadata=None) → list[int]`
138
+
139
+ Batch insert. All inputs are validated before any insertion occurs.
140
+
141
+ ```python
142
+ import numpy as np
143
+
144
+ vectors = np.random.rand(1000, 512)
145
+ docs = [f"Document {i}" for i in range(1000)]
146
+ indices = db.add_nodes(vectors, docs)
147
+ ```
148
+
149
+ ---
150
+
151
+ ### `search_top_k(query_vector, k, metric='cosine', filter_fn=None) → list[SearchResult]`
152
+
153
+ Find the top-k most similar vectors. Returns results sorted by score descending.
154
+
155
+ ```python
156
+ results = db.search_top_k(query, k=5)
157
+ results = db.search_top_k(query, k=5, metric="euclidean")
158
+
159
+ # Filter before scoring — only consider documents tagged "news"
160
+ results = db.search_top_k(
161
+ query, k=5,
162
+ filter_fn=lambda node: node.metadata.get("type") == "news"
163
+ )
164
+ ```
165
+
166
+ **Metrics:**
167
+
168
+ | Metric | Range | Notes |
169
+ |--------|-------|-------|
170
+ | `cosine` | [-1, 1] | Best for text embeddings. Direction-based, scale-invariant. |
171
+ | `euclidean` | (0, 1] | `1 / (1 + dist)`. Identical vectors = 1.0. |
172
+ | `dot` | (-∞, +∞) | Fastest. Best for pre-normalized unit vectors. |
173
+
174
+ **`SearchResult` fields:**
175
+
176
+ ```python
177
+ result.index # int — node's assigned index
178
+ result.document # str — the document text
179
+ result.score # float — similarity score (higher = more similar)
180
+ result.metadata # dict — metadata stored with this node
181
+ ```
182
+
183
+ ---
184
+
185
+ ### `search_by_threshold(query_vector, min_score, metric='cosine') → list[SearchResult]`
186
+
187
+ Return all nodes with score >= `min_score`, sorted descending.
188
+
189
+ ```python
190
+ results = db.search_by_threshold(query, min_score=0.8)
191
+ ```
192
+
193
+ ---
194
+
195
+ ### `get_node(index) → Node`
196
+
197
+ Retrieve a node by index.
198
+
199
+ ```python
200
+ node = db.get_node(42)
201
+ print(node.document, node.metadata)
202
+ ```
203
+
204
+ ---
205
+
206
+ ### `update_node(index, vector=None, document=None, metadata=None)`
207
+
208
+ Update fields of an existing node. Omit fields to leave them unchanged.
209
+
210
+ ```python
211
+ db.update_node(42, document="Updated text")
212
+ db.update_node(42, metadata={"status": "reviewed"})
213
+ ```
214
+
215
+ ---
216
+
217
+ ### `remove_node(index)`
218
+
219
+ Remove a node. Its index is permanently retired (never reused).
220
+
221
+ ```python
222
+ db.remove_node(42)
223
+ ```
224
+
225
+ ---
226
+
227
+ ### `save(path)` / `MicroVectorDB.load(path)`
228
+
229
+ Persist to and restore from a `.mvdb/` directory. Safe format: JSON + numpy binary.
230
+
231
+ ```python
232
+ db.save("my_db") # creates my_db.mvdb/
233
+ db = MicroVectorDB.load("my_db")
234
+ ```
235
+
236
+ The `.mvdb/` directory contains:
237
+ - `index.json` — node metadata and documents (human-readable)
238
+ - `vectors.npy` — all vectors as a 2-D numpy array
239
+
240
+ ---
241
+
242
+ ### Utility methods
243
+
244
+ ```python
245
+ len(db) # number of nodes
246
+ 42 in db # check if index exists
247
+ repr(db) # MicroVectorDB(dimension=512, nodes=1000, next_index=1000)
248
+ db.stats() # {'count': 1000, 'dimension': 512, 'next_index': 1000, ...}
249
+ ```
250
+
251
+ ---
252
+
253
+ ## Exceptions
254
+
255
+ All exceptions inherit from `MicroVectorError`.
256
+
257
+ ```python
258
+ from microvector import (
259
+ MicroVectorError,
260
+ DimensionMismatchError, # wrong vector dimension
261
+ EmptyDatabaseError, # search on empty database
262
+ NodeNotFoundError, # get/update/remove non-existent index
263
+ )
264
+ ```
265
+
266
+ ---
267
+
268
+ ## Design Notes
269
+
270
+ **Why in-memory?** MicroVector is designed for small-to-medium datasets (up to ~100k vectors) where the simplicity of pure-Python outweighs the need for a dedicated service. It starts instantly, needs no configuration, and is trivially embeddable in any Python application.
271
+
272
+ **Why not pickle?** Pickle can execute arbitrary code when loading untrusted files. MicroVector uses JSON + numpy binary format which is safe, portable, and inspectable with any text editor.
273
+
274
+ **Index monotonicity.** Deleted indexes are never reused. This prevents stale external references from silently pointing to new data.
275
+
276
+ ---
277
+
278
+ ## Roadmap
279
+
280
+ - [ ] Approximate nearest neighbor (HNSW)
281
+ - [ ] Voronoi cell indexing
282
+ - [ ] FAISS optional backend for large datasets
283
+ - [ ] Async search support
284
+ - [ ] Benchmarks
285
+
286
+ ---
287
+
288
+ ## License
289
+
290
+ MIT
@@ -0,0 +1,9 @@
1
+ microvector/__init__.py,sha256=aOlY0tDisQbs5JMtW0QLoLSUHnvyWQOrnYpyDAz5_P4,804
2
+ microvector/core.py,sha256=g1W3Ou2Tim6W8TmcHLiBpE_8djaewnIIONUZKNuloA4,14946
3
+ microvector/exceptions.py,sha256=NWHhUDeLuJGHRfEMHlbTdtWUwa1R6pBR5opcO_jwb-Y,916
4
+ microvector/models.py,sha256=-YfPkQ_0YP4pgXBDNnEpIle4n73u-chsk_20n4mQhus,894
5
+ microvector/quantization.py,sha256=DRemEHwhhkcoCUobpU-XAbbkzWI5Tct2WyJdJV5EAZE,1149
6
+ microvector/similarity.py,sha256=9SABrDuMYBin5gSBql_hjnukBO5FHkbUlKtNwJ8tv6I,2136
7
+ microvec-0.1.0.dist-info/METADATA,sha256=h4l8q3qMCdKliPbDQXWz33PywG_pNAWBq2-UADtP9Ns,7972
8
+ microvec-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
9
+ microvec-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,36 @@
1
+ """
2
+ microvector — A lightweight, production-grade in-memory vector database.
3
+
4
+ Quick start::
5
+
6
+ import numpy as np
7
+ from microvector import MicroVectorDB
8
+
9
+ db = MicroVectorDB(dimension=3)
10
+ db.add_node(np.array([1.0, 0.0, 0.0]), "hello world")
11
+
12
+ results = db.search_top_k(np.array([1.0, 0.0, 0.0]), k=1)
13
+ print(results[0].document) # "hello world"
14
+ print(results[0].score) # ~1.0
15
+ """
16
+
17
+ __version__ = "0.1.0"
18
+
19
+ __all__ = [
20
+ "MicroVectorDB",
21
+ "SearchResult",
22
+ "Node",
23
+ "MicroVectorError",
24
+ "DimensionMismatchError",
25
+ "EmptyDatabaseError",
26
+ "NodeNotFoundError",
27
+ ]
28
+
29
+ from .core import MicroVectorDB
30
+ from .exceptions import (
31
+ DimensionMismatchError,
32
+ EmptyDatabaseError,
33
+ MicroVectorError,
34
+ NodeNotFoundError,
35
+ )
36
+ from .models import Node, SearchResult
microvector/core.py ADDED
@@ -0,0 +1,436 @@
1
+ """Core MicroVectorDB class."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Any, Callable
8
+
9
+ import numpy as np
10
+
11
+ from .exceptions import DimensionMismatchError, EmptyDatabaseError, NodeNotFoundError
12
+ from .models import Node, SearchResult
13
+ from .quantization import binary_quantization
14
+ from .similarity import get_metric
15
+
16
+
17
+ class MicroVectorDB:
18
+ """
19
+ A lightweight, in-memory vector database.
20
+
21
+ Indexes are assigned monotonically and are never reused after deletion.
22
+ All similarity scores follow the convention: higher = more similar.
23
+
24
+ Example:
25
+ >>> import numpy as np
26
+ >>> from microvector import MicroVectorDB
27
+ >>> db = MicroVectorDB(dimension=3)
28
+ >>> db.add_node(np.array([1.0, 0.0, 0.0]), "hello")
29
+ 0
30
+ >>> results = db.search_top_k(np.array([1.0, 0.0, 0.0]), k=1)
31
+ >>> results[0].document
32
+ 'hello'
33
+ """
34
+
35
+ def __init__(self, dimension: int) -> None:
36
+ """
37
+ Initialize a new MicroVectorDB.
38
+
39
+ Args:
40
+ dimension: The fixed dimension of all vectors in this database.
41
+
42
+ Raises:
43
+ ValueError: If dimension < 1.
44
+ """
45
+ if dimension < 1:
46
+ raise ValueError(f"dimension must be >= 1, got {dimension}")
47
+ self._dimension = dimension
48
+ self._nodes: dict[int, Node] = {}
49
+ self._next_index: int = 0
50
+
51
+ # ------------------------------------------------------------------ #
52
+ # Internal helpers #
53
+ # ------------------------------------------------------------------ #
54
+
55
+ def _validate_vector(self, vector: np.ndarray) -> None:
56
+ """Validate that a vector is a 1-D numpy array of the correct dimension."""
57
+ if not isinstance(vector, np.ndarray):
58
+ raise TypeError(f"vector must be np.ndarray, got {type(vector).__name__}")
59
+ if vector.ndim != 1:
60
+ raise ValueError(f"vector must be 1-D, got shape {vector.shape}")
61
+ if vector.shape[0] != self._dimension:
62
+ raise DimensionMismatchError(self._dimension, vector.shape[0])
63
+
64
+ # ------------------------------------------------------------------ #
65
+ # Mutation #
66
+ # ------------------------------------------------------------------ #
67
+
68
+ def add_node(
69
+ self,
70
+ vector: np.ndarray,
71
+ document: str,
72
+ metadata: dict[str, Any] | None = None,
73
+ num_bits: int | None = None,
74
+ ) -> int:
75
+ """
76
+ Add a single node to the database.
77
+
78
+ Args:
79
+ vector: The embedding vector. Must match the database dimension.
80
+ document: The text or identifier associated with this vector.
81
+ metadata: Optional dict of arbitrary key-value pairs.
82
+ num_bits: If set, apply n-bit scalar quantization before storing.
83
+
84
+ Returns:
85
+ The index assigned to the new node.
86
+
87
+ Raises:
88
+ TypeError: If vector is not a numpy array.
89
+ DimensionMismatchError: If vector dimension doesn't match the database.
90
+ """
91
+ self._validate_vector(vector)
92
+ v = (
93
+ binary_quantization(vector, num_bits)
94
+ if num_bits is not None
95
+ else vector.copy()
96
+ )
97
+ node = Node(
98
+ index=self._next_index,
99
+ vector=v,
100
+ document=document,
101
+ metadata=metadata or {},
102
+ )
103
+ self._nodes[self._next_index] = node
104
+ self._next_index += 1
105
+ return node.index
106
+
107
+ def add_nodes(
108
+ self,
109
+ vectors: np.ndarray | list[np.ndarray],
110
+ documents: list[str],
111
+ metadata: list[dict[str, Any] | None] | None = None,
112
+ ) -> list[int]:
113
+ """
114
+ Add multiple nodes in a single call.
115
+
116
+ All inputs are validated before any insertion occurs, so either all
117
+ nodes are added or none are (fail-fast, not partial).
118
+
119
+ Args:
120
+ vectors: A 2-D numpy array (n, dim) or list of 1-D arrays.
121
+ documents: List of document strings, one per vector.
122
+ metadata: Optional list of metadata dicts, one per vector.
123
+
124
+ Returns:
125
+ List of assigned indexes, in the same order as the inputs.
126
+
127
+ Raises:
128
+ ValueError: If vectors and documents have different lengths.
129
+ DimensionMismatchError: If any vector has wrong dimension.
130
+ """
131
+ if isinstance(vectors, np.ndarray):
132
+ if vectors.ndim != 2:
133
+ raise ValueError(
134
+ "Batch vectors array must be 2-D (n_vectors, dimension)"
135
+ )
136
+ vector_list: list[np.ndarray] = [
137
+ vectors[i] for i in range(vectors.shape[0])
138
+ ]
139
+ else:
140
+ vector_list = list(vectors)
141
+
142
+ if len(vector_list) != len(documents):
143
+ raise ValueError(
144
+ f"vectors and documents must have the same length: "
145
+ f"{len(vector_list)} != {len(documents)}"
146
+ )
147
+ if metadata is not None and len(metadata) != len(vector_list):
148
+ raise ValueError(
149
+ f"metadata list must match vectors length: "
150
+ f"{len(metadata)} != {len(vector_list)}"
151
+ )
152
+
153
+ # Validate all before inserting any
154
+ for v in vector_list:
155
+ self._validate_vector(v)
156
+
157
+ metas: list[dict[str, Any] | None] = (
158
+ metadata if metadata is not None else [None] * len(vector_list)
159
+ )
160
+ return [
161
+ self.add_node(v, doc, meta)
162
+ for v, doc, meta in zip(vector_list, documents, metas)
163
+ ]
164
+
165
+ def get_node(self, index: int) -> Node:
166
+ """
167
+ Retrieve a node by its index.
168
+
169
+ Raises:
170
+ NodeNotFoundError: If no node with that index exists.
171
+ """
172
+ if index not in self._nodes:
173
+ raise NodeNotFoundError(index)
174
+ return self._nodes[index]
175
+
176
+ def update_node(
177
+ self,
178
+ index: int,
179
+ vector: np.ndarray | None = None,
180
+ document: str | None = None,
181
+ metadata: dict[str, Any] | None = None,
182
+ ) -> None:
183
+ """
184
+ Update one or more fields of an existing node.
185
+
186
+ Only provided fields are changed; omitted fields are left as-is.
187
+
188
+ Raises:
189
+ NodeNotFoundError: If no node with that index exists.
190
+ DimensionMismatchError: If the new vector has wrong dimension.
191
+ """
192
+ node = self.get_node(index)
193
+ if vector is not None:
194
+ self._validate_vector(vector)
195
+ node.vector = vector.copy()
196
+ if document is not None:
197
+ node.document = document
198
+ if metadata is not None:
199
+ node.metadata = metadata
200
+
201
+ def remove_node(self, index: int) -> None:
202
+ """
203
+ Remove a node by its index.
204
+
205
+ The index is permanently retired — it will not be reused for future nodes.
206
+
207
+ Raises:
208
+ NodeNotFoundError: If no node with that index exists.
209
+ """
210
+ if index not in self._nodes:
211
+ raise NodeNotFoundError(index)
212
+ del self._nodes[index]
213
+
214
+ # ------------------------------------------------------------------ #
215
+ # Search #
216
+ # ------------------------------------------------------------------ #
217
+
218
+ def search_top_k(
219
+ self,
220
+ query_vector: np.ndarray,
221
+ k: int,
222
+ metric: str = "cosine",
223
+ filter_fn: Callable[[Node], bool] | None = None,
224
+ ) -> list[SearchResult]:
225
+ """
226
+ Find the top-k most similar nodes to a query vector.
227
+
228
+ Args:
229
+ query_vector: The query embedding. Must match the database dimension.
230
+ k: Number of results to return. If k > number of nodes, all nodes
231
+ are returned (no error).
232
+ metric: Distance metric — 'cosine', 'euclidean', or 'dot'.
233
+ filter_fn: Optional predicate to pre-filter nodes before scoring.
234
+ Only nodes where filter_fn(node) is True are considered.
235
+
236
+ Returns:
237
+ List of SearchResult objects sorted by score descending.
238
+ Each result includes: index, document, score, metadata.
239
+
240
+ Raises:
241
+ EmptyDatabaseError: If the database has no nodes.
242
+ DimensionMismatchError: If query_vector dimension doesn't match.
243
+ ValueError: If k < 1 or metric is unknown.
244
+ """
245
+ if not self._nodes:
246
+ raise EmptyDatabaseError()
247
+ self._validate_vector(query_vector)
248
+ if k < 1:
249
+ raise ValueError(f"k must be >= 1, got {k}")
250
+
251
+ sim_fn = get_metric(metric)
252
+ candidates = (
253
+ (n for n in self._nodes.values() if filter_fn(n))
254
+ if filter_fn is not None
255
+ else self._nodes.values()
256
+ )
257
+
258
+ results: list[SearchResult] = []
259
+ for node in candidates:
260
+ score = sim_fn(query_vector, node.vector)
261
+ results.append(
262
+ SearchResult(
263
+ index=node.index,
264
+ document=node.document,
265
+ score=score,
266
+ metadata=node.metadata,
267
+ )
268
+ )
269
+
270
+ results.sort(reverse=True)
271
+ return results[:k]
272
+
273
+ def search_by_threshold(
274
+ self,
275
+ query_vector: np.ndarray,
276
+ min_score: float,
277
+ metric: str = "cosine",
278
+ ) -> list[SearchResult]:
279
+ """
280
+ Return all nodes with similarity score >= min_score, sorted descending.
281
+
282
+ Args:
283
+ query_vector: The query embedding.
284
+ min_score: Minimum score threshold (inclusive).
285
+ metric: Distance metric — 'cosine', 'euclidean', or 'dot'.
286
+
287
+ Returns:
288
+ List of SearchResult objects with score >= min_score, sorted descending.
289
+
290
+ Raises:
291
+ EmptyDatabaseError: If the database has no nodes.
292
+ DimensionMismatchError: If query_vector dimension doesn't match.
293
+ """
294
+ if not self._nodes:
295
+ raise EmptyDatabaseError()
296
+ self._validate_vector(query_vector)
297
+
298
+ sim_fn = get_metric(metric)
299
+ results: list[SearchResult] = []
300
+ for node in self._nodes.values():
301
+ score = sim_fn(query_vector, node.vector)
302
+ if score >= min_score:
303
+ results.append(
304
+ SearchResult(
305
+ index=node.index,
306
+ document=node.document,
307
+ score=score,
308
+ metadata=node.metadata,
309
+ )
310
+ )
311
+ results.sort(reverse=True)
312
+ return results
313
+
314
+ # ------------------------------------------------------------------ #
315
+ # Persistence #
316
+ # ------------------------------------------------------------------ #
317
+
318
+ def save(self, path: str | Path) -> None:
319
+ """
320
+ Save the database to a ``.mvdb/`` directory.
321
+
322
+ The directory contains:
323
+ - ``index.json``: Node metadata, documents, and manifest.
324
+ - ``vectors.npy``: All vectors as a 2-D numpy array.
325
+
326
+ This format is safe (no arbitrary code execution unlike pickle),
327
+ portable, and human-inspectable.
328
+
329
+ Args:
330
+ path: Destination path. The ``.mvdb`` extension is added automatically
331
+ if not already present.
332
+ """
333
+ path = Path(path)
334
+ if not path.name.endswith(".mvdb"):
335
+ path = path.with_suffix(".mvdb")
336
+ path.mkdir(parents=True, exist_ok=True)
337
+
338
+ indices = sorted(self._nodes.keys())
339
+ manifest = {
340
+ "version": "1.0",
341
+ "dimension": self._dimension,
342
+ "next_index": self._next_index,
343
+ "count": len(indices),
344
+ "nodes": [
345
+ {
346
+ "index": i,
347
+ "document": self._nodes[i].document,
348
+ "metadata": self._nodes[i].metadata,
349
+ "vector_row": row,
350
+ }
351
+ for row, i in enumerate(indices)
352
+ ],
353
+ }
354
+
355
+ if indices:
356
+ vector_matrix = np.stack([self._nodes[i].vector for i in indices])
357
+ else:
358
+ vector_matrix = np.empty((0, self._dimension), dtype=np.float32)
359
+
360
+ with open(path / "index.json", "w", encoding="utf-8") as f:
361
+ json.dump(manifest, f, indent=2)
362
+ np.save(str(path / "vectors.npy"), vector_matrix)
363
+
364
+ @classmethod
365
+ def load(cls, path: str | Path) -> MicroVectorDB:
366
+ """
367
+ Load a database from a ``.mvdb/`` directory.
368
+
369
+ Args:
370
+ path: Path to the ``.mvdb`` directory (extension added if omitted).
371
+
372
+ Returns:
373
+ A fully restored MicroVectorDB instance.
374
+
375
+ Raises:
376
+ FileNotFoundError: If the path does not exist.
377
+ """
378
+ path = Path(path)
379
+ if not path.name.endswith(".mvdb"):
380
+ path = path.with_suffix(".mvdb")
381
+
382
+ if not path.exists():
383
+ raise FileNotFoundError(f"No database found at {path}")
384
+
385
+ with open(path / "index.json", encoding="utf-8") as f:
386
+ manifest = json.load(f)
387
+
388
+ vectors = np.load(str(path / "vectors.npy"))
389
+ db = cls(dimension=manifest["dimension"])
390
+ db._next_index = manifest["next_index"]
391
+
392
+ for node_meta in manifest["nodes"]:
393
+ row = node_meta["vector_row"]
394
+ node = Node(
395
+ index=node_meta["index"],
396
+ vector=vectors[row],
397
+ document=node_meta["document"],
398
+ metadata=node_meta["metadata"],
399
+ )
400
+ db._nodes[node.index] = node
401
+
402
+ return db
403
+
404
+ # ------------------------------------------------------------------ #
405
+ # Dunder methods #
406
+ # ------------------------------------------------------------------ #
407
+
408
+ def __len__(self) -> int:
409
+ """Return the number of nodes currently in the database."""
410
+ return len(self._nodes)
411
+
412
+ def __contains__(self, index: object) -> bool:
413
+ """Check whether an index exists in the database."""
414
+ return index in self._nodes
415
+
416
+ def __repr__(self) -> str:
417
+ return (
418
+ f"MicroVectorDB(dimension={self._dimension}, "
419
+ f"nodes={len(self._nodes)}, "
420
+ f"next_index={self._next_index})"
421
+ )
422
+
423
+ def stats(self) -> dict[str, Any]:
424
+ """
425
+ Return a summary of the database state.
426
+
427
+ Returns:
428
+ Dict with keys: count, dimension, next_index, index_gaps, indices.
429
+ """
430
+ return {
431
+ "count": len(self._nodes),
432
+ "dimension": self._dimension,
433
+ "next_index": self._next_index,
434
+ "index_gaps": self._next_index - len(self._nodes),
435
+ "indices": sorted(self._nodes.keys()),
436
+ }
@@ -0,0 +1,29 @@
1
+ """Custom exceptions for microvector."""
2
+
3
+
4
+ class MicroVectorError(Exception):
5
+ """Base exception for all microvector errors."""
6
+
7
+
8
+ class DimensionMismatchError(MicroVectorError):
9
+ """Raised when a vector's dimension does not match the database dimension."""
10
+
11
+ def __init__(self, expected: int, got: int) -> None:
12
+ super().__init__(f"Vector dimension mismatch: expected {expected}, got {got}")
13
+ self.expected = expected
14
+ self.got = got
15
+
16
+
17
+ class EmptyDatabaseError(MicroVectorError):
18
+ """Raised when a search is attempted on an empty database."""
19
+
20
+ def __init__(self) -> None:
21
+ super().__init__("Cannot search an empty database.")
22
+
23
+
24
+ class NodeNotFoundError(MicroVectorError):
25
+ """Raised when a node with the given index does not exist."""
26
+
27
+ def __init__(self, index: int) -> None:
28
+ super().__init__(f"No node found with index {index}.")
29
+ self.index = index
microvector/models.py ADDED
@@ -0,0 +1,36 @@
1
+ """Data models for microvector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any
7
+
8
+ import numpy as np
9
+
10
+
11
+ @dataclass
12
+ class Node:
13
+ """A stored vector with its associated document and optional metadata."""
14
+
15
+ index: int
16
+ vector: np.ndarray
17
+ document: str
18
+ metadata: dict[str, Any] = field(default_factory=dict)
19
+
20
+ def __eq__(self, other: object) -> bool:
21
+ if not isinstance(other, Node):
22
+ return NotImplemented
23
+ return self.index == other.index and np.array_equal(self.vector, other.vector)
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class SearchResult:
28
+ """An immutable search result returned by query methods."""
29
+
30
+ index: int
31
+ document: str
32
+ score: float
33
+ metadata: dict[str, Any] = field(default_factory=dict)
34
+
35
+ def __lt__(self, other: SearchResult) -> bool:
36
+ return self.score < other.score
@@ -0,0 +1,36 @@
1
+ """Vector quantization utilities."""
2
+
3
+ from typing import cast
4
+
5
+ import numpy as np
6
+
7
+
8
+ def binary_quantization(vector: np.ndarray, num_bits: int) -> np.ndarray:
9
+ """
10
+ Scalar quantization: maps float values into 2^num_bits integer buckets.
11
+
12
+ Args:
13
+ vector: 1-D float array.
14
+ num_bits: Number of bits (1–16). Common values: 4, 8.
15
+
16
+ Returns:
17
+ Quantized vector as float32.
18
+
19
+ Note:
20
+ When min == max (constant vector), all values map to bucket 0.
21
+ """
22
+ if num_bits < 1 or num_bits > 16:
23
+ raise ValueError(f"num_bits must be between 1 and 16, got {num_bits}")
24
+ if vector.ndim != 1:
25
+ raise ValueError(f"vector must be 1-D, got shape {vector.shape}")
26
+
27
+ num_buckets = 2**num_bits
28
+ v_min, v_max = float(np.min(vector)), float(np.max(vector))
29
+
30
+ if v_min == v_max:
31
+ return cast(np.ndarray, np.zeros(vector.shape, dtype=np.float32))
32
+
33
+ ranges = np.linspace(v_min, v_max, num=num_buckets + 1, endpoint=True)
34
+ quantized = np.digitize(vector, ranges, right=True) - 1
35
+ quantized = np.clip(quantized, 0, num_buckets - 1)
36
+ return cast(np.ndarray, quantized.astype(np.float32))
@@ -0,0 +1,71 @@
1
+ """Similarity and distance functions for vector search."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Callable
6
+
7
+ import numpy as np
8
+
9
+
10
+ def _validate_shapes(a: np.ndarray, b: np.ndarray) -> None:
11
+ if a.shape != b.shape:
12
+ raise ValueError(f"Shape mismatch: {a.shape} vs {b.shape}")
13
+
14
+
15
+ def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
16
+ """
17
+ Cosine similarity between two vectors.
18
+
19
+ Returns a value in [-1.0, 1.0] where 1.0 means identical direction.
20
+ Returns 0.0 if either vector has zero norm (no direction to compare).
21
+
22
+ Fixes original bug: crashed with ZeroDivisionError on zero-norm vectors.
23
+ """
24
+ _validate_shapes(a, b)
25
+ norm_a = float(np.linalg.norm(a))
26
+ norm_b = float(np.linalg.norm(b))
27
+ if norm_a == 0.0 or norm_b == 0.0:
28
+ return 0.0
29
+ raw = np.dot(a, b) / (norm_a * norm_b)
30
+ return float(np.clip(raw, -1.0, 1.0))
31
+
32
+
33
+ def euclidean_similarity(a: np.ndarray, b: np.ndarray) -> float:
34
+ """
35
+ Euclidean similarity between two vectors: 1 / (1 + distance).
36
+
37
+ Maps distance [0, ∞) → similarity (0, 1]. Identical vectors score 1.0.
38
+
39
+ Fixes original bug: returned raw negative distance, making far vectors
40
+ rank as more similar than near ones.
41
+ """
42
+ _validate_shapes(a, b)
43
+ dist = float(np.linalg.norm(a - b))
44
+ return 1.0 / (1.0 + dist)
45
+
46
+
47
+ def dot_product_similarity(a: np.ndarray, b: np.ndarray) -> float:
48
+ """
49
+ Raw dot product similarity.
50
+
51
+ Not normalized to [0, 1]. Best used with pre-normalized embeddings
52
+ (e.g., unit-norm vectors from OpenAI or HuggingFace models).
53
+ """
54
+ _validate_shapes(a, b)
55
+ return float(np.dot(a, b))
56
+
57
+
58
+ METRICS: dict[str, Callable[[np.ndarray, np.ndarray], float]] = {
59
+ "cosine": cosine_similarity,
60
+ "euclidean": euclidean_similarity,
61
+ "dot": dot_product_similarity,
62
+ }
63
+
64
+
65
+ def get_metric(name: str) -> Callable[[np.ndarray, np.ndarray], float]:
66
+ """Return the similarity function for the given metric name."""
67
+ if name not in METRICS:
68
+ raise ValueError(
69
+ f"Unknown metric '{name}'. Choose from: {sorted(METRICS.keys())}"
70
+ )
71
+ return METRICS[name]