microvec 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- microvec-0.1.0/.claude/settings.local.json +22 -0
- microvec-0.1.0/.github/workflows/ci.yml +61 -0
- microvec-0.1.0/.gitignore +16 -0
- microvec-0.1.0/CHANGELOG.md +47 -0
- microvec-0.1.0/PKG-INFO +290 -0
- microvec-0.1.0/README.md +258 -0
- microvec-0.1.0/Screenshot 2024-02-24 at 10.41.04.png +0 -0
- microvec-0.1.0/microvector/__init__.py +36 -0
- microvec-0.1.0/microvector/core.py +436 -0
- microvec-0.1.0/microvector/exceptions.py +29 -0
- microvec-0.1.0/microvector/models.py +36 -0
- microvec-0.1.0/microvector/quantization.py +36 -0
- microvec-0.1.0/microvector/similarity.py +71 -0
- microvec-0.1.0/pyproject.toml +70 -0
- microvec-0.1.0/tests/__init__.py +0 -0
- microvec-0.1.0/tests/conftest.py +33 -0
- microvec-0.1.0/tests/test_core.py +305 -0
- microvec-0.1.0/tests/test_persistence.py +116 -0
- microvec-0.1.0/tests/test_quantization.py +57 -0
- microvec-0.1.0/tests/test_similarity.py +130 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(git init:*)",
|
|
5
|
+
"Bash(git remote:*)",
|
|
6
|
+
"Bash(git fetch:*)",
|
|
7
|
+
"Bash(git checkout:*)",
|
|
8
|
+
"Bash(python3 -m py_compile /Users/waltergonzalez/Documents/projects/microvec/microvector.py)",
|
|
9
|
+
"Bash(pip install:*)",
|
|
10
|
+
"Bash(pip3 install:*)",
|
|
11
|
+
"Bash(python3:*)",
|
|
12
|
+
"Bash(git add:*)",
|
|
13
|
+
"Bash(git rm:*)",
|
|
14
|
+
"Bash(git commit:*)",
|
|
15
|
+
"Bash(git push:*)",
|
|
16
|
+
"Bash(curl -s https://pypi.org/pypi/microvector/json)",
|
|
17
|
+
"Bash(curl -s -o /dev/null -w \"microvectordb: %{http_code}\\\\n\" https://pypi.org/pypi/microvectordb/json)",
|
|
18
|
+
"Bash(curl -s -o /dev/null -w \"microvec: %{http_code}\\\\n\" https://pypi.org/pypi/microvec/json)",
|
|
19
|
+
"Bash(curl -s -o /dev/null -w \"micro-vector-db: %{http_code}\\\\n\" https://pypi.org/pypi/micro-vector-db/json)"
|
|
20
|
+
]
|
|
21
|
+
}
|
|
22
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main, develop]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
name: Lint
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
|
|
16
|
+
- name: Set up Python 3.12
|
|
17
|
+
uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: "3.12"
|
|
20
|
+
|
|
21
|
+
- name: Install dev dependencies
|
|
22
|
+
run: pip install -e ".[dev]"
|
|
23
|
+
|
|
24
|
+
- name: Check formatting (black)
|
|
25
|
+
run: black --check microvector/ tests/
|
|
26
|
+
|
|
27
|
+
- name: Lint (ruff)
|
|
28
|
+
run: ruff check microvector/ tests/
|
|
29
|
+
|
|
30
|
+
- name: Type check (mypy)
|
|
31
|
+
run: mypy microvector/
|
|
32
|
+
|
|
33
|
+
test:
|
|
34
|
+
name: Test Python ${{ matrix.python-version }}
|
|
35
|
+
runs-on: ubuntu-latest
|
|
36
|
+
needs: lint
|
|
37
|
+
strategy:
|
|
38
|
+
fail-fast: false
|
|
39
|
+
matrix:
|
|
40
|
+
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
|
41
|
+
|
|
42
|
+
steps:
|
|
43
|
+
- uses: actions/checkout@v4
|
|
44
|
+
|
|
45
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
46
|
+
uses: actions/setup-python@v5
|
|
47
|
+
with:
|
|
48
|
+
python-version: ${{ matrix.python-version }}
|
|
49
|
+
|
|
50
|
+
- name: Install dependencies
|
|
51
|
+
run: pip install -e ".[dev]"
|
|
52
|
+
|
|
53
|
+
- name: Run tests with coverage
|
|
54
|
+
run: pytest --cov=microvector --cov-report=xml --cov-fail-under=90
|
|
55
|
+
|
|
56
|
+
- name: Upload coverage to Codecov
|
|
57
|
+
uses: codecov/codecov-action@v4
|
|
58
|
+
if: matrix.python-version == '3.12'
|
|
59
|
+
with:
|
|
60
|
+
files: ./coverage.xml
|
|
61
|
+
fail_ci_if_error: false
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|
6
|
+
This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## [0.1.0] — 2026-03-24
|
|
11
|
+
|
|
12
|
+
Complete rewrite from prototype to production-grade library.
|
|
13
|
+
|
|
14
|
+
### Fixed
|
|
15
|
+
|
|
16
|
+
- **Critical: `IndentationError` in `add_node`** — the original module could not be imported at all due to a syntax error in the method definition.
|
|
17
|
+
- **Euclidean distance returned negative values** — the original returned `-np.linalg.norm(...)`, making far vectors rank as *more* similar than near ones. Fixed to `1 / (1 + distance)`, mapping to `(0, 1]` where higher always means more similar.
|
|
18
|
+
- **Cosine similarity crashed on zero-norm vectors** — division by zero when either vector was all zeros. Now returns `0.0` (no direction, no similarity).
|
|
19
|
+
- **Vector dimension never validated** — `self._dimension` was stored in `__init__` but never checked. Any vector of any size was silently accepted. Now raises `DimensionMismatchError` with clear expected/got values.
|
|
20
|
+
- **Search results dropped the document text** — `search_top_k` returned `(index, score)` tuples, forcing callers to do a separate lookup. Now returns `SearchResult` objects with `index`, `document`, `score`, and `metadata`.
|
|
21
|
+
- **Index counter could diverge after removal** — internal index tracking was done via a list with no gap management. Replaced with `dict[int, Node]` keyed by index; `_next_index` increments monotonically and retired indexes are never reused.
|
|
22
|
+
- **Quantization was untested and had edge cases** — constant vectors (min == max) would cause `linspace` issues. Added input validation, edge-case handling, and a full test suite to verify correctness.
|
|
23
|
+
|
|
24
|
+
### Added
|
|
25
|
+
|
|
26
|
+
- **Package structure** — reorganized from a single `microvector.py` file into a proper `microvector/` package with `core.py`, `similarity.py`, `quantization.py`, `models.py`, `exceptions.py`, and `__init__.py`.
|
|
27
|
+
- **`SearchResult` dataclass** — immutable result type with `index`, `document`, `score`, `metadata` fields. Supports sorting.
|
|
28
|
+
- **`Node` dataclass** — replaces internal dict representation.
|
|
29
|
+
- **Custom exceptions** — `MicroVectorError` (base), `DimensionMismatchError`, `EmptyDatabaseError`, `NodeNotFoundError`. All carry structured attributes for programmatic inspection.
|
|
30
|
+
- **`add_nodes()`** — batch insert with fail-fast validation (all inputs validated before any insertion).
|
|
31
|
+
- **`get_node()`** — retrieve a node by index.
|
|
32
|
+
- **`update_node()`** — update vector, document, and/or metadata in place.
|
|
33
|
+
- **`search_by_threshold()`** — return all nodes with score >= a minimum threshold.
|
|
34
|
+
- **`filter_fn` parameter on `search_top_k`** — pre-filter nodes before scoring with any Python predicate.
|
|
35
|
+
- **`dot` distance metric** — raw dot product, best for pre-normalized embeddings.
|
|
36
|
+
- **`__len__`, `__contains__`, `__repr__`** — standard Python dunder methods.
|
|
37
|
+
- **`stats()`** — returns count, dimension, next_index, index_gaps, indices.
|
|
38
|
+
- **`metadata` support** — arbitrary key-value dicts stored per node, returned in search results.
|
|
39
|
+
- **Safe persistence** — replaced `pickle` with JSON manifest + numpy `.npy` file in a `.mvdb/` directory. Safe against arbitrary code execution, portable, human-inspectable.
|
|
40
|
+
- **`pyproject.toml`** — proper packaging with hatchling, PyPI classifiers, optional dev dependencies.
|
|
41
|
+
- **Full test suite** — 107 tests, 98.5% coverage across all modules.
|
|
42
|
+
- **CI/CD** — GitHub Actions workflow: lint (black + ruff + mypy) then test across Python 3.9, 3.10, 3.11, 3.12.
|
|
43
|
+
|
|
44
|
+
### Changed
|
|
45
|
+
|
|
46
|
+
- `save_to_disk` / `read_from_disk` → `save` / `MicroVectorDB.load` (classmethod). New format is a `.mvdb/` directory instead of a single pickle file.
|
|
47
|
+
- `search_top_k` now accepts `metric=` keyword (was `distance_metric=`) and returns `list[SearchResult]` instead of `list[tuple]`.
|
microvec-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: microvec
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight, production-grade in-memory vector database
|
|
5
|
+
Project-URL: Homepage, https://github.com/huolter/microVector
|
|
6
|
+
Project-URL: Repository, https://github.com/huolter/microVector
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/huolter/microVector/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/huolter/microVector/blob/main/CHANGELOG.md
|
|
9
|
+
Author: huolter
|
|
10
|
+
License: MIT
|
|
11
|
+
Keywords: database,embeddings,rag,search,similarity,vector
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Database
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Typing :: Typed
|
|
23
|
+
Requires-Python: >=3.9
|
|
24
|
+
Requires-Dist: numpy>=1.21.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: black>=23.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: mypy>=1.5; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest-cov>=4.1; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest>=7.4; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# MicroVector
|
|
34
|
+
|
|
35
|
+
A lightweight, production-grade in-memory vector database for Python.
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
pip install microvector
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
No external services. No complex setup. Just numpy and your embeddings.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Features
|
|
46
|
+
|
|
47
|
+
- **Fast similarity search** — cosine, euclidean, and dot product metrics
|
|
48
|
+
- **Rich results** — every search result includes the document text and metadata, not just an index
|
|
49
|
+
- **Batch operations** — insert thousands of vectors in one call
|
|
50
|
+
- **Metadata filtering** — filter candidates before scoring with any Python predicate
|
|
51
|
+
- **Safe persistence** — JSON + numpy format (no pickle, no security risk)
|
|
52
|
+
- **Full type hints** — works great with mypy and IDEs
|
|
53
|
+
- **Zero required dependencies** beyond numpy
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Quick Start
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
import numpy as np
|
|
61
|
+
from microvector import MicroVectorDB
|
|
62
|
+
|
|
63
|
+
# Create a database for 768-dimensional embeddings (e.g. OpenAI ada-002)
|
|
64
|
+
db = MicroVectorDB(dimension=768)
|
|
65
|
+
|
|
66
|
+
# Add documents with their embeddings
|
|
67
|
+
db.add_node(embed("Paris is the capital of France"), "Paris is the capital of France")
|
|
68
|
+
db.add_node(embed("The Eiffel Tower is in Paris"), "The Eiffel Tower is in Paris")
|
|
69
|
+
db.add_node(embed("Python is a programming language"), "Python is a programming language")
|
|
70
|
+
|
|
71
|
+
# Search — results include document text, not just indexes
|
|
72
|
+
results = db.search_top_k(embed("What city is the Eiffel Tower in?"), k=2)
|
|
73
|
+
for r in results:
|
|
74
|
+
print(f"{r.score:.3f} {r.document}")
|
|
75
|
+
# 0.921 The Eiffel Tower is in Paris
|
|
76
|
+
# 0.887 Paris is the capital of France
|
|
77
|
+
|
|
78
|
+
# Save and reload
|
|
79
|
+
db.save("my_knowledge_base")
|
|
80
|
+
db = MicroVectorDB.load("my_knowledge_base")
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## Installation
|
|
86
|
+
|
|
87
|
+
**Requires Python 3.9+ and numpy >= 1.21.**
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
pip install microvector
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Or from source:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
git clone https://github.com/huolter/microVector
|
|
97
|
+
cd microVector
|
|
98
|
+
pip install -e ".[dev]"
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## API Reference
|
|
104
|
+
|
|
105
|
+
### `MicroVectorDB(dimension)`
|
|
106
|
+
|
|
107
|
+
Create a new database. All vectors must have this exact dimension.
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
db = MicroVectorDB(dimension=512)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
### `add_node(vector, document, metadata=None, num_bits=None) → int`
|
|
116
|
+
|
|
117
|
+
Add a single vector. Returns the assigned index.
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
idx = db.add_node(
|
|
121
|
+
vector=np.array([...]),
|
|
122
|
+
document="The text this vector represents",
|
|
123
|
+
metadata={"source": "wikipedia", "date": "2024-01"},
|
|
124
|
+
num_bits=8, # optional: apply 8-bit scalar quantization
|
|
125
|
+
)
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
| Parameter | Type | Description |
|
|
129
|
+
|-----------|------|-------------|
|
|
130
|
+
| `vector` | `np.ndarray` | 1-D array matching the database dimension |
|
|
131
|
+
| `document` | `str` | Text or identifier associated with this vector |
|
|
132
|
+
| `metadata` | `dict` | Optional key-value pairs stored alongside the vector |
|
|
133
|
+
| `num_bits` | `int` | Optional quantization (1–16 bits). Reduces memory at the cost of precision. |
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
### `add_nodes(vectors, documents, metadata=None) → list[int]`
|
|
138
|
+
|
|
139
|
+
Batch insert. All inputs are validated before any insertion occurs.
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
import numpy as np
|
|
143
|
+
|
|
144
|
+
vectors = np.random.rand(1000, 512)
|
|
145
|
+
docs = [f"Document {i}" for i in range(1000)]
|
|
146
|
+
indices = db.add_nodes(vectors, docs)
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
### `search_top_k(query_vector, k, metric='cosine', filter_fn=None) → list[SearchResult]`
|
|
152
|
+
|
|
153
|
+
Find the top-k most similar vectors. Returns results sorted by score descending.
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
results = db.search_top_k(query, k=5)
|
|
157
|
+
results = db.search_top_k(query, k=5, metric="euclidean")
|
|
158
|
+
|
|
159
|
+
# Filter before scoring — only consider documents tagged "news"
|
|
160
|
+
results = db.search_top_k(
|
|
161
|
+
query, k=5,
|
|
162
|
+
filter_fn=lambda node: node.metadata.get("type") == "news"
|
|
163
|
+
)
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
**Metrics:**
|
|
167
|
+
|
|
168
|
+
| Metric | Range | Notes |
|
|
169
|
+
|--------|-------|-------|
|
|
170
|
+
| `cosine` | [-1, 1] | Best for text embeddings. Direction-based, scale-invariant. |
|
|
171
|
+
| `euclidean` | (0, 1] | `1 / (1 + dist)`. Identical vectors = 1.0. |
|
|
172
|
+
| `dot` | (-∞, +∞) | Fastest. Best for pre-normalized unit vectors. |
|
|
173
|
+
|
|
174
|
+
**`SearchResult` fields:**
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
result.index # int — node's assigned index
|
|
178
|
+
result.document # str — the document text
|
|
179
|
+
result.score # float — similarity score (higher = more similar)
|
|
180
|
+
result.metadata # dict — metadata stored with this node
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
### `search_by_threshold(query_vector, min_score, metric='cosine') → list[SearchResult]`
|
|
186
|
+
|
|
187
|
+
Return all nodes with score >= `min_score`, sorted descending.
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
results = db.search_by_threshold(query, min_score=0.8)
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
### `get_node(index) → Node`
|
|
196
|
+
|
|
197
|
+
Retrieve a node by index.
|
|
198
|
+
|
|
199
|
+
```python
|
|
200
|
+
node = db.get_node(42)
|
|
201
|
+
print(node.document, node.metadata)
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
### `update_node(index, vector=None, document=None, metadata=None)`
|
|
207
|
+
|
|
208
|
+
Update fields of an existing node. Omit fields to leave them unchanged.
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
db.update_node(42, document="Updated text")
|
|
212
|
+
db.update_node(42, metadata={"status": "reviewed"})
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
### `remove_node(index)`
|
|
218
|
+
|
|
219
|
+
Remove a node. Its index is permanently retired (never reused).
|
|
220
|
+
|
|
221
|
+
```python
|
|
222
|
+
db.remove_node(42)
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
---
|
|
226
|
+
|
|
227
|
+
### `save(path)` / `MicroVectorDB.load(path)`
|
|
228
|
+
|
|
229
|
+
Persist to and restore from a `.mvdb/` directory. Safe format: JSON + numpy binary.
|
|
230
|
+
|
|
231
|
+
```python
|
|
232
|
+
db.save("my_db") # creates my_db.mvdb/
|
|
233
|
+
db = MicroVectorDB.load("my_db")
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
The `.mvdb/` directory contains:
|
|
237
|
+
- `index.json` — node metadata and documents (human-readable)
|
|
238
|
+
- `vectors.npy` — all vectors as a 2-D numpy array
|
|
239
|
+
|
|
240
|
+
---
|
|
241
|
+
|
|
242
|
+
### Utility methods
|
|
243
|
+
|
|
244
|
+
```python
|
|
245
|
+
len(db) # number of nodes
|
|
246
|
+
42 in db # check if index exists
|
|
247
|
+
repr(db) # MicroVectorDB(dimension=512, nodes=1000, next_index=1000)
|
|
248
|
+
db.stats() # {'count': 1000, 'dimension': 512, 'next_index': 1000, ...}
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
---
|
|
252
|
+
|
|
253
|
+
## Exceptions
|
|
254
|
+
|
|
255
|
+
All exceptions inherit from `MicroVectorError`.
|
|
256
|
+
|
|
257
|
+
```python
|
|
258
|
+
from microvector import (
|
|
259
|
+
MicroVectorError,
|
|
260
|
+
DimensionMismatchError, # wrong vector dimension
|
|
261
|
+
EmptyDatabaseError, # search on empty database
|
|
262
|
+
NodeNotFoundError, # get/update/remove non-existent index
|
|
263
|
+
)
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
---
|
|
267
|
+
|
|
268
|
+
## Design Notes
|
|
269
|
+
|
|
270
|
+
**Why in-memory?** MicroVector is designed for small-to-medium datasets (up to ~100k vectors) where the simplicity of pure-Python outweighs the need for a dedicated service. It starts instantly, needs no configuration, and is trivially embeddable in any Python application.
|
|
271
|
+
|
|
272
|
+
**Why not pickle?** Pickle can execute arbitrary code when loading untrusted files. MicroVector uses JSON + numpy binary format which is safe, portable, and inspectable with any text editor.
|
|
273
|
+
|
|
274
|
+
**Index monotonicity.** Deleted indexes are never reused. This prevents stale external references from silently pointing to new data.
|
|
275
|
+
|
|
276
|
+
---
|
|
277
|
+
|
|
278
|
+
## Roadmap
|
|
279
|
+
|
|
280
|
+
- [ ] Approximate nearest neighbor (HNSW)
|
|
281
|
+
- [ ] Voronoi cell indexing
|
|
282
|
+
- [ ] FAISS optional backend for large datasets
|
|
283
|
+
- [ ] Async search support
|
|
284
|
+
- [ ] Benchmarks
|
|
285
|
+
|
|
286
|
+
---
|
|
287
|
+
|
|
288
|
+
## License
|
|
289
|
+
|
|
290
|
+
MIT
|
microvec-0.1.0/README.md
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
# MicroVector
|
|
2
|
+
|
|
3
|
+
A lightweight, production-grade in-memory vector database for Python.
|
|
4
|
+
|
|
5
|
+
```
|
|
6
|
+
pip install microvector
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
No external services. No complex setup. Just numpy and your embeddings.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Features
|
|
14
|
+
|
|
15
|
+
- **Fast similarity search** — cosine, euclidean, and dot product metrics
|
|
16
|
+
- **Rich results** — every search result includes the document text and metadata, not just an index
|
|
17
|
+
- **Batch operations** — insert thousands of vectors in one call
|
|
18
|
+
- **Metadata filtering** — filter candidates before scoring with any Python predicate
|
|
19
|
+
- **Safe persistence** — JSON + numpy format (no pickle, no security risk)
|
|
20
|
+
- **Full type hints** — works great with mypy and IDEs
|
|
21
|
+
- **Zero required dependencies** beyond numpy
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
import numpy as np
|
|
29
|
+
from microvector import MicroVectorDB
|
|
30
|
+
|
|
31
|
+
# Create a database for 768-dimensional embeddings (e.g. OpenAI ada-002)
|
|
32
|
+
db = MicroVectorDB(dimension=768)
|
|
33
|
+
|
|
34
|
+
# Add documents with their embeddings
|
|
35
|
+
db.add_node(embed("Paris is the capital of France"), "Paris is the capital of France")
|
|
36
|
+
db.add_node(embed("The Eiffel Tower is in Paris"), "The Eiffel Tower is in Paris")
|
|
37
|
+
db.add_node(embed("Python is a programming language"), "Python is a programming language")
|
|
38
|
+
|
|
39
|
+
# Search — results include document text, not just indexes
|
|
40
|
+
results = db.search_top_k(embed("What city is the Eiffel Tower in?"), k=2)
|
|
41
|
+
for r in results:
|
|
42
|
+
print(f"{r.score:.3f} {r.document}")
|
|
43
|
+
# 0.921 The Eiffel Tower is in Paris
|
|
44
|
+
# 0.887 Paris is the capital of France
|
|
45
|
+
|
|
46
|
+
# Save and reload
|
|
47
|
+
db.save("my_knowledge_base")
|
|
48
|
+
db = MicroVectorDB.load("my_knowledge_base")
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Installation
|
|
54
|
+
|
|
55
|
+
**Requires Python 3.9+ and numpy >= 1.21.**
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install microvector
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Or from source:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
git clone https://github.com/huolter/microVector
|
|
65
|
+
cd microVector
|
|
66
|
+
pip install -e ".[dev]"
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## API Reference
|
|
72
|
+
|
|
73
|
+
### `MicroVectorDB(dimension)`
|
|
74
|
+
|
|
75
|
+
Create a new database. All vectors must have this exact dimension.
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
db = MicroVectorDB(dimension=512)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
### `add_node(vector, document, metadata=None, num_bits=None) → int`
|
|
84
|
+
|
|
85
|
+
Add a single vector. Returns the assigned index.
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
idx = db.add_node(
|
|
89
|
+
vector=np.array([...]),
|
|
90
|
+
document="The text this vector represents",
|
|
91
|
+
metadata={"source": "wikipedia", "date": "2024-01"},
|
|
92
|
+
num_bits=8, # optional: apply 8-bit scalar quantization
|
|
93
|
+
)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
| Parameter | Type | Description |
|
|
97
|
+
|-----------|------|-------------|
|
|
98
|
+
| `vector` | `np.ndarray` | 1-D array matching the database dimension |
|
|
99
|
+
| `document` | `str` | Text or identifier associated with this vector |
|
|
100
|
+
| `metadata` | `dict` | Optional key-value pairs stored alongside the vector |
|
|
101
|
+
| `num_bits` | `int` | Optional quantization (1–16 bits). Reduces memory at the cost of precision. |
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
### `add_nodes(vectors, documents, metadata=None) → list[int]`
|
|
106
|
+
|
|
107
|
+
Batch insert. All inputs are validated before any insertion occurs.
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
import numpy as np
|
|
111
|
+
|
|
112
|
+
vectors = np.random.rand(1000, 512)
|
|
113
|
+
docs = [f"Document {i}" for i in range(1000)]
|
|
114
|
+
indices = db.add_nodes(vectors, docs)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
### `search_top_k(query_vector, k, metric='cosine', filter_fn=None) → list[SearchResult]`
|
|
120
|
+
|
|
121
|
+
Find the top-k most similar vectors. Returns results sorted by score descending.
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
results = db.search_top_k(query, k=5)
|
|
125
|
+
results = db.search_top_k(query, k=5, metric="euclidean")
|
|
126
|
+
|
|
127
|
+
# Filter before scoring — only consider documents tagged "news"
|
|
128
|
+
results = db.search_top_k(
|
|
129
|
+
query, k=5,
|
|
130
|
+
filter_fn=lambda node: node.metadata.get("type") == "news"
|
|
131
|
+
)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
**Metrics:**
|
|
135
|
+
|
|
136
|
+
| Metric | Range | Notes |
|
|
137
|
+
|--------|-------|-------|
|
|
138
|
+
| `cosine` | [-1, 1] | Best for text embeddings. Direction-based, scale-invariant. |
|
|
139
|
+
| `euclidean` | (0, 1] | `1 / (1 + dist)`. Identical vectors = 1.0. |
|
|
140
|
+
| `dot` | (-∞, +∞) | Fastest. Best for pre-normalized unit vectors. |
|
|
141
|
+
|
|
142
|
+
**`SearchResult` fields:**
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
result.index # int — node's assigned index
|
|
146
|
+
result.document # str — the document text
|
|
147
|
+
result.score # float — similarity score (higher = more similar)
|
|
148
|
+
result.metadata # dict — metadata stored with this node
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
### `search_by_threshold(query_vector, min_score, metric='cosine') → list[SearchResult]`
|
|
154
|
+
|
|
155
|
+
Return all nodes with score >= `min_score`, sorted descending.
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
results = db.search_by_threshold(query, min_score=0.8)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
### `get_node(index) → Node`
|
|
164
|
+
|
|
165
|
+
Retrieve a node by index.
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
node = db.get_node(42)
|
|
169
|
+
print(node.document, node.metadata)
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
### `update_node(index, vector=None, document=None, metadata=None)`
|
|
175
|
+
|
|
176
|
+
Update fields of an existing node. Omit fields to leave them unchanged.
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
db.update_node(42, document="Updated text")
|
|
180
|
+
db.update_node(42, metadata={"status": "reviewed"})
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
### `remove_node(index)`
|
|
186
|
+
|
|
187
|
+
Remove a node. Its index is permanently retired (never reused).
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
db.remove_node(42)
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
### `save(path)` / `MicroVectorDB.load(path)`
|
|
196
|
+
|
|
197
|
+
Persist to and restore from a `.mvdb/` directory. Safe format: JSON + numpy binary.
|
|
198
|
+
|
|
199
|
+
```python
|
|
200
|
+
db.save("my_db") # creates my_db.mvdb/
|
|
201
|
+
db = MicroVectorDB.load("my_db")
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
The `.mvdb/` directory contains:
|
|
205
|
+
- `index.json` — node metadata and documents (human-readable)
|
|
206
|
+
- `vectors.npy` — all vectors as a 2-D numpy array
|
|
207
|
+
|
|
208
|
+
---
|
|
209
|
+
|
|
210
|
+
### Utility methods
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
len(db) # number of nodes
|
|
214
|
+
42 in db # check if index exists
|
|
215
|
+
repr(db) # MicroVectorDB(dimension=512, nodes=1000, next_index=1000)
|
|
216
|
+
db.stats() # {'count': 1000, 'dimension': 512, 'next_index': 1000, ...}
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
## Exceptions
|
|
222
|
+
|
|
223
|
+
All exceptions inherit from `MicroVectorError`.
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
from microvector import (
|
|
227
|
+
MicroVectorError,
|
|
228
|
+
DimensionMismatchError, # wrong vector dimension
|
|
229
|
+
EmptyDatabaseError, # search on empty database
|
|
230
|
+
NodeNotFoundError, # get/update/remove non-existent index
|
|
231
|
+
)
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
---
|
|
235
|
+
|
|
236
|
+
## Design Notes
|
|
237
|
+
|
|
238
|
+
**Why in-memory?** MicroVector is designed for small-to-medium datasets (up to ~100k vectors) where the simplicity of pure-Python outweighs the need for a dedicated service. It starts instantly, needs no configuration, and is trivially embeddable in any Python application.
|
|
239
|
+
|
|
240
|
+
**Why not pickle?** Pickle can execute arbitrary code when loading untrusted files. MicroVector uses JSON + numpy binary format which is safe, portable, and inspectable with any text editor.
|
|
241
|
+
|
|
242
|
+
**Index monotonicity.** Deleted indexes are never reused. This prevents stale external references from silently pointing to new data.
|
|
243
|
+
|
|
244
|
+
---
|
|
245
|
+
|
|
246
|
+
## Roadmap
|
|
247
|
+
|
|
248
|
+
- [ ] Approximate nearest neighbor (HNSW)
|
|
249
|
+
- [ ] Voronoi cell indexing
|
|
250
|
+
- [ ] FAISS optional backend for large datasets
|
|
251
|
+
- [ ] Async search support
|
|
252
|
+
- [ ] Benchmarks
|
|
253
|
+
|
|
254
|
+
---
|
|
255
|
+
|
|
256
|
+
## License
|
|
257
|
+
|
|
258
|
+
MIT
|
|
Binary file
|