vegamdb 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vegamdb-0.1.0/.gitignore +16 -0
- vegamdb-0.1.0/CMakeLists.txt +57 -0
- vegamdb-0.1.0/PKG-INFO +244 -0
- vegamdb-0.1.0/README.md +234 -0
- vegamdb-0.1.0/benchmarks/annoy_benchmark.py +121 -0
- vegamdb-0.1.0/benchmarks/ivf_benchmarks.py +167 -0
- vegamdb-0.1.0/benchmarks/persistence_test.py +161 -0
- vegamdb-0.1.0/benchmarks/stress_test.py +78 -0
- vegamdb-0.1.0/benchmarks/test_kmeans.py +56 -0
- vegamdb-0.1.0/include/VegamDB.hpp +33 -0
- vegamdb-0.1.0/include/indexes/AnnoyIndex.hpp +71 -0
- vegamdb-0.1.0/include/indexes/FlatIndex.hpp +30 -0
- vegamdb-0.1.0/include/indexes/IVFIndex.hpp +43 -0
- vegamdb-0.1.0/include/indexes/IndexBase.hpp +31 -0
- vegamdb-0.1.0/include/indexes/KMeans.hpp +80 -0
- vegamdb-0.1.0/include/storage/VectorStore.hpp +26 -0
- vegamdb-0.1.0/include/utils/Math.hpp +47 -0
- vegamdb-0.1.0/pyproject.toml +34 -0
- vegamdb-0.1.0/requirements.txt +1 -0
- vegamdb-0.1.0/src/VegamDB.cpp +92 -0
- vegamdb-0.1.0/src/bindings.cpp +198 -0
- vegamdb-0.1.0/src/indexes/AnnoyIndex.cpp +291 -0
- vegamdb-0.1.0/src/indexes/FlatIndex.cpp +53 -0
- vegamdb-0.1.0/src/indexes/IVFIndex.cpp +130 -0
- vegamdb-0.1.0/src/indexes/KMeans.cpp +152 -0
- vegamdb-0.1.0/src/storage/VectorStore.cpp +66 -0
- vegamdb-0.1.0/src/utils/Math.cpp +51 -0
- vegamdb-0.1.0/test.py +109 -0
- vegamdb-0.1.0/vegamdb/__init__.py +13 -0
- vegamdb-0.1.0/vegamdb/_vegamdb.pyi +206 -0
vegamdb-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# 1. Check the CMake version
|
|
2
|
+
cmake_minimum_required(VERSION 3.15)
|
|
3
|
+
|
|
4
|
+
# 2. Name the project
|
|
5
|
+
project(VectorDBProject)
|
|
6
|
+
|
|
7
|
+
# 3. Set the C++ Standard
|
|
8
|
+
# We are forcing C++17. Why?
|
|
9
|
+
# Because older C++ versions are painful. C++17 gives us modern features
|
|
10
|
+
# (like better filesystem handling) that make life easier.
|
|
11
|
+
set(CMAKE_CXX_STANDARD 17)
|
|
12
|
+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
13
|
+
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
|
14
|
+
|
|
15
|
+
include(FetchContent)
|
|
16
|
+
|
|
17
|
+
FetchContent_Declare(
|
|
18
|
+
pybind11
|
|
19
|
+
GIT_REPOSITORY https://github.com/pybind/pybind11
|
|
20
|
+
GIT_TAG v2.11.1
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
FetchContent_MakeAvailable(pybind11)
|
|
24
|
+
|
|
25
|
+
# 4. Tell CMake where to look for "Menu" files (Headers)
|
|
26
|
+
# This is equivalent to the "-I./include" flag we typed manually.
|
|
27
|
+
include_directories(include)
|
|
28
|
+
|
|
29
|
+
# 5. Create the Executable
|
|
30
|
+
# "add_executable" tells CMake:
|
|
31
|
+
# "I want to build a program named 'my_db_app'.
|
|
32
|
+
# Here are the ingredients (source files) you need to cook it."
|
|
33
|
+
# add_executable(my_db_app
|
|
34
|
+
# src/main.cpp
|
|
35
|
+
# src/VectorDB.cpp
|
|
36
|
+
# )
|
|
37
|
+
|
|
38
|
+
# 1. Turn on Maximum Optimization (-O3)
|
|
39
|
+
# 2. Tune for the architecture of YOUR specific computer (-march=native)
|
|
40
|
+
if(MSVC)
|
|
41
|
+
add_compile_options(/O2 /arch:AVX2)
|
|
42
|
+
else()
|
|
43
|
+
add_compile_options(-O3 -march=native -ffast-math)
|
|
44
|
+
endif()
|
|
45
|
+
|
|
46
|
+
pybind11_add_module(_vegamdb
|
|
47
|
+
src/VegamDB.cpp
|
|
48
|
+
src/bindings.cpp
|
|
49
|
+
src/indexes/FlatIndex.cpp
|
|
50
|
+
src/indexes/IVFIndex.cpp
|
|
51
|
+
src/indexes/AnnoyIndex.cpp
|
|
52
|
+
src/indexes/KMeans.cpp
|
|
53
|
+
src/storage/VectorStore.cpp
|
|
54
|
+
src/utils/Math.cpp
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
install(TARGETS _vegamdb DESTINATION vegamdb)
|
vegamdb-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: vegamdb
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A high-performance vector database written in C++ with Python bindings
|
|
5
|
+
Author: Naredla Ajay Kumar Reddy
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Requires-Dist: numpy
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# VegamDB
|
|
12
|
+
|
|
13
|
+
A high-performance vector database written in C++ with Python bindings. VegamDB provides fast nearest neighbor search with pluggable index types, zero-copy NumPy integration, and built-in persistence.
|
|
14
|
+
|
|
15
|
+
## Features
|
|
16
|
+
|
|
17
|
+
- **Multiple Index Types** -- Flat (exact brute-force), IVF (inverted file with K-Means), and Annoy (random projection trees)
|
|
18
|
+
- **C++ Core** -- All indexing and search logic runs in optimized C++17 with `-O3` and `-march=native`
|
|
19
|
+
- **Zero-Copy NumPy** -- Vectors pass directly from NumPy arrays to C++ via pointer, with no intermediate copies
|
|
20
|
+
- **Persistence** -- Save and load the entire database (vectors + index) to a single binary file
|
|
21
|
+
- **Pluggable Architecture** -- Switch index types at runtime without changing application code
|
|
22
|
+
- **Type-Safe Python API** -- Full type stubs (`.pyi`) for IDE autocomplete and static analysis
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
### From Source
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
git clone https://github.com/LuciAkirami/vegamdb.git
|
|
30
|
+
cd vegamdb
|
|
31
|
+
pip install .
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### Requirements
|
|
35
|
+
|
|
36
|
+
- Python >= 3.8
|
|
37
|
+
- CMake >= 3.15
|
|
38
|
+
- A C++17 compatible compiler (GCC 7+, Clang 5+, MSVC 2017+)
|
|
39
|
+
- NumPy
|
|
40
|
+
|
|
41
|
+
### Development Install
|
|
42
|
+
|
|
43
|
+
For development, use an editable install so changes to Python files take effect immediately:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install scikit-build-core pybind11 numpy
|
|
47
|
+
pip install -e . --no-build-isolation
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Quick Start
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
import numpy as np
|
|
54
|
+
from vegamdb import VegamDB
|
|
55
|
+
|
|
56
|
+
# Create a database
|
|
57
|
+
db = VegamDB()
|
|
58
|
+
|
|
59
|
+
# Add vectors
|
|
60
|
+
data = np.random.random((10000, 128)).astype(np.float32)
|
|
61
|
+
for vec in data:
|
|
62
|
+
db.add_vector_numpy(vec)
|
|
63
|
+
|
|
64
|
+
# Search (defaults to exact flat search)
|
|
65
|
+
query = np.random.random(128).astype(np.float32)
|
|
66
|
+
results = db.search(query, k=5)
|
|
67
|
+
|
|
68
|
+
print(results.ids) # [4823, 1092, 7744, 331, 5619]
|
|
69
|
+
print(results.distances) # [4.12, 4.15, 4.18, 4.21, 4.23]
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Index Types
|
|
73
|
+
|
|
74
|
+
VegamDB supports three index types, each offering a different trade-off between speed and accuracy.
|
|
75
|
+
|
|
76
|
+
### Flat Index (Default)
|
|
77
|
+
|
|
78
|
+
Exact brute-force search. Computes the Euclidean distance between the query and every stored vector. Always returns the true nearest neighbors.
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
db.use_flat_index()
|
|
82
|
+
results = db.search(query, k=10)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
| Metric | Value |
|
|
86
|
+
| ------------ | ------------ |
|
|
87
|
+
| Accuracy | 100% |
|
|
88
|
+
| Build Time | None |
|
|
89
|
+
| Best For | Small datasets (< 50K vectors), ground truth validation |
|
|
90
|
+
|
|
91
|
+
### IVF Index (Inverted File)
|
|
92
|
+
|
|
93
|
+
Partitions vectors into clusters using K-Means. At query time, only the closest clusters are searched, trading some accuracy for a large speedup.
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
db.use_ivf_index(n_clusters=100, max_iters=20, n_probe=1)
|
|
97
|
+
db.build_index()
|
|
98
|
+
|
|
99
|
+
# Search with custom probe count
|
|
100
|
+
from vegamdb import IVFSearchParams
|
|
101
|
+
params = IVFSearchParams()
|
|
102
|
+
params.n_probe = 10 # Search 10 of 100 clusters
|
|
103
|
+
results = db.search(query, k=10, params=params)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
| Parameter | Description | Default |
|
|
107
|
+
| ------------- | ------------------------------------------------ | ------- |
|
|
108
|
+
| `n_clusters` | Number of Voronoi cells (partitions) | -- |
|
|
109
|
+
| `max_iters` | Maximum K-Means training iterations | 50 |
|
|
110
|
+
| `n_probe` | Clusters to search at query time | 1 |
|
|
111
|
+
|
|
112
|
+
### Annoy Index (Approximate Nearest Neighbors)
|
|
113
|
+
|
|
114
|
+
Builds a forest of random projection trees. Each tree recursively splits the vector space with random hyperplanes. At query time, multiple trees are traversed to collect candidate neighbors.
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
db.use_annoy_index(num_trees=10, k_leaf=50)
|
|
118
|
+
db.build_index()
|
|
119
|
+
|
|
120
|
+
results = db.search(query, k=10)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
| Parameter | Description | Default |
|
|
124
|
+
| ---------------- | ---------------------------------------------- | ------- |
|
|
125
|
+
| `num_trees` | Number of random projection trees | -- |
|
|
126
|
+
| `k_leaf` | Maximum points per leaf node | -- |
|
|
127
|
+
|
|
128
|
+
### Choosing an Index
|
|
129
|
+
|
|
130
|
+
| Use Case | Recommended Index | Why |
|
|
131
|
+
| ---------------------------- | ----------------- | ------------------------------------- |
|
|
132
|
+
| Small dataset (< 50K) | Flat | Exact results, no training overhead |
|
|
133
|
+
| Medium dataset (50K - 1M) | IVF | Good speed/accuracy with tunable probe|
|
|
134
|
+
| Large dataset (1M+) | Annoy | Fast tree traversal, low memory |
|
|
135
|
+
| Ground truth / benchmarking | Flat | Guaranteed correct results |
|
|
136
|
+
|
|
137
|
+
## Persistence
|
|
138
|
+
|
|
139
|
+
Save and load the entire database state, including vectors and the trained index:
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
# Save
|
|
143
|
+
db.save("my_database.bin")
|
|
144
|
+
|
|
145
|
+
# Load into a fresh instance
|
|
146
|
+
db2 = VegamDB()
|
|
147
|
+
db2.load("my_database.bin")
|
|
148
|
+
|
|
149
|
+
assert db2.size() == db.size()
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
The index type and its trained state are serialized automatically. After loading, the index is ready to search without rebuilding.
|
|
153
|
+
|
|
154
|
+
## API Reference
|
|
155
|
+
|
|
156
|
+
### VegamDB
|
|
157
|
+
|
|
158
|
+
| Method | Description |
|
|
159
|
+
| ---------------------- | ----------------------------------------------------------------- |
|
|
160
|
+
| `VegamDB()` | Create a new empty database instance |
|
|
161
|
+
| `add_vector(vec)` | Add a vector from a Python list of floats |
|
|
162
|
+
| `add_vector_numpy(arr)`| Add a vector from a 1D NumPy float32 array (zero-copy) |
|
|
163
|
+
| `size()` | Return the number of stored vectors |
|
|
164
|
+
| `dimension()` | Return the dimensionality of stored vectors (0 if empty) |
|
|
165
|
+
| `use_flat_index()` | Set index to brute-force flat search |
|
|
166
|
+
| `use_ivf_index(...)` | Set index to IVF with specified cluster configuration |
|
|
167
|
+
| `use_annoy_index(...)` | Set index to Annoy with specified tree configuration |
|
|
168
|
+
| `build_index()` | Explicitly build/train the current index |
|
|
169
|
+
| `search(query, k, params=None)` | Search for k nearest neighbors, returns `SearchResults` |
|
|
170
|
+
| `save(filename)` | Save database and index to a binary file |
|
|
171
|
+
| `load(filename)` | Load database and index from a binary file |
|
|
172
|
+
|
|
173
|
+
### SearchResults
|
|
174
|
+
|
|
175
|
+
| Attribute | Type | Description |
|
|
176
|
+
| ------------ | ------------- | ------------------------------------------------ |
|
|
177
|
+
| `ids` | `list[int]` | Indices of nearest neighbors (insertion order) |
|
|
178
|
+
| `distances` | `list[float]` | Euclidean distances to the query vector |
|
|
179
|
+
|
|
180
|
+
### Search Parameters
|
|
181
|
+
|
|
182
|
+
**IVFSearchParams** -- Override the default probe count for IVF search:
|
|
183
|
+
- `n_probe` (int): Number of clusters to search. Higher values improve recall at the cost of latency.
|
|
184
|
+
|
|
185
|
+
## Architecture
|
|
186
|
+
|
|
187
|
+
```
|
|
188
|
+
VegamDB (Orchestrator)
|
|
189
|
+
/ \
|
|
190
|
+
VectorStore IndexBase
|
|
191
|
+
(raw float vectors) (search strategy)
|
|
192
|
+
/ | \
|
|
193
|
+
Flat IVF Annoy
|
|
194
|
+
(exact) (K-Means) (trees)
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
- **VegamDB** -- Main entry point. Manages the vector store and delegates search to the active index.
|
|
198
|
+
- **VectorStore** -- Stores raw vectors in a `vector<vector<float>>`. Handles serialization.
|
|
199
|
+
- **IndexBase** -- Abstract interface that all index types implement (`build`, `search`, `save`, `load`).
|
|
200
|
+
- **FlatIndex** -- Iterates over all vectors, computing Euclidean distance. O(n) per query.
|
|
201
|
+
- **IVFIndex** -- Trains K-Means centroids, assigns vectors to clusters, searches only nearby clusters.
|
|
202
|
+
- **AnnoyIndex** -- Builds a forest of binary trees using random hyperplane splits for fast traversal.
|
|
203
|
+
|
|
204
|
+
## Project Structure
|
|
205
|
+
|
|
206
|
+
```
|
|
207
|
+
vegamdb/
|
|
208
|
+
├── include/ # C++ headers
|
|
209
|
+
│ ├── VegamDB.hpp
|
|
210
|
+
│ ├── indexes/ # IndexBase, FlatIndex, IVFIndex, AnnoyIndex, KMeans
|
|
211
|
+
│ ├── storage/ # VectorStore
|
|
212
|
+
│ └── utils/ # Math utilities (Euclidean distance, dot product)
|
|
213
|
+
├── src/ # C++ implementation
|
|
214
|
+
│ ├── VegamDB.cpp
|
|
215
|
+
│ ├── bindings.cpp # pybind11 Python bindings
|
|
216
|
+
│ ├── indexes/
|
|
217
|
+
│ ├── storage/
|
|
218
|
+
│ └── utils/
|
|
219
|
+
├── vegamdb/ # Python package
|
|
220
|
+
│ ├── __init__.py # Public API re-exports
|
|
221
|
+
│ └── _vegamdb.pyi # Type stubs for IDE support
|
|
222
|
+
├── benchmarks/ # Performance benchmarks
|
|
223
|
+
├── CMakeLists.txt # C++ build configuration
|
|
224
|
+
└── pyproject.toml # Python packaging (scikit-build-core)
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
## Benchmarks
|
|
228
|
+
|
|
229
|
+
Run the included benchmarks to evaluate performance on your hardware:
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
# Stress test (Flat index, varying dataset sizes)
|
|
233
|
+
python benchmarks/stress_test.py
|
|
234
|
+
|
|
235
|
+
# IVF benchmark (accuracy vs speed trade-off across probe counts)
|
|
236
|
+
python benchmarks/ivf_benchmarks.py
|
|
237
|
+
|
|
238
|
+
# Annoy benchmark (accuracy vs speed trade-off across tree counts)
|
|
239
|
+
python benchmarks/annoy_benchmark.py
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
## License
|
|
243
|
+
|
|
244
|
+
MIT
|
vegamdb-0.1.0/README.md
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
# VegamDB
|
|
2
|
+
|
|
3
|
+
A high-performance vector database written in C++ with Python bindings. VegamDB provides fast nearest neighbor search with pluggable index types, zero-copy NumPy integration, and built-in persistence.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Multiple Index Types** -- Flat (exact brute-force), IVF (inverted file with K-Means), and Annoy (random projection trees)
|
|
8
|
+
- **C++ Core** -- All indexing and search logic runs in optimized C++17 with `-O3` and `-march=native`
|
|
9
|
+
- **Zero-Copy NumPy** -- Vectors pass directly from NumPy arrays to C++ via pointer, with no intermediate copies
|
|
10
|
+
- **Persistence** -- Save and load the entire database (vectors + index) to a single binary file
|
|
11
|
+
- **Pluggable Architecture** -- Switch index types at runtime without changing application code
|
|
12
|
+
- **Type-Safe Python API** -- Full type stubs (`.pyi`) for IDE autocomplete and static analysis
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
### From Source
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
git clone https://github.com/LuciAkirami/vegamdb.git
|
|
20
|
+
cd vegamdb
|
|
21
|
+
pip install .
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### Requirements
|
|
25
|
+
|
|
26
|
+
- Python >= 3.8
|
|
27
|
+
- CMake >= 3.15
|
|
28
|
+
- A C++17 compatible compiler (GCC 7+, Clang 5+, MSVC 2017+)
|
|
29
|
+
- NumPy
|
|
30
|
+
|
|
31
|
+
### Development Install
|
|
32
|
+
|
|
33
|
+
For development, use an editable install so changes to Python files take effect immediately:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install scikit-build-core pybind11 numpy
|
|
37
|
+
pip install -e . --no-build-isolation
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Quick Start
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
import numpy as np
|
|
44
|
+
from vegamdb import VegamDB
|
|
45
|
+
|
|
46
|
+
# Create a database
|
|
47
|
+
db = VegamDB()
|
|
48
|
+
|
|
49
|
+
# Add vectors
|
|
50
|
+
data = np.random.random((10000, 128)).astype(np.float32)
|
|
51
|
+
for vec in data:
|
|
52
|
+
db.add_vector_numpy(vec)
|
|
53
|
+
|
|
54
|
+
# Search (defaults to exact flat search)
|
|
55
|
+
query = np.random.random(128).astype(np.float32)
|
|
56
|
+
results = db.search(query, k=5)
|
|
57
|
+
|
|
58
|
+
print(results.ids) # [4823, 1092, 7744, 331, 5619]
|
|
59
|
+
print(results.distances) # [4.12, 4.15, 4.18, 4.21, 4.23]
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Index Types
|
|
63
|
+
|
|
64
|
+
VegamDB supports three index types, each offering a different trade-off between speed and accuracy.
|
|
65
|
+
|
|
66
|
+
### Flat Index (Default)
|
|
67
|
+
|
|
68
|
+
Exact brute-force search. Computes the Euclidean distance between the query and every stored vector. Always returns the true nearest neighbors.
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
db.use_flat_index()
|
|
72
|
+
results = db.search(query, k=10)
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
| Metric | Value |
|
|
76
|
+
| ------------ | ------------ |
|
|
77
|
+
| Accuracy | 100% |
|
|
78
|
+
| Build Time | None |
|
|
79
|
+
| Best For | Small datasets (< 50K vectors), ground truth validation |
|
|
80
|
+
|
|
81
|
+
### IVF Index (Inverted File)
|
|
82
|
+
|
|
83
|
+
Partitions vectors into clusters using K-Means. At query time, only the closest clusters are searched, trading some accuracy for a large speedup.
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
db.use_ivf_index(n_clusters=100, max_iters=20, n_probe=1)
|
|
87
|
+
db.build_index()
|
|
88
|
+
|
|
89
|
+
# Search with custom probe count
|
|
90
|
+
from vegamdb import IVFSearchParams
|
|
91
|
+
params = IVFSearchParams()
|
|
92
|
+
params.n_probe = 10 # Search 10 of 100 clusters
|
|
93
|
+
results = db.search(query, k=10, params=params)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
| Parameter | Description | Default |
|
|
97
|
+
| ------------- | ------------------------------------------------ | ------- |
|
|
98
|
+
| `n_clusters` | Number of Voronoi cells (partitions) | -- |
|
|
99
|
+
| `max_iters` | Maximum K-Means training iterations | 50 |
|
|
100
|
+
| `n_probe` | Clusters to search at query time | 1 |
|
|
101
|
+
|
|
102
|
+
### Annoy Index (Approximate Nearest Neighbors)
|
|
103
|
+
|
|
104
|
+
Builds a forest of random projection trees. Each tree recursively splits the vector space with random hyperplanes. At query time, multiple trees are traversed to collect candidate neighbors.
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
db.use_annoy_index(num_trees=10, k_leaf=50)
|
|
108
|
+
db.build_index()
|
|
109
|
+
|
|
110
|
+
results = db.search(query, k=10)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
| Parameter | Description | Default |
|
|
114
|
+
| ---------------- | ---------------------------------------------- | ------- |
|
|
115
|
+
| `num_trees` | Number of random projection trees | -- |
|
|
116
|
+
| `k_leaf` | Maximum points per leaf node | -- |
|
|
117
|
+
|
|
118
|
+
### Choosing an Index
|
|
119
|
+
|
|
120
|
+
| Use Case | Recommended Index | Why |
|
|
121
|
+
| ---------------------------- | ----------------- | ------------------------------------- |
|
|
122
|
+
| Small dataset (< 50K) | Flat | Exact results, no training overhead |
|
|
123
|
+
| Medium dataset (50K - 1M) | IVF | Good speed/accuracy with tunable probe|
|
|
124
|
+
| Large dataset (1M+) | Annoy | Fast tree traversal, low memory |
|
|
125
|
+
| Ground truth / benchmarking | Flat | Guaranteed correct results |
|
|
126
|
+
|
|
127
|
+
## Persistence
|
|
128
|
+
|
|
129
|
+
Save and load the entire database state, including vectors and the trained index:
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
# Save
|
|
133
|
+
db.save("my_database.bin")
|
|
134
|
+
|
|
135
|
+
# Load into a fresh instance
|
|
136
|
+
db2 = VegamDB()
|
|
137
|
+
db2.load("my_database.bin")
|
|
138
|
+
|
|
139
|
+
assert db2.size() == db.size()
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
The index type and its trained state are serialized automatically. After loading, the index is ready to search without rebuilding.
|
|
143
|
+
|
|
144
|
+
## API Reference
|
|
145
|
+
|
|
146
|
+
### VegamDB
|
|
147
|
+
|
|
148
|
+
| Method | Description |
|
|
149
|
+
| ---------------------- | ----------------------------------------------------------------- |
|
|
150
|
+
| `VegamDB()` | Create a new empty database instance |
|
|
151
|
+
| `add_vector(vec)` | Add a vector from a Python list of floats |
|
|
152
|
+
| `add_vector_numpy(arr)`| Add a vector from a 1D NumPy float32 array (zero-copy) |
|
|
153
|
+
| `size()` | Return the number of stored vectors |
|
|
154
|
+
| `dimension()` | Return the dimensionality of stored vectors (0 if empty) |
|
|
155
|
+
| `use_flat_index()` | Set index to brute-force flat search |
|
|
156
|
+
| `use_ivf_index(...)` | Set index to IVF with specified cluster configuration |
|
|
157
|
+
| `use_annoy_index(...)` | Set index to Annoy with specified tree configuration |
|
|
158
|
+
| `build_index()` | Explicitly build/train the current index |
|
|
159
|
+
| `search(query, k, params=None)` | Search for k nearest neighbors, returns `SearchResults` |
|
|
160
|
+
| `save(filename)` | Save database and index to a binary file |
|
|
161
|
+
| `load(filename)` | Load database and index from a binary file |
|
|
162
|
+
|
|
163
|
+
### SearchResults
|
|
164
|
+
|
|
165
|
+
| Attribute | Type | Description |
|
|
166
|
+
| ------------ | ------------- | ------------------------------------------------ |
|
|
167
|
+
| `ids` | `list[int]` | Indices of nearest neighbors (insertion order) |
|
|
168
|
+
| `distances` | `list[float]` | Euclidean distances to the query vector |
|
|
169
|
+
|
|
170
|
+
### Search Parameters
|
|
171
|
+
|
|
172
|
+
**IVFSearchParams** -- Override the default probe count for IVF search:
|
|
173
|
+
- `n_probe` (int): Number of clusters to search. Higher values improve recall at the cost of latency.
|
|
174
|
+
|
|
175
|
+
## Architecture
|
|
176
|
+
|
|
177
|
+
```
|
|
178
|
+
VegamDB (Orchestrator)
|
|
179
|
+
/ \
|
|
180
|
+
VectorStore IndexBase
|
|
181
|
+
(raw float vectors) (search strategy)
|
|
182
|
+
/ | \
|
|
183
|
+
Flat IVF Annoy
|
|
184
|
+
(exact) (K-Means) (trees)
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
- **VegamDB** -- Main entry point. Manages the vector store and delegates search to the active index.
|
|
188
|
+
- **VectorStore** -- Stores raw vectors in a `vector<vector<float>>`. Handles serialization.
|
|
189
|
+
- **IndexBase** -- Abstract interface that all index types implement (`build`, `search`, `save`, `load`).
|
|
190
|
+
- **FlatIndex** -- Iterates over all vectors, computing Euclidean distance. O(n) per query.
|
|
191
|
+
- **IVFIndex** -- Trains K-Means centroids, assigns vectors to clusters, searches only nearby clusters.
|
|
192
|
+
- **AnnoyIndex** -- Builds a forest of binary trees using random hyperplane splits for fast traversal.
|
|
193
|
+
|
|
194
|
+
## Project Structure
|
|
195
|
+
|
|
196
|
+
```
|
|
197
|
+
vegamdb/
|
|
198
|
+
├── include/ # C++ headers
|
|
199
|
+
│ ├── VegamDB.hpp
|
|
200
|
+
│ ├── indexes/ # IndexBase, FlatIndex, IVFIndex, AnnoyIndex, KMeans
|
|
201
|
+
│ ├── storage/ # VectorStore
|
|
202
|
+
│ └── utils/ # Math utilities (Euclidean distance, dot product)
|
|
203
|
+
├── src/ # C++ implementation
|
|
204
|
+
│ ├── VegamDB.cpp
|
|
205
|
+
│ ├── bindings.cpp # pybind11 Python bindings
|
|
206
|
+
│ ├── indexes/
|
|
207
|
+
│ ├── storage/
|
|
208
|
+
│ └── utils/
|
|
209
|
+
├── vegamdb/ # Python package
|
|
210
|
+
│ ├── __init__.py # Public API re-exports
|
|
211
|
+
│ └── _vegamdb.pyi # Type stubs for IDE support
|
|
212
|
+
├── benchmarks/ # Performance benchmarks
|
|
213
|
+
├── CMakeLists.txt # C++ build configuration
|
|
214
|
+
└── pyproject.toml # Python packaging (scikit-build-core)
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## Benchmarks
|
|
218
|
+
|
|
219
|
+
Run the included benchmarks to evaluate performance on your hardware:
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
# Stress test (Flat index, varying dataset sizes)
|
|
223
|
+
python benchmarks/stress_test.py
|
|
224
|
+
|
|
225
|
+
# IVF benchmark (accuracy vs speed trade-off across probe counts)
|
|
226
|
+
python benchmarks/ivf_benchmarks.py
|
|
227
|
+
|
|
228
|
+
# Annoy benchmark (accuracy vs speed trade-off across tree counts)
|
|
229
|
+
python benchmarks/annoy_benchmark.py
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
## License
|
|
233
|
+
|
|
234
|
+
MIT
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import vegamdb
|
|
3
|
+
import time
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def calculate_recall(ground_truth, predicted, k):
|
|
7
|
+
"""
|
|
8
|
+
Calculates the intersection between Ground Truth and Predicted results.
|
|
9
|
+
Recall = (Number of Overlapping Indices) / K
|
|
10
|
+
"""
|
|
11
|
+
total_recall = 0
|
|
12
|
+
|
|
13
|
+
for gt, pred in zip(ground_truth, predicted):
|
|
14
|
+
gt_set = set(gt)
|
|
15
|
+
pred_set = set(pred)
|
|
16
|
+
intersection_count = len(gt_set.intersection(pred_set))
|
|
17
|
+
total_recall += intersection_count / k
|
|
18
|
+
|
|
19
|
+
return (total_recall / len(ground_truth)) * 100
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def generate_clustered_data(n_vectors, dim, n_clusters_real=100):
|
|
23
|
+
print(f"Generating {n_vectors} CLUSTERED vectors...")
|
|
24
|
+
topic_centers = np.random.random((n_clusters_real, dim)).astype(np.float32)
|
|
25
|
+
data = []
|
|
26
|
+
vectors_per_cluster = n_vectors // n_clusters_real
|
|
27
|
+
|
|
28
|
+
for i in range(n_clusters_real):
|
|
29
|
+
center = topic_centers[i]
|
|
30
|
+
noise = np.random.normal(scale=0.1, size=(vectors_per_cluster, dim))
|
|
31
|
+
cluster_points = center + noise
|
|
32
|
+
data.append(cluster_points)
|
|
33
|
+
|
|
34
|
+
data = np.vstack(data).astype(np.float32)
|
|
35
|
+
np.random.shuffle(data)
|
|
36
|
+
return data
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def benchmark():
|
|
40
|
+
# ---------------------------------------------------------
|
|
41
|
+
# CONFIGURATION
|
|
42
|
+
# ---------------------------------------------------------
|
|
43
|
+
N_VECTORS = 100000
|
|
44
|
+
DIM = 128
|
|
45
|
+
K = 10
|
|
46
|
+
N_QUERIES = 100
|
|
47
|
+
|
|
48
|
+
# Annoy Param: Max items in a leaf before splitting
|
|
49
|
+
K_LEAF = 50
|
|
50
|
+
|
|
51
|
+
# We will test different Forest Sizes (Number of Trees)
|
|
52
|
+
# More Trees = Better Chance of finding neighbors = Slower Search
|
|
53
|
+
TREE_COUNTS = [1, 2, 5, 10, 50, 100, 200, 500]
|
|
54
|
+
|
|
55
|
+
# ---------------------------------------------------------
|
|
56
|
+
# 1. DATA GENERATION
|
|
57
|
+
# ---------------------------------------------------------
|
|
58
|
+
print(f"Generating {N_VECTORS} vectors (Dim: {DIM})...")
|
|
59
|
+
data = generate_clustered_data(N_VECTORS, DIM, n_clusters_real=500)
|
|
60
|
+
queries = generate_clustered_data(N_QUERIES, DIM, n_clusters_real=10)
|
|
61
|
+
|
|
62
|
+
# Use VegamDB to hold data and calculate ground truth
|
|
63
|
+
db = vegamdb.VegamDB()
|
|
64
|
+
print("Ingesting data into VegamDB...")
|
|
65
|
+
for i in range(N_VECTORS):
|
|
66
|
+
db.add_vector_numpy(data[i])
|
|
67
|
+
|
|
68
|
+
# ---------------------------------------------------------
|
|
69
|
+
# 2. ESTABLISH GROUND TRUTH (Brute Force)
|
|
70
|
+
# ---------------------------------------------------------
|
|
71
|
+
print(f"Calculating Ground Truth for {N_QUERIES} queries...")
|
|
72
|
+
start_flat = time.time()
|
|
73
|
+
ground_truth_results = []
|
|
74
|
+
for i in range(N_QUERIES):
|
|
75
|
+
res = db.search(queries[i], K)
|
|
76
|
+
ground_truth_results.append(res.ids)
|
|
77
|
+
|
|
78
|
+
time_flat = time.time() - start_flat
|
|
79
|
+
avg_flat_ms = (time_flat / N_QUERIES) * 1000
|
|
80
|
+
print(f"Avg Flat Search Latency: {avg_flat_ms:.4f} ms")
|
|
81
|
+
|
|
82
|
+
# ---------------------------------------------------------
|
|
83
|
+
# 3. ANNOY BENCHMARK LOOP
|
|
84
|
+
# ---------------------------------------------------------
|
|
85
|
+
print("\n--- ANNOY PERFORMANCE (Varying Tree Count) ---")
|
|
86
|
+
header = f"{'TREES':<6} | {'BUILD TIME':<10} | {'AVG LATENCY (ms)':<18} | {'SPEEDUP':<10} | {'ACCURACY (%)':<12}"
|
|
87
|
+
print("-" * len(header))
|
|
88
|
+
print(header)
|
|
89
|
+
print("-" * len(header))
|
|
90
|
+
|
|
91
|
+
# To test different tree counts, we must rebuild the index each time.
|
|
92
|
+
for n_trees in TREE_COUNTS:
|
|
93
|
+
|
|
94
|
+
# A. Build Index — use VegamDB's factory method
|
|
95
|
+
t0 = time.time()
|
|
96
|
+
db.use_annoy_index(num_trees=n_trees, k_leaf=K_LEAF)
|
|
97
|
+
db.build_index()
|
|
98
|
+
build_time = time.time() - t0
|
|
99
|
+
|
|
100
|
+
# B. Run Search
|
|
101
|
+
start_probe = time.time()
|
|
102
|
+
annoy_results = []
|
|
103
|
+
for i in range(N_QUERIES):
|
|
104
|
+
res = db.search(queries[i], K)
|
|
105
|
+
annoy_results.append(res.ids)
|
|
106
|
+
|
|
107
|
+
# C. Metrics
|
|
108
|
+
total_time = time.time() - start_probe
|
|
109
|
+
avg_time_ms = (total_time / N_QUERIES) * 1000
|
|
110
|
+
speedup = avg_flat_ms / avg_time_ms
|
|
111
|
+
accuracy = calculate_recall(ground_truth_results, annoy_results, K)
|
|
112
|
+
|
|
113
|
+
print(
|
|
114
|
+
f"{n_trees:<6} | {build_time:<10.2f} | {avg_time_ms:<18.4f} | {speedup:<10.1f} | {accuracy:<12.1f}"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
print("-" * len(header))
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
if __name__ == "__main__":
|
|
121
|
+
benchmark()
|