vegamdb 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ /garbage/
2
+ /.vscode/
3
+ build/
4
+ __pycache__/
5
+ .cache/
6
+ *.so
7
+ .venv
8
+ .clangd
9
+ artifacts/
10
+ docs_internal/
11
+
12
+ # Ignore binary data files
13
+ *.bin
14
+
15
+ # Ignore dist
16
+ dist/
@@ -0,0 +1,57 @@
1
+ # 1. Check the CMake version
2
+ cmake_minimum_required(VERSION 3.15)
3
+
4
+ # 2. Name the project
5
+ project(VectorDBProject)
6
+
7
+ # 3. Set the C++ Standard
8
+ # We are forcing C++17. Why?
9
+ # Because older C++ versions are painful. C++17 gives us modern features
10
+ # (like better filesystem handling) that make life easier.
11
+ set(CMAKE_CXX_STANDARD 17)
12
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
13
+ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
14
+
15
+ include(FetchContent)
16
+
17
+ FetchContent_Declare(
18
+ pybind11
19
+ GIT_REPOSITORY https://github.com/pybind/pybind11
20
+ GIT_TAG v2.11.1
21
+ )
22
+
23
+ FetchContent_MakeAvailable(pybind11)
24
+
25
+ # 4. Tell CMake where to look for "Menu" files (Headers)
26
+ # This is equivalent to the "-I./include" flag we typed manually.
27
+ include_directories(include)
28
+
29
+ # 5. Create the Executable
30
+ # "add_executable" tells CMake:
31
+ # "I want to build a program named 'my_db_app'.
32
+ # Here are the ingredients (source files) you need to cook it."
33
+ # add_executable(my_db_app
34
+ # src/main.cpp
35
+ # src/VectorDB.cpp
36
+ # )
37
+
38
+ # 1. Turn on Maximum Optimization (-O3)
39
+ # 2. Tune for the architecture of YOUR specific computer (-march=native)
40
+ if(MSVC)
41
+ add_compile_options(/O2 /arch:AVX2)
42
+ else()
43
+ add_compile_options(-O3 -march=native -ffast-math)
44
+ endif()
45
+
46
+ pybind11_add_module(_vegamdb
47
+ src/VegamDB.cpp
48
+ src/bindings.cpp
49
+ src/indexes/FlatIndex.cpp
50
+ src/indexes/IVFIndex.cpp
51
+ src/indexes/AnnoyIndex.cpp
52
+ src/indexes/KMeans.cpp
53
+ src/storage/VectorStore.cpp
54
+ src/utils/Math.cpp
55
+ )
56
+
57
+ install(TARGETS _vegamdb DESTINATION vegamdb)
vegamdb-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,244 @@
1
+ Metadata-Version: 2.2
2
+ Name: vegamdb
3
+ Version: 0.1.0
4
+ Summary: A high-performance vector database written in C++ with Python bindings
5
+ Author: Naredla Ajay Kumar Reddy
6
+ License: MIT
7
+ Requires-Python: >=3.8
8
+ Requires-Dist: numpy
9
+ Description-Content-Type: text/markdown
10
+
11
+ # VegamDB
12
+
13
+ A high-performance vector database written in C++ with Python bindings. VegamDB provides fast nearest neighbor search with pluggable index types, zero-copy NumPy integration, and built-in persistence.
14
+
15
+ ## Features
16
+
17
+ - **Multiple Index Types** -- Flat (exact brute-force), IVF (inverted file with K-Means), and Annoy (random projection trees)
18
+ - **C++ Core** -- All indexing and search logic runs in optimized C++17 with `-O3` and `-march=native`
19
+ - **Zero-Copy NumPy** -- Vectors pass directly from NumPy arrays to C++ via pointer, with no intermediate copies
20
+ - **Persistence** -- Save and load the entire database (vectors + index) to a single binary file
21
+ - **Pluggable Architecture** -- Switch index types at runtime without changing application code
22
+ - **Type-Safe Python API** -- Full type stubs (`.pyi`) for IDE autocomplete and static analysis
23
+
24
+ ## Installation
25
+
26
+ ### From Source
27
+
28
+ ```bash
29
+ git clone https://github.com/LuciAkirami/vegamdb.git
30
+ cd vegamdb
31
+ pip install .
32
+ ```
33
+
34
+ ### Requirements
35
+
36
+ - Python >= 3.8
37
+ - CMake >= 3.15
38
+ - A C++17 compatible compiler (GCC 7+, Clang 5+, MSVC 2017+)
39
+ - NumPy
40
+
41
+ ### Development Install
42
+
43
+ For development, use an editable install so changes to Python files take effect immediately:
44
+
45
+ ```bash
46
+ pip install scikit-build-core pybind11 numpy
47
+ pip install -e . --no-build-isolation
48
+ ```
49
+
50
+ ## Quick Start
51
+
52
+ ```python
53
+ import numpy as np
54
+ from vegamdb import VegamDB
55
+
56
+ # Create a database
57
+ db = VegamDB()
58
+
59
+ # Add vectors
60
+ data = np.random.random((10000, 128)).astype(np.float32)
61
+ for vec in data:
62
+ db.add_vector_numpy(vec)
63
+
64
+ # Search (defaults to exact flat search)
65
+ query = np.random.random(128).astype(np.float32)
66
+ results = db.search(query, k=5)
67
+
68
+ print(results.ids) # [4823, 1092, 7744, 331, 5619]
69
+ print(results.distances) # [4.12, 4.15, 4.18, 4.21, 4.23]
70
+ ```
71
+
72
+ ## Index Types
73
+
74
+ VegamDB supports three index types, each offering a different trade-off between speed and accuracy.
75
+
76
+ ### Flat Index (Default)
77
+
78
+ Exact brute-force search. Computes the Euclidean distance between the query and every stored vector. Always returns the true nearest neighbors.
79
+
80
+ ```python
81
+ db.use_flat_index()
82
+ results = db.search(query, k=10)
83
+ ```
84
+
85
+ | Metric | Value |
86
+ | ------------ | ------------ |
87
+ | Accuracy | 100% |
88
+ | Build Time | None |
89
+ | Best For | Small datasets (< 50K vectors), ground truth validation |
90
+
91
+ ### IVF Index (Inverted File)
92
+
93
+ Partitions vectors into clusters using K-Means. At query time, only the closest clusters are searched, trading some accuracy for a large speedup.
94
+
95
+ ```python
96
+ db.use_ivf_index(n_clusters=100, max_iters=20, n_probe=1)
97
+ db.build_index()
98
+
99
+ # Search with custom probe count
100
+ from vegamdb import IVFSearchParams
101
+ params = IVFSearchParams()
102
+ params.n_probe = 10 # Search 10 of 100 clusters
103
+ results = db.search(query, k=10, params=params)
104
+ ```
105
+
106
+ | Parameter | Description | Default |
107
+ | ------------- | ------------------------------------------------ | ------- |
108
+ | `n_clusters` | Number of Voronoi cells (partitions) | -- |
109
+ | `max_iters` | Maximum K-Means training iterations | 50 |
110
+ | `n_probe` | Clusters to search at query time | 1 |
111
+
112
+ ### Annoy Index (Approximate Nearest Neighbors)
113
+
114
+ Builds a forest of random projection trees. Each tree recursively splits the vector space with random hyperplanes. At query time, multiple trees are traversed to collect candidate neighbors.
115
+
116
+ ```python
117
+ db.use_annoy_index(num_trees=10, k_leaf=50)
118
+ db.build_index()
119
+
120
+ results = db.search(query, k=10)
121
+ ```
122
+
123
+ | Parameter | Description | Default |
124
+ | ---------------- | ---------------------------------------------- | ------- |
125
+ | `num_trees` | Number of random projection trees | -- |
126
+ | `k_leaf` | Maximum points per leaf node | -- |
127
+
128
+ ### Choosing an Index
129
+
130
+ | Use Case | Recommended Index | Why |
131
+ | ---------------------------- | ----------------- | ------------------------------------- |
132
+ | Small dataset (< 50K) | Flat | Exact results, no training overhead |
133
+ | Medium dataset (50K - 1M) | IVF | Good speed/accuracy with tunable probe|
134
+ | Large dataset (1M+) | Annoy | Fast tree traversal, low memory |
135
+ | Ground truth / benchmarking | Flat | Guaranteed correct results |
136
+
137
+ ## Persistence
138
+
139
+ Save and load the entire database state, including vectors and the trained index:
140
+
141
+ ```python
142
+ # Save
143
+ db.save("my_database.bin")
144
+
145
+ # Load into a fresh instance
146
+ db2 = VegamDB()
147
+ db2.load("my_database.bin")
148
+
149
+ assert db2.size() == db.size()
150
+ ```
151
+
152
+ The index type and its trained state are serialized automatically. After loading, the index is ready to search without rebuilding.
153
+
154
+ ## API Reference
155
+
156
+ ### VegamDB
157
+
158
+ | Method | Description |
159
+ | ---------------------- | ----------------------------------------------------------------- |
160
+ | `VegamDB()` | Create a new empty database instance |
161
+ | `add_vector(vec)` | Add a vector from a Python list of floats |
162
+ | `add_vector_numpy(arr)`| Add a vector from a 1D NumPy float32 array (zero-copy) |
163
+ | `size()` | Return the number of stored vectors |
164
+ | `dimension()` | Return the dimensionality of stored vectors (0 if empty) |
165
+ | `use_flat_index()` | Set index to brute-force flat search |
166
+ | `use_ivf_index(...)` | Set index to IVF with specified cluster configuration |
167
+ | `use_annoy_index(...)` | Set index to Annoy with specified tree configuration |
168
+ | `build_index()` | Explicitly build/train the current index |
169
+ | `search(query, k, params=None)` | Search for k nearest neighbors, returns `SearchResults` |
170
+ | `save(filename)` | Save database and index to a binary file |
171
+ | `load(filename)` | Load database and index from a binary file |
172
+
173
+ ### SearchResults
174
+
175
+ | Attribute | Type | Description |
176
+ | ------------ | ------------- | ------------------------------------------------ |
177
+ | `ids` | `list[int]` | Indices of nearest neighbors (insertion order) |
178
+ | `distances` | `list[float]` | Euclidean distances to the query vector |
179
+
180
+ ### Search Parameters
181
+
182
+ **IVFSearchParams** -- Override the default probe count for IVF search:
183
+ - `n_probe` (int): Number of clusters to search. Higher values improve recall at the cost of latency.
184
+
185
+ ## Architecture
186
+
187
+ ```
188
+ VegamDB (Orchestrator)
189
+ / \
190
+ VectorStore IndexBase
191
+ (raw float vectors) (search strategy)
192
+ / | \
193
+ Flat IVF Annoy
194
+ (exact) (K-Means) (trees)
195
+ ```
196
+
197
+ - **VegamDB** -- Main entry point. Manages the vector store and delegates search to the active index.
198
+ - **VectorStore** -- Stores raw vectors in a `vector<vector<float>>`. Handles serialization.
199
+ - **IndexBase** -- Abstract interface that all index types implement (`build`, `search`, `save`, `load`).
200
+ - **FlatIndex** -- Iterates over all vectors, computing Euclidean distance. O(n) per query.
201
+ - **IVFIndex** -- Trains K-Means centroids, assigns vectors to clusters, searches only nearby clusters.
202
+ - **AnnoyIndex** -- Builds a forest of binary trees using random hyperplane splits for fast traversal.
203
+
204
+ ## Project Structure
205
+
206
+ ```
207
+ vegamdb/
208
+ ├── include/ # C++ headers
209
+ │ ├── VegamDB.hpp
210
+ │ ├── indexes/ # IndexBase, FlatIndex, IVFIndex, AnnoyIndex, KMeans
211
+ │ ├── storage/ # VectorStore
212
+ │ └── utils/ # Math utilities (Euclidean distance, dot product)
213
+ ├── src/ # C++ implementation
214
+ │ ├── VegamDB.cpp
215
+ │ ├── bindings.cpp # pybind11 Python bindings
216
+ │ ├── indexes/
217
+ │ ├── storage/
218
+ │ └── utils/
219
+ ├── vegamdb/ # Python package
220
+ │ ├── __init__.py # Public API re-exports
221
+ │ └── _vegamdb.pyi # Type stubs for IDE support
222
+ ├── benchmarks/ # Performance benchmarks
223
+ ├── CMakeLists.txt # C++ build configuration
224
+ └── pyproject.toml # Python packaging (scikit-build-core)
225
+ ```
226
+
227
+ ## Benchmarks
228
+
229
+ Run the included benchmarks to evaluate performance on your hardware:
230
+
231
+ ```bash
232
+ # Stress test (Flat index, varying dataset sizes)
233
+ python benchmarks/stress_test.py
234
+
235
+ # IVF benchmark (accuracy vs speed trade-off across probe counts)
236
+ python benchmarks/ivf_benchmarks.py
237
+
238
+ # Annoy benchmark (accuracy vs speed trade-off across tree counts)
239
+ python benchmarks/annoy_benchmark.py
240
+ ```
241
+
242
+ ## License
243
+
244
+ MIT
@@ -0,0 +1,234 @@
1
+ # VegamDB
2
+
3
+ A high-performance vector database written in C++ with Python bindings. VegamDB provides fast nearest neighbor search with pluggable index types, zero-copy NumPy integration, and built-in persistence.
4
+
5
+ ## Features
6
+
7
+ - **Multiple Index Types** -- Flat (exact brute-force), IVF (inverted file with K-Means), and Annoy (random projection trees)
8
+ - **C++ Core** -- All indexing and search logic runs in optimized C++17 with `-O3` and `-march=native`
9
+ - **Zero-Copy NumPy** -- Vectors pass directly from NumPy arrays to C++ via pointer, with no intermediate copies
10
+ - **Persistence** -- Save and load the entire database (vectors + index) to a single binary file
11
+ - **Pluggable Architecture** -- Switch index types at runtime without changing application code
12
+ - **Type-Safe Python API** -- Full type stubs (`.pyi`) for IDE autocomplete and static analysis
13
+
14
+ ## Installation
15
+
16
+ ### From Source
17
+
18
+ ```bash
19
+ git clone https://github.com/LuciAkirami/vegamdb.git
20
+ cd vegamdb
21
+ pip install .
22
+ ```
23
+
24
+ ### Requirements
25
+
26
+ - Python >= 3.8
27
+ - CMake >= 3.15
28
+ - A C++17 compatible compiler (GCC 7+, Clang 5+, MSVC 2017+)
29
+ - NumPy
30
+
31
+ ### Development Install
32
+
33
+ For development, use an editable install so changes to Python files take effect immediately:
34
+
35
+ ```bash
36
+ pip install scikit-build-core pybind11 numpy
37
+ pip install -e . --no-build-isolation
38
+ ```
39
+
40
+ ## Quick Start
41
+
42
+ ```python
43
+ import numpy as np
44
+ from vegamdb import VegamDB
45
+
46
+ # Create a database
47
+ db = VegamDB()
48
+
49
+ # Add vectors
50
+ data = np.random.random((10000, 128)).astype(np.float32)
51
+ for vec in data:
52
+ db.add_vector_numpy(vec)
53
+
54
+ # Search (defaults to exact flat search)
55
+ query = np.random.random(128).astype(np.float32)
56
+ results = db.search(query, k=5)
57
+
58
+ print(results.ids) # [4823, 1092, 7744, 331, 5619]
59
+ print(results.distances) # [4.12, 4.15, 4.18, 4.21, 4.23]
60
+ ```
61
+
62
+ ## Index Types
63
+
64
+ VegamDB supports three index types, each offering a different trade-off between speed and accuracy.
65
+
66
+ ### Flat Index (Default)
67
+
68
+ Exact brute-force search. Computes the Euclidean distance between the query and every stored vector. Always returns the true nearest neighbors.
69
+
70
+ ```python
71
+ db.use_flat_index()
72
+ results = db.search(query, k=10)
73
+ ```
74
+
75
+ | Metric | Value |
76
+ | ------------ | ------------ |
77
+ | Accuracy | 100% |
78
+ | Build Time | None |
79
+ | Best For | Small datasets (< 50K vectors), ground truth validation |
80
+
81
+ ### IVF Index (Inverted File)
82
+
83
+ Partitions vectors into clusters using K-Means. At query time, only the closest clusters are searched, trading some accuracy for a large speedup.
84
+
85
+ ```python
86
+ db.use_ivf_index(n_clusters=100, max_iters=20, n_probe=1)
87
+ db.build_index()
88
+
89
+ # Search with custom probe count
90
+ from vegamdb import IVFSearchParams
91
+ params = IVFSearchParams()
92
+ params.n_probe = 10 # Search 10 of 100 clusters
93
+ results = db.search(query, k=10, params=params)
94
+ ```
95
+
96
+ | Parameter | Description | Default |
97
+ | ------------- | ------------------------------------------------ | ------- |
98
+ | `n_clusters` | Number of Voronoi cells (partitions) | -- |
99
+ | `max_iters` | Maximum K-Means training iterations | 50 |
100
+ | `n_probe` | Clusters to search at query time | 1 |
101
+
102
+ ### Annoy Index (Approximate Nearest Neighbors)
103
+
104
+ Builds a forest of random projection trees. Each tree recursively splits the vector space with random hyperplanes. At query time, multiple trees are traversed to collect candidate neighbors.
105
+
106
+ ```python
107
+ db.use_annoy_index(num_trees=10, k_leaf=50)
108
+ db.build_index()
109
+
110
+ results = db.search(query, k=10)
111
+ ```
112
+
113
+ | Parameter | Description | Default |
114
+ | ---------------- | ---------------------------------------------- | ------- |
115
+ | `num_trees` | Number of random projection trees | -- |
116
+ | `k_leaf` | Maximum points per leaf node | -- |
117
+
118
+ ### Choosing an Index
119
+
120
+ | Use Case | Recommended Index | Why |
121
+ | ---------------------------- | ----------------- | ------------------------------------- |
122
+ | Small dataset (< 50K) | Flat | Exact results, no training overhead |
123
+ | Medium dataset (50K - 1M) | IVF | Good speed/accuracy with tunable probe|
124
+ | Large dataset (1M+) | Annoy | Fast tree traversal, low memory |
125
+ | Ground truth / benchmarking | Flat | Guaranteed correct results |
126
+
127
+ ## Persistence
128
+
129
+ Save and load the entire database state, including vectors and the trained index:
130
+
131
+ ```python
132
+ # Save
133
+ db.save("my_database.bin")
134
+
135
+ # Load into a fresh instance
136
+ db2 = VegamDB()
137
+ db2.load("my_database.bin")
138
+
139
+ assert db2.size() == db.size()
140
+ ```
141
+
142
+ The index type and its trained state are serialized automatically. After loading, the index is ready to search without rebuilding.
143
+
144
+ ## API Reference
145
+
146
+ ### VegamDB
147
+
148
+ | Method | Description |
149
+ | ---------------------- | ----------------------------------------------------------------- |
150
+ | `VegamDB()` | Create a new empty database instance |
151
+ | `add_vector(vec)` | Add a vector from a Python list of floats |
152
+ | `add_vector_numpy(arr)`| Add a vector from a 1D NumPy float32 array (zero-copy) |
153
+ | `size()` | Return the number of stored vectors |
154
+ | `dimension()` | Return the dimensionality of stored vectors (0 if empty) |
155
+ | `use_flat_index()` | Set index to brute-force flat search |
156
+ | `use_ivf_index(...)` | Set index to IVF with specified cluster configuration |
157
+ | `use_annoy_index(...)` | Set index to Annoy with specified tree configuration |
158
+ | `build_index()` | Explicitly build/train the current index |
159
+ | `search(query, k, params=None)` | Search for k nearest neighbors, returns `SearchResults` |
160
+ | `save(filename)` | Save database and index to a binary file |
161
+ | `load(filename)` | Load database and index from a binary file |
162
+
163
+ ### SearchResults
164
+
165
+ | Attribute | Type | Description |
166
+ | ------------ | ------------- | ------------------------------------------------ |
167
+ | `ids` | `list[int]` | Indices of nearest neighbors (insertion order) |
168
+ | `distances` | `list[float]` | Euclidean distances to the query vector |
169
+
170
+ ### Search Parameters
171
+
172
+ **IVFSearchParams** -- Override the default probe count for IVF search:
173
+ - `n_probe` (int): Number of clusters to search. Higher values improve recall at the cost of latency.
174
+
175
+ ## Architecture
176
+
177
+ ```
178
+ VegamDB (Orchestrator)
179
+ / \
180
+ VectorStore IndexBase
181
+ (raw float vectors) (search strategy)
182
+ / | \
183
+ Flat IVF Annoy
184
+ (exact) (K-Means) (trees)
185
+ ```
186
+
187
+ - **VegamDB** -- Main entry point. Manages the vector store and delegates search to the active index.
188
+ - **VectorStore** -- Stores raw vectors in a `vector<vector<float>>`. Handles serialization.
189
+ - **IndexBase** -- Abstract interface that all index types implement (`build`, `search`, `save`, `load`).
190
+ - **FlatIndex** -- Iterates over all vectors, computing Euclidean distance. O(n) per query.
191
+ - **IVFIndex** -- Trains K-Means centroids, assigns vectors to clusters, searches only nearby clusters.
192
+ - **AnnoyIndex** -- Builds a forest of binary trees using random hyperplane splits for fast traversal.
193
+
194
+ ## Project Structure
195
+
196
+ ```
197
+ vegamdb/
198
+ ├── include/ # C++ headers
199
+ │ ├── VegamDB.hpp
200
+ │ ├── indexes/ # IndexBase, FlatIndex, IVFIndex, AnnoyIndex, KMeans
201
+ │ ├── storage/ # VectorStore
202
+ │ └── utils/ # Math utilities (Euclidean distance, dot product)
203
+ ├── src/ # C++ implementation
204
+ │ ├── VegamDB.cpp
205
+ │ ├── bindings.cpp # pybind11 Python bindings
206
+ │ ├── indexes/
207
+ │ ├── storage/
208
+ │ └── utils/
209
+ ├── vegamdb/ # Python package
210
+ │ ├── __init__.py # Public API re-exports
211
+ │ └── _vegamdb.pyi # Type stubs for IDE support
212
+ ├── benchmarks/ # Performance benchmarks
213
+ ├── CMakeLists.txt # C++ build configuration
214
+ └── pyproject.toml # Python packaging (scikit-build-core)
215
+ ```
216
+
217
+ ## Benchmarks
218
+
219
+ Run the included benchmarks to evaluate performance on your hardware:
220
+
221
+ ```bash
222
+ # Stress test (Flat index, varying dataset sizes)
223
+ python benchmarks/stress_test.py
224
+
225
+ # IVF benchmark (accuracy vs speed trade-off across probe counts)
226
+ python benchmarks/ivf_benchmarks.py
227
+
228
+ # Annoy benchmark (accuracy vs speed trade-off across tree counts)
229
+ python benchmarks/annoy_benchmark.py
230
+ ```
231
+
232
+ ## License
233
+
234
+ MIT
@@ -0,0 +1,121 @@
1
+ import numpy as np
2
+ import vegamdb
3
+ import time
4
+
5
+
6
+ def calculate_recall(ground_truth, predicted, k):
7
+ """
8
+ Calculates the intersection between Ground Truth and Predicted results.
9
+ Recall = (Number of Overlapping Indices) / K
10
+ """
11
+ total_recall = 0
12
+
13
+ for gt, pred in zip(ground_truth, predicted):
14
+ gt_set = set(gt)
15
+ pred_set = set(pred)
16
+ intersection_count = len(gt_set.intersection(pred_set))
17
+ total_recall += intersection_count / k
18
+
19
+ return (total_recall / len(ground_truth)) * 100
20
+
21
+
22
+ def generate_clustered_data(n_vectors, dim, n_clusters_real=100):
23
+ print(f"Generating {n_vectors} CLUSTERED vectors...")
24
+ topic_centers = np.random.random((n_clusters_real, dim)).astype(np.float32)
25
+ data = []
26
+ vectors_per_cluster = n_vectors // n_clusters_real
27
+
28
+ for i in range(n_clusters_real):
29
+ center = topic_centers[i]
30
+ noise = np.random.normal(scale=0.1, size=(vectors_per_cluster, dim))
31
+ cluster_points = center + noise
32
+ data.append(cluster_points)
33
+
34
+ data = np.vstack(data).astype(np.float32)
35
+ np.random.shuffle(data)
36
+ return data
37
+
38
+
39
+ def benchmark():
40
+ # ---------------------------------------------------------
41
+ # CONFIGURATION
42
+ # ---------------------------------------------------------
43
+ N_VECTORS = 100000
44
+ DIM = 128
45
+ K = 10
46
+ N_QUERIES = 100
47
+
48
+ # Annoy Param: Max items in a leaf before splitting
49
+ K_LEAF = 50
50
+
51
+ # We will test different Forest Sizes (Number of Trees)
52
+ # More Trees = Better Chance of finding neighbors = Slower Search
53
+ TREE_COUNTS = [1, 2, 5, 10, 50, 100, 200, 500]
54
+
55
+ # ---------------------------------------------------------
56
+ # 1. DATA GENERATION
57
+ # ---------------------------------------------------------
58
+ print(f"Generating {N_VECTORS} vectors (Dim: {DIM})...")
59
+ data = generate_clustered_data(N_VECTORS, DIM, n_clusters_real=500)
60
+ queries = generate_clustered_data(N_QUERIES, DIM, n_clusters_real=10)
61
+
62
+ # Use VegamDB to hold data and calculate ground truth
63
+ db = vegamdb.VegamDB()
64
+ print("Ingesting data into VegamDB...")
65
+ for i in range(N_VECTORS):
66
+ db.add_vector_numpy(data[i])
67
+
68
+ # ---------------------------------------------------------
69
+ # 2. ESTABLISH GROUND TRUTH (Brute Force)
70
+ # ---------------------------------------------------------
71
+ print(f"Calculating Ground Truth for {N_QUERIES} queries...")
72
+ start_flat = time.time()
73
+ ground_truth_results = []
74
+ for i in range(N_QUERIES):
75
+ res = db.search(queries[i], K)
76
+ ground_truth_results.append(res.ids)
77
+
78
+ time_flat = time.time() - start_flat
79
+ avg_flat_ms = (time_flat / N_QUERIES) * 1000
80
+ print(f"Avg Flat Search Latency: {avg_flat_ms:.4f} ms")
81
+
82
+ # ---------------------------------------------------------
83
+ # 3. ANNOY BENCHMARK LOOP
84
+ # ---------------------------------------------------------
85
+ print("\n--- ANNOY PERFORMANCE (Varying Tree Count) ---")
86
+ header = f"{'TREES':<6} | {'BUILD TIME':<10} | {'AVG LATENCY (ms)':<18} | {'SPEEDUP':<10} | {'ACCURACY (%)':<12}"
87
+ print("-" * len(header))
88
+ print(header)
89
+ print("-" * len(header))
90
+
91
+ # To test different tree counts, we must rebuild the index each time.
92
+ for n_trees in TREE_COUNTS:
93
+
94
+ # A. Build Index — use VegamDB's factory method
95
+ t0 = time.time()
96
+ db.use_annoy_index(num_trees=n_trees, k_leaf=K_LEAF)
97
+ db.build_index()
98
+ build_time = time.time() - t0
99
+
100
+ # B. Run Search
101
+ start_probe = time.time()
102
+ annoy_results = []
103
+ for i in range(N_QUERIES):
104
+ res = db.search(queries[i], K)
105
+ annoy_results.append(res.ids)
106
+
107
+ # C. Metrics
108
+ total_time = time.time() - start_probe
109
+ avg_time_ms = (total_time / N_QUERIES) * 1000
110
+ speedup = avg_flat_ms / avg_time_ms
111
+ accuracy = calculate_recall(ground_truth_results, annoy_results, K)
112
+
113
+ print(
114
+ f"{n_trees:<6} | {build_time:<10.2f} | {avg_time_ms:<18.4f} | {speedup:<10.1f} | {accuracy:<12.1f}"
115
+ )
116
+
117
+ print("-" * len(header))
118
+
119
+
120
+ if __name__ == "__main__":
121
+ benchmark()