vector-vault-db 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vector_vault_db-0.1.1/.github/workflows/ci.yml +57 -0
- vector_vault_db-0.1.1/.github/workflows/wheels.yml +64 -0
- vector_vault_db-0.1.1/.gitignore +40 -0
- vector_vault_db-0.1.1/CMakeLists.txt +79 -0
- vector_vault_db-0.1.1/PKG-INFO +167 -0
- vector_vault_db-0.1.1/README.md +154 -0
- vector_vault_db-0.1.1/docs/concepts.md +115 -0
- vector_vault_db-0.1.1/docs/cpp-api.md +152 -0
- vector_vault_db-0.1.1/docs/index.md +32 -0
- vector_vault_db-0.1.1/docs/installation.md +61 -0
- vector_vault_db-0.1.1/docs/python-api.md +249 -0
- vector_vault_db-0.1.1/docs/quickstart.md +109 -0
- vector_vault_db-0.1.1/include/vectorvault/collection.hpp +225 -0
- vector_vault_db-0.1.1/include/vectorvault/crc64.hpp +20 -0
- vector_vault_db-0.1.1/include/vectorvault/distance.hpp +71 -0
- vector_vault_db-0.1.1/include/vectorvault/engine.hpp +123 -0
- vector_vault_db-0.1.1/include/vectorvault/error.hpp +142 -0
- vector_vault_db-0.1.1/include/vectorvault/hnsw_index.hpp +154 -0
- vector_vault_db-0.1.1/include/vectorvault/index.hpp +107 -0
- vector_vault_db-0.1.1/include/vectorvault/ivf_index.hpp +96 -0
- vector_vault_db-0.1.1/include/vectorvault/memory_allocator.hpp +139 -0
- vector_vault_db-0.1.1/include/vectorvault/mmap_region.hpp +69 -0
- vector_vault_db-0.1.1/include/vectorvault/persistence.hpp +60 -0
- vector_vault_db-0.1.1/include/vectorvault/snapshot_format.hpp +316 -0
- vector_vault_db-0.1.1/include/vectorvault/span.hpp +92 -0
- vector_vault_db-0.1.1/include/vectorvault/types.hpp +99 -0
- vector_vault_db-0.1.1/include/vectorvault/version.hpp +17 -0
- vector_vault_db-0.1.1/pyproject.toml +37 -0
- vector_vault_db-0.1.1/src/core/collection.cpp +454 -0
- vector_vault_db-0.1.1/src/core/crc64.cpp +51 -0
- vector_vault_db-0.1.1/src/core/distance.cpp +255 -0
- vector_vault_db-0.1.1/src/core/engine.cpp +195 -0
- vector_vault_db-0.1.1/src/core/error.cpp +30 -0
- vector_vault_db-0.1.1/src/core/hnsw_index.cpp +522 -0
- vector_vault_db-0.1.1/src/core/ivf_index.cpp +284 -0
- vector_vault_db-0.1.1/src/core/memory_allocator.cpp +204 -0
- vector_vault_db-0.1.1/src/core/mmap_region.cpp +150 -0
- vector_vault_db-0.1.1/src/core/persistence.cpp +628 -0
- vector_vault_db-0.1.1/src/core/version.cpp +13 -0
- vector_vault_db-0.1.1/src/python/module.cpp +570 -0
- vector_vault_db-0.1.1/src/python/vectorvault/__init__.py +71 -0
- vector_vault_db-0.1.1/tests/cpp/CMakeLists.txt +46 -0
- vector_vault_db-0.1.1/tests/cpp/distance_test.cpp +247 -0
- vector_vault_db-0.1.1/tests/cpp/end_to_end_test.cpp +159 -0
- vector_vault_db-0.1.1/tests/cpp/engine_lifecycle_test.cpp +282 -0
- vector_vault_db-0.1.1/tests/cpp/index_query_test.cpp +511 -0
- vector_vault_db-0.1.1/tests/cpp/memory_allocator_test.cpp +181 -0
- vector_vault_db-0.1.1/tests/cpp/persistence_test.cpp +551 -0
- vector_vault_db-0.1.1/tests/cpp/recall_dispatch_test.cpp +372 -0
- vector_vault_db-0.1.1/tests/cpp/record_store_test.cpp +485 -0
- vector_vault_db-0.1.1/tests/cpp/smoke_test.cpp +30 -0
- vector_vault_db-0.1.1/tests/python/test_batch_insert.py +150 -0
- vector_vault_db-0.1.1/tests/python/test_binding_properties.py +322 -0
- vector_vault_db-0.1.1/tests/python/test_binding_units.py +114 -0
- vector_vault_db-0.1.1/tests/python/test_end_to_end.py +163 -0
- vector_vault_db-0.1.1/tests/python/test_smoke.py +19 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
concurrency:
|
|
10
|
+
group: ci-${{ github.ref }}
|
|
11
|
+
cancel-in-progress: true
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
cpp:
|
|
15
|
+
name: C++ core + tests
|
|
16
|
+
runs-on: ubuntu-latest
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
|
|
20
|
+
- name: Install toolchain
|
|
21
|
+
run: sudo apt-get update && sudo apt-get install -y g++ cmake ninja-build
|
|
22
|
+
|
|
23
|
+
- name: Configure
|
|
24
|
+
run: >
|
|
25
|
+
cmake -S . -B build -G Ninja
|
|
26
|
+
-DCMAKE_BUILD_TYPE=Release
|
|
27
|
+
-DVECTORVAULT_BUILD_TESTS=ON
|
|
28
|
+
-DVECTORVAULT_BUILD_PYTHON=OFF
|
|
29
|
+
|
|
30
|
+
- name: Build
|
|
31
|
+
run: cmake --build build
|
|
32
|
+
|
|
33
|
+
- name: Test
|
|
34
|
+
run: ctest --test-dir build --output-on-failure
|
|
35
|
+
|
|
36
|
+
python:
|
|
37
|
+
name: Python binding + tests
|
|
38
|
+
runs-on: ubuntu-latest
|
|
39
|
+
strategy:
|
|
40
|
+
fail-fast: false
|
|
41
|
+
matrix:
|
|
42
|
+
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
|
43
|
+
steps:
|
|
44
|
+
- uses: actions/checkout@v4
|
|
45
|
+
|
|
46
|
+
- name: Install toolchain
|
|
47
|
+
run: sudo apt-get update && sudo apt-get install -y g++ cmake ninja-build
|
|
48
|
+
|
|
49
|
+
- uses: actions/setup-python@v5
|
|
50
|
+
with:
|
|
51
|
+
python-version: ${{ matrix.python-version }}
|
|
52
|
+
|
|
53
|
+
- name: Install package and test dependencies
|
|
54
|
+
run: pip install -e ".[test]"
|
|
55
|
+
|
|
56
|
+
- name: Test
|
|
57
|
+
run: pytest -q
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
name: Wheels
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
push:
|
|
6
|
+
tags: ["v*"]
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
wheels:
|
|
10
|
+
name: Wheels on ${{ matrix.os }}
|
|
11
|
+
runs-on: ${{ matrix.os }}
|
|
12
|
+
strategy:
|
|
13
|
+
fail-fast: false
|
|
14
|
+
matrix:
|
|
15
|
+
os: [ubuntu-latest, windows-latest, macos-latest]
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Build wheels
|
|
20
|
+
uses: pypa/cibuildwheel@v2.21
|
|
21
|
+
|
|
22
|
+
- uses: actions/upload-artifact@v4
|
|
23
|
+
with:
|
|
24
|
+
name: wheels-${{ matrix.os }}
|
|
25
|
+
path: wheelhouse/*.whl
|
|
26
|
+
|
|
27
|
+
sdist:
|
|
28
|
+
name: Source distribution
|
|
29
|
+
runs-on: ubuntu-latest
|
|
30
|
+
steps:
|
|
31
|
+
- uses: actions/checkout@v4
|
|
32
|
+
|
|
33
|
+
- name: Build sdist
|
|
34
|
+
run: pipx run build --sdist
|
|
35
|
+
|
|
36
|
+
- uses: actions/upload-artifact@v4
|
|
37
|
+
with:
|
|
38
|
+
name: sdist
|
|
39
|
+
path: dist/*.tar.gz
|
|
40
|
+
|
|
41
|
+
publish:
|
|
42
|
+
name: Publish to PyPI
|
|
43
|
+
needs: [wheels, sdist]
|
|
44
|
+
runs-on: ubuntu-latest
|
|
45
|
+
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
|
|
46
|
+
environment: pypi
|
|
47
|
+
permissions:
|
|
48
|
+
id-token: write
|
|
49
|
+
steps:
|
|
50
|
+
- name: Download wheels
|
|
51
|
+
uses: actions/download-artifact@v4
|
|
52
|
+
with:
|
|
53
|
+
path: dist
|
|
54
|
+
pattern: wheels-*
|
|
55
|
+
merge-multiple: true
|
|
56
|
+
|
|
57
|
+
- name: Download sdist
|
|
58
|
+
uses: actions/download-artifact@v4
|
|
59
|
+
with:
|
|
60
|
+
name: sdist
|
|
61
|
+
path: dist
|
|
62
|
+
|
|
63
|
+
- name: Publish
|
|
64
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Build output
|
|
2
|
+
/build/
|
|
3
|
+
/_skbuild/
|
|
4
|
+
/dist/
|
|
5
|
+
*.egg-info/
|
|
6
|
+
/wheelhouse/
|
|
7
|
+
|
|
8
|
+
# CMake / FetchContent caches
|
|
9
|
+
CMakeCache.txt
|
|
10
|
+
CMakeFiles/
|
|
11
|
+
cmake_install.cmake
|
|
12
|
+
CTestTestfile.cmake
|
|
13
|
+
Testing/
|
|
14
|
+
_deps/
|
|
15
|
+
|
|
16
|
+
# Compiled artifacts
|
|
17
|
+
*.o
|
|
18
|
+
*.obj
|
|
19
|
+
*.a
|
|
20
|
+
*.lib
|
|
21
|
+
*.so
|
|
22
|
+
*.dylib
|
|
23
|
+
*.pyd
|
|
24
|
+
*.dll
|
|
25
|
+
|
|
26
|
+
# Python
|
|
27
|
+
__pycache__/
|
|
28
|
+
*.py[cod]
|
|
29
|
+
.pytest_cache/
|
|
30
|
+
.hypothesis/
|
|
31
|
+
.venv/
|
|
32
|
+
venv/
|
|
33
|
+
|
|
34
|
+
# Editor/OS
|
|
35
|
+
.vscode/
|
|
36
|
+
.idea/
|
|
37
|
+
.DS_Store
|
|
38
|
+
|
|
39
|
+
# Tooling
|
|
40
|
+
.kiro/
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
cmake_minimum_required(VERSION 3.20)
|
|
2
|
+
|
|
3
|
+
project(VectorVaultDB
|
|
4
|
+
VERSION 0.1.1
|
|
5
|
+
DESCRIPTION "High-performance vector database with a C++ core and Python bindings"
|
|
6
|
+
LANGUAGES CXX)
|
|
7
|
+
|
|
8
|
+
# ---------------------------------------------------------------------------
|
|
9
|
+
# Global settings
|
|
10
|
+
# ---------------------------------------------------------------------------
|
|
11
|
+
set(CMAKE_CXX_STANDARD 17)
|
|
12
|
+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
13
|
+
set(CMAKE_CXX_EXTENSIONS OFF)
|
|
14
|
+
|
|
15
|
+
if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
|
|
16
|
+
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
|
17
|
+
endif()
|
|
18
|
+
|
|
19
|
+
# Build options. The Python module is built by default (this is how
|
|
20
|
+
# scikit-build-core drives the build). C++ tests are opt-in so that wheel
|
|
21
|
+
# builds do not need to fetch the test dependencies.
|
|
22
|
+
option(VECTORVAULT_BUILD_PYTHON "Build the pybind11 Python extension" ON)
|
|
23
|
+
option(VECTORVAULT_BUILD_TESTS "Build the C++ test harness (Catch2 + RapidCheck)" OFF)
|
|
24
|
+
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
# Core engine library (C++17)
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
add_library(vectorvault_core STATIC
|
|
29
|
+
src/core/version.cpp
|
|
30
|
+
src/core/error.cpp
|
|
31
|
+
src/core/memory_allocator.cpp
|
|
32
|
+
src/core/distance.cpp
|
|
33
|
+
src/core/collection.cpp
|
|
34
|
+
src/core/hnsw_index.cpp
|
|
35
|
+
src/core/ivf_index.cpp
|
|
36
|
+
src/core/crc64.cpp
|
|
37
|
+
src/core/persistence.cpp
|
|
38
|
+
src/core/mmap_region.cpp
|
|
39
|
+
src/core/engine.cpp)
|
|
40
|
+
|
|
41
|
+
target_include_directories(vectorvault_core
|
|
42
|
+
PUBLIC
|
|
43
|
+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
|
|
44
|
+
|
|
45
|
+
set_target_properties(vectorvault_core PROPERTIES
|
|
46
|
+
POSITION_INDEPENDENT_CODE ON)
|
|
47
|
+
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
# Python extension (pybind11)
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
if(VECTORVAULT_BUILD_PYTHON)
|
|
52
|
+
find_package(pybind11 CONFIG REQUIRED)
|
|
53
|
+
|
|
54
|
+
pybind11_add_module(_vectorvault src/python/module.cpp)
|
|
55
|
+
target_link_libraries(_vectorvault PRIVATE vectorvault_core)
|
|
56
|
+
|
|
57
|
+
# On MinGW/MSYS2 the extension would otherwise depend on libstdc++,
|
|
58
|
+
# libgcc, and libwinpthread DLLs from the toolchain's bin directory.
|
|
59
|
+
# Link them statically so the .pyd is self-contained and importable by a
|
|
60
|
+
# stock CPython interpreter without the toolchain on PATH.
|
|
61
|
+
if(MINGW)
|
|
62
|
+
target_link_options(_vectorvault PRIVATE
|
|
63
|
+
-static-libgcc -static-libstdc++
|
|
64
|
+
-Wl,-Bstatic,--whole-archive -lwinpthread
|
|
65
|
+
-Wl,--no-whole-archive)
|
|
66
|
+
endif()
|
|
67
|
+
|
|
68
|
+
# Install the extension into the vectorvault package (scikit-build-core
|
|
69
|
+
# collects this together with the pure-Python package files).
|
|
70
|
+
install(TARGETS _vectorvault DESTINATION vectorvault)
|
|
71
|
+
endif()
|
|
72
|
+
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
# C++ test harness
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
if(VECTORVAULT_BUILD_TESTS)
|
|
77
|
+
enable_testing()
|
|
78
|
+
add_subdirectory(tests/cpp)
|
|
79
|
+
endif()
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: vector-vault-db
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: High-performance vector database with a C++ core and Python bindings
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Requires-Dist: numpy>=1.21
|
|
8
|
+
Provides-Extra: test
|
|
9
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
|
10
|
+
Requires-Dist: hypothesis>=6.0; extra == "test"
|
|
11
|
+
Requires-Dist: numpy>=1.21; extra == "test"
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# Vector-Vault-DB
|
|
15
|
+
|
|
16
|
+
A high-performance vector database written from scratch. The engine is C++17;
|
|
17
|
+
the public interface is a Python extension built with pybind11. There is no
|
|
18
|
+
dependency on an existing vector store — the index, distance kernels, allocator,
|
|
19
|
+
and on-disk format are all implemented directly.
|
|
20
|
+
|
|
21
|
+
## Features
|
|
22
|
+
|
|
23
|
+
- **Approximate nearest-neighbor search** over float32 vectors using either an
|
|
24
|
+
HNSW graph or an IVF (inverted-file) index.
|
|
25
|
+
- **SIMD-accelerated distance kernels** (AVX-512) for Euclidean, cosine, and dot
|
|
26
|
+
product, with a scalar fallback selected once at startup via CPU detection.
|
|
27
|
+
- **Custom arena allocator** that hands out 64-byte aligned blocks for vector
|
|
28
|
+
storage and tracks per-collection usage.
|
|
29
|
+
- **Memory-mapped persistence**: a versioned, CRC-checked binary snapshot format.
|
|
30
|
+
Saves are atomic (temp file + fsync + rename); loads map the vector region
|
|
31
|
+
instead of copying it onto the heap.
|
|
32
|
+
- **Python bindings** with NumPy support and an exception hierarchy that mirrors
|
|
33
|
+
the engine's error categories.
|
|
34
|
+
|
|
35
|
+
## Documentation
|
|
36
|
+
|
|
37
|
+
Full documentation lives in [`docs/`](docs/index.md):
|
|
38
|
+
|
|
39
|
+
- [Installation](docs/installation.md)
|
|
40
|
+
- [Quickstart](docs/quickstart.md)
|
|
41
|
+
- [Concepts](docs/concepts.md) — metrics, indexes, tuning, persistence
|
|
42
|
+
- [Python API reference](docs/python-api.md)
|
|
43
|
+
- [C++ API guide](docs/cpp-api.md)
|
|
44
|
+
|
|
45
|
+
## Architecture
|
|
46
|
+
|
|
47
|
+
```
|
|
48
|
+
Python (vectorvault) pybind11 extension, NumPy interop, exceptions
|
|
49
|
+
│
|
|
50
|
+
▼
|
|
51
|
+
Engine collection registry, shared allocator + persistence
|
|
52
|
+
│
|
|
53
|
+
▼
|
|
54
|
+
Collection record store, validation, query orchestration
|
|
55
|
+
├── Index HNSW / IVF — graph + inverted-file ANN
|
|
56
|
+
├── DistanceCalculator AVX-512 kernels with scalar fallback
|
|
57
|
+
├── MemoryAllocator 64-byte aligned arena, per-collection accounting
|
|
58
|
+
└── PersistenceManager atomic save, mmap-backed load, CRC validation
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
A collection is guarded by a single readers-writer lock: reads (get, query) take
|
|
62
|
+
a shared lock, mutations (insert, delete, build, save) take an exclusive lock, so
|
|
63
|
+
index membership stays consistent with the record store.
|
|
64
|
+
|
|
65
|
+
## Requirements
|
|
66
|
+
|
|
67
|
+
- A C++17 compiler (GCC, Clang, or MSVC)
|
|
68
|
+
- CMake >= 3.20
|
|
69
|
+
- Python >= 3.9 with NumPy (for the bindings)
|
|
70
|
+
- pybind11 (resolved by the build backend)
|
|
71
|
+
|
|
72
|
+
Catch2, RapidCheck, and Hypothesis are fetched automatically for the test builds.
|
|
73
|
+
|
|
74
|
+
## Install
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
pip install -e ".[test]"
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
This builds the native extension through scikit-build-core and installs the
|
|
81
|
+
`vectorvault` package in editable mode.
|
|
82
|
+
|
|
83
|
+
## Quickstart
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
import vectorvault as vv
|
|
87
|
+
|
|
88
|
+
engine = vv.Engine()
|
|
89
|
+
coll = engine.create_collection("documents", dim=128, metric="cosine")
|
|
90
|
+
|
|
91
|
+
coll.insert("doc-1", embedding_a, metadata={"title": "intro"})
|
|
92
|
+
coll.insert("doc-2", embedding_b)
|
|
93
|
+
|
|
94
|
+
coll.build_index("hnsw", m=16, ef_construction=200)
|
|
95
|
+
|
|
96
|
+
results = coll.query(query_vector, k=10, ef_search=64)
|
|
97
|
+
for record_id, distance in results:
|
|
98
|
+
print(record_id, distance)
|
|
99
|
+
|
|
100
|
+
engine.save("documents", "documents.vv")
|
|
101
|
+
restored = vv.Engine().load("documents.vv")
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Metrics: `"euclidean"` (`"l2"`), `"cosine"`, `"dot"` (`"dot_product"`).
|
|
105
|
+
Index types: `"hnsw"`, `"ivf"`.
|
|
106
|
+
|
|
107
|
+
Errors surface as typed exceptions under `vectorvault.VectorVaultError`; several
|
|
108
|
+
also derive from a builtin (for example `NotFoundError` is a `KeyError` and
|
|
109
|
+
`SnapshotNotFoundError` is a `FileNotFoundError`).
|
|
110
|
+
|
|
111
|
+
## On-disk format
|
|
112
|
+
|
|
113
|
+
A snapshot is a single file: a fixed header (magic, version, dimensionality,
|
|
114
|
+
metric, index type, region offsets, CRC-64 of the content), the collection name,
|
|
115
|
+
a record directory sorted by id with optional metadata, a 64-byte aligned vector
|
|
116
|
+
region, and the serialized index. Records, metadata keys, and index nodes are
|
|
117
|
+
written in a canonical order, so saving a collection, loading it, and saving
|
|
118
|
+
again produces a byte-identical file. Loads validate existence, magic, version,
|
|
119
|
+
and checksum before mapping the vector region.
|
|
120
|
+
|
|
121
|
+
## Testing
|
|
122
|
+
|
|
123
|
+
C++ (Catch2 + RapidCheck):
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
cmake -S . -B build -DVECTORVAULT_BUILD_TESTS=ON -DVECTORVAULT_BUILD_PYTHON=OFF
|
|
127
|
+
cmake --build build
|
|
128
|
+
ctest --test-dir build --output-on-failure
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Python (pytest + Hypothesis):
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
pytest
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Correctness-critical behavior — distance accuracy against a reference, allocator
|
|
138
|
+
accounting, query ordering, index membership, and the snapshot round-trip — is
|
|
139
|
+
covered by property-based tests. A recall benchmark builds an index over 10,000
|
|
140
|
+
vectors and asserts mean recall@10 against an exact baseline.
|
|
141
|
+
|
|
142
|
+
## Layout
|
|
143
|
+
|
|
144
|
+
```
|
|
145
|
+
include/vectorvault/ Public C++ headers
|
|
146
|
+
src/core/ Core engine implementation
|
|
147
|
+
src/python/ pybind11 module + vectorvault package
|
|
148
|
+
tests/cpp/ C++ tests (Catch2 + RapidCheck)
|
|
149
|
+
tests/python/ Python tests (pytest + Hypothesis)
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## Notes and limitations
|
|
153
|
+
|
|
154
|
+
- The HNSW index selects neighbours with the diversity heuristic from Malkov &
|
|
155
|
+
Yashunin (SELECT-NEIGHBORS-HEURISTIC), applied both when linking a new node and
|
|
156
|
+
when pruning an existing node whose adjacency list overflows. On uniform random
|
|
157
|
+
data this measures mean recall@10 of ~0.96 at dim 32 and ~0.85 at dim 64
|
|
158
|
+
(m=16, ef_construction=200, ef_search=50); recall still tapers on harder,
|
|
159
|
+
higher-dimensional distributions.
|
|
160
|
+
- The snapshot reader assumes a little-endian host (it reads float components
|
|
161
|
+
directly from the mapping).
|
|
162
|
+
- AVX-512 is detected at runtime; on hosts without it the scalar kernels are
|
|
163
|
+
used, which are also the accuracy reference.
|
|
164
|
+
|
|
165
|
+
## License
|
|
166
|
+
|
|
167
|
+
MIT
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# Vector-Vault-DB
|
|
2
|
+
|
|
3
|
+
A high-performance vector database written from scratch. The engine is C++17;
|
|
4
|
+
the public interface is a Python extension built with pybind11. There is no
|
|
5
|
+
dependency on an existing vector store — the index, distance kernels, allocator,
|
|
6
|
+
and on-disk format are all implemented directly.
|
|
7
|
+
|
|
8
|
+
## Features
|
|
9
|
+
|
|
10
|
+
- **Approximate nearest-neighbor search** over float32 vectors using either an
|
|
11
|
+
HNSW graph or an IVF (inverted-file) index.
|
|
12
|
+
- **SIMD-accelerated distance kernels** (AVX-512) for Euclidean, cosine, and dot
|
|
13
|
+
product, with a scalar fallback selected once at startup via CPU detection.
|
|
14
|
+
- **Custom arena allocator** that hands out 64-byte aligned blocks for vector
|
|
15
|
+
storage and tracks per-collection usage.
|
|
16
|
+
- **Memory-mapped persistence**: a versioned, CRC-checked binary snapshot format.
|
|
17
|
+
Saves are atomic (temp file + fsync + rename); loads map the vector region
|
|
18
|
+
instead of copying it onto the heap.
|
|
19
|
+
- **Python bindings** with NumPy support and an exception hierarchy that mirrors
|
|
20
|
+
the engine's error categories.
|
|
21
|
+
|
|
22
|
+
## Documentation
|
|
23
|
+
|
|
24
|
+
Full documentation lives in [`docs/`](docs/index.md):
|
|
25
|
+
|
|
26
|
+
- [Installation](docs/installation.md)
|
|
27
|
+
- [Quickstart](docs/quickstart.md)
|
|
28
|
+
- [Concepts](docs/concepts.md) — metrics, indexes, tuning, persistence
|
|
29
|
+
- [Python API reference](docs/python-api.md)
|
|
30
|
+
- [C++ API guide](docs/cpp-api.md)
|
|
31
|
+
|
|
32
|
+
## Architecture
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
Python (vectorvault) pybind11 extension, NumPy interop, exceptions
|
|
36
|
+
│
|
|
37
|
+
▼
|
|
38
|
+
Engine collection registry, shared allocator + persistence
|
|
39
|
+
│
|
|
40
|
+
▼
|
|
41
|
+
Collection record store, validation, query orchestration
|
|
42
|
+
├── Index HNSW / IVF — graph + inverted-file ANN
|
|
43
|
+
├── DistanceCalculator AVX-512 kernels with scalar fallback
|
|
44
|
+
├── MemoryAllocator 64-byte aligned arena, per-collection accounting
|
|
45
|
+
└── PersistenceManager atomic save, mmap-backed load, CRC validation
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
A collection is guarded by a single readers-writer lock: reads (get, query) take
|
|
49
|
+
a shared lock, mutations (insert, delete, build, save) take an exclusive lock, so
|
|
50
|
+
index membership stays consistent with the record store.
|
|
51
|
+
|
|
52
|
+
## Requirements
|
|
53
|
+
|
|
54
|
+
- A C++17 compiler (GCC, Clang, or MSVC)
|
|
55
|
+
- CMake >= 3.20
|
|
56
|
+
- Python >= 3.9 with NumPy (for the bindings)
|
|
57
|
+
- pybind11 (resolved by the build backend)
|
|
58
|
+
|
|
59
|
+
Catch2, RapidCheck, and Hypothesis are fetched automatically for the test builds.
|
|
60
|
+
|
|
61
|
+
## Install
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
pip install -e ".[test]"
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
This builds the native extension through scikit-build-core and installs the
|
|
68
|
+
`vectorvault` package in editable mode.
|
|
69
|
+
|
|
70
|
+
## Quickstart
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
import vectorvault as vv
|
|
74
|
+
|
|
75
|
+
engine = vv.Engine()
|
|
76
|
+
coll = engine.create_collection("documents", dim=128, metric="cosine")
|
|
77
|
+
|
|
78
|
+
coll.insert("doc-1", embedding_a, metadata={"title": "intro"})
|
|
79
|
+
coll.insert("doc-2", embedding_b)
|
|
80
|
+
|
|
81
|
+
coll.build_index("hnsw", m=16, ef_construction=200)
|
|
82
|
+
|
|
83
|
+
results = coll.query(query_vector, k=10, ef_search=64)
|
|
84
|
+
for record_id, distance in results:
|
|
85
|
+
print(record_id, distance)
|
|
86
|
+
|
|
87
|
+
engine.save("documents", "documents.vv")
|
|
88
|
+
restored = vv.Engine().load("documents.vv")
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Metrics: `"euclidean"` (`"l2"`), `"cosine"`, `"dot"` (`"dot_product"`).
|
|
92
|
+
Index types: `"hnsw"`, `"ivf"`.
|
|
93
|
+
|
|
94
|
+
Errors surface as typed exceptions under `vectorvault.VectorVaultError`; several
|
|
95
|
+
also derive from a builtin (for example `NotFoundError` is a `KeyError` and
|
|
96
|
+
`SnapshotNotFoundError` is a `FileNotFoundError`).
|
|
97
|
+
|
|
98
|
+
## On-disk format
|
|
99
|
+
|
|
100
|
+
A snapshot is a single file: a fixed header (magic, version, dimensionality,
|
|
101
|
+
metric, index type, region offsets, CRC-64 of the content), the collection name,
|
|
102
|
+
a record directory sorted by id with optional metadata, a 64-byte aligned vector
|
|
103
|
+
region, and the serialized index. Records, metadata keys, and index nodes are
|
|
104
|
+
written in a canonical order, so saving a collection, loading it, and saving
|
|
105
|
+
again produces a byte-identical file. Loads validate existence, magic, version,
|
|
106
|
+
and checksum before mapping the vector region.
|
|
107
|
+
|
|
108
|
+
## Testing
|
|
109
|
+
|
|
110
|
+
C++ (Catch2 + RapidCheck):
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
cmake -S . -B build -DVECTORVAULT_BUILD_TESTS=ON -DVECTORVAULT_BUILD_PYTHON=OFF
|
|
114
|
+
cmake --build build
|
|
115
|
+
ctest --test-dir build --output-on-failure
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Python (pytest + Hypothesis):
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
pytest
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Correctness-critical behavior — distance accuracy against a reference, allocator
|
|
125
|
+
accounting, query ordering, index membership, and the snapshot round-trip — is
|
|
126
|
+
covered by property-based tests. A recall benchmark builds an index over 10,000
|
|
127
|
+
vectors and asserts mean recall@10 against an exact baseline.
|
|
128
|
+
|
|
129
|
+
## Layout
|
|
130
|
+
|
|
131
|
+
```
|
|
132
|
+
include/vectorvault/ Public C++ headers
|
|
133
|
+
src/core/ Core engine implementation
|
|
134
|
+
src/python/ pybind11 module + vectorvault package
|
|
135
|
+
tests/cpp/ C++ tests (Catch2 + RapidCheck)
|
|
136
|
+
tests/python/ Python tests (pytest + Hypothesis)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## Notes and limitations
|
|
140
|
+
|
|
141
|
+
- The HNSW index selects neighbours with the diversity heuristic from Malkov &
|
|
142
|
+
Yashunin (SELECT-NEIGHBORS-HEURISTIC), applied both when linking a new node and
|
|
143
|
+
when pruning an existing node whose adjacency list overflows. On uniform random
|
|
144
|
+
data this measures mean recall@10 of ~0.96 at dim 32 and ~0.85 at dim 64
|
|
145
|
+
(m=16, ef_construction=200, ef_search=50); recall still tapers on harder,
|
|
146
|
+
higher-dimensional distributions.
|
|
147
|
+
- The snapshot reader assumes a little-endian host (it reads float components
|
|
148
|
+
directly from the mapping).
|
|
149
|
+
- AVX-512 is detected at runtime; on hosts without it the scalar kernels are
|
|
150
|
+
used, which are also the accuracy reference.
|
|
151
|
+
|
|
152
|
+
## License
|
|
153
|
+
|
|
154
|
+
MIT
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# Concepts
|
|
2
|
+
|
|
3
|
+
## Collections
|
|
4
|
+
|
|
5
|
+
A collection is a named container for vectors that share a fixed
|
|
6
|
+
**dimensionality** and a single **distance metric**. Both are set at creation
|
|
7
|
+
and immutable thereafter. Each record is a unique string id, a float32 vector,
|
|
8
|
+
and optional metadata.
|
|
9
|
+
|
|
10
|
+
Constraints enforced at the boundary:
|
|
11
|
+
|
|
12
|
+
- Collection name: 1–255 characters, unique within an engine.
|
|
13
|
+
- Dimensionality: an integer in `[1, 65536]`.
|
|
14
|
+
- Record id: non-empty string.
|
|
15
|
+
- Vector: length equals the collection dimensionality; all components finite
|
|
16
|
+
(no NaN or infinity).
|
|
17
|
+
- Metadata: a map of string keys to str / int / float / bool values.
|
|
18
|
+
|
|
19
|
+
## Distance metrics
|
|
20
|
+
|
|
21
|
+
| Metric | String | Definition | Ordering |
|
|
22
|
+
|---|---|---|---|
|
|
23
|
+
| Euclidean (L2) | `"euclidean"`, `"l2"` | √Σ(aᵢ−bᵢ)² | nearest = smallest |
|
|
24
|
+
| Cosine | `"cosine"` | 1 − (a·b)/(‖a‖‖b‖), in [0, 2] | nearest = smallest |
|
|
25
|
+
| Dot product | `"dot"`, `"dot_product"` | Σ aᵢbᵢ | see note below |
|
|
26
|
+
|
|
27
|
+
Query results are always ordered by **ascending metric value**, with ties
|
|
28
|
+
broken by ascending id for determinism.
|
|
29
|
+
|
|
30
|
+
- For **Euclidean** and **Cosine**, a smaller value means more similar, so
|
|
31
|
+
ascending order returns the nearest neighbors first — the expected behavior.
|
|
32
|
+
- For **Dot product**, the raw dot product is treated as the distance and sorted
|
|
33
|
+
ascending, so the largest dot products come *last*. If you want
|
|
34
|
+
maximum-inner-product semantics (largest dot first), use cosine on normalized
|
|
35
|
+
vectors, or reverse the result list yourself.
|
|
36
|
+
|
|
37
|
+
Cosine distance is undefined for a zero-magnitude vector; computing it raises
|
|
38
|
+
`UndefinedDistanceError`.
|
|
39
|
+
|
|
40
|
+
## Indexes
|
|
41
|
+
|
|
42
|
+
An index makes nearest-neighbor search sublinear. With no index, queries fall
|
|
43
|
+
back to an exact brute-force scan, which is correct but O(n) per query.
|
|
44
|
+
|
|
45
|
+
### HNSW (Hierarchical Navigable Small World)
|
|
46
|
+
|
|
47
|
+
A layered proximity graph. Search descends through sparse upper layers to a good
|
|
48
|
+
entry point, then explores the base layer. Neighbor selection uses a diversity
|
|
49
|
+
heuristic so the graph stays navigable.
|
|
50
|
+
|
|
51
|
+
Build parameters:
|
|
52
|
+
|
|
53
|
+
- `m` (default 16): max neighbors per node on upper layers (base layer keeps
|
|
54
|
+
`2*m`). Higher `m` improves recall and memory use. Must be ≥ 2.
|
|
55
|
+
- `ef_construction` (default 200): build-time search breadth. Higher values build
|
|
56
|
+
a better graph more slowly. Must be ≥ 1.
|
|
57
|
+
|
|
58
|
+
Query parameter:
|
|
59
|
+
|
|
60
|
+
- `ef_search` (default 50): search breadth. Higher values trade speed for recall.
|
|
61
|
+
Must be ≥ 1.
|
|
62
|
+
|
|
63
|
+
### IVF (Inverted File)
|
|
64
|
+
|
|
65
|
+
Partitions vectors into `nlist` cells via k-means. A query scans only the
|
|
66
|
+
`nprobe` cells nearest to it.
|
|
67
|
+
|
|
68
|
+
Build parameter:
|
|
69
|
+
|
|
70
|
+
- `nlist` (default 0 = auto, ≈ √n): number of partitions. Must be in
|
|
71
|
+
`[1, record_count]`.
|
|
72
|
+
|
|
73
|
+
Query parameter:
|
|
74
|
+
|
|
75
|
+
- `nprobe` (default 1): partitions scanned per query. Higher values trade speed
|
|
76
|
+
for recall. Must be in `[1, nlist]`.
|
|
77
|
+
|
|
78
|
+
### Choosing an index
|
|
79
|
+
|
|
80
|
+
- **HNSW** generally gives higher recall at a given speed and is a good default.
|
|
81
|
+
- **IVF** has lower memory overhead and a smaller build cost; recall is tuned
|
|
82
|
+
primarily through `nprobe`.
|
|
83
|
+
|
|
84
|
+
Deletions are handled by tombstoning: a removed record is excluded from results
|
|
85
|
+
immediately and the surrounding structure is preserved.
|
|
86
|
+
|
|
87
|
+
## Persistence
|
|
88
|
+
|
|
89
|
+
`engine.save(name, path)` writes a single binary snapshot containing the
|
|
90
|
+
collection metadata, all records (with metadata), and the index. The format is:
|
|
91
|
+
|
|
92
|
+
- A fixed header: magic, format version, dimensionality, metric, index type,
|
|
93
|
+
region offsets, and a CRC-64 of the content.
|
|
94
|
+
- The collection name.
|
|
95
|
+
- A record directory sorted by id, each with optional metadata.
|
|
96
|
+
- A 64-byte aligned vector region.
|
|
97
|
+
- The serialized index (if one was built).
|
|
98
|
+
|
|
99
|
+
Records, metadata keys, and index nodes are written in a canonical order, so
|
|
100
|
+
saving a collection, loading it, and saving again produces a **byte-identical**
|
|
101
|
+
file.
|
|
102
|
+
|
|
103
|
+
`engine.load(path)` validates the file in order — existence and size, magic and
|
|
104
|
+
header, format version, then content checksum — before mapping the vector region
|
|
105
|
+
into memory. Vector data is read through the mapping rather than copied onto the
|
|
106
|
+
heap. Failures raise `SnapshotNotFoundError`, `UnsupportedVersionError`, or
|
|
107
|
+
`CorruptionError`.
|
|
108
|
+
|
|
109
|
+
## Performance notes
|
|
110
|
+
|
|
111
|
+
- Distance kernels use AVX-512 when the CPU supports it and fall back to scalar
|
|
112
|
+
code otherwise; both accumulate in double precision for accuracy.
|
|
113
|
+
- Vector storage is 64-byte aligned to suit SIMD access.
|
|
114
|
+
- A collection is guarded by a readers-writer lock: concurrent reads are allowed;
|
|
115
|
+
mutations are serialized.
|