vector-vault-db 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. vector_vault_db-0.1.1/.github/workflows/ci.yml +57 -0
  2. vector_vault_db-0.1.1/.github/workflows/wheels.yml +64 -0
  3. vector_vault_db-0.1.1/.gitignore +40 -0
  4. vector_vault_db-0.1.1/CMakeLists.txt +79 -0
  5. vector_vault_db-0.1.1/PKG-INFO +167 -0
  6. vector_vault_db-0.1.1/README.md +154 -0
  7. vector_vault_db-0.1.1/docs/concepts.md +115 -0
  8. vector_vault_db-0.1.1/docs/cpp-api.md +152 -0
  9. vector_vault_db-0.1.1/docs/index.md +32 -0
  10. vector_vault_db-0.1.1/docs/installation.md +61 -0
  11. vector_vault_db-0.1.1/docs/python-api.md +249 -0
  12. vector_vault_db-0.1.1/docs/quickstart.md +109 -0
  13. vector_vault_db-0.1.1/include/vectorvault/collection.hpp +225 -0
  14. vector_vault_db-0.1.1/include/vectorvault/crc64.hpp +20 -0
  15. vector_vault_db-0.1.1/include/vectorvault/distance.hpp +71 -0
  16. vector_vault_db-0.1.1/include/vectorvault/engine.hpp +123 -0
  17. vector_vault_db-0.1.1/include/vectorvault/error.hpp +142 -0
  18. vector_vault_db-0.1.1/include/vectorvault/hnsw_index.hpp +154 -0
  19. vector_vault_db-0.1.1/include/vectorvault/index.hpp +107 -0
  20. vector_vault_db-0.1.1/include/vectorvault/ivf_index.hpp +96 -0
  21. vector_vault_db-0.1.1/include/vectorvault/memory_allocator.hpp +139 -0
  22. vector_vault_db-0.1.1/include/vectorvault/mmap_region.hpp +69 -0
  23. vector_vault_db-0.1.1/include/vectorvault/persistence.hpp +60 -0
  24. vector_vault_db-0.1.1/include/vectorvault/snapshot_format.hpp +316 -0
  25. vector_vault_db-0.1.1/include/vectorvault/span.hpp +92 -0
  26. vector_vault_db-0.1.1/include/vectorvault/types.hpp +99 -0
  27. vector_vault_db-0.1.1/include/vectorvault/version.hpp +17 -0
  28. vector_vault_db-0.1.1/pyproject.toml +37 -0
  29. vector_vault_db-0.1.1/src/core/collection.cpp +454 -0
  30. vector_vault_db-0.1.1/src/core/crc64.cpp +51 -0
  31. vector_vault_db-0.1.1/src/core/distance.cpp +255 -0
  32. vector_vault_db-0.1.1/src/core/engine.cpp +195 -0
  33. vector_vault_db-0.1.1/src/core/error.cpp +30 -0
  34. vector_vault_db-0.1.1/src/core/hnsw_index.cpp +522 -0
  35. vector_vault_db-0.1.1/src/core/ivf_index.cpp +284 -0
  36. vector_vault_db-0.1.1/src/core/memory_allocator.cpp +204 -0
  37. vector_vault_db-0.1.1/src/core/mmap_region.cpp +150 -0
  38. vector_vault_db-0.1.1/src/core/persistence.cpp +628 -0
  39. vector_vault_db-0.1.1/src/core/version.cpp +13 -0
  40. vector_vault_db-0.1.1/src/python/module.cpp +570 -0
  41. vector_vault_db-0.1.1/src/python/vectorvault/__init__.py +71 -0
  42. vector_vault_db-0.1.1/tests/cpp/CMakeLists.txt +46 -0
  43. vector_vault_db-0.1.1/tests/cpp/distance_test.cpp +247 -0
  44. vector_vault_db-0.1.1/tests/cpp/end_to_end_test.cpp +159 -0
  45. vector_vault_db-0.1.1/tests/cpp/engine_lifecycle_test.cpp +282 -0
  46. vector_vault_db-0.1.1/tests/cpp/index_query_test.cpp +511 -0
  47. vector_vault_db-0.1.1/tests/cpp/memory_allocator_test.cpp +181 -0
  48. vector_vault_db-0.1.1/tests/cpp/persistence_test.cpp +551 -0
  49. vector_vault_db-0.1.1/tests/cpp/recall_dispatch_test.cpp +372 -0
  50. vector_vault_db-0.1.1/tests/cpp/record_store_test.cpp +485 -0
  51. vector_vault_db-0.1.1/tests/cpp/smoke_test.cpp +30 -0
  52. vector_vault_db-0.1.1/tests/python/test_batch_insert.py +150 -0
  53. vector_vault_db-0.1.1/tests/python/test_binding_properties.py +322 -0
  54. vector_vault_db-0.1.1/tests/python/test_binding_units.py +114 -0
  55. vector_vault_db-0.1.1/tests/python/test_end_to_end.py +163 -0
  56. vector_vault_db-0.1.1/tests/python/test_smoke.py +19 -0
@@ -0,0 +1,57 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ workflow_dispatch:
8
+
9
+ concurrency:
10
+ group: ci-${{ github.ref }}
11
+ cancel-in-progress: true
12
+
13
+ jobs:
14
+ cpp:
15
+ name: C++ core + tests
16
+ runs-on: ubuntu-latest
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+
20
+ - name: Install toolchain
21
+ run: sudo apt-get update && sudo apt-get install -y g++ cmake ninja-build
22
+
23
+ - name: Configure
24
+ run: >
25
+ cmake -S . -B build -G Ninja
26
+ -DCMAKE_BUILD_TYPE=Release
27
+ -DVECTORVAULT_BUILD_TESTS=ON
28
+ -DVECTORVAULT_BUILD_PYTHON=OFF
29
+
30
+ - name: Build
31
+ run: cmake --build build
32
+
33
+ - name: Test
34
+ run: ctest --test-dir build --output-on-failure
35
+
36
+ python:
37
+ name: Python binding + tests
38
+ runs-on: ubuntu-latest
39
+ strategy:
40
+ fail-fast: false
41
+ matrix:
42
+ python-version: ["3.9", "3.10", "3.11", "3.12"]
43
+ steps:
44
+ - uses: actions/checkout@v4
45
+
46
+ - name: Install toolchain
47
+ run: sudo apt-get update && sudo apt-get install -y g++ cmake ninja-build
48
+
49
+ - uses: actions/setup-python@v5
50
+ with:
51
+ python-version: ${{ matrix.python-version }}
52
+
53
+ - name: Install package and test dependencies
54
+ run: pip install -e ".[test]"
55
+
56
+ - name: Test
57
+ run: pytest -q
@@ -0,0 +1,64 @@
1
+ name: Wheels
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ push:
6
+ tags: ["v*"]
7
+
8
+ jobs:
9
+ wheels:
10
+ name: Wheels on ${{ matrix.os }}
11
+ runs-on: ${{ matrix.os }}
12
+ strategy:
13
+ fail-fast: false
14
+ matrix:
15
+ os: [ubuntu-latest, windows-latest, macos-latest]
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Build wheels
20
+ uses: pypa/cibuildwheel@v2.21
21
+
22
+ - uses: actions/upload-artifact@v4
23
+ with:
24
+ name: wheels-${{ matrix.os }}
25
+ path: wheelhouse/*.whl
26
+
27
+ sdist:
28
+ name: Source distribution
29
+ runs-on: ubuntu-latest
30
+ steps:
31
+ - uses: actions/checkout@v4
32
+
33
+ - name: Build sdist
34
+ run: pipx run build --sdist
35
+
36
+ - uses: actions/upload-artifact@v4
37
+ with:
38
+ name: sdist
39
+ path: dist/*.tar.gz
40
+
41
+ publish:
42
+ name: Publish to PyPI
43
+ needs: [wheels, sdist]
44
+ runs-on: ubuntu-latest
45
+ if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
46
+ environment: pypi
47
+ permissions:
48
+ id-token: write
49
+ steps:
50
+ - name: Download wheels
51
+ uses: actions/download-artifact@v4
52
+ with:
53
+ path: dist
54
+ pattern: wheels-*
55
+ merge-multiple: true
56
+
57
+ - name: Download sdist
58
+ uses: actions/download-artifact@v4
59
+ with:
60
+ name: sdist
61
+ path: dist
62
+
63
+ - name: Publish
64
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,40 @@
1
+ # Build output
2
+ /build/
3
+ /_skbuild/
4
+ /dist/
5
+ *.egg-info/
6
+ /wheelhouse/
7
+
8
+ # CMake / FetchContent caches
9
+ CMakeCache.txt
10
+ CMakeFiles/
11
+ cmake_install.cmake
12
+ CTestTestfile.cmake
13
+ Testing/
14
+ _deps/
15
+
16
+ # Compiled artifacts
17
+ *.o
18
+ *.obj
19
+ *.a
20
+ *.lib
21
+ *.so
22
+ *.dylib
23
+ *.pyd
24
+ *.dll
25
+
26
+ # Python
27
+ __pycache__/
28
+ *.py[cod]
29
+ .pytest_cache/
30
+ .hypothesis/
31
+ .venv/
32
+ venv/
33
+
34
+ # Editor/OS
35
+ .vscode/
36
+ .idea/
37
+ .DS_Store
38
+
39
+ # Tooling
40
+ .kiro/
@@ -0,0 +1,79 @@
1
+ cmake_minimum_required(VERSION 3.20)
2
+
3
+ project(VectorVaultDB
4
+ VERSION 0.1.1
5
+ DESCRIPTION "High-performance vector database with a C++ core and Python bindings"
6
+ LANGUAGES CXX)
7
+
8
+ # ---------------------------------------------------------------------------
9
+ # Global settings
10
+ # ---------------------------------------------------------------------------
11
+ set(CMAKE_CXX_STANDARD 17)
12
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
13
+ set(CMAKE_CXX_EXTENSIONS OFF)
14
+
15
+ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
16
+ set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
17
+ endif()
18
+
19
+ # Build options. The Python module is built by default (this is how
20
+ # scikit-build-core drives the build). C++ tests are opt-in so that wheel
21
+ # builds do not need to fetch the test dependencies.
22
+ option(VECTORVAULT_BUILD_PYTHON "Build the pybind11 Python extension" ON)
23
+ option(VECTORVAULT_BUILD_TESTS "Build the C++ test harness (Catch2 + RapidCheck)" OFF)
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # Core engine library (C++17)
27
+ # ---------------------------------------------------------------------------
28
+ add_library(vectorvault_core STATIC
29
+ src/core/version.cpp
30
+ src/core/error.cpp
31
+ src/core/memory_allocator.cpp
32
+ src/core/distance.cpp
33
+ src/core/collection.cpp
34
+ src/core/hnsw_index.cpp
35
+ src/core/ivf_index.cpp
36
+ src/core/crc64.cpp
37
+ src/core/persistence.cpp
38
+ src/core/mmap_region.cpp
39
+ src/core/engine.cpp)
40
+
41
+ target_include_directories(vectorvault_core
42
+ PUBLIC
43
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
44
+
45
+ set_target_properties(vectorvault_core PROPERTIES
46
+ POSITION_INDEPENDENT_CODE ON)
47
+
48
+ # ---------------------------------------------------------------------------
49
+ # Python extension (pybind11)
50
+ # ---------------------------------------------------------------------------
51
+ if(VECTORVAULT_BUILD_PYTHON)
52
+ find_package(pybind11 CONFIG REQUIRED)
53
+
54
+ pybind11_add_module(_vectorvault src/python/module.cpp)
55
+ target_link_libraries(_vectorvault PRIVATE vectorvault_core)
56
+
57
+ # On MinGW/MSYS2 the extension would otherwise depend on libstdc++,
58
+ # libgcc, and libwinpthread DLLs from the toolchain's bin directory.
59
+ # Link them statically so the .pyd is self-contained and importable by a
60
+ # stock CPython interpreter without the toolchain on PATH.
61
+ if(MINGW)
62
+ target_link_options(_vectorvault PRIVATE
63
+ -static-libgcc -static-libstdc++
64
+ -Wl,-Bstatic,--whole-archive -lwinpthread
65
+ -Wl,--no-whole-archive)
66
+ endif()
67
+
68
+ # Install the extension into the vectorvault package (scikit-build-core
69
+ # collects this together with the pure-Python package files).
70
+ install(TARGETS _vectorvault DESTINATION vectorvault)
71
+ endif()
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # C++ test harness
75
+ # ---------------------------------------------------------------------------
76
+ if(VECTORVAULT_BUILD_TESTS)
77
+ enable_testing()
78
+ add_subdirectory(tests/cpp)
79
+ endif()
@@ -0,0 +1,167 @@
1
+ Metadata-Version: 2.2
2
+ Name: vector-vault-db
3
+ Version: 0.1.1
4
+ Summary: High-performance vector database with a C++ core and Python bindings
5
+ License: MIT
6
+ Requires-Python: >=3.9
7
+ Requires-Dist: numpy>=1.21
8
+ Provides-Extra: test
9
+ Requires-Dist: pytest>=7.0; extra == "test"
10
+ Requires-Dist: hypothesis>=6.0; extra == "test"
11
+ Requires-Dist: numpy>=1.21; extra == "test"
12
+ Description-Content-Type: text/markdown
13
+
14
+ # Vector-Vault-DB
15
+
16
+ A high-performance vector database written from scratch. The engine is C++17;
17
+ the public interface is a Python extension built with pybind11. There is no
18
+ dependency on an existing vector store — the index, distance kernels, allocator,
19
+ and on-disk format are all implemented directly.
20
+
21
+ ## Features
22
+
23
+ - **Approximate nearest-neighbor search** over float32 vectors using either an
24
+ HNSW graph or an IVF (inverted-file) index.
25
+ - **SIMD-accelerated distance kernels** (AVX-512) for Euclidean, cosine, and dot
26
+ product, with a scalar fallback selected once at startup via CPU detection.
27
+ - **Custom arena allocator** that hands out 64-byte aligned blocks for vector
28
+ storage and tracks per-collection usage.
29
+ - **Memory-mapped persistence**: a versioned, CRC-checked binary snapshot format.
30
+ Saves are atomic (temp file + fsync + rename); loads map the vector region
31
+ instead of copying it onto the heap.
32
+ - **Python bindings** with NumPy support and an exception hierarchy that mirrors
33
+ the engine's error categories.
34
+
35
+ ## Documentation
36
+
37
+ Full documentation lives in [`docs/`](docs/index.md):
38
+
39
+ - [Installation](docs/installation.md)
40
+ - [Quickstart](docs/quickstart.md)
41
+ - [Concepts](docs/concepts.md) — metrics, indexes, tuning, persistence
42
+ - [Python API reference](docs/python-api.md)
43
+ - [C++ API guide](docs/cpp-api.md)
44
+
45
+ ## Architecture
46
+
47
+ ```
48
+ Python (vectorvault) pybind11 extension, NumPy interop, exceptions
49
+
50
+
51
+ Engine collection registry, shared allocator + persistence
52
+
53
+
54
+ Collection record store, validation, query orchestration
55
+ ├── Index HNSW / IVF — graph + inverted-file ANN
56
+ ├── DistanceCalculator AVX-512 kernels with scalar fallback
57
+ ├── MemoryAllocator 64-byte aligned arena, per-collection accounting
58
+ └── PersistenceManager atomic save, mmap-backed load, CRC validation
59
+ ```
60
+
61
+ A collection is guarded by a single readers-writer lock: reads (get, query) take
62
+ a shared lock, mutations (insert, delete, build, save) take an exclusive lock, so
63
+ index membership stays consistent with the record store.
64
+
65
+ ## Requirements
66
+
67
+ - A C++17 compiler (GCC, Clang, or MSVC)
68
+ - CMake >= 3.20
69
+ - Python >= 3.9 with NumPy (for the bindings)
70
+ - pybind11 (resolved by the build backend)
71
+
72
+ Catch2, RapidCheck, and Hypothesis are fetched automatically for the test builds.
73
+
74
+ ## Install
75
+
76
+ ```bash
77
+ pip install -e ".[test]"
78
+ ```
79
+
80
+ This builds the native extension through scikit-build-core and installs the
81
+ `vectorvault` package in editable mode.
82
+
83
+ ## Quickstart
84
+
85
+ ```python
86
+ import vectorvault as vv
87
+
88
+ engine = vv.Engine()
89
+ coll = engine.create_collection("documents", dim=128, metric="cosine")
90
+
91
+ coll.insert("doc-1", embedding_a, metadata={"title": "intro"})
92
+ coll.insert("doc-2", embedding_b)
93
+
94
+ coll.build_index("hnsw", m=16, ef_construction=200)
95
+
96
+ results = coll.query(query_vector, k=10, ef_search=64)
97
+ for record_id, distance in results:
98
+ print(record_id, distance)
99
+
100
+ engine.save("documents", "documents.vv")
101
+ restored = vv.Engine().load("documents.vv")
102
+ ```
103
+
104
+ Metrics: `"euclidean"` (`"l2"`), `"cosine"`, `"dot"` (`"dot_product"`).
105
+ Index types: `"hnsw"`, `"ivf"`.
106
+
107
+ Errors surface as typed exceptions under `vectorvault.VectorVaultError`; several
108
+ also derive from a builtin (for example `NotFoundError` is a `KeyError` and
109
+ `SnapshotNotFoundError` is a `FileNotFoundError`).
110
+
111
+ ## On-disk format
112
+
113
+ A snapshot is a single file: a fixed header (magic, version, dimensionality,
114
+ metric, index type, region offsets, CRC-64 of the content), the collection name,
115
+ a record directory sorted by id with optional metadata, a 64-byte aligned vector
116
+ region, and the serialized index. Records, metadata keys, and index nodes are
117
+ written in a canonical order, so saving a collection, loading it, and saving
118
+ again produces a byte-identical file. Loads validate existence, magic, version,
119
+ and checksum before mapping the vector region.
120
+
121
+ ## Testing
122
+
123
+ C++ (Catch2 + RapidCheck):
124
+
125
+ ```bash
126
+ cmake -S . -B build -DVECTORVAULT_BUILD_TESTS=ON -DVECTORVAULT_BUILD_PYTHON=OFF
127
+ cmake --build build
128
+ ctest --test-dir build --output-on-failure
129
+ ```
130
+
131
+ Python (pytest + Hypothesis):
132
+
133
+ ```bash
134
+ pytest
135
+ ```
136
+
137
+ Correctness-critical behavior — distance accuracy against a reference, allocator
138
+ accounting, query ordering, index membership, and the snapshot round-trip — is
139
+ covered by property-based tests. A recall benchmark builds an index over 10,000
140
+ vectors and asserts mean recall@10 against an exact baseline.
141
+
142
+ ## Layout
143
+
144
+ ```
145
+ include/vectorvault/ Public C++ headers
146
+ src/core/ Core engine implementation
147
+ src/python/ pybind11 module + vectorvault package
148
+ tests/cpp/ C++ tests (Catch2 + RapidCheck)
149
+ tests/python/ Python tests (pytest + Hypothesis)
150
+ ```
151
+
152
+ ## Notes and limitations
153
+
154
+ - The HNSW index selects neighbours with the diversity heuristic from Malkov &
155
+ Yashunin (SELECT-NEIGHBORS-HEURISTIC), applied both when linking a new node and
156
+ when pruning an existing node whose adjacency list overflows. On uniform random
157
+ data this measures mean recall@10 of ~0.96 at dim 32 and ~0.85 at dim 64
158
+ (m=16, ef_construction=200, ef_search=50); recall still tapers on harder,
159
+ higher-dimensional distributions.
160
+ - The snapshot reader assumes a little-endian host (it reads float components
161
+ directly from the mapping).
162
+ - AVX-512 is detected at runtime; on hosts without it the scalar kernels are
163
+ used, which are also the accuracy reference.
164
+
165
+ ## License
166
+
167
+ MIT
@@ -0,0 +1,154 @@
1
+ # Vector-Vault-DB
2
+
3
+ A high-performance vector database written from scratch. The engine is C++17;
4
+ the public interface is a Python extension built with pybind11. There is no
5
+ dependency on an existing vector store — the index, distance kernels, allocator,
6
+ and on-disk format are all implemented directly.
7
+
8
+ ## Features
9
+
10
+ - **Approximate nearest-neighbor search** over float32 vectors using either an
11
+ HNSW graph or an IVF (inverted-file) index.
12
+ - **SIMD-accelerated distance kernels** (AVX-512) for Euclidean, cosine, and dot
13
+ product, with a scalar fallback selected once at startup via CPU detection.
14
+ - **Custom arena allocator** that hands out 64-byte aligned blocks for vector
15
+ storage and tracks per-collection usage.
16
+ - **Memory-mapped persistence**: a versioned, CRC-checked binary snapshot format.
17
+ Saves are atomic (temp file + fsync + rename); loads map the vector region
18
+ instead of copying it onto the heap.
19
+ - **Python bindings** with NumPy support and an exception hierarchy that mirrors
20
+ the engine's error categories.
21
+
22
+ ## Documentation
23
+
24
+ Full documentation lives in [`docs/`](docs/index.md):
25
+
26
+ - [Installation](docs/installation.md)
27
+ - [Quickstart](docs/quickstart.md)
28
+ - [Concepts](docs/concepts.md) — metrics, indexes, tuning, persistence
29
+ - [Python API reference](docs/python-api.md)
30
+ - [C++ API guide](docs/cpp-api.md)
31
+
32
+ ## Architecture
33
+
34
+ ```
35
+ Python (vectorvault) pybind11 extension, NumPy interop, exceptions
36
+
37
+
38
+ Engine collection registry, shared allocator + persistence
39
+
40
+
41
+ Collection record store, validation, query orchestration
42
+ ├── Index HNSW / IVF — graph + inverted-file ANN
43
+ ├── DistanceCalculator AVX-512 kernels with scalar fallback
44
+ ├── MemoryAllocator 64-byte aligned arena, per-collection accounting
45
+ └── PersistenceManager atomic save, mmap-backed load, CRC validation
46
+ ```
47
+
48
+ A collection is guarded by a single readers-writer lock: reads (get, query) take
49
+ a shared lock, mutations (insert, delete, build, save) take an exclusive lock, so
50
+ index membership stays consistent with the record store.
51
+
52
+ ## Requirements
53
+
54
+ - A C++17 compiler (GCC, Clang, or MSVC)
55
+ - CMake >= 3.20
56
+ - Python >= 3.9 with NumPy (for the bindings)
57
+ - pybind11 (resolved by the build backend)
58
+
59
+ Catch2, RapidCheck, and Hypothesis are fetched automatically for the test builds.
60
+
61
+ ## Install
62
+
63
+ ```bash
64
+ pip install -e ".[test]"
65
+ ```
66
+
67
+ This builds the native extension through scikit-build-core and installs the
68
+ `vectorvault` package in editable mode.
69
+
70
+ ## Quickstart
71
+
72
+ ```python
73
+ import vectorvault as vv
74
+
75
+ engine = vv.Engine()
76
+ coll = engine.create_collection("documents", dim=128, metric="cosine")
77
+
78
+ coll.insert("doc-1", embedding_a, metadata={"title": "intro"})
79
+ coll.insert("doc-2", embedding_b)
80
+
81
+ coll.build_index("hnsw", m=16, ef_construction=200)
82
+
83
+ results = coll.query(query_vector, k=10, ef_search=64)
84
+ for record_id, distance in results:
85
+ print(record_id, distance)
86
+
87
+ engine.save("documents", "documents.vv")
88
+ restored = vv.Engine().load("documents.vv")
89
+ ```
90
+
91
+ Metrics: `"euclidean"` (`"l2"`), `"cosine"`, `"dot"` (`"dot_product"`).
92
+ Index types: `"hnsw"`, `"ivf"`.
93
+
94
+ Errors surface as typed exceptions under `vectorvault.VectorVaultError`; several
95
+ also derive from a builtin (for example `NotFoundError` is a `KeyError` and
96
+ `SnapshotNotFoundError` is a `FileNotFoundError`).
97
+
98
+ ## On-disk format
99
+
100
+ A snapshot is a single file: a fixed header (magic, version, dimensionality,
101
+ metric, index type, region offsets, CRC-64 of the content), the collection name,
102
+ a record directory sorted by id with optional metadata, a 64-byte aligned vector
103
+ region, and the serialized index. Records, metadata keys, and index nodes are
104
+ written in a canonical order, so saving a collection, loading it, and saving
105
+ again produces a byte-identical file. Loads validate existence, magic, version,
106
+ and checksum before mapping the vector region.
107
+
108
+ ## Testing
109
+
110
+ C++ (Catch2 + RapidCheck):
111
+
112
+ ```bash
113
+ cmake -S . -B build -DVECTORVAULT_BUILD_TESTS=ON -DVECTORVAULT_BUILD_PYTHON=OFF
114
+ cmake --build build
115
+ ctest --test-dir build --output-on-failure
116
+ ```
117
+
118
+ Python (pytest + Hypothesis):
119
+
120
+ ```bash
121
+ pytest
122
+ ```
123
+
124
+ Correctness-critical behavior — distance accuracy against a reference, allocator
125
+ accounting, query ordering, index membership, and the snapshot round-trip — is
126
+ covered by property-based tests. A recall benchmark builds an index over 10,000
127
+ vectors and asserts mean recall@10 against an exact baseline.
128
+
129
+ ## Layout
130
+
131
+ ```
132
+ include/vectorvault/ Public C++ headers
133
+ src/core/ Core engine implementation
134
+ src/python/ pybind11 module + vectorvault package
135
+ tests/cpp/ C++ tests (Catch2 + RapidCheck)
136
+ tests/python/ Python tests (pytest + Hypothesis)
137
+ ```
138
+
139
+ ## Notes and limitations
140
+
141
+ - The HNSW index selects neighbours with the diversity heuristic from Malkov &
142
+ Yashunin (SELECT-NEIGHBORS-HEURISTIC), applied both when linking a new node and
143
+ when pruning an existing node whose adjacency list overflows. On uniform random
144
+ data this measures mean recall@10 of ~0.96 at dim 32 and ~0.85 at dim 64
145
+ (m=16, ef_construction=200, ef_search=50); recall still tapers on harder,
146
+ higher-dimensional distributions.
147
+ - The snapshot reader assumes a little-endian host (it reads float components
148
+ directly from the mapping).
149
+ - AVX-512 is detected at runtime; on hosts without it the scalar kernels are
150
+ used, which are also the accuracy reference.
151
+
152
+ ## License
153
+
154
+ MIT
@@ -0,0 +1,115 @@
1
+ # Concepts
2
+
3
+ ## Collections
4
+
5
+ A collection is a named container for vectors that share a fixed
6
+ **dimensionality** and a single **distance metric**. Both are set at creation
7
+ and immutable thereafter. Each record is a unique string id, a float32 vector,
8
+ and optional metadata.
9
+
10
+ Constraints enforced at the boundary:
11
+
12
+ - Collection name: 1–255 characters, unique within an engine.
13
+ - Dimensionality: an integer in `[1, 65536]`.
14
+ - Record id: non-empty string.
15
+ - Vector: length equals the collection dimensionality; all components finite
16
+ (no NaN or infinity).
17
+ - Metadata: a map of string keys to str / int / float / bool values.
18
+
19
+ ## Distance metrics
20
+
21
+ | Metric | String | Definition | Ordering |
22
+ |---|---|---|---|
23
+ | Euclidean (L2) | `"euclidean"`, `"l2"` | √Σ(aᵢ−bᵢ)² | nearest = smallest |
24
+ | Cosine | `"cosine"` | 1 − (a·b)/(‖a‖‖b‖), in [0, 2] | nearest = smallest |
25
+ | Dot product | `"dot"`, `"dot_product"` | Σ aᵢbᵢ | see note below |
26
+
27
+ Query results are always ordered by **ascending metric value**, with ties
28
+ broken by ascending id for determinism.
29
+
30
+ - For **Euclidean** and **Cosine**, a smaller value means more similar, so
31
+ ascending order returns the nearest neighbors first — the expected behavior.
32
+ - For **Dot product**, the raw dot product is treated as the distance and sorted
33
+ ascending, so the largest dot products come *last*. If you want
34
+ maximum-inner-product semantics (largest dot first), use cosine on normalized
35
+ vectors, or reverse the result list yourself.
36
+
37
+ Cosine distance is undefined for a zero-magnitude vector; computing it raises
38
+ `UndefinedDistanceError`.
39
+
40
+ ## Indexes
41
+
42
+ An index makes nearest-neighbor search sublinear. With no index, queries fall
43
+ back to an exact brute-force scan, which is correct but O(n) per query.
44
+
45
+ ### HNSW (Hierarchical Navigable Small World)
46
+
47
+ A layered proximity graph. Search descends through sparse upper layers to a good
48
+ entry point, then explores the base layer. Neighbor selection uses a diversity
49
+ heuristic so the graph stays navigable.
50
+
51
+ Build parameters:
52
+
53
+ - `m` (default 16): max neighbors per node on upper layers (base layer keeps
54
+ `2*m`). Higher `m` improves recall and memory use. Must be ≥ 2.
55
+ - `ef_construction` (default 200): build-time search breadth. Higher values build
56
+ a better graph more slowly. Must be ≥ 1.
57
+
58
+ Query parameter:
59
+
60
+ - `ef_search` (default 50): search breadth. Higher values trade speed for recall.
61
+ Must be ≥ 1.
62
+
63
+ ### IVF (Inverted File)
64
+
65
+ Partitions vectors into `nlist` cells via k-means. A query scans only the
66
+ `nprobe` cells nearest to it.
67
+
68
+ Build parameter:
69
+
70
+ - `nlist` (default 0 = auto, ≈ √n): number of partitions. Must be in
71
+ `[1, record_count]`.
72
+
73
+ Query parameter:
74
+
75
+ - `nprobe` (default 1): partitions scanned per query. Higher values trade speed
76
+ for recall. Must be in `[1, nlist]`.
77
+
78
+ ### Choosing an index
79
+
80
+ - **HNSW** generally gives higher recall at a given speed and is a good default.
81
+ - **IVF** has lower memory overhead and a smaller build cost; recall is tuned
82
+ primarily through `nprobe`.
83
+
84
+ Deletions are handled by tombstoning: a removed record is excluded from results
85
+ immediately and the surrounding structure is preserved.
86
+
87
+ ## Persistence
88
+
89
+ `engine.save(name, path)` writes a single binary snapshot containing the
90
+ collection metadata, all records (with metadata), and the index. The format is:
91
+
92
+ - A fixed header: magic, format version, dimensionality, metric, index type,
93
+ region offsets, and a CRC-64 of the content.
94
+ - The collection name.
95
+ - A record directory sorted by id, each with optional metadata.
96
+ - A 64-byte aligned vector region.
97
+ - The serialized index (if one was built).
98
+
99
+ Records, metadata keys, and index nodes are written in a canonical order, so
100
+ saving a collection, loading it, and saving again produces a **byte-identical**
101
+ file.
102
+
103
+ `engine.load(path)` validates the file in order — existence and size, magic and
104
+ header, format version, then content checksum — before mapping the vector region
105
+ into memory. Vector data is read through the mapping rather than copied onto the
106
+ heap. Failures raise `SnapshotNotFoundError`, `UnsupportedVersionError`, or
107
+ `CorruptionError`.
108
+
109
+ ## Performance notes
110
+
111
+ - Distance kernels use AVX-512 when the CPU supports it and fall back to scalar
112
+ code otherwise; both accumulate in double precision for accuracy.
113
+ - Vector storage is 64-byte aligned to suit SIMD access.
114
+ - A collection is guarded by a readers-writer lock: concurrent reads are allowed;
115
+ mutations are serialized.