vectlite 0.9.3 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +42 -3
- package/index.d.ts +75 -0
- package/index.js +90 -2
- package/native/Cargo.toml +1 -1
- package/native/src/lib.rs +209 -11
- package/native/vectlite-core/Cargo.toml +1 -1
- package/native/vectlite-core/src/lib.rs +1179 -43
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/vectlite.node +0 -0
- package/prebuilds/darwin-x64/vectlite.node +0 -0
- package/prebuilds/linux-x64-gnu/vectlite.node +0 -0
- package/prebuilds/win32-x64-msvc/vectlite.node +0 -0
|
@@ -35,6 +35,10 @@ const ANN_OVERSAMPLE: usize = 8;
|
|
|
35
35
|
const ANN_MIN_CANDIDATES: usize = 64;
|
|
36
36
|
const ANN_M: usize = 16;
|
|
37
37
|
const ANN_EF_CONSTRUCTION: usize = 200;
|
|
38
|
+
/// Threshold above which HNSW construction uses parallel batch insert
|
|
39
|
+
/// (Rayon-based). Below this, sequential insert is cheaper because of
|
|
40
|
+
/// thread setup overhead.
|
|
41
|
+
const ANN_PARALLEL_INSERT_THRESHOLD: usize = 256;
|
|
38
42
|
const BM25_K1: f32 = 1.2;
|
|
39
43
|
const BM25_B: f32 = 0.75;
|
|
40
44
|
|
|
@@ -754,6 +758,151 @@ pub struct SearchOptions {
|
|
|
754
758
|
pub truncate_dim: Option<usize>,
|
|
755
759
|
}
|
|
756
760
|
|
|
761
|
+
/// HNSW tuning parameters. Exposed so callers can trade off recall, latency,
|
|
762
|
+
/// memory and build time.
|
|
763
|
+
///
|
|
764
|
+
/// Defaults mirror VectLite's historical built-in values (`m = 16`,
|
|
765
|
+
/// `ef_construction = 200`). `ef_search = None` means VectLite picks an
|
|
766
|
+
/// `ef_search` derived from `top_k * ANN_OVERSAMPLE`.
|
|
767
|
+
///
|
|
768
|
+
/// Reference: Malkov & Yashunin, *Efficient and robust approximate nearest
|
|
769
|
+
/// neighbor search using Hierarchical Navigable Small World graphs*.
|
|
770
|
+
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
|
771
|
+
pub struct IndexConfig {
|
|
772
|
+
/// Max number of bidirectional links per node. Higher = better recall,
|
|
773
|
+
/// more memory, slower build. Typical range: 8..64.
|
|
774
|
+
pub m: usize,
|
|
775
|
+
/// Width of the search during graph construction. Higher = better recall,
|
|
776
|
+
/// slower build. Typical range: 64..800.
|
|
777
|
+
pub ef_construction: usize,
|
|
778
|
+
/// Width of the search at query time. None = auto (derived from top_k).
|
|
779
|
+
/// Higher = better recall, slower search.
|
|
780
|
+
pub ef_search: Option<usize>,
|
|
781
|
+
/// Use parallel (Rayon-backed) HNSW insertion when the dataset has at
|
|
782
|
+
/// least this many vectors. Defaults to `ANN_PARALLEL_INSERT_THRESHOLD`.
|
|
783
|
+
/// Set very high to disable parallel insert.
|
|
784
|
+
pub parallel_insert_threshold: usize,
|
|
785
|
+
/// Percentage (0..=100) of tombstoned nodes at which the HNSW graph is
|
|
786
|
+
/// rebuilt during `compact()`. A `delete` doesn't physically remove a
|
|
787
|
+
/// node from HNSW (that operation is not supported by the library); the
|
|
788
|
+
/// node is just marked dead and filtered out at search time. Once enough
|
|
789
|
+
/// nodes are dead, search recall and latency degrade, so we rebuild.
|
|
790
|
+
/// Default `30` (rebuild when ≥30% of the graph is dead). Set to `100`
|
|
791
|
+
/// to disable automatic rebuild.
|
|
792
|
+
pub tombstone_rebuild_pct: u8,
|
|
793
|
+
}
|
|
794
|
+
|
|
795
|
+
impl Default for IndexConfig {
|
|
796
|
+
fn default() -> Self {
|
|
797
|
+
Self {
|
|
798
|
+
m: ANN_M,
|
|
799
|
+
ef_construction: ANN_EF_CONSTRUCTION,
|
|
800
|
+
ef_search: None,
|
|
801
|
+
parallel_insert_threshold: ANN_PARALLEL_INSERT_THRESHOLD,
|
|
802
|
+
tombstone_rebuild_pct: 30,
|
|
803
|
+
}
|
|
804
|
+
}
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
impl IndexConfig {
|
|
808
|
+
/// A preset tuned for higher recall at the cost of build/search time.
|
|
809
|
+
/// Useful for benchmark comparisons where recall@10 must approach 1.0.
|
|
810
|
+
pub fn high_recall() -> Self {
|
|
811
|
+
Self {
|
|
812
|
+
m: 32,
|
|
813
|
+
ef_construction: 400,
|
|
814
|
+
ef_search: Some(200),
|
|
815
|
+
parallel_insert_threshold: ANN_PARALLEL_INSERT_THRESHOLD,
|
|
816
|
+
tombstone_rebuild_pct: 30,
|
|
817
|
+
}
|
|
818
|
+
}
|
|
819
|
+
|
|
820
|
+
/// A preset tuned for fast build & low latency, lower recall.
|
|
821
|
+
pub fn fast() -> Self {
|
|
822
|
+
Self {
|
|
823
|
+
m: 8,
|
|
824
|
+
ef_construction: 100,
|
|
825
|
+
ef_search: Some(40),
|
|
826
|
+
parallel_insert_threshold: ANN_PARALLEL_INSERT_THRESHOLD,
|
|
827
|
+
tombstone_rebuild_pct: 30,
|
|
828
|
+
}
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
fn validate(&self) -> Result<()> {
|
|
832
|
+
if self.m == 0 {
|
|
833
|
+
return Err(VectLiteError::InvalidFormat(
|
|
834
|
+
"IndexConfig.m must be >= 1".to_owned(),
|
|
835
|
+
));
|
|
836
|
+
}
|
|
837
|
+
if self.ef_construction == 0 {
|
|
838
|
+
return Err(VectLiteError::InvalidFormat(
|
|
839
|
+
"IndexConfig.ef_construction must be >= 1".to_owned(),
|
|
840
|
+
));
|
|
841
|
+
}
|
|
842
|
+
if let Some(ef) = self.ef_search {
|
|
843
|
+
if ef == 0 {
|
|
844
|
+
return Err(VectLiteError::InvalidFormat(
|
|
845
|
+
"IndexConfig.ef_search must be >= 1 when set".to_owned(),
|
|
846
|
+
));
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
if self.tombstone_rebuild_pct > 100 {
|
|
850
|
+
return Err(VectLiteError::InvalidFormat(
|
|
851
|
+
"IndexConfig.tombstone_rebuild_pct must be in 0..=100".to_owned(),
|
|
852
|
+
));
|
|
853
|
+
}
|
|
854
|
+
Ok(())
|
|
855
|
+
}
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
/// Controls when the WAL file is `fsync`'d to disk.
|
|
859
|
+
///
|
|
860
|
+
/// Per-record durability is the default (`PerOp`) but on macOS APFS — and to
|
|
861
|
+
/// a lesser extent on Linux ext4 — `fsync` is the dominant cost of single
|
|
862
|
+
/// `insert` calls. Relaxing this knob can multiply ingestion throughput by
|
|
863
|
+
/// 5–10× at the cost of losing some recently-acknowledged records on an
|
|
864
|
+
/// unclean shutdown.
|
|
865
|
+
///
|
|
866
|
+
/// The WAL is *always* fully synced on `flush()`, `compact()`, and `close()`.
|
|
867
|
+
/// So even with `OnFlush`, any data that survives a clean shutdown is
|
|
868
|
+
/// durable. The window of vulnerability is limited to:
|
|
869
|
+
/// - `EveryN(n)`: at most the last `n - 1` inserts since the last fsync.
|
|
870
|
+
/// - `OnFlush`: every insert since the last `flush()` / `compact()`.
|
|
871
|
+
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
|
872
|
+
pub enum WalSyncMode {
|
|
873
|
+
/// `fsync` after every WAL append. Strongest durability, slowest. This is
|
|
874
|
+
/// the default and matches pre-0.11 behaviour.
|
|
875
|
+
PerOp,
|
|
876
|
+
/// `fsync` once every `n` ops. On a crash, up to the last `n - 1` ops
|
|
877
|
+
/// since the last sync may be lost. A good middle ground when streaming
|
|
878
|
+
/// thousands of small records: pick `n` so the worst-case loss is
|
|
879
|
+
/// tolerable (e.g. `64` ≈ a fraction of a second of data).
|
|
880
|
+
EveryN(usize),
|
|
881
|
+
/// Never `fsync` from the per-op path. Sync only at `flush()` / `compact()`
|
|
882
|
+
/// / `close()`. Maximum throughput, weakest durability — appropriate for
|
|
883
|
+
/// bulk ingestion of data that can be regenerated.
|
|
884
|
+
OnFlush,
|
|
885
|
+
}
|
|
886
|
+
|
|
887
|
+
impl Default for WalSyncMode {
|
|
888
|
+
fn default() -> Self {
|
|
889
|
+
WalSyncMode::PerOp
|
|
890
|
+
}
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
impl WalSyncMode {
|
|
894
|
+
fn validate(self) -> Result<()> {
|
|
895
|
+
if let WalSyncMode::EveryN(n) = self {
|
|
896
|
+
if n == 0 {
|
|
897
|
+
return Err(VectLiteError::InvalidFormat(
|
|
898
|
+
"WalSyncMode::EveryN must be >= 1".to_owned(),
|
|
899
|
+
));
|
|
900
|
+
}
|
|
901
|
+
}
|
|
902
|
+
Ok(())
|
|
903
|
+
}
|
|
904
|
+
}
|
|
905
|
+
|
|
757
906
|
impl Default for SearchOptions {
|
|
758
907
|
fn default() -> Self {
|
|
759
908
|
Self {
|
|
@@ -1214,6 +1363,29 @@ pub struct Database {
|
|
|
1214
1363
|
/// Holds the lock file open for the lifetime of the database.
|
|
1215
1364
|
/// Dropping this releases the advisory lock.
|
|
1216
1365
|
_lock_file: Option<File>,
|
|
1366
|
+
/// Cached WAL writer: avoids paying the open() syscall on every insert.
|
|
1367
|
+
/// Reset whenever the WAL is rotated (compact, clear_wal).
|
|
1368
|
+
wal_writer: Option<BufWriter<File>>,
|
|
1369
|
+
/// Controls when `fsync` is issued against the WAL — see [`WalSyncMode`].
|
|
1370
|
+
wal_sync_mode: WalSyncMode,
|
|
1371
|
+
/// Number of ops appended to the WAL since the last fsync. Used by the
|
|
1372
|
+
/// `EveryN` sync mode to decide when to flush+sync.
|
|
1373
|
+
wal_ops_since_sync: usize,
|
|
1374
|
+
/// True if the in-memory ANN graph(s) have unsaved changes (incremental
|
|
1375
|
+
/// inserts, fresh build, or a full rebuild) that have not been written
|
|
1376
|
+
/// out via `persist_ann_to_disk`. Set on every mutation in
|
|
1377
|
+
/// `apply_wal_batch` / `bulk_ingest` and cleared by `compact_inner` or
|
|
1378
|
+
/// an explicit `persist_ann_to_disk`.
|
|
1379
|
+
ann_dirty: bool,
|
|
1380
|
+
/// True if the quantized PQ index needs to be rebuilt at the next flush
|
|
1381
|
+
/// (because records have been inserted/deleted since the last rebuild).
|
|
1382
|
+
/// While dirty, the in-memory `quantized` field is set to `None` so
|
|
1383
|
+
/// searches transparently fall back to the HNSW path instead of
|
|
1384
|
+
/// returning candidates from a stale codebook.
|
|
1385
|
+
quantized_dirty: bool,
|
|
1386
|
+
/// Same as `quantized_dirty`, but for multi-vector (ColBERT-style)
|
|
1387
|
+
/// quantization spaces. Lazy rebuild happens at flush time.
|
|
1388
|
+
multi_vector_quantized_dirty: bool,
|
|
1217
1389
|
/// Optional quantized index for accelerated search.
|
|
1218
1390
|
quantized: Option<QuantizedIndex>,
|
|
1219
1391
|
/// Configuration used to build the quantized index (persisted).
|
|
@@ -1230,6 +1402,91 @@ pub struct Database {
|
|
|
1230
1402
|
payload_index_defs: BTreeMap<String, PayloadIndexType>,
|
|
1231
1403
|
/// Live payload indexes, populated from records.
|
|
1232
1404
|
payload_indexes: BTreeMap<String, PayloadIndexData>,
|
|
1405
|
+
/// HNSW tuning parameters. Not persisted to disk: this is a per-session
|
|
1406
|
+
/// knob so callers can change recall/latency tradeoffs without migrating
|
|
1407
|
+
/// data files. A subsequent `set_index_config` triggers a rebuild.
|
|
1408
|
+
index_config: IndexConfig,
|
|
1409
|
+
/// Contiguous f32 mirror of the default dense vector for every record.
|
|
1410
|
+
/// Used by brute-force / rescoring scans for cache-friendly SIMD.
|
|
1411
|
+
/// `None` when the arena hasn't been materialised yet for this session.
|
|
1412
|
+
vector_arena: Option<VectorArena>,
|
|
1413
|
+
/// When true, `vector_arena` is stale (e.g. a delete happened) and must
|
|
1414
|
+
/// be rebuilt before use.
|
|
1415
|
+
vector_arena_dirty: bool,
|
|
1416
|
+
}
|
|
1417
|
+
|
|
1418
|
+
/// Contiguous-storage mirror of the default dense vector per record.
|
|
1419
|
+
///
|
|
1420
|
+
/// In the original layout each `Record.vector` is a separately-allocated
|
|
1421
|
+
/// `Vec<f32>` and the records themselves live in `BTreeMap` nodes, so a
|
|
1422
|
+
/// brute-force or rescoring scan pays two pointer hops per record AND
|
|
1423
|
+
/// touches one cache line per vector — terrible for SIMD throughput.
|
|
1424
|
+
///
|
|
1425
|
+
/// This arena stores every vector in a single flat `buf: Vec<f32>` so a scan
|
|
1426
|
+
/// is a straight contiguous walk (one cache miss per ~16 vectors, vs ~2 per
|
|
1427
|
+
/// vector). Lance / Arrow use the same trick — see the v0.11 CHANGELOG note.
|
|
1428
|
+
///
|
|
1429
|
+
/// The arena is maintained incrementally on insert; deletes are too
|
|
1430
|
+
/// expensive to compact in place (would shift O(N) f32s) so they just mark
|
|
1431
|
+
/// the arena dirty and force a lazy full rebuild on next use.
|
|
1432
|
+
struct VectorArena {
|
|
1433
|
+
buf: Vec<f32>,
|
|
1434
|
+
keys: Vec<RecordKey>,
|
|
1435
|
+
key_to_index: HashMap<RecordKey, usize>,
|
|
1436
|
+
dim: usize,
|
|
1437
|
+
}
|
|
1438
|
+
|
|
1439
|
+
impl VectorArena {
|
|
1440
|
+
fn new(dim: usize) -> Self {
|
|
1441
|
+
Self {
|
|
1442
|
+
buf: Vec::new(),
|
|
1443
|
+
keys: Vec::new(),
|
|
1444
|
+
key_to_index: HashMap::new(),
|
|
1445
|
+
dim,
|
|
1446
|
+
}
|
|
1447
|
+
}
|
|
1448
|
+
|
|
1449
|
+
fn append(&mut self, key: RecordKey, vector: &[f32]) {
|
|
1450
|
+
// Defensive: ignore mismatched dims rather than panicking — this is
|
|
1451
|
+
// a perf cache, not the source of truth.
|
|
1452
|
+
if vector.len() != self.dim {
|
|
1453
|
+
return;
|
|
1454
|
+
}
|
|
1455
|
+
let idx = self.keys.len();
|
|
1456
|
+
self.buf.extend_from_slice(vector);
|
|
1457
|
+
self.key_to_index.insert(key.clone(), idx);
|
|
1458
|
+
self.keys.push(key);
|
|
1459
|
+
}
|
|
1460
|
+
|
|
1461
|
+
/// Rebuild from records in BTreeMap order. Called lazily when the arena
|
|
1462
|
+
/// is dirty (i.e. after a delete or a full ANN rebuild).
|
|
1463
|
+
fn rebuild_from(records: &BTreeMap<RecordKey, Record>, dim: usize) -> Self {
|
|
1464
|
+
let mut arena = Self::new(dim);
|
|
1465
|
+
arena.buf.reserve(records.len() * dim);
|
|
1466
|
+
arena.keys.reserve(records.len());
|
|
1467
|
+
arena.key_to_index.reserve(records.len());
|
|
1468
|
+
for (key, record) in records {
|
|
1469
|
+
if record.vector.len() == dim {
|
|
1470
|
+
arena.append(key.clone(), &record.vector);
|
|
1471
|
+
}
|
|
1472
|
+
}
|
|
1473
|
+
arena
|
|
1474
|
+
}
|
|
1475
|
+
|
|
1476
|
+
/// Iterator yielding `(key, vector_slice)` pairs. The slice references
|
|
1477
|
+
/// the contiguous `buf`, so consumers get cache-friendly SIMD scans.
|
|
1478
|
+
#[allow(dead_code)]
|
|
1479
|
+
fn iter(&self) -> impl Iterator<Item = (&RecordKey, &[f32])> {
|
|
1480
|
+
let dim = self.dim;
|
|
1481
|
+
self.keys.iter().enumerate().map(move |(i, k)| {
|
|
1482
|
+
let start = i * dim;
|
|
1483
|
+
(k, &self.buf[start..start + dim])
|
|
1484
|
+
})
|
|
1485
|
+
}
|
|
1486
|
+
|
|
1487
|
+
fn len(&self) -> usize {
|
|
1488
|
+
self.keys.len()
|
|
1489
|
+
}
|
|
1233
1490
|
}
|
|
1234
1491
|
|
|
1235
1492
|
#[derive(Default)]
|
|
@@ -1255,6 +1512,42 @@ impl AnnHnsw {
|
|
|
1255
1512
|
}
|
|
1256
1513
|
}
|
|
1257
1514
|
|
|
1515
|
+
/// Incrementally insert a single vector into an existing HNSW graph.
|
|
1516
|
+
/// `origin_id` must be unique within the graph and is used to map back
|
|
1517
|
+
/// to the caller's record key array.
|
|
1518
|
+
fn insert_one(&mut self, vector: &[f32], origin_id: usize) {
|
|
1519
|
+
match self {
|
|
1520
|
+
AnnHnsw::Cosine(h) => h.insert((vector, origin_id)),
|
|
1521
|
+
AnnHnsw::Euclidean(h) => h.insert((vector, origin_id)),
|
|
1522
|
+
AnnHnsw::DotProduct(h) => h.insert((vector, origin_id)),
|
|
1523
|
+
AnnHnsw::Manhattan(h) => h.insert((vector, origin_id)),
|
|
1524
|
+
}
|
|
1525
|
+
}
|
|
1526
|
+
|
|
1527
|
+
/// Bulk-insert a batch of vectors in parallel (Rayon-multithreaded).
|
|
1528
|
+
/// Significantly faster than repeated `insert_one` when the batch is
|
|
1529
|
+
/// large enough to amortise thread setup.
|
|
1530
|
+
fn parallel_insert_batch(&mut self, batch: &[(&Vec<f32>, usize)]) {
|
|
1531
|
+
match self {
|
|
1532
|
+
AnnHnsw::Cosine(h) => h.parallel_insert(batch),
|
|
1533
|
+
AnnHnsw::Euclidean(h) => h.parallel_insert(batch),
|
|
1534
|
+
AnnHnsw::DotProduct(h) => h.parallel_insert(batch),
|
|
1535
|
+
AnnHnsw::Manhattan(h) => h.parallel_insert(batch),
|
|
1536
|
+
}
|
|
1537
|
+
}
|
|
1538
|
+
|
|
1539
|
+
/// Toggle the `searching_mode` hint on the underlying HNSW. When `true`
|
|
1540
|
+
/// the graph is treated as read-only and lookups skip some bookkeeping;
|
|
1541
|
+
/// when `false` further inserts are allowed.
|
|
1542
|
+
fn set_searching_mode(&mut self, value: bool) {
|
|
1543
|
+
match self {
|
|
1544
|
+
AnnHnsw::Cosine(h) => h.set_searching_mode(value),
|
|
1545
|
+
AnnHnsw::Euclidean(h) => h.set_searching_mode(value),
|
|
1546
|
+
AnnHnsw::DotProduct(h) => h.set_searching_mode(value),
|
|
1547
|
+
AnnHnsw::Manhattan(h) => h.set_searching_mode(value),
|
|
1548
|
+
}
|
|
1549
|
+
}
|
|
1550
|
+
|
|
1258
1551
|
fn file_dump(&self, directory: &Path, basename: &str) -> Result<()> {
|
|
1259
1552
|
let result = match self {
|
|
1260
1553
|
AnnHnsw::Cosine(h) => h.file_dump(directory, basename),
|
|
@@ -1270,7 +1563,38 @@ impl AnnHnsw {
|
|
|
1270
1563
|
|
|
1271
1564
|
struct AnnIndex {
|
|
1272
1565
|
hnsw: AnnHnsw,
|
|
1566
|
+
/// `keys[i]` is the record key for HNSW origin_id `i`. Always grows; we
|
|
1567
|
+
/// never shrink it (HNSW doesn't support compacted deletion). Tombstoned
|
|
1568
|
+
/// slots stay in the vec to keep origin_id ↔ key mapping stable.
|
|
1273
1569
|
keys: Vec<RecordKey>,
|
|
1570
|
+
/// Reverse index: `key → origin_id`. Lets `delete` find a record's HNSW
|
|
1571
|
+
/// node in O(1). Built alongside `keys` on every (re)build.
|
|
1572
|
+
key_to_origin: HashMap<RecordKey, usize>,
|
|
1573
|
+
/// Origin_ids that have been logically deleted but are still part of the
|
|
1574
|
+
/// HNSW graph. Search filters these out by lookup; a `compact()` rebuilds
|
|
1575
|
+
/// the graph once the ratio exceeds `IndexConfig.tombstone_rebuild_pct`.
|
|
1576
|
+
tombstones: HashSet<usize>,
|
|
1577
|
+
}
|
|
1578
|
+
|
|
1579
|
+
impl AnnIndex {
|
|
1580
|
+
/// Number of live (non-tombstoned) records in the graph.
|
|
1581
|
+
fn live_count(&self) -> usize {
|
|
1582
|
+
self.keys.len().saturating_sub(self.tombstones.len())
|
|
1583
|
+
}
|
|
1584
|
+
|
|
1585
|
+
/// True when the fraction of dead nodes is at or above the configured
|
|
1586
|
+
/// rebuild threshold (`IndexConfig.tombstone_rebuild_pct`). Currently
|
|
1587
|
+
/// `compact_inner` rebuilds on *any* tombstones because the persisted
|
|
1588
|
+
/// manifest format only tracks live record counts — when we add a
|
|
1589
|
+
/// tombstone-aware manifest (planned), this becomes the trigger.
|
|
1590
|
+
#[allow(dead_code)]
|
|
1591
|
+
fn should_rebuild(&self, threshold_pct: u8) -> bool {
|
|
1592
|
+
if self.keys.is_empty() {
|
|
1593
|
+
return false;
|
|
1594
|
+
}
|
|
1595
|
+
let pct = (self.tombstones.len() * 100) / self.keys.len();
|
|
1596
|
+
pct >= threshold_pct as usize
|
|
1597
|
+
}
|
|
1274
1598
|
}
|
|
1275
1599
|
|
|
1276
1600
|
struct AnnManifestEntry {
|
|
@@ -1319,6 +1643,12 @@ impl Database {
|
|
|
1319
1643
|
ann_loaded_from_disk: false,
|
|
1320
1644
|
read_only: false,
|
|
1321
1645
|
_lock_file: Some(lock),
|
|
1646
|
+
wal_writer: None,
|
|
1647
|
+
wal_sync_mode: WalSyncMode::default(),
|
|
1648
|
+
wal_ops_since_sync: 0,
|
|
1649
|
+
ann_dirty: false,
|
|
1650
|
+
quantized_dirty: false,
|
|
1651
|
+
multi_vector_quantized_dirty: false,
|
|
1322
1652
|
quantized: None,
|
|
1323
1653
|
quantization_config: None,
|
|
1324
1654
|
quantized_keys: Vec::new(),
|
|
@@ -1327,6 +1657,9 @@ impl Database {
|
|
|
1327
1657
|
multi_vector_quantized_keys: BTreeMap::new(),
|
|
1328
1658
|
payload_index_defs: BTreeMap::new(),
|
|
1329
1659
|
payload_indexes: BTreeMap::new(),
|
|
1660
|
+
index_config: IndexConfig::default(),
|
|
1661
|
+
vector_arena: None,
|
|
1662
|
+
vector_arena_dirty: false,
|
|
1330
1663
|
};
|
|
1331
1664
|
|
|
1332
1665
|
database.flush()?;
|
|
@@ -1432,6 +1765,8 @@ impl Database {
|
|
|
1432
1765
|
if !self.read_only {
|
|
1433
1766
|
self.compact_inner()?;
|
|
1434
1767
|
}
|
|
1768
|
+
// Drop the cached WAL writer (also closes the underlying file handle).
|
|
1769
|
+
self.wal_writer = None;
|
|
1435
1770
|
// Release the lock by dropping the file handle
|
|
1436
1771
|
self._lock_file = None;
|
|
1437
1772
|
// Clear in-memory state
|
|
@@ -1441,6 +1776,8 @@ impl Database {
|
|
|
1441
1776
|
self.quantized = None;
|
|
1442
1777
|
self.quantization_config = None;
|
|
1443
1778
|
self.quantized_keys.clear();
|
|
1779
|
+
self.vector_arena = None;
|
|
1780
|
+
self.vector_arena_dirty = false;
|
|
1444
1781
|
self.dimension = 0;
|
|
1445
1782
|
Ok(())
|
|
1446
1783
|
}
|
|
@@ -2038,8 +2375,12 @@ impl Database {
|
|
|
2038
2375
|
self.rebuild_ann();
|
|
2039
2376
|
self.ann_loaded_from_disk = false;
|
|
2040
2377
|
self.persist_ann_to_disk()?;
|
|
2378
|
+
self.ann_dirty = false;
|
|
2379
|
+
self.vector_arena_dirty = true;
|
|
2041
2380
|
self.rebuild_quantized_index();
|
|
2381
|
+
self.quantized_dirty = false;
|
|
2042
2382
|
self.rebuild_all_multi_vector_quantized_indexes();
|
|
2383
|
+
self.multi_vector_quantized_dirty = false;
|
|
2043
2384
|
Ok(count)
|
|
2044
2385
|
}
|
|
2045
2386
|
|
|
@@ -2066,8 +2407,12 @@ impl Database {
|
|
|
2066
2407
|
self.rebuild_ann();
|
|
2067
2408
|
self.ann_loaded_from_disk = false;
|
|
2068
2409
|
self.persist_ann_to_disk()?;
|
|
2410
|
+
self.ann_dirty = false;
|
|
2411
|
+
self.vector_arena_dirty = true;
|
|
2069
2412
|
self.rebuild_quantized_index();
|
|
2413
|
+
self.quantized_dirty = false;
|
|
2070
2414
|
self.rebuild_all_multi_vector_quantized_indexes();
|
|
2415
|
+
self.multi_vector_quantized_dirty = false;
|
|
2071
2416
|
Ok(count)
|
|
2072
2417
|
}
|
|
2073
2418
|
|
|
@@ -2271,8 +2616,12 @@ impl Database {
|
|
|
2271
2616
|
self.rebuild_ann();
|
|
2272
2617
|
self.ann_loaded_from_disk = false;
|
|
2273
2618
|
self.persist_ann_to_disk()?;
|
|
2619
|
+
self.ann_dirty = false;
|
|
2620
|
+
self.vector_arena_dirty = true;
|
|
2274
2621
|
self.rebuild_quantized_index();
|
|
2622
|
+
self.quantized_dirty = false;
|
|
2275
2623
|
self.rebuild_all_multi_vector_quantized_indexes();
|
|
2624
|
+
self.multi_vector_quantized_dirty = false;
|
|
2276
2625
|
Ok(())
|
|
2277
2626
|
}
|
|
2278
2627
|
|
|
@@ -2483,15 +2832,119 @@ impl Database {
|
|
|
2483
2832
|
self.compact_inner()
|
|
2484
2833
|
}
|
|
2485
2834
|
|
|
2835
|
+
/// Configure WAL durability. See [`WalSyncMode`] for the safety / speed
|
|
2836
|
+
/// tradeoffs.
|
|
2837
|
+
///
|
|
2838
|
+
/// Switching to a more relaxed mode while there are unsync'd bytes in
|
|
2839
|
+
/// the WAL is safe — the bytes simply stay in the BufWriter / OS cache
|
|
2840
|
+
/// until the next sync point (`flush()`, `compact()`, `close()`, or the
|
|
2841
|
+
/// counter reaching `EveryN(n)`). Switching to a *stricter* mode forces
|
|
2842
|
+
/// an immediate sync so there is no surprise loss window.
|
|
2843
|
+
pub fn set_wal_sync_mode(&mut self, mode: WalSyncMode) -> Result<()> {
|
|
2844
|
+
self.check_writable()?;
|
|
2845
|
+
mode.validate()?;
|
|
2846
|
+
let previous = self.wal_sync_mode;
|
|
2847
|
+
self.wal_sync_mode = mode;
|
|
2848
|
+
// If we just tightened durability (e.g. moved from OnFlush back to
|
|
2849
|
+
// PerOp) and there are pending ops, sync immediately so the user's
|
|
2850
|
+
// mental model — "after this call any acknowledged write is durable"
|
|
2851
|
+
// — holds.
|
|
2852
|
+
let became_stricter = matches!(
|
|
2853
|
+
(previous, mode),
|
|
2854
|
+
(
|
|
2855
|
+
WalSyncMode::OnFlush,
|
|
2856
|
+
WalSyncMode::PerOp | WalSyncMode::EveryN(_)
|
|
2857
|
+
) | (WalSyncMode::EveryN(_), WalSyncMode::PerOp)
|
|
2858
|
+
);
|
|
2859
|
+
if became_stricter && self.wal_ops_since_sync > 0 {
|
|
2860
|
+
self.sync_wal()?;
|
|
2861
|
+
self.wal_ops_since_sync = 0;
|
|
2862
|
+
}
|
|
2863
|
+
Ok(())
|
|
2864
|
+
}
|
|
2865
|
+
|
|
2866
|
+
/// Return the current WAL sync mode.
|
|
2867
|
+
pub fn wal_sync_mode(&self) -> WalSyncMode {
|
|
2868
|
+
self.wal_sync_mode
|
|
2869
|
+
}
|
|
2870
|
+
|
|
2871
|
+
/// Materialise the contiguous-vector arena up front.
|
|
2872
|
+
///
|
|
2873
|
+
/// The arena mirrors the default dense vector of every record in a
|
|
2874
|
+
/// single flat `Vec<f32>` — much more cache- and SIMD-friendly than the
|
|
2875
|
+
/// default `BTreeMap<Record>` layout. It's normally built lazily on
|
|
2876
|
+
/// first use, but if you know a heavy brute-force or rescoring scan is
|
|
2877
|
+
/// coming you can pay the build cost up front by calling this. Cheap
|
|
2878
|
+
/// when already fresh.
|
|
2879
|
+
pub fn prepare_for_scan(&mut self) {
|
|
2880
|
+
let _ = self.ensure_vector_arena();
|
|
2881
|
+
}
|
|
2882
|
+
|
|
2883
|
+
/// Number of vectors in the contiguous arena, or `None` if the arena
|
|
2884
|
+
/// hasn't been materialised yet for this session. Useful for tests and
|
|
2885
|
+
/// observability.
|
|
2886
|
+
pub fn vector_arena_len(&self) -> Option<usize> {
|
|
2887
|
+
self.vector_arena.as_ref().map(VectorArena::len)
|
|
2888
|
+
}
|
|
2889
|
+
|
|
2890
|
+
/// Return (live_count, tombstoned_count) summed across every HNSW graph
|
|
2891
|
+
/// (global + per-namespace). Useful for monitoring when a `compact()`
|
|
2892
|
+
/// would benefit from rebuilding the graph(s).
|
|
2893
|
+
pub fn tombstone_stats(&self) -> (usize, usize) {
|
|
2894
|
+
let mut live = 0usize;
|
|
2895
|
+
let mut dead = 0usize;
|
|
2896
|
+
for idx in self.ann.global.values() {
|
|
2897
|
+
live += idx.live_count();
|
|
2898
|
+
dead += idx.tombstones.len();
|
|
2899
|
+
}
|
|
2900
|
+
for indexes in self.ann.namespaces.values() {
|
|
2901
|
+
for idx in indexes.values() {
|
|
2902
|
+
live += idx.live_count();
|
|
2903
|
+
dead += idx.tombstones.len();
|
|
2904
|
+
}
|
|
2905
|
+
}
|
|
2906
|
+
(live, dead)
|
|
2907
|
+
}
|
|
2908
|
+
|
|
2486
2909
|
/// Bulk-ingest many records efficiently. WAL writes happen in batches of
|
|
2487
2910
|
/// `batch_size`, but the ANN index and sparse index are only rebuilt once
|
|
2488
2911
|
/// at the very end, making this much faster than `upsert_many` for large
|
|
2489
2912
|
/// imports.
|
|
2913
|
+
///
|
|
2914
|
+
/// Performance notes:
|
|
2915
|
+
/// - The WAL is written without a per-batch `fsync` (each batch goes
|
|
2916
|
+
/// through `BufWriter` and is appended to the open file). A single
|
|
2917
|
+
/// `sync_all` is issued at the end. This avoids the per-batch fsync
|
|
2918
|
+
/// tax that dominates ingestion latency on macOS and modern SSDs.
|
|
2919
|
+
/// - The final ANN rebuild uses parallel HNSW insertion (Rayon) when
|
|
2920
|
+
/// the dataset is large enough (see
|
|
2921
|
+
/// `IndexConfig.parallel_insert_threshold`).
|
|
2490
2922
|
pub fn bulk_ingest<I>(&mut self, records: I, batch_size: usize) -> Result<usize>
|
|
2923
|
+
where
|
|
2924
|
+
I: IntoIterator<Item = Record>,
|
|
2925
|
+
{
|
|
2926
|
+
self.bulk_ingest_with_config(records, batch_size, None)
|
|
2927
|
+
}
|
|
2928
|
+
|
|
2929
|
+
/// Bulk-ingest with an override for the HNSW index configuration. The
|
|
2930
|
+
/// override is applied for the rebuild step at the end, so the resulting
|
|
2931
|
+
/// graph uses the requested `m` / `ef_construction`. The new config is
|
|
2932
|
+
/// also stored on the database (so subsequent searches use the
|
|
2933
|
+
/// corresponding `ef_search`).
|
|
2934
|
+
pub fn bulk_ingest_with_config<I>(
|
|
2935
|
+
&mut self,
|
|
2936
|
+
records: I,
|
|
2937
|
+
batch_size: usize,
|
|
2938
|
+
config: Option<IndexConfig>,
|
|
2939
|
+
) -> Result<usize>
|
|
2491
2940
|
where
|
|
2492
2941
|
I: IntoIterator<Item = Record>,
|
|
2493
2942
|
{
|
|
2494
2943
|
self.check_writable()?;
|
|
2944
|
+
if let Some(cfg) = config {
|
|
2945
|
+
cfg.validate()?;
|
|
2946
|
+
self.index_config = cfg;
|
|
2947
|
+
}
|
|
2495
2948
|
let batch_size = batch_size.max(1);
|
|
2496
2949
|
let mut total = 0_usize;
|
|
2497
2950
|
let mut batch = Vec::with_capacity(batch_size);
|
|
@@ -2502,7 +2955,8 @@ impl Database {
|
|
|
2502
2955
|
|
|
2503
2956
|
if batch.len() >= batch_size {
|
|
2504
2957
|
total += batch.len();
|
|
2505
|
-
|
|
2958
|
+
// Coalesced WAL writes: append without per-batch fsync.
|
|
2959
|
+
self.append_wal_batch_unsynced(&batch)?;
|
|
2506
2960
|
self.apply_ops_in_memory(batch);
|
|
2507
2961
|
batch = Vec::with_capacity(batch_size);
|
|
2508
2962
|
}
|
|
@@ -2510,22 +2964,71 @@ impl Database {
|
|
|
2510
2964
|
|
|
2511
2965
|
if !batch.is_empty() {
|
|
2512
2966
|
total += batch.len();
|
|
2513
|
-
self.
|
|
2967
|
+
self.append_wal_batch_unsynced(&batch)?;
|
|
2514
2968
|
self.apply_ops_in_memory(batch);
|
|
2515
2969
|
}
|
|
2516
2970
|
|
|
2517
2971
|
if total > 0 {
|
|
2972
|
+
// Single fsync at the very end to make all batches durable in
|
|
2973
|
+
// one shot. This is the major ingestion optimisation: instead
|
|
2974
|
+
// of paying fsync per batch (every `batch_size` records) we pay
|
|
2975
|
+
// it once for the whole bulk_ingest call.
|
|
2976
|
+
self.sync_wal()?;
|
|
2518
2977
|
self.rebuild_sparse_index();
|
|
2519
2978
|
self.rebuild_ann();
|
|
2520
2979
|
self.ann_loaded_from_disk = false;
|
|
2980
|
+
// Persist the freshly-built ANN so a subsequent reopen can skip
|
|
2981
|
+
// the rebuild — bulk_ingest is a "batch" operation and callers
|
|
2982
|
+
// expect index state to be on disk afterwards.
|
|
2521
2983
|
self.persist_ann_to_disk()?;
|
|
2984
|
+
self.ann_dirty = false;
|
|
2985
|
+
self.vector_arena_dirty = true;
|
|
2522
2986
|
self.rebuild_quantized_index();
|
|
2987
|
+
self.quantized_dirty = false;
|
|
2523
2988
|
self.rebuild_all_multi_vector_quantized_indexes();
|
|
2989
|
+
self.multi_vector_quantized_dirty = false;
|
|
2524
2990
|
}
|
|
2525
2991
|
|
|
2526
2992
|
Ok(total)
|
|
2527
2993
|
}
|
|
2528
2994
|
|
|
2995
|
+
/// Replace the HNSW tuning parameters and rebuild the ANN index.
|
|
2996
|
+
/// Use this to trade off recall vs latency without re-ingesting data.
|
|
2997
|
+
pub fn set_index_config(&mut self, config: IndexConfig) -> Result<()> {
|
|
2998
|
+
self.check_writable()?;
|
|
2999
|
+
config.validate()?;
|
|
3000
|
+
let changed_build_params = self.index_config.m != config.m
|
|
3001
|
+
|| self.index_config.ef_construction != config.ef_construction;
|
|
3002
|
+
self.index_config = config;
|
|
3003
|
+
if changed_build_params {
|
|
3004
|
+
// m / ef_construction affect graph structure → full rebuild.
|
|
3005
|
+
self.rebuild_ann();
|
|
3006
|
+
self.ann_loaded_from_disk = false;
|
|
3007
|
+
self.persist_ann_to_disk()?;
|
|
3008
|
+
self.ann_dirty = false;
|
|
3009
|
+
}
|
|
3010
|
+
Ok(())
|
|
3011
|
+
}
|
|
3012
|
+
|
|
3013
|
+
/// Return the current HNSW tuning parameters.
|
|
3014
|
+
pub fn index_config(&self) -> IndexConfig {
|
|
3015
|
+
self.index_config
|
|
3016
|
+
}
|
|
3017
|
+
|
|
3018
|
+
/// Convenience: update only the query-time `ef_search` without rebuilding
|
|
3019
|
+
/// the index. Higher = better recall, slower search.
|
|
3020
|
+
pub fn set_ef_search(&mut self, ef_search: Option<usize>) -> Result<()> {
|
|
3021
|
+
if let Some(ef) = ef_search {
|
|
3022
|
+
if ef == 0 {
|
|
3023
|
+
return Err(VectLiteError::InvalidFormat(
|
|
3024
|
+
"ef_search must be >= 1".to_owned(),
|
|
3025
|
+
));
|
|
3026
|
+
}
|
|
3027
|
+
}
|
|
3028
|
+
self.index_config.ef_search = ef_search;
|
|
3029
|
+
Ok(())
|
|
3030
|
+
}
|
|
3031
|
+
|
|
2529
3032
|
pub fn compact(&mut self) -> Result<()> {
|
|
2530
3033
|
self.check_writable()?;
|
|
2531
3034
|
self.compact_inner()
|
|
@@ -2548,6 +3051,7 @@ impl Database {
|
|
|
2548
3051
|
validate_quantization_config(&config, self.dimension)?;
|
|
2549
3052
|
self.quantization_config = Some(config);
|
|
2550
3053
|
self.rebuild_quantized_index();
|
|
3054
|
+
self.quantized_dirty = false;
|
|
2551
3055
|
self.persist_quantization_params()?;
|
|
2552
3056
|
Ok(())
|
|
2553
3057
|
}
|
|
@@ -2558,6 +3062,7 @@ impl Database {
|
|
|
2558
3062
|
self.quantized = None;
|
|
2559
3063
|
self.quantization_config = None;
|
|
2560
3064
|
self.quantized_keys.clear();
|
|
3065
|
+
self.quantized_dirty = false;
|
|
2561
3066
|
// Remove the sidecar file
|
|
2562
3067
|
let params_path = quantization_params_path(&self.path);
|
|
2563
3068
|
if params_path.exists() {
|
|
@@ -3239,6 +3744,54 @@ impl Database {
|
|
|
3239
3744
|
self.records.remove(key);
|
|
3240
3745
|
}
|
|
3241
3746
|
|
|
3747
|
+
// If any HNSW graph has tombstones, rebuild it before persisting.
|
|
3748
|
+
//
|
|
3749
|
+
// Two reasons:
|
|
3750
|
+
// 1. Crossing `tombstone_rebuild_pct` means search recall has
|
|
3751
|
+
// degraded enough that the user wants a clean graph.
|
|
3752
|
+
// 2. Even below the threshold, the persisted manifest's
|
|
3753
|
+
// `record_count` is derived from `self.records` (live only),
|
|
3754
|
+
// but the in-memory `keys` array includes dead slots — so a
|
|
3755
|
+
// persisted-with-tombstones graph would always fail the
|
|
3756
|
+
// record_count check on reopen and rebuild anyway. Rebuilding
|
|
3757
|
+
// *now* dumps a clean graph that survives reload.
|
|
3758
|
+
let threshold = self.index_config.tombstone_rebuild_pct;
|
|
3759
|
+
let any_tombstones = self
|
|
3760
|
+
.ann
|
|
3761
|
+
.global
|
|
3762
|
+
.values()
|
|
3763
|
+
.any(|idx| !idx.tombstones.is_empty())
|
|
3764
|
+
|| self
|
|
3765
|
+
.ann
|
|
3766
|
+
.namespaces
|
|
3767
|
+
.values()
|
|
3768
|
+
.flat_map(|m| m.values())
|
|
3769
|
+
.any(|idx| !idx.tombstones.is_empty());
|
|
3770
|
+
// (We track `threshold` even though we currently rebuild on any
|
|
3771
|
+
// tombstones, so `should_rebuild` could later replace this when we
|
|
3772
|
+
// add tombstone persistence in the manifest.)
|
|
3773
|
+
let _ = threshold;
|
|
3774
|
+
if any_tombstones {
|
|
3775
|
+
self.rebuild_ann();
|
|
3776
|
+
}
|
|
3777
|
+
|
|
3778
|
+
// Rebuild any lazy indexes that were marked dirty during the session
|
|
3779
|
+
// before we persist. This is the point where we pay back the work
|
|
3780
|
+
// we deferred from the per-insert hot path:
|
|
3781
|
+
// - the HNSW graph is already up-to-date (incremental inserts),
|
|
3782
|
+
// we just need to dump it.
|
|
3783
|
+
// - the quantized PQ index was dropped on first insert and is
|
|
3784
|
+
// rebuilt now so search can use it again next session.
|
|
3785
|
+
// - same for multi-vector PQ.
|
|
3786
|
+
if self.quantized_dirty {
|
|
3787
|
+
self.rebuild_quantized_index();
|
|
3788
|
+
self.quantized_dirty = false;
|
|
3789
|
+
}
|
|
3790
|
+
if self.multi_vector_quantized_dirty {
|
|
3791
|
+
self.rebuild_all_multi_vector_quantized_indexes();
|
|
3792
|
+
self.multi_vector_quantized_dirty = false;
|
|
3793
|
+
}
|
|
3794
|
+
|
|
3242
3795
|
if let Some(parent) = self.path.parent() {
|
|
3243
3796
|
if !parent.as_os_str().is_empty() {
|
|
3244
3797
|
fs::create_dir_all(parent)?;
|
|
@@ -3261,6 +3814,7 @@ impl Database {
|
|
|
3261
3814
|
self.clear_wal()?;
|
|
3262
3815
|
self.wal_entries_replayed = 0;
|
|
3263
3816
|
self.persist_ann_to_disk()?;
|
|
3817
|
+
self.ann_dirty = false;
|
|
3264
3818
|
|
|
3265
3819
|
Ok(())
|
|
3266
3820
|
}
|
|
@@ -3401,6 +3955,65 @@ impl Database {
|
|
|
3401
3955
|
.iter()
|
|
3402
3956
|
.all(|op| matches!(op, WalOp::UpdateMetadata { .. } | WalOp::SetTtl { .. }));
|
|
3403
3957
|
|
|
3958
|
+
// Categorise each op so we can route to the fastest correct path:
|
|
3959
|
+
// incremental insert (Upsert with new key) → ann_apply_incremental
|
|
3960
|
+
// tombstone delete (Delete of present key) → ann_apply_tombstones
|
|
3961
|
+
// anything else (upsert of existing key, etc) → full rebuild
|
|
3962
|
+
let mut incremental_eligible = !metadata_only;
|
|
3963
|
+
let mut tombstone_only = !metadata_only;
|
|
3964
|
+
for op in &ops {
|
|
3965
|
+
match op {
|
|
3966
|
+
WalOp::Upsert(record) => {
|
|
3967
|
+
let exists = self
|
|
3968
|
+
.records
|
|
3969
|
+
.contains_key(&(record.namespace.clone(), record.id.clone()));
|
|
3970
|
+
if exists {
|
|
3971
|
+
incremental_eligible = false;
|
|
3972
|
+
tombstone_only = false;
|
|
3973
|
+
} else {
|
|
3974
|
+
// New upsert — fine for incremental, but not tombstone-only.
|
|
3975
|
+
tombstone_only = false;
|
|
3976
|
+
}
|
|
3977
|
+
}
|
|
3978
|
+
WalOp::Delete { namespace, id } => {
|
|
3979
|
+
let exists = self.records.contains_key(&(namespace.clone(), id.clone()));
|
|
3980
|
+
if exists {
|
|
3981
|
+
// OK for tombstone path, but not for incremental.
|
|
3982
|
+
incremental_eligible = false;
|
|
3983
|
+
}
|
|
3984
|
+
// (A delete of a non-existent key is a no-op for both
|
|
3985
|
+
// paths, but we still let it through.)
|
|
3986
|
+
}
|
|
3987
|
+
WalOp::UpdateMetadata { .. } | WalOp::SetTtl { .. } => {
|
|
3988
|
+
incremental_eligible = false;
|
|
3989
|
+
tombstone_only = false;
|
|
3990
|
+
}
|
|
3991
|
+
}
|
|
3992
|
+
}
|
|
3993
|
+
|
|
3994
|
+
// Collect the keys we'll need to feed to the relevant updater
|
|
3995
|
+
// before we move `ops` into `apply_ops_in_memory`.
|
|
3996
|
+
let new_keys: Vec<RecordKey> = if incremental_eligible {
|
|
3997
|
+
ops.iter()
|
|
3998
|
+
.filter_map(|op| match op {
|
|
3999
|
+
WalOp::Upsert(record) => Some((record.namespace.clone(), record.id.clone())),
|
|
4000
|
+
_ => None,
|
|
4001
|
+
})
|
|
4002
|
+
.collect()
|
|
4003
|
+
} else {
|
|
4004
|
+
Vec::new()
|
|
4005
|
+
};
|
|
4006
|
+
let deleted_keys: Vec<RecordKey> = if tombstone_only {
|
|
4007
|
+
ops.iter()
|
|
4008
|
+
.filter_map(|op| match op {
|
|
4009
|
+
WalOp::Delete { namespace, id } => Some((namespace.clone(), id.clone())),
|
|
4010
|
+
_ => None,
|
|
4011
|
+
})
|
|
4012
|
+
.collect()
|
|
4013
|
+
} else {
|
|
4014
|
+
Vec::new()
|
|
4015
|
+
};
|
|
4016
|
+
|
|
3404
4017
|
self.append_wal_batch(&ops)?;
|
|
3405
4018
|
self.apply_ops_in_memory(ops);
|
|
3406
4019
|
|
|
@@ -3409,11 +4022,55 @@ impl Database {
|
|
|
3409
4022
|
if has_sparse {
|
|
3410
4023
|
self.rebuild_sparse_index();
|
|
3411
4024
|
}
|
|
3412
|
-
|
|
4025
|
+
if incremental_eligible {
|
|
4026
|
+
// Fast path: just append the new vectors into the existing
|
|
4027
|
+
// HNSW graph(s) instead of rebuilding from scratch. Converts
|
|
4028
|
+
// single-record ingestion from O(N log N) per insert to
|
|
4029
|
+
// amortised O(log N).
|
|
4030
|
+
self.ann_apply_incremental(&new_keys);
|
|
4031
|
+
// Keep the contiguous arena in sync. If it hasn't been
|
|
4032
|
+
// materialised yet, leave it alone — it'll be lazily built
|
|
4033
|
+
// on first read.
|
|
4034
|
+
if self.vector_arena.is_some() && !self.vector_arena_dirty {
|
|
4035
|
+
self.arena_apply_incremental(&new_keys);
|
|
4036
|
+
}
|
|
4037
|
+
} else if tombstone_only {
|
|
4038
|
+
// Delete-only fast path: tombstone the corresponding
|
|
4039
|
+
// `origin_id`s in each affected HNSW graph. No rebuild;
|
|
4040
|
+
// search filters out tombstoned candidates. The graph is
|
|
4041
|
+
// rebuilt automatically at the next `compact()` once the
|
|
4042
|
+
// tombstone ratio crosses `tombstone_rebuild_pct`.
|
|
4043
|
+
self.ann_apply_tombstones(&deleted_keys);
|
|
4044
|
+
// The arena can't compact in place without shifting O(N)
|
|
4045
|
+
// floats; mark dirty so it's lazily rebuilt on next scan.
|
|
4046
|
+
self.vector_arena_dirty = true;
|
|
4047
|
+
} else {
|
|
4048
|
+
// Slow path: a mixed-mode batch or an update-of-existing.
|
|
4049
|
+
// Rebuild the whole catalog.
|
|
4050
|
+
self.rebuild_ann();
|
|
4051
|
+
self.vector_arena_dirty = true;
|
|
4052
|
+
}
|
|
4053
|
+
// Defer persistence of the HNSW graph to disk: writing the graph
|
|
4054
|
+
// files is expensive (full re-dump + fsync) and is only required
|
|
4055
|
+
// for crash recovery on reopen. The WAL gives us that durability
|
|
4056
|
+
// already — on reopen, if the persisted graph is stale, it's
|
|
4057
|
+
// detected via the manifest signature check and rebuilt from
|
|
4058
|
+
// records in memory. Persistence happens at `flush` / `compact`.
|
|
3413
4059
|
self.ann_loaded_from_disk = false;
|
|
3414
|
-
self.
|
|
3415
|
-
|
|
3416
|
-
|
|
4060
|
+
self.ann_dirty = true;
|
|
4061
|
+
// Lazy-rebuild quantized indexes too. Drop the in-memory
|
|
4062
|
+
// structures so callers get correct (HNSW-fallback) results
|
|
4063
|
+
// until the next flush, where we rebuild from the new corpus.
|
|
4064
|
+
if self.quantization_config.is_some() {
|
|
4065
|
+
self.quantized = None;
|
|
4066
|
+
self.quantized_keys.clear();
|
|
4067
|
+
self.quantized_dirty = true;
|
|
4068
|
+
}
|
|
4069
|
+
if !self.multi_vector_quantization_config.is_empty() {
|
|
4070
|
+
self.multi_vector_quantized.clear();
|
|
4071
|
+
self.multi_vector_quantized_keys.clear();
|
|
4072
|
+
self.multi_vector_quantized_dirty = true;
|
|
4073
|
+
}
|
|
3417
4074
|
}
|
|
3418
4075
|
Ok(())
|
|
3419
4076
|
}
|
|
@@ -3497,32 +4154,109 @@ impl Database {
|
|
|
3497
4154
|
}
|
|
3498
4155
|
}
|
|
3499
4156
|
|
|
3500
|
-
fn append_wal_batch(&self, ops: &[WalOp]) -> Result<()> {
|
|
4157
|
+
fn append_wal_batch(&mut self, ops: &[WalOp]) -> Result<()> {
|
|
4158
|
+
// Decide whether this batch should trigger an fsync. We use the
|
|
4159
|
+
// ops count in the batch (not 1) so `EveryN` semantics scale across
|
|
4160
|
+
// both single inserts and `insert_many` calls.
|
|
4161
|
+
let n_ops = ops.len();
|
|
4162
|
+
let should_sync = match self.wal_sync_mode {
|
|
4163
|
+
WalSyncMode::PerOp => true,
|
|
4164
|
+
WalSyncMode::EveryN(n) => {
|
|
4165
|
+
self.wal_ops_since_sync = self.wal_ops_since_sync.saturating_add(n_ops);
|
|
4166
|
+
if self.wal_ops_since_sync >= n {
|
|
4167
|
+
self.wal_ops_since_sync = 0;
|
|
4168
|
+
true
|
|
4169
|
+
} else {
|
|
4170
|
+
false
|
|
4171
|
+
}
|
|
4172
|
+
}
|
|
4173
|
+
WalSyncMode::OnFlush => {
|
|
4174
|
+
self.wal_ops_since_sync = self.wal_ops_since_sync.saturating_add(n_ops);
|
|
4175
|
+
false
|
|
4176
|
+
}
|
|
4177
|
+
};
|
|
4178
|
+
self.append_wal_batch_inner(ops, should_sync)
|
|
4179
|
+
}
|
|
4180
|
+
|
|
4181
|
+
/// Append a WAL batch without issuing an fsync. The caller is responsible
|
|
4182
|
+
/// for issuing `sync_wal` later (typically once at the end of a bulk
|
|
4183
|
+
/// ingest). This is the hot path for `bulk_ingest`.
|
|
4184
|
+
fn append_wal_batch_unsynced(&mut self, ops: &[WalOp]) -> Result<()> {
|
|
4185
|
+
// Track pending ops so future `sync_wal` / `compact_inner` calls
|
|
4186
|
+
// know to flush them.
|
|
4187
|
+
self.wal_ops_since_sync = self.wal_ops_since_sync.saturating_add(ops.len());
|
|
4188
|
+
self.append_wal_batch_inner(ops, false)
|
|
4189
|
+
}
|
|
4190
|
+
|
|
4191
|
+
/// Append a WAL batch. Reuses a cached `BufWriter<File>` across calls so
|
|
4192
|
+
/// the WAL file is only opened once per database session — saving the
|
|
4193
|
+
/// `open()` syscall on every single `insert` call, which matters when
|
|
4194
|
+
/// per-record overhead is the bottleneck.
|
|
4195
|
+
fn append_wal_batch_inner(&mut self, ops: &[WalOp], sync: bool) -> Result<()> {
|
|
3501
4196
|
if let Some(parent) = self.wal_path.parent() {
|
|
3502
4197
|
if !parent.as_os_str().is_empty() {
|
|
3503
4198
|
fs::create_dir_all(parent)?;
|
|
3504
4199
|
}
|
|
3505
4200
|
}
|
|
3506
4201
|
|
|
3507
|
-
|
|
3508
|
-
|
|
3509
|
-
|
|
3510
|
-
.
|
|
3511
|
-
|
|
3512
|
-
|
|
3513
|
-
|
|
3514
|
-
|
|
4202
|
+
// Lazily create the cached BufWriter, writing the WAL_MAGIC header
|
|
4203
|
+
// on first use of a brand-new file.
|
|
4204
|
+
if self.wal_writer.is_none() {
|
|
4205
|
+
let new_file = !self.wal_path.exists();
|
|
4206
|
+
let file = OpenOptions::new()
|
|
4207
|
+
.create(true)
|
|
4208
|
+
.append(true)
|
|
4209
|
+
.open(&self.wal_path)?;
|
|
4210
|
+
let mut writer = BufWriter::with_capacity(64 * 1024, file);
|
|
4211
|
+
if new_file {
|
|
4212
|
+
writer.write_all(WAL_MAGIC)?;
|
|
4213
|
+
}
|
|
4214
|
+
self.wal_writer = Some(writer);
|
|
3515
4215
|
}
|
|
3516
4216
|
|
|
4217
|
+
// Serialise the batch into a temporary buffer first, so that the
|
|
4218
|
+
// single `write_all` we issue to the cached writer is one contiguous
|
|
4219
|
+
// user-space copy (BufWriter then bunches everything up further).
|
|
3517
4220
|
let mut buffer = Vec::new();
|
|
3518
4221
|
write_u32(&mut buffer, u32_from_usize(ops.len())?)?;
|
|
3519
4222
|
for op in ops {
|
|
3520
4223
|
write_wal_op(&mut buffer, op)?;
|
|
3521
4224
|
}
|
|
3522
4225
|
|
|
3523
|
-
|
|
3524
|
-
|
|
4226
|
+
let writer = self.wal_writer.as_mut().unwrap();
|
|
4227
|
+
write_u32(writer, u32_from_usize(buffer.len())?)?;
|
|
4228
|
+
writer.write_all(&buffer)?;
|
|
4229
|
+
|
|
4230
|
+
if sync {
|
|
4231
|
+
// Flush BufWriter into the OS, then ask the kernel to make the
|
|
4232
|
+
// bytes durable. We must `flush()` before `sync_all()` — sync_all
|
|
4233
|
+
// only operates on what's already in the kernel's page cache.
|
|
4234
|
+
writer.flush()?;
|
|
4235
|
+
writer.get_ref().sync_all()?;
|
|
4236
|
+
}
|
|
4237
|
+
Ok(())
|
|
4238
|
+
}
|
|
4239
|
+
|
|
4240
|
+
/// Force a durability fence on the WAL file. Flushes any buffered bytes
|
|
4241
|
+
/// from the cached writer and asks the kernel to make them durable in a
|
|
4242
|
+
/// single `sync_all`. Used by `bulk_ingest`, `flush`, `close`, and as a
|
|
4243
|
+
/// manual fence when running in `EveryN` or `OnFlush` mode.
|
|
4244
|
+
fn sync_wal(&mut self) -> Result<()> {
|
|
4245
|
+
if let Some(writer) = self.wal_writer.as_mut() {
|
|
4246
|
+
writer.flush()?;
|
|
4247
|
+
writer.get_ref().sync_all()?;
|
|
4248
|
+
self.wal_ops_since_sync = 0;
|
|
4249
|
+
return Ok(());
|
|
4250
|
+
}
|
|
4251
|
+
// Fallback: no cached writer (e.g. WAL was opened externally). Open
|
|
4252
|
+
// the file briefly just to issue the sync.
|
|
4253
|
+
if !self.wal_path.exists() {
|
|
4254
|
+
self.wal_ops_since_sync = 0;
|
|
4255
|
+
return Ok(());
|
|
4256
|
+
}
|
|
4257
|
+
let file = OpenOptions::new().append(true).open(&self.wal_path)?;
|
|
3525
4258
|
file.sync_all()?;
|
|
4259
|
+
self.wal_ops_since_sync = 0;
|
|
3526
4260
|
Ok(())
|
|
3527
4261
|
}
|
|
3528
4262
|
|
|
@@ -3576,7 +4310,12 @@ impl Database {
|
|
|
3576
4310
|
Ok(())
|
|
3577
4311
|
}
|
|
3578
4312
|
|
|
3579
|
-
fn clear_wal(&self) -> Result<()> {
|
|
4313
|
+
fn clear_wal(&mut self) -> Result<()> {
|
|
4314
|
+
// Drop the cached writer first: on POSIX the file would survive the
|
|
4315
|
+
// unlink because we still hold an open handle, but we'd then keep
|
|
4316
|
+
// appending into the now-detached inode and never see those bytes on
|
|
4317
|
+
// disk after reopen.
|
|
4318
|
+
self.wal_writer = None;
|
|
3580
4319
|
if self.wal_path.exists() {
|
|
3581
4320
|
fs::remove_file(&self.wal_path)?;
|
|
3582
4321
|
}
|
|
@@ -3688,6 +4427,12 @@ impl Database {
|
|
|
3688
4427
|
ann_loaded_from_disk: false,
|
|
3689
4428
|
read_only: false,
|
|
3690
4429
|
_lock_file: None,
|
|
4430
|
+
wal_writer: None,
|
|
4431
|
+
wal_sync_mode: WalSyncMode::default(),
|
|
4432
|
+
wal_ops_since_sync: 0,
|
|
4433
|
+
ann_dirty: false,
|
|
4434
|
+
quantized_dirty: false,
|
|
4435
|
+
multi_vector_quantized_dirty: false,
|
|
3691
4436
|
quantized: None,
|
|
3692
4437
|
quantization_config: None,
|
|
3693
4438
|
quantized_keys: Vec::new(),
|
|
@@ -3696,6 +4441,9 @@ impl Database {
|
|
|
3696
4441
|
multi_vector_quantized_keys: BTreeMap::new(),
|
|
3697
4442
|
payload_index_defs: BTreeMap::new(),
|
|
3698
4443
|
payload_indexes: BTreeMap::new(),
|
|
4444
|
+
index_config: IndexConfig::default(),
|
|
4445
|
+
vector_arena: None,
|
|
4446
|
+
vector_arena_dirty: false,
|
|
3699
4447
|
})
|
|
3700
4448
|
}
|
|
3701
4449
|
|
|
@@ -3833,6 +4581,237 @@ impl Database {
|
|
|
3833
4581
|
Ok(())
|
|
3834
4582
|
}
|
|
3835
4583
|
|
|
4584
|
+
/// Incremental ANN update. Appends the given new records into the
|
|
4585
|
+
/// existing HNSW graph(s) without rebuilding them from scratch.
|
|
4586
|
+
///
|
|
4587
|
+
/// Preconditions:
|
|
4588
|
+
/// - `new_keys` are keys that already live in `self.records` (caller
|
|
4589
|
+
/// must have applied the WAL ops to memory first).
|
|
4590
|
+
/// - Each key referenced by `new_keys` did NOT previously exist in
|
|
4591
|
+
/// `self.records` (i.e. it's a true insert, not an update).
|
|
4592
|
+
///
|
|
4593
|
+
/// Behaviour per (namespace, vector_name) "slot":
|
|
4594
|
+
/// - If a graph already exists, the new vectors are appended to it
|
|
4595
|
+
/// via single-element `hnsw.insert` calls (or `parallel_insert` if
|
|
4596
|
+
/// the batch is large enough to amortise thread overhead).
|
|
4597
|
+
/// - If no graph exists but the total record count for that slot has
|
|
4598
|
+
/// now crossed `ANN_MIN_POINTS`, a fresh graph is built from all
|
|
4599
|
+
/// matching records.
|
|
4600
|
+
/// - Below `ANN_MIN_POINTS`, we skip — searches will brute-force
|
|
4601
|
+
/// without harm.
|
|
4602
|
+
fn ann_apply_incremental(&mut self, new_keys: &[RecordKey]) {
|
|
4603
|
+
if new_keys.is_empty() {
|
|
4604
|
+
return;
|
|
4605
|
+
}
|
|
4606
|
+
let cfg = self.index_config;
|
|
4607
|
+
|
|
4608
|
+
// Group the new records by (Option<namespace>, vector_name). Each
|
|
4609
|
+
// upserted record contributes to exactly one global slot and one
|
|
4610
|
+
// namespace-scoped slot per dense vector it owns.
|
|
4611
|
+
let mut groups: BTreeMap<(Option<String>, String), Vec<(RecordKey, Vec<f32>)>> =
|
|
4612
|
+
BTreeMap::new();
|
|
4613
|
+
for key in new_keys {
|
|
4614
|
+
let Some(record) = self.records.get(key) else {
|
|
4615
|
+
continue;
|
|
4616
|
+
};
|
|
4617
|
+
for (vector_name, vector) in record.dense_vectors() {
|
|
4618
|
+
let item = (key.clone(), vector.clone());
|
|
4619
|
+
groups
|
|
4620
|
+
.entry((None, vector_name.to_owned()))
|
|
4621
|
+
.or_default()
|
|
4622
|
+
.push(item.clone());
|
|
4623
|
+
groups
|
|
4624
|
+
.entry((Some(record.namespace.clone()), vector_name.to_owned()))
|
|
4625
|
+
.or_default()
|
|
4626
|
+
.push(item);
|
|
4627
|
+
}
|
|
4628
|
+
}
|
|
4629
|
+
|
|
4630
|
+
// Two-phase processing to keep the borrow checker happy:
|
|
4631
|
+
// phase 1: classify each slot (needs fresh build vs incremental
|
|
4632
|
+
// append), reading `self.records` only.
|
|
4633
|
+
// phase 2: mutate `self.ann` based on the classifications.
|
|
4634
|
+
let mut fresh_builds: Vec<((Option<String>, String), Vec<(RecordKey, Vec<f32>)>)> =
|
|
4635
|
+
Vec::new();
|
|
4636
|
+
let mut incremental: Vec<((Option<String>, String), Vec<(RecordKey, Vec<f32>)>)> =
|
|
4637
|
+
Vec::new();
|
|
4638
|
+
|
|
4639
|
+
for ((opt_ns, vector_name), new_items) in groups {
|
|
4640
|
+
let has_existing = match &opt_ns {
|
|
4641
|
+
None => self.ann.global.contains_key(&vector_name),
|
|
4642
|
+
Some(ns) => self
|
|
4643
|
+
.ann
|
|
4644
|
+
.namespaces
|
|
4645
|
+
.get(ns)
|
|
4646
|
+
.map_or(false, |m| m.contains_key(&vector_name)),
|
|
4647
|
+
};
|
|
4648
|
+
|
|
4649
|
+
if has_existing {
|
|
4650
|
+
incremental.push(((opt_ns, vector_name), new_items));
|
|
4651
|
+
continue;
|
|
4652
|
+
}
|
|
4653
|
+
|
|
4654
|
+
// Count matching records (post-insert state) to decide whether
|
|
4655
|
+
// we've crossed the build threshold.
|
|
4656
|
+
let total = self
|
|
4657
|
+
.records
|
|
4658
|
+
.iter()
|
|
4659
|
+
.filter(|(_, r)| match &opt_ns {
|
|
4660
|
+
Some(ns) => r.namespace == *ns,
|
|
4661
|
+
None => true,
|
|
4662
|
+
})
|
|
4663
|
+
.filter(|(_, r)| {
|
|
4664
|
+
r.dense_vectors()
|
|
4665
|
+
.any(|(name, _)| name == vector_name.as_str())
|
|
4666
|
+
})
|
|
4667
|
+
.count();
|
|
4668
|
+
|
|
4669
|
+
if total < ANN_MIN_POINTS {
|
|
4670
|
+
continue;
|
|
4671
|
+
}
|
|
4672
|
+
|
|
4673
|
+
// Need to build a fresh graph for this slot. Collect ALL matching
|
|
4674
|
+
// records (not just the new ones) — owned clones so the build
|
|
4675
|
+
// step doesn't borrow `self.records`.
|
|
4676
|
+
let mut all_items: Vec<(RecordKey, Vec<f32>)> = Vec::with_capacity(total);
|
|
4677
|
+
for (k, r) in &self.records {
|
|
4678
|
+
if let Some(ns) = &opt_ns {
|
|
4679
|
+
if r.namespace != *ns {
|
|
4680
|
+
continue;
|
|
4681
|
+
}
|
|
4682
|
+
}
|
|
4683
|
+
for (name, vec) in r.dense_vectors() {
|
|
4684
|
+
if name == vector_name.as_str() {
|
|
4685
|
+
all_items.push((k.clone(), vec.clone()));
|
|
4686
|
+
break;
|
|
4687
|
+
}
|
|
4688
|
+
}
|
|
4689
|
+
}
|
|
4690
|
+
let _ = new_items; // already folded into `all_items`
|
|
4691
|
+
fresh_builds.push(((opt_ns, vector_name), all_items));
|
|
4692
|
+
}
|
|
4693
|
+
|
|
4694
|
+
// Phase 2a: build-from-scratch for slots that just crossed the
|
|
4695
|
+
// threshold.
|
|
4696
|
+
for ((opt_ns, vector_name), all_items) in fresh_builds {
|
|
4697
|
+
let records_for_build: Vec<(RecordKey, &Vec<f32>)> =
|
|
4698
|
+
all_items.iter().map(|(k, v)| (k.clone(), v)).collect();
|
|
4699
|
+
let new_index = build_ann_index(records_for_build, self.metric, &cfg);
|
|
4700
|
+
match opt_ns {
|
|
4701
|
+
None => {
|
|
4702
|
+
self.ann.global.insert(vector_name, new_index);
|
|
4703
|
+
}
|
|
4704
|
+
Some(ns) => {
|
|
4705
|
+
self.ann
|
|
4706
|
+
.namespaces
|
|
4707
|
+
.entry(ns)
|
|
4708
|
+
.or_default()
|
|
4709
|
+
.insert(vector_name, new_index);
|
|
4710
|
+
}
|
|
4711
|
+
}
|
|
4712
|
+
}
|
|
4713
|
+
|
|
4714
|
+
// Phase 2b: incremental appends into existing graphs.
|
|
4715
|
+
for ((opt_ns, vector_name), new_items) in incremental {
|
|
4716
|
+
let idx_opt = match &opt_ns {
|
|
4717
|
+
None => self.ann.global.get_mut(&vector_name),
|
|
4718
|
+
Some(ns) => self
|
|
4719
|
+
.ann
|
|
4720
|
+
.namespaces
|
|
4721
|
+
.get_mut(ns)
|
|
4722
|
+
.and_then(|m| m.get_mut(&vector_name)),
|
|
4723
|
+
};
|
|
4724
|
+
let Some(idx) = idx_opt else {
|
|
4725
|
+
continue;
|
|
4726
|
+
};
|
|
4727
|
+
|
|
4728
|
+
// hnsw_rs marks indexes that have been searched as "searching
|
|
4729
|
+
// mode" (a hint that skips some bookkeeping in the data layer).
|
|
4730
|
+
// Re-enable mutation mode before we insert — cheap toggle.
|
|
4731
|
+
idx.hnsw.set_searching_mode(false);
|
|
4732
|
+
|
|
4733
|
+
if new_items.len() >= cfg.parallel_insert_threshold {
|
|
4734
|
+
let start_id = idx.keys.len();
|
|
4735
|
+
let batch: Vec<(&Vec<f32>, usize)> = new_items
|
|
4736
|
+
.iter()
|
|
4737
|
+
.enumerate()
|
|
4738
|
+
.map(|(offset, (_, v))| (v, start_id + offset))
|
|
4739
|
+
.collect();
|
|
4740
|
+
idx.hnsw.parallel_insert_batch(&batch);
|
|
4741
|
+
for (offset, (k, _)) in new_items.into_iter().enumerate() {
|
|
4742
|
+
let origin_id = start_id + offset;
|
|
4743
|
+
idx.key_to_origin.insert(k.clone(), origin_id);
|
|
4744
|
+
idx.keys.push(k);
|
|
4745
|
+
}
|
|
4746
|
+
} else {
|
|
4747
|
+
for (key, vector) in new_items {
|
|
4748
|
+
let origin_id = idx.keys.len();
|
|
4749
|
+
idx.key_to_origin.insert(key.clone(), origin_id);
|
|
4750
|
+
idx.keys.push(key);
|
|
4751
|
+
idx.hnsw.insert_one(vector.as_slice(), origin_id);
|
|
4752
|
+
}
|
|
4753
|
+
}
|
|
4754
|
+
}
|
|
4755
|
+
}
|
|
4756
|
+
|
|
4757
|
+
/// Append newly-inserted vectors to the contiguous arena. Caller must
|
|
4758
|
+
/// have already inserted the records into `self.records` and confirmed
|
|
4759
|
+
/// the arena exists and isn't dirty.
|
|
4760
|
+
fn arena_apply_incremental(&mut self, new_keys: &[RecordKey]) {
|
|
4761
|
+
let Some(arena) = self.vector_arena.as_mut() else {
|
|
4762
|
+
return;
|
|
4763
|
+
};
|
|
4764
|
+
for key in new_keys {
|
|
4765
|
+
if let Some(record) = self.records.get(key) {
|
|
4766
|
+
arena.append(key.clone(), &record.vector);
|
|
4767
|
+
}
|
|
4768
|
+
}
|
|
4769
|
+
}
|
|
4770
|
+
|
|
4771
|
+
/// Ensure the contiguous arena is materialised and fresh. Cheap when
|
|
4772
|
+
/// already clean; rebuilds from `self.records` (in BTreeMap order) on
|
|
4773
|
+
/// first call or after a delete. Allocates `dim * N` f32s.
|
|
4774
|
+
fn ensure_vector_arena(&mut self) -> &VectorArena {
|
|
4775
|
+
let needs_build = self
|
|
4776
|
+
.vector_arena
|
|
4777
|
+
.as_ref()
|
|
4778
|
+
.map_or(true, |a| self.vector_arena_dirty || a.dim != self.dimension);
|
|
4779
|
+
if needs_build {
|
|
4780
|
+
self.vector_arena = Some(VectorArena::rebuild_from(&self.records, self.dimension));
|
|
4781
|
+
self.vector_arena_dirty = false;
|
|
4782
|
+
}
|
|
4783
|
+
self.vector_arena.as_ref().unwrap()
|
|
4784
|
+
}
|
|
4785
|
+
|
|
4786
|
+
/// Mark the given record keys as deleted in every HNSW graph they live
|
|
4787
|
+
/// in. The graph itself is not modified — search filters tombstoned
|
|
4788
|
+
/// `origin_id`s. A subsequent `compact()` will rebuild any graph whose
|
|
4789
|
+
/// dead ratio exceeds `IndexConfig.tombstone_rebuild_pct`.
|
|
4790
|
+
fn ann_apply_tombstones(&mut self, deleted_keys: &[RecordKey]) {
|
|
4791
|
+
if deleted_keys.is_empty() {
|
|
4792
|
+
return;
|
|
4793
|
+
}
|
|
4794
|
+
for key in deleted_keys {
|
|
4795
|
+
// Global graphs (per vector_name): every graph that contains
|
|
4796
|
+
// this key gets the corresponding origin_id tombstoned.
|
|
4797
|
+
for (_, idx) in self.ann.global.iter_mut() {
|
|
4798
|
+
if let Some(&origin_id) = idx.key_to_origin.get(key) {
|
|
4799
|
+
idx.tombstones.insert(origin_id);
|
|
4800
|
+
}
|
|
4801
|
+
}
|
|
4802
|
+
// Per-namespace graphs: only the namespace this key belongs to
|
|
4803
|
+
// has a chance of containing it, but checking all of them is
|
|
4804
|
+
// fine — `key_to_origin.get` is O(1) and misses immediately.
|
|
4805
|
+
for (_, indexes) in self.ann.namespaces.iter_mut() {
|
|
4806
|
+
for (_, idx) in indexes.iter_mut() {
|
|
4807
|
+
if let Some(&origin_id) = idx.key_to_origin.get(key) {
|
|
4808
|
+
idx.tombstones.insert(origin_id);
|
|
4809
|
+
}
|
|
4810
|
+
}
|
|
4811
|
+
}
|
|
4812
|
+
}
|
|
4813
|
+
}
|
|
4814
|
+
|
|
3836
4815
|
fn rebuild_ann(&mut self) {
|
|
3837
4816
|
self.ann = AnnCatalog::default();
|
|
3838
4817
|
let mut global_by_vector: BTreeMap<String, Vec<(RecordKey, &Vec<f32>)>> = BTreeMap::new();
|
|
@@ -3854,13 +4833,14 @@ impl Database {
|
|
|
3854
4833
|
}
|
|
3855
4834
|
}
|
|
3856
4835
|
|
|
4836
|
+
let cfg = self.index_config;
|
|
3857
4837
|
self.ann.global = global_by_vector
|
|
3858
4838
|
.into_iter()
|
|
3859
4839
|
.filter_map(|(vector_name, records)| {
|
|
3860
4840
|
if records.len() < ANN_MIN_POINTS {
|
|
3861
4841
|
None
|
|
3862
4842
|
} else {
|
|
3863
|
-
Some((vector_name, build_ann_index(records, self.metric)))
|
|
4843
|
+
Some((vector_name, build_ann_index(records, self.metric, &cfg)))
|
|
3864
4844
|
}
|
|
3865
4845
|
})
|
|
3866
4846
|
.collect();
|
|
@@ -3874,7 +4854,7 @@ impl Database {
|
|
|
3874
4854
|
if records.len() < ANN_MIN_POINTS {
|
|
3875
4855
|
None
|
|
3876
4856
|
} else {
|
|
3877
|
-
Some((vector_name, build_ann_index(records, self.metric)))
|
|
4857
|
+
Some((vector_name, build_ann_index(records, self.metric, &cfg)))
|
|
3878
4858
|
}
|
|
3879
4859
|
})
|
|
3880
4860
|
.collect::<BTreeMap<_, _>>();
|
|
@@ -3919,6 +4899,24 @@ impl Database {
|
|
|
3919
4899
|
return false;
|
|
3920
4900
|
}
|
|
3921
4901
|
|
|
4902
|
+
// For ANN2 manifests, use the persisted keys verbatim — they
|
|
4903
|
+
// match the `origin_id`s baked into the HNSW graph file. For
|
|
4904
|
+
// ANN1 (no persisted keys), fall back to the recomputed
|
|
4905
|
+
// BTreeMap-ordered list, which matches the way ANN1 graphs were
|
|
4906
|
+
// always built.
|
|
4907
|
+
let keys = if manifest_entry.keys.is_empty() {
|
|
4908
|
+
expected_entry.keys.clone()
|
|
4909
|
+
} else {
|
|
4910
|
+
// Defensive: persisted keys length must agree with the
|
|
4911
|
+
// declared record_count and the live record set, else the
|
|
4912
|
+
// manifest is inconsistent and we'd rather rebuild than
|
|
4913
|
+
// serve wrong neighbours.
|
|
4914
|
+
if manifest_entry.keys.len() != manifest_entry.record_count {
|
|
4915
|
+
return false;
|
|
4916
|
+
}
|
|
4917
|
+
manifest_entry.keys.clone()
|
|
4918
|
+
};
|
|
4919
|
+
|
|
3922
4920
|
let Some(index) = load_ann_index(
|
|
3923
4921
|
parent,
|
|
3924
4922
|
&ann_basename(
|
|
@@ -3926,7 +4924,7 @@ impl Database {
|
|
|
3926
4924
|
expected_entry.namespace.as_deref(),
|
|
3927
4925
|
&expected_entry.vector_name,
|
|
3928
4926
|
),
|
|
3929
|
-
|
|
4927
|
+
keys,
|
|
3930
4928
|
self.metric,
|
|
3931
4929
|
) else {
|
|
3932
4930
|
return false;
|
|
@@ -3957,7 +4955,11 @@ impl Database {
|
|
|
3957
4955
|
return Ok(());
|
|
3958
4956
|
}
|
|
3959
4957
|
|
|
3960
|
-
|
|
4958
|
+
// Use `actual_ann_entries` (NOT `expected_ann_entries`) so the
|
|
4959
|
+
// persisted keys array matches the order the HNSW graph stored its
|
|
4960
|
+
// `origin_id`s in. After incremental inserts the in-memory keys vec
|
|
4961
|
+
// is in insertion order, which usually differs from BTreeMap order.
|
|
4962
|
+
let entries = self.actual_ann_entries();
|
|
3961
4963
|
for entry in &entries {
|
|
3962
4964
|
let basename = ann_basename(&self.path, entry.namespace.as_deref(), &entry.vector_name);
|
|
3963
4965
|
let graph_path = parent.join(format!("{basename}.hnsw.graph"));
|
|
@@ -3985,6 +4987,41 @@ impl Database {
|
|
|
3985
4987
|
write_ann_manifest(&ann_manifest_path(&self.path), &entries)
|
|
3986
4988
|
}
|
|
3987
4989
|
|
|
4990
|
+
/// Like `expected_ann_entries`, but populates each entry's `keys` field
|
|
4991
|
+
/// from the actual in-memory `AnnIndex.keys` array (insertion order).
|
|
4992
|
+
/// This is what gets serialised into the ANN2 manifest, and matches the
|
|
4993
|
+
/// `origin_id`s baked into the dumped HNSW graph files.
|
|
4994
|
+
fn actual_ann_entries(&self) -> Vec<AnnManifestEntry> {
|
|
4995
|
+
let mut entries = Vec::new();
|
|
4996
|
+
for (vector_name, index) in &self.ann.global {
|
|
4997
|
+
if index.keys.len() < ANN_MIN_POINTS {
|
|
4998
|
+
continue;
|
|
4999
|
+
}
|
|
5000
|
+
entries.push(AnnManifestEntry {
|
|
5001
|
+
namespace: None,
|
|
5002
|
+
vector_name: vector_name.clone(),
|
|
5003
|
+
record_count: index.keys.len(),
|
|
5004
|
+
key_signature: record_key_signature(&index.keys),
|
|
5005
|
+
keys: index.keys.clone(),
|
|
5006
|
+
});
|
|
5007
|
+
}
|
|
5008
|
+
for (namespace, indexes) in &self.ann.namespaces {
|
|
5009
|
+
for (vector_name, index) in indexes {
|
|
5010
|
+
if index.keys.len() < ANN_MIN_POINTS {
|
|
5011
|
+
continue;
|
|
5012
|
+
}
|
|
5013
|
+
entries.push(AnnManifestEntry {
|
|
5014
|
+
namespace: Some(namespace.clone()),
|
|
5015
|
+
vector_name: vector_name.clone(),
|
|
5016
|
+
record_count: index.keys.len(),
|
|
5017
|
+
key_signature: record_key_signature(&index.keys),
|
|
5018
|
+
keys: index.keys.clone(),
|
|
5019
|
+
});
|
|
5020
|
+
}
|
|
5021
|
+
}
|
|
5022
|
+
entries
|
|
5023
|
+
}
|
|
5024
|
+
|
|
3988
5025
|
fn expected_ann_entries(&self) -> Vec<AnnManifestEntry> {
|
|
3989
5026
|
let mut global: BTreeMap<String, Vec<RecordKey>> = BTreeMap::new();
|
|
3990
5027
|
let mut by_namespace: BTreeMap<String, BTreeMap<String, Vec<RecordKey>>> = BTreeMap::new();
|
|
@@ -4196,21 +5233,45 @@ impl Database {
|
|
|
4196
5233
|
.global
|
|
4197
5234
|
.get(vector_name.unwrap_or(DEFAULT_VECTOR_NAME)),
|
|
4198
5235
|
}?;
|
|
4199
|
-
|
|
5236
|
+
// Gate on live (non-tombstoned) record count: if half the graph is
|
|
5237
|
+
// dead, treat the live half as if it were the whole corpus.
|
|
5238
|
+
let live = index.live_count();
|
|
5239
|
+
if live < ANN_SEARCH_MIN_POINTS {
|
|
4200
5240
|
return None;
|
|
4201
5241
|
}
|
|
4202
5242
|
|
|
4203
|
-
let candidate_count = candidate_count(top_k,
|
|
5243
|
+
let candidate_count = candidate_count(top_k, live);
|
|
4204
5244
|
if candidate_count == 0 {
|
|
4205
5245
|
return None;
|
|
4206
5246
|
}
|
|
4207
5247
|
|
|
4208
|
-
|
|
4209
|
-
|
|
5248
|
+
// ef_search controls recall vs latency at query time. When the user
|
|
5249
|
+
// explicitly sets `IndexConfig.ef_search`, honour it directly.
|
|
5250
|
+
// Otherwise default to max(candidate_count, ef_construction) which is
|
|
5251
|
+
// a conservative high-recall heuristic.
|
|
5252
|
+
let mut ef_search = match self.index_config.ef_search {
|
|
5253
|
+
Some(ef) => ef.max(candidate_count),
|
|
5254
|
+
None => candidate_count.max(self.index_config.ef_construction),
|
|
5255
|
+
};
|
|
5256
|
+
// Over-fetch to compensate for tombstoned candidates we'll drop. Cap
|
|
5257
|
+
// at the live count so we don't waste work; we'd never get more
|
|
5258
|
+
// distinct results than that anyway.
|
|
5259
|
+
if !index.tombstones.is_empty() {
|
|
5260
|
+
let dead = index.tombstones.len();
|
|
5261
|
+
ef_search = ef_search
|
|
5262
|
+
.saturating_add(dead.min(ef_search))
|
|
5263
|
+
.min(index.keys.len());
|
|
5264
|
+
}
|
|
5265
|
+
let fetch_count = candidate_count
|
|
5266
|
+
.saturating_add(index.tombstones.len().min(candidate_count))
|
|
5267
|
+
.min(index.keys.len());
|
|
5268
|
+
let neighbours = index.hnsw.search(query, fetch_count, ef_search);
|
|
4210
5269
|
Some(
|
|
4211
5270
|
neighbours
|
|
4212
5271
|
.into_iter()
|
|
5272
|
+
.filter(|n| !index.tombstones.contains(&n.d_id))
|
|
4213
5273
|
.filter_map(|neighbour| index.keys.get(neighbour.d_id).cloned())
|
|
5274
|
+
.take(candidate_count)
|
|
4214
5275
|
.collect(),
|
|
4215
5276
|
)
|
|
4216
5277
|
}
|
|
@@ -4475,28 +5536,51 @@ fn score_dense_prefix(
|
|
|
4475
5536
|
metric.score(&left[..dimension], &right[..dimension])
|
|
4476
5537
|
}
|
|
4477
5538
|
|
|
4478
|
-
fn build_ann_index(
|
|
5539
|
+
fn build_ann_index(
|
|
5540
|
+
records: Vec<(RecordKey, &Vec<f32>)>,
|
|
5541
|
+
metric: DistanceMetric,
|
|
5542
|
+
config: &IndexConfig,
|
|
5543
|
+
) -> AnnIndex {
|
|
4479
5544
|
let max_layer = compute_hnsw_layers(records.len());
|
|
4480
5545
|
let count = records.len();
|
|
5546
|
+
let use_parallel = count >= config.parallel_insert_threshold;
|
|
4481
5547
|
|
|
4482
5548
|
macro_rules! build_hnsw {
|
|
4483
5549
|
($dist_type:ty, $dist_val:expr, $variant:ident) => {{
|
|
4484
5550
|
let mut hnsw = Hnsw::<f32, $dist_type>::new(
|
|
4485
|
-
|
|
4486
|
-
count,
|
|
5551
|
+
config.m,
|
|
5552
|
+
count.max(1),
|
|
4487
5553
|
max_layer,
|
|
4488
|
-
|
|
5554
|
+
config.ef_construction,
|
|
4489
5555
|
$dist_val,
|
|
4490
5556
|
);
|
|
4491
5557
|
let mut keys = Vec::with_capacity(count);
|
|
4492
|
-
|
|
4493
|
-
|
|
4494
|
-
|
|
5558
|
+
let mut key_to_origin = HashMap::with_capacity(count);
|
|
5559
|
+
if use_parallel {
|
|
5560
|
+
// hnsw_rs's `parallel_insert` takes `&[(&Vec<T>, usize)]`
|
|
5561
|
+
// (the API is built around owned-Vec borrows) and uses Rayon
|
|
5562
|
+
// internally so the dominant cost (distance calculations
|
|
5563
|
+
// during graph neighbour selection) is multi-threaded.
|
|
5564
|
+
let mut batch: Vec<(&Vec<f32>, usize)> = Vec::with_capacity(count);
|
|
5565
|
+
for (origin_id, (key, vector)) in records.into_iter().enumerate() {
|
|
5566
|
+
batch.push((vector, origin_id));
|
|
5567
|
+
key_to_origin.insert(key.clone(), origin_id);
|
|
5568
|
+
keys.push(key);
|
|
5569
|
+
}
|
|
5570
|
+
hnsw.parallel_insert(&batch);
|
|
5571
|
+
} else {
|
|
5572
|
+
for (origin_id, (key, vector)) in records.into_iter().enumerate() {
|
|
5573
|
+
hnsw.insert((vector.as_slice(), origin_id));
|
|
5574
|
+
key_to_origin.insert(key.clone(), origin_id);
|
|
5575
|
+
keys.push(key);
|
|
5576
|
+
}
|
|
4495
5577
|
}
|
|
4496
5578
|
hnsw.set_searching_mode(true);
|
|
4497
5579
|
AnnIndex {
|
|
4498
5580
|
hnsw: AnnHnsw::$variant(hnsw),
|
|
4499
5581
|
keys,
|
|
5582
|
+
key_to_origin,
|
|
5583
|
+
tombstones: HashSet::new(),
|
|
4500
5584
|
}
|
|
4501
5585
|
}};
|
|
4502
5586
|
}
|
|
@@ -4691,9 +5775,21 @@ fn hex_encode(bytes: &[u8]) -> String {
|
|
|
4691
5775
|
out
|
|
4692
5776
|
}
|
|
4693
5777
|
|
|
5778
|
+
/// Order-independent FNV-1a hash over a set of record keys. We sort first so
|
|
5779
|
+
/// the signature only depends on the SET of keys, not the order they were
|
|
5780
|
+
/// inserted. Callers can use this to check whether a persisted ANN graph
|
|
5781
|
+
/// matches the live record set regardless of whether the live `keys` vec is
|
|
5782
|
+
/// BTreeMap-ordered (full rebuild) or insertion-ordered (incremental
|
|
5783
|
+
/// updates).
|
|
5784
|
+
///
|
|
5785
|
+
/// Historical note: previously the input was always BTreeMap-iterated and
|
|
5786
|
+
/// therefore already sorted, so the sort step is a no-op for old ANN1
|
|
5787
|
+
/// manifests — backwards compatible.
|
|
4694
5788
|
fn record_key_signature(keys: &[RecordKey]) -> u64 {
|
|
5789
|
+
let mut sorted: Vec<&RecordKey> = keys.iter().collect();
|
|
5790
|
+
sorted.sort();
|
|
4695
5791
|
let mut state = 0xcbf29ce484222325_u64;
|
|
4696
|
-
for (namespace, id) in
|
|
5792
|
+
for (namespace, id) in sorted {
|
|
4697
5793
|
for byte in namespace
|
|
4698
5794
|
.as_bytes()
|
|
4699
5795
|
.iter()
|
|
@@ -4720,9 +5816,16 @@ fn load_ann_index(
|
|
|
4720
5816
|
($dist_val:expr, $variant:ident) => {{
|
|
4721
5817
|
let mut hnsw = reloader.load_hnsw_with_dist($dist_val).ok()?;
|
|
4722
5818
|
hnsw.set_searching_mode(true);
|
|
5819
|
+
let key_to_origin = keys
|
|
5820
|
+
.iter()
|
|
5821
|
+
.enumerate()
|
|
5822
|
+
.map(|(i, k)| (k.clone(), i))
|
|
5823
|
+
.collect();
|
|
4723
5824
|
Some(AnnIndex {
|
|
4724
5825
|
hnsw: AnnHnsw::$variant(hnsw),
|
|
4725
5826
|
keys,
|
|
5827
|
+
key_to_origin,
|
|
5828
|
+
tombstones: HashSet::new(),
|
|
4726
5829
|
})
|
|
4727
5830
|
}};
|
|
4728
5831
|
}
|
|
@@ -4735,9 +5838,16 @@ fn load_ann_index(
|
|
|
4735
5838
|
}
|
|
4736
5839
|
}
|
|
4737
5840
|
|
|
5841
|
+
/// Write the ANN sidecar manifest. We use format `ANN2`, which (compared to
|
|
5842
|
+
/// the original `ANN1`) also serialises the actual key array per index in
|
|
5843
|
+
/// the order the HNSW knows its `origin_id`s. This is required for
|
|
5844
|
+
/// incremental insertion: without it, a reload would associate the wrong
|
|
5845
|
+
/// (BTreeMap-ordered) record key with each HNSW origin_id whenever the in
|
|
5846
|
+
/// memory key array isn't sorted (which happens any time we incrementally
|
|
5847
|
+
/// append).
|
|
4738
5848
|
fn write_ann_manifest(path: &Path, entries: &[AnnManifestEntry]) -> Result<()> {
|
|
4739
|
-
let mut file = File::create(path)
|
|
4740
|
-
file.write_all(b"
|
|
5849
|
+
let mut file = BufWriter::new(File::create(path)?);
|
|
5850
|
+
file.write_all(b"ANN2")?;
|
|
4741
5851
|
write_u32(&mut file, u32_from_usize(entries.len())?)?;
|
|
4742
5852
|
for entry in entries {
|
|
4743
5853
|
write_u8(&mut file, u8::from(entry.namespace.is_some()))?;
|
|
@@ -4747,8 +5857,15 @@ fn write_ann_manifest(path: &Path, entries: &[AnnManifestEntry]) -> Result<()> {
|
|
|
4747
5857
|
write_string(&mut file, &entry.vector_name)?;
|
|
4748
5858
|
write_u64(&mut file, u64_from_usize(entry.record_count)?)?;
|
|
4749
5859
|
write_u64(&mut file, entry.key_signature)?;
|
|
5860
|
+
// ANN2 addition: the full keys array in insertion order.
|
|
5861
|
+
write_u64(&mut file, u64_from_usize(entry.keys.len())?)?;
|
|
5862
|
+
for (ns, id) in &entry.keys {
|
|
5863
|
+
write_string(&mut file, ns)?;
|
|
5864
|
+
write_string(&mut file, id)?;
|
|
5865
|
+
}
|
|
4750
5866
|
}
|
|
4751
|
-
file.
|
|
5867
|
+
file.flush()?;
|
|
5868
|
+
file.get_ref().sync_all()?;
|
|
4752
5869
|
Ok(())
|
|
4753
5870
|
}
|
|
4754
5871
|
|
|
@@ -4756,11 +5873,15 @@ fn read_ann_manifest(path: &Path) -> Result<Vec<AnnManifestEntry>> {
|
|
|
4756
5873
|
let mut file = BufReader::new(File::open(path)?);
|
|
4757
5874
|
let mut magic = [0_u8; 4];
|
|
4758
5875
|
file.read_exact(&mut magic)?;
|
|
4759
|
-
|
|
4760
|
-
|
|
4761
|
-
|
|
4762
|
-
|
|
4763
|
-
|
|
5876
|
+
let version = match &magic {
|
|
5877
|
+
b"ANN1" => 1u8,
|
|
5878
|
+
b"ANN2" => 2u8,
|
|
5879
|
+
_ => {
|
|
5880
|
+
return Err(VectLiteError::InvalidFormat(
|
|
5881
|
+
"invalid ANN manifest".to_owned(),
|
|
5882
|
+
));
|
|
5883
|
+
}
|
|
5884
|
+
};
|
|
4764
5885
|
|
|
4765
5886
|
let count = usize_from_u32(read_u32(&mut file)?)?;
|
|
4766
5887
|
let mut entries = Vec::with_capacity(count);
|
|
@@ -4774,12 +5895,27 @@ fn read_ann_manifest(path: &Path) -> Result<Vec<AnnManifestEntry>> {
|
|
|
4774
5895
|
let vector_name = read_string(&mut file)?;
|
|
4775
5896
|
let record_count = usize_from_u64(read_u64(&mut file)?)?;
|
|
4776
5897
|
let key_signature = read_u64(&mut file)?;
|
|
5898
|
+
let keys = if version >= 2 {
|
|
5899
|
+
let n = usize_from_u64(read_u64(&mut file)?)?;
|
|
5900
|
+
let mut keys = Vec::with_capacity(n);
|
|
5901
|
+
for _ in 0..n {
|
|
5902
|
+
let ns = read_string(&mut file)?;
|
|
5903
|
+
let id = read_string(&mut file)?;
|
|
5904
|
+
keys.push((ns, id));
|
|
5905
|
+
}
|
|
5906
|
+
keys
|
|
5907
|
+
} else {
|
|
5908
|
+
// ANN1 had no persisted keys; caller falls back to recomputing
|
|
5909
|
+
// them from `self.records` (which yields BTreeMap-sorted keys,
|
|
5910
|
+
// matching the order ANN1 indexes were always built in).
|
|
5911
|
+
Vec::new()
|
|
5912
|
+
};
|
|
4777
5913
|
entries.push(AnnManifestEntry {
|
|
4778
5914
|
namespace,
|
|
4779
5915
|
vector_name,
|
|
4780
5916
|
record_count,
|
|
4781
5917
|
key_signature,
|
|
4782
|
-
keys
|
|
5918
|
+
keys,
|
|
4783
5919
|
});
|
|
4784
5920
|
}
|
|
4785
5921
|
Ok(entries)
|