vectlite 0.10.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -3
- package/index.d.ts +47 -0
- package/index.js +53 -0
- package/native/Cargo.toml +1 -1
- package/native/src/lib.rs +106 -7
- package/native/vectlite-core/Cargo.toml +1 -1
- package/native/vectlite-core/src/lib.rs +962 -41
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/vectlite.node +0 -0
- package/prebuilds/darwin-x64/vectlite.node +0 -0
- package/prebuilds/linux-x64-gnu/vectlite.node +0 -0
- package/prebuilds/win32-x64-msvc/vectlite.node +0 -0
|
@@ -782,6 +782,14 @@ pub struct IndexConfig {
|
|
|
782
782
|
/// least this many vectors. Defaults to `ANN_PARALLEL_INSERT_THRESHOLD`.
|
|
783
783
|
/// Set very high to disable parallel insert.
|
|
784
784
|
pub parallel_insert_threshold: usize,
|
|
785
|
+
/// Percentage (0..=100) of tombstoned nodes at which the HNSW graph is
|
|
786
|
+
/// rebuilt during `compact()`. A `delete` doesn't physically remove a
|
|
787
|
+
/// node from HNSW (that operation is not supported by the library); the
|
|
788
|
+
/// node is just marked dead and filtered out at search time. Once enough
|
|
789
|
+
/// nodes are dead, search recall and latency degrade, so we rebuild.
|
|
790
|
+
/// Default `30` (rebuild when ≥30% of the graph is dead). Set to `100`
|
|
791
|
+
/// to disable automatic rebuild.
|
|
792
|
+
pub tombstone_rebuild_pct: u8,
|
|
785
793
|
}
|
|
786
794
|
|
|
787
795
|
impl Default for IndexConfig {
|
|
@@ -791,6 +799,7 @@ impl Default for IndexConfig {
|
|
|
791
799
|
ef_construction: ANN_EF_CONSTRUCTION,
|
|
792
800
|
ef_search: None,
|
|
793
801
|
parallel_insert_threshold: ANN_PARALLEL_INSERT_THRESHOLD,
|
|
802
|
+
tombstone_rebuild_pct: 30,
|
|
794
803
|
}
|
|
795
804
|
}
|
|
796
805
|
}
|
|
@@ -804,6 +813,7 @@ impl IndexConfig {
|
|
|
804
813
|
ef_construction: 400,
|
|
805
814
|
ef_search: Some(200),
|
|
806
815
|
parallel_insert_threshold: ANN_PARALLEL_INSERT_THRESHOLD,
|
|
816
|
+
tombstone_rebuild_pct: 30,
|
|
807
817
|
}
|
|
808
818
|
}
|
|
809
819
|
|
|
@@ -814,6 +824,7 @@ impl IndexConfig {
|
|
|
814
824
|
ef_construction: 100,
|
|
815
825
|
ef_search: Some(40),
|
|
816
826
|
parallel_insert_threshold: ANN_PARALLEL_INSERT_THRESHOLD,
|
|
827
|
+
tombstone_rebuild_pct: 30,
|
|
817
828
|
}
|
|
818
829
|
}
|
|
819
830
|
|
|
@@ -835,6 +846,59 @@ impl IndexConfig {
|
|
|
835
846
|
));
|
|
836
847
|
}
|
|
837
848
|
}
|
|
849
|
+
if self.tombstone_rebuild_pct > 100 {
|
|
850
|
+
return Err(VectLiteError::InvalidFormat(
|
|
851
|
+
"IndexConfig.tombstone_rebuild_pct must be in 0..=100".to_owned(),
|
|
852
|
+
));
|
|
853
|
+
}
|
|
854
|
+
Ok(())
|
|
855
|
+
}
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
/// Controls when the WAL file is `fsync`'d to disk.
|
|
859
|
+
///
|
|
860
|
+
/// Per-record durability is the default (`PerOp`) but on macOS APFS — and to
|
|
861
|
+
/// a lesser extent on Linux ext4 — `fsync` is the dominant cost of single
|
|
862
|
+
/// `insert` calls. Relaxing this knob can multiply ingestion throughput by
|
|
863
|
+
/// 5–10× at the cost of losing some recently-acknowledged records on an
|
|
864
|
+
/// unclean shutdown.
|
|
865
|
+
///
|
|
866
|
+
/// The WAL is *always* fully synced on `flush()`, `compact()`, and `close()`.
|
|
867
|
+
/// So even with `OnFlush`, any data that survives a clean shutdown is
|
|
868
|
+
/// durable. The window of vulnerability is limited to:
|
|
869
|
+
/// - `EveryN(n)`: at most the last `n - 1` inserts since the last fsync.
|
|
870
|
+
/// - `OnFlush`: every insert since the last `flush()` / `compact()`.
|
|
871
|
+
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
|
872
|
+
pub enum WalSyncMode {
|
|
873
|
+
/// `fsync` after every WAL append. Strongest durability, slowest. This is
|
|
874
|
+
/// the default and matches pre-0.11 behaviour.
|
|
875
|
+
PerOp,
|
|
876
|
+
/// `fsync` once every `n` ops. On a crash, up to the last `n - 1` ops
|
|
877
|
+
/// since the last sync may be lost. A good middle ground when streaming
|
|
878
|
+
/// thousands of small records: pick `n` so the worst-case loss is
|
|
879
|
+
/// tolerable (e.g. `64` ≈ a fraction of a second of data).
|
|
880
|
+
EveryN(usize),
|
|
881
|
+
/// Never `fsync` from the per-op path. Sync only at `flush()` / `compact()`
|
|
882
|
+
/// / `close()`. Maximum throughput, weakest durability — appropriate for
|
|
883
|
+
/// bulk ingestion of data that can be regenerated.
|
|
884
|
+
OnFlush,
|
|
885
|
+
}
|
|
886
|
+
|
|
887
|
+
impl Default for WalSyncMode {
|
|
888
|
+
fn default() -> Self {
|
|
889
|
+
WalSyncMode::PerOp
|
|
890
|
+
}
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
impl WalSyncMode {
|
|
894
|
+
fn validate(self) -> Result<()> {
|
|
895
|
+
if let WalSyncMode::EveryN(n) = self {
|
|
896
|
+
if n == 0 {
|
|
897
|
+
return Err(VectLiteError::InvalidFormat(
|
|
898
|
+
"WalSyncMode::EveryN must be >= 1".to_owned(),
|
|
899
|
+
));
|
|
900
|
+
}
|
|
901
|
+
}
|
|
838
902
|
Ok(())
|
|
839
903
|
}
|
|
840
904
|
}
|
|
@@ -1299,6 +1363,29 @@ pub struct Database {
|
|
|
1299
1363
|
/// Holds the lock file open for the lifetime of the database.
|
|
1300
1364
|
/// Dropping this releases the advisory lock.
|
|
1301
1365
|
_lock_file: Option<File>,
|
|
1366
|
+
/// Cached WAL writer: avoids paying the open() syscall on every insert.
|
|
1367
|
+
/// Reset whenever the WAL is rotated (compact, clear_wal).
|
|
1368
|
+
wal_writer: Option<BufWriter<File>>,
|
|
1369
|
+
/// Controls when `fsync` is issued against the WAL — see [`WalSyncMode`].
|
|
1370
|
+
wal_sync_mode: WalSyncMode,
|
|
1371
|
+
/// Number of ops appended to the WAL since the last fsync. Used by the
|
|
1372
|
+
/// `EveryN` sync mode to decide when to flush+sync.
|
|
1373
|
+
wal_ops_since_sync: usize,
|
|
1374
|
+
/// True if the in-memory ANN graph(s) have unsaved changes (incremental
|
|
1375
|
+
/// inserts, fresh build, or a full rebuild) that have not been written
|
|
1376
|
+
/// out via `persist_ann_to_disk`. Set on every mutation in
|
|
1377
|
+
/// `apply_wal_batch` / `bulk_ingest` and cleared by `compact_inner` or
|
|
1378
|
+
/// an explicit `persist_ann_to_disk`.
|
|
1379
|
+
ann_dirty: bool,
|
|
1380
|
+
/// True if the quantized PQ index needs to be rebuilt at the next flush
|
|
1381
|
+
/// (because records have been inserted/deleted since the last rebuild).
|
|
1382
|
+
/// While dirty, the in-memory `quantized` field is set to `None` so
|
|
1383
|
+
/// searches transparently fall back to the HNSW path instead of
|
|
1384
|
+
/// returning candidates from a stale codebook.
|
|
1385
|
+
quantized_dirty: bool,
|
|
1386
|
+
/// Same as `quantized_dirty`, but for multi-vector (ColBERT-style)
|
|
1387
|
+
/// quantization spaces. Lazy rebuild happens at flush time.
|
|
1388
|
+
multi_vector_quantized_dirty: bool,
|
|
1302
1389
|
/// Optional quantized index for accelerated search.
|
|
1303
1390
|
quantized: Option<QuantizedIndex>,
|
|
1304
1391
|
/// Configuration used to build the quantized index (persisted).
|
|
@@ -1319,6 +1406,87 @@ pub struct Database {
|
|
|
1319
1406
|
/// knob so callers can change recall/latency tradeoffs without migrating
|
|
1320
1407
|
/// data files. A subsequent `set_index_config` triggers a rebuild.
|
|
1321
1408
|
index_config: IndexConfig,
|
|
1409
|
+
/// Contiguous f32 mirror of the default dense vector for every record.
|
|
1410
|
+
/// Used by brute-force / rescoring scans for cache-friendly SIMD.
|
|
1411
|
+
/// `None` when the arena hasn't been materialised yet for this session.
|
|
1412
|
+
vector_arena: Option<VectorArena>,
|
|
1413
|
+
/// When true, `vector_arena` is stale (e.g. a delete happened) and must
|
|
1414
|
+
/// be rebuilt before use.
|
|
1415
|
+
vector_arena_dirty: bool,
|
|
1416
|
+
}
|
|
1417
|
+
|
|
1418
|
+
/// Contiguous-storage mirror of the default dense vector per record.
|
|
1419
|
+
///
|
|
1420
|
+
/// In the original layout each `Record.vector` is a separately-allocated
|
|
1421
|
+
/// `Vec<f32>` and the records themselves live in `BTreeMap` nodes, so a
|
|
1422
|
+
/// brute-force or rescoring scan pays two pointer hops per record AND
|
|
1423
|
+
/// touches one cache line per vector — terrible for SIMD throughput.
|
|
1424
|
+
///
|
|
1425
|
+
/// This arena stores every vector in a single flat `buf: Vec<f32>` so a scan
|
|
1426
|
+
/// is a straight contiguous walk (one cache miss per ~16 vectors, vs ~2 per
|
|
1427
|
+
/// vector). Lance / Arrow use the same trick — see the v0.11 CHANGELOG note.
|
|
1428
|
+
///
|
|
1429
|
+
/// The arena is maintained incrementally on insert; deletes are too
|
|
1430
|
+
/// expensive to compact in place (would shift O(N) f32s) so they just mark
|
|
1431
|
+
/// the arena dirty and force a lazy full rebuild on next use.
|
|
1432
|
+
struct VectorArena {
|
|
1433
|
+
buf: Vec<f32>,
|
|
1434
|
+
keys: Vec<RecordKey>,
|
|
1435
|
+
key_to_index: HashMap<RecordKey, usize>,
|
|
1436
|
+
dim: usize,
|
|
1437
|
+
}
|
|
1438
|
+
|
|
1439
|
+
impl VectorArena {
|
|
1440
|
+
fn new(dim: usize) -> Self {
|
|
1441
|
+
Self {
|
|
1442
|
+
buf: Vec::new(),
|
|
1443
|
+
keys: Vec::new(),
|
|
1444
|
+
key_to_index: HashMap::new(),
|
|
1445
|
+
dim,
|
|
1446
|
+
}
|
|
1447
|
+
}
|
|
1448
|
+
|
|
1449
|
+
fn append(&mut self, key: RecordKey, vector: &[f32]) {
|
|
1450
|
+
// Defensive: ignore mismatched dims rather than panicking — this is
|
|
1451
|
+
// a perf cache, not the source of truth.
|
|
1452
|
+
if vector.len() != self.dim {
|
|
1453
|
+
return;
|
|
1454
|
+
}
|
|
1455
|
+
let idx = self.keys.len();
|
|
1456
|
+
self.buf.extend_from_slice(vector);
|
|
1457
|
+
self.key_to_index.insert(key.clone(), idx);
|
|
1458
|
+
self.keys.push(key);
|
|
1459
|
+
}
|
|
1460
|
+
|
|
1461
|
+
/// Rebuild from records in BTreeMap order. Called lazily when the arena
|
|
1462
|
+
/// is dirty (i.e. after a delete or a full ANN rebuild).
|
|
1463
|
+
fn rebuild_from(records: &BTreeMap<RecordKey, Record>, dim: usize) -> Self {
|
|
1464
|
+
let mut arena = Self::new(dim);
|
|
1465
|
+
arena.buf.reserve(records.len() * dim);
|
|
1466
|
+
arena.keys.reserve(records.len());
|
|
1467
|
+
arena.key_to_index.reserve(records.len());
|
|
1468
|
+
for (key, record) in records {
|
|
1469
|
+
if record.vector.len() == dim {
|
|
1470
|
+
arena.append(key.clone(), &record.vector);
|
|
1471
|
+
}
|
|
1472
|
+
}
|
|
1473
|
+
arena
|
|
1474
|
+
}
|
|
1475
|
+
|
|
1476
|
+
/// Iterator yielding `(key, vector_slice)` pairs. The slice references
|
|
1477
|
+
/// the contiguous `buf`, so consumers get cache-friendly SIMD scans.
|
|
1478
|
+
#[allow(dead_code)]
|
|
1479
|
+
fn iter(&self) -> impl Iterator<Item = (&RecordKey, &[f32])> {
|
|
1480
|
+
let dim = self.dim;
|
|
1481
|
+
self.keys.iter().enumerate().map(move |(i, k)| {
|
|
1482
|
+
let start = i * dim;
|
|
1483
|
+
(k, &self.buf[start..start + dim])
|
|
1484
|
+
})
|
|
1485
|
+
}
|
|
1486
|
+
|
|
1487
|
+
fn len(&self) -> usize {
|
|
1488
|
+
self.keys.len()
|
|
1489
|
+
}
|
|
1322
1490
|
}
|
|
1323
1491
|
|
|
1324
1492
|
#[derive(Default)]
|
|
@@ -1344,6 +1512,42 @@ impl AnnHnsw {
|
|
|
1344
1512
|
}
|
|
1345
1513
|
}
|
|
1346
1514
|
|
|
1515
|
+
/// Incrementally insert a single vector into an existing HNSW graph.
|
|
1516
|
+
/// `origin_id` must be unique within the graph and is used to map back
|
|
1517
|
+
/// to the caller's record key array.
|
|
1518
|
+
fn insert_one(&mut self, vector: &[f32], origin_id: usize) {
|
|
1519
|
+
match self {
|
|
1520
|
+
AnnHnsw::Cosine(h) => h.insert((vector, origin_id)),
|
|
1521
|
+
AnnHnsw::Euclidean(h) => h.insert((vector, origin_id)),
|
|
1522
|
+
AnnHnsw::DotProduct(h) => h.insert((vector, origin_id)),
|
|
1523
|
+
AnnHnsw::Manhattan(h) => h.insert((vector, origin_id)),
|
|
1524
|
+
}
|
|
1525
|
+
}
|
|
1526
|
+
|
|
1527
|
+
/// Bulk-insert a batch of vectors in parallel (Rayon-multithreaded).
|
|
1528
|
+
/// Significantly faster than repeated `insert_one` when the batch is
|
|
1529
|
+
/// large enough to amortise thread setup.
|
|
1530
|
+
fn parallel_insert_batch(&mut self, batch: &[(&Vec<f32>, usize)]) {
|
|
1531
|
+
match self {
|
|
1532
|
+
AnnHnsw::Cosine(h) => h.parallel_insert(batch),
|
|
1533
|
+
AnnHnsw::Euclidean(h) => h.parallel_insert(batch),
|
|
1534
|
+
AnnHnsw::DotProduct(h) => h.parallel_insert(batch),
|
|
1535
|
+
AnnHnsw::Manhattan(h) => h.parallel_insert(batch),
|
|
1536
|
+
}
|
|
1537
|
+
}
|
|
1538
|
+
|
|
1539
|
+
/// Toggle the `searching_mode` hint on the underlying HNSW. When `true`
|
|
1540
|
+
/// the graph is treated as read-only and lookups skip some bookkeeping;
|
|
1541
|
+
/// when `false` further inserts are allowed.
|
|
1542
|
+
fn set_searching_mode(&mut self, value: bool) {
|
|
1543
|
+
match self {
|
|
1544
|
+
AnnHnsw::Cosine(h) => h.set_searching_mode(value),
|
|
1545
|
+
AnnHnsw::Euclidean(h) => h.set_searching_mode(value),
|
|
1546
|
+
AnnHnsw::DotProduct(h) => h.set_searching_mode(value),
|
|
1547
|
+
AnnHnsw::Manhattan(h) => h.set_searching_mode(value),
|
|
1548
|
+
}
|
|
1549
|
+
}
|
|
1550
|
+
|
|
1347
1551
|
fn file_dump(&self, directory: &Path, basename: &str) -> Result<()> {
|
|
1348
1552
|
let result = match self {
|
|
1349
1553
|
AnnHnsw::Cosine(h) => h.file_dump(directory, basename),
|
|
@@ -1359,7 +1563,38 @@ impl AnnHnsw {
|
|
|
1359
1563
|
|
|
1360
1564
|
struct AnnIndex {
|
|
1361
1565
|
hnsw: AnnHnsw,
|
|
1566
|
+
/// `keys[i]` is the record key for HNSW origin_id `i`. Always grows; we
|
|
1567
|
+
/// never shrink it (HNSW doesn't support compacted deletion). Tombstoned
|
|
1568
|
+
/// slots stay in the vec to keep origin_id ↔ key mapping stable.
|
|
1362
1569
|
keys: Vec<RecordKey>,
|
|
1570
|
+
/// Reverse index: `key → origin_id`. Lets `delete` find a record's HNSW
|
|
1571
|
+
/// node in O(1). Built alongside `keys` on every (re)build.
|
|
1572
|
+
key_to_origin: HashMap<RecordKey, usize>,
|
|
1573
|
+
/// Origin_ids that have been logically deleted but are still part of the
|
|
1574
|
+
/// HNSW graph. Search filters these out by lookup; a `compact()` rebuilds
|
|
1575
|
+
/// the graph once the ratio exceeds `IndexConfig.tombstone_rebuild_pct`.
|
|
1576
|
+
tombstones: HashSet<usize>,
|
|
1577
|
+
}
|
|
1578
|
+
|
|
1579
|
+
impl AnnIndex {
|
|
1580
|
+
/// Number of live (non-tombstoned) records in the graph.
|
|
1581
|
+
fn live_count(&self) -> usize {
|
|
1582
|
+
self.keys.len().saturating_sub(self.tombstones.len())
|
|
1583
|
+
}
|
|
1584
|
+
|
|
1585
|
+
/// True when the fraction of dead nodes is at or above the configured
|
|
1586
|
+
/// rebuild threshold (`IndexConfig.tombstone_rebuild_pct`). Currently
|
|
1587
|
+
/// `compact_inner` rebuilds on *any* tombstones because the persisted
|
|
1588
|
+
/// manifest format only tracks live record counts — when we add a
|
|
1589
|
+
/// tombstone-aware manifest (planned), this becomes the trigger.
|
|
1590
|
+
#[allow(dead_code)]
|
|
1591
|
+
fn should_rebuild(&self, threshold_pct: u8) -> bool {
|
|
1592
|
+
if self.keys.is_empty() {
|
|
1593
|
+
return false;
|
|
1594
|
+
}
|
|
1595
|
+
let pct = (self.tombstones.len() * 100) / self.keys.len();
|
|
1596
|
+
pct >= threshold_pct as usize
|
|
1597
|
+
}
|
|
1363
1598
|
}
|
|
1364
1599
|
|
|
1365
1600
|
struct AnnManifestEntry {
|
|
@@ -1408,6 +1643,12 @@ impl Database {
|
|
|
1408
1643
|
ann_loaded_from_disk: false,
|
|
1409
1644
|
read_only: false,
|
|
1410
1645
|
_lock_file: Some(lock),
|
|
1646
|
+
wal_writer: None,
|
|
1647
|
+
wal_sync_mode: WalSyncMode::default(),
|
|
1648
|
+
wal_ops_since_sync: 0,
|
|
1649
|
+
ann_dirty: false,
|
|
1650
|
+
quantized_dirty: false,
|
|
1651
|
+
multi_vector_quantized_dirty: false,
|
|
1411
1652
|
quantized: None,
|
|
1412
1653
|
quantization_config: None,
|
|
1413
1654
|
quantized_keys: Vec::new(),
|
|
@@ -1417,6 +1658,8 @@ impl Database {
|
|
|
1417
1658
|
payload_index_defs: BTreeMap::new(),
|
|
1418
1659
|
payload_indexes: BTreeMap::new(),
|
|
1419
1660
|
index_config: IndexConfig::default(),
|
|
1661
|
+
vector_arena: None,
|
|
1662
|
+
vector_arena_dirty: false,
|
|
1420
1663
|
};
|
|
1421
1664
|
|
|
1422
1665
|
database.flush()?;
|
|
@@ -1522,6 +1765,8 @@ impl Database {
|
|
|
1522
1765
|
if !self.read_only {
|
|
1523
1766
|
self.compact_inner()?;
|
|
1524
1767
|
}
|
|
1768
|
+
// Drop the cached WAL writer (also closes the underlying file handle).
|
|
1769
|
+
self.wal_writer = None;
|
|
1525
1770
|
// Release the lock by dropping the file handle
|
|
1526
1771
|
self._lock_file = None;
|
|
1527
1772
|
// Clear in-memory state
|
|
@@ -1531,6 +1776,8 @@ impl Database {
|
|
|
1531
1776
|
self.quantized = None;
|
|
1532
1777
|
self.quantization_config = None;
|
|
1533
1778
|
self.quantized_keys.clear();
|
|
1779
|
+
self.vector_arena = None;
|
|
1780
|
+
self.vector_arena_dirty = false;
|
|
1534
1781
|
self.dimension = 0;
|
|
1535
1782
|
Ok(())
|
|
1536
1783
|
}
|
|
@@ -2128,8 +2375,12 @@ impl Database {
|
|
|
2128
2375
|
self.rebuild_ann();
|
|
2129
2376
|
self.ann_loaded_from_disk = false;
|
|
2130
2377
|
self.persist_ann_to_disk()?;
|
|
2378
|
+
self.ann_dirty = false;
|
|
2379
|
+
self.vector_arena_dirty = true;
|
|
2131
2380
|
self.rebuild_quantized_index();
|
|
2381
|
+
self.quantized_dirty = false;
|
|
2132
2382
|
self.rebuild_all_multi_vector_quantized_indexes();
|
|
2383
|
+
self.multi_vector_quantized_dirty = false;
|
|
2133
2384
|
Ok(count)
|
|
2134
2385
|
}
|
|
2135
2386
|
|
|
@@ -2156,8 +2407,12 @@ impl Database {
|
|
|
2156
2407
|
self.rebuild_ann();
|
|
2157
2408
|
self.ann_loaded_from_disk = false;
|
|
2158
2409
|
self.persist_ann_to_disk()?;
|
|
2410
|
+
self.ann_dirty = false;
|
|
2411
|
+
self.vector_arena_dirty = true;
|
|
2159
2412
|
self.rebuild_quantized_index();
|
|
2413
|
+
self.quantized_dirty = false;
|
|
2160
2414
|
self.rebuild_all_multi_vector_quantized_indexes();
|
|
2415
|
+
self.multi_vector_quantized_dirty = false;
|
|
2161
2416
|
Ok(count)
|
|
2162
2417
|
}
|
|
2163
2418
|
|
|
@@ -2361,8 +2616,12 @@ impl Database {
|
|
|
2361
2616
|
self.rebuild_ann();
|
|
2362
2617
|
self.ann_loaded_from_disk = false;
|
|
2363
2618
|
self.persist_ann_to_disk()?;
|
|
2619
|
+
self.ann_dirty = false;
|
|
2620
|
+
self.vector_arena_dirty = true;
|
|
2364
2621
|
self.rebuild_quantized_index();
|
|
2622
|
+
self.quantized_dirty = false;
|
|
2365
2623
|
self.rebuild_all_multi_vector_quantized_indexes();
|
|
2624
|
+
self.multi_vector_quantized_dirty = false;
|
|
2366
2625
|
Ok(())
|
|
2367
2626
|
}
|
|
2368
2627
|
|
|
@@ -2573,6 +2832,80 @@ impl Database {
|
|
|
2573
2832
|
self.compact_inner()
|
|
2574
2833
|
}
|
|
2575
2834
|
|
|
2835
|
+
/// Configure WAL durability. See [`WalSyncMode`] for the safety / speed
|
|
2836
|
+
/// tradeoffs.
|
|
2837
|
+
///
|
|
2838
|
+
/// Switching to a more relaxed mode while there are unsync'd bytes in
|
|
2839
|
+
/// the WAL is safe — the bytes simply stay in the BufWriter / OS cache
|
|
2840
|
+
/// until the next sync point (`flush()`, `compact()`, `close()`, or the
|
|
2841
|
+
/// counter reaching `EveryN(n)`). Switching to a *stricter* mode forces
|
|
2842
|
+
/// an immediate sync so there is no surprise loss window.
|
|
2843
|
+
pub fn set_wal_sync_mode(&mut self, mode: WalSyncMode) -> Result<()> {
|
|
2844
|
+
self.check_writable()?;
|
|
2845
|
+
mode.validate()?;
|
|
2846
|
+
let previous = self.wal_sync_mode;
|
|
2847
|
+
self.wal_sync_mode = mode;
|
|
2848
|
+
// If we just tightened durability (e.g. moved from OnFlush back to
|
|
2849
|
+
// PerOp) and there are pending ops, sync immediately so the user's
|
|
2850
|
+
// mental model — "after this call any acknowledged write is durable"
|
|
2851
|
+
// — holds.
|
|
2852
|
+
let became_stricter = matches!(
|
|
2853
|
+
(previous, mode),
|
|
2854
|
+
(
|
|
2855
|
+
WalSyncMode::OnFlush,
|
|
2856
|
+
WalSyncMode::PerOp | WalSyncMode::EveryN(_)
|
|
2857
|
+
) | (WalSyncMode::EveryN(_), WalSyncMode::PerOp)
|
|
2858
|
+
);
|
|
2859
|
+
if became_stricter && self.wal_ops_since_sync > 0 {
|
|
2860
|
+
self.sync_wal()?;
|
|
2861
|
+
self.wal_ops_since_sync = 0;
|
|
2862
|
+
}
|
|
2863
|
+
Ok(())
|
|
2864
|
+
}
|
|
2865
|
+
|
|
2866
|
+
/// Return the current WAL sync mode.
|
|
2867
|
+
pub fn wal_sync_mode(&self) -> WalSyncMode {
|
|
2868
|
+
self.wal_sync_mode
|
|
2869
|
+
}
|
|
2870
|
+
|
|
2871
|
+
/// Materialise the contiguous-vector arena up front.
|
|
2872
|
+
///
|
|
2873
|
+
/// The arena mirrors the default dense vector of every record in a
|
|
2874
|
+
/// single flat `Vec<f32>` — much more cache- and SIMD-friendly than the
|
|
2875
|
+
/// default `BTreeMap<Record>` layout. It's normally built lazily on
|
|
2876
|
+
/// first use, but if you know a heavy brute-force or rescoring scan is
|
|
2877
|
+
/// coming you can pay the build cost up front by calling this. Cheap
|
|
2878
|
+
/// when already fresh.
|
|
2879
|
+
pub fn prepare_for_scan(&mut self) {
|
|
2880
|
+
let _ = self.ensure_vector_arena();
|
|
2881
|
+
}
|
|
2882
|
+
|
|
2883
|
+
/// Number of vectors in the contiguous arena, or `None` if the arena
|
|
2884
|
+
/// hasn't been materialised yet for this session. Useful for tests and
|
|
2885
|
+
/// observability.
|
|
2886
|
+
pub fn vector_arena_len(&self) -> Option<usize> {
|
|
2887
|
+
self.vector_arena.as_ref().map(VectorArena::len)
|
|
2888
|
+
}
|
|
2889
|
+
|
|
2890
|
+
/// Return (live_count, tombstoned_count) summed across every HNSW graph
|
|
2891
|
+
/// (global + per-namespace). Useful for monitoring when a `compact()`
|
|
2892
|
+
/// would benefit from rebuilding the graph(s).
|
|
2893
|
+
pub fn tombstone_stats(&self) -> (usize, usize) {
|
|
2894
|
+
let mut live = 0usize;
|
|
2895
|
+
let mut dead = 0usize;
|
|
2896
|
+
for idx in self.ann.global.values() {
|
|
2897
|
+
live += idx.live_count();
|
|
2898
|
+
dead += idx.tombstones.len();
|
|
2899
|
+
}
|
|
2900
|
+
for indexes in self.ann.namespaces.values() {
|
|
2901
|
+
for idx in indexes.values() {
|
|
2902
|
+
live += idx.live_count();
|
|
2903
|
+
dead += idx.tombstones.len();
|
|
2904
|
+
}
|
|
2905
|
+
}
|
|
2906
|
+
(live, dead)
|
|
2907
|
+
}
|
|
2908
|
+
|
|
2576
2909
|
/// Bulk-ingest many records efficiently. WAL writes happen in batches of
|
|
2577
2910
|
/// `batch_size`, but the ANN index and sparse index are only rebuilt once
|
|
2578
2911
|
/// at the very end, making this much faster than `upsert_many` for large
|
|
@@ -2644,9 +2977,16 @@ impl Database {
|
|
|
2644
2977
|
self.rebuild_sparse_index();
|
|
2645
2978
|
self.rebuild_ann();
|
|
2646
2979
|
self.ann_loaded_from_disk = false;
|
|
2980
|
+
// Persist the freshly-built ANN so a subsequent reopen can skip
|
|
2981
|
+
// the rebuild — bulk_ingest is a "batch" operation and callers
|
|
2982
|
+
// expect index state to be on disk afterwards.
|
|
2647
2983
|
self.persist_ann_to_disk()?;
|
|
2984
|
+
self.ann_dirty = false;
|
|
2985
|
+
self.vector_arena_dirty = true;
|
|
2648
2986
|
self.rebuild_quantized_index();
|
|
2987
|
+
self.quantized_dirty = false;
|
|
2649
2988
|
self.rebuild_all_multi_vector_quantized_indexes();
|
|
2989
|
+
self.multi_vector_quantized_dirty = false;
|
|
2650
2990
|
}
|
|
2651
2991
|
|
|
2652
2992
|
Ok(total)
|
|
@@ -2665,6 +3005,7 @@ impl Database {
|
|
|
2665
3005
|
self.rebuild_ann();
|
|
2666
3006
|
self.ann_loaded_from_disk = false;
|
|
2667
3007
|
self.persist_ann_to_disk()?;
|
|
3008
|
+
self.ann_dirty = false;
|
|
2668
3009
|
}
|
|
2669
3010
|
Ok(())
|
|
2670
3011
|
}
|
|
@@ -2710,6 +3051,7 @@ impl Database {
|
|
|
2710
3051
|
validate_quantization_config(&config, self.dimension)?;
|
|
2711
3052
|
self.quantization_config = Some(config);
|
|
2712
3053
|
self.rebuild_quantized_index();
|
|
3054
|
+
self.quantized_dirty = false;
|
|
2713
3055
|
self.persist_quantization_params()?;
|
|
2714
3056
|
Ok(())
|
|
2715
3057
|
}
|
|
@@ -2720,6 +3062,7 @@ impl Database {
|
|
|
2720
3062
|
self.quantized = None;
|
|
2721
3063
|
self.quantization_config = None;
|
|
2722
3064
|
self.quantized_keys.clear();
|
|
3065
|
+
self.quantized_dirty = false;
|
|
2723
3066
|
// Remove the sidecar file
|
|
2724
3067
|
let params_path = quantization_params_path(&self.path);
|
|
2725
3068
|
if params_path.exists() {
|
|
@@ -3401,6 +3744,54 @@ impl Database {
|
|
|
3401
3744
|
self.records.remove(key);
|
|
3402
3745
|
}
|
|
3403
3746
|
|
|
3747
|
+
// If any HNSW graph has tombstones, rebuild it before persisting.
|
|
3748
|
+
//
|
|
3749
|
+
// Two reasons:
|
|
3750
|
+
// 1. Crossing `tombstone_rebuild_pct` means search recall has
|
|
3751
|
+
// degraded enough that the user wants a clean graph.
|
|
3752
|
+
// 2. Even below the threshold, the persisted manifest's
|
|
3753
|
+
// `record_count` is derived from `self.records` (live only),
|
|
3754
|
+
// but the in-memory `keys` array includes dead slots — so a
|
|
3755
|
+
// persisted-with-tombstones graph would always fail the
|
|
3756
|
+
// record_count check on reopen and rebuild anyway. Rebuilding
|
|
3757
|
+
// *now* dumps a clean graph that survives reload.
|
|
3758
|
+
let threshold = self.index_config.tombstone_rebuild_pct;
|
|
3759
|
+
let any_tombstones = self
|
|
3760
|
+
.ann
|
|
3761
|
+
.global
|
|
3762
|
+
.values()
|
|
3763
|
+
.any(|idx| !idx.tombstones.is_empty())
|
|
3764
|
+
|| self
|
|
3765
|
+
.ann
|
|
3766
|
+
.namespaces
|
|
3767
|
+
.values()
|
|
3768
|
+
.flat_map(|m| m.values())
|
|
3769
|
+
.any(|idx| !idx.tombstones.is_empty());
|
|
3770
|
+
// (We track `threshold` even though we currently rebuild on any
|
|
3771
|
+
// tombstones, so `should_rebuild` could later replace this when we
|
|
3772
|
+
// add tombstone persistence in the manifest.)
|
|
3773
|
+
let _ = threshold;
|
|
3774
|
+
if any_tombstones {
|
|
3775
|
+
self.rebuild_ann();
|
|
3776
|
+
}
|
|
3777
|
+
|
|
3778
|
+
// Rebuild any lazy indexes that were marked dirty during the session
|
|
3779
|
+
// before we persist. This is the point where we pay back the work
|
|
3780
|
+
// we deferred from the per-insert hot path:
|
|
3781
|
+
// - the HNSW graph is already up-to-date (incremental inserts),
|
|
3782
|
+
// we just need to dump it.
|
|
3783
|
+
// - the quantized PQ index was dropped on first insert and is
|
|
3784
|
+
// rebuilt now so search can use it again next session.
|
|
3785
|
+
// - same for multi-vector PQ.
|
|
3786
|
+
if self.quantized_dirty {
|
|
3787
|
+
self.rebuild_quantized_index();
|
|
3788
|
+
self.quantized_dirty = false;
|
|
3789
|
+
}
|
|
3790
|
+
if self.multi_vector_quantized_dirty {
|
|
3791
|
+
self.rebuild_all_multi_vector_quantized_indexes();
|
|
3792
|
+
self.multi_vector_quantized_dirty = false;
|
|
3793
|
+
}
|
|
3794
|
+
|
|
3404
3795
|
if let Some(parent) = self.path.parent() {
|
|
3405
3796
|
if !parent.as_os_str().is_empty() {
|
|
3406
3797
|
fs::create_dir_all(parent)?;
|
|
@@ -3423,6 +3814,7 @@ impl Database {
|
|
|
3423
3814
|
self.clear_wal()?;
|
|
3424
3815
|
self.wal_entries_replayed = 0;
|
|
3425
3816
|
self.persist_ann_to_disk()?;
|
|
3817
|
+
self.ann_dirty = false;
|
|
3426
3818
|
|
|
3427
3819
|
Ok(())
|
|
3428
3820
|
}
|
|
@@ -3563,6 +3955,65 @@ impl Database {
|
|
|
3563
3955
|
.iter()
|
|
3564
3956
|
.all(|op| matches!(op, WalOp::UpdateMetadata { .. } | WalOp::SetTtl { .. }));
|
|
3565
3957
|
|
|
3958
|
+
// Categorise each op so we can route to the fastest correct path:
|
|
3959
|
+
// incremental insert (Upsert with new key) → ann_apply_incremental
|
|
3960
|
+
// tombstone delete (Delete of present key) → ann_apply_tombstones
|
|
3961
|
+
// anything else (upsert of existing key, etc) → full rebuild
|
|
3962
|
+
let mut incremental_eligible = !metadata_only;
|
|
3963
|
+
let mut tombstone_only = !metadata_only;
|
|
3964
|
+
for op in &ops {
|
|
3965
|
+
match op {
|
|
3966
|
+
WalOp::Upsert(record) => {
|
|
3967
|
+
let exists = self
|
|
3968
|
+
.records
|
|
3969
|
+
.contains_key(&(record.namespace.clone(), record.id.clone()));
|
|
3970
|
+
if exists {
|
|
3971
|
+
incremental_eligible = false;
|
|
3972
|
+
tombstone_only = false;
|
|
3973
|
+
} else {
|
|
3974
|
+
// New upsert — fine for incremental, but not tombstone-only.
|
|
3975
|
+
tombstone_only = false;
|
|
3976
|
+
}
|
|
3977
|
+
}
|
|
3978
|
+
WalOp::Delete { namespace, id } => {
|
|
3979
|
+
let exists = self.records.contains_key(&(namespace.clone(), id.clone()));
|
|
3980
|
+
if exists {
|
|
3981
|
+
// OK for tombstone path, but not for incremental.
|
|
3982
|
+
incremental_eligible = false;
|
|
3983
|
+
}
|
|
3984
|
+
// (A delete of a non-existent key is a no-op for both
|
|
3985
|
+
// paths, but we still let it through.)
|
|
3986
|
+
}
|
|
3987
|
+
WalOp::UpdateMetadata { .. } | WalOp::SetTtl { .. } => {
|
|
3988
|
+
incremental_eligible = false;
|
|
3989
|
+
tombstone_only = false;
|
|
3990
|
+
}
|
|
3991
|
+
}
|
|
3992
|
+
}
|
|
3993
|
+
|
|
3994
|
+
// Collect the keys we'll need to feed to the relevant updater
|
|
3995
|
+
// before we move `ops` into `apply_ops_in_memory`.
|
|
3996
|
+
let new_keys: Vec<RecordKey> = if incremental_eligible {
|
|
3997
|
+
ops.iter()
|
|
3998
|
+
.filter_map(|op| match op {
|
|
3999
|
+
WalOp::Upsert(record) => Some((record.namespace.clone(), record.id.clone())),
|
|
4000
|
+
_ => None,
|
|
4001
|
+
})
|
|
4002
|
+
.collect()
|
|
4003
|
+
} else {
|
|
4004
|
+
Vec::new()
|
|
4005
|
+
};
|
|
4006
|
+
let deleted_keys: Vec<RecordKey> = if tombstone_only {
|
|
4007
|
+
ops.iter()
|
|
4008
|
+
.filter_map(|op| match op {
|
|
4009
|
+
WalOp::Delete { namespace, id } => Some((namespace.clone(), id.clone())),
|
|
4010
|
+
_ => None,
|
|
4011
|
+
})
|
|
4012
|
+
.collect()
|
|
4013
|
+
} else {
|
|
4014
|
+
Vec::new()
|
|
4015
|
+
};
|
|
4016
|
+
|
|
3566
4017
|
self.append_wal_batch(&ops)?;
|
|
3567
4018
|
self.apply_ops_in_memory(ops);
|
|
3568
4019
|
|
|
@@ -3571,11 +4022,55 @@ impl Database {
|
|
|
3571
4022
|
if has_sparse {
|
|
3572
4023
|
self.rebuild_sparse_index();
|
|
3573
4024
|
}
|
|
3574
|
-
|
|
4025
|
+
if incremental_eligible {
|
|
4026
|
+
// Fast path: just append the new vectors into the existing
|
|
4027
|
+
// HNSW graph(s) instead of rebuilding from scratch. Converts
|
|
4028
|
+
// single-record ingestion from O(N log N) per insert to
|
|
4029
|
+
// amortised O(log N).
|
|
4030
|
+
self.ann_apply_incremental(&new_keys);
|
|
4031
|
+
// Keep the contiguous arena in sync. If it hasn't been
|
|
4032
|
+
// materialised yet, leave it alone — it'll be lazily built
|
|
4033
|
+
// on first read.
|
|
4034
|
+
if self.vector_arena.is_some() && !self.vector_arena_dirty {
|
|
4035
|
+
self.arena_apply_incremental(&new_keys);
|
|
4036
|
+
}
|
|
4037
|
+
} else if tombstone_only {
|
|
4038
|
+
// Delete-only fast path: tombstone the corresponding
|
|
4039
|
+
// `origin_id`s in each affected HNSW graph. No rebuild;
|
|
4040
|
+
// search filters out tombstoned candidates. The graph is
|
|
4041
|
+
// rebuilt automatically at the next `compact()` once the
|
|
4042
|
+
// tombstone ratio crosses `tombstone_rebuild_pct`.
|
|
4043
|
+
self.ann_apply_tombstones(&deleted_keys);
|
|
4044
|
+
// The arena can't compact in place without shifting O(N)
|
|
4045
|
+
// floats; mark dirty so it's lazily rebuilt on next scan.
|
|
4046
|
+
self.vector_arena_dirty = true;
|
|
4047
|
+
} else {
|
|
4048
|
+
// Slow path: a mixed-mode batch or an update-of-existing.
|
|
4049
|
+
// Rebuild the whole catalog.
|
|
4050
|
+
self.rebuild_ann();
|
|
4051
|
+
self.vector_arena_dirty = true;
|
|
4052
|
+
}
|
|
4053
|
+
// Defer persistence of the HNSW graph to disk: writing the graph
|
|
4054
|
+
// files is expensive (full re-dump + fsync) and is only required
|
|
4055
|
+
// for crash recovery on reopen. The WAL gives us that durability
|
|
4056
|
+
// already — on reopen, if the persisted graph is stale, it's
|
|
4057
|
+
// detected via the manifest signature check and rebuilt from
|
|
4058
|
+
// records in memory. Persistence happens at `flush` / `compact`.
|
|
3575
4059
|
self.ann_loaded_from_disk = false;
|
|
3576
|
-
self.
|
|
3577
|
-
|
|
3578
|
-
|
|
4060
|
+
self.ann_dirty = true;
|
|
4061
|
+
// Lazy-rebuild quantized indexes too. Drop the in-memory
|
|
4062
|
+
// structures so callers get correct (HNSW-fallback) results
|
|
4063
|
+
// until the next flush, where we rebuild from the new corpus.
|
|
4064
|
+
if self.quantization_config.is_some() {
|
|
4065
|
+
self.quantized = None;
|
|
4066
|
+
self.quantized_keys.clear();
|
|
4067
|
+
self.quantized_dirty = true;
|
|
4068
|
+
}
|
|
4069
|
+
if !self.multi_vector_quantization_config.is_empty() {
|
|
4070
|
+
self.multi_vector_quantized.clear();
|
|
4071
|
+
self.multi_vector_quantized_keys.clear();
|
|
4072
|
+
self.multi_vector_quantized_dirty = true;
|
|
4073
|
+
}
|
|
3579
4074
|
}
|
|
3580
4075
|
Ok(())
|
|
3581
4076
|
}
|
|
@@ -3659,58 +4154,109 @@ impl Database {
|
|
|
3659
4154
|
}
|
|
3660
4155
|
}
|
|
3661
4156
|
|
|
3662
|
-
fn append_wal_batch(&self, ops: &[WalOp]) -> Result<()> {
|
|
3663
|
-
|
|
4157
|
+
fn append_wal_batch(&mut self, ops: &[WalOp]) -> Result<()> {
|
|
4158
|
+
// Decide whether this batch should trigger an fsync. We use the
|
|
4159
|
+
// ops count in the batch (not 1) so `EveryN` semantics scale across
|
|
4160
|
+
// both single inserts and `insert_many` calls.
|
|
4161
|
+
let n_ops = ops.len();
|
|
4162
|
+
let should_sync = match self.wal_sync_mode {
|
|
4163
|
+
WalSyncMode::PerOp => true,
|
|
4164
|
+
WalSyncMode::EveryN(n) => {
|
|
4165
|
+
self.wal_ops_since_sync = self.wal_ops_since_sync.saturating_add(n_ops);
|
|
4166
|
+
if self.wal_ops_since_sync >= n {
|
|
4167
|
+
self.wal_ops_since_sync = 0;
|
|
4168
|
+
true
|
|
4169
|
+
} else {
|
|
4170
|
+
false
|
|
4171
|
+
}
|
|
4172
|
+
}
|
|
4173
|
+
WalSyncMode::OnFlush => {
|
|
4174
|
+
self.wal_ops_since_sync = self.wal_ops_since_sync.saturating_add(n_ops);
|
|
4175
|
+
false
|
|
4176
|
+
}
|
|
4177
|
+
};
|
|
4178
|
+
self.append_wal_batch_inner(ops, should_sync)
|
|
3664
4179
|
}
|
|
3665
4180
|
|
|
3666
4181
|
/// Append a WAL batch without issuing an fsync. The caller is responsible
|
|
3667
4182
|
/// for issuing `sync_wal` later (typically once at the end of a bulk
|
|
3668
4183
|
/// ingest). This is the hot path for `bulk_ingest`.
|
|
3669
|
-
fn append_wal_batch_unsynced(&self, ops: &[WalOp]) -> Result<()> {
|
|
4184
|
+
fn append_wal_batch_unsynced(&mut self, ops: &[WalOp]) -> Result<()> {
|
|
4185
|
+
// Track pending ops so future `sync_wal` / `compact_inner` calls
|
|
4186
|
+
// know to flush them.
|
|
4187
|
+
self.wal_ops_since_sync = self.wal_ops_since_sync.saturating_add(ops.len());
|
|
3670
4188
|
self.append_wal_batch_inner(ops, false)
|
|
3671
4189
|
}
|
|
3672
4190
|
|
|
3673
|
-
|
|
4191
|
+
/// Append a WAL batch. Reuses a cached `BufWriter<File>` across calls so
|
|
4192
|
+
/// the WAL file is only opened once per database session — saving the
|
|
4193
|
+
/// `open()` syscall on every single `insert` call, which matters when
|
|
4194
|
+
/// per-record overhead is the bottleneck.
|
|
4195
|
+
fn append_wal_batch_inner(&mut self, ops: &[WalOp], sync: bool) -> Result<()> {
|
|
3674
4196
|
if let Some(parent) = self.wal_path.parent() {
|
|
3675
4197
|
if !parent.as_os_str().is_empty() {
|
|
3676
4198
|
fs::create_dir_all(parent)?;
|
|
3677
4199
|
}
|
|
3678
4200
|
}
|
|
3679
4201
|
|
|
3680
|
-
|
|
3681
|
-
|
|
3682
|
-
|
|
3683
|
-
.
|
|
3684
|
-
|
|
3685
|
-
|
|
3686
|
-
|
|
3687
|
-
|
|
4202
|
+
// Lazily create the cached BufWriter, writing the WAL_MAGIC header
|
|
4203
|
+
// on first use of a brand-new file.
|
|
4204
|
+
if self.wal_writer.is_none() {
|
|
4205
|
+
let new_file = !self.wal_path.exists();
|
|
4206
|
+
let file = OpenOptions::new()
|
|
4207
|
+
.create(true)
|
|
4208
|
+
.append(true)
|
|
4209
|
+
.open(&self.wal_path)?;
|
|
4210
|
+
let mut writer = BufWriter::with_capacity(64 * 1024, file);
|
|
4211
|
+
if new_file {
|
|
4212
|
+
writer.write_all(WAL_MAGIC)?;
|
|
4213
|
+
}
|
|
4214
|
+
self.wal_writer = Some(writer);
|
|
3688
4215
|
}
|
|
3689
4216
|
|
|
4217
|
+
// Serialise the batch into a temporary buffer first, so that the
|
|
4218
|
+
// single `write_all` we issue to the cached writer is one contiguous
|
|
4219
|
+
// user-space copy (BufWriter then bunches everything up further).
|
|
3690
4220
|
let mut buffer = Vec::new();
|
|
3691
4221
|
write_u32(&mut buffer, u32_from_usize(ops.len())?)?;
|
|
3692
4222
|
for op in ops {
|
|
3693
4223
|
write_wal_op(&mut buffer, op)?;
|
|
3694
4224
|
}
|
|
3695
4225
|
|
|
3696
|
-
|
|
3697
|
-
|
|
4226
|
+
let writer = self.wal_writer.as_mut().unwrap();
|
|
4227
|
+
write_u32(writer, u32_from_usize(buffer.len())?)?;
|
|
4228
|
+
writer.write_all(&buffer)?;
|
|
4229
|
+
|
|
3698
4230
|
if sync {
|
|
3699
|
-
|
|
4231
|
+
// Flush BufWriter into the OS, then ask the kernel to make the
|
|
4232
|
+
// bytes durable. We must `flush()` before `sync_all()` — sync_all
|
|
4233
|
+
// only operates on what's already in the kernel's page cache.
|
|
4234
|
+
writer.flush()?;
|
|
4235
|
+
writer.get_ref().sync_all()?;
|
|
3700
4236
|
}
|
|
3701
4237
|
Ok(())
|
|
3702
4238
|
}
|
|
3703
4239
|
|
|
3704
|
-
/// Force a durability fence on the WAL file.
|
|
3705
|
-
///
|
|
3706
|
-
///
|
|
3707
|
-
///
|
|
3708
|
-
fn sync_wal(&self) -> Result<()> {
|
|
4240
|
+
/// Force a durability fence on the WAL file. Flushes any buffered bytes
|
|
4241
|
+
/// from the cached writer and asks the kernel to make them durable in a
|
|
4242
|
+
/// single `sync_all`. Used by `bulk_ingest`, `flush`, `close`, and as a
|
|
4243
|
+
/// manual fence when running in `EveryN` or `OnFlush` mode.
|
|
4244
|
+
fn sync_wal(&mut self) -> Result<()> {
|
|
4245
|
+
if let Some(writer) = self.wal_writer.as_mut() {
|
|
4246
|
+
writer.flush()?;
|
|
4247
|
+
writer.get_ref().sync_all()?;
|
|
4248
|
+
self.wal_ops_since_sync = 0;
|
|
4249
|
+
return Ok(());
|
|
4250
|
+
}
|
|
4251
|
+
// Fallback: no cached writer (e.g. WAL was opened externally). Open
|
|
4252
|
+
// the file briefly just to issue the sync.
|
|
3709
4253
|
if !self.wal_path.exists() {
|
|
4254
|
+
self.wal_ops_since_sync = 0;
|
|
3710
4255
|
return Ok(());
|
|
3711
4256
|
}
|
|
3712
4257
|
let file = OpenOptions::new().append(true).open(&self.wal_path)?;
|
|
3713
4258
|
file.sync_all()?;
|
|
4259
|
+
self.wal_ops_since_sync = 0;
|
|
3714
4260
|
Ok(())
|
|
3715
4261
|
}
|
|
3716
4262
|
|
|
@@ -3764,7 +4310,12 @@ impl Database {
|
|
|
3764
4310
|
Ok(())
|
|
3765
4311
|
}
|
|
3766
4312
|
|
|
3767
|
-
fn clear_wal(&self) -> Result<()> {
|
|
4313
|
+
fn clear_wal(&mut self) -> Result<()> {
|
|
4314
|
+
// Drop the cached writer first: on POSIX the file would survive the
|
|
4315
|
+
// unlink because we still hold an open handle, but we'd then keep
|
|
4316
|
+
// appending into the now-detached inode and never see those bytes on
|
|
4317
|
+
// disk after reopen.
|
|
4318
|
+
self.wal_writer = None;
|
|
3768
4319
|
if self.wal_path.exists() {
|
|
3769
4320
|
fs::remove_file(&self.wal_path)?;
|
|
3770
4321
|
}
|
|
@@ -3876,6 +4427,12 @@ impl Database {
|
|
|
3876
4427
|
ann_loaded_from_disk: false,
|
|
3877
4428
|
read_only: false,
|
|
3878
4429
|
_lock_file: None,
|
|
4430
|
+
wal_writer: None,
|
|
4431
|
+
wal_sync_mode: WalSyncMode::default(),
|
|
4432
|
+
wal_ops_since_sync: 0,
|
|
4433
|
+
ann_dirty: false,
|
|
4434
|
+
quantized_dirty: false,
|
|
4435
|
+
multi_vector_quantized_dirty: false,
|
|
3879
4436
|
quantized: None,
|
|
3880
4437
|
quantization_config: None,
|
|
3881
4438
|
quantized_keys: Vec::new(),
|
|
@@ -3885,6 +4442,8 @@ impl Database {
|
|
|
3885
4442
|
payload_index_defs: BTreeMap::new(),
|
|
3886
4443
|
payload_indexes: BTreeMap::new(),
|
|
3887
4444
|
index_config: IndexConfig::default(),
|
|
4445
|
+
vector_arena: None,
|
|
4446
|
+
vector_arena_dirty: false,
|
|
3888
4447
|
})
|
|
3889
4448
|
}
|
|
3890
4449
|
|
|
@@ -4022,6 +4581,237 @@ impl Database {
|
|
|
4022
4581
|
Ok(())
|
|
4023
4582
|
}
|
|
4024
4583
|
|
|
4584
|
+
/// Incremental ANN update. Appends the given new records into the
|
|
4585
|
+
/// existing HNSW graph(s) without rebuilding them from scratch.
|
|
4586
|
+
///
|
|
4587
|
+
/// Preconditions:
|
|
4588
|
+
/// - `new_keys` are keys that already live in `self.records` (caller
|
|
4589
|
+
/// must have applied the WAL ops to memory first).
|
|
4590
|
+
/// - Each key referenced by `new_keys` did NOT previously exist in
|
|
4591
|
+
/// `self.records` (i.e. it's a true insert, not an update).
|
|
4592
|
+
///
|
|
4593
|
+
/// Behaviour per (namespace, vector_name) "slot":
|
|
4594
|
+
/// - If a graph already exists, the new vectors are appended to it
|
|
4595
|
+
/// via single-element `hnsw.insert` calls (or `parallel_insert` if
|
|
4596
|
+
/// the batch is large enough to amortise thread overhead).
|
|
4597
|
+
/// - If no graph exists but the total record count for that slot has
|
|
4598
|
+
/// now crossed `ANN_MIN_POINTS`, a fresh graph is built from all
|
|
4599
|
+
/// matching records.
|
|
4600
|
+
/// - Below `ANN_MIN_POINTS`, we skip — searches will brute-force
|
|
4601
|
+
/// without harm.
|
|
4602
|
+
fn ann_apply_incremental(&mut self, new_keys: &[RecordKey]) {
|
|
4603
|
+
if new_keys.is_empty() {
|
|
4604
|
+
return;
|
|
4605
|
+
}
|
|
4606
|
+
let cfg = self.index_config;
|
|
4607
|
+
|
|
4608
|
+
// Group the new records by (Option<namespace>, vector_name). Each
|
|
4609
|
+
// upserted record contributes to exactly one global slot and one
|
|
4610
|
+
// namespace-scoped slot per dense vector it owns.
|
|
4611
|
+
let mut groups: BTreeMap<(Option<String>, String), Vec<(RecordKey, Vec<f32>)>> =
|
|
4612
|
+
BTreeMap::new();
|
|
4613
|
+
for key in new_keys {
|
|
4614
|
+
let Some(record) = self.records.get(key) else {
|
|
4615
|
+
continue;
|
|
4616
|
+
};
|
|
4617
|
+
for (vector_name, vector) in record.dense_vectors() {
|
|
4618
|
+
let item = (key.clone(), vector.clone());
|
|
4619
|
+
groups
|
|
4620
|
+
.entry((None, vector_name.to_owned()))
|
|
4621
|
+
.or_default()
|
|
4622
|
+
.push(item.clone());
|
|
4623
|
+
groups
|
|
4624
|
+
.entry((Some(record.namespace.clone()), vector_name.to_owned()))
|
|
4625
|
+
.or_default()
|
|
4626
|
+
.push(item);
|
|
4627
|
+
}
|
|
4628
|
+
}
|
|
4629
|
+
|
|
4630
|
+
// Two-phase processing to keep the borrow checker happy:
|
|
4631
|
+
// phase 1: classify each slot (needs fresh build vs incremental
|
|
4632
|
+
// append), reading `self.records` only.
|
|
4633
|
+
// phase 2: mutate `self.ann` based on the classifications.
|
|
4634
|
+
let mut fresh_builds: Vec<((Option<String>, String), Vec<(RecordKey, Vec<f32>)>)> =
|
|
4635
|
+
Vec::new();
|
|
4636
|
+
let mut incremental: Vec<((Option<String>, String), Vec<(RecordKey, Vec<f32>)>)> =
|
|
4637
|
+
Vec::new();
|
|
4638
|
+
|
|
4639
|
+
for ((opt_ns, vector_name), new_items) in groups {
|
|
4640
|
+
let has_existing = match &opt_ns {
|
|
4641
|
+
None => self.ann.global.contains_key(&vector_name),
|
|
4642
|
+
Some(ns) => self
|
|
4643
|
+
.ann
|
|
4644
|
+
.namespaces
|
|
4645
|
+
.get(ns)
|
|
4646
|
+
.map_or(false, |m| m.contains_key(&vector_name)),
|
|
4647
|
+
};
|
|
4648
|
+
|
|
4649
|
+
if has_existing {
|
|
4650
|
+
incremental.push(((opt_ns, vector_name), new_items));
|
|
4651
|
+
continue;
|
|
4652
|
+
}
|
|
4653
|
+
|
|
4654
|
+
// Count matching records (post-insert state) to decide whether
|
|
4655
|
+
// we've crossed the build threshold.
|
|
4656
|
+
let total = self
|
|
4657
|
+
.records
|
|
4658
|
+
.iter()
|
|
4659
|
+
.filter(|(_, r)| match &opt_ns {
|
|
4660
|
+
Some(ns) => r.namespace == *ns,
|
|
4661
|
+
None => true,
|
|
4662
|
+
})
|
|
4663
|
+
.filter(|(_, r)| {
|
|
4664
|
+
r.dense_vectors()
|
|
4665
|
+
.any(|(name, _)| name == vector_name.as_str())
|
|
4666
|
+
})
|
|
4667
|
+
.count();
|
|
4668
|
+
|
|
4669
|
+
if total < ANN_MIN_POINTS {
|
|
4670
|
+
continue;
|
|
4671
|
+
}
|
|
4672
|
+
|
|
4673
|
+
// Need to build a fresh graph for this slot. Collect ALL matching
|
|
4674
|
+
// records (not just the new ones) — owned clones so the build
|
|
4675
|
+
// step doesn't borrow `self.records`.
|
|
4676
|
+
let mut all_items: Vec<(RecordKey, Vec<f32>)> = Vec::with_capacity(total);
|
|
4677
|
+
for (k, r) in &self.records {
|
|
4678
|
+
if let Some(ns) = &opt_ns {
|
|
4679
|
+
if r.namespace != *ns {
|
|
4680
|
+
continue;
|
|
4681
|
+
}
|
|
4682
|
+
}
|
|
4683
|
+
for (name, vec) in r.dense_vectors() {
|
|
4684
|
+
if name == vector_name.as_str() {
|
|
4685
|
+
all_items.push((k.clone(), vec.clone()));
|
|
4686
|
+
break;
|
|
4687
|
+
}
|
|
4688
|
+
}
|
|
4689
|
+
}
|
|
4690
|
+
let _ = new_items; // already folded into `all_items`
|
|
4691
|
+
fresh_builds.push(((opt_ns, vector_name), all_items));
|
|
4692
|
+
}
|
|
4693
|
+
|
|
4694
|
+
// Phase 2a: build-from-scratch for slots that just crossed the
|
|
4695
|
+
// threshold.
|
|
4696
|
+
for ((opt_ns, vector_name), all_items) in fresh_builds {
|
|
4697
|
+
let records_for_build: Vec<(RecordKey, &Vec<f32>)> =
|
|
4698
|
+
all_items.iter().map(|(k, v)| (k.clone(), v)).collect();
|
|
4699
|
+
let new_index = build_ann_index(records_for_build, self.metric, &cfg);
|
|
4700
|
+
match opt_ns {
|
|
4701
|
+
None => {
|
|
4702
|
+
self.ann.global.insert(vector_name, new_index);
|
|
4703
|
+
}
|
|
4704
|
+
Some(ns) => {
|
|
4705
|
+
self.ann
|
|
4706
|
+
.namespaces
|
|
4707
|
+
.entry(ns)
|
|
4708
|
+
.or_default()
|
|
4709
|
+
.insert(vector_name, new_index);
|
|
4710
|
+
}
|
|
4711
|
+
}
|
|
4712
|
+
}
|
|
4713
|
+
|
|
4714
|
+
// Phase 2b: incremental appends into existing graphs.
|
|
4715
|
+
for ((opt_ns, vector_name), new_items) in incremental {
|
|
4716
|
+
let idx_opt = match &opt_ns {
|
|
4717
|
+
None => self.ann.global.get_mut(&vector_name),
|
|
4718
|
+
Some(ns) => self
|
|
4719
|
+
.ann
|
|
4720
|
+
.namespaces
|
|
4721
|
+
.get_mut(ns)
|
|
4722
|
+
.and_then(|m| m.get_mut(&vector_name)),
|
|
4723
|
+
};
|
|
4724
|
+
let Some(idx) = idx_opt else {
|
|
4725
|
+
continue;
|
|
4726
|
+
};
|
|
4727
|
+
|
|
4728
|
+
// hnsw_rs marks indexes that have been searched as "searching
|
|
4729
|
+
// mode" (a hint that skips some bookkeeping in the data layer).
|
|
4730
|
+
// Re-enable mutation mode before we insert — cheap toggle.
|
|
4731
|
+
idx.hnsw.set_searching_mode(false);
|
|
4732
|
+
|
|
4733
|
+
if new_items.len() >= cfg.parallel_insert_threshold {
|
|
4734
|
+
let start_id = idx.keys.len();
|
|
4735
|
+
let batch: Vec<(&Vec<f32>, usize)> = new_items
|
|
4736
|
+
.iter()
|
|
4737
|
+
.enumerate()
|
|
4738
|
+
.map(|(offset, (_, v))| (v, start_id + offset))
|
|
4739
|
+
.collect();
|
|
4740
|
+
idx.hnsw.parallel_insert_batch(&batch);
|
|
4741
|
+
for (offset, (k, _)) in new_items.into_iter().enumerate() {
|
|
4742
|
+
let origin_id = start_id + offset;
|
|
4743
|
+
idx.key_to_origin.insert(k.clone(), origin_id);
|
|
4744
|
+
idx.keys.push(k);
|
|
4745
|
+
}
|
|
4746
|
+
} else {
|
|
4747
|
+
for (key, vector) in new_items {
|
|
4748
|
+
let origin_id = idx.keys.len();
|
|
4749
|
+
idx.key_to_origin.insert(key.clone(), origin_id);
|
|
4750
|
+
idx.keys.push(key);
|
|
4751
|
+
idx.hnsw.insert_one(vector.as_slice(), origin_id);
|
|
4752
|
+
}
|
|
4753
|
+
}
|
|
4754
|
+
}
|
|
4755
|
+
}
|
|
4756
|
+
|
|
4757
|
+
/// Append newly-inserted vectors to the contiguous arena. Caller must
|
|
4758
|
+
/// have already inserted the records into `self.records` and confirmed
|
|
4759
|
+
/// the arena exists and isn't dirty.
|
|
4760
|
+
fn arena_apply_incremental(&mut self, new_keys: &[RecordKey]) {
|
|
4761
|
+
let Some(arena) = self.vector_arena.as_mut() else {
|
|
4762
|
+
return;
|
|
4763
|
+
};
|
|
4764
|
+
for key in new_keys {
|
|
4765
|
+
if let Some(record) = self.records.get(key) {
|
|
4766
|
+
arena.append(key.clone(), &record.vector);
|
|
4767
|
+
}
|
|
4768
|
+
}
|
|
4769
|
+
}
|
|
4770
|
+
|
|
4771
|
+
/// Ensure the contiguous arena is materialised and fresh. Cheap when
|
|
4772
|
+
/// already clean; rebuilds from `self.records` (in BTreeMap order) on
|
|
4773
|
+
/// first call or after a delete. Allocates `dim * N` f32s.
|
|
4774
|
+
fn ensure_vector_arena(&mut self) -> &VectorArena {
|
|
4775
|
+
let needs_build = self
|
|
4776
|
+
.vector_arena
|
|
4777
|
+
.as_ref()
|
|
4778
|
+
.map_or(true, |a| self.vector_arena_dirty || a.dim != self.dimension);
|
|
4779
|
+
if needs_build {
|
|
4780
|
+
self.vector_arena = Some(VectorArena::rebuild_from(&self.records, self.dimension));
|
|
4781
|
+
self.vector_arena_dirty = false;
|
|
4782
|
+
}
|
|
4783
|
+
self.vector_arena.as_ref().unwrap()
|
|
4784
|
+
}
|
|
4785
|
+
|
|
4786
|
+
/// Mark the given record keys as deleted in every HNSW graph they live
|
|
4787
|
+
/// in. The graph itself is not modified — search filters tombstoned
|
|
4788
|
+
/// `origin_id`s. A subsequent `compact()` will rebuild any graph whose
|
|
4789
|
+
/// dead ratio exceeds `IndexConfig.tombstone_rebuild_pct`.
|
|
4790
|
+
fn ann_apply_tombstones(&mut self, deleted_keys: &[RecordKey]) {
|
|
4791
|
+
if deleted_keys.is_empty() {
|
|
4792
|
+
return;
|
|
4793
|
+
}
|
|
4794
|
+
for key in deleted_keys {
|
|
4795
|
+
// Global graphs (per vector_name): every graph that contains
|
|
4796
|
+
// this key gets the corresponding origin_id tombstoned.
|
|
4797
|
+
for (_, idx) in self.ann.global.iter_mut() {
|
|
4798
|
+
if let Some(&origin_id) = idx.key_to_origin.get(key) {
|
|
4799
|
+
idx.tombstones.insert(origin_id);
|
|
4800
|
+
}
|
|
4801
|
+
}
|
|
4802
|
+
// Per-namespace graphs: only the namespace this key belongs to
|
|
4803
|
+
// has a chance of containing it, but checking all of them is
|
|
4804
|
+
// fine — `key_to_origin.get` is O(1) and misses immediately.
|
|
4805
|
+
for (_, indexes) in self.ann.namespaces.iter_mut() {
|
|
4806
|
+
for (_, idx) in indexes.iter_mut() {
|
|
4807
|
+
if let Some(&origin_id) = idx.key_to_origin.get(key) {
|
|
4808
|
+
idx.tombstones.insert(origin_id);
|
|
4809
|
+
}
|
|
4810
|
+
}
|
|
4811
|
+
}
|
|
4812
|
+
}
|
|
4813
|
+
}
|
|
4814
|
+
|
|
4025
4815
|
fn rebuild_ann(&mut self) {
|
|
4026
4816
|
self.ann = AnnCatalog::default();
|
|
4027
4817
|
let mut global_by_vector: BTreeMap<String, Vec<(RecordKey, &Vec<f32>)>> = BTreeMap::new();
|
|
@@ -4109,6 +4899,24 @@ impl Database {
|
|
|
4109
4899
|
return false;
|
|
4110
4900
|
}
|
|
4111
4901
|
|
|
4902
|
+
// For ANN2 manifests, use the persisted keys verbatim — they
|
|
4903
|
+
// match the `origin_id`s baked into the HNSW graph file. For
|
|
4904
|
+
// ANN1 (no persisted keys), fall back to the recomputed
|
|
4905
|
+
// BTreeMap-ordered list, which matches the way ANN1 graphs were
|
|
4906
|
+
// always built.
|
|
4907
|
+
let keys = if manifest_entry.keys.is_empty() {
|
|
4908
|
+
expected_entry.keys.clone()
|
|
4909
|
+
} else {
|
|
4910
|
+
// Defensive: persisted keys length must agree with the
|
|
4911
|
+
// declared record_count and the live record set, else the
|
|
4912
|
+
// manifest is inconsistent and we'd rather rebuild than
|
|
4913
|
+
// serve wrong neighbours.
|
|
4914
|
+
if manifest_entry.keys.len() != manifest_entry.record_count {
|
|
4915
|
+
return false;
|
|
4916
|
+
}
|
|
4917
|
+
manifest_entry.keys.clone()
|
|
4918
|
+
};
|
|
4919
|
+
|
|
4112
4920
|
let Some(index) = load_ann_index(
|
|
4113
4921
|
parent,
|
|
4114
4922
|
&ann_basename(
|
|
@@ -4116,7 +4924,7 @@ impl Database {
|
|
|
4116
4924
|
expected_entry.namespace.as_deref(),
|
|
4117
4925
|
&expected_entry.vector_name,
|
|
4118
4926
|
),
|
|
4119
|
-
|
|
4927
|
+
keys,
|
|
4120
4928
|
self.metric,
|
|
4121
4929
|
) else {
|
|
4122
4930
|
return false;
|
|
@@ -4147,7 +4955,11 @@ impl Database {
|
|
|
4147
4955
|
return Ok(());
|
|
4148
4956
|
}
|
|
4149
4957
|
|
|
4150
|
-
|
|
4958
|
+
// Use `actual_ann_entries` (NOT `expected_ann_entries`) so the
|
|
4959
|
+
// persisted keys array matches the order the HNSW graph stored its
|
|
4960
|
+
// `origin_id`s in. After incremental inserts the in-memory keys vec
|
|
4961
|
+
// is in insertion order, which usually differs from BTreeMap order.
|
|
4962
|
+
let entries = self.actual_ann_entries();
|
|
4151
4963
|
for entry in &entries {
|
|
4152
4964
|
let basename = ann_basename(&self.path, entry.namespace.as_deref(), &entry.vector_name);
|
|
4153
4965
|
let graph_path = parent.join(format!("{basename}.hnsw.graph"));
|
|
@@ -4175,6 +4987,41 @@ impl Database {
|
|
|
4175
4987
|
write_ann_manifest(&ann_manifest_path(&self.path), &entries)
|
|
4176
4988
|
}
|
|
4177
4989
|
|
|
4990
|
+
/// Like `expected_ann_entries`, but populates each entry's `keys` field
|
|
4991
|
+
/// from the actual in-memory `AnnIndex.keys` array (insertion order).
|
|
4992
|
+
/// This is what gets serialised into the ANN2 manifest, and matches the
|
|
4993
|
+
/// `origin_id`s baked into the dumped HNSW graph files.
|
|
4994
|
+
fn actual_ann_entries(&self) -> Vec<AnnManifestEntry> {
|
|
4995
|
+
let mut entries = Vec::new();
|
|
4996
|
+
for (vector_name, index) in &self.ann.global {
|
|
4997
|
+
if index.keys.len() < ANN_MIN_POINTS {
|
|
4998
|
+
continue;
|
|
4999
|
+
}
|
|
5000
|
+
entries.push(AnnManifestEntry {
|
|
5001
|
+
namespace: None,
|
|
5002
|
+
vector_name: vector_name.clone(),
|
|
5003
|
+
record_count: index.keys.len(),
|
|
5004
|
+
key_signature: record_key_signature(&index.keys),
|
|
5005
|
+
keys: index.keys.clone(),
|
|
5006
|
+
});
|
|
5007
|
+
}
|
|
5008
|
+
for (namespace, indexes) in &self.ann.namespaces {
|
|
5009
|
+
for (vector_name, index) in indexes {
|
|
5010
|
+
if index.keys.len() < ANN_MIN_POINTS {
|
|
5011
|
+
continue;
|
|
5012
|
+
}
|
|
5013
|
+
entries.push(AnnManifestEntry {
|
|
5014
|
+
namespace: Some(namespace.clone()),
|
|
5015
|
+
vector_name: vector_name.clone(),
|
|
5016
|
+
record_count: index.keys.len(),
|
|
5017
|
+
key_signature: record_key_signature(&index.keys),
|
|
5018
|
+
keys: index.keys.clone(),
|
|
5019
|
+
});
|
|
5020
|
+
}
|
|
5021
|
+
}
|
|
5022
|
+
entries
|
|
5023
|
+
}
|
|
5024
|
+
|
|
4178
5025
|
fn expected_ann_entries(&self) -> Vec<AnnManifestEntry> {
|
|
4179
5026
|
let mut global: BTreeMap<String, Vec<RecordKey>> = BTreeMap::new();
|
|
4180
5027
|
let mut by_namespace: BTreeMap<String, BTreeMap<String, Vec<RecordKey>>> = BTreeMap::new();
|
|
@@ -4386,11 +5233,14 @@ impl Database {
|
|
|
4386
5233
|
.global
|
|
4387
5234
|
.get(vector_name.unwrap_or(DEFAULT_VECTOR_NAME)),
|
|
4388
5235
|
}?;
|
|
4389
|
-
|
|
5236
|
+
// Gate on live (non-tombstoned) record count: if half the graph is
|
|
5237
|
+
// dead, treat the live half as if it were the whole corpus.
|
|
5238
|
+
let live = index.live_count();
|
|
5239
|
+
if live < ANN_SEARCH_MIN_POINTS {
|
|
4390
5240
|
return None;
|
|
4391
5241
|
}
|
|
4392
5242
|
|
|
4393
|
-
let candidate_count = candidate_count(top_k,
|
|
5243
|
+
let candidate_count = candidate_count(top_k, live);
|
|
4394
5244
|
if candidate_count == 0 {
|
|
4395
5245
|
return None;
|
|
4396
5246
|
}
|
|
@@ -4399,15 +5249,29 @@ impl Database {
|
|
|
4399
5249
|
// explicitly sets `IndexConfig.ef_search`, honour it directly.
|
|
4400
5250
|
// Otherwise default to max(candidate_count, ef_construction) which is
|
|
4401
5251
|
// a conservative high-recall heuristic.
|
|
4402
|
-
let ef_search = match self.index_config.ef_search {
|
|
5252
|
+
let mut ef_search = match self.index_config.ef_search {
|
|
4403
5253
|
Some(ef) => ef.max(candidate_count),
|
|
4404
5254
|
None => candidate_count.max(self.index_config.ef_construction),
|
|
4405
5255
|
};
|
|
4406
|
-
|
|
5256
|
+
// Over-fetch to compensate for tombstoned candidates we'll drop. Cap
|
|
5257
|
+
// at the live count so we don't waste work; we'd never get more
|
|
5258
|
+
// distinct results than that anyway.
|
|
5259
|
+
if !index.tombstones.is_empty() {
|
|
5260
|
+
let dead = index.tombstones.len();
|
|
5261
|
+
ef_search = ef_search
|
|
5262
|
+
.saturating_add(dead.min(ef_search))
|
|
5263
|
+
.min(index.keys.len());
|
|
5264
|
+
}
|
|
5265
|
+
let fetch_count = candidate_count
|
|
5266
|
+
.saturating_add(index.tombstones.len().min(candidate_count))
|
|
5267
|
+
.min(index.keys.len());
|
|
5268
|
+
let neighbours = index.hnsw.search(query, fetch_count, ef_search);
|
|
4407
5269
|
Some(
|
|
4408
5270
|
neighbours
|
|
4409
5271
|
.into_iter()
|
|
5272
|
+
.filter(|n| !index.tombstones.contains(&n.d_id))
|
|
4410
5273
|
.filter_map(|neighbour| index.keys.get(neighbour.d_id).cloned())
|
|
5274
|
+
.take(candidate_count)
|
|
4411
5275
|
.collect(),
|
|
4412
5276
|
)
|
|
4413
5277
|
}
|
|
@@ -4691,6 +5555,7 @@ fn build_ann_index(
|
|
|
4691
5555
|
$dist_val,
|
|
4692
5556
|
);
|
|
4693
5557
|
let mut keys = Vec::with_capacity(count);
|
|
5558
|
+
let mut key_to_origin = HashMap::with_capacity(count);
|
|
4694
5559
|
if use_parallel {
|
|
4695
5560
|
// hnsw_rs's `parallel_insert` takes `&[(&Vec<T>, usize)]`
|
|
4696
5561
|
// (the API is built around owned-Vec borrows) and uses Rayon
|
|
@@ -4699,12 +5564,14 @@ fn build_ann_index(
|
|
|
4699
5564
|
let mut batch: Vec<(&Vec<f32>, usize)> = Vec::with_capacity(count);
|
|
4700
5565
|
for (origin_id, (key, vector)) in records.into_iter().enumerate() {
|
|
4701
5566
|
batch.push((vector, origin_id));
|
|
5567
|
+
key_to_origin.insert(key.clone(), origin_id);
|
|
4702
5568
|
keys.push(key);
|
|
4703
5569
|
}
|
|
4704
5570
|
hnsw.parallel_insert(&batch);
|
|
4705
5571
|
} else {
|
|
4706
5572
|
for (origin_id, (key, vector)) in records.into_iter().enumerate() {
|
|
4707
5573
|
hnsw.insert((vector.as_slice(), origin_id));
|
|
5574
|
+
key_to_origin.insert(key.clone(), origin_id);
|
|
4708
5575
|
keys.push(key);
|
|
4709
5576
|
}
|
|
4710
5577
|
}
|
|
@@ -4712,6 +5579,8 @@ fn build_ann_index(
|
|
|
4712
5579
|
AnnIndex {
|
|
4713
5580
|
hnsw: AnnHnsw::$variant(hnsw),
|
|
4714
5581
|
keys,
|
|
5582
|
+
key_to_origin,
|
|
5583
|
+
tombstones: HashSet::new(),
|
|
4715
5584
|
}
|
|
4716
5585
|
}};
|
|
4717
5586
|
}
|
|
@@ -4906,9 +5775,21 @@ fn hex_encode(bytes: &[u8]) -> String {
|
|
|
4906
5775
|
out
|
|
4907
5776
|
}
|
|
4908
5777
|
|
|
5778
|
+
/// Order-independent FNV-1a hash over a set of record keys. We sort first so
|
|
5779
|
+
/// the signature only depends on the SET of keys, not the order they were
|
|
5780
|
+
/// inserted. Callers can use this to check whether a persisted ANN graph
|
|
5781
|
+
/// matches the live record set regardless of whether the live `keys` vec is
|
|
5782
|
+
/// BTreeMap-ordered (full rebuild) or insertion-ordered (incremental
|
|
5783
|
+
/// updates).
|
|
5784
|
+
///
|
|
5785
|
+
/// Historical note: previously the input was always BTreeMap-iterated and
|
|
5786
|
+
/// therefore already sorted, so the sort step is a no-op for old ANN1
|
|
5787
|
+
/// manifests — backwards compatible.
|
|
4909
5788
|
fn record_key_signature(keys: &[RecordKey]) -> u64 {
|
|
5789
|
+
let mut sorted: Vec<&RecordKey> = keys.iter().collect();
|
|
5790
|
+
sorted.sort();
|
|
4910
5791
|
let mut state = 0xcbf29ce484222325_u64;
|
|
4911
|
-
for (namespace, id) in
|
|
5792
|
+
for (namespace, id) in sorted {
|
|
4912
5793
|
for byte in namespace
|
|
4913
5794
|
.as_bytes()
|
|
4914
5795
|
.iter()
|
|
@@ -4935,9 +5816,16 @@ fn load_ann_index(
|
|
|
4935
5816
|
($dist_val:expr, $variant:ident) => {{
|
|
4936
5817
|
let mut hnsw = reloader.load_hnsw_with_dist($dist_val).ok()?;
|
|
4937
5818
|
hnsw.set_searching_mode(true);
|
|
5819
|
+
let key_to_origin = keys
|
|
5820
|
+
.iter()
|
|
5821
|
+
.enumerate()
|
|
5822
|
+
.map(|(i, k)| (k.clone(), i))
|
|
5823
|
+
.collect();
|
|
4938
5824
|
Some(AnnIndex {
|
|
4939
5825
|
hnsw: AnnHnsw::$variant(hnsw),
|
|
4940
5826
|
keys,
|
|
5827
|
+
key_to_origin,
|
|
5828
|
+
tombstones: HashSet::new(),
|
|
4941
5829
|
})
|
|
4942
5830
|
}};
|
|
4943
5831
|
}
|
|
@@ -4950,9 +5838,16 @@ fn load_ann_index(
|
|
|
4950
5838
|
}
|
|
4951
5839
|
}
|
|
4952
5840
|
|
|
5841
|
+
/// Write the ANN sidecar manifest. We use format `ANN2`, which (compared to
|
|
5842
|
+
/// the original `ANN1`) also serialises the actual key array per index in
|
|
5843
|
+
/// the order the HNSW knows its `origin_id`s. This is required for
|
|
5844
|
+
/// incremental insertion: without it, a reload would associate the wrong
|
|
5845
|
+
/// (BTreeMap-ordered) record key with each HNSW origin_id whenever the in
|
|
5846
|
+
/// memory key array isn't sorted (which happens any time we incrementally
|
|
5847
|
+
/// append).
|
|
4953
5848
|
fn write_ann_manifest(path: &Path, entries: &[AnnManifestEntry]) -> Result<()> {
|
|
4954
|
-
let mut file = File::create(path)
|
|
4955
|
-
file.write_all(b"
|
|
5849
|
+
let mut file = BufWriter::new(File::create(path)?);
|
|
5850
|
+
file.write_all(b"ANN2")?;
|
|
4956
5851
|
write_u32(&mut file, u32_from_usize(entries.len())?)?;
|
|
4957
5852
|
for entry in entries {
|
|
4958
5853
|
write_u8(&mut file, u8::from(entry.namespace.is_some()))?;
|
|
@@ -4962,8 +5857,15 @@ fn write_ann_manifest(path: &Path, entries: &[AnnManifestEntry]) -> Result<()> {
|
|
|
4962
5857
|
write_string(&mut file, &entry.vector_name)?;
|
|
4963
5858
|
write_u64(&mut file, u64_from_usize(entry.record_count)?)?;
|
|
4964
5859
|
write_u64(&mut file, entry.key_signature)?;
|
|
5860
|
+
// ANN2 addition: the full keys array in insertion order.
|
|
5861
|
+
write_u64(&mut file, u64_from_usize(entry.keys.len())?)?;
|
|
5862
|
+
for (ns, id) in &entry.keys {
|
|
5863
|
+
write_string(&mut file, ns)?;
|
|
5864
|
+
write_string(&mut file, id)?;
|
|
5865
|
+
}
|
|
4965
5866
|
}
|
|
4966
|
-
file.
|
|
5867
|
+
file.flush()?;
|
|
5868
|
+
file.get_ref().sync_all()?;
|
|
4967
5869
|
Ok(())
|
|
4968
5870
|
}
|
|
4969
5871
|
|
|
@@ -4971,11 +5873,15 @@ fn read_ann_manifest(path: &Path) -> Result<Vec<AnnManifestEntry>> {
|
|
|
4971
5873
|
let mut file = BufReader::new(File::open(path)?);
|
|
4972
5874
|
let mut magic = [0_u8; 4];
|
|
4973
5875
|
file.read_exact(&mut magic)?;
|
|
4974
|
-
|
|
4975
|
-
|
|
4976
|
-
|
|
4977
|
-
|
|
4978
|
-
|
|
5876
|
+
let version = match &magic {
|
|
5877
|
+
b"ANN1" => 1u8,
|
|
5878
|
+
b"ANN2" => 2u8,
|
|
5879
|
+
_ => {
|
|
5880
|
+
return Err(VectLiteError::InvalidFormat(
|
|
5881
|
+
"invalid ANN manifest".to_owned(),
|
|
5882
|
+
));
|
|
5883
|
+
}
|
|
5884
|
+
};
|
|
4979
5885
|
|
|
4980
5886
|
let count = usize_from_u32(read_u32(&mut file)?)?;
|
|
4981
5887
|
let mut entries = Vec::with_capacity(count);
|
|
@@ -4989,12 +5895,27 @@ fn read_ann_manifest(path: &Path) -> Result<Vec<AnnManifestEntry>> {
|
|
|
4989
5895
|
let vector_name = read_string(&mut file)?;
|
|
4990
5896
|
let record_count = usize_from_u64(read_u64(&mut file)?)?;
|
|
4991
5897
|
let key_signature = read_u64(&mut file)?;
|
|
5898
|
+
let keys = if version >= 2 {
|
|
5899
|
+
let n = usize_from_u64(read_u64(&mut file)?)?;
|
|
5900
|
+
let mut keys = Vec::with_capacity(n);
|
|
5901
|
+
for _ in 0..n {
|
|
5902
|
+
let ns = read_string(&mut file)?;
|
|
5903
|
+
let id = read_string(&mut file)?;
|
|
5904
|
+
keys.push((ns, id));
|
|
5905
|
+
}
|
|
5906
|
+
keys
|
|
5907
|
+
} else {
|
|
5908
|
+
// ANN1 had no persisted keys; caller falls back to recomputing
|
|
5909
|
+
// them from `self.records` (which yields BTreeMap-sorted keys,
|
|
5910
|
+
// matching the order ANN1 indexes were always built in).
|
|
5911
|
+
Vec::new()
|
|
5912
|
+
};
|
|
4992
5913
|
entries.push(AnnManifestEntry {
|
|
4993
5914
|
namespace,
|
|
4994
5915
|
vector_name,
|
|
4995
5916
|
record_count,
|
|
4996
5917
|
key_signature,
|
|
4997
|
-
keys
|
|
5918
|
+
keys,
|
|
4998
5919
|
});
|
|
4999
5920
|
}
|
|
5000
5921
|
Ok(entries)
|