vectlite 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -782,6 +782,14 @@ pub struct IndexConfig {
782
782
  /// least this many vectors. Defaults to `ANN_PARALLEL_INSERT_THRESHOLD`.
783
783
  /// Set very high to disable parallel insert.
784
784
  pub parallel_insert_threshold: usize,
785
+ /// Percentage (0..=100) of tombstoned nodes at which the HNSW graph is
786
+ /// rebuilt during `compact()`. A `delete` doesn't physically remove a
787
+ /// node from HNSW (that operation is not supported by the library); the
788
+ /// node is just marked dead and filtered out at search time. Once enough
789
+ /// nodes are dead, search recall and latency degrade, so we rebuild.
790
+ /// Default `30` (rebuild when ≥30% of the graph is dead). Set to `100`
791
+ /// to disable automatic rebuild.
792
+ pub tombstone_rebuild_pct: u8,
785
793
  }
786
794
 
787
795
  impl Default for IndexConfig {
@@ -791,6 +799,7 @@ impl Default for IndexConfig {
791
799
  ef_construction: ANN_EF_CONSTRUCTION,
792
800
  ef_search: None,
793
801
  parallel_insert_threshold: ANN_PARALLEL_INSERT_THRESHOLD,
802
+ tombstone_rebuild_pct: 30,
794
803
  }
795
804
  }
796
805
  }
@@ -804,6 +813,7 @@ impl IndexConfig {
804
813
  ef_construction: 400,
805
814
  ef_search: Some(200),
806
815
  parallel_insert_threshold: ANN_PARALLEL_INSERT_THRESHOLD,
816
+ tombstone_rebuild_pct: 30,
807
817
  }
808
818
  }
809
819
 
@@ -814,6 +824,7 @@ impl IndexConfig {
814
824
  ef_construction: 100,
815
825
  ef_search: Some(40),
816
826
  parallel_insert_threshold: ANN_PARALLEL_INSERT_THRESHOLD,
827
+ tombstone_rebuild_pct: 30,
817
828
  }
818
829
  }
819
830
 
@@ -835,6 +846,59 @@ impl IndexConfig {
835
846
  ));
836
847
  }
837
848
  }
849
+ if self.tombstone_rebuild_pct > 100 {
850
+ return Err(VectLiteError::InvalidFormat(
851
+ "IndexConfig.tombstone_rebuild_pct must be in 0..=100".to_owned(),
852
+ ));
853
+ }
854
+ Ok(())
855
+ }
856
+ }
857
+
858
+ /// Controls when the WAL file is `fsync`'d to disk.
859
+ ///
860
+ /// Per-record durability is the default (`PerOp`) but on macOS APFS — and to
861
+ /// a lesser extent on Linux ext4 — `fsync` is the dominant cost of single
862
+ /// `insert` calls. Relaxing this knob can multiply ingestion throughput by
863
+ /// 5–10× at the cost of losing some recently-acknowledged records on an
864
+ /// unclean shutdown.
865
+ ///
866
+ /// The WAL is *always* fully synced on `flush()`, `compact()`, and `close()`.
867
+ /// So even with `OnFlush`, any data that survives a clean shutdown is
868
+ /// durable. The window of vulnerability is limited to:
869
+ /// - `EveryN(n)`: at most the last `n - 1` inserts since the last fsync.
870
+ /// - `OnFlush`: every insert since the last `flush()` / `compact()`.
871
+ #[derive(Clone, Copy, Debug, PartialEq, Eq)]
872
+ pub enum WalSyncMode {
873
+ /// `fsync` after every WAL append. Strongest durability, slowest. This is
874
+ /// the default and matches pre-0.11 behaviour.
875
+ PerOp,
876
+ /// `fsync` once every `n` ops. On a crash, up to the last `n - 1` ops
877
+ /// since the last sync may be lost. A good middle ground when streaming
878
+ /// thousands of small records: pick `n` so the worst-case loss is
879
+ /// tolerable (e.g. `64` ≈ a fraction of a second of data).
880
+ EveryN(usize),
881
+ /// Never `fsync` from the per-op path. Sync only at `flush()` / `compact()`
882
+ /// / `close()`. Maximum throughput, weakest durability — appropriate for
883
+ /// bulk ingestion of data that can be regenerated.
884
+ OnFlush,
885
+ }
886
+
887
+ impl Default for WalSyncMode {
888
+ fn default() -> Self {
889
+ WalSyncMode::PerOp
890
+ }
891
+ }
892
+
893
+ impl WalSyncMode {
894
+ fn validate(self) -> Result<()> {
895
+ if let WalSyncMode::EveryN(n) = self {
896
+ if n == 0 {
897
+ return Err(VectLiteError::InvalidFormat(
898
+ "WalSyncMode::EveryN must be >= 1".to_owned(),
899
+ ));
900
+ }
901
+ }
838
902
  Ok(())
839
903
  }
840
904
  }
@@ -1299,6 +1363,29 @@ pub struct Database {
1299
1363
  /// Holds the lock file open for the lifetime of the database.
1300
1364
  /// Dropping this releases the advisory lock.
1301
1365
  _lock_file: Option<File>,
1366
+ /// Cached WAL writer: avoids paying the open() syscall on every insert.
1367
+ /// Reset whenever the WAL is rotated (compact, clear_wal).
1368
+ wal_writer: Option<BufWriter<File>>,
1369
+ /// Controls when `fsync` is issued against the WAL — see [`WalSyncMode`].
1370
+ wal_sync_mode: WalSyncMode,
1371
+ /// Number of ops appended to the WAL since the last fsync. Used by the
1372
+ /// `EveryN` sync mode to decide when to flush+sync.
1373
+ wal_ops_since_sync: usize,
1374
+ /// True if the in-memory ANN graph(s) have unsaved changes (incremental
1375
+ /// inserts, fresh build, or a full rebuild) that have not been written
1376
+ /// out via `persist_ann_to_disk`. Set on every mutation in
1377
+ /// `apply_wal_batch` / `bulk_ingest` and cleared by `compact_inner` or
1378
+ /// an explicit `persist_ann_to_disk`.
1379
+ ann_dirty: bool,
1380
+ /// True if the quantized PQ index needs to be rebuilt at the next flush
1381
+ /// (because records have been inserted/deleted since the last rebuild).
1382
+ /// While dirty, the in-memory `quantized` field is set to `None` so
1383
+ /// searches transparently fall back to the HNSW path instead of
1384
+ /// returning candidates from a stale codebook.
1385
+ quantized_dirty: bool,
1386
+ /// Same as `quantized_dirty`, but for multi-vector (ColBERT-style)
1387
+ /// quantization spaces. Lazy rebuild happens at flush time.
1388
+ multi_vector_quantized_dirty: bool,
1302
1389
  /// Optional quantized index for accelerated search.
1303
1390
  quantized: Option<QuantizedIndex>,
1304
1391
  /// Configuration used to build the quantized index (persisted).
@@ -1319,6 +1406,87 @@ pub struct Database {
1319
1406
  /// knob so callers can change recall/latency tradeoffs without migrating
1320
1407
  /// data files. A subsequent `set_index_config` triggers a rebuild.
1321
1408
  index_config: IndexConfig,
1409
+ /// Contiguous f32 mirror of the default dense vector for every record.
1410
+ /// Used by brute-force / rescoring scans for cache-friendly SIMD.
1411
+ /// `None` when the arena hasn't been materialised yet for this session.
1412
+ vector_arena: Option<VectorArena>,
1413
+ /// When true, `vector_arena` is stale (e.g. a delete happened) and must
1414
+ /// be rebuilt before use.
1415
+ vector_arena_dirty: bool,
1416
+ }
1417
+
1418
+ /// Contiguous-storage mirror of the default dense vector per record.
1419
+ ///
1420
+ /// In the original layout each `Record.vector` is a separately-allocated
1421
+ /// `Vec<f32>` and the records themselves live in `BTreeMap` nodes, so a
1422
+ /// brute-force or rescoring scan pays two pointer hops per record AND
1423
+ /// touches one cache line per vector — terrible for SIMD throughput.
1424
+ ///
1425
+ /// This arena stores every vector in a single flat `buf: Vec<f32>` so a scan
1426
+ /// is a straight contiguous walk (one cache miss per ~16 vectors, vs ~2 per
1427
+ /// vector). Lance / Arrow use the same trick — see the v0.11 CHANGELOG note.
1428
+ ///
1429
+ /// The arena is maintained incrementally on insert; deletes are too
1430
+ /// expensive to compact in place (would shift O(N) f32s) so they just mark
1431
+ /// the arena dirty and force a lazy full rebuild on next use.
1432
+ struct VectorArena {
1433
+ buf: Vec<f32>,
1434
+ keys: Vec<RecordKey>,
1435
+ key_to_index: HashMap<RecordKey, usize>,
1436
+ dim: usize,
1437
+ }
1438
+
1439
+ impl VectorArena {
1440
+ fn new(dim: usize) -> Self {
1441
+ Self {
1442
+ buf: Vec::new(),
1443
+ keys: Vec::new(),
1444
+ key_to_index: HashMap::new(),
1445
+ dim,
1446
+ }
1447
+ }
1448
+
1449
+ fn append(&mut self, key: RecordKey, vector: &[f32]) {
1450
+ // Defensive: ignore mismatched dims rather than panicking — this is
1451
+ // a perf cache, not the source of truth.
1452
+ if vector.len() != self.dim {
1453
+ return;
1454
+ }
1455
+ let idx = self.keys.len();
1456
+ self.buf.extend_from_slice(vector);
1457
+ self.key_to_index.insert(key.clone(), idx);
1458
+ self.keys.push(key);
1459
+ }
1460
+
1461
+ /// Rebuild from records in BTreeMap order. Called lazily when the arena
1462
+ /// is dirty (i.e. after a delete or a full ANN rebuild).
1463
+ fn rebuild_from(records: &BTreeMap<RecordKey, Record>, dim: usize) -> Self {
1464
+ let mut arena = Self::new(dim);
1465
+ arena.buf.reserve(records.len() * dim);
1466
+ arena.keys.reserve(records.len());
1467
+ arena.key_to_index.reserve(records.len());
1468
+ for (key, record) in records {
1469
+ if record.vector.len() == dim {
1470
+ arena.append(key.clone(), &record.vector);
1471
+ }
1472
+ }
1473
+ arena
1474
+ }
1475
+
1476
+ /// Iterator yielding `(key, vector_slice)` pairs. The slice references
1477
+ /// the contiguous `buf`, so consumers get cache-friendly SIMD scans.
1478
+ #[allow(dead_code)]
1479
+ fn iter(&self) -> impl Iterator<Item = (&RecordKey, &[f32])> {
1480
+ let dim = self.dim;
1481
+ self.keys.iter().enumerate().map(move |(i, k)| {
1482
+ let start = i * dim;
1483
+ (k, &self.buf[start..start + dim])
1484
+ })
1485
+ }
1486
+
1487
+ fn len(&self) -> usize {
1488
+ self.keys.len()
1489
+ }
1322
1490
  }
1323
1491
 
1324
1492
  #[derive(Default)]
@@ -1344,6 +1512,42 @@ impl AnnHnsw {
1344
1512
  }
1345
1513
  }
1346
1514
 
1515
+ /// Incrementally insert a single vector into an existing HNSW graph.
1516
+ /// `origin_id` must be unique within the graph and is used to map back
1517
+ /// to the caller's record key array.
1518
+ fn insert_one(&mut self, vector: &[f32], origin_id: usize) {
1519
+ match self {
1520
+ AnnHnsw::Cosine(h) => h.insert((vector, origin_id)),
1521
+ AnnHnsw::Euclidean(h) => h.insert((vector, origin_id)),
1522
+ AnnHnsw::DotProduct(h) => h.insert((vector, origin_id)),
1523
+ AnnHnsw::Manhattan(h) => h.insert((vector, origin_id)),
1524
+ }
1525
+ }
1526
+
1527
+ /// Bulk-insert a batch of vectors in parallel (Rayon-multithreaded).
1528
+ /// Significantly faster than repeated `insert_one` when the batch is
1529
+ /// large enough to amortise thread setup.
1530
+ fn parallel_insert_batch(&mut self, batch: &[(&Vec<f32>, usize)]) {
1531
+ match self {
1532
+ AnnHnsw::Cosine(h) => h.parallel_insert(batch),
1533
+ AnnHnsw::Euclidean(h) => h.parallel_insert(batch),
1534
+ AnnHnsw::DotProduct(h) => h.parallel_insert(batch),
1535
+ AnnHnsw::Manhattan(h) => h.parallel_insert(batch),
1536
+ }
1537
+ }
1538
+
1539
+ /// Toggle the `searching_mode` hint on the underlying HNSW. When `true`
1540
+ /// the graph is treated as read-only and lookups skip some bookkeeping;
1541
+ /// when `false` further inserts are allowed.
1542
+ fn set_searching_mode(&mut self, value: bool) {
1543
+ match self {
1544
+ AnnHnsw::Cosine(h) => h.set_searching_mode(value),
1545
+ AnnHnsw::Euclidean(h) => h.set_searching_mode(value),
1546
+ AnnHnsw::DotProduct(h) => h.set_searching_mode(value),
1547
+ AnnHnsw::Manhattan(h) => h.set_searching_mode(value),
1548
+ }
1549
+ }
1550
+
1347
1551
  fn file_dump(&self, directory: &Path, basename: &str) -> Result<()> {
1348
1552
  let result = match self {
1349
1553
  AnnHnsw::Cosine(h) => h.file_dump(directory, basename),
@@ -1359,7 +1563,38 @@ impl AnnHnsw {
1359
1563
 
1360
1564
  struct AnnIndex {
1361
1565
  hnsw: AnnHnsw,
1566
+ /// `keys[i]` is the record key for HNSW origin_id `i`. Always grows; we
1567
+ /// never shrink it (HNSW doesn't support compacted deletion). Tombstoned
1568
+ /// slots stay in the vec to keep origin_id ↔ key mapping stable.
1362
1569
  keys: Vec<RecordKey>,
1570
+ /// Reverse index: `key → origin_id`. Lets `delete` find a record's HNSW
1571
+ /// node in O(1). Built alongside `keys` on every (re)build.
1572
+ key_to_origin: HashMap<RecordKey, usize>,
1573
+ /// Origin_ids that have been logically deleted but are still part of the
1574
+ /// HNSW graph. Search filters these out by lookup; a `compact()` rebuilds
1575
+ /// the graph once the ratio exceeds `IndexConfig.tombstone_rebuild_pct`.
1576
+ tombstones: HashSet<usize>,
1577
+ }
1578
+
1579
+ impl AnnIndex {
1580
+ /// Number of live (non-tombstoned) records in the graph.
1581
+ fn live_count(&self) -> usize {
1582
+ self.keys.len().saturating_sub(self.tombstones.len())
1583
+ }
1584
+
1585
+ /// True when the fraction of dead nodes is at or above the configured
1586
+ /// rebuild threshold (`IndexConfig.tombstone_rebuild_pct`). Currently
1587
+ /// `compact_inner` rebuilds on *any* tombstones because the persisted
1588
+ /// manifest format only tracks live record counts — when we add a
1589
+ /// tombstone-aware manifest (planned), this becomes the trigger.
1590
+ #[allow(dead_code)]
1591
+ fn should_rebuild(&self, threshold_pct: u8) -> bool {
1592
+ if self.keys.is_empty() {
1593
+ return false;
1594
+ }
1595
+ let pct = (self.tombstones.len() * 100) / self.keys.len();
1596
+ pct >= threshold_pct as usize
1597
+ }
1363
1598
  }
1364
1599
 
1365
1600
  struct AnnManifestEntry {
@@ -1408,6 +1643,12 @@ impl Database {
1408
1643
  ann_loaded_from_disk: false,
1409
1644
  read_only: false,
1410
1645
  _lock_file: Some(lock),
1646
+ wal_writer: None,
1647
+ wal_sync_mode: WalSyncMode::default(),
1648
+ wal_ops_since_sync: 0,
1649
+ ann_dirty: false,
1650
+ quantized_dirty: false,
1651
+ multi_vector_quantized_dirty: false,
1411
1652
  quantized: None,
1412
1653
  quantization_config: None,
1413
1654
  quantized_keys: Vec::new(),
@@ -1417,6 +1658,8 @@ impl Database {
1417
1658
  payload_index_defs: BTreeMap::new(),
1418
1659
  payload_indexes: BTreeMap::new(),
1419
1660
  index_config: IndexConfig::default(),
1661
+ vector_arena: None,
1662
+ vector_arena_dirty: false,
1420
1663
  };
1421
1664
 
1422
1665
  database.flush()?;
@@ -1522,6 +1765,8 @@ impl Database {
1522
1765
  if !self.read_only {
1523
1766
  self.compact_inner()?;
1524
1767
  }
1768
+ // Drop the cached WAL writer (also closes the underlying file handle).
1769
+ self.wal_writer = None;
1525
1770
  // Release the lock by dropping the file handle
1526
1771
  self._lock_file = None;
1527
1772
  // Clear in-memory state
@@ -1531,6 +1776,8 @@ impl Database {
1531
1776
  self.quantized = None;
1532
1777
  self.quantization_config = None;
1533
1778
  self.quantized_keys.clear();
1779
+ self.vector_arena = None;
1780
+ self.vector_arena_dirty = false;
1534
1781
  self.dimension = 0;
1535
1782
  Ok(())
1536
1783
  }
@@ -2128,8 +2375,12 @@ impl Database {
2128
2375
  self.rebuild_ann();
2129
2376
  self.ann_loaded_from_disk = false;
2130
2377
  self.persist_ann_to_disk()?;
2378
+ self.ann_dirty = false;
2379
+ self.vector_arena_dirty = true;
2131
2380
  self.rebuild_quantized_index();
2381
+ self.quantized_dirty = false;
2132
2382
  self.rebuild_all_multi_vector_quantized_indexes();
2383
+ self.multi_vector_quantized_dirty = false;
2133
2384
  Ok(count)
2134
2385
  }
2135
2386
 
@@ -2156,8 +2407,12 @@ impl Database {
2156
2407
  self.rebuild_ann();
2157
2408
  self.ann_loaded_from_disk = false;
2158
2409
  self.persist_ann_to_disk()?;
2410
+ self.ann_dirty = false;
2411
+ self.vector_arena_dirty = true;
2159
2412
  self.rebuild_quantized_index();
2413
+ self.quantized_dirty = false;
2160
2414
  self.rebuild_all_multi_vector_quantized_indexes();
2415
+ self.multi_vector_quantized_dirty = false;
2161
2416
  Ok(count)
2162
2417
  }
2163
2418
 
@@ -2361,8 +2616,12 @@ impl Database {
2361
2616
  self.rebuild_ann();
2362
2617
  self.ann_loaded_from_disk = false;
2363
2618
  self.persist_ann_to_disk()?;
2619
+ self.ann_dirty = false;
2620
+ self.vector_arena_dirty = true;
2364
2621
  self.rebuild_quantized_index();
2622
+ self.quantized_dirty = false;
2365
2623
  self.rebuild_all_multi_vector_quantized_indexes();
2624
+ self.multi_vector_quantized_dirty = false;
2366
2625
  Ok(())
2367
2626
  }
2368
2627
 
@@ -2573,6 +2832,80 @@ impl Database {
2573
2832
  self.compact_inner()
2574
2833
  }
2575
2834
 
2835
+ /// Configure WAL durability. See [`WalSyncMode`] for the safety / speed
2836
+ /// tradeoffs.
2837
+ ///
2838
+ /// Switching to a more relaxed mode while there are unsync'd bytes in
2839
+ /// the WAL is safe — the bytes simply stay in the BufWriter / OS cache
2840
+ /// until the next sync point (`flush()`, `compact()`, `close()`, or the
2841
+ /// counter reaching `EveryN(n)`). Switching to a *stricter* mode forces
2842
+ /// an immediate sync so there is no surprise loss window.
2843
+ pub fn set_wal_sync_mode(&mut self, mode: WalSyncMode) -> Result<()> {
2844
+ self.check_writable()?;
2845
+ mode.validate()?;
2846
+ let previous = self.wal_sync_mode;
2847
+ self.wal_sync_mode = mode;
2848
+ // If we just tightened durability (e.g. moved from OnFlush back to
2849
+ // PerOp) and there are pending ops, sync immediately so the user's
2850
+ // mental model — "after this call any acknowledged write is durable"
2851
+ // — holds.
2852
+ let became_stricter = matches!(
2853
+ (previous, mode),
2854
+ (
2855
+ WalSyncMode::OnFlush,
2856
+ WalSyncMode::PerOp | WalSyncMode::EveryN(_)
2857
+ ) | (WalSyncMode::EveryN(_), WalSyncMode::PerOp)
2858
+ );
2859
+ if became_stricter && self.wal_ops_since_sync > 0 {
2860
+ self.sync_wal()?;
2861
+ self.wal_ops_since_sync = 0;
2862
+ }
2863
+ Ok(())
2864
+ }
2865
+
2866
+ /// Return the current WAL sync mode.
2867
+ pub fn wal_sync_mode(&self) -> WalSyncMode {
2868
+ self.wal_sync_mode
2869
+ }
2870
+
2871
+ /// Materialise the contiguous-vector arena up front.
2872
+ ///
2873
+ /// The arena mirrors the default dense vector of every record in a
2874
+ /// single flat `Vec<f32>` — much more cache- and SIMD-friendly than the
2875
+ /// default `BTreeMap<Record>` layout. It's normally built lazily on
2876
+ /// first use, but if you know a heavy brute-force or rescoring scan is
2877
+ /// coming you can pay the build cost up front by calling this. Cheap
2878
+ /// when already fresh.
2879
+ pub fn prepare_for_scan(&mut self) {
2880
+ let _ = self.ensure_vector_arena();
2881
+ }
2882
+
2883
+ /// Number of vectors in the contiguous arena, or `None` if the arena
2884
+ /// hasn't been materialised yet for this session. Useful for tests and
2885
+ /// observability.
2886
+ pub fn vector_arena_len(&self) -> Option<usize> {
2887
+ self.vector_arena.as_ref().map(VectorArena::len)
2888
+ }
2889
+
2890
+ /// Return (live_count, tombstoned_count) summed across every HNSW graph
2891
+ /// (global + per-namespace). Useful for monitoring when a `compact()`
2892
+ /// would benefit from rebuilding the graph(s).
2893
+ pub fn tombstone_stats(&self) -> (usize, usize) {
2894
+ let mut live = 0usize;
2895
+ let mut dead = 0usize;
2896
+ for idx in self.ann.global.values() {
2897
+ live += idx.live_count();
2898
+ dead += idx.tombstones.len();
2899
+ }
2900
+ for indexes in self.ann.namespaces.values() {
2901
+ for idx in indexes.values() {
2902
+ live += idx.live_count();
2903
+ dead += idx.tombstones.len();
2904
+ }
2905
+ }
2906
+ (live, dead)
2907
+ }
2908
+
2576
2909
  /// Bulk-ingest many records efficiently. WAL writes happen in batches of
2577
2910
  /// `batch_size`, but the ANN index and sparse index are only rebuilt once
2578
2911
  /// at the very end, making this much faster than `upsert_many` for large
@@ -2644,9 +2977,16 @@ impl Database {
2644
2977
  self.rebuild_sparse_index();
2645
2978
  self.rebuild_ann();
2646
2979
  self.ann_loaded_from_disk = false;
2980
+ // Persist the freshly-built ANN so a subsequent reopen can skip
2981
+ // the rebuild — bulk_ingest is a "batch" operation and callers
2982
+ // expect index state to be on disk afterwards.
2647
2983
  self.persist_ann_to_disk()?;
2984
+ self.ann_dirty = false;
2985
+ self.vector_arena_dirty = true;
2648
2986
  self.rebuild_quantized_index();
2987
+ self.quantized_dirty = false;
2649
2988
  self.rebuild_all_multi_vector_quantized_indexes();
2989
+ self.multi_vector_quantized_dirty = false;
2650
2990
  }
2651
2991
 
2652
2992
  Ok(total)
@@ -2665,6 +3005,7 @@ impl Database {
2665
3005
  self.rebuild_ann();
2666
3006
  self.ann_loaded_from_disk = false;
2667
3007
  self.persist_ann_to_disk()?;
3008
+ self.ann_dirty = false;
2668
3009
  }
2669
3010
  Ok(())
2670
3011
  }
@@ -2710,6 +3051,7 @@ impl Database {
2710
3051
  validate_quantization_config(&config, self.dimension)?;
2711
3052
  self.quantization_config = Some(config);
2712
3053
  self.rebuild_quantized_index();
3054
+ self.quantized_dirty = false;
2713
3055
  self.persist_quantization_params()?;
2714
3056
  Ok(())
2715
3057
  }
@@ -2720,6 +3062,7 @@ impl Database {
2720
3062
  self.quantized = None;
2721
3063
  self.quantization_config = None;
2722
3064
  self.quantized_keys.clear();
3065
+ self.quantized_dirty = false;
2723
3066
  // Remove the sidecar file
2724
3067
  let params_path = quantization_params_path(&self.path);
2725
3068
  if params_path.exists() {
@@ -3401,6 +3744,54 @@ impl Database {
3401
3744
  self.records.remove(key);
3402
3745
  }
3403
3746
 
3747
+ // If any HNSW graph has tombstones, rebuild it before persisting.
3748
+ //
3749
+ // Two reasons:
3750
+ // 1. Crossing `tombstone_rebuild_pct` means search recall has
3751
+ // degraded enough that the user wants a clean graph.
3752
+ // 2. Even below the threshold, the persisted manifest's
3753
+ // `record_count` is derived from `self.records` (live only),
3754
+ // but the in-memory `keys` array includes dead slots — so a
3755
+ // persisted-with-tombstones graph would always fail the
3756
+ // record_count check on reopen and rebuild anyway. Rebuilding
3757
+ // *now* dumps a clean graph that survives reload.
3758
+ let threshold = self.index_config.tombstone_rebuild_pct;
3759
+ let any_tombstones = self
3760
+ .ann
3761
+ .global
3762
+ .values()
3763
+ .any(|idx| !idx.tombstones.is_empty())
3764
+ || self
3765
+ .ann
3766
+ .namespaces
3767
+ .values()
3768
+ .flat_map(|m| m.values())
3769
+ .any(|idx| !idx.tombstones.is_empty());
3770
+ // (We track `threshold` even though we currently rebuild on any
3771
+ // tombstones, so `should_rebuild` could later replace this when we
3772
+ // add tombstone persistence in the manifest.)
3773
+ let _ = threshold;
3774
+ if any_tombstones {
3775
+ self.rebuild_ann();
3776
+ }
3777
+
3778
+ // Rebuild any lazy indexes that were marked dirty during the session
3779
+ // before we persist. This is the point where we pay back the work
3780
+ // we deferred from the per-insert hot path:
3781
+ // - the HNSW graph is already up-to-date (incremental inserts),
3782
+ // we just need to dump it.
3783
+ // - the quantized PQ index was dropped on first insert and is
3784
+ // rebuilt now so search can use it again next session.
3785
+ // - same for multi-vector PQ.
3786
+ if self.quantized_dirty {
3787
+ self.rebuild_quantized_index();
3788
+ self.quantized_dirty = false;
3789
+ }
3790
+ if self.multi_vector_quantized_dirty {
3791
+ self.rebuild_all_multi_vector_quantized_indexes();
3792
+ self.multi_vector_quantized_dirty = false;
3793
+ }
3794
+
3404
3795
  if let Some(parent) = self.path.parent() {
3405
3796
  if !parent.as_os_str().is_empty() {
3406
3797
  fs::create_dir_all(parent)?;
@@ -3423,6 +3814,7 @@ impl Database {
3423
3814
  self.clear_wal()?;
3424
3815
  self.wal_entries_replayed = 0;
3425
3816
  self.persist_ann_to_disk()?;
3817
+ self.ann_dirty = false;
3426
3818
 
3427
3819
  Ok(())
3428
3820
  }
@@ -3563,6 +3955,65 @@ impl Database {
3563
3955
  .iter()
3564
3956
  .all(|op| matches!(op, WalOp::UpdateMetadata { .. } | WalOp::SetTtl { .. }));
3565
3957
 
3958
+ // Categorise each op so we can route to the fastest correct path:
3959
+ // incremental insert (Upsert with new key) → ann_apply_incremental
3960
+ // tombstone delete (Delete of present key) → ann_apply_tombstones
3961
+ // anything else (upsert of existing key, etc) → full rebuild
3962
+ let mut incremental_eligible = !metadata_only;
3963
+ let mut tombstone_only = !metadata_only;
3964
+ for op in &ops {
3965
+ match op {
3966
+ WalOp::Upsert(record) => {
3967
+ let exists = self
3968
+ .records
3969
+ .contains_key(&(record.namespace.clone(), record.id.clone()));
3970
+ if exists {
3971
+ incremental_eligible = false;
3972
+ tombstone_only = false;
3973
+ } else {
3974
+ // New upsert — fine for incremental, but not tombstone-only.
3975
+ tombstone_only = false;
3976
+ }
3977
+ }
3978
+ WalOp::Delete { namespace, id } => {
3979
+ let exists = self.records.contains_key(&(namespace.clone(), id.clone()));
3980
+ if exists {
3981
+ // OK for tombstone path, but not for incremental.
3982
+ incremental_eligible = false;
3983
+ }
3984
+ // (A delete of a non-existent key is a no-op for both
3985
+ // paths, but we still let it through.)
3986
+ }
3987
+ WalOp::UpdateMetadata { .. } | WalOp::SetTtl { .. } => {
3988
+ incremental_eligible = false;
3989
+ tombstone_only = false;
3990
+ }
3991
+ }
3992
+ }
3993
+
3994
+ // Collect the keys we'll need to feed to the relevant updater
3995
+ // before we move `ops` into `apply_ops_in_memory`.
3996
+ let new_keys: Vec<RecordKey> = if incremental_eligible {
3997
+ ops.iter()
3998
+ .filter_map(|op| match op {
3999
+ WalOp::Upsert(record) => Some((record.namespace.clone(), record.id.clone())),
4000
+ _ => None,
4001
+ })
4002
+ .collect()
4003
+ } else {
4004
+ Vec::new()
4005
+ };
4006
+ let deleted_keys: Vec<RecordKey> = if tombstone_only {
4007
+ ops.iter()
4008
+ .filter_map(|op| match op {
4009
+ WalOp::Delete { namespace, id } => Some((namespace.clone(), id.clone())),
4010
+ _ => None,
4011
+ })
4012
+ .collect()
4013
+ } else {
4014
+ Vec::new()
4015
+ };
4016
+
3566
4017
  self.append_wal_batch(&ops)?;
3567
4018
  self.apply_ops_in_memory(ops);
3568
4019
 
@@ -3571,11 +4022,55 @@ impl Database {
3571
4022
  if has_sparse {
3572
4023
  self.rebuild_sparse_index();
3573
4024
  }
3574
- self.rebuild_ann();
4025
+ if incremental_eligible {
4026
+ // Fast path: just append the new vectors into the existing
4027
+ // HNSW graph(s) instead of rebuilding from scratch. Converts
4028
+ // single-record ingestion from O(N log N) per insert to
4029
+ // amortised O(log N).
4030
+ self.ann_apply_incremental(&new_keys);
4031
+ // Keep the contiguous arena in sync. If it hasn't been
4032
+ // materialised yet, leave it alone — it'll be lazily built
4033
+ // on first read.
4034
+ if self.vector_arena.is_some() && !self.vector_arena_dirty {
4035
+ self.arena_apply_incremental(&new_keys);
4036
+ }
4037
+ } else if tombstone_only {
4038
+ // Delete-only fast path: tombstone the corresponding
4039
+ // `origin_id`s in each affected HNSW graph. No rebuild;
4040
+ // search filters out tombstoned candidates. The graph is
4041
+ // rebuilt automatically at the next `compact()` once the
4042
+ // tombstone ratio crosses `tombstone_rebuild_pct`.
4043
+ self.ann_apply_tombstones(&deleted_keys);
4044
+ // The arena can't compact in place without shifting O(N)
4045
+ // floats; mark dirty so it's lazily rebuilt on next scan.
4046
+ self.vector_arena_dirty = true;
4047
+ } else {
4048
+ // Slow path: a mixed-mode batch or an update-of-existing.
4049
+ // Rebuild the whole catalog.
4050
+ self.rebuild_ann();
4051
+ self.vector_arena_dirty = true;
4052
+ }
4053
+ // Defer persistence of the HNSW graph to disk: writing the graph
4054
+ // files is expensive (full re-dump + fsync) and is only required
4055
+ // for crash recovery on reopen. The WAL gives us that durability
4056
+ // already — on reopen, if the persisted graph is stale, it's
4057
+ // detected via the manifest signature check and rebuilt from
4058
+ // records in memory. Persistence happens at `flush` / `compact`.
3575
4059
  self.ann_loaded_from_disk = false;
3576
- self.persist_ann_to_disk()?;
3577
- self.rebuild_quantized_index();
3578
- self.rebuild_all_multi_vector_quantized_indexes();
4060
+ self.ann_dirty = true;
4061
+ // Lazy-rebuild quantized indexes too. Drop the in-memory
4062
+ // structures so callers get correct (HNSW-fallback) results
4063
+ // until the next flush, where we rebuild from the new corpus.
4064
+ if self.quantization_config.is_some() {
4065
+ self.quantized = None;
4066
+ self.quantized_keys.clear();
4067
+ self.quantized_dirty = true;
4068
+ }
4069
+ if !self.multi_vector_quantization_config.is_empty() {
4070
+ self.multi_vector_quantized.clear();
4071
+ self.multi_vector_quantized_keys.clear();
4072
+ self.multi_vector_quantized_dirty = true;
4073
+ }
3579
4074
  }
3580
4075
  Ok(())
3581
4076
  }
@@ -3659,58 +4154,109 @@ impl Database {
3659
4154
  }
3660
4155
  }
3661
4156
 
3662
- fn append_wal_batch(&self, ops: &[WalOp]) -> Result<()> {
3663
- self.append_wal_batch_inner(ops, true)
4157
+ fn append_wal_batch(&mut self, ops: &[WalOp]) -> Result<()> {
4158
+ // Decide whether this batch should trigger an fsync. We use the
4159
+ // ops count in the batch (not 1) so `EveryN` semantics scale across
4160
+ // both single inserts and `insert_many` calls.
4161
+ let n_ops = ops.len();
4162
+ let should_sync = match self.wal_sync_mode {
4163
+ WalSyncMode::PerOp => true,
4164
+ WalSyncMode::EveryN(n) => {
4165
+ self.wal_ops_since_sync = self.wal_ops_since_sync.saturating_add(n_ops);
4166
+ if self.wal_ops_since_sync >= n {
4167
+ self.wal_ops_since_sync = 0;
4168
+ true
4169
+ } else {
4170
+ false
4171
+ }
4172
+ }
4173
+ WalSyncMode::OnFlush => {
4174
+ self.wal_ops_since_sync = self.wal_ops_since_sync.saturating_add(n_ops);
4175
+ false
4176
+ }
4177
+ };
4178
+ self.append_wal_batch_inner(ops, should_sync)
3664
4179
  }
3665
4180
 
3666
4181
  /// Append a WAL batch without issuing an fsync. The caller is responsible
3667
4182
  /// for issuing `sync_wal` later (typically once at the end of a bulk
3668
4183
  /// ingest). This is the hot path for `bulk_ingest`.
3669
- fn append_wal_batch_unsynced(&self, ops: &[WalOp]) -> Result<()> {
4184
+ fn append_wal_batch_unsynced(&mut self, ops: &[WalOp]) -> Result<()> {
4185
+ // Track pending ops so future `sync_wal` / `compact_inner` calls
4186
+ // know to flush them.
4187
+ self.wal_ops_since_sync = self.wal_ops_since_sync.saturating_add(ops.len());
3670
4188
  self.append_wal_batch_inner(ops, false)
3671
4189
  }
3672
4190
 
3673
- fn append_wal_batch_inner(&self, ops: &[WalOp], sync: bool) -> Result<()> {
4191
+ /// Append a WAL batch. Reuses a cached `BufWriter<File>` across calls so
4192
+ /// the WAL file is only opened once per database session — saving the
4193
+ /// `open()` syscall on every single `insert` call, which matters when
4194
+ /// per-record overhead is the bottleneck.
4195
+ fn append_wal_batch_inner(&mut self, ops: &[WalOp], sync: bool) -> Result<()> {
3674
4196
  if let Some(parent) = self.wal_path.parent() {
3675
4197
  if !parent.as_os_str().is_empty() {
3676
4198
  fs::create_dir_all(parent)?;
3677
4199
  }
3678
4200
  }
3679
4201
 
3680
- let new_file = !self.wal_path.exists();
3681
- let mut file = OpenOptions::new()
3682
- .create(true)
3683
- .append(true)
3684
- .open(&self.wal_path)?;
3685
-
3686
- if new_file {
3687
- file.write_all(WAL_MAGIC)?;
4202
+ // Lazily create the cached BufWriter, writing the WAL_MAGIC header
4203
+ // on first use of a brand-new file.
4204
+ if self.wal_writer.is_none() {
4205
+ let new_file = !self.wal_path.exists();
4206
+ let file = OpenOptions::new()
4207
+ .create(true)
4208
+ .append(true)
4209
+ .open(&self.wal_path)?;
4210
+ let mut writer = BufWriter::with_capacity(64 * 1024, file);
4211
+ if new_file {
4212
+ writer.write_all(WAL_MAGIC)?;
4213
+ }
4214
+ self.wal_writer = Some(writer);
3688
4215
  }
3689
4216
 
4217
+ // Serialise the batch into a temporary buffer first, so that the
4218
+ // single `write_all` we issue to the cached writer is one contiguous
4219
+ // user-space copy (BufWriter then bunches everything up further).
3690
4220
  let mut buffer = Vec::new();
3691
4221
  write_u32(&mut buffer, u32_from_usize(ops.len())?)?;
3692
4222
  for op in ops {
3693
4223
  write_wal_op(&mut buffer, op)?;
3694
4224
  }
3695
4225
 
3696
- write_u32(&mut file, u32_from_usize(buffer.len())?)?;
3697
- file.write_all(&buffer)?;
4226
+ let writer = self.wal_writer.as_mut().unwrap();
4227
+ write_u32(writer, u32_from_usize(buffer.len())?)?;
4228
+ writer.write_all(&buffer)?;
4229
+
3698
4230
  if sync {
3699
- file.sync_all()?;
4231
+ // Flush BufWriter into the OS, then ask the kernel to make the
4232
+ // bytes durable. We must `flush()` before `sync_all()` — sync_all
4233
+ // only operates on what's already in the kernel's page cache.
4234
+ writer.flush()?;
4235
+ writer.get_ref().sync_all()?;
3700
4236
  }
3701
4237
  Ok(())
3702
4238
  }
3703
4239
 
3704
- /// Force a durability fence on the WAL file. Opens the file in append
3705
- /// mode and calls `sync_all`, which makes all previous unsynced writes
3706
- /// durable in one shot. This is used by `bulk_ingest` to amortise fsync
3707
- /// cost across many batches.
3708
- fn sync_wal(&self) -> Result<()> {
4240
+ /// Force a durability fence on the WAL file. Flushes any buffered bytes
4241
+ /// from the cached writer and asks the kernel to make them durable in a
4242
+ /// single `sync_all`. Used by `bulk_ingest`, `flush`, `close`, and as a
4243
+ /// manual fence when running in `EveryN` or `OnFlush` mode.
4244
+ fn sync_wal(&mut self) -> Result<()> {
4245
+ if let Some(writer) = self.wal_writer.as_mut() {
4246
+ writer.flush()?;
4247
+ writer.get_ref().sync_all()?;
4248
+ self.wal_ops_since_sync = 0;
4249
+ return Ok(());
4250
+ }
4251
+ // Fallback: no cached writer (e.g. WAL was opened externally). Open
4252
+ // the file briefly just to issue the sync.
3709
4253
  if !self.wal_path.exists() {
4254
+ self.wal_ops_since_sync = 0;
3710
4255
  return Ok(());
3711
4256
  }
3712
4257
  let file = OpenOptions::new().append(true).open(&self.wal_path)?;
3713
4258
  file.sync_all()?;
4259
+ self.wal_ops_since_sync = 0;
3714
4260
  Ok(())
3715
4261
  }
3716
4262
 
@@ -3764,7 +4310,12 @@ impl Database {
3764
4310
  Ok(())
3765
4311
  }
3766
4312
 
3767
- fn clear_wal(&self) -> Result<()> {
4313
+ fn clear_wal(&mut self) -> Result<()> {
4314
+ // Drop the cached writer first: on POSIX the file would survive the
4315
+ // unlink because we still hold an open handle, but we'd then keep
4316
+ // appending into the now-detached inode and never see those bytes on
4317
+ // disk after reopen.
4318
+ self.wal_writer = None;
3768
4319
  if self.wal_path.exists() {
3769
4320
  fs::remove_file(&self.wal_path)?;
3770
4321
  }
@@ -3876,6 +4427,12 @@ impl Database {
3876
4427
  ann_loaded_from_disk: false,
3877
4428
  read_only: false,
3878
4429
  _lock_file: None,
4430
+ wal_writer: None,
4431
+ wal_sync_mode: WalSyncMode::default(),
4432
+ wal_ops_since_sync: 0,
4433
+ ann_dirty: false,
4434
+ quantized_dirty: false,
4435
+ multi_vector_quantized_dirty: false,
3879
4436
  quantized: None,
3880
4437
  quantization_config: None,
3881
4438
  quantized_keys: Vec::new(),
@@ -3885,6 +4442,8 @@ impl Database {
3885
4442
  payload_index_defs: BTreeMap::new(),
3886
4443
  payload_indexes: BTreeMap::new(),
3887
4444
  index_config: IndexConfig::default(),
4445
+ vector_arena: None,
4446
+ vector_arena_dirty: false,
3888
4447
  })
3889
4448
  }
3890
4449
 
@@ -4022,6 +4581,237 @@ impl Database {
4022
4581
  Ok(())
4023
4582
  }
4024
4583
 
4584
+ /// Incremental ANN update. Appends the given new records into the
4585
+ /// existing HNSW graph(s) without rebuilding them from scratch.
4586
+ ///
4587
+ /// Preconditions:
4588
+ /// - `new_keys` are keys that already live in `self.records` (caller
4589
+ /// must have applied the WAL ops to memory first).
4590
+ /// - Each key referenced by `new_keys` did NOT previously exist in
4591
+ /// `self.records` (i.e. it's a true insert, not an update).
4592
+ ///
4593
+ /// Behaviour per (namespace, vector_name) "slot":
4594
+ /// - If a graph already exists, the new vectors are appended to it
4595
+ /// via single-element `hnsw.insert` calls (or `parallel_insert` if
4596
+ /// the batch is large enough to amortise thread overhead).
4597
+ /// - If no graph exists but the total record count for that slot has
4598
+ /// now crossed `ANN_MIN_POINTS`, a fresh graph is built from all
4599
+ /// matching records.
4600
+ /// - Below `ANN_MIN_POINTS`, we skip — searches will brute-force
4601
+ /// without harm.
4602
+ fn ann_apply_incremental(&mut self, new_keys: &[RecordKey]) {
4603
+ if new_keys.is_empty() {
4604
+ return;
4605
+ }
4606
+ let cfg = self.index_config;
4607
+
4608
+ // Group the new records by (Option<namespace>, vector_name). Each
4609
+ // upserted record contributes to exactly one global slot and one
4610
+ // namespace-scoped slot per dense vector it owns.
4611
+ let mut groups: BTreeMap<(Option<String>, String), Vec<(RecordKey, Vec<f32>)>> =
4612
+ BTreeMap::new();
4613
+ for key in new_keys {
4614
+ let Some(record) = self.records.get(key) else {
4615
+ continue;
4616
+ };
4617
+ for (vector_name, vector) in record.dense_vectors() {
4618
+ let item = (key.clone(), vector.clone());
4619
+ groups
4620
+ .entry((None, vector_name.to_owned()))
4621
+ .or_default()
4622
+ .push(item.clone());
4623
+ groups
4624
+ .entry((Some(record.namespace.clone()), vector_name.to_owned()))
4625
+ .or_default()
4626
+ .push(item);
4627
+ }
4628
+ }
4629
+
4630
+ // Two-phase processing to keep the borrow checker happy:
4631
+ // phase 1: classify each slot (needs fresh build vs incremental
4632
+ // append), reading `self.records` only.
4633
+ // phase 2: mutate `self.ann` based on the classifications.
4634
+ let mut fresh_builds: Vec<((Option<String>, String), Vec<(RecordKey, Vec<f32>)>)> =
4635
+ Vec::new();
4636
+ let mut incremental: Vec<((Option<String>, String), Vec<(RecordKey, Vec<f32>)>)> =
4637
+ Vec::new();
4638
+
4639
+ for ((opt_ns, vector_name), new_items) in groups {
4640
+ let has_existing = match &opt_ns {
4641
+ None => self.ann.global.contains_key(&vector_name),
4642
+ Some(ns) => self
4643
+ .ann
4644
+ .namespaces
4645
+ .get(ns)
4646
+ .map_or(false, |m| m.contains_key(&vector_name)),
4647
+ };
4648
+
4649
+ if has_existing {
4650
+ incremental.push(((opt_ns, vector_name), new_items));
4651
+ continue;
4652
+ }
4653
+
4654
+ // Count matching records (post-insert state) to decide whether
4655
+ // we've crossed the build threshold.
4656
+ let total = self
4657
+ .records
4658
+ .iter()
4659
+ .filter(|(_, r)| match &opt_ns {
4660
+ Some(ns) => r.namespace == *ns,
4661
+ None => true,
4662
+ })
4663
+ .filter(|(_, r)| {
4664
+ r.dense_vectors()
4665
+ .any(|(name, _)| name == vector_name.as_str())
4666
+ })
4667
+ .count();
4668
+
4669
+ if total < ANN_MIN_POINTS {
4670
+ continue;
4671
+ }
4672
+
4673
+ // Need to build a fresh graph for this slot. Collect ALL matching
4674
+ // records (not just the new ones) — owned clones so the build
4675
+ // step doesn't borrow `self.records`.
4676
+ let mut all_items: Vec<(RecordKey, Vec<f32>)> = Vec::with_capacity(total);
4677
+ for (k, r) in &self.records {
4678
+ if let Some(ns) = &opt_ns {
4679
+ if r.namespace != *ns {
4680
+ continue;
4681
+ }
4682
+ }
4683
+ for (name, vec) in r.dense_vectors() {
4684
+ if name == vector_name.as_str() {
4685
+ all_items.push((k.clone(), vec.clone()));
4686
+ break;
4687
+ }
4688
+ }
4689
+ }
4690
+ let _ = new_items; // already folded into `all_items`
4691
+ fresh_builds.push(((opt_ns, vector_name), all_items));
4692
+ }
4693
+
4694
+ // Phase 2a: build-from-scratch for slots that just crossed the
4695
+ // threshold.
4696
+ for ((opt_ns, vector_name), all_items) in fresh_builds {
4697
+ let records_for_build: Vec<(RecordKey, &Vec<f32>)> =
4698
+ all_items.iter().map(|(k, v)| (k.clone(), v)).collect();
4699
+ let new_index = build_ann_index(records_for_build, self.metric, &cfg);
4700
+ match opt_ns {
4701
+ None => {
4702
+ self.ann.global.insert(vector_name, new_index);
4703
+ }
4704
+ Some(ns) => {
4705
+ self.ann
4706
+ .namespaces
4707
+ .entry(ns)
4708
+ .or_default()
4709
+ .insert(vector_name, new_index);
4710
+ }
4711
+ }
4712
+ }
4713
+
4714
+ // Phase 2b: incremental appends into existing graphs.
4715
+ for ((opt_ns, vector_name), new_items) in incremental {
4716
+ let idx_opt = match &opt_ns {
4717
+ None => self.ann.global.get_mut(&vector_name),
4718
+ Some(ns) => self
4719
+ .ann
4720
+ .namespaces
4721
+ .get_mut(ns)
4722
+ .and_then(|m| m.get_mut(&vector_name)),
4723
+ };
4724
+ let Some(idx) = idx_opt else {
4725
+ continue;
4726
+ };
4727
+
4728
+ // hnsw_rs marks indexes that have been searched as "searching
4729
+ // mode" (a hint that skips some bookkeeping in the data layer).
4730
+ // Re-enable mutation mode before we insert — cheap toggle.
4731
+ idx.hnsw.set_searching_mode(false);
4732
+
4733
+ if new_items.len() >= cfg.parallel_insert_threshold {
4734
+ let start_id = idx.keys.len();
4735
+ let batch: Vec<(&Vec<f32>, usize)> = new_items
4736
+ .iter()
4737
+ .enumerate()
4738
+ .map(|(offset, (_, v))| (v, start_id + offset))
4739
+ .collect();
4740
+ idx.hnsw.parallel_insert_batch(&batch);
4741
+ for (offset, (k, _)) in new_items.into_iter().enumerate() {
4742
+ let origin_id = start_id + offset;
4743
+ idx.key_to_origin.insert(k.clone(), origin_id);
4744
+ idx.keys.push(k);
4745
+ }
4746
+ } else {
4747
+ for (key, vector) in new_items {
4748
+ let origin_id = idx.keys.len();
4749
+ idx.key_to_origin.insert(key.clone(), origin_id);
4750
+ idx.keys.push(key);
4751
+ idx.hnsw.insert_one(vector.as_slice(), origin_id);
4752
+ }
4753
+ }
4754
+ }
4755
+ }
4756
+
4757
+ /// Append newly-inserted vectors to the contiguous arena. Caller must
4758
+ /// have already inserted the records into `self.records` and confirmed
4759
+ /// the arena exists and isn't dirty.
4760
+ fn arena_apply_incremental(&mut self, new_keys: &[RecordKey]) {
4761
+ let Some(arena) = self.vector_arena.as_mut() else {
4762
+ return;
4763
+ };
4764
+ for key in new_keys {
4765
+ if let Some(record) = self.records.get(key) {
4766
+ arena.append(key.clone(), &record.vector);
4767
+ }
4768
+ }
4769
+ }
4770
+
4771
+ /// Ensure the contiguous arena is materialised and fresh. Cheap when
4772
+ /// already clean; rebuilds from `self.records` (in BTreeMap order) on
4773
+ /// first call or after a delete. Allocates `dim * N` f32s.
4774
+ fn ensure_vector_arena(&mut self) -> &VectorArena {
4775
+ let needs_build = self
4776
+ .vector_arena
4777
+ .as_ref()
4778
+ .map_or(true, |a| self.vector_arena_dirty || a.dim != self.dimension);
4779
+ if needs_build {
4780
+ self.vector_arena = Some(VectorArena::rebuild_from(&self.records, self.dimension));
4781
+ self.vector_arena_dirty = false;
4782
+ }
4783
+ self.vector_arena.as_ref().unwrap()
4784
+ }
4785
+
4786
+ /// Mark the given record keys as deleted in every HNSW graph they live
4787
+ /// in. The graph itself is not modified — search filters tombstoned
4788
+ /// `origin_id`s. A subsequent `compact()` will rebuild any graph whose
4789
+ /// dead ratio exceeds `IndexConfig.tombstone_rebuild_pct`.
4790
+ fn ann_apply_tombstones(&mut self, deleted_keys: &[RecordKey]) {
4791
+ if deleted_keys.is_empty() {
4792
+ return;
4793
+ }
4794
+ for key in deleted_keys {
4795
+ // Global graphs (per vector_name): every graph that contains
4796
+ // this key gets the corresponding origin_id tombstoned.
4797
+ for (_, idx) in self.ann.global.iter_mut() {
4798
+ if let Some(&origin_id) = idx.key_to_origin.get(key) {
4799
+ idx.tombstones.insert(origin_id);
4800
+ }
4801
+ }
4802
+ // Per-namespace graphs: only the namespace this key belongs to
4803
+ // has a chance of containing it, but checking all of them is
4804
+ // fine — `key_to_origin.get` is O(1) and misses immediately.
4805
+ for (_, indexes) in self.ann.namespaces.iter_mut() {
4806
+ for (_, idx) in indexes.iter_mut() {
4807
+ if let Some(&origin_id) = idx.key_to_origin.get(key) {
4808
+ idx.tombstones.insert(origin_id);
4809
+ }
4810
+ }
4811
+ }
4812
+ }
4813
+ }
4814
+
4025
4815
  fn rebuild_ann(&mut self) {
4026
4816
  self.ann = AnnCatalog::default();
4027
4817
  let mut global_by_vector: BTreeMap<String, Vec<(RecordKey, &Vec<f32>)>> = BTreeMap::new();
@@ -4109,6 +4899,24 @@ impl Database {
4109
4899
  return false;
4110
4900
  }
4111
4901
 
4902
+ // For ANN2 manifests, use the persisted keys verbatim — they
4903
+ // match the `origin_id`s baked into the HNSW graph file. For
4904
+ // ANN1 (no persisted keys), fall back to the recomputed
4905
+ // BTreeMap-ordered list, which matches the way ANN1 graphs were
4906
+ // always built.
4907
+ let keys = if manifest_entry.keys.is_empty() {
4908
+ expected_entry.keys.clone()
4909
+ } else {
4910
+ // Defensive: persisted keys length must agree with the
4911
+ // declared record_count and the live record set, else the
4912
+ // manifest is inconsistent and we'd rather rebuild than
4913
+ // serve wrong neighbours.
4914
+ if manifest_entry.keys.len() != manifest_entry.record_count {
4915
+ return false;
4916
+ }
4917
+ manifest_entry.keys.clone()
4918
+ };
4919
+
4112
4920
  let Some(index) = load_ann_index(
4113
4921
  parent,
4114
4922
  &ann_basename(
@@ -4116,7 +4924,7 @@ impl Database {
4116
4924
  expected_entry.namespace.as_deref(),
4117
4925
  &expected_entry.vector_name,
4118
4926
  ),
4119
- expected_entry.keys.clone(),
4927
+ keys,
4120
4928
  self.metric,
4121
4929
  ) else {
4122
4930
  return false;
@@ -4147,7 +4955,11 @@ impl Database {
4147
4955
  return Ok(());
4148
4956
  }
4149
4957
 
4150
- let entries = self.expected_ann_entries();
4958
+ // Use `actual_ann_entries` (NOT `expected_ann_entries`) so the
4959
+ // persisted keys array matches the order the HNSW graph stored its
4960
+ // `origin_id`s in. After incremental inserts the in-memory keys vec
4961
+ // is in insertion order, which usually differs from BTreeMap order.
4962
+ let entries = self.actual_ann_entries();
4151
4963
  for entry in &entries {
4152
4964
  let basename = ann_basename(&self.path, entry.namespace.as_deref(), &entry.vector_name);
4153
4965
  let graph_path = parent.join(format!("{basename}.hnsw.graph"));
@@ -4175,6 +4987,41 @@ impl Database {
4175
4987
  write_ann_manifest(&ann_manifest_path(&self.path), &entries)
4176
4988
  }
4177
4989
 
4990
+ /// Like `expected_ann_entries`, but populates each entry's `keys` field
4991
+ /// from the actual in-memory `AnnIndex.keys` array (insertion order).
4992
+ /// This is what gets serialised into the ANN2 manifest, and matches the
4993
+ /// `origin_id`s baked into the dumped HNSW graph files.
4994
+ fn actual_ann_entries(&self) -> Vec<AnnManifestEntry> {
4995
+ let mut entries = Vec::new();
4996
+ for (vector_name, index) in &self.ann.global {
4997
+ if index.keys.len() < ANN_MIN_POINTS {
4998
+ continue;
4999
+ }
5000
+ entries.push(AnnManifestEntry {
5001
+ namespace: None,
5002
+ vector_name: vector_name.clone(),
5003
+ record_count: index.keys.len(),
5004
+ key_signature: record_key_signature(&index.keys),
5005
+ keys: index.keys.clone(),
5006
+ });
5007
+ }
5008
+ for (namespace, indexes) in &self.ann.namespaces {
5009
+ for (vector_name, index) in indexes {
5010
+ if index.keys.len() < ANN_MIN_POINTS {
5011
+ continue;
5012
+ }
5013
+ entries.push(AnnManifestEntry {
5014
+ namespace: Some(namespace.clone()),
5015
+ vector_name: vector_name.clone(),
5016
+ record_count: index.keys.len(),
5017
+ key_signature: record_key_signature(&index.keys),
5018
+ keys: index.keys.clone(),
5019
+ });
5020
+ }
5021
+ }
5022
+ entries
5023
+ }
5024
+
4178
5025
  fn expected_ann_entries(&self) -> Vec<AnnManifestEntry> {
4179
5026
  let mut global: BTreeMap<String, Vec<RecordKey>> = BTreeMap::new();
4180
5027
  let mut by_namespace: BTreeMap<String, BTreeMap<String, Vec<RecordKey>>> = BTreeMap::new();
@@ -4386,11 +5233,14 @@ impl Database {
4386
5233
  .global
4387
5234
  .get(vector_name.unwrap_or(DEFAULT_VECTOR_NAME)),
4388
5235
  }?;
4389
- if index.keys.len() < ANN_SEARCH_MIN_POINTS {
5236
+ // Gate on live (non-tombstoned) record count: if half the graph is
5237
+ // dead, treat the live half as if it were the whole corpus.
5238
+ let live = index.live_count();
5239
+ if live < ANN_SEARCH_MIN_POINTS {
4390
5240
  return None;
4391
5241
  }
4392
5242
 
4393
- let candidate_count = candidate_count(top_k, index.keys.len());
5243
+ let candidate_count = candidate_count(top_k, live);
4394
5244
  if candidate_count == 0 {
4395
5245
  return None;
4396
5246
  }
@@ -4399,15 +5249,29 @@ impl Database {
4399
5249
  // explicitly sets `IndexConfig.ef_search`, honour it directly.
4400
5250
  // Otherwise default to max(candidate_count, ef_construction) which is
4401
5251
  // a conservative high-recall heuristic.
4402
- let ef_search = match self.index_config.ef_search {
5252
+ let mut ef_search = match self.index_config.ef_search {
4403
5253
  Some(ef) => ef.max(candidate_count),
4404
5254
  None => candidate_count.max(self.index_config.ef_construction),
4405
5255
  };
4406
- let neighbours = index.hnsw.search(query, candidate_count, ef_search);
5256
+ // Over-fetch to compensate for tombstoned candidates we'll drop. Cap
5257
+ // at the live count so we don't waste work; we'd never get more
5258
+ // distinct results than that anyway.
5259
+ if !index.tombstones.is_empty() {
5260
+ let dead = index.tombstones.len();
5261
+ ef_search = ef_search
5262
+ .saturating_add(dead.min(ef_search))
5263
+ .min(index.keys.len());
5264
+ }
5265
+ let fetch_count = candidate_count
5266
+ .saturating_add(index.tombstones.len().min(candidate_count))
5267
+ .min(index.keys.len());
5268
+ let neighbours = index.hnsw.search(query, fetch_count, ef_search);
4407
5269
  Some(
4408
5270
  neighbours
4409
5271
  .into_iter()
5272
+ .filter(|n| !index.tombstones.contains(&n.d_id))
4410
5273
  .filter_map(|neighbour| index.keys.get(neighbour.d_id).cloned())
5274
+ .take(candidate_count)
4411
5275
  .collect(),
4412
5276
  )
4413
5277
  }
@@ -4691,6 +5555,7 @@ fn build_ann_index(
4691
5555
  $dist_val,
4692
5556
  );
4693
5557
  let mut keys = Vec::with_capacity(count);
5558
+ let mut key_to_origin = HashMap::with_capacity(count);
4694
5559
  if use_parallel {
4695
5560
  // hnsw_rs's `parallel_insert` takes `&[(&Vec<T>, usize)]`
4696
5561
  // (the API is built around owned-Vec borrows) and uses Rayon
@@ -4699,12 +5564,14 @@ fn build_ann_index(
4699
5564
  let mut batch: Vec<(&Vec<f32>, usize)> = Vec::with_capacity(count);
4700
5565
  for (origin_id, (key, vector)) in records.into_iter().enumerate() {
4701
5566
  batch.push((vector, origin_id));
5567
+ key_to_origin.insert(key.clone(), origin_id);
4702
5568
  keys.push(key);
4703
5569
  }
4704
5570
  hnsw.parallel_insert(&batch);
4705
5571
  } else {
4706
5572
  for (origin_id, (key, vector)) in records.into_iter().enumerate() {
4707
5573
  hnsw.insert((vector.as_slice(), origin_id));
5574
+ key_to_origin.insert(key.clone(), origin_id);
4708
5575
  keys.push(key);
4709
5576
  }
4710
5577
  }
@@ -4712,6 +5579,8 @@ fn build_ann_index(
4712
5579
  AnnIndex {
4713
5580
  hnsw: AnnHnsw::$variant(hnsw),
4714
5581
  keys,
5582
+ key_to_origin,
5583
+ tombstones: HashSet::new(),
4715
5584
  }
4716
5585
  }};
4717
5586
  }
@@ -4906,9 +5775,21 @@ fn hex_encode(bytes: &[u8]) -> String {
4906
5775
  out
4907
5776
  }
4908
5777
 
5778
+ /// Order-independent FNV-1a hash over a set of record keys. We sort first so
5779
+ /// the signature only depends on the SET of keys, not the order they were
5780
+ /// inserted. Callers can use this to check whether a persisted ANN graph
5781
+ /// matches the live record set regardless of whether the live `keys` vec is
5782
+ /// BTreeMap-ordered (full rebuild) or insertion-ordered (incremental
5783
+ /// updates).
5784
+ ///
5785
+ /// Historical note: previously the input was always BTreeMap-iterated and
5786
+ /// therefore already sorted, so the sort step is a no-op for old ANN1
5787
+ /// manifests — backwards compatible.
4909
5788
  fn record_key_signature(keys: &[RecordKey]) -> u64 {
5789
+ let mut sorted: Vec<&RecordKey> = keys.iter().collect();
5790
+ sorted.sort();
4910
5791
  let mut state = 0xcbf29ce484222325_u64;
4911
- for (namespace, id) in keys {
5792
+ for (namespace, id) in sorted {
4912
5793
  for byte in namespace
4913
5794
  .as_bytes()
4914
5795
  .iter()
@@ -4935,9 +5816,16 @@ fn load_ann_index(
4935
5816
  ($dist_val:expr, $variant:ident) => {{
4936
5817
  let mut hnsw = reloader.load_hnsw_with_dist($dist_val).ok()?;
4937
5818
  hnsw.set_searching_mode(true);
5819
+ let key_to_origin = keys
5820
+ .iter()
5821
+ .enumerate()
5822
+ .map(|(i, k)| (k.clone(), i))
5823
+ .collect();
4938
5824
  Some(AnnIndex {
4939
5825
  hnsw: AnnHnsw::$variant(hnsw),
4940
5826
  keys,
5827
+ key_to_origin,
5828
+ tombstones: HashSet::new(),
4941
5829
  })
4942
5830
  }};
4943
5831
  }
@@ -4950,9 +5838,16 @@ fn load_ann_index(
4950
5838
  }
4951
5839
  }
4952
5840
 
5841
+ /// Write the ANN sidecar manifest. We use format `ANN2`, which (compared to
5842
+ /// the original `ANN1`) also serialises the actual key array per index in
5843
+ /// the order the HNSW knows its `origin_id`s. This is required for
5844
+ /// incremental insertion: without it, a reload would associate the wrong
5845
+ /// (BTreeMap-ordered) record key with each HNSW origin_id whenever the in
5846
+ /// memory key array isn't sorted (which happens any time we incrementally
5847
+ /// append).
4953
5848
  fn write_ann_manifest(path: &Path, entries: &[AnnManifestEntry]) -> Result<()> {
4954
- let mut file = File::create(path)?;
4955
- file.write_all(b"ANN1")?;
5849
+ let mut file = BufWriter::new(File::create(path)?);
5850
+ file.write_all(b"ANN2")?;
4956
5851
  write_u32(&mut file, u32_from_usize(entries.len())?)?;
4957
5852
  for entry in entries {
4958
5853
  write_u8(&mut file, u8::from(entry.namespace.is_some()))?;
@@ -4962,8 +5857,15 @@ fn write_ann_manifest(path: &Path, entries: &[AnnManifestEntry]) -> Result<()> {
4962
5857
  write_string(&mut file, &entry.vector_name)?;
4963
5858
  write_u64(&mut file, u64_from_usize(entry.record_count)?)?;
4964
5859
  write_u64(&mut file, entry.key_signature)?;
5860
+ // ANN2 addition: the full keys array in insertion order.
5861
+ write_u64(&mut file, u64_from_usize(entry.keys.len())?)?;
5862
+ for (ns, id) in &entry.keys {
5863
+ write_string(&mut file, ns)?;
5864
+ write_string(&mut file, id)?;
5865
+ }
4965
5866
  }
4966
- file.sync_all()?;
5867
+ file.flush()?;
5868
+ file.get_ref().sync_all()?;
4967
5869
  Ok(())
4968
5870
  }
4969
5871
 
@@ -4971,11 +5873,15 @@ fn read_ann_manifest(path: &Path) -> Result<Vec<AnnManifestEntry>> {
4971
5873
  let mut file = BufReader::new(File::open(path)?);
4972
5874
  let mut magic = [0_u8; 4];
4973
5875
  file.read_exact(&mut magic)?;
4974
- if &magic != b"ANN1" {
4975
- return Err(VectLiteError::InvalidFormat(
4976
- "invalid ANN manifest".to_owned(),
4977
- ));
4978
- }
5876
+ let version = match &magic {
5877
+ b"ANN1" => 1u8,
5878
+ b"ANN2" => 2u8,
5879
+ _ => {
5880
+ return Err(VectLiteError::InvalidFormat(
5881
+ "invalid ANN manifest".to_owned(),
5882
+ ));
5883
+ }
5884
+ };
4979
5885
 
4980
5886
  let count = usize_from_u32(read_u32(&mut file)?)?;
4981
5887
  let mut entries = Vec::with_capacity(count);
@@ -4989,12 +5895,27 @@ fn read_ann_manifest(path: &Path) -> Result<Vec<AnnManifestEntry>> {
4989
5895
  let vector_name = read_string(&mut file)?;
4990
5896
  let record_count = usize_from_u64(read_u64(&mut file)?)?;
4991
5897
  let key_signature = read_u64(&mut file)?;
5898
+ let keys = if version >= 2 {
5899
+ let n = usize_from_u64(read_u64(&mut file)?)?;
5900
+ let mut keys = Vec::with_capacity(n);
5901
+ for _ in 0..n {
5902
+ let ns = read_string(&mut file)?;
5903
+ let id = read_string(&mut file)?;
5904
+ keys.push((ns, id));
5905
+ }
5906
+ keys
5907
+ } else {
5908
+ // ANN1 had no persisted keys; caller falls back to recomputing
5909
+ // them from `self.records` (which yields BTreeMap-sorted keys,
5910
+ // matching the order ANN1 indexes were always built in).
5911
+ Vec::new()
5912
+ };
4992
5913
  entries.push(AnnManifestEntry {
4993
5914
  namespace,
4994
5915
  vector_name,
4995
5916
  record_count,
4996
5917
  key_signature,
4997
- keys: Vec::new(),
5918
+ keys,
4998
5919
  });
4999
5920
  }
5000
5921
  Ok(entries)