vectlite 0.9.3 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -35,6 +35,10 @@ const ANN_OVERSAMPLE: usize = 8;
35
35
  const ANN_MIN_CANDIDATES: usize = 64;
36
36
  const ANN_M: usize = 16;
37
37
  const ANN_EF_CONSTRUCTION: usize = 200;
38
+ /// Threshold above which HNSW construction uses parallel batch insert
39
+ /// (Rayon-based). Below this, sequential insert is cheaper because of
40
+ /// thread setup overhead.
41
+ const ANN_PARALLEL_INSERT_THRESHOLD: usize = 256;
38
42
  const BM25_K1: f32 = 1.2;
39
43
  const BM25_B: f32 = 0.75;
40
44
 
@@ -754,6 +758,151 @@ pub struct SearchOptions {
754
758
  pub truncate_dim: Option<usize>,
755
759
  }
756
760
 
761
+ /// HNSW tuning parameters. Exposed so callers can trade off recall, latency,
762
+ /// memory and build time.
763
+ ///
764
+ /// Defaults mirror VectLite's historical built-in values (`m = 16`,
765
+ /// `ef_construction = 200`). `ef_search = None` means VectLite picks an
766
+ /// `ef_search` derived from `top_k * ANN_OVERSAMPLE`.
767
+ ///
768
+ /// Reference: Malkov & Yashunin, *Efficient and robust approximate nearest
769
+ /// neighbor search using Hierarchical Navigable Small World graphs*.
770
+ #[derive(Clone, Copy, Debug, PartialEq, Eq)]
771
+ pub struct IndexConfig {
772
+ /// Max number of bidirectional links per node. Higher = better recall,
773
+ /// more memory, slower build. Typical range: 8..64.
774
+ pub m: usize,
775
+ /// Width of the search during graph construction. Higher = better recall,
776
+ /// slower build. Typical range: 64..800.
777
+ pub ef_construction: usize,
778
+ /// Width of the search at query time. None = auto (derived from top_k).
779
+ /// Higher = better recall, slower search.
780
+ pub ef_search: Option<usize>,
781
+ /// Use parallel (Rayon-backed) HNSW insertion when the dataset has at
782
+ /// least this many vectors. Defaults to `ANN_PARALLEL_INSERT_THRESHOLD`.
783
+ /// Set very high to disable parallel insert.
784
+ pub parallel_insert_threshold: usize,
785
+ /// Percentage (0..=100) of tombstoned nodes at which the HNSW graph is
786
+ /// rebuilt during `compact()`. A `delete` doesn't physically remove a
787
+ /// node from HNSW (that operation is not supported by the library); the
788
+ /// node is just marked dead and filtered out at search time. Once enough
789
+ /// nodes are dead, search recall and latency degrade, so we rebuild.
790
+ /// Default `30` (rebuild when ≥30% of the graph is dead). Set to `100`
791
+ /// to disable automatic rebuild.
792
+ pub tombstone_rebuild_pct: u8,
793
+ }
794
+
795
+ impl Default for IndexConfig {
796
+ fn default() -> Self {
797
+ Self {
798
+ m: ANN_M,
799
+ ef_construction: ANN_EF_CONSTRUCTION,
800
+ ef_search: None,
801
+ parallel_insert_threshold: ANN_PARALLEL_INSERT_THRESHOLD,
802
+ tombstone_rebuild_pct: 30,
803
+ }
804
+ }
805
+ }
806
+
807
+ impl IndexConfig {
808
+ /// A preset tuned for higher recall at the cost of build/search time.
809
+ /// Useful for benchmark comparisons where recall@10 must approach 1.0.
810
+ pub fn high_recall() -> Self {
811
+ Self {
812
+ m: 32,
813
+ ef_construction: 400,
814
+ ef_search: Some(200),
815
+ parallel_insert_threshold: ANN_PARALLEL_INSERT_THRESHOLD,
816
+ tombstone_rebuild_pct: 30,
817
+ }
818
+ }
819
+
820
+ /// A preset tuned for fast build & low latency, lower recall.
821
+ pub fn fast() -> Self {
822
+ Self {
823
+ m: 8,
824
+ ef_construction: 100,
825
+ ef_search: Some(40),
826
+ parallel_insert_threshold: ANN_PARALLEL_INSERT_THRESHOLD,
827
+ tombstone_rebuild_pct: 30,
828
+ }
829
+ }
830
+
831
+ fn validate(&self) -> Result<()> {
832
+ if self.m == 0 {
833
+ return Err(VectLiteError::InvalidFormat(
834
+ "IndexConfig.m must be >= 1".to_owned(),
835
+ ));
836
+ }
837
+ if self.ef_construction == 0 {
838
+ return Err(VectLiteError::InvalidFormat(
839
+ "IndexConfig.ef_construction must be >= 1".to_owned(),
840
+ ));
841
+ }
842
+ if let Some(ef) = self.ef_search {
843
+ if ef == 0 {
844
+ return Err(VectLiteError::InvalidFormat(
845
+ "IndexConfig.ef_search must be >= 1 when set".to_owned(),
846
+ ));
847
+ }
848
+ }
849
+ if self.tombstone_rebuild_pct > 100 {
850
+ return Err(VectLiteError::InvalidFormat(
851
+ "IndexConfig.tombstone_rebuild_pct must be in 0..=100".to_owned(),
852
+ ));
853
+ }
854
+ Ok(())
855
+ }
856
+ }
857
+
858
+ /// Controls when the WAL file is `fsync`'d to disk.
859
+ ///
860
+ /// Per-record durability is the default (`PerOp`) but on macOS APFS — and to
861
+ /// a lesser extent on Linux ext4 — `fsync` is the dominant cost of single
862
+ /// `insert` calls. Relaxing this knob can multiply ingestion throughput by
863
+ /// 5–10× at the cost of losing some recently-acknowledged records on an
864
+ /// unclean shutdown.
865
+ ///
866
+ /// The WAL is *always* fully synced on `flush()`, `compact()`, and `close()`.
867
+ /// So even with `OnFlush`, any data that survives a clean shutdown is
868
+ /// durable. The window of vulnerability is limited to:
869
+ /// - `EveryN(n)`: at most the last `n - 1` inserts since the last fsync.
870
+ /// - `OnFlush`: every insert since the last `flush()` / `compact()`.
871
+ #[derive(Clone, Copy, Debug, PartialEq, Eq)]
872
+ pub enum WalSyncMode {
873
+ /// `fsync` after every WAL append. Strongest durability, slowest. This is
874
+ /// the default and matches pre-0.11 behaviour.
875
+ PerOp,
876
+ /// `fsync` once every `n` ops. On a crash, up to the last `n - 1` ops
877
+ /// since the last sync may be lost. A good middle ground when streaming
878
+ /// thousands of small records: pick `n` so the worst-case loss is
879
+ /// tolerable (e.g. `64` ≈ a fraction of a second of data).
880
+ EveryN(usize),
881
+ /// Never `fsync` from the per-op path. Sync only at `flush()` / `compact()`
882
+ /// / `close()`. Maximum throughput, weakest durability — appropriate for
883
+ /// bulk ingestion of data that can be regenerated.
884
+ OnFlush,
885
+ }
886
+
887
+ impl Default for WalSyncMode {
888
+ fn default() -> Self {
889
+ WalSyncMode::PerOp
890
+ }
891
+ }
892
+
893
+ impl WalSyncMode {
894
+ fn validate(self) -> Result<()> {
895
+ if let WalSyncMode::EveryN(n) = self {
896
+ if n == 0 {
897
+ return Err(VectLiteError::InvalidFormat(
898
+ "WalSyncMode::EveryN must be >= 1".to_owned(),
899
+ ));
900
+ }
901
+ }
902
+ Ok(())
903
+ }
904
+ }
905
+
757
906
  impl Default for SearchOptions {
758
907
  fn default() -> Self {
759
908
  Self {
@@ -1214,6 +1363,29 @@ pub struct Database {
1214
1363
  /// Holds the lock file open for the lifetime of the database.
1215
1364
  /// Dropping this releases the advisory lock.
1216
1365
  _lock_file: Option<File>,
1366
+ /// Cached WAL writer: avoids paying the open() syscall on every insert.
1367
+ /// Reset whenever the WAL is rotated (compact, clear_wal).
1368
+ wal_writer: Option<BufWriter<File>>,
1369
+ /// Controls when `fsync` is issued against the WAL — see [`WalSyncMode`].
1370
+ wal_sync_mode: WalSyncMode,
1371
+ /// Number of ops appended to the WAL since the last fsync. Used by the
1372
+ /// `EveryN` sync mode to decide when to flush+sync.
1373
+ wal_ops_since_sync: usize,
1374
+ /// True if the in-memory ANN graph(s) have unsaved changes (incremental
1375
+ /// inserts, fresh build, or a full rebuild) that have not been written
1376
+ /// out via `persist_ann_to_disk`. Set on every mutation in
1377
+ /// `apply_wal_batch` / `bulk_ingest` and cleared by `compact_inner` or
1378
+ /// an explicit `persist_ann_to_disk`.
1379
+ ann_dirty: bool,
1380
+ /// True if the quantized PQ index needs to be rebuilt at the next flush
1381
+ /// (because records have been inserted/deleted since the last rebuild).
1382
+ /// While dirty, the in-memory `quantized` field is set to `None` so
1383
+ /// searches transparently fall back to the HNSW path instead of
1384
+ /// returning candidates from a stale codebook.
1385
+ quantized_dirty: bool,
1386
+ /// Same as `quantized_dirty`, but for multi-vector (ColBERT-style)
1387
+ /// quantization spaces. Lazy rebuild happens at flush time.
1388
+ multi_vector_quantized_dirty: bool,
1217
1389
  /// Optional quantized index for accelerated search.
1218
1390
  quantized: Option<QuantizedIndex>,
1219
1391
  /// Configuration used to build the quantized index (persisted).
@@ -1230,6 +1402,91 @@ pub struct Database {
1230
1402
  payload_index_defs: BTreeMap<String, PayloadIndexType>,
1231
1403
  /// Live payload indexes, populated from records.
1232
1404
  payload_indexes: BTreeMap<String, PayloadIndexData>,
1405
+ /// HNSW tuning parameters. Not persisted to disk: this is a per-session
1406
+ /// knob so callers can change recall/latency tradeoffs without migrating
1407
+ /// data files. A subsequent `set_index_config` triggers a rebuild.
1408
+ index_config: IndexConfig,
1409
+ /// Contiguous f32 mirror of the default dense vector for every record.
1410
+ /// Used by brute-force / rescoring scans for cache-friendly SIMD.
1411
+ /// `None` when the arena hasn't been materialised yet for this session.
1412
+ vector_arena: Option<VectorArena>,
1413
+ /// When true, `vector_arena` is stale (e.g. a delete happened) and must
1414
+ /// be rebuilt before use.
1415
+ vector_arena_dirty: bool,
1416
+ }
1417
+
1418
+ /// Contiguous-storage mirror of the default dense vector per record.
1419
+ ///
1420
+ /// In the original layout each `Record.vector` is a separately-allocated
1421
+ /// `Vec<f32>` and the records themselves live in `BTreeMap` nodes, so a
1422
+ /// brute-force or rescoring scan pays two pointer hops per record AND
1423
+ /// touches one cache line per vector — terrible for SIMD throughput.
1424
+ ///
1425
+ /// This arena stores every vector in a single flat `buf: Vec<f32>` so a scan
1426
+ /// is a straight contiguous walk (one cache miss per ~16 vectors, vs ~2 per
1427
+ /// vector). Lance / Arrow use the same trick — see the v0.11 CHANGELOG note.
1428
+ ///
1429
+ /// The arena is maintained incrementally on insert; deletes are too
1430
+ /// expensive to compact in place (would shift O(N) f32s) so they just mark
1431
+ /// the arena dirty and force a lazy full rebuild on next use.
1432
+ struct VectorArena {
1433
+ buf: Vec<f32>,
1434
+ keys: Vec<RecordKey>,
1435
+ key_to_index: HashMap<RecordKey, usize>,
1436
+ dim: usize,
1437
+ }
1438
+
1439
+ impl VectorArena {
1440
+ fn new(dim: usize) -> Self {
1441
+ Self {
1442
+ buf: Vec::new(),
1443
+ keys: Vec::new(),
1444
+ key_to_index: HashMap::new(),
1445
+ dim,
1446
+ }
1447
+ }
1448
+
1449
+ fn append(&mut self, key: RecordKey, vector: &[f32]) {
1450
+ // Defensive: ignore mismatched dims rather than panicking — this is
1451
+ // a perf cache, not the source of truth.
1452
+ if vector.len() != self.dim {
1453
+ return;
1454
+ }
1455
+ let idx = self.keys.len();
1456
+ self.buf.extend_from_slice(vector);
1457
+ self.key_to_index.insert(key.clone(), idx);
1458
+ self.keys.push(key);
1459
+ }
1460
+
1461
+ /// Rebuild from records in BTreeMap order. Called lazily when the arena
1462
+ /// is dirty (i.e. after a delete or a full ANN rebuild).
1463
+ fn rebuild_from(records: &BTreeMap<RecordKey, Record>, dim: usize) -> Self {
1464
+ let mut arena = Self::new(dim);
1465
+ arena.buf.reserve(records.len() * dim);
1466
+ arena.keys.reserve(records.len());
1467
+ arena.key_to_index.reserve(records.len());
1468
+ for (key, record) in records {
1469
+ if record.vector.len() == dim {
1470
+ arena.append(key.clone(), &record.vector);
1471
+ }
1472
+ }
1473
+ arena
1474
+ }
1475
+
1476
+ /// Iterator yielding `(key, vector_slice)` pairs. The slice references
1477
+ /// the contiguous `buf`, so consumers get cache-friendly SIMD scans.
1478
+ #[allow(dead_code)]
1479
+ fn iter(&self) -> impl Iterator<Item = (&RecordKey, &[f32])> {
1480
+ let dim = self.dim;
1481
+ self.keys.iter().enumerate().map(move |(i, k)| {
1482
+ let start = i * dim;
1483
+ (k, &self.buf[start..start + dim])
1484
+ })
1485
+ }
1486
+
1487
+ fn len(&self) -> usize {
1488
+ self.keys.len()
1489
+ }
1233
1490
  }
1234
1491
 
1235
1492
  #[derive(Default)]
@@ -1255,6 +1512,42 @@ impl AnnHnsw {
1255
1512
  }
1256
1513
  }
1257
1514
 
1515
+ /// Incrementally insert a single vector into an existing HNSW graph.
1516
+ /// `origin_id` must be unique within the graph and is used to map back
1517
+ /// to the caller's record key array.
1518
+ fn insert_one(&mut self, vector: &[f32], origin_id: usize) {
1519
+ match self {
1520
+ AnnHnsw::Cosine(h) => h.insert((vector, origin_id)),
1521
+ AnnHnsw::Euclidean(h) => h.insert((vector, origin_id)),
1522
+ AnnHnsw::DotProduct(h) => h.insert((vector, origin_id)),
1523
+ AnnHnsw::Manhattan(h) => h.insert((vector, origin_id)),
1524
+ }
1525
+ }
1526
+
1527
+ /// Bulk-insert a batch of vectors in parallel (Rayon-multithreaded).
1528
+ /// Significantly faster than repeated `insert_one` when the batch is
1529
+ /// large enough to amortise thread setup.
1530
+ fn parallel_insert_batch(&mut self, batch: &[(&Vec<f32>, usize)]) {
1531
+ match self {
1532
+ AnnHnsw::Cosine(h) => h.parallel_insert(batch),
1533
+ AnnHnsw::Euclidean(h) => h.parallel_insert(batch),
1534
+ AnnHnsw::DotProduct(h) => h.parallel_insert(batch),
1535
+ AnnHnsw::Manhattan(h) => h.parallel_insert(batch),
1536
+ }
1537
+ }
1538
+
1539
+ /// Toggle the `searching_mode` hint on the underlying HNSW. When `true`
1540
+ /// the graph is treated as read-only and lookups skip some bookkeeping;
1541
+ /// when `false` further inserts are allowed.
1542
+ fn set_searching_mode(&mut self, value: bool) {
1543
+ match self {
1544
+ AnnHnsw::Cosine(h) => h.set_searching_mode(value),
1545
+ AnnHnsw::Euclidean(h) => h.set_searching_mode(value),
1546
+ AnnHnsw::DotProduct(h) => h.set_searching_mode(value),
1547
+ AnnHnsw::Manhattan(h) => h.set_searching_mode(value),
1548
+ }
1549
+ }
1550
+
1258
1551
  fn file_dump(&self, directory: &Path, basename: &str) -> Result<()> {
1259
1552
  let result = match self {
1260
1553
  AnnHnsw::Cosine(h) => h.file_dump(directory, basename),
@@ -1270,7 +1563,38 @@ impl AnnHnsw {
1270
1563
 
1271
1564
  struct AnnIndex {
1272
1565
  hnsw: AnnHnsw,
1566
+ /// `keys[i]` is the record key for HNSW origin_id `i`. Always grows; we
1567
+ /// never shrink it (HNSW doesn't support compacted deletion). Tombstoned
1568
+ /// slots stay in the vec to keep origin_id ↔ key mapping stable.
1273
1569
  keys: Vec<RecordKey>,
1570
+ /// Reverse index: `key → origin_id`. Lets `delete` find a record's HNSW
1571
+ /// node in O(1). Built alongside `keys` on every (re)build.
1572
+ key_to_origin: HashMap<RecordKey, usize>,
1573
+ /// Origin_ids that have been logically deleted but are still part of the
1574
+ /// HNSW graph. Search filters these out by lookup; a `compact()` rebuilds
1575
+ /// the graph once the ratio exceeds `IndexConfig.tombstone_rebuild_pct`.
1576
+ tombstones: HashSet<usize>,
1577
+ }
1578
+
1579
+ impl AnnIndex {
1580
+ /// Number of live (non-tombstoned) records in the graph.
1581
+ fn live_count(&self) -> usize {
1582
+ self.keys.len().saturating_sub(self.tombstones.len())
1583
+ }
1584
+
1585
+ /// True when the fraction of dead nodes is at or above the configured
1586
+ /// rebuild threshold (`IndexConfig.tombstone_rebuild_pct`). Currently
1587
+ /// `compact_inner` rebuilds on *any* tombstones because the persisted
1588
+ /// manifest format only tracks live record counts — when we add a
1589
+ /// tombstone-aware manifest (planned), this becomes the trigger.
1590
+ #[allow(dead_code)]
1591
+ fn should_rebuild(&self, threshold_pct: u8) -> bool {
1592
+ if self.keys.is_empty() {
1593
+ return false;
1594
+ }
1595
+ let pct = (self.tombstones.len() * 100) / self.keys.len();
1596
+ pct >= threshold_pct as usize
1597
+ }
1274
1598
  }
1275
1599
 
1276
1600
  struct AnnManifestEntry {
@@ -1319,6 +1643,12 @@ impl Database {
1319
1643
  ann_loaded_from_disk: false,
1320
1644
  read_only: false,
1321
1645
  _lock_file: Some(lock),
1646
+ wal_writer: None,
1647
+ wal_sync_mode: WalSyncMode::default(),
1648
+ wal_ops_since_sync: 0,
1649
+ ann_dirty: false,
1650
+ quantized_dirty: false,
1651
+ multi_vector_quantized_dirty: false,
1322
1652
  quantized: None,
1323
1653
  quantization_config: None,
1324
1654
  quantized_keys: Vec::new(),
@@ -1327,6 +1657,9 @@ impl Database {
1327
1657
  multi_vector_quantized_keys: BTreeMap::new(),
1328
1658
  payload_index_defs: BTreeMap::new(),
1329
1659
  payload_indexes: BTreeMap::new(),
1660
+ index_config: IndexConfig::default(),
1661
+ vector_arena: None,
1662
+ vector_arena_dirty: false,
1330
1663
  };
1331
1664
 
1332
1665
  database.flush()?;
@@ -1432,6 +1765,8 @@ impl Database {
1432
1765
  if !self.read_only {
1433
1766
  self.compact_inner()?;
1434
1767
  }
1768
+ // Drop the cached WAL writer (also closes the underlying file handle).
1769
+ self.wal_writer = None;
1435
1770
  // Release the lock by dropping the file handle
1436
1771
  self._lock_file = None;
1437
1772
  // Clear in-memory state
@@ -1441,6 +1776,8 @@ impl Database {
1441
1776
  self.quantized = None;
1442
1777
  self.quantization_config = None;
1443
1778
  self.quantized_keys.clear();
1779
+ self.vector_arena = None;
1780
+ self.vector_arena_dirty = false;
1444
1781
  self.dimension = 0;
1445
1782
  Ok(())
1446
1783
  }
@@ -2038,8 +2375,12 @@ impl Database {
2038
2375
  self.rebuild_ann();
2039
2376
  self.ann_loaded_from_disk = false;
2040
2377
  self.persist_ann_to_disk()?;
2378
+ self.ann_dirty = false;
2379
+ self.vector_arena_dirty = true;
2041
2380
  self.rebuild_quantized_index();
2381
+ self.quantized_dirty = false;
2042
2382
  self.rebuild_all_multi_vector_quantized_indexes();
2383
+ self.multi_vector_quantized_dirty = false;
2043
2384
  Ok(count)
2044
2385
  }
2045
2386
 
@@ -2066,8 +2407,12 @@ impl Database {
2066
2407
  self.rebuild_ann();
2067
2408
  self.ann_loaded_from_disk = false;
2068
2409
  self.persist_ann_to_disk()?;
2410
+ self.ann_dirty = false;
2411
+ self.vector_arena_dirty = true;
2069
2412
  self.rebuild_quantized_index();
2413
+ self.quantized_dirty = false;
2070
2414
  self.rebuild_all_multi_vector_quantized_indexes();
2415
+ self.multi_vector_quantized_dirty = false;
2071
2416
  Ok(count)
2072
2417
  }
2073
2418
 
@@ -2271,8 +2616,12 @@ impl Database {
2271
2616
  self.rebuild_ann();
2272
2617
  self.ann_loaded_from_disk = false;
2273
2618
  self.persist_ann_to_disk()?;
2619
+ self.ann_dirty = false;
2620
+ self.vector_arena_dirty = true;
2274
2621
  self.rebuild_quantized_index();
2622
+ self.quantized_dirty = false;
2275
2623
  self.rebuild_all_multi_vector_quantized_indexes();
2624
+ self.multi_vector_quantized_dirty = false;
2276
2625
  Ok(())
2277
2626
  }
2278
2627
 
@@ -2483,15 +2832,119 @@ impl Database {
2483
2832
  self.compact_inner()
2484
2833
  }
2485
2834
 
2835
+ /// Configure WAL durability. See [`WalSyncMode`] for the safety / speed
2836
+ /// tradeoffs.
2837
+ ///
2838
+ /// Switching to a more relaxed mode while there are unsync'd bytes in
2839
+ /// the WAL is safe — the bytes simply stay in the BufWriter / OS cache
2840
+ /// until the next sync point (`flush()`, `compact()`, `close()`, or the
2841
+ /// counter reaching `EveryN(n)`). Switching to a *stricter* mode forces
2842
+ /// an immediate sync so there is no surprise loss window.
2843
+ pub fn set_wal_sync_mode(&mut self, mode: WalSyncMode) -> Result<()> {
2844
+ self.check_writable()?;
2845
+ mode.validate()?;
2846
+ let previous = self.wal_sync_mode;
2847
+ self.wal_sync_mode = mode;
2848
+ // If we just tightened durability (e.g. moved from OnFlush back to
2849
+ // PerOp) and there are pending ops, sync immediately so the user's
2850
+ // mental model — "after this call any acknowledged write is durable"
2851
+ // — holds.
2852
+ let became_stricter = matches!(
2853
+ (previous, mode),
2854
+ (
2855
+ WalSyncMode::OnFlush,
2856
+ WalSyncMode::PerOp | WalSyncMode::EveryN(_)
2857
+ ) | (WalSyncMode::EveryN(_), WalSyncMode::PerOp)
2858
+ );
2859
+ if became_stricter && self.wal_ops_since_sync > 0 {
2860
+ self.sync_wal()?;
2861
+ self.wal_ops_since_sync = 0;
2862
+ }
2863
+ Ok(())
2864
+ }
2865
+
2866
+ /// Return the current WAL sync mode.
2867
+ pub fn wal_sync_mode(&self) -> WalSyncMode {
2868
+ self.wal_sync_mode
2869
+ }
2870
+
2871
+ /// Materialise the contiguous-vector arena up front.
2872
+ ///
2873
+ /// The arena mirrors the default dense vector of every record in a
2874
+ /// single flat `Vec<f32>` — much more cache- and SIMD-friendly than the
2875
+ /// default `BTreeMap<Record>` layout. It's normally built lazily on
2876
+ /// first use, but if you know a heavy brute-force or rescoring scan is
2877
+ /// coming you can pay the build cost up front by calling this. Cheap
2878
+ /// when already fresh.
2879
+ pub fn prepare_for_scan(&mut self) {
2880
+ let _ = self.ensure_vector_arena();
2881
+ }
2882
+
2883
+ /// Number of vectors in the contiguous arena, or `None` if the arena
2884
+ /// hasn't been materialised yet for this session. Useful for tests and
2885
+ /// observability.
2886
+ pub fn vector_arena_len(&self) -> Option<usize> {
2887
+ self.vector_arena.as_ref().map(VectorArena::len)
2888
+ }
2889
+
2890
+ /// Return (live_count, tombstoned_count) summed across every HNSW graph
2891
+ /// (global + per-namespace). Useful for monitoring when a `compact()`
2892
+ /// would benefit from rebuilding the graph(s).
2893
+ pub fn tombstone_stats(&self) -> (usize, usize) {
2894
+ let mut live = 0usize;
2895
+ let mut dead = 0usize;
2896
+ for idx in self.ann.global.values() {
2897
+ live += idx.live_count();
2898
+ dead += idx.tombstones.len();
2899
+ }
2900
+ for indexes in self.ann.namespaces.values() {
2901
+ for idx in indexes.values() {
2902
+ live += idx.live_count();
2903
+ dead += idx.tombstones.len();
2904
+ }
2905
+ }
2906
+ (live, dead)
2907
+ }
2908
+
2486
2909
  /// Bulk-ingest many records efficiently. WAL writes happen in batches of
2487
2910
  /// `batch_size`, but the ANN index and sparse index are only rebuilt once
2488
2911
  /// at the very end, making this much faster than `upsert_many` for large
2489
2912
  /// imports.
2913
+ ///
2914
+ /// Performance notes:
2915
+ /// - The WAL is written without a per-batch `fsync` (each batch goes
2916
+ /// through `BufWriter` and is appended to the open file). A single
2917
+ /// `sync_all` is issued at the end. This avoids the per-batch fsync
2918
+ /// tax that dominates ingestion latency on macOS and modern SSDs.
2919
+ /// - The final ANN rebuild uses parallel HNSW insertion (Rayon) when
2920
+ /// the dataset is large enough (see
2921
+ /// `IndexConfig.parallel_insert_threshold`).
2490
2922
  pub fn bulk_ingest<I>(&mut self, records: I, batch_size: usize) -> Result<usize>
2923
+ where
2924
+ I: IntoIterator<Item = Record>,
2925
+ {
2926
+ self.bulk_ingest_with_config(records, batch_size, None)
2927
+ }
2928
+
2929
+ /// Bulk-ingest with an override for the HNSW index configuration. The
2930
+ /// override is applied for the rebuild step at the end, so the resulting
2931
+ /// graph uses the requested `m` / `ef_construction`. The new config is
2932
+ /// also stored on the database (so subsequent searches use the
2933
+ /// corresponding `ef_search`).
2934
+ pub fn bulk_ingest_with_config<I>(
2935
+ &mut self,
2936
+ records: I,
2937
+ batch_size: usize,
2938
+ config: Option<IndexConfig>,
2939
+ ) -> Result<usize>
2491
2940
  where
2492
2941
  I: IntoIterator<Item = Record>,
2493
2942
  {
2494
2943
  self.check_writable()?;
2944
+ if let Some(cfg) = config {
2945
+ cfg.validate()?;
2946
+ self.index_config = cfg;
2947
+ }
2495
2948
  let batch_size = batch_size.max(1);
2496
2949
  let mut total = 0_usize;
2497
2950
  let mut batch = Vec::with_capacity(batch_size);
@@ -2502,7 +2955,8 @@ impl Database {
2502
2955
 
2503
2956
  if batch.len() >= batch_size {
2504
2957
  total += batch.len();
2505
- self.append_wal_batch(&batch)?;
2958
+ // Coalesced WAL writes: append without per-batch fsync.
2959
+ self.append_wal_batch_unsynced(&batch)?;
2506
2960
  self.apply_ops_in_memory(batch);
2507
2961
  batch = Vec::with_capacity(batch_size);
2508
2962
  }
@@ -2510,22 +2964,71 @@ impl Database {
2510
2964
 
2511
2965
  if !batch.is_empty() {
2512
2966
  total += batch.len();
2513
- self.append_wal_batch(&batch)?;
2967
+ self.append_wal_batch_unsynced(&batch)?;
2514
2968
  self.apply_ops_in_memory(batch);
2515
2969
  }
2516
2970
 
2517
2971
  if total > 0 {
2972
+ // Single fsync at the very end to make all batches durable in
2973
+ // one shot. This is the major ingestion optimisation: instead
2974
+ // of paying fsync per batch (every `batch_size` records) we pay
2975
+ // it once for the whole bulk_ingest call.
2976
+ self.sync_wal()?;
2518
2977
  self.rebuild_sparse_index();
2519
2978
  self.rebuild_ann();
2520
2979
  self.ann_loaded_from_disk = false;
2980
+ // Persist the freshly-built ANN so a subsequent reopen can skip
2981
+ // the rebuild — bulk_ingest is a "batch" operation and callers
2982
+ // expect index state to be on disk afterwards.
2521
2983
  self.persist_ann_to_disk()?;
2984
+ self.ann_dirty = false;
2985
+ self.vector_arena_dirty = true;
2522
2986
  self.rebuild_quantized_index();
2987
+ self.quantized_dirty = false;
2523
2988
  self.rebuild_all_multi_vector_quantized_indexes();
2989
+ self.multi_vector_quantized_dirty = false;
2524
2990
  }
2525
2991
 
2526
2992
  Ok(total)
2527
2993
  }
2528
2994
 
2995
+ /// Replace the HNSW tuning parameters and rebuild the ANN index.
2996
+ /// Use this to trade off recall vs latency without re-ingesting data.
2997
+ pub fn set_index_config(&mut self, config: IndexConfig) -> Result<()> {
2998
+ self.check_writable()?;
2999
+ config.validate()?;
3000
+ let changed_build_params = self.index_config.m != config.m
3001
+ || self.index_config.ef_construction != config.ef_construction;
3002
+ self.index_config = config;
3003
+ if changed_build_params {
3004
+ // m / ef_construction affect graph structure → full rebuild.
3005
+ self.rebuild_ann();
3006
+ self.ann_loaded_from_disk = false;
3007
+ self.persist_ann_to_disk()?;
3008
+ self.ann_dirty = false;
3009
+ }
3010
+ Ok(())
3011
+ }
3012
+
3013
+ /// Return the current HNSW tuning parameters.
3014
+ pub fn index_config(&self) -> IndexConfig {
3015
+ self.index_config
3016
+ }
3017
+
3018
+ /// Convenience: update only the query-time `ef_search` without rebuilding
3019
+ /// the index. Higher = better recall, slower search.
3020
+ pub fn set_ef_search(&mut self, ef_search: Option<usize>) -> Result<()> {
3021
+ if let Some(ef) = ef_search {
3022
+ if ef == 0 {
3023
+ return Err(VectLiteError::InvalidFormat(
3024
+ "ef_search must be >= 1".to_owned(),
3025
+ ));
3026
+ }
3027
+ }
3028
+ self.index_config.ef_search = ef_search;
3029
+ Ok(())
3030
+ }
3031
+
2529
3032
  pub fn compact(&mut self) -> Result<()> {
2530
3033
  self.check_writable()?;
2531
3034
  self.compact_inner()
@@ -2548,6 +3051,7 @@ impl Database {
2548
3051
  validate_quantization_config(&config, self.dimension)?;
2549
3052
  self.quantization_config = Some(config);
2550
3053
  self.rebuild_quantized_index();
3054
+ self.quantized_dirty = false;
2551
3055
  self.persist_quantization_params()?;
2552
3056
  Ok(())
2553
3057
  }
@@ -2558,6 +3062,7 @@ impl Database {
2558
3062
  self.quantized = None;
2559
3063
  self.quantization_config = None;
2560
3064
  self.quantized_keys.clear();
3065
+ self.quantized_dirty = false;
2561
3066
  // Remove the sidecar file
2562
3067
  let params_path = quantization_params_path(&self.path);
2563
3068
  if params_path.exists() {
@@ -3239,6 +3744,54 @@ impl Database {
3239
3744
  self.records.remove(key);
3240
3745
  }
3241
3746
 
3747
+ // If any HNSW graph has tombstones, rebuild it before persisting.
3748
+ //
3749
+ // Two reasons:
3750
+ // 1. Crossing `tombstone_rebuild_pct` means search recall has
3751
+ // degraded enough that the user wants a clean graph.
3752
+ // 2. Even below the threshold, the persisted manifest's
3753
+ // `record_count` is derived from `self.records` (live only),
3754
+ // but the in-memory `keys` array includes dead slots — so a
3755
+ // persisted-with-tombstones graph would always fail the
3756
+ // record_count check on reopen and rebuild anyway. Rebuilding
3757
+ // *now* dumps a clean graph that survives reload.
3758
+ let threshold = self.index_config.tombstone_rebuild_pct;
3759
+ let any_tombstones = self
3760
+ .ann
3761
+ .global
3762
+ .values()
3763
+ .any(|idx| !idx.tombstones.is_empty())
3764
+ || self
3765
+ .ann
3766
+ .namespaces
3767
+ .values()
3768
+ .flat_map(|m| m.values())
3769
+ .any(|idx| !idx.tombstones.is_empty());
3770
+ // (We track `threshold` even though we currently rebuild on any
3771
+ // tombstones, so `should_rebuild` could later replace this when we
3772
+ // add tombstone persistence in the manifest.)
3773
+ let _ = threshold;
3774
+ if any_tombstones {
3775
+ self.rebuild_ann();
3776
+ }
3777
+
3778
+ // Rebuild any lazy indexes that were marked dirty during the session
3779
+ // before we persist. This is the point where we pay back the work
3780
+ // we deferred from the per-insert hot path:
3781
+ // - the HNSW graph is already up-to-date (incremental inserts),
3782
+ // we just need to dump it.
3783
+ // - the quantized PQ index was dropped on first insert and is
3784
+ // rebuilt now so search can use it again next session.
3785
+ // - same for multi-vector PQ.
3786
+ if self.quantized_dirty {
3787
+ self.rebuild_quantized_index();
3788
+ self.quantized_dirty = false;
3789
+ }
3790
+ if self.multi_vector_quantized_dirty {
3791
+ self.rebuild_all_multi_vector_quantized_indexes();
3792
+ self.multi_vector_quantized_dirty = false;
3793
+ }
3794
+
3242
3795
  if let Some(parent) = self.path.parent() {
3243
3796
  if !parent.as_os_str().is_empty() {
3244
3797
  fs::create_dir_all(parent)?;
@@ -3261,6 +3814,7 @@ impl Database {
3261
3814
  self.clear_wal()?;
3262
3815
  self.wal_entries_replayed = 0;
3263
3816
  self.persist_ann_to_disk()?;
3817
+ self.ann_dirty = false;
3264
3818
 
3265
3819
  Ok(())
3266
3820
  }
@@ -3401,6 +3955,65 @@ impl Database {
3401
3955
  .iter()
3402
3956
  .all(|op| matches!(op, WalOp::UpdateMetadata { .. } | WalOp::SetTtl { .. }));
3403
3957
 
3958
+ // Categorise each op so we can route to the fastest correct path:
3959
+ // incremental insert (Upsert with new key) → ann_apply_incremental
3960
+ // tombstone delete (Delete of present key) → ann_apply_tombstones
3961
+ // anything else (upsert of existing key, etc) → full rebuild
3962
+ let mut incremental_eligible = !metadata_only;
3963
+ let mut tombstone_only = !metadata_only;
3964
+ for op in &ops {
3965
+ match op {
3966
+ WalOp::Upsert(record) => {
3967
+ let exists = self
3968
+ .records
3969
+ .contains_key(&(record.namespace.clone(), record.id.clone()));
3970
+ if exists {
3971
+ incremental_eligible = false;
3972
+ tombstone_only = false;
3973
+ } else {
3974
+ // New upsert — fine for incremental, but not tombstone-only.
3975
+ tombstone_only = false;
3976
+ }
3977
+ }
3978
+ WalOp::Delete { namespace, id } => {
3979
+ let exists = self.records.contains_key(&(namespace.clone(), id.clone()));
3980
+ if exists {
3981
+ // OK for tombstone path, but not for incremental.
3982
+ incremental_eligible = false;
3983
+ }
3984
+ // (A delete of a non-existent key is a no-op for both
3985
+ // paths, but we still let it through.)
3986
+ }
3987
+ WalOp::UpdateMetadata { .. } | WalOp::SetTtl { .. } => {
3988
+ incremental_eligible = false;
3989
+ tombstone_only = false;
3990
+ }
3991
+ }
3992
+ }
3993
+
3994
+ // Collect the keys we'll need to feed to the relevant updater
3995
+ // before we move `ops` into `apply_ops_in_memory`.
3996
+ let new_keys: Vec<RecordKey> = if incremental_eligible {
3997
+ ops.iter()
3998
+ .filter_map(|op| match op {
3999
+ WalOp::Upsert(record) => Some((record.namespace.clone(), record.id.clone())),
4000
+ _ => None,
4001
+ })
4002
+ .collect()
4003
+ } else {
4004
+ Vec::new()
4005
+ };
4006
+ let deleted_keys: Vec<RecordKey> = if tombstone_only {
4007
+ ops.iter()
4008
+ .filter_map(|op| match op {
4009
+ WalOp::Delete { namespace, id } => Some((namespace.clone(), id.clone())),
4010
+ _ => None,
4011
+ })
4012
+ .collect()
4013
+ } else {
4014
+ Vec::new()
4015
+ };
4016
+
3404
4017
  self.append_wal_batch(&ops)?;
3405
4018
  self.apply_ops_in_memory(ops);
3406
4019
 
@@ -3409,11 +4022,55 @@ impl Database {
3409
4022
  if has_sparse {
3410
4023
  self.rebuild_sparse_index();
3411
4024
  }
3412
- self.rebuild_ann();
4025
+ if incremental_eligible {
4026
+ // Fast path: just append the new vectors into the existing
4027
+ // HNSW graph(s) instead of rebuilding from scratch. Converts
4028
+ // single-record ingestion from O(N log N) per insert to
4029
+ // amortised O(log N).
4030
+ self.ann_apply_incremental(&new_keys);
4031
+ // Keep the contiguous arena in sync. If it hasn't been
4032
+ // materialised yet, leave it alone — it'll be lazily built
4033
+ // on first read.
4034
+ if self.vector_arena.is_some() && !self.vector_arena_dirty {
4035
+ self.arena_apply_incremental(&new_keys);
4036
+ }
4037
+ } else if tombstone_only {
4038
+ // Delete-only fast path: tombstone the corresponding
4039
+ // `origin_id`s in each affected HNSW graph. No rebuild;
4040
+ // search filters out tombstoned candidates. The graph is
4041
+ // rebuilt automatically at the next `compact()` once the
4042
+ // tombstone ratio crosses `tombstone_rebuild_pct`.
4043
+ self.ann_apply_tombstones(&deleted_keys);
4044
+ // The arena can't compact in place without shifting O(N)
4045
+ // floats; mark dirty so it's lazily rebuilt on next scan.
4046
+ self.vector_arena_dirty = true;
4047
+ } else {
4048
+ // Slow path: a mixed-mode batch or an update-of-existing.
4049
+ // Rebuild the whole catalog.
4050
+ self.rebuild_ann();
4051
+ self.vector_arena_dirty = true;
4052
+ }
4053
+ // Defer persistence of the HNSW graph to disk: writing the graph
4054
+ // files is expensive (full re-dump + fsync) and is only required
4055
+ // for crash recovery on reopen. The WAL gives us that durability
4056
+ // already — on reopen, if the persisted graph is stale, it's
4057
+ // detected via the manifest signature check and rebuilt from
4058
+ // records in memory. Persistence happens at `flush` / `compact`.
3413
4059
  self.ann_loaded_from_disk = false;
3414
- self.persist_ann_to_disk()?;
3415
- self.rebuild_quantized_index();
3416
- self.rebuild_all_multi_vector_quantized_indexes();
4060
+ self.ann_dirty = true;
4061
+ // Lazy-rebuild quantized indexes too. Drop the in-memory
4062
+ // structures so callers get correct (HNSW-fallback) results
4063
+ // until the next flush, where we rebuild from the new corpus.
4064
+ if self.quantization_config.is_some() {
4065
+ self.quantized = None;
4066
+ self.quantized_keys.clear();
4067
+ self.quantized_dirty = true;
4068
+ }
4069
+ if !self.multi_vector_quantization_config.is_empty() {
4070
+ self.multi_vector_quantized.clear();
4071
+ self.multi_vector_quantized_keys.clear();
4072
+ self.multi_vector_quantized_dirty = true;
4073
+ }
3417
4074
  }
3418
4075
  Ok(())
3419
4076
  }
@@ -3497,32 +4154,109 @@ impl Database {
3497
4154
  }
3498
4155
  }
3499
4156
 
3500
- fn append_wal_batch(&self, ops: &[WalOp]) -> Result<()> {
4157
+ fn append_wal_batch(&mut self, ops: &[WalOp]) -> Result<()> {
4158
+ // Decide whether this batch should trigger an fsync. We use the
4159
+ // ops count in the batch (not 1) so `EveryN` semantics scale across
4160
+ // both single inserts and `insert_many` calls.
4161
+ let n_ops = ops.len();
4162
+ let should_sync = match self.wal_sync_mode {
4163
+ WalSyncMode::PerOp => true,
4164
+ WalSyncMode::EveryN(n) => {
4165
+ self.wal_ops_since_sync = self.wal_ops_since_sync.saturating_add(n_ops);
4166
+ if self.wal_ops_since_sync >= n {
4167
+ self.wal_ops_since_sync = 0;
4168
+ true
4169
+ } else {
4170
+ false
4171
+ }
4172
+ }
4173
+ WalSyncMode::OnFlush => {
4174
+ self.wal_ops_since_sync = self.wal_ops_since_sync.saturating_add(n_ops);
4175
+ false
4176
+ }
4177
+ };
4178
+ self.append_wal_batch_inner(ops, should_sync)
4179
+ }
4180
+
4181
+ /// Append a WAL batch without issuing an fsync. The caller is responsible
4182
+ /// for issuing `sync_wal` later (typically once at the end of a bulk
4183
+ /// ingest). This is the hot path for `bulk_ingest`.
4184
+ fn append_wal_batch_unsynced(&mut self, ops: &[WalOp]) -> Result<()> {
4185
+ // Track pending ops so future `sync_wal` / `compact_inner` calls
4186
+ // know to flush them.
4187
+ self.wal_ops_since_sync = self.wal_ops_since_sync.saturating_add(ops.len());
4188
+ self.append_wal_batch_inner(ops, false)
4189
+ }
4190
+
4191
+ /// Append a WAL batch. Reuses a cached `BufWriter<File>` across calls so
4192
+ /// the WAL file is only opened once per database session — saving the
4193
+ /// `open()` syscall on every single `insert` call, which matters when
4194
+ /// per-record overhead is the bottleneck.
4195
+ fn append_wal_batch_inner(&mut self, ops: &[WalOp], sync: bool) -> Result<()> {
3501
4196
  if let Some(parent) = self.wal_path.parent() {
3502
4197
  if !parent.as_os_str().is_empty() {
3503
4198
  fs::create_dir_all(parent)?;
3504
4199
  }
3505
4200
  }
3506
4201
 
3507
- let new_file = !self.wal_path.exists();
3508
- let mut file = OpenOptions::new()
3509
- .create(true)
3510
- .append(true)
3511
- .open(&self.wal_path)?;
3512
-
3513
- if new_file {
3514
- file.write_all(WAL_MAGIC)?;
4202
+ // Lazily create the cached BufWriter, writing the WAL_MAGIC header
4203
+ // on first use of a brand-new file.
4204
+ if self.wal_writer.is_none() {
4205
+ let new_file = !self.wal_path.exists();
4206
+ let file = OpenOptions::new()
4207
+ .create(true)
4208
+ .append(true)
4209
+ .open(&self.wal_path)?;
4210
+ let mut writer = BufWriter::with_capacity(64 * 1024, file);
4211
+ if new_file {
4212
+ writer.write_all(WAL_MAGIC)?;
4213
+ }
4214
+ self.wal_writer = Some(writer);
3515
4215
  }
3516
4216
 
4217
+ // Serialise the batch into a temporary buffer first, so that the
4218
+ // single `write_all` we issue to the cached writer is one contiguous
4219
+ // user-space copy (BufWriter then bunches everything up further).
3517
4220
  let mut buffer = Vec::new();
3518
4221
  write_u32(&mut buffer, u32_from_usize(ops.len())?)?;
3519
4222
  for op in ops {
3520
4223
  write_wal_op(&mut buffer, op)?;
3521
4224
  }
3522
4225
 
3523
- write_u32(&mut file, u32_from_usize(buffer.len())?)?;
3524
- file.write_all(&buffer)?;
4226
+ let writer = self.wal_writer.as_mut().unwrap();
4227
+ write_u32(writer, u32_from_usize(buffer.len())?)?;
4228
+ writer.write_all(&buffer)?;
4229
+
4230
+ if sync {
4231
+ // Flush BufWriter into the OS, then ask the kernel to make the
4232
+ // bytes durable. We must `flush()` before `sync_all()` — sync_all
4233
+ // only operates on what's already in the kernel's page cache.
4234
+ writer.flush()?;
4235
+ writer.get_ref().sync_all()?;
4236
+ }
4237
+ Ok(())
4238
+ }
4239
+
4240
+ /// Force a durability fence on the WAL file. Flushes any buffered bytes
4241
+ /// from the cached writer and asks the kernel to make them durable in a
4242
+ /// single `sync_all`. Used by `bulk_ingest`, `flush`, `close`, and as a
4243
+ /// manual fence when running in `EveryN` or `OnFlush` mode.
4244
+ fn sync_wal(&mut self) -> Result<()> {
4245
+ if let Some(writer) = self.wal_writer.as_mut() {
4246
+ writer.flush()?;
4247
+ writer.get_ref().sync_all()?;
4248
+ self.wal_ops_since_sync = 0;
4249
+ return Ok(());
4250
+ }
4251
+ // Fallback: no cached writer (e.g. WAL was opened externally). Open
4252
+ // the file briefly just to issue the sync.
4253
+ if !self.wal_path.exists() {
4254
+ self.wal_ops_since_sync = 0;
4255
+ return Ok(());
4256
+ }
4257
+ let file = OpenOptions::new().append(true).open(&self.wal_path)?;
3525
4258
  file.sync_all()?;
4259
+ self.wal_ops_since_sync = 0;
3526
4260
  Ok(())
3527
4261
  }
3528
4262
 
@@ -3576,7 +4310,12 @@ impl Database {
3576
4310
  Ok(())
3577
4311
  }
3578
4312
 
3579
- fn clear_wal(&self) -> Result<()> {
4313
+ fn clear_wal(&mut self) -> Result<()> {
4314
+ // Drop the cached writer first: on POSIX the file would survive the
4315
+ // unlink because we still hold an open handle, but we'd then keep
4316
+ // appending into the now-detached inode and never see those bytes on
4317
+ // disk after reopen.
4318
+ self.wal_writer = None;
3580
4319
  if self.wal_path.exists() {
3581
4320
  fs::remove_file(&self.wal_path)?;
3582
4321
  }
@@ -3688,6 +4427,12 @@ impl Database {
3688
4427
  ann_loaded_from_disk: false,
3689
4428
  read_only: false,
3690
4429
  _lock_file: None,
4430
+ wal_writer: None,
4431
+ wal_sync_mode: WalSyncMode::default(),
4432
+ wal_ops_since_sync: 0,
4433
+ ann_dirty: false,
4434
+ quantized_dirty: false,
4435
+ multi_vector_quantized_dirty: false,
3691
4436
  quantized: None,
3692
4437
  quantization_config: None,
3693
4438
  quantized_keys: Vec::new(),
@@ -3696,6 +4441,9 @@ impl Database {
3696
4441
  multi_vector_quantized_keys: BTreeMap::new(),
3697
4442
  payload_index_defs: BTreeMap::new(),
3698
4443
  payload_indexes: BTreeMap::new(),
4444
+ index_config: IndexConfig::default(),
4445
+ vector_arena: None,
4446
+ vector_arena_dirty: false,
3699
4447
  })
3700
4448
  }
3701
4449
 
@@ -3833,6 +4581,237 @@ impl Database {
3833
4581
  Ok(())
3834
4582
  }
3835
4583
 
4584
+ /// Incremental ANN update. Appends the given new records into the
4585
+ /// existing HNSW graph(s) without rebuilding them from scratch.
4586
+ ///
4587
+ /// Preconditions:
4588
+ /// - `new_keys` are keys that already live in `self.records` (caller
4589
+ /// must have applied the WAL ops to memory first).
4590
+ /// - Each key referenced by `new_keys` did NOT previously exist in
4591
+ /// `self.records` (i.e. it's a true insert, not an update).
4592
+ ///
4593
+ /// Behaviour per (namespace, vector_name) "slot":
4594
+ /// - If a graph already exists, the new vectors are appended to it
4595
+ /// via single-element `hnsw.insert` calls (or `parallel_insert` if
4596
+ /// the batch is large enough to amortise thread overhead).
4597
+ /// - If no graph exists but the total record count for that slot has
4598
+ /// now crossed `ANN_MIN_POINTS`, a fresh graph is built from all
4599
+ /// matching records.
4600
+ /// - Below `ANN_MIN_POINTS`, we skip — searches will brute-force
4601
+ /// without harm.
4602
+ fn ann_apply_incremental(&mut self, new_keys: &[RecordKey]) {
4603
+ if new_keys.is_empty() {
4604
+ return;
4605
+ }
4606
+ let cfg = self.index_config;
4607
+
4608
+ // Group the new records by (Option<namespace>, vector_name). Each
4609
+ // upserted record contributes to exactly one global slot and one
4610
+ // namespace-scoped slot per dense vector it owns.
4611
+ let mut groups: BTreeMap<(Option<String>, String), Vec<(RecordKey, Vec<f32>)>> =
4612
+ BTreeMap::new();
4613
+ for key in new_keys {
4614
+ let Some(record) = self.records.get(key) else {
4615
+ continue;
4616
+ };
4617
+ for (vector_name, vector) in record.dense_vectors() {
4618
+ let item = (key.clone(), vector.clone());
4619
+ groups
4620
+ .entry((None, vector_name.to_owned()))
4621
+ .or_default()
4622
+ .push(item.clone());
4623
+ groups
4624
+ .entry((Some(record.namespace.clone()), vector_name.to_owned()))
4625
+ .or_default()
4626
+ .push(item);
4627
+ }
4628
+ }
4629
+
4630
+ // Two-phase processing to keep the borrow checker happy:
4631
+ // phase 1: classify each slot (needs fresh build vs incremental
4632
+ // append), reading `self.records` only.
4633
+ // phase 2: mutate `self.ann` based on the classifications.
4634
+ let mut fresh_builds: Vec<((Option<String>, String), Vec<(RecordKey, Vec<f32>)>)> =
4635
+ Vec::new();
4636
+ let mut incremental: Vec<((Option<String>, String), Vec<(RecordKey, Vec<f32>)>)> =
4637
+ Vec::new();
4638
+
4639
+ for ((opt_ns, vector_name), new_items) in groups {
4640
+ let has_existing = match &opt_ns {
4641
+ None => self.ann.global.contains_key(&vector_name),
4642
+ Some(ns) => self
4643
+ .ann
4644
+ .namespaces
4645
+ .get(ns)
4646
+ .map_or(false, |m| m.contains_key(&vector_name)),
4647
+ };
4648
+
4649
+ if has_existing {
4650
+ incremental.push(((opt_ns, vector_name), new_items));
4651
+ continue;
4652
+ }
4653
+
4654
+ // Count matching records (post-insert state) to decide whether
4655
+ // we've crossed the build threshold.
4656
+ let total = self
4657
+ .records
4658
+ .iter()
4659
+ .filter(|(_, r)| match &opt_ns {
4660
+ Some(ns) => r.namespace == *ns,
4661
+ None => true,
4662
+ })
4663
+ .filter(|(_, r)| {
4664
+ r.dense_vectors()
4665
+ .any(|(name, _)| name == vector_name.as_str())
4666
+ })
4667
+ .count();
4668
+
4669
+ if total < ANN_MIN_POINTS {
4670
+ continue;
4671
+ }
4672
+
4673
+ // Need to build a fresh graph for this slot. Collect ALL matching
4674
+ // records (not just the new ones) — owned clones so the build
4675
+ // step doesn't borrow `self.records`.
4676
+ let mut all_items: Vec<(RecordKey, Vec<f32>)> = Vec::with_capacity(total);
4677
+ for (k, r) in &self.records {
4678
+ if let Some(ns) = &opt_ns {
4679
+ if r.namespace != *ns {
4680
+ continue;
4681
+ }
4682
+ }
4683
+ for (name, vec) in r.dense_vectors() {
4684
+ if name == vector_name.as_str() {
4685
+ all_items.push((k.clone(), vec.clone()));
4686
+ break;
4687
+ }
4688
+ }
4689
+ }
4690
+ let _ = new_items; // already folded into `all_items`
4691
+ fresh_builds.push(((opt_ns, vector_name), all_items));
4692
+ }
4693
+
4694
+ // Phase 2a: build-from-scratch for slots that just crossed the
4695
+ // threshold.
4696
+ for ((opt_ns, vector_name), all_items) in fresh_builds {
4697
+ let records_for_build: Vec<(RecordKey, &Vec<f32>)> =
4698
+ all_items.iter().map(|(k, v)| (k.clone(), v)).collect();
4699
+ let new_index = build_ann_index(records_for_build, self.metric, &cfg);
4700
+ match opt_ns {
4701
+ None => {
4702
+ self.ann.global.insert(vector_name, new_index);
4703
+ }
4704
+ Some(ns) => {
4705
+ self.ann
4706
+ .namespaces
4707
+ .entry(ns)
4708
+ .or_default()
4709
+ .insert(vector_name, new_index);
4710
+ }
4711
+ }
4712
+ }
4713
+
4714
+ // Phase 2b: incremental appends into existing graphs.
4715
+ for ((opt_ns, vector_name), new_items) in incremental {
4716
+ let idx_opt = match &opt_ns {
4717
+ None => self.ann.global.get_mut(&vector_name),
4718
+ Some(ns) => self
4719
+ .ann
4720
+ .namespaces
4721
+ .get_mut(ns)
4722
+ .and_then(|m| m.get_mut(&vector_name)),
4723
+ };
4724
+ let Some(idx) = idx_opt else {
4725
+ continue;
4726
+ };
4727
+
4728
+ // hnsw_rs marks indexes that have been searched as "searching
4729
+ // mode" (a hint that skips some bookkeeping in the data layer).
4730
+ // Re-enable mutation mode before we insert — cheap toggle.
4731
+ idx.hnsw.set_searching_mode(false);
4732
+
4733
+ if new_items.len() >= cfg.parallel_insert_threshold {
4734
+ let start_id = idx.keys.len();
4735
+ let batch: Vec<(&Vec<f32>, usize)> = new_items
4736
+ .iter()
4737
+ .enumerate()
4738
+ .map(|(offset, (_, v))| (v, start_id + offset))
4739
+ .collect();
4740
+ idx.hnsw.parallel_insert_batch(&batch);
4741
+ for (offset, (k, _)) in new_items.into_iter().enumerate() {
4742
+ let origin_id = start_id + offset;
4743
+ idx.key_to_origin.insert(k.clone(), origin_id);
4744
+ idx.keys.push(k);
4745
+ }
4746
+ } else {
4747
+ for (key, vector) in new_items {
4748
+ let origin_id = idx.keys.len();
4749
+ idx.key_to_origin.insert(key.clone(), origin_id);
4750
+ idx.keys.push(key);
4751
+ idx.hnsw.insert_one(vector.as_slice(), origin_id);
4752
+ }
4753
+ }
4754
+ }
4755
+ }
4756
+
4757
+ /// Append newly-inserted vectors to the contiguous arena. Caller must
4758
+ /// have already inserted the records into `self.records` and confirmed
4759
+ /// the arena exists and isn't dirty.
4760
+ fn arena_apply_incremental(&mut self, new_keys: &[RecordKey]) {
4761
+ let Some(arena) = self.vector_arena.as_mut() else {
4762
+ return;
4763
+ };
4764
+ for key in new_keys {
4765
+ if let Some(record) = self.records.get(key) {
4766
+ arena.append(key.clone(), &record.vector);
4767
+ }
4768
+ }
4769
+ }
4770
+
4771
+ /// Ensure the contiguous arena is materialised and fresh. Cheap when
4772
+ /// already clean; rebuilds from `self.records` (in BTreeMap order) on
4773
+ /// first call or after a delete. Allocates `dim * N` f32s.
4774
+ fn ensure_vector_arena(&mut self) -> &VectorArena {
4775
+ let needs_build = self
4776
+ .vector_arena
4777
+ .as_ref()
4778
+ .map_or(true, |a| self.vector_arena_dirty || a.dim != self.dimension);
4779
+ if needs_build {
4780
+ self.vector_arena = Some(VectorArena::rebuild_from(&self.records, self.dimension));
4781
+ self.vector_arena_dirty = false;
4782
+ }
4783
+ self.vector_arena.as_ref().unwrap()
4784
+ }
4785
+
4786
+ /// Mark the given record keys as deleted in every HNSW graph they live
4787
+ /// in. The graph itself is not modified — search filters tombstoned
4788
+ /// `origin_id`s. A subsequent `compact()` will rebuild any graph whose
4789
+ /// dead ratio exceeds `IndexConfig.tombstone_rebuild_pct`.
4790
+ fn ann_apply_tombstones(&mut self, deleted_keys: &[RecordKey]) {
4791
+ if deleted_keys.is_empty() {
4792
+ return;
4793
+ }
4794
+ for key in deleted_keys {
4795
+ // Global graphs (per vector_name): every graph that contains
4796
+ // this key gets the corresponding origin_id tombstoned.
4797
+ for (_, idx) in self.ann.global.iter_mut() {
4798
+ if let Some(&origin_id) = idx.key_to_origin.get(key) {
4799
+ idx.tombstones.insert(origin_id);
4800
+ }
4801
+ }
4802
+ // Per-namespace graphs: only the namespace this key belongs to
4803
+ // has a chance of containing it, but checking all of them is
4804
+ // fine — `key_to_origin.get` is O(1) and misses immediately.
4805
+ for (_, indexes) in self.ann.namespaces.iter_mut() {
4806
+ for (_, idx) in indexes.iter_mut() {
4807
+ if let Some(&origin_id) = idx.key_to_origin.get(key) {
4808
+ idx.tombstones.insert(origin_id);
4809
+ }
4810
+ }
4811
+ }
4812
+ }
4813
+ }
4814
+
3836
4815
  fn rebuild_ann(&mut self) {
3837
4816
  self.ann = AnnCatalog::default();
3838
4817
  let mut global_by_vector: BTreeMap<String, Vec<(RecordKey, &Vec<f32>)>> = BTreeMap::new();
@@ -3854,13 +4833,14 @@ impl Database {
3854
4833
  }
3855
4834
  }
3856
4835
 
4836
+ let cfg = self.index_config;
3857
4837
  self.ann.global = global_by_vector
3858
4838
  .into_iter()
3859
4839
  .filter_map(|(vector_name, records)| {
3860
4840
  if records.len() < ANN_MIN_POINTS {
3861
4841
  None
3862
4842
  } else {
3863
- Some((vector_name, build_ann_index(records, self.metric)))
4843
+ Some((vector_name, build_ann_index(records, self.metric, &cfg)))
3864
4844
  }
3865
4845
  })
3866
4846
  .collect();
@@ -3874,7 +4854,7 @@ impl Database {
3874
4854
  if records.len() < ANN_MIN_POINTS {
3875
4855
  None
3876
4856
  } else {
3877
- Some((vector_name, build_ann_index(records, self.metric)))
4857
+ Some((vector_name, build_ann_index(records, self.metric, &cfg)))
3878
4858
  }
3879
4859
  })
3880
4860
  .collect::<BTreeMap<_, _>>();
@@ -3919,6 +4899,24 @@ impl Database {
3919
4899
  return false;
3920
4900
  }
3921
4901
 
4902
+ // For ANN2 manifests, use the persisted keys verbatim — they
4903
+ // match the `origin_id`s baked into the HNSW graph file. For
4904
+ // ANN1 (no persisted keys), fall back to the recomputed
4905
+ // BTreeMap-ordered list, which matches the way ANN1 graphs were
4906
+ // always built.
4907
+ let keys = if manifest_entry.keys.is_empty() {
4908
+ expected_entry.keys.clone()
4909
+ } else {
4910
+ // Defensive: persisted keys length must agree with the
4911
+ // declared record_count and the live record set, else the
4912
+ // manifest is inconsistent and we'd rather rebuild than
4913
+ // serve wrong neighbours.
4914
+ if manifest_entry.keys.len() != manifest_entry.record_count {
4915
+ return false;
4916
+ }
4917
+ manifest_entry.keys.clone()
4918
+ };
4919
+
3922
4920
  let Some(index) = load_ann_index(
3923
4921
  parent,
3924
4922
  &ann_basename(
@@ -3926,7 +4924,7 @@ impl Database {
3926
4924
  expected_entry.namespace.as_deref(),
3927
4925
  &expected_entry.vector_name,
3928
4926
  ),
3929
- expected_entry.keys.clone(),
4927
+ keys,
3930
4928
  self.metric,
3931
4929
  ) else {
3932
4930
  return false;
@@ -3957,7 +4955,11 @@ impl Database {
3957
4955
  return Ok(());
3958
4956
  }
3959
4957
 
3960
- let entries = self.expected_ann_entries();
4958
+ // Use `actual_ann_entries` (NOT `expected_ann_entries`) so the
4959
+ // persisted keys array matches the order the HNSW graph stored its
4960
+ // `origin_id`s in. After incremental inserts the in-memory keys vec
4961
+ // is in insertion order, which usually differs from BTreeMap order.
4962
+ let entries = self.actual_ann_entries();
3961
4963
  for entry in &entries {
3962
4964
  let basename = ann_basename(&self.path, entry.namespace.as_deref(), &entry.vector_name);
3963
4965
  let graph_path = parent.join(format!("{basename}.hnsw.graph"));
@@ -3985,6 +4987,41 @@ impl Database {
3985
4987
  write_ann_manifest(&ann_manifest_path(&self.path), &entries)
3986
4988
  }
3987
4989
 
4990
+ /// Like `expected_ann_entries`, but populates each entry's `keys` field
4991
+ /// from the actual in-memory `AnnIndex.keys` array (insertion order).
4992
+ /// This is what gets serialised into the ANN2 manifest, and matches the
4993
+ /// `origin_id`s baked into the dumped HNSW graph files.
4994
+ fn actual_ann_entries(&self) -> Vec<AnnManifestEntry> {
4995
+ let mut entries = Vec::new();
4996
+ for (vector_name, index) in &self.ann.global {
4997
+ if index.keys.len() < ANN_MIN_POINTS {
4998
+ continue;
4999
+ }
5000
+ entries.push(AnnManifestEntry {
5001
+ namespace: None,
5002
+ vector_name: vector_name.clone(),
5003
+ record_count: index.keys.len(),
5004
+ key_signature: record_key_signature(&index.keys),
5005
+ keys: index.keys.clone(),
5006
+ });
5007
+ }
5008
+ for (namespace, indexes) in &self.ann.namespaces {
5009
+ for (vector_name, index) in indexes {
5010
+ if index.keys.len() < ANN_MIN_POINTS {
5011
+ continue;
5012
+ }
5013
+ entries.push(AnnManifestEntry {
5014
+ namespace: Some(namespace.clone()),
5015
+ vector_name: vector_name.clone(),
5016
+ record_count: index.keys.len(),
5017
+ key_signature: record_key_signature(&index.keys),
5018
+ keys: index.keys.clone(),
5019
+ });
5020
+ }
5021
+ }
5022
+ entries
5023
+ }
5024
+
3988
5025
  fn expected_ann_entries(&self) -> Vec<AnnManifestEntry> {
3989
5026
  let mut global: BTreeMap<String, Vec<RecordKey>> = BTreeMap::new();
3990
5027
  let mut by_namespace: BTreeMap<String, BTreeMap<String, Vec<RecordKey>>> = BTreeMap::new();
@@ -4196,21 +5233,45 @@ impl Database {
4196
5233
  .global
4197
5234
  .get(vector_name.unwrap_or(DEFAULT_VECTOR_NAME)),
4198
5235
  }?;
4199
- if index.keys.len() < ANN_SEARCH_MIN_POINTS {
5236
+ // Gate on live (non-tombstoned) record count: if half the graph is
5237
+ // dead, treat the live half as if it were the whole corpus.
5238
+ let live = index.live_count();
5239
+ if live < ANN_SEARCH_MIN_POINTS {
4200
5240
  return None;
4201
5241
  }
4202
5242
 
4203
- let candidate_count = candidate_count(top_k, index.keys.len());
5243
+ let candidate_count = candidate_count(top_k, live);
4204
5244
  if candidate_count == 0 {
4205
5245
  return None;
4206
5246
  }
4207
5247
 
4208
- let ef_search = candidate_count.max(ANN_EF_CONSTRUCTION);
4209
- let neighbours = index.hnsw.search(query, candidate_count, ef_search);
5248
+ // ef_search controls recall vs latency at query time. When the user
5249
+ // explicitly sets `IndexConfig.ef_search`, honour it directly.
5250
+ // Otherwise default to max(candidate_count, ef_construction) which is
5251
+ // a conservative high-recall heuristic.
5252
+ let mut ef_search = match self.index_config.ef_search {
5253
+ Some(ef) => ef.max(candidate_count),
5254
+ None => candidate_count.max(self.index_config.ef_construction),
5255
+ };
5256
+ // Over-fetch to compensate for tombstoned candidates we'll drop. Cap
5257
+ // at the live count so we don't waste work; we'd never get more
5258
+ // distinct results than that anyway.
5259
+ if !index.tombstones.is_empty() {
5260
+ let dead = index.tombstones.len();
5261
+ ef_search = ef_search
5262
+ .saturating_add(dead.min(ef_search))
5263
+ .min(index.keys.len());
5264
+ }
5265
+ let fetch_count = candidate_count
5266
+ .saturating_add(index.tombstones.len().min(candidate_count))
5267
+ .min(index.keys.len());
5268
+ let neighbours = index.hnsw.search(query, fetch_count, ef_search);
4210
5269
  Some(
4211
5270
  neighbours
4212
5271
  .into_iter()
5272
+ .filter(|n| !index.tombstones.contains(&n.d_id))
4213
5273
  .filter_map(|neighbour| index.keys.get(neighbour.d_id).cloned())
5274
+ .take(candidate_count)
4214
5275
  .collect(),
4215
5276
  )
4216
5277
  }
@@ -4475,28 +5536,51 @@ fn score_dense_prefix(
4475
5536
  metric.score(&left[..dimension], &right[..dimension])
4476
5537
  }
4477
5538
 
4478
- fn build_ann_index(records: Vec<(RecordKey, &Vec<f32>)>, metric: DistanceMetric) -> AnnIndex {
5539
+ fn build_ann_index(
5540
+ records: Vec<(RecordKey, &Vec<f32>)>,
5541
+ metric: DistanceMetric,
5542
+ config: &IndexConfig,
5543
+ ) -> AnnIndex {
4479
5544
  let max_layer = compute_hnsw_layers(records.len());
4480
5545
  let count = records.len();
5546
+ let use_parallel = count >= config.parallel_insert_threshold;
4481
5547
 
4482
5548
  macro_rules! build_hnsw {
4483
5549
  ($dist_type:ty, $dist_val:expr, $variant:ident) => {{
4484
5550
  let mut hnsw = Hnsw::<f32, $dist_type>::new(
4485
- ANN_M,
4486
- count,
5551
+ config.m,
5552
+ count.max(1),
4487
5553
  max_layer,
4488
- ANN_EF_CONSTRUCTION,
5554
+ config.ef_construction,
4489
5555
  $dist_val,
4490
5556
  );
4491
5557
  let mut keys = Vec::with_capacity(count);
4492
- for (origin_id, (key, vector)) in records.into_iter().enumerate() {
4493
- hnsw.insert((vector.as_slice(), origin_id));
4494
- keys.push(key);
5558
+ let mut key_to_origin = HashMap::with_capacity(count);
5559
+ if use_parallel {
5560
+ // hnsw_rs's `parallel_insert` takes `&[(&Vec<T>, usize)]`
5561
+ // (the API is built around owned-Vec borrows) and uses Rayon
5562
+ // internally so the dominant cost (distance calculations
5563
+ // during graph neighbour selection) is multi-threaded.
5564
+ let mut batch: Vec<(&Vec<f32>, usize)> = Vec::with_capacity(count);
5565
+ for (origin_id, (key, vector)) in records.into_iter().enumerate() {
5566
+ batch.push((vector, origin_id));
5567
+ key_to_origin.insert(key.clone(), origin_id);
5568
+ keys.push(key);
5569
+ }
5570
+ hnsw.parallel_insert(&batch);
5571
+ } else {
5572
+ for (origin_id, (key, vector)) in records.into_iter().enumerate() {
5573
+ hnsw.insert((vector.as_slice(), origin_id));
5574
+ key_to_origin.insert(key.clone(), origin_id);
5575
+ keys.push(key);
5576
+ }
4495
5577
  }
4496
5578
  hnsw.set_searching_mode(true);
4497
5579
  AnnIndex {
4498
5580
  hnsw: AnnHnsw::$variant(hnsw),
4499
5581
  keys,
5582
+ key_to_origin,
5583
+ tombstones: HashSet::new(),
4500
5584
  }
4501
5585
  }};
4502
5586
  }
@@ -4691,9 +5775,21 @@ fn hex_encode(bytes: &[u8]) -> String {
4691
5775
  out
4692
5776
  }
4693
5777
 
5778
+ /// Order-independent FNV-1a hash over a set of record keys. We sort first so
5779
+ /// the signature only depends on the SET of keys, not the order they were
5780
+ /// inserted. Callers can use this to check whether a persisted ANN graph
5781
+ /// matches the live record set regardless of whether the live `keys` vec is
5782
+ /// BTreeMap-ordered (full rebuild) or insertion-ordered (incremental
5783
+ /// updates).
5784
+ ///
5785
+ /// Historical note: previously the input was always BTreeMap-iterated and
5786
+ /// therefore already sorted, so the sort step is a no-op for old ANN1
5787
+ /// manifests — backwards compatible.
4694
5788
  fn record_key_signature(keys: &[RecordKey]) -> u64 {
5789
+ let mut sorted: Vec<&RecordKey> = keys.iter().collect();
5790
+ sorted.sort();
4695
5791
  let mut state = 0xcbf29ce484222325_u64;
4696
- for (namespace, id) in keys {
5792
+ for (namespace, id) in sorted {
4697
5793
  for byte in namespace
4698
5794
  .as_bytes()
4699
5795
  .iter()
@@ -4720,9 +5816,16 @@ fn load_ann_index(
4720
5816
  ($dist_val:expr, $variant:ident) => {{
4721
5817
  let mut hnsw = reloader.load_hnsw_with_dist($dist_val).ok()?;
4722
5818
  hnsw.set_searching_mode(true);
5819
+ let key_to_origin = keys
5820
+ .iter()
5821
+ .enumerate()
5822
+ .map(|(i, k)| (k.clone(), i))
5823
+ .collect();
4723
5824
  Some(AnnIndex {
4724
5825
  hnsw: AnnHnsw::$variant(hnsw),
4725
5826
  keys,
5827
+ key_to_origin,
5828
+ tombstones: HashSet::new(),
4726
5829
  })
4727
5830
  }};
4728
5831
  }
@@ -4735,9 +5838,16 @@ fn load_ann_index(
4735
5838
  }
4736
5839
  }
4737
5840
 
5841
+ /// Write the ANN sidecar manifest. We use format `ANN2`, which (compared to
5842
+ /// the original `ANN1`) also serialises the actual key array per index in
5843
+ /// the order the HNSW knows its `origin_id`s. This is required for
5844
+ /// incremental insertion: without it, a reload would associate the wrong
5845
+ /// (BTreeMap-ordered) record key with each HNSW origin_id whenever the in
5846
+ /// memory key array isn't sorted (which happens any time we incrementally
5847
+ /// append).
4738
5848
  fn write_ann_manifest(path: &Path, entries: &[AnnManifestEntry]) -> Result<()> {
4739
- let mut file = File::create(path)?;
4740
- file.write_all(b"ANN1")?;
5849
+ let mut file = BufWriter::new(File::create(path)?);
5850
+ file.write_all(b"ANN2")?;
4741
5851
  write_u32(&mut file, u32_from_usize(entries.len())?)?;
4742
5852
  for entry in entries {
4743
5853
  write_u8(&mut file, u8::from(entry.namespace.is_some()))?;
@@ -4747,8 +5857,15 @@ fn write_ann_manifest(path: &Path, entries: &[AnnManifestEntry]) -> Result<()> {
4747
5857
  write_string(&mut file, &entry.vector_name)?;
4748
5858
  write_u64(&mut file, u64_from_usize(entry.record_count)?)?;
4749
5859
  write_u64(&mut file, entry.key_signature)?;
5860
+ // ANN2 addition: the full keys array in insertion order.
5861
+ write_u64(&mut file, u64_from_usize(entry.keys.len())?)?;
5862
+ for (ns, id) in &entry.keys {
5863
+ write_string(&mut file, ns)?;
5864
+ write_string(&mut file, id)?;
5865
+ }
4750
5866
  }
4751
- file.sync_all()?;
5867
+ file.flush()?;
5868
+ file.get_ref().sync_all()?;
4752
5869
  Ok(())
4753
5870
  }
4754
5871
 
@@ -4756,11 +5873,15 @@ fn read_ann_manifest(path: &Path) -> Result<Vec<AnnManifestEntry>> {
4756
5873
  let mut file = BufReader::new(File::open(path)?);
4757
5874
  let mut magic = [0_u8; 4];
4758
5875
  file.read_exact(&mut magic)?;
4759
- if &magic != b"ANN1" {
4760
- return Err(VectLiteError::InvalidFormat(
4761
- "invalid ANN manifest".to_owned(),
4762
- ));
4763
- }
5876
+ let version = match &magic {
5877
+ b"ANN1" => 1u8,
5878
+ b"ANN2" => 2u8,
5879
+ _ => {
5880
+ return Err(VectLiteError::InvalidFormat(
5881
+ "invalid ANN manifest".to_owned(),
5882
+ ));
5883
+ }
5884
+ };
4764
5885
 
4765
5886
  let count = usize_from_u32(read_u32(&mut file)?)?;
4766
5887
  let mut entries = Vec::with_capacity(count);
@@ -4774,12 +5895,27 @@ fn read_ann_manifest(path: &Path) -> Result<Vec<AnnManifestEntry>> {
4774
5895
  let vector_name = read_string(&mut file)?;
4775
5896
  let record_count = usize_from_u64(read_u64(&mut file)?)?;
4776
5897
  let key_signature = read_u64(&mut file)?;
5898
+ let keys = if version >= 2 {
5899
+ let n = usize_from_u64(read_u64(&mut file)?)?;
5900
+ let mut keys = Vec::with_capacity(n);
5901
+ for _ in 0..n {
5902
+ let ns = read_string(&mut file)?;
5903
+ let id = read_string(&mut file)?;
5904
+ keys.push((ns, id));
5905
+ }
5906
+ keys
5907
+ } else {
5908
+ // ANN1 had no persisted keys; caller falls back to recomputing
5909
+ // them from `self.records` (which yields BTreeMap-sorted keys,
5910
+ // matching the order ANN1 indexes were always built in).
5911
+ Vec::new()
5912
+ };
4777
5913
  entries.push(AnnManifestEntry {
4778
5914
  namespace,
4779
5915
  vector_name,
4780
5916
  record_count,
4781
5917
  key_signature,
4782
- keys: Vec::new(),
5918
+ keys,
4783
5919
  });
4784
5920
  }
4785
5921
  Ok(entries)