vectlite 0.9.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +26 -14
- package/index.d.ts +56 -0
- package/index.js +132 -8
- package/native/Cargo.toml +1 -1
- package/native/src/lib.rs +80 -47
- package/native/vectlite-core/Cargo.toml +1 -1
- package/native/vectlite-core/src/lib.rs +512 -152
- package/native/vectlite-core/src/quantization.rs +234 -49
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/vectlite.node +0 -0
- package/prebuilds/darwin-x64/vectlite.node +0 -0
- package/prebuilds/linux-x64-gnu/vectlite.node +0 -0
- package/prebuilds/win32-x64-msvc/vectlite.node +0 -0
|
@@ -14,6 +14,7 @@ use simsimd::SpatialSimilarity;
|
|
|
14
14
|
|
|
15
15
|
use quantization::{
|
|
16
16
|
MultiVectorQuantizationConfig, MultiVectorQuantizedIndex, QuantizationConfig, QuantizedIndex,
|
|
17
|
+
valid_product_num_sub_vectors, validate_quantization_config,
|
|
17
18
|
};
|
|
18
19
|
|
|
19
20
|
const MAGIC: &[u8; 4] = b"VDB1";
|
|
@@ -233,7 +234,10 @@ fn scalar_manhattan_distance(left: &[f32], right: &[f32]) -> f32 {
|
|
|
233
234
|
#[derive(Clone, Debug)]
|
|
234
235
|
enum WalOp {
|
|
235
236
|
Upsert(Record),
|
|
236
|
-
Delete {
|
|
237
|
+
Delete {
|
|
238
|
+
namespace: String,
|
|
239
|
+
id: String,
|
|
240
|
+
},
|
|
237
241
|
UpdateMetadata {
|
|
238
242
|
namespace: String,
|
|
239
243
|
id: String,
|
|
@@ -1147,7 +1151,10 @@ impl NumericIndex {
|
|
|
1147
1151
|
/// Return keys where value > threshold.
|
|
1148
1152
|
fn range_gt(&self, threshold: f64) -> HashSet<RecordKey> {
|
|
1149
1153
|
let mut result = HashSet::new();
|
|
1150
|
-
for (_, set) in self.tree.range((
|
|
1154
|
+
for (_, set) in self.tree.range((
|
|
1155
|
+
std::ops::Bound::Excluded(OrdF64(threshold)),
|
|
1156
|
+
std::ops::Bound::Unbounded,
|
|
1157
|
+
)) {
|
|
1151
1158
|
result.extend(set.iter().cloned());
|
|
1152
1159
|
}
|
|
1153
1160
|
result
|
|
@@ -1255,9 +1262,9 @@ impl AnnHnsw {
|
|
|
1255
1262
|
AnnHnsw::DotProduct(h) => h.file_dump(directory, basename),
|
|
1256
1263
|
AnnHnsw::Manhattan(h) => h.file_dump(directory, basename),
|
|
1257
1264
|
};
|
|
1258
|
-
result
|
|
1259
|
-
|
|
1260
|
-
|
|
1265
|
+
result.map(|_| ()).map_err(|err| {
|
|
1266
|
+
VectLiteError::InvalidFormat(format!("failed to persist ANN index: {err}"))
|
|
1267
|
+
})
|
|
1261
1268
|
}
|
|
1262
1269
|
}
|
|
1263
1270
|
|
|
@@ -1514,9 +1521,7 @@ impl Database {
|
|
|
1514
1521
|
.filter_map(|key| self.records.get(key))
|
|
1515
1522
|
.filter(|record| {
|
|
1516
1523
|
!record.is_expired_at(now)
|
|
1517
|
-
&& filter
|
|
1518
|
-
.map(|f| f.matches(&record.metadata))
|
|
1519
|
-
.unwrap_or(true)
|
|
1524
|
+
&& filter.map(|f| f.matches(&record.metadata)).unwrap_or(true)
|
|
1520
1525
|
})
|
|
1521
1526
|
.count()
|
|
1522
1527
|
} else {
|
|
@@ -1563,9 +1568,7 @@ impl Database {
|
|
|
1563
1568
|
.filter_map(|key| self.records.get(*key))
|
|
1564
1569
|
.filter(|record| {
|
|
1565
1570
|
!record.is_expired_at(now)
|
|
1566
|
-
&& filter
|
|
1567
|
-
.map(|f| f.matches(&record.metadata))
|
|
1568
|
-
.unwrap_or(true)
|
|
1571
|
+
&& filter.map(|f| f.matches(&record.metadata)).unwrap_or(true)
|
|
1569
1572
|
})
|
|
1570
1573
|
.skip(offset)
|
|
1571
1574
|
.take(if limit == 0 { usize::MAX } else { limit })
|
|
@@ -1659,9 +1662,7 @@ impl Database {
|
|
|
1659
1662
|
|
|
1660
1663
|
let next_cursor = if results.len() > limit {
|
|
1661
1664
|
results.pop(); // remove the extra
|
|
1662
|
-
results
|
|
1663
|
-
.last()
|
|
1664
|
-
.map(|r| format!("{}\0{}", r.namespace, r.id))
|
|
1665
|
+
results.last().map(|r| format!("{}\0{}", r.namespace, r.id))
|
|
1665
1666
|
} else {
|
|
1666
1667
|
None
|
|
1667
1668
|
};
|
|
@@ -1710,11 +1711,7 @@ impl Database {
|
|
|
1710
1711
|
///
|
|
1711
1712
|
/// Returns `true` if the record exists and was updated, `false` if the
|
|
1712
1713
|
/// record was not found (no error is raised).
|
|
1713
|
-
pub fn update_metadata(
|
|
1714
|
-
&mut self,
|
|
1715
|
-
id: impl Into<String>,
|
|
1716
|
-
metadata: Metadata,
|
|
1717
|
-
) -> Result<bool> {
|
|
1714
|
+
pub fn update_metadata(&mut self, id: impl Into<String>, metadata: Metadata) -> Result<bool> {
|
|
1718
1715
|
self.update_metadata_in_namespace(DEFAULT_NAMESPACE, id, metadata)
|
|
1719
1716
|
}
|
|
1720
1717
|
|
|
@@ -1841,7 +1838,11 @@ impl Database {
|
|
|
1841
1838
|
PayloadIndexType::Numeric => {
|
|
1842
1839
|
let mut num = NumericIndex::default();
|
|
1843
1840
|
for (key, record) in &self.records {
|
|
1844
|
-
if let Some(val) = record
|
|
1841
|
+
if let Some(val) = record
|
|
1842
|
+
.metadata
|
|
1843
|
+
.get(&field)
|
|
1844
|
+
.and_then(MetadataValue::as_number)
|
|
1845
|
+
{
|
|
1845
1846
|
num.insert(val, key.clone());
|
|
1846
1847
|
}
|
|
1847
1848
|
}
|
|
@@ -2290,6 +2291,18 @@ impl Database {
|
|
|
2290
2291
|
"search requires a dense query, a sparse query, or both".to_owned(),
|
|
2291
2292
|
));
|
|
2292
2293
|
}
|
|
2294
|
+
// Reject zero-norm query vectors for metrics where similarity is undefined.
|
|
2295
|
+
if let Some(query) = dense_query {
|
|
2296
|
+
if self.metric.is_similarity() {
|
|
2297
|
+
let norm_sq: f32 = query.iter().map(|v| v * v).sum();
|
|
2298
|
+
if norm_sq == 0.0 {
|
|
2299
|
+
return Err(VectLiteError::InvalidFormat(
|
|
2300
|
+
"query vector has zero norm; cosine/dot-product similarity is undefined"
|
|
2301
|
+
.to_owned(),
|
|
2302
|
+
));
|
|
2303
|
+
}
|
|
2304
|
+
}
|
|
2305
|
+
}
|
|
2293
2306
|
if let Some(mmr_lambda) = options.mmr_lambda {
|
|
2294
2307
|
if !(0.0..=1.0).contains(&mmr_lambda) {
|
|
2295
2308
|
return Err(VectLiteError::InvalidFormat(
|
|
@@ -2326,14 +2339,13 @@ impl Database {
|
|
|
2326
2339
|
let dense_start = Instant::now();
|
|
2327
2340
|
// Use quantized index for candidate selection if available (2-stage pipeline).
|
|
2328
2341
|
// The quantized index operates on the default vector only and globally (not per-namespace).
|
|
2329
|
-
let quantized_candidates =
|
|
2330
|
-
|
|
2331
|
-
|
|
2332
|
-
|
|
2333
|
-
|
|
2334
|
-
|
|
2335
|
-
|
|
2336
|
-
};
|
|
2342
|
+
let quantized_candidates = if !matryoshka_truncated
|
|
2343
|
+
&& (vector_name.is_none() || vector_name == Some(DEFAULT_VECTOR_NAME))
|
|
2344
|
+
{
|
|
2345
|
+
dense_query.and_then(|query| self.quantized_candidate_keys(query, fetch_k))
|
|
2346
|
+
} else {
|
|
2347
|
+
None
|
|
2348
|
+
};
|
|
2337
2349
|
let ann_candidates = if quantized_candidates.is_some() {
|
|
2338
2350
|
// Skip HNSW if quantized index provided candidates
|
|
2339
2351
|
None
|
|
@@ -2363,13 +2375,18 @@ impl Database {
|
|
|
2363
2375
|
};
|
|
2364
2376
|
|
|
2365
2377
|
// Use payload indexes to narrow candidates when doing a full scan.
|
|
2366
|
-
let payload_candidates = options
|
|
2367
|
-
|
|
2368
|
-
|
|
2378
|
+
let payload_candidates = options
|
|
2379
|
+
.filter
|
|
2380
|
+
.as_ref()
|
|
2381
|
+
.and_then(|f| self.payload_index_candidates(f, namespace));
|
|
2369
2382
|
let candidate_keys = match (candidate_keys, payload_candidates) {
|
|
2370
2383
|
(Some(ck), Some(pc)) => {
|
|
2371
2384
|
// Intersect ANN/sparse candidates with payload index candidates.
|
|
2372
|
-
Some(
|
|
2385
|
+
Some(
|
|
2386
|
+
ck.into_iter()
|
|
2387
|
+
.filter(|k| pc.contains(k))
|
|
2388
|
+
.collect::<Vec<_>>(),
|
|
2389
|
+
)
|
|
2373
2390
|
}
|
|
2374
2391
|
(None, Some(pc)) => {
|
|
2375
2392
|
// No ANN candidates but payload index narrowed the set.
|
|
@@ -2403,8 +2420,14 @@ impl Database {
|
|
|
2403
2420
|
|
|
2404
2421
|
if effective_dense_candidates.is_some() && results.len() < fetch_k {
|
|
2405
2422
|
stats.exact_fallback = true;
|
|
2406
|
-
results =
|
|
2407
|
-
|
|
2423
|
+
results = self.collect_results(
|
|
2424
|
+
dense_query,
|
|
2425
|
+
sparse_query,
|
|
2426
|
+
&options,
|
|
2427
|
+
namespace,
|
|
2428
|
+
None,
|
|
2429
|
+
effective_dimension,
|
|
2430
|
+
);
|
|
2408
2431
|
stats.considered_count = results.len();
|
|
2409
2432
|
}
|
|
2410
2433
|
|
|
@@ -2522,6 +2545,7 @@ impl Database {
|
|
|
2522
2545
|
"cannot enable quantization on an empty database".to_owned(),
|
|
2523
2546
|
));
|
|
2524
2547
|
}
|
|
2548
|
+
validate_quantization_config(&config, self.dimension)?;
|
|
2525
2549
|
self.quantization_config = Some(config);
|
|
2526
2550
|
self.rebuild_quantized_index();
|
|
2527
2551
|
self.persist_quantization_params()?;
|
|
@@ -2552,6 +2576,11 @@ impl Database {
|
|
|
2552
2576
|
self.quantization_config.as_ref()
|
|
2553
2577
|
}
|
|
2554
2578
|
|
|
2579
|
+
/// Returns all valid Product Quantization `num_sub_vectors` values for this database.
|
|
2580
|
+
pub fn valid_num_sub_vectors(&self) -> Vec<usize> {
|
|
2581
|
+
valid_product_num_sub_vectors(self.dimension)
|
|
2582
|
+
}
|
|
2583
|
+
|
|
2555
2584
|
/// Rebuild the quantized index from current records.
|
|
2556
2585
|
fn rebuild_quantized_index(&mut self) {
|
|
2557
2586
|
let config = match &self.quantization_config {
|
|
@@ -2635,7 +2664,7 @@ impl Database {
|
|
|
2635
2664
|
return None;
|
|
2636
2665
|
}
|
|
2637
2666
|
|
|
2638
|
-
let candidate_indices = index.
|
|
2667
|
+
let candidate_indices = index.search_candidates_with_metric(query, top_k, self.metric);
|
|
2639
2668
|
Some(
|
|
2640
2669
|
candidate_indices
|
|
2641
2670
|
.into_iter()
|
|
@@ -2657,7 +2686,11 @@ impl Database {
|
|
|
2657
2686
|
multi_vectors: MultiVectors,
|
|
2658
2687
|
) -> Result<()> {
|
|
2659
2688
|
self.upsert_multi_vectors_in_namespace(
|
|
2660
|
-
DEFAULT_NAMESPACE,
|
|
2689
|
+
DEFAULT_NAMESPACE,
|
|
2690
|
+
id,
|
|
2691
|
+
vector,
|
|
2692
|
+
metadata,
|
|
2693
|
+
multi_vectors,
|
|
2661
2694
|
)
|
|
2662
2695
|
}
|
|
2663
2696
|
|
|
@@ -2716,10 +2749,8 @@ impl Database {
|
|
|
2716
2749
|
|
|
2717
2750
|
// Try quantized multi-vector search first for candidate selection
|
|
2718
2751
|
let query_refs: Vec<&[f32]> = query_tokens.iter().map(Vec::as_slice).collect();
|
|
2719
|
-
let candidate_keys: Option<Vec<RecordKey>> =
|
|
2720
|
-
.multi_vector_quantized
|
|
2721
|
-
.get(space)
|
|
2722
|
-
.and_then(|index| {
|
|
2752
|
+
let candidate_keys: Option<Vec<RecordKey>> =
|
|
2753
|
+
self.multi_vector_quantized.get(space).and_then(|index| {
|
|
2723
2754
|
let keys = self.multi_vector_quantized_keys.get(space)?;
|
|
2724
2755
|
let candidate_indices = index.search(&query_refs, top_k);
|
|
2725
2756
|
Some(
|
|
@@ -2740,9 +2771,7 @@ impl Database {
|
|
|
2740
2771
|
let mut scored: Vec<(f32, &Record)> = record_iter
|
|
2741
2772
|
.filter(|record| {
|
|
2742
2773
|
!record.is_expired_at(now)
|
|
2743
|
-
&& namespace
|
|
2744
|
-
.map(|ns| record.namespace == ns)
|
|
2745
|
-
.unwrap_or(true)
|
|
2774
|
+
&& namespace.map(|ns| record.namespace == ns).unwrap_or(true)
|
|
2746
2775
|
&& record.multi_vectors.contains_key(space)
|
|
2747
2776
|
&& options
|
|
2748
2777
|
.filter
|
|
@@ -2840,14 +2869,9 @@ impl Database {
|
|
|
2840
2869
|
return;
|
|
2841
2870
|
}
|
|
2842
2871
|
|
|
2843
|
-
let index = MultiVectorQuantizedIndex::build(
|
|
2844
|
-
&doc_token_vectors,
|
|
2845
|
-
token_dimension,
|
|
2846
|
-
&config,
|
|
2847
|
-
);
|
|
2872
|
+
let index = MultiVectorQuantizedIndex::build(&doc_token_vectors, token_dimension, &config);
|
|
2848
2873
|
|
|
2849
|
-
self.multi_vector_quantized
|
|
2850
|
-
.insert(space.to_owned(), index);
|
|
2874
|
+
self.multi_vector_quantized.insert(space.to_owned(), index);
|
|
2851
2875
|
self.multi_vector_quantized_keys
|
|
2852
2876
|
.insert(space.to_owned(), keys);
|
|
2853
2877
|
}
|
|
@@ -2928,15 +2952,15 @@ impl Database {
|
|
|
2928
2952
|
|
|
2929
2953
|
if !doc_token_vectors.is_empty() {
|
|
2930
2954
|
index.rebuild(&doc_token_vectors);
|
|
2931
|
-
let MultiVectorQuantizationConfig::TwoBit(ref cfg) =
|
|
2932
|
-
MultiVectorQuantizationConfig::TwoBit(index.quantizer.config.clone())
|
|
2933
|
-
|
|
2934
|
-
|
|
2935
|
-
|
|
2955
|
+
let MultiVectorQuantizationConfig::TwoBit(ref cfg) =
|
|
2956
|
+
{ MultiVectorQuantizationConfig::TwoBit(index.quantizer.config.clone()) };
|
|
2957
|
+
self.multi_vector_quantization_config.insert(
|
|
2958
|
+
space.to_owned(),
|
|
2959
|
+
MultiVectorQuantizationConfig::TwoBit(cfg.clone()),
|
|
2960
|
+
);
|
|
2936
2961
|
self.multi_vector_quantized_keys
|
|
2937
2962
|
.insert(space.to_owned(), keys);
|
|
2938
|
-
self.multi_vector_quantized
|
|
2939
|
-
.insert(space.to_owned(), index);
|
|
2963
|
+
self.multi_vector_quantized.insert(space.to_owned(), index);
|
|
2940
2964
|
}
|
|
2941
2965
|
}
|
|
2942
2966
|
}
|
|
@@ -3020,7 +3044,11 @@ impl Database {
|
|
|
3020
3044
|
PayloadIndexType::Numeric => {
|
|
3021
3045
|
let mut num = NumericIndex::default();
|
|
3022
3046
|
for (key, record) in &self.records {
|
|
3023
|
-
if let Some(val) = record
|
|
3047
|
+
if let Some(val) = record
|
|
3048
|
+
.metadata
|
|
3049
|
+
.get(field)
|
|
3050
|
+
.and_then(MetadataValue::as_number)
|
|
3051
|
+
{
|
|
3024
3052
|
num.insert(val, key.clone());
|
|
3025
3053
|
}
|
|
3026
3054
|
}
|
|
@@ -3071,14 +3099,22 @@ impl Database {
|
|
|
3071
3099
|
/// Use payload indexes to narrow down candidate keys for a filter.
|
|
3072
3100
|
/// Returns `None` if no indexes can help with this filter (fallback to scan).
|
|
3073
3101
|
/// Returns `Some(set)` with the set of record keys that *may* match the filter.
|
|
3074
|
-
fn payload_index_candidates(
|
|
3102
|
+
fn payload_index_candidates(
|
|
3103
|
+
&self,
|
|
3104
|
+
filter: &MetadataFilter,
|
|
3105
|
+
namespace: Option<&str>,
|
|
3106
|
+
) -> Option<HashSet<RecordKey>> {
|
|
3075
3107
|
if self.payload_indexes.is_empty() {
|
|
3076
3108
|
return None;
|
|
3077
3109
|
}
|
|
3078
3110
|
self.payload_index_candidates_inner(filter, namespace)
|
|
3079
3111
|
}
|
|
3080
3112
|
|
|
3081
|
-
fn payload_index_candidates_inner(
|
|
3113
|
+
fn payload_index_candidates_inner(
|
|
3114
|
+
&self,
|
|
3115
|
+
filter: &MetadataFilter,
|
|
3116
|
+
namespace: Option<&str>,
|
|
3117
|
+
) -> Option<HashSet<RecordKey>> {
|
|
3082
3118
|
match filter {
|
|
3083
3119
|
MetadataFilter::Eq { key, value } => {
|
|
3084
3120
|
// Try keyword index for string equality
|
|
@@ -3172,7 +3208,11 @@ impl Database {
|
|
|
3172
3208
|
}
|
|
3173
3209
|
}
|
|
3174
3210
|
|
|
3175
|
-
fn filter_by_namespace(
|
|
3211
|
+
fn filter_by_namespace(
|
|
3212
|
+
&self,
|
|
3213
|
+
keys: HashSet<RecordKey>,
|
|
3214
|
+
namespace: Option<&str>,
|
|
3215
|
+
) -> HashSet<RecordKey> {
|
|
3176
3216
|
match namespace {
|
|
3177
3217
|
Some(ns) => keys.into_iter().filter(|(n, _)| n == ns).collect(),
|
|
3178
3218
|
None => keys,
|
|
@@ -3357,7 +3397,9 @@ impl Database {
|
|
|
3357
3397
|
WalOp::UpdateMetadata { .. } | WalOp::SetTtl { .. } => false,
|
|
3358
3398
|
});
|
|
3359
3399
|
|
|
3360
|
-
let metadata_only = ops
|
|
3400
|
+
let metadata_only = ops
|
|
3401
|
+
.iter()
|
|
3402
|
+
.all(|op| matches!(op, WalOp::UpdateMetadata { .. } | WalOp::SetTtl { .. }));
|
|
3361
3403
|
|
|
3362
3404
|
self.append_wal_batch(&ops)?;
|
|
3363
3405
|
self.apply_ops_in_memory(ops);
|
|
@@ -3721,22 +3763,32 @@ impl Database {
|
|
|
3721
3763
|
Some(0) => {
|
|
3722
3764
|
return Err(VectLiteError::InvalidFormat(
|
|
3723
3765
|
"truncate_dim must be greater than zero".to_owned(),
|
|
3724
|
-
))
|
|
3766
|
+
));
|
|
3725
3767
|
}
|
|
3726
3768
|
Some(dim) if dim > self.dimension => {
|
|
3727
3769
|
return Err(VectLiteError::DimensionMismatch {
|
|
3728
3770
|
expected: self.dimension,
|
|
3729
3771
|
found: dim,
|
|
3730
|
-
})
|
|
3772
|
+
});
|
|
3731
3773
|
}
|
|
3732
3774
|
Some(dim) if dim > query.len() => {
|
|
3733
3775
|
return Err(VectLiteError::InvalidFormat(format!(
|
|
3734
3776
|
"truncate_dim ({dim}) cannot exceed query vector length ({})",
|
|
3735
3777
|
query.len()
|
|
3736
|
-
)))
|
|
3778
|
+
)));
|
|
3737
3779
|
}
|
|
3738
3780
|
Some(dim) => dim,
|
|
3739
|
-
None =>
|
|
3781
|
+
None => {
|
|
3782
|
+
// Without explicit truncate_dim, require exact dimension match.
|
|
3783
|
+
// Users must pass truncate_dim to opt into Matryoshka truncation.
|
|
3784
|
+
if query.len() != self.dimension {
|
|
3785
|
+
return Err(VectLiteError::DimensionMismatch {
|
|
3786
|
+
expected: self.dimension,
|
|
3787
|
+
found: query.len(),
|
|
3788
|
+
});
|
|
3789
|
+
}
|
|
3790
|
+
query.len()
|
|
3791
|
+
}
|
|
3740
3792
|
};
|
|
3741
3793
|
|
|
3742
3794
|
Ok(Some(effective))
|
|
@@ -4058,8 +4110,13 @@ impl Database {
|
|
|
4058
4110
|
let mut weighted_sum = 0.0_f32;
|
|
4059
4111
|
for (name, (query, weight)) in &options.multi_vector_queries {
|
|
4060
4112
|
if let Some(vector) = record.vector_for(Some(name.as_str())) {
|
|
4061
|
-
weighted_sum +=
|
|
4062
|
-
|
|
4113
|
+
weighted_sum += weight
|
|
4114
|
+
* score_dense_prefix(
|
|
4115
|
+
self.metric,
|
|
4116
|
+
query,
|
|
4117
|
+
vector,
|
|
4118
|
+
effective_dimension,
|
|
4119
|
+
);
|
|
4063
4120
|
}
|
|
4064
4121
|
}
|
|
4065
4122
|
(weighted_sum, None)
|
|
@@ -4375,11 +4432,7 @@ fn ensure_dimension(dimension: usize) -> Result<()> {
|
|
|
4375
4432
|
/// MaxSim scoring (ColBERT-style late interaction).
|
|
4376
4433
|
/// For each query token, find the maximum similarity against any document
|
|
4377
4434
|
/// token using the given metric, then sum those maxima across all query tokens.
|
|
4378
|
-
fn maxsim_score(
|
|
4379
|
-
query_tokens: &[&[f32]],
|
|
4380
|
-
doc_tokens: &[Vec<f32>],
|
|
4381
|
-
metric: DistanceMetric,
|
|
4382
|
-
) -> f32 {
|
|
4435
|
+
fn maxsim_score(query_tokens: &[&[f32]], doc_tokens: &[Vec<f32>], metric: DistanceMetric) -> f32 {
|
|
4383
4436
|
if query_tokens.is_empty() || doc_tokens.is_empty() {
|
|
4384
4437
|
return 0.0;
|
|
4385
4438
|
}
|
|
@@ -4428,8 +4481,13 @@ fn build_ann_index(records: Vec<(RecordKey, &Vec<f32>)>, metric: DistanceMetric)
|
|
|
4428
4481
|
|
|
4429
4482
|
macro_rules! build_hnsw {
|
|
4430
4483
|
($dist_type:ty, $dist_val:expr, $variant:ident) => {{
|
|
4431
|
-
let mut hnsw =
|
|
4432
|
-
|
|
4484
|
+
let mut hnsw = Hnsw::<f32, $dist_type>::new(
|
|
4485
|
+
ANN_M,
|
|
4486
|
+
count,
|
|
4487
|
+
max_layer,
|
|
4488
|
+
ANN_EF_CONSTRUCTION,
|
|
4489
|
+
$dist_val,
|
|
4490
|
+
);
|
|
4433
4491
|
let mut keys = Vec::with_capacity(count);
|
|
4434
4492
|
for (origin_id, (key, vector)) in records.into_iter().enumerate() {
|
|
4435
4493
|
hnsw.insert((vector.as_slice(), origin_id));
|
|
@@ -4616,11 +4674,13 @@ fn ann_basename(path: &Path, namespace: Option<&str>, vector_name: &str) -> Stri
|
|
|
4616
4674
|
.file_name()
|
|
4617
4675
|
.and_then(|name| name.to_str())
|
|
4618
4676
|
.unwrap_or("vectlite");
|
|
4619
|
-
|
|
4620
|
-
|
|
4621
|
-
|
|
4622
|
-
|
|
4623
|
-
)
|
|
4677
|
+
let ns_hex = hex_encode(namespace.unwrap_or(DEFAULT_NAMESPACE).as_bytes());
|
|
4678
|
+
let vn_hex = hex_encode(vector_name.as_bytes());
|
|
4679
|
+
// Use "_" sentinel for empty components to avoid triple-dot filenames
|
|
4680
|
+
// like "c.vdb.ann...hnsw.data".
|
|
4681
|
+
let ns_part = if ns_hex.is_empty() { "_" } else { &ns_hex };
|
|
4682
|
+
let vn_part = if vn_hex.is_empty() { "_" } else { &vn_hex };
|
|
4683
|
+
format!("{stem}.ann.{ns_part}.{vn_part}")
|
|
4624
4684
|
}
|
|
4625
4685
|
|
|
4626
4686
|
fn hex_encode(bytes: &[u8]) -> String {
|
|
@@ -5350,9 +5410,9 @@ fn usize_from_u64(value: u64) -> Result<usize> {
|
|
|
5350
5410
|
#[cfg(test)]
|
|
5351
5411
|
mod tests {
|
|
5352
5412
|
use super::{
|
|
5353
|
-
Database, HybridSearchOptions, Metadata, MetadataFilter, MetadataValue,
|
|
5354
|
-
MultiVectorSearchOptions, NamedVectors, PayloadIndexType, Record,
|
|
5355
|
-
SparseVector, VectLiteError,
|
|
5413
|
+
Database, DistanceMetric, HybridSearchOptions, Metadata, MetadataFilter, MetadataValue,
|
|
5414
|
+
MultiVectorSearchOptions, MultiVectors, NamedVectors, PayloadIndexType, Record,
|
|
5415
|
+
SearchOptions, SparseVector, VectLiteError,
|
|
5356
5416
|
};
|
|
5357
5417
|
use std::path::{Path, PathBuf};
|
|
5358
5418
|
use std::time::{SystemTime, UNIX_EPOCH};
|
|
@@ -6046,6 +6106,54 @@ mod tests {
|
|
|
6046
6106
|
cleanup(&path);
|
|
6047
6107
|
}
|
|
6048
6108
|
|
|
6109
|
+
#[test]
|
|
6110
|
+
fn scalar_quantization_keeps_signed_cosine_neighbor_in_candidate_set() {
|
|
6111
|
+
use super::quantization::{QuantizationConfig, ScalarQuantizationConfig};
|
|
6112
|
+
|
|
6113
|
+
let path = temp_file("quant-scalar-signed-recall");
|
|
6114
|
+
let dim = 146;
|
|
6115
|
+
|
|
6116
|
+
let mut query = vec![-1.0_f32; dim];
|
|
6117
|
+
for value in &mut query[..10] {
|
|
6118
|
+
*value = 1.0;
|
|
6119
|
+
}
|
|
6120
|
+
|
|
6121
|
+
let mut db = Database::create(&path, dim).expect("create");
|
|
6122
|
+
for i in 0..120 {
|
|
6123
|
+
db.upsert(format!("high{i:03}"), vec![2.0_f32; dim], Metadata::new())
|
|
6124
|
+
.expect("upsert high distractor");
|
|
6125
|
+
}
|
|
6126
|
+
|
|
6127
|
+
let mut calibration_low = vec![2.0_f32; dim];
|
|
6128
|
+
for value in &mut calibration_low[..10] {
|
|
6129
|
+
*value = -1.0;
|
|
6130
|
+
}
|
|
6131
|
+
db.upsert("calibration-low", calibration_low, Metadata::new())
|
|
6132
|
+
.expect("upsert calibration low");
|
|
6133
|
+
db.upsert("target", query.clone(), Metadata::new())
|
|
6134
|
+
.expect("upsert target");
|
|
6135
|
+
|
|
6136
|
+
db.enable_quantization(QuantizationConfig::Scalar(ScalarQuantizationConfig {
|
|
6137
|
+
rescore_multiplier: 1,
|
|
6138
|
+
}))
|
|
6139
|
+
.expect("enable quant");
|
|
6140
|
+
|
|
6141
|
+
let results = db
|
|
6142
|
+
.search(
|
|
6143
|
+
&query,
|
|
6144
|
+
SearchOptions {
|
|
6145
|
+
top_k: 1,
|
|
6146
|
+
filter: None,
|
|
6147
|
+
truncate_dim: None,
|
|
6148
|
+
},
|
|
6149
|
+
)
|
|
6150
|
+
.expect("search");
|
|
6151
|
+
|
|
6152
|
+
assert_eq!(results[0].id, "target");
|
|
6153
|
+
|
|
6154
|
+
cleanup(&path);
|
|
6155
|
+
}
|
|
6156
|
+
|
|
6049
6157
|
#[test]
|
|
6050
6158
|
fn binary_quantization_enables_search() {
|
|
6051
6159
|
use super::quantization::{BinaryQuantizationConfig, QuantizationConfig};
|
|
@@ -6187,6 +6295,40 @@ mod tests {
|
|
|
6187
6295
|
cleanup(&path);
|
|
6188
6296
|
}
|
|
6189
6297
|
|
|
6298
|
+
#[test]
|
|
6299
|
+
fn product_quantization_invalid_subvector_count_returns_error() {
|
|
6300
|
+
use super::quantization::{ProductQuantizationConfig, QuantizationConfig};
|
|
6301
|
+
|
|
6302
|
+
let path = temp_file("quant-pq-invalid-subvectors");
|
|
6303
|
+
let mut db = Database::create(&path, 146).expect("create");
|
|
6304
|
+
for i in 0..4 {
|
|
6305
|
+
db.upsert(
|
|
6306
|
+
format!("doc{i}"),
|
|
6307
|
+
vec![0.1_f32 + i as f32; 146],
|
|
6308
|
+
Metadata::new(),
|
|
6309
|
+
)
|
|
6310
|
+
.expect("upsert");
|
|
6311
|
+
}
|
|
6312
|
+
assert_eq!(db.valid_num_sub_vectors(), vec![1, 2, 73, 146]);
|
|
6313
|
+
|
|
6314
|
+
let result =
|
|
6315
|
+
db.enable_quantization(QuantizationConfig::Product(ProductQuantizationConfig {
|
|
6316
|
+
num_sub_vectors: 16,
|
|
6317
|
+
num_centroids: 4,
|
|
6318
|
+
training_iterations: 1,
|
|
6319
|
+
rescore_multiplier: 1,
|
|
6320
|
+
}));
|
|
6321
|
+
|
|
6322
|
+
assert!(matches!(
|
|
6323
|
+
result,
|
|
6324
|
+
Err(VectLiteError::InvalidFormat(message))
|
|
6325
|
+
if message.contains("dimension (146) must be divisible by num_sub_vectors (16)")
|
|
6326
|
+
));
|
|
6327
|
+
assert!(!db.is_quantized());
|
|
6328
|
+
|
|
6329
|
+
cleanup(&path);
|
|
6330
|
+
}
|
|
6331
|
+
|
|
6190
6332
|
// -----------------------------------------------------------------------
|
|
6191
6333
|
// Multi-vector / ColBERT-style integration tests
|
|
6192
6334
|
// -----------------------------------------------------------------------
|
|
@@ -6200,10 +6342,7 @@ mod tests {
|
|
|
6200
6342
|
let mut mv1 = MultiVectors::new();
|
|
6201
6343
|
mv1.insert(
|
|
6202
6344
|
"colbert".to_owned(),
|
|
6203
|
-
vec![
|
|
6204
|
-
vec![1.0, 0.0, 0.0],
|
|
6205
|
-
vec![0.0, 1.0, 0.0],
|
|
6206
|
-
],
|
|
6345
|
+
vec![vec![1.0, 0.0, 0.0], vec![0.0, 1.0, 0.0]],
|
|
6207
6346
|
);
|
|
6208
6347
|
db.upsert_multi_vectors("doc1", vec![1.0, 0.0, 0.0], Metadata::new(), mv1)
|
|
6209
6348
|
.expect("upsert doc1");
|
|
@@ -6211,10 +6350,7 @@ mod tests {
|
|
|
6211
6350
|
let mut mv2 = MultiVectors::new();
|
|
6212
6351
|
mv2.insert(
|
|
6213
6352
|
"colbert".to_owned(),
|
|
6214
|
-
vec![
|
|
6215
|
-
vec![0.0, 0.0, 1.0],
|
|
6216
|
-
vec![0.0, 1.0, 0.0],
|
|
6217
|
-
],
|
|
6353
|
+
vec![vec![0.0, 0.0, 1.0], vec![0.0, 1.0, 0.0]],
|
|
6218
6354
|
);
|
|
6219
6355
|
db.upsert_multi_vectors("doc2", vec![0.0, 0.0, 1.0], Metadata::new(), mv2)
|
|
6220
6356
|
.expect("upsert doc2");
|
|
@@ -6222,13 +6358,14 @@ mod tests {
|
|
|
6222
6358
|
assert_eq!(db.len(), 2);
|
|
6223
6359
|
|
|
6224
6360
|
// Search with query tokens that strongly match doc1
|
|
6225
|
-
let query_tokens = vec![
|
|
6226
|
-
vec![1.0, 0.0, 0.0],
|
|
6227
|
-
vec![0.0, 1.0, 0.0],
|
|
6228
|
-
];
|
|
6361
|
+
let query_tokens = vec![vec![1.0, 0.0, 0.0], vec![0.0, 1.0, 0.0]];
|
|
6229
6362
|
|
|
6230
6363
|
let results = db
|
|
6231
|
-
.search_multi_vector(
|
|
6364
|
+
.search_multi_vector(
|
|
6365
|
+
"colbert",
|
|
6366
|
+
&query_tokens,
|
|
6367
|
+
MultiVectorSearchOptions::default(),
|
|
6368
|
+
)
|
|
6232
6369
|
.expect("search");
|
|
6233
6370
|
|
|
6234
6371
|
assert_eq!(results.len(), 2);
|
|
@@ -6255,7 +6392,11 @@ mod tests {
|
|
|
6255
6392
|
let db = Database::create(&path, 3).expect("create");
|
|
6256
6393
|
|
|
6257
6394
|
let query_tokens: Vec<Vec<f32>> = vec![];
|
|
6258
|
-
let result = db.search_multi_vector(
|
|
6395
|
+
let result = db.search_multi_vector(
|
|
6396
|
+
"colbert",
|
|
6397
|
+
&query_tokens,
|
|
6398
|
+
MultiVectorSearchOptions::default(),
|
|
6399
|
+
);
|
|
6259
6400
|
assert!(result.is_err());
|
|
6260
6401
|
|
|
6261
6402
|
cleanup(&path);
|
|
@@ -6268,10 +6409,22 @@ mod tests {
|
|
|
6268
6409
|
|
|
6269
6410
|
let mut mv = MultiVectors::new();
|
|
6270
6411
|
mv.insert("colbert".to_owned(), vec![vec![1.0, 0.0, 0.0]]);
|
|
6271
|
-
db.upsert_multi_vectors_in_namespace(
|
|
6272
|
-
|
|
6273
|
-
|
|
6274
|
-
.
|
|
6412
|
+
db.upsert_multi_vectors_in_namespace(
|
|
6413
|
+
"ns1",
|
|
6414
|
+
"doc1",
|
|
6415
|
+
vec![1.0, 0.0, 0.0],
|
|
6416
|
+
Metadata::new(),
|
|
6417
|
+
mv.clone(),
|
|
6418
|
+
)
|
|
6419
|
+
.expect("upsert ns1");
|
|
6420
|
+
db.upsert_multi_vectors_in_namespace(
|
|
6421
|
+
"ns2",
|
|
6422
|
+
"doc2",
|
|
6423
|
+
vec![0.0, 1.0, 0.0],
|
|
6424
|
+
Metadata::new(),
|
|
6425
|
+
mv.clone(),
|
|
6426
|
+
)
|
|
6427
|
+
.expect("upsert ns2");
|
|
6275
6428
|
|
|
6276
6429
|
let query_tokens = vec![vec![1.0, 0.0, 0.0]];
|
|
6277
6430
|
let options = MultiVectorSearchOptions {
|
|
@@ -6279,7 +6432,9 @@ mod tests {
|
|
|
6279
6432
|
filter: None,
|
|
6280
6433
|
namespace: Some("ns1".to_owned()),
|
|
6281
6434
|
};
|
|
6282
|
-
let results = db
|
|
6435
|
+
let results = db
|
|
6436
|
+
.search_multi_vector("colbert", &query_tokens, options)
|
|
6437
|
+
.expect("search");
|
|
6283
6438
|
|
|
6284
6439
|
assert_eq!(results.len(), 1);
|
|
6285
6440
|
assert_eq!(results[0].id, "doc1");
|
|
@@ -6331,13 +6486,18 @@ mod tests {
|
|
|
6331
6486
|
// Search should still work
|
|
6332
6487
|
let query_tokens = vec![vec![9.0, 0.0, 0.0], vec![0.0, 9.0, 0.0]];
|
|
6333
6488
|
let results = db
|
|
6334
|
-
.search_multi_vector(
|
|
6489
|
+
.search_multi_vector(
|
|
6490
|
+
"colbert",
|
|
6491
|
+
&query_tokens,
|
|
6492
|
+
MultiVectorSearchOptions::default(),
|
|
6493
|
+
)
|
|
6335
6494
|
.expect("search");
|
|
6336
6495
|
|
|
6337
6496
|
assert!(!results.is_empty());
|
|
6338
6497
|
|
|
6339
6498
|
// Disable quantization
|
|
6340
|
-
db.disable_multi_vector_quantization("colbert")
|
|
6499
|
+
db.disable_multi_vector_quantization("colbert")
|
|
6500
|
+
.expect("disable");
|
|
6341
6501
|
assert!(!db.is_multi_vector_quantized("colbert"));
|
|
6342
6502
|
|
|
6343
6503
|
cleanup(&path);
|
|
@@ -6387,7 +6547,11 @@ mod tests {
|
|
|
6387
6547
|
// Search should work on reopened database
|
|
6388
6548
|
let query_tokens = vec![vec![0.9, 0.5, 0.5]];
|
|
6389
6549
|
let results = db
|
|
6390
|
-
.search_multi_vector(
|
|
6550
|
+
.search_multi_vector(
|
|
6551
|
+
"colbert",
|
|
6552
|
+
&query_tokens,
|
|
6553
|
+
MultiVectorSearchOptions::default(),
|
|
6554
|
+
)
|
|
6391
6555
|
.expect("search");
|
|
6392
6556
|
assert!(!results.is_empty());
|
|
6393
6557
|
|
|
@@ -6421,7 +6585,7 @@ mod tests {
|
|
|
6421
6585
|
|
|
6422
6586
|
#[test]
|
|
6423
6587
|
fn multi_vector_maxsim_scoring_correctness() {
|
|
6424
|
-
use super::{
|
|
6588
|
+
use super::{DistanceMetric, maxsim_score};
|
|
6425
6589
|
|
|
6426
6590
|
// Two identical sets: MaxSim should be sum of 1.0 per query token
|
|
6427
6591
|
let query = [&[1.0_f32, 0.0, 0.0][..], &[0.0, 1.0, 0.0]];
|
|
@@ -6479,17 +6643,44 @@ mod tests {
|
|
|
6479
6643
|
fn distance_metric_name_aliases() {
|
|
6480
6644
|
use super::DistanceMetric;
|
|
6481
6645
|
// Euclidean aliases
|
|
6482
|
-
assert_eq!(
|
|
6483
|
-
|
|
6484
|
-
|
|
6646
|
+
assert_eq!(
|
|
6647
|
+
DistanceMetric::from_name("l2").unwrap(),
|
|
6648
|
+
DistanceMetric::Euclidean
|
|
6649
|
+
);
|
|
6650
|
+
assert_eq!(
|
|
6651
|
+
DistanceMetric::from_name("L2").unwrap(),
|
|
6652
|
+
DistanceMetric::Euclidean
|
|
6653
|
+
);
|
|
6654
|
+
assert_eq!(
|
|
6655
|
+
DistanceMetric::from_name("EUCLIDEAN").unwrap(),
|
|
6656
|
+
DistanceMetric::Euclidean
|
|
6657
|
+
);
|
|
6485
6658
|
// DotProduct aliases
|
|
6486
|
-
assert_eq!(
|
|
6487
|
-
|
|
6488
|
-
|
|
6489
|
-
|
|
6659
|
+
assert_eq!(
|
|
6660
|
+
DistanceMetric::from_name("dot").unwrap(),
|
|
6661
|
+
DistanceMetric::DotProduct
|
|
6662
|
+
);
|
|
6663
|
+
assert_eq!(
|
|
6664
|
+
DistanceMetric::from_name("dot_product").unwrap(),
|
|
6665
|
+
DistanceMetric::DotProduct
|
|
6666
|
+
);
|
|
6667
|
+
assert_eq!(
|
|
6668
|
+
DistanceMetric::from_name("ip").unwrap(),
|
|
6669
|
+
DistanceMetric::DotProduct
|
|
6670
|
+
);
|
|
6671
|
+
assert_eq!(
|
|
6672
|
+
DistanceMetric::from_name("inner_product").unwrap(),
|
|
6673
|
+
DistanceMetric::DotProduct
|
|
6674
|
+
);
|
|
6490
6675
|
// Manhattan aliases
|
|
6491
|
-
assert_eq!(
|
|
6492
|
-
|
|
6676
|
+
assert_eq!(
|
|
6677
|
+
DistanceMetric::from_name("l1").unwrap(),
|
|
6678
|
+
DistanceMetric::Manhattan
|
|
6679
|
+
);
|
|
6680
|
+
assert_eq!(
|
|
6681
|
+
DistanceMetric::from_name("L1").unwrap(),
|
|
6682
|
+
DistanceMetric::Manhattan
|
|
6683
|
+
);
|
|
6493
6684
|
// Invalid
|
|
6494
6685
|
assert!(DistanceMetric::from_name("hamming").is_err());
|
|
6495
6686
|
}
|
|
@@ -6634,8 +6825,8 @@ mod tests {
|
|
|
6634
6825
|
fn search_with_euclidean_metric() {
|
|
6635
6826
|
use super::DistanceMetric;
|
|
6636
6827
|
let path = temp_file("metric-search-euclidean");
|
|
6637
|
-
let mut db =
|
|
6638
|
-
.expect("create");
|
|
6828
|
+
let mut db =
|
|
6829
|
+
Database::create_with_metric(&path, 3, DistanceMetric::Euclidean).expect("create");
|
|
6639
6830
|
|
|
6640
6831
|
// Insert vectors at known distances from query [0, 0, 0]
|
|
6641
6832
|
db.insert("close", vec![1.0, 0.0, 0.0], Metadata::new())
|
|
@@ -6671,8 +6862,8 @@ mod tests {
|
|
|
6671
6862
|
fn search_with_dotproduct_metric() {
|
|
6672
6863
|
use super::DistanceMetric;
|
|
6673
6864
|
let path = temp_file("metric-search-dot");
|
|
6674
|
-
let mut db =
|
|
6675
|
-
.expect("create");
|
|
6865
|
+
let mut db =
|
|
6866
|
+
Database::create_with_metric(&path, 3, DistanceMetric::DotProduct).expect("create");
|
|
6676
6867
|
|
|
6677
6868
|
// Vectors with different dot products with query [1, 0, 0]
|
|
6678
6869
|
db.insert("high", vec![10.0, 0.0, 0.0], Metadata::new())
|
|
@@ -6707,8 +6898,8 @@ mod tests {
|
|
|
6707
6898
|
fn search_with_manhattan_metric() {
|
|
6708
6899
|
use super::DistanceMetric;
|
|
6709
6900
|
let path = temp_file("metric-search-manhattan");
|
|
6710
|
-
let mut db =
|
|
6711
|
-
.expect("create");
|
|
6901
|
+
let mut db =
|
|
6902
|
+
Database::create_with_metric(&path, 3, DistanceMetric::Manhattan).expect("create");
|
|
6712
6903
|
|
|
6713
6904
|
// Vectors at known Manhattan distances from query [0, 0, 0]
|
|
6714
6905
|
db.insert("close", vec![1.0, 0.0, 0.0], Metadata::new())
|
|
@@ -6755,6 +6946,7 @@ mod tests {
|
|
|
6755
6946
|
None,
|
|
6756
6947
|
HybridSearchOptions {
|
|
6757
6948
|
top_k: 2,
|
|
6949
|
+
truncate_dim: Some(2),
|
|
6758
6950
|
..HybridSearchOptions::default()
|
|
6759
6951
|
},
|
|
6760
6952
|
)
|
|
@@ -6801,8 +6993,8 @@ mod tests {
|
|
|
6801
6993
|
fn search_with_cosine_metric_explicit() {
|
|
6802
6994
|
use super::DistanceMetric;
|
|
6803
6995
|
let path = temp_file("metric-search-cosine-explicit");
|
|
6804
|
-
let mut db =
|
|
6805
|
-
.expect("create");
|
|
6996
|
+
let mut db =
|
|
6997
|
+
Database::create_with_metric(&path, 3, DistanceMetric::Cosine).expect("create");
|
|
6806
6998
|
|
|
6807
6999
|
db.insert("aligned", vec![2.0, 0.0, 0.0], Metadata::new())
|
|
6808
7000
|
.expect("insert aligned"); // cosine = 1.0
|
|
@@ -6836,8 +7028,8 @@ mod tests {
|
|
|
6836
7028
|
use super::DistanceMetric;
|
|
6837
7029
|
let path = temp_file("metric-upsert-cycle");
|
|
6838
7030
|
{
|
|
6839
|
-
let mut db =
|
|
6840
|
-
.expect("create");
|
|
7031
|
+
let mut db =
|
|
7032
|
+
Database::create_with_metric(&path, 3, DistanceMetric::Manhattan).expect("create");
|
|
6841
7033
|
db.upsert("a", vec![1.0, 0.0, 0.0], Metadata::new())
|
|
6842
7034
|
.expect("upsert a");
|
|
6843
7035
|
db.upsert("b", vec![0.0, 5.0, 0.0], Metadata::new())
|
|
@@ -6916,7 +7108,8 @@ mod tests {
|
|
|
6916
7108
|
let mut meta = Metadata::new();
|
|
6917
7109
|
meta.insert("source".into(), "blog".into());
|
|
6918
7110
|
meta.insert("version".into(), MetadataValue::Integer(1));
|
|
6919
|
-
db.upsert("doc1", vec![1.0, 0.0, 0.0], meta)
|
|
7111
|
+
db.upsert("doc1", vec![1.0, 0.0, 0.0], meta)
|
|
7112
|
+
.expect("upsert");
|
|
6920
7113
|
|
|
6921
7114
|
// Patch: update version, add new key
|
|
6922
7115
|
let mut patch = Metadata::new();
|
|
@@ -6966,7 +7159,8 @@ mod tests {
|
|
|
6966
7159
|
|
|
6967
7160
|
let mut meta = Metadata::new();
|
|
6968
7161
|
meta.insert("source".into(), "blog".into());
|
|
6969
|
-
db.upsert("doc1", vec![1.0, 2.0, 3.0], meta)
|
|
7162
|
+
db.upsert("doc1", vec![1.0, 2.0, 3.0], meta)
|
|
7163
|
+
.expect("upsert");
|
|
6970
7164
|
|
|
6971
7165
|
let mut patch = Metadata::new();
|
|
6972
7166
|
patch.insert("source".into(), "updated".into());
|
|
@@ -6986,7 +7180,8 @@ mod tests {
|
|
|
6986
7180
|
let mut db = Database::create(&path, 3).expect("create");
|
|
6987
7181
|
let mut meta = Metadata::new();
|
|
6988
7182
|
meta.insert("source".into(), "blog".into());
|
|
6989
|
-
db.upsert("doc1", vec![1.0, 0.0, 0.0], meta)
|
|
7183
|
+
db.upsert("doc1", vec![1.0, 0.0, 0.0], meta)
|
|
7184
|
+
.expect("upsert");
|
|
6990
7185
|
|
|
6991
7186
|
let mut patch = Metadata::new();
|
|
6992
7187
|
patch.insert("source".into(), "updated".into());
|
|
@@ -7052,7 +7247,8 @@ mod tests {
|
|
|
7052
7247
|
|
|
7053
7248
|
let mut meta = Metadata::new();
|
|
7054
7249
|
meta.insert("status".into(), "draft".into());
|
|
7055
|
-
db.upsert("doc1", vec![1.0, 0.0, 0.0], meta)
|
|
7250
|
+
db.upsert("doc1", vec![1.0, 0.0, 0.0], meta)
|
|
7251
|
+
.expect("upsert");
|
|
7056
7252
|
|
|
7057
7253
|
// Before patch: filter matches
|
|
7058
7254
|
let count = db.count_filtered(None, Some(&MetadataFilter::eq("status", "draft")));
|
|
@@ -7320,7 +7516,8 @@ mod tests {
|
|
|
7320
7516
|
|
|
7321
7517
|
let mut meta = Metadata::new();
|
|
7322
7518
|
meta.insert("source".into(), "blog".into());
|
|
7323
|
-
db.upsert("doc1", vec![1.0, 0.0, 0.0], meta)
|
|
7519
|
+
db.upsert("doc1", vec![1.0, 0.0, 0.0], meta)
|
|
7520
|
+
.expect("upsert");
|
|
7324
7521
|
|
|
7325
7522
|
let mut meta2 = Metadata::new();
|
|
7326
7523
|
meta2.insert("source".into(), "docs".into());
|
|
@@ -7356,7 +7553,8 @@ mod tests {
|
|
|
7356
7553
|
// Now upsert records — they should be indexed incrementally
|
|
7357
7554
|
let mut meta = Metadata::new();
|
|
7358
7555
|
meta.insert("source".into(), "blog".into());
|
|
7359
|
-
db.upsert("doc1", vec![1.0, 0.0, 0.0], meta)
|
|
7556
|
+
db.upsert("doc1", vec![1.0, 0.0, 0.0], meta)
|
|
7557
|
+
.expect("upsert");
|
|
7360
7558
|
|
|
7361
7559
|
let count = db.count_filtered(None, Some(&MetadataFilter::eq("source", "blog")));
|
|
7362
7560
|
assert_eq!(count, 1);
|
|
@@ -7381,7 +7579,8 @@ mod tests {
|
|
|
7381
7579
|
|
|
7382
7580
|
let mut meta = Metadata::new();
|
|
7383
7581
|
meta.insert("source".into(), "blog".into());
|
|
7384
|
-
db.upsert("doc1", vec![1.0, 0.0, 0.0], meta)
|
|
7582
|
+
db.upsert("doc1", vec![1.0, 0.0, 0.0], meta)
|
|
7583
|
+
.expect("upsert");
|
|
7385
7584
|
|
|
7386
7585
|
let mut meta2 = Metadata::new();
|
|
7387
7586
|
meta2.insert("source".into(), "blog".into());
|
|
@@ -7454,7 +7653,8 @@ mod tests {
|
|
|
7454
7653
|
|
|
7455
7654
|
let mut meta = Metadata::new();
|
|
7456
7655
|
meta.insert("status".into(), "draft".into());
|
|
7457
|
-
db.upsert("doc1", vec![1.0, 0.0, 0.0], meta)
|
|
7656
|
+
db.upsert("doc1", vec![1.0, 0.0, 0.0], meta)
|
|
7657
|
+
.expect("upsert");
|
|
7458
7658
|
|
|
7459
7659
|
db.create_index("status", PayloadIndexType::Keyword)
|
|
7460
7660
|
.expect("create");
|
|
@@ -7490,7 +7690,10 @@ mod tests {
|
|
|
7490
7690
|
// Insert records with source and priority
|
|
7491
7691
|
for i in 0..30 {
|
|
7492
7692
|
let mut meta = Metadata::new();
|
|
7493
|
-
meta.insert(
|
|
7693
|
+
meta.insert(
|
|
7694
|
+
"source".into(),
|
|
7695
|
+
if i % 2 == 0 { "blog" } else { "docs" }.into(),
|
|
7696
|
+
);
|
|
7494
7697
|
meta.insert("priority".into(), MetadataValue::Float((i % 3) as f64));
|
|
7495
7698
|
db.upsert(format!("doc{}", i), vec![1.0, 0.0, 0.0], meta)
|
|
7496
7699
|
.expect("upsert");
|
|
@@ -7659,7 +7862,10 @@ mod tests {
|
|
|
7659
7862
|
|
|
7660
7863
|
for i in 0..20 {
|
|
7661
7864
|
let mut meta = Metadata::new();
|
|
7662
|
-
meta.insert(
|
|
7865
|
+
meta.insert(
|
|
7866
|
+
"type".into(),
|
|
7867
|
+
if i % 2 == 0 { "even" } else { "odd" }.into(),
|
|
7868
|
+
);
|
|
7663
7869
|
db.upsert(format!("doc{}", i), vec![1.0, 0.0, 0.0], meta)
|
|
7664
7870
|
.expect("upsert");
|
|
7665
7871
|
}
|
|
@@ -7929,7 +8135,10 @@ mod tests {
|
|
|
7929
8135
|
assert!(db.get_in_namespace("ns1", "doc1").is_none());
|
|
7930
8136
|
|
|
7931
8137
|
// Wrong namespace returns false
|
|
7932
|
-
assert!(
|
|
8138
|
+
assert!(
|
|
8139
|
+
!db.set_ttl_in_namespace("ns2", "doc1", 60.0)
|
|
8140
|
+
.expect("set wrong ns")
|
|
8141
|
+
);
|
|
7933
8142
|
|
|
7934
8143
|
cleanup(&path);
|
|
7935
8144
|
}
|
|
@@ -7943,12 +8152,8 @@ mod tests {
|
|
|
7943
8152
|
let path = temp_file("cursor-basic");
|
|
7944
8153
|
let mut db = Database::create(&path, 3).expect("create");
|
|
7945
8154
|
for i in 0..5 {
|
|
7946
|
-
db.upsert(
|
|
7947
|
-
|
|
7948
|
-
vec![1.0, 0.0, 0.0],
|
|
7949
|
-
Metadata::new(),
|
|
7950
|
-
)
|
|
7951
|
-
.expect("upsert");
|
|
8155
|
+
db.upsert(&format!("doc{i}"), vec![1.0, 0.0, 0.0], Metadata::new())
|
|
8156
|
+
.expect("upsert");
|
|
7952
8157
|
}
|
|
7953
8158
|
|
|
7954
8159
|
// First page of 2
|
|
@@ -7983,12 +8188,22 @@ mod tests {
|
|
|
7983
8188
|
let path = temp_file("cursor-ns");
|
|
7984
8189
|
let mut db = Database::create(&path, 3).expect("create");
|
|
7985
8190
|
for i in 0..3 {
|
|
7986
|
-
db.upsert_in_namespace(
|
|
7987
|
-
|
|
8191
|
+
db.upsert_in_namespace(
|
|
8192
|
+
"ns1",
|
|
8193
|
+
&format!("doc{i}"),
|
|
8194
|
+
vec![1.0, 0.0, 0.0],
|
|
8195
|
+
Metadata::new(),
|
|
8196
|
+
)
|
|
8197
|
+
.expect("upsert");
|
|
7988
8198
|
}
|
|
7989
8199
|
for i in 0..2 {
|
|
7990
|
-
db.upsert_in_namespace(
|
|
7991
|
-
|
|
8200
|
+
db.upsert_in_namespace(
|
|
8201
|
+
"ns2",
|
|
8202
|
+
&format!("doc{i}"),
|
|
8203
|
+
vec![0.0, 1.0, 0.0],
|
|
8204
|
+
Metadata::new(),
|
|
8205
|
+
)
|
|
8206
|
+
.expect("upsert");
|
|
7992
8207
|
}
|
|
7993
8208
|
|
|
7994
8209
|
let (page1, cursor1) = db.list_cursor(Some("ns1"), None, 2, None);
|
|
@@ -8007,12 +8222,8 @@ mod tests {
|
|
|
8007
8222
|
let path = temp_file("cursor-ttl");
|
|
8008
8223
|
let mut db = Database::create(&path, 3).expect("create");
|
|
8009
8224
|
for i in 0..5 {
|
|
8010
|
-
db.upsert(
|
|
8011
|
-
|
|
8012
|
-
vec![1.0, 0.0, 0.0],
|
|
8013
|
-
Metadata::new(),
|
|
8014
|
-
)
|
|
8015
|
-
.expect("upsert");
|
|
8225
|
+
db.upsert(&format!("doc{i}"), vec![1.0, 0.0, 0.0], Metadata::new())
|
|
8226
|
+
.expect("upsert");
|
|
8016
8227
|
}
|
|
8017
8228
|
|
|
8018
8229
|
// Expire doc1
|
|
@@ -8045,4 +8256,153 @@ mod tests {
|
|
|
8045
8256
|
assert!(cursor.is_none());
|
|
8046
8257
|
cleanup(&path);
|
|
8047
8258
|
}
|
|
8259
|
+
|
|
8260
|
+
// ---------------------------------------------------------------
|
|
8261
|
+
// Bug #14: zero-norm query vector should be rejected for cosine
|
|
8262
|
+
// ---------------------------------------------------------------
|
|
8263
|
+
|
|
8264
|
+
#[test]
|
|
8265
|
+
fn search_zero_norm_query_cosine_rejected() {
|
|
8266
|
+
let path = temp_file("zero-norm-cosine");
|
|
8267
|
+
let mut db = Database::create(&path, 3).expect("create");
|
|
8268
|
+
db.insert("a", vec![1.0, 0.0, 0.0], Metadata::new())
|
|
8269
|
+
.expect("insert");
|
|
8270
|
+
|
|
8271
|
+
let result = db.search(
|
|
8272
|
+
&[0.0, 0.0, 0.0],
|
|
8273
|
+
SearchOptions {
|
|
8274
|
+
top_k: 5,
|
|
8275
|
+
..Default::default()
|
|
8276
|
+
},
|
|
8277
|
+
);
|
|
8278
|
+
assert!(result.is_err(), "zero-norm cosine search should fail");
|
|
8279
|
+
let err_msg = result.unwrap_err().to_string();
|
|
8280
|
+
assert!(
|
|
8281
|
+
err_msg.contains("zero norm"),
|
|
8282
|
+
"error should mention zero norm: {err_msg}"
|
|
8283
|
+
);
|
|
8284
|
+
cleanup(&path);
|
|
8285
|
+
}
|
|
8286
|
+
|
|
8287
|
+
#[test]
|
|
8288
|
+
fn search_zero_norm_query_dotproduct_rejected() {
|
|
8289
|
+
let path = temp_file("zero-norm-dot");
|
|
8290
|
+
let mut db =
|
|
8291
|
+
Database::create_with_metric(&path, 3, DistanceMetric::DotProduct).expect("create");
|
|
8292
|
+
db.insert("a", vec![1.0, 0.0, 0.0], Metadata::new())
|
|
8293
|
+
.expect("insert");
|
|
8294
|
+
|
|
8295
|
+
let result = db.search(
|
|
8296
|
+
&[0.0, 0.0, 0.0],
|
|
8297
|
+
SearchOptions {
|
|
8298
|
+
top_k: 5,
|
|
8299
|
+
..Default::default()
|
|
8300
|
+
},
|
|
8301
|
+
);
|
|
8302
|
+
assert!(result.is_err(), "zero-norm dotproduct search should fail");
|
|
8303
|
+
cleanup(&path);
|
|
8304
|
+
}
|
|
8305
|
+
|
|
8306
|
+
#[test]
|
|
8307
|
+
fn search_zero_norm_query_euclidean_allowed() {
|
|
8308
|
+
let path = temp_file("zero-norm-euclidean");
|
|
8309
|
+
let mut db =
|
|
8310
|
+
Database::create_with_metric(&path, 3, DistanceMetric::Euclidean).expect("create");
|
|
8311
|
+
db.insert("a", vec![1.0, 0.0, 0.0], Metadata::new())
|
|
8312
|
+
.expect("insert");
|
|
8313
|
+
|
|
8314
|
+
// Euclidean distance from the origin is well-defined; should succeed.
|
|
8315
|
+
let result = db.search(
|
|
8316
|
+
&[0.0, 0.0, 0.0],
|
|
8317
|
+
SearchOptions {
|
|
8318
|
+
top_k: 5,
|
|
8319
|
+
..Default::default()
|
|
8320
|
+
},
|
|
8321
|
+
);
|
|
8322
|
+
assert!(
|
|
8323
|
+
result.is_ok(),
|
|
8324
|
+
"zero-norm euclidean search should succeed: {:?}",
|
|
8325
|
+
result.err()
|
|
8326
|
+
);
|
|
8327
|
+
cleanup(&path);
|
|
8328
|
+
}
|
|
8329
|
+
|
|
8330
|
+
// ---------------------------------------------------------------
|
|
8331
|
+
// Bug #15: dimension mismatch in search query should be rejected
|
|
8332
|
+
// ---------------------------------------------------------------
|
|
8333
|
+
|
|
8334
|
+
#[test]
|
|
8335
|
+
fn search_undersized_query_rejected() {
|
|
8336
|
+
let path = temp_file("dim-under");
|
|
8337
|
+
let mut db = Database::create(&path, 4).expect("create");
|
|
8338
|
+
db.insert("a", vec![1.0, 0.0, 0.0, 0.0], Metadata::new())
|
|
8339
|
+
.expect("insert");
|
|
8340
|
+
|
|
8341
|
+
// Query dim=2 on a dim=4 database without truncate_dim.
|
|
8342
|
+
let result = db.search(
|
|
8343
|
+
&[1.0, 0.0],
|
|
8344
|
+
SearchOptions {
|
|
8345
|
+
top_k: 5,
|
|
8346
|
+
..Default::default()
|
|
8347
|
+
},
|
|
8348
|
+
);
|
|
8349
|
+
assert!(result.is_err(), "undersized query should fail");
|
|
8350
|
+
match result.unwrap_err() {
|
|
8351
|
+
VectLiteError::DimensionMismatch { expected, found } => {
|
|
8352
|
+
assert_eq!(expected, 4);
|
|
8353
|
+
assert_eq!(found, 2);
|
|
8354
|
+
}
|
|
8355
|
+
other => panic!("expected DimensionMismatch, got: {other}"),
|
|
8356
|
+
}
|
|
8357
|
+
cleanup(&path);
|
|
8358
|
+
}
|
|
8359
|
+
|
|
8360
|
+
#[test]
|
|
8361
|
+
fn search_oversized_query_rejected() {
|
|
8362
|
+
let path = temp_file("dim-over");
|
|
8363
|
+
let mut db = Database::create(&path, 3).expect("create");
|
|
8364
|
+
db.insert("a", vec![1.0, 0.0, 0.0], Metadata::new())
|
|
8365
|
+
.expect("insert");
|
|
8366
|
+
|
|
8367
|
+
let result = db.search(
|
|
8368
|
+
&[1.0, 0.0, 0.0, 0.0, 0.0],
|
|
8369
|
+
SearchOptions {
|
|
8370
|
+
top_k: 5,
|
|
8371
|
+
..Default::default()
|
|
8372
|
+
},
|
|
8373
|
+
);
|
|
8374
|
+
assert!(result.is_err(), "oversized query should fail");
|
|
8375
|
+
match result.unwrap_err() {
|
|
8376
|
+
VectLiteError::DimensionMismatch { expected, found } => {
|
|
8377
|
+
assert_eq!(expected, 3);
|
|
8378
|
+
assert_eq!(found, 5);
|
|
8379
|
+
}
|
|
8380
|
+
other => panic!("expected DimensionMismatch, got: {other}"),
|
|
8381
|
+
}
|
|
8382
|
+
cleanup(&path);
|
|
8383
|
+
}
|
|
8384
|
+
|
|
8385
|
+
#[test]
|
|
8386
|
+
fn search_undersized_query_with_truncate_dim_allowed() {
|
|
8387
|
+
let path = temp_file("dim-matryoshka");
|
|
8388
|
+
let mut db = Database::create(&path, 4).expect("create");
|
|
8389
|
+
db.insert("a", vec![1.0, 0.0, 0.0, 0.0], Metadata::new())
|
|
8390
|
+
.expect("insert");
|
|
8391
|
+
|
|
8392
|
+
// With explicit truncate_dim, undersized queries are Matryoshka-truncated.
|
|
8393
|
+
let result = db.search(
|
|
8394
|
+
&[1.0, 0.0],
|
|
8395
|
+
SearchOptions {
|
|
8396
|
+
top_k: 5,
|
|
8397
|
+
truncate_dim: Some(2),
|
|
8398
|
+
..Default::default()
|
|
8399
|
+
},
|
|
8400
|
+
);
|
|
8401
|
+
assert!(
|
|
8402
|
+
result.is_ok(),
|
|
8403
|
+
"truncate_dim query should succeed: {:?}",
|
|
8404
|
+
result.err()
|
|
8405
|
+
);
|
|
8406
|
+
cleanup(&path);
|
|
8407
|
+
}
|
|
8048
8408
|
}
|