faiss 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/ext/faiss/extconf.rb +9 -2
- data/ext/faiss/index.cpp +1 -1
- data/ext/faiss/index_binary.cpp +2 -2
- data/ext/faiss/product_quantizer.cpp +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +7 -7
- data/vendor/faiss/faiss/AutoTune.h +0 -1
- data/vendor/faiss/faiss/Clustering.cpp +4 -18
- data/vendor/faiss/faiss/Clustering.h +31 -21
- data/vendor/faiss/faiss/IVFlib.cpp +22 -11
- data/vendor/faiss/faiss/Index.cpp +1 -1
- data/vendor/faiss/faiss/Index.h +20 -5
- data/vendor/faiss/faiss/Index2Layer.cpp +7 -7
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +176 -166
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +15 -15
- data/vendor/faiss/faiss/IndexBinary.cpp +9 -4
- data/vendor/faiss/faiss/IndexBinary.h +8 -19
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +2 -1
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +24 -31
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +25 -50
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +106 -187
- data/vendor/faiss/faiss/IndexFastScan.cpp +90 -159
- data/vendor/faiss/faiss/IndexFastScan.h +9 -8
- data/vendor/faiss/faiss/IndexFlat.cpp +195 -3
- data/vendor/faiss/faiss/IndexFlat.h +20 -1
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +11 -0
- data/vendor/faiss/faiss/IndexFlatCodes.h +3 -1
- data/vendor/faiss/faiss/IndexHNSW.cpp +112 -316
- data/vendor/faiss/faiss/IndexHNSW.h +12 -48
- data/vendor/faiss/faiss/IndexIDMap.cpp +69 -28
- data/vendor/faiss/faiss/IndexIDMap.h +24 -2
- data/vendor/faiss/faiss/IndexIVF.cpp +159 -53
- data/vendor/faiss/faiss/IndexIVF.h +37 -5
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +18 -26
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +3 -2
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +19 -46
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -3
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +433 -405
- data/vendor/faiss/faiss/IndexIVFFastScan.h +56 -26
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +15 -5
- data/vendor/faiss/faiss/IndexIVFFlat.h +3 -2
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +172 -0
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.h +56 -0
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +78 -122
- data/vendor/faiss/faiss/IndexIVFPQ.h +6 -7
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +18 -50
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +4 -3
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +45 -29
- data/vendor/faiss/faiss/IndexIVFPQR.h +5 -2
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +25 -27
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +6 -6
- data/vendor/faiss/faiss/IndexLSH.cpp +14 -16
- data/vendor/faiss/faiss/IndexNNDescent.cpp +3 -4
- data/vendor/faiss/faiss/IndexNSG.cpp +11 -27
- data/vendor/faiss/faiss/IndexNSG.h +10 -10
- data/vendor/faiss/faiss/IndexPQ.cpp +72 -88
- data/vendor/faiss/faiss/IndexPQ.h +1 -4
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +1 -1
- data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -31
- data/vendor/faiss/faiss/IndexRefine.cpp +49 -19
- data/vendor/faiss/faiss/IndexRefine.h +7 -0
- data/vendor/faiss/faiss/IndexReplicas.cpp +23 -26
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +22 -16
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -4
- data/vendor/faiss/faiss/IndexShards.cpp +21 -29
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +1 -2
- data/vendor/faiss/faiss/MatrixStats.cpp +17 -32
- data/vendor/faiss/faiss/MatrixStats.h +21 -9
- data/vendor/faiss/faiss/MetaIndexes.cpp +35 -35
- data/vendor/faiss/faiss/VectorTransform.cpp +13 -26
- data/vendor/faiss/faiss/VectorTransform.h +7 -7
- data/vendor/faiss/faiss/clone_index.cpp +15 -10
- data/vendor/faiss/faiss/clone_index.h +3 -0
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +87 -4
- data/vendor/faiss/faiss/gpu/GpuCloner.h +22 -0
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +7 -0
- data/vendor/faiss/faiss/gpu/GpuDistance.h +46 -38
- data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +4 -4
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +8 -9
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +18 -3
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -11
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +1 -3
- data/vendor/faiss/faiss/gpu/GpuResources.cpp +24 -3
- data/vendor/faiss/faiss/gpu/GpuResources.h +39 -11
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +117 -17
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +57 -3
- data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +1 -1
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +25 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +129 -9
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +267 -40
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +299 -208
- data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +1 -0
- data/vendor/faiss/faiss/gpu/utils/RaftUtils.h +75 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +3 -1
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +5 -5
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +1 -1
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +1 -2
- data/vendor/faiss/faiss/impl/DistanceComputer.h +24 -1
- data/vendor/faiss/faiss/impl/FaissException.h +13 -34
- data/vendor/faiss/faiss/impl/HNSW.cpp +321 -70
- data/vendor/faiss/faiss/impl/HNSW.h +9 -8
- data/vendor/faiss/faiss/impl/IDSelector.h +4 -4
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +3 -1
- data/vendor/faiss/faiss/impl/NNDescent.cpp +29 -19
- data/vendor/faiss/faiss/impl/NSG.h +1 -1
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +14 -12
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +24 -22
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/Quantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +27 -1015
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +5 -63
- data/vendor/faiss/faiss/impl/ResultHandler.h +232 -176
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +444 -104
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +0 -8
- data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +280 -42
- data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +21 -14
- data/vendor/faiss/faiss/impl/code_distance/code_distance.h +22 -12
- data/vendor/faiss/faiss/impl/index_read.cpp +45 -19
- data/vendor/faiss/faiss/impl/index_write.cpp +60 -41
- data/vendor/faiss/faiss/impl/io.cpp +10 -10
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
- data/vendor/faiss/faiss/impl/platform_macros.h +18 -1
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +3 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +7 -6
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +52 -38
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +40 -49
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +960 -0
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +176 -0
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +374 -202
- data/vendor/faiss/faiss/index_factory.cpp +10 -7
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -1
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +27 -9
- data/vendor/faiss/faiss/invlists/InvertedLists.h +12 -3
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +3 -3
- data/vendor/faiss/faiss/python/python_callbacks.cpp +1 -1
- data/vendor/faiss/faiss/utils/Heap.cpp +3 -1
- data/vendor/faiss/faiss/utils/WorkerThread.h +1 -0
- data/vendor/faiss/faiss/utils/distances.cpp +128 -74
- data/vendor/faiss/faiss/utils/distances.h +81 -4
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +5 -5
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +2 -2
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +2 -2
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +1 -1
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +5 -5
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +1 -1
- data/vendor/faiss/faiss/utils/distances_simd.cpp +428 -70
- data/vendor/faiss/faiss/utils/fp16-arm.h +29 -0
- data/vendor/faiss/faiss/utils/fp16.h +2 -0
- data/vendor/faiss/faiss/utils/hamming.cpp +162 -110
- data/vendor/faiss/faiss/utils/hamming.h +58 -0
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +16 -89
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +1 -0
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +15 -87
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +57 -0
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +14 -104
- data/vendor/faiss/faiss/utils/partitioning.cpp +3 -4
- data/vendor/faiss/faiss/utils/prefetch.h +77 -0
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +0 -14
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +0 -6
- data/vendor/faiss/faiss/utils/simdlib_neon.h +72 -77
- data/vendor/faiss/faiss/utils/sorting.cpp +140 -5
- data/vendor/faiss/faiss/utils/sorting.h +27 -0
- data/vendor/faiss/faiss/utils/utils.cpp +112 -6
- data/vendor/faiss/faiss/utils/utils.h +57 -20
- metadata +10 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e41b15bbcda6c4d2a250df5b98d86e9baf51b34b90fc2fccb6f0a37f486ef417
|
|
4
|
+
data.tar.gz: 768074275062ed45f1752e3a5c9d55a9695a6aa453e925aa0a6e607ce3215bab
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: cecc466dd24e03206219b63e750e48b554355c1c5dfc8e911879988a6f31eb628617133f5b584b3de29efcbe65d087cf5b4e219371cee959e8248c989a4dbffc
|
|
7
|
+
data.tar.gz: 3e0c6be53825949f9c51a0195d85cbed87bc198dd06852c88c537b13e6bcc8e7fa65a3e3c88667eefef44e95278fe2c73ece89d5f92bd24f8c0d27b543488b56
|
data/CHANGELOG.md
CHANGED
data/LICENSE.txt
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
MIT License
|
|
2
2
|
|
|
3
3
|
Copyright (c) Facebook, Inc. and its affiliates.
|
|
4
|
-
Copyright (c) 2020-
|
|
4
|
+
Copyright (c) 2020-2024 Andrew Kane
|
|
5
5
|
|
|
6
6
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
7
|
of this software and associated documentation files (the "Software"), to deal
|
data/README.md
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
Learn more about [Faiss](https://engineering.fb.com/data-infrastructure/faiss-a-library-for-efficient-similarity-search/)
|
|
6
6
|
|
|
7
|
-
[](https://github.com/ankane/faiss-ruby/actions)
|
|
8
8
|
|
|
9
9
|
## Installation
|
|
10
10
|
|
data/ext/faiss/extconf.rb
CHANGED
|
@@ -19,9 +19,16 @@ abort "Numo not found" unless find_header("numo/narray.h", numo)
|
|
|
19
19
|
# for https://bugs.ruby-lang.org/issues/19005
|
|
20
20
|
$LDFLAGS += " -Wl,-undefined,dynamic_lookup" if RbConfig::CONFIG["host_os"] =~ /darwin/i
|
|
21
21
|
|
|
22
|
+
$CXXFLAGS += " -std=c++17 $(optflags) -DFINTEGER=int"
|
|
23
|
+
$CXXFLAGS += " -Wall -Wno-unused-parameter -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-sign-compare"
|
|
24
|
+
|
|
22
25
|
# -march=native not supported with ARM Mac
|
|
23
|
-
default_optflags = RbConfig::CONFIG["host_os"] =~ /darwin/i && RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i ? "" : "-march=native"
|
|
24
|
-
$CXXFLAGS
|
|
26
|
+
default_optflags = RbConfig::CONFIG["host_os"] =~ /darwin/i && RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i ? "" : " -march=native"
|
|
27
|
+
$CXXFLAGS += with_config("optflags", default_optflags)
|
|
28
|
+
|
|
29
|
+
apple_clang = RbConfig::CONFIG["CC_VERSION_MESSAGE"] =~ /apple clang/i
|
|
30
|
+
$CXXFLAGS += " -Xclang" if apple_clang
|
|
31
|
+
$CXXFLAGS += " -fopenmp"
|
|
25
32
|
|
|
26
33
|
ext = File.expand_path(".", __dir__)
|
|
27
34
|
vendor = File.expand_path("../../vendor/faiss", __dir__)
|
data/ext/faiss/index.cpp
CHANGED
|
@@ -157,7 +157,7 @@ void init_index(Rice::Module& m) {
|
|
|
157
157
|
"load",
|
|
158
158
|
[](Rice::String fname) {
|
|
159
159
|
return faiss::read_index(fname.c_str());
|
|
160
|
-
});
|
|
160
|
+
}, Rice::Return().takeOwnership());
|
|
161
161
|
|
|
162
162
|
Rice::define_class_under<faiss::IndexFlatL2, faiss::Index>(m, "IndexFlatL2")
|
|
163
163
|
.define_constructor(Rice::Constructor<faiss::IndexFlatL2, int64_t>());
|
data/ext/faiss/index_binary.cpp
CHANGED
|
@@ -59,7 +59,7 @@ void init_index_binary(Rice::Module& m) {
|
|
|
59
59
|
"load",
|
|
60
60
|
[](Rice::String fname) {
|
|
61
61
|
return faiss::read_index_binary(fname.c_str());
|
|
62
|
-
});
|
|
62
|
+
}, Rice::Return().takeOwnership());
|
|
63
63
|
|
|
64
64
|
Rice::define_class_under<faiss::IndexBinaryFlat, faiss::IndexBinary>(m, "IndexBinaryFlat")
|
|
65
65
|
.define_constructor(Rice::Constructor<faiss::IndexBinaryFlat, int64_t>());
|
|
@@ -71,5 +71,5 @@ void init_index_binary(Rice::Module& m) {
|
|
|
71
71
|
"index_binary_factory",
|
|
72
72
|
[](int d, Rice::String description) {
|
|
73
73
|
return faiss::index_binary_factory(d, description.c_str());
|
|
74
|
-
});
|
|
74
|
+
}, Rice::Return().takeOwnership());
|
|
75
75
|
}
|
data/lib/faiss/version.rb
CHANGED
|
@@ -152,12 +152,10 @@ bool OperatingPoints::add(
|
|
|
152
152
|
return false;
|
|
153
153
|
}
|
|
154
154
|
}
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
a.erase(a.begin() + (i - 1));
|
|
160
|
-
i--;
|
|
155
|
+
// remove non-optimal points from array
|
|
156
|
+
for (int i = a.size() - 1; i > 0; --i) {
|
|
157
|
+
if (a[i].t < a[i - 1].t) {
|
|
158
|
+
a.erase(a.begin() + (i - 1));
|
|
161
159
|
}
|
|
162
160
|
}
|
|
163
161
|
return true;
|
|
@@ -286,6 +284,8 @@ std::string ParameterSpace::combination_name(size_t cno) const {
|
|
|
286
284
|
char buf[1000], *wp = buf;
|
|
287
285
|
*wp = 0;
|
|
288
286
|
for (int i = 0; i < parameter_ranges.size(); i++) {
|
|
287
|
+
FAISS_THROW_IF_NOT_MSG(
|
|
288
|
+
buf + 1000 - wp >= 0, "Overflow detected in snprintf");
|
|
289
289
|
const ParameterRange& pr = parameter_ranges[i];
|
|
290
290
|
size_t j = cno % pr.values.size();
|
|
291
291
|
cno /= pr.values.size();
|
|
@@ -334,7 +334,7 @@ ParameterRange& ParameterSpace::add_range(const std::string& name) {
|
|
|
334
334
|
return pr;
|
|
335
335
|
}
|
|
336
336
|
}
|
|
337
|
-
parameter_ranges.
|
|
337
|
+
parameter_ranges.emplace_back();
|
|
338
338
|
parameter_ranges.back().name = name;
|
|
339
339
|
return parameter_ranges.back();
|
|
340
340
|
}
|
|
@@ -27,20 +27,6 @@
|
|
|
27
27
|
|
|
28
28
|
namespace faiss {
|
|
29
29
|
|
|
30
|
-
ClusteringParameters::ClusteringParameters()
|
|
31
|
-
: niter(25),
|
|
32
|
-
nredo(1),
|
|
33
|
-
verbose(false),
|
|
34
|
-
spherical(false),
|
|
35
|
-
int_centroids(false),
|
|
36
|
-
update_index(false),
|
|
37
|
-
frozen_centroids(false),
|
|
38
|
-
min_points_per_centroid(39),
|
|
39
|
-
max_points_per_centroid(256),
|
|
40
|
-
seed(1234),
|
|
41
|
-
decode_block_size(32768) {}
|
|
42
|
-
// 39 corresponds to 10000 / 256 -> to avoid warnings on PQ tests with randu10k
|
|
43
|
-
|
|
44
30
|
Clustering::Clustering(int d, int k) : d(d), k(k) {}
|
|
45
31
|
|
|
46
32
|
Clustering::Clustering(int d, int k, const ClusteringParameters& cp)
|
|
@@ -231,7 +217,7 @@ int split_clusters(
|
|
|
231
217
|
for (size_t ci = 0; ci < k; ci++) {
|
|
232
218
|
if (hassign[ci] == 0) { /* need to redefine a centroid */
|
|
233
219
|
size_t cj;
|
|
234
|
-
for (cj = 0;
|
|
220
|
+
for (cj = 0; true; cj = (cj + 1) % k) {
|
|
235
221
|
/* probability to pick this cluster for split */
|
|
236
222
|
float p = (hassign[cj] - 1.0) / (float)(n - k);
|
|
237
223
|
float r = rng.rand_float();
|
|
@@ -264,7 +250,7 @@ int split_clusters(
|
|
|
264
250
|
return nsplit;
|
|
265
251
|
}
|
|
266
252
|
|
|
267
|
-
}
|
|
253
|
+
} // namespace
|
|
268
254
|
|
|
269
255
|
void Clustering::train_encoded(
|
|
270
256
|
idx_t nx,
|
|
@@ -590,7 +576,7 @@ float kmeans_clustering(
|
|
|
590
576
|
const float* x,
|
|
591
577
|
float* centroids) {
|
|
592
578
|
Clustering clus(d, k);
|
|
593
|
-
clus.verbose = d * n * k > (
|
|
579
|
+
clus.verbose = d * n * k > (size_t(1) << 30);
|
|
594
580
|
// display logs if > 1Gflop per iteration
|
|
595
581
|
IndexFlatL2 index(d);
|
|
596
582
|
clus.train(n, x, index);
|
|
@@ -631,7 +617,7 @@ void copy_columns(idx_t n, idx_t d1, const float* src, idx_t d2, float* dest) {
|
|
|
631
617
|
}
|
|
632
618
|
}
|
|
633
619
|
|
|
634
|
-
}
|
|
620
|
+
} // namespace
|
|
635
621
|
|
|
636
622
|
void ProgressiveDimClustering::train(
|
|
637
623
|
idx_t n,
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* LICENSE file in the root directory of this source tree.
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
/** Implementation of k-means clustering with many variants. */
|
|
9
9
|
|
|
10
10
|
#ifndef FAISS_CLUSTERING_H
|
|
11
11
|
#define FAISS_CLUSTERING_H
|
|
@@ -19,25 +19,35 @@ namespace faiss {
|
|
|
19
19
|
* constructor of the Clustering object.
|
|
20
20
|
*/
|
|
21
21
|
struct ClusteringParameters {
|
|
22
|
-
|
|
23
|
-
int
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
bool
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
22
|
+
/// number of clustering iterations
|
|
23
|
+
int niter = 25;
|
|
24
|
+
/// redo clustering this many times and keep the clusters with the best
|
|
25
|
+
/// objective
|
|
26
|
+
int nredo = 1;
|
|
27
|
+
|
|
28
|
+
bool verbose = false;
|
|
29
|
+
/// whether to normalize centroids after each iteration (useful for inner
|
|
30
|
+
/// product clustering)
|
|
31
|
+
bool spherical = false;
|
|
32
|
+
/// round centroids coordinates to integer after each iteration?
|
|
33
|
+
bool int_centroids = false;
|
|
34
|
+
/// re-train index after each iteration?
|
|
35
|
+
bool update_index = false;
|
|
36
|
+
|
|
37
|
+
/// Use the subset of centroids provided as input and do not change them
|
|
38
|
+
/// during iterations
|
|
39
|
+
bool frozen_centroids = false;
|
|
40
|
+
/// If fewer than this number of training vectors per centroid are provided,
|
|
41
|
+
/// writes a warning. Note that fewer than 1 point per centroid raises an
|
|
42
|
+
/// exception.
|
|
43
|
+
int min_points_per_centroid = 39;
|
|
44
|
+
/// to limit size of dataset, otherwise the training set is subsampled
|
|
45
|
+
int max_points_per_centroid = 256;
|
|
46
|
+
/// seed for the random number generator
|
|
47
|
+
int seed = 1234;
|
|
48
|
+
|
|
49
|
+
/// when the training set is encoded, batch size of the codec decoder
|
|
50
|
+
size_t decode_block_size = 32768;
|
|
41
51
|
};
|
|
42
52
|
|
|
43
53
|
struct ClusteringIterationStats {
|
|
@@ -94,7 +104,7 @@ struct Clustering : ClusteringParameters {
|
|
|
94
104
|
* to decode the input vectors.
|
|
95
105
|
*
|
|
96
106
|
* @param codec codec used to decode the vectors (nullptr =
|
|
97
|
-
* vectors are in fact floats)
|
|
107
|
+
* vectors are in fact floats)
|
|
98
108
|
*/
|
|
99
109
|
void train_encoded(
|
|
100
110
|
idx_t nx,
|
|
@@ -12,7 +12,9 @@
|
|
|
12
12
|
|
|
13
13
|
#include <faiss/IndexAdditiveQuantizer.h>
|
|
14
14
|
#include <faiss/IndexIVFAdditiveQuantizer.h>
|
|
15
|
+
#include <faiss/IndexIVFIndependentQuantizer.h>
|
|
15
16
|
#include <faiss/IndexPreTransform.h>
|
|
17
|
+
#include <faiss/IndexRefine.h>
|
|
16
18
|
#include <faiss/MetaIndexes.h>
|
|
17
19
|
#include <faiss/impl/FaissAssert.h>
|
|
18
20
|
#include <faiss/utils/distances.h>
|
|
@@ -57,20 +59,29 @@ void check_compatible_for_merge(const Index* index0, const Index* index1) {
|
|
|
57
59
|
}
|
|
58
60
|
|
|
59
61
|
const IndexIVF* try_extract_index_ivf(const Index* index) {
|
|
60
|
-
|
|
61
|
-
|
|
62
|
+
auto* ivf = dynamic_cast<const IndexIVF*>(index);
|
|
63
|
+
if (ivf != nullptr) {
|
|
64
|
+
return ivf;
|
|
62
65
|
}
|
|
63
66
|
|
|
67
|
+
if (auto* pt = dynamic_cast<const IndexPreTransform*>(index)) {
|
|
68
|
+
return try_extract_index_ivf(pt->index);
|
|
69
|
+
}
|
|
64
70
|
if (auto* idmap = dynamic_cast<const IndexIDMap*>(index)) {
|
|
65
|
-
|
|
71
|
+
return try_extract_index_ivf(idmap->index);
|
|
66
72
|
}
|
|
67
73
|
if (auto* idmap = dynamic_cast<const IndexIDMap2*>(index)) {
|
|
68
|
-
|
|
74
|
+
return try_extract_index_ivf(idmap->index);
|
|
75
|
+
}
|
|
76
|
+
if (auto* indep =
|
|
77
|
+
dynamic_cast<const IndexIVFIndependentQuantizer*>(index)) {
|
|
78
|
+
return try_extract_index_ivf(indep->index_ivf);
|
|
79
|
+
}
|
|
80
|
+
if (auto* refine = dynamic_cast<const IndexRefine*>(index)) {
|
|
81
|
+
return try_extract_index_ivf(refine->base_index);
|
|
69
82
|
}
|
|
70
83
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
return ivf;
|
|
84
|
+
return nullptr;
|
|
74
85
|
}
|
|
75
86
|
|
|
76
87
|
IndexIVF* try_extract_index_ivf(Index* index) {
|
|
@@ -321,14 +332,14 @@ void search_with_parameters(
|
|
|
321
332
|
double* ms_per_stage) {
|
|
322
333
|
FAISS_THROW_IF_NOT(params);
|
|
323
334
|
const float* prev_x = x;
|
|
324
|
-
|
|
335
|
+
std::unique_ptr<const float[]> del;
|
|
325
336
|
|
|
326
337
|
double t0 = getmillisecs();
|
|
327
338
|
|
|
328
339
|
if (auto ip = dynamic_cast<const IndexPreTransform*>(index)) {
|
|
329
340
|
x = ip->apply_chain(n, x);
|
|
330
341
|
if (x != prev_x) {
|
|
331
|
-
del.
|
|
342
|
+
del.reset(x);
|
|
332
343
|
}
|
|
333
344
|
index = ip->index;
|
|
334
345
|
}
|
|
@@ -371,14 +382,14 @@ void range_search_with_parameters(
|
|
|
371
382
|
double* ms_per_stage) {
|
|
372
383
|
FAISS_THROW_IF_NOT(params);
|
|
373
384
|
const float* prev_x = x;
|
|
374
|
-
|
|
385
|
+
std::unique_ptr<const float[]> del;
|
|
375
386
|
|
|
376
387
|
double t0 = getmillisecs();
|
|
377
388
|
|
|
378
389
|
if (auto ip = dynamic_cast<const IndexPreTransform*>(index)) {
|
|
379
390
|
x = ip->apply_chain(n, x);
|
|
380
391
|
if (x != prev_x) {
|
|
381
|
-
del.
|
|
392
|
+
del.reset(x);
|
|
382
393
|
}
|
|
383
394
|
index = ip->index;
|
|
384
395
|
}
|
data/vendor/faiss/faiss/Index.h
CHANGED
|
@@ -17,8 +17,8 @@
|
|
|
17
17
|
#include <typeinfo>
|
|
18
18
|
|
|
19
19
|
#define FAISS_VERSION_MAJOR 1
|
|
20
|
-
#define FAISS_VERSION_MINOR
|
|
21
|
-
#define FAISS_VERSION_PATCH
|
|
20
|
+
#define FAISS_VERSION_MINOR 8
|
|
21
|
+
#define FAISS_VERSION_PATCH 0
|
|
22
22
|
|
|
23
23
|
/**
|
|
24
24
|
* @namespace faiss
|
|
@@ -99,6 +99,7 @@ struct Index {
|
|
|
99
99
|
* Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
|
|
100
100
|
* This function slices the input vectors in chunks smaller than
|
|
101
101
|
* blocksize_add and calls add_core.
|
|
102
|
+
* @param n number of vectors
|
|
102
103
|
* @param x input matrix, size n * d
|
|
103
104
|
*/
|
|
104
105
|
virtual void add(idx_t n, const float* x) = 0;
|
|
@@ -108,7 +109,9 @@ struct Index {
|
|
|
108
109
|
* The default implementation fails with an assertion, as it is
|
|
109
110
|
* not supported by all indexes.
|
|
110
111
|
*
|
|
111
|
-
* @param
|
|
112
|
+
* @param n number of vectors
|
|
113
|
+
* @param x input vectors, size n * d
|
|
114
|
+
* @param xids if non-null, ids to store for the vectors (size n)
|
|
112
115
|
*/
|
|
113
116
|
virtual void add_with_ids(idx_t n, const float* x, const idx_t* xids);
|
|
114
117
|
|
|
@@ -117,9 +120,11 @@ struct Index {
|
|
|
117
120
|
* return at most k vectors. If there are not enough results for a
|
|
118
121
|
* query, the result array is padded with -1s.
|
|
119
122
|
*
|
|
123
|
+
* @param n number of vectors
|
|
120
124
|
* @param x input vectors to search, size n * d
|
|
121
|
-
* @param
|
|
125
|
+
* @param k number of extracted vectors
|
|
122
126
|
* @param distances output pairwise distances, size n*k
|
|
127
|
+
* @param labels output labels of the NNs, size n*k
|
|
123
128
|
*/
|
|
124
129
|
virtual void search(
|
|
125
130
|
idx_t n,
|
|
@@ -135,6 +140,7 @@ struct Index {
|
|
|
135
140
|
* indexes do not implement the range_search (only the k-NN search
|
|
136
141
|
* is mandatory).
|
|
137
142
|
*
|
|
143
|
+
* @param n number of vectors
|
|
138
144
|
* @param x input vectors to search, size n * d
|
|
139
145
|
* @param radius search radius
|
|
140
146
|
* @param result result table
|
|
@@ -149,8 +155,10 @@ struct Index {
|
|
|
149
155
|
/** return the indexes of the k vectors closest to the query x.
|
|
150
156
|
*
|
|
151
157
|
* This function is identical as search but only return labels of neighbors.
|
|
158
|
+
* @param n number of vectors
|
|
152
159
|
* @param x input vectors to search, size n * d
|
|
153
160
|
* @param labels output labels of the NNs, size n*k
|
|
161
|
+
* @param k number of nearest neighbours
|
|
154
162
|
*/
|
|
155
163
|
virtual void assign(idx_t n, const float* x, idx_t* labels, idx_t k = 1)
|
|
156
164
|
const;
|
|
@@ -174,7 +182,7 @@ struct Index {
|
|
|
174
182
|
/** Reconstruct several stored vectors (or an approximation if lossy coding)
|
|
175
183
|
*
|
|
176
184
|
* this function may not be defined for some indexes
|
|
177
|
-
* @param n
|
|
185
|
+
* @param n number of vectors to reconstruct
|
|
178
186
|
* @param keys ids of the vectors to reconstruct (size n)
|
|
179
187
|
* @param recons reconstucted vector (size n * d)
|
|
180
188
|
*/
|
|
@@ -184,6 +192,8 @@ struct Index {
|
|
|
184
192
|
/** Reconstruct vectors i0 to i0 + ni - 1
|
|
185
193
|
*
|
|
186
194
|
* this function may not be defined for some indexes
|
|
195
|
+
* @param i0 index of the first vector in the sequence
|
|
196
|
+
* @param ni number of vectors in the sequence
|
|
187
197
|
* @param recons reconstucted vector (size ni * d)
|
|
188
198
|
*/
|
|
189
199
|
virtual void reconstruct_n(idx_t i0, idx_t ni, float* recons) const;
|
|
@@ -194,6 +204,11 @@ struct Index {
|
|
|
194
204
|
* If there are not enough results for a query, the resulting arrays
|
|
195
205
|
* is padded with -1s.
|
|
196
206
|
*
|
|
207
|
+
* @param n number of vectors
|
|
208
|
+
* @param x input vectors to search, size n * d
|
|
209
|
+
* @param k number of extracted vectors
|
|
210
|
+
* @param distances output pairwise distances, size n*k
|
|
211
|
+
* @param labels output labels of the NNs, size n*k
|
|
197
212
|
* @param recons reconstructed vectors size (n, k, d)
|
|
198
213
|
**/
|
|
199
214
|
virtual void search_and_reconstruct(
|
|
@@ -10,10 +10,10 @@
|
|
|
10
10
|
#include <faiss/Index2Layer.h>
|
|
11
11
|
|
|
12
12
|
#include <faiss/impl/platform_macros.h>
|
|
13
|
-
#include <stdint.h>
|
|
14
13
|
#include <cassert>
|
|
15
14
|
#include <cinttypes>
|
|
16
15
|
#include <cmath>
|
|
16
|
+
#include <cstdint>
|
|
17
17
|
#include <cstdio>
|
|
18
18
|
|
|
19
19
|
#ifdef __SSE3__
|
|
@@ -47,7 +47,7 @@ Index2Layer::Index2Layer(
|
|
|
47
47
|
pq(quantizer->d, M, nbit) {
|
|
48
48
|
is_trained = false;
|
|
49
49
|
for (int nbyte = 0; nbyte < 7; nbyte++) {
|
|
50
|
-
if ((
|
|
50
|
+
if (((size_t)1 << (8 * nbyte)) >= nlist) {
|
|
51
51
|
code_size_1 = nbyte;
|
|
52
52
|
break;
|
|
53
53
|
}
|
|
@@ -60,7 +60,7 @@ Index2Layer::Index2Layer() {
|
|
|
60
60
|
code_size = code_size_1 = code_size_2 = 0;
|
|
61
61
|
}
|
|
62
62
|
|
|
63
|
-
Index2Layer::~Index2Layer()
|
|
63
|
+
Index2Layer::~Index2Layer() = default;
|
|
64
64
|
|
|
65
65
|
void Index2Layer::train(idx_t n, const float* x) {
|
|
66
66
|
if (verbose) {
|
|
@@ -83,7 +83,7 @@ void Index2Layer::train(idx_t n, const float* x) {
|
|
|
83
83
|
verbose,
|
|
84
84
|
pq.cp.seed);
|
|
85
85
|
|
|
86
|
-
|
|
86
|
+
std::unique_ptr<const float[]> del_x(x_in == x ? nullptr : x);
|
|
87
87
|
|
|
88
88
|
std::vector<idx_t> assign(n); // assignement to coarse centroids
|
|
89
89
|
q1.quantizer->assign(n, x, assign.data());
|
|
@@ -179,7 +179,7 @@ struct DistanceXPQ4 : Distance2Level {
|
|
|
179
179
|
float operator()(idx_t i) override {
|
|
180
180
|
#ifdef __SSE3__
|
|
181
181
|
const uint8_t* code = storage.codes.data() + i * storage.code_size;
|
|
182
|
-
|
|
182
|
+
idx_t key = 0;
|
|
183
183
|
memcpy(&key, code, storage.code_size_1);
|
|
184
184
|
code += storage.code_size_1;
|
|
185
185
|
|
|
@@ -225,7 +225,7 @@ struct Distance2xXPQ4 : Distance2Level {
|
|
|
225
225
|
|
|
226
226
|
float operator()(idx_t i) override {
|
|
227
227
|
const uint8_t* code = storage.codes.data() + i * storage.code_size;
|
|
228
|
-
|
|
228
|
+
int64_t key01 = 0;
|
|
229
229
|
memcpy(&key01, code, storage.code_size_1);
|
|
230
230
|
code += storage.code_size_1;
|
|
231
231
|
#ifdef __SSE3__
|
|
@@ -237,7 +237,7 @@ struct Distance2xXPQ4 : Distance2Level {
|
|
|
237
237
|
__m128 accu = _mm_setzero_ps();
|
|
238
238
|
|
|
239
239
|
for (int mi_m = 0; mi_m < 2; mi_m++) {
|
|
240
|
-
|
|
240
|
+
int64_t l1_idx = key01 & (((int64_t)1 << mi_nbits) - 1);
|
|
241
241
|
const __m128* pq_l1 = pq_l1_t + M_2 * l1_idx;
|
|
242
242
|
|
|
243
243
|
for (int m = 0; m < M_2; m++) {
|