faiss 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/ext/faiss/extconf.rb +9 -2
- data/ext/faiss/index.cpp +1 -1
- data/ext/faiss/index_binary.cpp +2 -2
- data/ext/faiss/product_quantizer.cpp +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +7 -7
- data/vendor/faiss/faiss/AutoTune.h +1 -2
- data/vendor/faiss/faiss/Clustering.cpp +39 -22
- data/vendor/faiss/faiss/Clustering.h +40 -21
- data/vendor/faiss/faiss/IVFlib.cpp +26 -12
- data/vendor/faiss/faiss/Index.cpp +1 -1
- data/vendor/faiss/faiss/Index.h +40 -10
- data/vendor/faiss/faiss/Index2Layer.cpp +7 -7
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +176 -166
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +15 -15
- data/vendor/faiss/faiss/IndexBinary.cpp +9 -4
- data/vendor/faiss/faiss/IndexBinary.h +8 -19
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +2 -1
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +24 -31
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +25 -50
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +107 -188
- data/vendor/faiss/faiss/IndexFastScan.cpp +95 -146
- data/vendor/faiss/faiss/IndexFastScan.h +9 -8
- data/vendor/faiss/faiss/IndexFlat.cpp +206 -10
- data/vendor/faiss/faiss/IndexFlat.h +20 -1
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +170 -5
- data/vendor/faiss/faiss/IndexFlatCodes.h +23 -4
- data/vendor/faiss/faiss/IndexHNSW.cpp +231 -382
- data/vendor/faiss/faiss/IndexHNSW.h +62 -49
- data/vendor/faiss/faiss/IndexIDMap.cpp +69 -28
- data/vendor/faiss/faiss/IndexIDMap.h +24 -2
- data/vendor/faiss/faiss/IndexIVF.cpp +162 -56
- data/vendor/faiss/faiss/IndexIVF.h +46 -6
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +33 -26
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +6 -2
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +19 -46
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -3
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +502 -401
- data/vendor/faiss/faiss/IndexIVFFastScan.h +63 -26
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +15 -5
- data/vendor/faiss/faiss/IndexIVFFlat.h +3 -2
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +172 -0
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.h +56 -0
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +79 -125
- data/vendor/faiss/faiss/IndexIVFPQ.h +6 -7
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +39 -52
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +4 -3
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +45 -29
- data/vendor/faiss/faiss/IndexIVFPQR.h +5 -2
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +25 -27
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +6 -6
- data/vendor/faiss/faiss/IndexLSH.cpp +14 -16
- data/vendor/faiss/faiss/IndexLattice.cpp +1 -19
- data/vendor/faiss/faiss/IndexLattice.h +3 -22
- data/vendor/faiss/faiss/IndexNNDescent.cpp +3 -33
- data/vendor/faiss/faiss/IndexNNDescent.h +1 -1
- data/vendor/faiss/faiss/IndexNSG.cpp +11 -27
- data/vendor/faiss/faiss/IndexNSG.h +11 -11
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +56 -0
- data/vendor/faiss/faiss/IndexNeuralNetCodec.h +49 -0
- data/vendor/faiss/faiss/IndexPQ.cpp +72 -88
- data/vendor/faiss/faiss/IndexPQ.h +1 -4
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +1 -1
- data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -31
- data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
- data/vendor/faiss/faiss/IndexRefine.cpp +54 -24
- data/vendor/faiss/faiss/IndexRefine.h +7 -0
- data/vendor/faiss/faiss/IndexReplicas.cpp +23 -26
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +25 -17
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -4
- data/vendor/faiss/faiss/IndexShards.cpp +21 -29
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +1 -2
- data/vendor/faiss/faiss/MatrixStats.cpp +17 -32
- data/vendor/faiss/faiss/MatrixStats.h +21 -9
- data/vendor/faiss/faiss/MetaIndexes.cpp +35 -35
- data/vendor/faiss/faiss/MetricType.h +7 -2
- data/vendor/faiss/faiss/VectorTransform.cpp +13 -26
- data/vendor/faiss/faiss/VectorTransform.h +7 -7
- data/vendor/faiss/faiss/clone_index.cpp +15 -10
- data/vendor/faiss/faiss/clone_index.h +3 -0
- data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +95 -17
- data/vendor/faiss/faiss/cppcontrib/factory_tools.cpp +152 -0
- data/vendor/faiss/faiss/cppcontrib/factory_tools.h +24 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +83 -30
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +123 -8
- data/vendor/faiss/faiss/gpu/GpuCloner.h +22 -0
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +13 -0
- data/vendor/faiss/faiss/gpu/GpuDistance.h +46 -38
- data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -1
- data/vendor/faiss/faiss/gpu/GpuIndex.h +30 -12
- data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +282 -0
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +4 -4
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +14 -9
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +20 -3
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -11
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +1 -3
- data/vendor/faiss/faiss/gpu/GpuResources.cpp +24 -3
- data/vendor/faiss/faiss/gpu/GpuResources.h +39 -11
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +142 -17
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +57 -3
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +26 -21
- data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +7 -1
- data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +8 -5
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +25 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +129 -9
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +332 -40
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +299 -208
- data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +1 -0
- data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +1 -1
- data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +6 -0
- data/vendor/faiss/faiss/gpu/utils/RaftUtils.h +75 -0
- data/vendor/faiss/faiss/gpu/utils/Timer.cpp +4 -1
- data/vendor/faiss/faiss/gpu/utils/Timer.h +1 -1
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +3 -1
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +5 -5
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +26 -1
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +10 -3
- data/vendor/faiss/faiss/impl/DistanceComputer.h +70 -1
- data/vendor/faiss/faiss/impl/FaissAssert.h +4 -2
- data/vendor/faiss/faiss/impl/FaissException.h +13 -34
- data/vendor/faiss/faiss/impl/HNSW.cpp +605 -186
- data/vendor/faiss/faiss/impl/HNSW.h +52 -30
- data/vendor/faiss/faiss/impl/IDSelector.h +4 -4
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +11 -9
- data/vendor/faiss/faiss/impl/LookupTableScaler.h +34 -0
- data/vendor/faiss/faiss/impl/NNDescent.cpp +42 -27
- data/vendor/faiss/faiss/impl/NSG.cpp +0 -29
- data/vendor/faiss/faiss/impl/NSG.h +1 -1
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +14 -12
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -22
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +6 -2
- data/vendor/faiss/faiss/impl/Quantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +27 -1015
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +5 -63
- data/vendor/faiss/faiss/impl/ResultHandler.h +347 -172
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +1104 -147
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +3 -8
- data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +285 -42
- data/vendor/faiss/faiss/impl/code_distance/code_distance-avx512.h +248 -0
- data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +21 -14
- data/vendor/faiss/faiss/impl/code_distance/code_distance.h +22 -12
- data/vendor/faiss/faiss/impl/index_read.cpp +74 -34
- data/vendor/faiss/faiss/impl/index_read_utils.h +37 -0
- data/vendor/faiss/faiss/impl/index_write.cpp +88 -51
- data/vendor/faiss/faiss/impl/io.cpp +23 -15
- data/vendor/faiss/faiss/impl/io.h +4 -4
- data/vendor/faiss/faiss/impl/io_macros.h +6 -0
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
- data/vendor/faiss/faiss/impl/platform_macros.h +40 -1
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +14 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +7 -6
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +52 -38
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +487 -49
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +960 -0
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +176 -0
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +481 -225
- data/vendor/faiss/faiss/index_factory.cpp +41 -20
- data/vendor/faiss/faiss/index_io.h +12 -5
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +28 -8
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +3 -0
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +10 -2
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +73 -17
- data/vendor/faiss/faiss/invlists/InvertedLists.h +26 -8
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +24 -9
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +2 -1
- data/vendor/faiss/faiss/python/python_callbacks.cpp +4 -4
- data/vendor/faiss/faiss/utils/Heap.cpp +3 -1
- data/vendor/faiss/faiss/utils/Heap.h +105 -0
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +342 -0
- data/vendor/faiss/faiss/utils/NeuralNet.h +147 -0
- data/vendor/faiss/faiss/utils/WorkerThread.h +1 -0
- data/vendor/faiss/faiss/utils/bf16.h +36 -0
- data/vendor/faiss/faiss/utils/distances.cpp +147 -123
- data/vendor/faiss/faiss/utils/distances.h +86 -9
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +5 -5
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +2 -2
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +2 -2
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +1 -1
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +5 -5
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +1 -1
- data/vendor/faiss/faiss/utils/distances_simd.cpp +1589 -243
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +70 -0
- data/vendor/faiss/faiss/utils/extra_distances.cpp +85 -137
- data/vendor/faiss/faiss/utils/extra_distances.h +3 -2
- data/vendor/faiss/faiss/utils/fp16-arm.h +29 -0
- data/vendor/faiss/faiss/utils/fp16.h +2 -0
- data/vendor/faiss/faiss/utils/hamming.cpp +163 -111
- data/vendor/faiss/faiss/utils/hamming.h +58 -0
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +16 -89
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +1 -0
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +19 -88
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +58 -0
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +14 -104
- data/vendor/faiss/faiss/utils/partitioning.cpp +3 -4
- data/vendor/faiss/faiss/utils/prefetch.h +77 -0
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +0 -14
- data/vendor/faiss/faiss/utils/random.cpp +43 -0
- data/vendor/faiss/faiss/utils/random.h +25 -0
- data/vendor/faiss/faiss/utils/simdlib.h +10 -1
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +0 -6
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +296 -0
- data/vendor/faiss/faiss/utils/simdlib_neon.h +77 -79
- data/vendor/faiss/faiss/utils/simdlib_ppc64.h +1084 -0
- data/vendor/faiss/faiss/utils/sorting.cpp +140 -5
- data/vendor/faiss/faiss/utils/sorting.h +27 -0
- data/vendor/faiss/faiss/utils/transpose/transpose-avx512-inl.h +176 -0
- data/vendor/faiss/faiss/utils/utils.cpp +120 -7
- data/vendor/faiss/faiss/utils/utils.h +60 -20
- metadata +23 -4
- data/vendor/faiss/faiss/impl/code_distance/code_distance_avx512.h +0 -102
|
@@ -9,9 +9,10 @@
|
|
|
9
9
|
|
|
10
10
|
#include <faiss/MatrixStats.h>
|
|
11
11
|
|
|
12
|
-
#include <
|
|
12
|
+
#include <cstdarg> /* va_list, va_start, va_arg, va_end */
|
|
13
13
|
|
|
14
14
|
#include <faiss/utils/utils.h>
|
|
15
|
+
#include <cinttypes>
|
|
15
16
|
#include <cmath>
|
|
16
17
|
#include <cstdio>
|
|
17
18
|
|
|
@@ -21,18 +22,6 @@ namespace faiss {
|
|
|
21
22
|
* MatrixStats
|
|
22
23
|
*********************************************************************/
|
|
23
24
|
|
|
24
|
-
MatrixStats::PerDimStats::PerDimStats()
|
|
25
|
-
: n(0),
|
|
26
|
-
n_nan(0),
|
|
27
|
-
n_inf(0),
|
|
28
|
-
n0(0),
|
|
29
|
-
min(HUGE_VALF),
|
|
30
|
-
max(-HUGE_VALF),
|
|
31
|
-
sum(0),
|
|
32
|
-
sum2(0),
|
|
33
|
-
mean(NAN),
|
|
34
|
-
stddev(NAN) {}
|
|
35
|
-
|
|
36
25
|
void MatrixStats::PerDimStats::add(float x) {
|
|
37
26
|
n++;
|
|
38
27
|
if (std::isnan(x)) {
|
|
@@ -74,19 +63,12 @@ void MatrixStats::do_comment(const char* fmt, ...) {
|
|
|
74
63
|
buf += size;
|
|
75
64
|
}
|
|
76
65
|
|
|
77
|
-
MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
|
|
78
|
-
: n(n),
|
|
79
|
-
d(d),
|
|
80
|
-
n_collision(0),
|
|
81
|
-
n_valid(0),
|
|
82
|
-
n0(0),
|
|
83
|
-
min_norm2(HUGE_VAL),
|
|
84
|
-
max_norm2(0) {
|
|
66
|
+
MatrixStats::MatrixStats(size_t n, size_t d, const float* x) : n(n), d(d) {
|
|
85
67
|
std::vector<char> comment_buf(10000);
|
|
86
68
|
buf = comment_buf.data();
|
|
87
69
|
nbuf = comment_buf.size();
|
|
88
70
|
|
|
89
|
-
do_comment("analyzing %
|
|
71
|
+
do_comment("analyzing %zd vectors of size %zd\n", n, d);
|
|
90
72
|
|
|
91
73
|
if (d > 1024) {
|
|
92
74
|
do_comment(
|
|
@@ -94,6 +76,9 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
|
|
|
94
76
|
"please consider dimensionality reducution (with PCAMatrix)\n");
|
|
95
77
|
}
|
|
96
78
|
|
|
79
|
+
hash_value = hash_bytes((const uint8_t*)x, n * d * sizeof(*x));
|
|
80
|
+
do_comment("hash value 0x%016" PRIx64 "\n", hash_value);
|
|
81
|
+
|
|
97
82
|
size_t nbytes = sizeof(x[0]) * d;
|
|
98
83
|
per_dim_stats.resize(d);
|
|
99
84
|
|
|
@@ -156,7 +141,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
|
|
|
156
141
|
|
|
157
142
|
if (n_collision > 0) {
|
|
158
143
|
do_comment(
|
|
159
|
-
"%
|
|
144
|
+
"%zd collisions in hash table, "
|
|
160
145
|
"counts may be invalid\n",
|
|
161
146
|
n_collision);
|
|
162
147
|
}
|
|
@@ -167,14 +152,14 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
|
|
|
167
152
|
max = it->second;
|
|
168
153
|
}
|
|
169
154
|
}
|
|
170
|
-
do_comment("vector %
|
|
155
|
+
do_comment("vector %zd has %zd copies\n", max.first, max.count);
|
|
171
156
|
}
|
|
172
157
|
|
|
173
158
|
{ // norm stats
|
|
174
159
|
min_norm2 = sqrt(min_norm2);
|
|
175
160
|
max_norm2 = sqrt(max_norm2);
|
|
176
161
|
do_comment(
|
|
177
|
-
"range of L2 norms=[%g, %g] (%
|
|
162
|
+
"range of L2 norms=[%g, %g] (%zd null vectors)\n",
|
|
178
163
|
min_norm2,
|
|
179
164
|
max_norm2,
|
|
180
165
|
n0);
|
|
@@ -182,7 +167,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
|
|
|
182
167
|
if (max_norm2 < min_norm2 * 1.0001) {
|
|
183
168
|
do_comment(
|
|
184
169
|
"vectors are normalized, inner product and "
|
|
185
|
-
"L2
|
|
170
|
+
"L2 search are equivalent\n");
|
|
186
171
|
}
|
|
187
172
|
|
|
188
173
|
if (max_norm2 > min_norm2 * 100) {
|
|
@@ -196,12 +181,12 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
|
|
|
196
181
|
|
|
197
182
|
double max_std = 0, min_std = HUGE_VAL;
|
|
198
183
|
|
|
199
|
-
size_t n_dangerous_range = 0, n_0_range = 0,
|
|
184
|
+
size_t n_dangerous_range = 0, n_0_range = 0, n0_2 = 0;
|
|
200
185
|
|
|
201
186
|
for (size_t j = 0; j < d; j++) {
|
|
202
187
|
PerDimStats& st = per_dim_stats[j];
|
|
203
188
|
st.compute_mean_std();
|
|
204
|
-
|
|
189
|
+
n0_2 += st.n0;
|
|
205
190
|
|
|
206
191
|
if (st.max == st.min) {
|
|
207
192
|
n_0_range++;
|
|
@@ -215,19 +200,19 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
|
|
|
215
200
|
min_std = st.stddev;
|
|
216
201
|
}
|
|
217
202
|
|
|
218
|
-
if (
|
|
203
|
+
if (n0_2 == 0) {
|
|
219
204
|
do_comment("matrix contains no 0s\n");
|
|
220
205
|
} else {
|
|
221
206
|
do_comment(
|
|
222
207
|
"matrix contains %.2f %% 0 entries\n",
|
|
223
|
-
|
|
208
|
+
n0_2 * 100.0 / (n * d));
|
|
224
209
|
}
|
|
225
210
|
|
|
226
211
|
if (n_0_range == 0) {
|
|
227
212
|
do_comment("no constant dimensions\n");
|
|
228
213
|
} else {
|
|
229
214
|
do_comment(
|
|
230
|
-
"%
|
|
215
|
+
"%zd dimensions are constant: they can be removed\n",
|
|
231
216
|
n_0_range);
|
|
232
217
|
}
|
|
233
218
|
|
|
@@ -235,7 +220,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
|
|
|
235
220
|
do_comment("no dimension has a too large mean\n");
|
|
236
221
|
} else {
|
|
237
222
|
do_comment(
|
|
238
|
-
"%
|
|
223
|
+
"%zd dimensions are too large "
|
|
239
224
|
"wrt. their variance, may loose precision "
|
|
240
225
|
"in IndexFlatL2 (use CenteringTransform)\n",
|
|
241
226
|
n_dangerous_range);
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
#pragma once
|
|
11
11
|
|
|
12
12
|
#include <stdint.h>
|
|
13
|
+
#include <cmath>
|
|
13
14
|
#include <string>
|
|
14
15
|
#include <unordered_map>
|
|
15
16
|
#include <vector>
|
|
@@ -26,20 +27,31 @@ struct MatrixStats {
|
|
|
26
27
|
std::string comments;
|
|
27
28
|
|
|
28
29
|
// raw statistics
|
|
29
|
-
size_t n, d;
|
|
30
|
-
size_t n_collision
|
|
31
|
-
|
|
30
|
+
size_t n = 0, d = 0;
|
|
31
|
+
size_t n_collision = 0;
|
|
32
|
+
size_t n_valid = 0;
|
|
33
|
+
size_t n0 = 0;
|
|
34
|
+
double min_norm2 = HUGE_VALF;
|
|
35
|
+
double max_norm2 = 0;
|
|
36
|
+
uint64_t hash_value = 0;
|
|
32
37
|
|
|
33
38
|
struct PerDimStats {
|
|
34
|
-
|
|
39
|
+
/// counts of various special entries
|
|
40
|
+
size_t n = 0;
|
|
41
|
+
size_t n_nan = 0;
|
|
42
|
+
size_t n_inf = 0;
|
|
43
|
+
size_t n0 = 0;
|
|
35
44
|
|
|
36
|
-
|
|
37
|
-
|
|
45
|
+
/// to get min/max and stddev values
|
|
46
|
+
float min = HUGE_VALF;
|
|
47
|
+
float max = -HUGE_VALF;
|
|
48
|
+
double sum = 0;
|
|
49
|
+
double sum2 = 0;
|
|
38
50
|
|
|
39
|
-
size_t n_valid;
|
|
40
|
-
double mean
|
|
51
|
+
size_t n_valid = 0;
|
|
52
|
+
double mean = NAN;
|
|
53
|
+
double stddev = NAN;
|
|
41
54
|
|
|
42
|
-
PerDimStats();
|
|
43
55
|
void add(float x);
|
|
44
56
|
void compute_mean_std();
|
|
45
57
|
};
|
|
@@ -9,8 +9,8 @@
|
|
|
9
9
|
|
|
10
10
|
#include <faiss/MetaIndexes.h>
|
|
11
11
|
|
|
12
|
-
#include <stdint.h>
|
|
13
12
|
#include <cinttypes>
|
|
13
|
+
#include <cstdint>
|
|
14
14
|
#include <cstdio>
|
|
15
15
|
#include <limits>
|
|
16
16
|
|
|
@@ -70,37 +70,37 @@ void IndexSplitVectors::search(
|
|
|
70
70
|
sum_d == d, "not enough indexes compared to # dimensions");
|
|
71
71
|
|
|
72
72
|
int64_t nshard = sub_indexes.size();
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
73
|
+
|
|
74
|
+
std::unique_ptr<float[]> all_distances(new float[nshard * k * n]);
|
|
75
|
+
std::unique_ptr<idx_t[]> all_labels(new idx_t[nshard * k * n]);
|
|
76
|
+
|
|
77
|
+
auto query_func =
|
|
78
|
+
[n, x, k, distances, labels, &all_distances, &all_labels, this](
|
|
79
|
+
int no) {
|
|
80
|
+
const IndexSplitVectors* index = this;
|
|
81
|
+
float* distances1 =
|
|
82
|
+
no == 0 ? distances : all_distances.get() + no * k * n;
|
|
83
|
+
idx_t* labels1 =
|
|
84
|
+
no == 0 ? labels : all_labels.get() + no * k * n;
|
|
85
|
+
if (index->verbose)
|
|
86
|
+
printf("begin query shard %d on %" PRId64 " points\n",
|
|
87
|
+
no,
|
|
88
|
+
n);
|
|
89
|
+
const Index* sub_index = index->sub_indexes[no];
|
|
90
|
+
int64_t sub_d = sub_index->d, d = index->d;
|
|
91
|
+
idx_t ofs = 0;
|
|
92
|
+
for (int i = 0; i < no; i++)
|
|
93
|
+
ofs += index->sub_indexes[i]->d;
|
|
94
|
+
|
|
95
|
+
std::unique_ptr<float[]> sub_x(new float[sub_d * n]);
|
|
96
|
+
for (idx_t i = 0; i < n; i++)
|
|
97
|
+
memcpy(sub_x.get() + i * sub_d,
|
|
98
|
+
x + ofs + i * d,
|
|
99
|
+
sub_d * sizeof(float));
|
|
100
|
+
sub_index->search(n, sub_x.get(), k, distances1, labels1);
|
|
101
|
+
if (index->verbose)
|
|
102
|
+
printf("end query shard %d\n", no);
|
|
103
|
+
};
|
|
104
104
|
|
|
105
105
|
if (!threaded) {
|
|
106
106
|
for (int i = 0; i < nshard; i++) {
|
|
@@ -125,8 +125,8 @@ void IndexSplitVectors::search(
|
|
|
125
125
|
int64_t factor = 1;
|
|
126
126
|
for (int i = 0; i < nshard; i++) {
|
|
127
127
|
if (i > 0) { // results of 0 are already in the table
|
|
128
|
-
const float* distances_i = all_distances + i * k * n;
|
|
129
|
-
const idx_t* labels_i = all_labels + i * k * n;
|
|
128
|
+
const float* distances_i = all_distances.get() + i * k * n;
|
|
129
|
+
const idx_t* labels_i = all_labels.get() + i * k * n;
|
|
130
130
|
for (int64_t j = 0; j < n; j++) {
|
|
131
131
|
if (labels[j] >= 0 && labels_i[j] >= 0) {
|
|
132
132
|
labels[j] += labels_i[j] * factor;
|
|
@@ -238,6 +238,6 @@ void IndexRandom::reset() {
|
|
|
238
238
|
ntotal = 0;
|
|
239
239
|
}
|
|
240
240
|
|
|
241
|
-
IndexRandom::~IndexRandom()
|
|
241
|
+
IndexRandom::~IndexRandom() = default;
|
|
242
242
|
|
|
243
243
|
} // namespace faiss
|
|
@@ -31,8 +31,13 @@ enum MetricType {
|
|
|
31
31
|
METRIC_Canberra = 20,
|
|
32
32
|
METRIC_BrayCurtis,
|
|
33
33
|
METRIC_JensenShannon,
|
|
34
|
-
|
|
35
|
-
|
|
34
|
+
|
|
35
|
+
/// sum_i(min(a_i, b_i)) / sum_i(max(a_i, b_i)) where a_i, b_i > 0
|
|
36
|
+
METRIC_Jaccard,
|
|
37
|
+
/// Squared Eucliden distance, ignoring NaNs
|
|
38
|
+
METRIC_NaNEuclidean,
|
|
39
|
+
/// abs(x | y): the distance to a hyperplane
|
|
40
|
+
METRIC_ABS_INNER_PRODUCT,
|
|
36
41
|
};
|
|
37
42
|
|
|
38
43
|
/// all vector indices are this type
|
|
@@ -441,13 +441,10 @@ void eig(size_t d_in, double* cov, double* eigenvalues, int verbose) {
|
|
|
441
441
|
|
|
442
442
|
} // namespace
|
|
443
443
|
|
|
444
|
-
void PCAMatrix::train(idx_t n, const float*
|
|
445
|
-
const float*
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
d_in, (size_t*)&n, max_points_per_d * d_in, x, verbose);
|
|
449
|
-
|
|
450
|
-
ScopeDeleter<float> del_x(x != x_in ? x : nullptr);
|
|
444
|
+
void PCAMatrix::train(idx_t n, const float* x_in) {
|
|
445
|
+
const float* x = fvecs_maybe_subsample(
|
|
446
|
+
d_in, (size_t*)&n, max_points_per_d * d_in, x_in, verbose);
|
|
447
|
+
TransformedVectors tv(x_in, x);
|
|
451
448
|
|
|
452
449
|
// compute mean
|
|
453
450
|
mean.clear();
|
|
@@ -884,14 +881,13 @@ ITQTransform::ITQTransform(int d_in, int d_out, bool do_pca)
|
|
|
884
881
|
is_trained = false;
|
|
885
882
|
}
|
|
886
883
|
|
|
887
|
-
void ITQTransform::train(idx_t n, const float*
|
|
884
|
+
void ITQTransform::train(idx_t n, const float* x_in) {
|
|
888
885
|
FAISS_THROW_IF_NOT(!is_trained);
|
|
889
886
|
|
|
890
|
-
const float* x_in = x;
|
|
891
887
|
size_t max_train_points = std::max(d_in * max_train_per_dim, 32768);
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
888
|
+
const float* x =
|
|
889
|
+
fvecs_maybe_subsample(d_in, (size_t*)&n, max_train_points, x_in);
|
|
890
|
+
TransformedVectors tv(x_in, x);
|
|
895
891
|
|
|
896
892
|
std::unique_ptr<float[]> x_norm(new float[n * d_in]);
|
|
897
893
|
{ // normalize
|
|
@@ -988,25 +984,16 @@ void ITQTransform::check_identical(const VectorTransform& other_in) const {
|
|
|
988
984
|
*********************************************/
|
|
989
985
|
|
|
990
986
|
OPQMatrix::OPQMatrix(int d, int M, int d2)
|
|
991
|
-
: LinearTransform(d, d2 == -1 ? d : d2, false),
|
|
992
|
-
M(M),
|
|
993
|
-
niter(50),
|
|
994
|
-
niter_pq(4),
|
|
995
|
-
niter_pq_0(40),
|
|
996
|
-
verbose(false),
|
|
997
|
-
pq(nullptr) {
|
|
987
|
+
: LinearTransform(d, d2 == -1 ? d : d2, false), M(M) {
|
|
998
988
|
is_trained = false;
|
|
999
989
|
// OPQ is quite expensive to train, so set this right.
|
|
1000
990
|
max_train_points = 256 * 256;
|
|
1001
|
-
pq = nullptr;
|
|
1002
991
|
}
|
|
1003
992
|
|
|
1004
|
-
void OPQMatrix::train(idx_t n, const float*
|
|
1005
|
-
const float*
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
ScopeDeleter<float> del_x(x != x_in ? x : nullptr);
|
|
993
|
+
void OPQMatrix::train(idx_t n, const float* x_in) {
|
|
994
|
+
const float* x = fvecs_maybe_subsample(
|
|
995
|
+
d_in, (size_t*)&n, max_train_points, x_in, verbose);
|
|
996
|
+
TransformedVectors tv(x_in, x);
|
|
1010
997
|
|
|
1011
998
|
// To support d_out > d_in, we pad input vectors with 0s to d_out
|
|
1012
999
|
size_t d = d_out <= d_in ? d_in : d_out;
|
|
@@ -230,18 +230,18 @@ struct ProductQuantizer;
|
|
|
230
230
|
*
|
|
231
231
|
*/
|
|
232
232
|
struct OPQMatrix : LinearTransform {
|
|
233
|
-
int M;
|
|
234
|
-
int niter; ///< Number of outer training iterations
|
|
235
|
-
int niter_pq;
|
|
236
|
-
int niter_pq_0; ///< same, for the first outer iteration
|
|
233
|
+
int M; ///< nb of subquantizers
|
|
234
|
+
int niter = 50; ///< Number of outer training iterations
|
|
235
|
+
int niter_pq = 4; ///< Number of training iterations for the PQ
|
|
236
|
+
int niter_pq_0 = 40; ///< same, for the first outer iteration
|
|
237
237
|
|
|
238
238
|
/// if there are too many training points, resample
|
|
239
|
-
size_t max_train_points;
|
|
240
|
-
bool verbose;
|
|
239
|
+
size_t max_train_points = 256 * 256;
|
|
240
|
+
bool verbose = false;
|
|
241
241
|
|
|
242
242
|
/// if non-NULL, use this product quantizer for training
|
|
243
243
|
/// should be constructed with (d_out, M, _)
|
|
244
|
-
ProductQuantizer* pq;
|
|
244
|
+
ProductQuantizer* pq = nullptr;
|
|
245
245
|
|
|
246
246
|
/// if d2 != -1, output vectors of this dimension
|
|
247
247
|
explicit OPQMatrix(int d = 0, int M = 1, int d2 = -1);
|
|
@@ -17,6 +17,8 @@
|
|
|
17
17
|
#include <faiss/Index2Layer.h>
|
|
18
18
|
#include <faiss/IndexAdditiveQuantizer.h>
|
|
19
19
|
#include <faiss/IndexAdditiveQuantizerFastScan.h>
|
|
20
|
+
#include <faiss/IndexBinary.h>
|
|
21
|
+
#include <faiss/IndexBinaryFlat.h>
|
|
20
22
|
#include <faiss/IndexFlat.h>
|
|
21
23
|
#include <faiss/IndexHNSW.h>
|
|
22
24
|
#include <faiss/IndexIVF.h>
|
|
@@ -35,6 +37,7 @@
|
|
|
35
37
|
#include <faiss/IndexRefine.h>
|
|
36
38
|
#include <faiss/IndexRowwiseMinMax.h>
|
|
37
39
|
#include <faiss/IndexScalarQuantizer.h>
|
|
40
|
+
|
|
38
41
|
#include <faiss/MetaIndexes.h>
|
|
39
42
|
#include <faiss/VectorTransform.h>
|
|
40
43
|
|
|
@@ -60,9 +63,10 @@ Index* clone_index(const Index* index) {
|
|
|
60
63
|
// assumes there is a copy constructor ready. Always try from most
|
|
61
64
|
// specific to most general. Most indexes don't have complicated
|
|
62
65
|
// structs, the default copy constructor often just works.
|
|
63
|
-
#define TRYCLONE(classname, obj)
|
|
64
|
-
if (const classname* clo =
|
|
65
|
-
|
|
66
|
+
#define TRYCLONE(classname, obj) \
|
|
67
|
+
if (const classname* clo##classname = \
|
|
68
|
+
dynamic_cast<const classname*>(obj)) { \
|
|
69
|
+
return new classname(*clo##classname); \
|
|
66
70
|
} else
|
|
67
71
|
|
|
68
72
|
VectorTransform* Cloner::clone_VectorTransform(const VectorTransform* vt) {
|
|
@@ -234,13 +238,6 @@ Index* clone_AdditiveQuantizerIndex(const Index* index) {
|
|
|
234
238
|
|
|
235
239
|
namespace {
|
|
236
240
|
|
|
237
|
-
IndexHNSW* clone_HNSW(const IndexHNSW* ihnsw) {
|
|
238
|
-
TRYCLONE(IndexHNSWFlat, ihnsw)
|
|
239
|
-
TRYCLONE(IndexHNSWPQ, ihnsw)
|
|
240
|
-
TRYCLONE(IndexHNSWSQ, ihnsw)
|
|
241
|
-
return new IndexHNSW(*ihnsw);
|
|
242
|
-
}
|
|
243
|
-
|
|
244
241
|
InvertedLists* clone_InvertedLists(const InvertedLists* invlists) {
|
|
245
242
|
if (auto* ails = dynamic_cast<const ArrayInvertedLists*>(invlists)) {
|
|
246
243
|
return new ArrayInvertedLists(*ails);
|
|
@@ -385,4 +382,12 @@ Quantizer* clone_Quantizer(const Quantizer* quant) {
|
|
|
385
382
|
FAISS_THROW_MSG("Did not recognize quantizer to clone");
|
|
386
383
|
}
|
|
387
384
|
|
|
385
|
+
IndexBinary* clone_binary_index(const IndexBinary* index) {
|
|
386
|
+
if (auto ii = dynamic_cast<const IndexBinaryFlat*>(index)) {
|
|
387
|
+
return new IndexBinaryFlat(*ii);
|
|
388
|
+
} else {
|
|
389
|
+
FAISS_THROW_MSG("cannot clone this type of index");
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
|
|
388
393
|
} // namespace faiss
|
|
@@ -17,6 +17,7 @@ struct Index;
|
|
|
17
17
|
struct IndexIVF;
|
|
18
18
|
struct VectorTransform;
|
|
19
19
|
struct Quantizer;
|
|
20
|
+
struct IndexBinary;
|
|
20
21
|
|
|
21
22
|
/* cloning functions */
|
|
22
23
|
Index* clone_index(const Index*);
|
|
@@ -33,4 +34,6 @@ struct Cloner {
|
|
|
33
34
|
|
|
34
35
|
Quantizer* clone_Quantizer(const Quantizer* quant);
|
|
35
36
|
|
|
37
|
+
IndexBinary* clone_binary_index(const IndexBinary* index);
|
|
38
|
+
|
|
36
39
|
} // namespace faiss
|