faiss 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/LICENSE.txt +1 -1
- data/README.md +23 -21
- data/ext/faiss/extconf.rb +11 -0
- data/ext/faiss/index.cpp +4 -4
- data/ext/faiss/index_binary.cpp +6 -6
- data/ext/faiss/product_quantizer.cpp +4 -4
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +13 -0
- data/vendor/faiss/faiss/Clustering.cpp +32 -0
- data/vendor/faiss/faiss/Clustering.h +14 -0
- data/vendor/faiss/faiss/IVFlib.cpp +101 -2
- data/vendor/faiss/faiss/IVFlib.h +26 -2
- data/vendor/faiss/faiss/Index.cpp +36 -3
- data/vendor/faiss/faiss/Index.h +43 -6
- data/vendor/faiss/faiss/Index2Layer.cpp +24 -93
- data/vendor/faiss/faiss/Index2Layer.h +8 -17
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +610 -0
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +253 -0
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +299 -0
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +199 -0
- data/vendor/faiss/faiss/IndexBinary.cpp +20 -4
- data/vendor/faiss/faiss/IndexBinary.h +18 -3
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +9 -2
- data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -2
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -1
- data/vendor/faiss/faiss/IndexBinaryFromFloat.h +2 -1
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +5 -1
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +2 -1
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +17 -4
- data/vendor/faiss/faiss/IndexBinaryHash.h +8 -4
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +28 -13
- data/vendor/faiss/faiss/IndexBinaryIVF.h +10 -7
- data/vendor/faiss/faiss/IndexFastScan.cpp +626 -0
- data/vendor/faiss/faiss/IndexFastScan.h +145 -0
- data/vendor/faiss/faiss/IndexFlat.cpp +52 -69
- data/vendor/faiss/faiss/IndexFlat.h +16 -19
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +101 -0
- data/vendor/faiss/faiss/IndexFlatCodes.h +59 -0
- data/vendor/faiss/faiss/IndexHNSW.cpp +66 -138
- data/vendor/faiss/faiss/IndexHNSW.h +4 -2
- data/vendor/faiss/faiss/IndexIDMap.cpp +247 -0
- data/vendor/faiss/faiss/IndexIDMap.h +107 -0
- data/vendor/faiss/faiss/IndexIVF.cpp +200 -40
- data/vendor/faiss/faiss/IndexIVF.h +59 -22
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +393 -0
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +183 -0
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +590 -0
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +171 -0
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +1290 -0
- data/vendor/faiss/faiss/IndexIVFFastScan.h +213 -0
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +43 -26
- data/vendor/faiss/faiss/IndexIVFFlat.h +4 -2
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +238 -53
- data/vendor/faiss/faiss/IndexIVFPQ.h +6 -2
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +23 -852
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +7 -112
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +3 -3
- data/vendor/faiss/faiss/IndexIVFPQR.h +1 -1
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +63 -40
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +23 -7
- data/vendor/faiss/faiss/IndexLSH.cpp +8 -32
- data/vendor/faiss/faiss/IndexLSH.h +4 -16
- data/vendor/faiss/faiss/IndexLattice.cpp +7 -1
- data/vendor/faiss/faiss/IndexLattice.h +3 -1
- data/vendor/faiss/faiss/IndexNNDescent.cpp +4 -5
- data/vendor/faiss/faiss/IndexNNDescent.h +2 -1
- data/vendor/faiss/faiss/IndexNSG.cpp +37 -5
- data/vendor/faiss/faiss/IndexNSG.h +25 -1
- data/vendor/faiss/faiss/IndexPQ.cpp +108 -120
- data/vendor/faiss/faiss/IndexPQ.h +21 -22
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +15 -450
- data/vendor/faiss/faiss/IndexPQFastScan.h +15 -78
- data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -8
- data/vendor/faiss/faiss/IndexPreTransform.h +15 -3
- data/vendor/faiss/faiss/IndexRefine.cpp +36 -4
- data/vendor/faiss/faiss/IndexRefine.h +14 -2
- data/vendor/faiss/faiss/IndexReplicas.cpp +4 -2
- data/vendor/faiss/faiss/IndexReplicas.h +2 -1
- data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +438 -0
- data/vendor/faiss/faiss/IndexRowwiseMinMax.h +92 -0
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +28 -43
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +8 -23
- data/vendor/faiss/faiss/IndexShards.cpp +4 -1
- data/vendor/faiss/faiss/IndexShards.h +2 -1
- data/vendor/faiss/faiss/MetaIndexes.cpp +5 -178
- data/vendor/faiss/faiss/MetaIndexes.h +3 -81
- data/vendor/faiss/faiss/VectorTransform.cpp +45 -1
- data/vendor/faiss/faiss/VectorTransform.h +25 -4
- data/vendor/faiss/faiss/clone_index.cpp +26 -3
- data/vendor/faiss/faiss/clone_index.h +3 -0
- data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +300 -0
- data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +24 -0
- data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +195 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +2058 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +408 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +2147 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +460 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +465 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +1618 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +251 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +1452 -0
- data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +1 -0
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +2 -6
- data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +60 -0
- data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
- data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +2 -1
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +10 -8
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +75 -14
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +19 -32
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -31
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +22 -28
- data/vendor/faiss/faiss/gpu/GpuResources.cpp +14 -0
- data/vendor/faiss/faiss/gpu/GpuResources.h +16 -3
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +3 -3
- data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +32 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +1 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +311 -75
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +10 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +3 -0
- data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +2 -2
- data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +5 -4
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +331 -29
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +110 -19
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +0 -54
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +0 -76
- data/vendor/faiss/faiss/impl/DistanceComputer.h +64 -0
- data/vendor/faiss/faiss/impl/HNSW.cpp +133 -32
- data/vendor/faiss/faiss/impl/HNSW.h +19 -16
- data/vendor/faiss/faiss/impl/IDSelector.cpp +125 -0
- data/vendor/faiss/faiss/impl/IDSelector.h +135 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +378 -217
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +106 -29
- data/vendor/faiss/faiss/impl/LookupTableScaler.h +77 -0
- data/vendor/faiss/faiss/impl/NNDescent.cpp +1 -0
- data/vendor/faiss/faiss/impl/NSG.cpp +1 -4
- data/vendor/faiss/faiss/impl/NSG.h +1 -1
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +383 -0
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +154 -0
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +225 -145
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +29 -10
- data/vendor/faiss/faiss/impl/Quantizer.h +43 -0
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +521 -55
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +94 -16
- data/vendor/faiss/faiss/impl/ResultHandler.h +96 -0
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +108 -191
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +18 -18
- data/vendor/faiss/faiss/impl/index_read.cpp +338 -24
- data/vendor/faiss/faiss/impl/index_write.cpp +300 -18
- data/vendor/faiss/faiss/impl/io.cpp +1 -1
- data/vendor/faiss/faiss/impl/io_macros.h +20 -0
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +303 -0
- data/vendor/faiss/faiss/impl/kmeans1d.h +48 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +56 -16
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +25 -8
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +66 -25
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +75 -27
- data/vendor/faiss/faiss/index_factory.cpp +772 -412
- data/vendor/faiss/faiss/index_factory.h +3 -0
- data/vendor/faiss/faiss/index_io.h +5 -0
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -0
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +4 -1
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +2 -1
- data/vendor/faiss/faiss/python/python_callbacks.cpp +27 -0
- data/vendor/faiss/faiss/python/python_callbacks.h +15 -0
- data/vendor/faiss/faiss/utils/Heap.h +31 -15
- data/vendor/faiss/faiss/utils/distances.cpp +384 -58
- data/vendor/faiss/faiss/utils/distances.h +149 -18
- data/vendor/faiss/faiss/utils/distances_simd.cpp +776 -6
- data/vendor/faiss/faiss/utils/extra_distances.cpp +12 -7
- data/vendor/faiss/faiss/utils/extra_distances.h +3 -1
- data/vendor/faiss/faiss/utils/fp16-fp16c.h +21 -0
- data/vendor/faiss/faiss/utils/fp16-inl.h +101 -0
- data/vendor/faiss/faiss/utils/fp16.h +11 -0
- data/vendor/faiss/faiss/utils/hamming-inl.h +54 -0
- data/vendor/faiss/faiss/utils/hamming.cpp +0 -48
- data/vendor/faiss/faiss/utils/ordered_key_value.h +10 -0
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +62 -0
- data/vendor/faiss/faiss/utils/quantize_lut.h +20 -0
- data/vendor/faiss/faiss/utils/random.cpp +53 -0
- data/vendor/faiss/faiss/utils/random.h +5 -0
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +4 -0
- data/vendor/faiss/faiss/utils/simdlib_emulated.h +6 -1
- data/vendor/faiss/faiss/utils/simdlib_neon.h +7 -2
- data/vendor/faiss/faiss/utils/utils.h +1 -1
- metadata +46 -5
- data/vendor/faiss/faiss/IndexResidual.cpp +0 -291
- data/vendor/faiss/faiss/IndexResidual.h +0 -152
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
|
|
2
|
+
|
|
3
|
+
#pragma once
|
|
4
|
+
|
|
5
|
+
// This file contains a custom fast implementation of faiss::Index::sa_decode()
|
|
6
|
+
// function for the following index families:
|
|
7
|
+
// * IVF256,PQ[1]x8np
|
|
8
|
+
// * Residual[1]x8,PQ[2]x8
|
|
9
|
+
// * IVF[2^9-2^16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
|
|
10
|
+
// * Residual1x[9-16 bit],PQ[1]x8 (such as Residual1x9,PQ8)
|
|
11
|
+
// * PQ[1]x8
|
|
12
|
+
// Additionally, AVX2 and ARM versions support
|
|
13
|
+
// * Residual[1]x8,PQ[2]x10
|
|
14
|
+
// * Residual[1]x8,PQ[2]x16
|
|
15
|
+
// * Residual[1]x10,PQ[2]x10
|
|
16
|
+
// * Residual[1]x10,PQ[2]x16
|
|
17
|
+
// * Residual[1]x16,PQ[2]x10
|
|
18
|
+
// * Residual[1]x16,PQ[2]x16
|
|
19
|
+
// * Residual1x[9-16 bit],PQ[1]x10 (such as Residual1x9,PQ16x10)
|
|
20
|
+
// * * (use with COARSE_BITS=16)
|
|
21
|
+
// * Residual1x[9-16 bit],PQ[1]x16 (such as Residual1x9,PQ16x16)
|
|
22
|
+
// * * (use with COARSE_BITS=16)
|
|
23
|
+
// * PQ[1]x10
|
|
24
|
+
// * PQ[1]x16
|
|
25
|
+
// Unfortunately, currently Faiss does not support something like
|
|
26
|
+
// IVF256,PQ16x10np
|
|
27
|
+
//
|
|
28
|
+
// The goal was to achieve the maximum performance, so the template version it
|
|
29
|
+
// is. The provided index families share the same code for sa_decode.
|
|
30
|
+
//
|
|
31
|
+
// The front-end code provides two high-level structures.
|
|
32
|
+
//
|
|
33
|
+
// First one:
|
|
34
|
+
// {
|
|
35
|
+
// template <
|
|
36
|
+
// intptr_t DIM,
|
|
37
|
+
// intptr_t COARSE_SIZE,
|
|
38
|
+
// intptr_t FINE_SIZE,
|
|
39
|
+
// intptr_t COARSE_BITS = 8
|
|
40
|
+
// intptr_t FINE_BITS = 8>
|
|
41
|
+
// struct Index2LevelDecoder { /*...*/ };
|
|
42
|
+
// }
|
|
43
|
+
// * DIM is the dimensionality of data
|
|
44
|
+
// * COARSE_SIZE is the dimensionality of the coarse quantizer (IVF, Residual)
|
|
45
|
+
// * FINE_SIZE is the dimensionality of the ProductQuantizer dsq
|
|
46
|
+
// * COARSE_BITS is the number of bits that are needed to represent a coarse
|
|
47
|
+
// quantizer code.
|
|
48
|
+
// * FINE_BITS is the number of bits that are needed to represent a fine
|
|
49
|
+
// quantizer code.
|
|
50
|
+
// For example, "IVF256,PQ8np" for 160-dim data translates into
|
|
51
|
+
// Index2LevelDecoder<160,160,20,8>
|
|
52
|
+
// For example, "Residual4x8,PQ16" for 256-dim data translates into
|
|
53
|
+
// Index2LevelDecoder<256,64,1,8>
|
|
54
|
+
// For example, "IVF1024,PQ16np" for 256-dim data translates into
|
|
55
|
+
// Index2LevelDecoder<256,256,16,10>. But as there are only 1 coarse code
|
|
56
|
+
// element, Index2LevelDecoder<256,256,16,16> can be used as a faster
|
|
57
|
+
// decoder.
|
|
58
|
+
// For example, "Residual4x10,PQ16x10np" for 256-dim data translates into
|
|
59
|
+
// Index2LevelDecoder<256,64,16,10,10>
|
|
60
|
+
//
|
|
61
|
+
// Additional supported values for COARSE_BITS and FINE_BITS may be added later.
|
|
62
|
+
//
|
|
63
|
+
// Second one:
|
|
64
|
+
// {
|
|
65
|
+
// template <
|
|
66
|
+
// intptr_t DIM,
|
|
67
|
+
// intptr_t FINE_SIZE,
|
|
68
|
+
// intptr_t FINE_BITS = 8>
|
|
69
|
+
// struct IndexPQDecoder { /*...*/ };
|
|
70
|
+
// }
|
|
71
|
+
// * DIM is the dimensionality of data
|
|
72
|
+
// * FINE_SIZE is the dimensionality of the ProductQuantizer dsq
|
|
73
|
+
// * FINE_BITS is the number of bits that are needed to represent a fine
|
|
74
|
+
// quantizer code.
|
|
75
|
+
// For example, "PQ8np" for 160-dim data translates into
|
|
76
|
+
// IndexPQDecoder<160,20>
|
|
77
|
+
//
|
|
78
|
+
// Unlike the general purpose version in faiss::Index::sa_decode(),
|
|
79
|
+
// this version provides the following functions (please note that
|
|
80
|
+
// pqCoarseCentroids params are not available for IndexPQDecoder,
|
|
81
|
+
// but the functionality is the same as for Index2LevelDecoder):
|
|
82
|
+
//
|
|
83
|
+
// * ::store(), which is similar to sa_decode(1, input, output),
|
|
84
|
+
// The method signature is the following:
|
|
85
|
+
// {
|
|
86
|
+
// void store(
|
|
87
|
+
// const float* const __restrict pqCoarseCentroids,
|
|
88
|
+
// const float* const __restrict pqFineCentroids,
|
|
89
|
+
// const uint8_t* const __restrict code,
|
|
90
|
+
// float* const __restrict outputStore);
|
|
91
|
+
// }
|
|
92
|
+
//
|
|
93
|
+
// * ::accum(), which is used to create a linear combination
|
|
94
|
+
// of decoded vectors:
|
|
95
|
+
// {
|
|
96
|
+
// const faiss::Index* const index;
|
|
97
|
+
// const uint8_t* const input;
|
|
98
|
+
// float weight;
|
|
99
|
+
//
|
|
100
|
+
// std::vector<float> buffer(d, 0);
|
|
101
|
+
//
|
|
102
|
+
// index->sa_decode(1, input, buffer.data());
|
|
103
|
+
// for (size_t iDim = 0; iDim < d; iDim++)
|
|
104
|
+
// output[iDim] += weight * buffer[iDim];
|
|
105
|
+
// }
|
|
106
|
+
// The method signature is the following:
|
|
107
|
+
// {
|
|
108
|
+
// static void accum(
|
|
109
|
+
// const float* const __restrict pqCoarseCentroids,
|
|
110
|
+
// const float* const __restrict pqFineCentroids,
|
|
111
|
+
// const uint8_t* const __restrict code,
|
|
112
|
+
// const float weight,
|
|
113
|
+
// float* const __restrict outputAccum);
|
|
114
|
+
// }
|
|
115
|
+
//
|
|
116
|
+
// * There is an additional overload for ::accum() that decodes two vectors
|
|
117
|
+
// per call. This provides an additional speedup because of a CPU
|
|
118
|
+
// superscalar architecture:
|
|
119
|
+
// {
|
|
120
|
+
// const faiss::Index* const index;
|
|
121
|
+
// const uint8_t* const input0;
|
|
122
|
+
// float weight0;
|
|
123
|
+
// const uint8_t* const input1;
|
|
124
|
+
// float weight1;
|
|
125
|
+
//
|
|
126
|
+
// std::vector<float> buffer(d, 0);
|
|
127
|
+
//
|
|
128
|
+
// index->sa_decode(1, input0, buffer.data());
|
|
129
|
+
// for (size_t iDim = 0; iDim < d; iDim++)
|
|
130
|
+
// output[iDim] += weight0 * buffer[iDim];
|
|
131
|
+
//
|
|
132
|
+
// index->sa_decode(1, input1, buffer.data());
|
|
133
|
+
// for (size_t iDim = 0; iDim < d; iDim++)
|
|
134
|
+
// output[iDim] += weight1 * buffer[iDim];
|
|
135
|
+
// }
|
|
136
|
+
// If each code uses its own coarse quantizer centroids table and its own fine
|
|
137
|
+
// quantizer centroids table, then the following overload can be used:
|
|
138
|
+
// {
|
|
139
|
+
// static void accum(
|
|
140
|
+
// const float* const __restrict pqCoarseCentroids0,
|
|
141
|
+
// const float* const __restrict pqFineCentroids0,
|
|
142
|
+
// const uint8_t* const __restrict code0,
|
|
143
|
+
// const float weight0,
|
|
144
|
+
// const float* const __restrict pqCoarseCentroids1,
|
|
145
|
+
// const float* const __restrict pqFineCentroids1,
|
|
146
|
+
// const uint8_t* const __restrict code1,
|
|
147
|
+
// const float weight1,
|
|
148
|
+
// float* const __restrict outputAccum);
|
|
149
|
+
// }
|
|
150
|
+
// If codes share the coarse quantizer centroids table and also share
|
|
151
|
+
// the fine quantizer centroids table, then the following overload can be
|
|
152
|
+
// used:
|
|
153
|
+
// {
|
|
154
|
+
// static void accum(
|
|
155
|
+
// const float* const __restrict pqCoarseCentroids,
|
|
156
|
+
// const float* const __restrict pqFineCentroids,
|
|
157
|
+
// const uint8_t* const __restrict code0,
|
|
158
|
+
// const float weight0,
|
|
159
|
+
// const uint8_t* const __restrict code1,
|
|
160
|
+
// const float weight1,
|
|
161
|
+
// float* const __restrict outputAccum);
|
|
162
|
+
// }
|
|
163
|
+
//
|
|
164
|
+
// * And one more overload for ::accum() that decodes and accumulates
|
|
165
|
+
// three vectors per call.
|
|
166
|
+
// {
|
|
167
|
+
// const faiss::Index* const index;
|
|
168
|
+
// const uint8_t* const input0;
|
|
169
|
+
// float weight0;
|
|
170
|
+
// const uint8_t* const input1;
|
|
171
|
+
// float weight1;
|
|
172
|
+
// const uint8_t* const input2;
|
|
173
|
+
// float weight2;
|
|
174
|
+
//
|
|
175
|
+
// std::vector<float> buffer(d, 0);
|
|
176
|
+
//
|
|
177
|
+
// index->sa_decode(1, input0, buffer.data());
|
|
178
|
+
// for (size_t iDim = 0; iDim < d; iDim++)
|
|
179
|
+
// output[iDim] += weight0 * buffer[iDim];
|
|
180
|
+
//
|
|
181
|
+
// index->sa_decode(1, input1, buffer.data());
|
|
182
|
+
// for (size_t iDim = 0; iDim < d; iDim++)
|
|
183
|
+
// output[iDim] += weight1 * buffer[iDim];
|
|
184
|
+
//
|
|
185
|
+
// index->sa_decode(1, input2, buffer.data());
|
|
186
|
+
// for (size_t iDim = 0; iDim < d; iDim++)
|
|
187
|
+
// output[iDim] += weight2 * buffer[iDim];
|
|
188
|
+
// }
|
|
189
|
+
//
|
|
190
|
+
// If each code uses its own coarse quantizer centroids table and its own fine
|
|
191
|
+
// quantizer centroids table, then the following overload can be used:
|
|
192
|
+
// {
|
|
193
|
+
// static void accum(
|
|
194
|
+
// const float* const __restrict pqCoarseCentroids0,
|
|
195
|
+
// const float* const __restrict pqFineCentroids0,
|
|
196
|
+
// const uint8_t* const __restrict code0,
|
|
197
|
+
// const float weight0,
|
|
198
|
+
// const float* const __restrict pqCoarseCentroids1,
|
|
199
|
+
// const float* const __restrict pqFineCentroids1,
|
|
200
|
+
// const uint8_t* const __restrict code1,
|
|
201
|
+
// const float weight1,
|
|
202
|
+
// const float* const __restrict pqCoarseCentroids2,
|
|
203
|
+
// const float* const __restrict pqFineCentroids2,
|
|
204
|
+
// const uint8_t* const __restrict code2,
|
|
205
|
+
// const float weight2,
|
|
206
|
+
// float* const __restrict outputAccum);
|
|
207
|
+
// }
|
|
208
|
+
// If codes share the coarse quantizer centroids table and also share
|
|
209
|
+
// the fine quantizer centroids table, then the following overload can be
|
|
210
|
+
// used:
|
|
211
|
+
// {
|
|
212
|
+
// static void accum(
|
|
213
|
+
// const float* const __restrict pqCoarseCentroids,
|
|
214
|
+
// const float* const __restrict pqFineCentroids,
|
|
215
|
+
// const uint8_t* const __restrict code0,
|
|
216
|
+
// const float weight0,
|
|
217
|
+
// const uint8_t* const __restrict code1,
|
|
218
|
+
// const float weight1,
|
|
219
|
+
// const uint8_t* const __restrict code2,
|
|
220
|
+
// const float weight2,
|
|
221
|
+
// float* const __restrict outputAccum);
|
|
222
|
+
// }
|
|
223
|
+
//
|
|
224
|
+
// The provided version is not multithreaded.
|
|
225
|
+
//
|
|
226
|
+
// Currently, an AVX2+FMA implementation is available. AVX512 version is also
|
|
227
|
+
// doable, but it was found to be slower than AVX2 for real world applications
|
|
228
|
+
// that I needed.
|
|
229
|
+
//
|
|
230
|
+
////////////////////////////////////////////////////////////////////////////////////
|
|
231
|
+
//
|
|
232
|
+
// It is possible to use an additional index wrapper on top of IVFPQ /
|
|
233
|
+
// Residual+PQ, known as IndexRowwiseMinMax / IndexRowwiseMinMaxFP16. Index
|
|
234
|
+
// wrapper that performs rowwise normalization to [0,1], preserving the
|
|
235
|
+
// coefficients. This is a vector codec index only.
|
|
236
|
+
// For more details please refer to the description in
|
|
237
|
+
// faiss/IndexRowwiseMinMax.h file.
|
|
238
|
+
//
|
|
239
|
+
// If such a wrapper is used, then the quantizer will look like, say,
|
|
240
|
+
// MinMaxFP16,IVF256,PQ32np
|
|
241
|
+
// or
|
|
242
|
+
// MinMax,PQ16np
|
|
243
|
+
// In this case, please use the following contruction for the decoding,
|
|
244
|
+
// basically, wrapping a kernel in a kernel:
|
|
245
|
+
// {
|
|
246
|
+
// using SubT = faiss::cppcontrib::Index2LevelDecoder<128, 128, 2>;
|
|
247
|
+
// using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
|
|
248
|
+
// // do T::store(...) or T::accum(...)
|
|
249
|
+
// }
|
|
250
|
+
//
|
|
251
|
+
// T::accum(...) contains an additional function variable which is
|
|
252
|
+
// used for accumulating scaling. Thus, the code pattern is the following:
|
|
253
|
+
// {
|
|
254
|
+
// const float* const __restrict pqCoarseCentroidsQ;
|
|
255
|
+
// const float* const __restrict pqFineCentroidsQ;
|
|
256
|
+
// const uint8_t* const __restrict input;
|
|
257
|
+
// const float* const __restrict weights;
|
|
258
|
+
// float* const __restrict output;
|
|
259
|
+
// float outputAccumMin = 0;
|
|
260
|
+
//
|
|
261
|
+
// for (size_t i = 0; i < n; i++) {
|
|
262
|
+
// T::accum(
|
|
263
|
+
// pqCoarseCentroidsQ,
|
|
264
|
+
// pqFineCentroidsQ,
|
|
265
|
+
// input + i * code_size,
|
|
266
|
+
// weights[i],
|
|
267
|
+
// output,
|
|
268
|
+
// outputAccumMin);
|
|
269
|
+
// }
|
|
270
|
+
// for (size_t j = 0; j < d; j++)
|
|
271
|
+
// output[j] += outputAccumMin;
|
|
272
|
+
// }
|
|
273
|
+
// This is similar to the following regular pseudo-code:
|
|
274
|
+
// {
|
|
275
|
+
// const faiss::Index* const index;
|
|
276
|
+
// const uint8_t* const __restrict input;
|
|
277
|
+
// const float* const __restrict weights;
|
|
278
|
+
// float* const __restrict output;
|
|
279
|
+
//
|
|
280
|
+
// for (size_t i = 0; i < n; i++) {
|
|
281
|
+
// std::vector<float> buffer(d, 0);
|
|
282
|
+
//
|
|
283
|
+
// index->sa_decode(1, input + i * code_size, buffer.data());
|
|
284
|
+
// for (size_t j = 0; j < d; j++)
|
|
285
|
+
// output[j] += weights[i] * buffer[j];
|
|
286
|
+
// }
|
|
287
|
+
|
|
288
|
+
#include <faiss/cppcontrib/sa_decode/MinMax-inl.h>
|
|
289
|
+
#include <faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h>
|
|
290
|
+
|
|
291
|
+
#ifdef __AVX2__
|
|
292
|
+
#include <faiss/cppcontrib/sa_decode/Level2-avx2-inl.h>
|
|
293
|
+
#include <faiss/cppcontrib/sa_decode/PQ-avx2-inl.h>
|
|
294
|
+
#elif defined(__ARM_NEON)
|
|
295
|
+
#include <faiss/cppcontrib/sa_decode/Level2-neon-inl.h>
|
|
296
|
+
#include <faiss/cppcontrib/sa_decode/PQ-neon-inl.h>
|
|
297
|
+
#else
|
|
298
|
+
#include <faiss/cppcontrib/sa_decode/Level2-inl.h>
|
|
299
|
+
#include <faiss/cppcontrib/sa_decode/PQ-inl.h>
|
|
300
|
+
#endif
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <cstdint>
|
|
4
|
+
|
|
5
|
+
namespace faiss {
|
|
6
|
+
namespace cppcontrib {
|
|
7
|
+
namespace detail {
|
|
8
|
+
|
|
9
|
+
template <int COARSE_BITS>
|
|
10
|
+
struct CoarseBitType {};
|
|
11
|
+
|
|
12
|
+
template <>
|
|
13
|
+
struct CoarseBitType<8> {
|
|
14
|
+
using bit_type = uint8_t;
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
template <>
|
|
18
|
+
struct CoarseBitType<16> {
|
|
19
|
+
using bit_type = uint16_t;
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
} // namespace detail
|
|
23
|
+
} // namespace cppcontrib
|
|
24
|
+
} // namespace faiss
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <cstdint>
|
|
4
|
+
|
|
5
|
+
namespace faiss {
|
|
6
|
+
namespace cppcontrib {
|
|
7
|
+
namespace detail {
|
|
8
|
+
|
|
9
|
+
namespace {
|
|
10
|
+
|
|
11
|
+
template <intptr_t N_ELEMENTS, intptr_t CPOS>
|
|
12
|
+
struct Uint8Reader {
|
|
13
|
+
static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
|
|
14
|
+
|
|
15
|
+
static intptr_t get(const uint8_t* const __restrict codes) {
|
|
16
|
+
// Read using 4-bytes, if possible.
|
|
17
|
+
// Reading using 8-byte takes too many registers somewhy.
|
|
18
|
+
|
|
19
|
+
constexpr intptr_t ELEMENT_TO_READ = CPOS / 4;
|
|
20
|
+
constexpr intptr_t SUB_ELEMENT = CPOS % 4;
|
|
21
|
+
|
|
22
|
+
switch (SUB_ELEMENT) {
|
|
23
|
+
case 0: {
|
|
24
|
+
if (N_ELEMENTS > CPOS + 3) {
|
|
25
|
+
const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
|
|
26
|
+
codes + ELEMENT_TO_READ * 4);
|
|
27
|
+
return (code32 & 0x000000FF);
|
|
28
|
+
} else {
|
|
29
|
+
return codes[CPOS];
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
case 1: {
|
|
33
|
+
if (N_ELEMENTS > CPOS + 2) {
|
|
34
|
+
const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
|
|
35
|
+
codes + ELEMENT_TO_READ * 4);
|
|
36
|
+
return (code32 & 0x0000FF00) >> 8;
|
|
37
|
+
} else {
|
|
38
|
+
return codes[CPOS];
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
case 2: {
|
|
42
|
+
if (N_ELEMENTS > CPOS + 1) {
|
|
43
|
+
const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
|
|
44
|
+
codes + ELEMENT_TO_READ * 4);
|
|
45
|
+
return (code32 & 0x00FF0000) >> 16;
|
|
46
|
+
} else {
|
|
47
|
+
return codes[CPOS];
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
case 3: {
|
|
51
|
+
if (N_ELEMENTS > CPOS) {
|
|
52
|
+
const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
|
|
53
|
+
codes + ELEMENT_TO_READ * 4);
|
|
54
|
+
return (code32) >> 24;
|
|
55
|
+
} else {
|
|
56
|
+
return codes[CPOS];
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
// reduces the number of read operations from RAM
|
|
64
|
+
///////////////////////////////////////////////
|
|
65
|
+
// 76543210 76543210 76543210 76543210 76543210
|
|
66
|
+
// 00000000 00
|
|
67
|
+
// 111111 1111
|
|
68
|
+
// 2222 222222
|
|
69
|
+
// 33 33333333
|
|
70
|
+
template <intptr_t N_ELEMENTS, intptr_t CPOS>
|
|
71
|
+
struct Uint10Reader {
|
|
72
|
+
static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
|
|
73
|
+
|
|
74
|
+
static intptr_t get(const uint8_t* const __restrict codes) {
|
|
75
|
+
// Read using 4-bytes or 2-bytes.
|
|
76
|
+
|
|
77
|
+
constexpr intptr_t ELEMENT_TO_READ = CPOS / 4;
|
|
78
|
+
constexpr intptr_t SUB_ELEMENT = CPOS % 4;
|
|
79
|
+
|
|
80
|
+
switch (SUB_ELEMENT) {
|
|
81
|
+
case 0: {
|
|
82
|
+
if (N_ELEMENTS > CPOS + 2) {
|
|
83
|
+
const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
|
|
84
|
+
codes + ELEMENT_TO_READ * 5);
|
|
85
|
+
return (code32 & 0b0000001111111111);
|
|
86
|
+
} else {
|
|
87
|
+
const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
|
|
88
|
+
codes + ELEMENT_TO_READ * 5 + 0);
|
|
89
|
+
return (code16 & 0b0000001111111111);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
case 1: {
|
|
93
|
+
if (N_ELEMENTS > CPOS + 1) {
|
|
94
|
+
const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
|
|
95
|
+
codes + ELEMENT_TO_READ * 5);
|
|
96
|
+
return (code32 & 0b000011111111110000000000) >> 10;
|
|
97
|
+
} else {
|
|
98
|
+
const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
|
|
99
|
+
codes + ELEMENT_TO_READ * 5 + 1);
|
|
100
|
+
return (code16 & 0b0000111111111100) >> 2;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
case 2: {
|
|
104
|
+
if (N_ELEMENTS > CPOS) {
|
|
105
|
+
const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
|
|
106
|
+
codes + ELEMENT_TO_READ * 5);
|
|
107
|
+
return (code32 & 0b00111111111100000000000000000000) >> 20;
|
|
108
|
+
} else {
|
|
109
|
+
const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
|
|
110
|
+
codes + ELEMENT_TO_READ * 5 + 2);
|
|
111
|
+
return (code16 & 0b0011111111110000) >> 4;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
case 3: {
|
|
115
|
+
const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
|
|
116
|
+
codes + ELEMENT_TO_READ * 5 + 3);
|
|
117
|
+
return (code16 & 0b1111111111000000) >> 6;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
};
|
|
122
|
+
|
|
123
|
+
// reduces the number of read operations from RAM
|
|
124
|
+
template <intptr_t N_ELEMENTS, intptr_t CPOS>
|
|
125
|
+
struct Uint16Reader {
|
|
126
|
+
static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
|
|
127
|
+
|
|
128
|
+
static intptr_t get(const uint8_t* const __restrict codes) {
|
|
129
|
+
// Read using 4-bytes or 2-bytes.
|
|
130
|
+
// Reading using 8-byte takes too many registers somewhy.
|
|
131
|
+
|
|
132
|
+
constexpr intptr_t ELEMENT_TO_READ = CPOS / 2;
|
|
133
|
+
constexpr intptr_t SUB_ELEMENT = CPOS % 2;
|
|
134
|
+
|
|
135
|
+
switch (SUB_ELEMENT) {
|
|
136
|
+
case 0: {
|
|
137
|
+
if (N_ELEMENTS > CPOS + 1) {
|
|
138
|
+
const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
|
|
139
|
+
codes + ELEMENT_TO_READ * 4);
|
|
140
|
+
return (code32 & 0x0000FFFF);
|
|
141
|
+
} else {
|
|
142
|
+
const uint16_t* const __restrict codesFp16 =
|
|
143
|
+
reinterpret_cast<const uint16_t*>(codes);
|
|
144
|
+
return codesFp16[CPOS];
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
case 1: {
|
|
148
|
+
if (N_ELEMENTS > CPOS) {
|
|
149
|
+
const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
|
|
150
|
+
codes + ELEMENT_TO_READ * 4);
|
|
151
|
+
return code32 >> 16;
|
|
152
|
+
} else {
|
|
153
|
+
const uint16_t* const __restrict codesFp16 =
|
|
154
|
+
reinterpret_cast<const uint16_t*>(codes);
|
|
155
|
+
return codesFp16[CPOS];
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
};
|
|
161
|
+
|
|
162
|
+
//
|
|
163
|
+
template <intptr_t N_ELEMENTS, intptr_t CODE_BITS, intptr_t CPOS>
|
|
164
|
+
struct UintReaderImplType {};
|
|
165
|
+
|
|
166
|
+
template <intptr_t N_ELEMENTS, intptr_t CPOS>
|
|
167
|
+
struct UintReaderImplType<N_ELEMENTS, 8, CPOS> {
|
|
168
|
+
using reader_type = Uint8Reader<N_ELEMENTS, CPOS>;
|
|
169
|
+
};
|
|
170
|
+
|
|
171
|
+
template <intptr_t N_ELEMENTS, intptr_t CPOS>
|
|
172
|
+
struct UintReaderImplType<N_ELEMENTS, 10, CPOS> {
|
|
173
|
+
using reader_type = Uint10Reader<N_ELEMENTS, CPOS>;
|
|
174
|
+
};
|
|
175
|
+
|
|
176
|
+
template <intptr_t N_ELEMENTS, intptr_t CPOS>
|
|
177
|
+
struct UintReaderImplType<N_ELEMENTS, 16, CPOS> {
|
|
178
|
+
using reader_type = Uint16Reader<N_ELEMENTS, CPOS>;
|
|
179
|
+
};
|
|
180
|
+
|
|
181
|
+
} // namespace
|
|
182
|
+
|
|
183
|
+
// reduces the number of read operations from RAM
|
|
184
|
+
template <intptr_t DIM, intptr_t CODE_SIZE, intptr_t CODE_BITS, intptr_t CPOS>
|
|
185
|
+
using UintReader =
|
|
186
|
+
typename UintReaderImplType<DIM / CODE_SIZE, CODE_BITS, CPOS>::
|
|
187
|
+
reader_type;
|
|
188
|
+
|
|
189
|
+
template <intptr_t N_ELEMENTS, intptr_t CODE_BITS, intptr_t CPOS>
|
|
190
|
+
using UintReaderRaw =
|
|
191
|
+
typename UintReaderImplType<N_ELEMENTS, CODE_BITS, CPOS>::reader_type;
|
|
192
|
+
|
|
193
|
+
} // namespace detail
|
|
194
|
+
} // namespace cppcontrib
|
|
195
|
+
} // namespace faiss
|