faiss 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +23 -21
- data/ext/faiss/extconf.rb +11 -0
- data/ext/faiss/index.cpp +4 -4
- data/ext/faiss/index_binary.cpp +6 -6
- data/ext/faiss/product_quantizer.cpp +4 -4
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +13 -0
- data/vendor/faiss/faiss/IVFlib.cpp +101 -2
- data/vendor/faiss/faiss/IVFlib.h +26 -2
- data/vendor/faiss/faiss/Index.cpp +36 -3
- data/vendor/faiss/faiss/Index.h +43 -6
- data/vendor/faiss/faiss/Index2Layer.cpp +6 -2
- data/vendor/faiss/faiss/Index2Layer.h +6 -1
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +219 -16
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +63 -5
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +299 -0
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +199 -0
- data/vendor/faiss/faiss/IndexBinary.cpp +20 -4
- data/vendor/faiss/faiss/IndexBinary.h +18 -3
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +9 -2
- data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -2
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -1
- data/vendor/faiss/faiss/IndexBinaryFromFloat.h +2 -1
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +5 -1
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +2 -1
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +17 -4
- data/vendor/faiss/faiss/IndexBinaryHash.h +8 -4
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +28 -13
- data/vendor/faiss/faiss/IndexBinaryIVF.h +10 -7
- data/vendor/faiss/faiss/IndexFastScan.cpp +626 -0
- data/vendor/faiss/faiss/IndexFastScan.h +145 -0
- data/vendor/faiss/faiss/IndexFlat.cpp +34 -21
- data/vendor/faiss/faiss/IndexFlat.h +7 -4
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +35 -1
- data/vendor/faiss/faiss/IndexFlatCodes.h +12 -0
- data/vendor/faiss/faiss/IndexHNSW.cpp +66 -138
- data/vendor/faiss/faiss/IndexHNSW.h +4 -2
- data/vendor/faiss/faiss/IndexIDMap.cpp +247 -0
- data/vendor/faiss/faiss/IndexIDMap.h +107 -0
- data/vendor/faiss/faiss/IndexIVF.cpp +121 -33
- data/vendor/faiss/faiss/IndexIVF.h +35 -16
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +84 -7
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +63 -1
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +590 -0
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +171 -0
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +1290 -0
- data/vendor/faiss/faiss/IndexIVFFastScan.h +213 -0
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +37 -17
- data/vendor/faiss/faiss/IndexIVFFlat.h +4 -2
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +234 -50
- data/vendor/faiss/faiss/IndexIVFPQ.h +5 -1
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +23 -852
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +7 -112
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +3 -3
- data/vendor/faiss/faiss/IndexIVFPQR.h +1 -1
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +3 -1
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +2 -1
- data/vendor/faiss/faiss/IndexLSH.cpp +4 -2
- data/vendor/faiss/faiss/IndexLSH.h +2 -1
- data/vendor/faiss/faiss/IndexLattice.cpp +7 -1
- data/vendor/faiss/faiss/IndexLattice.h +3 -1
- data/vendor/faiss/faiss/IndexNNDescent.cpp +4 -3
- data/vendor/faiss/faiss/IndexNNDescent.h +2 -1
- data/vendor/faiss/faiss/IndexNSG.cpp +37 -3
- data/vendor/faiss/faiss/IndexNSG.h +25 -1
- data/vendor/faiss/faiss/IndexPQ.cpp +106 -69
- data/vendor/faiss/faiss/IndexPQ.h +19 -5
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +15 -450
- data/vendor/faiss/faiss/IndexPQFastScan.h +15 -78
- data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -8
- data/vendor/faiss/faiss/IndexPreTransform.h +15 -3
- data/vendor/faiss/faiss/IndexRefine.cpp +8 -4
- data/vendor/faiss/faiss/IndexRefine.h +4 -2
- data/vendor/faiss/faiss/IndexReplicas.cpp +4 -2
- data/vendor/faiss/faiss/IndexReplicas.h +2 -1
- data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +438 -0
- data/vendor/faiss/faiss/IndexRowwiseMinMax.h +92 -0
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +26 -15
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -7
- data/vendor/faiss/faiss/IndexShards.cpp +4 -1
- data/vendor/faiss/faiss/IndexShards.h +2 -1
- data/vendor/faiss/faiss/MetaIndexes.cpp +5 -178
- data/vendor/faiss/faiss/MetaIndexes.h +3 -81
- data/vendor/faiss/faiss/VectorTransform.cpp +43 -0
- data/vendor/faiss/faiss/VectorTransform.h +22 -4
- data/vendor/faiss/faiss/clone_index.cpp +23 -1
- data/vendor/faiss/faiss/clone_index.h +3 -0
- data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +300 -0
- data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +24 -0
- data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +195 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +2058 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +408 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +2147 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +460 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +465 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +1618 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +251 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +1452 -0
- data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +1 -0
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +0 -4
- data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
- data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +2 -1
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +10 -8
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +75 -14
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +19 -32
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -31
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +22 -28
- data/vendor/faiss/faiss/gpu/GpuResources.cpp +14 -0
- data/vendor/faiss/faiss/gpu/GpuResources.h +16 -3
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +3 -3
- data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +32 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +1 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +311 -75
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +10 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +3 -0
- data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +2 -2
- data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +5 -4
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +116 -47
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +44 -13
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +0 -54
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +0 -76
- data/vendor/faiss/faiss/impl/DistanceComputer.h +64 -0
- data/vendor/faiss/faiss/impl/HNSW.cpp +123 -27
- data/vendor/faiss/faiss/impl/HNSW.h +19 -16
- data/vendor/faiss/faiss/impl/IDSelector.cpp +125 -0
- data/vendor/faiss/faiss/impl/IDSelector.h +135 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +6 -28
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +6 -1
- data/vendor/faiss/faiss/impl/LookupTableScaler.h +77 -0
- data/vendor/faiss/faiss/impl/NNDescent.cpp +1 -0
- data/vendor/faiss/faiss/impl/NSG.cpp +1 -1
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +383 -0
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +154 -0
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +225 -145
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +29 -10
- data/vendor/faiss/faiss/impl/Quantizer.h +43 -0
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +192 -36
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +40 -20
- data/vendor/faiss/faiss/impl/ResultHandler.h +96 -0
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +97 -173
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +18 -18
- data/vendor/faiss/faiss/impl/index_read.cpp +240 -9
- data/vendor/faiss/faiss/impl/index_write.cpp +237 -5
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +6 -4
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +56 -16
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +25 -8
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +66 -25
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +75 -27
- data/vendor/faiss/faiss/index_factory.cpp +196 -7
- data/vendor/faiss/faiss/index_io.h +5 -0
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -0
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +4 -1
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +2 -1
- data/vendor/faiss/faiss/python/python_callbacks.cpp +27 -0
- data/vendor/faiss/faiss/python/python_callbacks.h +15 -0
- data/vendor/faiss/faiss/utils/Heap.h +31 -15
- data/vendor/faiss/faiss/utils/distances.cpp +380 -56
- data/vendor/faiss/faiss/utils/distances.h +113 -15
- data/vendor/faiss/faiss/utils/distances_simd.cpp +726 -6
- data/vendor/faiss/faiss/utils/extra_distances.cpp +12 -7
- data/vendor/faiss/faiss/utils/extra_distances.h +3 -1
- data/vendor/faiss/faiss/utils/fp16-fp16c.h +21 -0
- data/vendor/faiss/faiss/utils/fp16-inl.h +101 -0
- data/vendor/faiss/faiss/utils/fp16.h +11 -0
- data/vendor/faiss/faiss/utils/hamming-inl.h +54 -0
- data/vendor/faiss/faiss/utils/hamming.cpp +0 -48
- data/vendor/faiss/faiss/utils/ordered_key_value.h +10 -0
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +62 -0
- data/vendor/faiss/faiss/utils/quantize_lut.h +20 -0
- data/vendor/faiss/faiss/utils/random.cpp +53 -0
- data/vendor/faiss/faiss/utils/random.h +5 -0
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +4 -0
- data/vendor/faiss/faiss/utils/simdlib_emulated.h +6 -1
- data/vendor/faiss/faiss/utils/simdlib_neon.h +7 -2
- metadata +37 -3
|
@@ -43,19 +43,27 @@ struct VectorTransform {
|
|
|
43
43
|
*/
|
|
44
44
|
virtual void train(idx_t n, const float* x);
|
|
45
45
|
|
|
46
|
-
/** apply the
|
|
47
|
-
* @param
|
|
48
|
-
* @
|
|
46
|
+
/** apply the transformation and return the result in an allocated pointer
|
|
47
|
+
* @param n number of vectors to transform
|
|
48
|
+
* @param x input vectors, size n * d_in
|
|
49
|
+
* @return output vectors, size n * d_out
|
|
49
50
|
*/
|
|
50
51
|
float* apply(idx_t n, const float* x) const;
|
|
51
52
|
|
|
52
|
-
|
|
53
|
+
/** apply the transformation and return the result in a provided matrix
|
|
54
|
+
* @param n number of vectors to transform
|
|
55
|
+
* @param x input vectors, size n * d_in
|
|
56
|
+
* @param xt output vectors, size n * d_out
|
|
57
|
+
*/
|
|
53
58
|
virtual void apply_noalloc(idx_t n, const float* x, float* xt) const = 0;
|
|
54
59
|
|
|
55
60
|
/// reverse transformation. May not be implemented or may return
|
|
56
61
|
/// approximate result
|
|
57
62
|
virtual void reverse_transform(idx_t n, const float* xt, float* x) const;
|
|
58
63
|
|
|
64
|
+
// check that the two transforms are identical (to merge indexes)
|
|
65
|
+
virtual void check_identical(const VectorTransform& other) const = 0;
|
|
66
|
+
|
|
59
67
|
virtual ~VectorTransform() {}
|
|
60
68
|
};
|
|
61
69
|
|
|
@@ -100,6 +108,8 @@ struct LinearTransform : VectorTransform {
|
|
|
100
108
|
int n,
|
|
101
109
|
int d) const;
|
|
102
110
|
|
|
111
|
+
void check_identical(const VectorTransform& other) const override;
|
|
112
|
+
|
|
103
113
|
~LinearTransform() override {}
|
|
104
114
|
};
|
|
105
115
|
|
|
@@ -207,6 +217,8 @@ struct ITQTransform : VectorTransform {
|
|
|
207
217
|
void train(idx_t n, const float* x) override;
|
|
208
218
|
|
|
209
219
|
void apply_noalloc(idx_t n, const float* x, float* xt) const override;
|
|
220
|
+
|
|
221
|
+
void check_identical(const VectorTransform& other) const override;
|
|
210
222
|
};
|
|
211
223
|
|
|
212
224
|
struct ProductQuantizer;
|
|
@@ -260,6 +272,8 @@ struct RemapDimensionsTransform : VectorTransform {
|
|
|
260
272
|
void reverse_transform(idx_t n, const float* xt, float* x) const override;
|
|
261
273
|
|
|
262
274
|
RemapDimensionsTransform() {}
|
|
275
|
+
|
|
276
|
+
void check_identical(const VectorTransform& other) const override;
|
|
263
277
|
};
|
|
264
278
|
|
|
265
279
|
/** per-vector normalization */
|
|
@@ -273,6 +287,8 @@ struct NormalizationTransform : VectorTransform {
|
|
|
273
287
|
|
|
274
288
|
/// Identity transform since norm is not revertible
|
|
275
289
|
void reverse_transform(idx_t n, const float* xt, float* x) const override;
|
|
290
|
+
|
|
291
|
+
void check_identical(const VectorTransform& other) const override;
|
|
276
292
|
};
|
|
277
293
|
|
|
278
294
|
/** Subtract the mean of each component from the vectors. */
|
|
@@ -290,6 +306,8 @@ struct CenteringTransform : VectorTransform {
|
|
|
290
306
|
|
|
291
307
|
/// add the mean
|
|
292
308
|
void reverse_transform(idx_t n, const float* xt, float* x) const override;
|
|
309
|
+
|
|
310
|
+
void check_identical(const VectorTransform& other) const override;
|
|
293
311
|
};
|
|
294
312
|
|
|
295
313
|
} // namespace faiss
|
|
@@ -32,6 +32,11 @@
|
|
|
32
32
|
#include <faiss/MetaIndexes.h>
|
|
33
33
|
#include <faiss/VectorTransform.h>
|
|
34
34
|
|
|
35
|
+
#include <faiss/impl/LocalSearchQuantizer.h>
|
|
36
|
+
#include <faiss/impl/ProductQuantizer.h>
|
|
37
|
+
#include <faiss/impl/ResidualQuantizer.h>
|
|
38
|
+
#include <faiss/impl/ScalarQuantizer.h>
|
|
39
|
+
|
|
35
40
|
namespace faiss {
|
|
36
41
|
|
|
37
42
|
/*************************************************************
|
|
@@ -117,7 +122,9 @@ Index* Cloner::clone_Index(const Index* index) {
|
|
|
117
122
|
return res;
|
|
118
123
|
} else if (
|
|
119
124
|
const IndexIDMap* idmap = dynamic_cast<const IndexIDMap*>(index)) {
|
|
120
|
-
|
|
125
|
+
const IndexIDMap2* idmap2 = dynamic_cast<const IndexIDMap2*>(index);
|
|
126
|
+
IndexIDMap* res =
|
|
127
|
+
idmap2 ? new IndexIDMap2(*idmap2) : new IndexIDMap(*idmap);
|
|
121
128
|
res->own_fields = true;
|
|
122
129
|
res->index = clone_Index(idmap->index);
|
|
123
130
|
return res;
|
|
@@ -137,6 +144,13 @@ Index* Cloner::clone_Index(const Index* index) {
|
|
|
137
144
|
res->own_fields = true;
|
|
138
145
|
res->storage = clone_Index(insg->storage);
|
|
139
146
|
return res;
|
|
147
|
+
} else if (
|
|
148
|
+
const IndexNNDescent* innd =
|
|
149
|
+
dynamic_cast<const IndexNNDescent*>(index)) {
|
|
150
|
+
IndexNNDescent* res = new IndexNNDescent(*innd);
|
|
151
|
+
res->own_fields = true;
|
|
152
|
+
res->storage = clone_Index(innd->storage);
|
|
153
|
+
return res;
|
|
140
154
|
} else if (
|
|
141
155
|
const Index2Layer* i2l = dynamic_cast<const Index2Layer*>(index)) {
|
|
142
156
|
Index2Layer* res = new Index2Layer(*i2l);
|
|
@@ -149,4 +163,12 @@ Index* Cloner::clone_Index(const Index* index) {
|
|
|
149
163
|
return nullptr;
|
|
150
164
|
}
|
|
151
165
|
|
|
166
|
+
Quantizer* clone_Quantizer(const Quantizer* quant) {
|
|
167
|
+
TRYCLONE(ResidualQuantizer, quant)
|
|
168
|
+
TRYCLONE(LocalSearchQuantizer, quant)
|
|
169
|
+
TRYCLONE(ProductQuantizer, quant)
|
|
170
|
+
TRYCLONE(ScalarQuantizer, quant)
|
|
171
|
+
FAISS_THROW_MSG("Did not recognize quantizer to clone");
|
|
172
|
+
}
|
|
173
|
+
|
|
152
174
|
} // namespace faiss
|
|
@@ -16,6 +16,7 @@ namespace faiss {
|
|
|
16
16
|
struct Index;
|
|
17
17
|
struct IndexIVF;
|
|
18
18
|
struct VectorTransform;
|
|
19
|
+
struct Quantizer;
|
|
19
20
|
|
|
20
21
|
/* cloning functions */
|
|
21
22
|
Index* clone_index(const Index*);
|
|
@@ -30,4 +31,6 @@ struct Cloner {
|
|
|
30
31
|
virtual ~Cloner() {}
|
|
31
32
|
};
|
|
32
33
|
|
|
34
|
+
Quantizer* clone_Quantizer(const Quantizer* quant);
|
|
35
|
+
|
|
33
36
|
} // namespace faiss
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
|
|
2
|
+
|
|
3
|
+
#pragma once
|
|
4
|
+
|
|
5
|
+
// This file contains a custom fast implementation of faiss::Index::sa_decode()
|
|
6
|
+
// function for the following index families:
|
|
7
|
+
// * IVF256,PQ[1]x8np
|
|
8
|
+
// * Residual[1]x8,PQ[2]x8
|
|
9
|
+
// * IVF[2^9-2^16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
|
|
10
|
+
// * Residual1x[9-16 bit],PQ[1]x8 (such as Residual1x9,PQ8)
|
|
11
|
+
// * PQ[1]x8
|
|
12
|
+
// Additionally, AVX2 and ARM versions support
|
|
13
|
+
// * Residual[1]x8,PQ[2]x10
|
|
14
|
+
// * Residual[1]x8,PQ[2]x16
|
|
15
|
+
// * Residual[1]x10,PQ[2]x10
|
|
16
|
+
// * Residual[1]x10,PQ[2]x16
|
|
17
|
+
// * Residual[1]x16,PQ[2]x10
|
|
18
|
+
// * Residual[1]x16,PQ[2]x16
|
|
19
|
+
// * Residual1x[9-16 bit],PQ[1]x10 (such as Residual1x9,PQ16x10)
|
|
20
|
+
// * * (use with COARSE_BITS=16)
|
|
21
|
+
// * Residual1x[9-16 bit],PQ[1]x16 (such as Residual1x9,PQ16x16)
|
|
22
|
+
// * * (use with COARSE_BITS=16)
|
|
23
|
+
// * PQ[1]x10
|
|
24
|
+
// * PQ[1]x16
|
|
25
|
+
// Unfortunately, currently Faiss does not support something like
|
|
26
|
+
// IVF256,PQ16x10np
|
|
27
|
+
//
|
|
28
|
+
// The goal was to achieve the maximum performance, so the template version it
|
|
29
|
+
// is. The provided index families share the same code for sa_decode.
|
|
30
|
+
//
|
|
31
|
+
// The front-end code provides two high-level structures.
|
|
32
|
+
//
|
|
33
|
+
// First one:
|
|
34
|
+
// {
|
|
35
|
+
// template <
|
|
36
|
+
// intptr_t DIM,
|
|
37
|
+
// intptr_t COARSE_SIZE,
|
|
38
|
+
// intptr_t FINE_SIZE,
|
|
39
|
+
// intptr_t COARSE_BITS = 8
|
|
40
|
+
// intptr_t FINE_BITS = 8>
|
|
41
|
+
// struct Index2LevelDecoder { /*...*/ };
|
|
42
|
+
// }
|
|
43
|
+
// * DIM is the dimensionality of data
|
|
44
|
+
// * COARSE_SIZE is the dimensionality of the coarse quantizer (IVF, Residual)
|
|
45
|
+
// * FINE_SIZE is the dimensionality of the ProductQuantizer dsq
|
|
46
|
+
// * COARSE_BITS is the number of bits that are needed to represent a coarse
|
|
47
|
+
// quantizer code.
|
|
48
|
+
// * FINE_BITS is the number of bits that are needed to represent a fine
|
|
49
|
+
// quantizer code.
|
|
50
|
+
// For example, "IVF256,PQ8np" for 160-dim data translates into
|
|
51
|
+
// Index2LevelDecoder<160,160,20,8>
|
|
52
|
+
// For example, "Residual4x8,PQ16" for 256-dim data translates into
|
|
53
|
+
// Index2LevelDecoder<256,64,1,8>
|
|
54
|
+
// For example, "IVF1024,PQ16np" for 256-dim data translates into
|
|
55
|
+
// Index2LevelDecoder<256,256,16,10>. But as there are only 1 coarse code
|
|
56
|
+
// element, Index2LevelDecoder<256,256,16,16> can be used as a faster
|
|
57
|
+
// decoder.
|
|
58
|
+
// For example, "Residual4x10,PQ16x10np" for 256-dim data translates into
|
|
59
|
+
// Index2LevelDecoder<256,64,16,10,10>
|
|
60
|
+
//
|
|
61
|
+
// Additional supported values for COARSE_BITS and FINE_BITS may be added later.
|
|
62
|
+
//
|
|
63
|
+
// Second one:
|
|
64
|
+
// {
|
|
65
|
+
// template <
|
|
66
|
+
// intptr_t DIM,
|
|
67
|
+
// intptr_t FINE_SIZE,
|
|
68
|
+
// intptr_t FINE_BITS = 8>
|
|
69
|
+
// struct IndexPQDecoder { /*...*/ };
|
|
70
|
+
// }
|
|
71
|
+
// * DIM is the dimensionality of data
|
|
72
|
+
// * FINE_SIZE is the dimensionality of the ProductQuantizer dsq
|
|
73
|
+
// * FINE_BITS is the number of bits that are needed to represent a fine
|
|
74
|
+
// quantizer code.
|
|
75
|
+
// For example, "PQ8np" for 160-dim data translates into
|
|
76
|
+
// IndexPQDecoder<160,20>
|
|
77
|
+
//
|
|
78
|
+
// Unlike the general purpose version in faiss::Index::sa_decode(),
|
|
79
|
+
// this version provides the following functions (please note that
|
|
80
|
+
// pqCoarseCentroids params are not available for IndexPQDecoder,
|
|
81
|
+
// but the functionality is the same as for Index2LevelDecoder):
|
|
82
|
+
//
|
|
83
|
+
// * ::store(), which is similar to sa_decode(1, input, output),
|
|
84
|
+
// The method signature is the following:
|
|
85
|
+
// {
|
|
86
|
+
// void store(
|
|
87
|
+
// const float* const __restrict pqCoarseCentroids,
|
|
88
|
+
// const float* const __restrict pqFineCentroids,
|
|
89
|
+
// const uint8_t* const __restrict code,
|
|
90
|
+
// float* const __restrict outputStore);
|
|
91
|
+
// }
|
|
92
|
+
//
|
|
93
|
+
// * ::accum(), which is used to create a linear combination
|
|
94
|
+
// of decoded vectors:
|
|
95
|
+
// {
|
|
96
|
+
// const faiss::Index* const index;
|
|
97
|
+
// const uint8_t* const input;
|
|
98
|
+
// float weight;
|
|
99
|
+
//
|
|
100
|
+
// std::vector<float> buffer(d, 0);
|
|
101
|
+
//
|
|
102
|
+
// index->sa_decode(1, input, buffer.data());
|
|
103
|
+
// for (size_t iDim = 0; iDim < d; iDim++)
|
|
104
|
+
// output[iDim] += weight * buffer[iDim];
|
|
105
|
+
// }
|
|
106
|
+
// The method signature is the following:
|
|
107
|
+
// {
|
|
108
|
+
// static void accum(
|
|
109
|
+
// const float* const __restrict pqCoarseCentroids,
|
|
110
|
+
// const float* const __restrict pqFineCentroids,
|
|
111
|
+
// const uint8_t* const __restrict code,
|
|
112
|
+
// const float weight,
|
|
113
|
+
// float* const __restrict outputAccum);
|
|
114
|
+
// }
|
|
115
|
+
//
|
|
116
|
+
// * There is an additional overload for ::accum() that decodes two vectors
|
|
117
|
+
// per call. This provides an additional speedup because of a CPU
|
|
118
|
+
// superscalar architecture:
|
|
119
|
+
// {
|
|
120
|
+
// const faiss::Index* const index;
|
|
121
|
+
// const uint8_t* const input0;
|
|
122
|
+
// float weight0;
|
|
123
|
+
// const uint8_t* const input1;
|
|
124
|
+
// float weight1;
|
|
125
|
+
//
|
|
126
|
+
// std::vector<float> buffer(d, 0);
|
|
127
|
+
//
|
|
128
|
+
// index->sa_decode(1, input0, buffer.data());
|
|
129
|
+
// for (size_t iDim = 0; iDim < d; iDim++)
|
|
130
|
+
// output[iDim] += weight0 * buffer[iDim];
|
|
131
|
+
//
|
|
132
|
+
// index->sa_decode(1, input1, buffer.data());
|
|
133
|
+
// for (size_t iDim = 0; iDim < d; iDim++)
|
|
134
|
+
// output[iDim] += weight1 * buffer[iDim];
|
|
135
|
+
// }
|
|
136
|
+
// If each code uses its own coarse quantizer centroids table and its own fine
|
|
137
|
+
// quantizer centroids table, then the following overload can be used:
|
|
138
|
+
// {
|
|
139
|
+
// static void accum(
|
|
140
|
+
// const float* const __restrict pqCoarseCentroids0,
|
|
141
|
+
// const float* const __restrict pqFineCentroids0,
|
|
142
|
+
// const uint8_t* const __restrict code0,
|
|
143
|
+
// const float weight0,
|
|
144
|
+
// const float* const __restrict pqCoarseCentroids1,
|
|
145
|
+
// const float* const __restrict pqFineCentroids1,
|
|
146
|
+
// const uint8_t* const __restrict code1,
|
|
147
|
+
// const float weight1,
|
|
148
|
+
// float* const __restrict outputAccum);
|
|
149
|
+
// }
|
|
150
|
+
// If codes share the coarse quantizer centroids table and also share
|
|
151
|
+
// the fine quantizer centroids table, then the following overload can be
|
|
152
|
+
// used:
|
|
153
|
+
// {
|
|
154
|
+
// static void accum(
|
|
155
|
+
// const float* const __restrict pqCoarseCentroids,
|
|
156
|
+
// const float* const __restrict pqFineCentroids,
|
|
157
|
+
// const uint8_t* const __restrict code0,
|
|
158
|
+
// const float weight0,
|
|
159
|
+
// const uint8_t* const __restrict code1,
|
|
160
|
+
// const float weight1,
|
|
161
|
+
// float* const __restrict outputAccum);
|
|
162
|
+
// }
|
|
163
|
+
//
|
|
164
|
+
// * And one more overload for ::accum() that decodes and accumulates
|
|
165
|
+
// three vectors per call.
|
|
166
|
+
// {
|
|
167
|
+
// const faiss::Index* const index;
|
|
168
|
+
// const uint8_t* const input0;
|
|
169
|
+
// float weight0;
|
|
170
|
+
// const uint8_t* const input1;
|
|
171
|
+
// float weight1;
|
|
172
|
+
// const uint8_t* const input2;
|
|
173
|
+
// float weight2;
|
|
174
|
+
//
|
|
175
|
+
// std::vector<float> buffer(d, 0);
|
|
176
|
+
//
|
|
177
|
+
// index->sa_decode(1, input0, buffer.data());
|
|
178
|
+
// for (size_t iDim = 0; iDim < d; iDim++)
|
|
179
|
+
// output[iDim] += weight0 * buffer[iDim];
|
|
180
|
+
//
|
|
181
|
+
// index->sa_decode(1, input1, buffer.data());
|
|
182
|
+
// for (size_t iDim = 0; iDim < d; iDim++)
|
|
183
|
+
// output[iDim] += weight1 * buffer[iDim];
|
|
184
|
+
//
|
|
185
|
+
// index->sa_decode(1, input2, buffer.data());
|
|
186
|
+
// for (size_t iDim = 0; iDim < d; iDim++)
|
|
187
|
+
// output[iDim] += weight2 * buffer[iDim];
|
|
188
|
+
// }
|
|
189
|
+
//
|
|
190
|
+
// If each code uses its own coarse quantizer centroids table and its own fine
|
|
191
|
+
// quantizer centroids table, then the following overload can be used:
|
|
192
|
+
// {
|
|
193
|
+
// static void accum(
|
|
194
|
+
// const float* const __restrict pqCoarseCentroids0,
|
|
195
|
+
// const float* const __restrict pqFineCentroids0,
|
|
196
|
+
// const uint8_t* const __restrict code0,
|
|
197
|
+
// const float weight0,
|
|
198
|
+
// const float* const __restrict pqCoarseCentroids1,
|
|
199
|
+
// const float* const __restrict pqFineCentroids1,
|
|
200
|
+
// const uint8_t* const __restrict code1,
|
|
201
|
+
// const float weight1,
|
|
202
|
+
// const float* const __restrict pqCoarseCentroids2,
|
|
203
|
+
// const float* const __restrict pqFineCentroids2,
|
|
204
|
+
// const uint8_t* const __restrict code2,
|
|
205
|
+
// const float weight2,
|
|
206
|
+
// float* const __restrict outputAccum);
|
|
207
|
+
// }
|
|
208
|
+
// If codes share the coarse quantizer centroids table and also share
|
|
209
|
+
// the fine quantizer centroids table, then the following overload can be
|
|
210
|
+
// used:
|
|
211
|
+
// {
|
|
212
|
+
// static void accum(
|
|
213
|
+
// const float* const __restrict pqCoarseCentroids,
|
|
214
|
+
// const float* const __restrict pqFineCentroids,
|
|
215
|
+
// const uint8_t* const __restrict code0,
|
|
216
|
+
// const float weight0,
|
|
217
|
+
// const uint8_t* const __restrict code1,
|
|
218
|
+
// const float weight1,
|
|
219
|
+
// const uint8_t* const __restrict code2,
|
|
220
|
+
// const float weight2,
|
|
221
|
+
// float* const __restrict outputAccum);
|
|
222
|
+
// }
|
|
223
|
+
//
|
|
224
|
+
// The provided version is not multithreaded.
|
|
225
|
+
//
|
|
226
|
+
// Currently, an AVX2+FMA implementation is available. AVX512 version is also
|
|
227
|
+
// doable, but it was found to be slower than AVX2 for real world applications
|
|
228
|
+
// that I needed.
|
|
229
|
+
//
|
|
230
|
+
////////////////////////////////////////////////////////////////////////////////////
|
|
231
|
+
//
|
|
232
|
+
// It is possible to use an additional index wrapper on top of IVFPQ /
|
|
233
|
+
// Residual+PQ, known as IndexRowwiseMinMax / IndexRowwiseMinMaxFP16. Index
|
|
234
|
+
// wrapper that performs rowwise normalization to [0,1], preserving the
|
|
235
|
+
// coefficients. This is a vector codec index only.
|
|
236
|
+
// For more details please refer to the description in
|
|
237
|
+
// faiss/IndexRowwiseMinMax.h file.
|
|
238
|
+
//
|
|
239
|
+
// If such a wrapper is used, then the quantizer will look like, say,
|
|
240
|
+
// MinMaxFP16,IVF256,PQ32np
|
|
241
|
+
// or
|
|
242
|
+
// MinMax,PQ16np
|
|
243
|
+
// In this case, please use the following contruction for the decoding,
|
|
244
|
+
// basically, wrapping a kernel in a kernel:
|
|
245
|
+
// {
|
|
246
|
+
// using SubT = faiss::cppcontrib::Index2LevelDecoder<128, 128, 2>;
|
|
247
|
+
// using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
|
|
248
|
+
// // do T::store(...) or T::accum(...)
|
|
249
|
+
// }
|
|
250
|
+
//
|
|
251
|
+
// T::accum(...) contains an additional function variable which is
|
|
252
|
+
// used for accumulating scaling. Thus, the code pattern is the following:
|
|
253
|
+
// {
|
|
254
|
+
// const float* const __restrict pqCoarseCentroidsQ;
|
|
255
|
+
// const float* const __restrict pqFineCentroidsQ;
|
|
256
|
+
// const uint8_t* const __restrict input;
|
|
257
|
+
// const float* const __restrict weights;
|
|
258
|
+
// float* const __restrict output;
|
|
259
|
+
// float outputAccumMin = 0;
|
|
260
|
+
//
|
|
261
|
+
// for (size_t i = 0; i < n; i++) {
|
|
262
|
+
// T::accum(
|
|
263
|
+
// pqCoarseCentroidsQ,
|
|
264
|
+
// pqFineCentroidsQ,
|
|
265
|
+
// input + i * code_size,
|
|
266
|
+
// weights[i],
|
|
267
|
+
// output,
|
|
268
|
+
// outputAccumMin);
|
|
269
|
+
// }
|
|
270
|
+
// for (size_t j = 0; j < d; j++)
|
|
271
|
+
// output[j] += outputAccumMin;
|
|
272
|
+
// }
|
|
273
|
+
// This is similar to the following regular pseudo-code:
|
|
274
|
+
// {
|
|
275
|
+
// const faiss::Index* const index;
|
|
276
|
+
// const uint8_t* const __restrict input;
|
|
277
|
+
// const float* const __restrict weights;
|
|
278
|
+
// float* const __restrict output;
|
|
279
|
+
//
|
|
280
|
+
// for (size_t i = 0; i < n; i++) {
|
|
281
|
+
// std::vector<float> buffer(d, 0);
|
|
282
|
+
//
|
|
283
|
+
// index->sa_decode(1, input + i * code_size, buffer.data());
|
|
284
|
+
// for (size_t j = 0; j < d; j++)
|
|
285
|
+
// output[j] += weights[i] * buffer[j];
|
|
286
|
+
// }
|
|
287
|
+
|
|
288
|
+
#include <faiss/cppcontrib/sa_decode/MinMax-inl.h>
|
|
289
|
+
#include <faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h>
|
|
290
|
+
|
|
291
|
+
#ifdef __AVX2__
|
|
292
|
+
#include <faiss/cppcontrib/sa_decode/Level2-avx2-inl.h>
|
|
293
|
+
#include <faiss/cppcontrib/sa_decode/PQ-avx2-inl.h>
|
|
294
|
+
#elif defined(__ARM_NEON)
|
|
295
|
+
#include <faiss/cppcontrib/sa_decode/Level2-neon-inl.h>
|
|
296
|
+
#include <faiss/cppcontrib/sa_decode/PQ-neon-inl.h>
|
|
297
|
+
#else
|
|
298
|
+
#include <faiss/cppcontrib/sa_decode/Level2-inl.h>
|
|
299
|
+
#include <faiss/cppcontrib/sa_decode/PQ-inl.h>
|
|
300
|
+
#endif
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <cstdint>
|
|
4
|
+
|
|
5
|
+
namespace faiss {
|
|
6
|
+
namespace cppcontrib {
|
|
7
|
+
namespace detail {
|
|
8
|
+
|
|
9
|
+
template <int COARSE_BITS>
|
|
10
|
+
struct CoarseBitType {};
|
|
11
|
+
|
|
12
|
+
template <>
|
|
13
|
+
struct CoarseBitType<8> {
|
|
14
|
+
using bit_type = uint8_t;
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
template <>
|
|
18
|
+
struct CoarseBitType<16> {
|
|
19
|
+
using bit_type = uint16_t;
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
} // namespace detail
|
|
23
|
+
} // namespace cppcontrib
|
|
24
|
+
} // namespace faiss
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <cstdint>
|
|
4
|
+
|
|
5
|
+
namespace faiss {
|
|
6
|
+
namespace cppcontrib {
|
|
7
|
+
namespace detail {
|
|
8
|
+
|
|
9
|
+
namespace {
|
|
10
|
+
|
|
11
|
+
template <intptr_t N_ELEMENTS, intptr_t CPOS>
|
|
12
|
+
struct Uint8Reader {
|
|
13
|
+
static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
|
|
14
|
+
|
|
15
|
+
static intptr_t get(const uint8_t* const __restrict codes) {
|
|
16
|
+
// Read using 4-bytes, if possible.
|
|
17
|
+
// Reading using 8-byte takes too many registers somewhy.
|
|
18
|
+
|
|
19
|
+
constexpr intptr_t ELEMENT_TO_READ = CPOS / 4;
|
|
20
|
+
constexpr intptr_t SUB_ELEMENT = CPOS % 4;
|
|
21
|
+
|
|
22
|
+
switch (SUB_ELEMENT) {
|
|
23
|
+
case 0: {
|
|
24
|
+
if (N_ELEMENTS > CPOS + 3) {
|
|
25
|
+
const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
|
|
26
|
+
codes + ELEMENT_TO_READ * 4);
|
|
27
|
+
return (code32 & 0x000000FF);
|
|
28
|
+
} else {
|
|
29
|
+
return codes[CPOS];
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
case 1: {
|
|
33
|
+
if (N_ELEMENTS > CPOS + 2) {
|
|
34
|
+
const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
|
|
35
|
+
codes + ELEMENT_TO_READ * 4);
|
|
36
|
+
return (code32 & 0x0000FF00) >> 8;
|
|
37
|
+
} else {
|
|
38
|
+
return codes[CPOS];
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
case 2: {
|
|
42
|
+
if (N_ELEMENTS > CPOS + 1) {
|
|
43
|
+
const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
|
|
44
|
+
codes + ELEMENT_TO_READ * 4);
|
|
45
|
+
return (code32 & 0x00FF0000) >> 16;
|
|
46
|
+
} else {
|
|
47
|
+
return codes[CPOS];
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
case 3: {
|
|
51
|
+
if (N_ELEMENTS > CPOS) {
|
|
52
|
+
const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
|
|
53
|
+
codes + ELEMENT_TO_READ * 4);
|
|
54
|
+
return (code32) >> 24;
|
|
55
|
+
} else {
|
|
56
|
+
return codes[CPOS];
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
// reduces the number of read operations from RAM
|
|
64
|
+
///////////////////////////////////////////////
|
|
65
|
+
// 76543210 76543210 76543210 76543210 76543210
|
|
66
|
+
// 00000000 00
|
|
67
|
+
// 111111 1111
|
|
68
|
+
// 2222 222222
|
|
69
|
+
// 33 33333333
|
|
70
|
+
template <intptr_t N_ELEMENTS, intptr_t CPOS>
|
|
71
|
+
struct Uint10Reader {
|
|
72
|
+
static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
|
|
73
|
+
|
|
74
|
+
static intptr_t get(const uint8_t* const __restrict codes) {
|
|
75
|
+
// Read using 4-bytes or 2-bytes.
|
|
76
|
+
|
|
77
|
+
constexpr intptr_t ELEMENT_TO_READ = CPOS / 4;
|
|
78
|
+
constexpr intptr_t SUB_ELEMENT = CPOS % 4;
|
|
79
|
+
|
|
80
|
+
switch (SUB_ELEMENT) {
|
|
81
|
+
case 0: {
|
|
82
|
+
if (N_ELEMENTS > CPOS + 2) {
|
|
83
|
+
const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
|
|
84
|
+
codes + ELEMENT_TO_READ * 5);
|
|
85
|
+
return (code32 & 0b0000001111111111);
|
|
86
|
+
} else {
|
|
87
|
+
const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
|
|
88
|
+
codes + ELEMENT_TO_READ * 5 + 0);
|
|
89
|
+
return (code16 & 0b0000001111111111);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
case 1: {
|
|
93
|
+
if (N_ELEMENTS > CPOS + 1) {
|
|
94
|
+
const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
|
|
95
|
+
codes + ELEMENT_TO_READ * 5);
|
|
96
|
+
return (code32 & 0b000011111111110000000000) >> 10;
|
|
97
|
+
} else {
|
|
98
|
+
const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
|
|
99
|
+
codes + ELEMENT_TO_READ * 5 + 1);
|
|
100
|
+
return (code16 & 0b0000111111111100) >> 2;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
case 2: {
|
|
104
|
+
if (N_ELEMENTS > CPOS) {
|
|
105
|
+
const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
|
|
106
|
+
codes + ELEMENT_TO_READ * 5);
|
|
107
|
+
return (code32 & 0b00111111111100000000000000000000) >> 20;
|
|
108
|
+
} else {
|
|
109
|
+
const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
|
|
110
|
+
codes + ELEMENT_TO_READ * 5 + 2);
|
|
111
|
+
return (code16 & 0b0011111111110000) >> 4;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
case 3: {
|
|
115
|
+
const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
|
|
116
|
+
codes + ELEMENT_TO_READ * 5 + 3);
|
|
117
|
+
return (code16 & 0b1111111111000000) >> 6;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
};
|
|
122
|
+
|
|
123
|
+
// reduces the number of read operations from RAM
|
|
124
|
+
template <intptr_t N_ELEMENTS, intptr_t CPOS>
|
|
125
|
+
struct Uint16Reader {
|
|
126
|
+
static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
|
|
127
|
+
|
|
128
|
+
static intptr_t get(const uint8_t* const __restrict codes) {
|
|
129
|
+
// Read using 4-bytes or 2-bytes.
|
|
130
|
+
// Reading using 8-byte takes too many registers somewhy.
|
|
131
|
+
|
|
132
|
+
constexpr intptr_t ELEMENT_TO_READ = CPOS / 2;
|
|
133
|
+
constexpr intptr_t SUB_ELEMENT = CPOS % 2;
|
|
134
|
+
|
|
135
|
+
switch (SUB_ELEMENT) {
|
|
136
|
+
case 0: {
|
|
137
|
+
if (N_ELEMENTS > CPOS + 1) {
|
|
138
|
+
const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
|
|
139
|
+
codes + ELEMENT_TO_READ * 4);
|
|
140
|
+
return (code32 & 0x0000FFFF);
|
|
141
|
+
} else {
|
|
142
|
+
const uint16_t* const __restrict codesFp16 =
|
|
143
|
+
reinterpret_cast<const uint16_t*>(codes);
|
|
144
|
+
return codesFp16[CPOS];
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
case 1: {
|
|
148
|
+
if (N_ELEMENTS > CPOS) {
|
|
149
|
+
const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
|
|
150
|
+
codes + ELEMENT_TO_READ * 4);
|
|
151
|
+
return code32 >> 16;
|
|
152
|
+
} else {
|
|
153
|
+
const uint16_t* const __restrict codesFp16 =
|
|
154
|
+
reinterpret_cast<const uint16_t*>(codes);
|
|
155
|
+
return codesFp16[CPOS];
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
};
|
|
161
|
+
|
|
162
|
+
//
|
|
163
|
+
template <intptr_t N_ELEMENTS, intptr_t CODE_BITS, intptr_t CPOS>
|
|
164
|
+
struct UintReaderImplType {};
|
|
165
|
+
|
|
166
|
+
template <intptr_t N_ELEMENTS, intptr_t CPOS>
|
|
167
|
+
struct UintReaderImplType<N_ELEMENTS, 8, CPOS> {
|
|
168
|
+
using reader_type = Uint8Reader<N_ELEMENTS, CPOS>;
|
|
169
|
+
};
|
|
170
|
+
|
|
171
|
+
template <intptr_t N_ELEMENTS, intptr_t CPOS>
|
|
172
|
+
struct UintReaderImplType<N_ELEMENTS, 10, CPOS> {
|
|
173
|
+
using reader_type = Uint10Reader<N_ELEMENTS, CPOS>;
|
|
174
|
+
};
|
|
175
|
+
|
|
176
|
+
template <intptr_t N_ELEMENTS, intptr_t CPOS>
|
|
177
|
+
struct UintReaderImplType<N_ELEMENTS, 16, CPOS> {
|
|
178
|
+
using reader_type = Uint16Reader<N_ELEMENTS, CPOS>;
|
|
179
|
+
};
|
|
180
|
+
|
|
181
|
+
} // namespace
|
|
182
|
+
|
|
183
|
+
// reduces the number of read operations from RAM
|
|
184
|
+
template <intptr_t DIM, intptr_t CODE_SIZE, intptr_t CODE_BITS, intptr_t CPOS>
|
|
185
|
+
using UintReader =
|
|
186
|
+
typename UintReaderImplType<DIM / CODE_SIZE, CODE_BITS, CPOS>::
|
|
187
|
+
reader_type;
|
|
188
|
+
|
|
189
|
+
template <intptr_t N_ELEMENTS, intptr_t CODE_BITS, intptr_t CPOS>
|
|
190
|
+
using UintReaderRaw =
|
|
191
|
+
typename UintReaderImplType<N_ELEMENTS, CODE_BITS, CPOS>::reader_type;
|
|
192
|
+
|
|
193
|
+
} // namespace detail
|
|
194
|
+
} // namespace cppcontrib
|
|
195
|
+
} // namespace faiss
|