faiss 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/ext/faiss/extconf.rb +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +36 -33
- data/vendor/faiss/faiss/AutoTune.h +6 -3
- data/vendor/faiss/faiss/Clustering.cpp +16 -12
- data/vendor/faiss/faiss/Index.cpp +3 -4
- data/vendor/faiss/faiss/Index.h +3 -3
- data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
- data/vendor/faiss/faiss/IndexBinary.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
- data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
- data/vendor/faiss/faiss/IndexFlat.h +0 -51
- data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
- data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
- data/vendor/faiss/faiss/IndexIVF.h +22 -15
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
- data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
- data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
- data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
- data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
- data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
- data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
- data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
- data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
- data/vendor/faiss/faiss/IndexRefine.h +73 -0
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
- data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
- data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
- data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
- data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
- data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
- data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
- data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
- data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
- data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
- data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
- data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
- data/vendor/faiss/faiss/impl/io.cpp +33 -2
- data/vendor/faiss/faiss/impl/io.h +7 -2
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
- data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
- data/vendor/faiss/faiss/index_factory.cpp +112 -7
- data/vendor/faiss/faiss/index_io.h +1 -48
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
- data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
- data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
- data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
- data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
- data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
- data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
- data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
- data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
- data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
- data/vendor/faiss/faiss/utils/Heap.h +61 -50
- data/vendor/faiss/faiss/utils/distances.cpp +164 -319
- data/vendor/faiss/faiss/utils/distances.h +28 -20
- data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
- data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
- data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
- data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
- data/vendor/faiss/faiss/utils/hamming.h +2 -7
- data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
- data/vendor/faiss/faiss/utils/partitioning.h +69 -0
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
- data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
- data/vendor/faiss/faiss/utils/simdlib.h +31 -0
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
- data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
- metadata +43 -141
- data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
- data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
- data/vendor/faiss/c_api/AutoTune_c.h +0 -66
- data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
- data/vendor/faiss/c_api/Clustering_c.h +0 -123
- data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
- data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
- data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
- data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
- data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
- data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
- data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
- data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
- data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
- data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
- data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
- data/vendor/faiss/c_api/IndexShards_c.h +0 -39
- data/vendor/faiss/c_api/Index_c.cpp +0 -105
- data/vendor/faiss/c_api/Index_c.h +0 -183
- data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
- data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
- data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
- data/vendor/faiss/c_api/clone_index_c.h +0 -32
- data/vendor/faiss/c_api/error_c.h +0 -42
- data/vendor/faiss/c_api/error_impl.cpp +0 -27
- data/vendor/faiss/c_api/error_impl.h +0 -16
- data/vendor/faiss/c_api/faiss_c.h +0 -58
- data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
- data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
- data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
- data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
- data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
- data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
- data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
- data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
- data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
- data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
- data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
- data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
- data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
- data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
- data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
- data/vendor/faiss/c_api/index_factory_c.h +0 -30
- data/vendor/faiss/c_api/index_io_c.cpp +0 -42
- data/vendor/faiss/c_api/index_io_c.h +0 -50
- data/vendor/faiss/c_api/macros_impl.h +0 -110
- data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
- data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
- data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
- data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
- data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
- data/vendor/faiss/misc/test_blas.cpp +0 -87
- data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
- data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
- data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
- data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
- data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
- data/vendor/faiss/tests/test_merge.cpp +0 -260
- data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
- data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
- data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
- data/vendor/faiss/tests/test_params_override.cpp +0 -236
- data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
- data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
- data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
- data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
- data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
- data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
- data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
- data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
- data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104
@@ -41,8 +41,22 @@ class StandardGpuResourcesImpl : public GpuResources {
|
|
41
41
|
/// transfers
|
42
42
|
void setPinnedMemory(size_t size);
|
43
43
|
|
44
|
-
/// Called to change the stream for work ordering
|
45
|
-
|
44
|
+
/// Called to change the stream for work ordering. We do not own `stream`;
|
45
|
+
/// i.e., it will not be destroyed when the GpuResources object gets cleaned
|
46
|
+
/// up.
|
47
|
+
/// We are guaranteed that all Faiss GPU work is ordered with respect to
|
48
|
+
/// this stream upon exit from an index or other Faiss GPU call.
|
49
|
+
void setDefaultStream(int device, cudaStream_t stream) override;
|
50
|
+
|
51
|
+
/// Revert the default stream to the original stream managed by this resources
|
52
|
+
/// object, in case someone called `setDefaultStream`.
|
53
|
+
void revertDefaultStream(int device);
|
54
|
+
|
55
|
+
/// Returns the stream for the given device on which all Faiss GPU work is
|
56
|
+
/// ordered.
|
57
|
+
/// We are guaranteed that all Faiss GPU work is ordered with respect to
|
58
|
+
/// this stream upon exit from an index or other Faiss GPU call.
|
59
|
+
cudaStream_t getDefaultStream(int device) override;
|
46
60
|
|
47
61
|
/// Called to change the work ordering streams to the null stream
|
48
62
|
/// for all devices
|
@@ -60,8 +74,6 @@ class StandardGpuResourcesImpl : public GpuResources {
|
|
60
74
|
|
61
75
|
cublasHandle_t getBlasHandle(int device) override;
|
62
76
|
|
63
|
-
cudaStream_t getDefaultStream(int device) override;
|
64
|
-
|
65
77
|
std::vector<cudaStream_t> getAlternateStreams(int device) override;
|
66
78
|
|
67
79
|
/// Allocate non-temporary GPU memory
|
@@ -128,7 +140,9 @@ class StandardGpuResourcesImpl : public GpuResources {
|
|
128
140
|
};
|
129
141
|
|
130
142
|
/// Default implementation of GpuResources that allocates a cuBLAS
|
131
|
-
/// stream and 2 streams for use, as well as temporary memory
|
143
|
+
/// stream and 2 streams for use, as well as temporary memory.
|
144
|
+
/// Internally, the Faiss GPU code uses the instance managed by getResources,
|
145
|
+
/// but this is the user-facing object that is internally reference counted.
|
132
146
|
class StandardGpuResources : public GpuResourcesProvider {
|
133
147
|
public:
|
134
148
|
StandardGpuResources();
|
@@ -151,9 +165,17 @@ class StandardGpuResources : public GpuResourcesProvider {
|
|
151
165
|
/// transfers
|
152
166
|
void setPinnedMemory(size_t size);
|
153
167
|
|
154
|
-
/// Called to change the stream for work ordering
|
168
|
+
/// Called to change the stream for work ordering. We do not own `stream`;
|
169
|
+
/// i.e., it will not be destroyed when the GpuResources object gets cleaned
|
170
|
+
/// up.
|
171
|
+
/// We are guaranteed that all Faiss GPU work is ordered with respect to
|
172
|
+
/// this stream upon exit from an index or other Faiss GPU call.
|
155
173
|
void setDefaultStream(int device, cudaStream_t stream);
|
156
174
|
|
175
|
+
/// Revert the default stream to the original stream managed by this resources
|
176
|
+
/// object, in case someone called `setDefaultStream`.
|
177
|
+
void revertDefaultStream(int device);
|
178
|
+
|
157
179
|
/// Called to change the work ordering streams to the null stream
|
158
180
|
/// for all devices
|
159
181
|
void setDefaultNullStreamAllDevices();
|
@@ -0,0 +1,547 @@
|
|
1
|
+
/**
|
2
|
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
3
|
+
*
|
4
|
+
* This source code is licensed under the MIT license found in the
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
6
|
+
*/
|
7
|
+
|
8
|
+
#include <faiss/gpu/impl/InterleavedCodes.h>
|
9
|
+
#include <faiss/impl/FaissAssert.h>
|
10
|
+
#include <faiss/gpu/utils/StaticUtils.h>
|
11
|
+
|
12
|
+
namespace faiss { namespace gpu {
|
13
|
+
|
14
|
+
inline uint8_t unpack5(int i, uint8_t vLower, uint8_t vUpper) {
|
15
|
+
uint8_t v = 0;
|
16
|
+
|
17
|
+
// lsb ... msb
|
18
|
+
// 0: 0 0 0 0 0 1 1 1
|
19
|
+
// 1: 1 1 2 2 2 2 2 3
|
20
|
+
// 2: 3 3 3 3 4 4 4 4
|
21
|
+
// 3: 4 5 5 5 5 5 6 6
|
22
|
+
// 4: 6 6 6 7 7 7 7 7
|
23
|
+
switch (i % 8) {
|
24
|
+
case 0:
|
25
|
+
// 5 lsbs of lower
|
26
|
+
v = vLower & 0x1f;
|
27
|
+
break;
|
28
|
+
case 1:
|
29
|
+
// 3 msbs of lower as v lsbs
|
30
|
+
// 2 msbs of upper as v msbs
|
31
|
+
v = (vLower >> 5) | ((vUpper & 0x3) << 3);
|
32
|
+
break;
|
33
|
+
case 2:
|
34
|
+
// 5 of lower
|
35
|
+
v = (vLower >> 2) & 0x1f;
|
36
|
+
break;
|
37
|
+
case 3:
|
38
|
+
// 1 msbs of lower as v lsbs
|
39
|
+
// 4 lsbs of upper as v msbs
|
40
|
+
v = (vLower >> 7) | ((vUpper & 0xf) << 1);
|
41
|
+
break;
|
42
|
+
case 4:
|
43
|
+
// 4 msbs of lower as v lsbs
|
44
|
+
// 1 lsbs of upper as v msbs
|
45
|
+
v = (vLower >> 4) | ((vUpper & 0x1) << 4);
|
46
|
+
break;
|
47
|
+
case 5:
|
48
|
+
// 5 of lower
|
49
|
+
v = (vLower >> 1) & 0x1f;
|
50
|
+
break;
|
51
|
+
case 6:
|
52
|
+
// 2 msbs of lower as v lsbs
|
53
|
+
// 3 lsbs of upper as v msbs
|
54
|
+
v = (vLower >> 6) | ((vUpper & 0x7) << 2);
|
55
|
+
break;
|
56
|
+
case 7:
|
57
|
+
// 5 of lower
|
58
|
+
v = (vLower >> 3);
|
59
|
+
break;
|
60
|
+
}
|
61
|
+
|
62
|
+
return v;
|
63
|
+
}
|
64
|
+
|
65
|
+
inline uint8_t unpack6(int i, uint8_t vLower, uint8_t vUpper) {
|
66
|
+
uint8_t v = 0;
|
67
|
+
|
68
|
+
switch (i % 4) {
|
69
|
+
case 0:
|
70
|
+
// 6 lsbs of lower
|
71
|
+
v = vLower & 0x3f;
|
72
|
+
break;
|
73
|
+
case 1:
|
74
|
+
// 2 msbs of lower as v lsbs
|
75
|
+
// 4 lsbs of upper as v msbs
|
76
|
+
v = (vLower >> 6) | ((vUpper & 0xf) << 2);
|
77
|
+
break;
|
78
|
+
case 2:
|
79
|
+
// 4 msbs of lower as v lsbs
|
80
|
+
// 2 lsbs of upper as v msbs
|
81
|
+
v = (vLower >> 4) | ((vUpper & 0x3) << 4);
|
82
|
+
break;
|
83
|
+
case 3:
|
84
|
+
// 6 msbs of lower
|
85
|
+
v = (vLower >> 2);
|
86
|
+
break;
|
87
|
+
}
|
88
|
+
|
89
|
+
return v;
|
90
|
+
}
|
91
|
+
|
92
|
+
|
93
|
+
std::vector<uint8_t>
|
94
|
+
unpackNonInterleaved(std::vector<uint8_t> data,
|
95
|
+
int numVecs,
|
96
|
+
int dims,
|
97
|
+
int bitsPerCode) {
|
98
|
+
int srcVecSize = utils::divUp(dims * bitsPerCode, 8);
|
99
|
+
FAISS_ASSERT(data.size() == numVecs * srcVecSize);
|
100
|
+
|
101
|
+
if (bitsPerCode == 8 ||
|
102
|
+
bitsPerCode == 16 ||
|
103
|
+
bitsPerCode == 32) {
|
104
|
+
// nothing to do
|
105
|
+
return data;
|
106
|
+
}
|
107
|
+
|
108
|
+
// bit codes padded to whole bytes
|
109
|
+
std::vector<uint8_t> out(numVecs * dims * utils::divUp(bitsPerCode, 8));
|
110
|
+
|
111
|
+
if (bitsPerCode == 4) {
|
112
|
+
#pragma omp parallel for
|
113
|
+
for (int i = 0; i < numVecs; ++i) {
|
114
|
+
for (int j = 0; j < dims; ++j) {
|
115
|
+
int srcIdx = i * srcVecSize + (j / 2);
|
116
|
+
FAISS_ASSERT(srcIdx < data.size());
|
117
|
+
|
118
|
+
uint8_t v = data[srcIdx];
|
119
|
+
v = (j % 2 == 0) ? v & 0xf : v >> 4;
|
120
|
+
|
121
|
+
out[i * dims + j] = v;
|
122
|
+
}
|
123
|
+
}
|
124
|
+
} else if (bitsPerCode == 5) {
|
125
|
+
#pragma omp parallel for
|
126
|
+
for (int i = 0; i < numVecs; ++i) {
|
127
|
+
for (int j = 0; j < dims; ++j) {
|
128
|
+
int lo = i * srcVecSize + (j * 5) / 8;
|
129
|
+
int hi = lo + 1;
|
130
|
+
|
131
|
+
FAISS_ASSERT(lo < data.size());
|
132
|
+
FAISS_ASSERT(hi <= data.size());
|
133
|
+
|
134
|
+
auto vLower = data[lo];
|
135
|
+
auto vUpper = hi < data.size() ? data[hi] : 0;
|
136
|
+
|
137
|
+
out[i * dims + j] = unpack5(j, vLower, vUpper);
|
138
|
+
}
|
139
|
+
}
|
140
|
+
} else if (bitsPerCode == 6) {
|
141
|
+
#pragma omp parallel for
|
142
|
+
for (int i = 0; i < numVecs; ++i) {
|
143
|
+
for (int j = 0; j < dims; ++j) {
|
144
|
+
int lo = i * srcVecSize + (j * 6) / 8;
|
145
|
+
int hi = lo + 1;
|
146
|
+
|
147
|
+
FAISS_ASSERT(lo < data.size());
|
148
|
+
FAISS_ASSERT(hi <= data.size());
|
149
|
+
|
150
|
+
auto vLower = data[lo];
|
151
|
+
auto vUpper = hi < data.size() ? data[hi] : 0;
|
152
|
+
|
153
|
+
out[i * dims + j] = unpack6(j, vLower, vUpper);
|
154
|
+
}
|
155
|
+
}
|
156
|
+
} else {
|
157
|
+
// unhandled
|
158
|
+
FAISS_ASSERT(false);
|
159
|
+
}
|
160
|
+
|
161
|
+
return out;
|
162
|
+
}
|
163
|
+
|
164
|
+
template <typename T>
|
165
|
+
void
|
166
|
+
unpackInterleavedWord(const T* in,
|
167
|
+
T* out,
|
168
|
+
int numVecs,
|
169
|
+
int dims,
|
170
|
+
int bitsPerCode) {
|
171
|
+
int wordsPerDimBlock = 32 * bitsPerCode / (8 * sizeof(T));
|
172
|
+
int wordsPerBlock = wordsPerDimBlock * dims;
|
173
|
+
int numBlocks = utils::divUp(numVecs, 32);
|
174
|
+
|
175
|
+
#pragma omp parallel for
|
176
|
+
for (int i = 0; i < numVecs; ++i) {
|
177
|
+
int block = i / 32;
|
178
|
+
FAISS_ASSERT(block < numBlocks);
|
179
|
+
int lane = i % 32;
|
180
|
+
|
181
|
+
for (int j = 0; j < dims; ++j) {
|
182
|
+
int srcOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
|
183
|
+
out[i * dims + j] = in[srcOffset];
|
184
|
+
}
|
185
|
+
}
|
186
|
+
}
|
187
|
+
|
188
|
+
std::vector<uint8_t>
|
189
|
+
unpackInterleaved(std::vector<uint8_t> data,
|
190
|
+
int numVecs,
|
191
|
+
int dims,
|
192
|
+
int bitsPerCode) {
|
193
|
+
int bytesPerDimBlock = 32 * bitsPerCode / 8;
|
194
|
+
int bytesPerBlock = bytesPerDimBlock * dims;
|
195
|
+
int numBlocks = utils::divUp(numVecs, 32);
|
196
|
+
size_t totalSize = (size_t) bytesPerBlock * numBlocks;
|
197
|
+
FAISS_ASSERT(data.size() == totalSize);
|
198
|
+
|
199
|
+
// bit codes padded to whole bytes
|
200
|
+
std::vector<uint8_t> out(numVecs * dims * utils::divUp(bitsPerCode, 8));
|
201
|
+
|
202
|
+
if (bitsPerCode == 8) {
|
203
|
+
unpackInterleavedWord<uint8_t>(data.data(), out.data(),
|
204
|
+
numVecs, dims, bitsPerCode);
|
205
|
+
} else if (bitsPerCode == 16) {
|
206
|
+
unpackInterleavedWord<uint16_t>((uint16_t*) data.data(),
|
207
|
+
(uint16_t*) out.data(),
|
208
|
+
numVecs, dims, bitsPerCode);
|
209
|
+
} else if (bitsPerCode == 32) {
|
210
|
+
unpackInterleavedWord<uint32_t>((uint32_t*) data.data(),
|
211
|
+
(uint32_t*) out.data(),
|
212
|
+
numVecs, dims, bitsPerCode);
|
213
|
+
} else if (bitsPerCode == 4) {
|
214
|
+
#pragma omp parallel for
|
215
|
+
for (int i = 0; i < numVecs; ++i) {
|
216
|
+
int block = i / 32;
|
217
|
+
int lane = i % 32;
|
218
|
+
|
219
|
+
int word = lane / 2;
|
220
|
+
int subWord = lane % 2;
|
221
|
+
|
222
|
+
for (int j = 0; j < dims; ++j) {
|
223
|
+
auto v =
|
224
|
+
data[block * bytesPerBlock + j * bytesPerDimBlock + word];
|
225
|
+
|
226
|
+
v = (subWord == 0) ? v & 0xf : v >> 4;
|
227
|
+
out[i * dims + j] = v;
|
228
|
+
}
|
229
|
+
}
|
230
|
+
} else if (bitsPerCode == 5) {
|
231
|
+
#pragma omp parallel for
|
232
|
+
for (int i = 0; i < numVecs; ++i) {
|
233
|
+
int block = i / 32;
|
234
|
+
int blockVector = i % 32;
|
235
|
+
|
236
|
+
for (int j = 0; j < dims; ++j) {
|
237
|
+
uint8_t* dimBlock =
|
238
|
+
&data[block * bytesPerBlock + j * bytesPerDimBlock];
|
239
|
+
|
240
|
+
int lo = (blockVector * 5) / 8;
|
241
|
+
int hi = lo + 1;
|
242
|
+
|
243
|
+
FAISS_ASSERT(lo < bytesPerDimBlock);
|
244
|
+
FAISS_ASSERT(hi <= bytesPerDimBlock);
|
245
|
+
|
246
|
+
auto vLower = dimBlock[lo];
|
247
|
+
auto vUpper = hi < bytesPerDimBlock ? dimBlock[hi] : 0;
|
248
|
+
|
249
|
+
out[i * dims + j] = unpack5(blockVector, vLower, vUpper);
|
250
|
+
}
|
251
|
+
}
|
252
|
+
} else if (bitsPerCode == 6) {
|
253
|
+
#pragma omp parallel for
|
254
|
+
for (int i = 0; i < numVecs; ++i) {
|
255
|
+
int block = i / 32;
|
256
|
+
int blockVector = i % 32;
|
257
|
+
|
258
|
+
for (int j = 0; j < dims; ++j) {
|
259
|
+
uint8_t* dimBlock =
|
260
|
+
&data[block * bytesPerBlock + j * bytesPerDimBlock];
|
261
|
+
|
262
|
+
int lo = (blockVector * 6) / 8;
|
263
|
+
int hi = lo + 1;
|
264
|
+
|
265
|
+
FAISS_ASSERT(lo < bytesPerDimBlock);
|
266
|
+
FAISS_ASSERT(hi <= bytesPerDimBlock);
|
267
|
+
|
268
|
+
auto vLower = dimBlock[lo];
|
269
|
+
auto vUpper = hi < bytesPerDimBlock ? dimBlock[hi] : 0;
|
270
|
+
|
271
|
+
out[i * dims + j] = unpack6(blockVector, vLower, vUpper);
|
272
|
+
}
|
273
|
+
}
|
274
|
+
} else {
|
275
|
+
// unimplemented
|
276
|
+
FAISS_ASSERT(false);
|
277
|
+
}
|
278
|
+
|
279
|
+
return out;
|
280
|
+
}
|
281
|
+
|
282
|
+
inline uint8_t pack5(int i, uint8_t lo, uint8_t hi, uint8_t hi2) {
|
283
|
+
FAISS_ASSERT((lo & 0x1f) == lo);
|
284
|
+
FAISS_ASSERT((hi & 0x1f) == hi);
|
285
|
+
FAISS_ASSERT((hi2 & 0x1f) == hi2);
|
286
|
+
|
287
|
+
uint8_t v = 0;
|
288
|
+
|
289
|
+
// lsb ... msb
|
290
|
+
// 0: 0 0 0 0 0 1 1 1
|
291
|
+
// 1: 1 1 2 2 2 2 2 3
|
292
|
+
// 2: 3 3 3 3 4 4 4 4
|
293
|
+
// 3: 4 5 5 5 5 5 6 6
|
294
|
+
// 4: 6 6 6 7 7 7 7 7
|
295
|
+
switch (i % 5) {
|
296
|
+
case 0:
|
297
|
+
// 5 msbs of lower as vOut lsbs
|
298
|
+
// 3 lsbs of upper as vOut msbs
|
299
|
+
v = (lo & 0x1f) | (hi << 5);
|
300
|
+
break;
|
301
|
+
case 1:
|
302
|
+
// 2 msbs of lower as vOut lsbs
|
303
|
+
// 5 lsbs of upper as vOut msbs
|
304
|
+
// 1 lsbs of upper2 as vOut msb
|
305
|
+
v = (lo >> 3) | (hi << 2) | (hi2 << 7);
|
306
|
+
break;
|
307
|
+
case 2:
|
308
|
+
// 4 msbs of lower as vOut lsbs
|
309
|
+
// 4 lsbs of upper as vOut msbs
|
310
|
+
v = (lo >> 1) | (hi << 4);
|
311
|
+
break;
|
312
|
+
case 3:
|
313
|
+
// 1 msbs of lower as vOut lsbs
|
314
|
+
// 5 lsbs of upper as vOut msbs
|
315
|
+
// 2 lsbs of upper2 as vOut msb
|
316
|
+
v = (lo >> 4) | (hi << 1) | (hi2 << 6);
|
317
|
+
break;
|
318
|
+
case 4:
|
319
|
+
// 3 msbs of lower as vOut lsbs
|
320
|
+
// 5 lsbs of upper as vOut msbs
|
321
|
+
v = (lo >> 2) | (hi << 3);
|
322
|
+
break;
|
323
|
+
}
|
324
|
+
|
325
|
+
return v;
|
326
|
+
}
|
327
|
+
|
328
|
+
inline uint8_t pack6(int i, uint8_t lo, uint8_t hi) {
|
329
|
+
FAISS_ASSERT((lo & 0x3f) == lo);
|
330
|
+
FAISS_ASSERT((hi & 0x3f) == hi);
|
331
|
+
|
332
|
+
uint8_t v = 0;
|
333
|
+
|
334
|
+
// lsb ... msb
|
335
|
+
// 0: 0 0 0 0 0 0 1 1
|
336
|
+
// 1: 1 1 1 1 2 2 2 2
|
337
|
+
// 2: 2 2 3 3 3 3 3 3
|
338
|
+
switch (i % 3) {
|
339
|
+
case 0:
|
340
|
+
// 6 msbs of lower as vOut lsbs
|
341
|
+
// 2 lsbs of upper as vOut msbs
|
342
|
+
v = (lo & 0x3f) | (hi << 6);
|
343
|
+
break;
|
344
|
+
case 1:
|
345
|
+
// 4 msbs of lower as vOut lsbs
|
346
|
+
// 4 lsbs of upper as vOut msbs
|
347
|
+
v = (lo >> 2) | (hi << 4);
|
348
|
+
break;
|
349
|
+
case 2:
|
350
|
+
// 2 msbs of lower as vOut lsbs
|
351
|
+
// 6 lsbs of upper as vOut msbs
|
352
|
+
v = (lo >> 4) | (hi << 2);
|
353
|
+
break;
|
354
|
+
}
|
355
|
+
|
356
|
+
return v;
|
357
|
+
}
|
358
|
+
|
359
|
+
|
360
|
+
std::vector<uint8_t>
|
361
|
+
packNonInterleaved(std::vector<uint8_t> data,
|
362
|
+
int numVecs,
|
363
|
+
int dims,
|
364
|
+
int bitsPerCode) {
|
365
|
+
// bit codes padded to whole bytes
|
366
|
+
FAISS_ASSERT(data.size() == numVecs * dims * utils::divUp(bitsPerCode, 8));
|
367
|
+
|
368
|
+
if (bitsPerCode == 8 ||
|
369
|
+
bitsPerCode == 16 ||
|
370
|
+
bitsPerCode == 32) {
|
371
|
+
// nothing to do, whole words are already where they need to be
|
372
|
+
return data;
|
373
|
+
}
|
374
|
+
|
375
|
+
// bits packed into a whole number of bytes
|
376
|
+
int bytesPerVec = utils::divUp(dims * bitsPerCode, 8);
|
377
|
+
|
378
|
+
std::vector<uint8_t> out(numVecs * bytesPerVec);
|
379
|
+
|
380
|
+
if (bitsPerCode == 4) {
|
381
|
+
#pragma omp parallel for
|
382
|
+
for (int i = 0; i < numVecs; ++i) {
|
383
|
+
for (int j = 0; j < bytesPerVec; ++j) {
|
384
|
+
int dimLo = j * 2;
|
385
|
+
int dimHi = dimLo + 1;
|
386
|
+
FAISS_ASSERT(dimLo < dims);
|
387
|
+
FAISS_ASSERT(dimHi <= dims);
|
388
|
+
|
389
|
+
uint8_t lo = data[i * dims + dimLo];
|
390
|
+
uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
|
391
|
+
|
392
|
+
out[i * bytesPerVec + j] = (hi << 4) | (lo & 0xf);
|
393
|
+
}
|
394
|
+
}
|
395
|
+
} else if (bitsPerCode == 5) {
|
396
|
+
#pragma omp parallel for
|
397
|
+
for (int i = 0; i < numVecs; ++i) {
|
398
|
+
for (int j = 0; j < bytesPerVec; ++j) {
|
399
|
+
int dimLo = (j * 8) / 5;
|
400
|
+
int dimHi = dimLo + 1;
|
401
|
+
int dimHi2 = dimHi + 1;
|
402
|
+
FAISS_ASSERT(dimLo < dims);
|
403
|
+
FAISS_ASSERT(dimHi <= dims);
|
404
|
+
FAISS_ASSERT(dimHi <= dims + 1);
|
405
|
+
|
406
|
+
uint8_t lo = data[i * dims + dimLo];
|
407
|
+
uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
|
408
|
+
uint8_t hi2 = dimHi2 < dims ? data[i * dims + dimHi2] : 0;
|
409
|
+
|
410
|
+
out[i * bytesPerVec + j] = pack5(j, lo, hi, hi2);
|
411
|
+
}
|
412
|
+
}
|
413
|
+
} else if (bitsPerCode == 6) {
|
414
|
+
#pragma omp parallel for
|
415
|
+
for (int i = 0; i < numVecs; ++i) {
|
416
|
+
for (int j = 0; j < bytesPerVec; ++j) {
|
417
|
+
int dimLo = (j * 8) / 6;
|
418
|
+
int dimHi = dimLo + 1;
|
419
|
+
FAISS_ASSERT(dimLo < dims);
|
420
|
+
FAISS_ASSERT(dimHi <= dims);
|
421
|
+
|
422
|
+
uint8_t lo = data[i * dims + dimLo];
|
423
|
+
uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
|
424
|
+
|
425
|
+
out[i * bytesPerVec + j] = pack6(j, lo, hi);
|
426
|
+
}
|
427
|
+
}
|
428
|
+
} else {
|
429
|
+
// unhandled
|
430
|
+
FAISS_ASSERT(false);
|
431
|
+
}
|
432
|
+
|
433
|
+
return out;
|
434
|
+
}
|
435
|
+
|
436
|
+
template <typename T>
|
437
|
+
void
|
438
|
+
packInterleavedWord(const T* in,
|
439
|
+
T* out,
|
440
|
+
int numVecs,
|
441
|
+
int dims,
|
442
|
+
int bitsPerCode) {
|
443
|
+
int wordsPerDimBlock = 32 * bitsPerCode / (8 * sizeof(T));
|
444
|
+
int wordsPerBlock = wordsPerDimBlock * dims;
|
445
|
+
int numBlocks = utils::divUp(numVecs, 32);
|
446
|
+
|
447
|
+
// We're guaranteed that all other slots not filled by the vectors present are
|
448
|
+
// initialized to zero (from the vector constructor in packInterleaved)
|
449
|
+
#pragma omp parallel for
|
450
|
+
for (int i = 0; i < numVecs; ++i) {
|
451
|
+
int block = i / 32;
|
452
|
+
FAISS_ASSERT(block < numBlocks);
|
453
|
+
int lane = i % 32;
|
454
|
+
|
455
|
+
for (int j = 0; j < dims; ++j) {
|
456
|
+
int dstOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
|
457
|
+
out[dstOffset] = in[i * dims + j];
|
458
|
+
}
|
459
|
+
}
|
460
|
+
}
|
461
|
+
|
462
|
+
std::vector<uint8_t>
|
463
|
+
packInterleaved(std::vector<uint8_t> data,
|
464
|
+
int numVecs,
|
465
|
+
int dims,
|
466
|
+
int bitsPerCode) {
|
467
|
+
int bytesPerDimBlock = 32 * bitsPerCode / 8;
|
468
|
+
int bytesPerBlock = bytesPerDimBlock * dims;
|
469
|
+
int numBlocks = utils::divUp(numVecs, 32);
|
470
|
+
size_t totalSize = (size_t) bytesPerBlock * numBlocks;
|
471
|
+
|
472
|
+
// bit codes padded to whole bytes
|
473
|
+
FAISS_ASSERT(data.size() == numVecs * dims * utils::divUp(bitsPerCode, 8));
|
474
|
+
|
475
|
+
// packs based on blocks
|
476
|
+
std::vector<uint8_t> out(totalSize, 0);
|
477
|
+
|
478
|
+
if (bitsPerCode == 8) {
|
479
|
+
packInterleavedWord<uint8_t>(data.data(), out.data(),
|
480
|
+
numVecs, dims, bitsPerCode);
|
481
|
+
} else if (bitsPerCode == 16) {
|
482
|
+
packInterleavedWord<uint16_t>((uint16_t*) data.data(),
|
483
|
+
(uint16_t*) out.data(),
|
484
|
+
numVecs, dims, bitsPerCode);
|
485
|
+
} else if (bitsPerCode == 32) {
|
486
|
+
packInterleavedWord<uint32_t>((uint32_t*) data.data(),
|
487
|
+
(uint32_t*) out.data(),
|
488
|
+
numVecs, dims, bitsPerCode);
|
489
|
+
} else if (bitsPerCode == 4) {
|
490
|
+
#pragma omp parallel for
|
491
|
+
for (int i = 0; i < numBlocks; ++i) {
|
492
|
+
for (int j = 0; j < dims; ++j) {
|
493
|
+
for (int k = 0; k < bytesPerDimBlock; ++k) {
|
494
|
+
int loVec = i * 32 + k * 2;
|
495
|
+
int hiVec = loVec + 1;
|
496
|
+
|
497
|
+
uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
|
498
|
+
uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
|
499
|
+
|
500
|
+
out[i * bytesPerBlock + j * bytesPerDimBlock + k] =
|
501
|
+
(hi << 4) | (lo & 0xf);
|
502
|
+
}
|
503
|
+
}
|
504
|
+
}
|
505
|
+
} else if (bitsPerCode == 5) {
|
506
|
+
#pragma omp parallel for
|
507
|
+
for (int i = 0; i < numBlocks; ++i) {
|
508
|
+
for (int j = 0; j < dims; ++j) {
|
509
|
+
for (int k = 0; k < bytesPerDimBlock; ++k) {
|
510
|
+
// What input vectors we are pulling from
|
511
|
+
int loVec = i * 32 + (k * 8) / 5;
|
512
|
+
int hiVec = loVec + 1;
|
513
|
+
int hiVec2 = hiVec + 1;
|
514
|
+
|
515
|
+
uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
|
516
|
+
uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
|
517
|
+
uint8_t hi2 = hiVec2 < numVecs ? data[hiVec2 * dims + j] : 0;
|
518
|
+
|
519
|
+
out[i * bytesPerBlock + j * bytesPerDimBlock + k] = pack5(k, lo, hi, hi2);
|
520
|
+
}
|
521
|
+
}
|
522
|
+
}
|
523
|
+
} else if (bitsPerCode == 6) {
|
524
|
+
#pragma omp parallel for
|
525
|
+
for (int i = 0; i < numBlocks; ++i) {
|
526
|
+
for (int j = 0; j < dims; ++j) {
|
527
|
+
for (int k = 0; k < bytesPerDimBlock; ++k) {
|
528
|
+
// What input vectors we are pulling from
|
529
|
+
int loVec = i * 32 + (k * 8) / 6;
|
530
|
+
int hiVec = loVec + 1;
|
531
|
+
|
532
|
+
uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
|
533
|
+
uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
|
534
|
+
|
535
|
+
out[i * bytesPerBlock + j * bytesPerDimBlock + k] = pack6(k, lo, hi);
|
536
|
+
}
|
537
|
+
}
|
538
|
+
}
|
539
|
+
} else {
|
540
|
+
// unimplemented
|
541
|
+
FAISS_ASSERT(false);
|
542
|
+
}
|
543
|
+
|
544
|
+
return out;
|
545
|
+
}
|
546
|
+
|
547
|
+
} } // namespace
|