faiss 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/ext/faiss/extconf.rb +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +36 -33
- data/vendor/faiss/faiss/AutoTune.h +6 -3
- data/vendor/faiss/faiss/Clustering.cpp +16 -12
- data/vendor/faiss/faiss/Index.cpp +3 -4
- data/vendor/faiss/faiss/Index.h +3 -3
- data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
- data/vendor/faiss/faiss/IndexBinary.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
- data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
- data/vendor/faiss/faiss/IndexFlat.h +0 -51
- data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
- data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
- data/vendor/faiss/faiss/IndexIVF.h +22 -15
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
- data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
- data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
- data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
- data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
- data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
- data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
- data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
- data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
- data/vendor/faiss/faiss/IndexRefine.h +73 -0
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
- data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
- data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
- data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
- data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
- data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
- data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
- data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
- data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
- data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
- data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
- data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
- data/vendor/faiss/faiss/impl/io.cpp +33 -2
- data/vendor/faiss/faiss/impl/io.h +7 -2
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
- data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
- data/vendor/faiss/faiss/index_factory.cpp +112 -7
- data/vendor/faiss/faiss/index_io.h +1 -48
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
- data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
- data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
- data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
- data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
- data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
- data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
- data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
- data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
- data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
- data/vendor/faiss/faiss/utils/Heap.h +61 -50
- data/vendor/faiss/faiss/utils/distances.cpp +164 -319
- data/vendor/faiss/faiss/utils/distances.h +28 -20
- data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
- data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
- data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
- data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
- data/vendor/faiss/faiss/utils/hamming.h +2 -7
- data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
- data/vendor/faiss/faiss/utils/partitioning.h +69 -0
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
- data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
- data/vendor/faiss/faiss/utils/simdlib.h +31 -0
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
- data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
- metadata +43 -141
- data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
- data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
- data/vendor/faiss/c_api/AutoTune_c.h +0 -66
- data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
- data/vendor/faiss/c_api/Clustering_c.h +0 -123
- data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
- data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
- data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
- data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
- data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
- data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
- data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
- data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
- data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
- data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
- data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
- data/vendor/faiss/c_api/IndexShards_c.h +0 -39
- data/vendor/faiss/c_api/Index_c.cpp +0 -105
- data/vendor/faiss/c_api/Index_c.h +0 -183
- data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
- data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
- data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
- data/vendor/faiss/c_api/clone_index_c.h +0 -32
- data/vendor/faiss/c_api/error_c.h +0 -42
- data/vendor/faiss/c_api/error_impl.cpp +0 -27
- data/vendor/faiss/c_api/error_impl.h +0 -16
- data/vendor/faiss/c_api/faiss_c.h +0 -58
- data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
- data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
- data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
- data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
- data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
- data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
- data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
- data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
- data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
- data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
- data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
- data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
- data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
- data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
- data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
- data/vendor/faiss/c_api/index_factory_c.h +0 -30
- data/vendor/faiss/c_api/index_io_c.cpp +0 -42
- data/vendor/faiss/c_api/index_io_c.h +0 -50
- data/vendor/faiss/c_api/macros_impl.h +0 -110
- data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
- data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
- data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
- data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
- data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
- data/vendor/faiss/misc/test_blas.cpp +0 -87
- data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
- data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
- data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
- data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
- data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
- data/vendor/faiss/tests/test_merge.cpp +0 -260
- data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
- data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
- data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
- data/vendor/faiss/tests/test_params_override.cpp +0 -236
- data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
- data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
- data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
- data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
- data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
- data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
- data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
- data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
- data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104
@@ -242,16 +242,47 @@ BufferedIOWriter::~BufferedIOWriter()
|
|
242
242
|
|
243
243
|
|
244
244
|
uint32_t fourcc (const char sx[4]) {
|
245
|
-
|
245
|
+
FAISS_THROW_IF_NOT (4 == strlen(sx));
|
246
246
|
const unsigned char *x = (unsigned char*)sx;
|
247
247
|
return x[0] | x[1] << 8 | x[2] << 16 | x[3] << 24;
|
248
248
|
}
|
249
249
|
|
250
250
|
uint32_t fourcc (const std::string & sx) {
|
251
|
-
|
251
|
+
FAISS_THROW_IF_NOT (sx.length() == 4);
|
252
252
|
const unsigned char *x = (unsigned char*)sx.c_str();
|
253
253
|
return x[0] | x[1] << 8 | x[2] << 16 | x[3] << 24;
|
254
254
|
}
|
255
255
|
|
256
|
+
void fourcc_inv(uint32_t x, char str[5]) {
|
257
|
+
*(uint32_t*)str = x;
|
258
|
+
str[5] = 0;
|
259
|
+
}
|
260
|
+
|
261
|
+
std::string fourcc_inv(uint32_t x) {
|
262
|
+
char str[5];
|
263
|
+
fourcc_inv(x, str);
|
264
|
+
return std::string(str);
|
265
|
+
}
|
266
|
+
|
267
|
+
|
268
|
+
std::string fourcc_inv_printable(uint32_t x) {
|
269
|
+
char cstr[5];
|
270
|
+
fourcc_inv(x, cstr);
|
271
|
+
std::string str = "";
|
272
|
+
for (int i = 0; i < 4; i++) {
|
273
|
+
uint8_t c = cstr[i];
|
274
|
+
if (32 <= c && c < 127) {
|
275
|
+
str += c;
|
276
|
+
} else {
|
277
|
+
char buf[10];
|
278
|
+
sprintf(buf, "\\x%02x", c);
|
279
|
+
str += buf;
|
280
|
+
}
|
281
|
+
}
|
282
|
+
return str;
|
283
|
+
}
|
284
|
+
|
285
|
+
|
286
|
+
|
256
287
|
|
257
288
|
} // namespace faiss
|
@@ -50,7 +50,7 @@ struct IOWriter {
|
|
50
50
|
// return a file number that can be memory-mapped
|
51
51
|
virtual int fileno ();
|
52
52
|
|
53
|
-
virtual ~IOWriter() {}
|
53
|
+
virtual ~IOWriter() noexcept(false) {}
|
54
54
|
};
|
55
55
|
|
56
56
|
|
@@ -139,12 +139,17 @@ struct BufferedIOWriter: IOWriter {
|
|
139
139
|
size_t operator()(const void *ptr, size_t size, size_t nitems) override;
|
140
140
|
|
141
141
|
// flushes
|
142
|
-
~BufferedIOWriter();
|
142
|
+
~BufferedIOWriter() override;
|
143
143
|
};
|
144
144
|
|
145
145
|
/// cast a 4-character string to a uint32_t that can be written and read easily
|
146
146
|
uint32_t fourcc (const char sx[4]);
|
147
147
|
uint32_t fourcc (const std::string & sx);
|
148
148
|
|
149
|
+
// decoding of fourcc (int32 -> string)
|
150
|
+
void fourcc_inv(uint32_t x, char str[5]);
|
151
|
+
std::string fourcc_inv(uint32_t x);
|
152
|
+
std::string fourcc_inv_printable(uint32_t x);
|
153
|
+
|
149
154
|
|
150
155
|
} // namespace faiss
|
@@ -20,22 +20,8 @@
|
|
20
20
|
#include <algorithm>
|
21
21
|
|
22
22
|
#include <faiss/utils/distances.h>
|
23
|
+
#include <faiss/impl/platform_macros.h>
|
23
24
|
|
24
|
-
#ifdef _MSC_VER
|
25
|
-
|
26
|
-
#include <intrin.h>
|
27
|
-
|
28
|
-
static inline int __builtin_ctzll(uint64_t x) {
|
29
|
-
unsigned long ret;
|
30
|
-
_BitScanForward64(&ret, x);
|
31
|
-
return (int)ret;
|
32
|
-
}
|
33
|
-
|
34
|
-
static inline int __builtin_clzll(uint64_t x) {
|
35
|
-
return (int)__lzcnt64(x);
|
36
|
-
}
|
37
|
-
|
38
|
-
#endif // _MSC_VER
|
39
25
|
|
40
26
|
namespace faiss {
|
41
27
|
|
@@ -7,8 +7,14 @@
|
|
7
7
|
|
8
8
|
#pragma once
|
9
9
|
|
10
|
+
|
10
11
|
#ifdef _MSC_VER
|
11
12
|
|
13
|
+
/*******************************************************
|
14
|
+
* Windows specific macros
|
15
|
+
*******************************************************/
|
16
|
+
|
17
|
+
|
12
18
|
#ifdef FAISS_MAIN_LIB
|
13
19
|
#define FAISS_API __declspec(dllexport)
|
14
20
|
#else // _FAISS_MAIN_LIB
|
@@ -17,8 +23,46 @@
|
|
17
23
|
|
18
24
|
#define __PRETTY_FUNCTION__ __FUNCSIG__
|
19
25
|
|
26
|
+
#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
|
27
|
+
#define posix_memalign_free _aligned_free
|
28
|
+
|
29
|
+
// aligned should be in front of the declaration
|
30
|
+
#define ALIGNED(x) __declspec(align(x))
|
31
|
+
|
32
|
+
// redefine the GCC intrinsics with Windows equivalents
|
33
|
+
|
34
|
+
#include <intrin.h>
|
35
|
+
|
36
|
+
inline int __builtin_ctzll(uint64_t x) {
|
37
|
+
unsigned long ret;
|
38
|
+
_BitScanForward64(&ret, x);
|
39
|
+
return (int)ret;
|
40
|
+
}
|
41
|
+
|
42
|
+
inline int __builtin_ctz(unsigned long x) {
|
43
|
+
unsigned long ret;
|
44
|
+
_BitScanForward(&ret, x);
|
45
|
+
return (int)ret;
|
46
|
+
}
|
47
|
+
|
48
|
+
inline int __builtin_clzll(uint64_t x) {
|
49
|
+
return (int)__lzcnt64(x);
|
50
|
+
}
|
51
|
+
|
52
|
+
#define __builtin_popcountl __popcnt64
|
53
|
+
|
20
54
|
#else
|
55
|
+
/*******************************************************
|
56
|
+
* Linux and OSX
|
57
|
+
*******************************************************/
|
21
58
|
|
22
59
|
#define FAISS_API
|
60
|
+
#define posix_memalign_free free
|
61
|
+
|
62
|
+
// aligned should be *in front* of the declaration, for compatibility with windows
|
63
|
+
#define ALIGNED(x) __attribute__ ((aligned(x)))
|
23
64
|
|
24
65
|
#endif // _MSC_VER
|
66
|
+
|
67
|
+
|
68
|
+
|
@@ -0,0 +1,272 @@
|
|
1
|
+
/**
|
2
|
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
3
|
+
*
|
4
|
+
* This source code is licensed under the MIT license found in the
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
6
|
+
*/
|
7
|
+
|
8
|
+
#include <faiss/impl/pq4_fast_scan.h>
|
9
|
+
#include <faiss/impl/FaissAssert.h>
|
10
|
+
#include <faiss/impl/simd_result_handlers.h>
|
11
|
+
|
12
|
+
#include <array>
|
13
|
+
|
14
|
+
|
15
|
+
namespace faiss {
|
16
|
+
|
17
|
+
|
18
|
+
using namespace simd_result_handlers;
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
/***************************************************************
|
23
|
+
* Packing functions for codes
|
24
|
+
***************************************************************/
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
namespace {
|
29
|
+
|
30
|
+
/* extract the column starting at (i, j)
|
31
|
+
* from packed matrix src of size (m, n)*/
|
32
|
+
template<typename T, class TA>
|
33
|
+
void get_matrix_column(
|
34
|
+
T * src,
|
35
|
+
size_t m, size_t n,
|
36
|
+
int64_t i, int64_t j,
|
37
|
+
TA & dest) {
|
38
|
+
for(int64_t k = 0; k < dest.size(); k++) {
|
39
|
+
if (k + i >= 0 && k + i < m) {
|
40
|
+
dest[k] = src[(k + i) * n + j];
|
41
|
+
} else {
|
42
|
+
dest[k] = 0;
|
43
|
+
}
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
} // anonymous namespace
|
48
|
+
|
49
|
+
|
50
|
+
void pq4_pack_codes(
|
51
|
+
const uint8_t *codes,
|
52
|
+
size_t ntotal, size_t M,
|
53
|
+
size_t nb, size_t bbs, size_t nsq,
|
54
|
+
uint8_t *blocks
|
55
|
+
)
|
56
|
+
{
|
57
|
+
FAISS_THROW_IF_NOT(bbs % 32 == 0);
|
58
|
+
FAISS_THROW_IF_NOT(nb % bbs == 0);
|
59
|
+
FAISS_THROW_IF_NOT(nsq % 2 == 0);
|
60
|
+
|
61
|
+
memset(blocks, 0, nb * nsq / 2);
|
62
|
+
const uint8_t perm0[16] =
|
63
|
+
{0, 8, 1, 9, 2, 10, 3, 11,
|
64
|
+
4, 12, 5, 13, 6, 14, 7, 15};
|
65
|
+
|
66
|
+
uint8_t *codes2 = blocks;
|
67
|
+
for(size_t i0 = 0; i0 < nb; i0 += bbs) {
|
68
|
+
for(int sq = 0; sq < nsq; sq += 2) {
|
69
|
+
for(size_t i = 0; i < bbs; i += 32) {
|
70
|
+
std::array<uint8_t, 32> c, c0, c1;
|
71
|
+
get_matrix_column(
|
72
|
+
codes, ntotal,
|
73
|
+
(M + 1) / 2,
|
74
|
+
i0 + i, sq / 2, c
|
75
|
+
);
|
76
|
+
for(int j = 0; j < 32; j++) {
|
77
|
+
c0[j] = c[j] & 15;
|
78
|
+
c1[j] = c[j] >> 4;
|
79
|
+
}
|
80
|
+
for(int j = 0; j < 16; j++) {
|
81
|
+
uint8_t d0, d1;
|
82
|
+
d0 = c0[perm0[j]] | (c0[perm0[j] + 16] << 4);
|
83
|
+
d1 = c1[perm0[j]] | (c1[perm0[j] + 16] << 4);
|
84
|
+
codes2[j] = d0;
|
85
|
+
codes2[j + 16] = d1;
|
86
|
+
}
|
87
|
+
codes2 += 32;
|
88
|
+
}
|
89
|
+
}
|
90
|
+
}
|
91
|
+
}
|
92
|
+
|
93
|
+
void pq4_pack_codes_range(
|
94
|
+
const uint8_t *codes,
|
95
|
+
size_t M,
|
96
|
+
size_t i0, size_t i1,
|
97
|
+
size_t bbs, size_t M2,
|
98
|
+
uint8_t * blocks
|
99
|
+
) {
|
100
|
+
const uint8_t perm0[16] =
|
101
|
+
{0, 8, 1, 9, 2, 10, 3, 11,
|
102
|
+
4, 12, 5, 13, 6, 14, 7, 15};
|
103
|
+
|
104
|
+
// range of affected blocks
|
105
|
+
size_t block0 = i0 / bbs;
|
106
|
+
size_t block1 = ((i1 - 1) / bbs) + 1;
|
107
|
+
|
108
|
+
for (size_t b = block0; b < block1; b++) {
|
109
|
+
uint8_t *codes2 = blocks + b * bbs * M2 / 2;
|
110
|
+
int64_t i_base = b * bbs - i0;
|
111
|
+
for(int sq = 0; sq < M2; sq += 2) {
|
112
|
+
for(size_t i = 0; i < bbs; i += 32) {
|
113
|
+
std::array<uint8_t, 32> c, c0, c1;
|
114
|
+
get_matrix_column(
|
115
|
+
codes, i1 - i0,
|
116
|
+
(M + 1) / 2,
|
117
|
+
i_base + i, sq / 2, c
|
118
|
+
);
|
119
|
+
for(int j = 0; j < 32; j++) {
|
120
|
+
c0[j] = c[j] & 15;
|
121
|
+
c1[j] = c[j] >> 4;
|
122
|
+
}
|
123
|
+
for(int j = 0; j < 16; j++) {
|
124
|
+
uint8_t d0, d1;
|
125
|
+
d0 = c0[perm0[j]] | (c0[perm0[j] + 16] << 4);
|
126
|
+
d1 = c1[perm0[j]] | (c1[perm0[j] + 16] << 4);
|
127
|
+
codes2[j] |= d0;
|
128
|
+
codes2[j + 16] |= d1;
|
129
|
+
}
|
130
|
+
codes2 += 32;
|
131
|
+
}
|
132
|
+
}
|
133
|
+
}
|
134
|
+
|
135
|
+
}
|
136
|
+
|
137
|
+
|
138
|
+
uint8_t pq4_get_packed_element(
|
139
|
+
const uint8_t *data, size_t bbs, size_t nsq,
|
140
|
+
size_t i, size_t sq
|
141
|
+
) {
|
142
|
+
// move to correct bbs-sized block
|
143
|
+
data += (i / bbs * (nsq / 2) + sq / 2) * bbs;
|
144
|
+
sq = sq & 1;
|
145
|
+
i = i % bbs;
|
146
|
+
|
147
|
+
// another step
|
148
|
+
data += (i / 32) * 32;
|
149
|
+
i = i % 32;
|
150
|
+
|
151
|
+
if (sq == 1) {
|
152
|
+
data += 16;
|
153
|
+
}
|
154
|
+
const uint8_t iperm0[16] =
|
155
|
+
{0, 2, 4, 6, 8, 10, 12, 14,
|
156
|
+
1, 3, 5, 7, 9, 11, 13, 15};
|
157
|
+
if (i < 16) {
|
158
|
+
return data[iperm0[i]] & 15;
|
159
|
+
} else {
|
160
|
+
return data[iperm0[i - 16]] >> 4;
|
161
|
+
}
|
162
|
+
|
163
|
+
}
|
164
|
+
|
165
|
+
/***************************************************************
|
166
|
+
* Packing functions for Look-Up Tables (LUT)
|
167
|
+
***************************************************************/
|
168
|
+
|
169
|
+
|
170
|
+
|
171
|
+
|
172
|
+
void pq4_pack_LUT(
|
173
|
+
int nq, int nsq,
|
174
|
+
const uint8_t *src,
|
175
|
+
uint8_t *dest)
|
176
|
+
{
|
177
|
+
|
178
|
+
for(int q = 0; q < nq; q++) {
|
179
|
+
for(int sq = 0; sq < nsq; sq += 2) {
|
180
|
+
memcpy(
|
181
|
+
dest + (sq / 2 * nq + q) * 32,
|
182
|
+
src + (q * nsq + sq) * 16,
|
183
|
+
16
|
184
|
+
);
|
185
|
+
memcpy(
|
186
|
+
dest + (sq / 2 * nq + q) * 32 + 16,
|
187
|
+
src + (q * nsq + sq + 1) * 16,
|
188
|
+
16
|
189
|
+
);
|
190
|
+
}
|
191
|
+
}
|
192
|
+
}
|
193
|
+
|
194
|
+
|
195
|
+
int pq4_pack_LUT_qbs(
|
196
|
+
int qbs, int nsq,
|
197
|
+
const uint8_t *src,
|
198
|
+
uint8_t *dest)
|
199
|
+
{
|
200
|
+
FAISS_THROW_IF_NOT(nsq % 2 == 0);
|
201
|
+
size_t dim12 = 16 * nsq;
|
202
|
+
int i0 = 0;
|
203
|
+
int qi = qbs;
|
204
|
+
while(qi) {
|
205
|
+
int nq = qi & 15;
|
206
|
+
qi >>= 4;
|
207
|
+
pq4_pack_LUT(
|
208
|
+
nq, nsq,
|
209
|
+
src + i0 * dim12,
|
210
|
+
dest + i0 * dim12
|
211
|
+
);
|
212
|
+
i0 += nq;
|
213
|
+
}
|
214
|
+
return i0;
|
215
|
+
}
|
216
|
+
|
217
|
+
|
218
|
+
namespace {
|
219
|
+
|
220
|
+
void pack_LUT_1_q_map(
|
221
|
+
int nq, const int *q_map,
|
222
|
+
int nsq,
|
223
|
+
const uint8_t *src,
|
224
|
+
uint8_t *dest)
|
225
|
+
{
|
226
|
+
|
227
|
+
for(int qi = 0; qi < nq; qi++) {
|
228
|
+
int q = q_map[qi];
|
229
|
+
for(int sq = 0; sq < nsq; sq += 2) {
|
230
|
+
memcpy(
|
231
|
+
dest + (sq / 2 * nq + qi) * 32,
|
232
|
+
src + (q * nsq + sq) * 16,
|
233
|
+
16
|
234
|
+
);
|
235
|
+
memcpy(
|
236
|
+
dest + (sq / 2 * nq + qi) * 32 + 16,
|
237
|
+
src + (q * nsq + sq + 1) * 16,
|
238
|
+
16
|
239
|
+
);
|
240
|
+
}
|
241
|
+
}
|
242
|
+
|
243
|
+
}
|
244
|
+
|
245
|
+
} // anonymous namespace
|
246
|
+
|
247
|
+
int pq4_pack_LUT_qbs_q_map(
|
248
|
+
int qbs, int nsq,
|
249
|
+
const uint8_t *src,
|
250
|
+
const int * q_map,
|
251
|
+
uint8_t *dest)
|
252
|
+
{
|
253
|
+
FAISS_THROW_IF_NOT(nsq % 2 == 0);
|
254
|
+
size_t dim12 = 16 * nsq;
|
255
|
+
int i0 = 0;
|
256
|
+
int qi = qbs;
|
257
|
+
while(qi) {
|
258
|
+
int nq = qi & 15;
|
259
|
+
qi >>= 4;
|
260
|
+
pack_LUT_1_q_map(
|
261
|
+
nq, q_map + i0, nsq,
|
262
|
+
src,
|
263
|
+
dest + i0 * dim12
|
264
|
+
);
|
265
|
+
i0 += nq;
|
266
|
+
}
|
267
|
+
return i0;
|
268
|
+
}
|
269
|
+
|
270
|
+
|
271
|
+
|
272
|
+
} // namespace faiss
|
@@ -0,0 +1,169 @@
|
|
1
|
+
/**
|
2
|
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
3
|
+
*
|
4
|
+
* This source code is licensed under the MIT license found in the
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
6
|
+
*/
|
7
|
+
|
8
|
+
#pragma once
|
9
|
+
|
10
|
+
#include <cstdint>
|
11
|
+
#include <cstdlib>
|
12
|
+
|
13
|
+
/** PQ4 SIMD packing and accumulation functions
|
14
|
+
*
|
15
|
+
* The basic kernel accumulates nq query vectors with bbs = nb * 2 * 16 vectors
|
16
|
+
* and produces an output matrix for that. It is interesting for nq * nb <= 4,
|
17
|
+
* otherwise register spilling becomes too large.
|
18
|
+
*
|
19
|
+
* The implementation of these functions is spread over 3 cpp files to reduce
|
20
|
+
* parallel compile times. Templates are instanciated explicitly.
|
21
|
+
*/
|
22
|
+
|
23
|
+
|
24
|
+
namespace faiss {
|
25
|
+
|
26
|
+
|
27
|
+
/** Pack codes for consumption by the SIMD kernels.
|
28
|
+
* The unused bytes are set to 0.
|
29
|
+
*
|
30
|
+
* @param codes input codes, size (ntotal, ceil(M / 2))
|
31
|
+
* @param nototal number of input codes
|
32
|
+
* @param nb output number of codes (ntotal rounded up to a multiple of
|
33
|
+
* bbs)
|
34
|
+
* @param M2 number of sub-quantizers (=M rounded up to a muliple of 2)
|
35
|
+
* @param bbs size of database blocks (multiple of 32)
|
36
|
+
* @param blocks output array, size nb * nsq / 2.
|
37
|
+
*/
|
38
|
+
void pq4_pack_codes(
|
39
|
+
const uint8_t *codes,
|
40
|
+
size_t ntotal, size_t M,
|
41
|
+
size_t nb, size_t bbs, size_t M2,
|
42
|
+
uint8_t * blocks
|
43
|
+
);
|
44
|
+
|
45
|
+
/** Same as pack_codes but write in a given range of the output,
|
46
|
+
* leaving the rest untouched. Assumes allocated entries are 0 on input.
|
47
|
+
*
|
48
|
+
* @param codes input codes, size (i1 - i0, ceil(M / 2))
|
49
|
+
* @param i0 first output code to write
|
50
|
+
* @param i1 last output code to write
|
51
|
+
* @param blocks output array, size at least ceil(i1 / bbs) * bbs * nsq / 2
|
52
|
+
*/
|
53
|
+
void pq4_pack_codes_range(
|
54
|
+
const uint8_t *codes,
|
55
|
+
size_t M,
|
56
|
+
size_t i0, size_t i1,
|
57
|
+
size_t bbs, size_t M2,
|
58
|
+
uint8_t * blocks
|
59
|
+
);
|
60
|
+
|
61
|
+
/** get a single element from a packed codes table
|
62
|
+
*
|
63
|
+
* @param i vector id
|
64
|
+
* @param sq subquantizer (< nsq)
|
65
|
+
*/
|
66
|
+
uint8_t pq4_get_packed_element(
|
67
|
+
const uint8_t *data, size_t bbs, size_t nsq,
|
68
|
+
size_t i, size_t sq
|
69
|
+
);
|
70
|
+
|
71
|
+
/** Pack Look-up table for consumption by the kernel.
|
72
|
+
*
|
73
|
+
* @param nq number of queries
|
74
|
+
* @param nsq number of sub-quantizers (muliple of 2)
|
75
|
+
* @param src input array, size (nq, 16)
|
76
|
+
* @param dest output array, size (nq, 16)
|
77
|
+
*/
|
78
|
+
void pq4_pack_LUT(
|
79
|
+
int nq, int nsq,
|
80
|
+
const uint8_t *src,
|
81
|
+
uint8_t *dest
|
82
|
+
);
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
/** Loop over database elements and accumulate results into result handler
|
87
|
+
*
|
88
|
+
* @param nq number of queries
|
89
|
+
* @param nb number of database elements
|
90
|
+
* @param bbs size of database blocks (multiple of 32)
|
91
|
+
* @param nsq number of sub-quantizers (muliple of 2)
|
92
|
+
* @param codes packed codes array
|
93
|
+
* @param LUT packed look-up table
|
94
|
+
*/
|
95
|
+
template<class ResultHandler>
|
96
|
+
void pq4_accumulate_loop(
|
97
|
+
int nq,
|
98
|
+
size_t nb, int bbs,
|
99
|
+
int nsq,
|
100
|
+
const uint8_t *codes,
|
101
|
+
const uint8_t *LUT,
|
102
|
+
ResultHandler & res);
|
103
|
+
|
104
|
+
|
105
|
+
|
106
|
+
/* qbs versions, supported only for bbs=32.
|
107
|
+
*
|
108
|
+
* The kernel function runs the kernel for *several* query blocks
|
109
|
+
* and bbs database vectors. The sizes of the blocks are encoded in qbs as
|
110
|
+
* base-16 digits.
|
111
|
+
*
|
112
|
+
* For example, qbs = 0x1223 means that the kernel will be run 4 times, the
|
113
|
+
* first time with 3 query vectors, second time with 2 query vectors, then 2
|
114
|
+
* vectors again and finally with 1 query vector. The output block will thus be
|
115
|
+
* nq = 3 + 2 + 2 + 1 = 6 queries. For a given total block size, the optimal
|
116
|
+
* decomposition into sub-blocks (measured empirically) is given by
|
117
|
+
* preferred_qbs().
|
118
|
+
*/
|
119
|
+
|
120
|
+
|
121
|
+
/* compute the number of queries from a base-16 decomposition */
|
122
|
+
int pq4_qbs_to_nq(int qbs);
|
123
|
+
|
124
|
+
/** return the preferred decomposition in blocks for a nb of queries. */
|
125
|
+
int pq4_preferred_qbs(int nq);
|
126
|
+
|
127
|
+
/** Pack Look-up table for consumption by the kernel.
|
128
|
+
*
|
129
|
+
* @param qbs 4-bit encoded number of query blocks, the total number of
|
130
|
+
* queries handled (nq) is deduced from it
|
131
|
+
* @param nsq number of sub-quantizers (muliple of 2)
|
132
|
+
* @param src input array, size (nq, 16)
|
133
|
+
* @param dest output array, size (nq, 16)
|
134
|
+
* @return nq
|
135
|
+
*/
|
136
|
+
int pq4_pack_LUT_qbs(
|
137
|
+
int fqbs, int nsq,
|
138
|
+
const uint8_t *src,
|
139
|
+
uint8_t *dest
|
140
|
+
);
|
141
|
+
|
142
|
+
/** Same as pq4_pack_LUT_qbs, except the source vectors are remapped with q_map */
|
143
|
+
int pq4_pack_LUT_qbs_q_map(
|
144
|
+
int qbs, int nsq,
|
145
|
+
const uint8_t *src,
|
146
|
+
const int * q_map,
|
147
|
+
uint8_t *dest);
|
148
|
+
|
149
|
+
/** Run accumulation loop.
|
150
|
+
*
|
151
|
+
* @param qbs 4-bit encded number of queries
|
152
|
+
* @param nb number of database codes (mutliple of bbs)
|
153
|
+
* @param nsq number of sub-quantizers
|
154
|
+
* @param codes encoded database vectors (packed)
|
155
|
+
* @param LUT look-up table (packed)
|
156
|
+
* @param res call-back for the resutls
|
157
|
+
*/
|
158
|
+
template<class ResultHandler>
|
159
|
+
void pq4_accumulate_loop_qbs(
|
160
|
+
int qbs,
|
161
|
+
size_t nb,
|
162
|
+
int nsq,
|
163
|
+
const uint8_t *codes,
|
164
|
+
const uint8_t *LUT,
|
165
|
+
ResultHandler & res);
|
166
|
+
|
167
|
+
|
168
|
+
|
169
|
+
} // namespace faiss
|