faiss 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/ext/faiss/extconf.rb +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +36 -33
- data/vendor/faiss/faiss/AutoTune.h +6 -3
- data/vendor/faiss/faiss/Clustering.cpp +16 -12
- data/vendor/faiss/faiss/Index.cpp +3 -4
- data/vendor/faiss/faiss/Index.h +3 -3
- data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
- data/vendor/faiss/faiss/IndexBinary.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
- data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
- data/vendor/faiss/faiss/IndexFlat.h +0 -51
- data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
- data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
- data/vendor/faiss/faiss/IndexIVF.h +22 -15
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
- data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
- data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
- data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
- data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
- data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
- data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
- data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
- data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
- data/vendor/faiss/faiss/IndexRefine.h +73 -0
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
- data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
- data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
- data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
- data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
- data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
- data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
- data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
- data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
- data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
- data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
- data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
- data/vendor/faiss/faiss/impl/io.cpp +33 -2
- data/vendor/faiss/faiss/impl/io.h +7 -2
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
- data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
- data/vendor/faiss/faiss/index_factory.cpp +112 -7
- data/vendor/faiss/faiss/index_io.h +1 -48
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
- data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
- data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
- data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
- data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
- data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
- data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
- data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
- data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
- data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
- data/vendor/faiss/faiss/utils/Heap.h +61 -50
- data/vendor/faiss/faiss/utils/distances.cpp +164 -319
- data/vendor/faiss/faiss/utils/distances.h +28 -20
- data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
- data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
- data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
- data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
- data/vendor/faiss/faiss/utils/hamming.h +2 -7
- data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
- data/vendor/faiss/faiss/utils/partitioning.h +69 -0
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
- data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
- data/vendor/faiss/faiss/utils/simdlib.h +31 -0
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
- data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
- metadata +43 -141
- data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
- data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
- data/vendor/faiss/c_api/AutoTune_c.h +0 -66
- data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
- data/vendor/faiss/c_api/Clustering_c.h +0 -123
- data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
- data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
- data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
- data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
- data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
- data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
- data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
- data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
- data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
- data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
- data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
- data/vendor/faiss/c_api/IndexShards_c.h +0 -39
- data/vendor/faiss/c_api/Index_c.cpp +0 -105
- data/vendor/faiss/c_api/Index_c.h +0 -183
- data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
- data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
- data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
- data/vendor/faiss/c_api/clone_index_c.h +0 -32
- data/vendor/faiss/c_api/error_c.h +0 -42
- data/vendor/faiss/c_api/error_impl.cpp +0 -27
- data/vendor/faiss/c_api/error_impl.h +0 -16
- data/vendor/faiss/c_api/faiss_c.h +0 -58
- data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
- data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
- data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
- data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
- data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
- data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
- data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
- data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
- data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
- data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
- data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
- data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
- data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
- data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
- data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
- data/vendor/faiss/c_api/index_factory_c.h +0 -30
- data/vendor/faiss/c_api/index_io_c.cpp +0 -42
- data/vendor/faiss/c_api/index_io_c.h +0 -50
- data/vendor/faiss/c_api/macros_impl.h +0 -110
- data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
- data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
- data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
- data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
- data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
- data/vendor/faiss/misc/test_blas.cpp +0 -87
- data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
- data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
- data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
- data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
- data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
- data/vendor/faiss/tests/test_merge.cpp +0 -260
- data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
- data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
- data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
- data/vendor/faiss/tests/test_params_override.cpp +0 -236
- data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
- data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
- data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
- data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
- data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
- data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
- data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
- data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
- data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104
@@ -242,16 +242,47 @@ BufferedIOWriter::~BufferedIOWriter()
|
|
242
242
|
|
243
243
|
|
244
244
|
uint32_t fourcc (const char sx[4]) {
|
245
|
-
|
245
|
+
FAISS_THROW_IF_NOT (4 == strlen(sx));
|
246
246
|
const unsigned char *x = (unsigned char*)sx;
|
247
247
|
return x[0] | x[1] << 8 | x[2] << 16 | x[3] << 24;
|
248
248
|
}
|
249
249
|
|
250
250
|
uint32_t fourcc (const std::string & sx) {
|
251
|
-
|
251
|
+
FAISS_THROW_IF_NOT (sx.length() == 4);
|
252
252
|
const unsigned char *x = (unsigned char*)sx.c_str();
|
253
253
|
return x[0] | x[1] << 8 | x[2] << 16 | x[3] << 24;
|
254
254
|
}
|
255
255
|
|
256
|
+
void fourcc_inv(uint32_t x, char str[5]) {
|
257
|
+
*(uint32_t*)str = x;
|
258
|
+
str[5] = 0;
|
259
|
+
}
|
260
|
+
|
261
|
+
std::string fourcc_inv(uint32_t x) {
|
262
|
+
char str[5];
|
263
|
+
fourcc_inv(x, str);
|
264
|
+
return std::string(str);
|
265
|
+
}
|
266
|
+
|
267
|
+
|
268
|
+
std::string fourcc_inv_printable(uint32_t x) {
|
269
|
+
char cstr[5];
|
270
|
+
fourcc_inv(x, cstr);
|
271
|
+
std::string str = "";
|
272
|
+
for (int i = 0; i < 4; i++) {
|
273
|
+
uint8_t c = cstr[i];
|
274
|
+
if (32 <= c && c < 127) {
|
275
|
+
str += c;
|
276
|
+
} else {
|
277
|
+
char buf[10];
|
278
|
+
sprintf(buf, "\\x%02x", c);
|
279
|
+
str += buf;
|
280
|
+
}
|
281
|
+
}
|
282
|
+
return str;
|
283
|
+
}
|
284
|
+
|
285
|
+
|
286
|
+
|
256
287
|
|
257
288
|
} // namespace faiss
|
@@ -50,7 +50,7 @@ struct IOWriter {
|
|
50
50
|
// return a file number that can be memory-mapped
|
51
51
|
virtual int fileno ();
|
52
52
|
|
53
|
-
virtual ~IOWriter() {}
|
53
|
+
virtual ~IOWriter() noexcept(false) {}
|
54
54
|
};
|
55
55
|
|
56
56
|
|
@@ -139,12 +139,17 @@ struct BufferedIOWriter: IOWriter {
|
|
139
139
|
size_t operator()(const void *ptr, size_t size, size_t nitems) override;
|
140
140
|
|
141
141
|
// flushes
|
142
|
-
~BufferedIOWriter();
|
142
|
+
~BufferedIOWriter() override;
|
143
143
|
};
|
144
144
|
|
145
145
|
/// cast a 4-character string to a uint32_t that can be written and read easily
|
146
146
|
uint32_t fourcc (const char sx[4]);
|
147
147
|
uint32_t fourcc (const std::string & sx);
|
148
148
|
|
149
|
+
// decoding of fourcc (int32 -> string)
|
150
|
+
void fourcc_inv(uint32_t x, char str[5]);
|
151
|
+
std::string fourcc_inv(uint32_t x);
|
152
|
+
std::string fourcc_inv_printable(uint32_t x);
|
153
|
+
|
149
154
|
|
150
155
|
} // namespace faiss
|
@@ -20,22 +20,8 @@
|
|
20
20
|
#include <algorithm>
|
21
21
|
|
22
22
|
#include <faiss/utils/distances.h>
|
23
|
+
#include <faiss/impl/platform_macros.h>
|
23
24
|
|
24
|
-
#ifdef _MSC_VER
|
25
|
-
|
26
|
-
#include <intrin.h>
|
27
|
-
|
28
|
-
static inline int __builtin_ctzll(uint64_t x) {
|
29
|
-
unsigned long ret;
|
30
|
-
_BitScanForward64(&ret, x);
|
31
|
-
return (int)ret;
|
32
|
-
}
|
33
|
-
|
34
|
-
static inline int __builtin_clzll(uint64_t x) {
|
35
|
-
return (int)__lzcnt64(x);
|
36
|
-
}
|
37
|
-
|
38
|
-
#endif // _MSC_VER
|
39
25
|
|
40
26
|
namespace faiss {
|
41
27
|
|
@@ -7,8 +7,14 @@
|
|
7
7
|
|
8
8
|
#pragma once
|
9
9
|
|
10
|
+
|
10
11
|
#ifdef _MSC_VER
|
11
12
|
|
13
|
+
/*******************************************************
|
14
|
+
* Windows specific macros
|
15
|
+
*******************************************************/
|
16
|
+
|
17
|
+
|
12
18
|
#ifdef FAISS_MAIN_LIB
|
13
19
|
#define FAISS_API __declspec(dllexport)
|
14
20
|
#else // _FAISS_MAIN_LIB
|
@@ -17,8 +23,46 @@
|
|
17
23
|
|
18
24
|
#define __PRETTY_FUNCTION__ __FUNCSIG__
|
19
25
|
|
26
|
+
#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
|
27
|
+
#define posix_memalign_free _aligned_free
|
28
|
+
|
29
|
+
// aligned should be in front of the declaration
|
30
|
+
#define ALIGNED(x) __declspec(align(x))
|
31
|
+
|
32
|
+
// redefine the GCC intrinsics with Windows equivalents
|
33
|
+
|
34
|
+
#include <intrin.h>
|
35
|
+
|
36
|
+
inline int __builtin_ctzll(uint64_t x) {
|
37
|
+
unsigned long ret;
|
38
|
+
_BitScanForward64(&ret, x);
|
39
|
+
return (int)ret;
|
40
|
+
}
|
41
|
+
|
42
|
+
inline int __builtin_ctz(unsigned long x) {
|
43
|
+
unsigned long ret;
|
44
|
+
_BitScanForward(&ret, x);
|
45
|
+
return (int)ret;
|
46
|
+
}
|
47
|
+
|
48
|
+
inline int __builtin_clzll(uint64_t x) {
|
49
|
+
return (int)__lzcnt64(x);
|
50
|
+
}
|
51
|
+
|
52
|
+
#define __builtin_popcountl __popcnt64
|
53
|
+
|
20
54
|
#else
|
55
|
+
/*******************************************************
|
56
|
+
* Linux and OSX
|
57
|
+
*******************************************************/
|
21
58
|
|
22
59
|
#define FAISS_API
|
60
|
+
#define posix_memalign_free free
|
61
|
+
|
62
|
+
// aligned should be *in front* of the declaration, for compatibility with windows
|
63
|
+
#define ALIGNED(x) __attribute__ ((aligned(x)))
|
23
64
|
|
24
65
|
#endif // _MSC_VER
|
66
|
+
|
67
|
+
|
68
|
+
|
@@ -0,0 +1,272 @@
|
|
1
|
+
/**
|
2
|
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
3
|
+
*
|
4
|
+
* This source code is licensed under the MIT license found in the
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
6
|
+
*/
|
7
|
+
|
8
|
+
#include <faiss/impl/pq4_fast_scan.h>
|
9
|
+
#include <faiss/impl/FaissAssert.h>
|
10
|
+
#include <faiss/impl/simd_result_handlers.h>
|
11
|
+
|
12
|
+
#include <array>
|
13
|
+
|
14
|
+
|
15
|
+
namespace faiss {
|
16
|
+
|
17
|
+
|
18
|
+
using namespace simd_result_handlers;
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
/***************************************************************
|
23
|
+
* Packing functions for codes
|
24
|
+
***************************************************************/
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
namespace {
|
29
|
+
|
30
|
+
/* extract the column starting at (i, j)
|
31
|
+
* from packed matrix src of size (m, n)*/
|
32
|
+
template<typename T, class TA>
|
33
|
+
void get_matrix_column(
|
34
|
+
T * src,
|
35
|
+
size_t m, size_t n,
|
36
|
+
int64_t i, int64_t j,
|
37
|
+
TA & dest) {
|
38
|
+
for(int64_t k = 0; k < dest.size(); k++) {
|
39
|
+
if (k + i >= 0 && k + i < m) {
|
40
|
+
dest[k] = src[(k + i) * n + j];
|
41
|
+
} else {
|
42
|
+
dest[k] = 0;
|
43
|
+
}
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
} // anonymous namespace
|
48
|
+
|
49
|
+
|
50
|
+
void pq4_pack_codes(
|
51
|
+
const uint8_t *codes,
|
52
|
+
size_t ntotal, size_t M,
|
53
|
+
size_t nb, size_t bbs, size_t nsq,
|
54
|
+
uint8_t *blocks
|
55
|
+
)
|
56
|
+
{
|
57
|
+
FAISS_THROW_IF_NOT(bbs % 32 == 0);
|
58
|
+
FAISS_THROW_IF_NOT(nb % bbs == 0);
|
59
|
+
FAISS_THROW_IF_NOT(nsq % 2 == 0);
|
60
|
+
|
61
|
+
memset(blocks, 0, nb * nsq / 2);
|
62
|
+
const uint8_t perm0[16] =
|
63
|
+
{0, 8, 1, 9, 2, 10, 3, 11,
|
64
|
+
4, 12, 5, 13, 6, 14, 7, 15};
|
65
|
+
|
66
|
+
uint8_t *codes2 = blocks;
|
67
|
+
for(size_t i0 = 0; i0 < nb; i0 += bbs) {
|
68
|
+
for(int sq = 0; sq < nsq; sq += 2) {
|
69
|
+
for(size_t i = 0; i < bbs; i += 32) {
|
70
|
+
std::array<uint8_t, 32> c, c0, c1;
|
71
|
+
get_matrix_column(
|
72
|
+
codes, ntotal,
|
73
|
+
(M + 1) / 2,
|
74
|
+
i0 + i, sq / 2, c
|
75
|
+
);
|
76
|
+
for(int j = 0; j < 32; j++) {
|
77
|
+
c0[j] = c[j] & 15;
|
78
|
+
c1[j] = c[j] >> 4;
|
79
|
+
}
|
80
|
+
for(int j = 0; j < 16; j++) {
|
81
|
+
uint8_t d0, d1;
|
82
|
+
d0 = c0[perm0[j]] | (c0[perm0[j] + 16] << 4);
|
83
|
+
d1 = c1[perm0[j]] | (c1[perm0[j] + 16] << 4);
|
84
|
+
codes2[j] = d0;
|
85
|
+
codes2[j + 16] = d1;
|
86
|
+
}
|
87
|
+
codes2 += 32;
|
88
|
+
}
|
89
|
+
}
|
90
|
+
}
|
91
|
+
}
|
92
|
+
|
93
|
+
void pq4_pack_codes_range(
|
94
|
+
const uint8_t *codes,
|
95
|
+
size_t M,
|
96
|
+
size_t i0, size_t i1,
|
97
|
+
size_t bbs, size_t M2,
|
98
|
+
uint8_t * blocks
|
99
|
+
) {
|
100
|
+
const uint8_t perm0[16] =
|
101
|
+
{0, 8, 1, 9, 2, 10, 3, 11,
|
102
|
+
4, 12, 5, 13, 6, 14, 7, 15};
|
103
|
+
|
104
|
+
// range of affected blocks
|
105
|
+
size_t block0 = i0 / bbs;
|
106
|
+
size_t block1 = ((i1 - 1) / bbs) + 1;
|
107
|
+
|
108
|
+
for (size_t b = block0; b < block1; b++) {
|
109
|
+
uint8_t *codes2 = blocks + b * bbs * M2 / 2;
|
110
|
+
int64_t i_base = b * bbs - i0;
|
111
|
+
for(int sq = 0; sq < M2; sq += 2) {
|
112
|
+
for(size_t i = 0; i < bbs; i += 32) {
|
113
|
+
std::array<uint8_t, 32> c, c0, c1;
|
114
|
+
get_matrix_column(
|
115
|
+
codes, i1 - i0,
|
116
|
+
(M + 1) / 2,
|
117
|
+
i_base + i, sq / 2, c
|
118
|
+
);
|
119
|
+
for(int j = 0; j < 32; j++) {
|
120
|
+
c0[j] = c[j] & 15;
|
121
|
+
c1[j] = c[j] >> 4;
|
122
|
+
}
|
123
|
+
for(int j = 0; j < 16; j++) {
|
124
|
+
uint8_t d0, d1;
|
125
|
+
d0 = c0[perm0[j]] | (c0[perm0[j] + 16] << 4);
|
126
|
+
d1 = c1[perm0[j]] | (c1[perm0[j] + 16] << 4);
|
127
|
+
codes2[j] |= d0;
|
128
|
+
codes2[j + 16] |= d1;
|
129
|
+
}
|
130
|
+
codes2 += 32;
|
131
|
+
}
|
132
|
+
}
|
133
|
+
}
|
134
|
+
|
135
|
+
}
|
136
|
+
|
137
|
+
|
138
|
+
uint8_t pq4_get_packed_element(
|
139
|
+
const uint8_t *data, size_t bbs, size_t nsq,
|
140
|
+
size_t i, size_t sq
|
141
|
+
) {
|
142
|
+
// move to correct bbs-sized block
|
143
|
+
data += (i / bbs * (nsq / 2) + sq / 2) * bbs;
|
144
|
+
sq = sq & 1;
|
145
|
+
i = i % bbs;
|
146
|
+
|
147
|
+
// another step
|
148
|
+
data += (i / 32) * 32;
|
149
|
+
i = i % 32;
|
150
|
+
|
151
|
+
if (sq == 1) {
|
152
|
+
data += 16;
|
153
|
+
}
|
154
|
+
const uint8_t iperm0[16] =
|
155
|
+
{0, 2, 4, 6, 8, 10, 12, 14,
|
156
|
+
1, 3, 5, 7, 9, 11, 13, 15};
|
157
|
+
if (i < 16) {
|
158
|
+
return data[iperm0[i]] & 15;
|
159
|
+
} else {
|
160
|
+
return data[iperm0[i - 16]] >> 4;
|
161
|
+
}
|
162
|
+
|
163
|
+
}
|
164
|
+
|
165
|
+
/***************************************************************
|
166
|
+
* Packing functions for Look-Up Tables (LUT)
|
167
|
+
***************************************************************/
|
168
|
+
|
169
|
+
|
170
|
+
|
171
|
+
|
172
|
+
void pq4_pack_LUT(
|
173
|
+
int nq, int nsq,
|
174
|
+
const uint8_t *src,
|
175
|
+
uint8_t *dest)
|
176
|
+
{
|
177
|
+
|
178
|
+
for(int q = 0; q < nq; q++) {
|
179
|
+
for(int sq = 0; sq < nsq; sq += 2) {
|
180
|
+
memcpy(
|
181
|
+
dest + (sq / 2 * nq + q) * 32,
|
182
|
+
src + (q * nsq + sq) * 16,
|
183
|
+
16
|
184
|
+
);
|
185
|
+
memcpy(
|
186
|
+
dest + (sq / 2 * nq + q) * 32 + 16,
|
187
|
+
src + (q * nsq + sq + 1) * 16,
|
188
|
+
16
|
189
|
+
);
|
190
|
+
}
|
191
|
+
}
|
192
|
+
}
|
193
|
+
|
194
|
+
|
195
|
+
int pq4_pack_LUT_qbs(
|
196
|
+
int qbs, int nsq,
|
197
|
+
const uint8_t *src,
|
198
|
+
uint8_t *dest)
|
199
|
+
{
|
200
|
+
FAISS_THROW_IF_NOT(nsq % 2 == 0);
|
201
|
+
size_t dim12 = 16 * nsq;
|
202
|
+
int i0 = 0;
|
203
|
+
int qi = qbs;
|
204
|
+
while(qi) {
|
205
|
+
int nq = qi & 15;
|
206
|
+
qi >>= 4;
|
207
|
+
pq4_pack_LUT(
|
208
|
+
nq, nsq,
|
209
|
+
src + i0 * dim12,
|
210
|
+
dest + i0 * dim12
|
211
|
+
);
|
212
|
+
i0 += nq;
|
213
|
+
}
|
214
|
+
return i0;
|
215
|
+
}
|
216
|
+
|
217
|
+
|
218
|
+
namespace {
|
219
|
+
|
220
|
+
void pack_LUT_1_q_map(
|
221
|
+
int nq, const int *q_map,
|
222
|
+
int nsq,
|
223
|
+
const uint8_t *src,
|
224
|
+
uint8_t *dest)
|
225
|
+
{
|
226
|
+
|
227
|
+
for(int qi = 0; qi < nq; qi++) {
|
228
|
+
int q = q_map[qi];
|
229
|
+
for(int sq = 0; sq < nsq; sq += 2) {
|
230
|
+
memcpy(
|
231
|
+
dest + (sq / 2 * nq + qi) * 32,
|
232
|
+
src + (q * nsq + sq) * 16,
|
233
|
+
16
|
234
|
+
);
|
235
|
+
memcpy(
|
236
|
+
dest + (sq / 2 * nq + qi) * 32 + 16,
|
237
|
+
src + (q * nsq + sq + 1) * 16,
|
238
|
+
16
|
239
|
+
);
|
240
|
+
}
|
241
|
+
}
|
242
|
+
|
243
|
+
}
|
244
|
+
|
245
|
+
} // anonymous namespace
|
246
|
+
|
247
|
+
int pq4_pack_LUT_qbs_q_map(
|
248
|
+
int qbs, int nsq,
|
249
|
+
const uint8_t *src,
|
250
|
+
const int * q_map,
|
251
|
+
uint8_t *dest)
|
252
|
+
{
|
253
|
+
FAISS_THROW_IF_NOT(nsq % 2 == 0);
|
254
|
+
size_t dim12 = 16 * nsq;
|
255
|
+
int i0 = 0;
|
256
|
+
int qi = qbs;
|
257
|
+
while(qi) {
|
258
|
+
int nq = qi & 15;
|
259
|
+
qi >>= 4;
|
260
|
+
pack_LUT_1_q_map(
|
261
|
+
nq, q_map + i0, nsq,
|
262
|
+
src,
|
263
|
+
dest + i0 * dim12
|
264
|
+
);
|
265
|
+
i0 += nq;
|
266
|
+
}
|
267
|
+
return i0;
|
268
|
+
}
|
269
|
+
|
270
|
+
|
271
|
+
|
272
|
+
} // namespace faiss
|
@@ -0,0 +1,169 @@
|
|
1
|
+
/**
|
2
|
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
3
|
+
*
|
4
|
+
* This source code is licensed under the MIT license found in the
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
6
|
+
*/
|
7
|
+
|
8
|
+
#pragma once
|
9
|
+
|
10
|
+
#include <cstdint>
|
11
|
+
#include <cstdlib>
|
12
|
+
|
13
|
+
/** PQ4 SIMD packing and accumulation functions
|
14
|
+
*
|
15
|
+
* The basic kernel accumulates nq query vectors with bbs = nb * 2 * 16 vectors
|
16
|
+
* and produces an output matrix for that. It is interesting for nq * nb <= 4,
|
17
|
+
* otherwise register spilling becomes too large.
|
18
|
+
*
|
19
|
+
* The implementation of these functions is spread over 3 cpp files to reduce
|
20
|
+
* parallel compile times. Templates are instanciated explicitly.
|
21
|
+
*/
|
22
|
+
|
23
|
+
|
24
|
+
namespace faiss {
|
25
|
+
|
26
|
+
|
27
|
+
/** Pack codes for consumption by the SIMD kernels.
|
28
|
+
* The unused bytes are set to 0.
|
29
|
+
*
|
30
|
+
* @param codes input codes, size (ntotal, ceil(M / 2))
|
31
|
+
* @param nototal number of input codes
|
32
|
+
* @param nb output number of codes (ntotal rounded up to a multiple of
|
33
|
+
* bbs)
|
34
|
+
* @param M2 number of sub-quantizers (=M rounded up to a muliple of 2)
|
35
|
+
* @param bbs size of database blocks (multiple of 32)
|
36
|
+
* @param blocks output array, size nb * nsq / 2.
|
37
|
+
*/
|
38
|
+
void pq4_pack_codes(
|
39
|
+
const uint8_t *codes,
|
40
|
+
size_t ntotal, size_t M,
|
41
|
+
size_t nb, size_t bbs, size_t M2,
|
42
|
+
uint8_t * blocks
|
43
|
+
);
|
44
|
+
|
45
|
+
/** Same as pack_codes but write in a given range of the output,
|
46
|
+
* leaving the rest untouched. Assumes allocated entries are 0 on input.
|
47
|
+
*
|
48
|
+
* @param codes input codes, size (i1 - i0, ceil(M / 2))
|
49
|
+
* @param i0 first output code to write
|
50
|
+
* @param i1 last output code to write
|
51
|
+
* @param blocks output array, size at least ceil(i1 / bbs) * bbs * nsq / 2
|
52
|
+
*/
|
53
|
+
void pq4_pack_codes_range(
|
54
|
+
const uint8_t *codes,
|
55
|
+
size_t M,
|
56
|
+
size_t i0, size_t i1,
|
57
|
+
size_t bbs, size_t M2,
|
58
|
+
uint8_t * blocks
|
59
|
+
);
|
60
|
+
|
61
|
+
/** get a single element from a packed codes table
|
62
|
+
*
|
63
|
+
* @param i vector id
|
64
|
+
* @param sq subquantizer (< nsq)
|
65
|
+
*/
|
66
|
+
uint8_t pq4_get_packed_element(
|
67
|
+
const uint8_t *data, size_t bbs, size_t nsq,
|
68
|
+
size_t i, size_t sq
|
69
|
+
);
|
70
|
+
|
71
|
+
/** Pack Look-up table for consumption by the kernel.
|
72
|
+
*
|
73
|
+
* @param nq number of queries
|
74
|
+
* @param nsq number of sub-quantizers (muliple of 2)
|
75
|
+
* @param src input array, size (nq, 16)
|
76
|
+
* @param dest output array, size (nq, 16)
|
77
|
+
*/
|
78
|
+
void pq4_pack_LUT(
|
79
|
+
int nq, int nsq,
|
80
|
+
const uint8_t *src,
|
81
|
+
uint8_t *dest
|
82
|
+
);
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
/** Loop over database elements and accumulate results into result handler
|
87
|
+
*
|
88
|
+
* @param nq number of queries
|
89
|
+
* @param nb number of database elements
|
90
|
+
* @param bbs size of database blocks (multiple of 32)
|
91
|
+
* @param nsq number of sub-quantizers (muliple of 2)
|
92
|
+
* @param codes packed codes array
|
93
|
+
* @param LUT packed look-up table
|
94
|
+
*/
|
95
|
+
template<class ResultHandler>
|
96
|
+
void pq4_accumulate_loop(
|
97
|
+
int nq,
|
98
|
+
size_t nb, int bbs,
|
99
|
+
int nsq,
|
100
|
+
const uint8_t *codes,
|
101
|
+
const uint8_t *LUT,
|
102
|
+
ResultHandler & res);
|
103
|
+
|
104
|
+
|
105
|
+
|
106
|
+
/* qbs versions, supported only for bbs=32.
|
107
|
+
*
|
108
|
+
* The kernel function runs the kernel for *several* query blocks
|
109
|
+
* and bbs database vectors. The sizes of the blocks are encoded in qbs as
|
110
|
+
* base-16 digits.
|
111
|
+
*
|
112
|
+
* For example, qbs = 0x1223 means that the kernel will be run 4 times, the
|
113
|
+
* first time with 3 query vectors, second time with 2 query vectors, then 2
|
114
|
+
* vectors again and finally with 1 query vector. The output block will thus be
|
115
|
+
* nq = 3 + 2 + 2 + 1 = 6 queries. For a given total block size, the optimal
|
116
|
+
* decomposition into sub-blocks (measured empirically) is given by
|
117
|
+
* preferred_qbs().
|
118
|
+
*/
|
119
|
+
|
120
|
+
|
121
|
+
/* compute the number of queries from a base-16 decomposition */
|
122
|
+
int pq4_qbs_to_nq(int qbs);
|
123
|
+
|
124
|
+
/** return the preferred decomposition in blocks for a nb of queries. */
|
125
|
+
int pq4_preferred_qbs(int nq);
|
126
|
+
|
127
|
+
/** Pack Look-up table for consumption by the kernel.
|
128
|
+
*
|
129
|
+
* @param qbs 4-bit encoded number of query blocks, the total number of
|
130
|
+
* queries handled (nq) is deduced from it
|
131
|
+
* @param nsq number of sub-quantizers (muliple of 2)
|
132
|
+
* @param src input array, size (nq, 16)
|
133
|
+
* @param dest output array, size (nq, 16)
|
134
|
+
* @return nq
|
135
|
+
*/
|
136
|
+
int pq4_pack_LUT_qbs(
|
137
|
+
int fqbs, int nsq,
|
138
|
+
const uint8_t *src,
|
139
|
+
uint8_t *dest
|
140
|
+
);
|
141
|
+
|
142
|
+
/** Same as pq4_pack_LUT_qbs, except the source vectors are remapped with q_map */
|
143
|
+
int pq4_pack_LUT_qbs_q_map(
|
144
|
+
int qbs, int nsq,
|
145
|
+
const uint8_t *src,
|
146
|
+
const int * q_map,
|
147
|
+
uint8_t *dest);
|
148
|
+
|
149
|
+
/** Run accumulation loop.
|
150
|
+
*
|
151
|
+
* @param qbs 4-bit encded number of queries
|
152
|
+
* @param nb number of database codes (mutliple of bbs)
|
153
|
+
* @param nsq number of sub-quantizers
|
154
|
+
* @param codes encoded database vectors (packed)
|
155
|
+
* @param LUT look-up table (packed)
|
156
|
+
* @param res call-back for the resutls
|
157
|
+
*/
|
158
|
+
template<class ResultHandler>
|
159
|
+
void pq4_accumulate_loop_qbs(
|
160
|
+
int qbs,
|
161
|
+
size_t nb,
|
162
|
+
int nsq,
|
163
|
+
const uint8_t *codes,
|
164
|
+
const uint8_t *LUT,
|
165
|
+
ResultHandler & res);
|
166
|
+
|
167
|
+
|
168
|
+
|
169
|
+
} // namespace faiss
|