faiss 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (199) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +25 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +16 -4
  5. data/ext/faiss/ext.cpp +12 -308
  6. data/ext/faiss/extconf.rb +6 -3
  7. data/ext/faiss/index.cpp +189 -0
  8. data/ext/faiss/index_binary.cpp +75 -0
  9. data/ext/faiss/kmeans.cpp +40 -0
  10. data/ext/faiss/numo.hpp +867 -0
  11. data/ext/faiss/pca_matrix.cpp +33 -0
  12. data/ext/faiss/product_quantizer.cpp +53 -0
  13. data/ext/faiss/utils.cpp +13 -0
  14. data/ext/faiss/utils.h +5 -0
  15. data/lib/faiss.rb +0 -5
  16. data/lib/faiss/version.rb +1 -1
  17. data/vendor/faiss/faiss/AutoTune.cpp +36 -33
  18. data/vendor/faiss/faiss/AutoTune.h +6 -3
  19. data/vendor/faiss/faiss/Clustering.cpp +16 -12
  20. data/vendor/faiss/faiss/Index.cpp +3 -4
  21. data/vendor/faiss/faiss/Index.h +3 -3
  22. data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
  23. data/vendor/faiss/faiss/IndexBinary.h +1 -1
  24. data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
  25. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
  26. data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
  27. data/vendor/faiss/faiss/IndexFlat.h +0 -51
  28. data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
  29. data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
  30. data/vendor/faiss/faiss/IndexIVF.h +22 -15
  31. data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
  32. data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
  33. data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
  34. data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
  35. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
  36. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
  37. data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
  38. data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
  39. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
  40. data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
  41. data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
  42. data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
  43. data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
  44. data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
  45. data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
  46. data/vendor/faiss/faiss/IndexRefine.h +73 -0
  47. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
  48. data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
  49. data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
  50. data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
  51. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
  52. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
  53. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
  54. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
  55. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
  56. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
  57. data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
  58. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
  59. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
  60. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
  61. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
  62. data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
  63. data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
  64. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
  65. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
  66. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
  67. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
  68. data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
  69. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
  70. data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
  71. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
  72. data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
  73. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
  74. data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
  75. data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
  76. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
  77. data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
  78. data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
  79. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
  80. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
  81. data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
  82. data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
  83. data/vendor/faiss/faiss/impl/io.cpp +33 -2
  84. data/vendor/faiss/faiss/impl/io.h +7 -2
  85. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
  86. data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
  87. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
  88. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
  89. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
  90. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
  91. data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
  92. data/vendor/faiss/faiss/index_factory.cpp +112 -7
  93. data/vendor/faiss/faiss/index_io.h +1 -48
  94. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
  95. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
  96. data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
  97. data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
  98. data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
  99. data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
  100. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
  101. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
  102. data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
  103. data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
  104. data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
  105. data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
  106. data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
  107. data/vendor/faiss/faiss/utils/Heap.h +61 -50
  108. data/vendor/faiss/faiss/utils/distances.cpp +164 -319
  109. data/vendor/faiss/faiss/utils/distances.h +28 -20
  110. data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
  111. data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
  112. data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
  113. data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
  114. data/vendor/faiss/faiss/utils/hamming.h +2 -7
  115. data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
  116. data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
  117. data/vendor/faiss/faiss/utils/partitioning.h +69 -0
  118. data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
  119. data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
  120. data/vendor/faiss/faiss/utils/simdlib.h +31 -0
  121. data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
  122. data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
  123. metadata +54 -149
  124. data/lib/faiss/index.rb +0 -20
  125. data/lib/faiss/index_binary.rb +0 -20
  126. data/lib/faiss/kmeans.rb +0 -15
  127. data/lib/faiss/pca_matrix.rb +0 -15
  128. data/lib/faiss/product_quantizer.rb +0 -22
  129. data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
  130. data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
  131. data/vendor/faiss/c_api/AutoTune_c.h +0 -66
  132. data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
  133. data/vendor/faiss/c_api/Clustering_c.h +0 -123
  134. data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
  135. data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
  136. data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
  137. data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
  138. data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
  139. data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
  140. data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
  141. data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
  142. data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
  143. data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
  144. data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
  145. data/vendor/faiss/c_api/IndexShards_c.h +0 -39
  146. data/vendor/faiss/c_api/Index_c.cpp +0 -105
  147. data/vendor/faiss/c_api/Index_c.h +0 -183
  148. data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
  149. data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
  150. data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
  151. data/vendor/faiss/c_api/clone_index_c.h +0 -32
  152. data/vendor/faiss/c_api/error_c.h +0 -42
  153. data/vendor/faiss/c_api/error_impl.cpp +0 -27
  154. data/vendor/faiss/c_api/error_impl.h +0 -16
  155. data/vendor/faiss/c_api/faiss_c.h +0 -58
  156. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
  157. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
  158. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
  159. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
  160. data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
  161. data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
  162. data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
  163. data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
  164. data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
  165. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
  166. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
  167. data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
  168. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
  169. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
  170. data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
  171. data/vendor/faiss/c_api/index_factory_c.h +0 -30
  172. data/vendor/faiss/c_api/index_io_c.cpp +0 -42
  173. data/vendor/faiss/c_api/index_io_c.h +0 -50
  174. data/vendor/faiss/c_api/macros_impl.h +0 -110
  175. data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
  176. data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
  177. data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
  178. data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
  179. data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
  180. data/vendor/faiss/misc/test_blas.cpp +0 -87
  181. data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
  182. data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
  183. data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
  184. data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
  185. data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
  186. data/vendor/faiss/tests/test_merge.cpp +0 -260
  187. data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
  188. data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
  189. data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
  190. data/vendor/faiss/tests/test_params_override.cpp +0 -236
  191. data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
  192. data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
  193. data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
  194. data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
  195. data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
  196. data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
  197. data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
  198. data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
  199. data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104
@@ -242,16 +242,47 @@ BufferedIOWriter::~BufferedIOWriter()
242
242
 
243
243
 
244
244
  uint32_t fourcc (const char sx[4]) {
245
- assert(4 == strlen(sx));
245
+ FAISS_THROW_IF_NOT (4 == strlen(sx));
246
246
  const unsigned char *x = (unsigned char*)sx;
247
247
  return x[0] | x[1] << 8 | x[2] << 16 | x[3] << 24;
248
248
  }
249
249
 
250
250
  uint32_t fourcc (const std::string & sx) {
251
- assert(sx.length() == 4);
251
+ FAISS_THROW_IF_NOT (sx.length() == 4);
252
252
  const unsigned char *x = (unsigned char*)sx.c_str();
253
253
  return x[0] | x[1] << 8 | x[2] << 16 | x[3] << 24;
254
254
  }
255
255
 
256
+ void fourcc_inv(uint32_t x, char str[5]) {
257
+ *(uint32_t*)str = x;
258
+ str[5] = 0;
259
+ }
260
+
261
+ std::string fourcc_inv(uint32_t x) {
262
+ char str[5];
263
+ fourcc_inv(x, str);
264
+ return std::string(str);
265
+ }
266
+
267
+
268
+ std::string fourcc_inv_printable(uint32_t x) {
269
+ char cstr[5];
270
+ fourcc_inv(x, cstr);
271
+ std::string str = "";
272
+ for (int i = 0; i < 4; i++) {
273
+ uint8_t c = cstr[i];
274
+ if (32 <= c && c < 127) {
275
+ str += c;
276
+ } else {
277
+ char buf[10];
278
+ sprintf(buf, "\\x%02x", c);
279
+ str += buf;
280
+ }
281
+ }
282
+ return str;
283
+ }
284
+
285
+
286
+
256
287
 
257
288
  } // namespace faiss
@@ -50,7 +50,7 @@ struct IOWriter {
50
50
  // return a file number that can be memory-mapped
51
51
  virtual int fileno ();
52
52
 
53
- virtual ~IOWriter() {}
53
+ virtual ~IOWriter() noexcept(false) {}
54
54
  };
55
55
 
56
56
 
@@ -139,12 +139,17 @@ struct BufferedIOWriter: IOWriter {
139
139
  size_t operator()(const void *ptr, size_t size, size_t nitems) override;
140
140
 
141
141
  // flushes
142
- ~BufferedIOWriter();
142
+ ~BufferedIOWriter() override;
143
143
  };
144
144
 
145
145
  /// cast a 4-character string to a uint32_t that can be written and read easily
146
146
  uint32_t fourcc (const char sx[4]);
147
147
  uint32_t fourcc (const std::string & sx);
148
148
 
149
+ // decoding of fourcc (int32 -> string)
150
+ void fourcc_inv(uint32_t x, char str[5]);
151
+ std::string fourcc_inv(uint32_t x);
152
+ std::string fourcc_inv_printable(uint32_t x);
153
+
149
154
 
150
155
  } // namespace faiss
@@ -20,22 +20,8 @@
20
20
  #include <algorithm>
21
21
 
22
22
  #include <faiss/utils/distances.h>
23
+ #include <faiss/impl/platform_macros.h>
23
24
 
24
- #ifdef _MSC_VER
25
-
26
- #include <intrin.h>
27
-
28
- static inline int __builtin_ctzll(uint64_t x) {
29
- unsigned long ret;
30
- _BitScanForward64(&ret, x);
31
- return (int)ret;
32
- }
33
-
34
- static inline int __builtin_clzll(uint64_t x) {
35
- return (int)__lzcnt64(x);
36
- }
37
-
38
- #endif // _MSC_VER
39
25
 
40
26
  namespace faiss {
41
27
 
@@ -7,8 +7,14 @@
7
7
 
8
8
  #pragma once
9
9
 
10
+
10
11
  #ifdef _MSC_VER
11
12
 
13
+ /*******************************************************
14
+ * Windows specific macros
15
+ *******************************************************/
16
+
17
+
12
18
  #ifdef FAISS_MAIN_LIB
13
19
  #define FAISS_API __declspec(dllexport)
14
20
  #else // _FAISS_MAIN_LIB
@@ -17,8 +23,46 @@
17
23
 
18
24
  #define __PRETTY_FUNCTION__ __FUNCSIG__
19
25
 
26
+ #define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
27
+ #define posix_memalign_free _aligned_free
28
+
29
+ // aligned should be in front of the declaration
30
+ #define ALIGNED(x) __declspec(align(x))
31
+
32
+ // redefine the GCC intrinsics with Windows equivalents
33
+
34
+ #include <intrin.h>
35
+
36
+ inline int __builtin_ctzll(uint64_t x) {
37
+ unsigned long ret;
38
+ _BitScanForward64(&ret, x);
39
+ return (int)ret;
40
+ }
41
+
42
+ inline int __builtin_ctz(unsigned long x) {
43
+ unsigned long ret;
44
+ _BitScanForward(&ret, x);
45
+ return (int)ret;
46
+ }
47
+
48
+ inline int __builtin_clzll(uint64_t x) {
49
+ return (int)__lzcnt64(x);
50
+ }
51
+
52
+ #define __builtin_popcountl __popcnt64
53
+
20
54
  #else
55
+ /*******************************************************
56
+ * Linux and OSX
57
+ *******************************************************/
21
58
 
22
59
  #define FAISS_API
60
+ #define posix_memalign_free free
61
+
62
+ // aligned should be *in front* of the declaration, for compatibility with windows
63
+ #define ALIGNED(x) __attribute__ ((aligned(x)))
23
64
 
24
65
  #endif // _MSC_VER
66
+
67
+
68
+
@@ -0,0 +1,272 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #include <faiss/impl/pq4_fast_scan.h>
9
+ #include <faiss/impl/FaissAssert.h>
10
+ #include <faiss/impl/simd_result_handlers.h>
11
+
12
+ #include <array>
13
+
14
+
15
+ namespace faiss {
16
+
17
+
18
+ using namespace simd_result_handlers;
19
+
20
+
21
+
22
+ /***************************************************************
23
+ * Packing functions for codes
24
+ ***************************************************************/
25
+
26
+
27
+
28
+ namespace {
29
+
30
+ /* extract the column starting at (i, j)
31
+ * from packed matrix src of size (m, n)*/
32
+ template<typename T, class TA>
33
+ void get_matrix_column(
34
+ T * src,
35
+ size_t m, size_t n,
36
+ int64_t i, int64_t j,
37
+ TA & dest) {
38
+ for(int64_t k = 0; k < dest.size(); k++) {
39
+ if (k + i >= 0 && k + i < m) {
40
+ dest[k] = src[(k + i) * n + j];
41
+ } else {
42
+ dest[k] = 0;
43
+ }
44
+ }
45
+ }
46
+
47
+ } // anonymous namespace
48
+
49
+
50
+ void pq4_pack_codes(
51
+ const uint8_t *codes,
52
+ size_t ntotal, size_t M,
53
+ size_t nb, size_t bbs, size_t nsq,
54
+ uint8_t *blocks
55
+ )
56
+ {
57
+ FAISS_THROW_IF_NOT(bbs % 32 == 0);
58
+ FAISS_THROW_IF_NOT(nb % bbs == 0);
59
+ FAISS_THROW_IF_NOT(nsq % 2 == 0);
60
+
61
+ memset(blocks, 0, nb * nsq / 2);
62
+ const uint8_t perm0[16] =
63
+ {0, 8, 1, 9, 2, 10, 3, 11,
64
+ 4, 12, 5, 13, 6, 14, 7, 15};
65
+
66
+ uint8_t *codes2 = blocks;
67
+ for(size_t i0 = 0; i0 < nb; i0 += bbs) {
68
+ for(int sq = 0; sq < nsq; sq += 2) {
69
+ for(size_t i = 0; i < bbs; i += 32) {
70
+ std::array<uint8_t, 32> c, c0, c1;
71
+ get_matrix_column(
72
+ codes, ntotal,
73
+ (M + 1) / 2,
74
+ i0 + i, sq / 2, c
75
+ );
76
+ for(int j = 0; j < 32; j++) {
77
+ c0[j] = c[j] & 15;
78
+ c1[j] = c[j] >> 4;
79
+ }
80
+ for(int j = 0; j < 16; j++) {
81
+ uint8_t d0, d1;
82
+ d0 = c0[perm0[j]] | (c0[perm0[j] + 16] << 4);
83
+ d1 = c1[perm0[j]] | (c1[perm0[j] + 16] << 4);
84
+ codes2[j] = d0;
85
+ codes2[j + 16] = d1;
86
+ }
87
+ codes2 += 32;
88
+ }
89
+ }
90
+ }
91
+ }
92
+
93
+ void pq4_pack_codes_range(
94
+ const uint8_t *codes,
95
+ size_t M,
96
+ size_t i0, size_t i1,
97
+ size_t bbs, size_t M2,
98
+ uint8_t * blocks
99
+ ) {
100
+ const uint8_t perm0[16] =
101
+ {0, 8, 1, 9, 2, 10, 3, 11,
102
+ 4, 12, 5, 13, 6, 14, 7, 15};
103
+
104
+ // range of affected blocks
105
+ size_t block0 = i0 / bbs;
106
+ size_t block1 = ((i1 - 1) / bbs) + 1;
107
+
108
+ for (size_t b = block0; b < block1; b++) {
109
+ uint8_t *codes2 = blocks + b * bbs * M2 / 2;
110
+ int64_t i_base = b * bbs - i0;
111
+ for(int sq = 0; sq < M2; sq += 2) {
112
+ for(size_t i = 0; i < bbs; i += 32) {
113
+ std::array<uint8_t, 32> c, c0, c1;
114
+ get_matrix_column(
115
+ codes, i1 - i0,
116
+ (M + 1) / 2,
117
+ i_base + i, sq / 2, c
118
+ );
119
+ for(int j = 0; j < 32; j++) {
120
+ c0[j] = c[j] & 15;
121
+ c1[j] = c[j] >> 4;
122
+ }
123
+ for(int j = 0; j < 16; j++) {
124
+ uint8_t d0, d1;
125
+ d0 = c0[perm0[j]] | (c0[perm0[j] + 16] << 4);
126
+ d1 = c1[perm0[j]] | (c1[perm0[j] + 16] << 4);
127
+ codes2[j] |= d0;
128
+ codes2[j + 16] |= d1;
129
+ }
130
+ codes2 += 32;
131
+ }
132
+ }
133
+ }
134
+
135
+ }
136
+
137
+
138
+ uint8_t pq4_get_packed_element(
139
+ const uint8_t *data, size_t bbs, size_t nsq,
140
+ size_t i, size_t sq
141
+ ) {
142
+ // move to correct bbs-sized block
143
+ data += (i / bbs * (nsq / 2) + sq / 2) * bbs;
144
+ sq = sq & 1;
145
+ i = i % bbs;
146
+
147
+ // another step
148
+ data += (i / 32) * 32;
149
+ i = i % 32;
150
+
151
+ if (sq == 1) {
152
+ data += 16;
153
+ }
154
+ const uint8_t iperm0[16] =
155
+ {0, 2, 4, 6, 8, 10, 12, 14,
156
+ 1, 3, 5, 7, 9, 11, 13, 15};
157
+ if (i < 16) {
158
+ return data[iperm0[i]] & 15;
159
+ } else {
160
+ return data[iperm0[i - 16]] >> 4;
161
+ }
162
+
163
+ }
164
+
165
+ /***************************************************************
166
+ * Packing functions for Look-Up Tables (LUT)
167
+ ***************************************************************/
168
+
169
+
170
+
171
+
172
+ void pq4_pack_LUT(
173
+ int nq, int nsq,
174
+ const uint8_t *src,
175
+ uint8_t *dest)
176
+ {
177
+
178
+ for(int q = 0; q < nq; q++) {
179
+ for(int sq = 0; sq < nsq; sq += 2) {
180
+ memcpy(
181
+ dest + (sq / 2 * nq + q) * 32,
182
+ src + (q * nsq + sq) * 16,
183
+ 16
184
+ );
185
+ memcpy(
186
+ dest + (sq / 2 * nq + q) * 32 + 16,
187
+ src + (q * nsq + sq + 1) * 16,
188
+ 16
189
+ );
190
+ }
191
+ }
192
+ }
193
+
194
+
195
+ int pq4_pack_LUT_qbs(
196
+ int qbs, int nsq,
197
+ const uint8_t *src,
198
+ uint8_t *dest)
199
+ {
200
+ FAISS_THROW_IF_NOT(nsq % 2 == 0);
201
+ size_t dim12 = 16 * nsq;
202
+ int i0 = 0;
203
+ int qi = qbs;
204
+ while(qi) {
205
+ int nq = qi & 15;
206
+ qi >>= 4;
207
+ pq4_pack_LUT(
208
+ nq, nsq,
209
+ src + i0 * dim12,
210
+ dest + i0 * dim12
211
+ );
212
+ i0 += nq;
213
+ }
214
+ return i0;
215
+ }
216
+
217
+
218
+ namespace {
219
+
220
+ void pack_LUT_1_q_map(
221
+ int nq, const int *q_map,
222
+ int nsq,
223
+ const uint8_t *src,
224
+ uint8_t *dest)
225
+ {
226
+
227
+ for(int qi = 0; qi < nq; qi++) {
228
+ int q = q_map[qi];
229
+ for(int sq = 0; sq < nsq; sq += 2) {
230
+ memcpy(
231
+ dest + (sq / 2 * nq + qi) * 32,
232
+ src + (q * nsq + sq) * 16,
233
+ 16
234
+ );
235
+ memcpy(
236
+ dest + (sq / 2 * nq + qi) * 32 + 16,
237
+ src + (q * nsq + sq + 1) * 16,
238
+ 16
239
+ );
240
+ }
241
+ }
242
+
243
+ }
244
+
245
+ } // anonymous namespace
246
+
247
+ int pq4_pack_LUT_qbs_q_map(
248
+ int qbs, int nsq,
249
+ const uint8_t *src,
250
+ const int * q_map,
251
+ uint8_t *dest)
252
+ {
253
+ FAISS_THROW_IF_NOT(nsq % 2 == 0);
254
+ size_t dim12 = 16 * nsq;
255
+ int i0 = 0;
256
+ int qi = qbs;
257
+ while(qi) {
258
+ int nq = qi & 15;
259
+ qi >>= 4;
260
+ pack_LUT_1_q_map(
261
+ nq, q_map + i0, nsq,
262
+ src,
263
+ dest + i0 * dim12
264
+ );
265
+ i0 += nq;
266
+ }
267
+ return i0;
268
+ }
269
+
270
+
271
+
272
+ } // namespace faiss
@@ -0,0 +1,169 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #pragma once
9
+
10
+ #include <cstdint>
11
+ #include <cstdlib>
12
+
13
+ /** PQ4 SIMD packing and accumulation functions
14
+ *
15
+ * The basic kernel accumulates nq query vectors with bbs = nb * 2 * 16 vectors
16
+ * and produces an output matrix for that. It is interesting for nq * nb <= 4,
17
+ * otherwise register spilling becomes too large.
18
+ *
19
+ * The implementation of these functions is spread over 3 cpp files to reduce
20
+ * parallel compile times. Templates are instanciated explicitly.
21
+ */
22
+
23
+
24
+ namespace faiss {
25
+
26
+
27
+ /** Pack codes for consumption by the SIMD kernels.
28
+ * The unused bytes are set to 0.
29
+ *
30
+ * @param codes input codes, size (ntotal, ceil(M / 2))
31
+ * @param nototal number of input codes
32
+ * @param nb output number of codes (ntotal rounded up to a multiple of
33
+ * bbs)
34
+ * @param M2 number of sub-quantizers (=M rounded up to a muliple of 2)
35
+ * @param bbs size of database blocks (multiple of 32)
36
+ * @param blocks output array, size nb * nsq / 2.
37
+ */
38
+ void pq4_pack_codes(
39
+ const uint8_t *codes,
40
+ size_t ntotal, size_t M,
41
+ size_t nb, size_t bbs, size_t M2,
42
+ uint8_t * blocks
43
+ );
44
+
45
+ /** Same as pack_codes but write in a given range of the output,
46
+ * leaving the rest untouched. Assumes allocated entries are 0 on input.
47
+ *
48
+ * @param codes input codes, size (i1 - i0, ceil(M / 2))
49
+ * @param i0 first output code to write
50
+ * @param i1 last output code to write
51
+ * @param blocks output array, size at least ceil(i1 / bbs) * bbs * nsq / 2
52
+ */
53
+ void pq4_pack_codes_range(
54
+ const uint8_t *codes,
55
+ size_t M,
56
+ size_t i0, size_t i1,
57
+ size_t bbs, size_t M2,
58
+ uint8_t * blocks
59
+ );
60
+
61
+ /** get a single element from a packed codes table
62
+ *
63
+ * @param i vector id
64
+ * @param sq subquantizer (< nsq)
65
+ */
66
+ uint8_t pq4_get_packed_element(
67
+ const uint8_t *data, size_t bbs, size_t nsq,
68
+ size_t i, size_t sq
69
+ );
70
+
71
+ /** Pack Look-up table for consumption by the kernel.
72
+ *
73
+ * @param nq number of queries
74
+ * @param nsq number of sub-quantizers (muliple of 2)
75
+ * @param src input array, size (nq, 16)
76
+ * @param dest output array, size (nq, 16)
77
+ */
78
+ void pq4_pack_LUT(
79
+ int nq, int nsq,
80
+ const uint8_t *src,
81
+ uint8_t *dest
82
+ );
83
+
84
+
85
+
86
+ /** Loop over database elements and accumulate results into result handler
87
+ *
88
+ * @param nq number of queries
89
+ * @param nb number of database elements
90
+ * @param bbs size of database blocks (multiple of 32)
91
+ * @param nsq number of sub-quantizers (muliple of 2)
92
+ * @param codes packed codes array
93
+ * @param LUT packed look-up table
94
+ */
95
+ template<class ResultHandler>
96
+ void pq4_accumulate_loop(
97
+ int nq,
98
+ size_t nb, int bbs,
99
+ int nsq,
100
+ const uint8_t *codes,
101
+ const uint8_t *LUT,
102
+ ResultHandler & res);
103
+
104
+
105
+
106
+ /* qbs versions, supported only for bbs=32.
107
+ *
108
+ * The kernel function runs the kernel for *several* query blocks
109
+ * and bbs database vectors. The sizes of the blocks are encoded in qbs as
110
+ * base-16 digits.
111
+ *
112
+ * For example, qbs = 0x1223 means that the kernel will be run 4 times, the
113
+ * first time with 3 query vectors, second time with 2 query vectors, then 2
114
+ * vectors again and finally with 1 query vector. The output block will thus be
115
+ * nq = 3 + 2 + 2 + 1 = 6 queries. For a given total block size, the optimal
116
+ * decomposition into sub-blocks (measured empirically) is given by
117
+ * preferred_qbs().
118
+ */
119
+
120
+
121
+ /* compute the number of queries from a base-16 decomposition */
122
+ int pq4_qbs_to_nq(int qbs);
123
+
124
+ /** return the preferred decomposition in blocks for a nb of queries. */
125
+ int pq4_preferred_qbs(int nq);
126
+
127
+ /** Pack Look-up table for consumption by the kernel.
128
+ *
129
+ * @param qbs 4-bit encoded number of query blocks, the total number of
130
+ * queries handled (nq) is deduced from it
131
+ * @param nsq number of sub-quantizers (muliple of 2)
132
+ * @param src input array, size (nq, 16)
133
+ * @param dest output array, size (nq, 16)
134
+ * @return nq
135
+ */
136
+ int pq4_pack_LUT_qbs(
137
+ int fqbs, int nsq,
138
+ const uint8_t *src,
139
+ uint8_t *dest
140
+ );
141
+
142
+ /** Same as pq4_pack_LUT_qbs, except the source vectors are remapped with q_map */
143
+ int pq4_pack_LUT_qbs_q_map(
144
+ int qbs, int nsq,
145
+ const uint8_t *src,
146
+ const int * q_map,
147
+ uint8_t *dest);
148
+
149
+ /** Run accumulation loop.
150
+ *
151
+ * @param qbs 4-bit encded number of queries
152
+ * @param nb number of database codes (mutliple of bbs)
153
+ * @param nsq number of sub-quantizers
154
+ * @param codes encoded database vectors (packed)
155
+ * @param LUT look-up table (packed)
156
+ * @param res call-back for the resutls
157
+ */
158
+ template<class ResultHandler>
159
+ void pq4_accumulate_loop_qbs(
160
+ int qbs,
161
+ size_t nb,
162
+ int nsq,
163
+ const uint8_t *codes,
164
+ const uint8_t *LUT,
165
+ ResultHandler & res);
166
+
167
+
168
+
169
+ } // namespace faiss