faiss 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +1 -1
  5. data/ext/faiss/extconf.rb +1 -1
  6. data/lib/faiss/version.rb +1 -1
  7. data/vendor/faiss/faiss/AutoTune.cpp +36 -33
  8. data/vendor/faiss/faiss/AutoTune.h +6 -3
  9. data/vendor/faiss/faiss/Clustering.cpp +16 -12
  10. data/vendor/faiss/faiss/Index.cpp +3 -4
  11. data/vendor/faiss/faiss/Index.h +3 -3
  12. data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
  13. data/vendor/faiss/faiss/IndexBinary.h +1 -1
  14. data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
  15. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
  16. data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
  17. data/vendor/faiss/faiss/IndexFlat.h +0 -51
  18. data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
  19. data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
  20. data/vendor/faiss/faiss/IndexIVF.h +22 -15
  21. data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
  22. data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
  23. data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
  24. data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
  25. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
  26. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
  27. data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
  28. data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
  29. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
  30. data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
  31. data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
  32. data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
  33. data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
  34. data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
  35. data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
  36. data/vendor/faiss/faiss/IndexRefine.h +73 -0
  37. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
  38. data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
  39. data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
  40. data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
  41. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
  42. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
  43. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
  44. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
  45. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
  46. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
  47. data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
  48. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
  49. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
  50. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
  51. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
  52. data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
  53. data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
  54. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
  55. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
  56. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
  57. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
  58. data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
  59. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
  60. data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
  61. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
  62. data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
  63. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
  64. data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
  65. data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
  66. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
  67. data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
  68. data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
  69. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
  70. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
  71. data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
  72. data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
  73. data/vendor/faiss/faiss/impl/io.cpp +33 -2
  74. data/vendor/faiss/faiss/impl/io.h +7 -2
  75. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
  76. data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
  77. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
  78. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
  79. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
  80. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
  81. data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
  82. data/vendor/faiss/faiss/index_factory.cpp +112 -7
  83. data/vendor/faiss/faiss/index_io.h +1 -48
  84. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
  85. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
  86. data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
  87. data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
  88. data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
  89. data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
  90. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
  91. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
  92. data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
  93. data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
  94. data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
  95. data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
  96. data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
  97. data/vendor/faiss/faiss/utils/Heap.h +61 -50
  98. data/vendor/faiss/faiss/utils/distances.cpp +164 -319
  99. data/vendor/faiss/faiss/utils/distances.h +28 -20
  100. data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
  101. data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
  102. data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
  103. data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
  104. data/vendor/faiss/faiss/utils/hamming.h +2 -7
  105. data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
  106. data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
  107. data/vendor/faiss/faiss/utils/partitioning.h +69 -0
  108. data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
  109. data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
  110. data/vendor/faiss/faiss/utils/simdlib.h +31 -0
  111. data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
  112. data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
  113. metadata +43 -141
  114. data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
  115. data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
  116. data/vendor/faiss/c_api/AutoTune_c.h +0 -66
  117. data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
  118. data/vendor/faiss/c_api/Clustering_c.h +0 -123
  119. data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
  120. data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
  121. data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
  122. data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
  123. data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
  124. data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
  125. data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
  126. data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
  127. data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
  128. data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
  129. data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
  130. data/vendor/faiss/c_api/IndexShards_c.h +0 -39
  131. data/vendor/faiss/c_api/Index_c.cpp +0 -105
  132. data/vendor/faiss/c_api/Index_c.h +0 -183
  133. data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
  134. data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
  135. data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
  136. data/vendor/faiss/c_api/clone_index_c.h +0 -32
  137. data/vendor/faiss/c_api/error_c.h +0 -42
  138. data/vendor/faiss/c_api/error_impl.cpp +0 -27
  139. data/vendor/faiss/c_api/error_impl.h +0 -16
  140. data/vendor/faiss/c_api/faiss_c.h +0 -58
  141. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
  142. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
  143. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
  144. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
  145. data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
  146. data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
  147. data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
  148. data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
  149. data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
  150. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
  151. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
  152. data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
  153. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
  154. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
  155. data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
  156. data/vendor/faiss/c_api/index_factory_c.h +0 -30
  157. data/vendor/faiss/c_api/index_io_c.cpp +0 -42
  158. data/vendor/faiss/c_api/index_io_c.h +0 -50
  159. data/vendor/faiss/c_api/macros_impl.h +0 -110
  160. data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
  161. data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
  162. data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
  163. data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
  164. data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
  165. data/vendor/faiss/misc/test_blas.cpp +0 -87
  166. data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
  167. data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
  168. data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
  169. data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
  170. data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
  171. data/vendor/faiss/tests/test_merge.cpp +0 -260
  172. data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
  173. data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
  174. data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
  175. data/vendor/faiss/tests/test_params_override.cpp +0 -236
  176. data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
  177. data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
  178. data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
  179. data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
  180. data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
  181. data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
  182. data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
  183. data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
  184. data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104
@@ -242,16 +242,47 @@ BufferedIOWriter::~BufferedIOWriter()
242
242
 
243
243
 
244
244
  uint32_t fourcc (const char sx[4]) {
245
- assert(4 == strlen(sx));
245
+ FAISS_THROW_IF_NOT (4 == strlen(sx));
246
246
  const unsigned char *x = (unsigned char*)sx;
247
247
  return x[0] | x[1] << 8 | x[2] << 16 | x[3] << 24;
248
248
  }
249
249
 
250
250
  uint32_t fourcc (const std::string & sx) {
251
- assert(sx.length() == 4);
251
+ FAISS_THROW_IF_NOT (sx.length() == 4);
252
252
  const unsigned char *x = (unsigned char*)sx.c_str();
253
253
  return x[0] | x[1] << 8 | x[2] << 16 | x[3] << 24;
254
254
  }
255
255
 
256
+ void fourcc_inv(uint32_t x, char str[5]) {
257
+ *(uint32_t*)str = x;
258
+ str[5] = 0;
259
+ }
260
+
261
+ std::string fourcc_inv(uint32_t x) {
262
+ char str[5];
263
+ fourcc_inv(x, str);
264
+ return std::string(str);
265
+ }
266
+
267
+
268
+ std::string fourcc_inv_printable(uint32_t x) {
269
+ char cstr[5];
270
+ fourcc_inv(x, cstr);
271
+ std::string str = "";
272
+ for (int i = 0; i < 4; i++) {
273
+ uint8_t c = cstr[i];
274
+ if (32 <= c && c < 127) {
275
+ str += c;
276
+ } else {
277
+ char buf[10];
278
+ sprintf(buf, "\\x%02x", c);
279
+ str += buf;
280
+ }
281
+ }
282
+ return str;
283
+ }
284
+
285
+
286
+
256
287
 
257
288
  } // namespace faiss
@@ -50,7 +50,7 @@ struct IOWriter {
50
50
  // return a file number that can be memory-mapped
51
51
  virtual int fileno ();
52
52
 
53
- virtual ~IOWriter() {}
53
+ virtual ~IOWriter() noexcept(false) {}
54
54
  };
55
55
 
56
56
 
@@ -139,12 +139,17 @@ struct BufferedIOWriter: IOWriter {
139
139
  size_t operator()(const void *ptr, size_t size, size_t nitems) override;
140
140
 
141
141
  // flushes
142
- ~BufferedIOWriter();
142
+ ~BufferedIOWriter() override;
143
143
  };
144
144
 
145
145
  /// cast a 4-character string to a uint32_t that can be written and read easily
146
146
  uint32_t fourcc (const char sx[4]);
147
147
  uint32_t fourcc (const std::string & sx);
148
148
 
149
+ // decoding of fourcc (int32 -> string)
150
+ void fourcc_inv(uint32_t x, char str[5]);
151
+ std::string fourcc_inv(uint32_t x);
152
+ std::string fourcc_inv_printable(uint32_t x);
153
+
149
154
 
150
155
  } // namespace faiss
@@ -20,22 +20,8 @@
20
20
  #include <algorithm>
21
21
 
22
22
  #include <faiss/utils/distances.h>
23
+ #include <faiss/impl/platform_macros.h>
23
24
 
24
- #ifdef _MSC_VER
25
-
26
- #include <intrin.h>
27
-
28
- static inline int __builtin_ctzll(uint64_t x) {
29
- unsigned long ret;
30
- _BitScanForward64(&ret, x);
31
- return (int)ret;
32
- }
33
-
34
- static inline int __builtin_clzll(uint64_t x) {
35
- return (int)__lzcnt64(x);
36
- }
37
-
38
- #endif // _MSC_VER
39
25
 
40
26
  namespace faiss {
41
27
 
@@ -7,8 +7,14 @@
7
7
 
8
8
  #pragma once
9
9
 
10
+
10
11
  #ifdef _MSC_VER
11
12
 
13
+ /*******************************************************
14
+ * Windows specific macros
15
+ *******************************************************/
16
+
17
+
12
18
  #ifdef FAISS_MAIN_LIB
13
19
  #define FAISS_API __declspec(dllexport)
14
20
  #else // _FAISS_MAIN_LIB
@@ -17,8 +23,46 @@
17
23
 
18
24
  #define __PRETTY_FUNCTION__ __FUNCSIG__
19
25
 
26
+ #define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
27
+ #define posix_memalign_free _aligned_free
28
+
29
+ // aligned should be in front of the declaration
30
+ #define ALIGNED(x) __declspec(align(x))
31
+
32
+ // redefine the GCC intrinsics with Windows equivalents
33
+
34
+ #include <intrin.h>
35
+
36
+ inline int __builtin_ctzll(uint64_t x) {
37
+ unsigned long ret;
38
+ _BitScanForward64(&ret, x);
39
+ return (int)ret;
40
+ }
41
+
42
+ inline int __builtin_ctz(unsigned long x) {
43
+ unsigned long ret;
44
+ _BitScanForward(&ret, x);
45
+ return (int)ret;
46
+ }
47
+
48
+ inline int __builtin_clzll(uint64_t x) {
49
+ return (int)__lzcnt64(x);
50
+ }
51
+
52
+ #define __builtin_popcountl __popcnt64
53
+
20
54
  #else
55
+ /*******************************************************
56
+ * Linux and OSX
57
+ *******************************************************/
21
58
 
22
59
  #define FAISS_API
60
+ #define posix_memalign_free free
61
+
62
+ // aligned should be *in front* of the declaration, for compatibility with windows
63
+ #define ALIGNED(x) __attribute__ ((aligned(x)))
23
64
 
24
65
  #endif // _MSC_VER
66
+
67
+
68
+
@@ -0,0 +1,272 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #include <faiss/impl/pq4_fast_scan.h>
9
+ #include <faiss/impl/FaissAssert.h>
10
+ #include <faiss/impl/simd_result_handlers.h>
11
+
12
+ #include <array>
13
+
14
+
15
+ namespace faiss {
16
+
17
+
18
+ using namespace simd_result_handlers;
19
+
20
+
21
+
22
+ /***************************************************************
23
+ * Packing functions for codes
24
+ ***************************************************************/
25
+
26
+
27
+
28
+ namespace {
29
+
30
+ /* extract the column starting at (i, j)
31
+ * from packed matrix src of size (m, n)*/
32
+ template<typename T, class TA>
33
+ void get_matrix_column(
34
+ T * src,
35
+ size_t m, size_t n,
36
+ int64_t i, int64_t j,
37
+ TA & dest) {
38
+ for(int64_t k = 0; k < dest.size(); k++) {
39
+ if (k + i >= 0 && k + i < m) {
40
+ dest[k] = src[(k + i) * n + j];
41
+ } else {
42
+ dest[k] = 0;
43
+ }
44
+ }
45
+ }
46
+
47
+ } // anonymous namespace
48
+
49
+
50
+ void pq4_pack_codes(
51
+ const uint8_t *codes,
52
+ size_t ntotal, size_t M,
53
+ size_t nb, size_t bbs, size_t nsq,
54
+ uint8_t *blocks
55
+ )
56
+ {
57
+ FAISS_THROW_IF_NOT(bbs % 32 == 0);
58
+ FAISS_THROW_IF_NOT(nb % bbs == 0);
59
+ FAISS_THROW_IF_NOT(nsq % 2 == 0);
60
+
61
+ memset(blocks, 0, nb * nsq / 2);
62
+ const uint8_t perm0[16] =
63
+ {0, 8, 1, 9, 2, 10, 3, 11,
64
+ 4, 12, 5, 13, 6, 14, 7, 15};
65
+
66
+ uint8_t *codes2 = blocks;
67
+ for(size_t i0 = 0; i0 < nb; i0 += bbs) {
68
+ for(int sq = 0; sq < nsq; sq += 2) {
69
+ for(size_t i = 0; i < bbs; i += 32) {
70
+ std::array<uint8_t, 32> c, c0, c1;
71
+ get_matrix_column(
72
+ codes, ntotal,
73
+ (M + 1) / 2,
74
+ i0 + i, sq / 2, c
75
+ );
76
+ for(int j = 0; j < 32; j++) {
77
+ c0[j] = c[j] & 15;
78
+ c1[j] = c[j] >> 4;
79
+ }
80
+ for(int j = 0; j < 16; j++) {
81
+ uint8_t d0, d1;
82
+ d0 = c0[perm0[j]] | (c0[perm0[j] + 16] << 4);
83
+ d1 = c1[perm0[j]] | (c1[perm0[j] + 16] << 4);
84
+ codes2[j] = d0;
85
+ codes2[j + 16] = d1;
86
+ }
87
+ codes2 += 32;
88
+ }
89
+ }
90
+ }
91
+ }
92
+
93
+ void pq4_pack_codes_range(
94
+ const uint8_t *codes,
95
+ size_t M,
96
+ size_t i0, size_t i1,
97
+ size_t bbs, size_t M2,
98
+ uint8_t * blocks
99
+ ) {
100
+ const uint8_t perm0[16] =
101
+ {0, 8, 1, 9, 2, 10, 3, 11,
102
+ 4, 12, 5, 13, 6, 14, 7, 15};
103
+
104
+ // range of affected blocks
105
+ size_t block0 = i0 / bbs;
106
+ size_t block1 = ((i1 - 1) / bbs) + 1;
107
+
108
+ for (size_t b = block0; b < block1; b++) {
109
+ uint8_t *codes2 = blocks + b * bbs * M2 / 2;
110
+ int64_t i_base = b * bbs - i0;
111
+ for(int sq = 0; sq < M2; sq += 2) {
112
+ for(size_t i = 0; i < bbs; i += 32) {
113
+ std::array<uint8_t, 32> c, c0, c1;
114
+ get_matrix_column(
115
+ codes, i1 - i0,
116
+ (M + 1) / 2,
117
+ i_base + i, sq / 2, c
118
+ );
119
+ for(int j = 0; j < 32; j++) {
120
+ c0[j] = c[j] & 15;
121
+ c1[j] = c[j] >> 4;
122
+ }
123
+ for(int j = 0; j < 16; j++) {
124
+ uint8_t d0, d1;
125
+ d0 = c0[perm0[j]] | (c0[perm0[j] + 16] << 4);
126
+ d1 = c1[perm0[j]] | (c1[perm0[j] + 16] << 4);
127
+ codes2[j] |= d0;
128
+ codes2[j + 16] |= d1;
129
+ }
130
+ codes2 += 32;
131
+ }
132
+ }
133
+ }
134
+
135
+ }
136
+
137
+
138
+ uint8_t pq4_get_packed_element(
139
+ const uint8_t *data, size_t bbs, size_t nsq,
140
+ size_t i, size_t sq
141
+ ) {
142
+ // move to correct bbs-sized block
143
+ data += (i / bbs * (nsq / 2) + sq / 2) * bbs;
144
+ sq = sq & 1;
145
+ i = i % bbs;
146
+
147
+ // another step
148
+ data += (i / 32) * 32;
149
+ i = i % 32;
150
+
151
+ if (sq == 1) {
152
+ data += 16;
153
+ }
154
+ const uint8_t iperm0[16] =
155
+ {0, 2, 4, 6, 8, 10, 12, 14,
156
+ 1, 3, 5, 7, 9, 11, 13, 15};
157
+ if (i < 16) {
158
+ return data[iperm0[i]] & 15;
159
+ } else {
160
+ return data[iperm0[i - 16]] >> 4;
161
+ }
162
+
163
+ }
164
+
165
+ /***************************************************************
166
+ * Packing functions for Look-Up Tables (LUT)
167
+ ***************************************************************/
168
+
169
+
170
+
171
+
172
+ void pq4_pack_LUT(
173
+ int nq, int nsq,
174
+ const uint8_t *src,
175
+ uint8_t *dest)
176
+ {
177
+
178
+ for(int q = 0; q < nq; q++) {
179
+ for(int sq = 0; sq < nsq; sq += 2) {
180
+ memcpy(
181
+ dest + (sq / 2 * nq + q) * 32,
182
+ src + (q * nsq + sq) * 16,
183
+ 16
184
+ );
185
+ memcpy(
186
+ dest + (sq / 2 * nq + q) * 32 + 16,
187
+ src + (q * nsq + sq + 1) * 16,
188
+ 16
189
+ );
190
+ }
191
+ }
192
+ }
193
+
194
+
195
+ int pq4_pack_LUT_qbs(
196
+ int qbs, int nsq,
197
+ const uint8_t *src,
198
+ uint8_t *dest)
199
+ {
200
+ FAISS_THROW_IF_NOT(nsq % 2 == 0);
201
+ size_t dim12 = 16 * nsq;
202
+ int i0 = 0;
203
+ int qi = qbs;
204
+ while(qi) {
205
+ int nq = qi & 15;
206
+ qi >>= 4;
207
+ pq4_pack_LUT(
208
+ nq, nsq,
209
+ src + i0 * dim12,
210
+ dest + i0 * dim12
211
+ );
212
+ i0 += nq;
213
+ }
214
+ return i0;
215
+ }
216
+
217
+
218
+ namespace {
219
+
220
+ void pack_LUT_1_q_map(
221
+ int nq, const int *q_map,
222
+ int nsq,
223
+ const uint8_t *src,
224
+ uint8_t *dest)
225
+ {
226
+
227
+ for(int qi = 0; qi < nq; qi++) {
228
+ int q = q_map[qi];
229
+ for(int sq = 0; sq < nsq; sq += 2) {
230
+ memcpy(
231
+ dest + (sq / 2 * nq + qi) * 32,
232
+ src + (q * nsq + sq) * 16,
233
+ 16
234
+ );
235
+ memcpy(
236
+ dest + (sq / 2 * nq + qi) * 32 + 16,
237
+ src + (q * nsq + sq + 1) * 16,
238
+ 16
239
+ );
240
+ }
241
+ }
242
+
243
+ }
244
+
245
+ } // anonymous namespace
246
+
247
+ int pq4_pack_LUT_qbs_q_map(
248
+ int qbs, int nsq,
249
+ const uint8_t *src,
250
+ const int * q_map,
251
+ uint8_t *dest)
252
+ {
253
+ FAISS_THROW_IF_NOT(nsq % 2 == 0);
254
+ size_t dim12 = 16 * nsq;
255
+ int i0 = 0;
256
+ int qi = qbs;
257
+ while(qi) {
258
+ int nq = qi & 15;
259
+ qi >>= 4;
260
+ pack_LUT_1_q_map(
261
+ nq, q_map + i0, nsq,
262
+ src,
263
+ dest + i0 * dim12
264
+ );
265
+ i0 += nq;
266
+ }
267
+ return i0;
268
+ }
269
+
270
+
271
+
272
+ } // namespace faiss
@@ -0,0 +1,169 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #pragma once
9
+
10
+ #include <cstdint>
11
+ #include <cstdlib>
12
+
13
+ /** PQ4 SIMD packing and accumulation functions
14
+ *
15
+ * The basic kernel accumulates nq query vectors with bbs = nb * 2 * 16 vectors
16
+ * and produces an output matrix for that. It is interesting for nq * nb <= 4,
17
+ * otherwise register spilling becomes too large.
18
+ *
19
+ * The implementation of these functions is spread over 3 cpp files to reduce
20
+ * parallel compile times. Templates are instanciated explicitly.
21
+ */
22
+
23
+
24
+ namespace faiss {
25
+
26
+
27
+ /** Pack codes for consumption by the SIMD kernels.
28
+ * The unused bytes are set to 0.
29
+ *
30
+ * @param codes input codes, size (ntotal, ceil(M / 2))
31
+ * @param nototal number of input codes
32
+ * @param nb output number of codes (ntotal rounded up to a multiple of
33
+ * bbs)
34
+ * @param M2 number of sub-quantizers (=M rounded up to a muliple of 2)
35
+ * @param bbs size of database blocks (multiple of 32)
36
+ * @param blocks output array, size nb * nsq / 2.
37
+ */
38
+ void pq4_pack_codes(
39
+ const uint8_t *codes,
40
+ size_t ntotal, size_t M,
41
+ size_t nb, size_t bbs, size_t M2,
42
+ uint8_t * blocks
43
+ );
44
+
45
+ /** Same as pack_codes but write in a given range of the output,
46
+ * leaving the rest untouched. Assumes allocated entries are 0 on input.
47
+ *
48
+ * @param codes input codes, size (i1 - i0, ceil(M / 2))
49
+ * @param i0 first output code to write
50
+ * @param i1 last output code to write
51
+ * @param blocks output array, size at least ceil(i1 / bbs) * bbs * nsq / 2
52
+ */
53
+ void pq4_pack_codes_range(
54
+ const uint8_t *codes,
55
+ size_t M,
56
+ size_t i0, size_t i1,
57
+ size_t bbs, size_t M2,
58
+ uint8_t * blocks
59
+ );
60
+
61
+ /** get a single element from a packed codes table
62
+ *
63
+ * @param i vector id
64
+ * @param sq subquantizer (< nsq)
65
+ */
66
+ uint8_t pq4_get_packed_element(
67
+ const uint8_t *data, size_t bbs, size_t nsq,
68
+ size_t i, size_t sq
69
+ );
70
+
71
+ /** Pack Look-up table for consumption by the kernel.
72
+ *
73
+ * @param nq number of queries
74
+ * @param nsq number of sub-quantizers (muliple of 2)
75
+ * @param src input array, size (nq, 16)
76
+ * @param dest output array, size (nq, 16)
77
+ */
78
+ void pq4_pack_LUT(
79
+ int nq, int nsq,
80
+ const uint8_t *src,
81
+ uint8_t *dest
82
+ );
83
+
84
+
85
+
86
+ /** Loop over database elements and accumulate results into result handler
87
+ *
88
+ * @param nq number of queries
89
+ * @param nb number of database elements
90
+ * @param bbs size of database blocks (multiple of 32)
91
+ * @param nsq number of sub-quantizers (muliple of 2)
92
+ * @param codes packed codes array
93
+ * @param LUT packed look-up table
94
+ */
95
+ template<class ResultHandler>
96
+ void pq4_accumulate_loop(
97
+ int nq,
98
+ size_t nb, int bbs,
99
+ int nsq,
100
+ const uint8_t *codes,
101
+ const uint8_t *LUT,
102
+ ResultHandler & res);
103
+
104
+
105
+
106
+ /* qbs versions, supported only for bbs=32.
107
+ *
108
+ * The kernel function runs the kernel for *several* query blocks
109
+ * and bbs database vectors. The sizes of the blocks are encoded in qbs as
110
+ * base-16 digits.
111
+ *
112
+ * For example, qbs = 0x1223 means that the kernel will be run 4 times, the
113
+ * first time with 3 query vectors, second time with 2 query vectors, then 2
114
+ * vectors again and finally with 1 query vector. The output block will thus be
115
+ * nq = 3 + 2 + 2 + 1 = 6 queries. For a given total block size, the optimal
116
+ * decomposition into sub-blocks (measured empirically) is given by
117
+ * preferred_qbs().
118
+ */
119
+
120
+
121
+ /* compute the number of queries from a base-16 decomposition */
122
+ int pq4_qbs_to_nq(int qbs);
123
+
124
+ /** return the preferred decomposition in blocks for a nb of queries. */
125
+ int pq4_preferred_qbs(int nq);
126
+
127
+ /** Pack Look-up table for consumption by the kernel.
128
+ *
129
+ * @param qbs 4-bit encoded number of query blocks, the total number of
130
+ * queries handled (nq) is deduced from it
131
+ * @param nsq number of sub-quantizers (muliple of 2)
132
+ * @param src input array, size (nq, 16)
133
+ * @param dest output array, size (nq, 16)
134
+ * @return nq
135
+ */
136
+ int pq4_pack_LUT_qbs(
137
+ int fqbs, int nsq,
138
+ const uint8_t *src,
139
+ uint8_t *dest
140
+ );
141
+
142
+ /** Same as pq4_pack_LUT_qbs, except the source vectors are remapped with q_map */
143
+ int pq4_pack_LUT_qbs_q_map(
144
+ int qbs, int nsq,
145
+ const uint8_t *src,
146
+ const int * q_map,
147
+ uint8_t *dest);
148
+
149
+ /** Run accumulation loop.
150
+ *
151
+ * @param qbs 4-bit encded number of queries
152
+ * @param nb number of database codes (mutliple of bbs)
153
+ * @param nsq number of sub-quantizers
154
+ * @param codes encoded database vectors (packed)
155
+ * @param LUT look-up table (packed)
156
+ * @param res call-back for the resutls
157
+ */
158
+ template<class ResultHandler>
159
+ void pq4_accumulate_loop_qbs(
160
+ int qbs,
161
+ size_t nb,
162
+ int nsq,
163
+ const uint8_t *codes,
164
+ const uint8_t *LUT,
165
+ ResultHandler & res);
166
+
167
+
168
+
169
+ } // namespace faiss