faiss 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (199) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +25 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +16 -4
  5. data/ext/faiss/ext.cpp +12 -308
  6. data/ext/faiss/extconf.rb +6 -3
  7. data/ext/faiss/index.cpp +189 -0
  8. data/ext/faiss/index_binary.cpp +75 -0
  9. data/ext/faiss/kmeans.cpp +40 -0
  10. data/ext/faiss/numo.hpp +867 -0
  11. data/ext/faiss/pca_matrix.cpp +33 -0
  12. data/ext/faiss/product_quantizer.cpp +53 -0
  13. data/ext/faiss/utils.cpp +13 -0
  14. data/ext/faiss/utils.h +5 -0
  15. data/lib/faiss.rb +0 -5
  16. data/lib/faiss/version.rb +1 -1
  17. data/vendor/faiss/faiss/AutoTune.cpp +36 -33
  18. data/vendor/faiss/faiss/AutoTune.h +6 -3
  19. data/vendor/faiss/faiss/Clustering.cpp +16 -12
  20. data/vendor/faiss/faiss/Index.cpp +3 -4
  21. data/vendor/faiss/faiss/Index.h +3 -3
  22. data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
  23. data/vendor/faiss/faiss/IndexBinary.h +1 -1
  24. data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
  25. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
  26. data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
  27. data/vendor/faiss/faiss/IndexFlat.h +0 -51
  28. data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
  29. data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
  30. data/vendor/faiss/faiss/IndexIVF.h +22 -15
  31. data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
  32. data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
  33. data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
  34. data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
  35. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
  36. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
  37. data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
  38. data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
  39. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
  40. data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
  41. data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
  42. data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
  43. data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
  44. data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
  45. data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
  46. data/vendor/faiss/faiss/IndexRefine.h +73 -0
  47. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
  48. data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
  49. data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
  50. data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
  51. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
  52. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
  53. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
  54. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
  55. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
  56. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
  57. data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
  58. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
  59. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
  60. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
  61. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
  62. data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
  63. data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
  64. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
  65. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
  66. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
  67. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
  68. data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
  69. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
  70. data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
  71. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
  72. data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
  73. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
  74. data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
  75. data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
  76. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
  77. data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
  78. data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
  79. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
  80. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
  81. data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
  82. data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
  83. data/vendor/faiss/faiss/impl/io.cpp +33 -2
  84. data/vendor/faiss/faiss/impl/io.h +7 -2
  85. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
  86. data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
  87. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
  88. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
  89. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
  90. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
  91. data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
  92. data/vendor/faiss/faiss/index_factory.cpp +112 -7
  93. data/vendor/faiss/faiss/index_io.h +1 -48
  94. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
  95. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
  96. data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
  97. data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
  98. data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
  99. data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
  100. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
  101. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
  102. data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
  103. data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
  104. data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
  105. data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
  106. data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
  107. data/vendor/faiss/faiss/utils/Heap.h +61 -50
  108. data/vendor/faiss/faiss/utils/distances.cpp +164 -319
  109. data/vendor/faiss/faiss/utils/distances.h +28 -20
  110. data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
  111. data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
  112. data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
  113. data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
  114. data/vendor/faiss/faiss/utils/hamming.h +2 -7
  115. data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
  116. data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
  117. data/vendor/faiss/faiss/utils/partitioning.h +69 -0
  118. data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
  119. data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
  120. data/vendor/faiss/faiss/utils/simdlib.h +31 -0
  121. data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
  122. data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
  123. metadata +54 -149
  124. data/lib/faiss/index.rb +0 -20
  125. data/lib/faiss/index_binary.rb +0 -20
  126. data/lib/faiss/kmeans.rb +0 -15
  127. data/lib/faiss/pca_matrix.rb +0 -15
  128. data/lib/faiss/product_quantizer.rb +0 -22
  129. data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
  130. data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
  131. data/vendor/faiss/c_api/AutoTune_c.h +0 -66
  132. data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
  133. data/vendor/faiss/c_api/Clustering_c.h +0 -123
  134. data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
  135. data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
  136. data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
  137. data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
  138. data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
  139. data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
  140. data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
  141. data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
  142. data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
  143. data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
  144. data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
  145. data/vendor/faiss/c_api/IndexShards_c.h +0 -39
  146. data/vendor/faiss/c_api/Index_c.cpp +0 -105
  147. data/vendor/faiss/c_api/Index_c.h +0 -183
  148. data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
  149. data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
  150. data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
  151. data/vendor/faiss/c_api/clone_index_c.h +0 -32
  152. data/vendor/faiss/c_api/error_c.h +0 -42
  153. data/vendor/faiss/c_api/error_impl.cpp +0 -27
  154. data/vendor/faiss/c_api/error_impl.h +0 -16
  155. data/vendor/faiss/c_api/faiss_c.h +0 -58
  156. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
  157. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
  158. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
  159. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
  160. data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
  161. data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
  162. data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
  163. data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
  164. data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
  165. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
  166. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
  167. data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
  168. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
  169. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
  170. data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
  171. data/vendor/faiss/c_api/index_factory_c.h +0 -30
  172. data/vendor/faiss/c_api/index_io_c.cpp +0 -42
  173. data/vendor/faiss/c_api/index_io_c.h +0 -50
  174. data/vendor/faiss/c_api/macros_impl.h +0 -110
  175. data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
  176. data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
  177. data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
  178. data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
  179. data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
  180. data/vendor/faiss/misc/test_blas.cpp +0 -87
  181. data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
  182. data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
  183. data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
  184. data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
  185. data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
  186. data/vendor/faiss/tests/test_merge.cpp +0 -260
  187. data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
  188. data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
  189. data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
  190. data/vendor/faiss/tests/test_params_override.cpp +0 -236
  191. data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
  192. data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
  193. data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
  194. data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
  195. data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
  196. data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
  197. data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
  198. data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
  199. data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104
@@ -41,8 +41,22 @@ class StandardGpuResourcesImpl : public GpuResources {
41
41
  /// transfers
42
42
  void setPinnedMemory(size_t size);
43
43
 
44
- /// Called to change the stream for work ordering
45
- void setDefaultStream(int device, cudaStream_t stream);
44
+ /// Called to change the stream for work ordering. We do not own `stream`;
45
+ /// i.e., it will not be destroyed when the GpuResources object gets cleaned
46
+ /// up.
47
+ /// We are guaranteed that all Faiss GPU work is ordered with respect to
48
+ /// this stream upon exit from an index or other Faiss GPU call.
49
+ void setDefaultStream(int device, cudaStream_t stream) override;
50
+
51
+ /// Revert the default stream to the original stream managed by this resources
52
+ /// object, in case someone called `setDefaultStream`.
53
+ void revertDefaultStream(int device);
54
+
55
+ /// Returns the stream for the given device on which all Faiss GPU work is
56
+ /// ordered.
57
+ /// We are guaranteed that all Faiss GPU work is ordered with respect to
58
+ /// this stream upon exit from an index or other Faiss GPU call.
59
+ cudaStream_t getDefaultStream(int device) override;
46
60
 
47
61
  /// Called to change the work ordering streams to the null stream
48
62
  /// for all devices
@@ -60,8 +74,6 @@ class StandardGpuResourcesImpl : public GpuResources {
60
74
 
61
75
  cublasHandle_t getBlasHandle(int device) override;
62
76
 
63
- cudaStream_t getDefaultStream(int device) override;
64
-
65
77
  std::vector<cudaStream_t> getAlternateStreams(int device) override;
66
78
 
67
79
  /// Allocate non-temporary GPU memory
@@ -128,7 +140,9 @@ class StandardGpuResourcesImpl : public GpuResources {
128
140
  };
129
141
 
130
142
  /// Default implementation of GpuResources that allocates a cuBLAS
131
- /// stream and 2 streams for use, as well as temporary memory
143
+ /// stream and 2 streams for use, as well as temporary memory.
144
+ /// Internally, the Faiss GPU code uses the instance managed by getResources,
145
+ /// but this is the user-facing object that is internally reference counted.
132
146
  class StandardGpuResources : public GpuResourcesProvider {
133
147
  public:
134
148
  StandardGpuResources();
@@ -151,9 +165,17 @@ class StandardGpuResources : public GpuResourcesProvider {
151
165
  /// transfers
152
166
  void setPinnedMemory(size_t size);
153
167
 
154
- /// Called to change the stream for work ordering
168
+ /// Called to change the stream for work ordering. We do not own `stream`;
169
+ /// i.e., it will not be destroyed when the GpuResources object gets cleaned
170
+ /// up.
171
+ /// We are guaranteed that all Faiss GPU work is ordered with respect to
172
+ /// this stream upon exit from an index or other Faiss GPU call.
155
173
  void setDefaultStream(int device, cudaStream_t stream);
156
174
 
175
+ /// Revert the default stream to the original stream managed by this resources
176
+ /// object, in case someone called `setDefaultStream`.
177
+ void revertDefaultStream(int device);
178
+
157
179
  /// Called to change the work ordering streams to the null stream
158
180
  /// for all devices
159
181
  void setDefaultNullStreamAllDevices();
@@ -0,0 +1,547 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #include <faiss/gpu/impl/InterleavedCodes.h>
9
+ #include <faiss/impl/FaissAssert.h>
10
+ #include <faiss/gpu/utils/StaticUtils.h>
11
+
12
+ namespace faiss { namespace gpu {
13
+
14
+ inline uint8_t unpack5(int i, uint8_t vLower, uint8_t vUpper) {
15
+ uint8_t v = 0;
16
+
17
+ // lsb ... msb
18
+ // 0: 0 0 0 0 0 1 1 1
19
+ // 1: 1 1 2 2 2 2 2 3
20
+ // 2: 3 3 3 3 4 4 4 4
21
+ // 3: 4 5 5 5 5 5 6 6
22
+ // 4: 6 6 6 7 7 7 7 7
23
+ switch (i % 8) {
24
+ case 0:
25
+ // 5 lsbs of lower
26
+ v = vLower & 0x1f;
27
+ break;
28
+ case 1:
29
+ // 3 msbs of lower as v lsbs
30
+ // 2 msbs of upper as v msbs
31
+ v = (vLower >> 5) | ((vUpper & 0x3) << 3);
32
+ break;
33
+ case 2:
34
+ // 5 of lower
35
+ v = (vLower >> 2) & 0x1f;
36
+ break;
37
+ case 3:
38
+ // 1 msbs of lower as v lsbs
39
+ // 4 lsbs of upper as v msbs
40
+ v = (vLower >> 7) | ((vUpper & 0xf) << 1);
41
+ break;
42
+ case 4:
43
+ // 4 msbs of lower as v lsbs
44
+ // 1 lsbs of upper as v msbs
45
+ v = (vLower >> 4) | ((vUpper & 0x1) << 4);
46
+ break;
47
+ case 5:
48
+ // 5 of lower
49
+ v = (vLower >> 1) & 0x1f;
50
+ break;
51
+ case 6:
52
+ // 2 msbs of lower as v lsbs
53
+ // 3 lsbs of upper as v msbs
54
+ v = (vLower >> 6) | ((vUpper & 0x7) << 2);
55
+ break;
56
+ case 7:
57
+ // 5 of lower
58
+ v = (vLower >> 3);
59
+ break;
60
+ }
61
+
62
+ return v;
63
+ }
64
+
65
+ inline uint8_t unpack6(int i, uint8_t vLower, uint8_t vUpper) {
66
+ uint8_t v = 0;
67
+
68
+ switch (i % 4) {
69
+ case 0:
70
+ // 6 lsbs of lower
71
+ v = vLower & 0x3f;
72
+ break;
73
+ case 1:
74
+ // 2 msbs of lower as v lsbs
75
+ // 4 lsbs of upper as v msbs
76
+ v = (vLower >> 6) | ((vUpper & 0xf) << 2);
77
+ break;
78
+ case 2:
79
+ // 4 msbs of lower as v lsbs
80
+ // 2 lsbs of upper as v msbs
81
+ v = (vLower >> 4) | ((vUpper & 0x3) << 4);
82
+ break;
83
+ case 3:
84
+ // 6 msbs of lower
85
+ v = (vLower >> 2);
86
+ break;
87
+ }
88
+
89
+ return v;
90
+ }
91
+
92
+
93
+ std::vector<uint8_t>
94
+ unpackNonInterleaved(std::vector<uint8_t> data,
95
+ int numVecs,
96
+ int dims,
97
+ int bitsPerCode) {
98
+ int srcVecSize = utils::divUp(dims * bitsPerCode, 8);
99
+ FAISS_ASSERT(data.size() == numVecs * srcVecSize);
100
+
101
+ if (bitsPerCode == 8 ||
102
+ bitsPerCode == 16 ||
103
+ bitsPerCode == 32) {
104
+ // nothing to do
105
+ return data;
106
+ }
107
+
108
+ // bit codes padded to whole bytes
109
+ std::vector<uint8_t> out(numVecs * dims * utils::divUp(bitsPerCode, 8));
110
+
111
+ if (bitsPerCode == 4) {
112
+ #pragma omp parallel for
113
+ for (int i = 0; i < numVecs; ++i) {
114
+ for (int j = 0; j < dims; ++j) {
115
+ int srcIdx = i * srcVecSize + (j / 2);
116
+ FAISS_ASSERT(srcIdx < data.size());
117
+
118
+ uint8_t v = data[srcIdx];
119
+ v = (j % 2 == 0) ? v & 0xf : v >> 4;
120
+
121
+ out[i * dims + j] = v;
122
+ }
123
+ }
124
+ } else if (bitsPerCode == 5) {
125
+ #pragma omp parallel for
126
+ for (int i = 0; i < numVecs; ++i) {
127
+ for (int j = 0; j < dims; ++j) {
128
+ int lo = i * srcVecSize + (j * 5) / 8;
129
+ int hi = lo + 1;
130
+
131
+ FAISS_ASSERT(lo < data.size());
132
+ FAISS_ASSERT(hi <= data.size());
133
+
134
+ auto vLower = data[lo];
135
+ auto vUpper = hi < data.size() ? data[hi] : 0;
136
+
137
+ out[i * dims + j] = unpack5(j, vLower, vUpper);
138
+ }
139
+ }
140
+ } else if (bitsPerCode == 6) {
141
+ #pragma omp parallel for
142
+ for (int i = 0; i < numVecs; ++i) {
143
+ for (int j = 0; j < dims; ++j) {
144
+ int lo = i * srcVecSize + (j * 6) / 8;
145
+ int hi = lo + 1;
146
+
147
+ FAISS_ASSERT(lo < data.size());
148
+ FAISS_ASSERT(hi <= data.size());
149
+
150
+ auto vLower = data[lo];
151
+ auto vUpper = hi < data.size() ? data[hi] : 0;
152
+
153
+ out[i * dims + j] = unpack6(j, vLower, vUpper);
154
+ }
155
+ }
156
+ } else {
157
+ // unhandled
158
+ FAISS_ASSERT(false);
159
+ }
160
+
161
+ return out;
162
+ }
163
+
164
+ template <typename T>
165
+ void
166
+ unpackInterleavedWord(const T* in,
167
+ T* out,
168
+ int numVecs,
169
+ int dims,
170
+ int bitsPerCode) {
171
+ int wordsPerDimBlock = 32 * bitsPerCode / (8 * sizeof(T));
172
+ int wordsPerBlock = wordsPerDimBlock * dims;
173
+ int numBlocks = utils::divUp(numVecs, 32);
174
+
175
+ #pragma omp parallel for
176
+ for (int i = 0; i < numVecs; ++i) {
177
+ int block = i / 32;
178
+ FAISS_ASSERT(block < numBlocks);
179
+ int lane = i % 32;
180
+
181
+ for (int j = 0; j < dims; ++j) {
182
+ int srcOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
183
+ out[i * dims + j] = in[srcOffset];
184
+ }
185
+ }
186
+ }
187
+
188
+ std::vector<uint8_t>
189
+ unpackInterleaved(std::vector<uint8_t> data,
190
+ int numVecs,
191
+ int dims,
192
+ int bitsPerCode) {
193
+ int bytesPerDimBlock = 32 * bitsPerCode / 8;
194
+ int bytesPerBlock = bytesPerDimBlock * dims;
195
+ int numBlocks = utils::divUp(numVecs, 32);
196
+ size_t totalSize = (size_t) bytesPerBlock * numBlocks;
197
+ FAISS_ASSERT(data.size() == totalSize);
198
+
199
+ // bit codes padded to whole bytes
200
+ std::vector<uint8_t> out(numVecs * dims * utils::divUp(bitsPerCode, 8));
201
+
202
+ if (bitsPerCode == 8) {
203
+ unpackInterleavedWord<uint8_t>(data.data(), out.data(),
204
+ numVecs, dims, bitsPerCode);
205
+ } else if (bitsPerCode == 16) {
206
+ unpackInterleavedWord<uint16_t>((uint16_t*) data.data(),
207
+ (uint16_t*) out.data(),
208
+ numVecs, dims, bitsPerCode);
209
+ } else if (bitsPerCode == 32) {
210
+ unpackInterleavedWord<uint32_t>((uint32_t*) data.data(),
211
+ (uint32_t*) out.data(),
212
+ numVecs, dims, bitsPerCode);
213
+ } else if (bitsPerCode == 4) {
214
+ #pragma omp parallel for
215
+ for (int i = 0; i < numVecs; ++i) {
216
+ int block = i / 32;
217
+ int lane = i % 32;
218
+
219
+ int word = lane / 2;
220
+ int subWord = lane % 2;
221
+
222
+ for (int j = 0; j < dims; ++j) {
223
+ auto v =
224
+ data[block * bytesPerBlock + j * bytesPerDimBlock + word];
225
+
226
+ v = (subWord == 0) ? v & 0xf : v >> 4;
227
+ out[i * dims + j] = v;
228
+ }
229
+ }
230
+ } else if (bitsPerCode == 5) {
231
+ #pragma omp parallel for
232
+ for (int i = 0; i < numVecs; ++i) {
233
+ int block = i / 32;
234
+ int blockVector = i % 32;
235
+
236
+ for (int j = 0; j < dims; ++j) {
237
+ uint8_t* dimBlock =
238
+ &data[block * bytesPerBlock + j * bytesPerDimBlock];
239
+
240
+ int lo = (blockVector * 5) / 8;
241
+ int hi = lo + 1;
242
+
243
+ FAISS_ASSERT(lo < bytesPerDimBlock);
244
+ FAISS_ASSERT(hi <= bytesPerDimBlock);
245
+
246
+ auto vLower = dimBlock[lo];
247
+ auto vUpper = hi < bytesPerDimBlock ? dimBlock[hi] : 0;
248
+
249
+ out[i * dims + j] = unpack5(blockVector, vLower, vUpper);
250
+ }
251
+ }
252
+ } else if (bitsPerCode == 6) {
253
+ #pragma omp parallel for
254
+ for (int i = 0; i < numVecs; ++i) {
255
+ int block = i / 32;
256
+ int blockVector = i % 32;
257
+
258
+ for (int j = 0; j < dims; ++j) {
259
+ uint8_t* dimBlock =
260
+ &data[block * bytesPerBlock + j * bytesPerDimBlock];
261
+
262
+ int lo = (blockVector * 6) / 8;
263
+ int hi = lo + 1;
264
+
265
+ FAISS_ASSERT(lo < bytesPerDimBlock);
266
+ FAISS_ASSERT(hi <= bytesPerDimBlock);
267
+
268
+ auto vLower = dimBlock[lo];
269
+ auto vUpper = hi < bytesPerDimBlock ? dimBlock[hi] : 0;
270
+
271
+ out[i * dims + j] = unpack6(blockVector, vLower, vUpper);
272
+ }
273
+ }
274
+ } else {
275
+ // unimplemented
276
+ FAISS_ASSERT(false);
277
+ }
278
+
279
+ return out;
280
+ }
281
+
282
+ inline uint8_t pack5(int i, uint8_t lo, uint8_t hi, uint8_t hi2) {
283
+ FAISS_ASSERT((lo & 0x1f) == lo);
284
+ FAISS_ASSERT((hi & 0x1f) == hi);
285
+ FAISS_ASSERT((hi2 & 0x1f) == hi2);
286
+
287
+ uint8_t v = 0;
288
+
289
+ // lsb ... msb
290
+ // 0: 0 0 0 0 0 1 1 1
291
+ // 1: 1 1 2 2 2 2 2 3
292
+ // 2: 3 3 3 3 4 4 4 4
293
+ // 3: 4 5 5 5 5 5 6 6
294
+ // 4: 6 6 6 7 7 7 7 7
295
+ switch (i % 5) {
296
+ case 0:
297
+ // 5 msbs of lower as vOut lsbs
298
+ // 3 lsbs of upper as vOut msbs
299
+ v = (lo & 0x1f) | (hi << 5);
300
+ break;
301
+ case 1:
302
+ // 2 msbs of lower as vOut lsbs
303
+ // 5 lsbs of upper as vOut msbs
304
+ // 1 lsbs of upper2 as vOut msb
305
+ v = (lo >> 3) | (hi << 2) | (hi2 << 7);
306
+ break;
307
+ case 2:
308
+ // 4 msbs of lower as vOut lsbs
309
+ // 4 lsbs of upper as vOut msbs
310
+ v = (lo >> 1) | (hi << 4);
311
+ break;
312
+ case 3:
313
+ // 1 msbs of lower as vOut lsbs
314
+ // 5 lsbs of upper as vOut msbs
315
+ // 2 lsbs of upper2 as vOut msb
316
+ v = (lo >> 4) | (hi << 1) | (hi2 << 6);
317
+ break;
318
+ case 4:
319
+ // 3 msbs of lower as vOut lsbs
320
+ // 5 lsbs of upper as vOut msbs
321
+ v = (lo >> 2) | (hi << 3);
322
+ break;
323
+ }
324
+
325
+ return v;
326
+ }
327
+
328
+ inline uint8_t pack6(int i, uint8_t lo, uint8_t hi) {
329
+ FAISS_ASSERT((lo & 0x3f) == lo);
330
+ FAISS_ASSERT((hi & 0x3f) == hi);
331
+
332
+ uint8_t v = 0;
333
+
334
+ // lsb ... msb
335
+ // 0: 0 0 0 0 0 0 1 1
336
+ // 1: 1 1 1 1 2 2 2 2
337
+ // 2: 2 2 3 3 3 3 3 3
338
+ switch (i % 3) {
339
+ case 0:
340
+ // 6 msbs of lower as vOut lsbs
341
+ // 2 lsbs of upper as vOut msbs
342
+ v = (lo & 0x3f) | (hi << 6);
343
+ break;
344
+ case 1:
345
+ // 4 msbs of lower as vOut lsbs
346
+ // 4 lsbs of upper as vOut msbs
347
+ v = (lo >> 2) | (hi << 4);
348
+ break;
349
+ case 2:
350
+ // 2 msbs of lower as vOut lsbs
351
+ // 6 lsbs of upper as vOut msbs
352
+ v = (lo >> 4) | (hi << 2);
353
+ break;
354
+ }
355
+
356
+ return v;
357
+ }
358
+
359
+
360
+ std::vector<uint8_t>
361
+ packNonInterleaved(std::vector<uint8_t> data,
362
+ int numVecs,
363
+ int dims,
364
+ int bitsPerCode) {
365
+ // bit codes padded to whole bytes
366
+ FAISS_ASSERT(data.size() == numVecs * dims * utils::divUp(bitsPerCode, 8));
367
+
368
+ if (bitsPerCode == 8 ||
369
+ bitsPerCode == 16 ||
370
+ bitsPerCode == 32) {
371
+ // nothing to do, whole words are already where they need to be
372
+ return data;
373
+ }
374
+
375
+ // bits packed into a whole number of bytes
376
+ int bytesPerVec = utils::divUp(dims * bitsPerCode, 8);
377
+
378
+ std::vector<uint8_t> out(numVecs * bytesPerVec);
379
+
380
+ if (bitsPerCode == 4) {
381
+ #pragma omp parallel for
382
+ for (int i = 0; i < numVecs; ++i) {
383
+ for (int j = 0; j < bytesPerVec; ++j) {
384
+ int dimLo = j * 2;
385
+ int dimHi = dimLo + 1;
386
+ FAISS_ASSERT(dimLo < dims);
387
+ FAISS_ASSERT(dimHi <= dims);
388
+
389
+ uint8_t lo = data[i * dims + dimLo];
390
+ uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
391
+
392
+ out[i * bytesPerVec + j] = (hi << 4) | (lo & 0xf);
393
+ }
394
+ }
395
+ } else if (bitsPerCode == 5) {
396
+ #pragma omp parallel for
397
+ for (int i = 0; i < numVecs; ++i) {
398
+ for (int j = 0; j < bytesPerVec; ++j) {
399
+ int dimLo = (j * 8) / 5;
400
+ int dimHi = dimLo + 1;
401
+ int dimHi2 = dimHi + 1;
402
+ FAISS_ASSERT(dimLo < dims);
403
+ FAISS_ASSERT(dimHi <= dims);
404
+ FAISS_ASSERT(dimHi <= dims + 1);
405
+
406
+ uint8_t lo = data[i * dims + dimLo];
407
+ uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
408
+ uint8_t hi2 = dimHi2 < dims ? data[i * dims + dimHi2] : 0;
409
+
410
+ out[i * bytesPerVec + j] = pack5(j, lo, hi, hi2);
411
+ }
412
+ }
413
+ } else if (bitsPerCode == 6) {
414
+ #pragma omp parallel for
415
+ for (int i = 0; i < numVecs; ++i) {
416
+ for (int j = 0; j < bytesPerVec; ++j) {
417
+ int dimLo = (j * 8) / 6;
418
+ int dimHi = dimLo + 1;
419
+ FAISS_ASSERT(dimLo < dims);
420
+ FAISS_ASSERT(dimHi <= dims);
421
+
422
+ uint8_t lo = data[i * dims + dimLo];
423
+ uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
424
+
425
+ out[i * bytesPerVec + j] = pack6(j, lo, hi);
426
+ }
427
+ }
428
+ } else {
429
+ // unhandled
430
+ FAISS_ASSERT(false);
431
+ }
432
+
433
+ return out;
434
+ }
435
+
436
+ template <typename T>
437
+ void
438
+ packInterleavedWord(const T* in,
439
+ T* out,
440
+ int numVecs,
441
+ int dims,
442
+ int bitsPerCode) {
443
+ int wordsPerDimBlock = 32 * bitsPerCode / (8 * sizeof(T));
444
+ int wordsPerBlock = wordsPerDimBlock * dims;
445
+ int numBlocks = utils::divUp(numVecs, 32);
446
+
447
+ // We're guaranteed that all other slots not filled by the vectors present are
448
+ // initialized to zero (from the vector constructor in packInterleaved)
449
+ #pragma omp parallel for
450
+ for (int i = 0; i < numVecs; ++i) {
451
+ int block = i / 32;
452
+ FAISS_ASSERT(block < numBlocks);
453
+ int lane = i % 32;
454
+
455
+ for (int j = 0; j < dims; ++j) {
456
+ int dstOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
457
+ out[dstOffset] = in[i * dims + j];
458
+ }
459
+ }
460
+ }
461
+
462
+ std::vector<uint8_t>
463
+ packInterleaved(std::vector<uint8_t> data,
464
+ int numVecs,
465
+ int dims,
466
+ int bitsPerCode) {
467
+ int bytesPerDimBlock = 32 * bitsPerCode / 8;
468
+ int bytesPerBlock = bytesPerDimBlock * dims;
469
+ int numBlocks = utils::divUp(numVecs, 32);
470
+ size_t totalSize = (size_t) bytesPerBlock * numBlocks;
471
+
472
+ // bit codes padded to whole bytes
473
+ FAISS_ASSERT(data.size() == numVecs * dims * utils::divUp(bitsPerCode, 8));
474
+
475
+ // packs based on blocks
476
+ std::vector<uint8_t> out(totalSize, 0);
477
+
478
+ if (bitsPerCode == 8) {
479
+ packInterleavedWord<uint8_t>(data.data(), out.data(),
480
+ numVecs, dims, bitsPerCode);
481
+ } else if (bitsPerCode == 16) {
482
+ packInterleavedWord<uint16_t>((uint16_t*) data.data(),
483
+ (uint16_t*) out.data(),
484
+ numVecs, dims, bitsPerCode);
485
+ } else if (bitsPerCode == 32) {
486
+ packInterleavedWord<uint32_t>((uint32_t*) data.data(),
487
+ (uint32_t*) out.data(),
488
+ numVecs, dims, bitsPerCode);
489
+ } else if (bitsPerCode == 4) {
490
+ #pragma omp parallel for
491
+ for (int i = 0; i < numBlocks; ++i) {
492
+ for (int j = 0; j < dims; ++j) {
493
+ for (int k = 0; k < bytesPerDimBlock; ++k) {
494
+ int loVec = i * 32 + k * 2;
495
+ int hiVec = loVec + 1;
496
+
497
+ uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
498
+ uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
499
+
500
+ out[i * bytesPerBlock + j * bytesPerDimBlock + k] =
501
+ (hi << 4) | (lo & 0xf);
502
+ }
503
+ }
504
+ }
505
+ } else if (bitsPerCode == 5) {
506
+ #pragma omp parallel for
507
+ for (int i = 0; i < numBlocks; ++i) {
508
+ for (int j = 0; j < dims; ++j) {
509
+ for (int k = 0; k < bytesPerDimBlock; ++k) {
510
+ // What input vectors we are pulling from
511
+ int loVec = i * 32 + (k * 8) / 5;
512
+ int hiVec = loVec + 1;
513
+ int hiVec2 = hiVec + 1;
514
+
515
+ uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
516
+ uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
517
+ uint8_t hi2 = hiVec2 < numVecs ? data[hiVec2 * dims + j] : 0;
518
+
519
+ out[i * bytesPerBlock + j * bytesPerDimBlock + k] = pack5(k, lo, hi, hi2);
520
+ }
521
+ }
522
+ }
523
+ } else if (bitsPerCode == 6) {
524
+ #pragma omp parallel for
525
+ for (int i = 0; i < numBlocks; ++i) {
526
+ for (int j = 0; j < dims; ++j) {
527
+ for (int k = 0; k < bytesPerDimBlock; ++k) {
528
+ // What input vectors we are pulling from
529
+ int loVec = i * 32 + (k * 8) / 6;
530
+ int hiVec = loVec + 1;
531
+
532
+ uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
533
+ uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
534
+
535
+ out[i * bytesPerBlock + j * bytesPerDimBlock + k] = pack6(k, lo, hi);
536
+ }
537
+ }
538
+ }
539
+ } else {
540
+ // unimplemented
541
+ FAISS_ASSERT(false);
542
+ }
543
+
544
+ return out;
545
+ }
546
+
547
+ } } // namespace