faiss 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +1 -1
  5. data/ext/faiss/extconf.rb +1 -1
  6. data/lib/faiss/version.rb +1 -1
  7. data/vendor/faiss/faiss/AutoTune.cpp +36 -33
  8. data/vendor/faiss/faiss/AutoTune.h +6 -3
  9. data/vendor/faiss/faiss/Clustering.cpp +16 -12
  10. data/vendor/faiss/faiss/Index.cpp +3 -4
  11. data/vendor/faiss/faiss/Index.h +3 -3
  12. data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
  13. data/vendor/faiss/faiss/IndexBinary.h +1 -1
  14. data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
  15. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
  16. data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
  17. data/vendor/faiss/faiss/IndexFlat.h +0 -51
  18. data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
  19. data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
  20. data/vendor/faiss/faiss/IndexIVF.h +22 -15
  21. data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
  22. data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
  23. data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
  24. data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
  25. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
  26. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
  27. data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
  28. data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
  29. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
  30. data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
  31. data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
  32. data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
  33. data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
  34. data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
  35. data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
  36. data/vendor/faiss/faiss/IndexRefine.h +73 -0
  37. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
  38. data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
  39. data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
  40. data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
  41. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
  42. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
  43. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
  44. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
  45. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
  46. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
  47. data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
  48. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
  49. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
  50. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
  51. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
  52. data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
  53. data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
  54. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
  55. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
  56. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
  57. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
  58. data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
  59. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
  60. data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
  61. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
  62. data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
  63. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
  64. data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
  65. data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
  66. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
  67. data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
  68. data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
  69. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
  70. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
  71. data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
  72. data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
  73. data/vendor/faiss/faiss/impl/io.cpp +33 -2
  74. data/vendor/faiss/faiss/impl/io.h +7 -2
  75. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
  76. data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
  77. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
  78. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
  79. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
  80. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
  81. data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
  82. data/vendor/faiss/faiss/index_factory.cpp +112 -7
  83. data/vendor/faiss/faiss/index_io.h +1 -48
  84. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
  85. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
  86. data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
  87. data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
  88. data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
  89. data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
  90. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
  91. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
  92. data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
  93. data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
  94. data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
  95. data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
  96. data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
  97. data/vendor/faiss/faiss/utils/Heap.h +61 -50
  98. data/vendor/faiss/faiss/utils/distances.cpp +164 -319
  99. data/vendor/faiss/faiss/utils/distances.h +28 -20
  100. data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
  101. data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
  102. data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
  103. data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
  104. data/vendor/faiss/faiss/utils/hamming.h +2 -7
  105. data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
  106. data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
  107. data/vendor/faiss/faiss/utils/partitioning.h +69 -0
  108. data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
  109. data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
  110. data/vendor/faiss/faiss/utils/simdlib.h +31 -0
  111. data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
  112. data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
  113. metadata +43 -141
  114. data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
  115. data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
  116. data/vendor/faiss/c_api/AutoTune_c.h +0 -66
  117. data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
  118. data/vendor/faiss/c_api/Clustering_c.h +0 -123
  119. data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
  120. data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
  121. data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
  122. data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
  123. data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
  124. data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
  125. data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
  126. data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
  127. data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
  128. data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
  129. data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
  130. data/vendor/faiss/c_api/IndexShards_c.h +0 -39
  131. data/vendor/faiss/c_api/Index_c.cpp +0 -105
  132. data/vendor/faiss/c_api/Index_c.h +0 -183
  133. data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
  134. data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
  135. data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
  136. data/vendor/faiss/c_api/clone_index_c.h +0 -32
  137. data/vendor/faiss/c_api/error_c.h +0 -42
  138. data/vendor/faiss/c_api/error_impl.cpp +0 -27
  139. data/vendor/faiss/c_api/error_impl.h +0 -16
  140. data/vendor/faiss/c_api/faiss_c.h +0 -58
  141. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
  142. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
  143. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
  144. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
  145. data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
  146. data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
  147. data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
  148. data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
  149. data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
  150. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
  151. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
  152. data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
  153. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
  154. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
  155. data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
  156. data/vendor/faiss/c_api/index_factory_c.h +0 -30
  157. data/vendor/faiss/c_api/index_io_c.cpp +0 -42
  158. data/vendor/faiss/c_api/index_io_c.h +0 -50
  159. data/vendor/faiss/c_api/macros_impl.h +0 -110
  160. data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
  161. data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
  162. data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
  163. data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
  164. data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
  165. data/vendor/faiss/misc/test_blas.cpp +0 -87
  166. data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
  167. data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
  168. data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
  169. data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
  170. data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
  171. data/vendor/faiss/tests/test_merge.cpp +0 -260
  172. data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
  173. data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
  174. data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
  175. data/vendor/faiss/tests/test_params_override.cpp +0 -236
  176. data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
  177. data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
  178. data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
  179. data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
  180. data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
  181. data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
  182. data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
  183. data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
  184. data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104
@@ -41,8 +41,22 @@ class StandardGpuResourcesImpl : public GpuResources {
41
41
  /// transfers
42
42
  void setPinnedMemory(size_t size);
43
43
 
44
- /// Called to change the stream for work ordering
45
- void setDefaultStream(int device, cudaStream_t stream);
44
+ /// Called to change the stream for work ordering. We do not own `stream`;
45
+ /// i.e., it will not be destroyed when the GpuResources object gets cleaned
46
+ /// up.
47
+ /// We are guaranteed that all Faiss GPU work is ordered with respect to
48
+ /// this stream upon exit from an index or other Faiss GPU call.
49
+ void setDefaultStream(int device, cudaStream_t stream) override;
50
+
51
+ /// Revert the default stream to the original stream managed by this resources
52
+ /// object, in case someone called `setDefaultStream`.
53
+ void revertDefaultStream(int device);
54
+
55
+ /// Returns the stream for the given device on which all Faiss GPU work is
56
+ /// ordered.
57
+ /// We are guaranteed that all Faiss GPU work is ordered with respect to
58
+ /// this stream upon exit from an index or other Faiss GPU call.
59
+ cudaStream_t getDefaultStream(int device) override;
46
60
 
47
61
  /// Called to change the work ordering streams to the null stream
48
62
  /// for all devices
@@ -60,8 +74,6 @@ class StandardGpuResourcesImpl : public GpuResources {
60
74
 
61
75
  cublasHandle_t getBlasHandle(int device) override;
62
76
 
63
- cudaStream_t getDefaultStream(int device) override;
64
-
65
77
  std::vector<cudaStream_t> getAlternateStreams(int device) override;
66
78
 
67
79
  /// Allocate non-temporary GPU memory
@@ -128,7 +140,9 @@ class StandardGpuResourcesImpl : public GpuResources {
128
140
  };
129
141
 
130
142
  /// Default implementation of GpuResources that allocates a cuBLAS
131
- /// stream and 2 streams for use, as well as temporary memory
143
+ /// stream and 2 streams for use, as well as temporary memory.
144
+ /// Internally, the Faiss GPU code uses the instance managed by getResources,
145
+ /// but this is the user-facing object that is internally reference counted.
132
146
  class StandardGpuResources : public GpuResourcesProvider {
133
147
  public:
134
148
  StandardGpuResources();
@@ -151,9 +165,17 @@ class StandardGpuResources : public GpuResourcesProvider {
151
165
  /// transfers
152
166
  void setPinnedMemory(size_t size);
153
167
 
154
- /// Called to change the stream for work ordering
168
+ /// Called to change the stream for work ordering. We do not own `stream`;
169
+ /// i.e., it will not be destroyed when the GpuResources object gets cleaned
170
+ /// up.
171
+ /// We are guaranteed that all Faiss GPU work is ordered with respect to
172
+ /// this stream upon exit from an index or other Faiss GPU call.
155
173
  void setDefaultStream(int device, cudaStream_t stream);
156
174
 
175
+ /// Revert the default stream to the original stream managed by this resources
176
+ /// object, in case someone called `setDefaultStream`.
177
+ void revertDefaultStream(int device);
178
+
157
179
  /// Called to change the work ordering streams to the null stream
158
180
  /// for all devices
159
181
  void setDefaultNullStreamAllDevices();
@@ -0,0 +1,547 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #include <faiss/gpu/impl/InterleavedCodes.h>
9
+ #include <faiss/impl/FaissAssert.h>
10
+ #include <faiss/gpu/utils/StaticUtils.h>
11
+
12
+ namespace faiss { namespace gpu {
13
+
14
+ inline uint8_t unpack5(int i, uint8_t vLower, uint8_t vUpper) {
15
+ uint8_t v = 0;
16
+
17
+ // lsb ... msb
18
+ // 0: 0 0 0 0 0 1 1 1
19
+ // 1: 1 1 2 2 2 2 2 3
20
+ // 2: 3 3 3 3 4 4 4 4
21
+ // 3: 4 5 5 5 5 5 6 6
22
+ // 4: 6 6 6 7 7 7 7 7
23
+ switch (i % 8) {
24
+ case 0:
25
+ // 5 lsbs of lower
26
+ v = vLower & 0x1f;
27
+ break;
28
+ case 1:
29
+ // 3 msbs of lower as v lsbs
30
+ // 2 msbs of upper as v msbs
31
+ v = (vLower >> 5) | ((vUpper & 0x3) << 3);
32
+ break;
33
+ case 2:
34
+ // 5 of lower
35
+ v = (vLower >> 2) & 0x1f;
36
+ break;
37
+ case 3:
38
+ // 1 msbs of lower as v lsbs
39
+ // 4 lsbs of upper as v msbs
40
+ v = (vLower >> 7) | ((vUpper & 0xf) << 1);
41
+ break;
42
+ case 4:
43
+ // 4 msbs of lower as v lsbs
44
+ // 1 lsbs of upper as v msbs
45
+ v = (vLower >> 4) | ((vUpper & 0x1) << 4);
46
+ break;
47
+ case 5:
48
+ // 5 of lower
49
+ v = (vLower >> 1) & 0x1f;
50
+ break;
51
+ case 6:
52
+ // 2 msbs of lower as v lsbs
53
+ // 3 lsbs of upper as v msbs
54
+ v = (vLower >> 6) | ((vUpper & 0x7) << 2);
55
+ break;
56
+ case 7:
57
+ // 5 of lower
58
+ v = (vLower >> 3);
59
+ break;
60
+ }
61
+
62
+ return v;
63
+ }
64
+
65
+ inline uint8_t unpack6(int i, uint8_t vLower, uint8_t vUpper) {
66
+ uint8_t v = 0;
67
+
68
+ switch (i % 4) {
69
+ case 0:
70
+ // 6 lsbs of lower
71
+ v = vLower & 0x3f;
72
+ break;
73
+ case 1:
74
+ // 2 msbs of lower as v lsbs
75
+ // 4 lsbs of upper as v msbs
76
+ v = (vLower >> 6) | ((vUpper & 0xf) << 2);
77
+ break;
78
+ case 2:
79
+ // 4 msbs of lower as v lsbs
80
+ // 2 lsbs of upper as v msbs
81
+ v = (vLower >> 4) | ((vUpper & 0x3) << 4);
82
+ break;
83
+ case 3:
84
+ // 6 msbs of lower
85
+ v = (vLower >> 2);
86
+ break;
87
+ }
88
+
89
+ return v;
90
+ }
91
+
92
+
93
+ std::vector<uint8_t>
94
+ unpackNonInterleaved(std::vector<uint8_t> data,
95
+ int numVecs,
96
+ int dims,
97
+ int bitsPerCode) {
98
+ int srcVecSize = utils::divUp(dims * bitsPerCode, 8);
99
+ FAISS_ASSERT(data.size() == numVecs * srcVecSize);
100
+
101
+ if (bitsPerCode == 8 ||
102
+ bitsPerCode == 16 ||
103
+ bitsPerCode == 32) {
104
+ // nothing to do
105
+ return data;
106
+ }
107
+
108
+ // bit codes padded to whole bytes
109
+ std::vector<uint8_t> out(numVecs * dims * utils::divUp(bitsPerCode, 8));
110
+
111
+ if (bitsPerCode == 4) {
112
+ #pragma omp parallel for
113
+ for (int i = 0; i < numVecs; ++i) {
114
+ for (int j = 0; j < dims; ++j) {
115
+ int srcIdx = i * srcVecSize + (j / 2);
116
+ FAISS_ASSERT(srcIdx < data.size());
117
+
118
+ uint8_t v = data[srcIdx];
119
+ v = (j % 2 == 0) ? v & 0xf : v >> 4;
120
+
121
+ out[i * dims + j] = v;
122
+ }
123
+ }
124
+ } else if (bitsPerCode == 5) {
125
+ #pragma omp parallel for
126
+ for (int i = 0; i < numVecs; ++i) {
127
+ for (int j = 0; j < dims; ++j) {
128
+ int lo = i * srcVecSize + (j * 5) / 8;
129
+ int hi = lo + 1;
130
+
131
+ FAISS_ASSERT(lo < data.size());
132
+ FAISS_ASSERT(hi <= data.size());
133
+
134
+ auto vLower = data[lo];
135
+ auto vUpper = hi < data.size() ? data[hi] : 0;
136
+
137
+ out[i * dims + j] = unpack5(j, vLower, vUpper);
138
+ }
139
+ }
140
+ } else if (bitsPerCode == 6) {
141
+ #pragma omp parallel for
142
+ for (int i = 0; i < numVecs; ++i) {
143
+ for (int j = 0; j < dims; ++j) {
144
+ int lo = i * srcVecSize + (j * 6) / 8;
145
+ int hi = lo + 1;
146
+
147
+ FAISS_ASSERT(lo < data.size());
148
+ FAISS_ASSERT(hi <= data.size());
149
+
150
+ auto vLower = data[lo];
151
+ auto vUpper = hi < data.size() ? data[hi] : 0;
152
+
153
+ out[i * dims + j] = unpack6(j, vLower, vUpper);
154
+ }
155
+ }
156
+ } else {
157
+ // unhandled
158
+ FAISS_ASSERT(false);
159
+ }
160
+
161
+ return out;
162
+ }
163
+
164
+ template <typename T>
165
+ void
166
+ unpackInterleavedWord(const T* in,
167
+ T* out,
168
+ int numVecs,
169
+ int dims,
170
+ int bitsPerCode) {
171
+ int wordsPerDimBlock = 32 * bitsPerCode / (8 * sizeof(T));
172
+ int wordsPerBlock = wordsPerDimBlock * dims;
173
+ int numBlocks = utils::divUp(numVecs, 32);
174
+
175
+ #pragma omp parallel for
176
+ for (int i = 0; i < numVecs; ++i) {
177
+ int block = i / 32;
178
+ FAISS_ASSERT(block < numBlocks);
179
+ int lane = i % 32;
180
+
181
+ for (int j = 0; j < dims; ++j) {
182
+ int srcOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
183
+ out[i * dims + j] = in[srcOffset];
184
+ }
185
+ }
186
+ }
187
+
188
+ std::vector<uint8_t>
189
+ unpackInterleaved(std::vector<uint8_t> data,
190
+ int numVecs,
191
+ int dims,
192
+ int bitsPerCode) {
193
+ int bytesPerDimBlock = 32 * bitsPerCode / 8;
194
+ int bytesPerBlock = bytesPerDimBlock * dims;
195
+ int numBlocks = utils::divUp(numVecs, 32);
196
+ size_t totalSize = (size_t) bytesPerBlock * numBlocks;
197
+ FAISS_ASSERT(data.size() == totalSize);
198
+
199
+ // bit codes padded to whole bytes
200
+ std::vector<uint8_t> out(numVecs * dims * utils::divUp(bitsPerCode, 8));
201
+
202
+ if (bitsPerCode == 8) {
203
+ unpackInterleavedWord<uint8_t>(data.data(), out.data(),
204
+ numVecs, dims, bitsPerCode);
205
+ } else if (bitsPerCode == 16) {
206
+ unpackInterleavedWord<uint16_t>((uint16_t*) data.data(),
207
+ (uint16_t*) out.data(),
208
+ numVecs, dims, bitsPerCode);
209
+ } else if (bitsPerCode == 32) {
210
+ unpackInterleavedWord<uint32_t>((uint32_t*) data.data(),
211
+ (uint32_t*) out.data(),
212
+ numVecs, dims, bitsPerCode);
213
+ } else if (bitsPerCode == 4) {
214
+ #pragma omp parallel for
215
+ for (int i = 0; i < numVecs; ++i) {
216
+ int block = i / 32;
217
+ int lane = i % 32;
218
+
219
+ int word = lane / 2;
220
+ int subWord = lane % 2;
221
+
222
+ for (int j = 0; j < dims; ++j) {
223
+ auto v =
224
+ data[block * bytesPerBlock + j * bytesPerDimBlock + word];
225
+
226
+ v = (subWord == 0) ? v & 0xf : v >> 4;
227
+ out[i * dims + j] = v;
228
+ }
229
+ }
230
+ } else if (bitsPerCode == 5) {
231
+ #pragma omp parallel for
232
+ for (int i = 0; i < numVecs; ++i) {
233
+ int block = i / 32;
234
+ int blockVector = i % 32;
235
+
236
+ for (int j = 0; j < dims; ++j) {
237
+ uint8_t* dimBlock =
238
+ &data[block * bytesPerBlock + j * bytesPerDimBlock];
239
+
240
+ int lo = (blockVector * 5) / 8;
241
+ int hi = lo + 1;
242
+
243
+ FAISS_ASSERT(lo < bytesPerDimBlock);
244
+ FAISS_ASSERT(hi <= bytesPerDimBlock);
245
+
246
+ auto vLower = dimBlock[lo];
247
+ auto vUpper = hi < bytesPerDimBlock ? dimBlock[hi] : 0;
248
+
249
+ out[i * dims + j] = unpack5(blockVector, vLower, vUpper);
250
+ }
251
+ }
252
+ } else if (bitsPerCode == 6) {
253
+ #pragma omp parallel for
254
+ for (int i = 0; i < numVecs; ++i) {
255
+ int block = i / 32;
256
+ int blockVector = i % 32;
257
+
258
+ for (int j = 0; j < dims; ++j) {
259
+ uint8_t* dimBlock =
260
+ &data[block * bytesPerBlock + j * bytesPerDimBlock];
261
+
262
+ int lo = (blockVector * 6) / 8;
263
+ int hi = lo + 1;
264
+
265
+ FAISS_ASSERT(lo < bytesPerDimBlock);
266
+ FAISS_ASSERT(hi <= bytesPerDimBlock);
267
+
268
+ auto vLower = dimBlock[lo];
269
+ auto vUpper = hi < bytesPerDimBlock ? dimBlock[hi] : 0;
270
+
271
+ out[i * dims + j] = unpack6(blockVector, vLower, vUpper);
272
+ }
273
+ }
274
+ } else {
275
+ // unimplemented
276
+ FAISS_ASSERT(false);
277
+ }
278
+
279
+ return out;
280
+ }
281
+
282
+ inline uint8_t pack5(int i, uint8_t lo, uint8_t hi, uint8_t hi2) {
283
+ FAISS_ASSERT((lo & 0x1f) == lo);
284
+ FAISS_ASSERT((hi & 0x1f) == hi);
285
+ FAISS_ASSERT((hi2 & 0x1f) == hi2);
286
+
287
+ uint8_t v = 0;
288
+
289
+ // lsb ... msb
290
+ // 0: 0 0 0 0 0 1 1 1
291
+ // 1: 1 1 2 2 2 2 2 3
292
+ // 2: 3 3 3 3 4 4 4 4
293
+ // 3: 4 5 5 5 5 5 6 6
294
+ // 4: 6 6 6 7 7 7 7 7
295
+ switch (i % 5) {
296
+ case 0:
297
+ // 5 msbs of lower as vOut lsbs
298
+ // 3 lsbs of upper as vOut msbs
299
+ v = (lo & 0x1f) | (hi << 5);
300
+ break;
301
+ case 1:
302
+ // 2 msbs of lower as vOut lsbs
303
+ // 5 lsbs of upper as vOut msbs
304
+ // 1 lsbs of upper2 as vOut msb
305
+ v = (lo >> 3) | (hi << 2) | (hi2 << 7);
306
+ break;
307
+ case 2:
308
+ // 4 msbs of lower as vOut lsbs
309
+ // 4 lsbs of upper as vOut msbs
310
+ v = (lo >> 1) | (hi << 4);
311
+ break;
312
+ case 3:
313
+ // 1 msbs of lower as vOut lsbs
314
+ // 5 lsbs of upper as vOut msbs
315
+ // 2 lsbs of upper2 as vOut msb
316
+ v = (lo >> 4) | (hi << 1) | (hi2 << 6);
317
+ break;
318
+ case 4:
319
+ // 3 msbs of lower as vOut lsbs
320
+ // 5 lsbs of upper as vOut msbs
321
+ v = (lo >> 2) | (hi << 3);
322
+ break;
323
+ }
324
+
325
+ return v;
326
+ }
327
+
328
+ inline uint8_t pack6(int i, uint8_t lo, uint8_t hi) {
329
+ FAISS_ASSERT((lo & 0x3f) == lo);
330
+ FAISS_ASSERT((hi & 0x3f) == hi);
331
+
332
+ uint8_t v = 0;
333
+
334
+ // lsb ... msb
335
+ // 0: 0 0 0 0 0 0 1 1
336
+ // 1: 1 1 1 1 2 2 2 2
337
+ // 2: 2 2 3 3 3 3 3 3
338
+ switch (i % 3) {
339
+ case 0:
340
+ // 6 msbs of lower as vOut lsbs
341
+ // 2 lsbs of upper as vOut msbs
342
+ v = (lo & 0x3f) | (hi << 6);
343
+ break;
344
+ case 1:
345
+ // 4 msbs of lower as vOut lsbs
346
+ // 4 lsbs of upper as vOut msbs
347
+ v = (lo >> 2) | (hi << 4);
348
+ break;
349
+ case 2:
350
+ // 2 msbs of lower as vOut lsbs
351
+ // 6 lsbs of upper as vOut msbs
352
+ v = (lo >> 4) | (hi << 2);
353
+ break;
354
+ }
355
+
356
+ return v;
357
+ }
358
+
359
+
360
+ std::vector<uint8_t>
361
+ packNonInterleaved(std::vector<uint8_t> data,
362
+ int numVecs,
363
+ int dims,
364
+ int bitsPerCode) {
365
+ // bit codes padded to whole bytes
366
+ FAISS_ASSERT(data.size() == numVecs * dims * utils::divUp(bitsPerCode, 8));
367
+
368
+ if (bitsPerCode == 8 ||
369
+ bitsPerCode == 16 ||
370
+ bitsPerCode == 32) {
371
+ // nothing to do, whole words are already where they need to be
372
+ return data;
373
+ }
374
+
375
+ // bits packed into a whole number of bytes
376
+ int bytesPerVec = utils::divUp(dims * bitsPerCode, 8);
377
+
378
+ std::vector<uint8_t> out(numVecs * bytesPerVec);
379
+
380
+ if (bitsPerCode == 4) {
381
+ #pragma omp parallel for
382
+ for (int i = 0; i < numVecs; ++i) {
383
+ for (int j = 0; j < bytesPerVec; ++j) {
384
+ int dimLo = j * 2;
385
+ int dimHi = dimLo + 1;
386
+ FAISS_ASSERT(dimLo < dims);
387
+ FAISS_ASSERT(dimHi <= dims);
388
+
389
+ uint8_t lo = data[i * dims + dimLo];
390
+ uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
391
+
392
+ out[i * bytesPerVec + j] = (hi << 4) | (lo & 0xf);
393
+ }
394
+ }
395
+ } else if (bitsPerCode == 5) {
396
+ #pragma omp parallel for
397
+ for (int i = 0; i < numVecs; ++i) {
398
+ for (int j = 0; j < bytesPerVec; ++j) {
399
+ int dimLo = (j * 8) / 5;
400
+ int dimHi = dimLo + 1;
401
+ int dimHi2 = dimHi + 1;
402
+ FAISS_ASSERT(dimLo < dims);
403
+ FAISS_ASSERT(dimHi <= dims);
404
+ FAISS_ASSERT(dimHi <= dims + 1);
405
+
406
+ uint8_t lo = data[i * dims + dimLo];
407
+ uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
408
+ uint8_t hi2 = dimHi2 < dims ? data[i * dims + dimHi2] : 0;
409
+
410
+ out[i * bytesPerVec + j] = pack5(j, lo, hi, hi2);
411
+ }
412
+ }
413
+ } else if (bitsPerCode == 6) {
414
+ #pragma omp parallel for
415
+ for (int i = 0; i < numVecs; ++i) {
416
+ for (int j = 0; j < bytesPerVec; ++j) {
417
+ int dimLo = (j * 8) / 6;
418
+ int dimHi = dimLo + 1;
419
+ FAISS_ASSERT(dimLo < dims);
420
+ FAISS_ASSERT(dimHi <= dims);
421
+
422
+ uint8_t lo = data[i * dims + dimLo];
423
+ uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
424
+
425
+ out[i * bytesPerVec + j] = pack6(j, lo, hi);
426
+ }
427
+ }
428
+ } else {
429
+ // unhandled
430
+ FAISS_ASSERT(false);
431
+ }
432
+
433
+ return out;
434
+ }
435
+
436
+ template <typename T>
437
+ void
438
+ packInterleavedWord(const T* in,
439
+ T* out,
440
+ int numVecs,
441
+ int dims,
442
+ int bitsPerCode) {
443
+ int wordsPerDimBlock = 32 * bitsPerCode / (8 * sizeof(T));
444
+ int wordsPerBlock = wordsPerDimBlock * dims;
445
+ int numBlocks = utils::divUp(numVecs, 32);
446
+
447
+ // We're guaranteed that all other slots not filled by the vectors present are
448
+ // initialized to zero (from the vector constructor in packInterleaved)
449
+ #pragma omp parallel for
450
+ for (int i = 0; i < numVecs; ++i) {
451
+ int block = i / 32;
452
+ FAISS_ASSERT(block < numBlocks);
453
+ int lane = i % 32;
454
+
455
+ for (int j = 0; j < dims; ++j) {
456
+ int dstOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
457
+ out[dstOffset] = in[i * dims + j];
458
+ }
459
+ }
460
+ }
461
+
462
+ std::vector<uint8_t>
463
+ packInterleaved(std::vector<uint8_t> data,
464
+ int numVecs,
465
+ int dims,
466
+ int bitsPerCode) {
467
+ int bytesPerDimBlock = 32 * bitsPerCode / 8;
468
+ int bytesPerBlock = bytesPerDimBlock * dims;
469
+ int numBlocks = utils::divUp(numVecs, 32);
470
+ size_t totalSize = (size_t) bytesPerBlock * numBlocks;
471
+
472
+ // bit codes padded to whole bytes
473
+ FAISS_ASSERT(data.size() == numVecs * dims * utils::divUp(bitsPerCode, 8));
474
+
475
+ // packs based on blocks
476
+ std::vector<uint8_t> out(totalSize, 0);
477
+
478
+ if (bitsPerCode == 8) {
479
+ packInterleavedWord<uint8_t>(data.data(), out.data(),
480
+ numVecs, dims, bitsPerCode);
481
+ } else if (bitsPerCode == 16) {
482
+ packInterleavedWord<uint16_t>((uint16_t*) data.data(),
483
+ (uint16_t*) out.data(),
484
+ numVecs, dims, bitsPerCode);
485
+ } else if (bitsPerCode == 32) {
486
+ packInterleavedWord<uint32_t>((uint32_t*) data.data(),
487
+ (uint32_t*) out.data(),
488
+ numVecs, dims, bitsPerCode);
489
+ } else if (bitsPerCode == 4) {
490
+ #pragma omp parallel for
491
+ for (int i = 0; i < numBlocks; ++i) {
492
+ for (int j = 0; j < dims; ++j) {
493
+ for (int k = 0; k < bytesPerDimBlock; ++k) {
494
+ int loVec = i * 32 + k * 2;
495
+ int hiVec = loVec + 1;
496
+
497
+ uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
498
+ uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
499
+
500
+ out[i * bytesPerBlock + j * bytesPerDimBlock + k] =
501
+ (hi << 4) | (lo & 0xf);
502
+ }
503
+ }
504
+ }
505
+ } else if (bitsPerCode == 5) {
506
+ #pragma omp parallel for
507
+ for (int i = 0; i < numBlocks; ++i) {
508
+ for (int j = 0; j < dims; ++j) {
509
+ for (int k = 0; k < bytesPerDimBlock; ++k) {
510
+ // What input vectors we are pulling from
511
+ int loVec = i * 32 + (k * 8) / 5;
512
+ int hiVec = loVec + 1;
513
+ int hiVec2 = hiVec + 1;
514
+
515
+ uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
516
+ uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
517
+ uint8_t hi2 = hiVec2 < numVecs ? data[hiVec2 * dims + j] : 0;
518
+
519
+ out[i * bytesPerBlock + j * bytesPerDimBlock + k] = pack5(k, lo, hi, hi2);
520
+ }
521
+ }
522
+ }
523
+ } else if (bitsPerCode == 6) {
524
+ #pragma omp parallel for
525
+ for (int i = 0; i < numBlocks; ++i) {
526
+ for (int j = 0; j < dims; ++j) {
527
+ for (int k = 0; k < bytesPerDimBlock; ++k) {
528
+ // What input vectors we are pulling from
529
+ int loVec = i * 32 + (k * 8) / 6;
530
+ int hiVec = loVec + 1;
531
+
532
+ uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
533
+ uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
534
+
535
+ out[i * bytesPerBlock + j * bytesPerDimBlock + k] = pack6(k, lo, hi);
536
+ }
537
+ }
538
+ }
539
+ } else {
540
+ // unimplemented
541
+ FAISS_ASSERT(false);
542
+ }
543
+
544
+ return out;
545
+ }
546
+
547
+ } } // namespace