RubyGems - faiss - Versions diffs - 0.1.3 → 0.2.0 - Mend

faiss 0.1.3 → 0.2.0

Files changed (199) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +25 -0
data/LICENSE.txt +1 -1
data/README.md +16 -4
data/ext/faiss/ext.cpp +12 -308
data/ext/faiss/extconf.rb +6 -3
data/ext/faiss/index.cpp +189 -0
data/ext/faiss/index_binary.cpp +75 -0
data/ext/faiss/kmeans.cpp +40 -0
data/ext/faiss/numo.hpp +867 -0
data/ext/faiss/pca_matrix.cpp +33 -0
data/ext/faiss/product_quantizer.cpp +53 -0
data/ext/faiss/utils.cpp +13 -0
data/ext/faiss/utils.h +5 -0
data/lib/faiss.rb +0 -5
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +36 -33
data/vendor/faiss/faiss/AutoTune.h +6 -3
data/vendor/faiss/faiss/Clustering.cpp +16 -12
data/vendor/faiss/faiss/Index.cpp +3 -4
data/vendor/faiss/faiss/Index.h +3 -3
data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
data/vendor/faiss/faiss/IndexBinary.h +1 -1
data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
data/vendor/faiss/faiss/IndexFlat.h +0 -51
data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
data/vendor/faiss/faiss/IndexIVF.h +22 -15
data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
data/vendor/faiss/faiss/IndexRefine.h +73 -0
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
data/vendor/faiss/faiss/impl/io.cpp +33 -2
data/vendor/faiss/faiss/impl/io.h +7 -2
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
data/vendor/faiss/faiss/index_factory.cpp +112 -7
data/vendor/faiss/faiss/index_io.h +1 -48
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
data/vendor/faiss/faiss/utils/Heap.h +61 -50
data/vendor/faiss/faiss/utils/distances.cpp +164 -319
data/vendor/faiss/faiss/utils/distances.h +28 -20
data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
data/vendor/faiss/faiss/utils/hamming.h +2 -7
data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
data/vendor/faiss/faiss/utils/partitioning.h +69 -0
data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
data/vendor/faiss/faiss/utils/simdlib.h +31 -0
data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
metadata +54 -149
data/lib/faiss/index.rb +0 -20
data/lib/faiss/index_binary.rb +0 -20
data/lib/faiss/kmeans.rb +0 -15
data/lib/faiss/pca_matrix.rb +0 -15
data/lib/faiss/product_quantizer.rb +0 -22
data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
data/vendor/faiss/c_api/AutoTune_c.h +0 -66
data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
data/vendor/faiss/c_api/Clustering_c.h +0 -123
data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
data/vendor/faiss/c_api/IndexShards_c.h +0 -39
data/vendor/faiss/c_api/Index_c.cpp +0 -105
data/vendor/faiss/c_api/Index_c.h +0 -183
data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
data/vendor/faiss/c_api/clone_index_c.h +0 -32
data/vendor/faiss/c_api/error_c.h +0 -42
data/vendor/faiss/c_api/error_impl.cpp +0 -27
data/vendor/faiss/c_api/error_impl.h +0 -16
data/vendor/faiss/c_api/faiss_c.h +0 -58
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
data/vendor/faiss/c_api/index_factory_c.h +0 -30
data/vendor/faiss/c_api/index_io_c.cpp +0 -42
data/vendor/faiss/c_api/index_io_c.h +0 -50
data/vendor/faiss/c_api/macros_impl.h +0 -110
data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
data/vendor/faiss/misc/test_blas.cpp +0 -87
data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
data/vendor/faiss/tests/test_merge.cpp +0 -260
data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
data/vendor/faiss/tests/test_params_override.cpp +0 -236
data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104

data/vendor/faiss/faiss/gpu/StandardGpuResources.h CHANGED Viewed

@@ -41,8 +41,22 @@ class StandardGpuResourcesImpl : public GpuResources {
   /// transfers
   void setPinnedMemory(size_t size);
-  /// Called to change the stream for work ordering
-  void setDefaultStream(int device, cudaStream_t stream);
+  /// Called to change the stream for work ordering. We do not own `stream`;
+  /// i.e., it will not be destroyed when the GpuResources object gets cleaned
+  /// up.
+  /// We are guaranteed that all Faiss GPU work is ordered with respect to
+  /// this stream upon exit from an index or other Faiss GPU call.
+  void setDefaultStream(int device, cudaStream_t stream) override;
+  /// Revert the default stream to the original stream managed by this resources
+  /// object, in case someone called `setDefaultStream`.
+  void revertDefaultStream(int device);
+  /// Returns the stream for the given device on which all Faiss GPU work is
+  /// ordered.
+  /// We are guaranteed that all Faiss GPU work is ordered with respect to
+  /// this stream upon exit from an index or other Faiss GPU call.
+  cudaStream_t getDefaultStream(int device) override;
   /// Called to change the work ordering streams to the null stream
   /// for all devices
@@ -60,8 +74,6 @@ class StandardGpuResourcesImpl : public GpuResources {
   cublasHandle_t getBlasHandle(int device) override;
-  cudaStream_t getDefaultStream(int device) override;
   std::vector<cudaStream_t> getAlternateStreams(int device) override;
   /// Allocate non-temporary GPU memory
@@ -128,7 +140,9 @@ class StandardGpuResourcesImpl : public GpuResources {
 };
 /// Default implementation of GpuResources that allocates a cuBLAS
-/// stream and 2 streams for use, as well as temporary memory
+/// stream and 2 streams for use, as well as temporary memory.
+/// Internally, the Faiss GPU code uses the instance managed by getResources,
+/// but this is the user-facing object that is internally reference counted.
 class StandardGpuResources : public GpuResourcesProvider {
  public:
   StandardGpuResources();
@@ -151,9 +165,17 @@ class StandardGpuResources : public GpuResourcesProvider {
   /// transfers
   void setPinnedMemory(size_t size);
-  /// Called to change the stream for work ordering
+  /// Called to change the stream for work ordering. We do not own `stream`;
+  /// i.e., it will not be destroyed when the GpuResources object gets cleaned
+  /// up.
+  /// We are guaranteed that all Faiss GPU work is ordered with respect to
+  /// this stream upon exit from an index or other Faiss GPU call.
   void setDefaultStream(int device, cudaStream_t stream);
+  /// Revert the default stream to the original stream managed by this resources
+  /// object, in case someone called `setDefaultStream`.
+  void revertDefaultStream(int device);
   /// Called to change the work ordering streams to the null stream
   /// for all devices
   void setDefaultNullStreamAllDevices();

data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp ADDED Viewed

@@ -0,0 +1,547 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <faiss/gpu/impl/InterleavedCodes.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/gpu/utils/StaticUtils.h>
+namespace faiss { namespace gpu {
+inline uint8_t unpack5(int i, uint8_t vLower, uint8_t vUpper) {
+  uint8_t v = 0;
+  // lsb     ...    msb
+  // 0: 0 0 0 0 0 1 1 1
+  // 1: 1 1 2 2 2 2 2 3
+  // 2: 3 3 3 3 4 4 4 4
+  // 3: 4 5 5 5 5 5 6 6
+  // 4: 6 6 6 7 7 7 7 7
+  switch (i % 8) {
+    case 0:
+      // 5 lsbs of lower
+      v = vLower & 0x1f;
+      break;
+    case 1:
+      // 3 msbs of lower as v lsbs
+      // 2 msbs of upper as v msbs
+      v = (vLower >> 5) | ((vUpper & 0x3) << 3);
+      break;
+    case 2:
+      // 5 of lower
+      v = (vLower >> 2) & 0x1f;
+      break;
+    case 3:
+      // 1 msbs of lower as v lsbs
+      // 4 lsbs of upper as v msbs
+      v = (vLower >> 7) | ((vUpper & 0xf) << 1);
+      break;
+    case 4:
+      // 4 msbs of lower as v lsbs
+      // 1 lsbs of upper as v msbs
+      v = (vLower >> 4) | ((vUpper & 0x1) << 4);
+      break;
+    case 5:
+      // 5 of lower
+      v = (vLower >> 1) & 0x1f;
+      break;
+    case 6:
+      // 2 msbs of lower as v lsbs
+      // 3 lsbs of upper as v msbs
+      v = (vLower >> 6) | ((vUpper & 0x7) << 2);
+      break;
+    case 7:
+      // 5 of lower
+      v = (vLower >> 3);
+      break;
+  }
+  return v;
+}
+inline uint8_t unpack6(int i, uint8_t vLower, uint8_t vUpper) {
+  uint8_t v = 0;
+  switch (i % 4) {
+    case 0:
+      // 6 lsbs of lower
+      v = vLower & 0x3f;
+      break;
+    case 1:
+      // 2 msbs of lower as v lsbs
+      // 4 lsbs of upper as v msbs
+      v = (vLower >> 6) | ((vUpper & 0xf) << 2);
+      break;
+    case 2:
+      // 4 msbs of lower as v lsbs
+      // 2 lsbs of upper as v msbs
+      v = (vLower >> 4) | ((vUpper & 0x3) << 4);
+      break;
+    case 3:
+      // 6 msbs of lower
+      v = (vLower >> 2);
+      break;
+  }
+  return v;
+}
+std::vector<uint8_t>
+unpackNonInterleaved(std::vector<uint8_t> data,
+                     int numVecs,
+                     int dims,
+                     int bitsPerCode) {
+  int srcVecSize = utils::divUp(dims * bitsPerCode, 8);
+  FAISS_ASSERT(data.size() == numVecs * srcVecSize);
+  if (bitsPerCode == 8 ||
+      bitsPerCode == 16 ||
+      bitsPerCode == 32) {
+    // nothing to do
+    return data;
+  }
+  // bit codes padded to whole bytes
+  std::vector<uint8_t> out(numVecs * dims * utils::divUp(bitsPerCode, 8));
+  if (bitsPerCode == 4) {
+#pragma omp parallel for
+    for (int i = 0; i < numVecs; ++i) {
+      for (int j = 0; j < dims; ++j) {
+        int srcIdx = i * srcVecSize + (j / 2);
+        FAISS_ASSERT(srcIdx < data.size());
+        uint8_t v = data[srcIdx];
+        v = (j % 2 == 0) ? v & 0xf : v >> 4;
+        out[i * dims + j] = v;
+      }
+    }
+  } else if (bitsPerCode == 5) {
+#pragma omp parallel for
+    for (int i = 0; i < numVecs; ++i) {
+      for (int j = 0; j < dims; ++j) {
+        int lo = i * srcVecSize + (j * 5) / 8;
+        int hi = lo + 1;
+        FAISS_ASSERT(lo < data.size());
+        FAISS_ASSERT(hi <= data.size());
+        auto vLower = data[lo];
+        auto vUpper = hi < data.size() ? data[hi] : 0;
+        out[i * dims + j] = unpack5(j, vLower, vUpper);
+      }
+    }
+  } else if (bitsPerCode == 6) {
+#pragma omp parallel for
+    for (int i = 0; i < numVecs; ++i) {
+      for (int j = 0; j < dims; ++j) {
+        int lo = i * srcVecSize + (j * 6) / 8;
+        int hi = lo + 1;
+        FAISS_ASSERT(lo < data.size());
+        FAISS_ASSERT(hi <= data.size());
+        auto vLower = data[lo];
+        auto vUpper = hi < data.size() ? data[hi] : 0;
+        out[i * dims + j] = unpack6(j, vLower, vUpper);
+      }
+    }
+  } else {
+    // unhandled
+    FAISS_ASSERT(false);
+  }
+  return out;
+}
+template <typename T>
+void
+unpackInterleavedWord(const T* in,
+                      T* out,
+                      int numVecs,
+                      int dims,
+                      int bitsPerCode) {
+  int wordsPerDimBlock = 32 * bitsPerCode / (8 * sizeof(T));
+  int wordsPerBlock = wordsPerDimBlock * dims;
+  int numBlocks = utils::divUp(numVecs, 32);
+#pragma omp parallel for
+  for (int i = 0; i < numVecs; ++i) {
+    int block = i / 32;
+    FAISS_ASSERT(block < numBlocks);
+    int lane = i % 32;
+    for (int j = 0; j < dims; ++j) {
+      int srcOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
+      out[i * dims + j] = in[srcOffset];
+    }
+  }
+}
+std::vector<uint8_t>
+unpackInterleaved(std::vector<uint8_t> data,
+                  int numVecs,
+                  int dims,
+                  int bitsPerCode) {
+  int bytesPerDimBlock = 32 * bitsPerCode / 8;
+  int bytesPerBlock = bytesPerDimBlock * dims;
+  int numBlocks = utils::divUp(numVecs, 32);
+  size_t totalSize = (size_t) bytesPerBlock * numBlocks;
+  FAISS_ASSERT(data.size() == totalSize);
+  // bit codes padded to whole bytes
+  std::vector<uint8_t> out(numVecs * dims * utils::divUp(bitsPerCode, 8));
+  if (bitsPerCode == 8) {
+    unpackInterleavedWord<uint8_t>(data.data(), out.data(),
+                                   numVecs, dims, bitsPerCode);
+  } else if (bitsPerCode == 16) {
+    unpackInterleavedWord<uint16_t>((uint16_t*) data.data(),
+                                    (uint16_t*) out.data(),
+                                    numVecs, dims, bitsPerCode);
+  } else if (bitsPerCode == 32) {
+    unpackInterleavedWord<uint32_t>((uint32_t*) data.data(),
+                                    (uint32_t*) out.data(),
+                                    numVecs, dims, bitsPerCode);
+  } else if (bitsPerCode == 4) {
+#pragma omp parallel for
+    for (int i = 0; i < numVecs; ++i) {
+      int block = i / 32;
+      int lane = i % 32;
+      int word = lane / 2;
+      int subWord = lane % 2;
+      for (int j = 0; j < dims; ++j) {
+        auto v =
+          data[block * bytesPerBlock + j * bytesPerDimBlock + word];
+        v = (subWord == 0) ? v & 0xf : v >> 4;
+        out[i * dims + j] = v;
+      }
+    }
+  } else if (bitsPerCode == 5) {
+#pragma omp parallel for
+    for (int i = 0; i < numVecs; ++i) {
+      int block = i / 32;
+      int blockVector = i % 32;
+      for (int j = 0; j < dims; ++j) {
+        uint8_t* dimBlock =
+          &data[block * bytesPerBlock + j * bytesPerDimBlock];
+        int lo = (blockVector * 5) / 8;
+        int hi = lo + 1;
+        FAISS_ASSERT(lo < bytesPerDimBlock);
+        FAISS_ASSERT(hi <= bytesPerDimBlock);
+        auto vLower = dimBlock[lo];
+        auto vUpper = hi < bytesPerDimBlock ? dimBlock[hi] : 0;
+        out[i * dims + j] = unpack5(blockVector, vLower, vUpper);
+      }
+    }
+  } else if (bitsPerCode == 6) {
+#pragma omp parallel for
+    for (int i = 0; i < numVecs; ++i) {
+      int block = i / 32;
+      int blockVector = i % 32;
+      for (int j = 0; j < dims; ++j) {
+        uint8_t* dimBlock =
+          &data[block * bytesPerBlock + j * bytesPerDimBlock];
+        int lo = (blockVector * 6) / 8;
+        int hi = lo + 1;
+        FAISS_ASSERT(lo < bytesPerDimBlock);
+        FAISS_ASSERT(hi <= bytesPerDimBlock);
+        auto vLower = dimBlock[lo];
+        auto vUpper = hi < bytesPerDimBlock ? dimBlock[hi] : 0;
+        out[i * dims + j] = unpack6(blockVector, vLower, vUpper);
+      }
+    }
+  } else {
+    // unimplemented
+    FAISS_ASSERT(false);
+  }
+  return out;
+}
+inline uint8_t pack5(int i, uint8_t lo, uint8_t hi, uint8_t hi2) {
+  FAISS_ASSERT((lo & 0x1f) == lo);
+  FAISS_ASSERT((hi & 0x1f) == hi);
+  FAISS_ASSERT((hi2 & 0x1f) == hi2);
+  uint8_t v = 0;
+  // lsb     ...    msb
+  // 0: 0 0 0 0 0 1 1 1
+  // 1: 1 1 2 2 2 2 2 3
+  // 2: 3 3 3 3 4 4 4 4
+  // 3: 4 5 5 5 5 5 6 6
+  // 4: 6 6 6 7 7 7 7 7
+  switch (i % 5) {
+    case 0:
+      // 5 msbs of lower as vOut lsbs
+      // 3 lsbs of upper as vOut msbs
+      v = (lo & 0x1f) | (hi << 5);
+      break;
+    case 1:
+      // 2 msbs of lower as vOut lsbs
+      // 5 lsbs of upper as vOut msbs
+      // 1 lsbs of upper2 as vOut msb
+      v = (lo >> 3) | (hi << 2) | (hi2 << 7);
+      break;
+    case 2:
+      // 4 msbs of lower as vOut lsbs
+      // 4 lsbs of upper as vOut msbs
+      v = (lo >> 1) | (hi << 4);
+      break;
+    case 3:
+      // 1 msbs of lower as vOut lsbs
+      // 5 lsbs of upper as vOut msbs
+      // 2 lsbs of upper2 as vOut msb
+      v = (lo >> 4) | (hi << 1) | (hi2 << 6);
+      break;
+    case 4:
+      // 3 msbs of lower as vOut lsbs
+      // 5 lsbs of upper as vOut msbs
+      v = (lo >> 2) | (hi << 3);
+      break;
+  }
+  return v;
+}
+inline uint8_t pack6(int i, uint8_t lo, uint8_t hi) {
+  FAISS_ASSERT((lo & 0x3f) == lo);
+  FAISS_ASSERT((hi & 0x3f) == hi);
+  uint8_t v = 0;
+  // lsb     ...    msb
+  // 0: 0 0 0 0 0 0 1 1
+  // 1: 1 1 1 1 2 2 2 2
+  // 2: 2 2 3 3 3 3 3 3
+  switch (i % 3) {
+    case 0:
+      // 6 msbs of lower as vOut lsbs
+      // 2 lsbs of upper as vOut msbs
+      v = (lo & 0x3f) | (hi << 6);
+      break;
+    case 1:
+      // 4 msbs of lower as vOut lsbs
+      // 4 lsbs of upper as vOut msbs
+      v = (lo >> 2) | (hi << 4);
+      break;
+    case 2:
+      // 2 msbs of lower as vOut lsbs
+      // 6 lsbs of upper as vOut msbs
+      v = (lo >> 4) | (hi << 2);
+      break;
+  }
+  return v;
+}
+std::vector<uint8_t>
+packNonInterleaved(std::vector<uint8_t> data,
+                   int numVecs,
+                   int dims,
+                   int bitsPerCode) {
+  // bit codes padded to whole bytes
+  FAISS_ASSERT(data.size() == numVecs * dims * utils::divUp(bitsPerCode, 8));
+  if (bitsPerCode == 8 ||
+      bitsPerCode == 16 ||
+      bitsPerCode == 32) {
+    // nothing to do, whole words are already where they need to be
+    return data;
+  }
+  // bits packed into a whole number of bytes
+  int bytesPerVec = utils::divUp(dims * bitsPerCode, 8);
+  std::vector<uint8_t> out(numVecs * bytesPerVec);
+  if (bitsPerCode == 4) {
+#pragma omp parallel for
+    for (int i = 0; i < numVecs; ++i) {
+      for (int j = 0; j < bytesPerVec; ++j) {
+        int dimLo = j * 2;
+        int dimHi = dimLo + 1;
+        FAISS_ASSERT(dimLo < dims);
+        FAISS_ASSERT(dimHi <= dims);
+        uint8_t lo = data[i * dims + dimLo];
+        uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
+        out[i * bytesPerVec + j] = (hi << 4) | (lo & 0xf);
+      }
+    }
+  } else if (bitsPerCode == 5) {
+#pragma omp parallel for
+    for (int i = 0; i < numVecs; ++i) {
+      for (int j = 0; j < bytesPerVec; ++j) {
+        int dimLo = (j * 8) / 5;
+        int dimHi = dimLo + 1;
+        int dimHi2 = dimHi + 1;
+        FAISS_ASSERT(dimLo < dims);
+        FAISS_ASSERT(dimHi <= dims);
+        FAISS_ASSERT(dimHi <= dims + 1);
+        uint8_t lo = data[i * dims + dimLo];
+        uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
+        uint8_t hi2 = dimHi2 < dims ? data[i * dims + dimHi2] : 0;
+        out[i * bytesPerVec + j] = pack5(j, lo, hi, hi2);
+      }
+    }
+  } else if (bitsPerCode == 6) {
+#pragma omp parallel for
+    for (int i = 0; i < numVecs; ++i) {
+      for (int j = 0; j < bytesPerVec; ++j) {
+        int dimLo = (j * 8) / 6;
+        int dimHi = dimLo + 1;
+        FAISS_ASSERT(dimLo < dims);
+        FAISS_ASSERT(dimHi <= dims);
+        uint8_t lo = data[i * dims + dimLo];
+        uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
+        out[i * bytesPerVec + j] = pack6(j, lo, hi);
+      }
+    }
+  } else {
+    // unhandled
+    FAISS_ASSERT(false);
+  }
+  return out;
+}
+template <typename T>
+void
+packInterleavedWord(const T* in,
+                    T* out,
+                    int numVecs,
+                    int dims,
+                    int bitsPerCode) {
+  int wordsPerDimBlock = 32 * bitsPerCode / (8 * sizeof(T));
+  int wordsPerBlock = wordsPerDimBlock * dims;
+  int numBlocks = utils::divUp(numVecs, 32);
+  // We're guaranteed that all other slots not filled by the vectors present are
+  // initialized to zero (from the vector constructor in packInterleaved)
+#pragma omp parallel for
+  for (int i = 0; i < numVecs; ++i) {
+    int block = i / 32;
+    FAISS_ASSERT(block < numBlocks);
+    int lane = i % 32;
+    for (int j = 0; j < dims; ++j) {
+      int dstOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
+      out[dstOffset] = in[i * dims + j];
+    }
+  }
+}
+std::vector<uint8_t>
+packInterleaved(std::vector<uint8_t> data,
+                int numVecs,
+                int dims,
+                int bitsPerCode) {
+  int bytesPerDimBlock = 32 * bitsPerCode / 8;
+  int bytesPerBlock = bytesPerDimBlock * dims;
+  int numBlocks = utils::divUp(numVecs, 32);
+  size_t totalSize = (size_t) bytesPerBlock * numBlocks;
+  // bit codes padded to whole bytes
+  FAISS_ASSERT(data.size() == numVecs * dims * utils::divUp(bitsPerCode, 8));
+  // packs based on blocks
+  std::vector<uint8_t> out(totalSize, 0);
+  if (bitsPerCode == 8) {
+    packInterleavedWord<uint8_t>(data.data(), out.data(),
+                                 numVecs, dims, bitsPerCode);
+  } else if (bitsPerCode == 16) {
+    packInterleavedWord<uint16_t>((uint16_t*) data.data(),
+                                  (uint16_t*) out.data(),
+                                  numVecs, dims, bitsPerCode);
+  } else if (bitsPerCode == 32) {
+    packInterleavedWord<uint32_t>((uint32_t*) data.data(),
+                                  (uint32_t*) out.data(),
+                                  numVecs, dims, bitsPerCode);
+  } else if (bitsPerCode == 4) {
+#pragma omp parallel for
+    for (int i = 0; i < numBlocks; ++i) {
+      for (int j = 0; j < dims; ++j) {
+        for (int k = 0; k < bytesPerDimBlock; ++k) {
+          int loVec = i * 32 + k * 2;
+          int hiVec = loVec + 1;
+          uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
+          uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
+          out[i * bytesPerBlock + j * bytesPerDimBlock + k] =
+            (hi << 4) | (lo & 0xf);
+        }
+      }
+    }
+  } else if (bitsPerCode == 5) {
+#pragma omp parallel for
+    for (int i = 0; i < numBlocks; ++i) {
+      for (int j = 0; j < dims; ++j) {
+        for (int k = 0; k < bytesPerDimBlock; ++k) {
+          // What input vectors we are pulling from
+          int loVec = i * 32 + (k * 8) / 5;
+          int hiVec = loVec + 1;
+          int hiVec2 = hiVec + 1;
+          uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
+          uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
+          uint8_t hi2 = hiVec2 < numVecs ? data[hiVec2 * dims + j] : 0;
+          out[i * bytesPerBlock + j * bytesPerDimBlock + k] = pack5(k, lo, hi, hi2);
+        }
+      }
+    }
+  } else if (bitsPerCode == 6) {
+#pragma omp parallel for
+    for (int i = 0; i < numBlocks; ++i) {
+      for (int j = 0; j < dims; ++j) {
+        for (int k = 0; k < bytesPerDimBlock; ++k) {
+          // What input vectors we are pulling from
+          int loVec = i * 32 + (k * 8) / 6;
+          int hiVec = loVec + 1;
+          uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
+          uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
+          out[i * bytesPerBlock + j * bytesPerDimBlock + k] = pack6(k, lo, hi);
+        }
+      }
+    }
+  } else {
+    // unimplemented
+    FAISS_ASSERT(false);
+  }
+  return out;
+}
+} } // namespace