RubyGems - faiss - Versions diffs - 0.1.3 → 0.1.4 - Mend

faiss 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (184) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/LICENSE.txt +1 -1
data/README.md +1 -1
data/ext/faiss/extconf.rb +1 -1
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +36 -33
data/vendor/faiss/faiss/AutoTune.h +6 -3
data/vendor/faiss/faiss/Clustering.cpp +16 -12
data/vendor/faiss/faiss/Index.cpp +3 -4
data/vendor/faiss/faiss/Index.h +3 -3
data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
data/vendor/faiss/faiss/IndexBinary.h +1 -1
data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
data/vendor/faiss/faiss/IndexFlat.h +0 -51
data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
data/vendor/faiss/faiss/IndexIVF.h +22 -15
data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
data/vendor/faiss/faiss/IndexRefine.h +73 -0
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
data/vendor/faiss/faiss/impl/io.cpp +33 -2
data/vendor/faiss/faiss/impl/io.h +7 -2
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
data/vendor/faiss/faiss/index_factory.cpp +112 -7
data/vendor/faiss/faiss/index_io.h +1 -48
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
data/vendor/faiss/faiss/utils/Heap.h +61 -50
data/vendor/faiss/faiss/utils/distances.cpp +164 -319
data/vendor/faiss/faiss/utils/distances.h +28 -20
data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
data/vendor/faiss/faiss/utils/hamming.h +2 -7
data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
data/vendor/faiss/faiss/utils/partitioning.h +69 -0
data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
data/vendor/faiss/faiss/utils/simdlib.h +31 -0
data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
metadata +43 -141
data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
data/vendor/faiss/c_api/AutoTune_c.h +0 -66
data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
data/vendor/faiss/c_api/Clustering_c.h +0 -123
data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
data/vendor/faiss/c_api/IndexShards_c.h +0 -39
data/vendor/faiss/c_api/Index_c.cpp +0 -105
data/vendor/faiss/c_api/Index_c.h +0 -183
data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
data/vendor/faiss/c_api/clone_index_c.h +0 -32
data/vendor/faiss/c_api/error_c.h +0 -42
data/vendor/faiss/c_api/error_impl.cpp +0 -27
data/vendor/faiss/c_api/error_impl.h +0 -16
data/vendor/faiss/c_api/faiss_c.h +0 -58
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
data/vendor/faiss/c_api/index_factory_c.h +0 -30
data/vendor/faiss/c_api/index_io_c.cpp +0 -42
data/vendor/faiss/c_api/index_io_c.h +0 -50
data/vendor/faiss/c_api/macros_impl.h +0 -110
data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
data/vendor/faiss/misc/test_blas.cpp +0 -87
data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
data/vendor/faiss/tests/test_merge.cpp +0 -260
data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
data/vendor/faiss/tests/test_params_override.cpp +0 -236
data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104

data/vendor/faiss/faiss/gpu/StandardGpuResources.h CHANGED Viewed

@@ -41,8 +41,22 @@ class StandardGpuResourcesImpl : public GpuResources {
   /// transfers
   void setPinnedMemory(size_t size);
-  /// Called to change the stream for work ordering
-  void setDefaultStream(int device, cudaStream_t stream);
+  /// Called to change the stream for work ordering. We do not own `stream`;
+  /// i.e., it will not be destroyed when the GpuResources object gets cleaned
+  /// up.
+  /// We are guaranteed that all Faiss GPU work is ordered with respect to
+  /// this stream upon exit from an index or other Faiss GPU call.
+  void setDefaultStream(int device, cudaStream_t stream) override;
+  /// Revert the default stream to the original stream managed by this resources
+  /// object, in case someone called `setDefaultStream`.
+  void revertDefaultStream(int device);
+  /// Returns the stream for the given device on which all Faiss GPU work is
+  /// ordered.
+  /// We are guaranteed that all Faiss GPU work is ordered with respect to
+  /// this stream upon exit from an index or other Faiss GPU call.
+  cudaStream_t getDefaultStream(int device) override;
   /// Called to change the work ordering streams to the null stream
   /// for all devices
@@ -60,8 +74,6 @@ class StandardGpuResourcesImpl : public GpuResources {
   cublasHandle_t getBlasHandle(int device) override;
-  cudaStream_t getDefaultStream(int device) override;
   std::vector<cudaStream_t> getAlternateStreams(int device) override;
   /// Allocate non-temporary GPU memory
@@ -128,7 +140,9 @@ class StandardGpuResourcesImpl : public GpuResources {
 };
 /// Default implementation of GpuResources that allocates a cuBLAS
-/// stream and 2 streams for use, as well as temporary memory
+/// stream and 2 streams for use, as well as temporary memory.
+/// Internally, the Faiss GPU code uses the instance managed by getResources,
+/// but this is the user-facing object that is internally reference counted.
 class StandardGpuResources : public GpuResourcesProvider {
  public:
   StandardGpuResources();
@@ -151,9 +165,17 @@ class StandardGpuResources : public GpuResourcesProvider {
   /// transfers
   void setPinnedMemory(size_t size);
-  /// Called to change the stream for work ordering
+  /// Called to change the stream for work ordering. We do not own `stream`;
+  /// i.e., it will not be destroyed when the GpuResources object gets cleaned
+  /// up.
+  /// We are guaranteed that all Faiss GPU work is ordered with respect to
+  /// this stream upon exit from an index or other Faiss GPU call.
   void setDefaultStream(int device, cudaStream_t stream);
+  /// Revert the default stream to the original stream managed by this resources
+  /// object, in case someone called `setDefaultStream`.
+  void revertDefaultStream(int device);
   /// Called to change the work ordering streams to the null stream
   /// for all devices
   void setDefaultNullStreamAllDevices();

data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp ADDED Viewed

@@ -0,0 +1,547 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <faiss/gpu/impl/InterleavedCodes.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/gpu/utils/StaticUtils.h>
+namespace faiss { namespace gpu {
+inline uint8_t unpack5(int i, uint8_t vLower, uint8_t vUpper) {
+  uint8_t v = 0;
+  // lsb     ...    msb
+  // 0: 0 0 0 0 0 1 1 1
+  // 1: 1 1 2 2 2 2 2 3
+  // 2: 3 3 3 3 4 4 4 4
+  // 3: 4 5 5 5 5 5 6 6
+  // 4: 6 6 6 7 7 7 7 7
+  switch (i % 8) {
+    case 0:
+      // 5 lsbs of lower
+      v = vLower & 0x1f;
+      break;
+    case 1:
+      // 3 msbs of lower as v lsbs
+      // 2 msbs of upper as v msbs
+      v = (vLower >> 5) | ((vUpper & 0x3) << 3);
+      break;
+    case 2:
+      // 5 of lower
+      v = (vLower >> 2) & 0x1f;
+      break;
+    case 3:
+      // 1 msbs of lower as v lsbs
+      // 4 lsbs of upper as v msbs
+      v = (vLower >> 7) | ((vUpper & 0xf) << 1);
+      break;
+    case 4:
+      // 4 msbs of lower as v lsbs
+      // 1 lsbs of upper as v msbs
+      v = (vLower >> 4) | ((vUpper & 0x1) << 4);
+      break;
+    case 5:
+      // 5 of lower
+      v = (vLower >> 1) & 0x1f;
+      break;
+    case 6:
+      // 2 msbs of lower as v lsbs
+      // 3 lsbs of upper as v msbs
+      v = (vLower >> 6) | ((vUpper & 0x7) << 2);
+      break;
+    case 7:
+      // 5 of lower
+      v = (vLower >> 3);
+      break;
+  }
+  return v;
+}
+inline uint8_t unpack6(int i, uint8_t vLower, uint8_t vUpper) {
+  uint8_t v = 0;
+  switch (i % 4) {
+    case 0:
+      // 6 lsbs of lower
+      v = vLower & 0x3f;
+      break;
+    case 1:
+      // 2 msbs of lower as v lsbs
+      // 4 lsbs of upper as v msbs
+      v = (vLower >> 6) | ((vUpper & 0xf) << 2);
+      break;
+    case 2:
+      // 4 msbs of lower as v lsbs
+      // 2 lsbs of upper as v msbs
+      v = (vLower >> 4) | ((vUpper & 0x3) << 4);
+      break;
+    case 3:
+      // 6 msbs of lower
+      v = (vLower >> 2);
+      break;
+  }
+  return v;
+}
+std::vector<uint8_t>
+unpackNonInterleaved(std::vector<uint8_t> data,
+                     int numVecs,
+                     int dims,
+                     int bitsPerCode) {
+  int srcVecSize = utils::divUp(dims * bitsPerCode, 8);
+  FAISS_ASSERT(data.size() == numVecs * srcVecSize);
+  if (bitsPerCode == 8 ||
+      bitsPerCode == 16 ||
+      bitsPerCode == 32) {
+    // nothing to do
+    return data;
+  }
+  // bit codes padded to whole bytes
+  std::vector<uint8_t> out(numVecs * dims * utils::divUp(bitsPerCode, 8));
+  if (bitsPerCode == 4) {
+#pragma omp parallel for
+    for (int i = 0; i < numVecs; ++i) {
+      for (int j = 0; j < dims; ++j) {
+        int srcIdx = i * srcVecSize + (j / 2);
+        FAISS_ASSERT(srcIdx < data.size());
+        uint8_t v = data[srcIdx];
+        v = (j % 2 == 0) ? v & 0xf : v >> 4;
+        out[i * dims + j] = v;
+      }
+    }
+  } else if (bitsPerCode == 5) {
+#pragma omp parallel for
+    for (int i = 0; i < numVecs; ++i) {
+      for (int j = 0; j < dims; ++j) {
+        int lo = i * srcVecSize + (j * 5) / 8;
+        int hi = lo + 1;
+        FAISS_ASSERT(lo < data.size());
+        FAISS_ASSERT(hi <= data.size());
+        auto vLower = data[lo];
+        auto vUpper = hi < data.size() ? data[hi] : 0;
+        out[i * dims + j] = unpack5(j, vLower, vUpper);
+      }
+    }
+  } else if (bitsPerCode == 6) {
+#pragma omp parallel for
+    for (int i = 0; i < numVecs; ++i) {
+      for (int j = 0; j < dims; ++j) {
+        int lo = i * srcVecSize + (j * 6) / 8;
+        int hi = lo + 1;
+        FAISS_ASSERT(lo < data.size());
+        FAISS_ASSERT(hi <= data.size());
+        auto vLower = data[lo];
+        auto vUpper = hi < data.size() ? data[hi] : 0;
+        out[i * dims + j] = unpack6(j, vLower, vUpper);
+      }
+    }
+  } else {
+    // unhandled
+    FAISS_ASSERT(false);
+  }
+  return out;
+}
+template <typename T>
+void
+unpackInterleavedWord(const T* in,
+                      T* out,
+                      int numVecs,
+                      int dims,
+                      int bitsPerCode) {
+  int wordsPerDimBlock = 32 * bitsPerCode / (8 * sizeof(T));
+  int wordsPerBlock = wordsPerDimBlock * dims;
+  int numBlocks = utils::divUp(numVecs, 32);
+#pragma omp parallel for
+  for (int i = 0; i < numVecs; ++i) {
+    int block = i / 32;
+    FAISS_ASSERT(block < numBlocks);
+    int lane = i % 32;
+    for (int j = 0; j < dims; ++j) {
+      int srcOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
+      out[i * dims + j] = in[srcOffset];
+    }
+  }
+}
+std::vector<uint8_t>
+unpackInterleaved(std::vector<uint8_t> data,
+                  int numVecs,
+                  int dims,
+                  int bitsPerCode) {
+  int bytesPerDimBlock = 32 * bitsPerCode / 8;
+  int bytesPerBlock = bytesPerDimBlock * dims;
+  int numBlocks = utils::divUp(numVecs, 32);
+  size_t totalSize = (size_t) bytesPerBlock * numBlocks;
+  FAISS_ASSERT(data.size() == totalSize);
+  // bit codes padded to whole bytes
+  std::vector<uint8_t> out(numVecs * dims * utils::divUp(bitsPerCode, 8));
+  if (bitsPerCode == 8) {
+    unpackInterleavedWord<uint8_t>(data.data(), out.data(),
+                                   numVecs, dims, bitsPerCode);
+  } else if (bitsPerCode == 16) {
+    unpackInterleavedWord<uint16_t>((uint16_t*) data.data(),
+                                    (uint16_t*) out.data(),
+                                    numVecs, dims, bitsPerCode);
+  } else if (bitsPerCode == 32) {
+    unpackInterleavedWord<uint32_t>((uint32_t*) data.data(),
+                                    (uint32_t*) out.data(),
+                                    numVecs, dims, bitsPerCode);
+  } else if (bitsPerCode == 4) {
+#pragma omp parallel for
+    for (int i = 0; i < numVecs; ++i) {
+      int block = i / 32;
+      int lane = i % 32;
+      int word = lane / 2;
+      int subWord = lane % 2;
+      for (int j = 0; j < dims; ++j) {
+        auto v =
+          data[block * bytesPerBlock + j * bytesPerDimBlock + word];
+        v = (subWord == 0) ? v & 0xf : v >> 4;
+        out[i * dims + j] = v;
+      }
+    }
+  } else if (bitsPerCode == 5) {
+#pragma omp parallel for
+    for (int i = 0; i < numVecs; ++i) {
+      int block = i / 32;
+      int blockVector = i % 32;
+      for (int j = 0; j < dims; ++j) {
+        uint8_t* dimBlock =
+          &data[block * bytesPerBlock + j * bytesPerDimBlock];
+        int lo = (blockVector * 5) / 8;
+        int hi = lo + 1;
+        FAISS_ASSERT(lo < bytesPerDimBlock);
+        FAISS_ASSERT(hi <= bytesPerDimBlock);
+        auto vLower = dimBlock[lo];
+        auto vUpper = hi < bytesPerDimBlock ? dimBlock[hi] : 0;
+        out[i * dims + j] = unpack5(blockVector, vLower, vUpper);
+      }
+    }
+  } else if (bitsPerCode == 6) {
+#pragma omp parallel for
+    for (int i = 0; i < numVecs; ++i) {
+      int block = i / 32;
+      int blockVector = i % 32;
+      for (int j = 0; j < dims; ++j) {
+        uint8_t* dimBlock =
+          &data[block * bytesPerBlock + j * bytesPerDimBlock];
+        int lo = (blockVector * 6) / 8;
+        int hi = lo + 1;
+        FAISS_ASSERT(lo < bytesPerDimBlock);
+        FAISS_ASSERT(hi <= bytesPerDimBlock);
+        auto vLower = dimBlock[lo];
+        auto vUpper = hi < bytesPerDimBlock ? dimBlock[hi] : 0;
+        out[i * dims + j] = unpack6(blockVector, vLower, vUpper);
+      }
+    }
+  } else {
+    // unimplemented
+    FAISS_ASSERT(false);
+  }
+  return out;
+}
+inline uint8_t pack5(int i, uint8_t lo, uint8_t hi, uint8_t hi2) {
+  FAISS_ASSERT((lo & 0x1f) == lo);
+  FAISS_ASSERT((hi & 0x1f) == hi);
+  FAISS_ASSERT((hi2 & 0x1f) == hi2);
+  uint8_t v = 0;
+  // lsb     ...    msb
+  // 0: 0 0 0 0 0 1 1 1
+  // 1: 1 1 2 2 2 2 2 3
+  // 2: 3 3 3 3 4 4 4 4
+  // 3: 4 5 5 5 5 5 6 6
+  // 4: 6 6 6 7 7 7 7 7
+  switch (i % 5) {
+    case 0:
+      // 5 msbs of lower as vOut lsbs
+      // 3 lsbs of upper as vOut msbs
+      v = (lo & 0x1f) | (hi << 5);
+      break;
+    case 1:
+      // 2 msbs of lower as vOut lsbs
+      // 5 lsbs of upper as vOut msbs
+      // 1 lsbs of upper2 as vOut msb
+      v = (lo >> 3) | (hi << 2) | (hi2 << 7);
+      break;
+    case 2:
+      // 4 msbs of lower as vOut lsbs
+      // 4 lsbs of upper as vOut msbs
+      v = (lo >> 1) | (hi << 4);
+      break;
+    case 3:
+      // 1 msbs of lower as vOut lsbs
+      // 5 lsbs of upper as vOut msbs
+      // 2 lsbs of upper2 as vOut msb
+      v = (lo >> 4) | (hi << 1) | (hi2 << 6);
+      break;
+    case 4:
+      // 3 msbs of lower as vOut lsbs
+      // 5 lsbs of upper as vOut msbs
+      v = (lo >> 2) | (hi << 3);
+      break;
+  }
+  return v;
+}
+inline uint8_t pack6(int i, uint8_t lo, uint8_t hi) {
+  FAISS_ASSERT((lo & 0x3f) == lo);
+  FAISS_ASSERT((hi & 0x3f) == hi);
+  uint8_t v = 0;
+  // lsb     ...    msb
+  // 0: 0 0 0 0 0 0 1 1
+  // 1: 1 1 1 1 2 2 2 2
+  // 2: 2 2 3 3 3 3 3 3
+  switch (i % 3) {
+    case 0:
+      // 6 msbs of lower as vOut lsbs
+      // 2 lsbs of upper as vOut msbs
+      v = (lo & 0x3f) | (hi << 6);
+      break;
+    case 1:
+      // 4 msbs of lower as vOut lsbs
+      // 4 lsbs of upper as vOut msbs
+      v = (lo >> 2) | (hi << 4);
+      break;
+    case 2:
+      // 2 msbs of lower as vOut lsbs
+      // 6 lsbs of upper as vOut msbs
+      v = (lo >> 4) | (hi << 2);
+      break;
+  }
+  return v;
+}
+std::vector<uint8_t>
+packNonInterleaved(std::vector<uint8_t> data,
+                   int numVecs,
+                   int dims,
+                   int bitsPerCode) {
+  // bit codes padded to whole bytes
+  FAISS_ASSERT(data.size() == numVecs * dims * utils::divUp(bitsPerCode, 8));
+  if (bitsPerCode == 8 ||
+      bitsPerCode == 16 ||
+      bitsPerCode == 32) {
+    // nothing to do, whole words are already where they need to be
+    return data;
+  }
+  // bits packed into a whole number of bytes
+  int bytesPerVec = utils::divUp(dims * bitsPerCode, 8);
+  std::vector<uint8_t> out(numVecs * bytesPerVec);
+  if (bitsPerCode == 4) {
+#pragma omp parallel for
+    for (int i = 0; i < numVecs; ++i) {
+      for (int j = 0; j < bytesPerVec; ++j) {
+        int dimLo = j * 2;
+        int dimHi = dimLo + 1;
+        FAISS_ASSERT(dimLo < dims);
+        FAISS_ASSERT(dimHi <= dims);
+        uint8_t lo = data[i * dims + dimLo];
+        uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
+        out[i * bytesPerVec + j] = (hi << 4) | (lo & 0xf);
+      }
+    }
+  } else if (bitsPerCode == 5) {
+#pragma omp parallel for
+    for (int i = 0; i < numVecs; ++i) {
+      for (int j = 0; j < bytesPerVec; ++j) {
+        int dimLo = (j * 8) / 5;
+        int dimHi = dimLo + 1;
+        int dimHi2 = dimHi + 1;
+        FAISS_ASSERT(dimLo < dims);
+        FAISS_ASSERT(dimHi <= dims);
+        FAISS_ASSERT(dimHi <= dims + 1);
+        uint8_t lo = data[i * dims + dimLo];
+        uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
+        uint8_t hi2 = dimHi2 < dims ? data[i * dims + dimHi2] : 0;
+        out[i * bytesPerVec + j] = pack5(j, lo, hi, hi2);
+      }
+    }
+  } else if (bitsPerCode == 6) {
+#pragma omp parallel for
+    for (int i = 0; i < numVecs; ++i) {
+      for (int j = 0; j < bytesPerVec; ++j) {
+        int dimLo = (j * 8) / 6;
+        int dimHi = dimLo + 1;
+        FAISS_ASSERT(dimLo < dims);
+        FAISS_ASSERT(dimHi <= dims);
+        uint8_t lo = data[i * dims + dimLo];
+        uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
+        out[i * bytesPerVec + j] = pack6(j, lo, hi);
+      }
+    }
+  } else {
+    // unhandled
+    FAISS_ASSERT(false);
+  }
+  return out;
+}
+template <typename T>
+void
+packInterleavedWord(const T* in,
+                    T* out,
+                    int numVecs,
+                    int dims,
+                    int bitsPerCode) {
+  int wordsPerDimBlock = 32 * bitsPerCode / (8 * sizeof(T));
+  int wordsPerBlock = wordsPerDimBlock * dims;
+  int numBlocks = utils::divUp(numVecs, 32);
+  // We're guaranteed that all other slots not filled by the vectors present are
+  // initialized to zero (from the vector constructor in packInterleaved)
+#pragma omp parallel for
+  for (int i = 0; i < numVecs; ++i) {
+    int block = i / 32;
+    FAISS_ASSERT(block < numBlocks);
+    int lane = i % 32;
+    for (int j = 0; j < dims; ++j) {
+      int dstOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
+      out[dstOffset] = in[i * dims + j];
+    }
+  }
+}
+std::vector<uint8_t>
+packInterleaved(std::vector<uint8_t> data,
+                int numVecs,
+                int dims,
+                int bitsPerCode) {
+  int bytesPerDimBlock = 32 * bitsPerCode / 8;
+  int bytesPerBlock = bytesPerDimBlock * dims;
+  int numBlocks = utils::divUp(numVecs, 32);
+  size_t totalSize = (size_t) bytesPerBlock * numBlocks;
+  // bit codes padded to whole bytes
+  FAISS_ASSERT(data.size() == numVecs * dims * utils::divUp(bitsPerCode, 8));
+  // packs based on blocks
+  std::vector<uint8_t> out(totalSize, 0);
+  if (bitsPerCode == 8) {
+    packInterleavedWord<uint8_t>(data.data(), out.data(),
+                                 numVecs, dims, bitsPerCode);
+  } else if (bitsPerCode == 16) {
+    packInterleavedWord<uint16_t>((uint16_t*) data.data(),
+                                  (uint16_t*) out.data(),
+                                  numVecs, dims, bitsPerCode);
+  } else if (bitsPerCode == 32) {
+    packInterleavedWord<uint32_t>((uint32_t*) data.data(),
+                                  (uint32_t*) out.data(),
+                                  numVecs, dims, bitsPerCode);
+  } else if (bitsPerCode == 4) {
+#pragma omp parallel for
+    for (int i = 0; i < numBlocks; ++i) {
+      for (int j = 0; j < dims; ++j) {
+        for (int k = 0; k < bytesPerDimBlock; ++k) {
+          int loVec = i * 32 + k * 2;
+          int hiVec = loVec + 1;
+          uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
+          uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
+          out[i * bytesPerBlock + j * bytesPerDimBlock + k] =
+            (hi << 4) | (lo & 0xf);
+        }
+      }
+    }
+  } else if (bitsPerCode == 5) {
+#pragma omp parallel for
+    for (int i = 0; i < numBlocks; ++i) {
+      for (int j = 0; j < dims; ++j) {
+        for (int k = 0; k < bytesPerDimBlock; ++k) {
+          // What input vectors we are pulling from
+          int loVec = i * 32 + (k * 8) / 5;
+          int hiVec = loVec + 1;
+          int hiVec2 = hiVec + 1;
+          uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
+          uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
+          uint8_t hi2 = hiVec2 < numVecs ? data[hiVec2 * dims + j] : 0;
+          out[i * bytesPerBlock + j * bytesPerDimBlock + k] = pack5(k, lo, hi, hi2);
+        }
+      }
+    }
+  } else if (bitsPerCode == 6) {
+#pragma omp parallel for
+    for (int i = 0; i < numBlocks; ++i) {
+      for (int j = 0; j < dims; ++j) {
+        for (int k = 0; k < bytesPerDimBlock; ++k) {
+          // What input vectors we are pulling from
+          int loVec = i * 32 + (k * 8) / 6;
+          int hiVec = loVec + 1;
+          uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
+          uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
+          out[i * bytesPerBlock + j * bytesPerDimBlock + k] = pack6(k, lo, hi);
+        }
+      }
+    }
+  } else {
+    // unimplemented
+    FAISS_ASSERT(false);
+  }
+  return out;
+}
+} } // namespace