RubyGems - faiss - Versions diffs - 0.2.0 → 0.2.1 - Mend

faiss 0.2.0 → 0.2.1

Files changed (202) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +292 -291
data/vendor/faiss/faiss/AutoTune.h +55 -56
data/vendor/faiss/faiss/Clustering.cpp +334 -195
data/vendor/faiss/faiss/Clustering.h +88 -35
data/vendor/faiss/faiss/IVFlib.cpp +171 -195
data/vendor/faiss/faiss/IVFlib.h +48 -51
data/vendor/faiss/faiss/Index.cpp +85 -103
data/vendor/faiss/faiss/Index.h +54 -48
data/vendor/faiss/faiss/Index2Layer.cpp +139 -164
data/vendor/faiss/faiss/Index2Layer.h +22 -22
data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
data/vendor/faiss/faiss/IndexBinary.h +140 -132
data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
data/vendor/faiss/faiss/IndexFlat.cpp +116 -147
data/vendor/faiss/faiss/IndexFlat.h +35 -46
data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
data/vendor/faiss/faiss/IndexHNSW.h +57 -41
data/vendor/faiss/faiss/IndexIVF.cpp +474 -454
data/vendor/faiss/faiss/IndexIVF.h +146 -113
data/vendor/faiss/faiss/IndexIVFFlat.cpp +248 -250
data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
data/vendor/faiss/faiss/IndexIVFPQ.cpp +457 -516
data/vendor/faiss/faiss/IndexIVFPQ.h +74 -66
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +125 -133
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +19 -21
data/vendor/faiss/faiss/IndexLSH.cpp +75 -96
data/vendor/faiss/faiss/IndexLSH.h +21 -26
data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
data/vendor/faiss/faiss/IndexLattice.h +11 -16
data/vendor/faiss/faiss/IndexNNDescent.cpp +231 -0
data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
data/vendor/faiss/faiss/IndexNSG.cpp +303 -0
data/vendor/faiss/faiss/IndexNSG.h +85 -0
data/vendor/faiss/faiss/IndexPQ.cpp +405 -464
data/vendor/faiss/faiss/IndexPQ.h +64 -67
data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
data/vendor/faiss/faiss/IndexRefine.cpp +115 -131
data/vendor/faiss/faiss/IndexRefine.h +22 -23
data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
data/vendor/faiss/faiss/IndexReplicas.h +62 -56
data/vendor/faiss/faiss/IndexResidual.cpp +291 -0
data/vendor/faiss/faiss/IndexResidual.h +152 -0
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +120 -155
data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -45
data/vendor/faiss/faiss/IndexShards.cpp +256 -240
data/vendor/faiss/faiss/IndexShards.h +85 -73
data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
data/vendor/faiss/faiss/MatrixStats.h +7 -10
data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
data/vendor/faiss/faiss/MetaIndexes.h +40 -34
data/vendor/faiss/faiss/MetricType.h +7 -7
data/vendor/faiss/faiss/VectorTransform.cpp +652 -474
data/vendor/faiss/faiss/VectorTransform.h +61 -89
data/vendor/faiss/faiss/clone_index.cpp +77 -73
data/vendor/faiss/faiss/clone_index.h +4 -9
data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +197 -170
data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +270 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +115 -0
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
data/vendor/faiss/faiss/impl/FaissException.h +41 -29
data/vendor/faiss/faiss/impl/HNSW.cpp +595 -611
data/vendor/faiss/faiss/impl/HNSW.h +179 -200
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +672 -0
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +172 -0
data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
data/vendor/faiss/faiss/impl/NSG.cpp +682 -0
data/vendor/faiss/faiss/impl/NSG.h +199 -0
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +448 -0
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +130 -0
data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +648 -701
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
data/vendor/faiss/faiss/impl/index_read.cpp +547 -479
data/vendor/faiss/faiss/impl/index_write.cpp +497 -407
data/vendor/faiss/faiss/impl/io.cpp +75 -94
data/vendor/faiss/faiss/impl/io.h +31 -41
data/vendor/faiss/faiss/impl/io_macros.h +40 -29
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
data/vendor/faiss/faiss/index_factory.cpp +269 -218
data/vendor/faiss/faiss/index_factory.h +6 -7
data/vendor/faiss/faiss/index_io.h +23 -26
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
data/vendor/faiss/faiss/utils/Heap.h +186 -209
data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
data/vendor/faiss/faiss/utils/distances.cpp +301 -310
data/vendor/faiss/faiss/utils/distances.h +133 -118
data/vendor/faiss/faiss/utils/distances_simd.cpp +456 -516
data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
data/vendor/faiss/faiss/utils/hamming.h +62 -85
data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
data/vendor/faiss/faiss/utils/partitioning.h +26 -21
data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
data/vendor/faiss/faiss/utils/random.cpp +39 -63
data/vendor/faiss/faiss/utils/random.h +13 -16
data/vendor/faiss/faiss/utils/simdlib.h +4 -2
data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
data/vendor/faiss/faiss/utils/utils.cpp +304 -287
data/vendor/faiss/faiss/utils/utils.h +53 -48
metadata +20 -2

data/vendor/faiss/faiss/gpu/StandardGpuResources.h CHANGED Viewed

@@ -5,138 +5,138 @@
  * LICENSE file in the root directory of this source tree.
  */
 #pragma once
 #include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/StackDeviceMemory.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/StackDeviceMemory.h>
 #include <functional>
 #include <map>
 #include <unordered_map>
 #include <vector>
-namespace faiss { namespace gpu {
+namespace faiss {
+namespace gpu {
 /// Standard implementation of the GpuResources object that provides for a
 /// temporary memory manager
 class StandardGpuResourcesImpl : public GpuResources {
- public:
-  StandardGpuResourcesImpl();
+   public:
+    StandardGpuResourcesImpl();
-  ~StandardGpuResourcesImpl() override;
+    ~StandardGpuResourcesImpl() override;
-  /// Disable allocation of temporary memory; all temporary memory
-  /// requests will call cudaMalloc / cudaFree at the point of use
-  void noTempMemory();
+    /// Disable allocation of temporary memory; all temporary memory
+    /// requests will call cudaMalloc / cudaFree at the point of use
+    void noTempMemory();
-  /// Specify that we wish to use a certain fixed size of memory on
-  /// all devices as temporary memory. This is the upper bound for the GPU
-  /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
-  /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
-  /// To avoid any temporary memory allocation, pass 0.
-  void setTempMemory(size_t size);
+    /// Specify that we wish to use a certain fixed size of memory on
+    /// all devices as temporary memory. This is the upper bound for the GPU
+    /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
+    /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
+    /// To avoid any temporary memory allocation, pass 0.
+    void setTempMemory(size_t size);
-  /// Set amount of pinned memory to allocate, for async GPU <-> CPU
-  /// transfers
-  void setPinnedMemory(size_t size);
+    /// Set amount of pinned memory to allocate, for async GPU <-> CPU
+    /// transfers
+    void setPinnedMemory(size_t size);
-  /// Called to change the stream for work ordering. We do not own `stream`;
-  /// i.e., it will not be destroyed when the GpuResources object gets cleaned
-  /// up.
-  /// We are guaranteed that all Faiss GPU work is ordered with respect to
-  /// this stream upon exit from an index or other Faiss GPU call.
-  void setDefaultStream(int device, cudaStream_t stream) override;
+    /// Called to change the stream for work ordering. We do not own `stream`;
+    /// i.e., it will not be destroyed when the GpuResources object gets cleaned
+    /// up.
+    /// We are guaranteed that all Faiss GPU work is ordered with respect to
+    /// this stream upon exit from an index or other Faiss GPU call.
+    void setDefaultStream(int device, cudaStream_t stream) override;
-  /// Revert the default stream to the original stream managed by this resources
-  /// object, in case someone called `setDefaultStream`.
-  void revertDefaultStream(int device);
+    /// Revert the default stream to the original stream managed by this
+    /// resources object, in case someone called `setDefaultStream`.
+    void revertDefaultStream(int device);
-  /// Returns the stream for the given device on which all Faiss GPU work is
-  /// ordered.
-  /// We are guaranteed that all Faiss GPU work is ordered with respect to
-  /// this stream upon exit from an index or other Faiss GPU call.
-  cudaStream_t getDefaultStream(int device) override;
+    /// Returns the stream for the given device on which all Faiss GPU work is
+    /// ordered.
+    /// We are guaranteed that all Faiss GPU work is ordered with respect to
+    /// this stream upon exit from an index or other Faiss GPU call.
+    cudaStream_t getDefaultStream(int device) override;
-  /// Called to change the work ordering streams to the null stream
-  /// for all devices
-  void setDefaultNullStreamAllDevices();
+    /// Called to change the work ordering streams to the null stream
+    /// for all devices
+    void setDefaultNullStreamAllDevices();
-  /// If enabled, will print every GPU memory allocation and deallocation to
-  /// standard output
-  void setLogMemoryAllocations(bool enable);
+    /// If enabled, will print every GPU memory allocation and deallocation to
+    /// standard output
+    void setLogMemoryAllocations(bool enable);
- public:
-  /// Internal system calls
+   public:
+    /// Internal system calls
-  /// Initialize resources for this device
-  void initializeForDevice(int device) override;
+    /// Initialize resources for this device
+    void initializeForDevice(int device) override;
-  cublasHandle_t getBlasHandle(int device) override;
+    cublasHandle_t getBlasHandle(int device) override;
-  std::vector<cudaStream_t> getAlternateStreams(int device) override;
+    std::vector<cudaStream_t> getAlternateStreams(int device) override;
-  /// Allocate non-temporary GPU memory
-  void* allocMemory(const AllocRequest& req) override;
+    /// Allocate non-temporary GPU memory
+    void* allocMemory(const AllocRequest& req) override;
-  /// Returns a previous allocation
-  void deallocMemory(int device, void* in) override;
+    /// Returns a previous allocation
+    void deallocMemory(int device, void* in) override;
-  size_t getTempMemoryAvailable(int device) const override;
+    size_t getTempMemoryAvailable(int device) const override;
-  /// Export a description of memory used for Python
-  std::map<int, std::map<std::string, std::pair<int, size_t>>>
-  getMemoryInfo() const;
+    /// Export a description of memory used for Python
+    std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo()
+            const;
-  std::pair<void*, size_t> getPinnedMemory() override;
+    std::pair<void*, size_t> getPinnedMemory() override;
-  cudaStream_t getAsyncCopyStream(int device) override;
+    cudaStream_t getAsyncCopyStream(int device) override;
- private:
-  /// Have GPU resources been initialized for this device yet?
-  bool isInitialized(int device) const;
+   private:
+    /// Have GPU resources been initialized for this device yet?
+    bool isInitialized(int device) const;
-  /// Adjust the default temporary memory allocation based on the total GPU
-  /// memory size
-  static size_t getDefaultTempMemForGPU(int device, size_t requested);
+    /// Adjust the default temporary memory allocation based on the total GPU
+    /// memory size
+    static size_t getDefaultTempMemForGPU(int device, size_t requested);
- private:
-  /// Set of currently outstanding memory allocations per device
-  /// device -> (alloc request, allocated ptr)
-  std::unordered_map<int, std::unordered_map<void*, AllocRequest>> allocs_;
+   private:
+    /// Set of currently outstanding memory allocations per device
+    /// device -> (alloc request, allocated ptr)
+    std::unordered_map<int, std::unordered_map<void*, AllocRequest>> allocs_;
-  /// Temporary memory provider, per each device
-  std::unordered_map<int, std::unique_ptr<StackDeviceMemory>> tempMemory_;
+    /// Temporary memory provider, per each device
+    std::unordered_map<int, std::unique_ptr<StackDeviceMemory>> tempMemory_;
-  /// Our default stream that work is ordered on, one per each device
-  std::unordered_map<int, cudaStream_t> defaultStreams_;
+    /// Our default stream that work is ordered on, one per each device
+    std::unordered_map<int, cudaStream_t> defaultStreams_;
-  /// This contains particular streams as set by the user for
-  /// ordering, if any
-  std::unordered_map<int, cudaStream_t> userDefaultStreams_;
+    /// This contains particular streams as set by the user for
+    /// ordering, if any
+    std::unordered_map<int, cudaStream_t> userDefaultStreams_;
-  /// Other streams we can use, per each device
-  std::unordered_map<int, std::vector<cudaStream_t>> alternateStreams_;
+    /// Other streams we can use, per each device
+    std::unordered_map<int, std::vector<cudaStream_t>> alternateStreams_;
-  /// Async copy stream to use for GPU <-> CPU pinned memory copies
-  std::unordered_map<int, cudaStream_t> asyncCopyStreams_;
+    /// Async copy stream to use for GPU <-> CPU pinned memory copies
+    std::unordered_map<int, cudaStream_t> asyncCopyStreams_;
-  /// cuBLAS handle for each device
-  std::unordered_map<int, cublasHandle_t> blasHandles_;
+    /// cuBLAS handle for each device
+    std::unordered_map<int, cublasHandle_t> blasHandles_;
-  /// Pinned memory allocation for use with this GPU
-  void* pinnedMemAlloc_;
-  size_t pinnedMemAllocSize_;
+    /// Pinned memory allocation for use with this GPU
+    void* pinnedMemAlloc_;
+    size_t pinnedMemAllocSize_;
-  /// Another option is to use a specified amount of memory on all
-  /// devices
-  size_t tempMemSize_;
+    /// Another option is to use a specified amount of memory on all
+    /// devices
+    size_t tempMemSize_;
-  /// Amount of pinned memory we should allocate
-  size_t pinnedMemSize_;
+    /// Amount of pinned memory we should allocate
+    size_t pinnedMemSize_;
-  /// Whether or not we log every GPU memory allocation and deallocation
-  bool allocLogging_;
+    /// Whether or not we log every GPU memory allocation and deallocation
+    bool allocLogging_;
 };
 /// Default implementation of GpuResources that allocates a cuBLAS
@@ -144,61 +144,62 @@ class StandardGpuResourcesImpl : public GpuResources {
 /// Internally, the Faiss GPU code uses the instance managed by getResources,
 /// but this is the user-facing object that is internally reference counted.
 class StandardGpuResources : public GpuResourcesProvider {
- public:
-  StandardGpuResources();
-  ~StandardGpuResources() override;
+   public:
+    StandardGpuResources();
+    ~StandardGpuResources() override;
-  std::shared_ptr<GpuResources> getResources() override;
+    std::shared_ptr<GpuResources> getResources() override;
-  /// Disable allocation of temporary memory; all temporary memory
-  /// requests will call cudaMalloc / cudaFree at the point of use
-  void noTempMemory();
+    /// Disable allocation of temporary memory; all temporary memory
+    /// requests will call cudaMalloc / cudaFree at the point of use
+    void noTempMemory();
-  /// Specify that we wish to use a certain fixed size of memory on
-  /// all devices as temporary memory. This is the upper bound for the GPU
-  /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
-  /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
-  /// To avoid any temporary memory allocation, pass 0.
-  void setTempMemory(size_t size);
+    /// Specify that we wish to use a certain fixed size of memory on
+    /// all devices as temporary memory. This is the upper bound for the GPU
+    /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
+    /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
+    /// To avoid any temporary memory allocation, pass 0.
+    void setTempMemory(size_t size);
-  /// Set amount of pinned memory to allocate, for async GPU <-> CPU
-  /// transfers
-  void setPinnedMemory(size_t size);
+    /// Set amount of pinned memory to allocate, for async GPU <-> CPU
+    /// transfers
+    void setPinnedMemory(size_t size);
-  /// Called to change the stream for work ordering. We do not own `stream`;
-  /// i.e., it will not be destroyed when the GpuResources object gets cleaned
-  /// up.
-  /// We are guaranteed that all Faiss GPU work is ordered with respect to
-  /// this stream upon exit from an index or other Faiss GPU call.
-  void setDefaultStream(int device, cudaStream_t stream);
+    /// Called to change the stream for work ordering. We do not own `stream`;
+    /// i.e., it will not be destroyed when the GpuResources object gets cleaned
+    /// up.
+    /// We are guaranteed that all Faiss GPU work is ordered with respect to
+    /// this stream upon exit from an index or other Faiss GPU call.
+    void setDefaultStream(int device, cudaStream_t stream);
-  /// Revert the default stream to the original stream managed by this resources
-  /// object, in case someone called `setDefaultStream`.
-  void revertDefaultStream(int device);
+    /// Revert the default stream to the original stream managed by this
+    /// resources object, in case someone called `setDefaultStream`.
+    void revertDefaultStream(int device);
-  /// Called to change the work ordering streams to the null stream
-  /// for all devices
-  void setDefaultNullStreamAllDevices();
+    /// Called to change the work ordering streams to the null stream
+    /// for all devices
+    void setDefaultNullStreamAllDevices();
-  /// Export a description of memory used for Python
-  std::map<int, std::map<std::string, std::pair<int, size_t>>>
-  getMemoryInfo() const;
+    /// Export a description of memory used for Python
+    std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo()
+            const;
-  /// Returns the current default stream
-  cudaStream_t getDefaultStream(int device);
+    /// Returns the current default stream
+    cudaStream_t getDefaultStream(int device);
-  /// Returns the current amount of temp memory available
-  size_t getTempMemoryAvailable(int device) const;
+    /// Returns the current amount of temp memory available
+    size_t getTempMemoryAvailable(int device) const;
-  /// Synchronize our default stream with the CPU
-  void syncDefaultStreamCurrentDevice();
+    /// Synchronize our default stream with the CPU
+    void syncDefaultStreamCurrentDevice();
-  /// If enabled, will print every GPU memory allocation and deallocation to
-  /// standard output
-  void setLogMemoryAllocations(bool enable);
+    /// If enabled, will print every GPU memory allocation and deallocation to
+    /// standard output
+    void setLogMemoryAllocations(bool enable);
- private:
-  std::shared_ptr<StandardGpuResourcesImpl> res_;
+   private:
+    std::shared_ptr<StandardGpuResourcesImpl> res_;
 };
-} } // namespace
+} // namespace gpu
+} // namespace faiss

data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp CHANGED Viewed

@@ -6,542 +6,554 @@
  */
 #include <faiss/gpu/impl/InterleavedCodes.h>
-#include <faiss/impl/FaissAssert.h>
 #include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/impl/FaissAssert.h>
-namespace faiss { namespace gpu {
+namespace faiss {
+namespace gpu {
 inline uint8_t unpack5(int i, uint8_t vLower, uint8_t vUpper) {
-  uint8_t v = 0;
-  // lsb     ...    msb
-  // 0: 0 0 0 0 0 1 1 1
-  // 1: 1 1 2 2 2 2 2 3
-  // 2: 3 3 3 3 4 4 4 4
-  // 3: 4 5 5 5 5 5 6 6
-  // 4: 6 6 6 7 7 7 7 7
-  switch (i % 8) {
-    case 0:
-      // 5 lsbs of lower
-      v = vLower & 0x1f;
-      break;
-    case 1:
-      // 3 msbs of lower as v lsbs
-      // 2 msbs of upper as v msbs
-      v = (vLower >> 5) | ((vUpper & 0x3) << 3);
-      break;
-    case 2:
-      // 5 of lower
-      v = (vLower >> 2) & 0x1f;
-      break;
-    case 3:
-      // 1 msbs of lower as v lsbs
-      // 4 lsbs of upper as v msbs
-      v = (vLower >> 7) | ((vUpper & 0xf) << 1);
-      break;
-    case 4:
-      // 4 msbs of lower as v lsbs
-      // 1 lsbs of upper as v msbs
-      v = (vLower >> 4) | ((vUpper & 0x1) << 4);
-      break;
-    case 5:
-      // 5 of lower
-      v = (vLower >> 1) & 0x1f;
-      break;
-    case 6:
-      // 2 msbs of lower as v lsbs
-      // 3 lsbs of upper as v msbs
-      v = (vLower >> 6) | ((vUpper & 0x7) << 2);
-      break;
-    case 7:
-      // 5 of lower
-      v = (vLower >> 3);
-      break;
-  }
-  return v;
-}
+    uint8_t v = 0;
+    // lsb     ...    msb
+    // 0: 0 0 0 0 0 1 1 1
+    // 1: 1 1 2 2 2 2 2 3
+    // 2: 3 3 3 3 4 4 4 4
+    // 3: 4 5 5 5 5 5 6 6
+    // 4: 6 6 6 7 7 7 7 7
+    switch (i % 8) {
+        case 0:
+            // 5 lsbs of lower
+            v = vLower & 0x1f;
+            break;
+        case 1:
+            // 3 msbs of lower as v lsbs
+            // 2 msbs of upper as v msbs
+            v = (vLower >> 5) | ((vUpper & 0x3) << 3);
+            break;
+        case 2:
+            // 5 of lower
+            v = (vLower >> 2) & 0x1f;
+            break;
+        case 3:
+            // 1 msbs of lower as v lsbs
+            // 4 lsbs of upper as v msbs
+            v = (vLower >> 7) | ((vUpper & 0xf) << 1);
+            break;
+        case 4:
+            // 4 msbs of lower as v lsbs
+            // 1 lsbs of upper as v msbs
+            v = (vLower >> 4) | ((vUpper & 0x1) << 4);
+            break;
+        case 5:
+            // 5 of lower
+            v = (vLower >> 1) & 0x1f;
+            break;
+        case 6:
+            // 2 msbs of lower as v lsbs
+            // 3 lsbs of upper as v msbs
+            v = (vLower >> 6) | ((vUpper & 0x7) << 2);
+            break;
+        case 7:
+            // 5 of lower
+            v = (vLower >> 3);
+            break;
+    }
-inline uint8_t unpack6(int i, uint8_t vLower, uint8_t vUpper) {
-  uint8_t v = 0;
-  switch (i % 4) {
-    case 0:
-      // 6 lsbs of lower
-      v = vLower & 0x3f;
-      break;
-    case 1:
-      // 2 msbs of lower as v lsbs
-      // 4 lsbs of upper as v msbs
-      v = (vLower >> 6) | ((vUpper & 0xf) << 2);
-      break;
-    case 2:
-      // 4 msbs of lower as v lsbs
-      // 2 lsbs of upper as v msbs
-      v = (vLower >> 4) | ((vUpper & 0x3) << 4);
-      break;
-    case 3:
-      // 6 msbs of lower
-      v = (vLower >> 2);
-      break;
-  }
-  return v;
+    return v;
 }
+inline uint8_t unpack6(int i, uint8_t vLower, uint8_t vUpper) {
+    uint8_t v = 0;
+    switch (i % 4) {
+        case 0:
+            // 6 lsbs of lower
+            v = vLower & 0x3f;
+            break;
+        case 1:
+            // 2 msbs of lower as v lsbs
+            // 4 lsbs of upper as v msbs
+            v = (vLower >> 6) | ((vUpper & 0xf) << 2);
+            break;
+        case 2:
+            // 4 msbs of lower as v lsbs
+            // 2 lsbs of upper as v msbs
+            v = (vLower >> 4) | ((vUpper & 0x3) << 4);
+            break;
+        case 3:
+            // 6 msbs of lower
+            v = (vLower >> 2);
+            break;
+    }
-std::vector<uint8_t>
-unpackNonInterleaved(std::vector<uint8_t> data,
-                     int numVecs,
-                     int dims,
-                     int bitsPerCode) {
-  int srcVecSize = utils::divUp(dims * bitsPerCode, 8);
-  FAISS_ASSERT(data.size() == numVecs * srcVecSize);
+    return v;
+}
-  if (bitsPerCode == 8 ||
-      bitsPerCode == 16 ||
-      bitsPerCode == 32) {
-    // nothing to do
-    return data;
-  }
+std::vector<uint8_t> unpackNonInterleaved(
+        std::vector<uint8_t> data,
+        int numVecs,
+        int dims,
+        int bitsPerCode) {
+    int srcVecSize = utils::divUp(dims * bitsPerCode, 8);
+    FAISS_ASSERT(data.size() == numVecs * srcVecSize);
+    if (bitsPerCode == 8 || bitsPerCode == 16 || bitsPerCode == 32) {
+        // nothing to do
+        return data;
+    }
-  // bit codes padded to whole bytes
-  std::vector<uint8_t> out(numVecs * dims * utils::divUp(bitsPerCode, 8));
+    // bit codes padded to whole bytes
+    std::vector<uint8_t> out(numVecs * dims * utils::divUp(bitsPerCode, 8));
-  if (bitsPerCode == 4) {
+    if (bitsPerCode == 4) {
 #pragma omp parallel for
-    for (int i = 0; i < numVecs; ++i) {
-      for (int j = 0; j < dims; ++j) {
-        int srcIdx = i * srcVecSize + (j / 2);
-        FAISS_ASSERT(srcIdx < data.size());
+        for (int i = 0; i < numVecs; ++i) {
+            for (int j = 0; j < dims; ++j) {
+                int srcIdx = i * srcVecSize + (j / 2);
+                FAISS_ASSERT(srcIdx < data.size());
-        uint8_t v = data[srcIdx];
-        v = (j % 2 == 0) ? v & 0xf : v >> 4;
+                uint8_t v = data[srcIdx];
+                v = (j % 2 == 0) ? v & 0xf : v >> 4;
-        out[i * dims + j] = v;
-      }
-    }
-  } else if (bitsPerCode == 5) {
+                out[i * dims + j] = v;
+            }
+        }
+    } else if (bitsPerCode == 5) {
 #pragma omp parallel for
-    for (int i = 0; i < numVecs; ++i) {
-      for (int j = 0; j < dims; ++j) {
-        int lo = i * srcVecSize + (j * 5) / 8;
-        int hi = lo + 1;
+        for (int i = 0; i < numVecs; ++i) {
+            for (int j = 0; j < dims; ++j) {
+                int lo = i * srcVecSize + (j * 5) / 8;
+                int hi = lo + 1;
-        FAISS_ASSERT(lo < data.size());
-        FAISS_ASSERT(hi <= data.size());
+                FAISS_ASSERT(lo < data.size());
+                FAISS_ASSERT(hi <= data.size());
-        auto vLower = data[lo];
-        auto vUpper = hi < data.size() ? data[hi] : 0;
+                auto vLower = data[lo];
+                auto vUpper = hi < data.size() ? data[hi] : 0;
-        out[i * dims + j] = unpack5(j, vLower, vUpper);
-      }
-    }
-  } else if (bitsPerCode == 6) {
+                out[i * dims + j] = unpack5(j, vLower, vUpper);
+            }
+        }
+    } else if (bitsPerCode == 6) {
 #pragma omp parallel for
-    for (int i = 0; i < numVecs; ++i) {
-      for (int j = 0; j < dims; ++j) {
-        int lo = i * srcVecSize + (j * 6) / 8;
-        int hi = lo + 1;
+        for (int i = 0; i < numVecs; ++i) {
+            for (int j = 0; j < dims; ++j) {
+                int lo = i * srcVecSize + (j * 6) / 8;
+                int hi = lo + 1;
-        FAISS_ASSERT(lo < data.size());
-        FAISS_ASSERT(hi <= data.size());
+                FAISS_ASSERT(lo < data.size());
+                FAISS_ASSERT(hi <= data.size());
-        auto vLower = data[lo];
-        auto vUpper = hi < data.size() ? data[hi] : 0;
+                auto vLower = data[lo];
+                auto vUpper = hi < data.size() ? data[hi] : 0;
-        out[i * dims + j] = unpack6(j, vLower, vUpper);
-      }
+                out[i * dims + j] = unpack6(j, vLower, vUpper);
+            }
+        }
+    } else {
+        // unhandled
+        FAISS_ASSERT(false);
     }
-  } else {
-    // unhandled
-    FAISS_ASSERT(false);
-  }
-  return out;
+    return out;
 }
 template <typename T>
-void
-unpackInterleavedWord(const T* in,
-                      T* out,
-                      int numVecs,
-                      int dims,
-                      int bitsPerCode) {
-  int wordsPerDimBlock = 32 * bitsPerCode / (8 * sizeof(T));
-  int wordsPerBlock = wordsPerDimBlock * dims;
-  int numBlocks = utils::divUp(numVecs, 32);
+void unpackInterleavedWord(
+        const T* in,
+        T* out,
+        int numVecs,
+        int dims,
+        int bitsPerCode) {
+    int wordsPerDimBlock = 32 * bitsPerCode / (8 * sizeof(T));
+    int wordsPerBlock = wordsPerDimBlock * dims;
+    int numBlocks = utils::divUp(numVecs, 32);
 #pragma omp parallel for
-  for (int i = 0; i < numVecs; ++i) {
-    int block = i / 32;
-    FAISS_ASSERT(block < numBlocks);
-    int lane = i % 32;
-    for (int j = 0; j < dims; ++j) {
-      int srcOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
-      out[i * dims + j] = in[srcOffset];
+    for (int i = 0; i < numVecs; ++i) {
+        int block = i / 32;
+        FAISS_ASSERT(block < numBlocks);
+        int lane = i % 32;
+        for (int j = 0; j < dims; ++j) {
+            int srcOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
+            out[i * dims + j] = in[srcOffset];
+        }
     }
-  }
 }
-std::vector<uint8_t>
-unpackInterleaved(std::vector<uint8_t> data,
-                  int numVecs,
-                  int dims,
-                  int bitsPerCode) {
-  int bytesPerDimBlock = 32 * bitsPerCode / 8;
-  int bytesPerBlock = bytesPerDimBlock * dims;
-  int numBlocks = utils::divUp(numVecs, 32);
-  size_t totalSize = (size_t) bytesPerBlock * numBlocks;
-  FAISS_ASSERT(data.size() == totalSize);
-  // bit codes padded to whole bytes
-  std::vector<uint8_t> out(numVecs * dims * utils::divUp(bitsPerCode, 8));
-  if (bitsPerCode == 8) {
-    unpackInterleavedWord<uint8_t>(data.data(), out.data(),
-                                   numVecs, dims, bitsPerCode);
-  } else if (bitsPerCode == 16) {
-    unpackInterleavedWord<uint16_t>((uint16_t*) data.data(),
-                                    (uint16_t*) out.data(),
-                                    numVecs, dims, bitsPerCode);
-  } else if (bitsPerCode == 32) {
-    unpackInterleavedWord<uint32_t>((uint32_t*) data.data(),
-                                    (uint32_t*) out.data(),
-                                    numVecs, dims, bitsPerCode);
-  } else if (bitsPerCode == 4) {
+std::vector<uint8_t> unpackInterleaved(
+        std::vector<uint8_t> data,
+        int numVecs,
+        int dims,
+        int bitsPerCode) {
+    int bytesPerDimBlock = 32 * bitsPerCode / 8;
+    int bytesPerBlock = bytesPerDimBlock * dims;
+    int numBlocks = utils::divUp(numVecs, 32);
+    size_t totalSize = (size_t)bytesPerBlock * numBlocks;
+    FAISS_ASSERT(data.size() == totalSize);
+    // bit codes padded to whole bytes
+    std::vector<uint8_t> out(numVecs * dims * utils::divUp(bitsPerCode, 8));
+    if (bitsPerCode == 8) {
+        unpackInterleavedWord<uint8_t>(
+                data.data(), out.data(), numVecs, dims, bitsPerCode);
+    } else if (bitsPerCode == 16) {
+        unpackInterleavedWord<uint16_t>(
+                (uint16_t*)data.data(),
+                (uint16_t*)out.data(),
+                numVecs,
+                dims,
+                bitsPerCode);
+    } else if (bitsPerCode == 32) {
+        unpackInterleavedWord<uint32_t>(
+                (uint32_t*)data.data(),
+                (uint32_t*)out.data(),
+                numVecs,
+                dims,
+                bitsPerCode);
+    } else if (bitsPerCode == 4) {
 #pragma omp parallel for
-    for (int i = 0; i < numVecs; ++i) {
-      int block = i / 32;
-      int lane = i % 32;
+        for (int i = 0; i < numVecs; ++i) {
+            int block = i / 32;
+            int lane = i % 32;
-      int word = lane / 2;
-      int subWord = lane % 2;
+            int word = lane / 2;
+            int subWord = lane % 2;
-      for (int j = 0; j < dims; ++j) {
-        auto v =
-          data[block * bytesPerBlock + j * bytesPerDimBlock + word];
+            for (int j = 0; j < dims; ++j) {
+                auto v =
+                        data[block * bytesPerBlock + j * bytesPerDimBlock +
+                             word];
-        v = (subWord == 0) ? v & 0xf : v >> 4;
-        out[i * dims + j] = v;
-      }
-    }
-  } else if (bitsPerCode == 5) {
+                v = (subWord == 0) ? v & 0xf : v >> 4;
+                out[i * dims + j] = v;
+            }
+        }
+    } else if (bitsPerCode == 5) {
 #pragma omp parallel for
-    for (int i = 0; i < numVecs; ++i) {
-      int block = i / 32;
-      int blockVector = i % 32;
+        for (int i = 0; i < numVecs; ++i) {
+            int block = i / 32;
+            int blockVector = i % 32;
-      for (int j = 0; j < dims; ++j) {
-        uint8_t* dimBlock =
-          &data[block * bytesPerBlock + j * bytesPerDimBlock];
+            for (int j = 0; j < dims; ++j) {
+                uint8_t* dimBlock =
+                        &data[block * bytesPerBlock + j * bytesPerDimBlock];
-        int lo = (blockVector * 5) / 8;
-        int hi = lo + 1;
+                int lo = (blockVector * 5) / 8;
+                int hi = lo + 1;
-        FAISS_ASSERT(lo < bytesPerDimBlock);
-        FAISS_ASSERT(hi <= bytesPerDimBlock);
+                FAISS_ASSERT(lo < bytesPerDimBlock);
+                FAISS_ASSERT(hi <= bytesPerDimBlock);
-        auto vLower = dimBlock[lo];
-        auto vUpper = hi < bytesPerDimBlock ? dimBlock[hi] : 0;
+                auto vLower = dimBlock[lo];
+                auto vUpper = hi < bytesPerDimBlock ? dimBlock[hi] : 0;
-        out[i * dims + j] = unpack5(blockVector, vLower, vUpper);
-      }
-    }
-  } else if (bitsPerCode == 6) {
+                out[i * dims + j] = unpack5(blockVector, vLower, vUpper);
+            }
+        }
+    } else if (bitsPerCode == 6) {
 #pragma omp parallel for
-    for (int i = 0; i < numVecs; ++i) {
-      int block = i / 32;
-      int blockVector = i % 32;
+        for (int i = 0; i < numVecs; ++i) {
+            int block = i / 32;
+            int blockVector = i % 32;
-      for (int j = 0; j < dims; ++j) {
-        uint8_t* dimBlock =
-          &data[block * bytesPerBlock + j * bytesPerDimBlock];
+            for (int j = 0; j < dims; ++j) {
+                uint8_t* dimBlock =
+                        &data[block * bytesPerBlock + j * bytesPerDimBlock];
-        int lo = (blockVector * 6) / 8;
-        int hi = lo + 1;
+                int lo = (blockVector * 6) / 8;
+                int hi = lo + 1;
-        FAISS_ASSERT(lo < bytesPerDimBlock);
-        FAISS_ASSERT(hi <= bytesPerDimBlock);
+                FAISS_ASSERT(lo < bytesPerDimBlock);
+                FAISS_ASSERT(hi <= bytesPerDimBlock);
-        auto vLower = dimBlock[lo];
-        auto vUpper = hi < bytesPerDimBlock ? dimBlock[hi] : 0;
+                auto vLower = dimBlock[lo];
+                auto vUpper = hi < bytesPerDimBlock ? dimBlock[hi] : 0;
-        out[i * dims + j] = unpack6(blockVector, vLower, vUpper);
-      }
+                out[i * dims + j] = unpack6(blockVector, vLower, vUpper);
+            }
+        }
+    } else {
+        // unimplemented
+        FAISS_ASSERT(false);
     }
-  } else {
-    // unimplemented
-    FAISS_ASSERT(false);
-  }
-  return out;
+    return out;
 }
 inline uint8_t pack5(int i, uint8_t lo, uint8_t hi, uint8_t hi2) {
-  FAISS_ASSERT((lo & 0x1f) == lo);
-  FAISS_ASSERT((hi & 0x1f) == hi);
-  FAISS_ASSERT((hi2 & 0x1f) == hi2);
-  uint8_t v = 0;
-  // lsb     ...    msb
-  // 0: 0 0 0 0 0 1 1 1
-  // 1: 1 1 2 2 2 2 2 3
-  // 2: 3 3 3 3 4 4 4 4
-  // 3: 4 5 5 5 5 5 6 6
-  // 4: 6 6 6 7 7 7 7 7
-  switch (i % 5) {
-    case 0:
-      // 5 msbs of lower as vOut lsbs
-      // 3 lsbs of upper as vOut msbs
-      v = (lo & 0x1f) | (hi << 5);
-      break;
-    case 1:
-      // 2 msbs of lower as vOut lsbs
-      // 5 lsbs of upper as vOut msbs
-      // 1 lsbs of upper2 as vOut msb
-      v = (lo >> 3) | (hi << 2) | (hi2 << 7);
-      break;
-    case 2:
-      // 4 msbs of lower as vOut lsbs
-      // 4 lsbs of upper as vOut msbs
-      v = (lo >> 1) | (hi << 4);
-      break;
-    case 3:
-      // 1 msbs of lower as vOut lsbs
-      // 5 lsbs of upper as vOut msbs
-      // 2 lsbs of upper2 as vOut msb
-      v = (lo >> 4) | (hi << 1) | (hi2 << 6);
-      break;
-    case 4:
-      // 3 msbs of lower as vOut lsbs
-      // 5 lsbs of upper as vOut msbs
-      v = (lo >> 2) | (hi << 3);
-      break;
-  }
-  return v;
-}
+    FAISS_ASSERT((lo & 0x1f) == lo);
+    FAISS_ASSERT((hi & 0x1f) == hi);
+    FAISS_ASSERT((hi2 & 0x1f) == hi2);
+    uint8_t v = 0;
+    // lsb     ...    msb
+    // 0: 0 0 0 0 0 1 1 1
+    // 1: 1 1 2 2 2 2 2 3
+    // 2: 3 3 3 3 4 4 4 4
+    // 3: 4 5 5 5 5 5 6 6
+    // 4: 6 6 6 7 7 7 7 7
+    switch (i % 5) {
+        case 0:
+            // 5 msbs of lower as vOut lsbs
+            // 3 lsbs of upper as vOut msbs
+            v = (lo & 0x1f) | (hi << 5);
+            break;
+        case 1:
+            // 2 msbs of lower as vOut lsbs
+            // 5 lsbs of upper as vOut msbs
+            // 1 lsbs of upper2 as vOut msb
+            v = (lo >> 3) | (hi << 2) | (hi2 << 7);
+            break;
+        case 2:
+            // 4 msbs of lower as vOut lsbs
+            // 4 lsbs of upper as vOut msbs
+            v = (lo >> 1) | (hi << 4);
+            break;
+        case 3:
+            // 1 msbs of lower as vOut lsbs
+            // 5 lsbs of upper as vOut msbs
+            // 2 lsbs of upper2 as vOut msb
+            v = (lo >> 4) | (hi << 1) | (hi2 << 6);
+            break;
+        case 4:
+            // 3 msbs of lower as vOut lsbs
+            // 5 lsbs of upper as vOut msbs
+            v = (lo >> 2) | (hi << 3);
+            break;
+    }
-inline uint8_t pack6(int i, uint8_t lo, uint8_t hi) {
-  FAISS_ASSERT((lo & 0x3f) == lo);
-  FAISS_ASSERT((hi & 0x3f) == hi);
-  uint8_t v = 0;
-  // lsb     ...    msb
-  // 0: 0 0 0 0 0 0 1 1
-  // 1: 1 1 1 1 2 2 2 2
-  // 2: 2 2 3 3 3 3 3 3
-  switch (i % 3) {
-    case 0:
-      // 6 msbs of lower as vOut lsbs
-      // 2 lsbs of upper as vOut msbs
-      v = (lo & 0x3f) | (hi << 6);
-      break;
-    case 1:
-      // 4 msbs of lower as vOut lsbs
-      // 4 lsbs of upper as vOut msbs
-      v = (lo >> 2) | (hi << 4);
-      break;
-    case 2:
-      // 2 msbs of lower as vOut lsbs
-      // 6 lsbs of upper as vOut msbs
-      v = (lo >> 4) | (hi << 2);
-      break;
-  }
-  return v;
+    return v;
 }
+inline uint8_t pack6(int i, uint8_t lo, uint8_t hi) {
+    FAISS_ASSERT((lo & 0x3f) == lo);
+    FAISS_ASSERT((hi & 0x3f) == hi);
+    uint8_t v = 0;
+    // lsb     ...    msb
+    // 0: 0 0 0 0 0 0 1 1
+    // 1: 1 1 1 1 2 2 2 2
+    // 2: 2 2 3 3 3 3 3 3
+    switch (i % 3) {
+        case 0:
+            // 6 msbs of lower as vOut lsbs
+            // 2 lsbs of upper as vOut msbs
+            v = (lo & 0x3f) | (hi << 6);
+            break;
+        case 1:
+            // 4 msbs of lower as vOut lsbs
+            // 4 lsbs of upper as vOut msbs
+            v = (lo >> 2) | (hi << 4);
+            break;
+        case 2:
+            // 2 msbs of lower as vOut lsbs
+            // 6 lsbs of upper as vOut msbs
+            v = (lo >> 4) | (hi << 2);
+            break;
+    }
-std::vector<uint8_t>
-packNonInterleaved(std::vector<uint8_t> data,
-                   int numVecs,
-                   int dims,
-                   int bitsPerCode) {
-  // bit codes padded to whole bytes
-  FAISS_ASSERT(data.size() == numVecs * dims * utils::divUp(bitsPerCode, 8));
+    return v;
+}
-  if (bitsPerCode == 8 ||
-      bitsPerCode == 16 ||
-      bitsPerCode == 32) {
-    // nothing to do, whole words are already where they need to be
-    return data;
-  }
+std::vector<uint8_t> packNonInterleaved(
+        std::vector<uint8_t> data,
+        int numVecs,
+        int dims,
+        int bitsPerCode) {
+    // bit codes padded to whole bytes
+    FAISS_ASSERT(data.size() == numVecs * dims * utils::divUp(bitsPerCode, 8));
+    if (bitsPerCode == 8 || bitsPerCode == 16 || bitsPerCode == 32) {
+        // nothing to do, whole words are already where they need to be
+        return data;
+    }
-  // bits packed into a whole number of bytes
-  int bytesPerVec = utils::divUp(dims * bitsPerCode, 8);
+    // bits packed into a whole number of bytes
+    int bytesPerVec = utils::divUp(dims * bitsPerCode, 8);
-  std::vector<uint8_t> out(numVecs * bytesPerVec);
+    std::vector<uint8_t> out(numVecs * bytesPerVec);
-  if (bitsPerCode == 4) {
+    if (bitsPerCode == 4) {
 #pragma omp parallel for
-    for (int i = 0; i < numVecs; ++i) {
-      for (int j = 0; j < bytesPerVec; ++j) {
-        int dimLo = j * 2;
-        int dimHi = dimLo + 1;
-        FAISS_ASSERT(dimLo < dims);
-        FAISS_ASSERT(dimHi <= dims);
-        uint8_t lo = data[i * dims + dimLo];
-        uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
-        out[i * bytesPerVec + j] = (hi << 4) | (lo & 0xf);
-      }
-    }
-  } else if (bitsPerCode == 5) {
+        for (int i = 0; i < numVecs; ++i) {
+            for (int j = 0; j < bytesPerVec; ++j) {
+                int dimLo = j * 2;
+                int dimHi = dimLo + 1;
+                FAISS_ASSERT(dimLo < dims);
+                FAISS_ASSERT(dimHi <= dims);
+                uint8_t lo = data[i * dims + dimLo];
+                uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
+                out[i * bytesPerVec + j] = (hi << 4) | (lo & 0xf);
+            }
+        }
+    } else if (bitsPerCode == 5) {
 #pragma omp parallel for
-    for (int i = 0; i < numVecs; ++i) {
-      for (int j = 0; j < bytesPerVec; ++j) {
-        int dimLo = (j * 8) / 5;
-        int dimHi = dimLo + 1;
-        int dimHi2 = dimHi + 1;
-        FAISS_ASSERT(dimLo < dims);
-        FAISS_ASSERT(dimHi <= dims);
-        FAISS_ASSERT(dimHi <= dims + 1);
-        uint8_t lo = data[i * dims + dimLo];
-        uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
-        uint8_t hi2 = dimHi2 < dims ? data[i * dims + dimHi2] : 0;
-        out[i * bytesPerVec + j] = pack5(j, lo, hi, hi2);
-      }
-    }
-  } else if (bitsPerCode == 6) {
+        for (int i = 0; i < numVecs; ++i) {
+            for (int j = 0; j < bytesPerVec; ++j) {
+                int dimLo = (j * 8) / 5;
+                int dimHi = dimLo + 1;
+                int dimHi2 = dimHi + 1;
+                FAISS_ASSERT(dimLo < dims);
+                FAISS_ASSERT(dimHi <= dims);
+                FAISS_ASSERT(dimHi <= dims + 1);
+                uint8_t lo = data[i * dims + dimLo];
+                uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
+                uint8_t hi2 = dimHi2 < dims ? data[i * dims + dimHi2] : 0;
+                out[i * bytesPerVec + j] = pack5(j, lo, hi, hi2);
+            }
+        }
+    } else if (bitsPerCode == 6) {
 #pragma omp parallel for
-    for (int i = 0; i < numVecs; ++i) {
-      for (int j = 0; j < bytesPerVec; ++j) {
-        int dimLo = (j * 8) / 6;
-        int dimHi = dimLo + 1;
-        FAISS_ASSERT(dimLo < dims);
-        FAISS_ASSERT(dimHi <= dims);
-        uint8_t lo = data[i * dims + dimLo];
-        uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
-        out[i * bytesPerVec + j] = pack6(j, lo, hi);
-      }
+        for (int i = 0; i < numVecs; ++i) {
+            for (int j = 0; j < bytesPerVec; ++j) {
+                int dimLo = (j * 8) / 6;
+                int dimHi = dimLo + 1;
+                FAISS_ASSERT(dimLo < dims);
+                FAISS_ASSERT(dimHi <= dims);
+                uint8_t lo = data[i * dims + dimLo];
+                uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
+                out[i * bytesPerVec + j] = pack6(j, lo, hi);
+            }
+        }
+    } else {
+        // unhandled
+        FAISS_ASSERT(false);
     }
-  } else {
-    // unhandled
-    FAISS_ASSERT(false);
-  }
-  return out;
+    return out;
 }
 template <typename T>
-void
-packInterleavedWord(const T* in,
-                    T* out,
-                    int numVecs,
-                    int dims,
-                    int bitsPerCode) {
-  int wordsPerDimBlock = 32 * bitsPerCode / (8 * sizeof(T));
-  int wordsPerBlock = wordsPerDimBlock * dims;
-  int numBlocks = utils::divUp(numVecs, 32);
-  // We're guaranteed that all other slots not filled by the vectors present are
-  // initialized to zero (from the vector constructor in packInterleaved)
+void packInterleavedWord(
+        const T* in,
+        T* out,
+        int numVecs,
+        int dims,
+        int bitsPerCode) {
+    int wordsPerDimBlock = 32 * bitsPerCode / (8 * sizeof(T));
+    int wordsPerBlock = wordsPerDimBlock * dims;
+    int numBlocks = utils::divUp(numVecs, 32);
+    // We're guaranteed that all other slots not filled by the vectors present
+    // are initialized to zero (from the vector constructor in packInterleaved)
 #pragma omp parallel for
-  for (int i = 0; i < numVecs; ++i) {
-    int block = i / 32;
-    FAISS_ASSERT(block < numBlocks);
-    int lane = i % 32;
-    for (int j = 0; j < dims; ++j) {
-      int dstOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
-      out[dstOffset] = in[i * dims + j];
+    for (int i = 0; i < numVecs; ++i) {
+        int block = i / 32;
+        FAISS_ASSERT(block < numBlocks);
+        int lane = i % 32;
+        for (int j = 0; j < dims; ++j) {
+            int dstOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
+            out[dstOffset] = in[i * dims + j];
+        }
     }
-  }
 }
-std::vector<uint8_t>
-packInterleaved(std::vector<uint8_t> data,
-                int numVecs,
-                int dims,
-                int bitsPerCode) {
-  int bytesPerDimBlock = 32 * bitsPerCode / 8;
-  int bytesPerBlock = bytesPerDimBlock * dims;
-  int numBlocks = utils::divUp(numVecs, 32);
-  size_t totalSize = (size_t) bytesPerBlock * numBlocks;
-  // bit codes padded to whole bytes
-  FAISS_ASSERT(data.size() == numVecs * dims * utils::divUp(bitsPerCode, 8));
-  // packs based on blocks
-  std::vector<uint8_t> out(totalSize, 0);
-  if (bitsPerCode == 8) {
-    packInterleavedWord<uint8_t>(data.data(), out.data(),
-                                 numVecs, dims, bitsPerCode);
-  } else if (bitsPerCode == 16) {
-    packInterleavedWord<uint16_t>((uint16_t*) data.data(),
-                                  (uint16_t*) out.data(),
-                                  numVecs, dims, bitsPerCode);
-  } else if (bitsPerCode == 32) {
-    packInterleavedWord<uint32_t>((uint32_t*) data.data(),
-                                  (uint32_t*) out.data(),
-                                  numVecs, dims, bitsPerCode);
-  } else if (bitsPerCode == 4) {
+std::vector<uint8_t> packInterleaved(
+        std::vector<uint8_t> data,
+        int numVecs,
+        int dims,
+        int bitsPerCode) {
+    int bytesPerDimBlock = 32 * bitsPerCode / 8;
+    int bytesPerBlock = bytesPerDimBlock * dims;
+    int numBlocks = utils::divUp(numVecs, 32);
+    size_t totalSize = (size_t)bytesPerBlock * numBlocks;
+    // bit codes padded to whole bytes
+    FAISS_ASSERT(data.size() == numVecs * dims * utils::divUp(bitsPerCode, 8));
+    // packs based on blocks
+    std::vector<uint8_t> out(totalSize, 0);
+    if (bitsPerCode == 8) {
+        packInterleavedWord<uint8_t>(
+                data.data(), out.data(), numVecs, dims, bitsPerCode);
+    } else if (bitsPerCode == 16) {
+        packInterleavedWord<uint16_t>(
+                (uint16_t*)data.data(),
+                (uint16_t*)out.data(),
+                numVecs,
+                dims,
+                bitsPerCode);
+    } else if (bitsPerCode == 32) {
+        packInterleavedWord<uint32_t>(
+                (uint32_t*)data.data(),
+                (uint32_t*)out.data(),
+                numVecs,
+                dims,
+                bitsPerCode);
+    } else if (bitsPerCode == 4) {
 #pragma omp parallel for
-    for (int i = 0; i < numBlocks; ++i) {
-      for (int j = 0; j < dims; ++j) {
-        for (int k = 0; k < bytesPerDimBlock; ++k) {
-          int loVec = i * 32 + k * 2;
-          int hiVec = loVec + 1;
-          uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
-          uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
-          out[i * bytesPerBlock + j * bytesPerDimBlock + k] =
-            (hi << 4) | (lo & 0xf);
+        for (int i = 0; i < numBlocks; ++i) {
+            for (int j = 0; j < dims; ++j) {
+                for (int k = 0; k < bytesPerDimBlock; ++k) {
+                    int loVec = i * 32 + k * 2;
+                    int hiVec = loVec + 1;
+                    uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
+                    uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
+                    out[i * bytesPerBlock + j * bytesPerDimBlock + k] =
+                            (hi << 4) | (lo & 0xf);
+                }
+            }
         }
-      }
-    }
-  } else if (bitsPerCode == 5) {
+    } else if (bitsPerCode == 5) {
 #pragma omp parallel for
-    for (int i = 0; i < numBlocks; ++i) {
-      for (int j = 0; j < dims; ++j) {
-        for (int k = 0; k < bytesPerDimBlock; ++k) {
-          // What input vectors we are pulling from
-          int loVec = i * 32 + (k * 8) / 5;
-          int hiVec = loVec + 1;
-          int hiVec2 = hiVec + 1;
-          uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
-          uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
-          uint8_t hi2 = hiVec2 < numVecs ? data[hiVec2 * dims + j] : 0;
-          out[i * bytesPerBlock + j * bytesPerDimBlock + k] = pack5(k, lo, hi, hi2);
+        for (int i = 0; i < numBlocks; ++i) {
+            for (int j = 0; j < dims; ++j) {
+                for (int k = 0; k < bytesPerDimBlock; ++k) {
+                    // What input vectors we are pulling from
+                    int loVec = i * 32 + (k * 8) / 5;
+                    int hiVec = loVec + 1;
+                    int hiVec2 = hiVec + 1;
+                    uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
+                    uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
+                    uint8_t hi2 =
+                            hiVec2 < numVecs ? data[hiVec2 * dims + j] : 0;
+                    out[i * bytesPerBlock + j * bytesPerDimBlock + k] =
+                            pack5(k, lo, hi, hi2);
+                }
+            }
         }
-      }
-    }
-  } else if (bitsPerCode == 6) {
+    } else if (bitsPerCode == 6) {
 #pragma omp parallel for
-    for (int i = 0; i < numBlocks; ++i) {
-      for (int j = 0; j < dims; ++j) {
-        for (int k = 0; k < bytesPerDimBlock; ++k) {
-          // What input vectors we are pulling from
-          int loVec = i * 32 + (k * 8) / 6;
-          int hiVec = loVec + 1;
-          uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
-          uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
-          out[i * bytesPerBlock + j * bytesPerDimBlock + k] = pack6(k, lo, hi);
+        for (int i = 0; i < numBlocks; ++i) {
+            for (int j = 0; j < dims; ++j) {
+                for (int k = 0; k < bytesPerDimBlock; ++k) {
+                    // What input vectors we are pulling from
+                    int loVec = i * 32 + (k * 8) / 6;
+                    int hiVec = loVec + 1;
+                    uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
+                    uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
+                    out[i * bytesPerBlock + j * bytesPerDimBlock + k] =
+                            pack6(k, lo, hi);
+                }
+            }
         }
-      }
+    } else {
+        // unimplemented
+        FAISS_ASSERT(false);
     }
-  } else {
-    // unimplemented
-    FAISS_ASSERT(false);
-  }
-  return out;
+    return out;
 }
-} } // namespace
+} // namespace gpu
+} // namespace faiss