RubyGems - faiss - Versions diffs - 0.2.0 → 0.2.1 - Mend

faiss 0.2.0 → 0.2.1

Files changed (202) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +292 -291
data/vendor/faiss/faiss/AutoTune.h +55 -56
data/vendor/faiss/faiss/Clustering.cpp +334 -195
data/vendor/faiss/faiss/Clustering.h +88 -35
data/vendor/faiss/faiss/IVFlib.cpp +171 -195
data/vendor/faiss/faiss/IVFlib.h +48 -51
data/vendor/faiss/faiss/Index.cpp +85 -103
data/vendor/faiss/faiss/Index.h +54 -48
data/vendor/faiss/faiss/Index2Layer.cpp +139 -164
data/vendor/faiss/faiss/Index2Layer.h +22 -22
data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
data/vendor/faiss/faiss/IndexBinary.h +140 -132
data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
data/vendor/faiss/faiss/IndexFlat.cpp +116 -147
data/vendor/faiss/faiss/IndexFlat.h +35 -46
data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
data/vendor/faiss/faiss/IndexHNSW.h +57 -41
data/vendor/faiss/faiss/IndexIVF.cpp +474 -454
data/vendor/faiss/faiss/IndexIVF.h +146 -113
data/vendor/faiss/faiss/IndexIVFFlat.cpp +248 -250
data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
data/vendor/faiss/faiss/IndexIVFPQ.cpp +457 -516
data/vendor/faiss/faiss/IndexIVFPQ.h +74 -66
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +125 -133
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +19 -21
data/vendor/faiss/faiss/IndexLSH.cpp +75 -96
data/vendor/faiss/faiss/IndexLSH.h +21 -26
data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
data/vendor/faiss/faiss/IndexLattice.h +11 -16
data/vendor/faiss/faiss/IndexNNDescent.cpp +231 -0
data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
data/vendor/faiss/faiss/IndexNSG.cpp +303 -0
data/vendor/faiss/faiss/IndexNSG.h +85 -0
data/vendor/faiss/faiss/IndexPQ.cpp +405 -464
data/vendor/faiss/faiss/IndexPQ.h +64 -67
data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
data/vendor/faiss/faiss/IndexRefine.cpp +115 -131
data/vendor/faiss/faiss/IndexRefine.h +22 -23
data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
data/vendor/faiss/faiss/IndexReplicas.h +62 -56
data/vendor/faiss/faiss/IndexResidual.cpp +291 -0
data/vendor/faiss/faiss/IndexResidual.h +152 -0
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +120 -155
data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -45
data/vendor/faiss/faiss/IndexShards.cpp +256 -240
data/vendor/faiss/faiss/IndexShards.h +85 -73
data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
data/vendor/faiss/faiss/MatrixStats.h +7 -10
data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
data/vendor/faiss/faiss/MetaIndexes.h +40 -34
data/vendor/faiss/faiss/MetricType.h +7 -7
data/vendor/faiss/faiss/VectorTransform.cpp +652 -474
data/vendor/faiss/faiss/VectorTransform.h +61 -89
data/vendor/faiss/faiss/clone_index.cpp +77 -73
data/vendor/faiss/faiss/clone_index.h +4 -9
data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +197 -170
data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +270 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +115 -0
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
data/vendor/faiss/faiss/impl/FaissException.h +41 -29
data/vendor/faiss/faiss/impl/HNSW.cpp +595 -611
data/vendor/faiss/faiss/impl/HNSW.h +179 -200
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +672 -0
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +172 -0
data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
data/vendor/faiss/faiss/impl/NSG.cpp +682 -0
data/vendor/faiss/faiss/impl/NSG.h +199 -0
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +448 -0
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +130 -0
data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +648 -701
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
data/vendor/faiss/faiss/impl/index_read.cpp +547 -479
data/vendor/faiss/faiss/impl/index_write.cpp +497 -407
data/vendor/faiss/faiss/impl/io.cpp +75 -94
data/vendor/faiss/faiss/impl/io.h +31 -41
data/vendor/faiss/faiss/impl/io_macros.h +40 -29
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
data/vendor/faiss/faiss/index_factory.cpp +269 -218
data/vendor/faiss/faiss/index_factory.h +6 -7
data/vendor/faiss/faiss/index_io.h +23 -26
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
data/vendor/faiss/faiss/utils/Heap.h +186 -209
data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
data/vendor/faiss/faiss/utils/distances.cpp +301 -310
data/vendor/faiss/faiss/utils/distances.h +133 -118
data/vendor/faiss/faiss/utils/distances_simd.cpp +456 -516
data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
data/vendor/faiss/faiss/utils/hamming.h +62 -85
data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
data/vendor/faiss/faiss/utils/partitioning.h +26 -21
data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
data/vendor/faiss/faiss/utils/random.cpp +39 -63
data/vendor/faiss/faiss/utils/random.h +13 -16
data/vendor/faiss/faiss/utils/simdlib.h +4 -2
data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
data/vendor/faiss/faiss/utils/utils.cpp +304 -287
data/vendor/faiss/faiss/utils/utils.h +53 -48
metadata +20 -2

data/vendor/faiss/faiss/gpu/GpuResources.h CHANGED Viewed

@@ -5,55 +5,59 @@
  * LICENSE file in the root directory of this source tree.
  */
 #pragma once
-#include <faiss/impl/FaissAssert.h>
-#include <cuda_runtime.h>
 #include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <faiss/impl/FaissAssert.h>
 #include <memory>
 #include <utility>
 #include <vector>
-namespace faiss { namespace gpu {
+namespace faiss {
+namespace gpu {
 class GpuResources;
 enum AllocType {
-  /// Unknown allocation type or miscellaneous (not currently categorized)
-  Other = 0,
-  /// Primary data storage for GpuIndexFlat (the raw matrix of vectors and
-  /// vector norms if needed)
-  FlatData = 1,
-  /// Primary data storage for GpuIndexIVF* (the storage for each individual IVF
-  /// list)
-  IVFLists = 2,
-  /// Quantizer (PQ, SQ) dictionary information
-  Quantizer = 3,
-  /// For GpuIndexIVFPQ, "precomputed codes" for more efficient PQ lookup
-  /// require the use of possibly large tables. These are marked separately from
-  /// Quantizer as these can frequently be 100s - 1000s of MiB in size
-  QuantizerPrecomputedCodes = 4,
-  ///
-  /// StandardGpuResources implementation specific types
-  ///
-  /// When using StandardGpuResources, temporary memory allocations
-  /// (MemorySpace::Temporary) come out of a stack region of memory that is
-  /// allocated up front for each gpu (e.g., 1.5 GiB upon initialization). This
-  /// allocation by StandardGpuResources is marked with this AllocType.
-  TemporaryMemoryBuffer = 10,
-  /// When using StandardGpuResources, any MemorySpace::Temporary allocations
-  /// that cannot be satisfied within the TemporaryMemoryBuffer region fall back
-  /// to calling cudaMalloc which are sized to just the request at hand. These
-  /// "overflow" temporary allocations are marked with this AllocType.
-  TemporaryMemoryOverflow = 11,
+    /// Unknown allocation type or miscellaneous (not currently categorized)
+    Other = 0,
+    /// Primary data storage for GpuIndexFlat (the raw matrix of vectors and
+    /// vector norms if needed)
+    FlatData = 1,
+    /// Primary data storage for GpuIndexIVF* (the storage for each individual
+    /// IVF
+    /// list)
+    IVFLists = 2,
+    /// Quantizer (PQ, SQ) dictionary information
+    Quantizer = 3,
+    /// For GpuIndexIVFPQ, "precomputed codes" for more efficient PQ lookup
+    /// require the use of possibly large tables. These are marked separately
+    /// from
+    /// Quantizer as these can frequently be 100s - 1000s of MiB in size
+    QuantizerPrecomputedCodes = 4,
+    ///
+    /// StandardGpuResources implementation specific types
+    ///
+    /// When using StandardGpuResources, temporary memory allocations
+    /// (MemorySpace::Temporary) come out of a stack region of memory that is
+    /// allocated up front for each gpu (e.g., 1.5 GiB upon initialization).
+    /// This
+    /// allocation by StandardGpuResources is marked with this AllocType.
+    TemporaryMemoryBuffer = 10,
+    /// When using StandardGpuResources, any MemorySpace::Temporary allocations
+    /// that cannot be satisfied within the TemporaryMemoryBuffer region fall
+    /// back
+    /// to calling cudaMalloc which are sized to just the request at hand. These
+    /// "overflow" temporary allocations are marked with this AllocType.
+    TemporaryMemoryOverflow = 11,
 };
 /// Convert an AllocType to string
@@ -61,16 +65,17 @@ std::string allocTypeToString(AllocType t);
 /// Memory regions accessible to the GPU
 enum MemorySpace {
-  /// Temporary device memory (guaranteed to no longer be used upon exit of a
-  /// top-level index call, and where the streams using it have completed GPU
-  /// work). Typically backed by Device memory (cudaMalloc/cudaFree).
-  Temporary = 0,
+    /// Temporary device memory (guaranteed to no longer be used upon exit of a
+    /// top-level index call, and where the streams using it have completed GPU
+    /// work). Typically backed by Device memory (cudaMalloc/cudaFree).
+    Temporary = 0,
-  /// Managed using cudaMalloc/cudaFree (typical GPU device memory)
-  Device = 1,
+    /// Managed using cudaMalloc/cudaFree (typical GPU device memory)
+    Device = 1,
-  /// Managed using cudaMallocManaged/cudaFree (typical Unified CPU/GPU memory)
-  Unified = 2,
+    /// Managed using cudaMallocManaged/cudaFree (typical Unified CPU/GPU
+    /// memory)
+    Unified = 2,
 };
 /// Convert a MemorySpace to string
@@ -78,44 +83,36 @@ std::string memorySpaceToString(MemorySpace s);
 /// Information on what/where an allocation is
 struct AllocInfo {
-  inline AllocInfo()
-      : type(AllocType::Other),
-        device(0),
-        space(MemorySpace::Device),
-        stream(nullptr) {
-  }
-  inline AllocInfo(AllocType at,
-                   int dev,
-                   MemorySpace sp,
-                   cudaStream_t st)
-      : type(at),
-        device(dev),
-        space(sp),
-        stream(st) {
-  }
-  /// Returns a string representation of this info
-  std::string toString() const;
-  /// The internal category of the allocation
-  AllocType type;
-  /// The device on which the allocation is happening
-  int device;
-  /// The memory space of the allocation
-  MemorySpace space;
-  /// The stream on which new work on the memory will be ordered (e.g., if a
-  /// piece of memory cached and to be returned for this call was last used on
-  /// stream 3 and a new memory request is for stream 4, the memory manager will
-  /// synchronize stream 4 to wait for the completion of stream 3 via events or
-  /// other stream synchronization.
-  ///
-  /// The memory manager guarantees that the returned memory is free to use
-  /// without data races on this stream specified.
-  cudaStream_t stream;
+    inline AllocInfo()
+            : type(AllocType::Other),
+              device(0),
+              space(MemorySpace::Device),
+              stream(nullptr) {}
+    inline AllocInfo(AllocType at, int dev, MemorySpace sp, cudaStream_t st)
+            : type(at), device(dev), space(sp), stream(st) {}
+    /// Returns a string representation of this info
+    std::string toString() const;
+    /// The internal category of the allocation
+    AllocType type;
+    /// The device on which the allocation is happening
+    int device;
+    /// The memory space of the allocation
+    MemorySpace space;
+    /// The stream on which new work on the memory will be ordered (e.g., if a
+    /// piece of memory cached and to be returned for this call was last used on
+    /// stream 3 and a new memory request is for stream 4, the memory manager
+    /// will synchronize stream 4 to wait for the completion of stream 3 via
+    /// events or other stream synchronization.
+    ///
+    /// The memory manager guarantees that the returned memory is free to use
+    /// without data races on this stream specified.
+    cudaStream_t stream;
 };
 /// Create an AllocInfo for the current device with MemorySpace::Device
@@ -129,140 +126,139 @@ AllocInfo makeSpaceAlloc(AllocType at, MemorySpace sp, cudaStream_t st);
 /// Information on what/where an allocation is, along with how big it should be
 struct AllocRequest : public AllocInfo {
-  inline AllocRequest()
-      : AllocInfo(),
-        size(0) {
-  }
-  inline AllocRequest(const AllocInfo& info,
-                      size_t sz)
-      : AllocInfo(info),
-        size(sz) {
-  }
-  inline AllocRequest(AllocType at,
-                      int dev,
-                      MemorySpace sp,
-                      cudaStream_t st,
-                      size_t sz)
-      : AllocInfo(at, dev, sp, st),
-        size(sz) {
-  }
-  /// Returns a string representation of this request
-  std::string toString() const;
-  /// The size in bytes of the allocation
-  size_t size;
+    inline AllocRequest() : AllocInfo(), size(0) {}
+    inline AllocRequest(const AllocInfo& info, size_t sz)
+            : AllocInfo(info), size(sz) {}
+    inline AllocRequest(
+            AllocType at,
+            int dev,
+            MemorySpace sp,
+            cudaStream_t st,
+            size_t sz)
+            : AllocInfo(at, dev, sp, st), size(sz) {}
+    /// Returns a string representation of this request
+    std::string toString() const;
+    /// The size in bytes of the allocation
+    size_t size;
 };
 /// A RAII object that manages a temporary memory request
 struct GpuMemoryReservation {
-  GpuMemoryReservation();
-  GpuMemoryReservation(GpuResources* r,
-                       int dev,
-                       cudaStream_t str,
-                       void* p,
-                       size_t sz);
-  GpuMemoryReservation(GpuMemoryReservation&& m) noexcept;
-  ~GpuMemoryReservation();
-  GpuMemoryReservation& operator=(GpuMemoryReservation&& m);
-  inline void* get() { return data; }
-  void release();
-  GpuResources* res;
-  int device;
-  cudaStream_t stream;
-  void* data;
-  size_t size;
+    GpuMemoryReservation();
+    GpuMemoryReservation(
+            GpuResources* r,
+            int dev,
+            cudaStream_t str,
+            void* p,
+            size_t sz);
+    GpuMemoryReservation(GpuMemoryReservation&& m) noexcept;
+    ~GpuMemoryReservation();
+    GpuMemoryReservation& operator=(GpuMemoryReservation&& m);
+    inline void* get() {
+        return data;
+    }
+    void release();
+    GpuResources* res;
+    int device;
+    cudaStream_t stream;
+    void* data;
+    size_t size;
 };
 /// Base class of GPU-side resource provider; hides provision of
 /// cuBLAS handles, CUDA streams and all device memory allocation performed
 class GpuResources {
- public:
-  virtual ~GpuResources();
+   public:
+    virtual ~GpuResources();
-  /// Call to pre-allocate resources for a particular device. If this is
-  /// not called, then resources will be allocated at the first time
-  /// of demand
-  virtual void initializeForDevice(int device) = 0;
+    /// Call to pre-allocate resources for a particular device. If this is
+    /// not called, then resources will be allocated at the first time
+    /// of demand
+    virtual void initializeForDevice(int device) = 0;
-  /// Returns the cuBLAS handle that we use for the given device
-  virtual cublasHandle_t getBlasHandle(int device) = 0;
+    /// Returns the cuBLAS handle that we use for the given device
+    virtual cublasHandle_t getBlasHandle(int device) = 0;
-  /// Returns the stream that we order all computation on for the
-  /// given device
-  virtual cudaStream_t getDefaultStream(int device) = 0;
+    /// Returns the stream that we order all computation on for the
+    /// given device
+    virtual cudaStream_t getDefaultStream(int device) = 0;
-  /// Overrides the default stream for a device to the user-supplied stream. The
-  /// resources object does not own this stream (i.e., it will not destroy it).
-  virtual void setDefaultStream(int device, cudaStream_t stream) = 0;
+    /// Overrides the default stream for a device to the user-supplied stream.
+    /// The resources object does not own this stream (i.e., it will not destroy
+    /// it).
+    virtual void setDefaultStream(int device, cudaStream_t stream) = 0;
-  /// Returns the set of alternative streams that we use for the given device
-  virtual std::vector<cudaStream_t> getAlternateStreams(int device) = 0;
+    /// Returns the set of alternative streams that we use for the given device
+    virtual std::vector<cudaStream_t> getAlternateStreams(int device) = 0;
-  /// Memory management
-  /// Returns an allocation from the given memory space, ordered with respect to
-  /// the given stream (i.e., the first user will be a kernel in this stream).
-  /// All allocations are sized internally to be the next highest multiple of 16
-  /// bytes, and all allocations returned are guaranteed to be 16 byte aligned.
-  virtual void* allocMemory(const AllocRequest& req) = 0;
+    /// Memory management
+    /// Returns an allocation from the given memory space, ordered with respect
+    /// to the given stream (i.e., the first user will be a kernel in this
+    /// stream). All allocations are sized internally to be the next highest
+    /// multiple of 16 bytes, and all allocations returned are guaranteed to be
+    /// 16 byte aligned.
+    virtual void* allocMemory(const AllocRequest& req) = 0;
-  /// Returns a previous allocation
-  virtual void deallocMemory(int device, void* in) = 0;
+    /// Returns a previous allocation
+    virtual void deallocMemory(int device, void* in) = 0;
-  /// For MemorySpace::Temporary, how much space is immediately available
-  /// without cudaMalloc allocation?
-  virtual size_t getTempMemoryAvailable(int device) const = 0;
+    /// For MemorySpace::Temporary, how much space is immediately available
+    /// without cudaMalloc allocation?
+    virtual size_t getTempMemoryAvailable(int device) const = 0;
-  /// Returns the available CPU pinned memory buffer
-  virtual std::pair<void*, size_t> getPinnedMemory() = 0;
+    /// Returns the available CPU pinned memory buffer
+    virtual std::pair<void*, size_t> getPinnedMemory() = 0;
-  /// Returns the stream on which we perform async CPU <-> GPU copies
-  virtual cudaStream_t getAsyncCopyStream(int device) = 0;
+    /// Returns the stream on which we perform async CPU <-> GPU copies
+    virtual cudaStream_t getAsyncCopyStream(int device) = 0;
-  ///
-  /// Functions provided by default
-  ///
+    ///
+    /// Functions provided by default
+    ///
-  /// Calls getBlasHandle with the current device
-  cublasHandle_t getBlasHandleCurrentDevice();
+    /// Calls getBlasHandle with the current device
+    cublasHandle_t getBlasHandleCurrentDevice();
-  /// Calls getDefaultStream with the current device
-  cudaStream_t getDefaultStreamCurrentDevice();
+    /// Calls getDefaultStream with the current device
+    cudaStream_t getDefaultStreamCurrentDevice();
-  /// Calls getTempMemoryAvailable with the current device
-  size_t getTempMemoryAvailableCurrentDevice() const;
+    /// Calls getTempMemoryAvailable with the current device
+    size_t getTempMemoryAvailableCurrentDevice() const;
-  /// Returns a temporary memory allocation via a RAII object
-  GpuMemoryReservation allocMemoryHandle(const AllocRequest& req);
+    /// Returns a temporary memory allocation via a RAII object
+    GpuMemoryReservation allocMemoryHandle(const AllocRequest& req);
-  /// Synchronizes the CPU with respect to the default stream for the
-  /// given device
-  // equivalent to cudaDeviceSynchronize(getDefaultStream(device))
-  void syncDefaultStream(int device);
+    /// Synchronizes the CPU with respect to the default stream for the
+    /// given device
+    // equivalent to cudaDeviceSynchronize(getDefaultStream(device))
+    void syncDefaultStream(int device);
-  /// Calls syncDefaultStream for the current device
-  void syncDefaultStreamCurrentDevice();
+    /// Calls syncDefaultStream for the current device
+    void syncDefaultStreamCurrentDevice();
-  /// Calls getAlternateStreams for the current device
-  std::vector<cudaStream_t> getAlternateStreamsCurrentDevice();
+    /// Calls getAlternateStreams for the current device
+    std::vector<cudaStream_t> getAlternateStreamsCurrentDevice();
-  /// Calls getAsyncCopyStream for the current device
-  cudaStream_t getAsyncCopyStreamCurrentDevice();
+    /// Calls getAsyncCopyStream for the current device
+    cudaStream_t getAsyncCopyStreamCurrentDevice();
 };
 /// Interface for a provider of a shared resources object
 class GpuResourcesProvider {
- public:
-  virtual ~GpuResourcesProvider();
+   public:
+    virtual ~GpuResourcesProvider();
-  /// Returns the shared resources object
-  virtual std::shared_ptr<GpuResources> getResources() = 0;
+    /// Returns the shared resources object
+    virtual std::shared_ptr<GpuResources> getResources() = 0;
 };
-} } // namespace
+} // namespace gpu
+} // namespace faiss

data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp CHANGED Viewed

@@ -5,16 +5,16 @@
  * LICENSE file in the root directory of this source tree.
  */
 #include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/StaticUtils.h>
 #include <faiss/impl/FaissAssert.h>
-#include <limits>
 #include <iostream>
+#include <limits>
 #include <sstream>
-namespace faiss { namespace gpu {
+namespace faiss {
+namespace gpu {
 namespace {
@@ -22,513 +22,536 @@ namespace {
 constexpr int kNumStreams = 2;
 // Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
-constexpr size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
+constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024;
 // Default temporary memory allocation for <= 4 GiB memory GPUs
-constexpr size_t k4GiBTempMem = (size_t) 512 * 1024 * 1024;
+constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024;
 // Default temporary memory allocation for <= 8 GiB memory GPUs
-constexpr size_t k8GiBTempMem = (size_t) 1024 * 1024 * 1024;
+constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024;
 // Maximum temporary memory allocation for all GPUs
-constexpr size_t kMaxTempMem = (size_t) 1536 * 1024 * 1024;
+constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024;
 std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
-  // Produce a sorted list of all outstanding allocations by type
-  std::unordered_map<AllocType, std::pair<int, size_t>> stats;
-  for (auto& entry : map) {
-    auto& a = entry.second;
-    auto it = stats.find(a.type);
-    if (it != stats.end()) {
-      stats[a.type].first++;
-      stats[a.type].second += a.size;
-    } else {
-      stats[a.type] = std::make_pair(1, a.size);
+    // Produce a sorted list of all outstanding allocations by type
+    std::unordered_map<AllocType, std::pair<int, size_t>> stats;
+    for (auto& entry : map) {
+        auto& a = entry.second;
+        auto it = stats.find(a.type);
+        if (it != stats.end()) {
+            stats[a.type].first++;
+            stats[a.type].second += a.size;
+        } else {
+            stats[a.type] = std::make_pair(1, a.size);
+        }
     }
-  }
-  std::stringstream ss;
-  for (auto& entry : stats) {
-    ss << "Alloc type " << allocTypeToString(entry.first) << ": "
-       << entry.second.first << " allocations, "
-       << entry.second.second << " bytes\n";
-  }
+    std::stringstream ss;
+    for (auto& entry : stats) {
+        ss << "Alloc type " << allocTypeToString(entry.first) << ": "
+           << entry.second.first << " allocations, " << entry.second.second
+           << " bytes\n";
+    }
-  return ss.str();
+    return ss.str();
 }
-}
+} // namespace
 //
 // StandardGpuResourcesImpl
 //
-StandardGpuResourcesImpl::StandardGpuResourcesImpl() :
-    pinnedMemAlloc_(nullptr),
-    pinnedMemAllocSize_(0),
-    // let the adjustment function determine the memory size for us by passing
-    // in a huge value that will then be adjusted
-    tempMemSize_(getDefaultTempMemForGPU(-1,
-                                         std::numeric_limits<size_t>::max())),
-    pinnedMemSize_(kDefaultPinnedMemoryAllocation),
-    allocLogging_(false) {
-}
+StandardGpuResourcesImpl::StandardGpuResourcesImpl()
+        : pinnedMemAlloc_(nullptr),
+          pinnedMemAllocSize_(0),
+          // let the adjustment function determine the memory size for us by
+          // passing in a huge value that will then be adjusted
+          tempMemSize_(getDefaultTempMemForGPU(
+                  -1,
+                  std::numeric_limits<size_t>::max())),
+          pinnedMemSize_(kDefaultPinnedMemoryAllocation),
+          allocLogging_(false) {}
 StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
-  // The temporary memory allocator has allocated memory through us, so clean
-  // that up before we finish fully de-initializing ourselves
-  tempMemory_.clear();
-  // Make sure all allocations have been freed
-  bool allocError = false;
-  for (auto& entry : allocs_) {
-    auto& map = entry.second;
-    if (!map.empty()) {
-      std::cerr
-        << "StandardGpuResources destroyed with allocations outstanding:\n"
-        << "Device " << entry.first << " outstanding allocations:\n";
-      std::cerr << allocsToString(map);
-      allocError = true;
+    // The temporary memory allocator has allocated memory through us, so clean
+    // that up before we finish fully de-initializing ourselves
+    tempMemory_.clear();
+    // Make sure all allocations have been freed
+    bool allocError = false;
+    for (auto& entry : allocs_) {
+        auto& map = entry.second;
+        if (!map.empty()) {
+            std::cerr
+                    << "StandardGpuResources destroyed with allocations outstanding:\n"
+                    << "Device " << entry.first
+                    << " outstanding allocations:\n";
+            std::cerr << allocsToString(map);
+            allocError = true;
+        }
     }
-  }
-  FAISS_ASSERT_MSG(!allocError, "GPU memory allocations not properly cleaned up");
+    FAISS_ASSERT_MSG(
+            !allocError, "GPU memory allocations not properly cleaned up");
-  for (auto& entry : defaultStreams_) {
-    DeviceScope scope(entry.first);
+    for (auto& entry : defaultStreams_) {
+        DeviceScope scope(entry.first);
-    // We created these streams, so are responsible for destroying them
-    CUDA_VERIFY(cudaStreamDestroy(entry.second));
-  }
+        // We created these streams, so are responsible for destroying them
+        CUDA_VERIFY(cudaStreamDestroy(entry.second));
+    }
-  for (auto& entry : alternateStreams_) {
-    DeviceScope scope(entry.first);
+    for (auto& entry : alternateStreams_) {
+        DeviceScope scope(entry.first);
-    for (auto stream : entry.second) {
-      CUDA_VERIFY(cudaStreamDestroy(stream));
+        for (auto stream : entry.second) {
+            CUDA_VERIFY(cudaStreamDestroy(stream));
+        }
     }
-  }
-  for (auto& entry : asyncCopyStreams_) {
-    DeviceScope scope(entry.first);
+    for (auto& entry : asyncCopyStreams_) {
+        DeviceScope scope(entry.first);
-    CUDA_VERIFY(cudaStreamDestroy(entry.second));
-  }
+        CUDA_VERIFY(cudaStreamDestroy(entry.second));
+    }
-  for (auto& entry : blasHandles_) {
-    DeviceScope scope(entry.first);
+    for (auto& entry : blasHandles_) {
+        DeviceScope scope(entry.first);
-    auto blasStatus = cublasDestroy(entry.second);
-    FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
-  }
+        auto blasStatus = cublasDestroy(entry.second);
+        FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
+    }
-  if (pinnedMemAlloc_) {
-    auto err = cudaFreeHost(pinnedMemAlloc_);
-    FAISS_ASSERT_FMT(err == cudaSuccess,
-                     "Failed to cudaFreeHost pointer %p (error %d %s)",
-                     pinnedMemAlloc_, (int) err, cudaGetErrorString(err));
-  }
+    if (pinnedMemAlloc_) {
+        auto err = cudaFreeHost(pinnedMemAlloc_);
+        FAISS_ASSERT_FMT(
+                err == cudaSuccess,
+                "Failed to cudaFreeHost pointer %p (error %d %s)",
+                pinnedMemAlloc_,
+                (int)err,
+                cudaGetErrorString(err));
+    }
 }
-size_t
-StandardGpuResourcesImpl::getDefaultTempMemForGPU(int device,
-                                                  size_t requested) {
-  auto totalMem = device != -1 ?
-    getDeviceProperties(device).totalGlobalMem :
-    std::numeric_limits<size_t>::max();
+size_t StandardGpuResourcesImpl::getDefaultTempMemForGPU(
+        int device,
+        size_t requested) {
+    auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem
+                                 : std::numeric_limits<size_t>::max();
-  if (totalMem <= (size_t) 4 * 1024 * 1024 * 1024) {
-    // If the GPU has <= 4 GiB of memory, reserve 512 MiB
+    if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) {
+        // If the GPU has <= 4 GiB of memory, reserve 512 MiB
-    if (requested > k4GiBTempMem) {
-      return k4GiBTempMem;
-    }
-  } else if (totalMem <= (size_t) 8 * 1024 * 1024 * 1024) {
-    // If the GPU has <= 8 GiB of memory, reserve 1 GiB
+        if (requested > k4GiBTempMem) {
+            return k4GiBTempMem;
+        }
+    } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) {
+        // If the GPU has <= 8 GiB of memory, reserve 1 GiB
-    if (requested > k8GiBTempMem) {
-      return k8GiBTempMem;
-    }
-  } else {
-    // Never use more than 1.5 GiB
-    if (requested > kMaxTempMem) {
-      return kMaxTempMem;
+        if (requested > k8GiBTempMem) {
+            return k8GiBTempMem;
+        }
+    } else {
+        // Never use more than 1.5 GiB
+        if (requested > kMaxTempMem) {
+            return kMaxTempMem;
+        }
     }
-  }
-  // use whatever lower limit the user requested
-  return requested;
-}
-void
-StandardGpuResourcesImpl::noTempMemory() {
-  setTempMemory(0);
-}
-void
-StandardGpuResourcesImpl::setTempMemory(size_t size) {
-  if (tempMemSize_ != size) {
-    // adjust based on general limits
-    tempMemSize_ = getDefaultTempMemForGPU(-1, size);
-    // We need to re-initialize memory resources for all current devices that
-    // have been initialized.
-    // This should be safe to do, even if we are currently running work, because
-    // the cudaFree call that this implies will force-synchronize all GPUs with
-    // the CPU
-    for (auto& p : tempMemory_) {
-      int device = p.first;
-      // Free the existing memory first
-      p.second.reset();
-      // Allocate new
-      p.second = std::unique_ptr<StackDeviceMemory>(
-        new StackDeviceMemory(this,
-                              p.first,
-                              // adjust for this specific device
-                              getDefaultTempMemForGPU(device, tempMemSize_)));
+    // use whatever lower limit the user requested
+    return requested;
+}
+void StandardGpuResourcesImpl::noTempMemory() {
+    setTempMemory(0);
+}
+void StandardGpuResourcesImpl::setTempMemory(size_t size) {
+    if (tempMemSize_ != size) {
+        // adjust based on general limits
+        tempMemSize_ = getDefaultTempMemForGPU(-1, size);
+        // We need to re-initialize memory resources for all current devices
+        // that have been initialized. This should be safe to do, even if we are
+        // currently running work, because the cudaFree call that this implies
+        // will force-synchronize all GPUs with the CPU
+        for (auto& p : tempMemory_) {
+            int device = p.first;
+            // Free the existing memory first
+            p.second.reset();
+            // Allocate new
+            p.second = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
+                    this,
+                    p.first,
+                    // adjust for this specific device
+                    getDefaultTempMemForGPU(device, tempMemSize_)));
+        }
     }
-  }
 }
-void
-StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
-  // Should not call this after devices have been initialized
-  FAISS_ASSERT(defaultStreams_.size() == 0);
-  FAISS_ASSERT(!pinnedMemAlloc_);
+void StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
+    // Should not call this after devices have been initialized
+    FAISS_ASSERT(defaultStreams_.size() == 0);
+    FAISS_ASSERT(!pinnedMemAlloc_);
-  pinnedMemSize_ = size;
+    pinnedMemSize_ = size;
 }
-void
-StandardGpuResourcesImpl::setDefaultStream(int device, cudaStream_t stream) {
-  if (isInitialized(device)) {
-     // A new series of calls may not be ordered with what was the previous
-     // stream, so if the stream being specified is different, then we need to
-     // ensure ordering between the two (new stream waits on old).
-    auto it = userDefaultStreams_.find(device);
-    cudaStream_t prevStream = nullptr;
+void StandardGpuResourcesImpl::setDefaultStream(
+        int device,
+        cudaStream_t stream) {
+    if (isInitialized(device)) {
+        // A new series of calls may not be ordered with what was the previous
+        // stream, so if the stream being specified is different, then we need
+        // to ensure ordering between the two (new stream waits on old).
+        auto it = userDefaultStreams_.find(device);
+        cudaStream_t prevStream = nullptr;
-    if (it != userDefaultStreams_.end()) {
-      prevStream = it->second;
-    } else {
-      FAISS_ASSERT(defaultStreams_.count(device));
-      prevStream = defaultStreams_[device];
-    }
+        if (it != userDefaultStreams_.end()) {
+            prevStream = it->second;
+        } else {
+            FAISS_ASSERT(defaultStreams_.count(device));
+            prevStream = defaultStreams_[device];
+        }
-    if (prevStream != stream) {
-      streamWait({stream}, {prevStream});
+        if (prevStream != stream) {
+            streamWait({stream}, {prevStream});
+        }
     }
-  }
-  userDefaultStreams_[device] = stream;
+    userDefaultStreams_[device] = stream;
 }
-void
-StandardGpuResourcesImpl::revertDefaultStream(int device) {
-  if (isInitialized(device)) {
-    auto it = userDefaultStreams_.find(device);
+void StandardGpuResourcesImpl::revertDefaultStream(int device) {
+    if (isInitialized(device)) {
+        auto it = userDefaultStreams_.find(device);
-    if (it != userDefaultStreams_.end()) {
-      // There was a user stream set that we need to synchronize against
-      cudaStream_t prevStream = userDefaultStreams_[device];
+        if (it != userDefaultStreams_.end()) {
+            // There was a user stream set that we need to synchronize against
+            cudaStream_t prevStream = userDefaultStreams_[device];
-      FAISS_ASSERT(defaultStreams_.count(device));
-      cudaStream_t newStream = defaultStreams_[device];
+            FAISS_ASSERT(defaultStreams_.count(device));
+            cudaStream_t newStream = defaultStreams_[device];
-      streamWait({newStream}, {prevStream});
+            streamWait({newStream}, {prevStream});
+        }
     }
-  }
-  userDefaultStreams_.erase(device);
+    userDefaultStreams_.erase(device);
 }
-void
-StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
-  for (int dev = 0; dev < getNumDevices(); ++dev) {
-    setDefaultStream(dev, nullptr);
-  }
+void StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
+    for (int dev = 0; dev < getNumDevices(); ++dev) {
+        setDefaultStream(dev, nullptr);
+    }
 }
-void
-StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
-  allocLogging_ = enable;
+void StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
+    allocLogging_ = enable;
 }
-bool
-StandardGpuResourcesImpl::isInitialized(int device) const {
-  // Use default streams as a marker for whether or not a certain
-  // device has been initialized
-  return defaultStreams_.count(device) != 0;
+bool StandardGpuResourcesImpl::isInitialized(int device) const {
+    // Use default streams as a marker for whether or not a certain
+    // device has been initialized
+    return defaultStreams_.count(device) != 0;
 }
-void
-StandardGpuResourcesImpl::initializeForDevice(int device) {
-  if (isInitialized(device)) {
-    return;
-  }
-  // If this is the first device that we're initializing, create our
-  // pinned memory allocation
-  if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
-    auto err =
-      cudaHostAlloc(&pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
+void StandardGpuResourcesImpl::initializeForDevice(int device) {
+    if (isInitialized(device)) {
+        return;
+    }
-    FAISS_THROW_IF_NOT_FMT(
-      err == cudaSuccess,
-      "failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
-      "async copy buffer (error %d %s)",
-      pinnedMemSize_, (int) err, cudaGetErrorString(err));
+    // If this is the first device that we're initializing, create our
+    // pinned memory allocation
+    if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
+        auto err = cudaHostAlloc(
+                &pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
+        FAISS_THROW_IF_NOT_FMT(
+                err == cudaSuccess,
+                "failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
+                "async copy buffer (error %d %s)",
+                pinnedMemSize_,
+                (int)err,
+                cudaGetErrorString(err));
+        pinnedMemAllocSize_ = pinnedMemSize_;
+    }
-    pinnedMemAllocSize_ = pinnedMemSize_;
-  }
+    FAISS_ASSERT(device < getNumDevices());
+    DeviceScope scope(device);
-  FAISS_ASSERT(device < getNumDevices());
-  DeviceScope scope(device);
+    // Make sure that device properties for all devices are cached
+    auto& prop = getDeviceProperties(device);
-  // Make sure that device properties for all devices are cached
-  auto& prop = getDeviceProperties(device);
+    // Also check to make sure we meet our minimum compute capability (3.0)
+    FAISS_ASSERT_FMT(
+            prop.major >= 3,
+            "Device id %d with CC %d.%d not supported, "
+            "need 3.0+ compute capability",
+            device,
+            prop.major,
+            prop.minor);
-  // Also check to make sure we meet our minimum compute capability (3.0)
-  FAISS_ASSERT_FMT(prop.major >= 3,
-                   "Device id %d with CC %d.%d not supported, "
-                   "need 3.0+ compute capability",
-                   device, prop.major, prop.minor);
+    // Our code is pre-built with and expects warpSize == 32, validate that
+    FAISS_ASSERT_FMT(
+            prop.warpSize == 32,
+            "Device id %d does not have expected warpSize of 32",
+            device);
-  // Create streams
-  cudaStream_t defaultStream = 0;
-  CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
-                                        cudaStreamNonBlocking));
+    // Create streams
+    cudaStream_t defaultStream = 0;
+    CUDA_VERIFY(
+            cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
-  defaultStreams_[device] = defaultStream;
+    defaultStreams_[device] = defaultStream;
-  cudaStream_t asyncCopyStream = 0;
-  CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
-                                        cudaStreamNonBlocking));
+    cudaStream_t asyncCopyStream = 0;
+    CUDA_VERIFY(
+            cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
-  asyncCopyStreams_[device] = asyncCopyStream;
+    asyncCopyStreams_[device] = asyncCopyStream;
-  std::vector<cudaStream_t> deviceStreams;
-  for (int j = 0; j < kNumStreams; ++j) {
-    cudaStream_t stream = 0;
-    CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
-                                          cudaStreamNonBlocking));
+    std::vector<cudaStream_t> deviceStreams;
+    for (int j = 0; j < kNumStreams; ++j) {
+        cudaStream_t stream = 0;
+        CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-    deviceStreams.push_back(stream);
-  }
+        deviceStreams.push_back(stream);
+    }
-  alternateStreams_[device] = std::move(deviceStreams);
+    alternateStreams_[device] = std::move(deviceStreams);
-  // Create cuBLAS handle
-  cublasHandle_t blasHandle = 0;
-  auto blasStatus = cublasCreate(&blasHandle);
-  FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
-  blasHandles_[device] = blasHandle;
+    // Create cuBLAS handle
+    cublasHandle_t blasHandle = 0;
+    auto blasStatus = cublasCreate(&blasHandle);
+    FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
+    blasHandles_[device] = blasHandle;
-  // For CUDA 10 on V100, enabling tensor core usage would enable automatic
-  // rounding down of inputs to f16 (though accumulate in f32) which results in
-  // unacceptable loss of precision in general.
-  // For CUDA 11 / A100, only enable tensor core support if it doesn't result in
-  // a loss of precision.
+    // For CUDA 10 on V100, enabling tensor core usage would enable automatic
+    // rounding down of inputs to f16 (though accumulate in f32) which results
+    // in unacceptable loss of precision in general. For CUDA 11 / A100, only
+    // enable tensor core support if it doesn't result in a loss of precision.
 #if CUDA_VERSION >= 11000
-  cublasSetMathMode(blasHandle,
-                    CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+    cublasSetMathMode(
+            blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
 #endif
-  FAISS_ASSERT(allocs_.count(device) == 0);
-  allocs_[device] = std::unordered_map<void*, AllocRequest>();
+    FAISS_ASSERT(allocs_.count(device) == 0);
+    allocs_[device] = std::unordered_map<void*, AllocRequest>();
-  FAISS_ASSERT(tempMemory_.count(device) == 0);
-  auto mem = std::unique_ptr<StackDeviceMemory>(
-    new StackDeviceMemory(this,
-                          device,
-                          // adjust for this specific device
-                          getDefaultTempMemForGPU(device, tempMemSize_)));
+    FAISS_ASSERT(tempMemory_.count(device) == 0);
+    auto mem = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
+            this,
+            device,
+            // adjust for this specific device
+            getDefaultTempMemForGPU(device, tempMemSize_)));
-  tempMemory_.emplace(device, std::move(mem));
+    tempMemory_.emplace(device, std::move(mem));
 }
-cublasHandle_t
-StandardGpuResourcesImpl::getBlasHandle(int device) {
-  initializeForDevice(device);
-  return blasHandles_[device];
+cublasHandle_t StandardGpuResourcesImpl::getBlasHandle(int device) {
+    initializeForDevice(device);
+    return blasHandles_[device];
 }
-cudaStream_t
-StandardGpuResourcesImpl::getDefaultStream(int device) {
-  initializeForDevice(device);
+cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
+    initializeForDevice(device);
-  auto it = userDefaultStreams_.find(device);
-  if (it != userDefaultStreams_.end()) {
-    // There is a user override stream set
-    return it->second;
-  }
+    auto it = userDefaultStreams_.find(device);
+    if (it != userDefaultStreams_.end()) {
+        // There is a user override stream set
+        return it->second;
+    }
-  // Otherwise, our base default stream
-  return defaultStreams_[device];
+    // Otherwise, our base default stream
+    return defaultStreams_[device];
 }
-std::vector<cudaStream_t>
-StandardGpuResourcesImpl::getAlternateStreams(int device) {
-  initializeForDevice(device);
-  return alternateStreams_[device];
+std::vector<cudaStream_t> StandardGpuResourcesImpl::getAlternateStreams(
+        int device) {
+    initializeForDevice(device);
+    return alternateStreams_[device];
 }
-std::pair<void*, size_t>
-StandardGpuResourcesImpl::getPinnedMemory() {
-  return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
+std::pair<void*, size_t> StandardGpuResourcesImpl::getPinnedMemory() {
+    return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
 }
-cudaStream_t
-StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
-  initializeForDevice(device);
-  return asyncCopyStreams_[device];
+cudaStream_t StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
+    initializeForDevice(device);
+    return asyncCopyStreams_[device];
 }
-void*
-StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
-  initializeForDevice(req.device);
-  // We don't allocate a placeholder for zero-sized allocations
-  if (req.size == 0) {
-    return nullptr;
-  }
-  // Make sure that the allocation is a multiple of 16 bytes for alignment
-  // purposes
-  auto adjReq = req;
-  adjReq.size = utils::roundUp(adjReq.size, (size_t) 16);
-  void* p = nullptr;
-  if (allocLogging_) {
-    std::cout << "StandardGpuResources: alloc " << adjReq.toString() << "\n";
-  }
-  if (adjReq.space == MemorySpace::Temporary) {
-    // If we don't have enough space in our temporary memory manager, we need
-    // to allocate this request separately
-    auto& tempMem = tempMemory_[adjReq.device];
-    if (adjReq.size > tempMem->getSizeAvailable()) {
-      // We need to allocate this ourselves
-      AllocRequest newReq = adjReq;
-      newReq.space = MemorySpace::Device;
-      newReq.type = AllocType::TemporaryMemoryOverflow;
+void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
+    initializeForDevice(req.device);
-      return allocMemory(newReq);
+    // We don't allocate a placeholder for zero-sized allocations
+    if (req.size == 0) {
+        return nullptr;
     }
-    // Otherwise, we can handle this locally
-    p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
-  } else if (adjReq.space == MemorySpace::Device) {
-    auto err = cudaMalloc(&p, adjReq.size);
-    // Throw if we fail to allocate
-    if (err != cudaSuccess) {
-      auto& map = allocs_[req.device];
-      std::stringstream ss;
-      ss << "Failed to cudaMalloc " << adjReq.size << " bytes "
-         << "on device " << adjReq.device << " (error "
-         << (int) err << " " << cudaGetErrorString(err)
-         << "\nOutstanding allocations:\n" << allocsToString(map);
-      auto str = ss.str();
-      FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
+    // cudaMalloc guarantees allocation alignment to 256 bytes; do the same here
+    // for alignment purposes (to reduce memory transaction overhead etc)
+    auto adjReq = req;
+    adjReq.size = utils::roundUp(adjReq.size, (size_t)256);
+    void* p = nullptr;
+    if (adjReq.space == MemorySpace::Temporary) {
+        // If we don't have enough space in our temporary memory manager, we
+        // need to allocate this request separately
+        auto& tempMem = tempMemory_[adjReq.device];
+        if (adjReq.size > tempMem->getSizeAvailable()) {
+            // We need to allocate this ourselves
+            AllocRequest newReq = adjReq;
+            newReq.space = MemorySpace::Device;
+            newReq.type = AllocType::TemporaryMemoryOverflow;
+            if (allocLogging_) {
+                std::cout
+                        << "StandardGpuResources: alloc fail "
+                        << adjReq.toString()
+                        << " (no temp space); retrying as MemorySpace::Device\n";
+            }
+            return allocMemory(newReq);
+        }
+        // Otherwise, we can handle this locally
+        p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
+    } else if (adjReq.space == MemorySpace::Device) {
+        auto err = cudaMalloc(&p, adjReq.size);
+        // Throw if we fail to allocate
+        if (err != cudaSuccess) {
+            // FIXME: as of CUDA 11, a memory allocation error appears to be
+            // presented via cudaGetLastError as well, and needs to be cleared.
+            // Just call the function to clear it
+            cudaGetLastError();
+            std::stringstream ss;
+            ss << "StandardGpuResources: alloc fail " << adjReq.toString()
+               << " (cudaMalloc error " << cudaGetErrorString(err) << " ["
+               << (int)err << "])\n";
+            auto str = ss.str();
+            if (allocLogging_) {
+                std::cout << str;
+            }
+            FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
+        }
+    } else if (adjReq.space == MemorySpace::Unified) {
+        auto err = cudaMallocManaged(&p, adjReq.size);
+        if (err != cudaSuccess) {
+            // FIXME: as of CUDA 11, a memory allocation error appears to be
+            // presented via cudaGetLastError as well, and needs to be cleared.
+            // Just call the function to clear it
+            cudaGetLastError();
+            std::stringstream ss;
+            ss << "StandardGpuResources: alloc fail " << adjReq.toString()
+               << " failed (cudaMallocManaged error " << cudaGetErrorString(err)
+               << " [" << (int)err << "])\n";
+            auto str = ss.str();
+            if (allocLogging_) {
+                std::cout << str;
+            }
+            FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
+        }
+    } else {
+        FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
     }
-  } else if (adjReq.space == MemorySpace::Unified) {
-    auto err = cudaMallocManaged(&p, adjReq.size);
-    if (err != cudaSuccess) {
-      auto& map = allocs_[req.device];
-      std::stringstream ss;
-      ss << "Failed to cudaMallocManaged " << adjReq.size << " bytes "
-         << "(error " << (int) err << " " << cudaGetErrorString(err)
-         << "\nOutstanding allocations:\n" << allocsToString(map);
-      auto str = ss.str();
-      FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
+    if (allocLogging_) {
+        std::cout << "StandardGpuResources: alloc ok " << adjReq.toString()
+                  << " ptr 0x" << p << "\n";
     }
-  } else {
-    FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) adjReq.space);
-  }
-  allocs_[adjReq.device][p] = adjReq;
+    allocs_[adjReq.device][p] = adjReq;
-  return p;
+    return p;
 }
-void
-StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
-  FAISS_ASSERT(isInitialized(device));
+void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
+    FAISS_ASSERT(isInitialized(device));
-  if (!p) {
-    return;
-  }
+    if (!p) {
+        return;
+    }
-  auto& a = allocs_[device];
-  auto it = a.find(p);
-  FAISS_ASSERT(it != a.end());
+    auto& a = allocs_[device];
+    auto it = a.find(p);
+    FAISS_ASSERT(it != a.end());
-  auto& req = it->second;
+    auto& req = it->second;
-  if (allocLogging_) {
-    std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
-  }
+    if (allocLogging_) {
+        std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
+    }
-  if (req.space == MemorySpace::Temporary) {
-    tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
+    if (req.space == MemorySpace::Temporary) {
+        tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
-  } else if (req.space == MemorySpace::Device ||
-             req.space == MemorySpace::Unified) {
-    auto err = cudaFree(p);
-    FAISS_ASSERT_FMT(err == cudaSuccess,
-                     "Failed to cudaFree pointer %p (error %d %s)",
-                     p, (int) err, cudaGetErrorString(err));
+    } else if (
+            req.space == MemorySpace::Device ||
+            req.space == MemorySpace::Unified) {
+        auto err = cudaFree(p);
+        FAISS_ASSERT_FMT(
+                err == cudaSuccess,
+                "Failed to cudaFree pointer %p (error %d %s)",
+                p,
+                (int)err,
+                cudaGetErrorString(err));
-  } else {
-    FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) req.space);
-  }
+    } else {
+        FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
+    }
-  a.erase(it);
+    a.erase(it);
 }
-size_t
-StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
-  FAISS_ASSERT(isInitialized(device));
+size_t StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
+    FAISS_ASSERT(isInitialized(device));
-  auto it = tempMemory_.find(device);
-  FAISS_ASSERT(it != tempMemory_.end());
+    auto it = tempMemory_.find(device);
+    FAISS_ASSERT(it != tempMemory_.end());
-  return it->second->getSizeAvailable();
+    return it->second->getSizeAvailable();
 }
 std::map<int, std::map<std::string, std::pair<int, size_t>>>
 StandardGpuResourcesImpl::getMemoryInfo() const {
-  using AT = std::map<std::string, std::pair<int, size_t>>;
+    using AT = std::map<std::string, std::pair<int, size_t>>;
-  std::map<int, AT> out;
+    std::map<int, AT> out;
-  for (auto& entry : allocs_) {
-    AT outDevice;
+    for (auto& entry : allocs_) {
+        AT outDevice;
-    for (auto& a : entry.second) {
-      auto& v = outDevice[allocTypeToString(a.second.type)];
-      v.first++;
-      v.second += a.second.size;
-    }
+        for (auto& a : entry.second) {
+            auto& v = outDevice[allocTypeToString(a.second.type)];
+            v.first++;
+            v.second += a.second.size;
+        }
-    out[entry.first] = std::move(outDevice);
-  }
+        out[entry.first] = std::move(outDevice);
+    }
-  return out;
+    return out;
 }
 //
@@ -536,70 +559,58 @@ StandardGpuResourcesImpl::getMemoryInfo() const {
 //
 StandardGpuResources::StandardGpuResources()
-    : res_(new StandardGpuResourcesImpl) {
-}
+        : res_(new StandardGpuResourcesImpl) {}
-StandardGpuResources::~StandardGpuResources() {
-}
+StandardGpuResources::~StandardGpuResources() {}
-std::shared_ptr<GpuResources>
-StandardGpuResources::getResources() {
-  return res_;
+std::shared_ptr<GpuResources> StandardGpuResources::getResources() {
+    return res_;
 }
-void
-StandardGpuResources::noTempMemory() {
-  res_->noTempMemory();
+void StandardGpuResources::noTempMemory() {
+    res_->noTempMemory();
 }
-void
-StandardGpuResources::setTempMemory(size_t size) {
-  res_->setTempMemory(size);
+void StandardGpuResources::setTempMemory(size_t size) {
+    res_->setTempMemory(size);
 }
-void
-StandardGpuResources::setPinnedMemory(size_t size) {
-  res_->setPinnedMemory(size);
+void StandardGpuResources::setPinnedMemory(size_t size) {
+    res_->setPinnedMemory(size);
 }
-void
-StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
-  res_->setDefaultStream(device, stream);
+void StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
+    res_->setDefaultStream(device, stream);
 }
-void
-StandardGpuResources::revertDefaultStream(int device) {
-  res_->revertDefaultStream(device);
+void StandardGpuResources::revertDefaultStream(int device) {
+    res_->revertDefaultStream(device);
 }
-void
-StandardGpuResources::setDefaultNullStreamAllDevices() {
-  res_->setDefaultNullStreamAllDevices();
+void StandardGpuResources::setDefaultNullStreamAllDevices() {
+    res_->setDefaultNullStreamAllDevices();
 }
 std::map<int, std::map<std::string, std::pair<int, size_t>>>
 StandardGpuResources::getMemoryInfo() const {
-  return res_->getMemoryInfo();
+    return res_->getMemoryInfo();
 }
-cudaStream_t
-StandardGpuResources::getDefaultStream(int device) {
-  return res_->getDefaultStream(device);
+cudaStream_t StandardGpuResources::getDefaultStream(int device) {
+    return res_->getDefaultStream(device);
 }
-size_t
-StandardGpuResources::getTempMemoryAvailable(int device) const {
-  return res_->getTempMemoryAvailable(device);
+size_t StandardGpuResources::getTempMemoryAvailable(int device) const {
+    return res_->getTempMemoryAvailable(device);
 }
-void
-StandardGpuResources::syncDefaultStreamCurrentDevice() {
-  res_->syncDefaultStreamCurrentDevice();
+void StandardGpuResources::syncDefaultStreamCurrentDevice() {
+    res_->syncDefaultStreamCurrentDevice();
 }
-void
-StandardGpuResources::setLogMemoryAllocations(bool enable) {
-  res_->setLogMemoryAllocations(enable);
+void StandardGpuResources::setLogMemoryAllocations(bool enable) {
+    res_->setLogMemoryAllocations(enable);
 }
-} } // namespace
+} // namespace gpu
+} // namespace faiss