RubyGems - faiss - Versions diffs - 0.1.5 → 0.2.2 - Mend

faiss 0.1.5 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (219) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +24 -0
data/README.md +12 -0
data/ext/faiss/ext.cpp +1 -1
data/ext/faiss/extconf.rb +6 -2
data/ext/faiss/index.cpp +114 -43
data/ext/faiss/index_binary.cpp +24 -30
data/ext/faiss/kmeans.cpp +20 -16
data/ext/faiss/numo.hpp +867 -0
data/ext/faiss/pca_matrix.cpp +13 -14
data/ext/faiss/product_quantizer.cpp +23 -24
data/ext/faiss/utils.cpp +10 -37
data/ext/faiss/utils.h +2 -13
data/lib/faiss.rb +0 -5
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +292 -291
data/vendor/faiss/faiss/AutoTune.h +55 -56
data/vendor/faiss/faiss/Clustering.cpp +334 -195
data/vendor/faiss/faiss/Clustering.h +88 -35
data/vendor/faiss/faiss/IVFlib.cpp +171 -195
data/vendor/faiss/faiss/IVFlib.h +48 -51
data/vendor/faiss/faiss/Index.cpp +85 -103
data/vendor/faiss/faiss/Index.h +54 -48
data/vendor/faiss/faiss/Index2Layer.cpp +139 -164
data/vendor/faiss/faiss/Index2Layer.h +22 -22
data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
data/vendor/faiss/faiss/IndexBinary.h +140 -132
data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
data/vendor/faiss/faiss/IndexFlat.cpp +116 -147
data/vendor/faiss/faiss/IndexFlat.h +35 -46
data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
data/vendor/faiss/faiss/IndexHNSW.h +57 -41
data/vendor/faiss/faiss/IndexIVF.cpp +474 -454
data/vendor/faiss/faiss/IndexIVF.h +146 -113
data/vendor/faiss/faiss/IndexIVFFlat.cpp +248 -250
data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
data/vendor/faiss/faiss/IndexIVFPQ.cpp +457 -516
data/vendor/faiss/faiss/IndexIVFPQ.h +74 -66
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +125 -133
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +19 -21
data/vendor/faiss/faiss/IndexLSH.cpp +75 -96
data/vendor/faiss/faiss/IndexLSH.h +21 -26
data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
data/vendor/faiss/faiss/IndexLattice.h +11 -16
data/vendor/faiss/faiss/IndexNNDescent.cpp +231 -0
data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
data/vendor/faiss/faiss/IndexNSG.cpp +303 -0
data/vendor/faiss/faiss/IndexNSG.h +85 -0
data/vendor/faiss/faiss/IndexPQ.cpp +405 -464
data/vendor/faiss/faiss/IndexPQ.h +64 -67
data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
data/vendor/faiss/faiss/IndexRefine.cpp +115 -131
data/vendor/faiss/faiss/IndexRefine.h +22 -23
data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
data/vendor/faiss/faiss/IndexReplicas.h +62 -56
data/vendor/faiss/faiss/IndexResidual.cpp +291 -0
data/vendor/faiss/faiss/IndexResidual.h +152 -0
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +120 -155
data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -45
data/vendor/faiss/faiss/IndexShards.cpp +256 -240
data/vendor/faiss/faiss/IndexShards.h +85 -73
data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
data/vendor/faiss/faiss/MatrixStats.h +7 -10
data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
data/vendor/faiss/faiss/MetaIndexes.h +40 -34
data/vendor/faiss/faiss/MetricType.h +7 -7
data/vendor/faiss/faiss/VectorTransform.cpp +652 -474
data/vendor/faiss/faiss/VectorTransform.h +61 -89
data/vendor/faiss/faiss/clone_index.cpp +77 -73
data/vendor/faiss/faiss/clone_index.h +4 -9
data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +197 -170
data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +270 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +115 -0
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
data/vendor/faiss/faiss/impl/FaissException.h +41 -29
data/vendor/faiss/faiss/impl/HNSW.cpp +595 -611
data/vendor/faiss/faiss/impl/HNSW.h +179 -200
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +672 -0
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +172 -0
data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
data/vendor/faiss/faiss/impl/NSG.cpp +682 -0
data/vendor/faiss/faiss/impl/NSG.h +199 -0
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +448 -0
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +130 -0
data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +648 -701
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
data/vendor/faiss/faiss/impl/index_read.cpp +547 -479
data/vendor/faiss/faiss/impl/index_write.cpp +497 -407
data/vendor/faiss/faiss/impl/io.cpp +75 -94
data/vendor/faiss/faiss/impl/io.h +31 -41
data/vendor/faiss/faiss/impl/io_macros.h +40 -29
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
data/vendor/faiss/faiss/index_factory.cpp +269 -218
data/vendor/faiss/faiss/index_factory.h +6 -7
data/vendor/faiss/faiss/index_io.h +23 -26
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
data/vendor/faiss/faiss/utils/Heap.h +186 -209
data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
data/vendor/faiss/faiss/utils/distances.cpp +301 -310
data/vendor/faiss/faiss/utils/distances.h +133 -118
data/vendor/faiss/faiss/utils/distances_simd.cpp +456 -516
data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
data/vendor/faiss/faiss/utils/hamming.h +62 -85
data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
data/vendor/faiss/faiss/utils/partitioning.h +26 -21
data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
data/vendor/faiss/faiss/utils/random.cpp +39 -63
data/vendor/faiss/faiss/utils/random.h +13 -16
data/vendor/faiss/faiss/utils/simdlib.h +4 -2
data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
data/vendor/faiss/faiss/utils/utils.cpp +304 -287
data/vendor/faiss/faiss/utils/utils.h +53 -48
metadata +24 -10
data/lib/faiss/index.rb +0 -20
data/lib/faiss/index_binary.rb +0 -20
data/lib/faiss/kmeans.rb +0 -15
data/lib/faiss/pca_matrix.rb +0 -15
data/lib/faiss/product_quantizer.rb +0 -22

data/vendor/faiss/faiss/gpu/GpuResources.h CHANGED Viewed

@@ -5,55 +5,59 @@
  * LICENSE file in the root directory of this source tree.
  */
 #pragma once
-#include <faiss/impl/FaissAssert.h>
-#include <cuda_runtime.h>
 #include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <faiss/impl/FaissAssert.h>
 #include <memory>
 #include <utility>
 #include <vector>
-namespace faiss { namespace gpu {
+namespace faiss {
+namespace gpu {
 class GpuResources;
 enum AllocType {
-  /// Unknown allocation type or miscellaneous (not currently categorized)
-  Other = 0,
-  /// Primary data storage for GpuIndexFlat (the raw matrix of vectors and
-  /// vector norms if needed)
-  FlatData = 1,
-  /// Primary data storage for GpuIndexIVF* (the storage for each individual IVF
-  /// list)
-  IVFLists = 2,
-  /// Quantizer (PQ, SQ) dictionary information
-  Quantizer = 3,
-  /// For GpuIndexIVFPQ, "precomputed codes" for more efficient PQ lookup
-  /// require the use of possibly large tables. These are marked separately from
-  /// Quantizer as these can frequently be 100s - 1000s of MiB in size
-  QuantizerPrecomputedCodes = 4,
-  ///
-  /// StandardGpuResources implementation specific types
-  ///
-  /// When using StandardGpuResources, temporary memory allocations
-  /// (MemorySpace::Temporary) come out of a stack region of memory that is
-  /// allocated up front for each gpu (e.g., 1.5 GiB upon initialization). This
-  /// allocation by StandardGpuResources is marked with this AllocType.
-  TemporaryMemoryBuffer = 10,
-  /// When using StandardGpuResources, any MemorySpace::Temporary allocations
-  /// that cannot be satisfied within the TemporaryMemoryBuffer region fall back
-  /// to calling cudaMalloc which are sized to just the request at hand. These
-  /// "overflow" temporary allocations are marked with this AllocType.
-  TemporaryMemoryOverflow = 11,
+    /// Unknown allocation type or miscellaneous (not currently categorized)
+    Other = 0,
+    /// Primary data storage for GpuIndexFlat (the raw matrix of vectors and
+    /// vector norms if needed)
+    FlatData = 1,
+    /// Primary data storage for GpuIndexIVF* (the storage for each individual
+    /// IVF
+    /// list)
+    IVFLists = 2,
+    /// Quantizer (PQ, SQ) dictionary information
+    Quantizer = 3,
+    /// For GpuIndexIVFPQ, "precomputed codes" for more efficient PQ lookup
+    /// require the use of possibly large tables. These are marked separately
+    /// from
+    /// Quantizer as these can frequently be 100s - 1000s of MiB in size
+    QuantizerPrecomputedCodes = 4,
+    ///
+    /// StandardGpuResources implementation specific types
+    ///
+    /// When using StandardGpuResources, temporary memory allocations
+    /// (MemorySpace::Temporary) come out of a stack region of memory that is
+    /// allocated up front for each gpu (e.g., 1.5 GiB upon initialization).
+    /// This
+    /// allocation by StandardGpuResources is marked with this AllocType.
+    TemporaryMemoryBuffer = 10,
+    /// When using StandardGpuResources, any MemorySpace::Temporary allocations
+    /// that cannot be satisfied within the TemporaryMemoryBuffer region fall
+    /// back
+    /// to calling cudaMalloc which are sized to just the request at hand. These
+    /// "overflow" temporary allocations are marked with this AllocType.
+    TemporaryMemoryOverflow = 11,
 };
 /// Convert an AllocType to string
@@ -61,16 +65,17 @@ std::string allocTypeToString(AllocType t);
 /// Memory regions accessible to the GPU
 enum MemorySpace {
-  /// Temporary device memory (guaranteed to no longer be used upon exit of a
-  /// top-level index call, and where the streams using it have completed GPU
-  /// work). Typically backed by Device memory (cudaMalloc/cudaFree).
-  Temporary = 0,
+    /// Temporary device memory (guaranteed to no longer be used upon exit of a
+    /// top-level index call, and where the streams using it have completed GPU
+    /// work). Typically backed by Device memory (cudaMalloc/cudaFree).
+    Temporary = 0,
-  /// Managed using cudaMalloc/cudaFree (typical GPU device memory)
-  Device = 1,
+    /// Managed using cudaMalloc/cudaFree (typical GPU device memory)
+    Device = 1,
-  /// Managed using cudaMallocManaged/cudaFree (typical Unified CPU/GPU memory)
-  Unified = 2,
+    /// Managed using cudaMallocManaged/cudaFree (typical Unified CPU/GPU
+    /// memory)
+    Unified = 2,
 };
 /// Convert a MemorySpace to string
@@ -78,44 +83,36 @@ std::string memorySpaceToString(MemorySpace s);
 /// Information on what/where an allocation is
 struct AllocInfo {
-  inline AllocInfo()
-      : type(AllocType::Other),
-        device(0),
-        space(MemorySpace::Device),
-        stream(nullptr) {
-  }
-  inline AllocInfo(AllocType at,
-                   int dev,
-                   MemorySpace sp,
-                   cudaStream_t st)
-      : type(at),
-        device(dev),
-        space(sp),
-        stream(st) {
-  }
-  /// Returns a string representation of this info
-  std::string toString() const;
-  /// The internal category of the allocation
-  AllocType type;
-  /// The device on which the allocation is happening
-  int device;
-  /// The memory space of the allocation
-  MemorySpace space;
-  /// The stream on which new work on the memory will be ordered (e.g., if a
-  /// piece of memory cached and to be returned for this call was last used on
-  /// stream 3 and a new memory request is for stream 4, the memory manager will
-  /// synchronize stream 4 to wait for the completion of stream 3 via events or
-  /// other stream synchronization.
-  ///
-  /// The memory manager guarantees that the returned memory is free to use
-  /// without data races on this stream specified.
-  cudaStream_t stream;
+    inline AllocInfo()
+            : type(AllocType::Other),
+              device(0),
+              space(MemorySpace::Device),
+              stream(nullptr) {}
+    inline AllocInfo(AllocType at, int dev, MemorySpace sp, cudaStream_t st)
+            : type(at), device(dev), space(sp), stream(st) {}
+    /// Returns a string representation of this info
+    std::string toString() const;
+    /// The internal category of the allocation
+    AllocType type;
+    /// The device on which the allocation is happening
+    int device;
+    /// The memory space of the allocation
+    MemorySpace space;
+    /// The stream on which new work on the memory will be ordered (e.g., if a
+    /// piece of memory cached and to be returned for this call was last used on
+    /// stream 3 and a new memory request is for stream 4, the memory manager
+    /// will synchronize stream 4 to wait for the completion of stream 3 via
+    /// events or other stream synchronization.
+    ///
+    /// The memory manager guarantees that the returned memory is free to use
+    /// without data races on this stream specified.
+    cudaStream_t stream;
 };
 /// Create an AllocInfo for the current device with MemorySpace::Device
@@ -129,140 +126,139 @@ AllocInfo makeSpaceAlloc(AllocType at, MemorySpace sp, cudaStream_t st);
 /// Information on what/where an allocation is, along with how big it should be
 struct AllocRequest : public AllocInfo {
-  inline AllocRequest()
-      : AllocInfo(),
-        size(0) {
-  }
-  inline AllocRequest(const AllocInfo& info,
-                      size_t sz)
-      : AllocInfo(info),
-        size(sz) {
-  }
-  inline AllocRequest(AllocType at,
-                      int dev,
-                      MemorySpace sp,
-                      cudaStream_t st,
-                      size_t sz)
-      : AllocInfo(at, dev, sp, st),
-        size(sz) {
-  }
-  /// Returns a string representation of this request
-  std::string toString() const;
-  /// The size in bytes of the allocation
-  size_t size;
+    inline AllocRequest() : AllocInfo(), size(0) {}
+    inline AllocRequest(const AllocInfo& info, size_t sz)
+            : AllocInfo(info), size(sz) {}
+    inline AllocRequest(
+            AllocType at,
+            int dev,
+            MemorySpace sp,
+            cudaStream_t st,
+            size_t sz)
+            : AllocInfo(at, dev, sp, st), size(sz) {}
+    /// Returns a string representation of this request
+    std::string toString() const;
+    /// The size in bytes of the allocation
+    size_t size;
 };
 /// A RAII object that manages a temporary memory request
 struct GpuMemoryReservation {
-  GpuMemoryReservation();
-  GpuMemoryReservation(GpuResources* r,
-                       int dev,
-                       cudaStream_t str,
-                       void* p,
-                       size_t sz);
-  GpuMemoryReservation(GpuMemoryReservation&& m) noexcept;
-  ~GpuMemoryReservation();
-  GpuMemoryReservation& operator=(GpuMemoryReservation&& m);
-  inline void* get() { return data; }
-  void release();
-  GpuResources* res;
-  int device;
-  cudaStream_t stream;
-  void* data;
-  size_t size;
+    GpuMemoryReservation();
+    GpuMemoryReservation(
+            GpuResources* r,
+            int dev,
+            cudaStream_t str,
+            void* p,
+            size_t sz);
+    GpuMemoryReservation(GpuMemoryReservation&& m) noexcept;
+    ~GpuMemoryReservation();
+    GpuMemoryReservation& operator=(GpuMemoryReservation&& m);
+    inline void* get() {
+        return data;
+    }
+    void release();
+    GpuResources* res;
+    int device;
+    cudaStream_t stream;
+    void* data;
+    size_t size;
 };
 /// Base class of GPU-side resource provider; hides provision of
 /// cuBLAS handles, CUDA streams and all device memory allocation performed
 class GpuResources {
- public:
-  virtual ~GpuResources();
+   public:
+    virtual ~GpuResources();
-  /// Call to pre-allocate resources for a particular device. If this is
-  /// not called, then resources will be allocated at the first time
-  /// of demand
-  virtual void initializeForDevice(int device) = 0;
+    /// Call to pre-allocate resources for a particular device. If this is
+    /// not called, then resources will be allocated at the first time
+    /// of demand
+    virtual void initializeForDevice(int device) = 0;
-  /// Returns the cuBLAS handle that we use for the given device
-  virtual cublasHandle_t getBlasHandle(int device) = 0;
+    /// Returns the cuBLAS handle that we use for the given device
+    virtual cublasHandle_t getBlasHandle(int device) = 0;
-  /// Returns the stream that we order all computation on for the
-  /// given device
-  virtual cudaStream_t getDefaultStream(int device) = 0;
+    /// Returns the stream that we order all computation on for the
+    /// given device
+    virtual cudaStream_t getDefaultStream(int device) = 0;
-  /// Overrides the default stream for a device to the user-supplied stream. The
-  /// resources object does not own this stream (i.e., it will not destroy it).
-  virtual void setDefaultStream(int device, cudaStream_t stream) = 0;
+    /// Overrides the default stream for a device to the user-supplied stream.
+    /// The resources object does not own this stream (i.e., it will not destroy
+    /// it).
+    virtual void setDefaultStream(int device, cudaStream_t stream) = 0;
-  /// Returns the set of alternative streams that we use for the given device
-  virtual std::vector<cudaStream_t> getAlternateStreams(int device) = 0;
+    /// Returns the set of alternative streams that we use for the given device
+    virtual std::vector<cudaStream_t> getAlternateStreams(int device) = 0;
-  /// Memory management
-  /// Returns an allocation from the given memory space, ordered with respect to
-  /// the given stream (i.e., the first user will be a kernel in this stream).
-  /// All allocations are sized internally to be the next highest multiple of 16
-  /// bytes, and all allocations returned are guaranteed to be 16 byte aligned.
-  virtual void* allocMemory(const AllocRequest& req) = 0;
+    /// Memory management
+    /// Returns an allocation from the given memory space, ordered with respect
+    /// to the given stream (i.e., the first user will be a kernel in this
+    /// stream). All allocations are sized internally to be the next highest
+    /// multiple of 16 bytes, and all allocations returned are guaranteed to be
+    /// 16 byte aligned.
+    virtual void* allocMemory(const AllocRequest& req) = 0;
-  /// Returns a previous allocation
-  virtual void deallocMemory(int device, void* in) = 0;
+    /// Returns a previous allocation
+    virtual void deallocMemory(int device, void* in) = 0;
-  /// For MemorySpace::Temporary, how much space is immediately available
-  /// without cudaMalloc allocation?
-  virtual size_t getTempMemoryAvailable(int device) const = 0;
+    /// For MemorySpace::Temporary, how much space is immediately available
+    /// without cudaMalloc allocation?
+    virtual size_t getTempMemoryAvailable(int device) const = 0;
-  /// Returns the available CPU pinned memory buffer
-  virtual std::pair<void*, size_t> getPinnedMemory() = 0;
+    /// Returns the available CPU pinned memory buffer
+    virtual std::pair<void*, size_t> getPinnedMemory() = 0;
-  /// Returns the stream on which we perform async CPU <-> GPU copies
-  virtual cudaStream_t getAsyncCopyStream(int device) = 0;
+    /// Returns the stream on which we perform async CPU <-> GPU copies
+    virtual cudaStream_t getAsyncCopyStream(int device) = 0;
-  ///
-  /// Functions provided by default
-  ///
+    ///
+    /// Functions provided by default
+    ///
-  /// Calls getBlasHandle with the current device
-  cublasHandle_t getBlasHandleCurrentDevice();
+    /// Calls getBlasHandle with the current device
+    cublasHandle_t getBlasHandleCurrentDevice();
-  /// Calls getDefaultStream with the current device
-  cudaStream_t getDefaultStreamCurrentDevice();
+    /// Calls getDefaultStream with the current device
+    cudaStream_t getDefaultStreamCurrentDevice();
-  /// Calls getTempMemoryAvailable with the current device
-  size_t getTempMemoryAvailableCurrentDevice() const;
+    /// Calls getTempMemoryAvailable with the current device
+    size_t getTempMemoryAvailableCurrentDevice() const;
-  /// Returns a temporary memory allocation via a RAII object
-  GpuMemoryReservation allocMemoryHandle(const AllocRequest& req);
+    /// Returns a temporary memory allocation via a RAII object
+    GpuMemoryReservation allocMemoryHandle(const AllocRequest& req);
-  /// Synchronizes the CPU with respect to the default stream for the
-  /// given device
-  // equivalent to cudaDeviceSynchronize(getDefaultStream(device))
-  void syncDefaultStream(int device);
+    /// Synchronizes the CPU with respect to the default stream for the
+    /// given device
+    // equivalent to cudaDeviceSynchronize(getDefaultStream(device))
+    void syncDefaultStream(int device);
-  /// Calls syncDefaultStream for the current device
-  void syncDefaultStreamCurrentDevice();
+    /// Calls syncDefaultStream for the current device
+    void syncDefaultStreamCurrentDevice();
-  /// Calls getAlternateStreams for the current device
-  std::vector<cudaStream_t> getAlternateStreamsCurrentDevice();
+    /// Calls getAlternateStreams for the current device
+    std::vector<cudaStream_t> getAlternateStreamsCurrentDevice();
-  /// Calls getAsyncCopyStream for the current device
-  cudaStream_t getAsyncCopyStreamCurrentDevice();
+    /// Calls getAsyncCopyStream for the current device
+    cudaStream_t getAsyncCopyStreamCurrentDevice();
 };
 /// Interface for a provider of a shared resources object
 class GpuResourcesProvider {
- public:
-  virtual ~GpuResourcesProvider();
+   public:
+    virtual ~GpuResourcesProvider();
-  /// Returns the shared resources object
-  virtual std::shared_ptr<GpuResources> getResources() = 0;
+    /// Returns the shared resources object
+    virtual std::shared_ptr<GpuResources> getResources() = 0;
 };
-} } // namespace
+} // namespace gpu
+} // namespace faiss

data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp CHANGED Viewed

@@ -5,16 +5,16 @@
  * LICENSE file in the root directory of this source tree.
  */
 #include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/StaticUtils.h>
 #include <faiss/impl/FaissAssert.h>
-#include <limits>
 #include <iostream>
+#include <limits>
 #include <sstream>
-namespace faiss { namespace gpu {
+namespace faiss {
+namespace gpu {
 namespace {
@@ -22,513 +22,536 @@ namespace {
 constexpr int kNumStreams = 2;
 // Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
-constexpr size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
+constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024;
 // Default temporary memory allocation for <= 4 GiB memory GPUs
-constexpr size_t k4GiBTempMem = (size_t) 512 * 1024 * 1024;
+constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024;
 // Default temporary memory allocation for <= 8 GiB memory GPUs
-constexpr size_t k8GiBTempMem = (size_t) 1024 * 1024 * 1024;
+constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024;
 // Maximum temporary memory allocation for all GPUs
-constexpr size_t kMaxTempMem = (size_t) 1536 * 1024 * 1024;
+constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024;
 std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
-  // Produce a sorted list of all outstanding allocations by type
-  std::unordered_map<AllocType, std::pair<int, size_t>> stats;
-  for (auto& entry : map) {
-    auto& a = entry.second;
-    auto it = stats.find(a.type);
-    if (it != stats.end()) {
-      stats[a.type].first++;
-      stats[a.type].second += a.size;
-    } else {
-      stats[a.type] = std::make_pair(1, a.size);
+    // Produce a sorted list of all outstanding allocations by type
+    std::unordered_map<AllocType, std::pair<int, size_t>> stats;
+    for (auto& entry : map) {
+        auto& a = entry.second;
+        auto it = stats.find(a.type);
+        if (it != stats.end()) {
+            stats[a.type].first++;
+            stats[a.type].second += a.size;
+        } else {
+            stats[a.type] = std::make_pair(1, a.size);
+        }
     }
-  }
-  std::stringstream ss;
-  for (auto& entry : stats) {
-    ss << "Alloc type " << allocTypeToString(entry.first) << ": "
-       << entry.second.first << " allocations, "
-       << entry.second.second << " bytes\n";
-  }
+    std::stringstream ss;
+    for (auto& entry : stats) {
+        ss << "Alloc type " << allocTypeToString(entry.first) << ": "
+           << entry.second.first << " allocations, " << entry.second.second
+           << " bytes\n";
+    }
-  return ss.str();
+    return ss.str();
 }
-}
+} // namespace
 //
 // StandardGpuResourcesImpl
 //
-StandardGpuResourcesImpl::StandardGpuResourcesImpl() :
-    pinnedMemAlloc_(nullptr),
-    pinnedMemAllocSize_(0),
-    // let the adjustment function determine the memory size for us by passing
-    // in a huge value that will then be adjusted
-    tempMemSize_(getDefaultTempMemForGPU(-1,
-                                         std::numeric_limits<size_t>::max())),
-    pinnedMemSize_(kDefaultPinnedMemoryAllocation),
-    allocLogging_(false) {
-}
+StandardGpuResourcesImpl::StandardGpuResourcesImpl()
+        : pinnedMemAlloc_(nullptr),
+          pinnedMemAllocSize_(0),
+          // let the adjustment function determine the memory size for us by
+          // passing in a huge value that will then be adjusted
+          tempMemSize_(getDefaultTempMemForGPU(
+                  -1,
+                  std::numeric_limits<size_t>::max())),
+          pinnedMemSize_(kDefaultPinnedMemoryAllocation),
+          allocLogging_(false) {}
 StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
-  // The temporary memory allocator has allocated memory through us, so clean
-  // that up before we finish fully de-initializing ourselves
-  tempMemory_.clear();
-  // Make sure all allocations have been freed
-  bool allocError = false;
-  for (auto& entry : allocs_) {
-    auto& map = entry.second;
-    if (!map.empty()) {
-      std::cerr
-        << "StandardGpuResources destroyed with allocations outstanding:\n"
-        << "Device " << entry.first << " outstanding allocations:\n";
-      std::cerr << allocsToString(map);
-      allocError = true;
+    // The temporary memory allocator has allocated memory through us, so clean
+    // that up before we finish fully de-initializing ourselves
+    tempMemory_.clear();
+    // Make sure all allocations have been freed
+    bool allocError = false;
+    for (auto& entry : allocs_) {
+        auto& map = entry.second;
+        if (!map.empty()) {
+            std::cerr
+                    << "StandardGpuResources destroyed with allocations outstanding:\n"
+                    << "Device " << entry.first
+                    << " outstanding allocations:\n";
+            std::cerr << allocsToString(map);
+            allocError = true;
+        }
     }
-  }
-  FAISS_ASSERT_MSG(!allocError, "GPU memory allocations not properly cleaned up");
+    FAISS_ASSERT_MSG(
+            !allocError, "GPU memory allocations not properly cleaned up");
-  for (auto& entry : defaultStreams_) {
-    DeviceScope scope(entry.first);
+    for (auto& entry : defaultStreams_) {
+        DeviceScope scope(entry.first);
-    // We created these streams, so are responsible for destroying them
-    CUDA_VERIFY(cudaStreamDestroy(entry.second));
-  }
+        // We created these streams, so are responsible for destroying them
+        CUDA_VERIFY(cudaStreamDestroy(entry.second));
+    }
-  for (auto& entry : alternateStreams_) {
-    DeviceScope scope(entry.first);
+    for (auto& entry : alternateStreams_) {
+        DeviceScope scope(entry.first);
-    for (auto stream : entry.second) {
-      CUDA_VERIFY(cudaStreamDestroy(stream));
+        for (auto stream : entry.second) {
+            CUDA_VERIFY(cudaStreamDestroy(stream));
+        }
     }
-  }
-  for (auto& entry : asyncCopyStreams_) {
-    DeviceScope scope(entry.first);
+    for (auto& entry : asyncCopyStreams_) {
+        DeviceScope scope(entry.first);
-    CUDA_VERIFY(cudaStreamDestroy(entry.second));
-  }
+        CUDA_VERIFY(cudaStreamDestroy(entry.second));
+    }
-  for (auto& entry : blasHandles_) {
-    DeviceScope scope(entry.first);
+    for (auto& entry : blasHandles_) {
+        DeviceScope scope(entry.first);
-    auto blasStatus = cublasDestroy(entry.second);
-    FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
-  }
+        auto blasStatus = cublasDestroy(entry.second);
+        FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
+    }
-  if (pinnedMemAlloc_) {
-    auto err = cudaFreeHost(pinnedMemAlloc_);
-    FAISS_ASSERT_FMT(err == cudaSuccess,
-                     "Failed to cudaFreeHost pointer %p (error %d %s)",
-                     pinnedMemAlloc_, (int) err, cudaGetErrorString(err));
-  }
+    if (pinnedMemAlloc_) {
+        auto err = cudaFreeHost(pinnedMemAlloc_);
+        FAISS_ASSERT_FMT(
+                err == cudaSuccess,
+                "Failed to cudaFreeHost pointer %p (error %d %s)",
+                pinnedMemAlloc_,
+                (int)err,
+                cudaGetErrorString(err));
+    }
 }
-size_t
-StandardGpuResourcesImpl::getDefaultTempMemForGPU(int device,
-                                                  size_t requested) {
-  auto totalMem = device != -1 ?
-    getDeviceProperties(device).totalGlobalMem :
-    std::numeric_limits<size_t>::max();
+size_t StandardGpuResourcesImpl::getDefaultTempMemForGPU(
+        int device,
+        size_t requested) {
+    auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem
+                                 : std::numeric_limits<size_t>::max();
-  if (totalMem <= (size_t) 4 * 1024 * 1024 * 1024) {
-    // If the GPU has <= 4 GiB of memory, reserve 512 MiB
+    if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) {
+        // If the GPU has <= 4 GiB of memory, reserve 512 MiB
-    if (requested > k4GiBTempMem) {
-      return k4GiBTempMem;
-    }
-  } else if (totalMem <= (size_t) 8 * 1024 * 1024 * 1024) {
-    // If the GPU has <= 8 GiB of memory, reserve 1 GiB
+        if (requested > k4GiBTempMem) {
+            return k4GiBTempMem;
+        }
+    } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) {
+        // If the GPU has <= 8 GiB of memory, reserve 1 GiB
-    if (requested > k8GiBTempMem) {
-      return k8GiBTempMem;
-    }
-  } else {
-    // Never use more than 1.5 GiB
-    if (requested > kMaxTempMem) {
-      return kMaxTempMem;
+        if (requested > k8GiBTempMem) {
+            return k8GiBTempMem;
+        }
+    } else {
+        // Never use more than 1.5 GiB
+        if (requested > kMaxTempMem) {
+            return kMaxTempMem;
+        }
     }
-  }
-  // use whatever lower limit the user requested
-  return requested;
-}
-void
-StandardGpuResourcesImpl::noTempMemory() {
-  setTempMemory(0);
-}
-void
-StandardGpuResourcesImpl::setTempMemory(size_t size) {
-  if (tempMemSize_ != size) {
-    // adjust based on general limits
-    tempMemSize_ = getDefaultTempMemForGPU(-1, size);
-    // We need to re-initialize memory resources for all current devices that
-    // have been initialized.
-    // This should be safe to do, even if we are currently running work, because
-    // the cudaFree call that this implies will force-synchronize all GPUs with
-    // the CPU
-    for (auto& p : tempMemory_) {
-      int device = p.first;
-      // Free the existing memory first
-      p.second.reset();
-      // Allocate new
-      p.second = std::unique_ptr<StackDeviceMemory>(
-        new StackDeviceMemory(this,
-                              p.first,
-                              // adjust for this specific device
-                              getDefaultTempMemForGPU(device, tempMemSize_)));
+    // use whatever lower limit the user requested
+    return requested;
+}
+void StandardGpuResourcesImpl::noTempMemory() {
+    setTempMemory(0);
+}
+void StandardGpuResourcesImpl::setTempMemory(size_t size) {
+    if (tempMemSize_ != size) {
+        // adjust based on general limits
+        tempMemSize_ = getDefaultTempMemForGPU(-1, size);
+        // We need to re-initialize memory resources for all current devices
+        // that have been initialized. This should be safe to do, even if we are
+        // currently running work, because the cudaFree call that this implies
+        // will force-synchronize all GPUs with the CPU
+        for (auto& p : tempMemory_) {
+            int device = p.first;
+            // Free the existing memory first
+            p.second.reset();
+            // Allocate new
+            p.second = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
+                    this,
+                    p.first,
+                    // adjust for this specific device
+                    getDefaultTempMemForGPU(device, tempMemSize_)));
+        }
     }
-  }
 }
-void
-StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
-  // Should not call this after devices have been initialized
-  FAISS_ASSERT(defaultStreams_.size() == 0);
-  FAISS_ASSERT(!pinnedMemAlloc_);
+void StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
+    // Should not call this after devices have been initialized
+    FAISS_ASSERT(defaultStreams_.size() == 0);
+    FAISS_ASSERT(!pinnedMemAlloc_);
-  pinnedMemSize_ = size;
+    pinnedMemSize_ = size;
 }
-void
-StandardGpuResourcesImpl::setDefaultStream(int device, cudaStream_t stream) {
-  if (isInitialized(device)) {
-     // A new series of calls may not be ordered with what was the previous
-     // stream, so if the stream being specified is different, then we need to
-     // ensure ordering between the two (new stream waits on old).
-    auto it = userDefaultStreams_.find(device);
-    cudaStream_t prevStream = nullptr;
+void StandardGpuResourcesImpl::setDefaultStream(
+        int device,
+        cudaStream_t stream) {
+    if (isInitialized(device)) {
+        // A new series of calls may not be ordered with what was the previous
+        // stream, so if the stream being specified is different, then we need
+        // to ensure ordering between the two (new stream waits on old).
+        auto it = userDefaultStreams_.find(device);
+        cudaStream_t prevStream = nullptr;
-    if (it != userDefaultStreams_.end()) {
-      prevStream = it->second;
-    } else {
-      FAISS_ASSERT(defaultStreams_.count(device));
-      prevStream = defaultStreams_[device];
-    }
+        if (it != userDefaultStreams_.end()) {
+            prevStream = it->second;
+        } else {
+            FAISS_ASSERT(defaultStreams_.count(device));
+            prevStream = defaultStreams_[device];
+        }
-    if (prevStream != stream) {
-      streamWait({stream}, {prevStream});
+        if (prevStream != stream) {
+            streamWait({stream}, {prevStream});
+        }
     }
-  }
-  userDefaultStreams_[device] = stream;
+    userDefaultStreams_[device] = stream;
 }
-void
-StandardGpuResourcesImpl::revertDefaultStream(int device) {
-  if (isInitialized(device)) {
-    auto it = userDefaultStreams_.find(device);
+void StandardGpuResourcesImpl::revertDefaultStream(int device) {
+    if (isInitialized(device)) {
+        auto it = userDefaultStreams_.find(device);
-    if (it != userDefaultStreams_.end()) {
-      // There was a user stream set that we need to synchronize against
-      cudaStream_t prevStream = userDefaultStreams_[device];
+        if (it != userDefaultStreams_.end()) {
+            // There was a user stream set that we need to synchronize against
+            cudaStream_t prevStream = userDefaultStreams_[device];
-      FAISS_ASSERT(defaultStreams_.count(device));
-      cudaStream_t newStream = defaultStreams_[device];
+            FAISS_ASSERT(defaultStreams_.count(device));
+            cudaStream_t newStream = defaultStreams_[device];
-      streamWait({newStream}, {prevStream});
+            streamWait({newStream}, {prevStream});
+        }
     }
-  }
-  userDefaultStreams_.erase(device);
+    userDefaultStreams_.erase(device);
 }
-void
-StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
-  for (int dev = 0; dev < getNumDevices(); ++dev) {
-    setDefaultStream(dev, nullptr);
-  }
+void StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
+    for (int dev = 0; dev < getNumDevices(); ++dev) {
+        setDefaultStream(dev, nullptr);
+    }
 }
-void
-StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
-  allocLogging_ = enable;
+void StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
+    allocLogging_ = enable;
 }
-bool
-StandardGpuResourcesImpl::isInitialized(int device) const {
-  // Use default streams as a marker for whether or not a certain
-  // device has been initialized
-  return defaultStreams_.count(device) != 0;
+bool StandardGpuResourcesImpl::isInitialized(int device) const {
+    // Use default streams as a marker for whether or not a certain
+    // device has been initialized
+    return defaultStreams_.count(device) != 0;
 }
-void
-StandardGpuResourcesImpl::initializeForDevice(int device) {
-  if (isInitialized(device)) {
-    return;
-  }
-  // If this is the first device that we're initializing, create our
-  // pinned memory allocation
-  if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
-    auto err =
-      cudaHostAlloc(&pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
+void StandardGpuResourcesImpl::initializeForDevice(int device) {
+    if (isInitialized(device)) {
+        return;
+    }
-    FAISS_THROW_IF_NOT_FMT(
-      err == cudaSuccess,
-      "failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
-      "async copy buffer (error %d %s)",
-      pinnedMemSize_, (int) err, cudaGetErrorString(err));
+    // If this is the first device that we're initializing, create our
+    // pinned memory allocation
+    if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
+        auto err = cudaHostAlloc(
+                &pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
+        FAISS_THROW_IF_NOT_FMT(
+                err == cudaSuccess,
+                "failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
+                "async copy buffer (error %d %s)",
+                pinnedMemSize_,
+                (int)err,
+                cudaGetErrorString(err));
+        pinnedMemAllocSize_ = pinnedMemSize_;
+    }
-    pinnedMemAllocSize_ = pinnedMemSize_;
-  }
+    FAISS_ASSERT(device < getNumDevices());
+    DeviceScope scope(device);
-  FAISS_ASSERT(device < getNumDevices());
-  DeviceScope scope(device);
+    // Make sure that device properties for all devices are cached
+    auto& prop = getDeviceProperties(device);
-  // Make sure that device properties for all devices are cached
-  auto& prop = getDeviceProperties(device);
+    // Also check to make sure we meet our minimum compute capability (3.0)
+    FAISS_ASSERT_FMT(
+            prop.major >= 3,
+            "Device id %d with CC %d.%d not supported, "
+            "need 3.0+ compute capability",
+            device,
+            prop.major,
+            prop.minor);
-  // Also check to make sure we meet our minimum compute capability (3.0)
-  FAISS_ASSERT_FMT(prop.major >= 3,
-                   "Device id %d with CC %d.%d not supported, "
-                   "need 3.0+ compute capability",
-                   device, prop.major, prop.minor);
+    // Our code is pre-built with and expects warpSize == 32, validate that
+    FAISS_ASSERT_FMT(
+            prop.warpSize == 32,
+            "Device id %d does not have expected warpSize of 32",
+            device);
-  // Create streams
-  cudaStream_t defaultStream = 0;
-  CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
-                                        cudaStreamNonBlocking));
+    // Create streams
+    cudaStream_t defaultStream = 0;
+    CUDA_VERIFY(
+            cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
-  defaultStreams_[device] = defaultStream;
+    defaultStreams_[device] = defaultStream;
-  cudaStream_t asyncCopyStream = 0;
-  CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
-                                        cudaStreamNonBlocking));
+    cudaStream_t asyncCopyStream = 0;
+    CUDA_VERIFY(
+            cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
-  asyncCopyStreams_[device] = asyncCopyStream;
+    asyncCopyStreams_[device] = asyncCopyStream;
-  std::vector<cudaStream_t> deviceStreams;
-  for (int j = 0; j < kNumStreams; ++j) {
-    cudaStream_t stream = 0;
-    CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
-                                          cudaStreamNonBlocking));
+    std::vector<cudaStream_t> deviceStreams;
+    for (int j = 0; j < kNumStreams; ++j) {
+        cudaStream_t stream = 0;
+        CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-    deviceStreams.push_back(stream);
-  }
+        deviceStreams.push_back(stream);
+    }
-  alternateStreams_[device] = std::move(deviceStreams);
+    alternateStreams_[device] = std::move(deviceStreams);
-  // Create cuBLAS handle
-  cublasHandle_t blasHandle = 0;
-  auto blasStatus = cublasCreate(&blasHandle);
-  FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
-  blasHandles_[device] = blasHandle;
+    // Create cuBLAS handle
+    cublasHandle_t blasHandle = 0;
+    auto blasStatus = cublasCreate(&blasHandle);
+    FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
+    blasHandles_[device] = blasHandle;
-  // For CUDA 10 on V100, enabling tensor core usage would enable automatic
-  // rounding down of inputs to f16 (though accumulate in f32) which results in
-  // unacceptable loss of precision in general.
-  // For CUDA 11 / A100, only enable tensor core support if it doesn't result in
-  // a loss of precision.
+    // For CUDA 10 on V100, enabling tensor core usage would enable automatic
+    // rounding down of inputs to f16 (though accumulate in f32) which results
+    // in unacceptable loss of precision in general. For CUDA 11 / A100, only
+    // enable tensor core support if it doesn't result in a loss of precision.
 #if CUDA_VERSION >= 11000
-  cublasSetMathMode(blasHandle,
-                    CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+    cublasSetMathMode(
+            blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
 #endif
-  FAISS_ASSERT(allocs_.count(device) == 0);
-  allocs_[device] = std::unordered_map<void*, AllocRequest>();
+    FAISS_ASSERT(allocs_.count(device) == 0);
+    allocs_[device] = std::unordered_map<void*, AllocRequest>();
-  FAISS_ASSERT(tempMemory_.count(device) == 0);
-  auto mem = std::unique_ptr<StackDeviceMemory>(
-    new StackDeviceMemory(this,
-                          device,
-                          // adjust for this specific device
-                          getDefaultTempMemForGPU(device, tempMemSize_)));
+    FAISS_ASSERT(tempMemory_.count(device) == 0);
+    auto mem = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
+            this,
+            device,
+            // adjust for this specific device
+            getDefaultTempMemForGPU(device, tempMemSize_)));
-  tempMemory_.emplace(device, std::move(mem));
+    tempMemory_.emplace(device, std::move(mem));
 }
-cublasHandle_t
-StandardGpuResourcesImpl::getBlasHandle(int device) {
-  initializeForDevice(device);
-  return blasHandles_[device];
+cublasHandle_t StandardGpuResourcesImpl::getBlasHandle(int device) {
+    initializeForDevice(device);
+    return blasHandles_[device];
 }
-cudaStream_t
-StandardGpuResourcesImpl::getDefaultStream(int device) {
-  initializeForDevice(device);
+cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
+    initializeForDevice(device);
-  auto it = userDefaultStreams_.find(device);
-  if (it != userDefaultStreams_.end()) {
-    // There is a user override stream set
-    return it->second;
-  }
+    auto it = userDefaultStreams_.find(device);
+    if (it != userDefaultStreams_.end()) {
+        // There is a user override stream set
+        return it->second;
+    }
-  // Otherwise, our base default stream
-  return defaultStreams_[device];
+    // Otherwise, our base default stream
+    return defaultStreams_[device];
 }
-std::vector<cudaStream_t>
-StandardGpuResourcesImpl::getAlternateStreams(int device) {
-  initializeForDevice(device);
-  return alternateStreams_[device];
+std::vector<cudaStream_t> StandardGpuResourcesImpl::getAlternateStreams(
+        int device) {
+    initializeForDevice(device);
+    return alternateStreams_[device];
 }
-std::pair<void*, size_t>
-StandardGpuResourcesImpl::getPinnedMemory() {
-  return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
+std::pair<void*, size_t> StandardGpuResourcesImpl::getPinnedMemory() {
+    return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
 }
-cudaStream_t
-StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
-  initializeForDevice(device);
-  return asyncCopyStreams_[device];
+cudaStream_t StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
+    initializeForDevice(device);
+    return asyncCopyStreams_[device];
 }
-void*
-StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
-  initializeForDevice(req.device);
-  // We don't allocate a placeholder for zero-sized allocations
-  if (req.size == 0) {
-    return nullptr;
-  }
-  // Make sure that the allocation is a multiple of 16 bytes for alignment
-  // purposes
-  auto adjReq = req;
-  adjReq.size = utils::roundUp(adjReq.size, (size_t) 16);
-  void* p = nullptr;
-  if (allocLogging_) {
-    std::cout << "StandardGpuResources: alloc " << adjReq.toString() << "\n";
-  }
-  if (adjReq.space == MemorySpace::Temporary) {
-    // If we don't have enough space in our temporary memory manager, we need
-    // to allocate this request separately
-    auto& tempMem = tempMemory_[adjReq.device];
-    if (adjReq.size > tempMem->getSizeAvailable()) {
-      // We need to allocate this ourselves
-      AllocRequest newReq = adjReq;
-      newReq.space = MemorySpace::Device;
-      newReq.type = AllocType::TemporaryMemoryOverflow;
+void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
+    initializeForDevice(req.device);
-      return allocMemory(newReq);
+    // We don't allocate a placeholder for zero-sized allocations
+    if (req.size == 0) {
+        return nullptr;
     }
-    // Otherwise, we can handle this locally
-    p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
-  } else if (adjReq.space == MemorySpace::Device) {
-    auto err = cudaMalloc(&p, adjReq.size);
-    // Throw if we fail to allocate
-    if (err != cudaSuccess) {
-      auto& map = allocs_[req.device];
-      std::stringstream ss;
-      ss << "Failed to cudaMalloc " << adjReq.size << " bytes "
-         << "on device " << adjReq.device << " (error "
-         << (int) err << " " << cudaGetErrorString(err)
-         << "\nOutstanding allocations:\n" << allocsToString(map);
-      auto str = ss.str();
-      FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
+    // cudaMalloc guarantees allocation alignment to 256 bytes; do the same here
+    // for alignment purposes (to reduce memory transaction overhead etc)
+    auto adjReq = req;
+    adjReq.size = utils::roundUp(adjReq.size, (size_t)256);
+    void* p = nullptr;
+    if (adjReq.space == MemorySpace::Temporary) {
+        // If we don't have enough space in our temporary memory manager, we
+        // need to allocate this request separately
+        auto& tempMem = tempMemory_[adjReq.device];
+        if (adjReq.size > tempMem->getSizeAvailable()) {
+            // We need to allocate this ourselves
+            AllocRequest newReq = adjReq;
+            newReq.space = MemorySpace::Device;
+            newReq.type = AllocType::TemporaryMemoryOverflow;
+            if (allocLogging_) {
+                std::cout
+                        << "StandardGpuResources: alloc fail "
+                        << adjReq.toString()
+                        << " (no temp space); retrying as MemorySpace::Device\n";
+            }
+            return allocMemory(newReq);
+        }
+        // Otherwise, we can handle this locally
+        p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
+    } else if (adjReq.space == MemorySpace::Device) {
+        auto err = cudaMalloc(&p, adjReq.size);
+        // Throw if we fail to allocate
+        if (err != cudaSuccess) {
+            // FIXME: as of CUDA 11, a memory allocation error appears to be
+            // presented via cudaGetLastError as well, and needs to be cleared.
+            // Just call the function to clear it
+            cudaGetLastError();
+            std::stringstream ss;
+            ss << "StandardGpuResources: alloc fail " << adjReq.toString()
+               << " (cudaMalloc error " << cudaGetErrorString(err) << " ["
+               << (int)err << "])\n";
+            auto str = ss.str();
+            if (allocLogging_) {
+                std::cout << str;
+            }
+            FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
+        }
+    } else if (adjReq.space == MemorySpace::Unified) {
+        auto err = cudaMallocManaged(&p, adjReq.size);
+        if (err != cudaSuccess) {
+            // FIXME: as of CUDA 11, a memory allocation error appears to be
+            // presented via cudaGetLastError as well, and needs to be cleared.
+            // Just call the function to clear it
+            cudaGetLastError();
+            std::stringstream ss;
+            ss << "StandardGpuResources: alloc fail " << adjReq.toString()
+               << " failed (cudaMallocManaged error " << cudaGetErrorString(err)
+               << " [" << (int)err << "])\n";
+            auto str = ss.str();
+            if (allocLogging_) {
+                std::cout << str;
+            }
+            FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
+        }
+    } else {
+        FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
     }
-  } else if (adjReq.space == MemorySpace::Unified) {
-    auto err = cudaMallocManaged(&p, adjReq.size);
-    if (err != cudaSuccess) {
-      auto& map = allocs_[req.device];
-      std::stringstream ss;
-      ss << "Failed to cudaMallocManaged " << adjReq.size << " bytes "
-         << "(error " << (int) err << " " << cudaGetErrorString(err)
-         << "\nOutstanding allocations:\n" << allocsToString(map);
-      auto str = ss.str();
-      FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
+    if (allocLogging_) {
+        std::cout << "StandardGpuResources: alloc ok " << adjReq.toString()
+                  << " ptr 0x" << p << "\n";
     }
-  } else {
-    FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) adjReq.space);
-  }
-  allocs_[adjReq.device][p] = adjReq;
+    allocs_[adjReq.device][p] = adjReq;
-  return p;
+    return p;
 }
-void
-StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
-  FAISS_ASSERT(isInitialized(device));
+void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
+    FAISS_ASSERT(isInitialized(device));
-  if (!p) {
-    return;
-  }
+    if (!p) {
+        return;
+    }
-  auto& a = allocs_[device];
-  auto it = a.find(p);
-  FAISS_ASSERT(it != a.end());
+    auto& a = allocs_[device];
+    auto it = a.find(p);
+    FAISS_ASSERT(it != a.end());
-  auto& req = it->second;
+    auto& req = it->second;
-  if (allocLogging_) {
-    std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
-  }
+    if (allocLogging_) {
+        std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
+    }
-  if (req.space == MemorySpace::Temporary) {
-    tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
+    if (req.space == MemorySpace::Temporary) {
+        tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
-  } else if (req.space == MemorySpace::Device ||
-             req.space == MemorySpace::Unified) {
-    auto err = cudaFree(p);
-    FAISS_ASSERT_FMT(err == cudaSuccess,
-                     "Failed to cudaFree pointer %p (error %d %s)",
-                     p, (int) err, cudaGetErrorString(err));
+    } else if (
+            req.space == MemorySpace::Device ||
+            req.space == MemorySpace::Unified) {
+        auto err = cudaFree(p);
+        FAISS_ASSERT_FMT(
+                err == cudaSuccess,
+                "Failed to cudaFree pointer %p (error %d %s)",
+                p,
+                (int)err,
+                cudaGetErrorString(err));
-  } else {
-    FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) req.space);
-  }
+    } else {
+        FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
+    }
-  a.erase(it);
+    a.erase(it);
 }
-size_t
-StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
-  FAISS_ASSERT(isInitialized(device));
+size_t StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
+    FAISS_ASSERT(isInitialized(device));
-  auto it = tempMemory_.find(device);
-  FAISS_ASSERT(it != tempMemory_.end());
+    auto it = tempMemory_.find(device);
+    FAISS_ASSERT(it != tempMemory_.end());
-  return it->second->getSizeAvailable();
+    return it->second->getSizeAvailable();
 }
 std::map<int, std::map<std::string, std::pair<int, size_t>>>
 StandardGpuResourcesImpl::getMemoryInfo() const {
-  using AT = std::map<std::string, std::pair<int, size_t>>;
+    using AT = std::map<std::string, std::pair<int, size_t>>;
-  std::map<int, AT> out;
+    std::map<int, AT> out;
-  for (auto& entry : allocs_) {
-    AT outDevice;
+    for (auto& entry : allocs_) {
+        AT outDevice;
-    for (auto& a : entry.second) {
-      auto& v = outDevice[allocTypeToString(a.second.type)];
-      v.first++;
-      v.second += a.second.size;
-    }
+        for (auto& a : entry.second) {
+            auto& v = outDevice[allocTypeToString(a.second.type)];
+            v.first++;
+            v.second += a.second.size;
+        }
-    out[entry.first] = std::move(outDevice);
-  }
+        out[entry.first] = std::move(outDevice);
+    }
-  return out;
+    return out;
 }
 //
@@ -536,70 +559,58 @@ StandardGpuResourcesImpl::getMemoryInfo() const {
 //
 StandardGpuResources::StandardGpuResources()
-    : res_(new StandardGpuResourcesImpl) {
-}
+        : res_(new StandardGpuResourcesImpl) {}
-StandardGpuResources::~StandardGpuResources() {
-}
+StandardGpuResources::~StandardGpuResources() {}
-std::shared_ptr<GpuResources>
-StandardGpuResources::getResources() {
-  return res_;
+std::shared_ptr<GpuResources> StandardGpuResources::getResources() {
+    return res_;
 }
-void
-StandardGpuResources::noTempMemory() {
-  res_->noTempMemory();
+void StandardGpuResources::noTempMemory() {
+    res_->noTempMemory();
 }
-void
-StandardGpuResources::setTempMemory(size_t size) {
-  res_->setTempMemory(size);
+void StandardGpuResources::setTempMemory(size_t size) {
+    res_->setTempMemory(size);
 }
-void
-StandardGpuResources::setPinnedMemory(size_t size) {
-  res_->setPinnedMemory(size);
+void StandardGpuResources::setPinnedMemory(size_t size) {
+    res_->setPinnedMemory(size);
 }
-void
-StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
-  res_->setDefaultStream(device, stream);
+void StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
+    res_->setDefaultStream(device, stream);
 }
-void
-StandardGpuResources::revertDefaultStream(int device) {
-  res_->revertDefaultStream(device);
+void StandardGpuResources::revertDefaultStream(int device) {
+    res_->revertDefaultStream(device);
 }
-void
-StandardGpuResources::setDefaultNullStreamAllDevices() {
-  res_->setDefaultNullStreamAllDevices();
+void StandardGpuResources::setDefaultNullStreamAllDevices() {
+    res_->setDefaultNullStreamAllDevices();
 }
 std::map<int, std::map<std::string, std::pair<int, size_t>>>
 StandardGpuResources::getMemoryInfo() const {
-  return res_->getMemoryInfo();
+    return res_->getMemoryInfo();
 }
-cudaStream_t
-StandardGpuResources::getDefaultStream(int device) {
-  return res_->getDefaultStream(device);
+cudaStream_t StandardGpuResources::getDefaultStream(int device) {
+    return res_->getDefaultStream(device);
 }
-size_t
-StandardGpuResources::getTempMemoryAvailable(int device) const {
-  return res_->getTempMemoryAvailable(device);
+size_t StandardGpuResources::getTempMemoryAvailable(int device) const {
+    return res_->getTempMemoryAvailable(device);
 }
-void
-StandardGpuResources::syncDefaultStreamCurrentDevice() {
-  res_->syncDefaultStreamCurrentDevice();
+void StandardGpuResources::syncDefaultStreamCurrentDevice() {
+    res_->syncDefaultStreamCurrentDevice();
 }
-void
-StandardGpuResources::setLogMemoryAllocations(bool enable) {
-  res_->setLogMemoryAllocations(enable);
+void StandardGpuResources::setLogMemoryAllocations(bool enable) {
+    res_->setLogMemoryAllocations(enable);
 }
-} } // namespace
+} // namespace gpu
+} // namespace faiss