RubyGems - faiss - Versions diffs - 0.2.0 → 0.2.4 - Mend

faiss 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (215) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +16 -0
data/LICENSE.txt +1 -1
data/README.md +7 -7
data/ext/faiss/extconf.rb +6 -3
data/ext/faiss/numo.hpp +4 -4
data/ext/faiss/utils.cpp +1 -1
data/ext/faiss/utils.h +1 -1
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +292 -291
data/vendor/faiss/faiss/AutoTune.h +55 -56
data/vendor/faiss/faiss/Clustering.cpp +365 -194
data/vendor/faiss/faiss/Clustering.h +102 -35
data/vendor/faiss/faiss/IVFlib.cpp +171 -195
data/vendor/faiss/faiss/IVFlib.h +48 -51
data/vendor/faiss/faiss/Index.cpp +85 -103
data/vendor/faiss/faiss/Index.h +54 -48
data/vendor/faiss/faiss/Index2Layer.cpp +126 -224
data/vendor/faiss/faiss/Index2Layer.h +22 -36
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +407 -0
data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +195 -0
data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
data/vendor/faiss/faiss/IndexBinary.h +140 -132
data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
data/vendor/faiss/faiss/IndexFlat.cpp +115 -176
data/vendor/faiss/faiss/IndexFlat.h +42 -59
data/vendor/faiss/faiss/IndexFlatCodes.cpp +67 -0
data/vendor/faiss/faiss/IndexFlatCodes.h +47 -0
data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
data/vendor/faiss/faiss/IndexHNSW.h +57 -41
data/vendor/faiss/faiss/IndexIVF.cpp +545 -453
data/vendor/faiss/faiss/IndexIVF.h +169 -118
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +316 -0
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +121 -0
data/vendor/faiss/faiss/IndexIVFFlat.cpp +247 -252
data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
data/vendor/faiss/faiss/IndexIVFPQ.cpp +459 -517
data/vendor/faiss/faiss/IndexIVFPQ.h +75 -67
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +163 -150
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +38 -25
data/vendor/faiss/faiss/IndexLSH.cpp +66 -113
data/vendor/faiss/faiss/IndexLSH.h +20 -38
data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
data/vendor/faiss/faiss/IndexLattice.h +11 -16
data/vendor/faiss/faiss/IndexNNDescent.cpp +229 -0
data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
data/vendor/faiss/faiss/IndexNSG.cpp +301 -0
data/vendor/faiss/faiss/IndexNSG.h +85 -0
data/vendor/faiss/faiss/IndexPQ.cpp +387 -495
data/vendor/faiss/faiss/IndexPQ.h +64 -82
data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
data/vendor/faiss/faiss/IndexRefine.cpp +139 -127
data/vendor/faiss/faiss/IndexRefine.h +32 -23
data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
data/vendor/faiss/faiss/IndexReplicas.h +62 -56
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +111 -172
data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -59
data/vendor/faiss/faiss/IndexShards.cpp +256 -240
data/vendor/faiss/faiss/IndexShards.h +85 -73
data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
data/vendor/faiss/faiss/MatrixStats.h +7 -10
data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
data/vendor/faiss/faiss/MetaIndexes.h +40 -34
data/vendor/faiss/faiss/MetricType.h +7 -7
data/vendor/faiss/faiss/VectorTransform.cpp +654 -475
data/vendor/faiss/faiss/VectorTransform.h +64 -89
data/vendor/faiss/faiss/clone_index.cpp +78 -73
data/vendor/faiss/faiss/clone_index.h +4 -9
data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +198 -171
data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +60 -0
data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +503 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +175 -0
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
data/vendor/faiss/faiss/impl/FaissException.h +41 -29
data/vendor/faiss/faiss/impl/HNSW.cpp +606 -617
data/vendor/faiss/faiss/impl/HNSW.h +179 -200
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +855 -0
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +244 -0
data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
data/vendor/faiss/faiss/impl/NSG.cpp +679 -0
data/vendor/faiss/faiss/impl/NSG.h +199 -0
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +758 -0
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +188 -0
data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +647 -707
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
data/vendor/faiss/faiss/impl/index_read.cpp +631 -480
data/vendor/faiss/faiss/impl/index_write.cpp +547 -407
data/vendor/faiss/faiss/impl/io.cpp +76 -95
data/vendor/faiss/faiss/impl/io.h +31 -41
data/vendor/faiss/faiss/impl/io_macros.h +60 -29
data/vendor/faiss/faiss/impl/kmeans1d.cpp +301 -0
data/vendor/faiss/faiss/impl/kmeans1d.h +48 -0
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
data/vendor/faiss/faiss/index_factory.cpp +619 -397
data/vendor/faiss/faiss/index_factory.h +8 -6
data/vendor/faiss/faiss/index_io.h +23 -26
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
data/vendor/faiss/faiss/utils/Heap.h +186 -209
data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
data/vendor/faiss/faiss/utils/distances.cpp +305 -312
data/vendor/faiss/faiss/utils/distances.h +170 -122
data/vendor/faiss/faiss/utils/distances_simd.cpp +498 -508
data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
data/vendor/faiss/faiss/utils/hamming.h +62 -85
data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
data/vendor/faiss/faiss/utils/partitioning.h +26 -21
data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
data/vendor/faiss/faiss/utils/random.cpp +39 -63
data/vendor/faiss/faiss/utils/random.h +13 -16
data/vendor/faiss/faiss/utils/simdlib.h +4 -2
data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
data/vendor/faiss/faiss/utils/utils.cpp +304 -287
data/vendor/faiss/faiss/utils/utils.h +54 -49
metadata +29 -4

data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp CHANGED Viewed

@@ -5,16 +5,16 @@
  * LICENSE file in the root directory of this source tree.
  */
 #include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/StaticUtils.h>
 #include <faiss/impl/FaissAssert.h>
-#include <limits>
 #include <iostream>
+#include <limits>
 #include <sstream>
-namespace faiss { namespace gpu {
+namespace faiss {
+namespace gpu {
 namespace {
@@ -22,513 +22,536 @@ namespace {
 constexpr int kNumStreams = 2;
 // Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
-constexpr size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
+constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024;
 // Default temporary memory allocation for <= 4 GiB memory GPUs
-constexpr size_t k4GiBTempMem = (size_t) 512 * 1024 * 1024;
+constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024;
 // Default temporary memory allocation for <= 8 GiB memory GPUs
-constexpr size_t k8GiBTempMem = (size_t) 1024 * 1024 * 1024;
+constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024;
 // Maximum temporary memory allocation for all GPUs
-constexpr size_t kMaxTempMem = (size_t) 1536 * 1024 * 1024;
+constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024;
 std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
-  // Produce a sorted list of all outstanding allocations by type
-  std::unordered_map<AllocType, std::pair<int, size_t>> stats;
-  for (auto& entry : map) {
-    auto& a = entry.second;
-    auto it = stats.find(a.type);
-    if (it != stats.end()) {
-      stats[a.type].first++;
-      stats[a.type].second += a.size;
-    } else {
-      stats[a.type] = std::make_pair(1, a.size);
+    // Produce a sorted list of all outstanding allocations by type
+    std::unordered_map<AllocType, std::pair<int, size_t>> stats;
+    for (auto& entry : map) {
+        auto& a = entry.second;
+        auto it = stats.find(a.type);
+        if (it != stats.end()) {
+            stats[a.type].first++;
+            stats[a.type].second += a.size;
+        } else {
+            stats[a.type] = std::make_pair(1, a.size);
+        }
     }
-  }
-  std::stringstream ss;
-  for (auto& entry : stats) {
-    ss << "Alloc type " << allocTypeToString(entry.first) << ": "
-       << entry.second.first << " allocations, "
-       << entry.second.second << " bytes\n";
-  }
+    std::stringstream ss;
+    for (auto& entry : stats) {
+        ss << "Alloc type " << allocTypeToString(entry.first) << ": "
+           << entry.second.first << " allocations, " << entry.second.second
+           << " bytes\n";
+    }
-  return ss.str();
+    return ss.str();
 }
-}
+} // namespace
 //
 // StandardGpuResourcesImpl
 //
-StandardGpuResourcesImpl::StandardGpuResourcesImpl() :
-    pinnedMemAlloc_(nullptr),
-    pinnedMemAllocSize_(0),
-    // let the adjustment function determine the memory size for us by passing
-    // in a huge value that will then be adjusted
-    tempMemSize_(getDefaultTempMemForGPU(-1,
-                                         std::numeric_limits<size_t>::max())),
-    pinnedMemSize_(kDefaultPinnedMemoryAllocation),
-    allocLogging_(false) {
-}
+StandardGpuResourcesImpl::StandardGpuResourcesImpl()
+        : pinnedMemAlloc_(nullptr),
+          pinnedMemAllocSize_(0),
+          // let the adjustment function determine the memory size for us by
+          // passing in a huge value that will then be adjusted
+          tempMemSize_(getDefaultTempMemForGPU(
+                  -1,
+                  std::numeric_limits<size_t>::max())),
+          pinnedMemSize_(kDefaultPinnedMemoryAllocation),
+          allocLogging_(false) {}
 StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
-  // The temporary memory allocator has allocated memory through us, so clean
-  // that up before we finish fully de-initializing ourselves
-  tempMemory_.clear();
-  // Make sure all allocations have been freed
-  bool allocError = false;
-  for (auto& entry : allocs_) {
-    auto& map = entry.second;
-    if (!map.empty()) {
-      std::cerr
-        << "StandardGpuResources destroyed with allocations outstanding:\n"
-        << "Device " << entry.first << " outstanding allocations:\n";
-      std::cerr << allocsToString(map);
-      allocError = true;
+    // The temporary memory allocator has allocated memory through us, so clean
+    // that up before we finish fully de-initializing ourselves
+    tempMemory_.clear();
+    // Make sure all allocations have been freed
+    bool allocError = false;
+    for (auto& entry : allocs_) {
+        auto& map = entry.second;
+        if (!map.empty()) {
+            std::cerr
+                    << "StandardGpuResources destroyed with allocations outstanding:\n"
+                    << "Device " << entry.first
+                    << " outstanding allocations:\n";
+            std::cerr << allocsToString(map);
+            allocError = true;
+        }
     }
-  }
-  FAISS_ASSERT_MSG(!allocError, "GPU memory allocations not properly cleaned up");
+    FAISS_ASSERT_MSG(
+            !allocError, "GPU memory allocations not properly cleaned up");
-  for (auto& entry : defaultStreams_) {
-    DeviceScope scope(entry.first);
+    for (auto& entry : defaultStreams_) {
+        DeviceScope scope(entry.first);
-    // We created these streams, so are responsible for destroying them
-    CUDA_VERIFY(cudaStreamDestroy(entry.second));
-  }
+        // We created these streams, so are responsible for destroying them
+        CUDA_VERIFY(cudaStreamDestroy(entry.second));
+    }
-  for (auto& entry : alternateStreams_) {
-    DeviceScope scope(entry.first);
+    for (auto& entry : alternateStreams_) {
+        DeviceScope scope(entry.first);
-    for (auto stream : entry.second) {
-      CUDA_VERIFY(cudaStreamDestroy(stream));
+        for (auto stream : entry.second) {
+            CUDA_VERIFY(cudaStreamDestroy(stream));
+        }
     }
-  }
-  for (auto& entry : asyncCopyStreams_) {
-    DeviceScope scope(entry.first);
+    for (auto& entry : asyncCopyStreams_) {
+        DeviceScope scope(entry.first);
-    CUDA_VERIFY(cudaStreamDestroy(entry.second));
-  }
+        CUDA_VERIFY(cudaStreamDestroy(entry.second));
+    }
-  for (auto& entry : blasHandles_) {
-    DeviceScope scope(entry.first);
+    for (auto& entry : blasHandles_) {
+        DeviceScope scope(entry.first);
-    auto blasStatus = cublasDestroy(entry.second);
-    FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
-  }
+        auto blasStatus = cublasDestroy(entry.second);
+        FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
+    }
-  if (pinnedMemAlloc_) {
-    auto err = cudaFreeHost(pinnedMemAlloc_);
-    FAISS_ASSERT_FMT(err == cudaSuccess,
-                     "Failed to cudaFreeHost pointer %p (error %d %s)",
-                     pinnedMemAlloc_, (int) err, cudaGetErrorString(err));
-  }
+    if (pinnedMemAlloc_) {
+        auto err = cudaFreeHost(pinnedMemAlloc_);
+        FAISS_ASSERT_FMT(
+                err == cudaSuccess,
+                "Failed to cudaFreeHost pointer %p (error %d %s)",
+                pinnedMemAlloc_,
+                (int)err,
+                cudaGetErrorString(err));
+    }
 }
-size_t
-StandardGpuResourcesImpl::getDefaultTempMemForGPU(int device,
-                                                  size_t requested) {
-  auto totalMem = device != -1 ?
-    getDeviceProperties(device).totalGlobalMem :
-    std::numeric_limits<size_t>::max();
+size_t StandardGpuResourcesImpl::getDefaultTempMemForGPU(
+        int device,
+        size_t requested) {
+    auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem
+                                 : std::numeric_limits<size_t>::max();
-  if (totalMem <= (size_t) 4 * 1024 * 1024 * 1024) {
-    // If the GPU has <= 4 GiB of memory, reserve 512 MiB
+    if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) {
+        // If the GPU has <= 4 GiB of memory, reserve 512 MiB
-    if (requested > k4GiBTempMem) {
-      return k4GiBTempMem;
-    }
-  } else if (totalMem <= (size_t) 8 * 1024 * 1024 * 1024) {
-    // If the GPU has <= 8 GiB of memory, reserve 1 GiB
+        if (requested > k4GiBTempMem) {
+            return k4GiBTempMem;
+        }
+    } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) {
+        // If the GPU has <= 8 GiB of memory, reserve 1 GiB
-    if (requested > k8GiBTempMem) {
-      return k8GiBTempMem;
-    }
-  } else {
-    // Never use more than 1.5 GiB
-    if (requested > kMaxTempMem) {
-      return kMaxTempMem;
+        if (requested > k8GiBTempMem) {
+            return k8GiBTempMem;
+        }
+    } else {
+        // Never use more than 1.5 GiB
+        if (requested > kMaxTempMem) {
+            return kMaxTempMem;
+        }
     }
-  }
-  // use whatever lower limit the user requested
-  return requested;
-}
-void
-StandardGpuResourcesImpl::noTempMemory() {
-  setTempMemory(0);
-}
-void
-StandardGpuResourcesImpl::setTempMemory(size_t size) {
-  if (tempMemSize_ != size) {
-    // adjust based on general limits
-    tempMemSize_ = getDefaultTempMemForGPU(-1, size);
-    // We need to re-initialize memory resources for all current devices that
-    // have been initialized.
-    // This should be safe to do, even if we are currently running work, because
-    // the cudaFree call that this implies will force-synchronize all GPUs with
-    // the CPU
-    for (auto& p : tempMemory_) {
-      int device = p.first;
-      // Free the existing memory first
-      p.second.reset();
-      // Allocate new
-      p.second = std::unique_ptr<StackDeviceMemory>(
-        new StackDeviceMemory(this,
-                              p.first,
-                              // adjust for this specific device
-                              getDefaultTempMemForGPU(device, tempMemSize_)));
+    // use whatever lower limit the user requested
+    return requested;
+}
+void StandardGpuResourcesImpl::noTempMemory() {
+    setTempMemory(0);
+}
+void StandardGpuResourcesImpl::setTempMemory(size_t size) {
+    if (tempMemSize_ != size) {
+        // adjust based on general limits
+        tempMemSize_ = getDefaultTempMemForGPU(-1, size);
+        // We need to re-initialize memory resources for all current devices
+        // that have been initialized. This should be safe to do, even if we are
+        // currently running work, because the cudaFree call that this implies
+        // will force-synchronize all GPUs with the CPU
+        for (auto& p : tempMemory_) {
+            int device = p.first;
+            // Free the existing memory first
+            p.second.reset();
+            // Allocate new
+            p.second = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
+                    this,
+                    p.first,
+                    // adjust for this specific device
+                    getDefaultTempMemForGPU(device, tempMemSize_)));
+        }
     }
-  }
 }
-void
-StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
-  // Should not call this after devices have been initialized
-  FAISS_ASSERT(defaultStreams_.size() == 0);
-  FAISS_ASSERT(!pinnedMemAlloc_);
+void StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
+    // Should not call this after devices have been initialized
+    FAISS_ASSERT(defaultStreams_.size() == 0);
+    FAISS_ASSERT(!pinnedMemAlloc_);
-  pinnedMemSize_ = size;
+    pinnedMemSize_ = size;
 }
-void
-StandardGpuResourcesImpl::setDefaultStream(int device, cudaStream_t stream) {
-  if (isInitialized(device)) {
-     // A new series of calls may not be ordered with what was the previous
-     // stream, so if the stream being specified is different, then we need to
-     // ensure ordering between the two (new stream waits on old).
-    auto it = userDefaultStreams_.find(device);
-    cudaStream_t prevStream = nullptr;
+void StandardGpuResourcesImpl::setDefaultStream(
+        int device,
+        cudaStream_t stream) {
+    if (isInitialized(device)) {
+        // A new series of calls may not be ordered with what was the previous
+        // stream, so if the stream being specified is different, then we need
+        // to ensure ordering between the two (new stream waits on old).
+        auto it = userDefaultStreams_.find(device);
+        cudaStream_t prevStream = nullptr;
-    if (it != userDefaultStreams_.end()) {
-      prevStream = it->second;
-    } else {
-      FAISS_ASSERT(defaultStreams_.count(device));
-      prevStream = defaultStreams_[device];
-    }
+        if (it != userDefaultStreams_.end()) {
+            prevStream = it->second;
+        } else {
+            FAISS_ASSERT(defaultStreams_.count(device));
+            prevStream = defaultStreams_[device];
+        }
-    if (prevStream != stream) {
-      streamWait({stream}, {prevStream});
+        if (prevStream != stream) {
+            streamWait({stream}, {prevStream});
+        }
     }
-  }
-  userDefaultStreams_[device] = stream;
+    userDefaultStreams_[device] = stream;
 }
-void
-StandardGpuResourcesImpl::revertDefaultStream(int device) {
-  if (isInitialized(device)) {
-    auto it = userDefaultStreams_.find(device);
+void StandardGpuResourcesImpl::revertDefaultStream(int device) {
+    if (isInitialized(device)) {
+        auto it = userDefaultStreams_.find(device);
-    if (it != userDefaultStreams_.end()) {
-      // There was a user stream set that we need to synchronize against
-      cudaStream_t prevStream = userDefaultStreams_[device];
+        if (it != userDefaultStreams_.end()) {
+            // There was a user stream set that we need to synchronize against
+            cudaStream_t prevStream = userDefaultStreams_[device];
-      FAISS_ASSERT(defaultStreams_.count(device));
-      cudaStream_t newStream = defaultStreams_[device];
+            FAISS_ASSERT(defaultStreams_.count(device));
+            cudaStream_t newStream = defaultStreams_[device];
-      streamWait({newStream}, {prevStream});
+            streamWait({newStream}, {prevStream});
+        }
     }
-  }
-  userDefaultStreams_.erase(device);
+    userDefaultStreams_.erase(device);
 }
-void
-StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
-  for (int dev = 0; dev < getNumDevices(); ++dev) {
-    setDefaultStream(dev, nullptr);
-  }
+void StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
+    for (int dev = 0; dev < getNumDevices(); ++dev) {
+        setDefaultStream(dev, nullptr);
+    }
 }
-void
-StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
-  allocLogging_ = enable;
+void StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
+    allocLogging_ = enable;
 }
-bool
-StandardGpuResourcesImpl::isInitialized(int device) const {
-  // Use default streams as a marker for whether or not a certain
-  // device has been initialized
-  return defaultStreams_.count(device) != 0;
+bool StandardGpuResourcesImpl::isInitialized(int device) const {
+    // Use default streams as a marker for whether or not a certain
+    // device has been initialized
+    return defaultStreams_.count(device) != 0;
 }
-void
-StandardGpuResourcesImpl::initializeForDevice(int device) {
-  if (isInitialized(device)) {
-    return;
-  }
-  // If this is the first device that we're initializing, create our
-  // pinned memory allocation
-  if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
-    auto err =
-      cudaHostAlloc(&pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
+void StandardGpuResourcesImpl::initializeForDevice(int device) {
+    if (isInitialized(device)) {
+        return;
+    }
-    FAISS_THROW_IF_NOT_FMT(
-      err == cudaSuccess,
-      "failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
-      "async copy buffer (error %d %s)",
-      pinnedMemSize_, (int) err, cudaGetErrorString(err));
+    // If this is the first device that we're initializing, create our
+    // pinned memory allocation
+    if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
+        auto err = cudaHostAlloc(
+                &pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
+        FAISS_THROW_IF_NOT_FMT(
+                err == cudaSuccess,
+                "failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
+                "async copy buffer (error %d %s)",
+                pinnedMemSize_,
+                (int)err,
+                cudaGetErrorString(err));
+        pinnedMemAllocSize_ = pinnedMemSize_;
+    }
-    pinnedMemAllocSize_ = pinnedMemSize_;
-  }
+    FAISS_ASSERT(device < getNumDevices());
+    DeviceScope scope(device);
-  FAISS_ASSERT(device < getNumDevices());
-  DeviceScope scope(device);
+    // Make sure that device properties for all devices are cached
+    auto& prop = getDeviceProperties(device);
-  // Make sure that device properties for all devices are cached
-  auto& prop = getDeviceProperties(device);
+    // Also check to make sure we meet our minimum compute capability (3.0)
+    FAISS_ASSERT_FMT(
+            prop.major >= 3,
+            "Device id %d with CC %d.%d not supported, "
+            "need 3.0+ compute capability",
+            device,
+            prop.major,
+            prop.minor);
-  // Also check to make sure we meet our minimum compute capability (3.0)
-  FAISS_ASSERT_FMT(prop.major >= 3,
-                   "Device id %d with CC %d.%d not supported, "
-                   "need 3.0+ compute capability",
-                   device, prop.major, prop.minor);
+    // Our code is pre-built with and expects warpSize == 32, validate that
+    FAISS_ASSERT_FMT(
+            prop.warpSize == 32,
+            "Device id %d does not have expected warpSize of 32",
+            device);
-  // Create streams
-  cudaStream_t defaultStream = 0;
-  CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
-                                        cudaStreamNonBlocking));
+    // Create streams
+    cudaStream_t defaultStream = 0;
+    CUDA_VERIFY(
+            cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
-  defaultStreams_[device] = defaultStream;
+    defaultStreams_[device] = defaultStream;
-  cudaStream_t asyncCopyStream = 0;
-  CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
-                                        cudaStreamNonBlocking));
+    cudaStream_t asyncCopyStream = 0;
+    CUDA_VERIFY(
+            cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
-  asyncCopyStreams_[device] = asyncCopyStream;
+    asyncCopyStreams_[device] = asyncCopyStream;
-  std::vector<cudaStream_t> deviceStreams;
-  for (int j = 0; j < kNumStreams; ++j) {
-    cudaStream_t stream = 0;
-    CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
-                                          cudaStreamNonBlocking));
+    std::vector<cudaStream_t> deviceStreams;
+    for (int j = 0; j < kNumStreams; ++j) {
+        cudaStream_t stream = 0;
+        CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-    deviceStreams.push_back(stream);
-  }
+        deviceStreams.push_back(stream);
+    }
-  alternateStreams_[device] = std::move(deviceStreams);
+    alternateStreams_[device] = std::move(deviceStreams);
-  // Create cuBLAS handle
-  cublasHandle_t blasHandle = 0;
-  auto blasStatus = cublasCreate(&blasHandle);
-  FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
-  blasHandles_[device] = blasHandle;
+    // Create cuBLAS handle
+    cublasHandle_t blasHandle = 0;
+    auto blasStatus = cublasCreate(&blasHandle);
+    FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
+    blasHandles_[device] = blasHandle;
-  // For CUDA 10 on V100, enabling tensor core usage would enable automatic
-  // rounding down of inputs to f16 (though accumulate in f32) which results in
-  // unacceptable loss of precision in general.
-  // For CUDA 11 / A100, only enable tensor core support if it doesn't result in
-  // a loss of precision.
+    // For CUDA 10 on V100, enabling tensor core usage would enable automatic
+    // rounding down of inputs to f16 (though accumulate in f32) which results
+    // in unacceptable loss of precision in general. For CUDA 11 / A100, only
+    // enable tensor core support if it doesn't result in a loss of precision.
 #if CUDA_VERSION >= 11000
-  cublasSetMathMode(blasHandle,
-                    CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+    cublasSetMathMode(
+            blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
 #endif
-  FAISS_ASSERT(allocs_.count(device) == 0);
-  allocs_[device] = std::unordered_map<void*, AllocRequest>();
+    FAISS_ASSERT(allocs_.count(device) == 0);
+    allocs_[device] = std::unordered_map<void*, AllocRequest>();
-  FAISS_ASSERT(tempMemory_.count(device) == 0);
-  auto mem = std::unique_ptr<StackDeviceMemory>(
-    new StackDeviceMemory(this,
-                          device,
-                          // adjust for this specific device
-                          getDefaultTempMemForGPU(device, tempMemSize_)));
+    FAISS_ASSERT(tempMemory_.count(device) == 0);
+    auto mem = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
+            this,
+            device,
+            // adjust for this specific device
+            getDefaultTempMemForGPU(device, tempMemSize_)));
-  tempMemory_.emplace(device, std::move(mem));
+    tempMemory_.emplace(device, std::move(mem));
 }
-cublasHandle_t
-StandardGpuResourcesImpl::getBlasHandle(int device) {
-  initializeForDevice(device);
-  return blasHandles_[device];
+cublasHandle_t StandardGpuResourcesImpl::getBlasHandle(int device) {
+    initializeForDevice(device);
+    return blasHandles_[device];
 }
-cudaStream_t
-StandardGpuResourcesImpl::getDefaultStream(int device) {
-  initializeForDevice(device);
+cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
+    initializeForDevice(device);
-  auto it = userDefaultStreams_.find(device);
-  if (it != userDefaultStreams_.end()) {
-    // There is a user override stream set
-    return it->second;
-  }
+    auto it = userDefaultStreams_.find(device);
+    if (it != userDefaultStreams_.end()) {
+        // There is a user override stream set
+        return it->second;
+    }
-  // Otherwise, our base default stream
-  return defaultStreams_[device];
+    // Otherwise, our base default stream
+    return defaultStreams_[device];
 }
-std::vector<cudaStream_t>
-StandardGpuResourcesImpl::getAlternateStreams(int device) {
-  initializeForDevice(device);
-  return alternateStreams_[device];
+std::vector<cudaStream_t> StandardGpuResourcesImpl::getAlternateStreams(
+        int device) {
+    initializeForDevice(device);
+    return alternateStreams_[device];
 }
-std::pair<void*, size_t>
-StandardGpuResourcesImpl::getPinnedMemory() {
-  return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
+std::pair<void*, size_t> StandardGpuResourcesImpl::getPinnedMemory() {
+    return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
 }
-cudaStream_t
-StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
-  initializeForDevice(device);
-  return asyncCopyStreams_[device];
+cudaStream_t StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
+    initializeForDevice(device);
+    return asyncCopyStreams_[device];
 }
-void*
-StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
-  initializeForDevice(req.device);
-  // We don't allocate a placeholder for zero-sized allocations
-  if (req.size == 0) {
-    return nullptr;
-  }
-  // Make sure that the allocation is a multiple of 16 bytes for alignment
-  // purposes
-  auto adjReq = req;
-  adjReq.size = utils::roundUp(adjReq.size, (size_t) 16);
-  void* p = nullptr;
-  if (allocLogging_) {
-    std::cout << "StandardGpuResources: alloc " << adjReq.toString() << "\n";
-  }
-  if (adjReq.space == MemorySpace::Temporary) {
-    // If we don't have enough space in our temporary memory manager, we need
-    // to allocate this request separately
-    auto& tempMem = tempMemory_[adjReq.device];
-    if (adjReq.size > tempMem->getSizeAvailable()) {
-      // We need to allocate this ourselves
-      AllocRequest newReq = adjReq;
-      newReq.space = MemorySpace::Device;
-      newReq.type = AllocType::TemporaryMemoryOverflow;
+void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
+    initializeForDevice(req.device);
-      return allocMemory(newReq);
+    // We don't allocate a placeholder for zero-sized allocations
+    if (req.size == 0) {
+        return nullptr;
     }
-    // Otherwise, we can handle this locally
-    p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
-  } else if (adjReq.space == MemorySpace::Device) {
-    auto err = cudaMalloc(&p, adjReq.size);
-    // Throw if we fail to allocate
-    if (err != cudaSuccess) {
-      auto& map = allocs_[req.device];
-      std::stringstream ss;
-      ss << "Failed to cudaMalloc " << adjReq.size << " bytes "
-         << "on device " << adjReq.device << " (error "
-         << (int) err << " " << cudaGetErrorString(err)
-         << "\nOutstanding allocations:\n" << allocsToString(map);
-      auto str = ss.str();
-      FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
+    // cudaMalloc guarantees allocation alignment to 256 bytes; do the same here
+    // for alignment purposes (to reduce memory transaction overhead etc)
+    auto adjReq = req;
+    adjReq.size = utils::roundUp(adjReq.size, (size_t)256);
+    void* p = nullptr;
+    if (adjReq.space == MemorySpace::Temporary) {
+        // If we don't have enough space in our temporary memory manager, we
+        // need to allocate this request separately
+        auto& tempMem = tempMemory_[adjReq.device];
+        if (adjReq.size > tempMem->getSizeAvailable()) {
+            // We need to allocate this ourselves
+            AllocRequest newReq = adjReq;
+            newReq.space = MemorySpace::Device;
+            newReq.type = AllocType::TemporaryMemoryOverflow;
+            if (allocLogging_) {
+                std::cout
+                        << "StandardGpuResources: alloc fail "
+                        << adjReq.toString()
+                        << " (no temp space); retrying as MemorySpace::Device\n";
+            }
+            return allocMemory(newReq);
+        }
+        // Otherwise, we can handle this locally
+        p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
+    } else if (adjReq.space == MemorySpace::Device) {
+        auto err = cudaMalloc(&p, adjReq.size);
+        // Throw if we fail to allocate
+        if (err != cudaSuccess) {
+            // FIXME: as of CUDA 11, a memory allocation error appears to be
+            // presented via cudaGetLastError as well, and needs to be cleared.
+            // Just call the function to clear it
+            cudaGetLastError();
+            std::stringstream ss;
+            ss << "StandardGpuResources: alloc fail " << adjReq.toString()
+               << " (cudaMalloc error " << cudaGetErrorString(err) << " ["
+               << (int)err << "])\n";
+            auto str = ss.str();
+            if (allocLogging_) {
+                std::cout << str;
+            }
+            FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
+        }
+    } else if (adjReq.space == MemorySpace::Unified) {
+        auto err = cudaMallocManaged(&p, adjReq.size);
+        if (err != cudaSuccess) {
+            // FIXME: as of CUDA 11, a memory allocation error appears to be
+            // presented via cudaGetLastError as well, and needs to be cleared.
+            // Just call the function to clear it
+            cudaGetLastError();
+            std::stringstream ss;
+            ss << "StandardGpuResources: alloc fail " << adjReq.toString()
+               << " failed (cudaMallocManaged error " << cudaGetErrorString(err)
+               << " [" << (int)err << "])\n";
+            auto str = ss.str();
+            if (allocLogging_) {
+                std::cout << str;
+            }
+            FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
+        }
+    } else {
+        FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
     }
-  } else if (adjReq.space == MemorySpace::Unified) {
-    auto err = cudaMallocManaged(&p, adjReq.size);
-    if (err != cudaSuccess) {
-      auto& map = allocs_[req.device];
-      std::stringstream ss;
-      ss << "Failed to cudaMallocManaged " << adjReq.size << " bytes "
-         << "(error " << (int) err << " " << cudaGetErrorString(err)
-         << "\nOutstanding allocations:\n" << allocsToString(map);
-      auto str = ss.str();
-      FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
+    if (allocLogging_) {
+        std::cout << "StandardGpuResources: alloc ok " << adjReq.toString()
+                  << " ptr 0x" << p << "\n";
     }
-  } else {
-    FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) adjReq.space);
-  }
-  allocs_[adjReq.device][p] = adjReq;
+    allocs_[adjReq.device][p] = adjReq;
-  return p;
+    return p;
 }
-void
-StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
-  FAISS_ASSERT(isInitialized(device));
+void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
+    FAISS_ASSERT(isInitialized(device));
-  if (!p) {
-    return;
-  }
+    if (!p) {
+        return;
+    }
-  auto& a = allocs_[device];
-  auto it = a.find(p);
-  FAISS_ASSERT(it != a.end());
+    auto& a = allocs_[device];
+    auto it = a.find(p);
+    FAISS_ASSERT(it != a.end());
-  auto& req = it->second;
+    auto& req = it->second;
-  if (allocLogging_) {
-    std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
-  }
+    if (allocLogging_) {
+        std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
+    }
-  if (req.space == MemorySpace::Temporary) {
-    tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
+    if (req.space == MemorySpace::Temporary) {
+        tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
-  } else if (req.space == MemorySpace::Device ||
-             req.space == MemorySpace::Unified) {
-    auto err = cudaFree(p);
-    FAISS_ASSERT_FMT(err == cudaSuccess,
-                     "Failed to cudaFree pointer %p (error %d %s)",
-                     p, (int) err, cudaGetErrorString(err));
+    } else if (
+            req.space == MemorySpace::Device ||
+            req.space == MemorySpace::Unified) {
+        auto err = cudaFree(p);
+        FAISS_ASSERT_FMT(
+                err == cudaSuccess,
+                "Failed to cudaFree pointer %p (error %d %s)",
+                p,
+                (int)err,
+                cudaGetErrorString(err));
-  } else {
-    FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) req.space);
-  }
+    } else {
+        FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
+    }
-  a.erase(it);
+    a.erase(it);
 }
-size_t
-StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
-  FAISS_ASSERT(isInitialized(device));
+size_t StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
+    FAISS_ASSERT(isInitialized(device));
-  auto it = tempMemory_.find(device);
-  FAISS_ASSERT(it != tempMemory_.end());
+    auto it = tempMemory_.find(device);
+    FAISS_ASSERT(it != tempMemory_.end());
-  return it->second->getSizeAvailable();
+    return it->second->getSizeAvailable();
 }
 std::map<int, std::map<std::string, std::pair<int, size_t>>>
 StandardGpuResourcesImpl::getMemoryInfo() const {
-  using AT = std::map<std::string, std::pair<int, size_t>>;
+    using AT = std::map<std::string, std::pair<int, size_t>>;
-  std::map<int, AT> out;
+    std::map<int, AT> out;
-  for (auto& entry : allocs_) {
-    AT outDevice;
+    for (auto& entry : allocs_) {
+        AT outDevice;
-    for (auto& a : entry.second) {
-      auto& v = outDevice[allocTypeToString(a.second.type)];
-      v.first++;
-      v.second += a.second.size;
-    }
+        for (auto& a : entry.second) {
+            auto& v = outDevice[allocTypeToString(a.second.type)];
+            v.first++;
+            v.second += a.second.size;
+        }
-    out[entry.first] = std::move(outDevice);
-  }
+        out[entry.first] = std::move(outDevice);
+    }
-  return out;
+    return out;
 }
 //
@@ -536,70 +559,58 @@ StandardGpuResourcesImpl::getMemoryInfo() const {
 //
 StandardGpuResources::StandardGpuResources()
-    : res_(new StandardGpuResourcesImpl) {
-}
+        : res_(new StandardGpuResourcesImpl) {}
-StandardGpuResources::~StandardGpuResources() {
-}
+StandardGpuResources::~StandardGpuResources() {}
-std::shared_ptr<GpuResources>
-StandardGpuResources::getResources() {
-  return res_;
+std::shared_ptr<GpuResources> StandardGpuResources::getResources() {
+    return res_;
 }
-void
-StandardGpuResources::noTempMemory() {
-  res_->noTempMemory();
+void StandardGpuResources::noTempMemory() {
+    res_->noTempMemory();
 }
-void
-StandardGpuResources::setTempMemory(size_t size) {
-  res_->setTempMemory(size);
+void StandardGpuResources::setTempMemory(size_t size) {
+    res_->setTempMemory(size);
 }
-void
-StandardGpuResources::setPinnedMemory(size_t size) {
-  res_->setPinnedMemory(size);
+void StandardGpuResources::setPinnedMemory(size_t size) {
+    res_->setPinnedMemory(size);
 }
-void
-StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
-  res_->setDefaultStream(device, stream);
+void StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
+    res_->setDefaultStream(device, stream);
 }
-void
-StandardGpuResources::revertDefaultStream(int device) {
-  res_->revertDefaultStream(device);
+void StandardGpuResources::revertDefaultStream(int device) {
+    res_->revertDefaultStream(device);
 }
-void
-StandardGpuResources::setDefaultNullStreamAllDevices() {
-  res_->setDefaultNullStreamAllDevices();
+void StandardGpuResources::setDefaultNullStreamAllDevices() {
+    res_->setDefaultNullStreamAllDevices();
 }
 std::map<int, std::map<std::string, std::pair<int, size_t>>>
 StandardGpuResources::getMemoryInfo() const {
-  return res_->getMemoryInfo();
+    return res_->getMemoryInfo();
 }
-cudaStream_t
-StandardGpuResources::getDefaultStream(int device) {
-  return res_->getDefaultStream(device);
+cudaStream_t StandardGpuResources::getDefaultStream(int device) {
+    return res_->getDefaultStream(device);
 }
-size_t
-StandardGpuResources::getTempMemoryAvailable(int device) const {
-  return res_->getTempMemoryAvailable(device);
+size_t StandardGpuResources::getTempMemoryAvailable(int device) const {
+    return res_->getTempMemoryAvailable(device);
 }
-void
-StandardGpuResources::syncDefaultStreamCurrentDevice() {
-  res_->syncDefaultStreamCurrentDevice();
+void StandardGpuResources::syncDefaultStreamCurrentDevice() {
+    res_->syncDefaultStreamCurrentDevice();
 }
-void
-StandardGpuResources::setLogMemoryAllocations(bool enable) {
-  res_->setLogMemoryAllocations(enable);
+void StandardGpuResources::setLogMemoryAllocations(bool enable) {
+    res_->setLogMemoryAllocations(enable);
 }
-} } // namespace
+} // namespace gpu
+} // namespace faiss