RubyGems - faiss - Versions diffs - 0.3.0 → 0.3.2 - Mend

faiss 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (216) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +9 -0
data/LICENSE.txt +1 -1
data/README.md +1 -1
data/ext/faiss/extconf.rb +9 -2
data/ext/faiss/index.cpp +1 -1
data/ext/faiss/index_binary.cpp +2 -2
data/ext/faiss/product_quantizer.cpp +1 -1
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +7 -7
data/vendor/faiss/faiss/AutoTune.h +1 -2
data/vendor/faiss/faiss/Clustering.cpp +39 -22
data/vendor/faiss/faiss/Clustering.h +40 -21
data/vendor/faiss/faiss/IVFlib.cpp +26 -12
data/vendor/faiss/faiss/Index.cpp +1 -1
data/vendor/faiss/faiss/Index.h +40 -10
data/vendor/faiss/faiss/Index2Layer.cpp +7 -7
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +176 -166
data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +15 -15
data/vendor/faiss/faiss/IndexBinary.cpp +9 -4
data/vendor/faiss/faiss/IndexBinary.h +8 -19
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +2 -1
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +24 -31
data/vendor/faiss/faiss/IndexBinaryHNSW.h +1 -1
data/vendor/faiss/faiss/IndexBinaryHash.cpp +25 -50
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +107 -188
data/vendor/faiss/faiss/IndexFastScan.cpp +95 -146
data/vendor/faiss/faiss/IndexFastScan.h +9 -8
data/vendor/faiss/faiss/IndexFlat.cpp +206 -10
data/vendor/faiss/faiss/IndexFlat.h +20 -1
data/vendor/faiss/faiss/IndexFlatCodes.cpp +170 -5
data/vendor/faiss/faiss/IndexFlatCodes.h +23 -4
data/vendor/faiss/faiss/IndexHNSW.cpp +231 -382
data/vendor/faiss/faiss/IndexHNSW.h +62 -49
data/vendor/faiss/faiss/IndexIDMap.cpp +69 -28
data/vendor/faiss/faiss/IndexIDMap.h +24 -2
data/vendor/faiss/faiss/IndexIVF.cpp +162 -56
data/vendor/faiss/faiss/IndexIVF.h +46 -6
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +33 -26
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +6 -2
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +19 -46
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -3
data/vendor/faiss/faiss/IndexIVFFastScan.cpp +502 -401
data/vendor/faiss/faiss/IndexIVFFastScan.h +63 -26
data/vendor/faiss/faiss/IndexIVFFlat.cpp +15 -5
data/vendor/faiss/faiss/IndexIVFFlat.h +3 -2
data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +172 -0
data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.h +56 -0
data/vendor/faiss/faiss/IndexIVFPQ.cpp +79 -125
data/vendor/faiss/faiss/IndexIVFPQ.h +6 -7
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +39 -52
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +4 -3
data/vendor/faiss/faiss/IndexIVFPQR.cpp +45 -29
data/vendor/faiss/faiss/IndexIVFPQR.h +5 -2
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +25 -27
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +6 -6
data/vendor/faiss/faiss/IndexLSH.cpp +14 -16
data/vendor/faiss/faiss/IndexLattice.cpp +1 -19
data/vendor/faiss/faiss/IndexLattice.h +3 -22
data/vendor/faiss/faiss/IndexNNDescent.cpp +3 -33
data/vendor/faiss/faiss/IndexNNDescent.h +1 -1
data/vendor/faiss/faiss/IndexNSG.cpp +11 -27
data/vendor/faiss/faiss/IndexNSG.h +11 -11
data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +56 -0
data/vendor/faiss/faiss/IndexNeuralNetCodec.h +49 -0
data/vendor/faiss/faiss/IndexPQ.cpp +72 -88
data/vendor/faiss/faiss/IndexPQ.h +1 -4
data/vendor/faiss/faiss/IndexPQFastScan.cpp +1 -1
data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -31
data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
data/vendor/faiss/faiss/IndexRefine.cpp +54 -24
data/vendor/faiss/faiss/IndexRefine.h +7 -0
data/vendor/faiss/faiss/IndexReplicas.cpp +23 -26
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +25 -17
data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -4
data/vendor/faiss/faiss/IndexShards.cpp +21 -29
data/vendor/faiss/faiss/IndexShardsIVF.cpp +1 -2
data/vendor/faiss/faiss/MatrixStats.cpp +17 -32
data/vendor/faiss/faiss/MatrixStats.h +21 -9
data/vendor/faiss/faiss/MetaIndexes.cpp +35 -35
data/vendor/faiss/faiss/MetricType.h +7 -2
data/vendor/faiss/faiss/VectorTransform.cpp +13 -26
data/vendor/faiss/faiss/VectorTransform.h +7 -7
data/vendor/faiss/faiss/clone_index.cpp +15 -10
data/vendor/faiss/faiss/clone_index.h +3 -0
data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +95 -17
data/vendor/faiss/faiss/cppcontrib/factory_tools.cpp +152 -0
data/vendor/faiss/faiss/cppcontrib/factory_tools.h +24 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +83 -30
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +123 -8
data/vendor/faiss/faiss/gpu/GpuCloner.h +22 -0
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +13 -0
data/vendor/faiss/faiss/gpu/GpuDistance.h +46 -38
data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -1
data/vendor/faiss/faiss/gpu/GpuIndex.h +30 -12
data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +282 -0
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +4 -4
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +14 -9
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +20 -3
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -11
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +1 -3
data/vendor/faiss/faiss/gpu/GpuResources.cpp +24 -3
data/vendor/faiss/faiss/gpu/GpuResources.h +39 -11
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +142 -17
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +57 -3
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +26 -21
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +7 -1
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +8 -5
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +25 -0
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +129 -9
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +332 -40
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +299 -208
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +1 -0
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +1 -1
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +6 -0
data/vendor/faiss/faiss/gpu/utils/RaftUtils.h +75 -0
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +4 -1
data/vendor/faiss/faiss/gpu/utils/Timer.h +1 -1
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +3 -1
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +5 -5
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +26 -1
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +10 -3
data/vendor/faiss/faiss/impl/DistanceComputer.h +70 -1
data/vendor/faiss/faiss/impl/FaissAssert.h +4 -2
data/vendor/faiss/faiss/impl/FaissException.h +13 -34
data/vendor/faiss/faiss/impl/HNSW.cpp +605 -186
data/vendor/faiss/faiss/impl/HNSW.h +52 -30
data/vendor/faiss/faiss/impl/IDSelector.h +4 -4
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +11 -9
data/vendor/faiss/faiss/impl/LookupTableScaler.h +34 -0
data/vendor/faiss/faiss/impl/NNDescent.cpp +42 -27
data/vendor/faiss/faiss/impl/NSG.cpp +0 -29
data/vendor/faiss/faiss/impl/NSG.h +1 -1
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +14 -12
data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +1 -1
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -22
data/vendor/faiss/faiss/impl/ProductQuantizer.h +6 -2
data/vendor/faiss/faiss/impl/Quantizer.h +1 -1
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +27 -1015
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +5 -63
data/vendor/faiss/faiss/impl/ResultHandler.h +347 -172
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +1104 -147
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +3 -8
data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +285 -42
data/vendor/faiss/faiss/impl/code_distance/code_distance-avx512.h +248 -0
data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +21 -14
data/vendor/faiss/faiss/impl/code_distance/code_distance.h +22 -12
data/vendor/faiss/faiss/impl/index_read.cpp +74 -34
data/vendor/faiss/faiss/impl/index_read_utils.h +37 -0
data/vendor/faiss/faiss/impl/index_write.cpp +88 -51
data/vendor/faiss/faiss/impl/io.cpp +23 -15
data/vendor/faiss/faiss/impl/io.h +4 -4
data/vendor/faiss/faiss/impl/io_macros.h +6 -0
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
data/vendor/faiss/faiss/impl/platform_macros.h +40 -1
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +14 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +7 -6
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +52 -38
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +487 -49
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +960 -0
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +176 -0
data/vendor/faiss/faiss/impl/simd_result_handlers.h +481 -225
data/vendor/faiss/faiss/index_factory.cpp +41 -20
data/vendor/faiss/faiss/index_io.h +12 -5
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +28 -8
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +3 -0
data/vendor/faiss/faiss/invlists/DirectMap.cpp +10 -2
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +73 -17
data/vendor/faiss/faiss/invlists/InvertedLists.h +26 -8
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +24 -9
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +2 -1
data/vendor/faiss/faiss/python/python_callbacks.cpp +4 -4
data/vendor/faiss/faiss/utils/Heap.cpp +3 -1
data/vendor/faiss/faiss/utils/Heap.h +105 -0
data/vendor/faiss/faiss/utils/NeuralNet.cpp +342 -0
data/vendor/faiss/faiss/utils/NeuralNet.h +147 -0
data/vendor/faiss/faiss/utils/WorkerThread.h +1 -0
data/vendor/faiss/faiss/utils/bf16.h +36 -0
data/vendor/faiss/faiss/utils/distances.cpp +147 -123
data/vendor/faiss/faiss/utils/distances.h +86 -9
data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +5 -5
data/vendor/faiss/faiss/utils/distances_fused/avx512.h +2 -2
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +2 -2
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +1 -1
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +5 -5
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +1 -1
data/vendor/faiss/faiss/utils/distances_simd.cpp +1589 -243
data/vendor/faiss/faiss/utils/extra_distances-inl.h +70 -0
data/vendor/faiss/faiss/utils/extra_distances.cpp +85 -137
data/vendor/faiss/faiss/utils/extra_distances.h +3 -2
data/vendor/faiss/faiss/utils/fp16-arm.h +29 -0
data/vendor/faiss/faiss/utils/fp16.h +2 -0
data/vendor/faiss/faiss/utils/hamming.cpp +163 -111
data/vendor/faiss/faiss/utils/hamming.h +58 -0
data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +16 -89
data/vendor/faiss/faiss/utils/hamming_distance/common.h +1 -0
data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +19 -88
data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +58 -0
data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +14 -104
data/vendor/faiss/faiss/utils/partitioning.cpp +3 -4
data/vendor/faiss/faiss/utils/prefetch.h +77 -0
data/vendor/faiss/faiss/utils/quantize_lut.cpp +0 -14
data/vendor/faiss/faiss/utils/random.cpp +43 -0
data/vendor/faiss/faiss/utils/random.h +25 -0
data/vendor/faiss/faiss/utils/simdlib.h +10 -1
data/vendor/faiss/faiss/utils/simdlib_avx2.h +0 -6
data/vendor/faiss/faiss/utils/simdlib_avx512.h +296 -0
data/vendor/faiss/faiss/utils/simdlib_neon.h +77 -79
data/vendor/faiss/faiss/utils/simdlib_ppc64.h +1084 -0
data/vendor/faiss/faiss/utils/sorting.cpp +140 -5
data/vendor/faiss/faiss/utils/sorting.h +27 -0
data/vendor/faiss/faiss/utils/transpose/transpose-avx512-inl.h +176 -0
data/vendor/faiss/faiss/utils/utils.cpp +120 -7
data/vendor/faiss/faiss/utils/utils.h +60 -20
metadata +23 -4
data/vendor/faiss/faiss/impl/code_distance/code_distance_avx512.h +0 -102

data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp CHANGED Viewed

@@ -4,6 +4,29 @@
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
  */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#if defined USE_NVIDIA_RAFT
+#include <raft/core/device_resources.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/host/pinned_memory_resource.hpp>
+#include <memory>
+#endif
 #include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
@@ -66,7 +89,12 @@ std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
 //
 StandardGpuResourcesImpl::StandardGpuResourcesImpl()
-        : pinnedMemAlloc_(nullptr),
+        :
+#if defined USE_NVIDIA_RAFT
+          mmr_(new rmm::mr::managed_memory_resource),
+          pmr_(new rmm::mr::pinned_memory_resource),
+#endif
+          pinnedMemAlloc_(nullptr),
           pinnedMemAllocSize_(0),
           // let the adjustment function determine the memory size for us by
           // passing in a huge value that will then be adjusted
@@ -74,7 +102,8 @@ StandardGpuResourcesImpl::StandardGpuResourcesImpl()
                   -1,
                   std::numeric_limits<size_t>::max())),
           pinnedMemSize_(kDefaultPinnedMemoryAllocation),
-          allocLogging_(false) {}
+          allocLogging_(false) {
+}
 StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
     // The temporary memory allocator has allocated memory through us, so clean
@@ -129,6 +158,9 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
     }
     if (pinnedMemAlloc_) {
+#if defined USE_NVIDIA_RAFT
+        pmr_->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_);
+#else
         auto err = cudaFreeHost(pinnedMemAlloc_);
         FAISS_ASSERT_FMT(
                 err == cudaSuccess,
@@ -136,6 +168,7 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
                 pinnedMemAlloc_,
                 (int)err,
                 cudaGetErrorString(err));
+#endif
     }
 }
@@ -187,11 +220,11 @@ void StandardGpuResourcesImpl::setTempMemory(size_t size) {
             p.second.reset();
             // Allocate new
-            p.second = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
+            p.second = std::make_unique<StackDeviceMemory>(
                     this,
                     p.first,
                     // adjust for this specific device
-                    getDefaultTempMemForGPU(device, tempMemSize_)));
+                    getDefaultTempMemForGPU(device, tempMemSize_));
         }
     }
 }
@@ -224,6 +257,14 @@ void StandardGpuResourcesImpl::setDefaultStream(
         if (prevStream != stream) {
             streamWait({stream}, {prevStream});
         }
+#if defined USE_NVIDIA_RAFT
+        // delete the raft handle for this device, which will be initialized
+        // with the updated stream during any subsequent calls to getRaftHandle
+        auto it2 = raftHandles_.find(device);
+        if (it2 != raftHandles_.end()) {
+            raftHandles_.erase(it2);
+        }
+#endif
     }
     userDefaultStreams_[device] = stream;
@@ -242,6 +283,14 @@ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
             streamWait({newStream}, {prevStream});
         }
+#if defined USE_NVIDIA_RAFT
+        // delete the raft handle for this device, which will be initialized
+        // with the updated stream during any subsequent calls to getRaftHandle
+        auto it2 = raftHandles_.find(device);
+        if (it2 != raftHandles_.end()) {
+            raftHandles_.erase(it2);
+        }
+#endif
     }
     userDefaultStreams_.erase(device);
@@ -274,6 +323,19 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
     // If this is the first device that we're initializing, create our
     // pinned memory allocation
     if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
+#if defined USE_NVIDIA_RAFT
+        // If this is the first device that we're initializing, create our
+        // pinned memory allocation
+        if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
+            try {
+                pinnedMemAlloc_ = pmr_->allocate(pinnedMemSize_);
+            } catch (const std::bad_alloc& rmm_ex) {
+                FAISS_THROW_MSG("CUDA memory allocation error");
+            }
+            pinnedMemAllocSize_ = pinnedMemSize_;
+        }
+#else
         auto err = cudaHostAlloc(
                 &pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
@@ -286,6 +348,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
                 cudaGetErrorString(err));
         pinnedMemAllocSize_ = pinnedMemSize_;
+#endif
     }
     // Make sure that device properties for all devices are cached
@@ -300,19 +363,32 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
             prop.major,
             prop.minor);
+#if USE_AMD_ROCM
+    // Our code is pre-built with and expects warpSize == 32 or 64, validate
+    // that
+    FAISS_ASSERT_FMT(
+            prop.warpSize == 32 || prop.warpSize == 64,
+            "Device id %d does not have expected warpSize of 32 or 64",
+            device);
+#else
     // Our code is pre-built with and expects warpSize == 32, validate that
     FAISS_ASSERT_FMT(
             prop.warpSize == 32,
             "Device id %d does not have expected warpSize of 32",
             device);
+#endif
     // Create streams
-    cudaStream_t defaultStream = 0;
+    cudaStream_t defaultStream = nullptr;
     CUDA_VERIFY(
             cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
     defaultStreams_[device] = defaultStream;
+#if defined USE_NVIDIA_RAFT
+    raftHandles_.emplace(std::make_pair(device, defaultStream));
+#endif
     cudaStream_t asyncCopyStream = 0;
     CUDA_VERIFY(
             cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
@@ -321,7 +397,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
     std::vector<cudaStream_t> deviceStreams;
     for (int j = 0; j < kNumStreams; ++j) {
-        cudaStream_t stream = 0;
+        cudaStream_t stream = nullptr;
         CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
         deviceStreams.push_back(stream);
@@ -330,7 +406,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
     alternateStreams_[device] = std::move(deviceStreams);
     // Create cuBLAS handle
-    cublasHandle_t blasHandle = 0;
+    cublasHandle_t blasHandle = nullptr;
     auto blasStatus = cublasCreate(&blasHandle);
     FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
     blasHandles_[device] = blasHandle;
@@ -348,11 +424,11 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
     allocs_[device] = std::unordered_map<void*, AllocRequest>();
     FAISS_ASSERT(tempMemory_.count(device) == 0);
-    auto mem = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
+    auto mem = std::make_unique<StackDeviceMemory>(
             this,
             device,
             // adjust for this specific device
-            getDefaultTempMemForGPU(device, tempMemSize_)));
+            getDefaultTempMemForGPU(device, tempMemSize_));
     tempMemory_.emplace(device, std::move(mem));
 }
@@ -375,6 +451,25 @@ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
     return defaultStreams_[device];
 }
+#if defined USE_NVIDIA_RAFT
+raft::device_resources& StandardGpuResourcesImpl::getRaftHandle(int device) {
+    initializeForDevice(device);
+    auto it = raftHandles_.find(device);
+    if (it == raftHandles_.end()) {
+        // Make sure we are using the stream the user may have already assigned
+        // to the current GpuResources
+        raftHandles_.emplace(device, getDefaultStream(device));
+        // Initialize cublas handle
+        raftHandles_[device].get_cublas_handle();
+    }
+    // Otherwise, our base default handle
+    return raftHandles_[device];
+}
+#endif
 std::vector<cudaStream_t> StandardGpuResourcesImpl::getAlternateStreams(
         int device) {
     initializeForDevice(device);
@@ -406,8 +501,6 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
     void* p = nullptr;
     if (adjReq.space == MemorySpace::Temporary) {
-        // If we don't have enough space in our temporary memory manager, we
-        // need to allocate this request separately
         auto& tempMem = tempMemory_[adjReq.device];
         if (adjReq.size > tempMem->getSizeAvailable()) {
@@ -428,15 +521,25 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
         // Otherwise, we can handle this locally
         p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
     } else if (adjReq.space == MemorySpace::Device) {
+#if defined USE_NVIDIA_RAFT
+        try {
+            rmm::mr::device_memory_resource* current_mr =
+                    rmm::mr::get_per_device_resource(
+                            rmm::cuda_device_id{adjReq.device});
+            p = current_mr->allocate_async(adjReq.size, adjReq.stream);
+            adjReq.mr = current_mr;
+        } catch (const std::bad_alloc& rmm_ex) {
+            FAISS_THROW_MSG("CUDA memory allocation error");
+        }
+#else
         auto err = cudaMalloc(&p, adjReq.size);
         // Throw if we fail to allocate
         if (err != cudaSuccess) {
             // FIXME: as of CUDA 11, a memory allocation error appears to be
-            // presented via cudaGetLastError as well, and needs to be cleared.
-            // Just call the function to clear it
+            // presented via cudaGetLastError as well, and needs to be
+            // cleared. Just call the function to clear it
             cudaGetLastError();
             std::stringstream ss;
@@ -451,7 +554,20 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
             FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
         }
+#endif
     } else if (adjReq.space == MemorySpace::Unified) {
+#if defined USE_NVIDIA_RAFT
+        try {
+            // for now, use our own managed MR to do Unified Memory allocations.
+            // TODO: change this to use the current device resource once RMM has
+            // a way to retrieve a "guaranteed" managed memory resource for a
+            // device.
+            p = mmr_->allocate_async(adjReq.size, adjReq.stream);
+            adjReq.mr = mmr_.get();
+        } catch (const std::bad_alloc& rmm_ex) {
+            FAISS_THROW_MSG("CUDA memory allocation error");
+        }
+#else
         auto err = cudaMallocManaged(&p, adjReq.size);
         if (err != cudaSuccess) {
@@ -472,6 +588,7 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
             FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
         }
+#endif
     } else {
         FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
     }
@@ -505,10 +622,12 @@ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
     if (req.space == MemorySpace::Temporary) {
         tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
     } else if (
             req.space == MemorySpace::Device ||
             req.space == MemorySpace::Unified) {
+#if defined USE_NVIDIA_RAFT
+        req.mr->deallocate_async(p, req.size, req.stream);
+#else
         auto err = cudaFree(p);
         FAISS_ASSERT_FMT(
                 err == cudaSuccess,
@@ -516,7 +635,7 @@ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
                 p,
                 (int)err,
                 cudaGetErrorString(err));
+#endif
     } else {
         FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
     }
@@ -561,7 +680,7 @@ StandardGpuResourcesImpl::getMemoryInfo() const {
 StandardGpuResources::StandardGpuResources()
         : res_(new StandardGpuResourcesImpl) {}
-StandardGpuResources::~StandardGpuResources() {}
+StandardGpuResources::~StandardGpuResources() = default;
 std::shared_ptr<GpuResources> StandardGpuResources::getResources() {
     return res_;
@@ -600,6 +719,12 @@ cudaStream_t StandardGpuResources::getDefaultStream(int device) {
     return res_->getDefaultStream(device);
 }
+#if defined USE_NVIDIA_RAFT
+raft::device_resources& StandardGpuResources::getRaftHandle(int device) {
+    return res_->getRaftHandle(device);
+}
+#endif
 size_t StandardGpuResources::getTempMemoryAvailable(int device) const {
     return res_->getTempMemoryAvailable(device);
 }

data/vendor/faiss/faiss/gpu/StandardGpuResources.h CHANGED Viewed

@@ -4,9 +4,29 @@
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
  */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 #pragma once
+#if defined USE_NVIDIA_RAFT
+#include <raft/core/device_resources.hpp>
+#include <rmm/mr/host/pinned_memory_resource.hpp>
+#endif
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/StackDeviceMemory.h>
@@ -15,6 +35,7 @@
 #include <unordered_map>
 #include <vector>
+#pragma GCC visibility push(default)
 namespace faiss {
 namespace gpu {
@@ -58,6 +79,12 @@ class StandardGpuResourcesImpl : public GpuResources {
     /// this stream upon exit from an index or other Faiss GPU call.
     cudaStream_t getDefaultStream(int device) override;
+#if defined USE_NVIDIA_RAFT
+    /// Returns the raft handle for the given device which can be used to
+    /// make calls to other raft primitives.
+    raft::device_resources& getRaftHandle(int device) override;
+#endif
     /// Called to change the work ordering streams to the null stream
     /// for all devices
     void setDefaultNullStreamAllDevices();
@@ -92,7 +119,7 @@ class StandardGpuResourcesImpl : public GpuResources {
     cudaStream_t getAsyncCopyStream(int device) override;
-   private:
+   protected:
     /// Have GPU resources been initialized for this device yet?
     bool isInitialized(int device) const;
@@ -100,7 +127,7 @@ class StandardGpuResourcesImpl : public GpuResources {
     /// memory size
     static size_t getDefaultTempMemForGPU(int device, size_t requested);
-   private:
+   protected:
     /// Set of currently outstanding memory allocations per device
     /// device -> (alloc request, allocated ptr)
     std::unordered_map<int, std::unordered_map<void*, AllocRequest>> allocs_;
@@ -124,6 +151,27 @@ class StandardGpuResourcesImpl : public GpuResources {
     /// cuBLAS handle for each device
     std::unordered_map<int, cublasHandle_t> blasHandles_;
+#if defined USE_NVIDIA_RAFT
+    /// raft handle for each device
+    std::unordered_map<int, raft::device_resources> raftHandles_;
+    /**
+     * FIXME: Integrating these in a separate code path for now. Ultimately,
+     * it would be nice if we use a simple memory resource abstraction
+     * in FAISS so we could plug in whether to use RMM's memory resources
+     * or the default.
+     *
+     * There's enough duplicated logic that it doesn't *seem* to make sense
+     * to create a subclass only for the RMM memory resources.
+     */
+    // managed_memory_resource
+    std::unique_ptr<rmm::mr::device_memory_resource> mmr_;
+    // pinned_memory_resource
+    std::unique_ptr<rmm::mr::host_memory_resource> pmr_;
+#endif
     /// Pinned memory allocation for use with this GPU
     void* pinnedMemAlloc_;
     size_t pinnedMemAllocSize_;
@@ -183,10 +231,15 @@ class StandardGpuResources : public GpuResourcesProvider {
     /// Export a description of memory used for Python
     std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo()
             const;
     /// Returns the current default stream
     cudaStream_t getDefaultStream(int device);
+#if defined USE_NVIDIA_RAFT
+    /// Returns the raft handle for the given device which can be used to
+    /// make calls to other raft primitives.
+    raft::device_resources& getRaftHandle(int device);
+#endif
     /// Returns the current amount of temp memory available
     size_t getTempMemoryAvailable(int device) const;
@@ -203,3 +256,4 @@ class StandardGpuResources : public GpuResourcesProvider {
 } // namespace gpu
 } // namespace faiss
+#pragma GCC visibility pop

data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp CHANGED Viewed

@@ -6,6 +6,7 @@
  */
 #include <faiss/gpu/impl/InterleavedCodes.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/StaticUtils.h>
 #include <faiss/impl/FaissAssert.h>
@@ -166,15 +167,16 @@ void unpackInterleavedWord(
         int numVecs,
         int dims,
         int bitsPerCode) {
-    int wordsPerDimBlock = 32 * bitsPerCode / (8 * sizeof(T));
+    int warpSize = getWarpSizeCurrentDevice();
+    int wordsPerDimBlock = (size_t)warpSize * bitsPerCode / (8 * sizeof(T));
     int wordsPerBlock = wordsPerDimBlock * dims;
-    int numBlocks = utils::divUp(numVecs, 32);
+    int numBlocks = utils::divUp(numVecs, warpSize);
 #pragma omp parallel for
     for (int i = 0; i < numVecs; ++i) {
-        int block = i / 32;
+        int block = i / warpSize;
         FAISS_ASSERT(block < numBlocks);
-        int lane = i % 32;
+        int lane = i % warpSize;
         for (int j = 0; j < dims; ++j) {
             int srcOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
@@ -188,9 +190,10 @@ std::vector<uint8_t> unpackInterleaved(
         int numVecs,
         int dims,
         int bitsPerCode) {
-    int bytesPerDimBlock = 32 * bitsPerCode / 8;
+    int warpSize = getWarpSizeCurrentDevice();
+    int bytesPerDimBlock = warpSize * bitsPerCode / 8;
     int bytesPerBlock = bytesPerDimBlock * dims;
-    int numBlocks = utils::divUp(numVecs, 32);
+    int numBlocks = utils::divUp(numVecs, warpSize);
     size_t totalSize = (size_t)bytesPerBlock * numBlocks;
     FAISS_ASSERT(data.size() == totalSize);
@@ -217,8 +220,8 @@ std::vector<uint8_t> unpackInterleaved(
     } else if (bitsPerCode == 4) {
 #pragma omp parallel for
         for (int i = 0; i < numVecs; ++i) {
-            int block = i / 32;
-            int lane = i % 32;
+            int block = i / warpSize;
+            int lane = i % warpSize;
             int word = lane / 2;
             int subWord = lane % 2;
@@ -235,8 +238,8 @@ std::vector<uint8_t> unpackInterleaved(
     } else if (bitsPerCode == 5) {
 #pragma omp parallel for
         for (int i = 0; i < numVecs; ++i) {
-            int block = i / 32;
-            int blockVector = i % 32;
+            int block = i / warpSize;
+            int blockVector = i % warpSize;
             for (int j = 0; j < dims; ++j) {
                 uint8_t* dimBlock =
@@ -257,8 +260,8 @@ std::vector<uint8_t> unpackInterleaved(
     } else if (bitsPerCode == 6) {
 #pragma omp parallel for
         for (int i = 0; i < numVecs; ++i) {
-            int block = i / 32;
-            int blockVector = i % 32;
+            int block = i / warpSize;
+            int blockVector = i % warpSize;
             for (int j = 0; j < dims; ++j) {
                 uint8_t* dimBlock =
@@ -442,17 +445,18 @@ void packInterleavedWord(
         int numVecs,
         int dims,
         int bitsPerCode) {
-    int wordsPerDimBlock = 32 * bitsPerCode / (8 * sizeof(T));
+    int warpSize = getWarpSizeCurrentDevice();
+    int wordsPerDimBlock = (size_t)warpSize * bitsPerCode / (8 * sizeof(T));
     int wordsPerBlock = wordsPerDimBlock * dims;
-    int numBlocks = utils::divUp(numVecs, 32);
+    int numBlocks = utils::divUp(numVecs, warpSize);
     // We're guaranteed that all other slots not filled by the vectors present
     // are initialized to zero (from the vector constructor in packInterleaved)
 #pragma omp parallel for
     for (int i = 0; i < numVecs; ++i) {
-        int block = i / 32;
+        int block = i / warpSize;
         FAISS_ASSERT(block < numBlocks);
-        int lane = i % 32;
+        int lane = i % warpSize;
         for (int j = 0; j < dims; ++j) {
             int dstOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
@@ -466,9 +470,10 @@ std::vector<uint8_t> packInterleaved(
         int numVecs,
         int dims,
         int bitsPerCode) {
-    int bytesPerDimBlock = 32 * bitsPerCode / 8;
+    int warpSize = getWarpSizeCurrentDevice();
+    int bytesPerDimBlock = warpSize * bitsPerCode / 8;
     int bytesPerBlock = bytesPerDimBlock * dims;
-    int numBlocks = utils::divUp(numVecs, 32);
+    int numBlocks = utils::divUp(numVecs, warpSize);
     size_t totalSize = (size_t)bytesPerBlock * numBlocks;
     // bit codes padded to whole bytes
@@ -499,7 +504,7 @@ std::vector<uint8_t> packInterleaved(
         for (int i = 0; i < numBlocks; ++i) {
             for (int j = 0; j < dims; ++j) {
                 for (int k = 0; k < bytesPerDimBlock; ++k) {
-                    int loVec = i * 32 + k * 2;
+                    int loVec = i * warpSize + k * 2;
                     int hiVec = loVec + 1;
                     uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
@@ -516,7 +521,7 @@ std::vector<uint8_t> packInterleaved(
             for (int j = 0; j < dims; ++j) {
                 for (int k = 0; k < bytesPerDimBlock; ++k) {
                     // What input vectors we are pulling from
-                    int loVec = i * 32 + (k * 8) / 5;
+                    int loVec = i * warpSize + (k * 8) / 5;
                     int hiVec = loVec + 1;
                     int hiVec2 = hiVec + 1;
@@ -536,7 +541,7 @@ std::vector<uint8_t> packInterleaved(
             for (int j = 0; j < dims; ++j) {
                 for (int k = 0; k < bytesPerDimBlock; ++k) {
                     // What input vectors we are pulling from
-                    int loVec = i * 32 + (k * 8) / 6;
+                    int loVec = i * warpSize + (k * 8) / 6;
                     int hiVec = loVec + 1;
                     uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;

data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp CHANGED Viewed

@@ -17,6 +17,7 @@
 #include <vector>
 #include <cuda_profiler_api.h>
+#include <faiss/impl/AuxIndexStructures.h>
 DEFINE_int32(num, 10000, "# of vecs");
 DEFINE_int32(k, 100, "# of clusters");
@@ -34,6 +35,7 @@ DEFINE_int64(
         "minimum size to use CPU -> GPU paged copies");
 DEFINE_int64(pinned_mem, -1, "pinned memory allocation to use");
 DEFINE_int32(max_points, -1, "max points per centroid");
+DEFINE_double(timeout, 0, "timeout in seconds");
 using namespace faiss::gpu;
@@ -42,7 +44,7 @@ int main(int argc, char** argv) {
     cudaProfilerStop();
-    auto seed = FLAGS_seed != -1L ? FLAGS_seed : time(nullptr);
+    auto seed = FLAGS_seed != -1 ? FLAGS_seed : time(nullptr);
     printf("using seed %ld\n", seed);
     std::vector<float> vecs((size_t)FLAGS_num * FLAGS_dim);
@@ -99,10 +101,14 @@ int main(int argc, char** argv) {
         cp.max_points_per_centroid = FLAGS_max_points;
     }
+    auto tc = new faiss::TimeoutCallback();
+    faiss::InterruptCallback::instance.reset(tc);
     faiss::Clustering kmeans(FLAGS_dim, FLAGS_k, cp);
     // Time k-means
     {
+        tc->set_timeout(FLAGS_timeout);
         CpuTimer timer;
         kmeans.train(FLAGS_num, vecs.data(), *(gpuIndex.getIndex()));

data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp CHANGED Viewed

@@ -7,6 +7,7 @@
 #include <faiss/gpu/impl/InterleavedCodes.h>
 #include <faiss/gpu/test/TestUtils.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/StaticUtils.h>
 #include <gtest/gtest.h>
 #include <cmath>
@@ -119,8 +120,9 @@ TEST(TestCodePacking, InterleavedCodes_UnpackPack) {
                 std::cout << bitsPerCode << " " << dims << " " << numVecs
                           << "\n";
-                int blocks = utils::divUp(numVecs, 32);
-                int bytesPerDimBlock = 32 * bitsPerCode / 8;
+                int warpSize = getWarpSizeCurrentDevice();
+                int blocks = utils::divUp(numVecs, warpSize);
+                int bytesPerDimBlock = warpSize * bitsPerCode / 8;
                 int bytesPerBlock = bytesPerDimBlock * dims;
                 int size = blocks * bytesPerBlock;
@@ -132,9 +134,9 @@ TEST(TestCodePacking, InterleavedCodes_UnpackPack) {
                     for (int i = 0; i < blocks; ++i) {
                         for (int j = 0; j < dims; ++j) {
-                            for (int k = 0; k < 32; ++k) {
+                            for (int k = 0; k < warpSize; ++k) {
                                 for (int l = 0; l < bytesPerCode; ++l) {
-                                    int vec = i * 32 + k;
+                                    int vec = i * warpSize + k;
                                     if (vec < numVecs) {
                                         data[i * bytesPerBlock +
                                              j * bytesPerDimBlock +
@@ -148,7 +150,8 @@ TEST(TestCodePacking, InterleavedCodes_UnpackPack) {
                     for (int i = 0; i < blocks; ++i) {
                         for (int j = 0; j < dims; ++j) {
                             for (int k = 0; k < bytesPerDimBlock; ++k) {
-                                int loVec = i * 32 + (k * 8) / bitsPerCode;
+                                int loVec =
+                                        i * warpSize + (k * 8) / bitsPerCode;
                                 int hiVec = loVec + 1;
                                 int hiVec2 = hiVec + 1;