RubyGems - faiss - Versions diffs - 0.3.1 → 0.3.2 - Mend

faiss 0.3.1 → 0.3.2

Files changed (119) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.h +1 -1
data/vendor/faiss/faiss/Clustering.cpp +35 -4
data/vendor/faiss/faiss/Clustering.h +10 -1
data/vendor/faiss/faiss/IVFlib.cpp +4 -1
data/vendor/faiss/faiss/Index.h +21 -6
data/vendor/faiss/faiss/IndexBinaryHNSW.h +1 -1
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -1
data/vendor/faiss/faiss/IndexFastScan.cpp +22 -4
data/vendor/faiss/faiss/IndexFlat.cpp +11 -7
data/vendor/faiss/faiss/IndexFlatCodes.cpp +159 -5
data/vendor/faiss/faiss/IndexFlatCodes.h +20 -3
data/vendor/faiss/faiss/IndexHNSW.cpp +143 -90
data/vendor/faiss/faiss/IndexHNSW.h +52 -3
data/vendor/faiss/faiss/IndexIVF.cpp +3 -3
data/vendor/faiss/faiss/IndexIVF.h +9 -1
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +15 -0
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +3 -0
data/vendor/faiss/faiss/IndexIVFFastScan.cpp +130 -57
data/vendor/faiss/faiss/IndexIVFFastScan.h +14 -7
data/vendor/faiss/faiss/IndexIVFPQ.cpp +1 -3
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +21 -2
data/vendor/faiss/faiss/IndexLattice.cpp +1 -19
data/vendor/faiss/faiss/IndexLattice.h +3 -22
data/vendor/faiss/faiss/IndexNNDescent.cpp +0 -29
data/vendor/faiss/faiss/IndexNNDescent.h +1 -1
data/vendor/faiss/faiss/IndexNSG.h +1 -1
data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +56 -0
data/vendor/faiss/faiss/IndexNeuralNetCodec.h +49 -0
data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
data/vendor/faiss/faiss/IndexRefine.cpp +5 -5
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +3 -1
data/vendor/faiss/faiss/MetricType.h +7 -2
data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +95 -17
data/vendor/faiss/faiss/cppcontrib/factory_tools.cpp +152 -0
data/vendor/faiss/faiss/cppcontrib/factory_tools.h +24 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +83 -30
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +36 -4
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +6 -0
data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -1
data/vendor/faiss/faiss/gpu/GpuIndex.h +2 -8
data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +282 -0
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +6 -0
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +2 -0
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +25 -0
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +26 -21
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +6 -0
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +8 -5
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +65 -0
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +1 -1
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +6 -0
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +4 -1
data/vendor/faiss/faiss/gpu/utils/Timer.h +1 -1
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +25 -0
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +9 -1
data/vendor/faiss/faiss/impl/DistanceComputer.h +46 -0
data/vendor/faiss/faiss/impl/FaissAssert.h +4 -2
data/vendor/faiss/faiss/impl/HNSW.cpp +358 -190
data/vendor/faiss/faiss/impl/HNSW.h +43 -22
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +8 -8
data/vendor/faiss/faiss/impl/LookupTableScaler.h +34 -0
data/vendor/faiss/faiss/impl/NNDescent.cpp +13 -8
data/vendor/faiss/faiss/impl/NSG.cpp +0 -29
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +1 -0
data/vendor/faiss/faiss/impl/ProductQuantizer.h +5 -1
data/vendor/faiss/faiss/impl/ResultHandler.h +151 -32
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +719 -102
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +3 -0
data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +5 -0
data/vendor/faiss/faiss/impl/code_distance/code_distance-avx512.h +248 -0
data/vendor/faiss/faiss/impl/index_read.cpp +29 -15
data/vendor/faiss/faiss/impl/index_read_utils.h +37 -0
data/vendor/faiss/faiss/impl/index_write.cpp +28 -10
data/vendor/faiss/faiss/impl/io.cpp +13 -5
data/vendor/faiss/faiss/impl/io.h +4 -4
data/vendor/faiss/faiss/impl/io_macros.h +6 -0
data/vendor/faiss/faiss/impl/platform_macros.h +22 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +11 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +1 -1
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +448 -1
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +5 -5
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
data/vendor/faiss/faiss/impl/simd_result_handlers.h +143 -59
data/vendor/faiss/faiss/index_factory.cpp +31 -13
data/vendor/faiss/faiss/index_io.h +12 -5
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +28 -8
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +3 -0
data/vendor/faiss/faiss/invlists/DirectMap.cpp +9 -1
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +55 -17
data/vendor/faiss/faiss/invlists/InvertedLists.h +18 -9
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +21 -6
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +2 -1
data/vendor/faiss/faiss/python/python_callbacks.cpp +3 -3
data/vendor/faiss/faiss/utils/Heap.h +105 -0
data/vendor/faiss/faiss/utils/NeuralNet.cpp +342 -0
data/vendor/faiss/faiss/utils/NeuralNet.h +147 -0
data/vendor/faiss/faiss/utils/bf16.h +36 -0
data/vendor/faiss/faiss/utils/distances.cpp +58 -88
data/vendor/faiss/faiss/utils/distances.h +5 -5
data/vendor/faiss/faiss/utils/distances_simd.cpp +997 -9
data/vendor/faiss/faiss/utils/extra_distances-inl.h +70 -0
data/vendor/faiss/faiss/utils/extra_distances.cpp +85 -137
data/vendor/faiss/faiss/utils/extra_distances.h +3 -2
data/vendor/faiss/faiss/utils/hamming.cpp +1 -1
data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +4 -1
data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +2 -1
data/vendor/faiss/faiss/utils/random.cpp +43 -0
data/vendor/faiss/faiss/utils/random.h +25 -0
data/vendor/faiss/faiss/utils/simdlib.h +10 -1
data/vendor/faiss/faiss/utils/simdlib_avx512.h +296 -0
data/vendor/faiss/faiss/utils/simdlib_neon.h +5 -2
data/vendor/faiss/faiss/utils/simdlib_ppc64.h +1084 -0
data/vendor/faiss/faiss/utils/transpose/transpose-avx512-inl.h +176 -0
data/vendor/faiss/faiss/utils/utils.cpp +10 -3
data/vendor/faiss/faiss/utils/utils.h +3 -0
metadata +16 -4
data/vendor/faiss/faiss/impl/code_distance/code_distance_avx512.h +0 -102

data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h CHANGED Viewed

@@ -12,10 +12,19 @@
 #include <cstdint>
 #include <faiss/cppcontrib/detail/CoarseBitType.h>
+#include <faiss/impl/platform_macros.h>
 namespace faiss {
 namespace cppcontrib {
+bool isBigEndian() {
+#ifdef FAISS_BIG_ENDIAN
+    return true;
+#else
+    return false;
+#endif
+}
 ////////////////////////////////////////////////////////////////////////////////////
 /// Index2LevelDecoder
 ////////////////////////////////////////////////////////////////////////////////////
@@ -72,9 +81,14 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-            const intptr_t coarseCode = coarse[coarseCentroidIdx];
-            const intptr_t fineCode = fine[fineCentroidIdx];
+            intptr_t coarseCode, fineCode;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode = Swap2Bytes(coarse[coarseCentroidIdx]);
+                fineCode = Swap2Bytes(fine[fineCentroidIdx]);
+            } else {
+                coarseCode = coarse[coarseCentroidIdx];
+                fineCode = fine[fineCentroidIdx];
+            }
             const float* const __restrict coarsePtr = pqCoarseCentroids +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode) *
@@ -112,9 +126,14 @@ struct Index2LevelDecoder {
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-            const intptr_t coarseCode = coarse[coarseCentroidIdx];
-            const intptr_t fineCode = fine[fineCentroidIdx];
+            intptr_t coarseCode, fineCode;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode = Swap2Bytes(coarse[coarseCentroidIdx]);
+                fineCode = Swap2Bytes(fine[fineCentroidIdx]);
+            } else {
+                coarseCode = coarse[coarseCentroidIdx];
+                fineCode = fine[fineCentroidIdx];
+            }
             const float* const __restrict coarsePtr = pqCoarseCentroids +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode) *
                             COARSE_SIZE +
@@ -162,11 +181,18 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
+            intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
+                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
+                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
+                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
+            } else {
+                coarseCode0 = coarse0[coarseCentroidIdx];
+                fineCode0 = fine0[fineCentroidIdx];
+                coarseCode1 = coarse1[coarseCentroidIdx];
+                fineCode1 = fine1[fineCentroidIdx];
+            }
             const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -222,11 +248,18 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
+            intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
+                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
+                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
+                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
+            } else {
+                coarseCode0 = coarse0[coarseCentroidIdx];
+                fineCode0 = fine0[fineCentroidIdx];
+                coarseCode1 = coarse1[coarseCentroidIdx];
+                fineCode1 = fine1[fineCentroidIdx];
+            }
             const float* const __restrict coarsePtr0 = pqCoarseCentroids +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -292,13 +325,23 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
-            const intptr_t coarseCode2 = coarse2[coarseCentroidIdx];
-            const intptr_t fineCode2 = fine2[fineCentroidIdx];
+            intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
+            intptr_t coarseCode2, fineCode2;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
+                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
+                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
+                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
+                coarseCode2 = Swap2Bytes(coarse2[coarseCentroidIdx]);
+                fineCode2 = Swap2Bytes(fine2[fineCentroidIdx]);
+            } else {
+                coarseCode0 = coarse0[coarseCentroidIdx];
+                fineCode0 = fine0[fineCentroidIdx];
+                coarseCode1 = coarse1[coarseCentroidIdx];
+                fineCode1 = fine1[fineCentroidIdx];
+                coarseCode2 = coarse2[coarseCentroidIdx];
+                fineCode2 = fine2[fineCentroidIdx];
+            }
             const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -369,13 +412,23 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
-            const intptr_t coarseCode2 = coarse2[coarseCentroidIdx];
-            const intptr_t fineCode2 = fine2[fineCentroidIdx];
+            intptr_t coarseCode0, fineCode0, coarseCode1, fineCode1;
+            intptr_t coarseCode2, fineCode2;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
+                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
+                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
+                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
+                coarseCode2 = Swap2Bytes(coarse2[coarseCentroidIdx]);
+                fineCode2 = Swap2Bytes(fine2[fineCentroidIdx]);
+            } else {
+                coarseCode0 = coarse0[coarseCentroidIdx];
+                fineCode0 = fine0[fineCentroidIdx];
+                coarseCode1 = coarse1[coarseCentroidIdx];
+                fineCode1 = fine1[fineCentroidIdx];
+                coarseCode2 = coarse2[coarseCentroidIdx];
+                fineCode2 = fine2[fineCentroidIdx];
+            }
             const float* const __restrict coarsePtr0 = pqCoarseCentroids +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *

data/vendor/faiss/faiss/gpu/GpuCloner.cpp CHANGED Viewed

@@ -14,6 +14,9 @@
 #include <faiss/IndexBinaryFlat.h>
 #include <faiss/IndexFlat.h>
+#if defined USE_NVIDIA_RAFT
+#include <faiss/IndexHNSW.h>
+#endif
 #include <faiss/IndexIVF.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/IndexIVFPQ.h>
@@ -24,6 +27,9 @@
 #include <faiss/MetaIndexes.h>
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/GpuIndexBinaryFlat.h>
+#if defined USE_NVIDIA_RAFT
+#include <faiss/gpu/GpuIndexCagra.h>
+#endif
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
 #include <faiss/gpu/GpuIndexIVFPQ.h>
@@ -85,7 +91,15 @@ Index* ToCPUCloner::clone_Index(const Index* index) {
         // objective is to make a single component out of them
         // (inverse op of ToGpuClonerMultiple)
-    } else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
+    }
+#if defined USE_NVIDIA_RAFT
+    else if (auto icg = dynamic_cast<const GpuIndexCagra*>(index)) {
+        IndexHNSWCagra* res = new IndexHNSWCagra();
+        icg->copyTo(res);
+        return res;
+    }
+#endif
+    else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
         int nshard = ish->count();
         FAISS_ASSERT(nshard > 0);
         Index* res = clone_Index(ish->at(0));
@@ -153,6 +167,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         config.indicesOptions = indicesOptions;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
         config.use_raft = use_raft;
+        config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer;
         GpuIndexIVFFlat* res = new GpuIndexIVFFlat(
                 provider, ifl->d, ifl->nlist, ifl->metric_type, config);
@@ -205,6 +220,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         config.usePrecomputedTables = usePrecomputed;
         config.use_raft = use_raft;
         config.interleavedLayout = use_raft;
+        config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer;
         GpuIndexIVFPQ* res = new GpuIndexIVFPQ(provider, ipq, config);
@@ -213,9 +229,25 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         }
         return res;
-    } else {
-        // default: use CPU cloner
-        return Cloner::clone_Index(index);
+    }
+#if defined USE_NVIDIA_RAFT
+    else if (auto icg = dynamic_cast<const faiss::IndexHNSWCagra*>(index)) {
+        GpuIndexCagraConfig config;
+        config.device = device;
+        GpuIndexCagra* res =
+                new GpuIndexCagra(provider, icg->d, icg->metric_type, config);
+        res->copyFrom(icg);
+        return res;
+    }
+#endif
+    else {
+        // use CPU cloner for IDMap and PreTransform
+        auto index_idmap = dynamic_cast<const IndexIDMap*>(index);
+        auto index_pt = dynamic_cast<const IndexPreTransform*>(index);
+        if (index_idmap || index_pt) {
+            return Cloner::clone_Index(index);
+        }
+        FAISS_THROW_MSG("This index type is not implemented on GPU.");
     }
 }

data/vendor/faiss/faiss/gpu/GpuClonerOptions.h CHANGED Viewed

@@ -43,6 +43,12 @@ struct GpuClonerOptions {
 #else
     bool use_raft = false;
 #endif
+    /// This flag controls the CPU fallback logic for coarse quantizer
+    /// component of the index. When set to false (default), the cloner will
+    /// throw an exception for indices not implemented on GPU. When set to
+    /// true, it will fallback to a CPU implementation.
+    bool allowCpuCoarseQuantizer = false;
 };
 struct GpuMultipleClonerOptions : public GpuClonerOptions {

data/vendor/faiss/faiss/gpu/GpuFaissAssert.h CHANGED Viewed

@@ -15,7 +15,7 @@
 /// Assertions
 ///
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(USE_AMD_ROCM)
 #define GPU_FAISS_ASSERT(X) assert(X)
 #define GPU_FAISS_ASSERT_MSG(X, MSG) assert(X)
 #define GPU_FAISS_ASSERT_FMT(X, FMT, ...) assert(X)

data/vendor/faiss/faiss/gpu/GpuIndex.h CHANGED Viewed

@@ -84,19 +84,14 @@ class GpuIndex : public faiss::Index {
     /// `x` and `labels` can be resident on the CPU or any GPU; copies are
     /// performed as needed
-    void assign(
-            idx_t n,
-            const float* x,
-            idx_t* labels,
-            // faiss::Index has idx_t for k
-            idx_t k = 1) const override;
+    void assign(idx_t n, const float* x, idx_t* labels, idx_t k = 1)
+            const override;
     /// `x`, `distances` and `labels` can be resident on the CPU or any
     /// GPU; copies are performed as needed
     void search(
             idx_t n,
             const float* x,
-            // faiss::Index has idx_t for k
             idx_t k,
             float* distances,
             idx_t* labels,
@@ -107,7 +102,6 @@ class GpuIndex : public faiss::Index {
     void search_and_reconstruct(
             idx_t n,
             const float* x,
-            // faiss::Index has idx_t for k
             idx_t k,
             float* distances,
             idx_t* labels,

data/vendor/faiss/faiss/gpu/GpuIndexCagra.h ADDED Viewed

@@ -0,0 +1,282 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <faiss/IndexIVF.h>
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+namespace faiss {
+struct IndexHNSWCagra;
+}
+namespace faiss {
+namespace gpu {
+class RaftCagra;
+enum class graph_build_algo {
+    /// Use IVF-PQ to build all-neighbors knn graph
+    IVF_PQ,
+    /// Experimental, use NN-Descent to build all-neighbors knn graph
+    NN_DESCENT
+};
+/// A type for specifying how PQ codebooks are created.
+enum class codebook_gen { // NOLINT
+    PER_SUBSPACE = 0,     // NOLINT
+    PER_CLUSTER = 1,      // NOLINT
+};
+struct IVFPQBuildCagraConfig {
+    ///
+    /// The number of inverted lists (clusters)
+    ///
+    /// Hint: the number of vectors per cluster (`n_rows/n_lists`) should be
+    /// approximately 1,000 to 10,000.
+    uint32_t n_lists = 1024;
+    /// The number of iterations searching for kmeans centers (index building).
+    uint32_t kmeans_n_iters = 20;
+    /// The fraction of data to use during iterative kmeans building.
+    double kmeans_trainset_fraction = 0.5;
+    ///
+    /// The bit length of the vector element after compression by PQ.
+    ///
+    /// Possible values: [4, 5, 6, 7, 8].
+    ///
+    /// Hint: the smaller the 'pq_bits', the smaller the index size and the
+    /// better the search performance, but the lower the recall.
+    uint32_t pq_bits = 8;
+    ///
+    /// The dimensionality of the vector after compression by PQ. When zero, an
+    /// optimal value is selected using a heuristic.
+    ///
+    /// NB: `pq_dim  /// pq_bits` must be a multiple of 8.
+    ///
+    /// Hint: a smaller 'pq_dim' results in a smaller index size and better
+    /// search performance, but lower recall. If 'pq_bits' is 8, 'pq_dim' can be
+    /// set to any number, but multiple of 8 are desirable for good performance.
+    /// If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8. For good
+    /// performance, it is desirable that 'pq_dim' is a multiple of 32. Ideally,
+    /// 'pq_dim' should be also a divisor of the dataset dim.
+    uint32_t pq_dim = 0;
+    /// How PQ codebooks are created.
+    codebook_gen codebook_kind = codebook_gen::PER_SUBSPACE;
+    ///
+    /// Apply a random rotation matrix on the input data and queries even if
+    /// `dim % pq_dim == 0`.
+    ///
+    /// Note: if `dim` is not multiple of `pq_dim`, a random rotation is always
+    /// applied to the input data and queries to transform the working space
+    /// from `dim` to `rot_dim`, which may be slightly larger than the original
+    /// space and and is a multiple of `pq_dim` (`rot_dim % pq_dim == 0`).
+    /// However, this transform is not necessary when `dim` is multiple of
+    /// `pq_dim`
+    ///   (`dim == rot_dim`, hence no need in adding "extra" data columns /
+    ///   features).
+    ///
+    /// By default, if `dim == rot_dim`, the rotation transform is initialized
+    /// with the identity matrix. When `force_random_rotation == true`, a random
+    /// orthogonal transform matrix is generated regardless of the values of
+    /// `dim` and `pq_dim`.
+    bool force_random_rotation = false;
+    ///
+    /// By default, the algorithm allocates more space than necessary for
+    /// individual clusters
+    /// (`list_data`). This allows to amortize the cost of memory allocation and
+    /// reduce the number of data copies during repeated calls to `extend`
+    /// (extending the database).
+    ///
+    /// The alternative is the conservative allocation behavior; when enabled,
+    /// the algorithm always allocates the minimum amount of memory required to
+    /// store the given number of records. Set this flag to `true` if you prefer
+    /// to use as little GPU memory for the database as possible.
+    bool conservative_memory_allocation = false;
+};
+struct IVFPQSearchCagraConfig {
+    /// The number of clusters to search.
+    uint32_t n_probes = 20;
+    ///
+    /// Data type of look up table to be created dynamically at search time.
+    ///
+    /// Possible values: [CUDA_R_32F, CUDA_R_16F, CUDA_R_8U]
+    ///
+    /// The use of low-precision types reduces the amount of shared memory
+    /// required at search time, so fast shared memory kernels can be used even
+    /// for datasets with large dimansionality. Note that the recall is slightly
+    /// degraded when low-precision type is selected.
+    cudaDataType_t lut_dtype = CUDA_R_32F;
+    ///
+    /// Storage data type for distance/similarity computed at search time.
+    ///
+    /// Possible values: [CUDA_R_16F, CUDA_R_32F]
+    ///
+    /// If the performance limiter at search time is device memory access,
+    /// selecting FP16 will improve performance slightly.
+    cudaDataType_t internal_distance_dtype = CUDA_R_32F;
+    ///
+    /// Preferred fraction of SM's unified memory / L1 cache to be used as
+    /// shared memory.
+    ///
+    /// Possible values: [0.0 - 1.0] as a fraction of the
+    /// `sharedMemPerMultiprocessor`.
+    ///
+    /// One wants to increase the carveout to make sure a good GPU occupancy for
+    /// the main search kernel, but not to keep it too high to leave some memory
+    /// to be used as L1 cache. Note, this value is interpreted only as a hint.
+    /// Moreover, a GPU usually allows only a fixed set of cache configurations,
+    /// so the provided value is rounded up to the nearest configuration. Refer
+    /// to the NVIDIA tuning guide for the target GPU architecture.
+    ///
+    /// Note, this is a low-level tuning parameter that can have drastic
+    /// negative effects on the search performance if tweaked incorrectly.
+    double preferred_shmem_carveout = 1.0;
+};
+struct GpuIndexCagraConfig : public GpuIndexConfig {
+    /// Degree of input graph for pruning.
+    size_t intermediate_graph_degree = 128;
+    /// Degree of output graph.
+    size_t graph_degree = 64;
+    /// ANN algorithm to build knn graph.
+    graph_build_algo build_algo = graph_build_algo::IVF_PQ;
+    /// Number of Iterations to run if building with NN_DESCENT
+    size_t nn_descent_niter = 20;
+    IVFPQBuildCagraConfig* ivf_pq_params = nullptr;
+    IVFPQSearchCagraConfig* ivf_pq_search_params = nullptr;
+};
+enum class search_algo {
+    /// For large batch sizes.
+    SINGLE_CTA,
+    /// For small batch sizes.
+    MULTI_CTA,
+    MULTI_KERNEL,
+    AUTO
+};
+enum class hash_mode { HASH, SMALL, AUTO };
+struct SearchParametersCagra : SearchParameters {
+    /// Maximum number of queries to search at the same time (batch size). Auto
+    /// select when 0.
+    size_t max_queries = 0;
+    /// Number of intermediate search results retained during the search.
+    ///
+    ///  This is the main knob to adjust trade off between accuracy and search
+    /// speed. Higher values improve the search accuracy.
+    size_t itopk_size = 64;
+    /// Upper limit of search iterations. Auto select when 0.
+    size_t max_iterations = 0;
+    // In the following we list additional search parameters for fine tuning.
+    // Reasonable default values are automatically chosen.
+    /// Which search implementation to use.
+    search_algo algo = search_algo::AUTO;
+    /// Number of threads used to calculate a single distance. 4, 8, 16, or 32.
+    size_t team_size = 0;
+    /// Number of graph nodes to select as the starting point for the search in
+    /// each iteration. aka search width?
+    size_t search_width = 1;
+    /// Lower limit of search iterations.
+    size_t min_iterations = 0;
+    /// Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0.
+    size_t thread_block_size = 0;
+    /// Hashmap type. Auto selection when AUTO.
+    hash_mode hashmap_mode = hash_mode::AUTO;
+    /// Lower limit of hashmap bit length. More than 8.
+    size_t hashmap_min_bitlen = 0;
+    /// Upper limit of hashmap fill rate. More than 0.1, less than 0.9.
+    float hashmap_max_fill_rate = 0.5;
+    /// Number of iterations of initial random seed node selection. 1 or more.
+    uint32_t num_random_samplings = 1;
+    /// Bit mask used for initial random seed node selection.
+    uint64_t seed = 0x128394;
+};
+struct GpuIndexCagra : public GpuIndex {
+   public:
+    GpuIndexCagra(
+            GpuResourcesProvider* provider,
+            int dims,
+            faiss::MetricType metric = faiss::METRIC_L2,
+            GpuIndexCagraConfig config = GpuIndexCagraConfig());
+    /// Trains CAGRA based on the given vector data
+    void train(idx_t n, const float* x) override;
+    /// Initialize ourselves from the given CPU index; will overwrite
+    /// all data in ourselves
+    void copyFrom(const faiss::IndexHNSWCagra* index);
+    /// Copy ourselves to the given CPU index; will overwrite all data
+    /// in the index instance
+    void copyTo(faiss::IndexHNSWCagra* index) const;
+    void reset() override;
+    std::vector<idx_t> get_knngraph() const;
+   protected:
+    bool addImplRequiresIDs_() const override;
+    void addImpl_(idx_t n, const float* x, const idx_t* ids) override;
+    /// Called from GpuIndex for search
+    void searchImpl_(
+            idx_t n,
+            const float* x,
+            int k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* search_params) const override;
+    /// Our configuration options
+    const GpuIndexCagraConfig cagraConfig_;
+    /// Instance that we own; contains the inverted lists
+    std::shared_ptr<RaftCagra> index_;
+};
+} // namespace gpu
+} // namespace faiss

data/vendor/faiss/faiss/gpu/GpuIndexIVF.h CHANGED Viewed

@@ -26,6 +26,12 @@ struct GpuIndexIVFConfig : public GpuIndexConfig {
     /// Configuration for the coarse quantizer object
     GpuIndexFlatConfig flatConfig;
+    /// This flag controls the CPU fallback logic for coarse quantizer
+    /// component of the index. When set to false (default), the cloner will
+    /// throw an exception for indices not implemented on GPU. When set to
+    /// true, it will fallback to a CPU implementation.
+    bool allowCpuCoarseQuantizer = false;
 };
 /// Base class of all GPU IVF index types. This (for now) deliberately does not

data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h CHANGED Viewed

@@ -87,6 +87,8 @@ class GpuIndexIVFFlat : public GpuIndexIVF {
     /// Trains the coarse quantizer based on the given vector data
     void train(idx_t n, const float* x) override;
+    void reconstruct_n(idx_t i0, idx_t n, float* out) const override;
    protected:
     /// Initialize appropriate index
     void setIndex_(

data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp CHANGED Viewed

@@ -257,6 +257,14 @@ void StandardGpuResourcesImpl::setDefaultStream(
         if (prevStream != stream) {
             streamWait({stream}, {prevStream});
         }
+#if defined USE_NVIDIA_RAFT
+        // delete the raft handle for this device, which will be initialized
+        // with the updated stream during any subsequent calls to getRaftHandle
+        auto it2 = raftHandles_.find(device);
+        if (it2 != raftHandles_.end()) {
+            raftHandles_.erase(it2);
+        }
+#endif
     }
     userDefaultStreams_[device] = stream;
@@ -275,6 +283,14 @@ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
             streamWait({newStream}, {prevStream});
         }
+#if defined USE_NVIDIA_RAFT
+        // delete the raft handle for this device, which will be initialized
+        // with the updated stream during any subsequent calls to getRaftHandle
+        auto it2 = raftHandles_.find(device);
+        if (it2 != raftHandles_.end()) {
+            raftHandles_.erase(it2);
+        }
+#endif
     }
     userDefaultStreams_.erase(device);
@@ -347,11 +363,20 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
             prop.major,
             prop.minor);
+#if USE_AMD_ROCM
+    // Our code is pre-built with and expects warpSize == 32 or 64, validate
+    // that
+    FAISS_ASSERT_FMT(
+            prop.warpSize == 32 || prop.warpSize == 64,
+            "Device id %d does not have expected warpSize of 32 or 64",
+            device);
+#else
     // Our code is pre-built with and expects warpSize == 32, validate that
     FAISS_ASSERT_FMT(
             prop.warpSize == 32,
             "Device id %d does not have expected warpSize of 32",
             device);
+#endif
     // Create streams
     cudaStream_t defaultStream = nullptr;