RubyGems - faiss - Versions diffs - 0.3.0 → 0.3.2 - Mend

faiss 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (216) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +9 -0
data/LICENSE.txt +1 -1
data/README.md +1 -1
data/ext/faiss/extconf.rb +9 -2
data/ext/faiss/index.cpp +1 -1
data/ext/faiss/index_binary.cpp +2 -2
data/ext/faiss/product_quantizer.cpp +1 -1
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +7 -7
data/vendor/faiss/faiss/AutoTune.h +1 -2
data/vendor/faiss/faiss/Clustering.cpp +39 -22
data/vendor/faiss/faiss/Clustering.h +40 -21
data/vendor/faiss/faiss/IVFlib.cpp +26 -12
data/vendor/faiss/faiss/Index.cpp +1 -1
data/vendor/faiss/faiss/Index.h +40 -10
data/vendor/faiss/faiss/Index2Layer.cpp +7 -7
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +176 -166
data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +15 -15
data/vendor/faiss/faiss/IndexBinary.cpp +9 -4
data/vendor/faiss/faiss/IndexBinary.h +8 -19
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +2 -1
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +24 -31
data/vendor/faiss/faiss/IndexBinaryHNSW.h +1 -1
data/vendor/faiss/faiss/IndexBinaryHash.cpp +25 -50
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +107 -188
data/vendor/faiss/faiss/IndexFastScan.cpp +95 -146
data/vendor/faiss/faiss/IndexFastScan.h +9 -8
data/vendor/faiss/faiss/IndexFlat.cpp +206 -10
data/vendor/faiss/faiss/IndexFlat.h +20 -1
data/vendor/faiss/faiss/IndexFlatCodes.cpp +170 -5
data/vendor/faiss/faiss/IndexFlatCodes.h +23 -4
data/vendor/faiss/faiss/IndexHNSW.cpp +231 -382
data/vendor/faiss/faiss/IndexHNSW.h +62 -49
data/vendor/faiss/faiss/IndexIDMap.cpp +69 -28
data/vendor/faiss/faiss/IndexIDMap.h +24 -2
data/vendor/faiss/faiss/IndexIVF.cpp +162 -56
data/vendor/faiss/faiss/IndexIVF.h +46 -6
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +33 -26
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +6 -2
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +19 -46
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -3
data/vendor/faiss/faiss/IndexIVFFastScan.cpp +502 -401
data/vendor/faiss/faiss/IndexIVFFastScan.h +63 -26
data/vendor/faiss/faiss/IndexIVFFlat.cpp +15 -5
data/vendor/faiss/faiss/IndexIVFFlat.h +3 -2
data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +172 -0
data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.h +56 -0
data/vendor/faiss/faiss/IndexIVFPQ.cpp +79 -125
data/vendor/faiss/faiss/IndexIVFPQ.h +6 -7
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +39 -52
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +4 -3
data/vendor/faiss/faiss/IndexIVFPQR.cpp +45 -29
data/vendor/faiss/faiss/IndexIVFPQR.h +5 -2
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +25 -27
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +6 -6
data/vendor/faiss/faiss/IndexLSH.cpp +14 -16
data/vendor/faiss/faiss/IndexLattice.cpp +1 -19
data/vendor/faiss/faiss/IndexLattice.h +3 -22
data/vendor/faiss/faiss/IndexNNDescent.cpp +3 -33
data/vendor/faiss/faiss/IndexNNDescent.h +1 -1
data/vendor/faiss/faiss/IndexNSG.cpp +11 -27
data/vendor/faiss/faiss/IndexNSG.h +11 -11
data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +56 -0
data/vendor/faiss/faiss/IndexNeuralNetCodec.h +49 -0
data/vendor/faiss/faiss/IndexPQ.cpp +72 -88
data/vendor/faiss/faiss/IndexPQ.h +1 -4
data/vendor/faiss/faiss/IndexPQFastScan.cpp +1 -1
data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -31
data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
data/vendor/faiss/faiss/IndexRefine.cpp +54 -24
data/vendor/faiss/faiss/IndexRefine.h +7 -0
data/vendor/faiss/faiss/IndexReplicas.cpp +23 -26
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +25 -17
data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -4
data/vendor/faiss/faiss/IndexShards.cpp +21 -29
data/vendor/faiss/faiss/IndexShardsIVF.cpp +1 -2
data/vendor/faiss/faiss/MatrixStats.cpp +17 -32
data/vendor/faiss/faiss/MatrixStats.h +21 -9
data/vendor/faiss/faiss/MetaIndexes.cpp +35 -35
data/vendor/faiss/faiss/MetricType.h +7 -2
data/vendor/faiss/faiss/VectorTransform.cpp +13 -26
data/vendor/faiss/faiss/VectorTransform.h +7 -7
data/vendor/faiss/faiss/clone_index.cpp +15 -10
data/vendor/faiss/faiss/clone_index.h +3 -0
data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +95 -17
data/vendor/faiss/faiss/cppcontrib/factory_tools.cpp +152 -0
data/vendor/faiss/faiss/cppcontrib/factory_tools.h +24 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +83 -30
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +123 -8
data/vendor/faiss/faiss/gpu/GpuCloner.h +22 -0
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +13 -0
data/vendor/faiss/faiss/gpu/GpuDistance.h +46 -38
data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -1
data/vendor/faiss/faiss/gpu/GpuIndex.h +30 -12
data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +282 -0
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +4 -4
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +14 -9
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +20 -3
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -11
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +1 -3
data/vendor/faiss/faiss/gpu/GpuResources.cpp +24 -3
data/vendor/faiss/faiss/gpu/GpuResources.h +39 -11
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +142 -17
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +57 -3
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +26 -21
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +7 -1
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +8 -5
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +25 -0
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +129 -9
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +332 -40
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +299 -208
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +1 -0
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +1 -1
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +6 -0
data/vendor/faiss/faiss/gpu/utils/RaftUtils.h +75 -0
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +4 -1
data/vendor/faiss/faiss/gpu/utils/Timer.h +1 -1
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +3 -1
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +5 -5
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +26 -1
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +10 -3
data/vendor/faiss/faiss/impl/DistanceComputer.h +70 -1
data/vendor/faiss/faiss/impl/FaissAssert.h +4 -2
data/vendor/faiss/faiss/impl/FaissException.h +13 -34
data/vendor/faiss/faiss/impl/HNSW.cpp +605 -186
data/vendor/faiss/faiss/impl/HNSW.h +52 -30
data/vendor/faiss/faiss/impl/IDSelector.h +4 -4
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +11 -9
data/vendor/faiss/faiss/impl/LookupTableScaler.h +34 -0
data/vendor/faiss/faiss/impl/NNDescent.cpp +42 -27
data/vendor/faiss/faiss/impl/NSG.cpp +0 -29
data/vendor/faiss/faiss/impl/NSG.h +1 -1
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +14 -12
data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +1 -1
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -22
data/vendor/faiss/faiss/impl/ProductQuantizer.h +6 -2
data/vendor/faiss/faiss/impl/Quantizer.h +1 -1
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +27 -1015
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +5 -63
data/vendor/faiss/faiss/impl/ResultHandler.h +347 -172
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +1104 -147
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +3 -8
data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +285 -42
data/vendor/faiss/faiss/impl/code_distance/code_distance-avx512.h +248 -0
data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +21 -14
data/vendor/faiss/faiss/impl/code_distance/code_distance.h +22 -12
data/vendor/faiss/faiss/impl/index_read.cpp +74 -34
data/vendor/faiss/faiss/impl/index_read_utils.h +37 -0
data/vendor/faiss/faiss/impl/index_write.cpp +88 -51
data/vendor/faiss/faiss/impl/io.cpp +23 -15
data/vendor/faiss/faiss/impl/io.h +4 -4
data/vendor/faiss/faiss/impl/io_macros.h +6 -0
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
data/vendor/faiss/faiss/impl/platform_macros.h +40 -1
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +14 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +7 -6
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +52 -38
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +487 -49
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +960 -0
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +176 -0
data/vendor/faiss/faiss/impl/simd_result_handlers.h +481 -225
data/vendor/faiss/faiss/index_factory.cpp +41 -20
data/vendor/faiss/faiss/index_io.h +12 -5
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +28 -8
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +3 -0
data/vendor/faiss/faiss/invlists/DirectMap.cpp +10 -2
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +73 -17
data/vendor/faiss/faiss/invlists/InvertedLists.h +26 -8
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +24 -9
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +2 -1
data/vendor/faiss/faiss/python/python_callbacks.cpp +4 -4
data/vendor/faiss/faiss/utils/Heap.cpp +3 -1
data/vendor/faiss/faiss/utils/Heap.h +105 -0
data/vendor/faiss/faiss/utils/NeuralNet.cpp +342 -0
data/vendor/faiss/faiss/utils/NeuralNet.h +147 -0
data/vendor/faiss/faiss/utils/WorkerThread.h +1 -0
data/vendor/faiss/faiss/utils/bf16.h +36 -0
data/vendor/faiss/faiss/utils/distances.cpp +147 -123
data/vendor/faiss/faiss/utils/distances.h +86 -9
data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +5 -5
data/vendor/faiss/faiss/utils/distances_fused/avx512.h +2 -2
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +2 -2
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +1 -1
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +5 -5
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +1 -1
data/vendor/faiss/faiss/utils/distances_simd.cpp +1589 -243
data/vendor/faiss/faiss/utils/extra_distances-inl.h +70 -0
data/vendor/faiss/faiss/utils/extra_distances.cpp +85 -137
data/vendor/faiss/faiss/utils/extra_distances.h +3 -2
data/vendor/faiss/faiss/utils/fp16-arm.h +29 -0
data/vendor/faiss/faiss/utils/fp16.h +2 -0
data/vendor/faiss/faiss/utils/hamming.cpp +163 -111
data/vendor/faiss/faiss/utils/hamming.h +58 -0
data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +16 -89
data/vendor/faiss/faiss/utils/hamming_distance/common.h +1 -0
data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +19 -88
data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +58 -0
data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +14 -104
data/vendor/faiss/faiss/utils/partitioning.cpp +3 -4
data/vendor/faiss/faiss/utils/prefetch.h +77 -0
data/vendor/faiss/faiss/utils/quantize_lut.cpp +0 -14
data/vendor/faiss/faiss/utils/random.cpp +43 -0
data/vendor/faiss/faiss/utils/random.h +25 -0
data/vendor/faiss/faiss/utils/simdlib.h +10 -1
data/vendor/faiss/faiss/utils/simdlib_avx2.h +0 -6
data/vendor/faiss/faiss/utils/simdlib_avx512.h +296 -0
data/vendor/faiss/faiss/utils/simdlib_neon.h +77 -79
data/vendor/faiss/faiss/utils/simdlib_ppc64.h +1084 -0
data/vendor/faiss/faiss/utils/sorting.cpp +140 -5
data/vendor/faiss/faiss/utils/sorting.h +27 -0
data/vendor/faiss/faiss/utils/transpose/transpose-avx512-inl.h +176 -0
data/vendor/faiss/faiss/utils/utils.cpp +120 -7
data/vendor/faiss/faiss/utils/utils.h +60 -20
metadata +23 -4
data/vendor/faiss/faiss/impl/code_distance/code_distance_avx512.h +0 -102

data/vendor/faiss/faiss/utils/sorting.cpp CHANGED Viewed

@@ -123,7 +123,7 @@ void parallel_merge(
     }
 }
-}; // namespace
+} // namespace
 void fvec_argsort(size_t n, const float* vals, size_t* perm) {
     for (size_t i = 0; i < n; i++) {
@@ -544,7 +544,6 @@ void bucket_sort_inplace_parallel(
         // in this loop, we write elements collected in the previous round
         // and collect the elements that are overwritten for the next round
-        size_t tot_written = 0;
         int round = 0;
         for (;;) {
 #pragma omp barrier
@@ -554,9 +553,6 @@ void bucket_sort_inplace_parallel(
                 n_to_write += to_write_2.lims.back();
             }
-            tot_written += n_to_write;
-            // assert(tot_written <= nval);
 #pragma omp master
             {
                 if (verbose >= 1) {
@@ -689,4 +685,143 @@ void matrix_bucket_sort_inplace(
     }
 }
+/** Hashtable implementation for int64 -> int64 with external storage
+ * implemented for speed and parallel processing.
+ */
+namespace {
+int log2_capacity_to_log2_nbucket(int log2_capacity) {
+    return log2_capacity < 12    ? 0
+            : log2_capacity < 20 ? log2_capacity - 12
+                                 : 10;
+}
+// https://bigprimes.org/
+int64_t bigprime = 8955327411143;
+inline int64_t hash_function(int64_t x) {
+    return (x * 1000003) % bigprime;
+}
+} // anonymous namespace
+void hashtable_int64_to_int64_init(int log2_capacity, int64_t* tab) {
+    size_t capacity = (size_t)1 << log2_capacity;
+#pragma omp parallel for
+    for (int64_t i = 0; i < capacity; i++) {
+        tab[2 * i] = -1;
+        tab[2 * i + 1] = -1;
+    }
+}
+void hashtable_int64_to_int64_add(
+        int log2_capacity,
+        int64_t* tab,
+        size_t n,
+        const int64_t* keys,
+        const int64_t* vals) {
+    size_t capacity = (size_t)1 << log2_capacity;
+    std::vector<int64_t> hk(n);
+    std::vector<uint64_t> bucket_no(n);
+    int64_t mask = capacity - 1;
+    int log2_nbucket = log2_capacity_to_log2_nbucket(log2_capacity);
+    size_t nbucket = (size_t)1 << log2_nbucket;
+#pragma omp parallel for
+    for (int64_t i = 0; i < n; i++) {
+        hk[i] = hash_function(keys[i]) & mask;
+        bucket_no[i] = hk[i] >> (log2_capacity - log2_nbucket);
+    }
+    std::vector<int64_t> lims(nbucket + 1);
+    std::vector<int64_t> perm(n);
+    bucket_sort(
+            n,
+            bucket_no.data(),
+            nbucket,
+            lims.data(),
+            perm.data(),
+            omp_get_max_threads());
+    int num_errors = 0;
+#pragma omp parallel for reduction(+ : num_errors)
+    for (int64_t bucket = 0; bucket < nbucket; bucket++) {
+        size_t k0 = bucket << (log2_capacity - log2_nbucket);
+        size_t k1 = (bucket + 1) << (log2_capacity - log2_nbucket);
+        for (size_t i = lims[bucket]; i < lims[bucket + 1]; i++) {
+            int64_t j = perm[i];
+            assert(bucket_no[j] == bucket);
+            assert(hk[j] >= k0 && hk[j] < k1);
+            size_t slot = hk[j];
+            for (;;) {
+                if (tab[slot * 2] == -1) { // found!
+                    tab[slot * 2] = keys[j];
+                    tab[slot * 2 + 1] = vals[j];
+                    break;
+                } else if (tab[slot * 2] == keys[j]) { // overwrite!
+                    tab[slot * 2 + 1] = vals[j];
+                    break;
+                }
+                slot++;
+                if (slot == k1) {
+                    slot = k0;
+                }
+                if (slot == hk[j]) { // no free slot left in bucket
+                    num_errors++;
+                    break;
+                }
+            }
+            if (num_errors > 0) {
+                break;
+            }
+        }
+    }
+    FAISS_THROW_IF_NOT_MSG(num_errors == 0, "hashtable capacity exhausted");
+}
+void hashtable_int64_to_int64_lookup(
+        int log2_capacity,
+        const int64_t* tab,
+        size_t n,
+        const int64_t* keys,
+        int64_t* vals) {
+    size_t capacity = (size_t)1 << log2_capacity;
+    std::vector<int64_t> hk(n), bucket_no(n);
+    int64_t mask = capacity - 1;
+    int log2_nbucket = log2_capacity_to_log2_nbucket(log2_capacity);
+#pragma omp parallel for
+    for (int64_t i = 0; i < n; i++) {
+        int64_t k = keys[i];
+        int64_t hk = hash_function(k) & mask;
+        size_t slot = hk;
+        if (tab[2 * slot] == -1) { // not in table
+            vals[i] = -1;
+        } else if (tab[2 * slot] == k) { // found!
+            vals[i] = tab[2 * slot + 1];
+        } else { // need to search in [k0, k1)
+            size_t bucket = hk >> (log2_capacity - log2_nbucket);
+            size_t k0 = bucket << (log2_capacity - log2_nbucket);
+            size_t k1 = (bucket + 1) << (log2_capacity - log2_nbucket);
+            for (;;) {
+                if (tab[slot * 2] == k) { // found!
+                    vals[i] = tab[2 * slot + 1];
+                    break;
+                }
+                slot++;
+                if (slot == k1) {
+                    slot = k0;
+                }
+                if (slot == hk) { // bucket is full and not found
+                    vals[i] = -1;
+                    break;
+                }
+            }
+        }
+    }
+}
 } // namespace faiss

data/vendor/faiss/faiss/utils/sorting.h CHANGED Viewed

@@ -68,4 +68,31 @@ void matrix_bucket_sort_inplace(
         int64_t* lims,
         int nt = 0);
+/** Hashtable implementation for int64 -> int64 with external storage
+ * implemented for fast batch add and lookup.
+ *
+ * tab is of size  2 * (1 << log2_capacity)
+ * n is the number of elements to add or search
+ *
+ * adding several values in a same batch: an arbitrary one gets added
+ * in different batches: the newer batch overwrites.
+ * raises an exception if capacity is exhausted.
+ */
+void hashtable_int64_to_int64_init(int log2_capacity, int64_t* tab);
+void hashtable_int64_to_int64_add(
+        int log2_capacity,
+        int64_t* tab,
+        size_t n,
+        const int64_t* keys,
+        const int64_t* vals);
+void hashtable_int64_to_int64_lookup(
+        int log2_capacity,
+        const int64_t* tab,
+        size_t n,
+        const int64_t* keys,
+        int64_t* vals);
 } // namespace faiss

data/vendor/faiss/faiss/utils/transpose/transpose-avx512-inl.h ADDED Viewed

@@ -0,0 +1,176 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+// This file contains transposing kernels for AVX512 for // tiny float/int32
+// matrices, such as 16x2.
+#ifdef __AVX512F__
+#include <immintrin.h>
+namespace faiss {
+// 16x2 -> 2x16
+inline void transpose_16x2(
+        const __m512 i0,
+        const __m512 i1,
+        __m512& o0,
+        __m512& o1) {
+    // assume we have the following input:
+    // i0:  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // i1: 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+    // 0  1  2  3  8  9 10 11 16 17 18 19 24 25 26 27
+    const __m512 r0 = _mm512_shuffle_f32x4(i0, i1, _MM_SHUFFLE(2, 0, 2, 0));
+    // 4  5  6  7 12 13 14 15 20 21 22 23 28 29 30 31
+    const __m512 r1 = _mm512_shuffle_f32x4(i0, i1, _MM_SHUFFLE(3, 1, 3, 1));
+    // 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30
+    o0 = _mm512_shuffle_ps(r0, r1, _MM_SHUFFLE(2, 0, 2, 0));
+    // 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31
+    o1 = _mm512_shuffle_ps(r0, r1, _MM_SHUFFLE(3, 1, 3, 1));
+}
+// 16x4 -> 4x16
+inline void transpose_16x4(
+        const __m512 i0,
+        const __m512 i1,
+        const __m512 i2,
+        const __m512 i3,
+        __m512& o0,
+        __m512& o1,
+        __m512& o2,
+        __m512& o3) {
+    // assume that we have the following input:
+    // i0:  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // i1: 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+    // i2: 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
+    // i3: 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
+    //  0  1  2  3  8  9 10 11 16 17 18 19 24 25 26 27
+    const __m512 r0 = _mm512_shuffle_f32x4(i0, i1, _MM_SHUFFLE(2, 0, 2, 0));
+    //  4  5  6  7 12 13 14 15 20 21 22 23 28 29 30 31
+    const __m512 r1 = _mm512_shuffle_f32x4(i0, i1, _MM_SHUFFLE(3, 1, 3, 1));
+    // 32 33 34 35 40 41 42 43 48 49 50 51 56 57 58 59
+    const __m512 r2 = _mm512_shuffle_f32x4(i2, i3, _MM_SHUFFLE(2, 0, 2, 0));
+    // 52 53 54 55 60 61 62 63 52 53 54 55 60 61 62 63
+    const __m512 r3 = _mm512_shuffle_f32x4(i2, i3, _MM_SHUFFLE(3, 1, 3, 1));
+    //  0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30
+    const __m512 t0 = _mm512_shuffle_ps(r0, r1, _MM_SHUFFLE(2, 0, 2, 0));
+    //  1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31
+    const __m512 t1 = _mm512_shuffle_ps(r0, r1, _MM_SHUFFLE(3, 1, 3, 1));
+    // 32 34 52 54 40 42 60 62 48 50 52 54 56 58 60 62
+    const __m512 t2 = _mm512_shuffle_ps(r2, r3, _MM_SHUFFLE(2, 0, 2, 0));
+    // 33 35 53 55 41 43 61 63 49 51 53 55 57 59 61 63
+    const __m512 t3 = _mm512_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 1, 3, 1));
+    const __m512i idx0 = _mm512_set_epi32(
+            30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    const __m512i idx1 = _mm512_set_epi32(
+            31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+    // 0 4  8 12 16 20 24 28 32 52 40 60 48 52 56 60
+    o0 = _mm512_permutex2var_ps(t0, idx0, t2);
+    // 1 5  9 13 17 21 25 29 33 53 41 61 49 53 57 61
+    o1 = _mm512_permutex2var_ps(t1, idx0, t3);
+    // 2 6 10 14 18 22 26 30 34 54 42 62 50 54 58 62
+    o2 = _mm512_permutex2var_ps(t0, idx1, t2);
+    // 3 7 11 15 19 23 27 31 35 55 43 63 51 55 59 63
+    o3 = _mm512_permutex2var_ps(t1, idx1, t3);
+}
+// 16x8 -> 8x16 transpose
+inline void transpose_16x8(
+        const __m512 i0,
+        const __m512 i1,
+        const __m512 i2,
+        const __m512 i3,
+        const __m512 i4,
+        const __m512 i5,
+        const __m512 i6,
+        const __m512 i7,
+        __m512& o0,
+        __m512& o1,
+        __m512& o2,
+        __m512& o3,
+        __m512& o4,
+        __m512& o5,
+        __m512& o6,
+        __m512& o7) {
+    // assume that we have the following input:
+    // i0:   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+    // i1:  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
+    // i2:  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
+    // i3:  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
+    // i4:  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79
+    // i5:  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+    // i6:  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
+    // i7: 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
+    //  0  16   1  17   4  20   5  21   8  24   9  25  12  28  13  29
+    const __m512 r0 = _mm512_unpacklo_ps(i0, i1);
+    //  2  18   3  19   6  22   7  23  10  26  11  27  14  30  15  31
+    const __m512 r1 = _mm512_unpackhi_ps(i0, i1);
+    // 32  48  33  49  36  52  37  53  40  56  41  57  44  60  45  61
+    const __m512 r2 = _mm512_unpacklo_ps(i2, i3);
+    // 34  50  35  51  38  54  39  55  42  58  43  59  46  62  47  63
+    const __m512 r3 = _mm512_unpackhi_ps(i2, i3);
+    // 64  80  65  81  68  84  69  85  72  88  73  89  76  92  77  93
+    const __m512 r4 = _mm512_unpacklo_ps(i4, i5);
+    // 66  82  67  83  70  86  71  87  74  90  75  91  78  94  79  95
+    const __m512 r5 = _mm512_unpackhi_ps(i4, i5);
+    // 96 112  97 113 100 116 101 117 104 120 105 121 108 124 109 125
+    const __m512 r6 = _mm512_unpacklo_ps(i6, i7);
+    // 98 114  99 115 102 118 103 119 106 122 107 123 110 126 111 127
+    const __m512 r7 = _mm512_unpackhi_ps(i6, i7);
+    //  0  16  32  48   4  20  36  52   8  24  40  56  12  28  44  60
+    const __m512 t0 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(1, 0, 1, 0));
+    //  1  17  33  49   5  21  37  53   9  25  41  57  13  29  45  61
+    const __m512 t1 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 2, 3, 2));
+    //  2  18  34  50   6  22  38  54  10  26  42  58  14  30  46  62
+    const __m512 t2 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(1, 0, 1, 0));
+    //  3  19  35  51   7  23  39  55  11  27  43  59  15  31  47  63
+    const __m512 t3 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 2, 3, 2));
+    // 64  80  96 112  68  84 100 116  72  88 104 120  76  92 108 124
+    const __m512 t4 = _mm512_shuffle_ps(r4, r6, _MM_SHUFFLE(1, 0, 1, 0));
+    // 65  81  97 113  69  85 101 117  73  89 105 121  77  93 109 125
+    const __m512 t5 = _mm512_shuffle_ps(r4, r6, _MM_SHUFFLE(3, 2, 3, 2));
+    // 66  82  98 114  70  86 102 118  74  90 106 122  78  94 110 126
+    const __m512 t6 = _mm512_shuffle_ps(r5, r7, _MM_SHUFFLE(1, 0, 1, 0));
+    // 67  83  99 115  71  87 103 119  75  91 107 123  79  95 111 127
+    const __m512 t7 = _mm512_shuffle_ps(r5, r7, _MM_SHUFFLE(3, 2, 3, 2));
+    const __m512i idx0 = _mm512_set_epi32(
+            27, 19, 26, 18, 25, 17, 24, 16, 11, 3, 10, 2, 9, 1, 8, 0);
+    const __m512i idx1 = _mm512_set_epi32(
+            31, 23, 30, 22, 29, 21, 28, 20, 15, 7, 14, 6, 13, 5, 12, 4);
+    //  0   8  16  24  32  40  48  56  64  72  80  88  96 104 112 120
+    o0 = _mm512_permutex2var_ps(t0, idx0, t4);
+    //  1   9  17  25  33  41  49  57  65  73  81  89  97 105 113 121
+    o1 = _mm512_permutex2var_ps(t1, idx0, t5);
+    //  2  10  18  26  34  42  50  58  66  74  82  90  98 106 114 122
+    o2 = _mm512_permutex2var_ps(t2, idx0, t6);
+    //  3  11  19  27  35  43  51  59  67  75  83  91  99 107 115 123
+    o3 = _mm512_permutex2var_ps(t3, idx0, t7);
+    //  4  12  20  28  36  44  52  60  68  76  84  92 100 108 116 124
+    o4 = _mm512_permutex2var_ps(t0, idx1, t4);
+    //  5  13  21  29  37  45  53  61  69  77  85  93 101 109 117 125
+    o5 = _mm512_permutex2var_ps(t1, idx1, t5);
+    //  6  14  22  30  38  46  54  62  70  78  86  94 102 110 118 126
+    o6 = _mm512_permutex2var_ps(t2, idx1, t6);
+    //  7  15  23  31  39  47  55  63  71  79  87  95 103 111 119 127
+    o7 = _mm512_permutex2var_ps(t3, idx1, t7);
+}
+} // namespace faiss
+#endif

data/vendor/faiss/faiss/utils/utils.cpp CHANGED Viewed

@@ -7,6 +7,7 @@
 // -*- c++ -*-
+#include <faiss/Index.h>
 #include <faiss/utils/utils.h>
 #include <cassert>
@@ -28,6 +29,8 @@
 #include <omp.h>
 #include <algorithm>
+#include <set>
+#include <type_traits>
 #include <vector>
 #include <faiss/impl/AuxIndexStructures.h>
@@ -101,6 +104,9 @@ int sgemv_(
 namespace faiss {
+// this will be set at load time from GPU Faiss
+std::string gpu_compile_options;
 std::string get_compile_options() {
     std::string options;
@@ -109,17 +115,27 @@ std::string get_compile_options() {
     options += "OPTIMIZE ";
 #endif
-#ifdef __AVX2__
-    options += "AVX2";
+#ifdef __AVX512F__
+    options += "AVX512 ";
+#elif defined(__AVX2__)
+    options += "AVX2 ";
+#elif defined(__ARM_FEATURE_SVE)
+    options += "SVE NEON ";
 #elif defined(__aarch64__)
-    options += "NEON";
+    options += "NEON ";
 #else
-    options += "GENERIC";
+    options += "GENERIC ";
 #endif
+    options += gpu_compile_options;
     return options;
 }
+std::string get_version() {
+    return VERSION_STRING;
+}
 #ifdef _MSC_VER
 double getmillisecs() {
     LARGE_INTEGER ts;
@@ -423,15 +439,35 @@ void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist) {
     }
 }
-size_t ivec_checksum(size_t n, const int32_t* asigned) {
-    const uint32_t* a = reinterpret_cast<const uint32_t*>(asigned);
-    size_t cs = 112909;
+uint64_t ivec_checksum(size_t n, const int32_t* assigned) {
+    const uint32_t* a = reinterpret_cast<const uint32_t*>(assigned);
+    uint64_t cs = 112909;
     while (n--) {
         cs = cs * 65713 + a[n] * 1686049;
     }
     return cs;
 }
+uint64_t bvec_checksum(size_t n, const uint8_t* a) {
+    uint64_t cs = ivec_checksum(n / 4, (const int32_t*)a);
+    for (size_t i = n / 4 * 4; i < n; i++) {
+        cs = cs * 65713 + a[n] * 1686049;
+    }
+    return cs;
+}
+void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs) {
+    // MSVC can't accept unsigned index for #pragma omp parallel for
+    // so below codes only accept n <= std::numeric_limits<ssize_t>::max()
+    using ssize_t = std::make_signed<std::size_t>::type;
+    const ssize_t size = n;
+#pragma omp parallel for if (size > 1000)
+    for (ssize_t i_ = 0; i_ < size; i_++) {
+        const auto i = static_cast<std::size_t>(i_);
+        cs[i] = bvec_checksum(d, a + i * d);
+    }
+}
 const float* fvecs_maybe_subsample(
         size_t d,
         size_t* n,
@@ -528,4 +564,81 @@ bool check_openmp() {
     return true;
 }
+namespace {
+template <typename T>
+int64_t count_lt(int64_t n, const T* row, T threshold) {
+    for (int64_t i = 0; i < n; i++) {
+        if (!(row[i] < threshold)) {
+            return i;
+        }
+    }
+    return n;
+}
+template <typename T>
+int64_t count_gt(int64_t n, const T* row, T threshold) {
+    for (int64_t i = 0; i < n; i++) {
+        if (!(row[i] > threshold)) {
+            return i;
+        }
+    }
+    return n;
+}
+} // namespace
+template <typename T>
+void CombinerRangeKNN<T>::compute_sizes(int64_t* L_res_2) {
+    this->L_res = L_res_2;
+    L_res_2[0] = 0;
+    int64_t j = 0;
+    for (int64_t i = 0; i < nq; i++) {
+        int64_t n_in;
+        if (!mask || !mask[i]) {
+            const T* row = D + i * k;
+            n_in = keep_max ? count_gt(k, row, r2) : count_lt(k, row, r2);
+        } else {
+            n_in = lim_remain[j + 1] - lim_remain[j];
+            j++;
+        }
+        L_res_2[i + 1] = n_in; // L_res_2[i] + n_in;
+    }
+    // cumsum
+    for (int64_t i = 0; i < nq; i++) {
+        L_res_2[i + 1] += L_res_2[i];
+    }
+}
+template <typename T>
+void CombinerRangeKNN<T>::write_result(T* D_res, int64_t* I_res) {
+    FAISS_THROW_IF_NOT(L_res);
+    int64_t j = 0;
+    for (int64_t i = 0; i < nq; i++) {
+        int64_t n_in = L_res[i + 1] - L_res[i];
+        T* D_row = D_res + L_res[i];
+        int64_t* I_row = I_res + L_res[i];
+        if (!mask || !mask[i]) {
+            memcpy(D_row, D + i * k, n_in * sizeof(*D_row));
+            memcpy(I_row, I + i * k, n_in * sizeof(*I_row));
+        } else {
+            memcpy(D_row, D_remain + lim_remain[j], n_in * sizeof(*D_row));
+            memcpy(I_row, I_remain + lim_remain[j], n_in * sizeof(*I_row));
+            j++;
+        }
+    }
+}
+// explicit template instantiations
+template struct CombinerRangeKNN<float>;
+template struct CombinerRangeKNN<int16_t>;
+void CodeSet::insert(size_t n, const uint8_t* codes, bool* inserted) {
+    for (size_t i = 0; i < n; i++) {
+        auto res = s.insert(
+                std::vector<uint8_t>(codes + i * d, codes + i * d + d));
+        inserted[i] = res.second;
+    }
+}
 } // namespace faiss

data/vendor/faiss/faiss/utils/utils.h CHANGED Viewed

@@ -17,7 +17,9 @@
 #define FAISS_utils_h
 #include <stdint.h>
+#include <set>
 #include <string>
+#include <vector>
 #include <faiss/impl/platform_macros.h>
 #include <faiss/utils/Heap.h>
@@ -35,6 +37,9 @@ std::string get_compile_options();
  * Get some stats about the system
  **************************************************/
+// Expose FAISS version as a string
+std::string get_version();
 /// ms elapsed since some arbitrary epoch
 double getmillisecs();
@@ -47,25 +52,6 @@ uint64_t get_cycles();
  * Misc  matrix and vector manipulation functions
  ***************************************************************************/
-/** compute c := a + bf * b for a, b and c tables
- *
- * @param n   size of the tables
- * @param a   size n
- * @param b   size n
- * @param c   restult table, size n
- */
-void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c);
-/** same as fvec_madd, also return index of the min of the result table
- * @return    index of the min of table c
- */
-int fvec_madd_and_argmin(
-        size_t n,
-        const float* a,
-        float bf,
-        const float* b,
-        float* c);
 /* perform a reflection (not an efficient implementation, just for test ) */
 void reflection(const float* u, float* x, size_t n, size_t d, size_t nu);
@@ -121,7 +107,19 @@ int ivec_hist(size_t n, const int* v, int vmax, int* hist);
 void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist);
 /// compute a checksum on a table.
-size_t ivec_checksum(size_t n, const int32_t* a);
+uint64_t ivec_checksum(size_t n, const int32_t* a);
+/// compute a checksum on a table.
+uint64_t bvec_checksum(size_t n, const uint8_t* a);
+/** compute checksums for the rows of a matrix
+ *
+ * @param n   number of rows
+ * @param d   size per row
+ * @param a   matrix to handle, size n * d
+ * @param cs  output checksums, size n
+ */
+void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs);
 /** random subsamples a set of vectors if there are too many of them
  *
@@ -163,6 +161,48 @@ uint64_t hash_bytes(const uint8_t* bytes, int64_t n);
 /** Whether OpenMP annotations were respected. */
 bool check_openmp();
+/** This class is used to combine range and knn search results
+ * in contrib.exhaustive_search.range_search_gpu */
+template <typename T>
+struct CombinerRangeKNN {
+    int64_t nq;    /// nb of queries
+    size_t k;      /// number of neighbors for the knn search part
+    T r2;          /// range search radius
+    bool keep_max; /// whether to keep max values instead of min.
+    CombinerRangeKNN(int64_t nq, size_t k, T r2, bool keep_max)
+            : nq(nq), k(k), r2(r2), keep_max(keep_max) {}
+    /// Knn search results
+    const int64_t* I = nullptr; /// size nq * k
+    const T* D = nullptr;       /// size nq * k
+    /// optional: range search results (ignored if mask is NULL)
+    const bool* mask =
+            nullptr; /// mask for where knn results are valid, size nq
+    // range search results for remaining entries nrange = sum(mask)
+    const int64_t* lim_remain = nullptr; /// size nrange + 1
+    const T* D_remain = nullptr;         /// size lim_remain[nrange]
+    const int64_t* I_remain = nullptr;   /// size lim_remain[nrange]
+    const int64_t* L_res = nullptr; /// size nq + 1
+    // Phase 1: compute sizes into limits array (of size nq + 1)
+    void compute_sizes(int64_t* L_res);
+    /// Phase 2: caller allocates D_res and I_res (size L_res[nq])
+    /// Phase 3: fill in D_res and I_res
+    void write_result(T* D_res, int64_t* I_res);
+};
+struct CodeSet {
+    size_t d;
+    std::set<std::vector<uint8_t>> s;
+    explicit CodeSet(size_t d) : d(d) {}
+    void insert(size_t n, const uint8_t* codes, bool* inserted);
+};
 } // namespace faiss
 #endif /* FAISS_utils_h */