RubyGems - faiss - Versions diffs - 0.2.3 → 0.2.4 - Mend

faiss 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/LICENSE.txt +1 -1
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/Clustering.cpp +32 -0
data/vendor/faiss/faiss/Clustering.h +14 -0
data/vendor/faiss/faiss/Index.h +1 -1
data/vendor/faiss/faiss/Index2Layer.cpp +19 -92
data/vendor/faiss/faiss/Index2Layer.h +2 -16
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +407 -0
data/vendor/faiss/faiss/{IndexResidual.h → IndexAdditiveQuantizer.h} +101 -58
data/vendor/faiss/faiss/IndexFlat.cpp +22 -52
data/vendor/faiss/faiss/IndexFlat.h +9 -15
data/vendor/faiss/faiss/IndexFlatCodes.cpp +67 -0
data/vendor/faiss/faiss/IndexFlatCodes.h +47 -0
data/vendor/faiss/faiss/IndexIVF.cpp +79 -7
data/vendor/faiss/faiss/IndexIVF.h +25 -7
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +316 -0
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +121 -0
data/vendor/faiss/faiss/IndexIVFFlat.cpp +9 -12
data/vendor/faiss/faiss/IndexIVFPQ.cpp +5 -4
data/vendor/faiss/faiss/IndexIVFPQ.h +1 -1
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +60 -39
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +21 -6
data/vendor/faiss/faiss/IndexLSH.cpp +4 -30
data/vendor/faiss/faiss/IndexLSH.h +2 -15
data/vendor/faiss/faiss/IndexNNDescent.cpp +0 -2
data/vendor/faiss/faiss/IndexNSG.cpp +0 -2
data/vendor/faiss/faiss/IndexPQ.cpp +2 -51
data/vendor/faiss/faiss/IndexPQ.h +2 -17
data/vendor/faiss/faiss/IndexRefine.cpp +28 -0
data/vendor/faiss/faiss/IndexRefine.h +10 -0
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -28
data/vendor/faiss/faiss/IndexScalarQuantizer.h +2 -16
data/vendor/faiss/faiss/VectorTransform.cpp +2 -1
data/vendor/faiss/faiss/VectorTransform.h +3 -0
data/vendor/faiss/faiss/clone_index.cpp +3 -2
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +2 -2
data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +60 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +257 -24
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +69 -9
data/vendor/faiss/faiss/impl/HNSW.cpp +10 -5
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +393 -210
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +100 -28
data/vendor/faiss/faiss/impl/NSG.cpp +0 -3
data/vendor/faiss/faiss/impl/NSG.h +1 -1
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +357 -47
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +65 -7
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +12 -19
data/vendor/faiss/faiss/impl/index_read.cpp +102 -19
data/vendor/faiss/faiss/impl/index_write.cpp +66 -16
data/vendor/faiss/faiss/impl/io.cpp +1 -1
data/vendor/faiss/faiss/impl/io_macros.h +20 -0
data/vendor/faiss/faiss/impl/kmeans1d.cpp +301 -0
data/vendor/faiss/faiss/impl/kmeans1d.h +48 -0
data/vendor/faiss/faiss/index_factory.cpp +585 -414
data/vendor/faiss/faiss/index_factory.h +3 -0
data/vendor/faiss/faiss/utils/distances.cpp +4 -2
data/vendor/faiss/faiss/utils/distances.h +36 -3
data/vendor/faiss/faiss/utils/distances_simd.cpp +50 -0
data/vendor/faiss/faiss/utils/utils.h +1 -1
metadata +12 -5
data/vendor/faiss/faiss/IndexResidual.cpp +0 -291

data/vendor/faiss/faiss/index_factory.h CHANGED Viewed

@@ -19,6 +19,9 @@ Index* index_factory(
         const char* description,
         MetricType metric = METRIC_L2);
+/// set to > 0 to get more logs from index_factory
+FAISS_API extern int index_factory_verbose;
 IndexBinary* index_binary_factory(int d, const char* description);
 } // namespace faiss

data/vendor/faiss/faiss/utils/distances.cpp CHANGED Viewed

@@ -105,8 +105,9 @@ void exhaustive_inner_product_seq(
         size_t ny,
         ResultHandler& res) {
     using SingleResultHandler = typename ResultHandler::SingleResultHandler;
+    int nt = std::min(int(nx), omp_get_max_threads());
-#pragma omp parallel
+#pragma omp parallel num_threads(nt)
     {
         SingleResultHandler resi(res);
 #pragma omp for
@@ -135,8 +136,9 @@ void exhaustive_L2sqr_seq(
         size_t ny,
         ResultHandler& res) {
     using SingleResultHandler = typename ResultHandler::SingleResultHandler;
+    int nt = std::min(int(nx), omp_get_max_threads());
-#pragma omp parallel
+#pragma omp parallel num_threads(nt)
     {
         SingleResultHandler resi(res);
 #pragma omp for

data/vendor/faiss/faiss/utils/distances.h CHANGED Viewed

@@ -40,7 +40,7 @@ float fvec_Linf(const float* x, const float* y, size_t d);
  * @param nq    nb of query vectors
  * @param nb    nb of database vectors
  * @param xq    query vectors (size nq * d)
- * @param xb    database vectros (size nb * d)
+ * @param xb    database vectors (size nb * d)
  * @param dis   output distances (size nq * nb)
  * @param ldq,ldb, ldd strides for the matrices
  */
@@ -63,7 +63,7 @@ void fvec_inner_products_ny(
         size_t d,
         size_t ny);
-/* compute ny square L2 distance bewteen x and a set of contiguous y vectors */
+/* compute ny square L2 distance between x and a set of contiguous y vectors */
 void fvec_L2sqr_ny(
         float* dis,
         const float* x,
@@ -87,7 +87,7 @@ void fvec_norms_L2sqr(float* norms, const float* x, size_t d, size_t nx);
 /* L2-renormalize a set of vector. Nothing done if the vector is 0-normed */
 void fvec_renorm_L2(size_t d, size_t nx, float* x);
-/* This function exists because the Torch counterpart is extremly slow
+/* This function exists because the Torch counterpart is extremely slow
    (not multi-threaded + unexpected overhead even in single thread).
    It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y>  */
 void inner_product_to_L2sqr(
@@ -97,6 +97,39 @@ void inner_product_to_L2sqr(
         size_t n1,
         size_t n2);
+/*********************************************************
+ * Vector to vector functions
+ *********************************************************/
+/** compute c := a + b for vectors
+ *
+ * c and a can overlap, c and b can overlap
+ *
+ * @param a size d
+ * @param b size d
+ * @param c size d
+ */
+void fvec_add(size_t d, const float* a, const float* b, float* c);
+/** compute c := a + b for a, c vectors and b a scalar
+ *
+ * c and a can overlap
+ *
+ * @param a size d
+ * @param c size d
+ */
+void fvec_add(size_t d, const float* a, float b, float* c);
+/** compute c := a - b for vectors
+ *
+ * c and a can overlap, c and b can overlap
+ *
+ * @param a size d
+ * @param b size d
+ * @param c size d
+ */
+void fvec_sub(size_t d, const float* a, const float* b, float* c);
 /***************************************************************************
  * Compute a subset of  distances
  ***************************************************************************/

data/vendor/faiss/faiss/utils/distances_simd.cpp CHANGED Viewed

@@ -9,6 +9,7 @@
 #include <faiss/utils/distances.h>
+#include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
@@ -973,4 +974,53 @@ void compute_PQ_dis_tables_dsub2(
     }
 }
+/*********************************************************
+ * Vector to vector functions
+ *********************************************************/
+void fvec_sub(size_t d, const float* a, const float* b, float* c) {
+    size_t i;
+    for (i = 0; i + 7 < d; i += 8) {
+        simd8float32 ci, ai, bi;
+        ai.loadu(a + i);
+        bi.loadu(b + i);
+        ci = ai - bi;
+        ci.storeu(c + i);
+    }
+    // finish non-multiple of 8 remainder
+    for (; i < d; i++) {
+        c[i] = a[i] - b[i];
+    }
+}
+void fvec_add(size_t d, const float* a, const float* b, float* c) {
+    size_t i;
+    for (i = 0; i + 7 < d; i += 8) {
+        simd8float32 ci, ai, bi;
+        ai.loadu(a + i);
+        bi.loadu(b + i);
+        ci = ai + bi;
+        ci.storeu(c + i);
+    }
+    // finish non-multiple of 8 remainder
+    for (; i < d; i++) {
+        c[i] = a[i] + b[i];
+    }
+}
+void fvec_add(size_t d, const float* a, float b, float* c) {
+    size_t i;
+    simd8float32 bv(b);
+    for (i = 0; i + 7 < d; i += 8) {
+        simd8float32 ci, ai, bi;
+        ai.loadu(a + i);
+        ci = ai + bv;
+        ci.storeu(c + i);
+    }
+    // finish non-multiple of 8 remainder
+    for (; i < d; i++) {
+        c[i] = a[i] + b;
+    }
+}
 } // namespace faiss

data/vendor/faiss/faiss/utils/utils.h CHANGED Viewed

@@ -80,7 +80,7 @@ void matrix_qr(int m, int n, float* a);
 /** distances are supposed to be sorted. Sorts indices with same distance*/
 void ranklist_handle_ties(int k, int64_t* idx, const float* dis);
-/** count the number of comon elements between v1 and v2
+/** count the number of common elements between v1 and v2
  * algorithm = sorting + bissection to avoid double-counting duplicates
  */
 size_t ranklist_intersection_size(

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: faiss
 version: !ruby/object:Gem::Version
-  version: 0.2.3
+  version: 0.2.4
 platform: ruby
 authors:
 - Andrew Kane
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-12-17 00:00:00.000000000 Z
+date: 2022-01-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rice
@@ -71,6 +71,8 @@ files:
 - vendor/faiss/faiss/Index.h
 - vendor/faiss/faiss/Index2Layer.cpp
 - vendor/faiss/faiss/Index2Layer.h
+- vendor/faiss/faiss/IndexAdditiveQuantizer.cpp
+- vendor/faiss/faiss/IndexAdditiveQuantizer.h
 - vendor/faiss/faiss/IndexBinary.cpp
 - vendor/faiss/faiss/IndexBinary.h
 - vendor/faiss/faiss/IndexBinaryFlat.cpp
@@ -85,10 +87,14 @@ files:
 - vendor/faiss/faiss/IndexBinaryIVF.h
 - vendor/faiss/faiss/IndexFlat.cpp
 - vendor/faiss/faiss/IndexFlat.h
+- vendor/faiss/faiss/IndexFlatCodes.cpp
+- vendor/faiss/faiss/IndexFlatCodes.h
 - vendor/faiss/faiss/IndexHNSW.cpp
 - vendor/faiss/faiss/IndexHNSW.h
 - vendor/faiss/faiss/IndexIVF.cpp
 - vendor/faiss/faiss/IndexIVF.h
+- vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp
+- vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h
 - vendor/faiss/faiss/IndexIVFFlat.cpp
 - vendor/faiss/faiss/IndexIVFFlat.h
 - vendor/faiss/faiss/IndexIVFPQ.cpp
@@ -117,8 +123,6 @@ files:
 - vendor/faiss/faiss/IndexRefine.h
 - vendor/faiss/faiss/IndexReplicas.cpp
 - vendor/faiss/faiss/IndexReplicas.h
-- vendor/faiss/faiss/IndexResidual.cpp
-- vendor/faiss/faiss/IndexResidual.h
 - vendor/faiss/faiss/IndexScalarQuantizer.cpp
 - vendor/faiss/faiss/IndexScalarQuantizer.h
 - vendor/faiss/faiss/IndexShards.cpp
@@ -140,6 +144,7 @@ files:
 - vendor/faiss/faiss/gpu/GpuClonerOptions.h
 - vendor/faiss/faiss/gpu/GpuDistance.h
 - vendor/faiss/faiss/gpu/GpuFaissAssert.h
+- vendor/faiss/faiss/gpu/GpuIcmEncoder.h
 - vendor/faiss/faiss/gpu/GpuIndex.h
 - vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h
 - vendor/faiss/faiss/gpu/GpuIndexFlat.h
@@ -209,6 +214,8 @@ files:
 - vendor/faiss/faiss/impl/io.cpp
 - vendor/faiss/faiss/impl/io.h
 - vendor/faiss/faiss/impl/io_macros.h
+- vendor/faiss/faiss/impl/kmeans1d.cpp
+- vendor/faiss/faiss/impl/kmeans1d.h
 - vendor/faiss/faiss/impl/lattice_Zn.cpp
 - vendor/faiss/faiss/impl/lattice_Zn.h
 - vendor/faiss/faiss/impl/platform_macros.h
@@ -278,7 +285,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.2.32
+rubygems_version: 3.3.3
 signing_key:
 specification_version: 4
 summary: Efficient similarity search and clustering for Ruby

data/vendor/faiss/faiss/IndexResidual.cpp DELETED Viewed

@@ -1,291 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <faiss/IndexResidual.h>
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/ResultHandler.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/extra_distances.h>
-#include <faiss/utils/utils.h>
-namespace faiss {
-/**************************************************************************************
- * IndexResidual
- **************************************************************************************/
-IndexResidual::IndexResidual(
-        int d,        ///< dimensionality of the input vectors
-        size_t M,     ///< number of subquantizers
-        size_t nbits, ///< number of bit per subvector index
-        MetricType metric,
-        Search_type_t search_type_in)
-        : Index(d, metric), rq(d, M, nbits), search_type(ST_decompress) {
-    is_trained = false;
-    norm_max = norm_min = NAN;
-    set_search_type(search_type_in);
-}
-IndexResidual::IndexResidual(
-        int d,
-        const std::vector<size_t>& nbits,
-        MetricType metric,
-        Search_type_t search_type_in)
-        : Index(d, metric), rq(d, nbits), search_type(ST_decompress) {
-    is_trained = false;
-    norm_max = norm_min = NAN;
-    set_search_type(search_type_in);
-}
-IndexResidual::IndexResidual() : IndexResidual(0, 0, 0) {}
-void IndexResidual::set_search_type(Search_type_t new_search_type) {
-    int norm_bits = new_search_type == ST_norm_float ? 32
-            : new_search_type == ST_norm_qint8       ? 8
-                                                     : 0;
-    FAISS_THROW_IF_NOT(ntotal == 0);
-    search_type = new_search_type;
-    code_size = (rq.tot_bits + norm_bits + 7) / 8;
-}
-void IndexResidual::train(idx_t n, const float* x) {
-    rq.train(n, x);
-    std::vector<float> norms(n);
-    fvec_norms_L2sqr(norms.data(), x, d, n);
-    norm_min = HUGE_VALF;
-    norm_max = -HUGE_VALF;
-    for (idx_t i = 0; i < n; i++) {
-        if (norms[i] < norm_min) {
-            norm_min = norms[i];
-        }
-        if (norms[i] > norm_min) {
-            norm_max = norms[i];
-        }
-    }
-    is_trained = true;
-}
-void IndexResidual::add(idx_t n, const float* x) {
-    FAISS_THROW_IF_NOT(is_trained);
-    codes.resize((n + ntotal) * rq.code_size);
-    if (search_type == ST_decompress || search_type == ST_LUT_nonorm) {
-        rq.compute_codes(x, &codes[ntotal * rq.code_size], n);
-    } else {
-        // should compute codes + compute and quantize norms
-        FAISS_THROW_MSG("not implemented");
-    }
-    ntotal += n;
-}
-namespace {
-template <class VectorDistance, class ResultHandler>
-void search_with_decompress(
-        const IndexResidual& ir,
-        const float* xq,
-        VectorDistance& vd,
-        ResultHandler& res) {
-    const uint8_t* codes = ir.codes.data();
-    size_t ntotal = ir.ntotal;
-    size_t code_size = ir.code_size;
-    using SingleResultHandler = typename ResultHandler::SingleResultHandler;
-#pragma omp parallel for
-    for (int64_t q = 0; q < res.nq; q++) {
-        SingleResultHandler resi(res);
-        resi.begin(q);
-        std::vector<float> tmp(ir.d);
-        const float* x = xq + ir.d * q;
-        for (size_t i = 0; i < ntotal; i++) {
-            ir.rq.decode(codes + i * code_size, tmp.data(), 1);
-            float dis = vd(x, tmp.data());
-            resi.add_result(dis, i);
-        }
-        resi.end();
-    }
-}
-} // anonymous namespace
-void IndexResidual::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels) const {
-    if (search_type == ST_decompress) {
-        if (metric_type == METRIC_L2) {
-            using VD = VectorDistance<METRIC_L2>;
-            VD vd = {size_t(d), metric_arg};
-            HeapResultHandler<VD::C> rh(n, distances, labels, k);
-            search_with_decompress(*this, x, vd, rh);
-        } else if (metric_type == METRIC_INNER_PRODUCT) {
-            using VD = VectorDistance<METRIC_INNER_PRODUCT>;
-            VD vd = {size_t(d), metric_arg};
-            HeapResultHandler<VD::C> rh(n, distances, labels, k);
-            search_with_decompress(*this, x, vd, rh);
-        }
-    } else {
-        FAISS_THROW_MSG("not implemented");
-    }
-}
-void IndexResidual::reset() {
-    codes.clear();
-    ntotal = 0;
-}
-size_t IndexResidual::sa_code_size() const {
-    return code_size;
-}
-void IndexResidual::sa_encode(idx_t n, const float* x, uint8_t* bytes) const {
-    return rq.compute_codes(x, bytes, n);
-}
-void IndexResidual::sa_decode(idx_t n, const uint8_t* bytes, float* x) const {
-    return rq.decode(bytes, x, n);
-}
-/**************************************************************************************
- * ResidualCoarseQuantizer
- **************************************************************************************/
-ResidualCoarseQuantizer::ResidualCoarseQuantizer(
-        int d,        ///< dimensionality of the input vectors
-        size_t M,     ///< number of subquantizers
-        size_t nbits, ///< number of bit per subvector index
-        MetricType metric)
-        : Index(d, metric), rq(d, M, nbits), beam_factor(4.0) {
-    FAISS_THROW_IF_NOT(rq.tot_bits <= 63);
-    is_trained = false;
-}
-ResidualCoarseQuantizer::ResidualCoarseQuantizer(
-        int d,
-        const std::vector<size_t>& nbits,
-        MetricType metric)
-        : Index(d, metric), rq(d, nbits), beam_factor(4.0) {
-    FAISS_THROW_IF_NOT(rq.tot_bits <= 63);
-    is_trained = false;
-}
-ResidualCoarseQuantizer::ResidualCoarseQuantizer() {}
-void ResidualCoarseQuantizer::train(idx_t n, const float* x) {
-    rq.train(n, x);
-    is_trained = true;
-    ntotal = (idx_t)1 << rq.tot_bits;
-}
-void ResidualCoarseQuantizer::add(idx_t, const float*) {
-    FAISS_THROW_MSG("not applicable");
-}
-void ResidualCoarseQuantizer::set_beam_factor(float new_beam_factor) {
-    centroid_norms.resize(0);
-    beam_factor = new_beam_factor;
-    if (new_beam_factor > 0) {
-        FAISS_THROW_IF_NOT(new_beam_factor >= 1.0);
-        return;
-    }
-    if (metric_type == METRIC_L2) {
-        centroid_norms.resize((size_t)1 << rq.tot_bits);
-        rq.compute_centroid_norms(centroid_norms.data());
-    }
-}
-void ResidualCoarseQuantizer::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels) const {
-    if (beam_factor < 0) {
-        if (metric_type == METRIC_INNER_PRODUCT) {
-            rq.knn_exact_inner_product(n, x, k, distances, labels);
-        } else if (metric_type == METRIC_L2) {
-            FAISS_THROW_IF_NOT(centroid_norms.size() == ntotal);
-            rq.knn_exact_L2(n, x, k, distances, labels, centroid_norms.data());
-        }
-        return;
-    }
-    int beam_size = int(k * beam_factor);
-    size_t memory_per_point = rq.memory_per_point(beam_size);
-    /*
-    printf("mem per point %ld n=%d max_mem_distance=%ld mem_kb=%zd\n",
-        memory_per_point, int(n), rq.max_mem_distances, get_mem_usage_kb());
-    */
-    if (n > 1 && memory_per_point * n > rq.max_mem_distances) {
-        // then split queries to reduce temp memory
-        idx_t bs = rq.max_mem_distances / memory_per_point;
-        if (bs == 0) {
-            bs = 1; // otherwise we can't do much
-        }
-        if (verbose) {
-            printf("ResidualCoarseQuantizer::search: run %d searches in batches of size %d\n",
-                   int(n),
-                   int(bs));
-        }
-        for (idx_t i0 = 0; i0 < n; i0 += bs) {
-            idx_t i1 = std::min(n, i0 + bs);
-            search(i1 - i0, x + i0 * d, k, distances + i0 * k, labels + i0 * k);
-            InterruptCallback::check();
-        }
-        return;
-    }
-    std::vector<int32_t> codes(beam_size * rq.M * n);
-    std::vector<float> beam_distances(n * beam_size);
-    rq.refine_beam(
-            n, 1, x, beam_size, codes.data(), nullptr, beam_distances.data());
-#pragma omp parallel for if (n > 4000)
-    for (idx_t i = 0; i < n; i++) {
-        memcpy(distances + i * k,
-               beam_distances.data() + beam_size * i,
-               k * sizeof(distances[0]));
-        const int32_t* codes_i = codes.data() + beam_size * i * rq.M;
-        for (idx_t j = 0; j < k; j++) {
-            idx_t l = 0;
-            int shift = 0;
-            for (int m = 0; m < rq.M; m++) {
-                l |= (*codes_i++) << shift;
-                shift += rq.nbits[m];
-            }
-            labels[i * k + j] = l;
-        }
-    }
-}
-void ResidualCoarseQuantizer::reconstruct(idx_t key, float* recons) const {
-    rq.decode_64bit(key, recons);
-}
-void ResidualCoarseQuantizer::reset() {
-    FAISS_THROW_MSG("not applicable");
-}
-} // namespace faiss