RubyGems - faiss - Versions diffs - 0.2.7 → 0.3.1 - Mend

faiss 0.2.7 → 0.3.1

Files changed (172) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +10 -0
data/LICENSE.txt +1 -1
data/README.md +1 -1
data/ext/faiss/extconf.rb +9 -2
data/ext/faiss/index.cpp +1 -1
data/ext/faiss/index_binary.cpp +2 -2
data/ext/faiss/product_quantizer.cpp +1 -1
data/lib/faiss/version.rb +1 -1
data/lib/faiss.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +7 -7
data/vendor/faiss/faiss/AutoTune.h +0 -1
data/vendor/faiss/faiss/Clustering.cpp +4 -18
data/vendor/faiss/faiss/Clustering.h +31 -21
data/vendor/faiss/faiss/IVFlib.cpp +22 -11
data/vendor/faiss/faiss/Index.cpp +1 -1
data/vendor/faiss/faiss/Index.h +20 -5
data/vendor/faiss/faiss/Index2Layer.cpp +7 -7
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +176 -166
data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +15 -15
data/vendor/faiss/faiss/IndexBinary.cpp +9 -4
data/vendor/faiss/faiss/IndexBinary.h +8 -19
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +2 -1
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +24 -31
data/vendor/faiss/faiss/IndexBinaryHash.cpp +25 -50
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +106 -187
data/vendor/faiss/faiss/IndexFastScan.cpp +90 -159
data/vendor/faiss/faiss/IndexFastScan.h +9 -8
data/vendor/faiss/faiss/IndexFlat.cpp +195 -3
data/vendor/faiss/faiss/IndexFlat.h +20 -1
data/vendor/faiss/faiss/IndexFlatCodes.cpp +11 -0
data/vendor/faiss/faiss/IndexFlatCodes.h +3 -1
data/vendor/faiss/faiss/IndexHNSW.cpp +112 -316
data/vendor/faiss/faiss/IndexHNSW.h +12 -48
data/vendor/faiss/faiss/IndexIDMap.cpp +69 -28
data/vendor/faiss/faiss/IndexIDMap.h +24 -2
data/vendor/faiss/faiss/IndexIVF.cpp +159 -53
data/vendor/faiss/faiss/IndexIVF.h +37 -5
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +18 -26
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +3 -2
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +19 -46
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -3
data/vendor/faiss/faiss/IndexIVFFastScan.cpp +433 -405
data/vendor/faiss/faiss/IndexIVFFastScan.h +56 -26
data/vendor/faiss/faiss/IndexIVFFlat.cpp +15 -5
data/vendor/faiss/faiss/IndexIVFFlat.h +3 -2
data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +172 -0
data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.h +56 -0
data/vendor/faiss/faiss/IndexIVFPQ.cpp +78 -122
data/vendor/faiss/faiss/IndexIVFPQ.h +6 -7
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +18 -50
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +4 -3
data/vendor/faiss/faiss/IndexIVFPQR.cpp +45 -29
data/vendor/faiss/faiss/IndexIVFPQR.h +5 -2
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +25 -27
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +6 -6
data/vendor/faiss/faiss/IndexLSH.cpp +14 -16
data/vendor/faiss/faiss/IndexNNDescent.cpp +3 -4
data/vendor/faiss/faiss/IndexNSG.cpp +11 -27
data/vendor/faiss/faiss/IndexNSG.h +10 -10
data/vendor/faiss/faiss/IndexPQ.cpp +72 -88
data/vendor/faiss/faiss/IndexPQ.h +1 -4
data/vendor/faiss/faiss/IndexPQFastScan.cpp +1 -1
data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -31
data/vendor/faiss/faiss/IndexRefine.cpp +49 -19
data/vendor/faiss/faiss/IndexRefine.h +7 -0
data/vendor/faiss/faiss/IndexReplicas.cpp +23 -26
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +22 -16
data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -4
data/vendor/faiss/faiss/IndexShards.cpp +21 -29
data/vendor/faiss/faiss/IndexShardsIVF.cpp +1 -2
data/vendor/faiss/faiss/MatrixStats.cpp +17 -32
data/vendor/faiss/faiss/MatrixStats.h +21 -9
data/vendor/faiss/faiss/MetaIndexes.cpp +35 -35
data/vendor/faiss/faiss/VectorTransform.cpp +13 -26
data/vendor/faiss/faiss/VectorTransform.h +7 -7
data/vendor/faiss/faiss/clone_index.cpp +15 -10
data/vendor/faiss/faiss/clone_index.h +3 -0
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +87 -4
data/vendor/faiss/faiss/gpu/GpuCloner.h +22 -0
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +7 -0
data/vendor/faiss/faiss/gpu/GpuDistance.h +46 -38
data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +4 -4
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +8 -9
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +18 -3
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -11
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +1 -3
data/vendor/faiss/faiss/gpu/GpuResources.cpp +24 -3
data/vendor/faiss/faiss/gpu/GpuResources.h +39 -11
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +117 -17
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +57 -3
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +1 -1
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +25 -0
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +129 -9
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +267 -40
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +299 -208
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +1 -0
data/vendor/faiss/faiss/gpu/utils/RaftUtils.h +75 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +3 -1
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +5 -5
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +1 -1
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +1 -2
data/vendor/faiss/faiss/impl/DistanceComputer.h +24 -1
data/vendor/faiss/faiss/impl/FaissException.h +13 -34
data/vendor/faiss/faiss/impl/HNSW.cpp +321 -70
data/vendor/faiss/faiss/impl/HNSW.h +9 -8
data/vendor/faiss/faiss/impl/IDSelector.h +4 -4
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +3 -1
data/vendor/faiss/faiss/impl/NNDescent.cpp +29 -19
data/vendor/faiss/faiss/impl/NSG.h +1 -1
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +14 -12
data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +1 -1
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +24 -22
data/vendor/faiss/faiss/impl/ProductQuantizer.h +1 -1
data/vendor/faiss/faiss/impl/Quantizer.h +1 -1
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +27 -1015
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +5 -63
data/vendor/faiss/faiss/impl/ResultHandler.h +232 -176
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +444 -104
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +0 -8
data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +280 -42
data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +21 -14
data/vendor/faiss/faiss/impl/code_distance/code_distance.h +22 -12
data/vendor/faiss/faiss/impl/index_read.cpp +45 -19
data/vendor/faiss/faiss/impl/index_write.cpp +60 -41
data/vendor/faiss/faiss/impl/io.cpp +10 -10
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
data/vendor/faiss/faiss/impl/platform_macros.h +18 -1
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +3 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +7 -6
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +52 -38
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +40 -49
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +960 -0
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +176 -0
data/vendor/faiss/faiss/impl/simd_result_handlers.h +374 -202
data/vendor/faiss/faiss/index_factory.cpp +10 -7
data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -1
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +27 -9
data/vendor/faiss/faiss/invlists/InvertedLists.h +12 -3
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +3 -3
data/vendor/faiss/faiss/python/python_callbacks.cpp +1 -1
data/vendor/faiss/faiss/utils/Heap.cpp +3 -1
data/vendor/faiss/faiss/utils/WorkerThread.h +1 -0
data/vendor/faiss/faiss/utils/distances.cpp +128 -74
data/vendor/faiss/faiss/utils/distances.h +81 -4
data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +5 -5
data/vendor/faiss/faiss/utils/distances_fused/avx512.h +2 -2
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +2 -2
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +1 -1
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +5 -5
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +1 -1
data/vendor/faiss/faiss/utils/distances_simd.cpp +428 -70
data/vendor/faiss/faiss/utils/fp16-arm.h +29 -0
data/vendor/faiss/faiss/utils/fp16.h +2 -0
data/vendor/faiss/faiss/utils/hamming.cpp +162 -110
data/vendor/faiss/faiss/utils/hamming.h +58 -0
data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +16 -89
data/vendor/faiss/faiss/utils/hamming_distance/common.h +1 -0
data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +15 -87
data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +57 -0
data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +14 -104
data/vendor/faiss/faiss/utils/partitioning.cpp +3 -4
data/vendor/faiss/faiss/utils/prefetch.h +77 -0
data/vendor/faiss/faiss/utils/quantize_lut.cpp +0 -14
data/vendor/faiss/faiss/utils/simdlib_avx2.h +0 -6
data/vendor/faiss/faiss/utils/simdlib_neon.h +72 -77
data/vendor/faiss/faiss/utils/sorting.cpp +140 -5
data/vendor/faiss/faiss/utils/sorting.h +27 -0
data/vendor/faiss/faiss/utils/utils.cpp +112 -6
data/vendor/faiss/faiss/utils/utils.h +57 -20
metadata +11 -4

data/vendor/faiss/faiss/IndexReplicas.cpp CHANGED Viewed

@@ -12,17 +12,34 @@
 namespace faiss {
+namespace {
+// IndexBinary needs to update the code_size when d is set...
+void sync_d(Index* index) {}
+void sync_d(IndexBinary* index) {
+    FAISS_THROW_IF_NOT(index->d % 8 == 0);
+    index->code_size = index->d / 8;
+}
+} // anonymous namespace
 template <typename IndexT>
 IndexReplicasTemplate<IndexT>::IndexReplicasTemplate(bool threaded)
         : ThreadedIndex<IndexT>(threaded) {}
 template <typename IndexT>
 IndexReplicasTemplate<IndexT>::IndexReplicasTemplate(idx_t d, bool threaded)
-        : ThreadedIndex<IndexT>(d, threaded) {}
+        : ThreadedIndex<IndexT>(d, threaded) {
+    sync_d(this);
+}
 template <typename IndexT>
 IndexReplicasTemplate<IndexT>::IndexReplicasTemplate(int d, bool threaded)
-        : ThreadedIndex<IndexT>(d, threaded) {}
+        : ThreadedIndex<IndexT>(d, threaded) {
+    sync_d(this);
+}
 template <typename IndexT>
 void IndexReplicasTemplate<IndexT>::onAfterAddIndex(IndexT* index) {
@@ -168,6 +185,8 @@ void IndexReplicasTemplate<IndexT>::syncWithSubIndexes() {
     }
     auto firstIndex = this->at(0);
+    this->d = firstIndex->d;
+    sync_d(this);
     this->metric_type = firstIndex->metric_type;
     this->is_trained = firstIndex->is_trained;
     this->ntotal = firstIndex->ntotal;
@@ -181,30 +200,8 @@ void IndexReplicasTemplate<IndexT>::syncWithSubIndexes() {
     }
 }
-// No metric_type for IndexBinary
-template <>
-void IndexReplicasTemplate<IndexBinary>::syncWithSubIndexes() {
-    if (!this->count()) {
-        this->is_trained = false;
-        this->ntotal = 0;
-        return;
-    }
-    auto firstIndex = this->at(0);
-    this->is_trained = firstIndex->is_trained;
-    this->ntotal = firstIndex->ntotal;
-    for (int i = 1; i < this->count(); ++i) {
-        auto index = this->at(i);
-        FAISS_THROW_IF_NOT(this->d == index->d);
-        FAISS_THROW_IF_NOT(this->is_trained == index->is_trained);
-        FAISS_THROW_IF_NOT(this->ntotal == index->ntotal);
-    }
-}
 // explicit instantiations
-template struct IndexReplicasTemplate<Index>;
-template struct IndexReplicasTemplate<IndexBinary>;
+template class IndexReplicasTemplate<Index>;
+template class IndexReplicasTemplate<IndexBinary>;
 } // namespace faiss

data/vendor/faiss/faiss/IndexScalarQuantizer.cpp CHANGED Viewed

@@ -60,10 +60,9 @@ void IndexScalarQuantizer::search(
 #pragma omp parallel
     {
-        InvertedListScanner* scanner =
-                sq.select_InvertedListScanner(metric_type, nullptr, true, sel);
+        std::unique_ptr<InvertedListScanner> scanner(
+                sq.select_InvertedListScanner(metric_type, nullptr, true, sel));
-        ScopeDeleter1<InvertedListScanner> del(scanner);
         scanner->list_no = 0; // directly the list number
 #pragma omp for
@@ -122,21 +121,28 @@ IndexIVFScalarQuantizer::IndexIVFScalarQuantizer(
         size_t nlist,
         ScalarQuantizer::QuantizerType qtype,
         MetricType metric,
-        bool encode_residual)
-        : IndexIVF(quantizer, d, nlist, 0, metric),
-          sq(d, qtype),
-          by_residual(encode_residual) {
+        bool by_residual)
+        : IndexIVF(quantizer, d, nlist, 0, metric), sq(d, qtype) {
     code_size = sq.code_size;
+    this->by_residual = by_residual;
     // was not known at construction time
     invlists->code_size = code_size;
     is_trained = false;
 }
-IndexIVFScalarQuantizer::IndexIVFScalarQuantizer()
-        : IndexIVF(), by_residual(true) {}
+IndexIVFScalarQuantizer::IndexIVFScalarQuantizer() : IndexIVF() {
+    by_residual = true;
+}
-void IndexIVFScalarQuantizer::train_residual(idx_t n, const float* x) {
-    sq.train_residual(n, x, quantizer, by_residual, verbose);
+void IndexIVFScalarQuantizer::train_encoder(
+        idx_t n,
+        const float* x,
+        const idx_t* assign) {
+    sq.train(n, x);
+}
+idx_t IndexIVFScalarQuantizer::train_encoder_num_vectors() const {
+    return 100000;
 }
 void IndexIVFScalarQuantizer::encode_vectors(
@@ -201,15 +207,15 @@ void IndexIVFScalarQuantizer::add_core(
         idx_t n,
         const float* x,
         const idx_t* xids,
-        const idx_t* coarse_idx) {
+        const idx_t* coarse_idx,
+        void* inverted_list_context) {
     FAISS_THROW_IF_NOT(is_trained);
-    size_t nadd = 0;
     std::unique_ptr<ScalarQuantizer::SQuantizer> squant(sq.select_quantizer());
     DirectMapAdd dm_add(direct_map, n, xids);
-#pragma omp parallel reduction(+ : nadd)
+#pragma omp parallel
     {
         std::vector<float> residual(d);
         std::vector<uint8_t> one_code(code_size);
@@ -231,10 +237,10 @@ void IndexIVFScalarQuantizer::add_core(
                 memset(one_code.data(), 0, code_size);
                 squant->encode_vector(xi, one_code.data());
-                size_t ofs = invlists->add_entry(list_no, id, one_code.data());
+                size_t ofs = invlists->add_entry(
+                        list_no, id, one_code.data(), inverted_list_context);
                 dm_add.add(i, list_no, ofs);
-                nadd++;
             } else if (rank == 0 && list_no == -1) {
                 dm_add.add(i, -1, 0);

data/vendor/faiss/faiss/IndexScalarQuantizer.h CHANGED Viewed

@@ -65,7 +65,6 @@ struct IndexScalarQuantizer : IndexFlatCodes {
 struct IndexIVFScalarQuantizer : IndexIVF {
     ScalarQuantizer sq;
-    bool by_residual;
     IndexIVFScalarQuantizer(
             Index* quantizer,
@@ -73,11 +72,13 @@ struct IndexIVFScalarQuantizer : IndexIVF {
             size_t nlist,
             ScalarQuantizer::QuantizerType qtype,
             MetricType metric = METRIC_L2,
-            bool encode_residual = true);
+            bool by_residual = true);
     IndexIVFScalarQuantizer();
-    void train_residual(idx_t n, const float* x) override;
+    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
+    idx_t train_encoder_num_vectors() const override;
     void encode_vectors(
             idx_t n,
@@ -90,7 +91,8 @@ struct IndexIVFScalarQuantizer : IndexIVF {
             idx_t n,
             const float* x,
             const idx_t* xids,
-            const idx_t* precomputed_idx) override;
+            const idx_t* precomputed_idx,
+            void* inverted_list_context = nullptr) override;
     InvertedListScanner* get_InvertedListScanner(
             bool store_pairs,

data/vendor/faiss/faiss/IndexShards.cpp CHANGED Viewed

@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
-// -*- c++ -*-
 #include <faiss/IndexShards.h>
 #include <cinttypes>
@@ -22,6 +20,15 @@ namespace faiss {
 // subroutines
 namespace {
+// IndexBinary needs to update the code_size when d is set...
+void sync_d(Index* index) {}
+void sync_d(IndexBinary* index) {
+    FAISS_THROW_IF_NOT(index->d % 8 == 0);
+    index->code_size = index->d / 8;
+}
 // add translation to all valid labels
 void translate_labels(int64_t n, idx_t* labels, int64_t translation) {
     if (translation == 0)
@@ -40,20 +47,26 @@ IndexShardsTemplate<IndexT>::IndexShardsTemplate(
         idx_t d,
         bool threaded,
         bool successive_ids)
-        : ThreadedIndex<IndexT>(d, threaded), successive_ids(successive_ids) {}
+        : ThreadedIndex<IndexT>(d, threaded), successive_ids(successive_ids) {
+    sync_d(this);
+}
 template <typename IndexT>
 IndexShardsTemplate<IndexT>::IndexShardsTemplate(
         int d,
         bool threaded,
         bool successive_ids)
-        : ThreadedIndex<IndexT>(d, threaded), successive_ids(successive_ids) {}
+        : ThreadedIndex<IndexT>(d, threaded), successive_ids(successive_ids) {
+    sync_d(this);
+}
 template <typename IndexT>
 IndexShardsTemplate<IndexT>::IndexShardsTemplate(
         bool threaded,
         bool successive_ids)
-        : ThreadedIndex<IndexT>(threaded), successive_ids(successive_ids) {}
+        : ThreadedIndex<IndexT>(threaded), successive_ids(successive_ids) {
+    sync_d(this);
+}
 template <typename IndexT>
 void IndexShardsTemplate<IndexT>::onAfterAddIndex(IndexT* index /* unused */) {
@@ -78,6 +91,8 @@ void IndexShardsTemplate<IndexT>::syncWithSubIndexes() {
     }
     auto firstIndex = this->at(0);
+    this->d = firstIndex->d;
+    sync_d(this);
     this->metric_type = firstIndex->metric_type;
     this->is_trained = firstIndex->is_trained;
     this->ntotal = firstIndex->ntotal;
@@ -92,29 +107,6 @@ void IndexShardsTemplate<IndexT>::syncWithSubIndexes() {
     }
 }
-// No metric_type for IndexBinary
-template <>
-void IndexShardsTemplate<IndexBinary>::syncWithSubIndexes() {
-    if (!this->count()) {
-        this->is_trained = false;
-        this->ntotal = 0;
-        return;
-    }
-    auto firstIndex = this->at(0);
-    this->is_trained = firstIndex->is_trained;
-    this->ntotal = firstIndex->ntotal;
-    for (int i = 1; i < this->count(); ++i) {
-        auto index = this->at(i);
-        FAISS_THROW_IF_NOT(this->d == index->d);
-        FAISS_THROW_IF_NOT(this->is_trained == index->is_trained);
-        this->ntotal += index->ntotal;
-    }
-}
 template <typename IndexT>
 void IndexShardsTemplate<IndexT>::train(idx_t n, const component_t* x) {
     auto fn = [n, x](int no, IndexT* index) {
@@ -155,7 +147,7 @@ void IndexShardsTemplate<IndexT>::add_with_ids(
                 "request them to be shifted");
         FAISS_THROW_IF_NOT_MSG(
                 this->ntotal == 0,
-                "when adding to IndexShards with sucessive_ids, "
+                "when adding to IndexShards with successive_ids, "
                 "only add() in a single pass is supported");
     }

data/vendor/faiss/faiss/IndexShardsIVF.cpp CHANGED Viewed

@@ -111,7 +111,7 @@ void IndexShardsIVF::add_with_ids(
                 "request them to be shifted");
         FAISS_THROW_IF_NOT_MSG(
                 this->ntotal == 0,
-                "when adding to IndexShards with sucessive_ids, "
+                "when adding to IndexShards with successive_ids, "
                 "only add() in a single pass is supported");
     }
@@ -137,7 +137,6 @@ void IndexShardsIVF::add_with_ids(
     auto fn = [n, ids, x, nshard, d, Iq](int no, Index* index) {
         idx_t i0 = (idx_t)no * n / nshard;
         idx_t i1 = ((idx_t)no + 1) * n / nshard;
-        const float* x0 = x + i0 * d;
         auto index_ivf = dynamic_cast<IndexIVF*>(index);
         if (index->verbose) {

data/vendor/faiss/faiss/MatrixStats.cpp CHANGED Viewed

@@ -9,9 +9,10 @@
 #include <faiss/MatrixStats.h>
-#include <stdarg.h> /* va_list, va_start, va_arg, va_end */
+#include <cstdarg> /* va_list, va_start, va_arg, va_end */
 #include <faiss/utils/utils.h>
+#include <cinttypes>
 #include <cmath>
 #include <cstdio>
@@ -21,18 +22,6 @@ namespace faiss {
  * MatrixStats
  *********************************************************************/
-MatrixStats::PerDimStats::PerDimStats()
-        : n(0),
-          n_nan(0),
-          n_inf(0),
-          n0(0),
-          min(HUGE_VALF),
-          max(-HUGE_VALF),
-          sum(0),
-          sum2(0),
-          mean(NAN),
-          stddev(NAN) {}
 void MatrixStats::PerDimStats::add(float x) {
     n++;
     if (std::isnan(x)) {
@@ -74,19 +63,12 @@ void MatrixStats::do_comment(const char* fmt, ...) {
     buf += size;
 }
-MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
-        : n(n),
-          d(d),
-          n_collision(0),
-          n_valid(0),
-          n0(0),
-          min_norm2(HUGE_VAL),
-          max_norm2(0) {
+MatrixStats::MatrixStats(size_t n, size_t d, const float* x) : n(n), d(d) {
     std::vector<char> comment_buf(10000);
     buf = comment_buf.data();
     nbuf = comment_buf.size();
-    do_comment("analyzing %ld vectors of size %ld\n", n, d);
+    do_comment("analyzing %zd vectors of size %zd\n", n, d);
     if (d > 1024) {
         do_comment(
@@ -94,6 +76,9 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
                 "please consider dimensionality reducution (with PCAMatrix)\n");
     }
+    hash_value = hash_bytes((const uint8_t*)x, n * d * sizeof(*x));
+    do_comment("hash value 0x%016" PRIx64 "\n", hash_value);
     size_t nbytes = sizeof(x[0]) * d;
     per_dim_stats.resize(d);
@@ -156,7 +141,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
         if (n_collision > 0) {
             do_comment(
-                    "%ld collisions in hash table, "
+                    "%zd collisions in hash table, "
                     "counts may be invalid\n",
                     n_collision);
         }
@@ -167,14 +152,14 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
                 max = it->second;
             }
         }
-        do_comment("vector %ld has %ld copies\n", max.first, max.count);
+        do_comment("vector %zd has %zd copies\n", max.first, max.count);
     }
     { // norm stats
         min_norm2 = sqrt(min_norm2);
         max_norm2 = sqrt(max_norm2);
         do_comment(
-                "range of L2 norms=[%g, %g] (%ld null vectors)\n",
+                "range of L2 norms=[%g, %g] (%zd null vectors)\n",
                 min_norm2,
                 max_norm2,
                 n0);
@@ -182,7 +167,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
         if (max_norm2 < min_norm2 * 1.0001) {
             do_comment(
                     "vectors are normalized, inner product and "
-                    "L2  search are equivalent\n");
+                    "L2 search are equivalent\n");
         }
         if (max_norm2 > min_norm2 * 100) {
@@ -196,12 +181,12 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
         double max_std = 0, min_std = HUGE_VAL;
-        size_t n_dangerous_range = 0, n_0_range = 0, n0 = 0;
+        size_t n_dangerous_range = 0, n_0_range = 0, n0_2 = 0;
         for (size_t j = 0; j < d; j++) {
             PerDimStats& st = per_dim_stats[j];
             st.compute_mean_std();
-            n0 += st.n0;
+            n0_2 += st.n0;
             if (st.max == st.min) {
                 n_0_range++;
@@ -215,19 +200,19 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
                 min_std = st.stddev;
         }
-        if (n0 == 0) {
+        if (n0_2 == 0) {
             do_comment("matrix contains no 0s\n");
         } else {
             do_comment(
                     "matrix contains %.2f %% 0 entries\n",
-                    n0 * 100.0 / (n * d));
+                    n0_2 * 100.0 / (n * d));
         }
         if (n_0_range == 0) {
             do_comment("no constant dimensions\n");
         } else {
             do_comment(
-                    "%ld dimensions are constant: they can be removed\n",
+                    "%zd dimensions are constant: they can be removed\n",
                     n_0_range);
         }
@@ -235,7 +220,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
             do_comment("no dimension has a too large mean\n");
         } else {
             do_comment(
-                    "%ld dimensions are too large "
+                    "%zd dimensions are too large "
                     "wrt. their variance, may loose precision "
                     "in IndexFlatL2 (use CenteringTransform)\n",
                     n_dangerous_range);

data/vendor/faiss/faiss/MatrixStats.h CHANGED Viewed

@@ -10,6 +10,7 @@
 #pragma once
 #include <stdint.h>
+#include <cmath>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -26,20 +27,31 @@ struct MatrixStats {
     std::string comments;
     // raw statistics
-    size_t n, d;
-    size_t n_collision, n_valid, n0;
-    double min_norm2, max_norm2;
+    size_t n = 0, d = 0;
+    size_t n_collision = 0;
+    size_t n_valid = 0;
+    size_t n0 = 0;
+    double min_norm2 = HUGE_VALF;
+    double max_norm2 = 0;
+    uint64_t hash_value = 0;
     struct PerDimStats {
-        size_t n, n_nan, n_inf, n0;
+        /// counts of various special entries
+        size_t n = 0;
+        size_t n_nan = 0;
+        size_t n_inf = 0;
+        size_t n0 = 0;
-        float min, max;
-        double sum, sum2;
+        /// to get min/max and stddev values
+        float min = HUGE_VALF;
+        float max = -HUGE_VALF;
+        double sum = 0;
+        double sum2 = 0;
-        size_t n_valid;
-        double mean, stddev;
+        size_t n_valid = 0;
+        double mean = NAN;
+        double stddev = NAN;
-        PerDimStats();
         void add(float x);
         void compute_mean_std();
     };

data/vendor/faiss/faiss/MetaIndexes.cpp CHANGED Viewed

@@ -9,8 +9,8 @@
 #include <faiss/MetaIndexes.h>
-#include <stdint.h>
 #include <cinttypes>
+#include <cstdint>
 #include <cstdio>
 #include <limits>
@@ -70,37 +70,37 @@ void IndexSplitVectors::search(
             sum_d == d, "not enough indexes compared to # dimensions");
     int64_t nshard = sub_indexes.size();
-    float* all_distances = new float[nshard * k * n];
-    idx_t* all_labels = new idx_t[nshard * k * n];
-    ScopeDeleter<float> del(all_distances);
-    ScopeDeleter<idx_t> del2(all_labels);
-    auto query_func = [n,
-                       x,
-                       k,
-                       distances,
-                       labels,
-                       all_distances,
-                       all_labels,
-                       this](int no) {
-        const IndexSplitVectors* index = this;
-        float* distances1 = no == 0 ? distances : all_distances + no * k * n;
-        idx_t* labels1 = no == 0 ? labels : all_labels + no * k * n;
-        if (index->verbose)
-            printf("begin query shard %d on %" PRId64 " points\n", no, n);
-        const Index* sub_index = index->sub_indexes[no];
-        int64_t sub_d = sub_index->d, d = index->d;
-        idx_t ofs = 0;
-        for (int i = 0; i < no; i++)
-            ofs += index->sub_indexes[i]->d;
-        float* sub_x = new float[sub_d * n];
-        ScopeDeleter<float> del1(sub_x);
-        for (idx_t i = 0; i < n; i++)
-            memcpy(sub_x + i * sub_d, x + ofs + i * d, sub_d * sizeof(sub_x));
-        sub_index->search(n, sub_x, k, distances1, labels1);
-        if (index->verbose)
-            printf("end query shard %d\n", no);
-    };
+    std::unique_ptr<float[]> all_distances(new float[nshard * k * n]);
+    std::unique_ptr<idx_t[]> all_labels(new idx_t[nshard * k * n]);
+    auto query_func =
+            [n, x, k, distances, labels, &all_distances, &all_labels, this](
+                    int no) {
+                const IndexSplitVectors* index = this;
+                float* distances1 =
+                        no == 0 ? distances : all_distances.get() + no * k * n;
+                idx_t* labels1 =
+                        no == 0 ? labels : all_labels.get() + no * k * n;
+                if (index->verbose)
+                    printf("begin query shard %d on %" PRId64 " points\n",
+                           no,
+                           n);
+                const Index* sub_index = index->sub_indexes[no];
+                int64_t sub_d = sub_index->d, d = index->d;
+                idx_t ofs = 0;
+                for (int i = 0; i < no; i++)
+                    ofs += index->sub_indexes[i]->d;
+                std::unique_ptr<float[]> sub_x(new float[sub_d * n]);
+                for (idx_t i = 0; i < n; i++)
+                    memcpy(sub_x.get() + i * sub_d,
+                           x + ofs + i * d,
+                           sub_d * sizeof(float));
+                sub_index->search(n, sub_x.get(), k, distances1, labels1);
+                if (index->verbose)
+                    printf("end query shard %d\n", no);
+            };
     if (!threaded) {
         for (int i = 0; i < nshard; i++) {
@@ -125,8 +125,8 @@ void IndexSplitVectors::search(
     int64_t factor = 1;
     for (int i = 0; i < nshard; i++) {
         if (i > 0) { // results of 0 are already in the table
-            const float* distances_i = all_distances + i * k * n;
-            const idx_t* labels_i = all_labels + i * k * n;
+            const float* distances_i = all_distances.get() + i * k * n;
+            const idx_t* labels_i = all_labels.get() + i * k * n;
             for (int64_t j = 0; j < n; j++) {
                 if (labels[j] >= 0 && labels_i[j] >= 0) {
                     labels[j] += labels_i[j] * factor;
@@ -238,6 +238,6 @@ void IndexRandom::reset() {
     ntotal = 0;
 }
-IndexRandom::~IndexRandom() {}
+IndexRandom::~IndexRandom() = default;
 } // namespace faiss

data/vendor/faiss/faiss/VectorTransform.cpp CHANGED Viewed

@@ -441,13 +441,10 @@ void eig(size_t d_in, double* cov, double* eigenvalues, int verbose) {
 } // namespace
-void PCAMatrix::train(idx_t n, const float* x) {
-    const float* x_in = x;
-    x = fvecs_maybe_subsample(
-            d_in, (size_t*)&n, max_points_per_d * d_in, x, verbose);
-    ScopeDeleter<float> del_x(x != x_in ? x : nullptr);
+void PCAMatrix::train(idx_t n, const float* x_in) {
+    const float* x = fvecs_maybe_subsample(
+            d_in, (size_t*)&n, max_points_per_d * d_in, x_in, verbose);
+    TransformedVectors tv(x_in, x);
     // compute mean
     mean.clear();
@@ -884,14 +881,13 @@ ITQTransform::ITQTransform(int d_in, int d_out, bool do_pca)
     is_trained = false;
 }
-void ITQTransform::train(idx_t n, const float* x) {
+void ITQTransform::train(idx_t n, const float* x_in) {
     FAISS_THROW_IF_NOT(!is_trained);
-    const float* x_in = x;
     size_t max_train_points = std::max(d_in * max_train_per_dim, 32768);
-    x = fvecs_maybe_subsample(d_in, (size_t*)&n, max_train_points, x);
-    ScopeDeleter<float> del_x(x != x_in ? x : nullptr);
+    const float* x =
+            fvecs_maybe_subsample(d_in, (size_t*)&n, max_train_points, x_in);
+    TransformedVectors tv(x_in, x);
     std::unique_ptr<float[]> x_norm(new float[n * d_in]);
     { // normalize
@@ -988,25 +984,16 @@ void ITQTransform::check_identical(const VectorTransform& other_in) const {
  *********************************************/
 OPQMatrix::OPQMatrix(int d, int M, int d2)
-        : LinearTransform(d, d2 == -1 ? d : d2, false),
-          M(M),
-          niter(50),
-          niter_pq(4),
-          niter_pq_0(40),
-          verbose(false),
-          pq(nullptr) {
+        : LinearTransform(d, d2 == -1 ? d : d2, false), M(M) {
     is_trained = false;
     // OPQ is quite expensive to train, so set this right.
     max_train_points = 256 * 256;
-    pq = nullptr;
 }
-void OPQMatrix::train(idx_t n, const float* x) {
-    const float* x_in = x;
-    x = fvecs_maybe_subsample(d_in, (size_t*)&n, max_train_points, x, verbose);
-    ScopeDeleter<float> del_x(x != x_in ? x : nullptr);
+void OPQMatrix::train(idx_t n, const float* x_in) {
+    const float* x = fvecs_maybe_subsample(
+            d_in, (size_t*)&n, max_train_points, x_in, verbose);
+    TransformedVectors tv(x_in, x);
     // To support d_out > d_in, we pad input vectors with 0s to d_out
     size_t d = d_out <= d_in ? d_in : d_out;