RubyGems - faiss - Versions diffs - 0.3.0 → 0.3.1 - Mend

faiss 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (171) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/LICENSE.txt +1 -1
data/README.md +1 -1
data/ext/faiss/extconf.rb +9 -2
data/ext/faiss/index.cpp +1 -1
data/ext/faiss/index_binary.cpp +2 -2
data/ext/faiss/product_quantizer.cpp +1 -1
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +7 -7
data/vendor/faiss/faiss/AutoTune.h +0 -1
data/vendor/faiss/faiss/Clustering.cpp +4 -18
data/vendor/faiss/faiss/Clustering.h +31 -21
data/vendor/faiss/faiss/IVFlib.cpp +22 -11
data/vendor/faiss/faiss/Index.cpp +1 -1
data/vendor/faiss/faiss/Index.h +20 -5
data/vendor/faiss/faiss/Index2Layer.cpp +7 -7
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +176 -166
data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +15 -15
data/vendor/faiss/faiss/IndexBinary.cpp +9 -4
data/vendor/faiss/faiss/IndexBinary.h +8 -19
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +2 -1
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +24 -31
data/vendor/faiss/faiss/IndexBinaryHash.cpp +25 -50
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +106 -187
data/vendor/faiss/faiss/IndexFastScan.cpp +90 -159
data/vendor/faiss/faiss/IndexFastScan.h +9 -8
data/vendor/faiss/faiss/IndexFlat.cpp +195 -3
data/vendor/faiss/faiss/IndexFlat.h +20 -1
data/vendor/faiss/faiss/IndexFlatCodes.cpp +11 -0
data/vendor/faiss/faiss/IndexFlatCodes.h +3 -1
data/vendor/faiss/faiss/IndexHNSW.cpp +112 -316
data/vendor/faiss/faiss/IndexHNSW.h +12 -48
data/vendor/faiss/faiss/IndexIDMap.cpp +69 -28
data/vendor/faiss/faiss/IndexIDMap.h +24 -2
data/vendor/faiss/faiss/IndexIVF.cpp +159 -53
data/vendor/faiss/faiss/IndexIVF.h +37 -5
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +18 -26
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +3 -2
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +19 -46
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -3
data/vendor/faiss/faiss/IndexIVFFastScan.cpp +433 -405
data/vendor/faiss/faiss/IndexIVFFastScan.h +56 -26
data/vendor/faiss/faiss/IndexIVFFlat.cpp +15 -5
data/vendor/faiss/faiss/IndexIVFFlat.h +3 -2
data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +172 -0
data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.h +56 -0
data/vendor/faiss/faiss/IndexIVFPQ.cpp +78 -122
data/vendor/faiss/faiss/IndexIVFPQ.h +6 -7
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +18 -50
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +4 -3
data/vendor/faiss/faiss/IndexIVFPQR.cpp +45 -29
data/vendor/faiss/faiss/IndexIVFPQR.h +5 -2
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +25 -27
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +6 -6
data/vendor/faiss/faiss/IndexLSH.cpp +14 -16
data/vendor/faiss/faiss/IndexNNDescent.cpp +3 -4
data/vendor/faiss/faiss/IndexNSG.cpp +11 -27
data/vendor/faiss/faiss/IndexNSG.h +10 -10
data/vendor/faiss/faiss/IndexPQ.cpp +72 -88
data/vendor/faiss/faiss/IndexPQ.h +1 -4
data/vendor/faiss/faiss/IndexPQFastScan.cpp +1 -1
data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -31
data/vendor/faiss/faiss/IndexRefine.cpp +49 -19
data/vendor/faiss/faiss/IndexRefine.h +7 -0
data/vendor/faiss/faiss/IndexReplicas.cpp +23 -26
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +22 -16
data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -4
data/vendor/faiss/faiss/IndexShards.cpp +21 -29
data/vendor/faiss/faiss/IndexShardsIVF.cpp +1 -2
data/vendor/faiss/faiss/MatrixStats.cpp +17 -32
data/vendor/faiss/faiss/MatrixStats.h +21 -9
data/vendor/faiss/faiss/MetaIndexes.cpp +35 -35
data/vendor/faiss/faiss/VectorTransform.cpp +13 -26
data/vendor/faiss/faiss/VectorTransform.h +7 -7
data/vendor/faiss/faiss/clone_index.cpp +15 -10
data/vendor/faiss/faiss/clone_index.h +3 -0
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +87 -4
data/vendor/faiss/faiss/gpu/GpuCloner.h +22 -0
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +7 -0
data/vendor/faiss/faiss/gpu/GpuDistance.h +46 -38
data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +4 -4
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +8 -9
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +18 -3
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -11
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +1 -3
data/vendor/faiss/faiss/gpu/GpuResources.cpp +24 -3
data/vendor/faiss/faiss/gpu/GpuResources.h +39 -11
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +117 -17
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +57 -3
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +1 -1
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +25 -0
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +129 -9
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +267 -40
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +299 -208
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +1 -0
data/vendor/faiss/faiss/gpu/utils/RaftUtils.h +75 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +3 -1
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +5 -5
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +1 -1
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +1 -2
data/vendor/faiss/faiss/impl/DistanceComputer.h +24 -1
data/vendor/faiss/faiss/impl/FaissException.h +13 -34
data/vendor/faiss/faiss/impl/HNSW.cpp +321 -70
data/vendor/faiss/faiss/impl/HNSW.h +9 -8
data/vendor/faiss/faiss/impl/IDSelector.h +4 -4
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +3 -1
data/vendor/faiss/faiss/impl/NNDescent.cpp +29 -19
data/vendor/faiss/faiss/impl/NSG.h +1 -1
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +14 -12
data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +1 -1
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +24 -22
data/vendor/faiss/faiss/impl/ProductQuantizer.h +1 -1
data/vendor/faiss/faiss/impl/Quantizer.h +1 -1
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +27 -1015
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +5 -63
data/vendor/faiss/faiss/impl/ResultHandler.h +232 -176
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +444 -104
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +0 -8
data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +280 -42
data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +21 -14
data/vendor/faiss/faiss/impl/code_distance/code_distance.h +22 -12
data/vendor/faiss/faiss/impl/index_read.cpp +45 -19
data/vendor/faiss/faiss/impl/index_write.cpp +60 -41
data/vendor/faiss/faiss/impl/io.cpp +10 -10
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
data/vendor/faiss/faiss/impl/platform_macros.h +18 -1
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +3 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +7 -6
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +52 -38
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +40 -49
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +960 -0
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +176 -0
data/vendor/faiss/faiss/impl/simd_result_handlers.h +374 -202
data/vendor/faiss/faiss/index_factory.cpp +10 -7
data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -1
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +27 -9
data/vendor/faiss/faiss/invlists/InvertedLists.h +12 -3
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +3 -3
data/vendor/faiss/faiss/python/python_callbacks.cpp +1 -1
data/vendor/faiss/faiss/utils/Heap.cpp +3 -1
data/vendor/faiss/faiss/utils/WorkerThread.h +1 -0
data/vendor/faiss/faiss/utils/distances.cpp +128 -74
data/vendor/faiss/faiss/utils/distances.h +81 -4
data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +5 -5
data/vendor/faiss/faiss/utils/distances_fused/avx512.h +2 -2
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +2 -2
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +1 -1
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +5 -5
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +1 -1
data/vendor/faiss/faiss/utils/distances_simd.cpp +428 -70
data/vendor/faiss/faiss/utils/fp16-arm.h +29 -0
data/vendor/faiss/faiss/utils/fp16.h +2 -0
data/vendor/faiss/faiss/utils/hamming.cpp +162 -110
data/vendor/faiss/faiss/utils/hamming.h +58 -0
data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +16 -89
data/vendor/faiss/faiss/utils/hamming_distance/common.h +1 -0
data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +15 -87
data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +57 -0
data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +14 -104
data/vendor/faiss/faiss/utils/partitioning.cpp +3 -4
data/vendor/faiss/faiss/utils/prefetch.h +77 -0
data/vendor/faiss/faiss/utils/quantize_lut.cpp +0 -14
data/vendor/faiss/faiss/utils/simdlib_avx2.h +0 -6
data/vendor/faiss/faiss/utils/simdlib_neon.h +72 -77
data/vendor/faiss/faiss/utils/sorting.cpp +140 -5
data/vendor/faiss/faiss/utils/sorting.h +27 -0
data/vendor/faiss/faiss/utils/utils.cpp +112 -6
data/vendor/faiss/faiss/utils/utils.h +57 -20
metadata +10 -3

data/vendor/faiss/faiss/IndexReplicas.cpp CHANGED Viewed

@@ -12,17 +12,34 @@
 namespace faiss {
+namespace {
+// IndexBinary needs to update the code_size when d is set...
+void sync_d(Index* index) {}
+void sync_d(IndexBinary* index) {
+    FAISS_THROW_IF_NOT(index->d % 8 == 0);
+    index->code_size = index->d / 8;
+}
+} // anonymous namespace
 template <typename IndexT>
 IndexReplicasTemplate<IndexT>::IndexReplicasTemplate(bool threaded)
         : ThreadedIndex<IndexT>(threaded) {}
 template <typename IndexT>
 IndexReplicasTemplate<IndexT>::IndexReplicasTemplate(idx_t d, bool threaded)
-        : ThreadedIndex<IndexT>(d, threaded) {}
+        : ThreadedIndex<IndexT>(d, threaded) {
+    sync_d(this);
+}
 template <typename IndexT>
 IndexReplicasTemplate<IndexT>::IndexReplicasTemplate(int d, bool threaded)
-        : ThreadedIndex<IndexT>(d, threaded) {}
+        : ThreadedIndex<IndexT>(d, threaded) {
+    sync_d(this);
+}
 template <typename IndexT>
 void IndexReplicasTemplate<IndexT>::onAfterAddIndex(IndexT* index) {
@@ -168,6 +185,8 @@ void IndexReplicasTemplate<IndexT>::syncWithSubIndexes() {
     }
     auto firstIndex = this->at(0);
+    this->d = firstIndex->d;
+    sync_d(this);
     this->metric_type = firstIndex->metric_type;
     this->is_trained = firstIndex->is_trained;
     this->ntotal = firstIndex->ntotal;
@@ -181,30 +200,8 @@ void IndexReplicasTemplate<IndexT>::syncWithSubIndexes() {
     }
 }
-// No metric_type for IndexBinary
-template <>
-void IndexReplicasTemplate<IndexBinary>::syncWithSubIndexes() {
-    if (!this->count()) {
-        this->is_trained = false;
-        this->ntotal = 0;
-        return;
-    }
-    auto firstIndex = this->at(0);
-    this->is_trained = firstIndex->is_trained;
-    this->ntotal = firstIndex->ntotal;
-    for (int i = 1; i < this->count(); ++i) {
-        auto index = this->at(i);
-        FAISS_THROW_IF_NOT(this->d == index->d);
-        FAISS_THROW_IF_NOT(this->is_trained == index->is_trained);
-        FAISS_THROW_IF_NOT(this->ntotal == index->ntotal);
-    }
-}
 // explicit instantiations
-template struct IndexReplicasTemplate<Index>;
-template struct IndexReplicasTemplate<IndexBinary>;
+template class IndexReplicasTemplate<Index>;
+template class IndexReplicasTemplate<IndexBinary>;
 } // namespace faiss

data/vendor/faiss/faiss/IndexScalarQuantizer.cpp CHANGED Viewed

@@ -60,10 +60,9 @@ void IndexScalarQuantizer::search(
 #pragma omp parallel
     {
-        InvertedListScanner* scanner =
-                sq.select_InvertedListScanner(metric_type, nullptr, true, sel);
+        std::unique_ptr<InvertedListScanner> scanner(
+                sq.select_InvertedListScanner(metric_type, nullptr, true, sel));
-        ScopeDeleter1<InvertedListScanner> del(scanner);
         scanner->list_no = 0; // directly the list number
 #pragma omp for
@@ -122,21 +121,28 @@ IndexIVFScalarQuantizer::IndexIVFScalarQuantizer(
         size_t nlist,
         ScalarQuantizer::QuantizerType qtype,
         MetricType metric,
-        bool encode_residual)
-        : IndexIVF(quantizer, d, nlist, 0, metric),
-          sq(d, qtype),
-          by_residual(encode_residual) {
+        bool by_residual)
+        : IndexIVF(quantizer, d, nlist, 0, metric), sq(d, qtype) {
     code_size = sq.code_size;
+    this->by_residual = by_residual;
     // was not known at construction time
     invlists->code_size = code_size;
     is_trained = false;
 }
-IndexIVFScalarQuantizer::IndexIVFScalarQuantizer()
-        : IndexIVF(), by_residual(true) {}
+IndexIVFScalarQuantizer::IndexIVFScalarQuantizer() : IndexIVF() {
+    by_residual = true;
+}
-void IndexIVFScalarQuantizer::train_residual(idx_t n, const float* x) {
-    sq.train_residual(n, x, quantizer, by_residual, verbose);
+void IndexIVFScalarQuantizer::train_encoder(
+        idx_t n,
+        const float* x,
+        const idx_t* assign) {
+    sq.train(n, x);
+}
+idx_t IndexIVFScalarQuantizer::train_encoder_num_vectors() const {
+    return 100000;
 }
 void IndexIVFScalarQuantizer::encode_vectors(
@@ -201,15 +207,15 @@ void IndexIVFScalarQuantizer::add_core(
         idx_t n,
         const float* x,
         const idx_t* xids,
-        const idx_t* coarse_idx) {
+        const idx_t* coarse_idx,
+        void* inverted_list_context) {
     FAISS_THROW_IF_NOT(is_trained);
-    size_t nadd = 0;
     std::unique_ptr<ScalarQuantizer::SQuantizer> squant(sq.select_quantizer());
     DirectMapAdd dm_add(direct_map, n, xids);
-#pragma omp parallel reduction(+ : nadd)
+#pragma omp parallel
     {
         std::vector<float> residual(d);
         std::vector<uint8_t> one_code(code_size);
@@ -231,10 +237,10 @@ void IndexIVFScalarQuantizer::add_core(
                 memset(one_code.data(), 0, code_size);
                 squant->encode_vector(xi, one_code.data());
-                size_t ofs = invlists->add_entry(list_no, id, one_code.data());
+                size_t ofs = invlists->add_entry(
+                        list_no, id, one_code.data(), inverted_list_context);
                 dm_add.add(i, list_no, ofs);
-                nadd++;
             } else if (rank == 0 && list_no == -1) {
                 dm_add.add(i, -1, 0);

data/vendor/faiss/faiss/IndexScalarQuantizer.h CHANGED Viewed

@@ -65,7 +65,6 @@ struct IndexScalarQuantizer : IndexFlatCodes {
 struct IndexIVFScalarQuantizer : IndexIVF {
     ScalarQuantizer sq;
-    bool by_residual;
     IndexIVFScalarQuantizer(
             Index* quantizer,
@@ -73,11 +72,13 @@ struct IndexIVFScalarQuantizer : IndexIVF {
             size_t nlist,
             ScalarQuantizer::QuantizerType qtype,
             MetricType metric = METRIC_L2,
-            bool encode_residual = true);
+            bool by_residual = true);
     IndexIVFScalarQuantizer();
-    void train_residual(idx_t n, const float* x) override;
+    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
+    idx_t train_encoder_num_vectors() const override;
     void encode_vectors(
             idx_t n,
@@ -90,7 +91,8 @@ struct IndexIVFScalarQuantizer : IndexIVF {
             idx_t n,
             const float* x,
             const idx_t* xids,
-            const idx_t* precomputed_idx) override;
+            const idx_t* precomputed_idx,
+            void* inverted_list_context = nullptr) override;
     InvertedListScanner* get_InvertedListScanner(
             bool store_pairs,

data/vendor/faiss/faiss/IndexShards.cpp CHANGED Viewed

@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
-// -*- c++ -*-
 #include <faiss/IndexShards.h>
 #include <cinttypes>
@@ -22,6 +20,15 @@ namespace faiss {
 // subroutines
 namespace {
+// IndexBinary needs to update the code_size when d is set...
+void sync_d(Index* index) {}
+void sync_d(IndexBinary* index) {
+    FAISS_THROW_IF_NOT(index->d % 8 == 0);
+    index->code_size = index->d / 8;
+}
 // add translation to all valid labels
 void translate_labels(int64_t n, idx_t* labels, int64_t translation) {
     if (translation == 0)
@@ -40,20 +47,26 @@ IndexShardsTemplate<IndexT>::IndexShardsTemplate(
         idx_t d,
         bool threaded,
         bool successive_ids)
-        : ThreadedIndex<IndexT>(d, threaded), successive_ids(successive_ids) {}
+        : ThreadedIndex<IndexT>(d, threaded), successive_ids(successive_ids) {
+    sync_d(this);
+}
 template <typename IndexT>
 IndexShardsTemplate<IndexT>::IndexShardsTemplate(
         int d,
         bool threaded,
         bool successive_ids)
-        : ThreadedIndex<IndexT>(d, threaded), successive_ids(successive_ids) {}
+        : ThreadedIndex<IndexT>(d, threaded), successive_ids(successive_ids) {
+    sync_d(this);
+}
 template <typename IndexT>
 IndexShardsTemplate<IndexT>::IndexShardsTemplate(
         bool threaded,
         bool successive_ids)
-        : ThreadedIndex<IndexT>(threaded), successive_ids(successive_ids) {}
+        : ThreadedIndex<IndexT>(threaded), successive_ids(successive_ids) {
+    sync_d(this);
+}
 template <typename IndexT>
 void IndexShardsTemplate<IndexT>::onAfterAddIndex(IndexT* index /* unused */) {
@@ -78,6 +91,8 @@ void IndexShardsTemplate<IndexT>::syncWithSubIndexes() {
     }
     auto firstIndex = this->at(0);
+    this->d = firstIndex->d;
+    sync_d(this);
     this->metric_type = firstIndex->metric_type;
     this->is_trained = firstIndex->is_trained;
     this->ntotal = firstIndex->ntotal;
@@ -92,29 +107,6 @@ void IndexShardsTemplate<IndexT>::syncWithSubIndexes() {
     }
 }
-// No metric_type for IndexBinary
-template <>
-void IndexShardsTemplate<IndexBinary>::syncWithSubIndexes() {
-    if (!this->count()) {
-        this->is_trained = false;
-        this->ntotal = 0;
-        return;
-    }
-    auto firstIndex = this->at(0);
-    this->is_trained = firstIndex->is_trained;
-    this->ntotal = firstIndex->ntotal;
-    for (int i = 1; i < this->count(); ++i) {
-        auto index = this->at(i);
-        FAISS_THROW_IF_NOT(this->d == index->d);
-        FAISS_THROW_IF_NOT(this->is_trained == index->is_trained);
-        this->ntotal += index->ntotal;
-    }
-}
 template <typename IndexT>
 void IndexShardsTemplate<IndexT>::train(idx_t n, const component_t* x) {
     auto fn = [n, x](int no, IndexT* index) {
@@ -155,7 +147,7 @@ void IndexShardsTemplate<IndexT>::add_with_ids(
                 "request them to be shifted");
         FAISS_THROW_IF_NOT_MSG(
                 this->ntotal == 0,
-                "when adding to IndexShards with sucessive_ids, "
+                "when adding to IndexShards with successive_ids, "
                 "only add() in a single pass is supported");
     }

data/vendor/faiss/faiss/IndexShardsIVF.cpp CHANGED Viewed

@@ -111,7 +111,7 @@ void IndexShardsIVF::add_with_ids(
                 "request them to be shifted");
         FAISS_THROW_IF_NOT_MSG(
                 this->ntotal == 0,
-                "when adding to IndexShards with sucessive_ids, "
+                "when adding to IndexShards with successive_ids, "
                 "only add() in a single pass is supported");
     }
@@ -137,7 +137,6 @@ void IndexShardsIVF::add_with_ids(
     auto fn = [n, ids, x, nshard, d, Iq](int no, Index* index) {
         idx_t i0 = (idx_t)no * n / nshard;
         idx_t i1 = ((idx_t)no + 1) * n / nshard;
-        const float* x0 = x + i0 * d;
         auto index_ivf = dynamic_cast<IndexIVF*>(index);
         if (index->verbose) {

data/vendor/faiss/faiss/MatrixStats.cpp CHANGED Viewed

@@ -9,9 +9,10 @@
 #include <faiss/MatrixStats.h>
-#include <stdarg.h> /* va_list, va_start, va_arg, va_end */
+#include <cstdarg> /* va_list, va_start, va_arg, va_end */
 #include <faiss/utils/utils.h>
+#include <cinttypes>
 #include <cmath>
 #include <cstdio>
@@ -21,18 +22,6 @@ namespace faiss {
  * MatrixStats
  *********************************************************************/
-MatrixStats::PerDimStats::PerDimStats()
-        : n(0),
-          n_nan(0),
-          n_inf(0),
-          n0(0),
-          min(HUGE_VALF),
-          max(-HUGE_VALF),
-          sum(0),
-          sum2(0),
-          mean(NAN),
-          stddev(NAN) {}
 void MatrixStats::PerDimStats::add(float x) {
     n++;
     if (std::isnan(x)) {
@@ -74,19 +63,12 @@ void MatrixStats::do_comment(const char* fmt, ...) {
     buf += size;
 }
-MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
-        : n(n),
-          d(d),
-          n_collision(0),
-          n_valid(0),
-          n0(0),
-          min_norm2(HUGE_VAL),
-          max_norm2(0) {
+MatrixStats::MatrixStats(size_t n, size_t d, const float* x) : n(n), d(d) {
     std::vector<char> comment_buf(10000);
     buf = comment_buf.data();
     nbuf = comment_buf.size();
-    do_comment("analyzing %ld vectors of size %ld\n", n, d);
+    do_comment("analyzing %zd vectors of size %zd\n", n, d);
     if (d > 1024) {
         do_comment(
@@ -94,6 +76,9 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
                 "please consider dimensionality reducution (with PCAMatrix)\n");
     }
+    hash_value = hash_bytes((const uint8_t*)x, n * d * sizeof(*x));
+    do_comment("hash value 0x%016" PRIx64 "\n", hash_value);
     size_t nbytes = sizeof(x[0]) * d;
     per_dim_stats.resize(d);
@@ -156,7 +141,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
         if (n_collision > 0) {
             do_comment(
-                    "%ld collisions in hash table, "
+                    "%zd collisions in hash table, "
                     "counts may be invalid\n",
                     n_collision);
         }
@@ -167,14 +152,14 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
                 max = it->second;
             }
         }
-        do_comment("vector %ld has %ld copies\n", max.first, max.count);
+        do_comment("vector %zd has %zd copies\n", max.first, max.count);
     }
     { // norm stats
         min_norm2 = sqrt(min_norm2);
         max_norm2 = sqrt(max_norm2);
         do_comment(
-                "range of L2 norms=[%g, %g] (%ld null vectors)\n",
+                "range of L2 norms=[%g, %g] (%zd null vectors)\n",
                 min_norm2,
                 max_norm2,
                 n0);
@@ -182,7 +167,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
         if (max_norm2 < min_norm2 * 1.0001) {
             do_comment(
                     "vectors are normalized, inner product and "
-                    "L2  search are equivalent\n");
+                    "L2 search are equivalent\n");
         }
         if (max_norm2 > min_norm2 * 100) {
@@ -196,12 +181,12 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
         double max_std = 0, min_std = HUGE_VAL;
-        size_t n_dangerous_range = 0, n_0_range = 0, n0 = 0;
+        size_t n_dangerous_range = 0, n_0_range = 0, n0_2 = 0;
         for (size_t j = 0; j < d; j++) {
             PerDimStats& st = per_dim_stats[j];
             st.compute_mean_std();
-            n0 += st.n0;
+            n0_2 += st.n0;
             if (st.max == st.min) {
                 n_0_range++;
@@ -215,19 +200,19 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
                 min_std = st.stddev;
         }
-        if (n0 == 0) {
+        if (n0_2 == 0) {
             do_comment("matrix contains no 0s\n");
         } else {
             do_comment(
                     "matrix contains %.2f %% 0 entries\n",
-                    n0 * 100.0 / (n * d));
+                    n0_2 * 100.0 / (n * d));
         }
         if (n_0_range == 0) {
             do_comment("no constant dimensions\n");
         } else {
             do_comment(
-                    "%ld dimensions are constant: they can be removed\n",
+                    "%zd dimensions are constant: they can be removed\n",
                     n_0_range);
         }
@@ -235,7 +220,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
             do_comment("no dimension has a too large mean\n");
         } else {
             do_comment(
-                    "%ld dimensions are too large "
+                    "%zd dimensions are too large "
                     "wrt. their variance, may loose precision "
                     "in IndexFlatL2 (use CenteringTransform)\n",
                     n_dangerous_range);

data/vendor/faiss/faiss/MatrixStats.h CHANGED Viewed

@@ -10,6 +10,7 @@
 #pragma once
 #include <stdint.h>
+#include <cmath>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -26,20 +27,31 @@ struct MatrixStats {
     std::string comments;
     // raw statistics
-    size_t n, d;
-    size_t n_collision, n_valid, n0;
-    double min_norm2, max_norm2;
+    size_t n = 0, d = 0;
+    size_t n_collision = 0;
+    size_t n_valid = 0;
+    size_t n0 = 0;
+    double min_norm2 = HUGE_VALF;
+    double max_norm2 = 0;
+    uint64_t hash_value = 0;
     struct PerDimStats {
-        size_t n, n_nan, n_inf, n0;
+        /// counts of various special entries
+        size_t n = 0;
+        size_t n_nan = 0;
+        size_t n_inf = 0;
+        size_t n0 = 0;
-        float min, max;
-        double sum, sum2;
+        /// to get min/max and stddev values
+        float min = HUGE_VALF;
+        float max = -HUGE_VALF;
+        double sum = 0;
+        double sum2 = 0;
-        size_t n_valid;
-        double mean, stddev;
+        size_t n_valid = 0;
+        double mean = NAN;
+        double stddev = NAN;
-        PerDimStats();
         void add(float x);
         void compute_mean_std();
     };

data/vendor/faiss/faiss/MetaIndexes.cpp CHANGED Viewed

@@ -9,8 +9,8 @@
 #include <faiss/MetaIndexes.h>
-#include <stdint.h>
 #include <cinttypes>
+#include <cstdint>
 #include <cstdio>
 #include <limits>
@@ -70,37 +70,37 @@ void IndexSplitVectors::search(
             sum_d == d, "not enough indexes compared to # dimensions");
     int64_t nshard = sub_indexes.size();
-    float* all_distances = new float[nshard * k * n];
-    idx_t* all_labels = new idx_t[nshard * k * n];
-    ScopeDeleter<float> del(all_distances);
-    ScopeDeleter<idx_t> del2(all_labels);
-    auto query_func = [n,
-                       x,
-                       k,
-                       distances,
-                       labels,
-                       all_distances,
-                       all_labels,
-                       this](int no) {
-        const IndexSplitVectors* index = this;
-        float* distances1 = no == 0 ? distances : all_distances + no * k * n;
-        idx_t* labels1 = no == 0 ? labels : all_labels + no * k * n;
-        if (index->verbose)
-            printf("begin query shard %d on %" PRId64 " points\n", no, n);
-        const Index* sub_index = index->sub_indexes[no];
-        int64_t sub_d = sub_index->d, d = index->d;
-        idx_t ofs = 0;
-        for (int i = 0; i < no; i++)
-            ofs += index->sub_indexes[i]->d;
-        float* sub_x = new float[sub_d * n];
-        ScopeDeleter<float> del1(sub_x);
-        for (idx_t i = 0; i < n; i++)
-            memcpy(sub_x + i * sub_d, x + ofs + i * d, sub_d * sizeof(sub_x));
-        sub_index->search(n, sub_x, k, distances1, labels1);
-        if (index->verbose)
-            printf("end query shard %d\n", no);
-    };
+    std::unique_ptr<float[]> all_distances(new float[nshard * k * n]);
+    std::unique_ptr<idx_t[]> all_labels(new idx_t[nshard * k * n]);
+    auto query_func =
+            [n, x, k, distances, labels, &all_distances, &all_labels, this](
+                    int no) {
+                const IndexSplitVectors* index = this;
+                float* distances1 =
+                        no == 0 ? distances : all_distances.get() + no * k * n;
+                idx_t* labels1 =
+                        no == 0 ? labels : all_labels.get() + no * k * n;
+                if (index->verbose)
+                    printf("begin query shard %d on %" PRId64 " points\n",
+                           no,
+                           n);
+                const Index* sub_index = index->sub_indexes[no];
+                int64_t sub_d = sub_index->d, d = index->d;
+                idx_t ofs = 0;
+                for (int i = 0; i < no; i++)
+                    ofs += index->sub_indexes[i]->d;
+                std::unique_ptr<float[]> sub_x(new float[sub_d * n]);
+                for (idx_t i = 0; i < n; i++)
+                    memcpy(sub_x.get() + i * sub_d,
+                           x + ofs + i * d,
+                           sub_d * sizeof(float));
+                sub_index->search(n, sub_x.get(), k, distances1, labels1);
+                if (index->verbose)
+                    printf("end query shard %d\n", no);
+            };
     if (!threaded) {
         for (int i = 0; i < nshard; i++) {
@@ -125,8 +125,8 @@ void IndexSplitVectors::search(
     int64_t factor = 1;
     for (int i = 0; i < nshard; i++) {
         if (i > 0) { // results of 0 are already in the table
-            const float* distances_i = all_distances + i * k * n;
-            const idx_t* labels_i = all_labels + i * k * n;
+            const float* distances_i = all_distances.get() + i * k * n;
+            const idx_t* labels_i = all_labels.get() + i * k * n;
             for (int64_t j = 0; j < n; j++) {
                 if (labels[j] >= 0 && labels_i[j] >= 0) {
                     labels[j] += labels_i[j] * factor;
@@ -238,6 +238,6 @@ void IndexRandom::reset() {
     ntotal = 0;
 }
-IndexRandom::~IndexRandom() {}
+IndexRandom::~IndexRandom() = default;
 } // namespace faiss

data/vendor/faiss/faiss/VectorTransform.cpp CHANGED Viewed

@@ -441,13 +441,10 @@ void eig(size_t d_in, double* cov, double* eigenvalues, int verbose) {
 } // namespace
-void PCAMatrix::train(idx_t n, const float* x) {
-    const float* x_in = x;
-    x = fvecs_maybe_subsample(
-            d_in, (size_t*)&n, max_points_per_d * d_in, x, verbose);
-    ScopeDeleter<float> del_x(x != x_in ? x : nullptr);
+void PCAMatrix::train(idx_t n, const float* x_in) {
+    const float* x = fvecs_maybe_subsample(
+            d_in, (size_t*)&n, max_points_per_d * d_in, x_in, verbose);
+    TransformedVectors tv(x_in, x);
     // compute mean
     mean.clear();
@@ -884,14 +881,13 @@ ITQTransform::ITQTransform(int d_in, int d_out, bool do_pca)
     is_trained = false;
 }
-void ITQTransform::train(idx_t n, const float* x) {
+void ITQTransform::train(idx_t n, const float* x_in) {
     FAISS_THROW_IF_NOT(!is_trained);
-    const float* x_in = x;
     size_t max_train_points = std::max(d_in * max_train_per_dim, 32768);
-    x = fvecs_maybe_subsample(d_in, (size_t*)&n, max_train_points, x);
-    ScopeDeleter<float> del_x(x != x_in ? x : nullptr);
+    const float* x =
+            fvecs_maybe_subsample(d_in, (size_t*)&n, max_train_points, x_in);
+    TransformedVectors tv(x_in, x);
     std::unique_ptr<float[]> x_norm(new float[n * d_in]);
     { // normalize
@@ -988,25 +984,16 @@ void ITQTransform::check_identical(const VectorTransform& other_in) const {
  *********************************************/
 OPQMatrix::OPQMatrix(int d, int M, int d2)
-        : LinearTransform(d, d2 == -1 ? d : d2, false),
-          M(M),
-          niter(50),
-          niter_pq(4),
-          niter_pq_0(40),
-          verbose(false),
-          pq(nullptr) {
+        : LinearTransform(d, d2 == -1 ? d : d2, false), M(M) {
     is_trained = false;
     // OPQ is quite expensive to train, so set this right.
     max_train_points = 256 * 256;
-    pq = nullptr;
 }
-void OPQMatrix::train(idx_t n, const float* x) {
-    const float* x_in = x;
-    x = fvecs_maybe_subsample(d_in, (size_t*)&n, max_train_points, x, verbose);
-    ScopeDeleter<float> del_x(x != x_in ? x : nullptr);
+void OPQMatrix::train(idx_t n, const float* x_in) {
+    const float* x = fvecs_maybe_subsample(
+            d_in, (size_t*)&n, max_train_points, x_in, verbose);
+    TransformedVectors tv(x_in, x);
     // To support d_out > d_in, we pad input vectors with 0s to d_out
     size_t d = d_out <= d_in ? d_in : d_out;