RubyGems - faiss - Versions diffs - 0.1.7 → 0.2.3 - Mend

faiss 0.1.7 → 0.2.3

Files changed (219) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +18 -0
data/README.md +7 -7
data/ext/faiss/ext.cpp +1 -1
data/ext/faiss/extconf.rb +8 -2
data/ext/faiss/index.cpp +102 -69
data/ext/faiss/index_binary.cpp +24 -30
data/ext/faiss/kmeans.cpp +20 -16
data/ext/faiss/numo.hpp +867 -0
data/ext/faiss/pca_matrix.cpp +13 -14
data/ext/faiss/product_quantizer.cpp +23 -24
data/ext/faiss/utils.cpp +10 -37
data/ext/faiss/utils.h +2 -13
data/lib/faiss/version.rb +1 -1
data/lib/faiss.rb +0 -5
data/vendor/faiss/faiss/AutoTune.cpp +292 -291
data/vendor/faiss/faiss/AutoTune.h +55 -56
data/vendor/faiss/faiss/Clustering.cpp +334 -195
data/vendor/faiss/faiss/Clustering.h +88 -35
data/vendor/faiss/faiss/IVFlib.cpp +171 -195
data/vendor/faiss/faiss/IVFlib.h +48 -51
data/vendor/faiss/faiss/Index.cpp +85 -103
data/vendor/faiss/faiss/Index.h +54 -48
data/vendor/faiss/faiss/Index2Layer.cpp +139 -164
data/vendor/faiss/faiss/Index2Layer.h +22 -22
data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
data/vendor/faiss/faiss/IndexBinary.h +140 -132
data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
data/vendor/faiss/faiss/IndexFlat.cpp +116 -147
data/vendor/faiss/faiss/IndexFlat.h +35 -46
data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
data/vendor/faiss/faiss/IndexHNSW.h +57 -41
data/vendor/faiss/faiss/IndexIVF.cpp +474 -454
data/vendor/faiss/faiss/IndexIVF.h +146 -113
data/vendor/faiss/faiss/IndexIVFFlat.cpp +248 -250
data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
data/vendor/faiss/faiss/IndexIVFPQ.cpp +457 -516
data/vendor/faiss/faiss/IndexIVFPQ.h +74 -66
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +125 -133
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +19 -21
data/vendor/faiss/faiss/IndexLSH.cpp +75 -96
data/vendor/faiss/faiss/IndexLSH.h +21 -26
data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
data/vendor/faiss/faiss/IndexLattice.h +11 -16
data/vendor/faiss/faiss/IndexNNDescent.cpp +231 -0
data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
data/vendor/faiss/faiss/IndexNSG.cpp +303 -0
data/vendor/faiss/faiss/IndexNSG.h +85 -0
data/vendor/faiss/faiss/IndexPQ.cpp +405 -464
data/vendor/faiss/faiss/IndexPQ.h +64 -67
data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
data/vendor/faiss/faiss/IndexRefine.cpp +115 -131
data/vendor/faiss/faiss/IndexRefine.h +22 -23
data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
data/vendor/faiss/faiss/IndexReplicas.h +62 -56
data/vendor/faiss/faiss/IndexResidual.cpp +291 -0
data/vendor/faiss/faiss/IndexResidual.h +152 -0
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +120 -155
data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -45
data/vendor/faiss/faiss/IndexShards.cpp +256 -240
data/vendor/faiss/faiss/IndexShards.h +85 -73
data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
data/vendor/faiss/faiss/MatrixStats.h +7 -10
data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
data/vendor/faiss/faiss/MetaIndexes.h +40 -34
data/vendor/faiss/faiss/MetricType.h +7 -7
data/vendor/faiss/faiss/VectorTransform.cpp +652 -474
data/vendor/faiss/faiss/VectorTransform.h +61 -89
data/vendor/faiss/faiss/clone_index.cpp +77 -73
data/vendor/faiss/faiss/clone_index.h +4 -9
data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +197 -170
data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +270 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +115 -0
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
data/vendor/faiss/faiss/impl/FaissException.h +41 -29
data/vendor/faiss/faiss/impl/HNSW.cpp +595 -611
data/vendor/faiss/faiss/impl/HNSW.h +179 -200
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +672 -0
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +172 -0
data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
data/vendor/faiss/faiss/impl/NSG.cpp +682 -0
data/vendor/faiss/faiss/impl/NSG.h +199 -0
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +448 -0
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +130 -0
data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +648 -701
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
data/vendor/faiss/faiss/impl/index_read.cpp +547 -479
data/vendor/faiss/faiss/impl/index_write.cpp +497 -407
data/vendor/faiss/faiss/impl/io.cpp +75 -94
data/vendor/faiss/faiss/impl/io.h +31 -41
data/vendor/faiss/faiss/impl/io_macros.h +40 -29
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
data/vendor/faiss/faiss/index_factory.cpp +269 -218
data/vendor/faiss/faiss/index_factory.h +6 -7
data/vendor/faiss/faiss/index_io.h +23 -26
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
data/vendor/faiss/faiss/utils/Heap.h +186 -209
data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
data/vendor/faiss/faiss/utils/distances.cpp +301 -310
data/vendor/faiss/faiss/utils/distances.h +133 -118
data/vendor/faiss/faiss/utils/distances_simd.cpp +456 -516
data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
data/vendor/faiss/faiss/utils/hamming.h +62 -85
data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
data/vendor/faiss/faiss/utils/partitioning.h +26 -21
data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
data/vendor/faiss/faiss/utils/random.cpp +39 -63
data/vendor/faiss/faiss/utils/random.h +13 -16
data/vendor/faiss/faiss/utils/simdlib.h +4 -2
data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
data/vendor/faiss/faiss/utils/utils.cpp +304 -287
data/vendor/faiss/faiss/utils/utils.h +53 -48
metadata +26 -12
data/lib/faiss/index.rb +0 -20
data/lib/faiss/index_binary.rb +0 -20
data/lib/faiss/kmeans.rb +0 -15
data/lib/faiss/pca_matrix.rb +0 -15
data/lib/faiss/product_quantizer.rb +0 -22

data/vendor/faiss/faiss/IndexShards.h CHANGED Viewed

@@ -18,82 +18,94 @@ namespace faiss {
  */
 template <typename IndexT>
 struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
-  using idx_t = typename IndexT::idx_t;
-  using component_t = typename IndexT::component_t;
-  using distance_t = typename IndexT::distance_t;
-  /**
-   * The dimension that all sub-indices must share will be the dimension of the
-   * first sub-index added
-   *
-   * @param threaded     do we use one thread per sub_index or do
-   *                     queries sequentially?
-   * @param successive_ids should we shift the returned ids by
-   *                     the size of each sub-index or return them
-   *                     as they are?
-   */
-  explicit IndexShardsTemplate(bool threaded = false,
-                               bool successive_ids = true);
-  /**
-   * @param threaded     do we use one thread per sub_index or do
-   *                     queries sequentially?
-   * @param successive_ids should we shift the returned ids by
-   *                     the size of each sub-index or return them
-   *                     as they are?
-   */
-  explicit IndexShardsTemplate(idx_t d,
-                               bool threaded = false,
-                               bool successive_ids = true);
-  /// int version due to the implicit bool conversion ambiguity of int as
-  /// dimension
-  explicit IndexShardsTemplate(int d,
-                               bool threaded = false,
-                               bool successive_ids = true);
-  /// Alias for addIndex()
-  void add_shard(IndexT* index) { this->addIndex(index); }
-  /// Alias for removeIndex()
-  void remove_shard(IndexT* index) { this->removeIndex(index); }
-  /// supported only for sub-indices that implement add_with_ids
-  void add(idx_t n, const component_t* x) override;
-  /**
-   * Cases (successive_ids, xids):
-   * - true, non-NULL       ERROR: it makes no sense to pass in ids and
-   *                        request them to be shifted
-   * - true, NULL           OK, but should be called only once (calls add()
-   *                        on sub-indexes).
-   * - false, non-NULL      OK: will call add_with_ids with passed in xids
-   *                        distributed evenly over shards
-   * - false, NULL          OK: will call add_with_ids on each sub-index,
-   *                        starting at ntotal
-   */
-  void add_with_ids(idx_t n, const component_t* x, const idx_t* xids) override;
-  void search(idx_t n, const component_t* x, idx_t k,
-              distance_t* distances, idx_t* labels) const override;
-  void train(idx_t n, const component_t* x) override;
-  bool successive_ids;
-  /// Synchronize the top-level index (IndexShards) with data in the sub-indices
-  void syncWithSubIndexes();
- protected:
-  /// Called just after an index is added
-  void onAfterAddIndex(IndexT* index) override;
-  /// Called just after an index is removed
-  void onAfterRemoveIndex(IndexT* index) override;
+    using idx_t = typename IndexT::idx_t;
+    using component_t = typename IndexT::component_t;
+    using distance_t = typename IndexT::distance_t;
+    /**
+     * The dimension that all sub-indices must share will be the dimension of
+     * the first sub-index added
+     *
+     * @param threaded     do we use one thread per sub_index or do
+     *                     queries sequentially?
+     * @param successive_ids should we shift the returned ids by
+     *                     the size of each sub-index or return them
+     *                     as they are?
+     */
+    explicit IndexShardsTemplate(
+            bool threaded = false,
+            bool successive_ids = true);
+    /**
+     * @param threaded     do we use one thread per sub_index or do
+     *                     queries sequentially?
+     * @param successive_ids should we shift the returned ids by
+     *                     the size of each sub-index or return them
+     *                     as they are?
+     */
+    explicit IndexShardsTemplate(
+            idx_t d,
+            bool threaded = false,
+            bool successive_ids = true);
+    /// int version due to the implicit bool conversion ambiguity of int as
+    /// dimension
+    explicit IndexShardsTemplate(
+            int d,
+            bool threaded = false,
+            bool successive_ids = true);
+    /// Alias for addIndex()
+    void add_shard(IndexT* index) {
+        this->addIndex(index);
+    }
+    /// Alias for removeIndex()
+    void remove_shard(IndexT* index) {
+        this->removeIndex(index);
+    }
+    /// supported only for sub-indices that implement add_with_ids
+    void add(idx_t n, const component_t* x) override;
+    /**
+     * Cases (successive_ids, xids):
+     * - true, non-NULL       ERROR: it makes no sense to pass in ids and
+     *                        request them to be shifted
+     * - true, NULL           OK, but should be called only once (calls add()
+     *                        on sub-indexes).
+     * - false, non-NULL      OK: will call add_with_ids with passed in xids
+     *                        distributed evenly over shards
+     * - false, NULL          OK: will call add_with_ids on each sub-index,
+     *                        starting at ntotal
+     */
+    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
+            override;
+    void search(
+            idx_t n,
+            const component_t* x,
+            idx_t k,
+            distance_t* distances,
+            idx_t* labels) const override;
+    void train(idx_t n, const component_t* x) override;
+    bool successive_ids;
+    /// Synchronize the top-level index (IndexShards) with data in the
+    /// sub-indices
+    void syncWithSubIndexes();
+   protected:
+    /// Called just after an index is added
+    void onAfterAddIndex(IndexT* index) override;
+    /// Called just after an index is removed
+    void onAfterRemoveIndex(IndexT* index) override;
 };
 using IndexShards = IndexShardsTemplate<Index>;
 using IndexBinaryShards = IndexShardsTemplate<IndexBinary>;
 } // namespace faiss

data/vendor/faiss/faiss/MatrixStats.cpp CHANGED Viewed

@@ -7,15 +7,13 @@
 // -*- c++ -*-
 #include <faiss/MatrixStats.h>
+#include <stdarg.h> /* va_list, va_start, va_arg, va_end */
-#include <stdarg.h>     /* va_list, va_start, va_arg, va_end */
+#include <faiss/utils/utils.h>
 #include <cmath>
 #include <cstdio>
-#include <faiss/utils/utils.h>
 namespace faiss {
@@ -23,16 +21,19 @@ namespace faiss {
  * MatrixStats
  *********************************************************************/
-MatrixStats::PerDimStats::PerDimStats():
-    n(0), n_nan(0), n_inf(0), n0(0),
-    min(HUGE_VALF), max(-HUGE_VALF),
-    sum(0), sum2(0),
-    mean(NAN), stddev(NAN)
-{}
-void MatrixStats::PerDimStats::add (float x)
-{
+MatrixStats::PerDimStats::PerDimStats()
+        : n(0),
+          n_nan(0),
+          n_inf(0),
+          n0(0),
+          min(HUGE_VALF),
+          max(-HUGE_VALF),
+          sum(0),
+          sum2(0),
+          mean(NAN),
+          stddev(NAN) {}
+void MatrixStats::PerDimStats::add(float x) {
     n++;
     if (std::isnan(x)) {
         n_nan++;
@@ -42,25 +43,26 @@ void MatrixStats::PerDimStats::add (float x)
         n_inf++;
         return;
     }
-    if (x == 0) n0++;
-    if (x < min) min = x;
-    if (x > max) max = x;
+    if (x == 0)
+        n0++;
+    if (x < min)
+        min = x;
+    if (x > max)
+        max = x;
     sum += x;
     sum2 += (double)x * (double)x;
 }
-void MatrixStats::PerDimStats::compute_mean_std ()
-{
+void MatrixStats::PerDimStats::compute_mean_std() {
     n_valid = n - n_nan - n_inf;
     mean = sum / n_valid;
     double var = sum2 / n_valid - mean * mean;
-    if (var < 0) var = 0;
+    if (var < 0)
+        var = 0;
     stddev = sqrt(var);
 }
-void MatrixStats::do_comment (const char *fmt, ...)
-{
+void MatrixStats::do_comment(const char* fmt, ...) {
     va_list ap;
     /* Determine required size */
@@ -72,57 +74,60 @@ void MatrixStats::do_comment (const char *fmt, ...)
     buf += size;
 }
-MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
-    n(n), d(d),
-    n_collision(0), n_valid(0), n0(0),
-    min_norm2(HUGE_VAL), max_norm2(0)
-{
-    std::vector<char> comment_buf (10000);
-    buf = comment_buf.data ();
+MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
+        : n(n),
+          d(d),
+          n_collision(0),
+          n_valid(0),
+          n0(0),
+          min_norm2(HUGE_VAL),
+          max_norm2(0) {
+    std::vector<char> comment_buf(10000);
+    buf = comment_buf.data();
     nbuf = comment_buf.size();
-    do_comment ("analyzing %ld vectors of size %ld\n", n, d);
+    do_comment("analyzing %ld vectors of size %ld\n", n, d);
     if (d > 1024) {
-        do_comment (
-           "indexing this many dimensions is hard, "
-           "please consider dimensionality reducution (with PCAMatrix)\n");
+        do_comment(
+                "indexing this many dimensions is hard, "
+                "please consider dimensionality reducution (with PCAMatrix)\n");
     }
-    size_t nbytes = sizeof (x[0]) * d;
-    per_dim_stats.resize (d);
+    size_t nbytes = sizeof(x[0]) * d;
+    per_dim_stats.resize(d);
     for (size_t i = 0; i < n; i++) {
-        const float *xi = x + d * i;
+        const float* xi = x + d * i;
         double sum2 = 0;
         for (size_t j = 0; j < d; j++) {
-            per_dim_stats[j].add (xi[j]);
+            per_dim_stats[j].add(xi[j]);
             sum2 += xi[j] * (double)xi[j];
         }
-        if (std::isfinite (sum2)) {
+        if (std::isfinite(sum2)) {
             n_valid++;
             if (sum2 == 0) {
-                n0 ++;
+                n0++;
             } else {
-                if (sum2 < min_norm2) min_norm2 = sum2;
-                if (sum2 > max_norm2) max_norm2 = sum2;
+                if (sum2 < min_norm2)
+                    min_norm2 = sum2;
+                if (sum2 > max_norm2)
+                    max_norm2 = sum2;
             }
         }
         { // check hash
             uint64_t hash = hash_bytes((const uint8_t*)xi, nbytes);
-            auto elt = occurrences.find (hash);
+            auto elt = occurrences.find(hash);
             if (elt == occurrences.end()) {
                 Occurrence occ = {i, 1};
                 occurrences[hash] = occ;
             } else {
-                if (!memcmp (xi, x + elt->second.first * d, nbytes)) {
-                    elt->second.count ++;
+                if (!memcmp(xi, x + elt->second.first * d, nbytes)) {
+                    elt->second.count++;
                 } else {
-                    n_collision ++;
+                    n_collision++;
                     // we should use a list of collisions but overkill
                 }
             }
@@ -131,50 +136,59 @@ MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
     // invalid vecor stats
     if (n_valid == n) {
-        do_comment ("no NaN or Infs in data\n");
+        do_comment("no NaN or Infs in data\n");
     } else {
-        do_comment ("%ld vectors contain NaN or Inf "
-                 "(or have too large components), "
-                 "expect bad results with indexing!\n", n - n_valid);
+        do_comment(
+                "%ld vectors contain NaN or Inf "
+                "(or have too large components), "
+                "expect bad results with indexing!\n",
+                n - n_valid);
     }
     // copies in dataset
     if (occurrences.size() == n) {
-        do_comment ("all vectors are distinct\n");
+        do_comment("all vectors are distinct\n");
     } else {
-        do_comment ("%ld vectors are distinct (%.2f%%)\n",
-                 occurrences.size(),
-                 occurrences.size() * 100.0 / n);
+        do_comment(
+                "%ld vectors are distinct (%.2f%%)\n",
+                occurrences.size(),
+                occurrences.size() * 100.0 / n);
         if (n_collision > 0) {
-            do_comment ("%ld collisions in hash table, "
-                     "counts may be invalid\n", n_collision);
+            do_comment(
+                    "%ld collisions in hash table, "
+                    "counts may be invalid\n",
+                    n_collision);
         }
         Occurrence max = {0, 0};
-        for (auto it = occurrences.begin();
-             it != occurrences.end(); ++it) {
+        for (auto it = occurrences.begin(); it != occurrences.end(); ++it) {
             if (it->second.count > max.count) {
                 max = it->second;
             }
         }
-        do_comment ("vector %ld has %ld copies\n", max.first, max.count);
+        do_comment("vector %ld has %ld copies\n", max.first, max.count);
     }
     { // norm stats
-        min_norm2 = sqrt (min_norm2);
-        max_norm2 = sqrt (max_norm2);
-        do_comment ("range of L2 norms=[%g, %g] (%ld null vectors)\n",
-                 min_norm2, max_norm2, n0);
+        min_norm2 = sqrt(min_norm2);
+        max_norm2 = sqrt(max_norm2);
+        do_comment(
+                "range of L2 norms=[%g, %g] (%ld null vectors)\n",
+                min_norm2,
+                max_norm2,
+                n0);
         if (max_norm2 < min_norm2 * 1.0001) {
-            do_comment ("vectors are normalized, inner product and "
-                     "L2  search are equivalent\n");
+            do_comment(
+                    "vectors are normalized, inner product and "
+                    "L2  search are equivalent\n");
         }
         if (max_norm2 > min_norm2 * 100) {
-            do_comment ("vectors have very large differences in norms, "
-                     "is this normal?\n");
+            do_comment(
+                    "vectors have very large differences in norms, "
+                    "is this normal?\n");
         }
     }
@@ -185,68 +199,69 @@ MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
         size_t n_dangerous_range = 0, n_0_range = 0, n0 = 0;
         for (size_t j = 0; j < d; j++) {
-            PerDimStats &st = per_dim_stats[j];
-            st.compute_mean_std ();
+            PerDimStats& st = per_dim_stats[j];
+            st.compute_mean_std();
             n0 += st.n0;
             if (st.max == st.min) {
-                n_0_range ++;
+                n_0_range++;
             } else if (st.max < 1.001 * st.min) {
-                n_dangerous_range ++;
+                n_dangerous_range++;
             }
-            if (st.stddev > max_std) max_std = st.stddev;
-            if (st.stddev < min_std) min_std = st.stddev;
+            if (st.stddev > max_std)
+                max_std = st.stddev;
+            if (st.stddev < min_std)
+                min_std = st.stddev;
         }
         if (n0 == 0) {
-            do_comment ("matrix contains no 0s\n");
+            do_comment("matrix contains no 0s\n");
         } else {
-            do_comment ("matrix contains %.2f %% 0 entries\n",
-                     n0 * 100.0 / (n * d));
+            do_comment(
+                    "matrix contains %.2f %% 0 entries\n",
+                    n0 * 100.0 / (n * d));
         }
         if (n_0_range == 0) {
-            do_comment ("no constant dimensions\n");
+            do_comment("no constant dimensions\n");
         } else {
-            do_comment ("%ld dimensions are constant: they can be removed\n",
-                     n_0_range);
+            do_comment(
+                    "%ld dimensions are constant: they can be removed\n",
+                    n_0_range);
         }
         if (n_dangerous_range == 0) {
-            do_comment ("no dimension has a too large mean\n");
+            do_comment("no dimension has a too large mean\n");
         } else {
-            do_comment ("%ld dimensions are too large "
-                     "wrt. their variance, may loose precision "
-                     "in IndexFlatL2 (use CenteringTransform)\n",
-                     n_dangerous_range);
+            do_comment(
+                    "%ld dimensions are too large "
+                    "wrt. their variance, may loose precision "
+                    "in IndexFlatL2 (use CenteringTransform)\n",
+                    n_dangerous_range);
         }
-        do_comment ("stddevs per dimension are in [%g %g]\n", min_std, max_std);
+        do_comment("stddevs per dimension are in [%g %g]\n", min_std, max_std);
         size_t n_small_var = 0;
         for (size_t j = 0; j < d; j++) {
-            const PerDimStats &st = per_dim_stats[j];
+            const PerDimStats& st = per_dim_stats[j];
             if (st.stddev < max_std * 1e-4) {
                 n_small_var++;
             }
         }
         if (n_small_var > 0) {
-            do_comment ("%ld dimensions have negligible stddev wrt. "
-                     "the largest dimension, they could be ignored",
-                     n_small_var);
+            do_comment(
+                    "%ld dimensions have negligible stddev wrt. "
+                    "the largest dimension, they could be ignored",
+                    n_small_var);
         }
     }
-    comments = comment_buf.data ();
+    comments = comment_buf.data();
     buf = nullptr;
     nbuf = 0;
 }
 } // namespace faiss

data/vendor/faiss/faiss/MatrixStats.h CHANGED Viewed

@@ -9,22 +9,20 @@
 #pragma once
-#include <vector>
+#include <stdint.h>
 #include <string>
 #include <unordered_map>
-#include <stdint.h>
+#include <vector>
 namespace faiss {
 /** Reports some statistics on a dataset and comments on them.
  *
  * It is a class rather than a function so that all stats can also be
  * accessed from code */
 struct MatrixStats {
-    MatrixStats (size_t n, size_t d, const float *x);
+    MatrixStats(size_t n, size_t d, const float* x);
     std::string comments;
     // raw statistics
@@ -42,8 +40,8 @@ struct MatrixStats {
         double mean, stddev;
         PerDimStats();
-        void add (float x);
-        void compute_mean_std ();
+        void add(float x);
+        void compute_mean_std();
     };
     std::vector<PerDimStats> per_dim_stats;
@@ -53,10 +51,9 @@ struct MatrixStats {
     };
     std::unordered_map<uint64_t, Occurrence> occurrences;
-    char *buf;
+    char* buf;
     size_t nbuf;
-    void do_comment (const char *fmt, ...);
+    void do_comment(const char* fmt, ...);
 };
 } // namespace faiss