RubyGems - faiss - Versions diffs - 0.2.0 → 0.2.4 - Mend

faiss 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (215) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +16 -0
data/LICENSE.txt +1 -1
data/README.md +7 -7
data/ext/faiss/extconf.rb +6 -3
data/ext/faiss/numo.hpp +4 -4
data/ext/faiss/utils.cpp +1 -1
data/ext/faiss/utils.h +1 -1
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +292 -291
data/vendor/faiss/faiss/AutoTune.h +55 -56
data/vendor/faiss/faiss/Clustering.cpp +365 -194
data/vendor/faiss/faiss/Clustering.h +102 -35
data/vendor/faiss/faiss/IVFlib.cpp +171 -195
data/vendor/faiss/faiss/IVFlib.h +48 -51
data/vendor/faiss/faiss/Index.cpp +85 -103
data/vendor/faiss/faiss/Index.h +54 -48
data/vendor/faiss/faiss/Index2Layer.cpp +126 -224
data/vendor/faiss/faiss/Index2Layer.h +22 -36
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +407 -0
data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +195 -0
data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
data/vendor/faiss/faiss/IndexBinary.h +140 -132
data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
data/vendor/faiss/faiss/IndexFlat.cpp +115 -176
data/vendor/faiss/faiss/IndexFlat.h +42 -59
data/vendor/faiss/faiss/IndexFlatCodes.cpp +67 -0
data/vendor/faiss/faiss/IndexFlatCodes.h +47 -0
data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
data/vendor/faiss/faiss/IndexHNSW.h +57 -41
data/vendor/faiss/faiss/IndexIVF.cpp +545 -453
data/vendor/faiss/faiss/IndexIVF.h +169 -118
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +316 -0
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +121 -0
data/vendor/faiss/faiss/IndexIVFFlat.cpp +247 -252
data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
data/vendor/faiss/faiss/IndexIVFPQ.cpp +459 -517
data/vendor/faiss/faiss/IndexIVFPQ.h +75 -67
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +163 -150
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +38 -25
data/vendor/faiss/faiss/IndexLSH.cpp +66 -113
data/vendor/faiss/faiss/IndexLSH.h +20 -38
data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
data/vendor/faiss/faiss/IndexLattice.h +11 -16
data/vendor/faiss/faiss/IndexNNDescent.cpp +229 -0
data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
data/vendor/faiss/faiss/IndexNSG.cpp +301 -0
data/vendor/faiss/faiss/IndexNSG.h +85 -0
data/vendor/faiss/faiss/IndexPQ.cpp +387 -495
data/vendor/faiss/faiss/IndexPQ.h +64 -82
data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
data/vendor/faiss/faiss/IndexRefine.cpp +139 -127
data/vendor/faiss/faiss/IndexRefine.h +32 -23
data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
data/vendor/faiss/faiss/IndexReplicas.h +62 -56
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +111 -172
data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -59
data/vendor/faiss/faiss/IndexShards.cpp +256 -240
data/vendor/faiss/faiss/IndexShards.h +85 -73
data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
data/vendor/faiss/faiss/MatrixStats.h +7 -10
data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
data/vendor/faiss/faiss/MetaIndexes.h +40 -34
data/vendor/faiss/faiss/MetricType.h +7 -7
data/vendor/faiss/faiss/VectorTransform.cpp +654 -475
data/vendor/faiss/faiss/VectorTransform.h +64 -89
data/vendor/faiss/faiss/clone_index.cpp +78 -73
data/vendor/faiss/faiss/clone_index.h +4 -9
data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +198 -171
data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +60 -0
data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +503 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +175 -0
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
data/vendor/faiss/faiss/impl/FaissException.h +41 -29
data/vendor/faiss/faiss/impl/HNSW.cpp +606 -617
data/vendor/faiss/faiss/impl/HNSW.h +179 -200
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +855 -0
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +244 -0
data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
data/vendor/faiss/faiss/impl/NSG.cpp +679 -0
data/vendor/faiss/faiss/impl/NSG.h +199 -0
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +758 -0
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +188 -0
data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +647 -707
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
data/vendor/faiss/faiss/impl/index_read.cpp +631 -480
data/vendor/faiss/faiss/impl/index_write.cpp +547 -407
data/vendor/faiss/faiss/impl/io.cpp +76 -95
data/vendor/faiss/faiss/impl/io.h +31 -41
data/vendor/faiss/faiss/impl/io_macros.h +60 -29
data/vendor/faiss/faiss/impl/kmeans1d.cpp +301 -0
data/vendor/faiss/faiss/impl/kmeans1d.h +48 -0
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
data/vendor/faiss/faiss/index_factory.cpp +619 -397
data/vendor/faiss/faiss/index_factory.h +8 -6
data/vendor/faiss/faiss/index_io.h +23 -26
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
data/vendor/faiss/faiss/utils/Heap.h +186 -209
data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
data/vendor/faiss/faiss/utils/distances.cpp +305 -312
data/vendor/faiss/faiss/utils/distances.h +170 -122
data/vendor/faiss/faiss/utils/distances_simd.cpp +498 -508
data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
data/vendor/faiss/faiss/utils/hamming.h +62 -85
data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
data/vendor/faiss/faiss/utils/partitioning.h +26 -21
data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
data/vendor/faiss/faiss/utils/random.cpp +39 -63
data/vendor/faiss/faiss/utils/random.h +13 -16
data/vendor/faiss/faiss/utils/simdlib.h +4 -2
data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
data/vendor/faiss/faiss/utils/utils.cpp +304 -287
data/vendor/faiss/faiss/utils/utils.h +54 -49
metadata +29 -4

data/vendor/faiss/faiss/IndexShards.h CHANGED Viewed

@@ -18,82 +18,94 @@ namespace faiss {
  */
 template <typename IndexT>
 struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
-  using idx_t = typename IndexT::idx_t;
-  using component_t = typename IndexT::component_t;
-  using distance_t = typename IndexT::distance_t;
-  /**
-   * The dimension that all sub-indices must share will be the dimension of the
-   * first sub-index added
-   *
-   * @param threaded     do we use one thread per sub_index or do
-   *                     queries sequentially?
-   * @param successive_ids should we shift the returned ids by
-   *                     the size of each sub-index or return them
-   *                     as they are?
-   */
-  explicit IndexShardsTemplate(bool threaded = false,
-                               bool successive_ids = true);
-  /**
-   * @param threaded     do we use one thread per sub_index or do
-   *                     queries sequentially?
-   * @param successive_ids should we shift the returned ids by
-   *                     the size of each sub-index or return them
-   *                     as they are?
-   */
-  explicit IndexShardsTemplate(idx_t d,
-                               bool threaded = false,
-                               bool successive_ids = true);
-  /// int version due to the implicit bool conversion ambiguity of int as
-  /// dimension
-  explicit IndexShardsTemplate(int d,
-                               bool threaded = false,
-                               bool successive_ids = true);
-  /// Alias for addIndex()
-  void add_shard(IndexT* index) { this->addIndex(index); }
-  /// Alias for removeIndex()
-  void remove_shard(IndexT* index) { this->removeIndex(index); }
-  /// supported only for sub-indices that implement add_with_ids
-  void add(idx_t n, const component_t* x) override;
-  /**
-   * Cases (successive_ids, xids):
-   * - true, non-NULL       ERROR: it makes no sense to pass in ids and
-   *                        request them to be shifted
-   * - true, NULL           OK, but should be called only once (calls add()
-   *                        on sub-indexes).
-   * - false, non-NULL      OK: will call add_with_ids with passed in xids
-   *                        distributed evenly over shards
-   * - false, NULL          OK: will call add_with_ids on each sub-index,
-   *                        starting at ntotal
-   */
-  void add_with_ids(idx_t n, const component_t* x, const idx_t* xids) override;
-  void search(idx_t n, const component_t* x, idx_t k,
-              distance_t* distances, idx_t* labels) const override;
-  void train(idx_t n, const component_t* x) override;
-  bool successive_ids;
-  /// Synchronize the top-level index (IndexShards) with data in the sub-indices
-  void syncWithSubIndexes();
- protected:
-  /// Called just after an index is added
-  void onAfterAddIndex(IndexT* index) override;
-  /// Called just after an index is removed
-  void onAfterRemoveIndex(IndexT* index) override;
+    using idx_t = typename IndexT::idx_t;
+    using component_t = typename IndexT::component_t;
+    using distance_t = typename IndexT::distance_t;
+    /**
+     * The dimension that all sub-indices must share will be the dimension of
+     * the first sub-index added
+     *
+     * @param threaded     do we use one thread per sub_index or do
+     *                     queries sequentially?
+     * @param successive_ids should we shift the returned ids by
+     *                     the size of each sub-index or return them
+     *                     as they are?
+     */
+    explicit IndexShardsTemplate(
+            bool threaded = false,
+            bool successive_ids = true);
+    /**
+     * @param threaded     do we use one thread per sub_index or do
+     *                     queries sequentially?
+     * @param successive_ids should we shift the returned ids by
+     *                     the size of each sub-index or return them
+     *                     as they are?
+     */
+    explicit IndexShardsTemplate(
+            idx_t d,
+            bool threaded = false,
+            bool successive_ids = true);
+    /// int version due to the implicit bool conversion ambiguity of int as
+    /// dimension
+    explicit IndexShardsTemplate(
+            int d,
+            bool threaded = false,
+            bool successive_ids = true);
+    /// Alias for addIndex()
+    void add_shard(IndexT* index) {
+        this->addIndex(index);
+    }
+    /// Alias for removeIndex()
+    void remove_shard(IndexT* index) {
+        this->removeIndex(index);
+    }
+    /// supported only for sub-indices that implement add_with_ids
+    void add(idx_t n, const component_t* x) override;
+    /**
+     * Cases (successive_ids, xids):
+     * - true, non-NULL       ERROR: it makes no sense to pass in ids and
+     *                        request them to be shifted
+     * - true, NULL           OK, but should be called only once (calls add()
+     *                        on sub-indexes).
+     * - false, non-NULL      OK: will call add_with_ids with passed in xids
+     *                        distributed evenly over shards
+     * - false, NULL          OK: will call add_with_ids on each sub-index,
+     *                        starting at ntotal
+     */
+    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
+            override;
+    void search(
+            idx_t n,
+            const component_t* x,
+            idx_t k,
+            distance_t* distances,
+            idx_t* labels) const override;
+    void train(idx_t n, const component_t* x) override;
+    bool successive_ids;
+    /// Synchronize the top-level index (IndexShards) with data in the
+    /// sub-indices
+    void syncWithSubIndexes();
+   protected:
+    /// Called just after an index is added
+    void onAfterAddIndex(IndexT* index) override;
+    /// Called just after an index is removed
+    void onAfterRemoveIndex(IndexT* index) override;
 };
 using IndexShards = IndexShardsTemplate<Index>;
 using IndexBinaryShards = IndexShardsTemplate<IndexBinary>;
 } // namespace faiss

data/vendor/faiss/faiss/MatrixStats.cpp CHANGED Viewed

@@ -7,15 +7,13 @@
 // -*- c++ -*-
 #include <faiss/MatrixStats.h>
+#include <stdarg.h> /* va_list, va_start, va_arg, va_end */
-#include <stdarg.h>     /* va_list, va_start, va_arg, va_end */
+#include <faiss/utils/utils.h>
 #include <cmath>
 #include <cstdio>
-#include <faiss/utils/utils.h>
 namespace faiss {
@@ -23,16 +21,19 @@ namespace faiss {
  * MatrixStats
  *********************************************************************/
-MatrixStats::PerDimStats::PerDimStats():
-    n(0), n_nan(0), n_inf(0), n0(0),
-    min(HUGE_VALF), max(-HUGE_VALF),
-    sum(0), sum2(0),
-    mean(NAN), stddev(NAN)
-{}
-void MatrixStats::PerDimStats::add (float x)
-{
+MatrixStats::PerDimStats::PerDimStats()
+        : n(0),
+          n_nan(0),
+          n_inf(0),
+          n0(0),
+          min(HUGE_VALF),
+          max(-HUGE_VALF),
+          sum(0),
+          sum2(0),
+          mean(NAN),
+          stddev(NAN) {}
+void MatrixStats::PerDimStats::add(float x) {
     n++;
     if (std::isnan(x)) {
         n_nan++;
@@ -42,25 +43,26 @@ void MatrixStats::PerDimStats::add (float x)
         n_inf++;
         return;
     }
-    if (x == 0) n0++;
-    if (x < min) min = x;
-    if (x > max) max = x;
+    if (x == 0)
+        n0++;
+    if (x < min)
+        min = x;
+    if (x > max)
+        max = x;
     sum += x;
     sum2 += (double)x * (double)x;
 }
-void MatrixStats::PerDimStats::compute_mean_std ()
-{
+void MatrixStats::PerDimStats::compute_mean_std() {
     n_valid = n - n_nan - n_inf;
     mean = sum / n_valid;
     double var = sum2 / n_valid - mean * mean;
-    if (var < 0) var = 0;
+    if (var < 0)
+        var = 0;
     stddev = sqrt(var);
 }
-void MatrixStats::do_comment (const char *fmt, ...)
-{
+void MatrixStats::do_comment(const char* fmt, ...) {
     va_list ap;
     /* Determine required size */
@@ -72,57 +74,60 @@ void MatrixStats::do_comment (const char *fmt, ...)
     buf += size;
 }
-MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
-    n(n), d(d),
-    n_collision(0), n_valid(0), n0(0),
-    min_norm2(HUGE_VAL), max_norm2(0)
-{
-    std::vector<char> comment_buf (10000);
-    buf = comment_buf.data ();
+MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
+        : n(n),
+          d(d),
+          n_collision(0),
+          n_valid(0),
+          n0(0),
+          min_norm2(HUGE_VAL),
+          max_norm2(0) {
+    std::vector<char> comment_buf(10000);
+    buf = comment_buf.data();
     nbuf = comment_buf.size();
-    do_comment ("analyzing %ld vectors of size %ld\n", n, d);
+    do_comment("analyzing %ld vectors of size %ld\n", n, d);
     if (d > 1024) {
-        do_comment (
-           "indexing this many dimensions is hard, "
-           "please consider dimensionality reducution (with PCAMatrix)\n");
+        do_comment(
+                "indexing this many dimensions is hard, "
+                "please consider dimensionality reducution (with PCAMatrix)\n");
     }
-    size_t nbytes = sizeof (x[0]) * d;
-    per_dim_stats.resize (d);
+    size_t nbytes = sizeof(x[0]) * d;
+    per_dim_stats.resize(d);
     for (size_t i = 0; i < n; i++) {
-        const float *xi = x + d * i;
+        const float* xi = x + d * i;
         double sum2 = 0;
         for (size_t j = 0; j < d; j++) {
-            per_dim_stats[j].add (xi[j]);
+            per_dim_stats[j].add(xi[j]);
             sum2 += xi[j] * (double)xi[j];
         }
-        if (std::isfinite (sum2)) {
+        if (std::isfinite(sum2)) {
             n_valid++;
             if (sum2 == 0) {
-                n0 ++;
+                n0++;
             } else {
-                if (sum2 < min_norm2) min_norm2 = sum2;
-                if (sum2 > max_norm2) max_norm2 = sum2;
+                if (sum2 < min_norm2)
+                    min_norm2 = sum2;
+                if (sum2 > max_norm2)
+                    max_norm2 = sum2;
             }
         }
         { // check hash
             uint64_t hash = hash_bytes((const uint8_t*)xi, nbytes);
-            auto elt = occurrences.find (hash);
+            auto elt = occurrences.find(hash);
             if (elt == occurrences.end()) {
                 Occurrence occ = {i, 1};
                 occurrences[hash] = occ;
             } else {
-                if (!memcmp (xi, x + elt->second.first * d, nbytes)) {
-                    elt->second.count ++;
+                if (!memcmp(xi, x + elt->second.first * d, nbytes)) {
+                    elt->second.count++;
                 } else {
-                    n_collision ++;
+                    n_collision++;
                     // we should use a list of collisions but overkill
                 }
             }
@@ -131,50 +136,59 @@ MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
     // invalid vecor stats
     if (n_valid == n) {
-        do_comment ("no NaN or Infs in data\n");
+        do_comment("no NaN or Infs in data\n");
     } else {
-        do_comment ("%ld vectors contain NaN or Inf "
-                 "(or have too large components), "
-                 "expect bad results with indexing!\n", n - n_valid);
+        do_comment(
+                "%ld vectors contain NaN or Inf "
+                "(or have too large components), "
+                "expect bad results with indexing!\n",
+                n - n_valid);
     }
     // copies in dataset
     if (occurrences.size() == n) {
-        do_comment ("all vectors are distinct\n");
+        do_comment("all vectors are distinct\n");
     } else {
-        do_comment ("%ld vectors are distinct (%.2f%%)\n",
-                 occurrences.size(),
-                 occurrences.size() * 100.0 / n);
+        do_comment(
+                "%ld vectors are distinct (%.2f%%)\n",
+                occurrences.size(),
+                occurrences.size() * 100.0 / n);
         if (n_collision > 0) {
-            do_comment ("%ld collisions in hash table, "
-                     "counts may be invalid\n", n_collision);
+            do_comment(
+                    "%ld collisions in hash table, "
+                    "counts may be invalid\n",
+                    n_collision);
         }
         Occurrence max = {0, 0};
-        for (auto it = occurrences.begin();
-             it != occurrences.end(); ++it) {
+        for (auto it = occurrences.begin(); it != occurrences.end(); ++it) {
             if (it->second.count > max.count) {
                 max = it->second;
             }
         }
-        do_comment ("vector %ld has %ld copies\n", max.first, max.count);
+        do_comment("vector %ld has %ld copies\n", max.first, max.count);
     }
     { // norm stats
-        min_norm2 = sqrt (min_norm2);
-        max_norm2 = sqrt (max_norm2);
-        do_comment ("range of L2 norms=[%g, %g] (%ld null vectors)\n",
-                 min_norm2, max_norm2, n0);
+        min_norm2 = sqrt(min_norm2);
+        max_norm2 = sqrt(max_norm2);
+        do_comment(
+                "range of L2 norms=[%g, %g] (%ld null vectors)\n",
+                min_norm2,
+                max_norm2,
+                n0);
         if (max_norm2 < min_norm2 * 1.0001) {
-            do_comment ("vectors are normalized, inner product and "
-                     "L2  search are equivalent\n");
+            do_comment(
+                    "vectors are normalized, inner product and "
+                    "L2  search are equivalent\n");
         }
         if (max_norm2 > min_norm2 * 100) {
-            do_comment ("vectors have very large differences in norms, "
-                     "is this normal?\n");
+            do_comment(
+                    "vectors have very large differences in norms, "
+                    "is this normal?\n");
         }
     }
@@ -185,68 +199,69 @@ MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
         size_t n_dangerous_range = 0, n_0_range = 0, n0 = 0;
         for (size_t j = 0; j < d; j++) {
-            PerDimStats &st = per_dim_stats[j];
-            st.compute_mean_std ();
+            PerDimStats& st = per_dim_stats[j];
+            st.compute_mean_std();
             n0 += st.n0;
             if (st.max == st.min) {
-                n_0_range ++;
+                n_0_range++;
             } else if (st.max < 1.001 * st.min) {
-                n_dangerous_range ++;
+                n_dangerous_range++;
             }
-            if (st.stddev > max_std) max_std = st.stddev;
-            if (st.stddev < min_std) min_std = st.stddev;
+            if (st.stddev > max_std)
+                max_std = st.stddev;
+            if (st.stddev < min_std)
+                min_std = st.stddev;
         }
         if (n0 == 0) {
-            do_comment ("matrix contains no 0s\n");
+            do_comment("matrix contains no 0s\n");
         } else {
-            do_comment ("matrix contains %.2f %% 0 entries\n",
-                     n0 * 100.0 / (n * d));
+            do_comment(
+                    "matrix contains %.2f %% 0 entries\n",
+                    n0 * 100.0 / (n * d));
         }
         if (n_0_range == 0) {
-            do_comment ("no constant dimensions\n");
+            do_comment("no constant dimensions\n");
         } else {
-            do_comment ("%ld dimensions are constant: they can be removed\n",
-                     n_0_range);
+            do_comment(
+                    "%ld dimensions are constant: they can be removed\n",
+                    n_0_range);
         }
         if (n_dangerous_range == 0) {
-            do_comment ("no dimension has a too large mean\n");
+            do_comment("no dimension has a too large mean\n");
         } else {
-            do_comment ("%ld dimensions are too large "
-                     "wrt. their variance, may loose precision "
-                     "in IndexFlatL2 (use CenteringTransform)\n",
-                     n_dangerous_range);
+            do_comment(
+                    "%ld dimensions are too large "
+                    "wrt. their variance, may loose precision "
+                    "in IndexFlatL2 (use CenteringTransform)\n",
+                    n_dangerous_range);
         }
-        do_comment ("stddevs per dimension are in [%g %g]\n", min_std, max_std);
+        do_comment("stddevs per dimension are in [%g %g]\n", min_std, max_std);
         size_t n_small_var = 0;
         for (size_t j = 0; j < d; j++) {
-            const PerDimStats &st = per_dim_stats[j];
+            const PerDimStats& st = per_dim_stats[j];
             if (st.stddev < max_std * 1e-4) {
                 n_small_var++;
             }
         }
         if (n_small_var > 0) {
-            do_comment ("%ld dimensions have negligible stddev wrt. "
-                     "the largest dimension, they could be ignored",
-                     n_small_var);
+            do_comment(
+                    "%ld dimensions have negligible stddev wrt. "
+                    "the largest dimension, they could be ignored",
+                    n_small_var);
         }
     }
-    comments = comment_buf.data ();
+    comments = comment_buf.data();
     buf = nullptr;
     nbuf = 0;
 }
 } // namespace faiss

data/vendor/faiss/faiss/MatrixStats.h CHANGED Viewed

@@ -9,22 +9,20 @@
 #pragma once
-#include <vector>
+#include <stdint.h>
 #include <string>
 #include <unordered_map>
-#include <stdint.h>
+#include <vector>
 namespace faiss {
 /** Reports some statistics on a dataset and comments on them.
  *
  * It is a class rather than a function so that all stats can also be
  * accessed from code */
 struct MatrixStats {
-    MatrixStats (size_t n, size_t d, const float *x);
+    MatrixStats(size_t n, size_t d, const float* x);
     std::string comments;
     // raw statistics
@@ -42,8 +40,8 @@ struct MatrixStats {
         double mean, stddev;
         PerDimStats();
-        void add (float x);
-        void compute_mean_std ();
+        void add(float x);
+        void compute_mean_std();
     };
     std::vector<PerDimStats> per_dim_stats;
@@ -53,10 +51,9 @@ struct MatrixStats {
     };
     std::unordered_map<uint64_t, Occurrence> occurrences;
-    char *buf;
+    char* buf;
     size_t nbuf;
-    void do_comment (const char *fmt, ...);
+    void do_comment(const char* fmt, ...);
 };
 } // namespace faiss