RubyGems - umappp - Versions diffs - 0.1.6 → 0.2.1 - Mend

umappp 0.1.6 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +4 -4
data/README.md +22 -16
data/ext/umappp/numo.hpp +957 -833
data/ext/umappp/umappp.cpp +39 -45
data/lib/umappp/version.rb +1 -1
data/lib/umappp.rb +5 -4
data/vendor/aarand/aarand.hpp +141 -28
data/vendor/annoy/annoylib.h +1 -1
data/vendor/hnswlib/bruteforce.h +142 -127
data/vendor/hnswlib/hnswalg.h +1018 -939
data/vendor/hnswlib/hnswlib.h +149 -58
data/vendor/hnswlib/space_ip.h +322 -229
data/vendor/hnswlib/space_l2.h +283 -240
data/vendor/hnswlib/visited_list_pool.h +54 -55
data/vendor/irlba/irlba.hpp +12 -27
data/vendor/irlba/lanczos.hpp +30 -31
data/vendor/irlba/parallel.hpp +37 -38
data/vendor/irlba/utils.hpp +12 -23
data/vendor/irlba/wrappers.hpp +239 -70
data/vendor/kmeans/Details.hpp +1 -1
data/vendor/kmeans/HartiganWong.hpp +28 -2
data/vendor/kmeans/InitializeKmeansPP.hpp +29 -1
data/vendor/kmeans/Kmeans.hpp +25 -2
data/vendor/kmeans/Lloyd.hpp +29 -2
data/vendor/kmeans/MiniBatch.hpp +48 -8
data/vendor/knncolle/Annoy/Annoy.hpp +3 -0
data/vendor/knncolle/Hnsw/Hnsw.hpp +3 -0
data/vendor/knncolle/Kmknn/Kmknn.hpp +11 -1
data/vendor/knncolle/utils/find_nearest_neighbors.hpp +8 -6
data/vendor/umappp/Umap.hpp +85 -43
data/vendor/umappp/optimize_layout.hpp +410 -133
data/vendor/umappp/spectral_init.hpp +4 -1
metadata +7 -10

data/vendor/kmeans/MiniBatch.hpp CHANGED Viewed

@@ -53,29 +53,34 @@ public:
      */
     struct Defaults {
         /**
-         * See `MiniBatch::set_max_iterations()`.
+         * See `set_max_iterations()` for more details.
          */
         static constexpr int max_iterations = 100;
         /**
-         * See `MiniBatch::set_batch_size()`.
+         * See `set_batch_size()` for more details.
          */
         static constexpr INDEX_t batch_size = 500;
         /**
-         * See `MiniBatch::set_max_change_proportion()`.
+         * See `set_max_change_proportion()` for more details.
          */
         static constexpr double max_change_proportion = 0.01;
         /**
-         * See `MiniBatch::set_convergence_history()`.
+         * See `set_convergence_history()` for more details.
          */
         static constexpr int convergence_history = 10;
         /**
-         * See `MiniBatch::set_seed()`.
+         * See `set_seed()` for more details.
          */
         static constexpr uint64_t seed = 1234567890;
+        /**
+         * See `set_num_threads()` for more details.
+         */
+        static constexpr int num_threads = 1;
     };
 private:
@@ -88,6 +93,8 @@ private:
     double max_change = Defaults::max_change_proportion;
     uint64_t seed = Defaults::seed;
+    int nthreads = Defaults::num_threads;
 public:
     /**
      * @param i Maximum number of iterations.
@@ -143,6 +150,16 @@ public:
         return *this;
     }
+    /**
+     * @param n Number of threads to use.
+     *
+     * @return A reference to this `MiniBatch` object.
+     */
+    MiniBatch& set_num_threads(int n = Defaults::num_threads) {
+        nthreads = n;
+        return *this;
+    }
 public:
     /**
      * @param ndim Number of dimensions.
@@ -183,10 +200,22 @@ public:
             }
             QuickSearch<DATA_t, CLUSTER_t> index(ndim, ncenters, centers);
-            #pragma omp parallel for
-            for (size_t i = 0; i < chosen.size(); ++i) {
+            size_t nchosen = chosen.size();
+#ifndef KMEANS_CUSTOM_PARALLEL
+            #pragma omp parallel for num_threads(nthreads)
+            for (size_t i = 0; i < nchosen; ++i) {
+#else
+            KMEANS_CUSTOM_PARALLEL(nchosen, [&](size_t first, size_t last) -> void {
+            for (size_t i = first; i < last; ++i) {
+#endif
                 clusters[chosen[i]] = index.find(data + chosen[i] * ndim);
+#ifndef KMEANS_CUSTOM_PARALLEL
+            }
+#else
             }
+            }, nthreads);
+#endif
             // Updating the means for each cluster.
             for (auto o : chosen) {
@@ -236,10 +265,21 @@ public:
         // Run through all observations to make sure they have the latest cluster assignments.
         QuickSearch<DATA_t, CLUSTER_t> index(ndim, ncenters, centers);
-        #pragma omp parallel for
+#ifndef KMEANS_CUSTOM_PARALLEL
+        #pragma omp parallel for num_threads(nthreads)
         for (INDEX_t o = 0; o < nobs; ++o) {
+#else
+        KMEANS_CUSTOM_PARALLEL(nobs, [&](INDEX_t first, INDEX_t last) -> void {
+        for (INDEX_t o = first; o < last; ++o) {
+#endif
             clusters[o] = index.find(data + o * ndim);
+#ifndef KMEANS_CUSTOM_PARALLEL
         }
+#else
+        }
+        }, nthreads);
+#endif
         std::fill(total_sampled.begin(), total_sampled.end(), 0);
         for (INDEX_t o = 0; o < nobs; ++o) {

data/vendor/knncolle/Annoy/Annoy.hpp CHANGED Viewed

@@ -25,6 +25,9 @@ namespace knncolle {
  * For a given query point, each tree is searched to identify the subset of all points in the same leaf node as the query point.
  * The union of these subsets across all trees is exhaustively searched to identify the actual nearest neighbors to the query.
  *
+ * Note that, to improve reproducibility across architectures, we have disabled manual vectorization of the distance calculations by default.
+ * This can be restored by defining the `KNNCOLLE_MANUAL_VECTORIZATION` macro.
+ *
  * @see
  * Bernhardsson E (2018).
  * Annoy.

data/vendor/knncolle/Hnsw/Hnsw.hpp CHANGED Viewed

@@ -25,6 +25,9 @@ namespace knncolle {
  * The HNSW algorithm extends this idea by using a hierarchy of such graphs containing links of different lengths,
  * which avoids wasting time on small steps in the early stages of the search where the current node position is far from the query.
  *
+ * Note that, to improve reproducibility across architectures, we have disabled manual vectorization of the distance calculations by default.
+ * This can be restored by defining the `KNNCOLLE_MANUAL_VECTORIZATION` macro.
+ *
  * @see
  * Malkov YA, Yashunin DA (2016).
  * Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs.

data/vendor/knncolle/Kmknn/Kmknn.hpp CHANGED Viewed

@@ -16,6 +16,12 @@
 #include <iostream>
 #endif
+#ifndef KMEANS_CUSTOM_PARALLEL
+#ifdef KNNCOLLE_CUSTOM_PARALLEL
+#define KMEANS_CUSTOM_PARALLEL KNNCOLLE_CUSTOM_PARALLEL
+#endif
+#endif
 /**
  * @file Kmknn.hpp
  *
@@ -75,11 +81,12 @@ public:
      * i.e., contiguous elements belong to the same observation.
      * @param power Power of `nobs` to define the number of cluster centers.
      * By default, a square root is performed.
+     * @param nthreads Number of threads to use for the k-means clustering.
      *
      * @tparam INPUT_t Floating-point type of the input data.
      */
     template<typename INPUT_t>
-    Kmknn(INDEX_t ndim, INDEX_t nobs, const INPUT_t* vals, double power = 0.5) :
+    Kmknn(INDEX_t ndim, INDEX_t nobs, const INPUT_t* vals, double power = 0.5, int nthreads = 1) :
             num_dim(ndim),
             num_obs(nobs),
             data(ndim * nobs),
@@ -103,6 +110,9 @@ public:
             std::copy(vals, vals + data.size(), data.data());
             host = data.data();
         }
+        kmeans::Kmeans<INTERNAL_t, int> krunner;
+        krunner.set_num_threads(nthreads);
         auto output = kmeans::Kmeans<INTERNAL_t, int>().run(ndim, nobs, host, ncenters, centers.data(), clusters.data());
         std::swap(sizes, output.sizes);

data/vendor/knncolle/utils/find_nearest_neighbors.hpp CHANGED Viewed

@@ -36,17 +36,18 @@ using NeighborList = std::vector<std::vector<std::pair<INDEX_t, DISTANCE_t> > >;
  *
  * @param ptr Pointer to a `Base` index.
  * @param k Number of nearest neighbors.
+ * @param nthreads Number of threads to use.
  *
  * @return A `NeighborList` of length equal to the number of observations in `ptr->nobs()`.
  * Each entry contains the `k` nearest neighbors for each observation, sorted by increasing distance.
  */
 template<typename INDEX_t = int, typename DISTANCE_t = double, typename InputINDEX_t, typename InputDISTANCE_t, typename InputQUERY_t>
-NeighborList<INDEX_t, DISTANCE_t> find_nearest_neighbors(const Base<InputINDEX_t, InputDISTANCE_t, InputQUERY_t>* ptr, int k) {
+NeighborList<INDEX_t, DISTANCE_t> find_nearest_neighbors(const Base<InputINDEX_t, InputDISTANCE_t, InputQUERY_t>* ptr, int k, int nthreads) {
     auto n = ptr->nobs();
     NeighborList<INDEX_t, DISTANCE_t> output(n);
 #ifndef KNNCOLLE_CUSTOM_PARALLEL
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(nthreads)
     for (size_t i = 0; i < n; ++i) {
 #else
     KNNCOLLE_CUSTOM_PARALLEL(n, [&](size_t first, size_t last) -> void {
@@ -62,7 +63,7 @@ NeighborList<INDEX_t, DISTANCE_t> find_nearest_neighbors(const Base<InputINDEX_t
         }
     }
 #ifdef KNNCOLLE_CUSTOM_PARALLEL
-    });
+    }, nthreads);
 #endif
     return output;
@@ -79,17 +80,18 @@ NeighborList<INDEX_t, DISTANCE_t> find_nearest_neighbors(const Base<InputINDEX_t
  *
  * @param ptr Pointer to a `Base` index.
  * @param k Number of nearest neighbors.
+ * @param nthreads Number of threads to use.
  *
  * @return A vector of vectors of length equal to the number of observations in `ptr->nobs()`.
  * Each vector contains the indices of the `k` nearest neighbors for each observation, sorted by increasing distance.
  */
 template<typename INDEX_t = int, typename InputINDEX_t, typename InputDISTANCE_t, typename InputQUERY_t>
-std::vector<std::vector<INDEX_t> > find_nearest_neighbors_index_only(const Base<InputINDEX_t, InputDISTANCE_t, InputQUERY_t>* ptr, int k) {
+std::vector<std::vector<INDEX_t> > find_nearest_neighbors_index_only(const Base<InputINDEX_t, InputDISTANCE_t, InputQUERY_t>* ptr, int k, int nthreads) {
     auto n = ptr->nobs();
     std::vector<std::vector<INDEX_t> > output(n);
 #ifndef KNNCOLLE_CUSTOM_PARALLEL
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(nthreads)
     for (size_t i = 0; i < n; ++i) {
 #else
     KNNCOLLE_CUSTOM_PARALLEL(n, [&](size_t first, size_t last) -> void {
@@ -101,7 +103,7 @@ std::vector<std::vector<INDEX_t> > find_nearest_neighbors_index_only(const Base<
         }
     }
 #ifdef KNNCOLLE_CUSTOM_PARALLEL
-    });
+    }, nthreads);
 #endif
     return output;

data/vendor/umappp/Umap.hpp CHANGED Viewed

@@ -157,14 +157,14 @@ public:
         static constexpr uint64_t seed = 1234567890;
         /**
-         * See `set_batch()`.
+         * See `set_num_threads()`.
          */
-        static constexpr bool batch = false;
+        static constexpr int num_threads = 1;
         /**
-         * See `set_num_threads()`.
+         * See `set_parallel_optimization()`.
          */
-        static constexpr int num_threads = 1;
+        static constexpr int parallel_optimization = false;
     };
 private:
@@ -184,8 +184,8 @@ private:
         Float b = Defaults::b;
         Float repulsion_strength = Defaults::repulsion_strength;
         Float learning_rate = Defaults::learning_rate;
-        bool batch = Defaults::batch;
         int nthreads = Defaults::num_threads;
+        bool parallel_optimization = Defaults::parallel_optimization;
     };
     RuntimeParameters rparams;
@@ -359,29 +359,13 @@ public:
         return *this;
     }
-    /**
-     * @param b Whether to optimize in batch mode.
-     * Batch mode is required for effective parallelization via OpenMP but may reduce the stability of the gradient descent.
-     *
-     * Batch mode involves computing forces for all observations and applying them simultaneously.
-     * This is in contrast to the default where the location of observation is updated before the forces are computed for the next observation.
-     * As each observation's forces are computed independently, batch mode is more amenable to parallelization;
-     * however, this comes at the cost of stability as the force calculations for later observations are not aware of updates to the positions of earlier observations.
-     *
-     * @return A reference to this `Umap` object.
-     */
-    Umap& set_batch(bool b = Defaults::batch) {
-        rparams.batch = b;
-        return *this;
-    }
     /**
      * @param n Number of threads to use.
      *
      * @return A reference to this `Umap` object.
      *
      * This setting affects nearest neighbor detection (if an existing list of neighbors is not supplied in `initialize()` or `run()`) and spectral initialization.
-     * If `set_batch()` is `true`, multiple threads will also be used during layout optimization.
+     * If `set_parallel_optimization()` is true, it will also affect the layout optimization, i.e., the gradient descent iterations.
      *
      * The `UMAPPP_CUSTOM_PARALLEL` macro can be set to a function that specifies a custom parallelization scheme.
      * This function should be a template that accept three arguments:
@@ -404,6 +388,26 @@ public:
         return *this;
     }
+    /**
+     * @param p Whether to enable parallel optimization.
+     * If set to `true`, this will use the number of threads specified in `set_num_threads()` for the layout optimization step.
+     *
+     * @return A reference to this `Umap` object.
+     *
+     * By default, this is set to `false` as the increase in the number of threads is usually not cost-effective for layout optimization.
+     * Specifically, while CPU usage scales with the number of threads, the time spent does not decrease by the same factor.
+     * We also expect that the number of available CPUs is at least equal to the requested number of threads, otherwise contention will greatly degrade performance.
+     * Nonetheless, users can enable parallel optimization if cost is no issue - usually a higher number of threads (above 4) is required to see a reduction in time.
+     *
+     * If the `UMAPPP_NO_PARALLEL_OPTIMIZATION` macro is defined, **umappp** will not be compiled with support for parallel optimization.
+     * This may be desirable in environments that have no support for threading or atomics, or to reduce the binary size if parallelization is not of interest.
+     * In such cases, enabling parallel optimization and calling `Status::run()` will raise an error.
+     */
+    Umap& set_parallel_optimization(bool p = Defaults::parallel_optimization) {
+        rparams.parallel_optimization = p;
+        return *this;
+    }
 public:
     /**
      * @brief Status of the UMAP optimization iterations.
@@ -412,15 +416,51 @@ public:
         /**
          * @cond
          */
-        Status(EpochData<Float> e, uint64_t seed, RuntimeParameters p) : epochs(std::move(e)), engine(seed), rparams(std::move(p)) {}
+        Status(EpochData<Float> e, uint64_t seed, RuntimeParameters p, int n, Float* embed) :
+            epochs(std::move(e)), engine(seed), rparams(std::move(p)), ndim_(n), embedding_(embed) {}
         EpochData<Float> epochs;
         std::mt19937_64 engine;
         RuntimeParameters rparams;
+        int ndim_;
+        Float* embedding_;
         /**
          * @endcond
          */
+        /**
+         * @return Number of dimensions of the embedding.
+         */
+        int ndim() const {
+            return ndim_;
+        }
+        /**
+         * @return Pointer to a two-dimensional column-major array where rows are dimensions (`ndim`) and columns are observations.
+         * This is updated by `initialize()` to store the final embedding.
+         */
+        const Float* embedding() const {
+            return embedding_;
+        }
+        /**
+         * @param ptr Pointer to a two-dimensional array as described in `embedding()`.
+         * @param copy Whether the contents of the previous array should be copied into `ptr`.
+         *
+         * By default, the `Status` objects returned by `Umap` methods will operate on embeddings in an array specified at `Status` construction time.
+         * This method will change the embedding array for an existing `Status` object, which can be helpful in some situations,
+         * e.g., to clone a `Status` object and to store its embeddings in a different array than the object.
+         *
+         * Note that the contents of the new array in `ptr` should be the same as the array that it replaces, as `run()` will continue the iteration from the coordinates inside the array.
+         * If a copy was already performed from the old array to the new array, the caller may set `copy = false` to avoid an extra copy.
+         */
+        void set_embedding(Float* ptr, bool copy = true) {
+            if (copy) {
+                std::copy(embedding_, embedding_ + static_cast<size_t>(ndim()) * nobs(), ptr);
+            }
+            embedding_ = ptr;
+        }
         /**
          * @return Current epoch.
          */
@@ -444,21 +484,22 @@ public:
         }
         /**
-         * @param ndim Number of dimensions of the embedding.
-         * @param[in, out] embedding Two-dimensional array where rows are dimensions (`ndim`) and columns are observations.
-         * This contains the initial coordinates and is updated to store the final embedding.
+         * The status of the algorithm and the coordinates in `embedding()` are updated to the specified number of epochs.
+         *
          * @param epoch_limit Number of epochs to run to.
          * The actual number of epochs performed is equal to the difference between `epoch_limit` and the current number of epochs in `epoch()`.
-         * `epoch_limit` should be not less than `epoch()` and no greater than the maximum number of epochs specified in `Umap::set_num_epochs()`.
+         * `epoch_limit` should be not less than `epoch()` and be no greater than the maximum number of epochs specified in `Umap::set_num_epochs()`.
          * If zero, defaults to the maximum number of epochs.
-         *
-         * @return The status of the algorithm and the coordinates in `embedding` are updated to the specified number of epochs.
          */
-        void run(int ndim, Float* embedding, int epoch_limit = 0) {
-            if (!rparams.batch) {
+        void run(int epoch_limit = 0) {
+            if (epoch_limit == 0) {
+                epoch_limit = epochs.total_epochs;
+            }
+            if (rparams.nthreads == 1 || !rparams.parallel_optimization) {
                 optimize_layout(
-                    ndim,
-                    embedding,
+                    ndim_,
+                    embedding_,
                     epochs,
                     rparams.a,
                     rparams.b,
@@ -468,16 +509,15 @@ public:
                     epoch_limit
                 );
             } else {
-                optimize_layout_batched(
-                    ndim,
-                    embedding,
+                optimize_layout_parallel(
+                    ndim_,
+                    embedding_,
                     epochs,
                     rparams.a,
                     rparams.b,
                     rparams.repulsion_strength,
                     rparams.learning_rate,
-                    [&]() -> auto { return engine(); },
-                    [](decltype(engine()) s) -> auto { return std::mt19937_64(s); },
+                    engine,
                     epoch_limit,
                     rparams.nthreads
                 );
@@ -524,7 +564,9 @@ public:
         return Status(
             similarities_to_epochs(x, num_epochs_to_do, negative_sample_rate),
             seed,
-            std::move(pcopy)
+            std::move(pcopy),
+            ndim,
+            embedding
         );
     }
@@ -587,7 +629,7 @@ public:
      */
     template<typename Input = Float>
     Status initialize(int ndim_in, size_t nobs, const Input* input, int ndim_out, Float* embedding) {
-        knncolle::VpTreeEuclidean<> searcher(ndim_in, nobs, input);
+        knncolle::VpTreeEuclidean<int, Input, Input, Input> searcher(ndim_in, nobs, input);
         return initialize(&searcher, ndim_out, embedding);
     }
 #endif
@@ -609,7 +651,7 @@ public:
     template<class Algorithm>
     Status run(const Algorithm* searcher, int ndim, Float* embedding, int epoch_limit = 0) {
         auto status = initialize(searcher, ndim, embedding);
-        status.run(ndim, embedding, epoch_limit);
+        status.run(epoch_limit);
         return status;
     }
@@ -627,7 +669,7 @@ public:
      */
     Status run(NeighborList<Float> x, int ndim, Float* embedding, int epoch_limit = 0) const {
         auto status = initialize(std::move(x), ndim, embedding);
-        status.run(ndim, embedding, epoch_limit);
+        status.run(epoch_limit);
         return status;
     }
@@ -651,7 +693,7 @@ public:
     template<typename Input = Float>
     Status run(int ndim_in, size_t nobs, const Input* input, int ndim_out, Float* embedding, int epoch_limit = 0) {
         auto status = initialize(ndim_in, nobs, input, ndim_out, embedding);
-        status.run(ndim_out, embedding, epoch_limit);
+        status.run(epoch_limit);
         return status;
     }
 #endif