RubyGems - umappp - Versions diffs - 0.1.5 → 0.2.0 - Mend

umappp 0.1.5 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

checksums.yaml +4 -4
data/README.md +11 -4
data/ext/umappp/umappp.cpp +41 -43
data/lib/umappp/version.rb +1 -1
data/lib/umappp.rb +5 -4
data/vendor/aarand/aarand.hpp +141 -28
data/vendor/annoy/annoylib.h +1 -1
data/vendor/hnswlib/bruteforce.h +142 -127
data/vendor/hnswlib/hnswalg.h +1018 -939
data/vendor/hnswlib/hnswlib.h +149 -58
data/vendor/hnswlib/space_ip.h +322 -229
data/vendor/hnswlib/space_l2.h +283 -240
data/vendor/hnswlib/visited_list_pool.h +54 -55
data/vendor/irlba/irlba.hpp +12 -27
data/vendor/irlba/lanczos.hpp +30 -31
data/vendor/irlba/parallel.hpp +37 -38
data/vendor/irlba/utils.hpp +12 -23
data/vendor/irlba/wrappers.hpp +239 -70
data/vendor/kmeans/Details.hpp +1 -1
data/vendor/kmeans/HartiganWong.hpp +28 -2
data/vendor/kmeans/InitializeKmeansPP.hpp +29 -1
data/vendor/kmeans/Kmeans.hpp +25 -2
data/vendor/kmeans/Lloyd.hpp +29 -2
data/vendor/kmeans/MiniBatch.hpp +48 -8
data/vendor/knncolle/Annoy/Annoy.hpp +3 -0
data/vendor/knncolle/Hnsw/Hnsw.hpp +3 -0
data/vendor/knncolle/Kmknn/Kmknn.hpp +11 -1
data/vendor/knncolle/utils/find_nearest_neighbors.hpp +8 -6
data/vendor/umappp/Umap.hpp +85 -43
data/vendor/umappp/optimize_layout.hpp +410 -133
data/vendor/umappp/spectral_init.hpp +4 -1
metadata +6 -6

data/vendor/irlba/irlba.hpp CHANGED Viewed

@@ -16,6 +16,9 @@
  * @brief Implements the main user-visible class for running IRLBA.
  */
+/**
+ * @brief Implements the IRLBA algorithm for approximate SVD.
+ */
 namespace irlba {
 /**
@@ -89,10 +92,9 @@ public:
     }
     /**
-     * Set the maximum number of restart iterations.
-     * In most cases, convergence will occur before reaching this limit.
+     * Set the seed for the creation of random vectors, primarily during initialization of the IRLBA algorithm.
      *
-     * @param m Maximum number of iterations.
+     * @param s Seed value.
      *
      * @return A reference to the `Irlba` instance.
      */
@@ -155,7 +157,7 @@ public:
      * Run IRLBA on an input matrix to perform an approximate SVD, with arbitrary centering and scaling operations.
      *
      * @tparam M Matrix class, typically from the **Eigen** matrix manipulation library.
-     * However, other classes are also supported, see the other `run()` methods for details.
+     * However, other classes are also supported, see `wrappers.hpp` for details.
      * @tparam Engine A (pseudo-)random number generator class, returning a randomly sampled value when called as a functor with no arguments.
      *
      * @param[in] mat Input matrix.
@@ -273,24 +275,7 @@ public:
      * and the second entry indicates the number of restart iterations performed.
      *
      * Custom classes can be used to define modified matrices that cannot be efficiently realized into the standard **Eigen** classes.
-     * We expect:
-     * - A `rows()` method that returns the number of rows.
-     * - A `cols()` method that returns the number of columns.
-     * - One of the following for matrix-vector multiplication:
-     *   - `multiply(rhs, out)`, which should compute the product of the matrix with `rhs`, a `Eigen::VectorXd`-equivalent of length equal to the number of columns;
-     *     and stores the result in `out`, an `Eigen::VectorXd` of length equal to the number of rows.
-     *   - A `*` method where the right-hand side is an `Eigen::VectorXd` (or equivalent expression) of length equal to the number of columsn,
-     *     and returns an `Eigen::VectorXd`-equivalent of length equal to the number of rows.
-     * - One of the following for matrix transpose-vector multiplication:
-     *   - `adjoint_multiply(rhs, out)`, which should compute the product of the matrix transpose with `rhs`, a `Eigen::VectorXd`-equivalent of length equal to the number of rows;
-     *     and stores the result in `out`, an `Eigen::VectorXd` of length equal to the number of columns.
-     *   - An `adjoint()` method that returns an instance of any class that has a `*` method for matrix-vector multiplication.
-     *     The method should accept an `Eigen::VectorXd`-equivalent of length equal to the number of rows,
-     *     and return an `Eigen::VectorXd`-equvialent of length equal to the number of columns.
-     * - A `realize()` method that returns an `Eigen::MatrixXd` object representing the modified matrix.
-     *   This can be omitted if an `Eigen::MatrixXd` can be copy-constructed from the class.
-     *
-     * See the `Centered` and `Scaled` classes for more details.
+     * See the `wrappers.hpp` file for more details, along with the `Centered` and `Scaled` classes.
      *
      * If the smallest dimension of `mat` is below 6, this method falls back to performing an exact SVD.
      */
@@ -485,7 +470,7 @@ private:
 public:
     /**
-     * Result of the IRLBA-based decomposition.
+     * @brief Result of the IRLBA-based decomposition.
      */
     struct Results {
         /**
@@ -509,12 +494,12 @@ public:
         Eigen::VectorXd D;
         /**
-         * Whether the algorithm converged.
+         * The number of restart iterations performed.
          */
         int iterations;
         /**
-         * The number of restart iterations performed.
+         * Whether the algorithm converged.
          */
         bool converged;
     };
@@ -523,7 +508,7 @@ public:
      * Run IRLBA on an input matrix to perform an approximate SVD with centering and scaling.
      *
      * @tparam M Matrix class, most typically from the **Eigen** matrix manipulation library.
-     * However, other classes are also supported, see the other `run()` methods for details.
+     * However, other classes are also supported, see `wrappers.hpp` for details.
      * @tparam Engine A (pseudo-)random number generator class, returning a randomly sampled value when called as a functor with no arguments.
      *
      * @param[in] mat Input matrix.
@@ -549,7 +534,7 @@ public:
      * Run IRLBA on an input matrix to perform an approximate SVD, see the `run()` method for more details.
      *
      * @tparam M Matrix class,  most typically from the **Eigen** matrix manipulation library.
-     * However, other classes are also supported, see the other `run()` methods for details.
+     * However, other classes are also supported, see `wrappers.hpp` for details.
      * @tparam Engine A (pseudo-)random number generator class, returning a randomly sampled value when called as a functor with no arguments.
      *
      * @param[in] mat Input matrix.

data/vendor/irlba/lanczos.hpp CHANGED Viewed

@@ -44,16 +44,22 @@ public:
 public:
     /**
-     * @brief Intermediate data structures to avoid repeated allocations.
+     * @tparam M Some kind of matrix class, either from the **Eigen** library or one of **irlba**'s wrappers.
+     *
+     * @brief Intermediate data structures to avoid repeated allocations on `run()`.
      */
+    template<class M>
     struct Intermediates {
         /**
-         * @tparam M Matrix class, most typically from the **Eigen** library.
-         *
          * @param mat Instance of a matrix class `M`.
          */
-        template<class M>
-        Intermediates(const M& mat) : F(mat.cols()), W_next(mat.rows()), orthog_tmp(mat.cols()) {}
+        Intermediates(const M& mat) :
+            F(mat.cols()),
+            W_next(mat.rows()),
+            orthog_tmp(mat.cols()),
+            work(wrapped_workspace(&mat)),
+            awork(wrapped_adjoint_workspace(&mat))
+        {}
         /**
          * Obtain the residual vector, see algorithm 2.1 of Baglama and Reichel (2005).
@@ -70,13 +76,19 @@ public:
         Eigen::VectorXd F;
         Eigen::VectorXd W_next;
         Eigen::VectorXd orthog_tmp;
+        WrappedWorkspace<M> work;
+        WrappedAdjointWorkspace<M> awork;
         /**
          * @endcond
          */
     };
+    /**
+     * @tparam M Some matrix class, either from the **Eigen** library or one of **irlba**'s wrappers.
+     * @return An `Intermediates` object for subsequent calls to `run()` on `mat`.
+     */
     template<class M>
-    Intermediates initialize(const M& mat) const {
+    Intermediates<M> initialize(const M& mat) const {
         return Intermediates(mat);
     }
@@ -92,20 +104,19 @@ public:
      * @tparam Engine A functor that, when called with no arguments, returns a random integer from a discrete uniform distribution.
      *
      * @param mat Input matrix.
-     * @param W Output matrix with number of rows equal to `mat.rows()`.
+     * @param[in, out] W Output matrix with number of rows equal to `mat.rows()`.
      * The size of the working subspace is defined from the number of columns.
      * The first `start` columns should contain orthonormal column vectors with non-zero L2 norms.
-     * @param V Matrix with number of rows equal to `mat.cols()` and number of columns equal to `W.cols()`.
+     * On output, the rest of `W` is filled with orthonormal vectors.
+     * @param[in, out] V Matrix with number of rows equal to `mat.cols()` and number of columns equal to `W.cols()`.
      * The first `start + 1` columns should contain orthonormal column vectors with non-zero L2 norms.
-     * @param B Square matrix with number of rows and columns equal to the size of the working subspace.
+     * On output, the rest of `V` is filled with orthonormal vectors.
+     * @param[in, out] B Square matrix with number of rows and columns equal to the size of the working subspace.
      * Number of values is defined by `set_number()`.
+     * On output, `B` is filled with upper diagonal entries, starting from the `start`-th row/column.
      * @param eng An instance of a random number `Engine`.
      * @param inter Collection of intermediate data structures generated by calling `initialize()` on `mat`.
      * @param start The dimension from which to start the bidiagonalization.
-     *
-     * @return
-     * `W` is filled with orthonormal vectors, as is `V`.
-     * `B` is filled with upper diagonal entries.
      */
     template<class M, class Engine>
     void run(
@@ -114,7 +125,7 @@ public:
         Eigen::MatrixXd& V,
         Eigen::MatrixXd& B,
         Engine& eng,
-        Intermediates& inter,
+        Intermediates<M>& inter,
         int start = 0)
     const {
         const double eps = (epsilon < 0 ? std::pow(std::numeric_limits<double>::epsilon(), 0.8) : epsilon);
@@ -125,31 +136,23 @@ public:
         auto& otmp = inter.orthog_tmp;
         F = V.col(start);
-        if constexpr(has_multiply_method<M>::value) {
-            W_next.noalias() = mat * F;
-        } else {
-            mat.multiply(F, W_next);
-        }
+        wrapped_multiply(&mat, F, inter.work, W_next); // i.e., W_next = mat * F;
-        // If start = 0, we assume that it's already normalized, see argument description for 'V'.
+        // If start = 0, there's nothing to orthogonalize against.
         if (start) {
             orthogonalize_vector(W, W_next, start, otmp);
         }
         double S = W_next.norm();
         if (S < eps) {
-            throw -4;
+            throw std::runtime_error("starting vector near the null space of the input matrix");
         }
         W_next /= S;
         W.col(start) = W_next;
         // The Lanczos iterations themselves.
         for (int j = start; j < work; ++j) {
-            if constexpr(has_adjoint_multiply_method<M>::value) {
-                F.noalias() = mat.adjoint() * W.col(j);
-            } else {
-                mat.adjoint_multiply(W.col(j), F);
-            }
+            wrapped_adjoint_multiply(&mat, W.col(j), inter.awork, F); // i.e., F = mat.adjoint() * W.col(j);
             F -= S * V.col(j); // equivalent to daxpy.
             orthogonalize_vector(V, F, j + 1, otmp);
@@ -172,11 +175,7 @@ public:
                 B(j, j) = S;
                 B(j, j + 1) = R_F;
-                if constexpr(has_multiply_method<M>::value) {
-                    W_next.noalias() = mat * F;
-                } else {
-                    mat.multiply(F, W_next);
-                }
+                wrapped_multiply(&mat, F, inter.work, W_next); // i.e., W_next = mat * F;
                 // Full re-orthogonalization, using the left-most 'j +  1' columns of W.
                 // Recall that W_next will be the 'j + 2'-th column, i.e., W.col(j + 1) in

data/vendor/irlba/parallel.hpp CHANGED Viewed

@@ -235,20 +235,6 @@ private:
 private:
     template<class Right>
     void indirect_multiply(const Right& rhs, Eigen::VectorXd& output) const {
-        if constexpr(has_data_method<Right>::value) {
-            // If it has a .data() method, the data values are already computed
-            // and sitting in memory, so we just use that directly.
-            indirect_multiply_internal(rhs, output);
-        } else {
-            // Otherwise, it is presumably an expression that involves some work
-            // to get the values. We realize it into a VectorXd to ensure that
-            // it is not repeatedly evaluated on each access to 'rhs'.
-            indirect_multiply_internal(Eigen::VectorXd(rhs), output);
-        }
-    }
-    template<class Right>
-    void indirect_multiply_internal(const Right& rhs, Eigen::VectorXd& output) const {
         output.setZero();
         if (nthreads == 1) {
@@ -270,8 +256,8 @@ private:
         IRLBA_CUSTOM_PARALLEL(nthreads, [&](int t) -> void {
 #endif
-            auto starts = secondary_nonzero_starts[t];
-            auto ends = secondary_nonzero_starts[t + 1];
+            const auto& starts = secondary_nonzero_starts[t];
+            const auto& ends = secondary_nonzero_starts[t + 1];
             for (size_t c = 0; c < primary_dim; ++c) {
                 auto start = starts[c];
                 auto end = ends[c];
@@ -293,20 +279,6 @@ private:
 private:
     template<class Right>
     void direct_multiply(const Right& rhs, Eigen::VectorXd& output) const {
-        if constexpr(has_data_method<Right>::value) {
-            // If it has a .data() method, the data values are already computed
-            // and sitting in memory, so we just use that directly.
-            direct_multiply_internal(rhs, output);
-        } else {
-            // Otherwise, it is presumably an expression that involves some work
-            // to get the values. We realize it into a VectorXd to ensure that
-            // it is not repeatedly evaluated on each access to 'rhs'.
-            direct_multiply_internal(Eigen::VectorXd(rhs), output);
-        }
-    }
-    template<class Right>
-    void direct_multiply_internal(const Right& rhs, Eigen::VectorXd& output) const {
         if (nthreads == 1) {
             for (size_t c = 0; c < primary_dim; ++c) {
                 output.coeffRef(c) = column_dot_product(c, rhs);
@@ -346,18 +318,45 @@ private:
         return dot;
     }
+public:
+    /**
+     * Workspace type for `multiply()`.
+     * Currently this is a placeholder.
+     */
+    typedef bool Workspace;
+    /**
+     * @return Workspace for use in `multiply()`.
+     */
+    bool workspace() const {
+        return false;
+    }
+    /**
+     * Workspace type for `adjoint_multiply()`.
+     * Currently this is a placeholder.
+     */
+    typedef bool AdjointWorkspace;
+    /**
+     * @return Workspace for use in `adjoint_multiply()`.
+     */
+    bool adjoint_workspace() const {
+        return false;
+    }
 public:
     /**
      * @tparam Right An `Eigen::VectorXd` or equivalent expression.
      *
      * @param[in] rhs The right-hand side of the matrix product.
      * This should be a vector or have only one column.
-     * @param[out] out The output vector to store the matrix product.
-     *
-     * @return `out` is filled with the product of this matrix and `rhs`.
+     * @param work The return value of `workspace()`.
+     * @param[out] output The output vector to store the matrix product.
+     * This is filled with the product of this matrix and `rhs`.
      */
     template<class Right>
-    void multiply(const Right& rhs, Eigen::VectorXd& output) const {
+    void multiply(const Right& rhs, Workspace& work, Eigen::VectorXd& output) const {
         if constexpr(column_major) {
             indirect_multiply(rhs, output);
         } else {
@@ -370,12 +369,12 @@ public:
      *
      * @param[in] rhs The right-hand side of the matrix product.
      * This should be a vector or have only one column.
-     * @param[out] out The output vector to store the matrix product.
-     *
-     * @return `out` is filled with the product of the transpose of this matrix and `rhs`.
+     * @param work The return value of `adjoint_workspace()`.
+     * @param[out] output The output vector to store the matrix product.
+     * This is filled with the product of the transpose of this matrix and `rhs`.
      */
     template<class Right>
-    void adjoint_multiply(const Right& rhs, Eigen::VectorXd& output) const {
+    void adjoint_multiply(const Right& rhs, AdjointWorkspace& work, Eigen::VectorXd& output) const {
         if constexpr(column_major) {
             direct_multiply(rhs, output);
         } else {

data/vendor/irlba/utils.hpp CHANGED Viewed

@@ -18,12 +18,11 @@ namespace irlba {
  * Orthogonalize a vector against a set of orthonormal column vectors.
  *
  * @param mat A matrix where the left-most `ncols` columns are orthonormal vectors.
- * @param vec The vector of interest, of length equal to the number of rows in `mat`.
+ * @param[in, out] vec The vector of interest, of length equal to the number of rows in `mat`.
+ * On output, this is modified to contain `vec - mat0 * t(mat0) * vec`, where `mat0` is defined as the first `ncols` columns of `mat`.
+ * This ensures that it is orthogonal to each column of `mat0`.
  * @param tmp A vector of length equal to `mat.cols()`, used to store intermediate matrix products.
  * @param ncols Number of left-most columns of `mat` to use.
- *
- * @return `vec` is modified to contain `vec - mat0 * t(mat0) * vec`, where `mat0` is defined as the first `ncols` columns of `mat`.
- * This ensures that it is orthogonal to each column of `mat0`.
  */
 inline void orthogonalize_vector(const Eigen::MatrixXd& mat, Eigen::VectorXd& vec, size_t ncols, Eigen::VectorXd& tmp) {
     tmp.head(ncols).noalias() = mat.leftCols(ncols).adjoint() * vec;
@@ -34,13 +33,12 @@ inline void orthogonalize_vector(const Eigen::MatrixXd& mat, Eigen::VectorXd& ve
 /**
  * Fill an **Eigen** vector with random normals via **aarand**.
  *
- * @param Vec Any **Eigen** vector class or equivalent proxy object.
- * @param Engine A (pseudo-)random number generator class that returns a random number when called with no arguments.
+ * @tparam Vec Any **Eigen** vector class or equivalent proxy object.
+ * @tparam Engine A (pseudo-)random number generator class that returns a random number when called with no arguments.
  *
- * @param vec Instance of a `Vec` class.
+ * @param[out] vec Instance of a `Vec` class.
+ * This is filled with random draws from a standard normal distribution.
  * @param eng Instance of an `Engine` class.
- *
- * @return `vec` is filled with random draws from a standard normal distribution.
  */
 template<class Vec, class Engine>
 void fill_with_random_normals(Vec& vec, Engine& eng) {
@@ -77,13 +75,13 @@ struct ColumnVectorProxy {
 /**
  * Fill a column of an **Eigen** matrix with random normals via **aarand**.
  *
- * @param Matrix Any **Eigen** matrix class or equivalent proxy object.
- * @param Engine A (pseudo-)random number generator class that returns a random number when called with no arguments.
+ * @tparam Matrix Any **Eigen** matrix class or equivalent proxy object.
+ * @tparam Engine A (pseudo-)random number generator class that returns a random number when called with no arguments.
  *
  * @param mat Instance of a `Matrix` class.
+ * The `column` column of this matrix is filled with random draws from a standard normal distribution.
+ * @param column Column of `mat` to be filled.
  * @param eng Instance of an `Engine` class.
- *
- * @return The `column` column of `mat` is filled with random draws from a standard normal distribution.
  */
 template<class Matrix, class Engine>
 void fill_with_random_normals(Matrix& mat, int column, Engine& eng) {
@@ -145,6 +143,7 @@ public:
      *
      * @param sv Vector of singular values.
      * @param residuals Vector of residuals for each singular value/vector.
+     * @param last Vector of singular values from the previous iteration.
      *
      * @return The number of singular values/vectors that have achieved convergence.
      */
@@ -205,16 +204,6 @@ template<class M>
 struct has_realize_method<M, decltype((void) std::declval<M>().realize(), 0)> {
     static constexpr bool value = std::is_same<decltype(std::declval<M>().realize()), Eigen::MatrixXd>::value;
 };
-template<class M, typename = int>
-struct has_data_method {
-    static constexpr bool value = false;
-};
-template<class M>
-struct has_data_method<M, decltype((void) (std::declval<M>().data()), 0)> {
-    static constexpr bool value = true;
-};
 /**
  * @endcond
  */