RubyGems - umappp - Versions diffs - 0.1.6 → 0.2.1 - Mend

umappp 0.1.6 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +4 -4
data/README.md +22 -16
data/ext/umappp/numo.hpp +957 -833
data/ext/umappp/umappp.cpp +39 -45
data/lib/umappp/version.rb +1 -1
data/lib/umappp.rb +5 -4
data/vendor/aarand/aarand.hpp +141 -28
data/vendor/annoy/annoylib.h +1 -1
data/vendor/hnswlib/bruteforce.h +142 -127
data/vendor/hnswlib/hnswalg.h +1018 -939
data/vendor/hnswlib/hnswlib.h +149 -58
data/vendor/hnswlib/space_ip.h +322 -229
data/vendor/hnswlib/space_l2.h +283 -240
data/vendor/hnswlib/visited_list_pool.h +54 -55
data/vendor/irlba/irlba.hpp +12 -27
data/vendor/irlba/lanczos.hpp +30 -31
data/vendor/irlba/parallel.hpp +37 -38
data/vendor/irlba/utils.hpp +12 -23
data/vendor/irlba/wrappers.hpp +239 -70
data/vendor/kmeans/Details.hpp +1 -1
data/vendor/kmeans/HartiganWong.hpp +28 -2
data/vendor/kmeans/InitializeKmeansPP.hpp +29 -1
data/vendor/kmeans/Kmeans.hpp +25 -2
data/vendor/kmeans/Lloyd.hpp +29 -2
data/vendor/kmeans/MiniBatch.hpp +48 -8
data/vendor/knncolle/Annoy/Annoy.hpp +3 -0
data/vendor/knncolle/Hnsw/Hnsw.hpp +3 -0
data/vendor/knncolle/Kmknn/Kmknn.hpp +11 -1
data/vendor/knncolle/utils/find_nearest_neighbors.hpp +8 -6
data/vendor/umappp/Umap.hpp +85 -43
data/vendor/umappp/optimize_layout.hpp +410 -133
data/vendor/umappp/spectral_init.hpp +4 -1
metadata +7 -10

data/ext/umappp/umappp.cpp CHANGED Viewed

@@ -9,10 +9,6 @@
 typedef float Float;
 typedef typename umappp::Umap<Float> Umap;
-#ifdef _OPENMP
-#include <omp.h>
-#endif
 using namespace Rice;
 // This function is used to view default parameters from Ruby.
@@ -28,13 +24,14 @@ Hash umappp_default_parameters(Object self)
   d[Symbol("a")] = Umap::Defaults::a;
   d[Symbol("b")] = Umap::Defaults::b;
   d[Symbol("repulsion_strength")] = Umap::Defaults::repulsion_strength;
+  d[Symbol("initialize")] = Umap::Defaults::initialize;
   d[Symbol("num_epochs")] = Umap::Defaults::num_epochs;
   d[Symbol("learning_rate")] = Umap::Defaults::learning_rate;
   d[Symbol("negative_sample_rate")] = Umap::Defaults::negative_sample_rate;
   d[Symbol("num_neighbors")] = Umap::Defaults::num_neighbors;
   d[Symbol("seed")] = Umap::Defaults::seed;
-  d[Symbol("batch")] = Umap::Defaults::batch;
   d[Symbol("num_threads")] = Umap::Defaults::num_threads;
+  d[Symbol("parallel_optimization")] = Umap::Defaults::parallel_optimization;
   return d;
 }
@@ -46,13 +43,16 @@ Object umappp_run(
     Hash params,
     numo::SFloat data,
     int ndim,
-    int nn_method,
-    int tick = 0)
+    int nn_method)
 {
   // Parameters are taken from a Ruby Hash object.
   // If there is key, set the value.
+  if (ndim < 1)
+  {
+    throw std::runtime_error("ndim is less than 1");
+  }
-  auto umap_ptr = new Umap;
+  std::unique_ptr<Umap> umap_ptr(new Umap);
   double local_connectivity = Umap::Defaults::local_connectivity;
   if (RTEST(params.call("has_key?", Symbol("local_connectivity"))))
@@ -110,6 +110,13 @@ Object umappp_run(
     umap_ptr->set_repulsion_strength(repulsion_strength);
   }
+  umappp::InitMethod initialize = Umap::Defaults::initialize;
+  if (RTEST(params.call("has_key?", Symbol("initialize"))))
+  {
+    initialize = params.get<umappp::InitMethod>(Symbol("initialize"));
+    umap_ptr->set_initialize(initialize);
+  }
   int num_epochs = Umap::Defaults::num_epochs;
   if (RTEST(params.call("has_key?", Symbol("num_epochs"))))
   {
@@ -145,13 +152,6 @@ Object umappp_run(
     umap_ptr->set_seed(seed);
   }
-  bool batch = Umap::Defaults::batch;
-  if (RTEST(params.call("has_key?", Symbol("batch"))))
-  {
-    batch = params.get<bool>(Symbol("batch"));
-    umap_ptr->set_batch(batch);
-  }
   int num_threads = Umap::Defaults::num_threads;
   if (RTEST(params.call("has_key?", Symbol("num_threads"))))
   {
@@ -159,6 +159,13 @@ Object umappp_run(
     umap_ptr->set_num_threads(num_threads);
   }
+  bool parallel_optimization = Umap::Defaults::parallel_optimization;
+  if (RTEST(params.call("has_key?", Symbol("parallel_optimization"))))
+  {
+    parallel_optimization = params.get<bool>(Symbol("parallel_optimization"));
+    umap_ptr->set_parallel_optimization(parallel_optimization);
+  }
   // initialize_from_matrix
   const float *y = data.read_ptr();
@@ -166,6 +173,10 @@ Object umappp_run(
   int nd = shape[1];
   int nobs = shape[0];
+  if (nobs < 0)
+  {
+    throw std::runtime_error("nobs is negative");
+  }
   std::unique_ptr<knncolle::Base<int, Float>> knncolle_ptr;
   if (nn_method == 0)
@@ -180,39 +191,16 @@ Object umappp_run(
   std::vector<Float> embedding(ndim * nobs);
   auto status = umap_ptr->initialize(knncolle_ptr.get(), ndim, embedding.data());
-  if (nobs < 0 || ndim < 0)
-  {
-    throw std::runtime_error("nobs or ndim is negative");
-  }
-  if (tick == 0)
-  {
-    status.run(ndim, embedding.data(), 0);
-    // it is safe to cast to unsigned int
-    auto na = numo::SFloat({(unsigned int)nobs, (unsigned int)ndim});
-    std::copy(embedding.begin(), embedding.end(), na.write_ptr());
-    return na;
-  }
-  else
-  {
-    VALUE ret = rb_ary_new();
-    while (status.epoch() < status.num_epochs())
-    {
-      int epoch_limit = status.epoch() + tick;
+  int epoch_limit = 0;
+  // tick is not implemented yet
+  status.run(epoch_limit);
-      status.run(ndim, embedding.data(), epoch_limit);
+  // it is safe to cast to unsigned int
+  auto na = numo::SFloat({(unsigned int)nobs, (unsigned int)ndim});
+  std::copy(embedding.begin(), embedding.end(), na.write_ptr());
-      //it is safe to cast to unsigned int
-      auto na = numo::SFloat({(unsigned int)nobs, (unsigned int)ndim});
-      std::copy(embedding.begin(), embedding.end(), na.write_ptr());
-      rb_ary_push(ret, na.value());
-    }
-    return ret;
-  }
+  return na;
 }
 extern "C" void Init_umappp()
@@ -221,4 +209,10 @@ extern "C" void Init_umappp()
       define_module("Umappp")
           .define_singleton_method("umappp_run", &umappp_run)
           .define_singleton_method("umappp_default_parameters", &umappp_default_parameters);
+  Enum<umappp::InitMethod> init_method =
+      define_enum<umappp::InitMethod>("InitMethod")
+          .define_value("SPECTRAL", umappp::InitMethod::SPECTRAL)
+          .define_value("SPECTRAL_ONLY", umappp::InitMethod::SPECTRAL_ONLY)
+          .define_value("RANDOM", umappp::InitMethod::RANDOM)
+          .define_value("NONE", umappp::InitMethod::NONE);
 }

data/lib/umappp/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Umappp
-  VERSION = "0.1.6"
+  VERSION = "0.2.1"
 end

data/lib/umappp.rb CHANGED Viewed

@@ -12,7 +12,7 @@ module Umappp
   # View the default parameters defined within the Umappp C++ library structure.
   def self.default_parameters
-    # {method: :annoy, ndim: 2, tick: 0}.merge
+    # {method: :annoy, ndim: 2}.merge
     umappp_default_parameters
   end
@@ -30,16 +30,17 @@ module Umappp
   # @param a [Numeric]
   # @param b [Numeric]
   # @param repulsion_strength [Numeric]
+  # @param initilaize [Umappp::InitMethod]
   # @param num_epochs [Integer]
   # @param learning_rate [Numeric]
   # @param negative_sample_rate [Numeric]
   # @param num_neighbors [Integer]
   # @param seed [Integer]
-  # @param batch [Boolean]
   # @param num_threads [Integer]
+  # @param parallel_optimization [Boolean]
   # @return [Numo::SFloat] the final embedding
-  def self.run(embedding, method: :annoy, ndim: 2, tick: 0, **params)
+  def self.run(embedding, method: :annoy, ndim: 2, **params)
     unless (u = (params.keys - default_parameters.keys)).empty?
       raise ArgumentError, "[umappp.rb] unknown option : #{u.inspect}"
     end
@@ -50,6 +51,6 @@ module Umappp
     embedding2 = Numo::SFloat.cast(embedding)
     raise ArgumentError, "embedding must be a 2D array" if embedding2.ndim <= 1
-    umappp_run(params, embedding2, ndim, nnmethod, tick)
+    umappp_run(params, embedding2, ndim, nnmethod)
   end
 end

data/vendor/aarand/aarand.hpp CHANGED Viewed

@@ -5,10 +5,17 @@
 #include <limits>
 #include <stdexcept>
+/**
+ * @file aarand.hpp
+ *
+ * @brief Collection of random distribution functions.
+ */
 namespace aarand {
 /**
- * @tparam T Floating point type.
+ * @tparam T Floating point type to return.
+ * This is also used for intermediate calculations, so it is usually safest to provide a type that is at least as precise as a `double`.
  * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
  * where the `result_type` is an unsigned integer value.
  *
@@ -18,22 +25,34 @@ namespace aarand {
  */
 template<typename T = double, class Engine>
 T standard_uniform(Engine& eng) {
-    static_assert(!std::numeric_limits<typename Engine::result_type>::is_signed);
-    static_assert(std::numeric_limits<typename Engine::result_type>::is_integer);
+    typedef typename Engine::result_type R;
+    static_assert(std::numeric_limits<R>::is_integer, "RNG engine must yield integer results");
+    // Can't be bothered to figure out whether the range fits into 'R' for signed values.
+    // So instead, we just require unsigned integers, where the range will always fit.
+    static_assert(!std::numeric_limits<R>::is_signed, "RNG engine must yield unsigned integers");
+    // Make sure we get the right type to avoid inadvertent promotions.
+    constexpr T ONE_ = 1;
     // Stolen from Boost, see https://www.boost.org/doc/libs/1_67_0/boost/random/uniform_01.hpp
     // The +1 probably doesn't matter for 64-bit generators, but is helpful for engines with
     // fewer output bits, to reduce the (small) probability of sampling 1's.
-    constexpr double factor = 1.0 / (static_cast<T>(Engine::max() - Engine::min()) + 1.0);
-    double result;
+    constexpr T factor = ONE_ / (static_cast<T>(Engine::max() - Engine::min()) + ONE_);
+    // Note that it still might be possible to get a result = 1, depending on
+    // the numerical precision used to compute the product; hence the loop.
+    T result;
     do {
         result = static_cast<T>(eng() - Engine::min()) * factor;
-    } while (result == 1.0);
+    } while (result == ONE_);
     return result;
 }
 /**
- * @tparam T Floating point type.
+ * @tparam T Floating point type to return.
+ * This is also used for intermediate calculations, so it is usually safest to provide a type that is at least as precise as a `double`.
  * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
  * where the `result_type` is an unsigned integer value.
  *
@@ -43,16 +62,18 @@ T standard_uniform(Engine& eng) {
  */
 template<typename T = double, class Engine>
 std::pair<T, T> standard_normal(Engine& eng) {
-    constexpr double pi = 3.14159265358979323846;
+    constexpr T PI_ = 3.14159265358979323846;
+    constexpr T TWO_ = 2;
     // Box-Muller gives us two random values at a time.
-    double constant = std::sqrt(-2 * std::log(standard_uniform<T>(eng)));
-    double angle = 2 * pi * standard_uniform<T>(eng);
+    T constant = std::sqrt(-TWO_ * std::log(standard_uniform<T>(eng)));
+    T angle = TWO_ * PI_ * standard_uniform<T>(eng);
     return std::make_pair(constant * std::sin(angle), constant * std::cos(angle));
 }
 /**
- * @tparam T Floating point type.
+ * @tparam T Floating point type to return.
+ * This is also used for intermediate calculations, so it is usually safest to provide a type that is at least as precise as a `double`.
  * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
  * where the `result_type` is an unsigned integer value.
  *
@@ -62,7 +83,11 @@ std::pair<T, T> standard_normal(Engine& eng) {
  */
 template<typename T = double, class Engine>
 T standard_exponential(Engine& eng) {
-    return -std::log(standard_uniform(eng));
+    T val;
+    do {
+        val = standard_uniform<T>(eng);
+    } while (val == static_cast<T>(0));
+    return -std::log(val);
 }
 /**
@@ -79,7 +104,7 @@ template<typename T = int, class Engine>
 T discrete_uniform(Engine& eng, T bound) {
     typedef typename Engine::result_type R;
     static_assert(std::numeric_limits<R>::is_integer);
-    static_assert(!std::numeric_limits<R>::is_signed);
+    static_assert(!std::numeric_limits<R>::is_signed); // don't want to figure out how to store the range.
     constexpr R range = Engine::max() - Engine::min();
     if (bound > range) {
@@ -91,22 +116,110 @@ T discrete_uniform(Engine& eng, T bound) {
         throw std::runtime_error("'bound' should be a positive integer");
     }
-    // The limit is necessary to provide uniformity in the presence of the
-    // modulus. The idea is to re-sample if we get a draw above the limit.
-    // Technically this can have problems as bound approaches range, in which
-    // case we might end up discarding a lot of the sample space... but this
-    // is unlikely to happen in practice, so whatever. Note that the +1 is
-    // necessary because range is inclusive but bound is not.
-    const R limit = range - (range % bound + 1);
-    // In addition, we don't have to deal with the crap about combining draws
-    // to get enough entropy, which is 90% of the Boost implementation.
-    T draw;
-    do {
-        draw = (eng() - Engine::min()) % bound;
-    } while (draw > limit);
+    R draw = eng() - Engine::min();
+    // Conservative shortcut to avoid an extra modulo operation in computing
+    // 'limit' if 'draw' is below 'limit'. This is based on the observation
+    // that 'range - bound <= limit', so any condition that triggers the loop
+    // will also pass this check. Allows early return when 'range >> bound'.
+    if (draw > range - bound) {
-    return draw;
+        // The limit is necessary to provide uniformity in the presence of the
+        // modulus. The idea is to re-sample if we get a draw above the limit.
+        // Technically this can have problems as bound approaches range, in which
+        // case we might end up discarding a lot of the sample space... but this
+        // is unlikely to happen in practice, and even if it does, it's a rejection
+        // rate that's guaranteed to be less than 50%, so whatever.
+        //
+        // Note that the +1 is necessary because range is inclusive but bound is not.
+        const R limit = range - ((range % bound) + 1);
+        // In addition, we don't have to deal with the crap about combining draws
+        // to get enough entropy, which is 90% of the Boost implementation.
+        while (draw > limit) {
+            draw = eng() - Engine::min();
+        }
+    }
+    return draw % bound;
+}
+/**
+ * @tparam In Random-access iterator or pointer.
+ * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
+ * where the `result_type` is an unsigned integer value.
+ *
+ * @param values Iterator or pointer to an array of values to shuffle.
+ * @param n Number of values in the array pointed to by `values`.
+ * @param eng Instance of an RNG class like `std::mt19937_64`.
+ *
+ * @return Contents of `values` are randomly permuted in place using the Fisher-Yates algorithm.
+ */
+template<class In, class Engine>
+void shuffle(In values, size_t n, Engine& eng) {
+    if (n) {
+        using std::swap;
+        for (size_t i = 0; i < n - 1; ++i) {
+            auto chosen = discrete_uniform(eng, n - i);
+            swap(*(values + i), *(values + i + chosen));
+        }
+    }
+    return;
+}
+/**
+ * @tparam In Random-access iterator or pointer for the inputs.
+ * @tparam Out Random-access iterator or pointer for the outputs.
+ * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
+ * where the `result_type` is an unsigned integer value.
+ *
+ * @param values Iterator or pointer to an array of values to sample from.
+ * @param n Number of values in the array pointed to by `values`.
+ * @param s Number of values to sample.
+ * @param output Iterator or pointer to an array of length `s`, to store the sampled values.
+ * @param eng Instance of an RNG class like `std::mt19937_64`.
+ *
+ * @return `output` is filled with `s` sampled values from `values`.
+ *
+ * If `s > n`, `values` is copied into the first `n` elements of `output` and the remaining values of `output` are undefined.
+ */
+template<class In, class Out, class Engine>
+void sample(In values, size_t n, size_t s, Out output, Engine& eng) {
+    for (size_t i = 0; i < n && s; ++i, ++values) {
+        const double threshold = static_cast<double>(s)/(n - i);
+        if (threshold >= 1 || standard_uniform(eng) <= threshold) {
+            *output = *values;
+            ++output;
+            --s;
+        }
+    }
+}
+/**
+ * @tparam Out Random-access iterator or pointer for the outputs.
+ * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
+ * where the `result_type` is an unsigned integer value.
+ *
+ * @param bound Upper bound of the indices to sample from.
+ * @param s Number of values to sample.
+ * @param output Iterator or pointer to an array of length `s`, to store the sampled values.
+ * @param eng Instance of an RNG class like `std::mt19937_64`.
+ *
+ * @return `output` is filled with `s` sampled values from the sequence of integers in `{0, 1, ..., bound - 1}`.
+ *
+ * If `s > bound`, the first `n` elements of `output` will contain the sequence of integers from `0` to `bound - 1`.
+ * The remaining values of `output` are undefined.
+ */
+template<class Out, class Engine>
+void sample(size_t bound, size_t s, Out output, Engine& eng) {
+    for (size_t i = 0; i < bound && s; ++i) {
+        const double threshold = static_cast<double>(s)/(bound - i);
+        if (threshold >= 1 || standard_uniform(eng) <= threshold) {
+            *output = i;
+            ++output;
+            --s;
+        }
+    }
 }
 }

data/vendor/annoy/annoylib.h CHANGED Viewed

@@ -128,7 +128,7 @@ inline void set_error_from_errno(char **error, const char* msg) {
   annoylib_showUpdate("%s: %s (%d)\n", msg, strerror(errno), errno);
   if (error) {
     *error = (char *)malloc(256);  // TODO: win doesn't support snprintf
-    sprintf(*error, "%s: %s (%d)", msg, strerror(errno), errno);
+    snprintf(*error, 255, "%s: %s (%d)", msg, strerror(errno), errno);
   }
 }