RubyGems - umappp - Versions diffs - 0.1.6 → 0.2.0 - Mend

umappp 0.1.6 → 0.2.0

Files changed (32) hide show

checksums.yaml +4 -4
data/README.md +11 -3
data/ext/umappp/umappp.cpp +39 -45
data/lib/umappp/version.rb +1 -1
data/lib/umappp.rb +5 -4
data/vendor/aarand/aarand.hpp +141 -28
data/vendor/annoy/annoylib.h +1 -1
data/vendor/hnswlib/bruteforce.h +142 -127
data/vendor/hnswlib/hnswalg.h +1018 -939
data/vendor/hnswlib/hnswlib.h +149 -58
data/vendor/hnswlib/space_ip.h +322 -229
data/vendor/hnswlib/space_l2.h +283 -240
data/vendor/hnswlib/visited_list_pool.h +54 -55
data/vendor/irlba/irlba.hpp +12 -27
data/vendor/irlba/lanczos.hpp +30 -31
data/vendor/irlba/parallel.hpp +37 -38
data/vendor/irlba/utils.hpp +12 -23
data/vendor/irlba/wrappers.hpp +239 -70
data/vendor/kmeans/Details.hpp +1 -1
data/vendor/kmeans/HartiganWong.hpp +28 -2
data/vendor/kmeans/InitializeKmeansPP.hpp +29 -1
data/vendor/kmeans/Kmeans.hpp +25 -2
data/vendor/kmeans/Lloyd.hpp +29 -2
data/vendor/kmeans/MiniBatch.hpp +48 -8
data/vendor/knncolle/Annoy/Annoy.hpp +3 -0
data/vendor/knncolle/Hnsw/Hnsw.hpp +3 -0
data/vendor/knncolle/Kmknn/Kmknn.hpp +11 -1
data/vendor/knncolle/utils/find_nearest_neighbors.hpp +8 -6
data/vendor/umappp/Umap.hpp +85 -43
data/vendor/umappp/optimize_layout.hpp +410 -133
data/vendor/umappp/spectral_init.hpp +4 -1
metadata +6 -6

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4268767d4aa68fb7795e72e48a9822b629390091cd5214d460e1d99a3127a3c3
-  data.tar.gz: 684fdcf60aa7dc40a061927103692dda85fc2e40bb1680b0c61f014980197292
+  metadata.gz: 4e329dda5fe3d577f175b6c55059f165c4e9569ed14208785cd0a9184b5d14df
+  data.tar.gz: 3e0a7ed9a3a7a08109019adef9dc5c1f9a88c82e77d28307875a719c4bb0551e
 SHA512:
-  metadata.gz: 7a9d5181ec290f40b5b22079c36d90a2fe07e1072b35b933ee30da253fc0f734431081334b6fc7510317df330e7c321386bc46da742d4515a646437e70602835
-  data.tar.gz: 05d42582e0d559591bd22b767a6d7df7b821288fa8b44da86d8255822b49e629c15dd33e719d383a08d933ff1dfdd3fdf839e3eed7f2131dc0bf868c3ad467b4
+  metadata.gz: 1838cffb49dcac3e8429d7d112bff2dca7d6a72608d1ca19889533c1d394d332da24457cade7179845901712176d3ae9af626ce372e1f6e444ab490203180b65
+  data.tar.gz: bff7628b13e053fe337d9cf3b0ba37e68012440b943147303d0208c16491f82f45c7069e83e48084999f24b0b4880a0c510cac551059ff45d7302f4167428c28

data/README.md CHANGED Viewed

@@ -41,7 +41,6 @@ Available parameters and their default values
 |----------------------|------------------------------------|
 | method               | :annoy (another option is :vptree) |
 | ndim                 | 2                                  |
-| tick                 | 0 (Not yet implemented)            |
 | local_connectivity   | 1.0                                |
 | bandwidth            | 1                                  |
 | mix_ratio            | 1                                  |
@@ -50,12 +49,12 @@ Available parameters and their default values
 | a                    | 0                                  |
 | b                    | 0                                  |
 | repulsion_strength   | 1                                  |
+| initialize           | Umappp::InitMethod::SPECTRAL       |
 | num_epochs           | 500                                |
 | learning_rate        | 1                                  |
 | negative_sample_rate | 5                                  |
 | num_neighbors        | 15                                 |
 | seed                 | 1234567890                         |
-| batch                | false                              |
 | num_threads          | 1 (OpenMP required)                |
 ## Development
@@ -63,10 +62,19 @@ Available parameters and their default values
 ```
 git clone https://github.com/kojix2/ruby-umappp
 cd umap
-b ndle dle exec rake compile
+bundle exec rake compile
 bundle exec rake test
 ```
+Update LTLA/umappp
+Requires cmake to run
+```
+cd script
+./vendor.sh
+```
 ### Ruby dependencies
 * [rice](https://github.com/jasonroelofs/rice) - Ruby Interface for C++ Extensions

data/ext/umappp/umappp.cpp CHANGED Viewed

@@ -9,10 +9,6 @@
 typedef float Float;
 typedef typename umappp::Umap<Float> Umap;
-#ifdef _OPENMP
-#include <omp.h>
-#endif
 using namespace Rice;
 // This function is used to view default parameters from Ruby.
@@ -28,13 +24,14 @@ Hash umappp_default_parameters(Object self)
   d[Symbol("a")] = Umap::Defaults::a;
   d[Symbol("b")] = Umap::Defaults::b;
   d[Symbol("repulsion_strength")] = Umap::Defaults::repulsion_strength;
+  d[Symbol("initialize")] = Umap::Defaults::initialize;
   d[Symbol("num_epochs")] = Umap::Defaults::num_epochs;
   d[Symbol("learning_rate")] = Umap::Defaults::learning_rate;
   d[Symbol("negative_sample_rate")] = Umap::Defaults::negative_sample_rate;
   d[Symbol("num_neighbors")] = Umap::Defaults::num_neighbors;
   d[Symbol("seed")] = Umap::Defaults::seed;
-  d[Symbol("batch")] = Umap::Defaults::batch;
   d[Symbol("num_threads")] = Umap::Defaults::num_threads;
+  d[Symbol("parallel_optimization")] = Umap::Defaults::parallel_optimization;
   return d;
 }
@@ -46,13 +43,16 @@ Object umappp_run(
     Hash params,
     numo::SFloat data,
     int ndim,
-    int nn_method,
-    int tick = 0)
+    int nn_method)
 {
   // Parameters are taken from a Ruby Hash object.
   // If there is key, set the value.
+  if (ndim < 1)
+  {
+    throw std::runtime_error("ndim is less than 1");
+  }
-  auto umap_ptr = new Umap;
+  std::unique_ptr<Umap> umap_ptr(new Umap);
   double local_connectivity = Umap::Defaults::local_connectivity;
   if (RTEST(params.call("has_key?", Symbol("local_connectivity"))))
@@ -110,6 +110,13 @@ Object umappp_run(
     umap_ptr->set_repulsion_strength(repulsion_strength);
   }
+  umappp::InitMethod initialize = Umap::Defaults::initialize;
+  if (RTEST(params.call("has_key?", Symbol("initialize"))))
+  {
+    initialize = params.get<umappp::InitMethod>(Symbol("initialize"));
+    umap_ptr->set_initialize(initialize);
+  }
   int num_epochs = Umap::Defaults::num_epochs;
   if (RTEST(params.call("has_key?", Symbol("num_epochs"))))
   {
@@ -145,13 +152,6 @@ Object umappp_run(
     umap_ptr->set_seed(seed);
   }
-  bool batch = Umap::Defaults::batch;
-  if (RTEST(params.call("has_key?", Symbol("batch"))))
-  {
-    batch = params.get<bool>(Symbol("batch"));
-    umap_ptr->set_batch(batch);
-  }
   int num_threads = Umap::Defaults::num_threads;
   if (RTEST(params.call("has_key?", Symbol("num_threads"))))
   {
@@ -159,6 +159,13 @@ Object umappp_run(
     umap_ptr->set_num_threads(num_threads);
   }
+  bool parallel_optimization = Umap::Defaults::parallel_optimization;
+  if (RTEST(params.call("has_key?", Symbol("parallel_optimization"))))
+  {
+    parallel_optimization = params.get<bool>(Symbol("parallel_optimization"));
+    umap_ptr->set_parallel_optimization(parallel_optimization);
+  }
   // initialize_from_matrix
   const float *y = data.read_ptr();
@@ -166,6 +173,10 @@ Object umappp_run(
   int nd = shape[1];
   int nobs = shape[0];
+  if (nobs < 0)
+  {
+    throw std::runtime_error("nobs is negative");
+  }
   std::unique_ptr<knncolle::Base<int, Float>> knncolle_ptr;
   if (nn_method == 0)
@@ -180,39 +191,16 @@ Object umappp_run(
   std::vector<Float> embedding(ndim * nobs);
   auto status = umap_ptr->initialize(knncolle_ptr.get(), ndim, embedding.data());
-  if (nobs < 0 || ndim < 0)
-  {
-    throw std::runtime_error("nobs or ndim is negative");
-  }
-  if (tick == 0)
-  {
-    status.run(ndim, embedding.data(), 0);
-    // it is safe to cast to unsigned int
-    auto na = numo::SFloat({(unsigned int)nobs, (unsigned int)ndim});
-    std::copy(embedding.begin(), embedding.end(), na.write_ptr());
-    return na;
-  }
-  else
-  {
-    VALUE ret = rb_ary_new();
-    while (status.epoch() < status.num_epochs())
-    {
-      int epoch_limit = status.epoch() + tick;
+  int epoch_limit = 0;
+  // tick is not implemented yet
+  status.run(epoch_limit);
-      status.run(ndim, embedding.data(), epoch_limit);
+  // it is safe to cast to unsigned int
+  auto na = numo::SFloat({(unsigned int)nobs, (unsigned int)ndim});
+  std::copy(embedding.begin(), embedding.end(), na.write_ptr());
-      //it is safe to cast to unsigned int
-      auto na = numo::SFloat({(unsigned int)nobs, (unsigned int)ndim});
-      std::copy(embedding.begin(), embedding.end(), na.write_ptr());
-      rb_ary_push(ret, na.value());
-    }
-    return ret;
-  }
+  return na;
 }
 extern "C" void Init_umappp()
@@ -221,4 +209,10 @@ extern "C" void Init_umappp()
       define_module("Umappp")
           .define_singleton_method("umappp_run", &umappp_run)
           .define_singleton_method("umappp_default_parameters", &umappp_default_parameters);
+  Enum<umappp::InitMethod> init_method =
+      define_enum<umappp::InitMethod>("InitMethod", rb_mUmappp)
+          .define_value("SPECTRAL", umappp::InitMethod::SPECTRAL)
+          .define_value("SPECTRAL_ONLY", umappp::InitMethod::SPECTRAL_ONLY)
+          .define_value("RANDOM", umappp::InitMethod::RANDOM)
+          .define_value("NONE", umappp::InitMethod::NONE);
 }

data/lib/umappp/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Umappp
-  VERSION = "0.1.6"
+  VERSION = "0.2.0"
 end

data/lib/umappp.rb CHANGED Viewed

@@ -12,7 +12,7 @@ module Umappp
   # View the default parameters defined within the Umappp C++ library structure.
   def self.default_parameters
-    # {method: :annoy, ndim: 2, tick: 0}.merge
+    # {method: :annoy, ndim: 2}.merge
     umappp_default_parameters
   end
@@ -30,16 +30,17 @@ module Umappp
   # @param a [Numeric]
   # @param b [Numeric]
   # @param repulsion_strength [Numeric]
+  # @param initilaize [Umappp::InitMethod]
   # @param num_epochs [Integer]
   # @param learning_rate [Numeric]
   # @param negative_sample_rate [Numeric]
   # @param num_neighbors [Integer]
   # @param seed [Integer]
-  # @param batch [Boolean]
   # @param num_threads [Integer]
+  # @param parallel_optimization [Boolean]
   # @return [Numo::SFloat] the final embedding
-  def self.run(embedding, method: :annoy, ndim: 2, tick: 0, **params)
+  def self.run(embedding, method: :annoy, ndim: 2, **params)
     unless (u = (params.keys - default_parameters.keys)).empty?
       raise ArgumentError, "[umappp.rb] unknown option : #{u.inspect}"
     end
@@ -50,6 +51,6 @@ module Umappp
     embedding2 = Numo::SFloat.cast(embedding)
     raise ArgumentError, "embedding must be a 2D array" if embedding2.ndim <= 1
-    umappp_run(params, embedding2, ndim, nnmethod, tick)
+    umappp_run(params, embedding2, ndim, nnmethod)
   end
 end

data/vendor/aarand/aarand.hpp CHANGED Viewed

@@ -5,10 +5,17 @@
 #include <limits>
 #include <stdexcept>
+/**
+ * @file aarand.hpp
+ *
+ * @brief Collection of random distribution functions.
+ */
 namespace aarand {
 /**
- * @tparam T Floating point type.
+ * @tparam T Floating point type to return.
+ * This is also used for intermediate calculations, so it is usually safest to provide a type that is at least as precise as a `double`.
  * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
  * where the `result_type` is an unsigned integer value.
  *
@@ -18,22 +25,34 @@ namespace aarand {
  */
 template<typename T = double, class Engine>
 T standard_uniform(Engine& eng) {
-    static_assert(!std::numeric_limits<typename Engine::result_type>::is_signed);
-    static_assert(std::numeric_limits<typename Engine::result_type>::is_integer);
+    typedef typename Engine::result_type R;
+    static_assert(std::numeric_limits<R>::is_integer, "RNG engine must yield integer results");
+    // Can't be bothered to figure out whether the range fits into 'R' for signed values.
+    // So instead, we just require unsigned integers, where the range will always fit.
+    static_assert(!std::numeric_limits<R>::is_signed, "RNG engine must yield unsigned integers");
+    // Make sure we get the right type to avoid inadvertent promotions.
+    constexpr T ONE_ = 1;
     // Stolen from Boost, see https://www.boost.org/doc/libs/1_67_0/boost/random/uniform_01.hpp
     // The +1 probably doesn't matter for 64-bit generators, but is helpful for engines with
     // fewer output bits, to reduce the (small) probability of sampling 1's.
-    constexpr double factor = 1.0 / (static_cast<T>(Engine::max() - Engine::min()) + 1.0);
-    double result;
+    constexpr T factor = ONE_ / (static_cast<T>(Engine::max() - Engine::min()) + ONE_);
+    // Note that it still might be possible to get a result = 1, depending on
+    // the numerical precision used to compute the product; hence the loop.
+    T result;
     do {
         result = static_cast<T>(eng() - Engine::min()) * factor;
-    } while (result == 1.0);
+    } while (result == ONE_);
     return result;
 }
 /**
- * @tparam T Floating point type.
+ * @tparam T Floating point type to return.
+ * This is also used for intermediate calculations, so it is usually safest to provide a type that is at least as precise as a `double`.
  * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
  * where the `result_type` is an unsigned integer value.
  *
@@ -43,16 +62,18 @@ T standard_uniform(Engine& eng) {
  */
 template<typename T = double, class Engine>
 std::pair<T, T> standard_normal(Engine& eng) {
-    constexpr double pi = 3.14159265358979323846;
+    constexpr T PI_ = 3.14159265358979323846;
+    constexpr T TWO_ = 2;
     // Box-Muller gives us two random values at a time.
-    double constant = std::sqrt(-2 * std::log(standard_uniform<T>(eng)));
-    double angle = 2 * pi * standard_uniform<T>(eng);
+    T constant = std::sqrt(-TWO_ * std::log(standard_uniform<T>(eng)));
+    T angle = TWO_ * PI_ * standard_uniform<T>(eng);
     return std::make_pair(constant * std::sin(angle), constant * std::cos(angle));
 }
 /**
- * @tparam T Floating point type.
+ * @tparam T Floating point type to return.
+ * This is also used for intermediate calculations, so it is usually safest to provide a type that is at least as precise as a `double`.
  * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
  * where the `result_type` is an unsigned integer value.
  *
@@ -62,7 +83,11 @@ std::pair<T, T> standard_normal(Engine& eng) {
  */
 template<typename T = double, class Engine>
 T standard_exponential(Engine& eng) {
-    return -std::log(standard_uniform(eng));
+    T val;
+    do {
+        val = standard_uniform<T>(eng);
+    } while (val == static_cast<T>(0));
+    return -std::log(val);
 }
 /**
@@ -79,7 +104,7 @@ template<typename T = int, class Engine>
 T discrete_uniform(Engine& eng, T bound) {
     typedef typename Engine::result_type R;
     static_assert(std::numeric_limits<R>::is_integer);
-    static_assert(!std::numeric_limits<R>::is_signed);
+    static_assert(!std::numeric_limits<R>::is_signed); // don't want to figure out how to store the range.
     constexpr R range = Engine::max() - Engine::min();
     if (bound > range) {
@@ -91,22 +116,110 @@ T discrete_uniform(Engine& eng, T bound) {
         throw std::runtime_error("'bound' should be a positive integer");
     }
-    // The limit is necessary to provide uniformity in the presence of the
-    // modulus. The idea is to re-sample if we get a draw above the limit.
-    // Technically this can have problems as bound approaches range, in which
-    // case we might end up discarding a lot of the sample space... but this
-    // is unlikely to happen in practice, so whatever. Note that the +1 is
-    // necessary because range is inclusive but bound is not.
-    const R limit = range - (range % bound + 1);
-    // In addition, we don't have to deal with the crap about combining draws
-    // to get enough entropy, which is 90% of the Boost implementation.
-    T draw;
-    do {
-        draw = (eng() - Engine::min()) % bound;
-    } while (draw > limit);
+    R draw = eng() - Engine::min();
+    // Conservative shortcut to avoid an extra modulo operation in computing
+    // 'limit' if 'draw' is below 'limit'. This is based on the observation
+    // that 'range - bound <= limit', so any condition that triggers the loop
+    // will also pass this check. Allows early return when 'range >> bound'.
+    if (draw > range - bound) {
-    return draw;
+        // The limit is necessary to provide uniformity in the presence of the
+        // modulus. The idea is to re-sample if we get a draw above the limit.
+        // Technically this can have problems as bound approaches range, in which
+        // case we might end up discarding a lot of the sample space... but this
+        // is unlikely to happen in practice, and even if it does, it's a rejection
+        // rate that's guaranteed to be less than 50%, so whatever.
+        //
+        // Note that the +1 is necessary because range is inclusive but bound is not.
+        const R limit = range - ((range % bound) + 1);
+        // In addition, we don't have to deal with the crap about combining draws
+        // to get enough entropy, which is 90% of the Boost implementation.
+        while (draw > limit) {
+            draw = eng() - Engine::min();
+        }
+    }
+    return draw % bound;
+}
+/**
+ * @tparam In Random-access iterator or pointer.
+ * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
+ * where the `result_type` is an unsigned integer value.
+ *
+ * @param values Iterator or pointer to an array of values to shuffle.
+ * @param n Number of values in the array pointed to by `values`.
+ * @param eng Instance of an RNG class like `std::mt19937_64`.
+ *
+ * @return Contents of `values` are randomly permuted in place using the Fisher-Yates algorithm.
+ */
+template<class In, class Engine>
+void shuffle(In values, size_t n, Engine& eng) {
+    if (n) {
+        using std::swap;
+        for (size_t i = 0; i < n - 1; ++i) {
+            auto chosen = discrete_uniform(eng, n - i);
+            swap(*(values + i), *(values + i + chosen));
+        }
+    }
+    return;
+}
+/**
+ * @tparam In Random-access iterator or pointer for the inputs.
+ * @tparam Out Random-access iterator or pointer for the outputs.
+ * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
+ * where the `result_type` is an unsigned integer value.
+ *
+ * @param values Iterator or pointer to an array of values to sample from.
+ * @param n Number of values in the array pointed to by `values`.
+ * @param s Number of values to sample.
+ * @param output Iterator or pointer to an array of length `s`, to store the sampled values.
+ * @param eng Instance of an RNG class like `std::mt19937_64`.
+ *
+ * @return `output` is filled with `s` sampled values from `values`.
+ *
+ * If `s > n`, `values` is copied into the first `n` elements of `output` and the remaining values of `output` are undefined.
+ */
+template<class In, class Out, class Engine>
+void sample(In values, size_t n, size_t s, Out output, Engine& eng) {
+    for (size_t i = 0; i < n && s; ++i, ++values) {
+        const double threshold = static_cast<double>(s)/(n - i);
+        if (threshold >= 1 || standard_uniform(eng) <= threshold) {
+            *output = *values;
+            ++output;
+            --s;
+        }
+    }
+}
+/**
+ * @tparam Out Random-access iterator or pointer for the outputs.
+ * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
+ * where the `result_type` is an unsigned integer value.
+ *
+ * @param bound Upper bound of the indices to sample from.
+ * @param s Number of values to sample.
+ * @param output Iterator or pointer to an array of length `s`, to store the sampled values.
+ * @param eng Instance of an RNG class like `std::mt19937_64`.
+ *
+ * @return `output` is filled with `s` sampled values from the sequence of integers in `{0, 1, ..., bound - 1}`.
+ *
+ * If `s > bound`, the first `n` elements of `output` will contain the sequence of integers from `0` to `bound - 1`.
+ * The remaining values of `output` are undefined.
+ */
+template<class Out, class Engine>
+void sample(size_t bound, size_t s, Out output, Engine& eng) {
+    for (size_t i = 0; i < bound && s; ++i) {
+        const double threshold = static_cast<double>(s)/(bound - i);
+        if (threshold >= 1 || standard_uniform(eng) <= threshold) {
+            *output = i;
+            ++output;
+            --s;
+        }
+    }
 }
 }

data/vendor/annoy/annoylib.h CHANGED Viewed

@@ -128,7 +128,7 @@ inline void set_error_from_errno(char **error, const char* msg) {
   annoylib_showUpdate("%s: %s (%d)\n", msg, strerror(errno), errno);
   if (error) {
     *error = (char *)malloc(256);  // TODO: win doesn't support snprintf
-    sprintf(*error, "%s: %s (%d)", msg, strerror(errno), errno);
+    snprintf(*error, 255, "%s: %s (%d)", msg, strerror(errno), errno);
   }
 }