umappp 0.1.5 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cc70f43249cb1993d3fdb810a0cc542e9da383c113f3119414c43723e613ccf0
4
- data.tar.gz: ec95857557261b5d1aa066dae830ffb7648e7758dd50a96c14af5332a2563e9f
3
+ metadata.gz: 4e329dda5fe3d577f175b6c55059f165c4e9569ed14208785cd0a9184b5d14df
4
+ data.tar.gz: 3e0a7ed9a3a7a08109019adef9dc5c1f9a88c82e77d28307875a719c4bb0551e
5
5
  SHA512:
6
- metadata.gz: b0d0be70caca251226572ae920adab0938451383a90444235ffd939ad23e58c73034cdce95fa4639d26da9f5517453f0aa2b78fcb8d254d03685726b55ceb10d
7
- data.tar.gz: 4cebd1fb124c6d61bf083af58283f75a9e6a714c2ad4425a2c72871fa1bee579bb7377859691187ecfb6f845d6e2b8d4bb670e1837813fe7e6dbb33975aa2c70
6
+ metadata.gz: 1838cffb49dcac3e8429d7d112bff2dca7d6a72608d1ca19889533c1d394d332da24457cade7179845901712176d3ae9af626ce372e1f6e444ab490203180b65
7
+ data.tar.gz: bff7628b13e053fe337d9cf3b0ba37e68012440b943147303d0208c16491f82f45c7069e83e48084999f24b0b4880a0c510cac551059ff45d7302f4167428c28
data/README.md CHANGED
@@ -16,7 +16,6 @@
16
16
 
17
17
  ## Installation
18
18
 
19
-
20
19
  ```
21
20
  gem install umappp
22
21
  ```
@@ -42,7 +41,6 @@ Available parameters and their default values
42
41
  |----------------------|------------------------------------|
43
42
  | method | :annoy (another option is :vptree) |
44
43
  | ndim | 2 |
45
- | tick | 0 (Not yet implemented) |
46
44
  | local_connectivity | 1.0 |
47
45
  | bandwidth | 1 |
48
46
  | mix_ratio | 1 |
@@ -51,12 +49,12 @@ Available parameters and their default values
51
49
  | a | 0 |
52
50
  | b | 0 |
53
51
  | repulsion_strength | 1 |
52
+ | initialize | Umappp::InitMethod::SPECTRAL |
54
53
  | num_epochs | 500 |
55
54
  | learning_rate | 1 |
56
55
  | negative_sample_rate | 5 |
57
56
  | num_neighbors | 15 |
58
57
  | seed | 1234567890 |
59
- | batch | false |
60
58
  | num_threads | 1 (OpenMP required) |
61
59
 
62
60
  ## Development
@@ -64,10 +62,19 @@ Available parameters and their default values
64
62
  ```
65
63
  git clone https://github.com/kojix2/ruby-umappp
66
64
  cd umap
67
- b ndle dle exec rake compile
65
+ bundle exec rake compile
68
66
  bundle exec rake test
69
67
  ```
70
68
 
69
+ Update LTLA/umappp
70
+
71
+ Requires cmake to run
72
+
73
+ ```
74
+ cd script
75
+ ./vendor.sh
76
+ ```
77
+
71
78
  ### Ruby dependencies
72
79
 
73
80
  * [rice](https://github.com/jasonroelofs/rice) - Ruby Interface for C++ Extensions
@@ -1,7 +1,6 @@
1
1
  // Uniform Manifold Approximation and Projection for Ruby
2
2
  // https://github.com/kojix2/ruby-umappp
3
3
 
4
-
5
4
  #include <rice/rice.hpp>
6
5
  #include <rice/stl.hpp>
7
6
  #include "numo.hpp"
@@ -10,10 +9,6 @@
10
9
  typedef float Float;
11
10
  typedef typename umappp::Umap<Float> Umap;
12
11
 
13
- #ifdef _OPENMP
14
- #include <omp.h>
15
- #endif
16
-
17
12
  using namespace Rice;
18
13
 
19
14
  // This function is used to view default parameters from Ruby.
@@ -29,13 +24,14 @@ Hash umappp_default_parameters(Object self)
29
24
  d[Symbol("a")] = Umap::Defaults::a;
30
25
  d[Symbol("b")] = Umap::Defaults::b;
31
26
  d[Symbol("repulsion_strength")] = Umap::Defaults::repulsion_strength;
27
+ d[Symbol("initialize")] = Umap::Defaults::initialize;
32
28
  d[Symbol("num_epochs")] = Umap::Defaults::num_epochs;
33
29
  d[Symbol("learning_rate")] = Umap::Defaults::learning_rate;
34
30
  d[Symbol("negative_sample_rate")] = Umap::Defaults::negative_sample_rate;
35
31
  d[Symbol("num_neighbors")] = Umap::Defaults::num_neighbors;
36
32
  d[Symbol("seed")] = Umap::Defaults::seed;
37
- d[Symbol("batch")] = Umap::Defaults::batch;
38
33
  d[Symbol("num_threads")] = Umap::Defaults::num_threads;
34
+ d[Symbol("parallel_optimization")] = Umap::Defaults::parallel_optimization;
39
35
 
40
36
  return d;
41
37
  }
@@ -47,13 +43,16 @@ Object umappp_run(
47
43
  Hash params,
48
44
  numo::SFloat data,
49
45
  int ndim,
50
- int nn_method,
51
- int tick = 0)
46
+ int nn_method)
52
47
  {
53
48
  // Parameters are taken from a Ruby Hash object.
54
49
  // If there is key, set the value.
55
-
56
- auto umap_ptr = new Umap;
50
+ if (ndim < 1)
51
+ {
52
+ throw std::runtime_error("ndim is less than 1");
53
+ }
54
+
55
+ std::unique_ptr<Umap> umap_ptr(new Umap);
57
56
 
58
57
  double local_connectivity = Umap::Defaults::local_connectivity;
59
58
  if (RTEST(params.call("has_key?", Symbol("local_connectivity"))))
@@ -111,6 +110,13 @@ Object umappp_run(
111
110
  umap_ptr->set_repulsion_strength(repulsion_strength);
112
111
  }
113
112
 
113
+ umappp::InitMethod initialize = Umap::Defaults::initialize;
114
+ if (RTEST(params.call("has_key?", Symbol("initialize"))))
115
+ {
116
+ initialize = params.get<umappp::InitMethod>(Symbol("initialize"));
117
+ umap_ptr->set_initialize(initialize);
118
+ }
119
+
114
120
  int num_epochs = Umap::Defaults::num_epochs;
115
121
  if (RTEST(params.call("has_key?", Symbol("num_epochs"))))
116
122
  {
@@ -146,20 +152,20 @@ Object umappp_run(
146
152
  umap_ptr->set_seed(seed);
147
153
  }
148
154
 
149
- bool batch = Umap::Defaults::batch;
150
- if (RTEST(params.call("has_key?", Symbol("batch"))))
151
- {
152
- batch = params.get<bool>(Symbol("batch"));
153
- umap_ptr->set_batch(batch);
154
- }
155
-
156
155
  int num_threads = Umap::Defaults::num_threads;
157
156
  if (RTEST(params.call("has_key?", Symbol("num_threads"))))
158
157
  {
159
158
  num_threads = params.get<int>(Symbol("num_threads"));
160
159
  umap_ptr->set_num_threads(num_threads);
161
160
  }
162
-
161
+
162
+ bool parallel_optimization = Umap::Defaults::parallel_optimization;
163
+ if (RTEST(params.call("has_key?", Symbol("parallel_optimization"))))
164
+ {
165
+ parallel_optimization = params.get<bool>(Symbol("parallel_optimization"));
166
+ umap_ptr->set_parallel_optimization(parallel_optimization);
167
+ }
168
+
163
169
  // initialize_from_matrix
164
170
 
165
171
  const float *y = data.read_ptr();
@@ -167,6 +173,10 @@ Object umappp_run(
167
173
 
168
174
  int nd = shape[1];
169
175
  int nobs = shape[0];
176
+ if (nobs < 0)
177
+ {
178
+ throw std::runtime_error("nobs is negative");
179
+ }
170
180
 
171
181
  std::unique_ptr<knncolle::Base<int, Float>> knncolle_ptr;
172
182
  if (nn_method == 0)
@@ -182,33 +192,15 @@ Object umappp_run(
182
192
 
183
193
  auto status = umap_ptr->initialize(knncolle_ptr.get(), ndim, embedding.data());
184
194
 
185
- if (tick == 0)
186
- {
187
- status.run(ndim, embedding.data(), 0);
195
+ int epoch_limit = 0;
196
+ // tick is not implemented yet
197
+ status.run(epoch_limit);
188
198
 
189
- auto na = numo::SFloat({(uint)nobs, (uint)ndim});
190
- std::copy(embedding.begin(), embedding.end(), na.write_ptr());
199
+ // it is safe to cast to unsigned int
200
+ auto na = numo::SFloat({(unsigned int)nobs, (unsigned int)ndim});
201
+ std::copy(embedding.begin(), embedding.end(), na.write_ptr());
191
202
 
192
- return na;
193
- }
194
- else
195
- {
196
- VALUE ret = rb_ary_new();
197
-
198
- while (status.epoch() < status.num_epochs())
199
- {
200
- int epoch_limit = status.epoch() + tick;
201
-
202
- status.run(ndim, embedding.data(), epoch_limit);
203
-
204
- auto na = numo::SFloat({(uint)nobs, (uint)ndim});
205
- std::copy(embedding.begin(), embedding.end(), na.write_ptr());
206
-
207
- rb_ary_push(ret, na.value());
208
- }
209
-
210
- return ret;
211
- }
203
+ return na;
212
204
  }
213
205
 
214
206
  extern "C" void Init_umappp()
@@ -217,4 +209,10 @@ extern "C" void Init_umappp()
217
209
  define_module("Umappp")
218
210
  .define_singleton_method("umappp_run", &umappp_run)
219
211
  .define_singleton_method("umappp_default_parameters", &umappp_default_parameters);
212
+ Enum<umappp::InitMethod> init_method =
213
+ define_enum<umappp::InitMethod>("InitMethod", rb_mUmappp)
214
+ .define_value("SPECTRAL", umappp::InitMethod::SPECTRAL)
215
+ .define_value("SPECTRAL_ONLY", umappp::InitMethod::SPECTRAL_ONLY)
216
+ .define_value("RANDOM", umappp::InitMethod::RANDOM)
217
+ .define_value("NONE", umappp::InitMethod::NONE);
220
218
  }
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Umappp
4
- VERSION = "0.1.5"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/umappp.rb CHANGED
@@ -12,7 +12,7 @@ module Umappp
12
12
 
13
13
  # View the default parameters defined within the Umappp C++ library structure.
14
14
  def self.default_parameters
15
- # {method: :annoy, ndim: 2, tick: 0}.merge
15
+ # {method: :annoy, ndim: 2}.merge
16
16
  umappp_default_parameters
17
17
  end
18
18
 
@@ -30,16 +30,17 @@ module Umappp
30
30
  # @param a [Numeric]
31
31
  # @param b [Numeric]
32
32
  # @param repulsion_strength [Numeric]
33
+ # @param initilaize [Umappp::InitMethod]
33
34
  # @param num_epochs [Integer]
34
35
  # @param learning_rate [Numeric]
35
36
  # @param negative_sample_rate [Numeric]
36
37
  # @param num_neighbors [Integer]
37
38
  # @param seed [Integer]
38
- # @param batch [Boolean]
39
39
  # @param num_threads [Integer]
40
+ # @param parallel_optimization [Boolean]
40
41
  # @return [Numo::SFloat] the final embedding
41
42
 
42
- def self.run(embedding, method: :annoy, ndim: 2, tick: 0, **params)
43
+ def self.run(embedding, method: :annoy, ndim: 2, **params)
43
44
  unless (u = (params.keys - default_parameters.keys)).empty?
44
45
  raise ArgumentError, "[umappp.rb] unknown option : #{u.inspect}"
45
46
  end
@@ -50,6 +51,6 @@ module Umappp
50
51
  embedding2 = Numo::SFloat.cast(embedding)
51
52
  raise ArgumentError, "embedding must be a 2D array" if embedding2.ndim <= 1
52
53
 
53
- umappp_run(params, embedding2, ndim, nnmethod, tick)
54
+ umappp_run(params, embedding2, ndim, nnmethod)
54
55
  end
55
56
  end
@@ -5,10 +5,17 @@
5
5
  #include <limits>
6
6
  #include <stdexcept>
7
7
 
8
+ /**
9
+ * @file aarand.hpp
10
+ *
11
+ * @brief Collection of random distribution functions.
12
+ */
13
+
8
14
  namespace aarand {
9
15
 
10
16
  /**
11
- * @tparam T Floating point type.
17
+ * @tparam T Floating point type to return.
18
+ * This is also used for intermediate calculations, so it is usually safest to provide a type that is at least as precise as a `double`.
12
19
  * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
13
20
  * where the `result_type` is an unsigned integer value.
14
21
  *
@@ -18,22 +25,34 @@ namespace aarand {
18
25
  */
19
26
  template<typename T = double, class Engine>
20
27
  T standard_uniform(Engine& eng) {
21
- static_assert(!std::numeric_limits<typename Engine::result_type>::is_signed);
22
- static_assert(std::numeric_limits<typename Engine::result_type>::is_integer);
28
+ typedef typename Engine::result_type R;
29
+ static_assert(std::numeric_limits<R>::is_integer, "RNG engine must yield integer results");
30
+
31
+ // Can't be bothered to figure out whether the range fits into 'R' for signed values.
32
+ // So instead, we just require unsigned integers, where the range will always fit.
33
+ static_assert(!std::numeric_limits<R>::is_signed, "RNG engine must yield unsigned integers");
34
+
35
+ // Make sure we get the right type to avoid inadvertent promotions.
36
+ constexpr T ONE_ = 1;
23
37
 
24
38
  // Stolen from Boost, see https://www.boost.org/doc/libs/1_67_0/boost/random/uniform_01.hpp
25
39
  // The +1 probably doesn't matter for 64-bit generators, but is helpful for engines with
26
40
  // fewer output bits, to reduce the (small) probability of sampling 1's.
27
- constexpr double factor = 1.0 / (static_cast<T>(Engine::max() - Engine::min()) + 1.0);
28
- double result;
41
+ constexpr T factor = ONE_ / (static_cast<T>(Engine::max() - Engine::min()) + ONE_);
42
+
43
+ // Note that it still might be possible to get a result = 1, depending on
44
+ // the numerical precision used to compute the product; hence the loop.
45
+ T result;
29
46
  do {
30
47
  result = static_cast<T>(eng() - Engine::min()) * factor;
31
- } while (result == 1.0);
48
+ } while (result == ONE_);
49
+
32
50
  return result;
33
51
  }
34
52
 
35
53
  /**
36
- * @tparam T Floating point type.
54
+ * @tparam T Floating point type to return.
55
+ * This is also used for intermediate calculations, so it is usually safest to provide a type that is at least as precise as a `double`.
37
56
  * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
38
57
  * where the `result_type` is an unsigned integer value.
39
58
  *
@@ -43,16 +62,18 @@ T standard_uniform(Engine& eng) {
43
62
  */
44
63
  template<typename T = double, class Engine>
45
64
  std::pair<T, T> standard_normal(Engine& eng) {
46
- constexpr double pi = 3.14159265358979323846;
65
+ constexpr T PI_ = 3.14159265358979323846;
66
+ constexpr T TWO_ = 2;
47
67
 
48
68
  // Box-Muller gives us two random values at a time.
49
- double constant = std::sqrt(-2 * std::log(standard_uniform<T>(eng)));
50
- double angle = 2 * pi * standard_uniform<T>(eng);
69
+ T constant = std::sqrt(-TWO_ * std::log(standard_uniform<T>(eng)));
70
+ T angle = TWO_ * PI_ * standard_uniform<T>(eng);
51
71
  return std::make_pair(constant * std::sin(angle), constant * std::cos(angle));
52
72
  }
53
73
 
54
74
  /**
55
- * @tparam T Floating point type.
75
+ * @tparam T Floating point type to return.
76
+ * This is also used for intermediate calculations, so it is usually safest to provide a type that is at least as precise as a `double`.
56
77
  * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
57
78
  * where the `result_type` is an unsigned integer value.
58
79
  *
@@ -62,7 +83,11 @@ std::pair<T, T> standard_normal(Engine& eng) {
62
83
  */
63
84
  template<typename T = double, class Engine>
64
85
  T standard_exponential(Engine& eng) {
65
- return -std::log(standard_uniform(eng));
86
+ T val;
87
+ do {
88
+ val = standard_uniform<T>(eng);
89
+ } while (val == static_cast<T>(0));
90
+ return -std::log(val);
66
91
  }
67
92
 
68
93
  /**
@@ -79,7 +104,7 @@ template<typename T = int, class Engine>
79
104
  T discrete_uniform(Engine& eng, T bound) {
80
105
  typedef typename Engine::result_type R;
81
106
  static_assert(std::numeric_limits<R>::is_integer);
82
- static_assert(!std::numeric_limits<R>::is_signed);
107
+ static_assert(!std::numeric_limits<R>::is_signed); // don't want to figure out how to store the range.
83
108
 
84
109
  constexpr R range = Engine::max() - Engine::min();
85
110
  if (bound > range) {
@@ -91,22 +116,110 @@ T discrete_uniform(Engine& eng, T bound) {
91
116
  throw std::runtime_error("'bound' should be a positive integer");
92
117
  }
93
118
 
94
- // The limit is necessary to provide uniformity in the presence of the
95
- // modulus. The idea is to re-sample if we get a draw above the limit.
96
- // Technically this can have problems as bound approaches range, in which
97
- // case we might end up discarding a lot of the sample space... but this
98
- // is unlikely to happen in practice, so whatever. Note that the +1 is
99
- // necessary because range is inclusive but bound is not.
100
- const R limit = range - (range % bound + 1);
101
-
102
- // In addition, we don't have to deal with the crap about combining draws
103
- // to get enough entropy, which is 90% of the Boost implementation.
104
- T draw;
105
- do {
106
- draw = (eng() - Engine::min()) % bound;
107
- } while (draw > limit);
119
+ R draw = eng() - Engine::min();
120
+
121
+ // Conservative shortcut to avoid an extra modulo operation in computing
122
+ // 'limit' if 'draw' is below 'limit'. This is based on the observation
123
+ // that 'range - bound <= limit', so any condition that triggers the loop
124
+ // will also pass this check. Allows early return when 'range >> bound'.
125
+ if (draw > range - bound) {
108
126
 
109
- return draw;
127
+ // The limit is necessary to provide uniformity in the presence of the
128
+ // modulus. The idea is to re-sample if we get a draw above the limit.
129
+ // Technically this can have problems as bound approaches range, in which
130
+ // case we might end up discarding a lot of the sample space... but this
131
+ // is unlikely to happen in practice, and even if it does, it's a rejection
132
+ // rate that's guaranteed to be less than 50%, so whatever.
133
+ //
134
+ // Note that the +1 is necessary because range is inclusive but bound is not.
135
+ const R limit = range - ((range % bound) + 1);
136
+
137
+ // In addition, we don't have to deal with the crap about combining draws
138
+ // to get enough entropy, which is 90% of the Boost implementation.
139
+ while (draw > limit) {
140
+ draw = eng() - Engine::min();
141
+ }
142
+ }
143
+
144
+ return draw % bound;
145
+ }
146
+
147
+ /**
148
+ * @tparam In Random-access iterator or pointer.
149
+ * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
150
+ * where the `result_type` is an unsigned integer value.
151
+ *
152
+ * @param values Iterator or pointer to an array of values to shuffle.
153
+ * @param n Number of values in the array pointed to by `values`.
154
+ * @param eng Instance of an RNG class like `std::mt19937_64`.
155
+ *
156
+ * @return Contents of `values` are randomly permuted in place using the Fisher-Yates algorithm.
157
+ */
158
+ template<class In, class Engine>
159
+ void shuffle(In values, size_t n, Engine& eng) {
160
+ if (n) {
161
+ using std::swap;
162
+ for (size_t i = 0; i < n - 1; ++i) {
163
+ auto chosen = discrete_uniform(eng, n - i);
164
+ swap(*(values + i), *(values + i + chosen));
165
+ }
166
+ }
167
+ return;
168
+ }
169
+
170
+ /**
171
+ * @tparam In Random-access iterator or pointer for the inputs.
172
+ * @tparam Out Random-access iterator or pointer for the outputs.
173
+ * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
174
+ * where the `result_type` is an unsigned integer value.
175
+ *
176
+ * @param values Iterator or pointer to an array of values to sample from.
177
+ * @param n Number of values in the array pointed to by `values`.
178
+ * @param s Number of values to sample.
179
+ * @param output Iterator or pointer to an array of length `s`, to store the sampled values.
180
+ * @param eng Instance of an RNG class like `std::mt19937_64`.
181
+ *
182
+ * @return `output` is filled with `s` sampled values from `values`.
183
+ *
184
+ * If `s > n`, `values` is copied into the first `n` elements of `output` and the remaining values of `output` are undefined.
185
+ */
186
+ template<class In, class Out, class Engine>
187
+ void sample(In values, size_t n, size_t s, Out output, Engine& eng) {
188
+ for (size_t i = 0; i < n && s; ++i, ++values) {
189
+ const double threshold = static_cast<double>(s)/(n - i);
190
+ if (threshold >= 1 || standard_uniform(eng) <= threshold) {
191
+ *output = *values;
192
+ ++output;
193
+ --s;
194
+ }
195
+ }
196
+ }
197
+
198
+ /**
199
+ * @tparam Out Random-access iterator or pointer for the outputs.
200
+ * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
201
+ * where the `result_type` is an unsigned integer value.
202
+ *
203
+ * @param bound Upper bound of the indices to sample from.
204
+ * @param s Number of values to sample.
205
+ * @param output Iterator or pointer to an array of length `s`, to store the sampled values.
206
+ * @param eng Instance of an RNG class like `std::mt19937_64`.
207
+ *
208
+ * @return `output` is filled with `s` sampled values from the sequence of integers in `{0, 1, ..., bound - 1}`.
209
+ *
210
+ * If `s > bound`, the first `n` elements of `output` will contain the sequence of integers from `0` to `bound - 1`.
211
+ * The remaining values of `output` are undefined.
212
+ */
213
+ template<class Out, class Engine>
214
+ void sample(size_t bound, size_t s, Out output, Engine& eng) {
215
+ for (size_t i = 0; i < bound && s; ++i) {
216
+ const double threshold = static_cast<double>(s)/(bound - i);
217
+ if (threshold >= 1 || standard_uniform(eng) <= threshold) {
218
+ *output = i;
219
+ ++output;
220
+ --s;
221
+ }
222
+ }
110
223
  }
111
224
 
112
225
  }
@@ -128,7 +128,7 @@ inline void set_error_from_errno(char **error, const char* msg) {
128
128
  annoylib_showUpdate("%s: %s (%d)\n", msg, strerror(errno), errno);
129
129
  if (error) {
130
130
  *error = (char *)malloc(256); // TODO: win doesn't support snprintf
131
- sprintf(*error, "%s: %s (%d)", msg, strerror(errno), errno);
131
+ snprintf(*error, 255, "%s: %s (%d)", msg, strerror(errno), errno);
132
132
  }
133
133
  }
134
134