umappp 0.1.6 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,10 +9,6 @@
9
9
  typedef float Float;
10
10
  typedef typename umappp::Umap<Float> Umap;
11
11
 
12
- #ifdef _OPENMP
13
- #include <omp.h>
14
- #endif
15
-
16
12
  using namespace Rice;
17
13
 
18
14
  // This function is used to view default parameters from Ruby.
@@ -28,13 +24,14 @@ Hash umappp_default_parameters(Object self)
28
24
  d[Symbol("a")] = Umap::Defaults::a;
29
25
  d[Symbol("b")] = Umap::Defaults::b;
30
26
  d[Symbol("repulsion_strength")] = Umap::Defaults::repulsion_strength;
27
+ d[Symbol("initialize")] = Umap::Defaults::initialize;
31
28
  d[Symbol("num_epochs")] = Umap::Defaults::num_epochs;
32
29
  d[Symbol("learning_rate")] = Umap::Defaults::learning_rate;
33
30
  d[Symbol("negative_sample_rate")] = Umap::Defaults::negative_sample_rate;
34
31
  d[Symbol("num_neighbors")] = Umap::Defaults::num_neighbors;
35
32
  d[Symbol("seed")] = Umap::Defaults::seed;
36
- d[Symbol("batch")] = Umap::Defaults::batch;
37
33
  d[Symbol("num_threads")] = Umap::Defaults::num_threads;
34
+ d[Symbol("parallel_optimization")] = Umap::Defaults::parallel_optimization;
38
35
 
39
36
  return d;
40
37
  }
@@ -46,13 +43,16 @@ Object umappp_run(
46
43
  Hash params,
47
44
  numo::SFloat data,
48
45
  int ndim,
49
- int nn_method,
50
- int tick = 0)
46
+ int nn_method)
51
47
  {
52
48
  // Parameters are taken from a Ruby Hash object.
53
49
  // If there is key, set the value.
50
+ if (ndim < 1)
51
+ {
52
+ throw std::runtime_error("ndim is less than 1");
53
+ }
54
54
 
55
- auto umap_ptr = new Umap;
55
+ std::unique_ptr<Umap> umap_ptr(new Umap);
56
56
 
57
57
  double local_connectivity = Umap::Defaults::local_connectivity;
58
58
  if (RTEST(params.call("has_key?", Symbol("local_connectivity"))))
@@ -110,6 +110,13 @@ Object umappp_run(
110
110
  umap_ptr->set_repulsion_strength(repulsion_strength);
111
111
  }
112
112
 
113
+ umappp::InitMethod initialize = Umap::Defaults::initialize;
114
+ if (RTEST(params.call("has_key?", Symbol("initialize"))))
115
+ {
116
+ initialize = params.get<umappp::InitMethod>(Symbol("initialize"));
117
+ umap_ptr->set_initialize(initialize);
118
+ }
119
+
113
120
  int num_epochs = Umap::Defaults::num_epochs;
114
121
  if (RTEST(params.call("has_key?", Symbol("num_epochs"))))
115
122
  {
@@ -145,13 +152,6 @@ Object umappp_run(
145
152
  umap_ptr->set_seed(seed);
146
153
  }
147
154
 
148
- bool batch = Umap::Defaults::batch;
149
- if (RTEST(params.call("has_key?", Symbol("batch"))))
150
- {
151
- batch = params.get<bool>(Symbol("batch"));
152
- umap_ptr->set_batch(batch);
153
- }
154
-
155
155
  int num_threads = Umap::Defaults::num_threads;
156
156
  if (RTEST(params.call("has_key?", Symbol("num_threads"))))
157
157
  {
@@ -159,6 +159,13 @@ Object umappp_run(
159
159
  umap_ptr->set_num_threads(num_threads);
160
160
  }
161
161
 
162
+ bool parallel_optimization = Umap::Defaults::parallel_optimization;
163
+ if (RTEST(params.call("has_key?", Symbol("parallel_optimization"))))
164
+ {
165
+ parallel_optimization = params.get<bool>(Symbol("parallel_optimization"));
166
+ umap_ptr->set_parallel_optimization(parallel_optimization);
167
+ }
168
+
162
169
  // initialize_from_matrix
163
170
 
164
171
  const float *y = data.read_ptr();
@@ -166,6 +173,10 @@ Object umappp_run(
166
173
 
167
174
  int nd = shape[1];
168
175
  int nobs = shape[0];
176
+ if (nobs < 0)
177
+ {
178
+ throw std::runtime_error("nobs is negative");
179
+ }
169
180
 
170
181
  std::unique_ptr<knncolle::Base<int, Float>> knncolle_ptr;
171
182
  if (nn_method == 0)
@@ -180,39 +191,16 @@ Object umappp_run(
180
191
  std::vector<Float> embedding(ndim * nobs);
181
192
 
182
193
  auto status = umap_ptr->initialize(knncolle_ptr.get(), ndim, embedding.data());
183
- if (nobs < 0 || ndim < 0)
184
- {
185
- throw std::runtime_error("nobs or ndim is negative");
186
- }
187
- if (tick == 0)
188
- {
189
- status.run(ndim, embedding.data(), 0);
190
-
191
- // it is safe to cast to unsigned int
192
- auto na = numo::SFloat({(unsigned int)nobs, (unsigned int)ndim});
193
- std::copy(embedding.begin(), embedding.end(), na.write_ptr());
194
194
 
195
- return na;
196
- }
197
- else
198
- {
199
- VALUE ret = rb_ary_new();
200
-
201
- while (status.epoch() < status.num_epochs())
202
- {
203
- int epoch_limit = status.epoch() + tick;
195
+ int epoch_limit = 0;
196
+ // tick is not implemented yet
197
+ status.run(epoch_limit);
204
198
 
205
- status.run(ndim, embedding.data(), epoch_limit);
199
+ // it is safe to cast to unsigned int
200
+ auto na = numo::SFloat({(unsigned int)nobs, (unsigned int)ndim});
201
+ std::copy(embedding.begin(), embedding.end(), na.write_ptr());
206
202
 
207
- //it is safe to cast to unsigned int
208
- auto na = numo::SFloat({(unsigned int)nobs, (unsigned int)ndim});
209
- std::copy(embedding.begin(), embedding.end(), na.write_ptr());
210
-
211
- rb_ary_push(ret, na.value());
212
- }
213
-
214
- return ret;
215
- }
203
+ return na;
216
204
  }
217
205
 
218
206
  extern "C" void Init_umappp()
@@ -221,4 +209,10 @@ extern "C" void Init_umappp()
221
209
  define_module("Umappp")
222
210
  .define_singleton_method("umappp_run", &umappp_run)
223
211
  .define_singleton_method("umappp_default_parameters", &umappp_default_parameters);
212
+ Enum<umappp::InitMethod> init_method =
213
+ define_enum<umappp::InitMethod>("InitMethod")
214
+ .define_value("SPECTRAL", umappp::InitMethod::SPECTRAL)
215
+ .define_value("SPECTRAL_ONLY", umappp::InitMethod::SPECTRAL_ONLY)
216
+ .define_value("RANDOM", umappp::InitMethod::RANDOM)
217
+ .define_value("NONE", umappp::InitMethod::NONE);
224
218
  }
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Umappp
4
- VERSION = "0.1.6"
4
+ VERSION = "0.2.1"
5
5
  end
data/lib/umappp.rb CHANGED
@@ -12,7 +12,7 @@ module Umappp
12
12
 
13
13
  # View the default parameters defined within the Umappp C++ library structure.
14
14
  def self.default_parameters
15
- # {method: :annoy, ndim: 2, tick: 0}.merge
15
+ # {method: :annoy, ndim: 2}.merge
16
16
  umappp_default_parameters
17
17
  end
18
18
 
@@ -30,16 +30,17 @@ module Umappp
30
30
  # @param a [Numeric]
31
31
  # @param b [Numeric]
32
32
  # @param repulsion_strength [Numeric]
33
+ # @param initilaize [Umappp::InitMethod]
33
34
  # @param num_epochs [Integer]
34
35
  # @param learning_rate [Numeric]
35
36
  # @param negative_sample_rate [Numeric]
36
37
  # @param num_neighbors [Integer]
37
38
  # @param seed [Integer]
38
- # @param batch [Boolean]
39
39
  # @param num_threads [Integer]
40
+ # @param parallel_optimization [Boolean]
40
41
  # @return [Numo::SFloat] the final embedding
41
42
 
42
- def self.run(embedding, method: :annoy, ndim: 2, tick: 0, **params)
43
+ def self.run(embedding, method: :annoy, ndim: 2, **params)
43
44
  unless (u = (params.keys - default_parameters.keys)).empty?
44
45
  raise ArgumentError, "[umappp.rb] unknown option : #{u.inspect}"
45
46
  end
@@ -50,6 +51,6 @@ module Umappp
50
51
  embedding2 = Numo::SFloat.cast(embedding)
51
52
  raise ArgumentError, "embedding must be a 2D array" if embedding2.ndim <= 1
52
53
 
53
- umappp_run(params, embedding2, ndim, nnmethod, tick)
54
+ umappp_run(params, embedding2, ndim, nnmethod)
54
55
  end
55
56
  end
@@ -5,10 +5,17 @@
5
5
  #include <limits>
6
6
  #include <stdexcept>
7
7
 
8
+ /**
9
+ * @file aarand.hpp
10
+ *
11
+ * @brief Collection of random distribution functions.
12
+ */
13
+
8
14
  namespace aarand {
9
15
 
10
16
  /**
11
- * @tparam T Floating point type.
17
+ * @tparam T Floating point type to return.
18
+ * This is also used for intermediate calculations, so it is usually safest to provide a type that is at least as precise as a `double`.
12
19
  * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
13
20
  * where the `result_type` is an unsigned integer value.
14
21
  *
@@ -18,22 +25,34 @@ namespace aarand {
18
25
  */
19
26
  template<typename T = double, class Engine>
20
27
  T standard_uniform(Engine& eng) {
21
- static_assert(!std::numeric_limits<typename Engine::result_type>::is_signed);
22
- static_assert(std::numeric_limits<typename Engine::result_type>::is_integer);
28
+ typedef typename Engine::result_type R;
29
+ static_assert(std::numeric_limits<R>::is_integer, "RNG engine must yield integer results");
30
+
31
+ // Can't be bothered to figure out whether the range fits into 'R' for signed values.
32
+ // So instead, we just require unsigned integers, where the range will always fit.
33
+ static_assert(!std::numeric_limits<R>::is_signed, "RNG engine must yield unsigned integers");
34
+
35
+ // Make sure we get the right type to avoid inadvertent promotions.
36
+ constexpr T ONE_ = 1;
23
37
 
24
38
  // Stolen from Boost, see https://www.boost.org/doc/libs/1_67_0/boost/random/uniform_01.hpp
25
39
  // The +1 probably doesn't matter for 64-bit generators, but is helpful for engines with
26
40
  // fewer output bits, to reduce the (small) probability of sampling 1's.
27
- constexpr double factor = 1.0 / (static_cast<T>(Engine::max() - Engine::min()) + 1.0);
28
- double result;
41
+ constexpr T factor = ONE_ / (static_cast<T>(Engine::max() - Engine::min()) + ONE_);
42
+
43
+ // Note that it still might be possible to get a result = 1, depending on
44
+ // the numerical precision used to compute the product; hence the loop.
45
+ T result;
29
46
  do {
30
47
  result = static_cast<T>(eng() - Engine::min()) * factor;
31
- } while (result == 1.0);
48
+ } while (result == ONE_);
49
+
32
50
  return result;
33
51
  }
34
52
 
35
53
  /**
36
- * @tparam T Floating point type.
54
+ * @tparam T Floating point type to return.
55
+ * This is also used for intermediate calculations, so it is usually safest to provide a type that is at least as precise as a `double`.
37
56
  * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
38
57
  * where the `result_type` is an unsigned integer value.
39
58
  *
@@ -43,16 +62,18 @@ T standard_uniform(Engine& eng) {
43
62
  */
44
63
  template<typename T = double, class Engine>
45
64
  std::pair<T, T> standard_normal(Engine& eng) {
46
- constexpr double pi = 3.14159265358979323846;
65
+ constexpr T PI_ = 3.14159265358979323846;
66
+ constexpr T TWO_ = 2;
47
67
 
48
68
  // Box-Muller gives us two random values at a time.
49
- double constant = std::sqrt(-2 * std::log(standard_uniform<T>(eng)));
50
- double angle = 2 * pi * standard_uniform<T>(eng);
69
+ T constant = std::sqrt(-TWO_ * std::log(standard_uniform<T>(eng)));
70
+ T angle = TWO_ * PI_ * standard_uniform<T>(eng);
51
71
  return std::make_pair(constant * std::sin(angle), constant * std::cos(angle));
52
72
  }
53
73
 
54
74
  /**
55
- * @tparam T Floating point type.
75
+ * @tparam T Floating point type to return.
76
+ * This is also used for intermediate calculations, so it is usually safest to provide a type that is at least as precise as a `double`.
56
77
  * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
57
78
  * where the `result_type` is an unsigned integer value.
58
79
  *
@@ -62,7 +83,11 @@ std::pair<T, T> standard_normal(Engine& eng) {
62
83
  */
63
84
  template<typename T = double, class Engine>
64
85
  T standard_exponential(Engine& eng) {
65
- return -std::log(standard_uniform(eng));
86
+ T val;
87
+ do {
88
+ val = standard_uniform<T>(eng);
89
+ } while (val == static_cast<T>(0));
90
+ return -std::log(val);
66
91
  }
67
92
 
68
93
  /**
@@ -79,7 +104,7 @@ template<typename T = int, class Engine>
79
104
  T discrete_uniform(Engine& eng, T bound) {
80
105
  typedef typename Engine::result_type R;
81
106
  static_assert(std::numeric_limits<R>::is_integer);
82
- static_assert(!std::numeric_limits<R>::is_signed);
107
+ static_assert(!std::numeric_limits<R>::is_signed); // don't want to figure out how to store the range.
83
108
 
84
109
  constexpr R range = Engine::max() - Engine::min();
85
110
  if (bound > range) {
@@ -91,22 +116,110 @@ T discrete_uniform(Engine& eng, T bound) {
91
116
  throw std::runtime_error("'bound' should be a positive integer");
92
117
  }
93
118
 
94
- // The limit is necessary to provide uniformity in the presence of the
95
- // modulus. The idea is to re-sample if we get a draw above the limit.
96
- // Technically this can have problems as bound approaches range, in which
97
- // case we might end up discarding a lot of the sample space... but this
98
- // is unlikely to happen in practice, so whatever. Note that the +1 is
99
- // necessary because range is inclusive but bound is not.
100
- const R limit = range - (range % bound + 1);
101
-
102
- // In addition, we don't have to deal with the crap about combining draws
103
- // to get enough entropy, which is 90% of the Boost implementation.
104
- T draw;
105
- do {
106
- draw = (eng() - Engine::min()) % bound;
107
- } while (draw > limit);
119
+ R draw = eng() - Engine::min();
120
+
121
+ // Conservative shortcut to avoid an extra modulo operation in computing
122
+ // 'limit' if 'draw' is below 'limit'. This is based on the observation
123
+ // that 'range - bound <= limit', so any condition that triggers the loop
124
+ // will also pass this check. Allows early return when 'range >> bound'.
125
+ if (draw > range - bound) {
108
126
 
109
- return draw;
127
+ // The limit is necessary to provide uniformity in the presence of the
128
+ // modulus. The idea is to re-sample if we get a draw above the limit.
129
+ // Technically this can have problems as bound approaches range, in which
130
+ // case we might end up discarding a lot of the sample space... but this
131
+ // is unlikely to happen in practice, and even if it does, it's a rejection
132
+ // rate that's guaranteed to be less than 50%, so whatever.
133
+ //
134
+ // Note that the +1 is necessary because range is inclusive but bound is not.
135
+ const R limit = range - ((range % bound) + 1);
136
+
137
+ // In addition, we don't have to deal with the crap about combining draws
138
+ // to get enough entropy, which is 90% of the Boost implementation.
139
+ while (draw > limit) {
140
+ draw = eng() - Engine::min();
141
+ }
142
+ }
143
+
144
+ return draw % bound;
145
+ }
146
+
147
+ /**
148
+ * @tparam In Random-access iterator or pointer.
149
+ * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
150
+ * where the `result_type` is an unsigned integer value.
151
+ *
152
+ * @param values Iterator or pointer to an array of values to shuffle.
153
+ * @param n Number of values in the array pointed to by `values`.
154
+ * @param eng Instance of an RNG class like `std::mt19937_64`.
155
+ *
156
+ * @return Contents of `values` are randomly permuted in place using the Fisher-Yates algorithm.
157
+ */
158
+ template<class In, class Engine>
159
+ void shuffle(In values, size_t n, Engine& eng) {
160
+ if (n) {
161
+ using std::swap;
162
+ for (size_t i = 0; i < n - 1; ++i) {
163
+ auto chosen = discrete_uniform(eng, n - i);
164
+ swap(*(values + i), *(values + i + chosen));
165
+ }
166
+ }
167
+ return;
168
+ }
169
+
170
+ /**
171
+ * @tparam In Random-access iterator or pointer for the inputs.
172
+ * @tparam Out Random-access iterator or pointer for the outputs.
173
+ * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
174
+ * where the `result_type` is an unsigned integer value.
175
+ *
176
+ * @param values Iterator or pointer to an array of values to sample from.
177
+ * @param n Number of values in the array pointed to by `values`.
178
+ * @param s Number of values to sample.
179
+ * @param output Iterator or pointer to an array of length `s`, to store the sampled values.
180
+ * @param eng Instance of an RNG class like `std::mt19937_64`.
181
+ *
182
+ * @return `output` is filled with `s` sampled values from `values`.
183
+ *
184
+ * If `s > n`, `values` is copied into the first `n` elements of `output` and the remaining values of `output` are undefined.
185
+ */
186
+ template<class In, class Out, class Engine>
187
+ void sample(In values, size_t n, size_t s, Out output, Engine& eng) {
188
+ for (size_t i = 0; i < n && s; ++i, ++values) {
189
+ const double threshold = static_cast<double>(s)/(n - i);
190
+ if (threshold >= 1 || standard_uniform(eng) <= threshold) {
191
+ *output = *values;
192
+ ++output;
193
+ --s;
194
+ }
195
+ }
196
+ }
197
+
198
+ /**
199
+ * @tparam Out Random-access iterator or pointer for the outputs.
200
+ * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
201
+ * where the `result_type` is an unsigned integer value.
202
+ *
203
+ * @param bound Upper bound of the indices to sample from.
204
+ * @param s Number of values to sample.
205
+ * @param output Iterator or pointer to an array of length `s`, to store the sampled values.
206
+ * @param eng Instance of an RNG class like `std::mt19937_64`.
207
+ *
208
+ * @return `output` is filled with `s` sampled values from the sequence of integers in `{0, 1, ..., bound - 1}`.
209
+ *
210
+ * If `s > bound`, the first `n` elements of `output` will contain the sequence of integers from `0` to `bound - 1`.
211
+ * The remaining values of `output` are undefined.
212
+ */
213
+ template<class Out, class Engine>
214
+ void sample(size_t bound, size_t s, Out output, Engine& eng) {
215
+ for (size_t i = 0; i < bound && s; ++i) {
216
+ const double threshold = static_cast<double>(s)/(bound - i);
217
+ if (threshold >= 1 || standard_uniform(eng) <= threshold) {
218
+ *output = i;
219
+ ++output;
220
+ --s;
221
+ }
222
+ }
110
223
  }
111
224
 
112
225
  }
@@ -128,7 +128,7 @@ inline void set_error_from_errno(char **error, const char* msg) {
128
128
  annoylib_showUpdate("%s: %s (%d)\n", msg, strerror(errno), errno);
129
129
  if (error) {
130
130
  *error = (char *)malloc(256); // TODO: win doesn't support snprintf
131
- sprintf(*error, "%s: %s (%d)", msg, strerror(errno), errno);
131
+ snprintf(*error, 255, "%s: %s (%d)", msg, strerror(errno), errno);
132
132
  }
133
133
  }
134
134