umappp 0.1.6 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4268767d4aa68fb7795e72e48a9822b629390091cd5214d460e1d99a3127a3c3
4
- data.tar.gz: 684fdcf60aa7dc40a061927103692dda85fc2e40bb1680b0c61f014980197292
3
+ metadata.gz: 4e329dda5fe3d577f175b6c55059f165c4e9569ed14208785cd0a9184b5d14df
4
+ data.tar.gz: 3e0a7ed9a3a7a08109019adef9dc5c1f9a88c82e77d28307875a719c4bb0551e
5
5
  SHA512:
6
- metadata.gz: 7a9d5181ec290f40b5b22079c36d90a2fe07e1072b35b933ee30da253fc0f734431081334b6fc7510317df330e7c321386bc46da742d4515a646437e70602835
7
- data.tar.gz: 05d42582e0d559591bd22b767a6d7df7b821288fa8b44da86d8255822b49e629c15dd33e719d383a08d933ff1dfdd3fdf839e3eed7f2131dc0bf868c3ad467b4
6
+ metadata.gz: 1838cffb49dcac3e8429d7d112bff2dca7d6a72608d1ca19889533c1d394d332da24457cade7179845901712176d3ae9af626ce372e1f6e444ab490203180b65
7
+ data.tar.gz: bff7628b13e053fe337d9cf3b0ba37e68012440b943147303d0208c16491f82f45c7069e83e48084999f24b0b4880a0c510cac551059ff45d7302f4167428c28
data/README.md CHANGED
@@ -41,7 +41,6 @@ Available parameters and their default values
41
41
  |----------------------|------------------------------------|
42
42
  | method | :annoy (another option is :vptree) |
43
43
  | ndim | 2 |
44
- | tick | 0 (Not yet implemented) |
45
44
  | local_connectivity | 1.0 |
46
45
  | bandwidth | 1 |
47
46
  | mix_ratio | 1 |
@@ -50,12 +49,12 @@ Available parameters and their default values
50
49
  | a | 0 |
51
50
  | b | 0 |
52
51
  | repulsion_strength | 1 |
52
+ | initialize | Umappp::InitMethod::SPECTRAL |
53
53
  | num_epochs | 500 |
54
54
  | learning_rate | 1 |
55
55
  | negative_sample_rate | 5 |
56
56
  | num_neighbors | 15 |
57
57
  | seed | 1234567890 |
58
- | batch | false |
59
58
  | num_threads | 1 (OpenMP required) |
60
59
 
61
60
  ## Development
@@ -63,10 +62,19 @@ Available parameters and their default values
63
62
  ```
64
63
  git clone https://github.com/kojix2/ruby-umappp
65
64
  cd umap
66
- b ndle dle exec rake compile
65
+ bundle exec rake compile
67
66
  bundle exec rake test
68
67
  ```
69
68
 
69
+ Update LTLA/umappp
70
+
71
+ Requires cmake to run
72
+
73
+ ```
74
+ cd script
75
+ ./vendor.sh
76
+ ```
77
+
70
78
  ### Ruby dependencies
71
79
 
72
80
  * [rice](https://github.com/jasonroelofs/rice) - Ruby Interface for C++ Extensions
@@ -9,10 +9,6 @@
9
9
  typedef float Float;
10
10
  typedef typename umappp::Umap<Float> Umap;
11
11
 
12
- #ifdef _OPENMP
13
- #include <omp.h>
14
- #endif
15
-
16
12
  using namespace Rice;
17
13
 
18
14
  // This function is used to view default parameters from Ruby.
@@ -28,13 +24,14 @@ Hash umappp_default_parameters(Object self)
28
24
  d[Symbol("a")] = Umap::Defaults::a;
29
25
  d[Symbol("b")] = Umap::Defaults::b;
30
26
  d[Symbol("repulsion_strength")] = Umap::Defaults::repulsion_strength;
27
+ d[Symbol("initialize")] = Umap::Defaults::initialize;
31
28
  d[Symbol("num_epochs")] = Umap::Defaults::num_epochs;
32
29
  d[Symbol("learning_rate")] = Umap::Defaults::learning_rate;
33
30
  d[Symbol("negative_sample_rate")] = Umap::Defaults::negative_sample_rate;
34
31
  d[Symbol("num_neighbors")] = Umap::Defaults::num_neighbors;
35
32
  d[Symbol("seed")] = Umap::Defaults::seed;
36
- d[Symbol("batch")] = Umap::Defaults::batch;
37
33
  d[Symbol("num_threads")] = Umap::Defaults::num_threads;
34
+ d[Symbol("parallel_optimization")] = Umap::Defaults::parallel_optimization;
38
35
 
39
36
  return d;
40
37
  }
@@ -46,13 +43,16 @@ Object umappp_run(
46
43
  Hash params,
47
44
  numo::SFloat data,
48
45
  int ndim,
49
- int nn_method,
50
- int tick = 0)
46
+ int nn_method)
51
47
  {
52
48
  // Parameters are taken from a Ruby Hash object.
53
49
  // If there is key, set the value.
50
+ if (ndim < 1)
51
+ {
52
+ throw std::runtime_error("ndim is less than 1");
53
+ }
54
54
 
55
- auto umap_ptr = new Umap;
55
+ std::unique_ptr<Umap> umap_ptr(new Umap);
56
56
 
57
57
  double local_connectivity = Umap::Defaults::local_connectivity;
58
58
  if (RTEST(params.call("has_key?", Symbol("local_connectivity"))))
@@ -110,6 +110,13 @@ Object umappp_run(
110
110
  umap_ptr->set_repulsion_strength(repulsion_strength);
111
111
  }
112
112
 
113
+ umappp::InitMethod initialize = Umap::Defaults::initialize;
114
+ if (RTEST(params.call("has_key?", Symbol("initialize"))))
115
+ {
116
+ initialize = params.get<umappp::InitMethod>(Symbol("initialize"));
117
+ umap_ptr->set_initialize(initialize);
118
+ }
119
+
113
120
  int num_epochs = Umap::Defaults::num_epochs;
114
121
  if (RTEST(params.call("has_key?", Symbol("num_epochs"))))
115
122
  {
@@ -145,13 +152,6 @@ Object umappp_run(
145
152
  umap_ptr->set_seed(seed);
146
153
  }
147
154
 
148
- bool batch = Umap::Defaults::batch;
149
- if (RTEST(params.call("has_key?", Symbol("batch"))))
150
- {
151
- batch = params.get<bool>(Symbol("batch"));
152
- umap_ptr->set_batch(batch);
153
- }
154
-
155
155
  int num_threads = Umap::Defaults::num_threads;
156
156
  if (RTEST(params.call("has_key?", Symbol("num_threads"))))
157
157
  {
@@ -159,6 +159,13 @@ Object umappp_run(
159
159
  umap_ptr->set_num_threads(num_threads);
160
160
  }
161
161
 
162
+ bool parallel_optimization = Umap::Defaults::parallel_optimization;
163
+ if (RTEST(params.call("has_key?", Symbol("parallel_optimization"))))
164
+ {
165
+ parallel_optimization = params.get<bool>(Symbol("parallel_optimization"));
166
+ umap_ptr->set_parallel_optimization(parallel_optimization);
167
+ }
168
+
162
169
  // initialize_from_matrix
163
170
 
164
171
  const float *y = data.read_ptr();
@@ -166,6 +173,10 @@ Object umappp_run(
166
173
 
167
174
  int nd = shape[1];
168
175
  int nobs = shape[0];
176
+ if (nobs < 0)
177
+ {
178
+ throw std::runtime_error("nobs is negative");
179
+ }
169
180
 
170
181
  std::unique_ptr<knncolle::Base<int, Float>> knncolle_ptr;
171
182
  if (nn_method == 0)
@@ -180,39 +191,16 @@ Object umappp_run(
180
191
  std::vector<Float> embedding(ndim * nobs);
181
192
 
182
193
  auto status = umap_ptr->initialize(knncolle_ptr.get(), ndim, embedding.data());
183
- if (nobs < 0 || ndim < 0)
184
- {
185
- throw std::runtime_error("nobs or ndim is negative");
186
- }
187
- if (tick == 0)
188
- {
189
- status.run(ndim, embedding.data(), 0);
190
-
191
- // it is safe to cast to unsigned int
192
- auto na = numo::SFloat({(unsigned int)nobs, (unsigned int)ndim});
193
- std::copy(embedding.begin(), embedding.end(), na.write_ptr());
194
194
 
195
- return na;
196
- }
197
- else
198
- {
199
- VALUE ret = rb_ary_new();
200
-
201
- while (status.epoch() < status.num_epochs())
202
- {
203
- int epoch_limit = status.epoch() + tick;
195
+ int epoch_limit = 0;
196
+ // tick is not implemented yet
197
+ status.run(epoch_limit);
204
198
 
205
- status.run(ndim, embedding.data(), epoch_limit);
199
+ // it is safe to cast to unsigned int
200
+ auto na = numo::SFloat({(unsigned int)nobs, (unsigned int)ndim});
201
+ std::copy(embedding.begin(), embedding.end(), na.write_ptr());
206
202
 
207
- //it is safe to cast to unsigned int
208
- auto na = numo::SFloat({(unsigned int)nobs, (unsigned int)ndim});
209
- std::copy(embedding.begin(), embedding.end(), na.write_ptr());
210
-
211
- rb_ary_push(ret, na.value());
212
- }
213
-
214
- return ret;
215
- }
203
+ return na;
216
204
  }
217
205
 
218
206
  extern "C" void Init_umappp()
@@ -221,4 +209,10 @@ extern "C" void Init_umappp()
221
209
  define_module("Umappp")
222
210
  .define_singleton_method("umappp_run", &umappp_run)
223
211
  .define_singleton_method("umappp_default_parameters", &umappp_default_parameters);
212
+ Enum<umappp::InitMethod> init_method =
213
+ define_enum<umappp::InitMethod>("InitMethod", rb_mUmappp)
214
+ .define_value("SPECTRAL", umappp::InitMethod::SPECTRAL)
215
+ .define_value("SPECTRAL_ONLY", umappp::InitMethod::SPECTRAL_ONLY)
216
+ .define_value("RANDOM", umappp::InitMethod::RANDOM)
217
+ .define_value("NONE", umappp::InitMethod::NONE);
224
218
  }
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Umappp
4
- VERSION = "0.1.6"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/umappp.rb CHANGED
@@ -12,7 +12,7 @@ module Umappp
12
12
 
13
13
  # View the default parameters defined within the Umappp C++ library structure.
14
14
  def self.default_parameters
15
- # {method: :annoy, ndim: 2, tick: 0}.merge
15
+ # {method: :annoy, ndim: 2}.merge
16
16
  umappp_default_parameters
17
17
  end
18
18
 
@@ -30,16 +30,17 @@ module Umappp
30
30
  # @param a [Numeric]
31
31
  # @param b [Numeric]
32
32
  # @param repulsion_strength [Numeric]
33
+ # @param initilaize [Umappp::InitMethod]
33
34
  # @param num_epochs [Integer]
34
35
  # @param learning_rate [Numeric]
35
36
  # @param negative_sample_rate [Numeric]
36
37
  # @param num_neighbors [Integer]
37
38
  # @param seed [Integer]
38
- # @param batch [Boolean]
39
39
  # @param num_threads [Integer]
40
+ # @param parallel_optimization [Boolean]
40
41
  # @return [Numo::SFloat] the final embedding
41
42
 
42
- def self.run(embedding, method: :annoy, ndim: 2, tick: 0, **params)
43
+ def self.run(embedding, method: :annoy, ndim: 2, **params)
43
44
  unless (u = (params.keys - default_parameters.keys)).empty?
44
45
  raise ArgumentError, "[umappp.rb] unknown option : #{u.inspect}"
45
46
  end
@@ -50,6 +51,6 @@ module Umappp
50
51
  embedding2 = Numo::SFloat.cast(embedding)
51
52
  raise ArgumentError, "embedding must be a 2D array" if embedding2.ndim <= 1
52
53
 
53
- umappp_run(params, embedding2, ndim, nnmethod, tick)
54
+ umappp_run(params, embedding2, ndim, nnmethod)
54
55
  end
55
56
  end
@@ -5,10 +5,17 @@
5
5
  #include <limits>
6
6
  #include <stdexcept>
7
7
 
8
+ /**
9
+ * @file aarand.hpp
10
+ *
11
+ * @brief Collection of random distribution functions.
12
+ */
13
+
8
14
  namespace aarand {
9
15
 
10
16
  /**
11
- * @tparam T Floating point type.
17
+ * @tparam T Floating point type to return.
18
+ * This is also used for intermediate calculations, so it is usually safest to provide a type that is at least as precise as a `double`.
12
19
  * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
13
20
  * where the `result_type` is an unsigned integer value.
14
21
  *
@@ -18,22 +25,34 @@ namespace aarand {
18
25
  */
19
26
  template<typename T = double, class Engine>
20
27
  T standard_uniform(Engine& eng) {
21
- static_assert(!std::numeric_limits<typename Engine::result_type>::is_signed);
22
- static_assert(std::numeric_limits<typename Engine::result_type>::is_integer);
28
+ typedef typename Engine::result_type R;
29
+ static_assert(std::numeric_limits<R>::is_integer, "RNG engine must yield integer results");
30
+
31
+ // Can't be bothered to figure out whether the range fits into 'R' for signed values.
32
+ // So instead, we just require unsigned integers, where the range will always fit.
33
+ static_assert(!std::numeric_limits<R>::is_signed, "RNG engine must yield unsigned integers");
34
+
35
+ // Make sure we get the right type to avoid inadvertent promotions.
36
+ constexpr T ONE_ = 1;
23
37
 
24
38
  // Stolen from Boost, see https://www.boost.org/doc/libs/1_67_0/boost/random/uniform_01.hpp
25
39
  // The +1 probably doesn't matter for 64-bit generators, but is helpful for engines with
26
40
  // fewer output bits, to reduce the (small) probability of sampling 1's.
27
- constexpr double factor = 1.0 / (static_cast<T>(Engine::max() - Engine::min()) + 1.0);
28
- double result;
41
+ constexpr T factor = ONE_ / (static_cast<T>(Engine::max() - Engine::min()) + ONE_);
42
+
43
+ // Note that it still might be possible to get a result = 1, depending on
44
+ // the numerical precision used to compute the product; hence the loop.
45
+ T result;
29
46
  do {
30
47
  result = static_cast<T>(eng() - Engine::min()) * factor;
31
- } while (result == 1.0);
48
+ } while (result == ONE_);
49
+
32
50
  return result;
33
51
  }
34
52
 
35
53
  /**
36
- * @tparam T Floating point type.
54
+ * @tparam T Floating point type to return.
55
+ * This is also used for intermediate calculations, so it is usually safest to provide a type that is at least as precise as a `double`.
37
56
  * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
38
57
  * where the `result_type` is an unsigned integer value.
39
58
  *
@@ -43,16 +62,18 @@ T standard_uniform(Engine& eng) {
43
62
  */
44
63
  template<typename T = double, class Engine>
45
64
  std::pair<T, T> standard_normal(Engine& eng) {
46
- constexpr double pi = 3.14159265358979323846;
65
+ constexpr T PI_ = 3.14159265358979323846;
66
+ constexpr T TWO_ = 2;
47
67
 
48
68
  // Box-Muller gives us two random values at a time.
49
- double constant = std::sqrt(-2 * std::log(standard_uniform<T>(eng)));
50
- double angle = 2 * pi * standard_uniform<T>(eng);
69
+ T constant = std::sqrt(-TWO_ * std::log(standard_uniform<T>(eng)));
70
+ T angle = TWO_ * PI_ * standard_uniform<T>(eng);
51
71
  return std::make_pair(constant * std::sin(angle), constant * std::cos(angle));
52
72
  }
53
73
 
54
74
  /**
55
- * @tparam T Floating point type.
75
+ * @tparam T Floating point type to return.
76
+ * This is also used for intermediate calculations, so it is usually safest to provide a type that is at least as precise as a `double`.
56
77
  * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
57
78
  * where the `result_type` is an unsigned integer value.
58
79
  *
@@ -62,7 +83,11 @@ std::pair<T, T> standard_normal(Engine& eng) {
62
83
  */
63
84
  template<typename T = double, class Engine>
64
85
  T standard_exponential(Engine& eng) {
65
- return -std::log(standard_uniform(eng));
86
+ T val;
87
+ do {
88
+ val = standard_uniform<T>(eng);
89
+ } while (val == static_cast<T>(0));
90
+ return -std::log(val);
66
91
  }
67
92
 
68
93
  /**
@@ -79,7 +104,7 @@ template<typename T = int, class Engine>
79
104
  T discrete_uniform(Engine& eng, T bound) {
80
105
  typedef typename Engine::result_type R;
81
106
  static_assert(std::numeric_limits<R>::is_integer);
82
- static_assert(!std::numeric_limits<R>::is_signed);
107
+ static_assert(!std::numeric_limits<R>::is_signed); // don't want to figure out how to store the range.
83
108
 
84
109
  constexpr R range = Engine::max() - Engine::min();
85
110
  if (bound > range) {
@@ -91,22 +116,110 @@ T discrete_uniform(Engine& eng, T bound) {
91
116
  throw std::runtime_error("'bound' should be a positive integer");
92
117
  }
93
118
 
94
- // The limit is necessary to provide uniformity in the presence of the
95
- // modulus. The idea is to re-sample if we get a draw above the limit.
96
- // Technically this can have problems as bound approaches range, in which
97
- // case we might end up discarding a lot of the sample space... but this
98
- // is unlikely to happen in practice, so whatever. Note that the +1 is
99
- // necessary because range is inclusive but bound is not.
100
- const R limit = range - (range % bound + 1);
101
-
102
- // In addition, we don't have to deal with the crap about combining draws
103
- // to get enough entropy, which is 90% of the Boost implementation.
104
- T draw;
105
- do {
106
- draw = (eng() - Engine::min()) % bound;
107
- } while (draw > limit);
119
+ R draw = eng() - Engine::min();
120
+
121
+ // Conservative shortcut to avoid an extra modulo operation in computing
122
+ // 'limit' if 'draw' is below 'limit'. This is based on the observation
123
+ // that 'range - bound <= limit', so any condition that triggers the loop
124
+ // will also pass this check. Allows early return when 'range >> bound'.
125
+ if (draw > range - bound) {
108
126
 
109
- return draw;
127
+ // The limit is necessary to provide uniformity in the presence of the
128
+ // modulus. The idea is to re-sample if we get a draw above the limit.
129
+ // Technically this can have problems as bound approaches range, in which
130
+ // case we might end up discarding a lot of the sample space... but this
131
+ // is unlikely to happen in practice, and even if it does, it's a rejection
132
+ // rate that's guaranteed to be less than 50%, so whatever.
133
+ //
134
+ // Note that the +1 is necessary because range is inclusive but bound is not.
135
+ const R limit = range - ((range % bound) + 1);
136
+
137
+ // In addition, we don't have to deal with the crap about combining draws
138
+ // to get enough entropy, which is 90% of the Boost implementation.
139
+ while (draw > limit) {
140
+ draw = eng() - Engine::min();
141
+ }
142
+ }
143
+
144
+ return draw % bound;
145
+ }
146
+
147
+ /**
148
+ * @tparam In Random-access iterator or pointer.
149
+ * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
150
+ * where the `result_type` is an unsigned integer value.
151
+ *
152
+ * @param values Iterator or pointer to an array of values to shuffle.
153
+ * @param n Number of values in the array pointed to by `values`.
154
+ * @param eng Instance of an RNG class like `std::mt19937_64`.
155
+ *
156
+ * @return Contents of `values` are randomly permuted in place using the Fisher-Yates algorithm.
157
+ */
158
+ template<class In, class Engine>
159
+ void shuffle(In values, size_t n, Engine& eng) {
160
+ if (n) {
161
+ using std::swap;
162
+ for (size_t i = 0; i < n - 1; ++i) {
163
+ auto chosen = discrete_uniform(eng, n - i);
164
+ swap(*(values + i), *(values + i + chosen));
165
+ }
166
+ }
167
+ return;
168
+ }
169
+
170
+ /**
171
+ * @tparam In Random-access iterator or pointer for the inputs.
172
+ * @tparam Out Random-access iterator or pointer for the outputs.
173
+ * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
174
+ * where the `result_type` is an unsigned integer value.
175
+ *
176
+ * @param values Iterator or pointer to an array of values to sample from.
177
+ * @param n Number of values in the array pointed to by `values`.
178
+ * @param s Number of values to sample.
179
+ * @param output Iterator or pointer to an array of length `s`, to store the sampled values.
180
+ * @param eng Instance of an RNG class like `std::mt19937_64`.
181
+ *
182
+ * @return `output` is filled with `s` sampled values from `values`.
183
+ *
184
+ * If `s > n`, `values` is copied into the first `n` elements of `output` and the remaining values of `output` are undefined.
185
+ */
186
+ template<class In, class Out, class Engine>
187
+ void sample(In values, size_t n, size_t s, Out output, Engine& eng) {
188
+ for (size_t i = 0; i < n && s; ++i, ++values) {
189
+ const double threshold = static_cast<double>(s)/(n - i);
190
+ if (threshold >= 1 || standard_uniform(eng) <= threshold) {
191
+ *output = *values;
192
+ ++output;
193
+ --s;
194
+ }
195
+ }
196
+ }
197
+
198
+ /**
199
+ * @tparam Out Random-access iterator or pointer for the outputs.
200
+ * @tparam Engine A random number generator class with `operator()`, `min()` (static) and `max()` (static) methods,
201
+ * where the `result_type` is an unsigned integer value.
202
+ *
203
+ * @param bound Upper bound of the indices to sample from.
204
+ * @param s Number of values to sample.
205
+ * @param output Iterator or pointer to an array of length `s`, to store the sampled values.
206
+ * @param eng Instance of an RNG class like `std::mt19937_64`.
207
+ *
208
+ * @return `output` is filled with `s` sampled values from the sequence of integers in `{0, 1, ..., bound - 1}`.
209
+ *
210
+ * If `s > bound`, the first `n` elements of `output` will contain the sequence of integers from `0` to `bound - 1`.
211
+ * The remaining values of `output` are undefined.
212
+ */
213
+ template<class Out, class Engine>
214
+ void sample(size_t bound, size_t s, Out output, Engine& eng) {
215
+ for (size_t i = 0; i < bound && s; ++i) {
216
+ const double threshold = static_cast<double>(s)/(bound - i);
217
+ if (threshold >= 1 || standard_uniform(eng) <= threshold) {
218
+ *output = i;
219
+ ++output;
220
+ --s;
221
+ }
222
+ }
110
223
  }
111
224
 
112
225
  }
@@ -128,7 +128,7 @@ inline void set_error_from_errno(char **error, const char* msg) {
128
128
  annoylib_showUpdate("%s: %s (%d)\n", msg, strerror(errno), errno);
129
129
  if (error) {
130
130
  *error = (char *)malloc(256); // TODO: win doesn't support snprintf
131
- sprintf(*error, "%s: %s (%d)", msg, strerror(errno), errno);
131
+ snprintf(*error, 255, "%s: %s (%d)", msg, strerror(errno), errno);
132
132
  }
133
133
  }
134
134