rumale 0.11.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/README.md +100 -4
- data/lib/rumale/clustering/k_means.rb +3 -2
- data/lib/rumale/decomposition/nmf.rb +3 -2
- data/lib/rumale/decomposition/pca.rb +2 -5
- data/lib/rumale/ensemble/ada_boost_classifier.rb +3 -2
- data/lib/rumale/ensemble/ada_boost_regressor.rb +3 -2
- data/lib/rumale/ensemble/extra_trees_classifier.rb +2 -1
- data/lib/rumale/ensemble/extra_trees_regressor.rb +2 -1
- data/lib/rumale/ensemble/gradient_boosting_classifier.rb +2 -1
- data/lib/rumale/ensemble/gradient_boosting_regressor.rb +2 -1
- data/lib/rumale/ensemble/random_forest_classifier.rb +5 -4
- data/lib/rumale/ensemble/random_forest_regressor.rb +5 -4
- data/lib/rumale/kernel_approximation/rbf.rb +3 -2
- data/lib/rumale/kernel_machine/kernel_svc.rb +2 -1
- data/lib/rumale/linear_model/base_linear_model.rb +1 -1
- data/lib/rumale/manifold/tsne.rb +2 -1
- data/lib/rumale/model_selection/k_fold.rb +2 -1
- data/lib/rumale/model_selection/shuffle_split.rb +3 -2
- data/lib/rumale/model_selection/stratified_k_fold.rb +4 -3
- data/lib/rumale/model_selection/stratified_shuffle_split.rb +3 -2
- data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +1 -1
- data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +1 -1
- data/lib/rumale/pipeline/pipeline.rb +1 -1
- data/lib/rumale/polynomial_model/base_factorization_machine.rb +1 -1
- data/lib/rumale/tree/base_decision_tree.rb +1 -1
- data/lib/rumale/tree/decision_tree_classifier.rb +1 -0
- data/lib/rumale/tree/decision_tree_regressor.rb +1 -0
- data/lib/rumale/tree/extra_tree_classifier.rb +1 -1
- data/lib/rumale/tree/extra_tree_regressor.rb +1 -1
- data/lib/rumale/tree/gradient_tree_regressor.rb +2 -1
- data/lib/rumale/utils.rb +6 -2
- data/lib/rumale/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f662b1bf4abdb9aba9c978362094d80f59fcb390
|
4
|
+
data.tar.gz: eb5087ce4b4f2dfdc8e789c340139dd7d36693e0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8418aa3932962b135c3a9725262e84b741825ed2491e98dba508e04ea4104d7abe0a9938d6248d9405bc7d9793d1f128a7e37c89804a79367626a47d3fa6a773
|
7
|
+
data.tar.gz: 40fff97c335d5720eaf1c90b45ed39531c61827777797a3a4bbc5c8b5f4b8df9b3814290686c959f9f7a9726966187a064b521850139124c9be2c64189d1d29f
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,25 @@
|
|
1
|
+
# 0.12.0
|
2
|
+
## Breaking changes
|
3
|
+
- For reproductivity, Rumale changes to not repeatedly use the same random number generator in the same estimator.
|
4
|
+
In the training phase, estimators use a copy of the random number generator created in the initialize method.
|
5
|
+
Even with the same algorithm and the same data, the order of random number generation
|
6
|
+
may make slight differences in learning results.
|
7
|
+
By this change, even if the fit method is executed multiple times,
|
8
|
+
the same learning result can be obtained if the same data is given.
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
svc = Rumale::LinearModel::SVC.new(random_seed: 0)
|
12
|
+
svc.fit(x, y)
|
13
|
+
a = svc.weight_vec
|
14
|
+
svc.fit(x, y)
|
15
|
+
b = svc.weight_vec
|
16
|
+
err = ((a - b)**2).mean
|
17
|
+
|
18
|
+
# In version 0.11.0 or earlier, false may be output,
|
19
|
+
# but from this version, true is always output.
|
20
|
+
puts(err < 1e-4)
|
21
|
+
```
|
22
|
+
|
1
23
|
# 0.11.0
|
2
24
|
- Introduce [Parallel gem](https://github.com/grosser/parallel) to improve execution speed for one-vs-the-rest and bagging methods.
|
3
25
|
- Add the n_jobs parameter that specifies the number of jobs for parallel processing in some estimators belong to the Rumale::LinearModel, Rumale::PolynomialModel, and Rumale::Ensemble.
|
data/README.md
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
[](https://coveralls.io/github/yoshoku/rumale?branch=master)
|
7
7
|
[](https://badge.fury.io/rb/rumale)
|
8
8
|
[](https://github.com/yoshoku/rumale/blob/master/LICENSE.txt)
|
9
|
-
[](https://www.rubydoc.info/gems/rumale/0.
|
9
|
+
[](https://www.rubydoc.info/gems/rumale/0.12.0)
|
10
10
|
|
11
11
|
Rumale (**Ru**by **ma**chine **le**arning) is a machine learning library in Ruby.
|
12
12
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
@@ -36,7 +36,43 @@ Or install it yourself as:
|
|
36
36
|
|
37
37
|
## Usage
|
38
38
|
|
39
|
-
### Example 1.
|
39
|
+
### Example 1. XOR data
|
40
|
+
First, let's classify simple xor data.
|
41
|
+
In Rumale, feature vectors and labels are represented by [Numo::NArray](https://github.com/ruby-numo/numo-narray).
|
42
|
+
|
43
|
+
```ruby
|
44
|
+
require 'rumale'
|
45
|
+
|
46
|
+
# Prepare XOR data.
|
47
|
+
features = [[0, 0], [0, 1], [1, 0], [1, 1]]
|
48
|
+
labels = [0, 1, 1, 0]
|
49
|
+
|
50
|
+
# Convert Ruby Array into Numo::NArray.
|
51
|
+
x = Numo::DFloat.asarray(features)
|
52
|
+
y = Numo::Int32.asarray(labels)
|
53
|
+
|
54
|
+
# Train classifier with nearest neighbor rule.
|
55
|
+
estimator = Rumale::NearestNeighbors::KNeighborsClassifier.new(n_neighbors: 1)
|
56
|
+
estimator.fit(x, y)
|
57
|
+
|
58
|
+
# Predict labels.
|
59
|
+
p y
|
60
|
+
p estimator.predict(x)
|
61
|
+
```
|
62
|
+
|
63
|
+
Execution of the above script result in the following.
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
Numo::Int32#shape=[4]
|
67
|
+
[0, 1, 1, 0]
|
68
|
+
Numo::Int32#shape=[4]
|
69
|
+
[0, 1, 1, 0]
|
70
|
+
```
|
71
|
+
|
72
|
+
The basic usage of Rumale is to first train the model with the fit method
|
73
|
+
and then estimate with the predict method.
|
74
|
+
|
75
|
+
### Example 2. Pendigits dataset classification
|
40
76
|
|
41
77
|
Rumale provides function loading libsvm format dataset file.
|
42
78
|
We start by downloading the pendigits dataset from LIBSVM Data web site.
|
@@ -99,7 +135,7 @@ $ ruby test.rb
|
|
99
135
|
Accuracy: 98.4%
|
100
136
|
```
|
101
137
|
|
102
|
-
### Example
|
138
|
+
### Example 3. Cross-validation
|
103
139
|
|
104
140
|
```ruby
|
105
141
|
require 'rumale'
|
@@ -130,7 +166,7 @@ $ ruby cross_validation.rb
|
|
130
166
|
5-CV mean log-loss: 0.476
|
131
167
|
```
|
132
168
|
|
133
|
-
### Example
|
169
|
+
### Example 4. Pipeline
|
134
170
|
|
135
171
|
```ruby
|
136
172
|
require 'rumale'
|
@@ -162,6 +198,66 @@ $ ruby pipeline.rb
|
|
162
198
|
5-CV mean accuracy: 99.2 %
|
163
199
|
```
|
164
200
|
|
201
|
+
## Speeding up
|
202
|
+
|
203
|
+
### Numo::Linalg
|
204
|
+
Loading the [Numo::Linalg](https://github.com/ruby-numo/numo-linalg) allows to perform matrix product of Numo::NArray using BLAS libraries.
|
205
|
+
For example, using the [OpenBLAS](https://github.com/xianyi/OpenBLAS) speeds up many estimators in Rumale.
|
206
|
+
|
207
|
+
Install OpenBLAS library.
|
208
|
+
|
209
|
+
Mac:
|
210
|
+
|
211
|
+
```bash
|
212
|
+
$ brew install openblas --with-openmp
|
213
|
+
```
|
214
|
+
|
215
|
+
Ubuntu:
|
216
|
+
|
217
|
+
```bash
|
218
|
+
$ sudo apt-get install gcc gfortran
|
219
|
+
$ wget https://github.com/xianyi/OpenBLAS/archive/v0.3.5.tar.gz
|
220
|
+
$ tar xzf v0.3.5.tar.gz
|
221
|
+
$ cd OpenBLAS-0.3.5
|
222
|
+
$ make USE_OPENMP=1
|
223
|
+
$ sudo make PREFIX=/usr/local install
|
224
|
+
```
|
225
|
+
|
226
|
+
Install Numo::Linalg gem.
|
227
|
+
|
228
|
+
```bash
|
229
|
+
$ gem install numo-linalg
|
230
|
+
```
|
231
|
+
|
232
|
+
In ruby script, you only need to require the autoloader module of Numo::Linalg.
|
233
|
+
|
234
|
+
```ruby
|
235
|
+
require 'numo/linalg/autoloader'
|
236
|
+
require 'rumale'
|
237
|
+
```
|
238
|
+
|
239
|
+
### Parallel
|
240
|
+
Several estimators in Rumale support parallel processing.
|
241
|
+
Parallel processing in Rumale is realized by [Parallel](https://github.com/grosser/parallel) gem,
|
242
|
+
so install and load it.
|
243
|
+
|
244
|
+
```bash
|
245
|
+
$ gem install parallel
|
246
|
+
```
|
247
|
+
|
248
|
+
```ruby
|
249
|
+
require 'parallel'
|
250
|
+
require 'rumale'
|
251
|
+
```
|
252
|
+
|
253
|
+
Estimators that support parallel processing have n_jobs parameter.
|
254
|
+
When -1 is given to n_jobs parameter, all processors are used.
|
255
|
+
|
256
|
+
```ruby
|
257
|
+
estimator = Rumale::Ensemble::RandomForestClassifier.new(n_jobs: -1, random_seed: 1)
|
258
|
+
```
|
259
|
+
|
260
|
+
|
165
261
|
## Development
|
166
262
|
|
167
263
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
@@ -120,7 +120,8 @@ module Rumale
|
|
120
120
|
def init_cluster_centers(x)
|
121
121
|
# random initialize
|
122
122
|
n_samples = x.shape[0]
|
123
|
-
|
123
|
+
sub_rng = @rng.dup
|
124
|
+
rand_id = [*0...n_samples].sample(@params[:n_clusters], random: sub_rng)
|
124
125
|
@cluster_centers = x[rand_id, true].dup
|
125
126
|
return unless @params[:init] == 'k-means++'
|
126
127
|
# k-means++ initialize
|
@@ -129,7 +130,7 @@ module Rumale
|
|
129
130
|
min_distances = distance_matrix.flatten[distance_matrix.min_index(axis: 1)]
|
130
131
|
probs = min_distances**2 / (min_distances**2).sum
|
131
132
|
cum_probs = probs.cumsum
|
132
|
-
selected_id = cum_probs.gt(
|
133
|
+
selected_id = cum_probs.gt(sub_rng.rand).where.to_a.first
|
133
134
|
@cluster_centers[n, true] = x[selected_id, true].dup
|
134
135
|
end
|
135
136
|
end
|
@@ -113,8 +113,9 @@ module Rumale
|
|
113
113
|
# initialize some variables.
|
114
114
|
n_samples, n_features = x.shape
|
115
115
|
scale = Math.sqrt(x.mean / @params[:n_components])
|
116
|
-
|
117
|
-
|
116
|
+
sub_rng = @rng.dup
|
117
|
+
@components = Rumale::Utils.rand_uniform([@params[:n_components], n_features], sub_rng) * scale if update_comps
|
118
|
+
coefficients = Rumale::Utils.rand_uniform([n_samples, @params[:n_components]], sub_rng) * scale
|
118
119
|
# optimization.
|
119
120
|
@params[:max_iter].times do
|
120
121
|
# update
|
@@ -63,13 +63,14 @@ module Rumale
|
|
63
63
|
# initialize some variables.
|
64
64
|
@components = nil
|
65
65
|
n_samples, n_features = x.shape
|
66
|
+
sub_rng = @rng.dup
|
66
67
|
# centering.
|
67
68
|
@mean = x.mean(0)
|
68
69
|
centered_x = x - @mean
|
69
70
|
# optimization.
|
70
71
|
covariance_mat = centered_x.transpose.dot(centered_x) / (n_samples - 1)
|
71
72
|
@params[:n_components].times do
|
72
|
-
comp_vec =
|
73
|
+
comp_vec = Rumale::Utils.rand_uniform(n_features, sub_rng)
|
73
74
|
@params[:max_iter].times do
|
74
75
|
updated = orthogonalize(covariance_mat.dot(comp_vec))
|
75
76
|
break if (updated.dot(comp_vec) - 1).abs < @params[:tol]
|
@@ -139,10 +140,6 @@ module Rumale
|
|
139
140
|
end
|
140
141
|
pcvec / Math.sqrt((pcvec**2).sum.abs) + 1.0e-12
|
141
142
|
end
|
142
|
-
|
143
|
-
def random_vec(n_features)
|
144
|
-
Numo::DFloat[*(Array.new(n_features) { @rng.rand })]
|
145
|
-
end
|
146
143
|
end
|
147
144
|
end
|
148
145
|
end
|
@@ -95,6 +95,7 @@ module Rumale
|
|
95
95
|
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
96
96
|
@classes = Numo::Int32.asarray(y.to_a.uniq.sort)
|
97
97
|
n_classes = @classes.shape[0]
|
98
|
+
sub_rng = @rng.dup
|
98
99
|
## Boosting.
|
99
100
|
classes_arr = @classes.to_a
|
100
101
|
y_codes = Numo::DFloat.zeros(n_samples, n_classes) - 1.fdiv(n_classes - 1)
|
@@ -102,12 +103,12 @@ module Rumale
|
|
102
103
|
observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
|
103
104
|
@params[:n_estimators].times do |_t|
|
104
105
|
# Fit classfier.
|
105
|
-
ids = Rumale::Utils.choice_ids(n_samples, observation_weights,
|
106
|
+
ids = Rumale::Utils.choice_ids(n_samples, observation_weights, sub_rng)
|
106
107
|
break if y[ids].to_a.uniq.size != n_classes
|
107
108
|
tree = Tree::DecisionTreeClassifier.new(
|
108
109
|
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
109
110
|
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
110
|
-
max_features: @params[:max_features], random_seed:
|
111
|
+
max_features: @params[:max_features], random_seed: sub_rng.rand(Rumale::Values.int_max)
|
111
112
|
)
|
112
113
|
tree.fit(x[ids, true], y[ids])
|
113
114
|
# Calculate estimator error.
|
@@ -102,14 +102,15 @@ module Rumale
|
|
102
102
|
@estimators = []
|
103
103
|
@estimator_weights = []
|
104
104
|
@feature_importances = Numo::DFloat.zeros(n_features)
|
105
|
+
sub_rng = @rng.dup
|
105
106
|
# Construct forest.
|
106
107
|
@params[:n_estimators].times do |_t|
|
107
108
|
# Fit weak learner.
|
108
|
-
ids = Rumale::Utils.choice_ids(n_samples, observation_weights,
|
109
|
+
ids = Rumale::Utils.choice_ids(n_samples, observation_weights, sub_rng)
|
109
110
|
tree = Tree::DecisionTreeRegressor.new(
|
110
111
|
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
111
112
|
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
112
|
-
max_features: @params[:max_features], random_seed:
|
113
|
+
max_features: @params[:max_features], random_seed: sub_rng.rand(Rumale::Values.int_max)
|
113
114
|
)
|
114
115
|
tree.fit(x[ids, true], y[ids])
|
115
116
|
p = tree.predict(x)
|
@@ -80,8 +80,9 @@ module Rumale
|
|
80
80
|
@params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
|
81
81
|
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
82
82
|
@classes = Numo::Int32.asarray(y.to_a.uniq.sort)
|
83
|
+
sub_rng = @rng.dup
|
83
84
|
# Construct trees.
|
84
|
-
rng_seeds = Array.new(@params[:n_estimators]) {
|
85
|
+
rng_seeds = Array.new(@params[:n_estimators]) { sub_rng.rand(Rumale::Values.int_max) }
|
85
86
|
@estimators = if enable_parallel?
|
86
87
|
parallel_map(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
|
87
88
|
else
|
@@ -75,8 +75,9 @@ module Rumale
|
|
75
75
|
n_features = x.shape[1]
|
76
76
|
@params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
|
77
77
|
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
78
|
+
sub_rng = @rng.dup
|
78
79
|
# Construct forest.
|
79
|
-
rng_seeds = Array.new(@params[:n_estimators]) {
|
80
|
+
rng_seeds = Array.new(@params[:n_estimators]) { sub_rng.rand(Rumale::Values.int_max) }
|
80
81
|
@estimators = if enable_parallel?
|
81
82
|
parallel_map(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
|
82
83
|
else
|
@@ -216,10 +216,11 @@ module Rumale
|
|
216
216
|
n_sub_samples = [n_samples, [(n_samples * @params[:subsample]).to_i, 1].max].min
|
217
217
|
whole_ids = Array.new(n_samples) { |v| v }
|
218
218
|
y_pred = Numo::DFloat.ones(n_samples) * init_pred
|
219
|
+
sub_rng = @rng.dup
|
219
220
|
# grow trees.
|
220
221
|
@params[:n_estimators].times do |_t|
|
221
222
|
# subsampling
|
222
|
-
ids = whole_ids.sample(n_sub_samples, random:
|
223
|
+
ids = whole_ids.sample(n_sub_samples, random: sub_rng)
|
223
224
|
x_sub = x[ids, true]
|
224
225
|
y_sub = y[ids]
|
225
226
|
y_pred_sub = y_pred[ids]
|
@@ -178,10 +178,11 @@ module Rumale
|
|
178
178
|
n_sub_samples = [n_samples, [(n_samples * @params[:subsample]).to_i, 1].max].min
|
179
179
|
whole_ids = Array.new(n_samples) { |v| v }
|
180
180
|
y_pred = Numo::DFloat.ones(n_samples) * init_pred
|
181
|
+
sub_rng = @rng.dup
|
181
182
|
# grow trees.
|
182
183
|
@params[:n_estimators].times do |_t|
|
183
184
|
# subsampling
|
184
|
-
ids = whole_ids.sample(n_sub_samples, random:
|
185
|
+
ids = whole_ids.sample(n_sub_samples, random: sub_rng)
|
185
186
|
x_sub = x[ids, true]
|
186
187
|
y_sub = y[ids]
|
187
188
|
y_pred_sub = y_pred[ids]
|
@@ -94,10 +94,11 @@ module Rumale
|
|
94
94
|
@params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
|
95
95
|
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
96
96
|
@classes = Numo::Int32.asarray(y.to_a.uniq.sort)
|
97
|
+
sub_rng = @rng.dup
|
98
|
+
rngs = Array.new(@params[:n_estimators]) { Random.new(sub_rng.rand(Rumale::Values.int_max)) }
|
97
99
|
# Construct forest.
|
98
100
|
@estimators =
|
99
101
|
if enable_parallel?
|
100
|
-
rngs = Array.new(@params[:n_estimators]) { Random.new(@rng.rand(Rumale::Values.int_max)) }
|
101
102
|
# :nocov:
|
102
103
|
parallel_map(@params[:n_estimators]) do |n|
|
103
104
|
bootstrap_ids = Array.new(n_samples) { rngs[n].rand(0...n_samples) }
|
@@ -105,9 +106,9 @@ module Rumale
|
|
105
106
|
end
|
106
107
|
# :nocov:
|
107
108
|
else
|
108
|
-
Array.new(@params[:n_estimators]) do
|
109
|
-
bootstrap_ids = Array.new(n_samples) {
|
110
|
-
plant_tree(
|
109
|
+
Array.new(@params[:n_estimators]) do |n|
|
110
|
+
bootstrap_ids = Array.new(n_samples) { rngs[n].rand(0...n_samples) }
|
111
|
+
plant_tree(rngs[n].rand(Rumale::Values.int_max)).fit(x[bootstrap_ids, true], y[bootstrap_ids])
|
111
112
|
end
|
112
113
|
end
|
113
114
|
@feature_importances =
|
@@ -88,10 +88,11 @@ module Rumale
|
|
88
88
|
@params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
|
89
89
|
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
90
90
|
single_target = y.shape[1].nil?
|
91
|
+
sub_rng = @rng.dup
|
92
|
+
rngs = Array.new(@params[:n_estimators]) { Random.new(sub_rng.rand(Rumale::Values.int_max)) }
|
91
93
|
# Construct forest.
|
92
94
|
@estimators =
|
93
95
|
if enable_parallel?
|
94
|
-
rngs = Array.new(@params[:n_estimators]) { Random.new(@rng.rand(Rumale::Values.int_max)) }
|
95
96
|
# :nocov:
|
96
97
|
parallel_map(@params[:n_estimators]) do |n|
|
97
98
|
bootstrap_ids = Array.new(n_samples) { rngs[n].rand(0...n_samples) }
|
@@ -100,9 +101,9 @@ module Rumale
|
|
100
101
|
end
|
101
102
|
# :nocov:
|
102
103
|
else
|
103
|
-
Array.new(@params[:n_estimators]) do
|
104
|
-
bootstrap_ids = Array.new(n_samples) {
|
105
|
-
tree = plant_tree(
|
104
|
+
Array.new(@params[:n_estimators]) do |n|
|
105
|
+
bootstrap_ids = Array.new(n_samples) { rngs[n].rand(0...n_samples) }
|
106
|
+
tree = plant_tree(rngs[n].rand(Rumale::Values.int_max))
|
106
107
|
tree.fit(x[bootstrap_ids, true], single_target ? y[bootstrap_ids] : y[bootstrap_ids, true])
|
107
108
|
end
|
108
109
|
end
|
@@ -10,7 +10,7 @@ module Rumale
|
|
10
10
|
# Class for RBF kernel feature mapping.
|
11
11
|
#
|
12
12
|
# @example
|
13
|
-
# transformer = Rumale::KernelApproximation::RBF.new(gamma: 1.0,
|
13
|
+
# transformer = Rumale::KernelApproximation::RBF.new(gamma: 1.0, n_components: 128, random_seed: 1)
|
14
14
|
# new_training_samples = transformer.fit_transform(training_samples)
|
15
15
|
# new_testing_samples = transformer.transform(testing_samples)
|
16
16
|
#
|
@@ -63,8 +63,9 @@ module Rumale
|
|
63
63
|
check_sample_array(x)
|
64
64
|
|
65
65
|
n_features = x.shape[1]
|
66
|
+
sub_rng = @rng.dup
|
66
67
|
@params[:n_components] = 2 * n_features if @params[:n_components] <= 0
|
67
|
-
@random_mat = Rumale::Utils.rand_normal([n_features, @params[:n_components]],
|
68
|
+
@random_mat = Rumale::Utils.rand_normal([n_features, @params[:n_components]], sub_rng) * (2.0 * @params[:gamma])**0.5
|
68
69
|
n_half_components = @params[:n_components] / 2
|
69
70
|
@random_vec = Numo::DFloat.zeros(@params[:n_components] - n_half_components).concatenate(
|
70
71
|
Numo::DFloat.ones(n_half_components) * (0.5 * Math::PI)
|
@@ -202,10 +202,11 @@ module Rumale
|
|
202
202
|
n_training_samples = x.shape[0]
|
203
203
|
rand_ids = []
|
204
204
|
weight_vec = Numo::DFloat.zeros(n_training_samples)
|
205
|
+
sub_rng = @rng.dup
|
205
206
|
# Start optimization.
|
206
207
|
@params[:max_iter].times do |t|
|
207
208
|
# random sampling
|
208
|
-
rand_ids = [*0...n_training_samples].shuffle(random:
|
209
|
+
rand_ids = [*0...n_training_samples].shuffle(random: sub_rng) if rand_ids.empty?
|
209
210
|
target_id = rand_ids.shift
|
210
211
|
# update the weight vector
|
211
212
|
func = (weight_vec * bin_y).dot(x[target_id, true].transpose).to_f
|
@@ -49,7 +49,7 @@ module Rumale
|
|
49
49
|
samples = @params[:fit_bias] ? expand_feature(x) : x
|
50
50
|
# Initialize some variables.
|
51
51
|
n_samples, n_features = samples.shape
|
52
|
-
rand_ids = [*0...n_samples].shuffle(random: @rng)
|
52
|
+
rand_ids = [*0...n_samples].shuffle(random: @rng.dup)
|
53
53
|
weight = Numo::DFloat.zeros(n_features)
|
54
54
|
optimizer = @params[:optimizer].dup
|
55
55
|
# Optimization.
|
data/lib/rumale/manifold/tsne.rb
CHANGED
@@ -155,7 +155,8 @@ module Rumale
|
|
155
155
|
pca.fit_transform(x)
|
156
156
|
else
|
157
157
|
n_samples = x.shape[0]
|
158
|
-
|
158
|
+
sub_rng = @rng.dup
|
159
|
+
Rumale::Utils.rand_normal([n_samples, @params[:n_components]], sub_rng, 0, 0.0001)
|
159
160
|
end
|
160
161
|
end
|
161
162
|
|
@@ -60,9 +60,10 @@ module Rumale
|
|
60
60
|
raise ArgumentError,
|
61
61
|
'The value of n_splits must be not less than 2 and not more than the number of samples.'
|
62
62
|
end
|
63
|
+
sub_rng = @rng.dup
|
63
64
|
# Splits dataset ids to each fold.
|
64
65
|
dataset_ids = [*0...n_samples]
|
65
|
-
dataset_ids.shuffle!(random:
|
66
|
+
dataset_ids.shuffle!(random: sub_rng) if @shuffle
|
66
67
|
fold_sets = Array.new(@n_splits) do |n|
|
67
68
|
n_fold_samples = n_samples / @n_splits
|
68
69
|
n_fold_samples += 1 if n < n_samples % @n_splits
|
@@ -74,14 +74,15 @@ module Rumale
|
|
74
74
|
raise RangeError,
|
75
75
|
'The total number of samples in test split and train split must be not more than the number of samples.'
|
76
76
|
end
|
77
|
+
sub_rng = @rng.dup
|
77
78
|
# Returns array consisting of the training and testing ids for each fold.
|
78
79
|
dataset_ids = [*0...n_samples]
|
79
80
|
Array.new(@n_splits) do
|
80
|
-
test_ids = dataset_ids.sample(n_test_samples, random:
|
81
|
+
test_ids = dataset_ids.sample(n_test_samples, random: sub_rng)
|
81
82
|
train_ids = if @train_size.nil?
|
82
83
|
dataset_ids - test_ids
|
83
84
|
else
|
84
|
-
(dataset_ids - test_ids).sample(n_train_samples, random:
|
85
|
+
(dataset_ids - test_ids).sample(n_train_samples, random: sub_rng)
|
85
86
|
end
|
86
87
|
[train_ids, test_ids]
|
87
88
|
end
|
@@ -65,7 +65,8 @@ module Rumale
|
|
65
65
|
'The value of n_splits must be not less than 2 and not more than the number of samples in each class.'
|
66
66
|
end
|
67
67
|
# Splits dataset ids of each class to each fold.
|
68
|
-
|
68
|
+
sub_rng = @rng.dup
|
69
|
+
fold_sets_each_class = y.to_a.uniq.map { |label| fold_sets(y, label, sub_rng) }
|
69
70
|
# Returns array consisting of the training and testing ids for each fold.
|
70
71
|
Array.new(@n_splits) { |fold_id| train_test_sets(fold_sets_each_class, fold_id) }
|
71
72
|
end
|
@@ -76,9 +77,9 @@ module Rumale
|
|
76
77
|
y.to_a.uniq.map { |label| y.eq(label).where.size }.all? { |n_samples| @n_splits.between?(2, n_samples) }
|
77
78
|
end
|
78
79
|
|
79
|
-
def fold_sets(y, label)
|
80
|
+
def fold_sets(y, label, sub_rng)
|
80
81
|
sample_ids = y.eq(label).where.to_a
|
81
|
-
sample_ids.shuffle!(random:
|
82
|
+
sample_ids.shuffle!(random: sub_rng) if @shuffle
|
82
83
|
n_samples = sample_ids.size
|
83
84
|
Array.new(@n_splits) do |n|
|
84
85
|
n_fold_samples = n_samples / @n_splits
|
@@ -62,6 +62,7 @@ module Rumale
|
|
62
62
|
check_sample_label_size(x, y)
|
63
63
|
# Initialize and check some variables.
|
64
64
|
train_sz = @train_size.nil? ? 1.0 - @test_size : @train_size
|
65
|
+
sub_rng = @rng.dup
|
65
66
|
# Check the number of samples in each class.
|
66
67
|
unless valid_n_splits?(y)
|
67
68
|
raise ArgumentError,
|
@@ -88,11 +89,11 @@ module Rumale
|
|
88
89
|
n_samples = sample_ids.size
|
89
90
|
n_test_samples = (@test_size * n_samples).to_i
|
90
91
|
n_train_samples = (train_sz * n_samples).to_i
|
91
|
-
test_ids += sample_ids.sample(n_test_samples, random:
|
92
|
+
test_ids += sample_ids.sample(n_test_samples, random: sub_rng)
|
92
93
|
train_ids += if @train_size.nil?
|
93
94
|
sample_ids - test_ids
|
94
95
|
else
|
95
|
-
(sample_ids - test_ids).sample(n_train_samples, random:
|
96
|
+
(sample_ids - test_ids).sample(n_train_samples, random: sub_rng)
|
96
97
|
end
|
97
98
|
end
|
98
99
|
[train_ids, test_ids]
|
@@ -11,7 +11,7 @@ module Rumale
|
|
11
11
|
#
|
12
12
|
# @example
|
13
13
|
# estimator =
|
14
|
-
# Rumale::NearestNeighbors::KNeighborsClassifier.new(n_neighbors
|
14
|
+
# Rumale::NearestNeighbors::KNeighborsClassifier.new(n_neighbors: 5)
|
15
15
|
# estimator.fit(training_samples, traininig_labels)
|
16
16
|
# results = estimator.predict(testing_samples)
|
17
17
|
#
|
@@ -10,7 +10,7 @@ module Rumale
|
|
10
10
|
#
|
11
11
|
# @example
|
12
12
|
# estimator =
|
13
|
-
# Rumale::NearestNeighbors::KNeighborsRegressor.new(n_neighbors
|
13
|
+
# Rumale::NearestNeighbors::KNeighborsRegressor.new(n_neighbors: 5)
|
14
14
|
# estimator.fit(training_samples, traininig_target_values)
|
15
15
|
# results = estimator.predict(testing_samples)
|
16
16
|
#
|
@@ -9,7 +9,7 @@ module Rumale
|
|
9
9
|
# Pipeline is a class that implements the function to perform the transformers and estimators sequencially.
|
10
10
|
#
|
11
11
|
# @example
|
12
|
-
# rbf = Rumale::KernelApproximation::RBF.new(gamma: 1.0,
|
12
|
+
# rbf = Rumale::KernelApproximation::RBF.new(gamma: 1.0, n_components: 128, random_seed: 1)
|
13
13
|
# svc = Rumale::LinearModel::SVC.new(reg_param: 1.0, fit_bias: true, max_iter: 5000, random_seed: 1)
|
14
14
|
# pipeline = Rumale::Pipeline::Pipeline.new(steps: { trs: rbf, est: svc })
|
15
15
|
# pipeline.fit(training_samples, traininig_labels)
|
@@ -51,7 +51,7 @@ module Rumale
|
|
51
51
|
def partial_fit(x, y)
|
52
52
|
# Initialize some variables.
|
53
53
|
n_samples, n_features = x.shape
|
54
|
-
rand_ids = [*0...n_samples].shuffle(random: @rng)
|
54
|
+
rand_ids = [*0...n_samples].shuffle(random: @rng.dup)
|
55
55
|
weight_vec = Numo::DFloat.zeros(n_features + 1)
|
56
56
|
factor_mat = Numo::DFloat.zeros(@params[:n_factors], n_features)
|
57
57
|
weight_optimizer = @params[:optimizer].dup
|
@@ -104,7 +104,7 @@ module Rumale
|
|
104
104
|
private
|
105
105
|
|
106
106
|
def best_split(features, y, whole_impurity)
|
107
|
-
threshold = @
|
107
|
+
threshold = @sub_rng.rand(features.min..features.max)
|
108
108
|
l_ids = features.le(threshold).where
|
109
109
|
r_ids = features.gt(threshold).where
|
110
110
|
l_impurity = l_ids.empty? ? 0.0 : impurity(y[l_ids, true])
|
@@ -91,7 +91,7 @@ module Rumale
|
|
91
91
|
private
|
92
92
|
|
93
93
|
def best_split(features, y, whole_impurity)
|
94
|
-
threshold = @
|
94
|
+
threshold = @sub_rng.rand(features.min..features.max)
|
95
95
|
l_ids = features.le(threshold).where
|
96
96
|
r_ids = features.gt(threshold).where
|
97
97
|
l_impurity = l_ids.empty? ? 0.0 : impurity(y[l_ids, true])
|
@@ -93,6 +93,7 @@ module Rumale
|
|
93
93
|
@n_leaves = 0
|
94
94
|
@leaf_weights = []
|
95
95
|
@feature_importances = Numo::DFloat.zeros(n_features)
|
96
|
+
@sub_rng = @rng.dup
|
96
97
|
# Build tree.
|
97
98
|
build_tree(x, y, g, h)
|
98
99
|
@leaf_weights = Numo::DFloat[*@leaf_weights]
|
@@ -221,7 +222,7 @@ module Rumale
|
|
221
222
|
end
|
222
223
|
|
223
224
|
def rand_ids(n)
|
224
|
-
[*0...n].sample(@params[:max_features], random: @
|
225
|
+
[*0...n].sample(@params[:max_features], random: @sub_rng)
|
225
226
|
end
|
226
227
|
end
|
227
228
|
end
|
data/lib/rumale/utils.rb
CHANGED
@@ -22,8 +22,12 @@ module Rumale
|
|
22
22
|
# @!visibility private
|
23
23
|
def rand_uniform(shape, rng = nil)
|
24
24
|
rng ||= Random.new
|
25
|
-
|
26
|
-
|
25
|
+
if shape.is_a?(Array)
|
26
|
+
rnd_vals = Array.new(shape.inject(:*)) { rng.rand }
|
27
|
+
Numo::DFloat.asarray(rnd_vals).reshape(shape[0], shape[1])
|
28
|
+
else
|
29
|
+
Numo::DFloat.asarray(Array.new(shape) { rng.rand })
|
30
|
+
end
|
27
31
|
end
|
28
32
|
|
29
33
|
# @!visibility private
|
data/lib/rumale/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rumale
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-06-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|