rumale 0.8.1 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +5 -0
- data/CHANGELOG.md +6 -0
- data/README.md +7 -0
- data/lib/rumale.rb +3 -0
- data/lib/rumale/model_selection/k_fold.rb +4 -0
- data/lib/rumale/model_selection/shuffle_split.rb +91 -0
- data/lib/rumale/model_selection/stratified_k_fold.rb +4 -0
- data/lib/rumale/model_selection/stratified_shuffle_split.rb +115 -0
- data/lib/rumale/optimizer/adam.rb +77 -0
- data/lib/rumale/version.rb +2 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dba389e77a984b46e5352a2b4aae15f8eec2362d
|
4
|
+
data.tar.gz: 2eab0f18fc0e4b16af317bfa7b81db8203c62a20
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 034b0fc6f79ed66af3a50d025e66f17a3815c0c0e0634bd3eccec19546d585b17f158e376700aafa3ae89d52a895efefd79ff048d61b9f87dabe51f72393b75f
|
7
|
+
data.tar.gz: 4124f95f72392af658b342d7c21526717417de246ce99f4ee857cc4d26e799dc4a7ea0bbb59aa45c6e8621178ee84a4a4ead426aa79f09e62bf903e273cdf05b
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
# 0.8.2
|
2
|
+
- Add class for Adam optimizer.
|
3
|
+
- Add data splitter classes for random permutation cross validation.
|
4
|
+
- Add accessor method for number of splits to K-fold splitter classes.
|
5
|
+
- Add execution result of example script on README ([#3](https://github.com/yoshoku/rumale/pull/3)).
|
6
|
+
|
1
7
|
# 0.8.1
|
2
8
|
- Add some evaluator classes.
|
3
9
|
- MeanSquaredLogError
|
data/README.md
CHANGED
@@ -121,6 +121,13 @@ mean_logloss = report[:test_score].inject(:+) / kf.n_splits
|
|
121
121
|
puts("5-CV mean log-loss: %.3f" % mean_logloss)
|
122
122
|
```
|
123
123
|
|
124
|
+
Execution of the above scripts result in the following.
|
125
|
+
|
126
|
+
```bash
|
127
|
+
$ ruby cross_validation.rb
|
128
|
+
5-CV mean log-loss: 0.476
|
129
|
+
```
|
130
|
+
|
124
131
|
### Example 3. Pipeline
|
125
132
|
|
126
133
|
```ruby
|
data/lib/rumale.rb
CHANGED
@@ -18,6 +18,7 @@ require 'rumale/base/splitter'
|
|
18
18
|
require 'rumale/base/evaluator'
|
19
19
|
require 'rumale/optimizer/sgd'
|
20
20
|
require 'rumale/optimizer/rmsprop'
|
21
|
+
require 'rumale/optimizer/adam'
|
21
22
|
require 'rumale/optimizer/nadam'
|
22
23
|
require 'rumale/optimizer/yellow_fin'
|
23
24
|
require 'rumale/pipeline/pipeline'
|
@@ -56,6 +57,8 @@ require 'rumale/preprocessing/label_encoder'
|
|
56
57
|
require 'rumale/preprocessing/one_hot_encoder'
|
57
58
|
require 'rumale/model_selection/k_fold'
|
58
59
|
require 'rumale/model_selection/stratified_k_fold'
|
60
|
+
require 'rumale/model_selection/shuffle_split'
|
61
|
+
require 'rumale/model_selection/stratified_shuffle_split'
|
59
62
|
require 'rumale/model_selection/cross_validation'
|
60
63
|
require 'rumale/model_selection/grid_search_cv'
|
61
64
|
require 'rumale/evaluation_measure/accuracy'
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/splitter'
|
4
|
+
|
5
|
+
module Rumale
|
6
|
+
module ModelSelection
|
7
|
+
# ShuffleSplit is a class that generates the set of data indices for random permutation cross-validation.
|
8
|
+
#
|
9
|
+
# @example
|
10
|
+
# ss = Rumale::ModelSelection::ShuffleSplit.new(n_splits: 3, test_size: 0.2, random_seed: 1)
|
11
|
+
# ss.split(samples, labels).each do |train_ids, test_ids|
|
12
|
+
# train_samples = samples[train_ids, true]
|
13
|
+
# test_samples = samples[test_ids, true]
|
14
|
+
# ...
|
15
|
+
# end
|
16
|
+
#
|
17
|
+
class ShuffleSplit
|
18
|
+
include Base::Splitter
|
19
|
+
|
20
|
+
# Return the number of folds.
|
21
|
+
# @return [Integer]
|
22
|
+
attr_reader :n_splits
|
23
|
+
|
24
|
+
# Return the random generator for shuffling the dataset.
|
25
|
+
# @return [Random]
|
26
|
+
attr_reader :rng
|
27
|
+
|
28
|
+
# Create a new data splitter for random permutation cross validation.
|
29
|
+
#
|
30
|
+
# @param n_splits [Integer] The number of folds.
|
31
|
+
# @param test_size [Float] The ratio of number of samples for test data.
|
32
|
+
# @param train_size [Float] The ratio of number of samples for train data.
|
33
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
34
|
+
def initialize(n_splits: 3, test_size: 0.1, train_size: nil, random_seed: nil)
|
35
|
+
check_params_integer(n_splits: n_splits)
|
36
|
+
check_params_float(test_size: test_size)
|
37
|
+
check_params_type_or_nil(Float, train_size: train_size)
|
38
|
+
check_params_type_or_nil(Integer, random_seed: random_seed)
|
39
|
+
check_params_positive(n_splits: n_splits)
|
40
|
+
check_params_positive(test_size: test_size)
|
41
|
+
check_params_positive(train_size: train_size) unless train_size.nil?
|
42
|
+
@n_splits = n_splits
|
43
|
+
@test_size = test_size
|
44
|
+
@train_size = train_size
|
45
|
+
@random_seed = random_seed
|
46
|
+
@random_seed ||= srand
|
47
|
+
@rng = Random.new(@random_seed)
|
48
|
+
end
|
49
|
+
|
50
|
+
# Generate data indices for random permutation cross validation.
|
51
|
+
#
|
52
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features])
|
53
|
+
# The dataset to be used to generate data indices for random permutation cross validation.
|
54
|
+
# @return [Array] The set of data indices for constructing the training and testing dataset in each fold.
|
55
|
+
def split(x, _y = nil)
|
56
|
+
check_sample_array(x)
|
57
|
+
# Initialize and check some variables.
|
58
|
+
n_samples = x.shape[0]
|
59
|
+
n_test_samples = (@test_size * n_samples).to_i
|
60
|
+
n_train_samples = @train_size.nil? ? n_samples - n_test_samples : (@train_size * n_samples).to_i
|
61
|
+
unless @n_splits.between?(1, n_samples)
|
62
|
+
raise ArgumentError,
|
63
|
+
'The value of n_splits must be not less than 1 and not more than the number of samples.'
|
64
|
+
end
|
65
|
+
unless n_test_samples.between?(1, n_samples)
|
66
|
+
raise RangeError,
|
67
|
+
'The number of sample in test split must be not less than 1 and not more than the number of samples.'
|
68
|
+
end
|
69
|
+
unless n_train_samples.between?(1, n_samples)
|
70
|
+
raise RangeError,
|
71
|
+
'The number of sample in train split must be not less than 1 and not more than the number of samples.'
|
72
|
+
end
|
73
|
+
if (n_test_samples + n_train_samples) > n_samples
|
74
|
+
raise RangeError,
|
75
|
+
'The total number of samples in test split and train split must be not more than the number of samples.'
|
76
|
+
end
|
77
|
+
# Returns array consisting of the training and testing ids for each fold.
|
78
|
+
dataset_ids = [*0...n_samples]
|
79
|
+
Array.new(@n_splits) do
|
80
|
+
test_ids = dataset_ids.sample(n_test_samples, random: @rng)
|
81
|
+
train_ids = if @train_size.nil?
|
82
|
+
dataset_ids - test_ids
|
83
|
+
else
|
84
|
+
(dataset_ids - test_ids).sample(n_train_samples, random: @rng)
|
85
|
+
end
|
86
|
+
[train_ids, test_ids]
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -18,6 +18,10 @@ module Rumale
|
|
18
18
|
class StratifiedKFold
|
19
19
|
include Base::Splitter
|
20
20
|
|
21
|
+
# Return the number of folds.
|
22
|
+
# @return [Integer]
|
23
|
+
attr_reader :n_splits
|
24
|
+
|
21
25
|
# Return the flag indicating whether to shuffle the dataset.
|
22
26
|
# @return [Boolean]
|
23
27
|
attr_reader :shuffle
|
@@ -0,0 +1,115 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/splitter'
|
4
|
+
|
5
|
+
module Rumale
|
6
|
+
module ModelSelection
|
7
|
+
# StratifiedShuffleSplit is a class that generates the set of data indices for random permutation cross-validation.
|
8
|
+
# The proportion of the number of samples in each class will be almost equal for each fold.
|
9
|
+
#
|
10
|
+
# @example
|
11
|
+
# ss = Rumale::ModelSelection::StratifiedShuffleSplit.new(n_splits: 3, test_size: 0.2, random_seed: 1)
|
12
|
+
# ss.split(samples, labels).each do |train_ids, test_ids|
|
13
|
+
# train_samples = samples[train_ids, true]
|
14
|
+
# test_samples = samples[test_ids, true]
|
15
|
+
# ...
|
16
|
+
# end
|
17
|
+
#
|
18
|
+
class StratifiedShuffleSplit
|
19
|
+
include Base::Splitter
|
20
|
+
|
21
|
+
# Return the number of folds.
|
22
|
+
# @return [Integer]
|
23
|
+
attr_reader :n_splits
|
24
|
+
|
25
|
+
# Return the random generator for shuffling the dataset.
|
26
|
+
# @return [Random]
|
27
|
+
attr_reader :rng
|
28
|
+
|
29
|
+
# Create a new data splitter for random permutation cross validation.
|
30
|
+
#
|
31
|
+
# @param n_splits [Integer] The number of folds.
|
32
|
+
# @param test_size [Float] The ratio of number of samples for test data.
|
33
|
+
# @param train_size [Float] The ratio of number of samples for train data.
|
34
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
35
|
+
def initialize(n_splits: 3, test_size: 0.1, train_size: nil, random_seed: nil)
|
36
|
+
check_params_integer(n_splits: n_splits)
|
37
|
+
check_params_float(test_size: test_size)
|
38
|
+
check_params_type_or_nil(Float, train_size: train_size)
|
39
|
+
check_params_type_or_nil(Integer, random_seed: random_seed)
|
40
|
+
check_params_positive(n_splits: n_splits)
|
41
|
+
check_params_positive(test_size: test_size)
|
42
|
+
check_params_positive(train_size: train_size) unless train_size.nil?
|
43
|
+
@n_splits = n_splits
|
44
|
+
@test_size = test_size
|
45
|
+
@train_size = train_size
|
46
|
+
@random_seed = random_seed
|
47
|
+
@random_seed ||= srand
|
48
|
+
@rng = Random.new(@random_seed)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Generate data indices for stratified random permutation cross validation.
|
52
|
+
#
|
53
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features])
|
54
|
+
# The dataset to be used to generate data indices for stratified random permutation cross validation.
|
55
|
+
# This argument exists to unify the interface between the K-fold methods, it is not used in the method.
|
56
|
+
# @param y [Numo::Int32] (shape: [n_samples])
|
57
|
+
# The labels to be used to generate data indices for stratified random permutation cross validation.
|
58
|
+
# @return [Array] The set of data indices for constructing the training and testing dataset in each fold.
|
59
|
+
def split(x, y)
|
60
|
+
check_sample_array(x)
|
61
|
+
check_label_array(y)
|
62
|
+
check_sample_label_size(x, y)
|
63
|
+
# Initialize and check some variables.
|
64
|
+
train_sz = @train_size.nil? ? 1.0 - @test_size : @train_size
|
65
|
+
# Check the number of samples in each class.
|
66
|
+
unless valid_n_splits?(y)
|
67
|
+
raise ArgumentError,
|
68
|
+
'The value of n_splits must be not less than 1 and not more than the number of samples in each class.'
|
69
|
+
end
|
70
|
+
unless enough_data_size_each_class?(y, @test_size)
|
71
|
+
raise RangeError,
|
72
|
+
'The number of sample in test split must be not less than 1 and not more than the number of samples in each class.'
|
73
|
+
end
|
74
|
+
unless enough_data_size_each_class?(y, train_sz)
|
75
|
+
raise RangeError,
|
76
|
+
'The number of sample in train split must be not less than 1 and not more than the number of samples in each class.'
|
77
|
+
end
|
78
|
+
unless enough_data_size_each_class?(y, train_sz + @test_size)
|
79
|
+
raise RangeError,
|
80
|
+
'The total number of samples in test split and train split must be not more than the number of samples in each class.'
|
81
|
+
end
|
82
|
+
# Returns array consisting of the training and testing ids for each fold.
|
83
|
+
sample_ids_each_class = y.to_a.uniq.map { |label| y.eq(label).where.to_a }
|
84
|
+
Array.new(@n_splits) do
|
85
|
+
train_ids = []
|
86
|
+
test_ids = []
|
87
|
+
sample_ids_each_class.each do |sample_ids|
|
88
|
+
n_samples = sample_ids.size
|
89
|
+
n_test_samples = (@test_size * n_samples).to_i
|
90
|
+
n_train_samples = (train_sz * n_samples).to_i
|
91
|
+
test_ids += sample_ids.sample(n_test_samples, random: @rng)
|
92
|
+
train_ids += if @train_size.nil?
|
93
|
+
sample_ids - test_ids
|
94
|
+
else
|
95
|
+
(sample_ids - test_ids).sample(n_train_samples, random: @rng)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
[train_ids, test_ids]
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
private
|
103
|
+
|
104
|
+
def valid_n_splits?(y)
|
105
|
+
y.to_a.uniq.map { |label| y.eq(label).where.size }.all? { |n_samples| @n_splits.between?(1, n_samples) }
|
106
|
+
end
|
107
|
+
|
108
|
+
def enough_data_size_each_class?(y, data_size)
|
109
|
+
y.to_a.uniq.map { |label| y.eq(label).where.size }.all? do |n_samples|
|
110
|
+
(data_size * n_samples).to_i.between?(1, n_samples)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/validation'
|
4
|
+
require 'rumale/base/base_estimator'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module Optimizer
|
8
|
+
# Adam is a class that implements Adam optimizer.
|
9
|
+
#
|
10
|
+
# @example
|
11
|
+
# optimizer = Rumale::Optimizer::Adam.new(learning_rate: 0.01, momentum: 0.9, decay1: 0.9, decay2: 0.999)
|
12
|
+
# estimator = Rumale::LinearModel::LinearRegression.new(optimizer: optimizer, random_seed: 1)
|
13
|
+
# estimator.fit(samples, values)
|
14
|
+
#
|
15
|
+
# *Reference*
|
16
|
+
# - D P. Kingma and J. Ba, "Adam: A Method for Stochastic Optimization," Proc. ICLR'15, 2015.
|
17
|
+
class Adam
|
18
|
+
include Base::BaseEstimator
|
19
|
+
include Validation
|
20
|
+
|
21
|
+
# Create a new optimizer with Adam
|
22
|
+
#
|
23
|
+
# @param learning_rate [Float] The initial value of learning rate.
|
24
|
+
# @param decay1 [Float] The smoothing parameter for the first moment.
|
25
|
+
# @param decay2 [Float] The smoothing parameter for the second moment.
|
26
|
+
def initialize(learning_rate: 0.001, decay1: 0.9, decay2: 0.999)
|
27
|
+
check_params_float(learning_rate: learning_rate, decay1: decay1, decay2: decay2)
|
28
|
+
check_params_positive(learning_rate: learning_rate, decay1: decay1, decay2: decay2)
|
29
|
+
@params = {}
|
30
|
+
@params[:learning_rate] = learning_rate
|
31
|
+
@params[:decay1] = decay1
|
32
|
+
@params[:decay2] = decay2
|
33
|
+
@fst_moment = nil
|
34
|
+
@sec_moment = nil
|
35
|
+
@iter = 0
|
36
|
+
end
|
37
|
+
|
38
|
+
# Calculate the updated weight with Nadam adaptive learning rate.
|
39
|
+
#
|
40
|
+
# @param weight [Numo::DFloat] (shape: [n_features]) The weight to be updated.
|
41
|
+
# @param gradient [Numo::DFloat] (shape: [n_features]) The gradient for updating the weight.
|
42
|
+
# @return [Numo::DFloat] (shape: [n_feautres]) The updated weight.
|
43
|
+
def call(weight, gradient)
|
44
|
+
@fst_moment ||= Numo::DFloat.zeros(weight.shape[0])
|
45
|
+
@sec_moment ||= Numo::DFloat.zeros(weight.shape[0])
|
46
|
+
|
47
|
+
@iter += 1
|
48
|
+
|
49
|
+
@fst_moment = @params[:decay1] * @fst_moment + (1.0 - @params[:decay1]) * gradient
|
50
|
+
@sec_moment = @params[:decay2] * @sec_moment + (1.0 - @params[:decay2]) * gradient**2
|
51
|
+
nm_fst_moment = @fst_moment / (1.0 - @params[:decay1]**@iter)
|
52
|
+
nm_sec_moment = @sec_moment / (1.0 - @params[:decay2]**@iter)
|
53
|
+
|
54
|
+
weight - @params[:learning_rate] * nm_fst_moment / (nm_sec_moment**0.5 + 1e-8)
|
55
|
+
end
|
56
|
+
|
57
|
+
# Dump marshal data.
|
58
|
+
# @return [Hash] The marshal data.
|
59
|
+
def marshal_dump
|
60
|
+
{ params: @params,
|
61
|
+
fst_moment: @fst_moment,
|
62
|
+
sec_moment: @sec_moment,
|
63
|
+
iter: @iter }
|
64
|
+
end
|
65
|
+
|
66
|
+
# Load marshal data.
|
67
|
+
# @return [nil]
|
68
|
+
def marshal_load(obj)
|
69
|
+
@params = obj[:params]
|
70
|
+
@fst_moment = obj[:fst_moment]
|
71
|
+
@sec_moment = obj[:sec_moment]
|
72
|
+
@iter = obj[:iter]
|
73
|
+
nil
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
data/lib/rumale/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rumale
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-03-
|
11
|
+
date: 2019-03-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -152,11 +152,14 @@ files:
|
|
152
152
|
- lib/rumale/model_selection/cross_validation.rb
|
153
153
|
- lib/rumale/model_selection/grid_search_cv.rb
|
154
154
|
- lib/rumale/model_selection/k_fold.rb
|
155
|
+
- lib/rumale/model_selection/shuffle_split.rb
|
155
156
|
- lib/rumale/model_selection/stratified_k_fold.rb
|
157
|
+
- lib/rumale/model_selection/stratified_shuffle_split.rb
|
156
158
|
- lib/rumale/multiclass/one_vs_rest_classifier.rb
|
157
159
|
- lib/rumale/naive_bayes/naive_bayes.rb
|
158
160
|
- lib/rumale/nearest_neighbors/k_neighbors_classifier.rb
|
159
161
|
- lib/rumale/nearest_neighbors/k_neighbors_regressor.rb
|
162
|
+
- lib/rumale/optimizer/adam.rb
|
160
163
|
- lib/rumale/optimizer/nadam.rb
|
161
164
|
- lib/rumale/optimizer/rmsprop.rb
|
162
165
|
- lib/rumale/optimizer/sgd.rb
|