rumale 0.8.1 → 0.8.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dab9c67aa39f19e73859d41013363b4f3811142e
4
- data.tar.gz: 49b1d14b9261f2ede4dc97b4353efdf9032872d2
3
+ metadata.gz: dba389e77a984b46e5352a2b4aae15f8eec2362d
4
+ data.tar.gz: 2eab0f18fc0e4b16af317bfa7b81db8203c62a20
5
5
  SHA512:
6
- metadata.gz: 1d2b62e0660586f4ace811f06bdd73e9ff5adb877682a9ae53e29c76188b6fe1215d1305953d292668dcc4f0b5fba71399ad746e8976572a19bbd6a4c1153829
7
- data.tar.gz: ce075327208560af72f0b54d7113b39a23c98c1828a3f5c3f5b28b3d987be3d81af465aae943aa27f2883cc671337b1952b4f8d8981a970c3727e0c94affbef5
6
+ metadata.gz: 034b0fc6f79ed66af3a50d025e66f17a3815c0c0e0634bd3eccec19546d585b17f158e376700aafa3ae89d52a895efefd79ff048d61b9f87dabe51f72393b75f
7
+ data.tar.gz: 4124f95f72392af658b342d7c21526717417de246ce99f4ee857cc4d26e799dc4a7ea0bbb59aa45c6e8621178ee84a4a4ead426aa79f09e62bf903e273cdf05b
@@ -4,6 +4,11 @@ AllCops:
4
4
  TargetRubyVersion: 2.3
5
5
  DisplayCopNames: true
6
6
  DisplayStyleGuide: true
7
+ Exclude:
8
+ - 'bin/*'
9
+ - 'rumale.gemspec'
10
+ - 'Rakefile'
11
+ - 'Gemfile'
7
12
 
8
13
  Documentation:
9
14
  Enabled: false
@@ -1,3 +1,9 @@
1
+ # 0.8.2
2
+ - Add class for Adam optimizer.
3
+ - Add data splitter classes for random permutation cross validation.
4
+ - Add accessor method for number of splits to K-fold splitter classes.
5
+ - Add execution result of example script on README ([#3](https://github.com/yoshoku/rumale/pull/3)).
6
+
1
7
  # 0.8.1
2
8
  - Add some evaluator classes.
3
9
  - MeanSquaredLogError
data/README.md CHANGED
@@ -121,6 +121,13 @@ mean_logloss = report[:test_score].inject(:+) / kf.n_splits
121
121
  puts("5-CV mean log-loss: %.3f" % mean_logloss)
122
122
  ```
123
123
 
124
+ Execution of the above scripts result in the following.
125
+
126
+ ```bash
127
+ $ ruby cross_validation.rb
128
+ 5-CV mean log-loss: 0.476
129
+ ```
130
+
124
131
  ### Example 3. Pipeline
125
132
 
126
133
  ```ruby
@@ -18,6 +18,7 @@ require 'rumale/base/splitter'
18
18
  require 'rumale/base/evaluator'
19
19
  require 'rumale/optimizer/sgd'
20
20
  require 'rumale/optimizer/rmsprop'
21
+ require 'rumale/optimizer/adam'
21
22
  require 'rumale/optimizer/nadam'
22
23
  require 'rumale/optimizer/yellow_fin'
23
24
  require 'rumale/pipeline/pipeline'
@@ -56,6 +57,8 @@ require 'rumale/preprocessing/label_encoder'
56
57
  require 'rumale/preprocessing/one_hot_encoder'
57
58
  require 'rumale/model_selection/k_fold'
58
59
  require 'rumale/model_selection/stratified_k_fold'
60
+ require 'rumale/model_selection/shuffle_split'
61
+ require 'rumale/model_selection/stratified_shuffle_split'
59
62
  require 'rumale/model_selection/cross_validation'
60
63
  require 'rumale/model_selection/grid_search_cv'
61
64
  require 'rumale/evaluation_measure/accuracy'
@@ -18,6 +18,10 @@ module Rumale
18
18
  class KFold
19
19
  include Base::Splitter
20
20
 
21
+ # Return the number of folds.
22
+ # @return [Integer]
23
+ attr_reader :n_splits
24
+
21
25
  # Return the flag indicating whether to shuffle the dataset.
22
26
  # @return [Boolean]
23
27
  attr_reader :shuffle
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/splitter'
4
+
5
+ module Rumale
6
+ module ModelSelection
7
+ # ShuffleSplit is a class that generates the set of data indices for random permutation cross-validation.
8
+ #
9
+ # @example
10
+ # ss = Rumale::ModelSelection::ShuffleSplit.new(n_splits: 3, test_size: 0.2, random_seed: 1)
11
+ # ss.split(samples, labels).each do |train_ids, test_ids|
12
+ # train_samples = samples[train_ids, true]
13
+ # test_samples = samples[test_ids, true]
14
+ # ...
15
+ # end
16
+ #
17
+ class ShuffleSplit
18
+ include Base::Splitter
19
+
20
+ # Return the number of folds.
21
+ # @return [Integer]
22
+ attr_reader :n_splits
23
+
24
+ # Return the random generator for shuffling the dataset.
25
+ # @return [Random]
26
+ attr_reader :rng
27
+
28
+ # Create a new data splitter for random permutation cross validation.
29
+ #
30
+ # @param n_splits [Integer] The number of folds.
31
+ # @param test_size [Float] The ratio of number of samples for test data.
32
+ # @param train_size [Float] The ratio of number of samples for train data.
33
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
34
+ def initialize(n_splits: 3, test_size: 0.1, train_size: nil, random_seed: nil)
35
+ check_params_integer(n_splits: n_splits)
36
+ check_params_float(test_size: test_size)
37
+ check_params_type_or_nil(Float, train_size: train_size)
38
+ check_params_type_or_nil(Integer, random_seed: random_seed)
39
+ check_params_positive(n_splits: n_splits)
40
+ check_params_positive(test_size: test_size)
41
+ check_params_positive(train_size: train_size) unless train_size.nil?
42
+ @n_splits = n_splits
43
+ @test_size = test_size
44
+ @train_size = train_size
45
+ @random_seed = random_seed
46
+ @random_seed ||= srand
47
+ @rng = Random.new(@random_seed)
48
+ end
49
+
50
+ # Generate data indices for random permutation cross validation.
51
+ #
52
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features])
53
+ # The dataset to be used to generate data indices for random permutation cross validation.
54
+ # @return [Array] The set of data indices for constructing the training and testing dataset in each fold.
55
+ def split(x, _y = nil)
56
+ check_sample_array(x)
57
+ # Initialize and check some variables.
58
+ n_samples = x.shape[0]
59
+ n_test_samples = (@test_size * n_samples).to_i
60
+ n_train_samples = @train_size.nil? ? n_samples - n_test_samples : (@train_size * n_samples).to_i
61
+ unless @n_splits.between?(1, n_samples)
62
+ raise ArgumentError,
63
+ 'The value of n_splits must be not less than 1 and not more than the number of samples.'
64
+ end
65
+ unless n_test_samples.between?(1, n_samples)
66
+ raise RangeError,
67
+ 'The number of sample in test split must be not less than 1 and not more than the number of samples.'
68
+ end
69
+ unless n_train_samples.between?(1, n_samples)
70
+ raise RangeError,
71
+ 'The number of sample in train split must be not less than 1 and not more than the number of samples.'
72
+ end
73
+ if (n_test_samples + n_train_samples) > n_samples
74
+ raise RangeError,
75
+ 'The total number of samples in test split and train split must be not more than the number of samples.'
76
+ end
77
+ # Returns array consisting of the training and testing ids for each fold.
78
+ dataset_ids = [*0...n_samples]
79
+ Array.new(@n_splits) do
80
+ test_ids = dataset_ids.sample(n_test_samples, random: @rng)
81
+ train_ids = if @train_size.nil?
82
+ dataset_ids - test_ids
83
+ else
84
+ (dataset_ids - test_ids).sample(n_train_samples, random: @rng)
85
+ end
86
+ [train_ids, test_ids]
87
+ end
88
+ end
89
+ end
90
+ end
91
+ end
@@ -18,6 +18,10 @@ module Rumale
18
18
  class StratifiedKFold
19
19
  include Base::Splitter
20
20
 
21
+ # Return the number of folds.
22
+ # @return [Integer]
23
+ attr_reader :n_splits
24
+
21
25
  # Return the flag indicating whether to shuffle the dataset.
22
26
  # @return [Boolean]
23
27
  attr_reader :shuffle
@@ -0,0 +1,115 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/splitter'
4
+
5
+ module Rumale
6
+ module ModelSelection
7
+ # StratifiedShuffleSplit is a class that generates the set of data indices for random permutation cross-validation.
8
+ # The proportion of the number of samples in each class will be almost equal for each fold.
9
+ #
10
+ # @example
11
+ # ss = Rumale::ModelSelection::StratifiedShuffleSplit.new(n_splits: 3, test_size: 0.2, random_seed: 1)
12
+ # ss.split(samples, labels).each do |train_ids, test_ids|
13
+ # train_samples = samples[train_ids, true]
14
+ # test_samples = samples[test_ids, true]
15
+ # ...
16
+ # end
17
+ #
18
+ class StratifiedShuffleSplit
19
+ include Base::Splitter
20
+
21
+ # Return the number of folds.
22
+ # @return [Integer]
23
+ attr_reader :n_splits
24
+
25
+ # Return the random generator for shuffling the dataset.
26
+ # @return [Random]
27
+ attr_reader :rng
28
+
29
+ # Create a new data splitter for random permutation cross validation.
30
+ #
31
+ # @param n_splits [Integer] The number of folds.
32
+ # @param test_size [Float] The ratio of number of samples for test data.
33
+ # @param train_size [Float] The ratio of number of samples for train data.
34
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
35
+ def initialize(n_splits: 3, test_size: 0.1, train_size: nil, random_seed: nil)
36
+ check_params_integer(n_splits: n_splits)
37
+ check_params_float(test_size: test_size)
38
+ check_params_type_or_nil(Float, train_size: train_size)
39
+ check_params_type_or_nil(Integer, random_seed: random_seed)
40
+ check_params_positive(n_splits: n_splits)
41
+ check_params_positive(test_size: test_size)
42
+ check_params_positive(train_size: train_size) unless train_size.nil?
43
+ @n_splits = n_splits
44
+ @test_size = test_size
45
+ @train_size = train_size
46
+ @random_seed = random_seed
47
+ @random_seed ||= srand
48
+ @rng = Random.new(@random_seed)
49
+ end
50
+
51
+ # Generate data indices for stratified random permutation cross validation.
52
+ #
53
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features])
54
+ # The dataset to be used to generate data indices for stratified random permutation cross validation.
55
+ # This argument exists to unify the interface between the K-fold methods, it is not used in the method.
56
+ # @param y [Numo::Int32] (shape: [n_samples])
57
+ # The labels to be used to generate data indices for stratified random permutation cross validation.
58
+ # @return [Array] The set of data indices for constructing the training and testing dataset in each fold.
59
+ def split(x, y)
60
+ check_sample_array(x)
61
+ check_label_array(y)
62
+ check_sample_label_size(x, y)
63
+ # Initialize and check some variables.
64
+ train_sz = @train_size.nil? ? 1.0 - @test_size : @train_size
65
+ # Check the number of samples in each class.
66
+ unless valid_n_splits?(y)
67
+ raise ArgumentError,
68
+ 'The value of n_splits must be not less than 1 and not more than the number of samples in each class.'
69
+ end
70
+ unless enough_data_size_each_class?(y, @test_size)
71
+ raise RangeError,
72
+ 'The number of sample in test split must be not less than 1 and not more than the number of samples in each class.'
73
+ end
74
+ unless enough_data_size_each_class?(y, train_sz)
75
+ raise RangeError,
76
+ 'The number of sample in train split must be not less than 1 and not more than the number of samples in each class.'
77
+ end
78
+ unless enough_data_size_each_class?(y, train_sz + @test_size)
79
+ raise RangeError,
80
+ 'The total number of samples in test split and train split must be not more than the number of samples in each class.'
81
+ end
82
+ # Returns array consisting of the training and testing ids for each fold.
83
+ sample_ids_each_class = y.to_a.uniq.map { |label| y.eq(label).where.to_a }
84
+ Array.new(@n_splits) do
85
+ train_ids = []
86
+ test_ids = []
87
+ sample_ids_each_class.each do |sample_ids|
88
+ n_samples = sample_ids.size
89
+ n_test_samples = (@test_size * n_samples).to_i
90
+ n_train_samples = (train_sz * n_samples).to_i
91
+ test_ids += sample_ids.sample(n_test_samples, random: @rng)
92
+ train_ids += if @train_size.nil?
93
+ sample_ids - test_ids
94
+ else
95
+ (sample_ids - test_ids).sample(n_train_samples, random: @rng)
96
+ end
97
+ end
98
+ [train_ids, test_ids]
99
+ end
100
+ end
101
+
102
+ private
103
+
104
+ def valid_n_splits?(y)
105
+ y.to_a.uniq.map { |label| y.eq(label).where.size }.all? { |n_samples| @n_splits.between?(1, n_samples) }
106
+ end
107
+
108
+ def enough_data_size_each_class?(y, data_size)
109
+ y.to_a.uniq.map { |label| y.eq(label).where.size }.all? do |n_samples|
110
+ (data_size * n_samples).to_i.between?(1, n_samples)
111
+ end
112
+ end
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/validation'
4
+ require 'rumale/base/base_estimator'
5
+
6
+ module Rumale
7
+ module Optimizer
8
+ # Adam is a class that implements Adam optimizer.
9
+ #
10
+ # @example
11
+ # optimizer = Rumale::Optimizer::Adam.new(learning_rate: 0.01, momentum: 0.9, decay1: 0.9, decay2: 0.999)
12
+ # estimator = Rumale::LinearModel::LinearRegression.new(optimizer: optimizer, random_seed: 1)
13
+ # estimator.fit(samples, values)
14
+ #
15
+ # *Reference*
16
+ # - D P. Kingma and J. Ba, "Adam: A Method for Stochastic Optimization," Proc. ICLR'15, 2015.
17
+ class Adam
18
+ include Base::BaseEstimator
19
+ include Validation
20
+
21
+ # Create a new optimizer with Adam
22
+ #
23
+ # @param learning_rate [Float] The initial value of learning rate.
24
+ # @param decay1 [Float] The smoothing parameter for the first moment.
25
+ # @param decay2 [Float] The smoothing parameter for the second moment.
26
+ def initialize(learning_rate: 0.001, decay1: 0.9, decay2: 0.999)
27
+ check_params_float(learning_rate: learning_rate, decay1: decay1, decay2: decay2)
28
+ check_params_positive(learning_rate: learning_rate, decay1: decay1, decay2: decay2)
29
+ @params = {}
30
+ @params[:learning_rate] = learning_rate
31
+ @params[:decay1] = decay1
32
+ @params[:decay2] = decay2
33
+ @fst_moment = nil
34
+ @sec_moment = nil
35
+ @iter = 0
36
+ end
37
+
38
+ # Calculate the updated weight with Nadam adaptive learning rate.
39
+ #
40
+ # @param weight [Numo::DFloat] (shape: [n_features]) The weight to be updated.
41
+ # @param gradient [Numo::DFloat] (shape: [n_features]) The gradient for updating the weight.
42
+ # @return [Numo::DFloat] (shape: [n_feautres]) The updated weight.
43
+ def call(weight, gradient)
44
+ @fst_moment ||= Numo::DFloat.zeros(weight.shape[0])
45
+ @sec_moment ||= Numo::DFloat.zeros(weight.shape[0])
46
+
47
+ @iter += 1
48
+
49
+ @fst_moment = @params[:decay1] * @fst_moment + (1.0 - @params[:decay1]) * gradient
50
+ @sec_moment = @params[:decay2] * @sec_moment + (1.0 - @params[:decay2]) * gradient**2
51
+ nm_fst_moment = @fst_moment / (1.0 - @params[:decay1]**@iter)
52
+ nm_sec_moment = @sec_moment / (1.0 - @params[:decay2]**@iter)
53
+
54
+ weight - @params[:learning_rate] * nm_fst_moment / (nm_sec_moment**0.5 + 1e-8)
55
+ end
56
+
57
+ # Dump marshal data.
58
+ # @return [Hash] The marshal data.
59
+ def marshal_dump
60
+ { params: @params,
61
+ fst_moment: @fst_moment,
62
+ sec_moment: @sec_moment,
63
+ iter: @iter }
64
+ end
65
+
66
+ # Load marshal data.
67
+ # @return [nil]
68
+ def marshal_load(obj)
69
+ @params = obj[:params]
70
+ @fst_moment = obj[:fst_moment]
71
+ @sec_moment = obj[:sec_moment]
72
+ @iter = obj[:iter]
73
+ nil
74
+ end
75
+ end
76
+ end
77
+ end
@@ -2,5 +2,6 @@
2
2
 
3
3
  # Rumale is a machine learning library in Ruby.
4
4
  module Rumale
5
- VERSION = '0.8.1'
5
+ # The version of Rumale you are using.
6
+ VERSION = '0.8.2'
6
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rumale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.1
4
+ version: 0.8.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-03-08 00:00:00.000000000 Z
11
+ date: 2019-03-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -152,11 +152,14 @@ files:
152
152
  - lib/rumale/model_selection/cross_validation.rb
153
153
  - lib/rumale/model_selection/grid_search_cv.rb
154
154
  - lib/rumale/model_selection/k_fold.rb
155
+ - lib/rumale/model_selection/shuffle_split.rb
155
156
  - lib/rumale/model_selection/stratified_k_fold.rb
157
+ - lib/rumale/model_selection/stratified_shuffle_split.rb
156
158
  - lib/rumale/multiclass/one_vs_rest_classifier.rb
157
159
  - lib/rumale/naive_bayes/naive_bayes.rb
158
160
  - lib/rumale/nearest_neighbors/k_neighbors_classifier.rb
159
161
  - lib/rumale/nearest_neighbors/k_neighbors_regressor.rb
162
+ - lib/rumale/optimizer/adam.rb
160
163
  - lib/rumale/optimizer/nadam.rb
161
164
  - lib/rumale/optimizer/rmsprop.rb
162
165
  - lib/rumale/optimizer/sgd.rb