svmkit 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 558b550a373cb5cbe7c295dc589c57b6b37a697b8309fa5497e5b0da6fd83336
4
- data.tar.gz: ab8241d5e35446f1e7342a08fcb915bed0542a65fcaf4837c358d19699b791e9
3
+ metadata.gz: 93ce9c2e79ac158b4a3e988afc547b1891419eb6e6b1845156cf98eaa3cdd578
4
+ data.tar.gz: 4e677653deebd035cbdcd5c98529b7f4fee6804075ecab1113dbccc0bf9c65ed
5
5
  SHA512:
6
- metadata.gz: a4739788d141bae29fdf1baba602ba76c51299cd8f8536e1a919084d94601b5ddeb02f9128b289965c4c821a925f063da0c2eec9b360dd5190ef9b9c9f2daae5
7
- data.tar.gz: 1fdee6fec50ee3d995639d8c78f6e7c259456e1c0b1a916c40ec1e16ff578ecb63b4f7029029c129e6723c69bf19161e9e48589bd9954c8e8ab1ea90b777a870
6
+ metadata.gz: 7518039557e3c991c4a0cc112764198ed6340c8be1fa9c3fb746be21ffbb5518dd35651149cda5aba8ef52a36dfa6b17f47e1335893ae0cd1dfc5776a0e6bf8e
7
+ data.tar.gz: c062d9c2a7c04be82787a4d76a970855c2dc8ce0d4bf5531b6196b96873f4f8e679ee434af9a56c2ebf56ae9a8adb33387925050eb9b8964da613318f2a0e430
data/HISTORY.md CHANGED
@@ -1,3 +1,11 @@
1
+ # 0.3.2
2
+ - Add class for Factorization Machine regressor.
3
+ - Add class for Decision Tree regressor.
4
+ - Add class for Random Forest regressor.
5
+ - Fix to support loading and dumping libsvm file with multi-target variables.
6
+ - Fix to require DecisionTreeClassifier on RandomForestClassifier.
7
+ - Fix some mistakes on document.
8
+
1
9
  # 0.3.1
2
10
  - Fix bug on decision function calculation of FactorizationMachineClassifier.
3
11
  - Fix bug on weight updating process of KernelSVC.
@@ -33,11 +33,13 @@ module SVMKit
33
33
  # @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
34
34
  def dump_libsvm_file(data, labels, filename, zero_based: false)
35
35
  n_samples = [data.shape[0], labels.shape[0]].min
36
+ single_label = labels.shape[1].nil?
36
37
  label_type = detect_dtype(labels)
37
38
  value_type = detect_dtype(data)
38
39
  File.open(filename, 'w') do |file|
39
40
  n_samples.times do |n|
40
- file.puts(dump_libsvm_line(labels[n], data[n, true],
41
+ label = single_label ? labels[n] : labels[n, true].to_a
42
+ file.puts(dump_libsvm_line(label, data[n, true],
41
43
  label_type, value_type, zero_based))
42
44
  end
43
45
  end
@@ -47,8 +49,7 @@ module SVMKit
47
49
 
48
50
  def parse_libsvm_line(line, zero_based)
49
51
  tokens = line.split
50
- label = tokens.shift
51
- label = label.to_i.to_s == label ? label.to_i : label.to_f
52
+ label = parse_label(tokens.shift)
52
53
  ftvec = tokens.map do |el|
53
54
  idx, val = el.split(':')
54
55
  idx = idx.to_i - (zero_based == false ? 1 : 0)
@@ -60,6 +61,11 @@ module SVMKit
60
61
  [label, ftvec, max_idx]
61
62
  end
62
63
 
64
+ def parse_label(label)
65
+ lbl_arr = label.split(',').map { |lbl| lbl.to_i.to_s == lbl ? lbl.to_i : lbl.to_f }
66
+ lbl_arr.size > 1 ? lbl_arr : lbl_arr[0]
67
+ end
68
+
63
69
  def convert_to_matrix(data, n_features)
64
70
  mat = []
65
71
  data.each do |ft|
@@ -80,13 +86,21 @@ module SVMKit
80
86
  end
81
87
 
82
88
  def dump_libsvm_line(label, ftvec, label_type, value_type, zero_based)
83
- line = format(label_type.to_s, label)
89
+ line = dump_label(label, label_type.to_s)
84
90
  ftvec.to_a.each_with_index do |val, n|
85
91
  idx = n + (zero_based == false ? 1 : 0)
86
92
  line += format(" %d:#{value_type}", idx, val) if val != 0.0
87
93
  end
88
94
  line
89
95
  end
96
+
97
+ def dump_label(label, label_type_str)
98
+ if label.is_a?(Array)
99
+ label.map { |lbl| format(label_type_str, lbl) }.join(',')
100
+ else
101
+ format(label_type_str, label)
102
+ end
103
+ end
90
104
  end
91
105
  end
92
106
  end
@@ -3,6 +3,7 @@
3
3
  require 'svmkit/validation'
4
4
  require 'svmkit/base/base_estimator'
5
5
  require 'svmkit/base/classifier'
6
+ require 'svmkit/tree/decision_tree_classifier'
6
7
 
7
8
  module SVMKit
8
9
  # This module consists of the classes that implement ensemble-based methods.
@@ -32,7 +33,7 @@ module SVMKit
32
33
  # @return [Numo::DFloat] (size: n_features)
33
34
  attr_reader :feature_importances
34
35
 
35
- # Return the random generator for performing random sampling in the Pegasos algorithm.
36
+ # Return the random generator for random selection of feature index.
36
37
  # @return [Random]
37
38
  attr_reader :rng
38
39
 
@@ -0,0 +1,141 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pp'
4
+ require 'svmkit/validation'
5
+ require 'svmkit/base/base_estimator'
6
+ require 'svmkit/base/regressor'
7
+ require 'svmkit/tree/decision_tree_regressor'
8
+
9
+ module SVMKit
10
+ module Ensemble
11
+ # RandomForestRegressor is a class that implements random forest for regression
12
+ #
13
+ # @example
14
+ # estimator =
15
+ # SVMKit::Ensemble::RandomForestRegressor.new(
16
+ # n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
17
+ # estimator.fit(training_samples, traininig_values)
18
+ # results = estimator.predict(testing_samples)
19
+ #
20
+ class RandomForestRegressor
21
+ include Base::BaseEstimator
22
+ include Base::Regressor
23
+ include Validation
24
+
25
+ # Return the set of estimators.
26
+ # @return [Array<DecisionTreeRegressor>]
27
+ attr_reader :estimators
28
+
29
+ # Return the importance for each feature.
30
+ # @return [Numo::DFloat] (size: n_features)
31
+ attr_reader :feature_importances
32
+
33
+ # Return the random generator for random selection of feature index.
34
+ # @return [Random]
35
+ attr_reader :rng
36
+
37
+ # Create a new regressor with random forest.
38
+ #
39
+ # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
40
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
41
+ # @param max_depth [Integer] The maximum depth of the tree.
42
+ # If nil is given, decision tree grows without concern for depth.
43
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
44
+ # If nil is given, number of leaves is not limited.
45
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
46
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
47
+ # If nil is given, split process considers all features.
48
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
49
+ # It is used to randomly determine the order of features when deciding spliting point.
50
+ def initialize(n_estimators: 10, criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
51
+ max_features: nil, random_seed: nil)
52
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
53
+ max_features: max_features, random_seed: random_seed)
54
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
55
+ check_params_string(criterion: criterion)
56
+ check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
57
+ max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
58
+ max_features: max_features)
59
+ @params = {}
60
+ @params[:n_estimators] = n_estimators
61
+ @params[:criterion] = criterion
62
+ @params[:max_depth] = max_depth
63
+ @params[:max_leaf_nodes] = max_leaf_nodes
64
+ @params[:min_samples_leaf] = min_samples_leaf
65
+ @params[:max_features] = max_features
66
+ @params[:random_seed] = random_seed
67
+ @params[:random_seed] ||= srand
68
+ @estimators = nil
69
+ @feature_importances = nil
70
+ @rng = Random.new(@params[:random_seed])
71
+ end
72
+
73
+ # Fit the model with given training data.
74
+ #
75
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
76
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
77
+ # @return [RandomForestRegressor] The learned regressor itself.
78
+ def fit(x, y)
79
+ check_sample_array(x)
80
+ check_tvalue_array(y)
81
+ check_sample_tvalue_size(x, y)
82
+ # Initialize some variables.
83
+ n_samples, n_features = x.shape
84
+ @params[:max_features] ||= n_features
85
+ @params[:max_features] = [[1, @params[:max_features]].max, Math.sqrt(n_features).to_i].min
86
+ single_target = y.shape[1].nil?
87
+ # Construct forest.
88
+ @estimators = Array.new(@params[:n_estimators]) do |_n|
89
+ tree = Tree::DecisionTreeRegressor.new(
90
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
91
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
92
+ max_features: @params[:max_features], random_seed: @params[:random_seed]
93
+ )
94
+ bootstrap_ids = Array.new(n_samples) { @rng.rand(0...n_samples) }
95
+ tree.fit(x[bootstrap_ids, true], single_target ? y[bootstrap_ids] : y[bootstrap_ids, true])
96
+ end
97
+ # Calculate feature importances.
98
+ @feature_importances = @estimators.map(&:feature_importances).reduce(&:+)
99
+ @feature_importances /= @feature_importances.sum
100
+ self
101
+ end
102
+
103
+ # Predict values for samples.
104
+ #
105
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
106
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
107
+ def predict(x)
108
+ check_sample_array(x)
109
+ @estimators.map { |est| est.predict(x) }.reduce(&:+) / @params[:n_estimators]
110
+ end
111
+
112
+ # Return the index of the leaf that each sample reached.
113
+ #
114
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
115
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
116
+ def apply(x)
117
+ SVMKit::Validation.check_sample_array(x)
118
+ Numo::Int32[*Array.new(@params[:n_estimators]) { |n| @estimators[n].apply(x) }].transpose
119
+ end
120
+
121
+ # Dump marshal data.
122
+ # @return [Hash] The marshal data about RandomForestRegressor
123
+ def marshal_dump
124
+ { params: @params,
125
+ estimators: @estimators,
126
+ feature_importances: @feature_importances,
127
+ rng: @rng }
128
+ end
129
+
130
+ # Load marshal data.
131
+ # @return [nil]
132
+ def marshal_load(obj)
133
+ @params = obj[:params]
134
+ @estimators = obj[:estimators]
135
+ @feature_importances = obj[:feature_importances]
136
+ @rng = obj[:rng]
137
+ nil
138
+ end
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,231 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'svmkit/validation'
4
+ require 'svmkit/base/base_estimator'
5
+ require 'svmkit/base/regressor'
6
+
7
+ module SVMKit
8
+ module PolynomialModel
9
+ # FactorizationMachineRegressor is a class that implements Factorization Machine
10
+ # with stochastic gradient descent (SGD) optimization.
11
+ #
12
+ # @example
13
+ # estimator =
14
+ # SVMKit::PolynomialModel::FactorizationMachineRegressor.new(
15
+ # n_factors: 10, reg_param_bias: 0.1, reg_param_weight: 0.1, reg_param_factor: 0.1,
16
+ # max_iter: 5000, batch_size: 50, random_seed: 1)
17
+ # estimator.fit(training_samples, traininig_values)
18
+ # results = estimator.predict(testing_samples)
19
+ #
20
+ # *Reference*
21
+ # - S. Rendle, "Factorization Machines with libFM," ACM Transactions on Intelligent Systems and Technology, vol. 3 (3), pp. 57:1--57:22, 2012.
22
+ # - S. Rendle, "Factorization Machines," Proc. the 10th IEEE International Conference on Data Mining (ICDM'10), pp. 995--1000, 2010.
23
+ # - I. Sutskever, J. Martens, G. Dahl, and G. Hinton, "On the importance of initialization and momentum in deep learning," Proc. the 30th International Conference on Machine Learning (ICML' 13), pp. 1139--1147, 2013.
24
+ # - G. Hinton, N. Srivastava, and K. Swersky, "Lecture 6e rmsprop," Neural Networks for Machine Learning, 2012.
25
+ class FactorizationMachineRegressor
26
+ include Base::BaseEstimator
27
+ include Base::Regressor
28
+ include Validation
29
+
30
+ # Return the factor matrix for Factorization Machine.
31
+ # @return [Numo::DFloat] (shape: [n_outputs, n_factors, n_features])
32
+ attr_reader :factor_mat
33
+
34
+ # Return the weight vector for Factorization Machine.
35
+ # @return [Numo::DFloat] (shape: [n_outputs, n_features])
36
+ attr_reader :weight_vec
37
+
38
+ # Return the bias term for Factoriazation Machine.
39
+ # @return [Numo::DFloat] (shape: [n_outputs])
40
+ attr_reader :bias_term
41
+
42
+ # Return the random generator for random sampling.
43
+ # @return [Random]
44
+ attr_reader :rng
45
+
46
+ # Create a new regressor with Factorization Machine.
47
+ #
48
+ # @param n_factors [Integer] The maximum number of iterations.
49
+ # @param reg_param_bias [Float] The regularization parameter for bias term.
50
+ # @param reg_param_weight [Float] The regularization parameter for weight vector.
51
+ # @param reg_param_factor [Float] The regularization parameter for factor matrix.
52
+ # @param init_std [Float] The standard deviation of normal random number for initialization of factor matrix.
53
+ # @param learning_rate [Float] The learning rate for optimization.
54
+ # @param decay [Float] The discounting factor for RMS prop optimization.
55
+ # @param momentum [Float] The Nesterov momentum for optimization.
56
+ # @param max_iter [Integer] The maximum number of iterations.
57
+ # @param batch_size [Integer] The size of the mini batches.
58
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
59
+ def initialize(n_factors: 2,
60
+ reg_param_bias: 1.0, reg_param_weight: 1.0, reg_param_factor: 1.0, init_std: 0.01,
61
+ learning_rate: 0.01, decay: 0.9, momentum: 0.9,
62
+ max_iter: 1000, batch_size: 10, random_seed: nil)
63
+ check_params_float(reg_param_bias: reg_param_bias, reg_param_weight: reg_param_weight,
64
+ reg_param_factor: reg_param_factor, init_std: init_std,
65
+ learning_rate: learning_rate, decay: decay, momentum: momentum)
66
+ check_params_integer(n_factors: n_factors, max_iter: max_iter, batch_size: batch_size)
67
+ check_params_type_or_nil(Integer, random_seed: random_seed)
68
+ check_params_positive(n_factors: n_factors, reg_param_bias: reg_param_bias,
69
+ reg_param_weight: reg_param_weight, reg_param_factor: reg_param_factor,
70
+ learning_rate: learning_rate, decay: decay, momentum: momentum,
71
+ max_iter: max_iter, batch_size: batch_size)
72
+ @params = {}
73
+ @params[:n_factors] = n_factors
74
+ @params[:reg_param_bias] = reg_param_bias
75
+ @params[:reg_param_weight] = reg_param_weight
76
+ @params[:reg_param_factor] = reg_param_factor
77
+ @params[:init_std] = init_std
78
+ @params[:learning_rate] = learning_rate
79
+ @params[:decay] = decay
80
+ @params[:momentum] = momentum
81
+ @params[:max_iter] = max_iter
82
+ @params[:batch_size] = batch_size
83
+ @params[:random_seed] = random_seed
84
+ @params[:random_seed] ||= srand
85
+ @factor_mat = nil
86
+ @weight_vec = nil
87
+ @bias_term = nil
88
+ @rng = Random.new(@params[:random_seed])
89
+ end
90
+
91
+ # Fit the model with given training data.
92
+ #
93
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
94
+ # @param y [Numo::Int32] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
95
+ # @return [FactorizationMachineRegressor] The learned regressor itself.
96
+ def fit(x, y)
97
+ check_sample_array(x)
98
+ check_tvalue_array(y)
99
+ check_sample_tvalue_size(x, y)
100
+
101
+ n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
102
+ _n_samples, n_features = x.shape
103
+
104
+ if n_outputs > 1
105
+ @factor_mat = Numo::DFloat.zeros(n_outputs, @params[:n_factors], n_features)
106
+ @weight_vec = Numo::DFloat.zeros(n_outputs, n_features)
107
+ @bias_term = Numo::DFloat.zeros(n_outputs)
108
+ n_outputs.times do |n|
109
+ factor, weight, bias = single_fit(x, y[true, n])
110
+ @factor_mat[n, true, true] = factor
111
+ @weight_vec[n, true] = weight
112
+ @bias_term[n] = bias
113
+ end
114
+ else
115
+ @factor_mat, @weight_vec, @bias_term = single_fit(x, y)
116
+ end
117
+
118
+ self
119
+ end
120
+
121
+ # Predict values for samples.
122
+ #
123
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
124
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
125
+ def predict(x)
126
+ check_sample_array(x)
127
+ linear_term = @bias_term + x.dot(@weight_vec.transpose)
128
+ factor_term = if @weight_vec.shape[1].nil?
129
+ 0.5 * (@factor_mat.dot(x.transpose)**2 - (@factor_mat**2).dot(x.transpose**2)).sum(0)
130
+ else
131
+ 0.5 * (@factor_mat.dot(x.transpose)**2 - (@factor_mat**2).dot(x.transpose**2)).sum(1).transpose
132
+ end
133
+ linear_term + factor_term
134
+ end
135
+
136
+ # Dump marshal data.
137
+ # @return [Hash] The marshal data about FactorizationMachineRegressor
138
+ def marshal_dump
139
+ { params: @params,
140
+ factor_mat: @factor_mat,
141
+ weight_vec: @weight_vec,
142
+ bias_term: @bias_term,
143
+ rng: @rng }
144
+ end
145
+
146
+ # Load marshal data.
147
+ # @return [nil]
148
+ def marshal_load(obj)
149
+ @params = obj[:params]
150
+ @factor_mat = obj[:factor_mat]
151
+ @weight_vec = obj[:weight_vec]
152
+ @bias_term = obj[:bias_term]
153
+ @rng = obj[:rng]
154
+ nil
155
+ end
156
+
157
+ private
158
+
159
+ def single_fit(x, y)
160
+ # Initialize some variables.
161
+ n_samples, n_features = x.shape
162
+ rand_ids = [*0...n_samples].shuffle(random: @rng)
163
+ factor_mat = rand_normal([@params[:n_factors], n_features], 0, @params[:init_std])
164
+ factor_sqrsum = Numo::DFloat.zeros(factor_mat.shape)
165
+ factor_update = Numo::DFloat.zeros(factor_mat.shape)
166
+ weight_vec = Numo::DFloat.zeros(n_features)
167
+ weight_sqrsum = Numo::DFloat.zeros(n_features)
168
+ weight_update = Numo::DFloat.zeros(n_features)
169
+ bias_term = 0.0
170
+ bias_sqrsum = 0.0
171
+ bias_update = 0.0
172
+ # Start optimization.
173
+ @params[:max_iter].times do |_t|
174
+ # Random sampling.
175
+ subset_ids = rand_ids.shift(@params[:batch_size])
176
+ rand_ids.concat(subset_ids)
177
+ data = x[subset_ids, true]
178
+ values = y[subset_ids]
179
+ # Calculate gradients for loss function.
180
+ loss_grad = loss_gradient(data, values, factor_mat, weight_vec, bias_term)
181
+ next if loss_grad.ne(0.0).count.zero?
182
+ # Update each parameter.
183
+ bias_term, bias_sqrsum, bias_update =
184
+ update_param(bias_term, bias_sqrsum, bias_update,
185
+ bias_gradient(loss_grad, bias_term - @params[:momentum] * bias_update))
186
+ weight_vec, weight_sqrsum, weight_update =
187
+ update_param(weight_vec, weight_sqrsum, weight_update,
188
+ weight_gradient(loss_grad, data, weight_vec - @params[:momentum] * weight_update))
189
+ @params[:n_factors].times do |n|
190
+ factor_update[n, true], factor_sqrsum[n, true], factor_update[n, true] =
191
+ update_param(factor_update[n, true], factor_sqrsum[n, true], factor_update[n, true],
192
+ factor_gradient(loss_grad, data, factor_mat[n, true] - @params[:momentum] * factor_update[n, true]))
193
+ end
194
+ end
195
+ [factor_mat, weight_vec, bias_term]
196
+ end
197
+
198
+ def loss_gradient(x, y, factor, weight, bias)
199
+ z = bias + x.dot(weight) + 0.5 * (factor.dot(x.transpose)**2 - (factor**2).dot(x.transpose**2)).sum(0)
200
+ 2.0 * (z - y)
201
+ end
202
+
203
+ def bias_gradient(loss_grad, bias)
204
+ loss_grad.mean + @params[:reg_param_bias] * bias
205
+ end
206
+
207
+ def weight_gradient(loss_grad, data, weight)
208
+ (loss_grad.expand_dims(1) * data).mean(0) + @params[:reg_param_weight] * weight
209
+ end
210
+
211
+ def factor_gradient(loss_grad, data, factor)
212
+ (loss_grad.expand_dims(1) * (data * data.dot(factor).expand_dims(1) - factor * (data**2))).mean(0) + @params[:reg_param_factor] * factor
213
+ end
214
+
215
+ def update_param(param, sqrsum, update, gr)
216
+ new_sqrsum = @params[:decay] * sqrsum + (1.0 - @params[:decay]) * gr**2
217
+ new_update = (@params[:learning_rate] / ((new_sqrsum + 1.0e-8)**0.5)) * gr
218
+ new_param = param - (new_update + @params[:momentum] * update)
219
+ [new_param, new_sqrsum, new_update]
220
+ end
221
+
222
+ def rand_uniform(shape)
223
+ Numo::DFloat[*Array.new(shape.inject(&:*)) { @rng.rand }].reshape(*shape)
224
+ end
225
+
226
+ def rand_normal(shape, mu, sigma)
227
+ mu + sigma * (Numo::NMath.sqrt(-2.0 * Numo::NMath.log(rand_uniform(shape))) * Numo::NMath.sin(2.0 * Math::PI * rand_uniform(shape)))
228
+ end
229
+ end
230
+ end
231
+ end
@@ -3,74 +3,11 @@
3
3
  require 'svmkit/validation'
4
4
  require 'svmkit/base/base_estimator'
5
5
  require 'svmkit/base/classifier'
6
+ require 'svmkit/tree/node'
6
7
 
7
8
  module SVMKit
8
9
  # This module consists of the classes that implement tree models.
9
10
  module Tree
10
- # Node is a class that implements node used for construction of decision tree.
11
- # This class is used for internal data structures.
12
- class Node
13
- # @!visibility private
14
- attr_accessor :depth, :impurity, :n_samples, :probs, :leaf, :leaf_id, :left, :right, :feature_id, :threshold
15
-
16
- # Create a new node for decision tree.
17
- #
18
- # @param depth [Integer] The depth of the node in tree.
19
- # @param impurity [Float] The impurity of the node.
20
- # @param n_samples [Integer] The number of the samples in the node.
21
- # @param probs [Float] The probability of the node.
22
- # @param leaf [Boolean] The flag indicating whether the node is a leaf.
23
- # @param leaf_id [Integer] The leaf index of the node.
24
- # @param left [Node] The left node.
25
- # @param right [Node] The right node.
26
- # @param feature_id [Integer] The feature index used for evaluation.
27
- # @param threshold [Float] The threshold value of the feature for splitting the node.
28
- def initialize(depth: 0, impurity: 0.0, n_samples: 0, probs: 0.0,
29
- leaf: true, leaf_id: 0,
30
- left: nil, right: nil, feature_id: 0, threshold: 0.0)
31
- @depth = depth
32
- @impurity = impurity
33
- @n_samples = n_samples
34
- @probs = probs
35
- @leaf = leaf
36
- @leaf_id = leaf_id
37
- @left = left
38
- @right = right
39
- @feature_id = feature_id
40
- @threshold = threshold
41
- end
42
-
43
- # Dump marshal data.
44
- # @return [Hash] The marshal data about Node
45
- def marshal_dump
46
- { depth: @depth,
47
- impurity: @impurity,
48
- n_samples: @n_samples,
49
- probs: @probs,
50
- leaf: @leaf,
51
- leaf_id: @leaf_id,
52
- left: @left,
53
- right: @right,
54
- feature_id: @feature_id,
55
- threshold: @threshold }
56
- end
57
-
58
- # Load marshal data.
59
- # @return [nil]
60
- def marshal_load(obj)
61
- @depth = obj[:depth]
62
- @impurity = obj[:impurity]
63
- @n_samples = obj[:n_samples]
64
- @probs = obj[:probs]
65
- @leaf = obj[:leaf]
66
- @leaf_id = obj[:leaf_id]
67
- @left = obj[:left]
68
- @right = obj[:right]
69
- @feature_id = obj[:feature_id]
70
- @threshold = obj[:threshold]
71
- end
72
- end
73
-
74
11
  # DecisionTreeClassifier is a class that implements decision tree for classification.
75
12
  #
76
13
  # @example
@@ -96,7 +33,7 @@ module SVMKit
96
33
  # @return [Node]
97
34
  attr_reader :tree
98
35
 
99
- # Return the random generator for performing random sampling in the Pegasos algorithm.
36
+ # Return the random generator for random selection of feature index.
100
37
  # @return [Random]
101
38
  attr_reader :rng
102
39
 
@@ -0,0 +1,252 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'svmkit/validation'
4
+ require 'svmkit/base/base_estimator'
5
+ require 'svmkit/base/regressor'
6
+ require 'svmkit/tree/node'
7
+
8
+ module SVMKit
9
+ module Tree
10
+ # DecisionTreeRegressor is a class that implements decision tree for regression.
11
+ #
12
+ # @example
13
+ # estimator =
14
+ # SVMKit::Tree::DecisionTreeRegressor.new(
15
+ # max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
16
+ # estimator.fit(training_samples, traininig_values)
17
+ # results = estimator.predict(testing_samples)
18
+ #
19
+ class DecisionTreeRegressor
20
+ include Base::BaseEstimator
21
+ include Base::Regressor
22
+ include Validation
23
+
24
+ # Return the importance for each feature.
25
+ # @return [Numo::DFloat] (size: n_features)
26
+ attr_reader :feature_importances
27
+
28
+ # Return the learned tree.
29
+ # @return [Node]
30
+ attr_reader :tree
31
+
32
+ # Return the random generator for random selection of feature index.
33
+ # @return [Random]
34
+ attr_reader :rng
35
+
36
+ # Return the values assigned each leaf.
37
+ # @return [Numo::DFloat] (shape: [n_leafs, n_outputs])
38
+ attr_reader :leaf_values
39
+
40
+ # Create a new regressor with decision tree algorithm.
41
+ #
42
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'mae' and 'mse'.
43
+ # @param max_depth [Integer] The maximum depth of the tree.
44
+ # If nil is given, decision tree grows without concern for depth.
45
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
46
+ # If nil is given, number of leaves is not limited.
47
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
48
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
49
+ # If nil is given, split process considers all features.
50
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
51
+ # It is used to randomly determine the order of features when deciding spliting point.
52
+ def initialize(criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
53
+ random_seed: nil)
54
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
55
+ max_features: max_features, random_seed: random_seed)
56
+ check_params_integer(min_samples_leaf: min_samples_leaf)
57
+ check_params_string(criterion: criterion)
58
+ check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
59
+ min_samples_leaf: min_samples_leaf, max_features: max_features)
60
+ @params = {}
61
+ @params[:criterion] = criterion
62
+ @params[:max_depth] = max_depth
63
+ @params[:max_leaf_nodes] = max_leaf_nodes
64
+ @params[:min_samples_leaf] = min_samples_leaf
65
+ @params[:max_features] = max_features
66
+ @params[:random_seed] = random_seed
67
+ @params[:random_seed] ||= srand
68
+ @criterion = :mse
69
+ @criterion = :mae if @params[:criterion] == 'mae'
70
+ @tree = nil
71
+ @feature_importances = nil
72
+ @n_leaves = nil
73
+ @leaf_values = nil
74
+ @rng = Random.new(@params[:random_seed])
75
+ end
76
+
77
+ # Fit the model with given training data.
78
+ #
79
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
80
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The taget values to be used for fitting the model.
81
+ # @return [DecisionTreeRegressor] The learned regressor itself.
82
+ def fit(x, y)
83
+ check_sample_array(x)
84
+ check_tvalue_array(y)
85
+ check_sample_tvalue_size(x, y)
86
+ single_target = y.shape[1].nil?
87
+ y = y.expand_dims(1) if single_target
88
+ n_samples, n_features = x.shape
89
+ @params[:max_features] = n_features if @params[:max_features].nil?
90
+ @params[:max_features] = [@params[:max_features], n_features].min
91
+ build_tree(x, y)
92
+ @leaf_values = @leaf_values[true] if single_target
93
+ eval_importance(n_samples, n_features)
94
+ self
95
+ end
96
+
97
+ # Predict values for samples.
98
+ #
99
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
100
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
101
+ def predict(x)
102
+ check_sample_array(x)
103
+ @leaf_values.shape[1].nil? ? @leaf_values[apply(x)] : @leaf_values[apply(x), true]
104
+ end
105
+
106
+ # Return the index of the leaf that each sample reached.
107
+ #
108
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
109
+ # @return [Numo::Int32] (shape: [n_samples]) Leaf index for sample.
110
+ def apply(x)
111
+ check_sample_array(x)
112
+ Numo::Int32[*(Array.new(x.shape[0]) { |n| apply_at_node(@tree, x[n, true]) })]
113
+ end
114
+
115
+ # Dump marshal data.
116
+ # @return [Hash] The marshal data about DecisionTreeRegressor
117
+ def marshal_dump
118
+ { params: @params,
119
+ criterion: @criterion,
120
+ tree: @tree,
121
+ feature_importances: @feature_importances,
122
+ leaf_values: @leaf_values,
123
+ rng: @rng }
124
+ end
125
+
126
+ # Load marshal data.
127
+ # @return [nil]
128
+ def marshal_load(obj)
129
+ @params = obj[:params]
130
+ @criterion = obj[:criterion]
131
+ @tree = obj[:tree]
132
+ @feature_importances = obj[:feature_importances]
133
+ @leaf_values = obj[:leaf_values]
134
+ @rng = obj[:rng]
135
+ nil
136
+ end
137
+
138
+ private
139
+
140
+ def apply_at_node(node, sample)
141
+ return node.leaf_id if node.leaf
142
+ return apply_at_node(node.left, sample) if node.right.nil?
143
+ return apply_at_node(node.right, sample) if node.left.nil?
144
+ if sample[node.feature_id] <= node.threshold
145
+ apply_at_node(node.left, sample)
146
+ else
147
+ apply_at_node(node.right, sample)
148
+ end
149
+ end
150
+
151
+ def build_tree(x, y)
152
+ @n_leaves = 0
153
+ @leaf_values = []
154
+ @tree = grow_node(0, x, y)
155
+ @leaf_values = Numo::DFloat.cast(@leaf_values)
156
+ nil
157
+ end
158
+
159
+ def grow_node(depth, x, y)
160
+ unless @params[:max_leaf_nodes].nil?
161
+ return nil if @n_leaves >= @params[:max_leaf_nodes]
162
+ end
163
+
164
+ n_samples, n_features = x.shape
165
+ return nil if n_samples <= @params[:min_samples_leaf]
166
+
167
+ node = Node.new(depth: depth, impurity: impurity(y), n_samples: n_samples)
168
+
169
+ return put_leaf(node, y) if (y - y.mean(0)).sum.abs.zero?
170
+
171
+ unless @params[:max_depth].nil?
172
+ return put_leaf(node, y) if depth == @params[:max_depth]
173
+ end
174
+
175
+ feature_id, threshold, left_ids, right_ids, max_gain =
176
+ rand_ids(n_features).map { |f_id| [f_id, *best_split(x[true, f_id], y)] }.max_by(&:last)
177
+ return put_leaf(node, y) if max_gain.nil? || max_gain.zero?
178
+
179
+ node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true])
180
+ node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true])
181
+ return put_leaf(node, y) if node.left.nil? && node.right.nil?
182
+
183
+ node.feature_id = feature_id
184
+ node.threshold = threshold
185
+ node.leaf = false
186
+ node
187
+ end
188
+
189
+ def put_leaf(node, values)
190
+ node.probs = nil
191
+ node.leaf = true
192
+ node.leaf_id = @n_leaves
193
+ @n_leaves += 1
194
+ @leaf_values.push(values.mean(0))
195
+ node
196
+ end
197
+
198
+ def rand_ids(n)
199
+ [*0...n].sample(@params[:max_features], random: @rng)
200
+ end
201
+
202
+ def best_split(features, values)
203
+ features.to_a.uniq.sort.each_cons(2).map do |l, r|
204
+ threshold = 0.5 * (l + r)
205
+ left_ids, right_ids = splited_ids(features, threshold)
206
+ [threshold, left_ids, right_ids, gain(values, values[left_ids], values[right_ids])]
207
+ end.max_by(&:last)
208
+ end
209
+
210
+ def splited_ids(features, threshold)
211
+ [features.le(threshold).where.to_a, features.gt(threshold).where.to_a]
212
+ end
213
+
214
+ def gain(values, values_left, values_right)
215
+ prob_left = values_left.shape[0].fdiv(values.shape[0])
216
+ prob_right = values_right.shape[0].fdiv(values.shape[0])
217
+ impurity(values) - prob_left * impurity(values_left) - prob_right * impurity(values_right)
218
+ end
219
+
220
+ def impurity(values)
221
+ send(@criterion, values)
222
+ end
223
+
224
+ def mse(values)
225
+ ((values - values.mean(0))**2).mean
226
+ end
227
+
228
+ def mae(values)
229
+ (values - values.mean(0)).abs.mean
230
+ end
231
+
232
+ def eval_importance(n_samples, n_features)
233
+ @feature_importances = Numo::DFloat.zeros(n_features)
234
+ eval_importance_at_node(@tree)
235
+ @feature_importances /= n_samples
236
+ normalizer = @feature_importances.sum
237
+ @feature_importances /= normalizer if normalizer > 0.0
238
+ nil
239
+ end
240
+
241
+ def eval_importance_at_node(node)
242
+ return nil if node.leaf
243
+ return nil if node.left.nil? || node.right.nil?
244
+ gain = node.n_samples * node.impurity -
245
+ node.left.n_samples * node.left.impurity - node.right.n_samples * node.right.impurity
246
+ @feature_importances[node.feature_id] += gain
247
+ eval_importance_at_node(node.left)
248
+ eval_importance_at_node(node.right)
249
+ end
250
+ end
251
+ end
252
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SVMKit
4
+ module Tree
5
+ # Node is a class that implements node used for construction of decision tree.
6
+ # This class is used for internal data structures.
7
+ class Node
8
+ # @!visibility private
9
+ attr_accessor :depth, :impurity, :n_samples, :probs, :leaf, :leaf_id, :left, :right, :feature_id, :threshold
10
+
11
+ # Create a new node for decision tree.
12
+ #
13
+ # @param depth [Integer] The depth of the node in tree.
14
+ # @param impurity [Float] The impurity of the node.
15
+ # @param n_samples [Integer] The number of the samples in the node.
16
+ # @param probs [Float] The probability of the node.
17
+ # @param leaf [Boolean] The flag indicating whether the node is a leaf.
18
+ # @param leaf_id [Integer] The leaf index of the node.
19
+ # @param left [Node] The left node.
20
+ # @param right [Node] The right node.
21
+ # @param feature_id [Integer] The feature index used for evaluation.
22
+ # @param threshold [Float] The threshold value of the feature for splitting the node.
23
+ def initialize(depth: 0, impurity: 0.0, n_samples: 0, probs: 0.0,
24
+ leaf: true, leaf_id: 0,
25
+ left: nil, right: nil, feature_id: 0, threshold: 0.0)
26
+ @depth = depth
27
+ @impurity = impurity
28
+ @n_samples = n_samples
29
+ @probs = probs
30
+ @leaf = leaf
31
+ @leaf_id = leaf_id
32
+ @left = left
33
+ @right = right
34
+ @feature_id = feature_id
35
+ @threshold = threshold
36
+ end
37
+
38
+ # Dump marshal data.
39
+ # @return [Hash] The marshal data about Node
40
+ def marshal_dump
41
+ { depth: @depth,
42
+ impurity: @impurity,
43
+ n_samples: @n_samples,
44
+ probs: @probs,
45
+ leaf: @leaf,
46
+ leaf_id: @leaf_id,
47
+ left: @left,
48
+ right: @right,
49
+ feature_id: @feature_id,
50
+ threshold: @threshold }
51
+ end
52
+
53
+ # Load marshal data.
54
+ # @return [nil]
55
+ def marshal_load(obj)
56
+ @depth = obj[:depth]
57
+ @impurity = obj[:impurity]
58
+ @n_samples = obj[:n_samples]
59
+ @probs = obj[:probs]
60
+ @leaf = obj[:leaf]
61
+ @leaf_id = obj[:leaf_id]
62
+ @left = obj[:left]
63
+ @right = obj[:right]
64
+ @feature_id = obj[:feature_id]
65
+ @threshold = obj[:threshold]
66
+ nil
67
+ end
68
+ end
69
+ end
70
+ end
@@ -3,5 +3,5 @@
3
3
  # SVMKit is a machine learning library in Ruby.
4
4
  module SVMKit
5
5
  # @!visibility private
6
- VERSION = '0.3.1'.freeze
6
+ VERSION = '0.3.2'.freeze
7
7
  end
data/lib/svmkit.rb CHANGED
@@ -19,12 +19,16 @@ require 'svmkit/linear_model/svr'
19
19
  require 'svmkit/linear_model/logistic_regression'
20
20
  require 'svmkit/kernel_machine/kernel_svc'
21
21
  require 'svmkit/polynomial_model/factorization_machine_classifier'
22
+ require 'svmkit/polynomial_model/factorization_machine_regressor'
22
23
  require 'svmkit/multiclass/one_vs_rest_classifier'
23
24
  require 'svmkit/nearest_neighbors/k_neighbors_classifier'
24
25
  require 'svmkit/nearest_neighbors/k_neighbors_regressor'
25
26
  require 'svmkit/naive_bayes/naive_bayes'
27
+ require 'svmkit/tree/node'
26
28
  require 'svmkit/tree/decision_tree_classifier'
29
+ require 'svmkit/tree/decision_tree_regressor'
27
30
  require 'svmkit/ensemble/random_forest_classifier'
31
+ require 'svmkit/ensemble/random_forest_regressor'
28
32
  require 'svmkit/preprocessing/l2_normalizer'
29
33
  require 'svmkit/preprocessing/min_max_scaler'
30
34
  require 'svmkit/preprocessing/standard_scaler'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: svmkit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-05-16 00:00:00.000000000 Z
11
+ date: 2018-05-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -115,6 +115,7 @@ files:
115
115
  - lib/svmkit/base/transformer.rb
116
116
  - lib/svmkit/dataset.rb
117
117
  - lib/svmkit/ensemble/random_forest_classifier.rb
118
+ - lib/svmkit/ensemble/random_forest_regressor.rb
118
119
  - lib/svmkit/evaluation_measure/accuracy.rb
119
120
  - lib/svmkit/evaluation_measure/f_score.rb
120
121
  - lib/svmkit/evaluation_measure/log_loss.rb
@@ -138,6 +139,7 @@ files:
138
139
  - lib/svmkit/nearest_neighbors/k_neighbors_regressor.rb
139
140
  - lib/svmkit/pairwise_metric.rb
140
141
  - lib/svmkit/polynomial_model/factorization_machine_classifier.rb
142
+ - lib/svmkit/polynomial_model/factorization_machine_regressor.rb
141
143
  - lib/svmkit/preprocessing/l2_normalizer.rb
142
144
  - lib/svmkit/preprocessing/label_encoder.rb
143
145
  - lib/svmkit/preprocessing/min_max_scaler.rb
@@ -145,6 +147,8 @@ files:
145
147
  - lib/svmkit/preprocessing/standard_scaler.rb
146
148
  - lib/svmkit/probabilistic_output.rb
147
149
  - lib/svmkit/tree/decision_tree_classifier.rb
150
+ - lib/svmkit/tree/decision_tree_regressor.rb
151
+ - lib/svmkit/tree/node.rb
148
152
  - lib/svmkit/validation.rb
149
153
  - lib/svmkit/version.rb
150
154
  - svmkit.gemspec