svmkit 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 558b550a373cb5cbe7c295dc589c57b6b37a697b8309fa5497e5b0da6fd83336
4
- data.tar.gz: ab8241d5e35446f1e7342a08fcb915bed0542a65fcaf4837c358d19699b791e9
3
+ metadata.gz: 93ce9c2e79ac158b4a3e988afc547b1891419eb6e6b1845156cf98eaa3cdd578
4
+ data.tar.gz: 4e677653deebd035cbdcd5c98529b7f4fee6804075ecab1113dbccc0bf9c65ed
5
5
  SHA512:
6
- metadata.gz: a4739788d141bae29fdf1baba602ba76c51299cd8f8536e1a919084d94601b5ddeb02f9128b289965c4c821a925f063da0c2eec9b360dd5190ef9b9c9f2daae5
7
- data.tar.gz: 1fdee6fec50ee3d995639d8c78f6e7c259456e1c0b1a916c40ec1e16ff578ecb63b4f7029029c129e6723c69bf19161e9e48589bd9954c8e8ab1ea90b777a870
6
+ metadata.gz: 7518039557e3c991c4a0cc112764198ed6340c8be1fa9c3fb746be21ffbb5518dd35651149cda5aba8ef52a36dfa6b17f47e1335893ae0cd1dfc5776a0e6bf8e
7
+ data.tar.gz: c062d9c2a7c04be82787a4d76a970855c2dc8ce0d4bf5531b6196b96873f4f8e679ee434af9a56c2ebf56ae9a8adb33387925050eb9b8964da613318f2a0e430
data/HISTORY.md CHANGED
@@ -1,3 +1,11 @@
1
+ # 0.3.2
2
+ - Add class for Factorization Machine regressor.
3
+ - Add class for Decision Tree regressor.
4
+ - Add class for Random Forest regressor.
5
+ - Fix to support loading and dumping libsvm file with multi-target variables.
6
+ - Fix to require DecisionTreeClassifier on RandomForestClassifier.
7
+ - Fix some mistakes on document.
8
+
1
9
  # 0.3.1
2
10
  - Fix bug on decision function calculation of FactorizationMachineClassifier.
3
11
  - Fix bug on weight updating process of KernelSVC.
@@ -33,11 +33,13 @@ module SVMKit
33
33
  # @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
34
34
  def dump_libsvm_file(data, labels, filename, zero_based: false)
35
35
  n_samples = [data.shape[0], labels.shape[0]].min
36
+ single_label = labels.shape[1].nil?
36
37
  label_type = detect_dtype(labels)
37
38
  value_type = detect_dtype(data)
38
39
  File.open(filename, 'w') do |file|
39
40
  n_samples.times do |n|
40
- file.puts(dump_libsvm_line(labels[n], data[n, true],
41
+ label = single_label ? labels[n] : labels[n, true].to_a
42
+ file.puts(dump_libsvm_line(label, data[n, true],
41
43
  label_type, value_type, zero_based))
42
44
  end
43
45
  end
@@ -47,8 +49,7 @@ module SVMKit
47
49
 
48
50
  def parse_libsvm_line(line, zero_based)
49
51
  tokens = line.split
50
- label = tokens.shift
51
- label = label.to_i.to_s == label ? label.to_i : label.to_f
52
+ label = parse_label(tokens.shift)
52
53
  ftvec = tokens.map do |el|
53
54
  idx, val = el.split(':')
54
55
  idx = idx.to_i - (zero_based == false ? 1 : 0)
@@ -60,6 +61,11 @@ module SVMKit
60
61
  [label, ftvec, max_idx]
61
62
  end
62
63
 
64
+ def parse_label(label)
65
+ lbl_arr = label.split(',').map { |lbl| lbl.to_i.to_s == lbl ? lbl.to_i : lbl.to_f }
66
+ lbl_arr.size > 1 ? lbl_arr : lbl_arr[0]
67
+ end
68
+
63
69
  def convert_to_matrix(data, n_features)
64
70
  mat = []
65
71
  data.each do |ft|
@@ -80,13 +86,21 @@ module SVMKit
80
86
  end
81
87
 
82
88
  def dump_libsvm_line(label, ftvec, label_type, value_type, zero_based)
83
- line = format(label_type.to_s, label)
89
+ line = dump_label(label, label_type.to_s)
84
90
  ftvec.to_a.each_with_index do |val, n|
85
91
  idx = n + (zero_based == false ? 1 : 0)
86
92
  line += format(" %d:#{value_type}", idx, val) if val != 0.0
87
93
  end
88
94
  line
89
95
  end
96
+
97
+ def dump_label(label, label_type_str)
98
+ if label.is_a?(Array)
99
+ label.map { |lbl| format(label_type_str, lbl) }.join(',')
100
+ else
101
+ format(label_type_str, label)
102
+ end
103
+ end
90
104
  end
91
105
  end
92
106
  end
@@ -3,6 +3,7 @@
3
3
  require 'svmkit/validation'
4
4
  require 'svmkit/base/base_estimator'
5
5
  require 'svmkit/base/classifier'
6
+ require 'svmkit/tree/decision_tree_classifier'
6
7
 
7
8
  module SVMKit
8
9
  # This module consists of the classes that implement ensemble-based methods.
@@ -32,7 +33,7 @@ module SVMKit
32
33
  # @return [Numo::DFloat] (size: n_features)
33
34
  attr_reader :feature_importances
34
35
 
35
- # Return the random generator for performing random sampling in the Pegasos algorithm.
36
+ # Return the random generator for random selection of feature index.
36
37
  # @return [Random]
37
38
  attr_reader :rng
38
39
 
@@ -0,0 +1,141 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pp'
4
+ require 'svmkit/validation'
5
+ require 'svmkit/base/base_estimator'
6
+ require 'svmkit/base/regressor'
7
+ require 'svmkit/tree/decision_tree_regressor'
8
+
9
+ module SVMKit
10
+ module Ensemble
11
+ # RandomForestRegressor is a class that implements random forest for regression
12
+ #
13
+ # @example
14
+ # estimator =
15
+ # SVMKit::Ensemble::RandomForestRegressor.new(
16
+ # n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
17
+ # estimator.fit(training_samples, traininig_values)
18
+ # results = estimator.predict(testing_samples)
19
+ #
20
+ class RandomForestRegressor
21
+ include Base::BaseEstimator
22
+ include Base::Regressor
23
+ include Validation
24
+
25
+ # Return the set of estimators.
26
+ # @return [Array<DecisionTreeRegressor>]
27
+ attr_reader :estimators
28
+
29
+ # Return the importance for each feature.
30
+ # @return [Numo::DFloat] (size: n_features)
31
+ attr_reader :feature_importances
32
+
33
+ # Return the random generator for random selection of feature index.
34
+ # @return [Random]
35
+ attr_reader :rng
36
+
37
+ # Create a new regressor with random forest.
38
+ #
39
+ # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
40
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
41
+ # @param max_depth [Integer] The maximum depth of the tree.
42
+ # If nil is given, decision tree grows without concern for depth.
43
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
44
+ # If nil is given, number of leaves is not limited.
45
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
46
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
47
+ # If nil is given, split process considers all features.
48
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
49
+ # It is used to randomly determine the order of features when deciding spliting point.
50
+ def initialize(n_estimators: 10, criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
51
+ max_features: nil, random_seed: nil)
52
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
53
+ max_features: max_features, random_seed: random_seed)
54
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
55
+ check_params_string(criterion: criterion)
56
+ check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
57
+ max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
58
+ max_features: max_features)
59
+ @params = {}
60
+ @params[:n_estimators] = n_estimators
61
+ @params[:criterion] = criterion
62
+ @params[:max_depth] = max_depth
63
+ @params[:max_leaf_nodes] = max_leaf_nodes
64
+ @params[:min_samples_leaf] = min_samples_leaf
65
+ @params[:max_features] = max_features
66
+ @params[:random_seed] = random_seed
67
+ @params[:random_seed] ||= srand
68
+ @estimators = nil
69
+ @feature_importances = nil
70
+ @rng = Random.new(@params[:random_seed])
71
+ end
72
+
73
+ # Fit the model with given training data.
74
+ #
75
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
76
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
77
+ # @return [RandomForestRegressor] The learned regressor itself.
78
+ def fit(x, y)
79
+ check_sample_array(x)
80
+ check_tvalue_array(y)
81
+ check_sample_tvalue_size(x, y)
82
+ # Initialize some variables.
83
+ n_samples, n_features = x.shape
84
+ @params[:max_features] ||= n_features
85
+ @params[:max_features] = [[1, @params[:max_features]].max, Math.sqrt(n_features).to_i].min
86
+ single_target = y.shape[1].nil?
87
+ # Construct forest.
88
+ @estimators = Array.new(@params[:n_estimators]) do |_n|
89
+ tree = Tree::DecisionTreeRegressor.new(
90
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
91
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
92
+ max_features: @params[:max_features], random_seed: @params[:random_seed]
93
+ )
94
+ bootstrap_ids = Array.new(n_samples) { @rng.rand(0...n_samples) }
95
+ tree.fit(x[bootstrap_ids, true], single_target ? y[bootstrap_ids] : y[bootstrap_ids, true])
96
+ end
97
+ # Calculate feature importances.
98
+ @feature_importances = @estimators.map(&:feature_importances).reduce(&:+)
99
+ @feature_importances /= @feature_importances.sum
100
+ self
101
+ end
102
+
103
+ # Predict values for samples.
104
+ #
105
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
106
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
107
+ def predict(x)
108
+ check_sample_array(x)
109
+ @estimators.map { |est| est.predict(x) }.reduce(&:+) / @params[:n_estimators]
110
+ end
111
+
112
+ # Return the index of the leaf that each sample reached.
113
+ #
114
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
115
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
116
+ def apply(x)
117
+ SVMKit::Validation.check_sample_array(x)
118
+ Numo::Int32[*Array.new(@params[:n_estimators]) { |n| @estimators[n].apply(x) }].transpose
119
+ end
120
+
121
+ # Dump marshal data.
122
+ # @return [Hash] The marshal data about RandomForestRegressor
123
+ def marshal_dump
124
+ { params: @params,
125
+ estimators: @estimators,
126
+ feature_importances: @feature_importances,
127
+ rng: @rng }
128
+ end
129
+
130
+ # Load marshal data.
131
+ # @return [nil]
132
+ def marshal_load(obj)
133
+ @params = obj[:params]
134
+ @estimators = obj[:estimators]
135
+ @feature_importances = obj[:feature_importances]
136
+ @rng = obj[:rng]
137
+ nil
138
+ end
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,231 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'svmkit/validation'
4
+ require 'svmkit/base/base_estimator'
5
+ require 'svmkit/base/regressor'
6
+
7
+ module SVMKit
8
+ module PolynomialModel
9
+ # FactorizationMachineRegressor is a class that implements Factorization Machine
10
+ # with stochastic gradient descent (SGD) optimization.
11
+ #
12
+ # @example
13
+ # estimator =
14
+ # SVMKit::PolynomialModel::FactorizationMachineRegressor.new(
15
+ # n_factors: 10, reg_param_bias: 0.1, reg_param_weight: 0.1, reg_param_factor: 0.1,
16
+ # max_iter: 5000, batch_size: 50, random_seed: 1)
17
+ # estimator.fit(training_samples, traininig_values)
18
+ # results = estimator.predict(testing_samples)
19
+ #
20
+ # *Reference*
21
+ # - S. Rendle, "Factorization Machines with libFM," ACM Transactions on Intelligent Systems and Technology, vol. 3 (3), pp. 57:1--57:22, 2012.
22
+ # - S. Rendle, "Factorization Machines," Proc. the 10th IEEE International Conference on Data Mining (ICDM'10), pp. 995--1000, 2010.
23
+ # - I. Sutskever, J. Martens, G. Dahl, and G. Hinton, "On the importance of initialization and momentum in deep learning," Proc. the 30th International Conference on Machine Learning (ICML' 13), pp. 1139--1147, 2013.
24
+ # - G. Hinton, N. Srivastava, and K. Swersky, "Lecture 6e rmsprop," Neural Networks for Machine Learning, 2012.
25
+ class FactorizationMachineRegressor
26
+ include Base::BaseEstimator
27
+ include Base::Regressor
28
+ include Validation
29
+
30
+ # Return the factor matrix for Factorization Machine.
31
+ # @return [Numo::DFloat] (shape: [n_outputs, n_factors, n_features])
32
+ attr_reader :factor_mat
33
+
34
+ # Return the weight vector for Factorization Machine.
35
+ # @return [Numo::DFloat] (shape: [n_outputs, n_features])
36
+ attr_reader :weight_vec
37
+
38
+ # Return the bias term for Factoriazation Machine.
39
+ # @return [Numo::DFloat] (shape: [n_outputs])
40
+ attr_reader :bias_term
41
+
42
+ # Return the random generator for random sampling.
43
+ # @return [Random]
44
+ attr_reader :rng
45
+
46
+ # Create a new regressor with Factorization Machine.
47
+ #
48
+ # @param n_factors [Integer] The maximum number of iterations.
49
+ # @param reg_param_bias [Float] The regularization parameter for bias term.
50
+ # @param reg_param_weight [Float] The regularization parameter for weight vector.
51
+ # @param reg_param_factor [Float] The regularization parameter for factor matrix.
52
+ # @param init_std [Float] The standard deviation of normal random number for initialization of factor matrix.
53
+ # @param learning_rate [Float] The learning rate for optimization.
54
+ # @param decay [Float] The discounting factor for RMS prop optimization.
55
+ # @param momentum [Float] The Nesterov momentum for optimization.
56
+ # @param max_iter [Integer] The maximum number of iterations.
57
+ # @param batch_size [Integer] The size of the mini batches.
58
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
59
+ def initialize(n_factors: 2,
60
+ reg_param_bias: 1.0, reg_param_weight: 1.0, reg_param_factor: 1.0, init_std: 0.01,
61
+ learning_rate: 0.01, decay: 0.9, momentum: 0.9,
62
+ max_iter: 1000, batch_size: 10, random_seed: nil)
63
+ check_params_float(reg_param_bias: reg_param_bias, reg_param_weight: reg_param_weight,
64
+ reg_param_factor: reg_param_factor, init_std: init_std,
65
+ learning_rate: learning_rate, decay: decay, momentum: momentum)
66
+ check_params_integer(n_factors: n_factors, max_iter: max_iter, batch_size: batch_size)
67
+ check_params_type_or_nil(Integer, random_seed: random_seed)
68
+ check_params_positive(n_factors: n_factors, reg_param_bias: reg_param_bias,
69
+ reg_param_weight: reg_param_weight, reg_param_factor: reg_param_factor,
70
+ learning_rate: learning_rate, decay: decay, momentum: momentum,
71
+ max_iter: max_iter, batch_size: batch_size)
72
+ @params = {}
73
+ @params[:n_factors] = n_factors
74
+ @params[:reg_param_bias] = reg_param_bias
75
+ @params[:reg_param_weight] = reg_param_weight
76
+ @params[:reg_param_factor] = reg_param_factor
77
+ @params[:init_std] = init_std
78
+ @params[:learning_rate] = learning_rate
79
+ @params[:decay] = decay
80
+ @params[:momentum] = momentum
81
+ @params[:max_iter] = max_iter
82
+ @params[:batch_size] = batch_size
83
+ @params[:random_seed] = random_seed
84
+ @params[:random_seed] ||= srand
85
+ @factor_mat = nil
86
+ @weight_vec = nil
87
+ @bias_term = nil
88
+ @rng = Random.new(@params[:random_seed])
89
+ end
90
+
91
+ # Fit the model with given training data.
92
+ #
93
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
94
+ # @param y [Numo::Int32] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
95
+ # @return [FactorizationMachineRegressor] The learned regressor itself.
96
+ def fit(x, y)
97
+ check_sample_array(x)
98
+ check_tvalue_array(y)
99
+ check_sample_tvalue_size(x, y)
100
+
101
+ n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
102
+ _n_samples, n_features = x.shape
103
+
104
+ if n_outputs > 1
105
+ @factor_mat = Numo::DFloat.zeros(n_outputs, @params[:n_factors], n_features)
106
+ @weight_vec = Numo::DFloat.zeros(n_outputs, n_features)
107
+ @bias_term = Numo::DFloat.zeros(n_outputs)
108
+ n_outputs.times do |n|
109
+ factor, weight, bias = single_fit(x, y[true, n])
110
+ @factor_mat[n, true, true] = factor
111
+ @weight_vec[n, true] = weight
112
+ @bias_term[n] = bias
113
+ end
114
+ else
115
+ @factor_mat, @weight_vec, @bias_term = single_fit(x, y)
116
+ end
117
+
118
+ self
119
+ end
120
+
121
+ # Predict values for samples.
122
+ #
123
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
124
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
125
+ def predict(x)
126
+ check_sample_array(x)
127
+ linear_term = @bias_term + x.dot(@weight_vec.transpose)
128
+ factor_term = if @weight_vec.shape[1].nil?
129
+ 0.5 * (@factor_mat.dot(x.transpose)**2 - (@factor_mat**2).dot(x.transpose**2)).sum(0)
130
+ else
131
+ 0.5 * (@factor_mat.dot(x.transpose)**2 - (@factor_mat**2).dot(x.transpose**2)).sum(1).transpose
132
+ end
133
+ linear_term + factor_term
134
+ end
135
+
136
+ # Dump marshal data.
137
+ # @return [Hash] The marshal data about FactorizationMachineRegressor
138
+ def marshal_dump
139
+ { params: @params,
140
+ factor_mat: @factor_mat,
141
+ weight_vec: @weight_vec,
142
+ bias_term: @bias_term,
143
+ rng: @rng }
144
+ end
145
+
146
+ # Load marshal data.
147
+ # @return [nil]
148
+ def marshal_load(obj)
149
+ @params = obj[:params]
150
+ @factor_mat = obj[:factor_mat]
151
+ @weight_vec = obj[:weight_vec]
152
+ @bias_term = obj[:bias_term]
153
+ @rng = obj[:rng]
154
+ nil
155
+ end
156
+
157
+ private
158
+
159
+ def single_fit(x, y)
160
+ # Initialize some variables.
161
+ n_samples, n_features = x.shape
162
+ rand_ids = [*0...n_samples].shuffle(random: @rng)
163
+ factor_mat = rand_normal([@params[:n_factors], n_features], 0, @params[:init_std])
164
+ factor_sqrsum = Numo::DFloat.zeros(factor_mat.shape)
165
+ factor_update = Numo::DFloat.zeros(factor_mat.shape)
166
+ weight_vec = Numo::DFloat.zeros(n_features)
167
+ weight_sqrsum = Numo::DFloat.zeros(n_features)
168
+ weight_update = Numo::DFloat.zeros(n_features)
169
+ bias_term = 0.0
170
+ bias_sqrsum = 0.0
171
+ bias_update = 0.0
172
+ # Start optimization.
173
+ @params[:max_iter].times do |_t|
174
+ # Random sampling.
175
+ subset_ids = rand_ids.shift(@params[:batch_size])
176
+ rand_ids.concat(subset_ids)
177
+ data = x[subset_ids, true]
178
+ values = y[subset_ids]
179
+ # Calculate gradients for loss function.
180
+ loss_grad = loss_gradient(data, values, factor_mat, weight_vec, bias_term)
181
+ next if loss_grad.ne(0.0).count.zero?
182
+ # Update each parameter.
183
+ bias_term, bias_sqrsum, bias_update =
184
+ update_param(bias_term, bias_sqrsum, bias_update,
185
+ bias_gradient(loss_grad, bias_term - @params[:momentum] * bias_update))
186
+ weight_vec, weight_sqrsum, weight_update =
187
+ update_param(weight_vec, weight_sqrsum, weight_update,
188
+ weight_gradient(loss_grad, data, weight_vec - @params[:momentum] * weight_update))
189
+ @params[:n_factors].times do |n|
190
+ factor_update[n, true], factor_sqrsum[n, true], factor_update[n, true] =
191
+ update_param(factor_update[n, true], factor_sqrsum[n, true], factor_update[n, true],
192
+ factor_gradient(loss_grad, data, factor_mat[n, true] - @params[:momentum] * factor_update[n, true]))
193
+ end
194
+ end
195
+ [factor_mat, weight_vec, bias_term]
196
+ end
197
+
198
+ def loss_gradient(x, y, factor, weight, bias)
199
+ z = bias + x.dot(weight) + 0.5 * (factor.dot(x.transpose)**2 - (factor**2).dot(x.transpose**2)).sum(0)
200
+ 2.0 * (z - y)
201
+ end
202
+
203
+ def bias_gradient(loss_grad, bias)
204
+ loss_grad.mean + @params[:reg_param_bias] * bias
205
+ end
206
+
207
+ def weight_gradient(loss_grad, data, weight)
208
+ (loss_grad.expand_dims(1) * data).mean(0) + @params[:reg_param_weight] * weight
209
+ end
210
+
211
+ def factor_gradient(loss_grad, data, factor)
212
+ (loss_grad.expand_dims(1) * (data * data.dot(factor).expand_dims(1) - factor * (data**2))).mean(0) + @params[:reg_param_factor] * factor
213
+ end
214
+
215
+ def update_param(param, sqrsum, update, gr)
216
+ new_sqrsum = @params[:decay] * sqrsum + (1.0 - @params[:decay]) * gr**2
217
+ new_update = (@params[:learning_rate] / ((new_sqrsum + 1.0e-8)**0.5)) * gr
218
+ new_param = param - (new_update + @params[:momentum] * update)
219
+ [new_param, new_sqrsum, new_update]
220
+ end
221
+
222
+ def rand_uniform(shape)
223
+ Numo::DFloat[*Array.new(shape.inject(&:*)) { @rng.rand }].reshape(*shape)
224
+ end
225
+
226
+ def rand_normal(shape, mu, sigma)
227
+ mu + sigma * (Numo::NMath.sqrt(-2.0 * Numo::NMath.log(rand_uniform(shape))) * Numo::NMath.sin(2.0 * Math::PI * rand_uniform(shape)))
228
+ end
229
+ end
230
+ end
231
+ end
@@ -3,74 +3,11 @@
3
3
  require 'svmkit/validation'
4
4
  require 'svmkit/base/base_estimator'
5
5
  require 'svmkit/base/classifier'
6
+ require 'svmkit/tree/node'
6
7
 
7
8
  module SVMKit
8
9
  # This module consists of the classes that implement tree models.
9
10
  module Tree
10
- # Node is a class that implements node used for construction of decision tree.
11
- # This class is used for internal data structures.
12
- class Node
13
- # @!visibility private
14
- attr_accessor :depth, :impurity, :n_samples, :probs, :leaf, :leaf_id, :left, :right, :feature_id, :threshold
15
-
16
- # Create a new node for decision tree.
17
- #
18
- # @param depth [Integer] The depth of the node in tree.
19
- # @param impurity [Float] The impurity of the node.
20
- # @param n_samples [Integer] The number of the samples in the node.
21
- # @param probs [Float] The probability of the node.
22
- # @param leaf [Boolean] The flag indicating whether the node is a leaf.
23
- # @param leaf_id [Integer] The leaf index of the node.
24
- # @param left [Node] The left node.
25
- # @param right [Node] The right node.
26
- # @param feature_id [Integer] The feature index used for evaluation.
27
- # @param threshold [Float] The threshold value of the feature for splitting the node.
28
- def initialize(depth: 0, impurity: 0.0, n_samples: 0, probs: 0.0,
29
- leaf: true, leaf_id: 0,
30
- left: nil, right: nil, feature_id: 0, threshold: 0.0)
31
- @depth = depth
32
- @impurity = impurity
33
- @n_samples = n_samples
34
- @probs = probs
35
- @leaf = leaf
36
- @leaf_id = leaf_id
37
- @left = left
38
- @right = right
39
- @feature_id = feature_id
40
- @threshold = threshold
41
- end
42
-
43
- # Dump marshal data.
44
- # @return [Hash] The marshal data about Node
45
- def marshal_dump
46
- { depth: @depth,
47
- impurity: @impurity,
48
- n_samples: @n_samples,
49
- probs: @probs,
50
- leaf: @leaf,
51
- leaf_id: @leaf_id,
52
- left: @left,
53
- right: @right,
54
- feature_id: @feature_id,
55
- threshold: @threshold }
56
- end
57
-
58
- # Load marshal data.
59
- # @return [nil]
60
- def marshal_load(obj)
61
- @depth = obj[:depth]
62
- @impurity = obj[:impurity]
63
- @n_samples = obj[:n_samples]
64
- @probs = obj[:probs]
65
- @leaf = obj[:leaf]
66
- @leaf_id = obj[:leaf_id]
67
- @left = obj[:left]
68
- @right = obj[:right]
69
- @feature_id = obj[:feature_id]
70
- @threshold = obj[:threshold]
71
- end
72
- end
73
-
74
11
  # DecisionTreeClassifier is a class that implements decision tree for classification.
75
12
  #
76
13
  # @example
@@ -96,7 +33,7 @@ module SVMKit
96
33
  # @return [Node]
97
34
  attr_reader :tree
98
35
 
99
- # Return the random generator for performing random sampling in the Pegasos algorithm.
36
+ # Return the random generator for random selection of feature index.
100
37
  # @return [Random]
101
38
  attr_reader :rng
102
39
 
@@ -0,0 +1,252 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'svmkit/validation'
4
+ require 'svmkit/base/base_estimator'
5
+ require 'svmkit/base/regressor'
6
+ require 'svmkit/tree/node'
7
+
8
+ module SVMKit
9
+ module Tree
10
+ # DecisionTreeRegressor is a class that implements decision tree for regression.
11
+ #
12
+ # @example
13
+ # estimator =
14
+ # SVMKit::Tree::DecisionTreeRegressor.new(
15
+ # max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
16
+ # estimator.fit(training_samples, traininig_values)
17
+ # results = estimator.predict(testing_samples)
18
+ #
19
+ class DecisionTreeRegressor
20
+ include Base::BaseEstimator
21
+ include Base::Regressor
22
+ include Validation
23
+
24
+ # Return the importance for each feature.
25
+ # @return [Numo::DFloat] (size: n_features)
26
+ attr_reader :feature_importances
27
+
28
+ # Return the learned tree.
29
+ # @return [Node]
30
+ attr_reader :tree
31
+
32
+ # Return the random generator for random selection of feature index.
33
+ # @return [Random]
34
+ attr_reader :rng
35
+
36
+ # Return the values assigned each leaf.
37
+ # @return [Numo::DFloat] (shape: [n_leafs, n_outputs])
38
+ attr_reader :leaf_values
39
+
40
+ # Create a new regressor with decision tree algorithm.
41
+ #
42
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'mae' and 'mse'.
43
+ # @param max_depth [Integer] The maximum depth of the tree.
44
+ # If nil is given, decision tree grows without concern for depth.
45
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
46
+ # If nil is given, number of leaves is not limited.
47
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
48
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
49
+ # If nil is given, split process considers all features.
50
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
51
+ # It is used to randomly determine the order of features when deciding spliting point.
52
+ def initialize(criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
53
+ random_seed: nil)
54
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
55
+ max_features: max_features, random_seed: random_seed)
56
+ check_params_integer(min_samples_leaf: min_samples_leaf)
57
+ check_params_string(criterion: criterion)
58
+ check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
59
+ min_samples_leaf: min_samples_leaf, max_features: max_features)
60
+ @params = {}
61
+ @params[:criterion] = criterion
62
+ @params[:max_depth] = max_depth
63
+ @params[:max_leaf_nodes] = max_leaf_nodes
64
+ @params[:min_samples_leaf] = min_samples_leaf
65
+ @params[:max_features] = max_features
66
+ @params[:random_seed] = random_seed
67
+ @params[:random_seed] ||= srand
68
+ @criterion = :mse
69
+ @criterion = :mae if @params[:criterion] == 'mae'
70
+ @tree = nil
71
+ @feature_importances = nil
72
+ @n_leaves = nil
73
+ @leaf_values = nil
74
+ @rng = Random.new(@params[:random_seed])
75
+ end
76
+
77
+ # Fit the model with given training data.
78
+ #
79
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
80
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The taget values to be used for fitting the model.
81
+ # @return [DecisionTreeRegressor] The learned regressor itself.
82
+ def fit(x, y)
83
+ check_sample_array(x)
84
+ check_tvalue_array(y)
85
+ check_sample_tvalue_size(x, y)
86
+ single_target = y.shape[1].nil?
87
+ y = y.expand_dims(1) if single_target
88
+ n_samples, n_features = x.shape
89
+ @params[:max_features] = n_features if @params[:max_features].nil?
90
+ @params[:max_features] = [@params[:max_features], n_features].min
91
+ build_tree(x, y)
92
+ @leaf_values = @leaf_values[true] if single_target
93
+ eval_importance(n_samples, n_features)
94
+ self
95
+ end
96
+
97
+ # Predict values for samples.
98
+ #
99
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
100
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
101
+ def predict(x)
102
+ check_sample_array(x)
103
+ @leaf_values.shape[1].nil? ? @leaf_values[apply(x)] : @leaf_values[apply(x), true]
104
+ end
105
+
106
+ # Return the index of the leaf that each sample reached.
107
+ #
108
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
109
+ # @return [Numo::Int32] (shape: [n_samples]) Leaf index for sample.
110
+ def apply(x)
111
+ check_sample_array(x)
112
+ Numo::Int32[*(Array.new(x.shape[0]) { |n| apply_at_node(@tree, x[n, true]) })]
113
+ end
114
+
115
+ # Dump marshal data.
116
+ # @return [Hash] The marshal data about DecisionTreeRegressor
117
+ def marshal_dump
118
+ { params: @params,
119
+ criterion: @criterion,
120
+ tree: @tree,
121
+ feature_importances: @feature_importances,
122
+ leaf_values: @leaf_values,
123
+ rng: @rng }
124
+ end
125
+
126
+ # Load marshal data.
127
+ # @return [nil]
128
+ def marshal_load(obj)
129
+ @params = obj[:params]
130
+ @criterion = obj[:criterion]
131
+ @tree = obj[:tree]
132
+ @feature_importances = obj[:feature_importances]
133
+ @leaf_values = obj[:leaf_values]
134
+ @rng = obj[:rng]
135
+ nil
136
+ end
137
+
138
+ private
139
+
140
+ def apply_at_node(node, sample)
141
+ return node.leaf_id if node.leaf
142
+ return apply_at_node(node.left, sample) if node.right.nil?
143
+ return apply_at_node(node.right, sample) if node.left.nil?
144
+ if sample[node.feature_id] <= node.threshold
145
+ apply_at_node(node.left, sample)
146
+ else
147
+ apply_at_node(node.right, sample)
148
+ end
149
+ end
150
+
151
+ def build_tree(x, y)
152
+ @n_leaves = 0
153
+ @leaf_values = []
154
+ @tree = grow_node(0, x, y)
155
+ @leaf_values = Numo::DFloat.cast(@leaf_values)
156
+ nil
157
+ end
158
+
159
+ def grow_node(depth, x, y)
160
+ unless @params[:max_leaf_nodes].nil?
161
+ return nil if @n_leaves >= @params[:max_leaf_nodes]
162
+ end
163
+
164
+ n_samples, n_features = x.shape
165
+ return nil if n_samples <= @params[:min_samples_leaf]
166
+
167
+ node = Node.new(depth: depth, impurity: impurity(y), n_samples: n_samples)
168
+
169
+ return put_leaf(node, y) if (y - y.mean(0)).sum.abs.zero?
170
+
171
+ unless @params[:max_depth].nil?
172
+ return put_leaf(node, y) if depth == @params[:max_depth]
173
+ end
174
+
175
+ feature_id, threshold, left_ids, right_ids, max_gain =
176
+ rand_ids(n_features).map { |f_id| [f_id, *best_split(x[true, f_id], y)] }.max_by(&:last)
177
+ return put_leaf(node, y) if max_gain.nil? || max_gain.zero?
178
+
179
+ node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true])
180
+ node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true])
181
+ return put_leaf(node, y) if node.left.nil? && node.right.nil?
182
+
183
+ node.feature_id = feature_id
184
+ node.threshold = threshold
185
+ node.leaf = false
186
+ node
187
+ end
188
+
189
+ def put_leaf(node, values)
190
+ node.probs = nil
191
+ node.leaf = true
192
+ node.leaf_id = @n_leaves
193
+ @n_leaves += 1
194
+ @leaf_values.push(values.mean(0))
195
+ node
196
+ end
197
+
198
+ def rand_ids(n)
199
+ [*0...n].sample(@params[:max_features], random: @rng)
200
+ end
201
+
202
+ def best_split(features, values)
203
+ features.to_a.uniq.sort.each_cons(2).map do |l, r|
204
+ threshold = 0.5 * (l + r)
205
+ left_ids, right_ids = splited_ids(features, threshold)
206
+ [threshold, left_ids, right_ids, gain(values, values[left_ids], values[right_ids])]
207
+ end.max_by(&:last)
208
+ end
209
+
210
+ def splited_ids(features, threshold)
211
+ [features.le(threshold).where.to_a, features.gt(threshold).where.to_a]
212
+ end
213
+
214
+ def gain(values, values_left, values_right)
215
+ prob_left = values_left.shape[0].fdiv(values.shape[0])
216
+ prob_right = values_right.shape[0].fdiv(values.shape[0])
217
+ impurity(values) - prob_left * impurity(values_left) - prob_right * impurity(values_right)
218
+ end
219
+
220
+ def impurity(values)
221
+ send(@criterion, values)
222
+ end
223
+
224
+ def mse(values)
225
+ ((values - values.mean(0))**2).mean
226
+ end
227
+
228
+ def mae(values)
229
+ (values - values.mean(0)).abs.mean
230
+ end
231
+
232
+ def eval_importance(n_samples, n_features)
233
+ @feature_importances = Numo::DFloat.zeros(n_features)
234
+ eval_importance_at_node(@tree)
235
+ @feature_importances /= n_samples
236
+ normalizer = @feature_importances.sum
237
+ @feature_importances /= normalizer if normalizer > 0.0
238
+ nil
239
+ end
240
+
241
+ def eval_importance_at_node(node)
242
+ return nil if node.leaf
243
+ return nil if node.left.nil? || node.right.nil?
244
+ gain = node.n_samples * node.impurity -
245
+ node.left.n_samples * node.left.impurity - node.right.n_samples * node.right.impurity
246
+ @feature_importances[node.feature_id] += gain
247
+ eval_importance_at_node(node.left)
248
+ eval_importance_at_node(node.right)
249
+ end
250
+ end
251
+ end
252
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SVMKit
4
+ module Tree
5
+ # Node is a class that implements node used for construction of decision tree.
6
+ # This class is used for internal data structures.
7
+ class Node
8
+ # @!visibility private
9
+ attr_accessor :depth, :impurity, :n_samples, :probs, :leaf, :leaf_id, :left, :right, :feature_id, :threshold
10
+
11
+ # Create a new node for decision tree.
12
+ #
13
+ # @param depth [Integer] The depth of the node in tree.
14
+ # @param impurity [Float] The impurity of the node.
15
+ # @param n_samples [Integer] The number of the samples in the node.
16
+ # @param probs [Float] The probability of the node.
17
+ # @param leaf [Boolean] The flag indicating whether the node is a leaf.
18
+ # @param leaf_id [Integer] The leaf index of the node.
19
+ # @param left [Node] The left node.
20
+ # @param right [Node] The right node.
21
+ # @param feature_id [Integer] The feature index used for evaluation.
22
+ # @param threshold [Float] The threshold value of the feature for splitting the node.
23
+ def initialize(depth: 0, impurity: 0.0, n_samples: 0, probs: 0.0,
24
+ leaf: true, leaf_id: 0,
25
+ left: nil, right: nil, feature_id: 0, threshold: 0.0)
26
+ @depth = depth
27
+ @impurity = impurity
28
+ @n_samples = n_samples
29
+ @probs = probs
30
+ @leaf = leaf
31
+ @leaf_id = leaf_id
32
+ @left = left
33
+ @right = right
34
+ @feature_id = feature_id
35
+ @threshold = threshold
36
+ end
37
+
38
+ # Dump marshal data.
39
+ # @return [Hash] The marshal data about Node
40
+ def marshal_dump
41
+ { depth: @depth,
42
+ impurity: @impurity,
43
+ n_samples: @n_samples,
44
+ probs: @probs,
45
+ leaf: @leaf,
46
+ leaf_id: @leaf_id,
47
+ left: @left,
48
+ right: @right,
49
+ feature_id: @feature_id,
50
+ threshold: @threshold }
51
+ end
52
+
53
+ # Load marshal data.
54
+ # @return [nil]
55
+ def marshal_load(obj)
56
+ @depth = obj[:depth]
57
+ @impurity = obj[:impurity]
58
+ @n_samples = obj[:n_samples]
59
+ @probs = obj[:probs]
60
+ @leaf = obj[:leaf]
61
+ @leaf_id = obj[:leaf_id]
62
+ @left = obj[:left]
63
+ @right = obj[:right]
64
+ @feature_id = obj[:feature_id]
65
+ @threshold = obj[:threshold]
66
+ nil
67
+ end
68
+ end
69
+ end
70
+ end
@@ -3,5 +3,5 @@
3
3
  # SVMKit is a machine learning library in Ruby.
4
4
  module SVMKit
5
5
  # @!visibility private
6
- VERSION = '0.3.1'.freeze
6
+ VERSION = '0.3.2'.freeze
7
7
  end
data/lib/svmkit.rb CHANGED
@@ -19,12 +19,16 @@ require 'svmkit/linear_model/svr'
19
19
  require 'svmkit/linear_model/logistic_regression'
20
20
  require 'svmkit/kernel_machine/kernel_svc'
21
21
  require 'svmkit/polynomial_model/factorization_machine_classifier'
22
+ require 'svmkit/polynomial_model/factorization_machine_regressor'
22
23
  require 'svmkit/multiclass/one_vs_rest_classifier'
23
24
  require 'svmkit/nearest_neighbors/k_neighbors_classifier'
24
25
  require 'svmkit/nearest_neighbors/k_neighbors_regressor'
25
26
  require 'svmkit/naive_bayes/naive_bayes'
27
+ require 'svmkit/tree/node'
26
28
  require 'svmkit/tree/decision_tree_classifier'
29
+ require 'svmkit/tree/decision_tree_regressor'
27
30
  require 'svmkit/ensemble/random_forest_classifier'
31
+ require 'svmkit/ensemble/random_forest_regressor'
28
32
  require 'svmkit/preprocessing/l2_normalizer'
29
33
  require 'svmkit/preprocessing/min_max_scaler'
30
34
  require 'svmkit/preprocessing/standard_scaler'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: svmkit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-05-16 00:00:00.000000000 Z
11
+ date: 2018-05-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -115,6 +115,7 @@ files:
115
115
  - lib/svmkit/base/transformer.rb
116
116
  - lib/svmkit/dataset.rb
117
117
  - lib/svmkit/ensemble/random_forest_classifier.rb
118
+ - lib/svmkit/ensemble/random_forest_regressor.rb
118
119
  - lib/svmkit/evaluation_measure/accuracy.rb
119
120
  - lib/svmkit/evaluation_measure/f_score.rb
120
121
  - lib/svmkit/evaluation_measure/log_loss.rb
@@ -138,6 +139,7 @@ files:
138
139
  - lib/svmkit/nearest_neighbors/k_neighbors_regressor.rb
139
140
  - lib/svmkit/pairwise_metric.rb
140
141
  - lib/svmkit/polynomial_model/factorization_machine_classifier.rb
142
+ - lib/svmkit/polynomial_model/factorization_machine_regressor.rb
141
143
  - lib/svmkit/preprocessing/l2_normalizer.rb
142
144
  - lib/svmkit/preprocessing/label_encoder.rb
143
145
  - lib/svmkit/preprocessing/min_max_scaler.rb
@@ -145,6 +147,8 @@ files:
145
147
  - lib/svmkit/preprocessing/standard_scaler.rb
146
148
  - lib/svmkit/probabilistic_output.rb
147
149
  - lib/svmkit/tree/decision_tree_classifier.rb
150
+ - lib/svmkit/tree/decision_tree_regressor.rb
151
+ - lib/svmkit/tree/node.rb
148
152
  - lib/svmkit/validation.rb
149
153
  - lib/svmkit/version.rb
150
154
  - svmkit.gemspec