rumale 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +1 -0
  3. data/.gitignore +20 -0
  4. data/.rspec +3 -0
  5. data/.rubocop.yml +47 -0
  6. data/.rubocop_todo.yml +58 -0
  7. data/.travis.yml +13 -0
  8. data/CHANGELOG.md +2 -0
  9. data/CODE_OF_CONDUCT.md +74 -0
  10. data/Gemfile +4 -0
  11. data/LICENSE.txt +23 -0
  12. data/README.md +175 -0
  13. data/Rakefile +6 -0
  14. data/bin/console +14 -0
  15. data/bin/setup +8 -0
  16. data/lib/rumale.rb +70 -0
  17. data/lib/rumale/base/base_estimator.rb +13 -0
  18. data/lib/rumale/base/classifier.rb +36 -0
  19. data/lib/rumale/base/cluster_analyzer.rb +31 -0
  20. data/lib/rumale/base/evaluator.rb +17 -0
  21. data/lib/rumale/base/regressor.rb +36 -0
  22. data/lib/rumale/base/splitter.rb +21 -0
  23. data/lib/rumale/base/transformer.rb +22 -0
  24. data/lib/rumale/clustering/dbscan.rb +125 -0
  25. data/lib/rumale/clustering/k_means.rb +138 -0
  26. data/lib/rumale/dataset.rb +110 -0
  27. data/lib/rumale/decomposition/nmf.rb +141 -0
  28. data/lib/rumale/decomposition/pca.rb +148 -0
  29. data/lib/rumale/ensemble/ada_boost_classifier.rb +196 -0
  30. data/lib/rumale/ensemble/ada_boost_regressor.rb +178 -0
  31. data/lib/rumale/ensemble/random_forest_classifier.rb +180 -0
  32. data/lib/rumale/ensemble/random_forest_regressor.rb +141 -0
  33. data/lib/rumale/evaluation_measure/accuracy.rb +29 -0
  34. data/lib/rumale/evaluation_measure/f_score.rb +50 -0
  35. data/lib/rumale/evaluation_measure/log_loss.rb +45 -0
  36. data/lib/rumale/evaluation_measure/mean_absolute_error.rb +29 -0
  37. data/lib/rumale/evaluation_measure/mean_squared_error.rb +29 -0
  38. data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +62 -0
  39. data/lib/rumale/evaluation_measure/precision.rb +50 -0
  40. data/lib/rumale/evaluation_measure/precision_recall.rb +91 -0
  41. data/lib/rumale/evaluation_measure/purity.rb +40 -0
  42. data/lib/rumale/evaluation_measure/r2_score.rb +43 -0
  43. data/lib/rumale/evaluation_measure/recall.rb +50 -0
  44. data/lib/rumale/kernel_approximation/rbf.rb +121 -0
  45. data/lib/rumale/kernel_machine/kernel_svc.rb +193 -0
  46. data/lib/rumale/linear_model/base_linear_model.rb +89 -0
  47. data/lib/rumale/linear_model/lasso.rb +136 -0
  48. data/lib/rumale/linear_model/linear_regression.rb +110 -0
  49. data/lib/rumale/linear_model/logistic_regression.rb +159 -0
  50. data/lib/rumale/linear_model/ridge.rb +110 -0
  51. data/lib/rumale/linear_model/svc.rb +183 -0
  52. data/lib/rumale/linear_model/svr.rb +122 -0
  53. data/lib/rumale/model_selection/cross_validation.rb +123 -0
  54. data/lib/rumale/model_selection/grid_search_cv.rb +247 -0
  55. data/lib/rumale/model_selection/k_fold.rb +76 -0
  56. data/lib/rumale/model_selection/stratified_k_fold.rb +94 -0
  57. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +100 -0
  58. data/lib/rumale/naive_bayes/naive_bayes.rb +315 -0
  59. data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +111 -0
  60. data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +93 -0
  61. data/lib/rumale/optimizer/nadam.rb +90 -0
  62. data/lib/rumale/optimizer/rmsprop.rb +69 -0
  63. data/lib/rumale/optimizer/sgd.rb +65 -0
  64. data/lib/rumale/optimizer/yellow_fin.rb +144 -0
  65. data/lib/rumale/pairwise_metric.rb +91 -0
  66. data/lib/rumale/pipeline/pipeline.rb +197 -0
  67. data/lib/rumale/polynomial_model/base_factorization_machine.rb +99 -0
  68. data/lib/rumale/polynomial_model/factorization_machine_classifier.rb +197 -0
  69. data/lib/rumale/polynomial_model/factorization_machine_regressor.rb +131 -0
  70. data/lib/rumale/preprocessing/l2_normalizer.rb +62 -0
  71. data/lib/rumale/preprocessing/label_encoder.rb +94 -0
  72. data/lib/rumale/preprocessing/min_max_scaler.rb +92 -0
  73. data/lib/rumale/preprocessing/one_hot_encoder.rb +98 -0
  74. data/lib/rumale/preprocessing/standard_scaler.rb +86 -0
  75. data/lib/rumale/probabilistic_output.rb +112 -0
  76. data/lib/rumale/tree/base_decision_tree.rb +153 -0
  77. data/lib/rumale/tree/decision_tree_classifier.rb +163 -0
  78. data/lib/rumale/tree/decision_tree_regressor.rb +135 -0
  79. data/lib/rumale/tree/node.rb +70 -0
  80. data/lib/rumale/utils.rb +37 -0
  81. data/lib/rumale/validation.rb +79 -0
  82. data/lib/rumale/values.rb +13 -0
  83. data/lib/rumale/version.rb +6 -0
  84. data/rumale.gemspec +41 -0
  85. metadata +204 -0
@@ -0,0 +1,110 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+
5
+ module Rumale
6
+ # Module for loading and saving a dataset file.
7
+ module Dataset
8
+ class << self
9
+ # Load a dataset with the libsvm file format into Numo::NArray.
10
+ #
11
+ # @param filename [String] A path to a dataset file.
12
+ # @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
13
+ # @param dtype [Numo::NArray] Data type of Numo::NArray for features to be loaded.
14
+ #
15
+ # @return [Array<Numo::NArray>]
16
+ # Returns array containing the (n_samples x n_features) matrix for feature vectors
17
+ # and (n_samples) vector for labels or target values.
18
+ def load_libsvm_file(filename, zero_based: false, dtype: Numo::DFloat)
19
+ ftvecs = []
20
+ labels = []
21
+ n_features = 0
22
+ CSV.foreach(filename, col_sep: "\s", headers: false) do |line|
23
+ label, ftvec, max_idx = parse_libsvm_line(line, zero_based)
24
+ labels.push(label)
25
+ ftvecs.push(ftvec)
26
+ n_features = max_idx if n_features < max_idx
27
+ end
28
+ [convert_to_matrix(ftvecs, n_features, dtype), Numo::NArray.asarray(labels)]
29
+ end
30
+
31
+ # Dump the dataset with the libsvm file format.
32
+ #
33
+ # @param data [Numo::NArray] (shape: [n_samples, n_features]) matrix consisting of feature vectors.
34
+ # @param labels [Numo::NArray] (shape: [n_samples]) matrix consisting of labels or target values.
35
+ # @param filename [String] A path to the output libsvm file.
36
+ # @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
37
+ def dump_libsvm_file(data, labels, filename, zero_based: false)
38
+ n_samples = [data.shape[0], labels.shape[0]].min
39
+ single_label = labels.shape[1].nil?
40
+ label_type = detect_dtype(labels)
41
+ value_type = detect_dtype(data)
42
+ File.open(filename, 'w') do |file|
43
+ n_samples.times do |n|
44
+ label = single_label ? labels[n] : labels[n, true].to_a
45
+ file.puts(dump_libsvm_line(label, data[n, true],
46
+ label_type, value_type, zero_based))
47
+ end
48
+ end
49
+ end
50
+
51
+ private
52
+
53
+ def parse_libsvm_line(line, zero_based)
54
+ label = parse_label(line.shift)
55
+ adj_idx = zero_based == false ? 1 : 0
56
+ max_idx = -1
57
+ ftvec = []
58
+ while (el = line.shift)
59
+ idx, val = el.split(':')
60
+ idx = idx.to_i - adj_idx
61
+ val = val.to_i.to_s == val ? val.to_i : val.to_f
62
+ max_idx = idx if max_idx < idx
63
+ ftvec.push([idx, val])
64
+ end
65
+ [label, ftvec, max_idx]
66
+ end
67
+
68
+ def parse_label(label)
69
+ lbl_arr = label.split(',').map { |lbl| lbl.to_i.to_s == lbl ? lbl.to_i : lbl.to_f }
70
+ lbl_arr.size > 1 ? lbl_arr : lbl_arr[0]
71
+ end
72
+
73
+ def convert_to_matrix(data, n_features, dtype)
74
+ mat = []
75
+ data.each do |ft|
76
+ vec = Array.new(n_features) { 0 }
77
+ ft.each { |el| vec[el[0]] = el[1] }
78
+ mat.push(vec)
79
+ end
80
+ dtype.asarray(mat)
81
+ end
82
+
83
+ def detect_dtype(data)
84
+ arr_type_str = Numo::NArray.array_type(data).to_s
85
+ type = '%s'
86
+ type = '%d' if ['Numo::Int8', 'Numo::Int16', 'Numo::Int32', 'Numo::Int64'].include?(arr_type_str)
87
+ type = '%d' if ['Numo::UInt8', 'Numo::UInt16', 'Numo::UInt32', 'Numo::UInt64'].include?(arr_type_str)
88
+ type = '%.10g' if ['Numo::SFloat', 'Numo::DFloat'].include?(arr_type_str)
89
+ type
90
+ end
91
+
92
+ def dump_libsvm_line(label, ftvec, label_type, value_type, zero_based)
93
+ line = dump_label(label, label_type.to_s)
94
+ ftvec.to_a.each_with_index do |val, n|
95
+ idx = n + (zero_based == false ? 1 : 0)
96
+ line += format(" %d:#{value_type}", idx, val) if val != 0.0
97
+ end
98
+ line
99
+ end
100
+
101
+ def dump_label(label, label_type_str)
102
+ if label.is_a?(Array)
103
+ label.map { |lbl| format(label_type_str, lbl) }.join(',')
104
+ else
105
+ format(label_type_str, label)
106
+ end
107
+ end
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,141 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/utils'
4
+ require 'rumale/base/base_estimator'
5
+ require 'rumale/base/transformer'
6
+
7
+ module Rumale
8
+ module Decomposition
9
+ # NMF is a class that implements Non-negative Matrix Factorization.
10
+ #
11
+ # @example
12
+ # decomposer = Rumale::Decomposition::NMF.new(n_components: 2)
13
+ # representaion = decomposer.fit_transform(samples)
14
+ #
15
+ # *Reference*
16
+ # - W. Xu, X. Liu, and Y.Gong, "Document Clustering Based On Non-negative Matrix Factorization," Proc. SIGIR' 03 , pp. 267--273, 2003.
17
+ class NMF
18
+ include Base::BaseEstimator
19
+ include Base::Transformer
20
+
21
+ # Returns the factorization matrix.
22
+ # @return [Numo::DFloat] (shape: [n_components, n_features])
23
+ attr_reader :components
24
+
25
+ # Return the random generator.
26
+ # @return [Random]
27
+ attr_reader :rng
28
+
29
+ # Create a new transformer with NMF.
30
+ #
31
+ # @param n_components [Integer] The number of components.
32
+ # @param max_iter [Integer] The maximum number of iterations.
33
+ # @param tol [Float] The tolerance of termination criterion.
34
+ # @param eps [Float] A small value close to zero to avoid zero division error.
35
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
36
+ def initialize(n_components: 2, max_iter: 500, tol: 1.0e-4, eps: 1.0e-16, random_seed: nil)
37
+ check_params_integer(n_components: n_components, max_iter: max_iter)
38
+ check_params_float(tol: tol, eps: eps)
39
+ check_params_type_or_nil(Integer, random_seed: random_seed)
40
+ check_params_positive(n_components: n_components, max_iter: max_iter, tol: tol, eps: eps)
41
+ @params = {}
42
+ @params[:n_components] = n_components
43
+ @params[:max_iter] = max_iter
44
+ @params[:tol] = tol
45
+ @params[:eps] = eps
46
+ @params[:random_seed] = random_seed
47
+ @params[:random_seed] ||= srand
48
+ @components = nil
49
+ @rng = Random.new(@params[:random_seed])
50
+ end
51
+
52
+ # Fit the model with given training data.
53
+ #
54
+ # @overload fit(x) -> NMF
55
+ #
56
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
57
+ # @return [NMF] The learned transformer itself.
58
+ def fit(x, _y = nil)
59
+ check_sample_array(x)
60
+ partial_fit(x)
61
+ self
62
+ end
63
+
64
+ # Fit the model with training data, and then transform them with the learned model.
65
+ #
66
+ # @overload fit_transform(x) -> Numo::DFloat
67
+ #
68
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
69
+ # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
70
+ def fit_transform(x, _y = nil)
71
+ check_sample_array(x)
72
+ partial_fit(x)
73
+ end
74
+
75
+ # Transform the given data with the learned model.
76
+ #
77
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
78
+ # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
79
+ def transform(x)
80
+ check_sample_array(x)
81
+ partial_fit(x, false)
82
+ end
83
+
84
+ # Inverse transform the given transformed data with the learned model.
85
+ #
86
+ # @param z [Numo::DFloat] (shape: [n_samples, n_components]) The data to be restored into original space with the learned model.
87
+ # @return [Numo::DFloat] (shape: [n_samples, n_featuress]) The restored data.
88
+ def inverse_transform(z)
89
+ check_sample_array(z)
90
+ z.dot(@components)
91
+ end
92
+
93
+ # Dump marshal data.
94
+ # @return [Hash] The marshal data.
95
+ def marshal_dump
96
+ { params: @params,
97
+ components: @components,
98
+ rng: @rng }
99
+ end
100
+
101
+ # Load marshal data.
102
+ # @return [nil]
103
+ def marshal_load(obj)
104
+ @params = obj[:params]
105
+ @components = obj[:components]
106
+ @rng = obj[:rng]
107
+ nil
108
+ end
109
+
110
+ private
111
+
112
+ def partial_fit(x, update_comps = true)
113
+ # initialize some variables.
114
+ n_samples, n_features = x.shape
115
+ scale = Math.sqrt(x.mean / @params[:n_components])
116
+ @components = Rumale::Utils.rand_uniform([@params[:n_components], n_features], @rng) * scale if update_comps
117
+ coefficients = Rumale::Utils.rand_uniform([n_samples, @params[:n_components]], @rng) * scale
118
+ # optimization.
119
+ @params[:max_iter].times do
120
+ # update
121
+ if update_comps
122
+ nume = coefficients.transpose.dot(x)
123
+ deno = coefficients.transpose.dot(coefficients).dot(@components) + @params[:eps]
124
+ @components *= (nume / deno)
125
+ end
126
+ nume = x.dot(@components.transpose)
127
+ deno = coefficients.dot(@components).dot(@components.transpose) + @params[:eps]
128
+ coefficients *= (nume / deno)
129
+ # normalize
130
+ norm = Numo::NMath.sqrt((@components**2).sum(1)) + @params[:eps]
131
+ @components /= norm.expand_dims(1) if update_comps
132
+ coefficients *= norm
133
+ # check convergence
134
+ err = ((x - coefficients.dot(@components))**2).sum(1).mean
135
+ break if err < @params[:tol]
136
+ end
137
+ coefficients
138
+ end
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,148 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/transformer'
5
+
6
+ module Rumale
7
+ # Module for matrix decomposition algorithms.
8
+ module Decomposition
9
+ # PCA is a class that implements Principal Component Analysis.
10
+ #
11
+ # @example
12
+ # decomposer = Rumale::Decomposition::PCA.new(n_components: 2)
13
+ # representaion = decomposer.fit_transform(samples)
14
+ #
15
+ # *Reference*
16
+ # - A. Sharma and K K. Paliwal, "Fast principal component analysis using fixed-point algorithm," Pattern Recognition Letters, 28, pp. 1151--1155, 2007.
17
+ class PCA
18
+ include Base::BaseEstimator
19
+ include Base::Transformer
20
+
21
+ # Returns the principal components.
22
+ # @return [Numo::DFloat] (shape: [n_components, n_features])
23
+ attr_reader :components
24
+
25
+ # Returns the mean vector.
26
+ # @return [Numo::DFloat] (shape: [n_features]
27
+ attr_reader :mean
28
+
29
+ # Return the random generator.
30
+ # @return [Random]
31
+ attr_reader :rng
32
+
33
+ # Create a new transformer with PCA.
34
+ #
35
+ # @param n_components [Integer] The number of principal components.
36
+ # @param max_iter [Integer] The maximum number of iterations.
37
+ # @param tol [Float] The tolerance of termination criterion.
38
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
39
+ def initialize(n_components: 2, max_iter: 100, tol: 1.0e-4, random_seed: nil)
40
+ check_params_integer(n_components: n_components, max_iter: max_iter)
41
+ check_params_float(tol: tol)
42
+ check_params_type_or_nil(Integer, random_seed: random_seed)
43
+ check_params_positive(n_components: n_components, max_iter: max_iter, tol: tol)
44
+ @params = {}
45
+ @params[:n_components] = n_components
46
+ @params[:max_iter] = max_iter
47
+ @params[:tol] = tol
48
+ @params[:random_seed] = random_seed
49
+ @params[:random_seed] ||= srand
50
+ @components = nil
51
+ @mean = nil
52
+ @rng = Random.new(@params[:random_seed])
53
+ end
54
+
55
+ # Fit the model with given training data.
56
+ #
57
+ # @overload fit(x) -> PCA
58
+ #
59
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
60
+ # @return [PCA] The learned transformer itself.
61
+ def fit(x, _y = nil)
62
+ check_sample_array(x)
63
+ # initialize some variables.
64
+ @components = nil
65
+ n_samples, n_features = x.shape
66
+ # centering.
67
+ @mean = x.mean(0)
68
+ centered_x = x - @mean
69
+ # optimization.
70
+ covariance_mat = centered_x.transpose.dot(centered_x) / (n_samples - 1)
71
+ @params[:n_components].times do
72
+ comp_vec = random_vec(n_features)
73
+ @params[:max_iter].times do
74
+ updated = orthogonalize(covariance_mat.dot(comp_vec))
75
+ break if (updated.dot(comp_vec) - 1).abs < @params[:tol]
76
+ comp_vec = updated
77
+ end
78
+ @components = @components.nil? ? comp_vec : Numo::NArray.vstack([@components, comp_vec])
79
+ end
80
+ self
81
+ end
82
+
83
+ # Fit the model with training data, and then transform them with the learned model.
84
+ #
85
+ # @overload fit_transform(x) -> Numo::DFloat
86
+ #
87
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
88
+ # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
89
+ def fit_transform(x, _y = nil)
90
+ check_sample_array(x)
91
+ fit(x).transform(x)
92
+ end
93
+
94
+ # Transform the given data with the learned model.
95
+ #
96
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
97
+ # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
98
+ def transform(x)
99
+ check_sample_array(x)
100
+ (x - @mean).dot(@components.transpose)
101
+ end
102
+
103
+ # Inverse transform the given transformed data with the learned model.
104
+ #
105
+ # @param z [Numo::DFloat] (shape: [n_samples, n_components]) The data to be restored into original space with the learned model.
106
+ # @return [Numo::DFloat] (shape: [n_samples, n_featuress]) The restored data.
107
+ def inverse_transform(z)
108
+ check_sample_array(z)
109
+ c = @components.shape[1].nil? ? @components.expand_dims(0) : @components
110
+ z.dot(c) + @mean
111
+ end
112
+
113
+ # Dump marshal data.
114
+ # @return [Hash] The marshal data.
115
+ def marshal_dump
116
+ { params: @params,
117
+ components: @components,
118
+ mean: @mean,
119
+ rng: @rng }
120
+ end
121
+
122
+ # Load marshal data.
123
+ # @return [nil]
124
+ def marshal_load(obj)
125
+ @params = obj[:params]
126
+ @components = obj[:components]
127
+ @mean = obj[:mean]
128
+ @rng = obj[:rng]
129
+ nil
130
+ end
131
+
132
+ private
133
+
134
+ def orthogonalize(pcvec)
135
+ unless @components.nil?
136
+ delta = @components.dot(pcvec) * @components.transpose
137
+ delta = delta.sum(1) unless delta.shape[1].nil?
138
+ pcvec -= delta
139
+ end
140
+ pcvec / Math.sqrt((pcvec**2).sum.abs) + 1.0e-12
141
+ end
142
+
143
+ def random_vec(n_features)
144
+ Numo::DFloat[*(Array.new(n_features) { @rng.rand })]
145
+ end
146
+ end
147
+ end
148
+ end
@@ -0,0 +1,196 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/values'
4
+ require 'rumale/utils'
5
+ require 'rumale/base/base_estimator'
6
+ require 'rumale/base/classifier'
7
+ require 'rumale/tree/decision_tree_classifier'
8
+
9
+ module Rumale
10
+ module Ensemble
11
+ # AdaBoostClassifier is a class that implements AdaBoost (SAMME.R) for classification.
12
+ # This class uses decision tree for a weak learner.
13
+ #
14
+ # @example
15
+ # estimator =
16
+ # Rumale::Ensemble::AdaBoostClassifier.new(
17
+ # n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
18
+ # estimator.fit(training_samples, traininig_labels)
19
+ # results = estimator.predict(testing_samples)
20
+ #
21
+ # *Reference*
22
+ # - J. Zhu, S. Rosset, H. Zou, and T.Hashie, "Multi-class AdaBoost," Technical Report No. 430, Department of Statistics, University of Michigan, 2005.
23
+ class AdaBoostClassifier
24
+ include Base::BaseEstimator
25
+ include Base::Classifier
26
+
27
+ # Return the set of estimators.
28
+ # @return [Array<DecisionTreeClassifier>]
29
+ attr_reader :estimators
30
+
31
+ # Return the class labels.
32
+ # @return [Numo::Int32] (size: n_classes)
33
+ attr_reader :classes
34
+
35
+ # Return the importance for each feature.
36
+ # @return [Numo::DFloat] (size: n_features)
37
+ attr_reader :feature_importances
38
+
39
+ # Return the random generator for random selection of feature index.
40
+ # @return [Random]
41
+ attr_reader :rng
42
+
43
+ # Create a new classifier with AdaBoost.
44
+ #
45
+ # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
46
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
47
+ # @param max_depth [Integer] The maximum depth of the tree.
48
+ # If nil is given, decision tree grows without concern for depth.
49
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
50
+ # If nil is given, number of leaves is not limited.
51
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
52
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
53
+ # If nil is given, split process considers all features.
54
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
55
+ # It is used to randomly determine the order of features when deciding spliting point.
56
+ def initialize(n_estimators: 50,
57
+ criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
58
+ max_features: nil, random_seed: nil)
59
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
60
+ max_features: max_features, random_seed: random_seed)
61
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
62
+ check_params_string(criterion: criterion)
63
+ check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
64
+ max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
65
+ max_features: max_features)
66
+ @params = {}
67
+ @params[:n_estimators] = n_estimators
68
+ @params[:criterion] = criterion
69
+ @params[:max_depth] = max_depth
70
+ @params[:max_leaf_nodes] = max_leaf_nodes
71
+ @params[:min_samples_leaf] = min_samples_leaf
72
+ @params[:max_features] = max_features
73
+ @params[:random_seed] = random_seed
74
+ @params[:random_seed] ||= srand
75
+ @estimators = nil
76
+ @classes = nil
77
+ @feature_importances = nil
78
+ @rng = Random.new(@params[:random_seed])
79
+ end
80
+
81
+ # Fit the model with given training data.
82
+ #
83
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
84
+ # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
85
+ # @return [AdaBoostClassifier] The learned classifier itself.
86
+ def fit(x, y) # rubocop:disable Metrics/AbcSize
87
+ check_sample_array(x)
88
+ check_label_array(y)
89
+ check_sample_label_size(x, y)
90
+ ## Initialize some variables.
91
+ n_samples, n_features = x.shape
92
+ @estimators = []
93
+ @feature_importances = Numo::DFloat.zeros(n_features)
94
+ @params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
95
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
96
+ @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
97
+ n_classes = @classes.shape[0]
98
+ ## Boosting.
99
+ classes_arr = @classes.to_a
100
+ y_codes = Numo::DFloat.zeros(n_samples, n_classes) - 1.fdiv(n_classes - 1)
101
+ n_samples.times { |n| y_codes[n, classes_arr.index(y[n])] = 1.0 }
102
+ observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
103
+ @params[:n_estimators].times do |_t|
104
+ # Fit classfier.
105
+ ids = Rumale::Utils.choice_ids(n_samples, observation_weights, @rng)
106
+ break if y[ids].to_a.uniq.size != n_classes
107
+ tree = Tree::DecisionTreeClassifier.new(
108
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
109
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
110
+ max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
111
+ )
112
+ tree.fit(x[ids, true], y[ids])
113
+ # Calculate estimator error.
114
+ proba = tree.predict_proba(x).clip(1.0e-15, nil)
115
+ p = Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[proba[n, true].max_index] })
116
+ inds = p.ne(y)
117
+ error = (observation_weights * inds).sum / observation_weights.sum
118
+ # Store model.
119
+ @estimators.push(tree)
120
+ @feature_importances += tree.feature_importances
121
+ break if error.zero?
122
+ # Update observation weights.
123
+ log_proba = Numo::NMath.log(proba)
124
+ observation_weights *= Numo::NMath.exp(-1.0 * (n_classes - 1).fdiv(n_classes) * (y_codes * log_proba).sum(1))
125
+ observation_weights = observation_weights.clip(1.0e-15, nil)
126
+ sum_observation_weights = observation_weights.sum
127
+ break if sum_observation_weights.zero?
128
+ observation_weights /= sum_observation_weights
129
+ end
130
+ @feature_importances /= @feature_importances.sum
131
+ self
132
+ end
133
+
134
+ # Calculate confidence scores for samples.
135
+ #
136
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
137
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
138
+ def decision_function(x)
139
+ check_sample_array(x)
140
+ n_samples, = x.shape
141
+ n_classes = @classes.size
142
+ sum_probs = Numo::DFloat.zeros(n_samples, n_classes)
143
+ @estimators.each do |tree|
144
+ log_proba = Numo::NMath.log(tree.predict_proba(x).clip(1.0e-15, nil))
145
+ sum_probs += (n_classes - 1) * (log_proba - 1.fdiv(n_classes) * Numo::DFloat[log_proba.sum(1)].transpose)
146
+ end
147
+ sum_probs /= @estimators.size
148
+ end
149
+
150
+ # Predict class labels for samples.
151
+ #
152
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
153
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
154
+ def predict(x)
155
+ check_sample_array(x)
156
+ n_samples, = x.shape
157
+ probs = decision_function(x)
158
+ Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
159
+ end
160
+
161
+ # Predict probability for samples.
162
+ #
163
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
164
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
165
+ def predict_proba(x)
166
+ check_sample_array(x)
167
+ n_classes = @classes.size
168
+ probs = Numo::NMath.exp(1.fdiv(n_classes - 1) * decision_function(x))
169
+ sum_probs = probs.sum(1)
170
+ probs /= Numo::DFloat[sum_probs].transpose
171
+ probs
172
+ end
173
+
174
+ # Dump marshal data.
175
+ # @return [Hash] The marshal data about AdaBoostClassifier.
176
+ def marshal_dump
177
+ { params: @params,
178
+ estimators: @estimators,
179
+ classes: @classes,
180
+ feature_importances: @feature_importances,
181
+ rng: @rng }
182
+ end
183
+
184
+ # Load marshal data.
185
+ # @return [nil]
186
+ def marshal_load(obj)
187
+ @params = obj[:params]
188
+ @estimators = obj[:estimators]
189
+ @classes = obj[:classes]
190
+ @feature_importances = obj[:feature_importances]
191
+ @rng = obj[:rng]
192
+ nil
193
+ end
194
+ end
195
+ end
196
+ end