rumale 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +1 -0
  3. data/.gitignore +20 -0
  4. data/.rspec +3 -0
  5. data/.rubocop.yml +47 -0
  6. data/.rubocop_todo.yml +58 -0
  7. data/.travis.yml +13 -0
  8. data/CHANGELOG.md +2 -0
  9. data/CODE_OF_CONDUCT.md +74 -0
  10. data/Gemfile +4 -0
  11. data/LICENSE.txt +23 -0
  12. data/README.md +175 -0
  13. data/Rakefile +6 -0
  14. data/bin/console +14 -0
  15. data/bin/setup +8 -0
  16. data/lib/rumale.rb +70 -0
  17. data/lib/rumale/base/base_estimator.rb +13 -0
  18. data/lib/rumale/base/classifier.rb +36 -0
  19. data/lib/rumale/base/cluster_analyzer.rb +31 -0
  20. data/lib/rumale/base/evaluator.rb +17 -0
  21. data/lib/rumale/base/regressor.rb +36 -0
  22. data/lib/rumale/base/splitter.rb +21 -0
  23. data/lib/rumale/base/transformer.rb +22 -0
  24. data/lib/rumale/clustering/dbscan.rb +125 -0
  25. data/lib/rumale/clustering/k_means.rb +138 -0
  26. data/lib/rumale/dataset.rb +110 -0
  27. data/lib/rumale/decomposition/nmf.rb +141 -0
  28. data/lib/rumale/decomposition/pca.rb +148 -0
  29. data/lib/rumale/ensemble/ada_boost_classifier.rb +196 -0
  30. data/lib/rumale/ensemble/ada_boost_regressor.rb +178 -0
  31. data/lib/rumale/ensemble/random_forest_classifier.rb +180 -0
  32. data/lib/rumale/ensemble/random_forest_regressor.rb +141 -0
  33. data/lib/rumale/evaluation_measure/accuracy.rb +29 -0
  34. data/lib/rumale/evaluation_measure/f_score.rb +50 -0
  35. data/lib/rumale/evaluation_measure/log_loss.rb +45 -0
  36. data/lib/rumale/evaluation_measure/mean_absolute_error.rb +29 -0
  37. data/lib/rumale/evaluation_measure/mean_squared_error.rb +29 -0
  38. data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +62 -0
  39. data/lib/rumale/evaluation_measure/precision.rb +50 -0
  40. data/lib/rumale/evaluation_measure/precision_recall.rb +91 -0
  41. data/lib/rumale/evaluation_measure/purity.rb +40 -0
  42. data/lib/rumale/evaluation_measure/r2_score.rb +43 -0
  43. data/lib/rumale/evaluation_measure/recall.rb +50 -0
  44. data/lib/rumale/kernel_approximation/rbf.rb +121 -0
  45. data/lib/rumale/kernel_machine/kernel_svc.rb +193 -0
  46. data/lib/rumale/linear_model/base_linear_model.rb +89 -0
  47. data/lib/rumale/linear_model/lasso.rb +136 -0
  48. data/lib/rumale/linear_model/linear_regression.rb +110 -0
  49. data/lib/rumale/linear_model/logistic_regression.rb +159 -0
  50. data/lib/rumale/linear_model/ridge.rb +110 -0
  51. data/lib/rumale/linear_model/svc.rb +183 -0
  52. data/lib/rumale/linear_model/svr.rb +122 -0
  53. data/lib/rumale/model_selection/cross_validation.rb +123 -0
  54. data/lib/rumale/model_selection/grid_search_cv.rb +247 -0
  55. data/lib/rumale/model_selection/k_fold.rb +76 -0
  56. data/lib/rumale/model_selection/stratified_k_fold.rb +94 -0
  57. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +100 -0
  58. data/lib/rumale/naive_bayes/naive_bayes.rb +315 -0
  59. data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +111 -0
  60. data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +93 -0
  61. data/lib/rumale/optimizer/nadam.rb +90 -0
  62. data/lib/rumale/optimizer/rmsprop.rb +69 -0
  63. data/lib/rumale/optimizer/sgd.rb +65 -0
  64. data/lib/rumale/optimizer/yellow_fin.rb +144 -0
  65. data/lib/rumale/pairwise_metric.rb +91 -0
  66. data/lib/rumale/pipeline/pipeline.rb +197 -0
  67. data/lib/rumale/polynomial_model/base_factorization_machine.rb +99 -0
  68. data/lib/rumale/polynomial_model/factorization_machine_classifier.rb +197 -0
  69. data/lib/rumale/polynomial_model/factorization_machine_regressor.rb +131 -0
  70. data/lib/rumale/preprocessing/l2_normalizer.rb +62 -0
  71. data/lib/rumale/preprocessing/label_encoder.rb +94 -0
  72. data/lib/rumale/preprocessing/min_max_scaler.rb +92 -0
  73. data/lib/rumale/preprocessing/one_hot_encoder.rb +98 -0
  74. data/lib/rumale/preprocessing/standard_scaler.rb +86 -0
  75. data/lib/rumale/probabilistic_output.rb +112 -0
  76. data/lib/rumale/tree/base_decision_tree.rb +153 -0
  77. data/lib/rumale/tree/decision_tree_classifier.rb +163 -0
  78. data/lib/rumale/tree/decision_tree_regressor.rb +135 -0
  79. data/lib/rumale/tree/node.rb +70 -0
  80. data/lib/rumale/utils.rb +37 -0
  81. data/lib/rumale/validation.rb +79 -0
  82. data/lib/rumale/values.rb +13 -0
  83. data/lib/rumale/version.rb +6 -0
  84. data/rumale.gemspec +41 -0
  85. metadata +204 -0
@@ -0,0 +1,110 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+
5
+ module Rumale
6
+ # Module for loading and saving a dataset file.
7
+ module Dataset
8
+ class << self
9
+ # Load a dataset with the libsvm file format into Numo::NArray.
10
+ #
11
+ # @param filename [String] A path to a dataset file.
12
+ # @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
13
+ # @param dtype [Numo::NArray] Data type of Numo::NArray for features to be loaded.
14
+ #
15
+ # @return [Array<Numo::NArray>]
16
+ # Returns array containing the (n_samples x n_features) matrix for feature vectors
17
+ # and (n_samples) vector for labels or target values.
18
+ def load_libsvm_file(filename, zero_based: false, dtype: Numo::DFloat)
19
+ ftvecs = []
20
+ labels = []
21
+ n_features = 0
22
+ CSV.foreach(filename, col_sep: "\s", headers: false) do |line|
23
+ label, ftvec, max_idx = parse_libsvm_line(line, zero_based)
24
+ labels.push(label)
25
+ ftvecs.push(ftvec)
26
+ n_features = max_idx if n_features < max_idx
27
+ end
28
+ [convert_to_matrix(ftvecs, n_features, dtype), Numo::NArray.asarray(labels)]
29
+ end
30
+
31
+ # Dump the dataset with the libsvm file format.
32
+ #
33
+ # @param data [Numo::NArray] (shape: [n_samples, n_features]) matrix consisting of feature vectors.
34
+ # @param labels [Numo::NArray] (shape: [n_samples]) matrix consisting of labels or target values.
35
+ # @param filename [String] A path to the output libsvm file.
36
+ # @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
37
+ def dump_libsvm_file(data, labels, filename, zero_based: false)
38
+ n_samples = [data.shape[0], labels.shape[0]].min
39
+ single_label = labels.shape[1].nil?
40
+ label_type = detect_dtype(labels)
41
+ value_type = detect_dtype(data)
42
+ File.open(filename, 'w') do |file|
43
+ n_samples.times do |n|
44
+ label = single_label ? labels[n] : labels[n, true].to_a
45
+ file.puts(dump_libsvm_line(label, data[n, true],
46
+ label_type, value_type, zero_based))
47
+ end
48
+ end
49
+ end
50
+
51
+ private
52
+
53
+ def parse_libsvm_line(line, zero_based)
54
+ label = parse_label(line.shift)
55
+ adj_idx = zero_based == false ? 1 : 0
56
+ max_idx = -1
57
+ ftvec = []
58
+ while (el = line.shift)
59
+ idx, val = el.split(':')
60
+ idx = idx.to_i - adj_idx
61
+ val = val.to_i.to_s == val ? val.to_i : val.to_f
62
+ max_idx = idx if max_idx < idx
63
+ ftvec.push([idx, val])
64
+ end
65
+ [label, ftvec, max_idx]
66
+ end
67
+
68
+ def parse_label(label)
69
+ lbl_arr = label.split(',').map { |lbl| lbl.to_i.to_s == lbl ? lbl.to_i : lbl.to_f }
70
+ lbl_arr.size > 1 ? lbl_arr : lbl_arr[0]
71
+ end
72
+
73
+ def convert_to_matrix(data, n_features, dtype)
74
+ mat = []
75
+ data.each do |ft|
76
+ vec = Array.new(n_features) { 0 }
77
+ ft.each { |el| vec[el[0]] = el[1] }
78
+ mat.push(vec)
79
+ end
80
+ dtype.asarray(mat)
81
+ end
82
+
83
+ def detect_dtype(data)
84
+ arr_type_str = Numo::NArray.array_type(data).to_s
85
+ type = '%s'
86
+ type = '%d' if ['Numo::Int8', 'Numo::Int16', 'Numo::Int32', 'Numo::Int64'].include?(arr_type_str)
87
+ type = '%d' if ['Numo::UInt8', 'Numo::UInt16', 'Numo::UInt32', 'Numo::UInt64'].include?(arr_type_str)
88
+ type = '%.10g' if ['Numo::SFloat', 'Numo::DFloat'].include?(arr_type_str)
89
+ type
90
+ end
91
+
92
+ def dump_libsvm_line(label, ftvec, label_type, value_type, zero_based)
93
+ line = dump_label(label, label_type.to_s)
94
+ ftvec.to_a.each_with_index do |val, n|
95
+ idx = n + (zero_based == false ? 1 : 0)
96
+ line += format(" %d:#{value_type}", idx, val) if val != 0.0
97
+ end
98
+ line
99
+ end
100
+
101
+ def dump_label(label, label_type_str)
102
+ if label.is_a?(Array)
103
+ label.map { |lbl| format(label_type_str, lbl) }.join(',')
104
+ else
105
+ format(label_type_str, label)
106
+ end
107
+ end
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,141 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/utils'
4
+ require 'rumale/base/base_estimator'
5
+ require 'rumale/base/transformer'
6
+
7
+ module Rumale
8
+ module Decomposition
9
+ # NMF is a class that implements Non-negative Matrix Factorization.
10
+ #
11
+ # @example
12
+ # decomposer = Rumale::Decomposition::NMF.new(n_components: 2)
13
+ # representaion = decomposer.fit_transform(samples)
14
+ #
15
+ # *Reference*
16
+ # - W. Xu, X. Liu, and Y.Gong, "Document Clustering Based On Non-negative Matrix Factorization," Proc. SIGIR' 03 , pp. 267--273, 2003.
17
+ class NMF
18
+ include Base::BaseEstimator
19
+ include Base::Transformer
20
+
21
+ # Returns the factorization matrix.
22
+ # @return [Numo::DFloat] (shape: [n_components, n_features])
23
+ attr_reader :components
24
+
25
+ # Return the random generator.
26
+ # @return [Random]
27
+ attr_reader :rng
28
+
29
+ # Create a new transformer with NMF.
30
+ #
31
+ # @param n_components [Integer] The number of components.
32
+ # @param max_iter [Integer] The maximum number of iterations.
33
+ # @param tol [Float] The tolerance of termination criterion.
34
+ # @param eps [Float] A small value close to zero to avoid zero division error.
35
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
36
+ def initialize(n_components: 2, max_iter: 500, tol: 1.0e-4, eps: 1.0e-16, random_seed: nil)
37
+ check_params_integer(n_components: n_components, max_iter: max_iter)
38
+ check_params_float(tol: tol, eps: eps)
39
+ check_params_type_or_nil(Integer, random_seed: random_seed)
40
+ check_params_positive(n_components: n_components, max_iter: max_iter, tol: tol, eps: eps)
41
+ @params = {}
42
+ @params[:n_components] = n_components
43
+ @params[:max_iter] = max_iter
44
+ @params[:tol] = tol
45
+ @params[:eps] = eps
46
+ @params[:random_seed] = random_seed
47
+ @params[:random_seed] ||= srand
48
+ @components = nil
49
+ @rng = Random.new(@params[:random_seed])
50
+ end
51
+
52
+ # Fit the model with given training data.
53
+ #
54
+ # @overload fit(x) -> NMF
55
+ #
56
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
57
+ # @return [NMF] The learned transformer itself.
58
+ def fit(x, _y = nil)
59
+ check_sample_array(x)
60
+ partial_fit(x)
61
+ self
62
+ end
63
+
64
+ # Fit the model with training data, and then transform them with the learned model.
65
+ #
66
+ # @overload fit_transform(x) -> Numo::DFloat
67
+ #
68
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
69
+ # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
70
+ def fit_transform(x, _y = nil)
71
+ check_sample_array(x)
72
+ partial_fit(x)
73
+ end
74
+
75
+ # Transform the given data with the learned model.
76
+ #
77
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
78
+ # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
79
+ def transform(x)
80
+ check_sample_array(x)
81
+ partial_fit(x, false)
82
+ end
83
+
84
+ # Inverse transform the given transformed data with the learned model.
85
+ #
86
+ # @param z [Numo::DFloat] (shape: [n_samples, n_components]) The data to be restored into original space with the learned model.
87
+ # @return [Numo::DFloat] (shape: [n_samples, n_featuress]) The restored data.
88
+ def inverse_transform(z)
89
+ check_sample_array(z)
90
+ z.dot(@components)
91
+ end
92
+
93
+ # Dump marshal data.
94
+ # @return [Hash] The marshal data.
95
+ def marshal_dump
96
+ { params: @params,
97
+ components: @components,
98
+ rng: @rng }
99
+ end
100
+
101
+ # Load marshal data.
102
+ # @return [nil]
103
+ def marshal_load(obj)
104
+ @params = obj[:params]
105
+ @components = obj[:components]
106
+ @rng = obj[:rng]
107
+ nil
108
+ end
109
+
110
+ private
111
+
112
+ def partial_fit(x, update_comps = true)
113
+ # initialize some variables.
114
+ n_samples, n_features = x.shape
115
+ scale = Math.sqrt(x.mean / @params[:n_components])
116
+ @components = Rumale::Utils.rand_uniform([@params[:n_components], n_features], @rng) * scale if update_comps
117
+ coefficients = Rumale::Utils.rand_uniform([n_samples, @params[:n_components]], @rng) * scale
118
+ # optimization.
119
+ @params[:max_iter].times do
120
+ # update
121
+ if update_comps
122
+ nume = coefficients.transpose.dot(x)
123
+ deno = coefficients.transpose.dot(coefficients).dot(@components) + @params[:eps]
124
+ @components *= (nume / deno)
125
+ end
126
+ nume = x.dot(@components.transpose)
127
+ deno = coefficients.dot(@components).dot(@components.transpose) + @params[:eps]
128
+ coefficients *= (nume / deno)
129
+ # normalize
130
+ norm = Numo::NMath.sqrt((@components**2).sum(1)) + @params[:eps]
131
+ @components /= norm.expand_dims(1) if update_comps
132
+ coefficients *= norm
133
+ # check convergence
134
+ err = ((x - coefficients.dot(@components))**2).sum(1).mean
135
+ break if err < @params[:tol]
136
+ end
137
+ coefficients
138
+ end
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,148 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/transformer'
5
+
6
+ module Rumale
7
+ # Module for matrix decomposition algorithms.
8
+ module Decomposition
9
+ # PCA is a class that implements Principal Component Analysis.
10
+ #
11
+ # @example
12
+ # decomposer = Rumale::Decomposition::PCA.new(n_components: 2)
13
+ # representaion = decomposer.fit_transform(samples)
14
+ #
15
+ # *Reference*
16
+ # - A. Sharma and K K. Paliwal, "Fast principal component analysis using fixed-point algorithm," Pattern Recognition Letters, 28, pp. 1151--1155, 2007.
17
+ class PCA
18
+ include Base::BaseEstimator
19
+ include Base::Transformer
20
+
21
+ # Returns the principal components.
22
+ # @return [Numo::DFloat] (shape: [n_components, n_features])
23
+ attr_reader :components
24
+
25
+ # Returns the mean vector.
26
+ # @return [Numo::DFloat] (shape: [n_features]
27
+ attr_reader :mean
28
+
29
+ # Return the random generator.
30
+ # @return [Random]
31
+ attr_reader :rng
32
+
33
+ # Create a new transformer with PCA.
34
+ #
35
+ # @param n_components [Integer] The number of principal components.
36
+ # @param max_iter [Integer] The maximum number of iterations.
37
+ # @param tol [Float] The tolerance of termination criterion.
38
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
39
+ def initialize(n_components: 2, max_iter: 100, tol: 1.0e-4, random_seed: nil)
40
+ check_params_integer(n_components: n_components, max_iter: max_iter)
41
+ check_params_float(tol: tol)
42
+ check_params_type_or_nil(Integer, random_seed: random_seed)
43
+ check_params_positive(n_components: n_components, max_iter: max_iter, tol: tol)
44
+ @params = {}
45
+ @params[:n_components] = n_components
46
+ @params[:max_iter] = max_iter
47
+ @params[:tol] = tol
48
+ @params[:random_seed] = random_seed
49
+ @params[:random_seed] ||= srand
50
+ @components = nil
51
+ @mean = nil
52
+ @rng = Random.new(@params[:random_seed])
53
+ end
54
+
55
+ # Fit the model with given training data.
56
+ #
57
+ # @overload fit(x) -> PCA
58
+ #
59
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
60
+ # @return [PCA] The learned transformer itself.
61
+ def fit(x, _y = nil)
62
+ check_sample_array(x)
63
+ # initialize some variables.
64
+ @components = nil
65
+ n_samples, n_features = x.shape
66
+ # centering.
67
+ @mean = x.mean(0)
68
+ centered_x = x - @mean
69
+ # optimization.
70
+ covariance_mat = centered_x.transpose.dot(centered_x) / (n_samples - 1)
71
+ @params[:n_components].times do
72
+ comp_vec = random_vec(n_features)
73
+ @params[:max_iter].times do
74
+ updated = orthogonalize(covariance_mat.dot(comp_vec))
75
+ break if (updated.dot(comp_vec) - 1).abs < @params[:tol]
76
+ comp_vec = updated
77
+ end
78
+ @components = @components.nil? ? comp_vec : Numo::NArray.vstack([@components, comp_vec])
79
+ end
80
+ self
81
+ end
82
+
83
+ # Fit the model with training data, and then transform them with the learned model.
84
+ #
85
+ # @overload fit_transform(x) -> Numo::DFloat
86
+ #
87
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
88
+ # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
89
+ def fit_transform(x, _y = nil)
90
+ check_sample_array(x)
91
+ fit(x).transform(x)
92
+ end
93
+
94
+ # Transform the given data with the learned model.
95
+ #
96
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
97
+ # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
98
+ def transform(x)
99
+ check_sample_array(x)
100
+ (x - @mean).dot(@components.transpose)
101
+ end
102
+
103
+ # Inverse transform the given transformed data with the learned model.
104
+ #
105
+ # @param z [Numo::DFloat] (shape: [n_samples, n_components]) The data to be restored into original space with the learned model.
106
+ # @return [Numo::DFloat] (shape: [n_samples, n_featuress]) The restored data.
107
+ def inverse_transform(z)
108
+ check_sample_array(z)
109
+ c = @components.shape[1].nil? ? @components.expand_dims(0) : @components
110
+ z.dot(c) + @mean
111
+ end
112
+
113
+ # Dump marshal data.
114
+ # @return [Hash] The marshal data.
115
+ def marshal_dump
116
+ { params: @params,
117
+ components: @components,
118
+ mean: @mean,
119
+ rng: @rng }
120
+ end
121
+
122
+ # Load marshal data.
123
+ # @return [nil]
124
+ def marshal_load(obj)
125
+ @params = obj[:params]
126
+ @components = obj[:components]
127
+ @mean = obj[:mean]
128
+ @rng = obj[:rng]
129
+ nil
130
+ end
131
+
132
+ private
133
+
134
+ def orthogonalize(pcvec)
135
+ unless @components.nil?
136
+ delta = @components.dot(pcvec) * @components.transpose
137
+ delta = delta.sum(1) unless delta.shape[1].nil?
138
+ pcvec -= delta
139
+ end
140
+ pcvec / Math.sqrt((pcvec**2).sum.abs) + 1.0e-12
141
+ end
142
+
143
+ def random_vec(n_features)
144
+ Numo::DFloat[*(Array.new(n_features) { @rng.rand })]
145
+ end
146
+ end
147
+ end
148
+ end
@@ -0,0 +1,196 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/values'
4
+ require 'rumale/utils'
5
+ require 'rumale/base/base_estimator'
6
+ require 'rumale/base/classifier'
7
+ require 'rumale/tree/decision_tree_classifier'
8
+
9
+ module Rumale
10
+ module Ensemble
11
+ # AdaBoostClassifier is a class that implements AdaBoost (SAMME.R) for classification.
12
+ # This class uses decision tree for a weak learner.
13
+ #
14
+ # @example
15
+ # estimator =
16
+ # Rumale::Ensemble::AdaBoostClassifier.new(
17
+ # n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
18
+ # estimator.fit(training_samples, traininig_labels)
19
+ # results = estimator.predict(testing_samples)
20
+ #
21
+ # *Reference*
22
+ # - J. Zhu, S. Rosset, H. Zou, and T.Hashie, "Multi-class AdaBoost," Technical Report No. 430, Department of Statistics, University of Michigan, 2005.
23
+ class AdaBoostClassifier
24
+ include Base::BaseEstimator
25
+ include Base::Classifier
26
+
27
+ # Return the set of estimators.
28
+ # @return [Array<DecisionTreeClassifier>]
29
+ attr_reader :estimators
30
+
31
+ # Return the class labels.
32
+ # @return [Numo::Int32] (size: n_classes)
33
+ attr_reader :classes
34
+
35
+ # Return the importance for each feature.
36
+ # @return [Numo::DFloat] (size: n_features)
37
+ attr_reader :feature_importances
38
+
39
+ # Return the random generator for random selection of feature index.
40
+ # @return [Random]
41
+ attr_reader :rng
42
+
43
+ # Create a new classifier with AdaBoost.
44
+ #
45
+ # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
46
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
47
+ # @param max_depth [Integer] The maximum depth of the tree.
48
+ # If nil is given, decision tree grows without concern for depth.
49
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
50
+ # If nil is given, number of leaves is not limited.
51
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
52
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
53
+ # If nil is given, split process considers all features.
54
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
55
+ # It is used to randomly determine the order of features when deciding spliting point.
56
+ def initialize(n_estimators: 50,
57
+ criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
58
+ max_features: nil, random_seed: nil)
59
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
60
+ max_features: max_features, random_seed: random_seed)
61
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
62
+ check_params_string(criterion: criterion)
63
+ check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
64
+ max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
65
+ max_features: max_features)
66
+ @params = {}
67
+ @params[:n_estimators] = n_estimators
68
+ @params[:criterion] = criterion
69
+ @params[:max_depth] = max_depth
70
+ @params[:max_leaf_nodes] = max_leaf_nodes
71
+ @params[:min_samples_leaf] = min_samples_leaf
72
+ @params[:max_features] = max_features
73
+ @params[:random_seed] = random_seed
74
+ @params[:random_seed] ||= srand
75
+ @estimators = nil
76
+ @classes = nil
77
+ @feature_importances = nil
78
+ @rng = Random.new(@params[:random_seed])
79
+ end
80
+
81
+ # Fit the model with given training data.
82
+ #
83
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
84
+ # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
85
+ # @return [AdaBoostClassifier] The learned classifier itself.
86
+ def fit(x, y) # rubocop:disable Metrics/AbcSize
87
+ check_sample_array(x)
88
+ check_label_array(y)
89
+ check_sample_label_size(x, y)
90
+ ## Initialize some variables.
91
+ n_samples, n_features = x.shape
92
+ @estimators = []
93
+ @feature_importances = Numo::DFloat.zeros(n_features)
94
+ @params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
95
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
96
+ @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
97
+ n_classes = @classes.shape[0]
98
+ ## Boosting.
99
+ classes_arr = @classes.to_a
100
+ y_codes = Numo::DFloat.zeros(n_samples, n_classes) - 1.fdiv(n_classes - 1)
101
+ n_samples.times { |n| y_codes[n, classes_arr.index(y[n])] = 1.0 }
102
+ observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
103
+ @params[:n_estimators].times do |_t|
104
+ # Fit classfier.
105
+ ids = Rumale::Utils.choice_ids(n_samples, observation_weights, @rng)
106
+ break if y[ids].to_a.uniq.size != n_classes
107
+ tree = Tree::DecisionTreeClassifier.new(
108
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
109
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
110
+ max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
111
+ )
112
+ tree.fit(x[ids, true], y[ids])
113
+ # Calculate estimator error.
114
+ proba = tree.predict_proba(x).clip(1.0e-15, nil)
115
+ p = Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[proba[n, true].max_index] })
116
+ inds = p.ne(y)
117
+ error = (observation_weights * inds).sum / observation_weights.sum
118
+ # Store model.
119
+ @estimators.push(tree)
120
+ @feature_importances += tree.feature_importances
121
+ break if error.zero?
122
+ # Update observation weights.
123
+ log_proba = Numo::NMath.log(proba)
124
+ observation_weights *= Numo::NMath.exp(-1.0 * (n_classes - 1).fdiv(n_classes) * (y_codes * log_proba).sum(1))
125
+ observation_weights = observation_weights.clip(1.0e-15, nil)
126
+ sum_observation_weights = observation_weights.sum
127
+ break if sum_observation_weights.zero?
128
+ observation_weights /= sum_observation_weights
129
+ end
130
+ @feature_importances /= @feature_importances.sum
131
+ self
132
+ end
133
+
134
+ # Calculate confidence scores for samples.
135
+ #
136
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
137
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
138
+ def decision_function(x)
139
+ check_sample_array(x)
140
+ n_samples, = x.shape
141
+ n_classes = @classes.size
142
+ sum_probs = Numo::DFloat.zeros(n_samples, n_classes)
143
+ @estimators.each do |tree|
144
+ log_proba = Numo::NMath.log(tree.predict_proba(x).clip(1.0e-15, nil))
145
+ sum_probs += (n_classes - 1) * (log_proba - 1.fdiv(n_classes) * Numo::DFloat[log_proba.sum(1)].transpose)
146
+ end
147
+ sum_probs /= @estimators.size
148
+ end
149
+
150
+ # Predict class labels for samples.
151
+ #
152
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
153
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
154
+ def predict(x)
155
+ check_sample_array(x)
156
+ n_samples, = x.shape
157
+ probs = decision_function(x)
158
+ Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
159
+ end
160
+
161
+ # Predict probability for samples.
162
+ #
163
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
164
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
165
+ def predict_proba(x)
166
+ check_sample_array(x)
167
+ n_classes = @classes.size
168
+ probs = Numo::NMath.exp(1.fdiv(n_classes - 1) * decision_function(x))
169
+ sum_probs = probs.sum(1)
170
+ probs /= Numo::DFloat[sum_probs].transpose
171
+ probs
172
+ end
173
+
174
+ # Dump marshal data.
175
+ # @return [Hash] The marshal data about AdaBoostClassifier.
176
+ def marshal_dump
177
+ { params: @params,
178
+ estimators: @estimators,
179
+ classes: @classes,
180
+ feature_importances: @feature_importances,
181
+ rng: @rng }
182
+ end
183
+
184
+ # Load marshal data.
185
+ # @return [nil]
186
+ def marshal_load(obj)
187
+ @params = obj[:params]
188
+ @estimators = obj[:estimators]
189
+ @classes = obj[:classes]
190
+ @feature_importances = obj[:feature_importances]
191
+ @rng = obj[:rng]
192
+ nil
193
+ end
194
+ end
195
+ end
196
+ end