rumale 0.23.3 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE.txt +5 -1
  3. data/README.md +3 -288
  4. data/lib/rumale/version.rb +1 -1
  5. data/lib/rumale.rb +20 -131
  6. metadata +252 -150
  7. data/CHANGELOG.md +0 -643
  8. data/CODE_OF_CONDUCT.md +0 -74
  9. data/ext/rumale/extconf.rb +0 -37
  10. data/ext/rumale/rumaleext.c +0 -545
  11. data/ext/rumale/rumaleext.h +0 -12
  12. data/lib/rumale/base/base_estimator.rb +0 -49
  13. data/lib/rumale/base/classifier.rb +0 -36
  14. data/lib/rumale/base/cluster_analyzer.rb +0 -31
  15. data/lib/rumale/base/evaluator.rb +0 -17
  16. data/lib/rumale/base/regressor.rb +0 -36
  17. data/lib/rumale/base/splitter.rb +0 -21
  18. data/lib/rumale/base/transformer.rb +0 -22
  19. data/lib/rumale/clustering/dbscan.rb +0 -123
  20. data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
  21. data/lib/rumale/clustering/hdbscan.rb +0 -291
  22. data/lib/rumale/clustering/k_means.rb +0 -122
  23. data/lib/rumale/clustering/k_medoids.rb +0 -141
  24. data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
  25. data/lib/rumale/clustering/power_iteration.rb +0 -127
  26. data/lib/rumale/clustering/single_linkage.rb +0 -203
  27. data/lib/rumale/clustering/snn.rb +0 -76
  28. data/lib/rumale/clustering/spectral_clustering.rb +0 -115
  29. data/lib/rumale/dataset.rb +0 -246
  30. data/lib/rumale/decomposition/factor_analysis.rb +0 -150
  31. data/lib/rumale/decomposition/fast_ica.rb +0 -188
  32. data/lib/rumale/decomposition/nmf.rb +0 -124
  33. data/lib/rumale/decomposition/pca.rb +0 -159
  34. data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
  35. data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
  36. data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
  37. data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
  38. data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
  39. data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
  40. data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
  41. data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
  42. data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
  43. data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
  44. data/lib/rumale/ensemble/voting_classifier.rb +0 -126
  45. data/lib/rumale/ensemble/voting_regressor.rb +0 -82
  46. data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
  47. data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
  48. data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
  49. data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
  50. data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
  51. data/lib/rumale/evaluation_measure/f_score.rb +0 -50
  52. data/lib/rumale/evaluation_measure/function.rb +0 -147
  53. data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
  54. data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
  55. data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
  56. data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
  57. data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
  58. data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
  59. data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
  60. data/lib/rumale/evaluation_measure/precision.rb +0 -50
  61. data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
  62. data/lib/rumale/evaluation_measure/purity.rb +0 -40
  63. data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
  64. data/lib/rumale/evaluation_measure/recall.rb +0 -50
  65. data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
  66. data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
  67. data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
  68. data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
  69. data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
  70. data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
  71. data/lib/rumale/kernel_approximation/rbf.rb +0 -102
  72. data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
  73. data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
  74. data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
  75. data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
  76. data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
  77. data/lib/rumale/linear_model/base_sgd.rb +0 -285
  78. data/lib/rumale/linear_model/elastic_net.rb +0 -119
  79. data/lib/rumale/linear_model/lasso.rb +0 -115
  80. data/lib/rumale/linear_model/linear_regression.rb +0 -201
  81. data/lib/rumale/linear_model/logistic_regression.rb +0 -275
  82. data/lib/rumale/linear_model/nnls.rb +0 -137
  83. data/lib/rumale/linear_model/ridge.rb +0 -209
  84. data/lib/rumale/linear_model/svc.rb +0 -213
  85. data/lib/rumale/linear_model/svr.rb +0 -132
  86. data/lib/rumale/manifold/mds.rb +0 -155
  87. data/lib/rumale/manifold/tsne.rb +0 -222
  88. data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
  89. data/lib/rumale/metric_learning/mlkr.rb +0 -161
  90. data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
  91. data/lib/rumale/model_selection/cross_validation.rb +0 -125
  92. data/lib/rumale/model_selection/function.rb +0 -42
  93. data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
  94. data/lib/rumale/model_selection/group_k_fold.rb +0 -93
  95. data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
  96. data/lib/rumale/model_selection/k_fold.rb +0 -81
  97. data/lib/rumale/model_selection/shuffle_split.rb +0 -90
  98. data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
  99. data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
  100. data/lib/rumale/model_selection/time_series_split.rb +0 -91
  101. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
  102. data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
  103. data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
  104. data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
  105. data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
  106. data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
  107. data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
  108. data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
  109. data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
  110. data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
  111. data/lib/rumale/neural_network/adam.rb +0 -56
  112. data/lib/rumale/neural_network/base_mlp.rb +0 -248
  113. data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
  114. data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
  115. data/lib/rumale/pairwise_metric.rb +0 -152
  116. data/lib/rumale/pipeline/feature_union.rb +0 -69
  117. data/lib/rumale/pipeline/pipeline.rb +0 -175
  118. data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
  119. data/lib/rumale/preprocessing/binarizer.rb +0 -60
  120. data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
  121. data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
  122. data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
  123. data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
  124. data/lib/rumale/preprocessing/label_encoder.rb +0 -79
  125. data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
  126. data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
  127. data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
  128. data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
  129. data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
  130. data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
  131. data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
  132. data/lib/rumale/probabilistic_output.rb +0 -114
  133. data/lib/rumale/tree/base_decision_tree.rb +0 -150
  134. data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
  135. data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
  136. data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
  137. data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
  138. data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
  139. data/lib/rumale/tree/node.rb +0 -39
  140. data/lib/rumale/utils.rb +0 -42
  141. data/lib/rumale/validation.rb +0 -128
  142. data/lib/rumale/values.rb +0 -13
@@ -1,109 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/transformer'
5
-
6
- module Rumale
7
- module Preprocessing
8
- # Transfrom categorical features to integer values.
9
- #
10
- # @example
11
- # encoder = Rumale::Preprocessing::OrdinalEncoder.new
12
- # training_samples = [['left', 10], ['right', 15], ['right', 20]]
13
- # training_samples = Numo::NArray.asarray(training_samples)
14
- # encoder.fit(training_samples)
15
- # p encoder.categories
16
- # # [["left", "right"], [10, 15, 20]]
17
- # testing_samples = [['left', 20], ['right', 10]]
18
- # testing_samples = Numo::NArray.asarray(testing_samples)
19
- # encoded = encoder.transform(testing_samples)
20
- # p encoded
21
- # # Numo::DFloat#shape=[2,2]
22
- # # [[0, 2],
23
- # # [1, 0]]
24
- # p encoder.inverse_transform(encoded)
25
- # # Numo::RObject#shape=[2,2]
26
- # # [["left", 20],
27
- # # ["right", 10]]
28
- class OrdinalEncoder
29
- include Base::BaseEstimator
30
- include Base::Transformer
31
-
32
- # Return the array consists of categorical value each feature.
33
- # @return [Array] (size: n_features)
34
- attr_reader :categories
35
-
36
- # Create a new encoder that transform categorical features to integer values.
37
- #
38
- # @param categories [Nil/Array] The category list for each feature.
39
- # If nil is given, extracted categories from the training data by calling the fit method are used.
40
- def initialize(categories: nil)
41
- check_params_type_or_nil(Array, categories: categories)
42
- @categories = categories
43
- end
44
-
45
- # Fit encoder by extracting the category for each feature.
46
- #
47
- # @overload fit(x) -> OrdinalEncoder
48
- #
49
- # @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
50
- # @return [LabelEncoder]
51
- def fit(x, _y = nil)
52
- raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
53
- raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
54
-
55
- n_features = x.shape[1]
56
- @categories = Array.new(n_features) { |n| x[true, n].to_a.uniq.sort }
57
- self
58
- end
59
-
60
- # Fit encoder, then return encoded categorical features to integer values.
61
- #
62
- # @overload fit_transform(x) -> Numo::DFloat
63
- #
64
- # @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
65
- # @return [Numo::DFloat] The encoded categorical features to integer values.
66
- def fit_transform(x, _y = nil)
67
- raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
68
- raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
69
-
70
- fit(x).transform(x)
71
- end
72
-
73
- # Encode categorical features.
74
- #
75
- # @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
76
- # @return [Numo::DFloat] The encoded categorical features to integer values.
77
- def transform(x)
78
- raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
79
- raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
80
-
81
- n_features = x.shape[1]
82
- raise ArgumentError, 'Expect the number of features and the number of categories to be equal' if n_features != @categories.size
83
-
84
- transformed = Array.new(n_features) do |n|
85
- x[true, n].to_a.map { |v| @categories[n].index(v) }
86
- end
87
-
88
- Numo::DFloat.asarray(transformed.transpose)
89
- end
90
-
91
- # Decode values to categorical features.
92
- #
93
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples consisting of values transformed from categorical features.
94
- # @return [Numo::NArray] The decoded features.
95
- def inverse_transform(x)
96
- x = check_convert_sample_array(x)
97
-
98
- n_features = x.shape[1]
99
- raise ArgumentError, 'Expect the number of features and the number of categories to be equal' if n_features != @categories.size
100
-
101
- inv_transformed = Array.new(n_features) do |n|
102
- x[true, n].to_a.map { |i| @categories[n][i.to_i] }
103
- end
104
-
105
- Numo::NArray.asarray(inv_transformed.transpose)
106
- end
107
- end
108
- end
109
- end
@@ -1,109 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/transformer'
5
-
6
- module Rumale
7
- module Preprocessing
8
- # Generating polynomial features from the given samples.
9
- #
10
- # @example
11
- # require 'rumale'
12
- #
13
- # transformer = Rumale::Preprocessing::PolynomialFeatures.new(degree: 2)
14
- # x = Numo::DFloat[[0, 1], [2, 3], [4, 5]]
15
- # z = transformer.fit_transform(x)
16
- # p z
17
- #
18
- # # Numo::DFloat#shape=[3,6]
19
- # # [[1, 0, 1, 0, 0, 1],
20
- # # [1, 2, 3, 4, 6, 9],
21
- # # [1, 4, 5, 16, 20, 25]]
22
- #
23
- # # If you want to perform polynomial regression, combine it with LinearRegression as follows:
24
- # ply = Rumale::Preprocessing::PolynomialFeatures.new(degree: 2)
25
- # reg = Rumale::LinearModel::LinearRegression.new(fit_bias: false, random_seed: 1)
26
- # pipeline = Rumale::Pipeline::Pipeline.new(steps: { trs: ply, est: reg })
27
- # pipeline.fit(training_samples, training_values)
28
- # results = pipeline.predict(testing_samples)
29
- #
30
- class PolynomialFeatures
31
- include Base::BaseEstimator
32
- include Base::Transformer
33
-
34
- # Return the number of polynomial features.
35
- # @return [Integer]
36
- attr_reader :n_output_features
37
-
38
- # Create a transformer for generating polynomial features.
39
- #
40
- # @param degree [Integer] The degree of polynomial features.
41
- def initialize(degree: 2)
42
- check_params_numeric(degree: degree)
43
- raise ArgumentError, 'Expect the value of degree parameter greater than or eqaul to 1.' if degree < 1
44
-
45
- @params = {}
46
- @params[:degree] = degree
47
- @n_output_features = nil
48
- end
49
-
50
- # Calculate the number of output polynomial fetures.
51
- #
52
- # @overload fit(x) -> PolynomialFeatures
53
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the number of output polynomial fetures.
54
- # @return [PolynomialFeatures]
55
- def fit(x, _y = nil)
56
- x = check_convert_sample_array(x)
57
- n_features = x.shape[1]
58
- @n_output_features = 1
59
- @params[:degree].times do |t|
60
- @n_output_features += Array.new(n_features) { |n| n }.repeated_combination(t + 1).size
61
- end
62
- self
63
- end
64
-
65
- # Calculate the number of polynomial features, and then transform samples to polynomial features.
66
- #
67
- # @overload fit_transform(x) -> Numo::DFloat
68
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the number of polynomial features
69
- # and be transformed.
70
- # @return [Numo::DFloat] (shape: [n_samples, n_output_features]) The transformed samples.
71
- def fit_transform(x, _y = nil)
72
- x = check_convert_sample_array(x)
73
- fit(x).transform(x)
74
- end
75
-
76
- # Transform the given samples to polynomial features.
77
- #
78
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed.
79
- # @return [Numo::DFloat] (shape: [n_samples, n_output_features]) The transformed samples.
80
- def transform(x)
81
- x = check_convert_sample_array(x)
82
- # initialize transformed features
83
- n_samples, n_features = x.shape
84
- z = Numo::DFloat.zeros(n_samples, n_output_features)
85
- # bias
86
- z[true, 0] = 1
87
- curr_col = 1
88
- # itself
89
- z[true, 1..n_features] = x
90
- curr_col += n_features
91
- # high degree features
92
- curr_feat_ids = Array.new(n_features + 1) { |n| n + 1 }
93
- (1...@params[:degree]).each do
94
- next_feat_ids = []
95
- n_features.times do |d|
96
- f_range = curr_feat_ids[d]...curr_feat_ids.last
97
- next_col = curr_col + f_range.size
98
- z[true, curr_col...next_col] = z[true, f_range] * x[true, d..d]
99
- next_feat_ids.push(curr_col)
100
- curr_col = next_col
101
- end
102
- next_feat_ids.push(curr_col)
103
- curr_feat_ids = next_feat_ids
104
- end
105
- z
106
- end
107
- end
108
- end
109
- end
@@ -1,71 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/transformer'
5
-
6
- module Rumale
7
- # This module consists of the classes that perform preprocessings.
8
- module Preprocessing
9
- # Normalize samples by centering and scaling to unit variance.
10
- #
11
- # @example
12
- # normalizer = Rumale::Preprocessing::StandardScaler.new
13
- # new_training_samples = normalizer.fit_transform(training_samples)
14
- # new_testing_samples = normalizer.transform(testing_samples)
15
- class StandardScaler
16
- include Base::BaseEstimator
17
- include Base::Transformer
18
-
19
- # Return the vector consists of the mean value for each feature.
20
- # @return [Numo::DFloat] (shape: [n_features])
21
- attr_reader :mean_vec
22
-
23
- # Return the vector consists of the standard deviation for each feature.
24
- # @return [Numo::DFloat] (shape: [n_features])
25
- attr_reader :std_vec
26
-
27
- # Create a new normalizer for centering and scaling to unit variance.
28
- def initialize
29
- @params = {}
30
- @mean_vec = nil
31
- @std_vec = nil
32
- end
33
-
34
- # Calculate the mean value and standard deviation of each feature for scaling.
35
- #
36
- # @overload fit(x) -> StandardScaler
37
- #
38
- # @param x [Numo::DFloat] (shape: [n_samples, n_features])
39
- # The samples to calculate the mean values and standard deviations.
40
- # @return [StandardScaler]
41
- def fit(x, _y = nil)
42
- x = check_convert_sample_array(x)
43
- @mean_vec = x.mean(0)
44
- @std_vec = x.stddev(0)
45
- self
46
- end
47
-
48
- # Calculate the mean values and standard deviations, and then normalize samples using them.
49
- #
50
- # @overload fit_transform(x) -> Numo::DFloat
51
- #
52
- # @param x [Numo::DFloat] (shape: [n_samples, n_features])
53
- # The samples to calculate the mean values and standard deviations.
54
- # @return [Numo::DFloat] The scaled samples.
55
- def fit_transform(x, _y = nil)
56
- x = check_convert_sample_array(x)
57
- fit(x).transform(x)
58
- end
59
-
60
- # Perform standardization the given samples.
61
- #
62
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be scaled.
63
- # @return [Numo::DFloat] The scaled samples.
64
- def transform(x)
65
- x = check_convert_sample_array(x)
66
- n_samples, = x.shape
67
- (x - @mean_vec.tile(n_samples, 1)) / @std_vec.tile(n_samples, 1)
68
- end
69
- end
70
- end
71
- end
@@ -1,114 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Rumale
4
- # Module for calculating posterior class probabilities with SVM outputs.
5
- # This module is used for internal processes.
6
- #
7
- # @example
8
- # estimator = Rumale::LinearModel::SVC.new
9
- # estimator.fit(x, bin_y)
10
- # df = estimator.decision_function(x)
11
- # params = Rumale::ProbabilisticOutput.fit_sigmoid(df, bin_y)
12
- # probs = 1 / (Numo::NMath.exp(params[0] * df + params[1]) + 1)
13
- #
14
- # *Reference*
15
- # - Platt, J C., "Probabilistic Outputs for Support Vector Machines and Comparisons to Regularized Likelihood Methods," Adv. Large Margin Classifiers, pp. 61--74, 2000.
16
- # - Lin, H-T., Lin, C-J., and Weng, R C., "A Note on Platt's Probabilistic Outputs for Support Vector Machines," J. Machine Learning, Vol. 63 (3), pp. 267--276, 2007.
17
- module ProbabilisticOutput
18
- class << self
19
- # Fit the probabilistic model for binary SVM outputs.
20
- #
21
- # @param df [Numo::DFloat] (shape: [n_samples]) The outputs of decision function to be used for fitting the model.
22
- # @param bin_y [Numo::Int32] (shape: [n_samples]) The binary labels to be used for fitting the model.
23
- # @param max_iter [Integer] The maximum number of iterations.
24
- # @param min_step [Float] The minimum step of Newton's method.
25
- # @param sigma [Float] The parameter to avoid hessian matrix from becoming singular matrix.
26
- # @return [Numo::DFloat] (shape: 2) The parameters of the model.
27
- def fit_sigmoid(df, bin_y, max_iter = 100, min_step = 1e-10, sigma = 1e-12)
28
- # Initialize some variables.
29
- n_samples = bin_y.size
30
- negative_label = bin_y.to_a.uniq.min
31
- pos = bin_y.ne(negative_label)
32
- neg = bin_y.eq(negative_label)
33
- n_pos_samples = pos.count
34
- n_neg_samples = neg.count
35
- target_probs = Numo::DFloat.zeros(n_samples)
36
- target_probs[pos] = (n_pos_samples + 1) / (n_pos_samples + 2.0)
37
- target_probs[neg] = 1 / (n_neg_samples + 2.0)
38
- alpha = 0.0
39
- beta = Math.log((n_neg_samples + 1) / (n_pos_samples + 1.0))
40
- err = error_function(target_probs, df, alpha, beta)
41
- # Optimize parameters for class porbability calculation.
42
- old_grad_vec = Numo::DFloat.zeros(2)
43
- max_iter.times do
44
- # Calculate gradient and hessian matrix.
45
- probs = predicted_probs(df, alpha, beta)
46
- grad_vec = gradient(target_probs, probs, df)
47
- hess_mat = hessian_matrix(probs, df, sigma)
48
- break if grad_vec.abs.lt(1e-5).count == 2
49
- break if (old_grad_vec - grad_vec).abs.sum < 1e-5
50
-
51
- old_grad_vec = grad_vec
52
- # Calculate Newton directions.
53
- dirs_vec = directions(grad_vec, hess_mat)
54
- grad_dir = grad_vec.dot(dirs_vec)
55
- stepsize = 2.0
56
- while stepsize >= min_step
57
- stepsize *= 0.5
58
- new_alpha = alpha + stepsize * dirs_vec[0]
59
- new_beta = beta + stepsize * dirs_vec[1]
60
- new_err = error_function(target_probs, df, new_alpha, new_beta)
61
- next unless new_err < err + 0.0001 * stepsize * grad_dir
62
-
63
- alpha = new_alpha
64
- beta = new_beta
65
- err = new_err
66
- break
67
- end
68
- end
69
- Numo::DFloat[alpha, beta]
70
- end
71
-
72
- private
73
-
74
- def error_function(target_probs, df, alpha, beta)
75
- fn = alpha * df + beta
76
- pos = fn.ge(0.0)
77
- neg = fn.lt(0.0)
78
- err = 0.0
79
- err += (target_probs[pos] * fn[pos] + Numo::NMath.log(1 + Numo::NMath.exp(-fn[pos]))).sum if pos.count.positive?
80
- err += ((target_probs[neg] - 1) * fn[neg] + Numo::NMath.log(1 + Numo::NMath.exp(fn[neg]))).sum if neg.count.positive?
81
- err
82
- end
83
-
84
- def predicted_probs(df, alpha, beta)
85
- fn = alpha * df + beta
86
- pos = fn.ge(0.0)
87
- neg = fn.lt(0.0)
88
- probs = Numo::DFloat.zeros(df.shape[0])
89
- probs[pos] = Numo::NMath.exp(-fn[pos]) / (1 + Numo::NMath.exp(-fn[pos])) if pos.count.positive?
90
- probs[neg] = 1 / (1 + Numo::NMath.exp(fn[neg])) if neg.count.positive?
91
- probs
92
- end
93
-
94
- def gradient(target_probs, probs, df)
95
- sub = target_probs - probs
96
- Numo::DFloat[(df * sub).sum, sub.sum]
97
- end
98
-
99
- def hessian_matrix(probs, df, sigma)
100
- sub = probs * (1 - probs)
101
- h11 = (df**2 * sub).sum + sigma
102
- h22 = sub.sum + sigma
103
- h21 = (df * sub).sum
104
- Numo::DFloat[[h11, h21], [h21, h22]]
105
- end
106
-
107
- def directions(grad_vec, hess_mat)
108
- det = hess_mat[0, 0] * hess_mat[1, 1] - hess_mat[0, 1] * hess_mat[1, 0]
109
- inv_hess_mat = Numo::DFloat[[hess_mat[1, 1], -hess_mat[0, 1]], [-hess_mat[1, 0], hess_mat[0, 0]]] / det
110
- -inv_hess_mat.dot(grad_vec)
111
- end
112
- end
113
- end
114
- end
@@ -1,150 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/tree/node'
5
- require 'rumale/rumaleext'
6
-
7
- module Rumale
8
- # This module consists of the classes that implement tree models.
9
- module Tree
10
- # BaseDecisionTree is an abstract class for implementation of decision tree-based estimator.
11
- # This class is used internally.
12
- class BaseDecisionTree
13
- include Base::BaseEstimator
14
-
15
- # Initialize a decision tree-based estimator.
16
- #
17
- # @param criterion [String] The function to evalue spliting point.
18
- # @param max_depth [Integer] The maximum depth of the tree.
19
- # If nil is given, decision tree grows without concern for depth.
20
- # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
21
- # If nil is given, number of leaves is not limited.
22
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
23
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
24
- # If nil is given, split process considers all features.
25
- # @param random_seed [Integer] The seed value using to initialize the random generator.
26
- # It is used to randomly determine the order of features when deciding spliting point.
27
- def initialize(criterion: nil, max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil, random_seed: nil)
28
- @params = {}
29
- @params[:criterion] = criterion
30
- @params[:max_depth] = max_depth
31
- @params[:max_leaf_nodes] = max_leaf_nodes
32
- @params[:min_samples_leaf] = min_samples_leaf
33
- @params[:max_features] = max_features
34
- @params[:random_seed] = random_seed
35
- @params[:random_seed] ||= srand
36
- @tree = nil
37
- @feature_importances = nil
38
- @n_leaves = nil
39
- @rng = Random.new(@params[:random_seed])
40
- end
41
-
42
- # Return the index of the leaf that each sample reached.
43
- #
44
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
45
- # @return [Numo::Int32] (shape: [n_samples]) Leaf index for sample.
46
- def apply(x)
47
- x = check_convert_sample_array(x)
48
- Numo::Int32[*(Array.new(x.shape[0]) { |n| partial_apply(@tree, x[n, true]) })]
49
- end
50
-
51
- private
52
-
53
- def partial_apply(tree, sample)
54
- node = tree
55
- until node.leaf
56
- # :nocov:
57
- node = if node.right.nil?
58
- node.left
59
- elsif node.left.nil?
60
- node.right
61
- # :nocov:
62
- else
63
- sample[node.feature_id] <= node.threshold ? node.left : node.right
64
- end
65
- end
66
- node.leaf_id
67
- end
68
-
69
- def build_tree(x, y)
70
- y = y.expand_dims(1).dup if y.shape[1].nil?
71
- @feature_ids = Array.new(x.shape[1]) { |v| v }
72
- @tree = grow_node(0, x, y, impurity(y))
73
- @feature_ids = nil
74
- nil
75
- end
76
-
77
- def grow_node(depth, x, y, impurity)
78
- # intialize node.
79
- n_samples = x.shape[0]
80
- node = Node.new(depth: depth, impurity: impurity, n_samples: n_samples)
81
-
82
- # terminate growing.
83
- return nil if !@params[:max_leaf_nodes].nil? && @n_leaves >= @params[:max_leaf_nodes]
84
- return nil if n_samples < @params[:min_samples_leaf]
85
- return put_leaf(node, y) if n_samples == @params[:min_samples_leaf]
86
- return put_leaf(node, y) if !@params[:max_depth].nil? && depth == @params[:max_depth]
87
- return put_leaf(node, y) if stop_growing?(y)
88
-
89
- # calculate optimal parameters.
90
- feature_id, left_imp, right_imp, threshold, gain =
91
- rand_ids.map { |n| [n, *best_split(x[true, n], y, impurity)] }.max_by(&:last)
92
-
93
- return put_leaf(node, y) if gain.nil? || gain.zero?
94
-
95
- left_ids = x[true, feature_id].le(threshold).where
96
- right_ids = x[true, feature_id].gt(threshold).where
97
- node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true], left_imp)
98
- node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true], right_imp)
99
-
100
- return put_leaf(node, y) if node.left.nil? && node.right.nil?
101
-
102
- node.feature_id = feature_id
103
- node.threshold = threshold
104
- node.leaf = false
105
- node
106
- end
107
-
108
- def stop_growing?(_y)
109
- raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
110
- end
111
-
112
- def put_leaf(_node, _y)
113
- raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
114
- end
115
-
116
- def rand_ids
117
- @feature_ids.sample(@params[:max_features], random: @sub_rng)
118
- end
119
-
120
- def best_split(_features, _y, _impurity)
121
- raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
122
- end
123
-
124
- def impurity(_y)
125
- raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
126
- end
127
-
128
- def eval_importance(n_samples, n_features)
129
- @feature_importances = Numo::DFloat.zeros(n_features)
130
- eval_importance_at_node(@tree)
131
- @feature_importances /= n_samples
132
- normalizer = @feature_importances.sum
133
- @feature_importances /= normalizer if normalizer > 0.0
134
- nil
135
- end
136
-
137
- def eval_importance_at_node(node)
138
- return nil if node.leaf
139
- return nil if node.left.nil? || node.right.nil?
140
-
141
- gain = node.n_samples * node.impurity -
142
- node.left.n_samples * node.left.impurity -
143
- node.right.n_samples * node.right.impurity
144
- @feature_importances[node.feature_id] += gain
145
- eval_importance_at_node(node.left)
146
- eval_importance_at_node(node.right)
147
- end
148
- end
149
- end
150
- end