rumale 0.23.3 → 0.24.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE.txt +5 -1
  3. data/README.md +3 -288
  4. data/lib/rumale/version.rb +1 -1
  5. data/lib/rumale.rb +20 -131
  6. metadata +252 -150
  7. data/CHANGELOG.md +0 -643
  8. data/CODE_OF_CONDUCT.md +0 -74
  9. data/ext/rumale/extconf.rb +0 -37
  10. data/ext/rumale/rumaleext.c +0 -545
  11. data/ext/rumale/rumaleext.h +0 -12
  12. data/lib/rumale/base/base_estimator.rb +0 -49
  13. data/lib/rumale/base/classifier.rb +0 -36
  14. data/lib/rumale/base/cluster_analyzer.rb +0 -31
  15. data/lib/rumale/base/evaluator.rb +0 -17
  16. data/lib/rumale/base/regressor.rb +0 -36
  17. data/lib/rumale/base/splitter.rb +0 -21
  18. data/lib/rumale/base/transformer.rb +0 -22
  19. data/lib/rumale/clustering/dbscan.rb +0 -123
  20. data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
  21. data/lib/rumale/clustering/hdbscan.rb +0 -291
  22. data/lib/rumale/clustering/k_means.rb +0 -122
  23. data/lib/rumale/clustering/k_medoids.rb +0 -141
  24. data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
  25. data/lib/rumale/clustering/power_iteration.rb +0 -127
  26. data/lib/rumale/clustering/single_linkage.rb +0 -203
  27. data/lib/rumale/clustering/snn.rb +0 -76
  28. data/lib/rumale/clustering/spectral_clustering.rb +0 -115
  29. data/lib/rumale/dataset.rb +0 -246
  30. data/lib/rumale/decomposition/factor_analysis.rb +0 -150
  31. data/lib/rumale/decomposition/fast_ica.rb +0 -188
  32. data/lib/rumale/decomposition/nmf.rb +0 -124
  33. data/lib/rumale/decomposition/pca.rb +0 -159
  34. data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
  35. data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
  36. data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
  37. data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
  38. data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
  39. data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
  40. data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
  41. data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
  42. data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
  43. data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
  44. data/lib/rumale/ensemble/voting_classifier.rb +0 -126
  45. data/lib/rumale/ensemble/voting_regressor.rb +0 -82
  46. data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
  47. data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
  48. data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
  49. data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
  50. data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
  51. data/lib/rumale/evaluation_measure/f_score.rb +0 -50
  52. data/lib/rumale/evaluation_measure/function.rb +0 -147
  53. data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
  54. data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
  55. data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
  56. data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
  57. data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
  58. data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
  59. data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
  60. data/lib/rumale/evaluation_measure/precision.rb +0 -50
  61. data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
  62. data/lib/rumale/evaluation_measure/purity.rb +0 -40
  63. data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
  64. data/lib/rumale/evaluation_measure/recall.rb +0 -50
  65. data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
  66. data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
  67. data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
  68. data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
  69. data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
  70. data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
  71. data/lib/rumale/kernel_approximation/rbf.rb +0 -102
  72. data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
  73. data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
  74. data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
  75. data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
  76. data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
  77. data/lib/rumale/linear_model/base_sgd.rb +0 -285
  78. data/lib/rumale/linear_model/elastic_net.rb +0 -119
  79. data/lib/rumale/linear_model/lasso.rb +0 -115
  80. data/lib/rumale/linear_model/linear_regression.rb +0 -201
  81. data/lib/rumale/linear_model/logistic_regression.rb +0 -275
  82. data/lib/rumale/linear_model/nnls.rb +0 -137
  83. data/lib/rumale/linear_model/ridge.rb +0 -209
  84. data/lib/rumale/linear_model/svc.rb +0 -213
  85. data/lib/rumale/linear_model/svr.rb +0 -132
  86. data/lib/rumale/manifold/mds.rb +0 -155
  87. data/lib/rumale/manifold/tsne.rb +0 -222
  88. data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
  89. data/lib/rumale/metric_learning/mlkr.rb +0 -161
  90. data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
  91. data/lib/rumale/model_selection/cross_validation.rb +0 -125
  92. data/lib/rumale/model_selection/function.rb +0 -42
  93. data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
  94. data/lib/rumale/model_selection/group_k_fold.rb +0 -93
  95. data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
  96. data/lib/rumale/model_selection/k_fold.rb +0 -81
  97. data/lib/rumale/model_selection/shuffle_split.rb +0 -90
  98. data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
  99. data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
  100. data/lib/rumale/model_selection/time_series_split.rb +0 -91
  101. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
  102. data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
  103. data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
  104. data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
  105. data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
  106. data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
  107. data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
  108. data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
  109. data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
  110. data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
  111. data/lib/rumale/neural_network/adam.rb +0 -56
  112. data/lib/rumale/neural_network/base_mlp.rb +0 -248
  113. data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
  114. data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
  115. data/lib/rumale/pairwise_metric.rb +0 -152
  116. data/lib/rumale/pipeline/feature_union.rb +0 -69
  117. data/lib/rumale/pipeline/pipeline.rb +0 -175
  118. data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
  119. data/lib/rumale/preprocessing/binarizer.rb +0 -60
  120. data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
  121. data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
  122. data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
  123. data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
  124. data/lib/rumale/preprocessing/label_encoder.rb +0 -79
  125. data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
  126. data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
  127. data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
  128. data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
  129. data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
  130. data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
  131. data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
  132. data/lib/rumale/probabilistic_output.rb +0 -114
  133. data/lib/rumale/tree/base_decision_tree.rb +0 -150
  134. data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
  135. data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
  136. data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
  137. data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
  138. data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
  139. data/lib/rumale/tree/node.rb +0 -39
  140. data/lib/rumale/utils.rb +0 -42
  141. data/lib/rumale/validation.rb +0 -128
  142. data/lib/rumale/values.rb +0 -13
@@ -1,109 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/transformer'
5
-
6
- module Rumale
7
- module Preprocessing
8
- # Transfrom categorical features to integer values.
9
- #
10
- # @example
11
- # encoder = Rumale::Preprocessing::OrdinalEncoder.new
12
- # training_samples = [['left', 10], ['right', 15], ['right', 20]]
13
- # training_samples = Numo::NArray.asarray(training_samples)
14
- # encoder.fit(training_samples)
15
- # p encoder.categories
16
- # # [["left", "right"], [10, 15, 20]]
17
- # testing_samples = [['left', 20], ['right', 10]]
18
- # testing_samples = Numo::NArray.asarray(testing_samples)
19
- # encoded = encoder.transform(testing_samples)
20
- # p encoded
21
- # # Numo::DFloat#shape=[2,2]
22
- # # [[0, 2],
23
- # # [1, 0]]
24
- # p encoder.inverse_transform(encoded)
25
- # # Numo::RObject#shape=[2,2]
26
- # # [["left", 20],
27
- # # ["right", 10]]
28
- class OrdinalEncoder
29
- include Base::BaseEstimator
30
- include Base::Transformer
31
-
32
- # Return the array consists of categorical value each feature.
33
- # @return [Array] (size: n_features)
34
- attr_reader :categories
35
-
36
- # Create a new encoder that transform categorical features to integer values.
37
- #
38
- # @param categories [Nil/Array] The category list for each feature.
39
- # If nil is given, extracted categories from the training data by calling the fit method are used.
40
- def initialize(categories: nil)
41
- check_params_type_or_nil(Array, categories: categories)
42
- @categories = categories
43
- end
44
-
45
- # Fit encoder by extracting the category for each feature.
46
- #
47
- # @overload fit(x) -> OrdinalEncoder
48
- #
49
- # @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
50
- # @return [LabelEncoder]
51
- def fit(x, _y = nil)
52
- raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
53
- raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
54
-
55
- n_features = x.shape[1]
56
- @categories = Array.new(n_features) { |n| x[true, n].to_a.uniq.sort }
57
- self
58
- end
59
-
60
- # Fit encoder, then return encoded categorical features to integer values.
61
- #
62
- # @overload fit_transform(x) -> Numo::DFloat
63
- #
64
- # @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
65
- # @return [Numo::DFloat] The encoded categorical features to integer values.
66
- def fit_transform(x, _y = nil)
67
- raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
68
- raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
69
-
70
- fit(x).transform(x)
71
- end
72
-
73
- # Encode categorical features.
74
- #
75
- # @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
76
- # @return [Numo::DFloat] The encoded categorical features to integer values.
77
- def transform(x)
78
- raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
79
- raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
80
-
81
- n_features = x.shape[1]
82
- raise ArgumentError, 'Expect the number of features and the number of categories to be equal' if n_features != @categories.size
83
-
84
- transformed = Array.new(n_features) do |n|
85
- x[true, n].to_a.map { |v| @categories[n].index(v) }
86
- end
87
-
88
- Numo::DFloat.asarray(transformed.transpose)
89
- end
90
-
91
- # Decode values to categorical features.
92
- #
93
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples consisting of values transformed from categorical features.
94
- # @return [Numo::NArray] The decoded features.
95
- def inverse_transform(x)
96
- x = check_convert_sample_array(x)
97
-
98
- n_features = x.shape[1]
99
- raise ArgumentError, 'Expect the number of features and the number of categories to be equal' if n_features != @categories.size
100
-
101
- inv_transformed = Array.new(n_features) do |n|
102
- x[true, n].to_a.map { |i| @categories[n][i.to_i] }
103
- end
104
-
105
- Numo::NArray.asarray(inv_transformed.transpose)
106
- end
107
- end
108
- end
109
- end
@@ -1,109 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/transformer'
5
-
6
- module Rumale
7
- module Preprocessing
8
- # Generating polynomial features from the given samples.
9
- #
10
- # @example
11
- # require 'rumale'
12
- #
13
- # transformer = Rumale::Preprocessing::PolynomialFeatures.new(degree: 2)
14
- # x = Numo::DFloat[[0, 1], [2, 3], [4, 5]]
15
- # z = transformer.fit_transform(x)
16
- # p z
17
- #
18
- # # Numo::DFloat#shape=[3,6]
19
- # # [[1, 0, 1, 0, 0, 1],
20
- # # [1, 2, 3, 4, 6, 9],
21
- # # [1, 4, 5, 16, 20, 25]]
22
- #
23
- # # If you want to perform polynomial regression, combine it with LinearRegression as follows:
24
- # ply = Rumale::Preprocessing::PolynomialFeatures.new(degree: 2)
25
- # reg = Rumale::LinearModel::LinearRegression.new(fit_bias: false, random_seed: 1)
26
- # pipeline = Rumale::Pipeline::Pipeline.new(steps: { trs: ply, est: reg })
27
- # pipeline.fit(training_samples, training_values)
28
- # results = pipeline.predict(testing_samples)
29
- #
30
- class PolynomialFeatures
31
- include Base::BaseEstimator
32
- include Base::Transformer
33
-
34
- # Return the number of polynomial features.
35
- # @return [Integer]
36
- attr_reader :n_output_features
37
-
38
- # Create a transformer for generating polynomial features.
39
- #
40
- # @param degree [Integer] The degree of polynomial features.
41
- def initialize(degree: 2)
42
- check_params_numeric(degree: degree)
43
- raise ArgumentError, 'Expect the value of degree parameter greater than or eqaul to 1.' if degree < 1
44
-
45
- @params = {}
46
- @params[:degree] = degree
47
- @n_output_features = nil
48
- end
49
-
50
- # Calculate the number of output polynomial fetures.
51
- #
52
- # @overload fit(x) -> PolynomialFeatures
53
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the number of output polynomial fetures.
54
- # @return [PolynomialFeatures]
55
- def fit(x, _y = nil)
56
- x = check_convert_sample_array(x)
57
- n_features = x.shape[1]
58
- @n_output_features = 1
59
- @params[:degree].times do |t|
60
- @n_output_features += Array.new(n_features) { |n| n }.repeated_combination(t + 1).size
61
- end
62
- self
63
- end
64
-
65
- # Calculate the number of polynomial features, and then transform samples to polynomial features.
66
- #
67
- # @overload fit_transform(x) -> Numo::DFloat
68
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the number of polynomial features
69
- # and be transformed.
70
- # @return [Numo::DFloat] (shape: [n_samples, n_output_features]) The transformed samples.
71
- def fit_transform(x, _y = nil)
72
- x = check_convert_sample_array(x)
73
- fit(x).transform(x)
74
- end
75
-
76
- # Transform the given samples to polynomial features.
77
- #
78
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed.
79
- # @return [Numo::DFloat] (shape: [n_samples, n_output_features]) The transformed samples.
80
- def transform(x)
81
- x = check_convert_sample_array(x)
82
- # initialize transformed features
83
- n_samples, n_features = x.shape
84
- z = Numo::DFloat.zeros(n_samples, n_output_features)
85
- # bias
86
- z[true, 0] = 1
87
- curr_col = 1
88
- # itself
89
- z[true, 1..n_features] = x
90
- curr_col += n_features
91
- # high degree features
92
- curr_feat_ids = Array.new(n_features + 1) { |n| n + 1 }
93
- (1...@params[:degree]).each do
94
- next_feat_ids = []
95
- n_features.times do |d|
96
- f_range = curr_feat_ids[d]...curr_feat_ids.last
97
- next_col = curr_col + f_range.size
98
- z[true, curr_col...next_col] = z[true, f_range] * x[true, d..d]
99
- next_feat_ids.push(curr_col)
100
- curr_col = next_col
101
- end
102
- next_feat_ids.push(curr_col)
103
- curr_feat_ids = next_feat_ids
104
- end
105
- z
106
- end
107
- end
108
- end
109
- end
@@ -1,71 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/transformer'
5
-
6
- module Rumale
7
- # This module consists of the classes that perform preprocessings.
8
- module Preprocessing
9
- # Normalize samples by centering and scaling to unit variance.
10
- #
11
- # @example
12
- # normalizer = Rumale::Preprocessing::StandardScaler.new
13
- # new_training_samples = normalizer.fit_transform(training_samples)
14
- # new_testing_samples = normalizer.transform(testing_samples)
15
- class StandardScaler
16
- include Base::BaseEstimator
17
- include Base::Transformer
18
-
19
- # Return the vector consists of the mean value for each feature.
20
- # @return [Numo::DFloat] (shape: [n_features])
21
- attr_reader :mean_vec
22
-
23
- # Return the vector consists of the standard deviation for each feature.
24
- # @return [Numo::DFloat] (shape: [n_features])
25
- attr_reader :std_vec
26
-
27
- # Create a new normalizer for centering and scaling to unit variance.
28
- def initialize
29
- @params = {}
30
- @mean_vec = nil
31
- @std_vec = nil
32
- end
33
-
34
- # Calculate the mean value and standard deviation of each feature for scaling.
35
- #
36
- # @overload fit(x) -> StandardScaler
37
- #
38
- # @param x [Numo::DFloat] (shape: [n_samples, n_features])
39
- # The samples to calculate the mean values and standard deviations.
40
- # @return [StandardScaler]
41
- def fit(x, _y = nil)
42
- x = check_convert_sample_array(x)
43
- @mean_vec = x.mean(0)
44
- @std_vec = x.stddev(0)
45
- self
46
- end
47
-
48
- # Calculate the mean values and standard deviations, and then normalize samples using them.
49
- #
50
- # @overload fit_transform(x) -> Numo::DFloat
51
- #
52
- # @param x [Numo::DFloat] (shape: [n_samples, n_features])
53
- # The samples to calculate the mean values and standard deviations.
54
- # @return [Numo::DFloat] The scaled samples.
55
- def fit_transform(x, _y = nil)
56
- x = check_convert_sample_array(x)
57
- fit(x).transform(x)
58
- end
59
-
60
- # Perform standardization the given samples.
61
- #
62
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be scaled.
63
- # @return [Numo::DFloat] The scaled samples.
64
- def transform(x)
65
- x = check_convert_sample_array(x)
66
- n_samples, = x.shape
67
- (x - @mean_vec.tile(n_samples, 1)) / @std_vec.tile(n_samples, 1)
68
- end
69
- end
70
- end
71
- end
@@ -1,114 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Rumale
4
- # Module for calculating posterior class probabilities with SVM outputs.
5
- # This module is used for internal processes.
6
- #
7
- # @example
8
- # estimator = Rumale::LinearModel::SVC.new
9
- # estimator.fit(x, bin_y)
10
- # df = estimator.decision_function(x)
11
- # params = Rumale::ProbabilisticOutput.fit_sigmoid(df, bin_y)
12
- # probs = 1 / (Numo::NMath.exp(params[0] * df + params[1]) + 1)
13
- #
14
- # *Reference*
15
- # - Platt, J C., "Probabilistic Outputs for Support Vector Machines and Comparisons to Regularized Likelihood Methods," Adv. Large Margin Classifiers, pp. 61--74, 2000.
16
- # - Lin, H-T., Lin, C-J., and Weng, R C., "A Note on Platt's Probabilistic Outputs for Support Vector Machines," J. Machine Learning, Vol. 63 (3), pp. 267--276, 2007.
17
- module ProbabilisticOutput
18
- class << self
19
- # Fit the probabilistic model for binary SVM outputs.
20
- #
21
- # @param df [Numo::DFloat] (shape: [n_samples]) The outputs of decision function to be used for fitting the model.
22
- # @param bin_y [Numo::Int32] (shape: [n_samples]) The binary labels to be used for fitting the model.
23
- # @param max_iter [Integer] The maximum number of iterations.
24
- # @param min_step [Float] The minimum step of Newton's method.
25
- # @param sigma [Float] The parameter to avoid hessian matrix from becoming singular matrix.
26
- # @return [Numo::DFloat] (shape: 2) The parameters of the model.
27
- def fit_sigmoid(df, bin_y, max_iter = 100, min_step = 1e-10, sigma = 1e-12)
28
- # Initialize some variables.
29
- n_samples = bin_y.size
30
- negative_label = bin_y.to_a.uniq.min
31
- pos = bin_y.ne(negative_label)
32
- neg = bin_y.eq(negative_label)
33
- n_pos_samples = pos.count
34
- n_neg_samples = neg.count
35
- target_probs = Numo::DFloat.zeros(n_samples)
36
- target_probs[pos] = (n_pos_samples + 1) / (n_pos_samples + 2.0)
37
- target_probs[neg] = 1 / (n_neg_samples + 2.0)
38
- alpha = 0.0
39
- beta = Math.log((n_neg_samples + 1) / (n_pos_samples + 1.0))
40
- err = error_function(target_probs, df, alpha, beta)
41
- # Optimize parameters for class porbability calculation.
42
- old_grad_vec = Numo::DFloat.zeros(2)
43
- max_iter.times do
44
- # Calculate gradient and hessian matrix.
45
- probs = predicted_probs(df, alpha, beta)
46
- grad_vec = gradient(target_probs, probs, df)
47
- hess_mat = hessian_matrix(probs, df, sigma)
48
- break if grad_vec.abs.lt(1e-5).count == 2
49
- break if (old_grad_vec - grad_vec).abs.sum < 1e-5
50
-
51
- old_grad_vec = grad_vec
52
- # Calculate Newton directions.
53
- dirs_vec = directions(grad_vec, hess_mat)
54
- grad_dir = grad_vec.dot(dirs_vec)
55
- stepsize = 2.0
56
- while stepsize >= min_step
57
- stepsize *= 0.5
58
- new_alpha = alpha + stepsize * dirs_vec[0]
59
- new_beta = beta + stepsize * dirs_vec[1]
60
- new_err = error_function(target_probs, df, new_alpha, new_beta)
61
- next unless new_err < err + 0.0001 * stepsize * grad_dir
62
-
63
- alpha = new_alpha
64
- beta = new_beta
65
- err = new_err
66
- break
67
- end
68
- end
69
- Numo::DFloat[alpha, beta]
70
- end
71
-
72
- private
73
-
74
- def error_function(target_probs, df, alpha, beta)
75
- fn = alpha * df + beta
76
- pos = fn.ge(0.0)
77
- neg = fn.lt(0.0)
78
- err = 0.0
79
- err += (target_probs[pos] * fn[pos] + Numo::NMath.log(1 + Numo::NMath.exp(-fn[pos]))).sum if pos.count.positive?
80
- err += ((target_probs[neg] - 1) * fn[neg] + Numo::NMath.log(1 + Numo::NMath.exp(fn[neg]))).sum if neg.count.positive?
81
- err
82
- end
83
-
84
- def predicted_probs(df, alpha, beta)
85
- fn = alpha * df + beta
86
- pos = fn.ge(0.0)
87
- neg = fn.lt(0.0)
88
- probs = Numo::DFloat.zeros(df.shape[0])
89
- probs[pos] = Numo::NMath.exp(-fn[pos]) / (1 + Numo::NMath.exp(-fn[pos])) if pos.count.positive?
90
- probs[neg] = 1 / (1 + Numo::NMath.exp(fn[neg])) if neg.count.positive?
91
- probs
92
- end
93
-
94
- def gradient(target_probs, probs, df)
95
- sub = target_probs - probs
96
- Numo::DFloat[(df * sub).sum, sub.sum]
97
- end
98
-
99
- def hessian_matrix(probs, df, sigma)
100
- sub = probs * (1 - probs)
101
- h11 = (df**2 * sub).sum + sigma
102
- h22 = sub.sum + sigma
103
- h21 = (df * sub).sum
104
- Numo::DFloat[[h11, h21], [h21, h22]]
105
- end
106
-
107
- def directions(grad_vec, hess_mat)
108
- det = hess_mat[0, 0] * hess_mat[1, 1] - hess_mat[0, 1] * hess_mat[1, 0]
109
- inv_hess_mat = Numo::DFloat[[hess_mat[1, 1], -hess_mat[0, 1]], [-hess_mat[1, 0], hess_mat[0, 0]]] / det
110
- -inv_hess_mat.dot(grad_vec)
111
- end
112
- end
113
- end
114
- end
@@ -1,150 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/tree/node'
5
- require 'rumale/rumaleext'
6
-
7
- module Rumale
8
- # This module consists of the classes that implement tree models.
9
- module Tree
10
- # BaseDecisionTree is an abstract class for implementation of decision tree-based estimator.
11
- # This class is used internally.
12
- class BaseDecisionTree
13
- include Base::BaseEstimator
14
-
15
- # Initialize a decision tree-based estimator.
16
- #
17
- # @param criterion [String] The function to evalue spliting point.
18
- # @param max_depth [Integer] The maximum depth of the tree.
19
- # If nil is given, decision tree grows without concern for depth.
20
- # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
21
- # If nil is given, number of leaves is not limited.
22
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
23
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
24
- # If nil is given, split process considers all features.
25
- # @param random_seed [Integer] The seed value using to initialize the random generator.
26
- # It is used to randomly determine the order of features when deciding spliting point.
27
- def initialize(criterion: nil, max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil, random_seed: nil)
28
- @params = {}
29
- @params[:criterion] = criterion
30
- @params[:max_depth] = max_depth
31
- @params[:max_leaf_nodes] = max_leaf_nodes
32
- @params[:min_samples_leaf] = min_samples_leaf
33
- @params[:max_features] = max_features
34
- @params[:random_seed] = random_seed
35
- @params[:random_seed] ||= srand
36
- @tree = nil
37
- @feature_importances = nil
38
- @n_leaves = nil
39
- @rng = Random.new(@params[:random_seed])
40
- end
41
-
42
- # Return the index of the leaf that each sample reached.
43
- #
44
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
45
- # @return [Numo::Int32] (shape: [n_samples]) Leaf index for sample.
46
- def apply(x)
47
- x = check_convert_sample_array(x)
48
- Numo::Int32[*(Array.new(x.shape[0]) { |n| partial_apply(@tree, x[n, true]) })]
49
- end
50
-
51
- private
52
-
53
- def partial_apply(tree, sample)
54
- node = tree
55
- until node.leaf
56
- # :nocov:
57
- node = if node.right.nil?
58
- node.left
59
- elsif node.left.nil?
60
- node.right
61
- # :nocov:
62
- else
63
- sample[node.feature_id] <= node.threshold ? node.left : node.right
64
- end
65
- end
66
- node.leaf_id
67
- end
68
-
69
- def build_tree(x, y)
70
- y = y.expand_dims(1).dup if y.shape[1].nil?
71
- @feature_ids = Array.new(x.shape[1]) { |v| v }
72
- @tree = grow_node(0, x, y, impurity(y))
73
- @feature_ids = nil
74
- nil
75
- end
76
-
77
- def grow_node(depth, x, y, impurity)
78
- # intialize node.
79
- n_samples = x.shape[0]
80
- node = Node.new(depth: depth, impurity: impurity, n_samples: n_samples)
81
-
82
- # terminate growing.
83
- return nil if !@params[:max_leaf_nodes].nil? && @n_leaves >= @params[:max_leaf_nodes]
84
- return nil if n_samples < @params[:min_samples_leaf]
85
- return put_leaf(node, y) if n_samples == @params[:min_samples_leaf]
86
- return put_leaf(node, y) if !@params[:max_depth].nil? && depth == @params[:max_depth]
87
- return put_leaf(node, y) if stop_growing?(y)
88
-
89
- # calculate optimal parameters.
90
- feature_id, left_imp, right_imp, threshold, gain =
91
- rand_ids.map { |n| [n, *best_split(x[true, n], y, impurity)] }.max_by(&:last)
92
-
93
- return put_leaf(node, y) if gain.nil? || gain.zero?
94
-
95
- left_ids = x[true, feature_id].le(threshold).where
96
- right_ids = x[true, feature_id].gt(threshold).where
97
- node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true], left_imp)
98
- node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true], right_imp)
99
-
100
- return put_leaf(node, y) if node.left.nil? && node.right.nil?
101
-
102
- node.feature_id = feature_id
103
- node.threshold = threshold
104
- node.leaf = false
105
- node
106
- end
107
-
108
- def stop_growing?(_y)
109
- raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
110
- end
111
-
112
- def put_leaf(_node, _y)
113
- raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
114
- end
115
-
116
- def rand_ids
117
- @feature_ids.sample(@params[:max_features], random: @sub_rng)
118
- end
119
-
120
- def best_split(_features, _y, _impurity)
121
- raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
122
- end
123
-
124
- def impurity(_y)
125
- raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
126
- end
127
-
128
- def eval_importance(n_samples, n_features)
129
- @feature_importances = Numo::DFloat.zeros(n_features)
130
- eval_importance_at_node(@tree)
131
- @feature_importances /= n_samples
132
- normalizer = @feature_importances.sum
133
- @feature_importances /= normalizer if normalizer > 0.0
134
- nil
135
- end
136
-
137
- def eval_importance_at_node(node)
138
- return nil if node.leaf
139
- return nil if node.left.nil? || node.right.nil?
140
-
141
- gain = node.n_samples * node.impurity -
142
- node.left.n_samples * node.left.impurity -
143
- node.right.n_samples * node.right.impurity
144
- @feature_importances[node.feature_id] += gain
145
- eval_importance_at_node(node.left)
146
- eval_importance_at_node(node.right)
147
- end
148
- end
149
- end
150
- end