rumale 0.23.3 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE.txt +5 -1
  3. data/README.md +3 -288
  4. data/lib/rumale/version.rb +1 -1
  5. data/lib/rumale.rb +20 -131
  6. metadata +252 -150
  7. data/CHANGELOG.md +0 -643
  8. data/CODE_OF_CONDUCT.md +0 -74
  9. data/ext/rumale/extconf.rb +0 -37
  10. data/ext/rumale/rumaleext.c +0 -545
  11. data/ext/rumale/rumaleext.h +0 -12
  12. data/lib/rumale/base/base_estimator.rb +0 -49
  13. data/lib/rumale/base/classifier.rb +0 -36
  14. data/lib/rumale/base/cluster_analyzer.rb +0 -31
  15. data/lib/rumale/base/evaluator.rb +0 -17
  16. data/lib/rumale/base/regressor.rb +0 -36
  17. data/lib/rumale/base/splitter.rb +0 -21
  18. data/lib/rumale/base/transformer.rb +0 -22
  19. data/lib/rumale/clustering/dbscan.rb +0 -123
  20. data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
  21. data/lib/rumale/clustering/hdbscan.rb +0 -291
  22. data/lib/rumale/clustering/k_means.rb +0 -122
  23. data/lib/rumale/clustering/k_medoids.rb +0 -141
  24. data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
  25. data/lib/rumale/clustering/power_iteration.rb +0 -127
  26. data/lib/rumale/clustering/single_linkage.rb +0 -203
  27. data/lib/rumale/clustering/snn.rb +0 -76
  28. data/lib/rumale/clustering/spectral_clustering.rb +0 -115
  29. data/lib/rumale/dataset.rb +0 -246
  30. data/lib/rumale/decomposition/factor_analysis.rb +0 -150
  31. data/lib/rumale/decomposition/fast_ica.rb +0 -188
  32. data/lib/rumale/decomposition/nmf.rb +0 -124
  33. data/lib/rumale/decomposition/pca.rb +0 -159
  34. data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
  35. data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
  36. data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
  37. data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
  38. data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
  39. data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
  40. data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
  41. data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
  42. data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
  43. data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
  44. data/lib/rumale/ensemble/voting_classifier.rb +0 -126
  45. data/lib/rumale/ensemble/voting_regressor.rb +0 -82
  46. data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
  47. data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
  48. data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
  49. data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
  50. data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
  51. data/lib/rumale/evaluation_measure/f_score.rb +0 -50
  52. data/lib/rumale/evaluation_measure/function.rb +0 -147
  53. data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
  54. data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
  55. data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
  56. data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
  57. data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
  58. data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
  59. data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
  60. data/lib/rumale/evaluation_measure/precision.rb +0 -50
  61. data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
  62. data/lib/rumale/evaluation_measure/purity.rb +0 -40
  63. data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
  64. data/lib/rumale/evaluation_measure/recall.rb +0 -50
  65. data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
  66. data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
  67. data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
  68. data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
  69. data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
  70. data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
  71. data/lib/rumale/kernel_approximation/rbf.rb +0 -102
  72. data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
  73. data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
  74. data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
  75. data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
  76. data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
  77. data/lib/rumale/linear_model/base_sgd.rb +0 -285
  78. data/lib/rumale/linear_model/elastic_net.rb +0 -119
  79. data/lib/rumale/linear_model/lasso.rb +0 -115
  80. data/lib/rumale/linear_model/linear_regression.rb +0 -201
  81. data/lib/rumale/linear_model/logistic_regression.rb +0 -275
  82. data/lib/rumale/linear_model/nnls.rb +0 -137
  83. data/lib/rumale/linear_model/ridge.rb +0 -209
  84. data/lib/rumale/linear_model/svc.rb +0 -213
  85. data/lib/rumale/linear_model/svr.rb +0 -132
  86. data/lib/rumale/manifold/mds.rb +0 -155
  87. data/lib/rumale/manifold/tsne.rb +0 -222
  88. data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
  89. data/lib/rumale/metric_learning/mlkr.rb +0 -161
  90. data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
  91. data/lib/rumale/model_selection/cross_validation.rb +0 -125
  92. data/lib/rumale/model_selection/function.rb +0 -42
  93. data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
  94. data/lib/rumale/model_selection/group_k_fold.rb +0 -93
  95. data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
  96. data/lib/rumale/model_selection/k_fold.rb +0 -81
  97. data/lib/rumale/model_selection/shuffle_split.rb +0 -90
  98. data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
  99. data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
  100. data/lib/rumale/model_selection/time_series_split.rb +0 -91
  101. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
  102. data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
  103. data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
  104. data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
  105. data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
  106. data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
  107. data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
  108. data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
  109. data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
  110. data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
  111. data/lib/rumale/neural_network/adam.rb +0 -56
  112. data/lib/rumale/neural_network/base_mlp.rb +0 -248
  113. data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
  114. data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
  115. data/lib/rumale/pairwise_metric.rb +0 -152
  116. data/lib/rumale/pipeline/feature_union.rb +0 -69
  117. data/lib/rumale/pipeline/pipeline.rb +0 -175
  118. data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
  119. data/lib/rumale/preprocessing/binarizer.rb +0 -60
  120. data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
  121. data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
  122. data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
  123. data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
  124. data/lib/rumale/preprocessing/label_encoder.rb +0 -79
  125. data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
  126. data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
  127. data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
  128. data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
  129. data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
  130. data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
  131. data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
  132. data/lib/rumale/probabilistic_output.rb +0 -114
  133. data/lib/rumale/tree/base_decision_tree.rb +0 -150
  134. data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
  135. data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
  136. data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
  137. data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
  138. data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
  139. data/lib/rumale/tree/node.rb +0 -39
  140. data/lib/rumale/utils.rb +0 -42
  141. data/lib/rumale/validation.rb +0 -128
  142. data/lib/rumale/values.rb +0 -13
@@ -1,150 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/tree/base_decision_tree'
4
- require 'rumale/base/classifier'
5
-
6
- module Rumale
7
- module Tree
8
- # DecisionTreeClassifier is a class that implements decision tree for classification.
9
- #
10
- # @example
11
- # estimator =
12
- # Rumale::Tree::DecisionTreeClassifier.new(
13
- # criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
14
- # estimator.fit(training_samples, traininig_labels)
15
- # results = estimator.predict(testing_samples)
16
- #
17
- class DecisionTreeClassifier < BaseDecisionTree
18
- include Base::Classifier
19
- include ExtDecisionTreeClassifier
20
-
21
- # Return the class labels.
22
- # @return [Numo::Int32] (size: n_classes)
23
- attr_reader :classes
24
-
25
- # Return the importance for each feature.
26
- # @return [Numo::DFloat] (size: n_features)
27
- attr_reader :feature_importances
28
-
29
- # Return the learned tree.
30
- # @return [Node]
31
- attr_reader :tree
32
-
33
- # Return the random generator for random selection of feature index.
34
- # @return [Random]
35
- attr_reader :rng
36
-
37
- # Return the labels assigned each leaf.
38
- # @return [Numo::Int32] (size: n_leafs)
39
- attr_reader :leaf_labels
40
-
41
- # Create a new classifier with decision tree algorithm.
42
- #
43
- # @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
44
- # @param max_depth [Integer] The maximum depth of the tree.
45
- # If nil is given, decision tree grows without concern for depth.
46
- # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
47
- # If nil is given, number of leaves is not limited.
48
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
49
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
50
- # If nil is given, split process considers all features.
51
- # @param random_seed [Integer] The seed value using to initialize the random generator.
52
- # It is used to randomly determine the order of features when deciding spliting point.
53
- def initialize(criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
54
- random_seed: nil)
55
- check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
56
- max_features: max_features, random_seed: random_seed)
57
- check_params_numeric(min_samples_leaf: min_samples_leaf)
58
- check_params_string(criterion: criterion)
59
- check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
60
- min_samples_leaf: min_samples_leaf, max_features: max_features)
61
- super
62
- @leaf_labels = nil
63
- end
64
-
65
- # Fit the model with given training data.
66
- #
67
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
68
- # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
69
- # @return [DecisionTreeClassifier] The learned classifier itself.
70
- def fit(x, y)
71
- x = check_convert_sample_array(x)
72
- y = check_convert_label_array(y)
73
- check_sample_label_size(x, y)
74
- n_samples, n_features = x.shape
75
- @params[:max_features] = n_features if @params[:max_features].nil?
76
- @params[:max_features] = [@params[:max_features], n_features].min
77
- uniq_y = y.to_a.uniq.sort
78
- @classes = Numo::Int32.asarray(uniq_y)
79
- @n_leaves = 0
80
- @leaf_labels = []
81
- @sub_rng = @rng.dup
82
- build_tree(x, y.map { |v| uniq_y.index(v) })
83
- eval_importance(n_samples, n_features)
84
- @leaf_labels = Numo::Int32[*@leaf_labels]
85
- self
86
- end
87
-
88
- # Predict class labels for samples.
89
- #
90
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
91
- # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
92
- def predict(x)
93
- x = check_convert_sample_array(x)
94
- @leaf_labels[apply(x)].dup
95
- end
96
-
97
- # Predict probability for samples.
98
- #
99
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
100
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
101
- def predict_proba(x)
102
- x = check_convert_sample_array(x)
103
- Numo::DFloat[*(Array.new(x.shape[0]) { |n| partial_predict_proba(@tree, x[n, true]) })]
104
- end
105
-
106
- private
107
-
108
- def partial_predict_proba(tree, sample)
109
- node = tree
110
- until node.leaf
111
- # :nocov:
112
- node = if node.right.nil?
113
- node.left
114
- elsif node.left.nil?
115
- node.right
116
- # :nocov:
117
- else
118
- sample[node.feature_id] <= node.threshold ? node.left : node.right
119
- end
120
- end
121
- node.probs
122
- end
123
-
124
- def stop_growing?(y)
125
- y[true, 0].to_a.uniq.size == 1
126
- end
127
-
128
- def put_leaf(node, y)
129
- node.probs = y.flatten.bincount(minlength: @classes.size) / node.n_samples.to_f
130
- node.leaf = true
131
- node.leaf_id = @n_leaves
132
- @n_leaves += 1
133
- @leaf_labels.push(@classes[node.probs.max_index])
134
- node
135
- end
136
-
137
- def best_split(features, y, whole_impurity)
138
- order = features.sort_index
139
- n_classes = @classes.size
140
- find_split_params(@params[:criterion], whole_impurity, order, features, y[true, 0], n_classes)
141
- end
142
-
143
- def impurity(y)
144
- n_elements = y.shape[0]
145
- n_classes = @classes.size
146
- node_impurity(@params[:criterion], y[true, 0].dup, n_elements, n_classes)
147
- end
148
- end
149
- end
150
- end
@@ -1,116 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/tree/base_decision_tree'
4
- require 'rumale/base/regressor'
5
-
6
- module Rumale
7
- module Tree
8
- # DecisionTreeRegressor is a class that implements decision tree for regression.
9
- #
10
- # @example
11
- # estimator =
12
- # Rumale::Tree::DecisionTreeRegressor.new(
13
- # max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
14
- # estimator.fit(training_samples, traininig_values)
15
- # results = estimator.predict(testing_samples)
16
- #
17
- class DecisionTreeRegressor < BaseDecisionTree
18
- include Base::Regressor
19
- include ExtDecisionTreeRegressor
20
-
21
- # Return the importance for each feature.
22
- # @return [Numo::DFloat] (size: n_features)
23
- attr_reader :feature_importances
24
-
25
- # Return the learned tree.
26
- # @return [Node]
27
- attr_reader :tree
28
-
29
- # Return the random generator for random selection of feature index.
30
- # @return [Random]
31
- attr_reader :rng
32
-
33
- # Return the values assigned each leaf.
34
- # @return [Numo::DFloat] (shape: [n_leafs, n_outputs])
35
- attr_reader :leaf_values
36
-
37
- # Create a new regressor with decision tree algorithm.
38
- #
39
- # @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
40
- # @param max_depth [Integer] The maximum depth of the tree.
41
- # If nil is given, decision tree grows without concern for depth.
42
- # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
43
- # If nil is given, number of leaves is not limited.
44
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
45
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
46
- # If nil is given, split process considers all features.
47
- # @param random_seed [Integer] The seed value using to initialize the random generator.
48
- # It is used to randomly determine the order of features when deciding spliting point.
49
- def initialize(criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
50
- random_seed: nil)
51
- check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
52
- max_features: max_features, random_seed: random_seed)
53
- check_params_numeric(min_samples_leaf: min_samples_leaf)
54
- check_params_string(criterion: criterion)
55
- check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
56
- min_samples_leaf: min_samples_leaf, max_features: max_features)
57
- super
58
- @leaf_values = nil
59
- end
60
-
61
- # Fit the model with given training data.
62
- #
63
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
64
- # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The taget values to be used for fitting the model.
65
- # @return [DecisionTreeRegressor] The learned regressor itself.
66
- def fit(x, y)
67
- x = check_convert_sample_array(x)
68
- y = check_convert_tvalue_array(y)
69
- check_sample_tvalue_size(x, y)
70
- n_samples, n_features = x.shape
71
- @params[:max_features] = n_features if @params[:max_features].nil?
72
- @params[:max_features] = [@params[:max_features], n_features].min
73
- @n_leaves = 0
74
- @leaf_values = []
75
- @sub_rng = @rng.dup
76
- build_tree(x, y)
77
- eval_importance(n_samples, n_features)
78
- @leaf_values = Numo::DFloat.cast(@leaf_values)
79
- @leaf_values = @leaf_values.flatten.dup if @leaf_values.shape[1] == 1
80
- self
81
- end
82
-
83
- # Predict values for samples.
84
- #
85
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
86
- # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
87
- def predict(x)
88
- x = check_convert_sample_array(x)
89
- @leaf_values.shape[1].nil? ? @leaf_values[apply(x)].dup : @leaf_values[apply(x), true].dup
90
- end
91
-
92
- private
93
-
94
- def stop_growing?(y)
95
- y.to_a.uniq.size == 1
96
- end
97
-
98
- def put_leaf(node, y)
99
- node.probs = nil
100
- node.leaf = true
101
- node.leaf_id = @n_leaves
102
- @n_leaves += 1
103
- @leaf_values.push(y.mean(0))
104
- node
105
- end
106
-
107
- def best_split(f, y, impurity)
108
- find_split_params(@params[:criterion], impurity, f.sort_index, f, y)
109
- end
110
-
111
- def impurity(y)
112
- node_impurity(@params[:criterion], y.to_a)
113
- end
114
- end
115
- end
116
- end
@@ -1,107 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/tree/decision_tree_classifier'
4
-
5
- module Rumale
6
- module Tree
7
- # ExtraTreeClassifier is a class that implements extra randomized tree for classification.
8
- #
9
- # @example
10
- # estimator =
11
- # Rumale::Tree::ExtraTreeClassifier.new(
12
- # criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
13
- # estimator.fit(training_samples, traininig_labels)
14
- # results = estimator.predict(testing_samples)
15
- #
16
- # *Reference*
17
- # - Geurts, P., Ernst, D., and Wehenkel, L., "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
18
- class ExtraTreeClassifier < DecisionTreeClassifier
19
- # Return the class labels.
20
- # @return [Numo::Int32] (size: n_classes)
21
- attr_reader :classes
22
-
23
- # Return the importance for each feature.
24
- # @return [Numo::DFloat] (size: n_features)
25
- attr_reader :feature_importances
26
-
27
- # Return the learned tree.
28
- # @return [Node]
29
- attr_reader :tree
30
-
31
- # Return the random generator for random selection of feature index.
32
- # @return [Random]
33
- attr_reader :rng
34
-
35
- # Return the labels assigned each leaf.
36
- # @return [Numo::Int32] (size: n_leafs)
37
- attr_reader :leaf_labels
38
-
39
- # Create a new classifier with extra randomized tree algorithm.
40
- #
41
- # @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
42
- # @param max_depth [Integer] The maximum depth of the tree.
43
- # If nil is given, extra tree grows without concern for depth.
44
- # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
45
- # If nil is given, number of leaves is not limited.
46
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
47
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
48
- # If nil is given, split process considers all features.
49
- # @param random_seed [Integer] The seed value using to initialize the random generator.
50
- # It is used to randomly determine the order of features when deciding spliting point.
51
- def initialize(criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
52
- random_seed: nil)
53
- check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
54
- max_features: max_features, random_seed: random_seed)
55
- check_params_numeric(min_samples_leaf: min_samples_leaf)
56
- check_params_string(criterion: criterion)
57
- check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
58
- min_samples_leaf: min_samples_leaf, max_features: max_features)
59
- super
60
- end
61
-
62
- # Fit the model with given training data.
63
- #
64
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
65
- # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
66
- # @return [ExtraTreeClassifier] The learned classifier itself.
67
- def fit(x, y)
68
- x = check_convert_sample_array(x)
69
- y = check_convert_label_array(y)
70
- check_sample_label_size(x, y)
71
- super
72
- end
73
-
74
- # Predict class labels for samples.
75
- #
76
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
77
- # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
78
- def predict(x)
79
- x = check_convert_sample_array(x)
80
- super
81
- end
82
-
83
- # Predict probability for samples.
84
- #
85
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
86
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
87
- def predict_proba(x)
88
- x = check_convert_sample_array(x)
89
- super
90
- end
91
-
92
- private
93
-
94
- def best_split(features, y, whole_impurity)
95
- threshold = @sub_rng.rand(features.min..features.max)
96
- l_ids = features.le(threshold).where
97
- r_ids = features.gt(threshold).where
98
- l_impurity = l_ids.empty? ? 0.0 : impurity(y[l_ids, true])
99
- r_impurity = r_ids.empty? ? 0.0 : impurity(y[r_ids, true])
100
- gain = whole_impurity -
101
- l_impurity * l_ids.size.fdiv(y.shape[0]) -
102
- r_impurity * r_ids.size.fdiv(y.shape[0])
103
- [l_impurity, r_impurity, threshold, gain]
104
- end
105
- end
106
- end
107
- end
@@ -1,94 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/tree/decision_tree_regressor'
4
-
5
- module Rumale
6
- module Tree
7
- # ExtraTreeRegressor is a class that implements extra randomized tree for regression.
8
- #
9
- # @example
10
- # estimator =
11
- # Rumale::Tree::ExtraTreeRegressor.new(
12
- # max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
13
- # estimator.fit(training_samples, traininig_values)
14
- # results = estimator.predict(testing_samples)
15
- #
16
- # *Reference*
17
- # - Geurts, P., Ernst, D., and Wehenkel, L., "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
18
- class ExtraTreeRegressor < DecisionTreeRegressor
19
- # Return the importance for each feature.
20
- # @return [Numo::DFloat] (size: n_features)
21
- attr_reader :feature_importances
22
-
23
- # Return the learned tree.
24
- # @return [Node]
25
- attr_reader :tree
26
-
27
- # Return the random generator for random selection of feature index.
28
- # @return [Random]
29
- attr_reader :rng
30
-
31
- # Return the values assigned each leaf.
32
- # @return [Numo::DFloat] (shape: [n_leafs, n_outputs])
33
- attr_reader :leaf_values
34
-
35
- # Create a new regressor with extra randomized tree algorithm.
36
- #
37
- # @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
38
- # @param max_depth [Integer] The maximum depth of the tree.
39
- # If nil is given, extra tree grows without concern for depth.
40
- # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
41
- # If nil is given, number of leaves is not limited.
42
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
43
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
44
- # If nil is given, split process considers all features.
45
- # @param random_seed [Integer] The seed value using to initialize the random generator.
46
- # It is used to randomly determine the order of features when deciding spliting point.
47
- def initialize(criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
48
- random_seed: nil)
49
- check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
50
- max_features: max_features, random_seed: random_seed)
51
- check_params_numeric(min_samples_leaf: min_samples_leaf)
52
- check_params_string(criterion: criterion)
53
- check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
54
- min_samples_leaf: min_samples_leaf, max_features: max_features)
55
- super
56
- end
57
-
58
- # Fit the model with given training data.
59
- #
60
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
61
- # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The taget values to be used for fitting the model.
62
- # @return [ExtraTreeRegressor] The learned regressor itself.
63
- def fit(x, y)
64
- x = check_convert_sample_array(x)
65
- y = check_convert_tvalue_array(y)
66
- check_sample_tvalue_size(x, y)
67
- super
68
- end
69
-
70
- # Predict values for samples.
71
- #
72
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
73
- # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
74
- def predict(x)
75
- x = check_convert_sample_array(x)
76
- super
77
- end
78
-
79
- private
80
-
81
- def best_split(features, y, whole_impurity)
82
- threshold = @sub_rng.rand(features.min..features.max)
83
- l_ids = features.le(threshold).where
84
- r_ids = features.gt(threshold).where
85
- l_impurity = l_ids.empty? ? 0.0 : impurity(y[l_ids, true])
86
- r_impurity = r_ids.empty? ? 0.0 : impurity(y[r_ids, true])
87
- gain = whole_impurity -
88
- l_impurity * l_ids.size.fdiv(y.shape[0]) -
89
- r_impurity * r_ids.size.fdiv(y.shape[0])
90
- [l_impurity, r_impurity, threshold, gain]
91
- end
92
- end
93
- end
94
- end
@@ -1,202 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/regressor'
5
- require 'rumale/rumaleext'
6
- require 'rumale/tree/node'
7
-
8
- module Rumale
9
- module Tree
10
- # GradientTreeRegressor is a class that implements decision tree for regression with exact gredy algorithm.
11
- # This class is used internally for estimators with gradient tree boosting.
12
- #
13
- # *Reference*
14
- # - Friedman, J H., "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
15
- # - Friedman, J H., "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
16
- # - Chen, T., and Guestrin, C., "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
17
- class GradientTreeRegressor
18
- include Base::BaseEstimator
19
- include Base::Regressor
20
- include ExtGradientTreeRegressor
21
-
22
- # Return the importance for each feature.
23
- # The feature importances are calculated based on the numbers of times the feature is used for splitting.
24
- # @return [Numo::DFloat] (shape: [n_features])
25
- attr_reader :feature_importances
26
-
27
- # Return the learned tree.
28
- # @return [Node]
29
- attr_reader :tree
30
-
31
- # Return the random generator for random selection of feature index.
32
- # @return [Random]
33
- attr_reader :rng
34
-
35
- # Return the values assigned each leaf.
36
- # @return [Numo::DFloat] (shape: [n_leaves])
37
- attr_reader :leaf_weights
38
-
39
- # Initialize a gradient tree regressor
40
- #
41
- # @param reg_lambda [Float] The L2 regularization term on weight.
42
- # @param shrinkage_rate [Float] The shrinkage rate for weight.
43
- # @param max_depth [Integer] The maximum depth of the tree.
44
- # If nil is given, decision tree grows without concern for depth.
45
- # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
46
- # If nil is given, number of leaves is not limited.
47
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
48
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
49
- # If nil is given, split process considers all features.
50
- # @param random_seed [Integer] The seed value using to initialize the random generator.
51
- # It is used to randomly determine the order of features when deciding spliting point.
52
- def initialize(reg_lambda: 0.0, shrinkage_rate: 1.0,
53
- max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil, random_seed: nil)
54
- check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
55
- max_features: max_features, random_seed: random_seed)
56
- check_params_numeric(reg_lambda: reg_lambda, shrinkage_rate: shrinkage_rate, min_samples_leaf: min_samples_leaf)
57
- check_params_positive(reg_lambda: reg_lambda, shrinkage_rate: shrinkage_rate,
58
- max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
59
- min_samples_leaf: min_samples_leaf, max_features: max_features)
60
- @params = {}
61
- @params[:reg_lambda] = reg_lambda
62
- @params[:shrinkage_rate] = shrinkage_rate
63
- @params[:max_depth] = max_depth
64
- @params[:max_leaf_nodes] = max_leaf_nodes
65
- @params[:min_samples_leaf] = min_samples_leaf
66
- @params[:max_features] = max_features
67
- @params[:random_seed] = random_seed
68
- @params[:random_seed] ||= srand
69
- @tree = nil
70
- @feature_importances = nil
71
- @n_leaves = nil
72
- @leaf_weights = nil
73
- @rng = Random.new(@params[:random_seed])
74
- end
75
-
76
- # Fit the model with given training data.
77
- #
78
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
79
- # @param y [Numo::DFloat] (shape: [n_samples]) The taget values to be used for fitting the model.
80
- # @param g [Numo::DFloat] (shape: [n_samples]) The gradient of loss function.
81
- # @param h [Numo::DFloat] (shape: [n_samples]) The hessian of loss function.
82
- # @return [GradientTreeRegressor] The learned regressor itself.
83
- def fit(x, y, g, h)
84
- x = check_convert_sample_array(x)
85
- y = check_convert_tvalue_array(y)
86
- g = check_convert_tvalue_array(g)
87
- h = check_convert_tvalue_array(h)
88
- check_sample_tvalue_size(x, y)
89
- # Initialize some variables.
90
- n_features = x.shape[1]
91
- @params[:max_features] ||= n_features
92
- @n_leaves = 0
93
- @leaf_weights = []
94
- @feature_importances = Numo::DFloat.zeros(n_features)
95
- @sub_rng = @rng.dup
96
- # Build tree.
97
- build_tree(x, y, g, h)
98
- @leaf_weights = Numo::DFloat[*@leaf_weights]
99
- self
100
- end
101
-
102
- # Predict values for samples.
103
- #
104
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
105
- # @return [Numo::DFloat] (size: n_samples) Predicted values per sample.
106
- def predict(x)
107
- x = check_convert_sample_array(x)
108
- @leaf_weights[apply(x)].dup
109
- end
110
-
111
- # Return the index of the leaf that each sample reached.
112
- #
113
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
114
- # @return [Numo::Int32] (shape: [n_samples]) Leaf index for sample.
115
- def apply(x)
116
- x = check_convert_sample_array(x)
117
- Numo::Int32[*(Array.new(x.shape[0]) { |n| partial_apply(@tree, x[n, true]) })]
118
- end
119
-
120
- private
121
-
122
- def partial_apply(tree, sample)
123
- node = tree
124
- until node.leaf
125
- # :nocov:
126
- node = if node.right.nil?
127
- node.left
128
- elsif node.left.nil?
129
- node.right
130
- # :nocov:
131
- else
132
- sample[node.feature_id] <= node.threshold ? node.left : node.right
133
- end
134
- end
135
- node.leaf_id
136
- end
137
-
138
- def build_tree(x, y, g, h)
139
- @feature_ids = Array.new(x.shape[1]) { |v| v }
140
- @tree = grow_node(0, x, y, g, h)
141
- @feature_ids = nil
142
- nil
143
- end
144
-
145
- def grow_node(depth, x, y, g, h) # rubocop:disable Metrics/AbcSize
146
- # intialize some variables.
147
- sum_g = g.sum
148
- sum_h = h.sum
149
- n_samples = x.shape[0]
150
- node = Node.new(depth: depth, n_samples: n_samples)
151
-
152
- # terminate growing.
153
- return nil if !@params[:max_leaf_nodes].nil? && @n_leaves >= @params[:max_leaf_nodes]
154
- return nil if n_samples < @params[:min_samples_leaf]
155
- return put_leaf(node, sum_g, sum_h) if n_samples == @params[:min_samples_leaf]
156
- return put_leaf(node, sum_g, sum_h) if !@params[:max_depth].nil? && depth == @params[:max_depth]
157
- return put_leaf(node, sum_g, sum_h) if stop_growing?(y)
158
-
159
- # calculate optimal parameters.
160
- feature_id, threshold, gain = rand_ids.map { |n| [n, *best_split(x[true, n], g, h, sum_g, sum_h)] }.max_by(&:last)
161
-
162
- return put_leaf(node, sum_g, sum_h) if gain.nil? || gain.zero?
163
-
164
- left_ids = x[true, feature_id].le(threshold).where
165
- right_ids = x[true, feature_id].gt(threshold).where
166
- node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids], g[left_ids], h[left_ids])
167
- node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids], g[right_ids], h[right_ids])
168
-
169
- return put_leaf(node, sum_g, sum_h) if node.left.nil? && node.right.nil?
170
-
171
- @feature_importances[feature_id] += 1.0
172
-
173
- node.feature_id = feature_id
174
- node.threshold = threshold
175
- node.leaf = false
176
- node
177
- end
178
-
179
- def stop_growing?(y)
180
- y.to_a.uniq.size == 1
181
- end
182
-
183
- def put_leaf(node, sum_g, sum_h)
184
- node.probs = nil
185
- node.leaf = true
186
- node.leaf_id = @n_leaves
187
- weight = -@params[:shrinkage_rate] * sum_g / (sum_h + @params[:reg_lambda])
188
- @leaf_weights.push(weight)
189
- @n_leaves += 1
190
- node
191
- end
192
-
193
- def best_split(f, g, h, sum_g, sum_h)
194
- find_split_params(f.sort_index, f, g, h, sum_g, sum_h, @params[:reg_lambda])
195
- end
196
-
197
- def rand_ids
198
- @feature_ids.sample(@params[:max_features], random: @sub_rng)
199
- end
200
- end
201
- end
202
- end