rumale 0.23.3 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE.txt +5 -1
- data/README.md +3 -288
- data/lib/rumale/version.rb +1 -1
- data/lib/rumale.rb +20 -131
- metadata +252 -150
- data/CHANGELOG.md +0 -643
- data/CODE_OF_CONDUCT.md +0 -74
- data/ext/rumale/extconf.rb +0 -37
- data/ext/rumale/rumaleext.c +0 -545
- data/ext/rumale/rumaleext.h +0 -12
- data/lib/rumale/base/base_estimator.rb +0 -49
- data/lib/rumale/base/classifier.rb +0 -36
- data/lib/rumale/base/cluster_analyzer.rb +0 -31
- data/lib/rumale/base/evaluator.rb +0 -17
- data/lib/rumale/base/regressor.rb +0 -36
- data/lib/rumale/base/splitter.rb +0 -21
- data/lib/rumale/base/transformer.rb +0 -22
- data/lib/rumale/clustering/dbscan.rb +0 -123
- data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
- data/lib/rumale/clustering/hdbscan.rb +0 -291
- data/lib/rumale/clustering/k_means.rb +0 -122
- data/lib/rumale/clustering/k_medoids.rb +0 -141
- data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
- data/lib/rumale/clustering/power_iteration.rb +0 -127
- data/lib/rumale/clustering/single_linkage.rb +0 -203
- data/lib/rumale/clustering/snn.rb +0 -76
- data/lib/rumale/clustering/spectral_clustering.rb +0 -115
- data/lib/rumale/dataset.rb +0 -246
- data/lib/rumale/decomposition/factor_analysis.rb +0 -150
- data/lib/rumale/decomposition/fast_ica.rb +0 -188
- data/lib/rumale/decomposition/nmf.rb +0 -124
- data/lib/rumale/decomposition/pca.rb +0 -159
- data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
- data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
- data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
- data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
- data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
- data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
- data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
- data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
- data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
- data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
- data/lib/rumale/ensemble/voting_classifier.rb +0 -126
- data/lib/rumale/ensemble/voting_regressor.rb +0 -82
- data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
- data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
- data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
- data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
- data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
- data/lib/rumale/evaluation_measure/f_score.rb +0 -50
- data/lib/rumale/evaluation_measure/function.rb +0 -147
- data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
- data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
- data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
- data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
- data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
- data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
- data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
- data/lib/rumale/evaluation_measure/precision.rb +0 -50
- data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
- data/lib/rumale/evaluation_measure/purity.rb +0 -40
- data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
- data/lib/rumale/evaluation_measure/recall.rb +0 -50
- data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
- data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
- data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
- data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
- data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
- data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
- data/lib/rumale/kernel_approximation/rbf.rb +0 -102
- data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
- data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
- data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
- data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
- data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
- data/lib/rumale/linear_model/base_sgd.rb +0 -285
- data/lib/rumale/linear_model/elastic_net.rb +0 -119
- data/lib/rumale/linear_model/lasso.rb +0 -115
- data/lib/rumale/linear_model/linear_regression.rb +0 -201
- data/lib/rumale/linear_model/logistic_regression.rb +0 -275
- data/lib/rumale/linear_model/nnls.rb +0 -137
- data/lib/rumale/linear_model/ridge.rb +0 -209
- data/lib/rumale/linear_model/svc.rb +0 -213
- data/lib/rumale/linear_model/svr.rb +0 -132
- data/lib/rumale/manifold/mds.rb +0 -155
- data/lib/rumale/manifold/tsne.rb +0 -222
- data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
- data/lib/rumale/metric_learning/mlkr.rb +0 -161
- data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
- data/lib/rumale/model_selection/cross_validation.rb +0 -125
- data/lib/rumale/model_selection/function.rb +0 -42
- data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
- data/lib/rumale/model_selection/group_k_fold.rb +0 -93
- data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
- data/lib/rumale/model_selection/k_fold.rb +0 -81
- data/lib/rumale/model_selection/shuffle_split.rb +0 -90
- data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
- data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
- data/lib/rumale/model_selection/time_series_split.rb +0 -91
- data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
- data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
- data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
- data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
- data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
- data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
- data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
- data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
- data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
- data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
- data/lib/rumale/neural_network/adam.rb +0 -56
- data/lib/rumale/neural_network/base_mlp.rb +0 -248
- data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
- data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
- data/lib/rumale/pairwise_metric.rb +0 -152
- data/lib/rumale/pipeline/feature_union.rb +0 -69
- data/lib/rumale/pipeline/pipeline.rb +0 -175
- data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
- data/lib/rumale/preprocessing/binarizer.rb +0 -60
- data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
- data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
- data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
- data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
- data/lib/rumale/preprocessing/label_encoder.rb +0 -79
- data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
- data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
- data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
- data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
- data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
- data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
- data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
- data/lib/rumale/probabilistic_output.rb +0 -114
- data/lib/rumale/tree/base_decision_tree.rb +0 -150
- data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
- data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
- data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
- data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
- data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
- data/lib/rumale/tree/node.rb +0 -39
- data/lib/rumale/utils.rb +0 -42
- data/lib/rumale/validation.rb +0 -128
- data/lib/rumale/values.rb +0 -13
|
@@ -1,109 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/base/base_estimator'
|
|
4
|
-
require 'rumale/base/transformer'
|
|
5
|
-
|
|
6
|
-
module Rumale
|
|
7
|
-
module Preprocessing
|
|
8
|
-
# Transfrom categorical features to integer values.
|
|
9
|
-
#
|
|
10
|
-
# @example
|
|
11
|
-
# encoder = Rumale::Preprocessing::OrdinalEncoder.new
|
|
12
|
-
# training_samples = [['left', 10], ['right', 15], ['right', 20]]
|
|
13
|
-
# training_samples = Numo::NArray.asarray(training_samples)
|
|
14
|
-
# encoder.fit(training_samples)
|
|
15
|
-
# p encoder.categories
|
|
16
|
-
# # [["left", "right"], [10, 15, 20]]
|
|
17
|
-
# testing_samples = [['left', 20], ['right', 10]]
|
|
18
|
-
# testing_samples = Numo::NArray.asarray(testing_samples)
|
|
19
|
-
# encoded = encoder.transform(testing_samples)
|
|
20
|
-
# p encoded
|
|
21
|
-
# # Numo::DFloat#shape=[2,2]
|
|
22
|
-
# # [[0, 2],
|
|
23
|
-
# # [1, 0]]
|
|
24
|
-
# p encoder.inverse_transform(encoded)
|
|
25
|
-
# # Numo::RObject#shape=[2,2]
|
|
26
|
-
# # [["left", 20],
|
|
27
|
-
# # ["right", 10]]
|
|
28
|
-
class OrdinalEncoder
|
|
29
|
-
include Base::BaseEstimator
|
|
30
|
-
include Base::Transformer
|
|
31
|
-
|
|
32
|
-
# Return the array consists of categorical value each feature.
|
|
33
|
-
# @return [Array] (size: n_features)
|
|
34
|
-
attr_reader :categories
|
|
35
|
-
|
|
36
|
-
# Create a new encoder that transform categorical features to integer values.
|
|
37
|
-
#
|
|
38
|
-
# @param categories [Nil/Array] The category list for each feature.
|
|
39
|
-
# If nil is given, extracted categories from the training data by calling the fit method are used.
|
|
40
|
-
def initialize(categories: nil)
|
|
41
|
-
check_params_type_or_nil(Array, categories: categories)
|
|
42
|
-
@categories = categories
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
# Fit encoder by extracting the category for each feature.
|
|
46
|
-
#
|
|
47
|
-
# @overload fit(x) -> OrdinalEncoder
|
|
48
|
-
#
|
|
49
|
-
# @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
|
|
50
|
-
# @return [LabelEncoder]
|
|
51
|
-
def fit(x, _y = nil)
|
|
52
|
-
raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
|
|
53
|
-
raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
|
|
54
|
-
|
|
55
|
-
n_features = x.shape[1]
|
|
56
|
-
@categories = Array.new(n_features) { |n| x[true, n].to_a.uniq.sort }
|
|
57
|
-
self
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
# Fit encoder, then return encoded categorical features to integer values.
|
|
61
|
-
#
|
|
62
|
-
# @overload fit_transform(x) -> Numo::DFloat
|
|
63
|
-
#
|
|
64
|
-
# @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
|
|
65
|
-
# @return [Numo::DFloat] The encoded categorical features to integer values.
|
|
66
|
-
def fit_transform(x, _y = nil)
|
|
67
|
-
raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
|
|
68
|
-
raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
|
|
69
|
-
|
|
70
|
-
fit(x).transform(x)
|
|
71
|
-
end
|
|
72
|
-
|
|
73
|
-
# Encode categorical features.
|
|
74
|
-
#
|
|
75
|
-
# @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
|
|
76
|
-
# @return [Numo::DFloat] The encoded categorical features to integer values.
|
|
77
|
-
def transform(x)
|
|
78
|
-
raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
|
|
79
|
-
raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
|
|
80
|
-
|
|
81
|
-
n_features = x.shape[1]
|
|
82
|
-
raise ArgumentError, 'Expect the number of features and the number of categories to be equal' if n_features != @categories.size
|
|
83
|
-
|
|
84
|
-
transformed = Array.new(n_features) do |n|
|
|
85
|
-
x[true, n].to_a.map { |v| @categories[n].index(v) }
|
|
86
|
-
end
|
|
87
|
-
|
|
88
|
-
Numo::DFloat.asarray(transformed.transpose)
|
|
89
|
-
end
|
|
90
|
-
|
|
91
|
-
# Decode values to categorical features.
|
|
92
|
-
#
|
|
93
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples consisting of values transformed from categorical features.
|
|
94
|
-
# @return [Numo::NArray] The decoded features.
|
|
95
|
-
def inverse_transform(x)
|
|
96
|
-
x = check_convert_sample_array(x)
|
|
97
|
-
|
|
98
|
-
n_features = x.shape[1]
|
|
99
|
-
raise ArgumentError, 'Expect the number of features and the number of categories to be equal' if n_features != @categories.size
|
|
100
|
-
|
|
101
|
-
inv_transformed = Array.new(n_features) do |n|
|
|
102
|
-
x[true, n].to_a.map { |i| @categories[n][i.to_i] }
|
|
103
|
-
end
|
|
104
|
-
|
|
105
|
-
Numo::NArray.asarray(inv_transformed.transpose)
|
|
106
|
-
end
|
|
107
|
-
end
|
|
108
|
-
end
|
|
109
|
-
end
|
|
@@ -1,109 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/base/base_estimator'
|
|
4
|
-
require 'rumale/base/transformer'
|
|
5
|
-
|
|
6
|
-
module Rumale
|
|
7
|
-
module Preprocessing
|
|
8
|
-
# Generating polynomial features from the given samples.
|
|
9
|
-
#
|
|
10
|
-
# @example
|
|
11
|
-
# require 'rumale'
|
|
12
|
-
#
|
|
13
|
-
# transformer = Rumale::Preprocessing::PolynomialFeatures.new(degree: 2)
|
|
14
|
-
# x = Numo::DFloat[[0, 1], [2, 3], [4, 5]]
|
|
15
|
-
# z = transformer.fit_transform(x)
|
|
16
|
-
# p z
|
|
17
|
-
#
|
|
18
|
-
# # Numo::DFloat#shape=[3,6]
|
|
19
|
-
# # [[1, 0, 1, 0, 0, 1],
|
|
20
|
-
# # [1, 2, 3, 4, 6, 9],
|
|
21
|
-
# # [1, 4, 5, 16, 20, 25]]
|
|
22
|
-
#
|
|
23
|
-
# # If you want to perform polynomial regression, combine it with LinearRegression as follows:
|
|
24
|
-
# ply = Rumale::Preprocessing::PolynomialFeatures.new(degree: 2)
|
|
25
|
-
# reg = Rumale::LinearModel::LinearRegression.new(fit_bias: false, random_seed: 1)
|
|
26
|
-
# pipeline = Rumale::Pipeline::Pipeline.new(steps: { trs: ply, est: reg })
|
|
27
|
-
# pipeline.fit(training_samples, training_values)
|
|
28
|
-
# results = pipeline.predict(testing_samples)
|
|
29
|
-
#
|
|
30
|
-
class PolynomialFeatures
|
|
31
|
-
include Base::BaseEstimator
|
|
32
|
-
include Base::Transformer
|
|
33
|
-
|
|
34
|
-
# Return the number of polynomial features.
|
|
35
|
-
# @return [Integer]
|
|
36
|
-
attr_reader :n_output_features
|
|
37
|
-
|
|
38
|
-
# Create a transformer for generating polynomial features.
|
|
39
|
-
#
|
|
40
|
-
# @param degree [Integer] The degree of polynomial features.
|
|
41
|
-
def initialize(degree: 2)
|
|
42
|
-
check_params_numeric(degree: degree)
|
|
43
|
-
raise ArgumentError, 'Expect the value of degree parameter greater than or eqaul to 1.' if degree < 1
|
|
44
|
-
|
|
45
|
-
@params = {}
|
|
46
|
-
@params[:degree] = degree
|
|
47
|
-
@n_output_features = nil
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
# Calculate the number of output polynomial fetures.
|
|
51
|
-
#
|
|
52
|
-
# @overload fit(x) -> PolynomialFeatures
|
|
53
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the number of output polynomial fetures.
|
|
54
|
-
# @return [PolynomialFeatures]
|
|
55
|
-
def fit(x, _y = nil)
|
|
56
|
-
x = check_convert_sample_array(x)
|
|
57
|
-
n_features = x.shape[1]
|
|
58
|
-
@n_output_features = 1
|
|
59
|
-
@params[:degree].times do |t|
|
|
60
|
-
@n_output_features += Array.new(n_features) { |n| n }.repeated_combination(t + 1).size
|
|
61
|
-
end
|
|
62
|
-
self
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
# Calculate the number of polynomial features, and then transform samples to polynomial features.
|
|
66
|
-
#
|
|
67
|
-
# @overload fit_transform(x) -> Numo::DFloat
|
|
68
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the number of polynomial features
|
|
69
|
-
# and be transformed.
|
|
70
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_output_features]) The transformed samples.
|
|
71
|
-
def fit_transform(x, _y = nil)
|
|
72
|
-
x = check_convert_sample_array(x)
|
|
73
|
-
fit(x).transform(x)
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
# Transform the given samples to polynomial features.
|
|
77
|
-
#
|
|
78
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed.
|
|
79
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_output_features]) The transformed samples.
|
|
80
|
-
def transform(x)
|
|
81
|
-
x = check_convert_sample_array(x)
|
|
82
|
-
# initialize transformed features
|
|
83
|
-
n_samples, n_features = x.shape
|
|
84
|
-
z = Numo::DFloat.zeros(n_samples, n_output_features)
|
|
85
|
-
# bias
|
|
86
|
-
z[true, 0] = 1
|
|
87
|
-
curr_col = 1
|
|
88
|
-
# itself
|
|
89
|
-
z[true, 1..n_features] = x
|
|
90
|
-
curr_col += n_features
|
|
91
|
-
# high degree features
|
|
92
|
-
curr_feat_ids = Array.new(n_features + 1) { |n| n + 1 }
|
|
93
|
-
(1...@params[:degree]).each do
|
|
94
|
-
next_feat_ids = []
|
|
95
|
-
n_features.times do |d|
|
|
96
|
-
f_range = curr_feat_ids[d]...curr_feat_ids.last
|
|
97
|
-
next_col = curr_col + f_range.size
|
|
98
|
-
z[true, curr_col...next_col] = z[true, f_range] * x[true, d..d]
|
|
99
|
-
next_feat_ids.push(curr_col)
|
|
100
|
-
curr_col = next_col
|
|
101
|
-
end
|
|
102
|
-
next_feat_ids.push(curr_col)
|
|
103
|
-
curr_feat_ids = next_feat_ids
|
|
104
|
-
end
|
|
105
|
-
z
|
|
106
|
-
end
|
|
107
|
-
end
|
|
108
|
-
end
|
|
109
|
-
end
|
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/base/base_estimator'
|
|
4
|
-
require 'rumale/base/transformer'
|
|
5
|
-
|
|
6
|
-
module Rumale
|
|
7
|
-
# This module consists of the classes that perform preprocessings.
|
|
8
|
-
module Preprocessing
|
|
9
|
-
# Normalize samples by centering and scaling to unit variance.
|
|
10
|
-
#
|
|
11
|
-
# @example
|
|
12
|
-
# normalizer = Rumale::Preprocessing::StandardScaler.new
|
|
13
|
-
# new_training_samples = normalizer.fit_transform(training_samples)
|
|
14
|
-
# new_testing_samples = normalizer.transform(testing_samples)
|
|
15
|
-
class StandardScaler
|
|
16
|
-
include Base::BaseEstimator
|
|
17
|
-
include Base::Transformer
|
|
18
|
-
|
|
19
|
-
# Return the vector consists of the mean value for each feature.
|
|
20
|
-
# @return [Numo::DFloat] (shape: [n_features])
|
|
21
|
-
attr_reader :mean_vec
|
|
22
|
-
|
|
23
|
-
# Return the vector consists of the standard deviation for each feature.
|
|
24
|
-
# @return [Numo::DFloat] (shape: [n_features])
|
|
25
|
-
attr_reader :std_vec
|
|
26
|
-
|
|
27
|
-
# Create a new normalizer for centering and scaling to unit variance.
|
|
28
|
-
def initialize
|
|
29
|
-
@params = {}
|
|
30
|
-
@mean_vec = nil
|
|
31
|
-
@std_vec = nil
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
# Calculate the mean value and standard deviation of each feature for scaling.
|
|
35
|
-
#
|
|
36
|
-
# @overload fit(x) -> StandardScaler
|
|
37
|
-
#
|
|
38
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features])
|
|
39
|
-
# The samples to calculate the mean values and standard deviations.
|
|
40
|
-
# @return [StandardScaler]
|
|
41
|
-
def fit(x, _y = nil)
|
|
42
|
-
x = check_convert_sample_array(x)
|
|
43
|
-
@mean_vec = x.mean(0)
|
|
44
|
-
@std_vec = x.stddev(0)
|
|
45
|
-
self
|
|
46
|
-
end
|
|
47
|
-
|
|
48
|
-
# Calculate the mean values and standard deviations, and then normalize samples using them.
|
|
49
|
-
#
|
|
50
|
-
# @overload fit_transform(x) -> Numo::DFloat
|
|
51
|
-
#
|
|
52
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features])
|
|
53
|
-
# The samples to calculate the mean values and standard deviations.
|
|
54
|
-
# @return [Numo::DFloat] The scaled samples.
|
|
55
|
-
def fit_transform(x, _y = nil)
|
|
56
|
-
x = check_convert_sample_array(x)
|
|
57
|
-
fit(x).transform(x)
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
# Perform standardization the given samples.
|
|
61
|
-
#
|
|
62
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be scaled.
|
|
63
|
-
# @return [Numo::DFloat] The scaled samples.
|
|
64
|
-
def transform(x)
|
|
65
|
-
x = check_convert_sample_array(x)
|
|
66
|
-
n_samples, = x.shape
|
|
67
|
-
(x - @mean_vec.tile(n_samples, 1)) / @std_vec.tile(n_samples, 1)
|
|
68
|
-
end
|
|
69
|
-
end
|
|
70
|
-
end
|
|
71
|
-
end
|
|
@@ -1,114 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Rumale
|
|
4
|
-
# Module for calculating posterior class probabilities with SVM outputs.
|
|
5
|
-
# This module is used for internal processes.
|
|
6
|
-
#
|
|
7
|
-
# @example
|
|
8
|
-
# estimator = Rumale::LinearModel::SVC.new
|
|
9
|
-
# estimator.fit(x, bin_y)
|
|
10
|
-
# df = estimator.decision_function(x)
|
|
11
|
-
# params = Rumale::ProbabilisticOutput.fit_sigmoid(df, bin_y)
|
|
12
|
-
# probs = 1 / (Numo::NMath.exp(params[0] * df + params[1]) + 1)
|
|
13
|
-
#
|
|
14
|
-
# *Reference*
|
|
15
|
-
# - Platt, J C., "Probabilistic Outputs for Support Vector Machines and Comparisons to Regularized Likelihood Methods," Adv. Large Margin Classifiers, pp. 61--74, 2000.
|
|
16
|
-
# - Lin, H-T., Lin, C-J., and Weng, R C., "A Note on Platt's Probabilistic Outputs for Support Vector Machines," J. Machine Learning, Vol. 63 (3), pp. 267--276, 2007.
|
|
17
|
-
module ProbabilisticOutput
|
|
18
|
-
class << self
|
|
19
|
-
# Fit the probabilistic model for binary SVM outputs.
|
|
20
|
-
#
|
|
21
|
-
# @param df [Numo::DFloat] (shape: [n_samples]) The outputs of decision function to be used for fitting the model.
|
|
22
|
-
# @param bin_y [Numo::Int32] (shape: [n_samples]) The binary labels to be used for fitting the model.
|
|
23
|
-
# @param max_iter [Integer] The maximum number of iterations.
|
|
24
|
-
# @param min_step [Float] The minimum step of Newton's method.
|
|
25
|
-
# @param sigma [Float] The parameter to avoid hessian matrix from becoming singular matrix.
|
|
26
|
-
# @return [Numo::DFloat] (shape: 2) The parameters of the model.
|
|
27
|
-
def fit_sigmoid(df, bin_y, max_iter = 100, min_step = 1e-10, sigma = 1e-12)
|
|
28
|
-
# Initialize some variables.
|
|
29
|
-
n_samples = bin_y.size
|
|
30
|
-
negative_label = bin_y.to_a.uniq.min
|
|
31
|
-
pos = bin_y.ne(negative_label)
|
|
32
|
-
neg = bin_y.eq(negative_label)
|
|
33
|
-
n_pos_samples = pos.count
|
|
34
|
-
n_neg_samples = neg.count
|
|
35
|
-
target_probs = Numo::DFloat.zeros(n_samples)
|
|
36
|
-
target_probs[pos] = (n_pos_samples + 1) / (n_pos_samples + 2.0)
|
|
37
|
-
target_probs[neg] = 1 / (n_neg_samples + 2.0)
|
|
38
|
-
alpha = 0.0
|
|
39
|
-
beta = Math.log((n_neg_samples + 1) / (n_pos_samples + 1.0))
|
|
40
|
-
err = error_function(target_probs, df, alpha, beta)
|
|
41
|
-
# Optimize parameters for class porbability calculation.
|
|
42
|
-
old_grad_vec = Numo::DFloat.zeros(2)
|
|
43
|
-
max_iter.times do
|
|
44
|
-
# Calculate gradient and hessian matrix.
|
|
45
|
-
probs = predicted_probs(df, alpha, beta)
|
|
46
|
-
grad_vec = gradient(target_probs, probs, df)
|
|
47
|
-
hess_mat = hessian_matrix(probs, df, sigma)
|
|
48
|
-
break if grad_vec.abs.lt(1e-5).count == 2
|
|
49
|
-
break if (old_grad_vec - grad_vec).abs.sum < 1e-5
|
|
50
|
-
|
|
51
|
-
old_grad_vec = grad_vec
|
|
52
|
-
# Calculate Newton directions.
|
|
53
|
-
dirs_vec = directions(grad_vec, hess_mat)
|
|
54
|
-
grad_dir = grad_vec.dot(dirs_vec)
|
|
55
|
-
stepsize = 2.0
|
|
56
|
-
while stepsize >= min_step
|
|
57
|
-
stepsize *= 0.5
|
|
58
|
-
new_alpha = alpha + stepsize * dirs_vec[0]
|
|
59
|
-
new_beta = beta + stepsize * dirs_vec[1]
|
|
60
|
-
new_err = error_function(target_probs, df, new_alpha, new_beta)
|
|
61
|
-
next unless new_err < err + 0.0001 * stepsize * grad_dir
|
|
62
|
-
|
|
63
|
-
alpha = new_alpha
|
|
64
|
-
beta = new_beta
|
|
65
|
-
err = new_err
|
|
66
|
-
break
|
|
67
|
-
end
|
|
68
|
-
end
|
|
69
|
-
Numo::DFloat[alpha, beta]
|
|
70
|
-
end
|
|
71
|
-
|
|
72
|
-
private
|
|
73
|
-
|
|
74
|
-
def error_function(target_probs, df, alpha, beta)
|
|
75
|
-
fn = alpha * df + beta
|
|
76
|
-
pos = fn.ge(0.0)
|
|
77
|
-
neg = fn.lt(0.0)
|
|
78
|
-
err = 0.0
|
|
79
|
-
err += (target_probs[pos] * fn[pos] + Numo::NMath.log(1 + Numo::NMath.exp(-fn[pos]))).sum if pos.count.positive?
|
|
80
|
-
err += ((target_probs[neg] - 1) * fn[neg] + Numo::NMath.log(1 + Numo::NMath.exp(fn[neg]))).sum if neg.count.positive?
|
|
81
|
-
err
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
def predicted_probs(df, alpha, beta)
|
|
85
|
-
fn = alpha * df + beta
|
|
86
|
-
pos = fn.ge(0.0)
|
|
87
|
-
neg = fn.lt(0.0)
|
|
88
|
-
probs = Numo::DFloat.zeros(df.shape[0])
|
|
89
|
-
probs[pos] = Numo::NMath.exp(-fn[pos]) / (1 + Numo::NMath.exp(-fn[pos])) if pos.count.positive?
|
|
90
|
-
probs[neg] = 1 / (1 + Numo::NMath.exp(fn[neg])) if neg.count.positive?
|
|
91
|
-
probs
|
|
92
|
-
end
|
|
93
|
-
|
|
94
|
-
def gradient(target_probs, probs, df)
|
|
95
|
-
sub = target_probs - probs
|
|
96
|
-
Numo::DFloat[(df * sub).sum, sub.sum]
|
|
97
|
-
end
|
|
98
|
-
|
|
99
|
-
def hessian_matrix(probs, df, sigma)
|
|
100
|
-
sub = probs * (1 - probs)
|
|
101
|
-
h11 = (df**2 * sub).sum + sigma
|
|
102
|
-
h22 = sub.sum + sigma
|
|
103
|
-
h21 = (df * sub).sum
|
|
104
|
-
Numo::DFloat[[h11, h21], [h21, h22]]
|
|
105
|
-
end
|
|
106
|
-
|
|
107
|
-
def directions(grad_vec, hess_mat)
|
|
108
|
-
det = hess_mat[0, 0] * hess_mat[1, 1] - hess_mat[0, 1] * hess_mat[1, 0]
|
|
109
|
-
inv_hess_mat = Numo::DFloat[[hess_mat[1, 1], -hess_mat[0, 1]], [-hess_mat[1, 0], hess_mat[0, 0]]] / det
|
|
110
|
-
-inv_hess_mat.dot(grad_vec)
|
|
111
|
-
end
|
|
112
|
-
end
|
|
113
|
-
end
|
|
114
|
-
end
|
|
@@ -1,150 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/base/base_estimator'
|
|
4
|
-
require 'rumale/tree/node'
|
|
5
|
-
require 'rumale/rumaleext'
|
|
6
|
-
|
|
7
|
-
module Rumale
|
|
8
|
-
# This module consists of the classes that implement tree models.
|
|
9
|
-
module Tree
|
|
10
|
-
# BaseDecisionTree is an abstract class for implementation of decision tree-based estimator.
|
|
11
|
-
# This class is used internally.
|
|
12
|
-
class BaseDecisionTree
|
|
13
|
-
include Base::BaseEstimator
|
|
14
|
-
|
|
15
|
-
# Initialize a decision tree-based estimator.
|
|
16
|
-
#
|
|
17
|
-
# @param criterion [String] The function to evalue spliting point.
|
|
18
|
-
# @param max_depth [Integer] The maximum depth of the tree.
|
|
19
|
-
# If nil is given, decision tree grows without concern for depth.
|
|
20
|
-
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
|
21
|
-
# If nil is given, number of leaves is not limited.
|
|
22
|
-
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
|
23
|
-
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
|
24
|
-
# If nil is given, split process considers all features.
|
|
25
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
|
26
|
-
# It is used to randomly determine the order of features when deciding spliting point.
|
|
27
|
-
def initialize(criterion: nil, max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil, random_seed: nil)
|
|
28
|
-
@params = {}
|
|
29
|
-
@params[:criterion] = criterion
|
|
30
|
-
@params[:max_depth] = max_depth
|
|
31
|
-
@params[:max_leaf_nodes] = max_leaf_nodes
|
|
32
|
-
@params[:min_samples_leaf] = min_samples_leaf
|
|
33
|
-
@params[:max_features] = max_features
|
|
34
|
-
@params[:random_seed] = random_seed
|
|
35
|
-
@params[:random_seed] ||= srand
|
|
36
|
-
@tree = nil
|
|
37
|
-
@feature_importances = nil
|
|
38
|
-
@n_leaves = nil
|
|
39
|
-
@rng = Random.new(@params[:random_seed])
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
# Return the index of the leaf that each sample reached.
|
|
43
|
-
#
|
|
44
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
|
45
|
-
# @return [Numo::Int32] (shape: [n_samples]) Leaf index for sample.
|
|
46
|
-
def apply(x)
|
|
47
|
-
x = check_convert_sample_array(x)
|
|
48
|
-
Numo::Int32[*(Array.new(x.shape[0]) { |n| partial_apply(@tree, x[n, true]) })]
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
private
|
|
52
|
-
|
|
53
|
-
def partial_apply(tree, sample)
|
|
54
|
-
node = tree
|
|
55
|
-
until node.leaf
|
|
56
|
-
# :nocov:
|
|
57
|
-
node = if node.right.nil?
|
|
58
|
-
node.left
|
|
59
|
-
elsif node.left.nil?
|
|
60
|
-
node.right
|
|
61
|
-
# :nocov:
|
|
62
|
-
else
|
|
63
|
-
sample[node.feature_id] <= node.threshold ? node.left : node.right
|
|
64
|
-
end
|
|
65
|
-
end
|
|
66
|
-
node.leaf_id
|
|
67
|
-
end
|
|
68
|
-
|
|
69
|
-
def build_tree(x, y)
|
|
70
|
-
y = y.expand_dims(1).dup if y.shape[1].nil?
|
|
71
|
-
@feature_ids = Array.new(x.shape[1]) { |v| v }
|
|
72
|
-
@tree = grow_node(0, x, y, impurity(y))
|
|
73
|
-
@feature_ids = nil
|
|
74
|
-
nil
|
|
75
|
-
end
|
|
76
|
-
|
|
77
|
-
def grow_node(depth, x, y, impurity)
|
|
78
|
-
# intialize node.
|
|
79
|
-
n_samples = x.shape[0]
|
|
80
|
-
node = Node.new(depth: depth, impurity: impurity, n_samples: n_samples)
|
|
81
|
-
|
|
82
|
-
# terminate growing.
|
|
83
|
-
return nil if !@params[:max_leaf_nodes].nil? && @n_leaves >= @params[:max_leaf_nodes]
|
|
84
|
-
return nil if n_samples < @params[:min_samples_leaf]
|
|
85
|
-
return put_leaf(node, y) if n_samples == @params[:min_samples_leaf]
|
|
86
|
-
return put_leaf(node, y) if !@params[:max_depth].nil? && depth == @params[:max_depth]
|
|
87
|
-
return put_leaf(node, y) if stop_growing?(y)
|
|
88
|
-
|
|
89
|
-
# calculate optimal parameters.
|
|
90
|
-
feature_id, left_imp, right_imp, threshold, gain =
|
|
91
|
-
rand_ids.map { |n| [n, *best_split(x[true, n], y, impurity)] }.max_by(&:last)
|
|
92
|
-
|
|
93
|
-
return put_leaf(node, y) if gain.nil? || gain.zero?
|
|
94
|
-
|
|
95
|
-
left_ids = x[true, feature_id].le(threshold).where
|
|
96
|
-
right_ids = x[true, feature_id].gt(threshold).where
|
|
97
|
-
node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true], left_imp)
|
|
98
|
-
node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true], right_imp)
|
|
99
|
-
|
|
100
|
-
return put_leaf(node, y) if node.left.nil? && node.right.nil?
|
|
101
|
-
|
|
102
|
-
node.feature_id = feature_id
|
|
103
|
-
node.threshold = threshold
|
|
104
|
-
node.leaf = false
|
|
105
|
-
node
|
|
106
|
-
end
|
|
107
|
-
|
|
108
|
-
def stop_growing?(_y)
|
|
109
|
-
raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
|
|
110
|
-
end
|
|
111
|
-
|
|
112
|
-
def put_leaf(_node, _y)
|
|
113
|
-
raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
|
|
114
|
-
end
|
|
115
|
-
|
|
116
|
-
def rand_ids
|
|
117
|
-
@feature_ids.sample(@params[:max_features], random: @sub_rng)
|
|
118
|
-
end
|
|
119
|
-
|
|
120
|
-
def best_split(_features, _y, _impurity)
|
|
121
|
-
raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
|
|
122
|
-
end
|
|
123
|
-
|
|
124
|
-
def impurity(_y)
|
|
125
|
-
raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
def eval_importance(n_samples, n_features)
|
|
129
|
-
@feature_importances = Numo::DFloat.zeros(n_features)
|
|
130
|
-
eval_importance_at_node(@tree)
|
|
131
|
-
@feature_importances /= n_samples
|
|
132
|
-
normalizer = @feature_importances.sum
|
|
133
|
-
@feature_importances /= normalizer if normalizer > 0.0
|
|
134
|
-
nil
|
|
135
|
-
end
|
|
136
|
-
|
|
137
|
-
def eval_importance_at_node(node)
|
|
138
|
-
return nil if node.leaf
|
|
139
|
-
return nil if node.left.nil? || node.right.nil?
|
|
140
|
-
|
|
141
|
-
gain = node.n_samples * node.impurity -
|
|
142
|
-
node.left.n_samples * node.left.impurity -
|
|
143
|
-
node.right.n_samples * node.right.impurity
|
|
144
|
-
@feature_importances[node.feature_id] += gain
|
|
145
|
-
eval_importance_at_node(node.left)
|
|
146
|
-
eval_importance_at_node(node.right)
|
|
147
|
-
end
|
|
148
|
-
end
|
|
149
|
-
end
|
|
150
|
-
end
|