rumale 0.23.3 → 0.24.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE.txt +5 -1
- data/README.md +3 -288
- data/lib/rumale/version.rb +1 -1
- data/lib/rumale.rb +20 -131
- metadata +252 -150
- data/CHANGELOG.md +0 -643
- data/CODE_OF_CONDUCT.md +0 -74
- data/ext/rumale/extconf.rb +0 -37
- data/ext/rumale/rumaleext.c +0 -545
- data/ext/rumale/rumaleext.h +0 -12
- data/lib/rumale/base/base_estimator.rb +0 -49
- data/lib/rumale/base/classifier.rb +0 -36
- data/lib/rumale/base/cluster_analyzer.rb +0 -31
- data/lib/rumale/base/evaluator.rb +0 -17
- data/lib/rumale/base/regressor.rb +0 -36
- data/lib/rumale/base/splitter.rb +0 -21
- data/lib/rumale/base/transformer.rb +0 -22
- data/lib/rumale/clustering/dbscan.rb +0 -123
- data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
- data/lib/rumale/clustering/hdbscan.rb +0 -291
- data/lib/rumale/clustering/k_means.rb +0 -122
- data/lib/rumale/clustering/k_medoids.rb +0 -141
- data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
- data/lib/rumale/clustering/power_iteration.rb +0 -127
- data/lib/rumale/clustering/single_linkage.rb +0 -203
- data/lib/rumale/clustering/snn.rb +0 -76
- data/lib/rumale/clustering/spectral_clustering.rb +0 -115
- data/lib/rumale/dataset.rb +0 -246
- data/lib/rumale/decomposition/factor_analysis.rb +0 -150
- data/lib/rumale/decomposition/fast_ica.rb +0 -188
- data/lib/rumale/decomposition/nmf.rb +0 -124
- data/lib/rumale/decomposition/pca.rb +0 -159
- data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
- data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
- data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
- data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
- data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
- data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
- data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
- data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
- data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
- data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
- data/lib/rumale/ensemble/voting_classifier.rb +0 -126
- data/lib/rumale/ensemble/voting_regressor.rb +0 -82
- data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
- data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
- data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
- data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
- data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
- data/lib/rumale/evaluation_measure/f_score.rb +0 -50
- data/lib/rumale/evaluation_measure/function.rb +0 -147
- data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
- data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
- data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
- data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
- data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
- data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
- data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
- data/lib/rumale/evaluation_measure/precision.rb +0 -50
- data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
- data/lib/rumale/evaluation_measure/purity.rb +0 -40
- data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
- data/lib/rumale/evaluation_measure/recall.rb +0 -50
- data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
- data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
- data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
- data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
- data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
- data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
- data/lib/rumale/kernel_approximation/rbf.rb +0 -102
- data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
- data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
- data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
- data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
- data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
- data/lib/rumale/linear_model/base_sgd.rb +0 -285
- data/lib/rumale/linear_model/elastic_net.rb +0 -119
- data/lib/rumale/linear_model/lasso.rb +0 -115
- data/lib/rumale/linear_model/linear_regression.rb +0 -201
- data/lib/rumale/linear_model/logistic_regression.rb +0 -275
- data/lib/rumale/linear_model/nnls.rb +0 -137
- data/lib/rumale/linear_model/ridge.rb +0 -209
- data/lib/rumale/linear_model/svc.rb +0 -213
- data/lib/rumale/linear_model/svr.rb +0 -132
- data/lib/rumale/manifold/mds.rb +0 -155
- data/lib/rumale/manifold/tsne.rb +0 -222
- data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
- data/lib/rumale/metric_learning/mlkr.rb +0 -161
- data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
- data/lib/rumale/model_selection/cross_validation.rb +0 -125
- data/lib/rumale/model_selection/function.rb +0 -42
- data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
- data/lib/rumale/model_selection/group_k_fold.rb +0 -93
- data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
- data/lib/rumale/model_selection/k_fold.rb +0 -81
- data/lib/rumale/model_selection/shuffle_split.rb +0 -90
- data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
- data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
- data/lib/rumale/model_selection/time_series_split.rb +0 -91
- data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
- data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
- data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
- data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
- data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
- data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
- data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
- data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
- data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
- data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
- data/lib/rumale/neural_network/adam.rb +0 -56
- data/lib/rumale/neural_network/base_mlp.rb +0 -248
- data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
- data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
- data/lib/rumale/pairwise_metric.rb +0 -152
- data/lib/rumale/pipeline/feature_union.rb +0 -69
- data/lib/rumale/pipeline/pipeline.rb +0 -175
- data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
- data/lib/rumale/preprocessing/binarizer.rb +0 -60
- data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
- data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
- data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
- data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
- data/lib/rumale/preprocessing/label_encoder.rb +0 -79
- data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
- data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
- data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
- data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
- data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
- data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
- data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
- data/lib/rumale/probabilistic_output.rb +0 -114
- data/lib/rumale/tree/base_decision_tree.rb +0 -150
- data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
- data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
- data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
- data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
- data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
- data/lib/rumale/tree/node.rb +0 -39
- data/lib/rumale/utils.rb +0 -42
- data/lib/rumale/validation.rb +0 -128
- data/lib/rumale/values.rb +0 -13
@@ -1,109 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'rumale/base/base_estimator'
|
4
|
-
require 'rumale/base/transformer'
|
5
|
-
|
6
|
-
module Rumale
|
7
|
-
module Preprocessing
|
8
|
-
# Transfrom categorical features to integer values.
|
9
|
-
#
|
10
|
-
# @example
|
11
|
-
# encoder = Rumale::Preprocessing::OrdinalEncoder.new
|
12
|
-
# training_samples = [['left', 10], ['right', 15], ['right', 20]]
|
13
|
-
# training_samples = Numo::NArray.asarray(training_samples)
|
14
|
-
# encoder.fit(training_samples)
|
15
|
-
# p encoder.categories
|
16
|
-
# # [["left", "right"], [10, 15, 20]]
|
17
|
-
# testing_samples = [['left', 20], ['right', 10]]
|
18
|
-
# testing_samples = Numo::NArray.asarray(testing_samples)
|
19
|
-
# encoded = encoder.transform(testing_samples)
|
20
|
-
# p encoded
|
21
|
-
# # Numo::DFloat#shape=[2,2]
|
22
|
-
# # [[0, 2],
|
23
|
-
# # [1, 0]]
|
24
|
-
# p encoder.inverse_transform(encoded)
|
25
|
-
# # Numo::RObject#shape=[2,2]
|
26
|
-
# # [["left", 20],
|
27
|
-
# # ["right", 10]]
|
28
|
-
class OrdinalEncoder
|
29
|
-
include Base::BaseEstimator
|
30
|
-
include Base::Transformer
|
31
|
-
|
32
|
-
# Return the array consists of categorical value each feature.
|
33
|
-
# @return [Array] (size: n_features)
|
34
|
-
attr_reader :categories
|
35
|
-
|
36
|
-
# Create a new encoder that transform categorical features to integer values.
|
37
|
-
#
|
38
|
-
# @param categories [Nil/Array] The category list for each feature.
|
39
|
-
# If nil is given, extracted categories from the training data by calling the fit method are used.
|
40
|
-
def initialize(categories: nil)
|
41
|
-
check_params_type_or_nil(Array, categories: categories)
|
42
|
-
@categories = categories
|
43
|
-
end
|
44
|
-
|
45
|
-
# Fit encoder by extracting the category for each feature.
|
46
|
-
#
|
47
|
-
# @overload fit(x) -> OrdinalEncoder
|
48
|
-
#
|
49
|
-
# @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
|
50
|
-
# @return [LabelEncoder]
|
51
|
-
def fit(x, _y = nil)
|
52
|
-
raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
|
53
|
-
raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
|
54
|
-
|
55
|
-
n_features = x.shape[1]
|
56
|
-
@categories = Array.new(n_features) { |n| x[true, n].to_a.uniq.sort }
|
57
|
-
self
|
58
|
-
end
|
59
|
-
|
60
|
-
# Fit encoder, then return encoded categorical features to integer values.
|
61
|
-
#
|
62
|
-
# @overload fit_transform(x) -> Numo::DFloat
|
63
|
-
#
|
64
|
-
# @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
|
65
|
-
# @return [Numo::DFloat] The encoded categorical features to integer values.
|
66
|
-
def fit_transform(x, _y = nil)
|
67
|
-
raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
|
68
|
-
raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
|
69
|
-
|
70
|
-
fit(x).transform(x)
|
71
|
-
end
|
72
|
-
|
73
|
-
# Encode categorical features.
|
74
|
-
#
|
75
|
-
# @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
|
76
|
-
# @return [Numo::DFloat] The encoded categorical features to integer values.
|
77
|
-
def transform(x)
|
78
|
-
raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
|
79
|
-
raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
|
80
|
-
|
81
|
-
n_features = x.shape[1]
|
82
|
-
raise ArgumentError, 'Expect the number of features and the number of categories to be equal' if n_features != @categories.size
|
83
|
-
|
84
|
-
transformed = Array.new(n_features) do |n|
|
85
|
-
x[true, n].to_a.map { |v| @categories[n].index(v) }
|
86
|
-
end
|
87
|
-
|
88
|
-
Numo::DFloat.asarray(transformed.transpose)
|
89
|
-
end
|
90
|
-
|
91
|
-
# Decode values to categorical features.
|
92
|
-
#
|
93
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples consisting of values transformed from categorical features.
|
94
|
-
# @return [Numo::NArray] The decoded features.
|
95
|
-
def inverse_transform(x)
|
96
|
-
x = check_convert_sample_array(x)
|
97
|
-
|
98
|
-
n_features = x.shape[1]
|
99
|
-
raise ArgumentError, 'Expect the number of features and the number of categories to be equal' if n_features != @categories.size
|
100
|
-
|
101
|
-
inv_transformed = Array.new(n_features) do |n|
|
102
|
-
x[true, n].to_a.map { |i| @categories[n][i.to_i] }
|
103
|
-
end
|
104
|
-
|
105
|
-
Numo::NArray.asarray(inv_transformed.transpose)
|
106
|
-
end
|
107
|
-
end
|
108
|
-
end
|
109
|
-
end
|
@@ -1,109 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'rumale/base/base_estimator'
|
4
|
-
require 'rumale/base/transformer'
|
5
|
-
|
6
|
-
module Rumale
|
7
|
-
module Preprocessing
|
8
|
-
# Generating polynomial features from the given samples.
|
9
|
-
#
|
10
|
-
# @example
|
11
|
-
# require 'rumale'
|
12
|
-
#
|
13
|
-
# transformer = Rumale::Preprocessing::PolynomialFeatures.new(degree: 2)
|
14
|
-
# x = Numo::DFloat[[0, 1], [2, 3], [4, 5]]
|
15
|
-
# z = transformer.fit_transform(x)
|
16
|
-
# p z
|
17
|
-
#
|
18
|
-
# # Numo::DFloat#shape=[3,6]
|
19
|
-
# # [[1, 0, 1, 0, 0, 1],
|
20
|
-
# # [1, 2, 3, 4, 6, 9],
|
21
|
-
# # [1, 4, 5, 16, 20, 25]]
|
22
|
-
#
|
23
|
-
# # If you want to perform polynomial regression, combine it with LinearRegression as follows:
|
24
|
-
# ply = Rumale::Preprocessing::PolynomialFeatures.new(degree: 2)
|
25
|
-
# reg = Rumale::LinearModel::LinearRegression.new(fit_bias: false, random_seed: 1)
|
26
|
-
# pipeline = Rumale::Pipeline::Pipeline.new(steps: { trs: ply, est: reg })
|
27
|
-
# pipeline.fit(training_samples, training_values)
|
28
|
-
# results = pipeline.predict(testing_samples)
|
29
|
-
#
|
30
|
-
class PolynomialFeatures
|
31
|
-
include Base::BaseEstimator
|
32
|
-
include Base::Transformer
|
33
|
-
|
34
|
-
# Return the number of polynomial features.
|
35
|
-
# @return [Integer]
|
36
|
-
attr_reader :n_output_features
|
37
|
-
|
38
|
-
# Create a transformer for generating polynomial features.
|
39
|
-
#
|
40
|
-
# @param degree [Integer] The degree of polynomial features.
|
41
|
-
def initialize(degree: 2)
|
42
|
-
check_params_numeric(degree: degree)
|
43
|
-
raise ArgumentError, 'Expect the value of degree parameter greater than or eqaul to 1.' if degree < 1
|
44
|
-
|
45
|
-
@params = {}
|
46
|
-
@params[:degree] = degree
|
47
|
-
@n_output_features = nil
|
48
|
-
end
|
49
|
-
|
50
|
-
# Calculate the number of output polynomial fetures.
|
51
|
-
#
|
52
|
-
# @overload fit(x) -> PolynomialFeatures
|
53
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the number of output polynomial fetures.
|
54
|
-
# @return [PolynomialFeatures]
|
55
|
-
def fit(x, _y = nil)
|
56
|
-
x = check_convert_sample_array(x)
|
57
|
-
n_features = x.shape[1]
|
58
|
-
@n_output_features = 1
|
59
|
-
@params[:degree].times do |t|
|
60
|
-
@n_output_features += Array.new(n_features) { |n| n }.repeated_combination(t + 1).size
|
61
|
-
end
|
62
|
-
self
|
63
|
-
end
|
64
|
-
|
65
|
-
# Calculate the number of polynomial features, and then transform samples to polynomial features.
|
66
|
-
#
|
67
|
-
# @overload fit_transform(x) -> Numo::DFloat
|
68
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the number of polynomial features
|
69
|
-
# and be transformed.
|
70
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_output_features]) The transformed samples.
|
71
|
-
def fit_transform(x, _y = nil)
|
72
|
-
x = check_convert_sample_array(x)
|
73
|
-
fit(x).transform(x)
|
74
|
-
end
|
75
|
-
|
76
|
-
# Transform the given samples to polynomial features.
|
77
|
-
#
|
78
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed.
|
79
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_output_features]) The transformed samples.
|
80
|
-
def transform(x)
|
81
|
-
x = check_convert_sample_array(x)
|
82
|
-
# initialize transformed features
|
83
|
-
n_samples, n_features = x.shape
|
84
|
-
z = Numo::DFloat.zeros(n_samples, n_output_features)
|
85
|
-
# bias
|
86
|
-
z[true, 0] = 1
|
87
|
-
curr_col = 1
|
88
|
-
# itself
|
89
|
-
z[true, 1..n_features] = x
|
90
|
-
curr_col += n_features
|
91
|
-
# high degree features
|
92
|
-
curr_feat_ids = Array.new(n_features + 1) { |n| n + 1 }
|
93
|
-
(1...@params[:degree]).each do
|
94
|
-
next_feat_ids = []
|
95
|
-
n_features.times do |d|
|
96
|
-
f_range = curr_feat_ids[d]...curr_feat_ids.last
|
97
|
-
next_col = curr_col + f_range.size
|
98
|
-
z[true, curr_col...next_col] = z[true, f_range] * x[true, d..d]
|
99
|
-
next_feat_ids.push(curr_col)
|
100
|
-
curr_col = next_col
|
101
|
-
end
|
102
|
-
next_feat_ids.push(curr_col)
|
103
|
-
curr_feat_ids = next_feat_ids
|
104
|
-
end
|
105
|
-
z
|
106
|
-
end
|
107
|
-
end
|
108
|
-
end
|
109
|
-
end
|
@@ -1,71 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'rumale/base/base_estimator'
|
4
|
-
require 'rumale/base/transformer'
|
5
|
-
|
6
|
-
module Rumale
|
7
|
-
# This module consists of the classes that perform preprocessings.
|
8
|
-
module Preprocessing
|
9
|
-
# Normalize samples by centering and scaling to unit variance.
|
10
|
-
#
|
11
|
-
# @example
|
12
|
-
# normalizer = Rumale::Preprocessing::StandardScaler.new
|
13
|
-
# new_training_samples = normalizer.fit_transform(training_samples)
|
14
|
-
# new_testing_samples = normalizer.transform(testing_samples)
|
15
|
-
class StandardScaler
|
16
|
-
include Base::BaseEstimator
|
17
|
-
include Base::Transformer
|
18
|
-
|
19
|
-
# Return the vector consists of the mean value for each feature.
|
20
|
-
# @return [Numo::DFloat] (shape: [n_features])
|
21
|
-
attr_reader :mean_vec
|
22
|
-
|
23
|
-
# Return the vector consists of the standard deviation for each feature.
|
24
|
-
# @return [Numo::DFloat] (shape: [n_features])
|
25
|
-
attr_reader :std_vec
|
26
|
-
|
27
|
-
# Create a new normalizer for centering and scaling to unit variance.
|
28
|
-
def initialize
|
29
|
-
@params = {}
|
30
|
-
@mean_vec = nil
|
31
|
-
@std_vec = nil
|
32
|
-
end
|
33
|
-
|
34
|
-
# Calculate the mean value and standard deviation of each feature for scaling.
|
35
|
-
#
|
36
|
-
# @overload fit(x) -> StandardScaler
|
37
|
-
#
|
38
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features])
|
39
|
-
# The samples to calculate the mean values and standard deviations.
|
40
|
-
# @return [StandardScaler]
|
41
|
-
def fit(x, _y = nil)
|
42
|
-
x = check_convert_sample_array(x)
|
43
|
-
@mean_vec = x.mean(0)
|
44
|
-
@std_vec = x.stddev(0)
|
45
|
-
self
|
46
|
-
end
|
47
|
-
|
48
|
-
# Calculate the mean values and standard deviations, and then normalize samples using them.
|
49
|
-
#
|
50
|
-
# @overload fit_transform(x) -> Numo::DFloat
|
51
|
-
#
|
52
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features])
|
53
|
-
# The samples to calculate the mean values and standard deviations.
|
54
|
-
# @return [Numo::DFloat] The scaled samples.
|
55
|
-
def fit_transform(x, _y = nil)
|
56
|
-
x = check_convert_sample_array(x)
|
57
|
-
fit(x).transform(x)
|
58
|
-
end
|
59
|
-
|
60
|
-
# Perform standardization the given samples.
|
61
|
-
#
|
62
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be scaled.
|
63
|
-
# @return [Numo::DFloat] The scaled samples.
|
64
|
-
def transform(x)
|
65
|
-
x = check_convert_sample_array(x)
|
66
|
-
n_samples, = x.shape
|
67
|
-
(x - @mean_vec.tile(n_samples, 1)) / @std_vec.tile(n_samples, 1)
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
@@ -1,114 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Rumale
|
4
|
-
# Module for calculating posterior class probabilities with SVM outputs.
|
5
|
-
# This module is used for internal processes.
|
6
|
-
#
|
7
|
-
# @example
|
8
|
-
# estimator = Rumale::LinearModel::SVC.new
|
9
|
-
# estimator.fit(x, bin_y)
|
10
|
-
# df = estimator.decision_function(x)
|
11
|
-
# params = Rumale::ProbabilisticOutput.fit_sigmoid(df, bin_y)
|
12
|
-
# probs = 1 / (Numo::NMath.exp(params[0] * df + params[1]) + 1)
|
13
|
-
#
|
14
|
-
# *Reference*
|
15
|
-
# - Platt, J C., "Probabilistic Outputs for Support Vector Machines and Comparisons to Regularized Likelihood Methods," Adv. Large Margin Classifiers, pp. 61--74, 2000.
|
16
|
-
# - Lin, H-T., Lin, C-J., and Weng, R C., "A Note on Platt's Probabilistic Outputs for Support Vector Machines," J. Machine Learning, Vol. 63 (3), pp. 267--276, 2007.
|
17
|
-
module ProbabilisticOutput
|
18
|
-
class << self
|
19
|
-
# Fit the probabilistic model for binary SVM outputs.
|
20
|
-
#
|
21
|
-
# @param df [Numo::DFloat] (shape: [n_samples]) The outputs of decision function to be used for fitting the model.
|
22
|
-
# @param bin_y [Numo::Int32] (shape: [n_samples]) The binary labels to be used for fitting the model.
|
23
|
-
# @param max_iter [Integer] The maximum number of iterations.
|
24
|
-
# @param min_step [Float] The minimum step of Newton's method.
|
25
|
-
# @param sigma [Float] The parameter to avoid hessian matrix from becoming singular matrix.
|
26
|
-
# @return [Numo::DFloat] (shape: 2) The parameters of the model.
|
27
|
-
def fit_sigmoid(df, bin_y, max_iter = 100, min_step = 1e-10, sigma = 1e-12)
|
28
|
-
# Initialize some variables.
|
29
|
-
n_samples = bin_y.size
|
30
|
-
negative_label = bin_y.to_a.uniq.min
|
31
|
-
pos = bin_y.ne(negative_label)
|
32
|
-
neg = bin_y.eq(negative_label)
|
33
|
-
n_pos_samples = pos.count
|
34
|
-
n_neg_samples = neg.count
|
35
|
-
target_probs = Numo::DFloat.zeros(n_samples)
|
36
|
-
target_probs[pos] = (n_pos_samples + 1) / (n_pos_samples + 2.0)
|
37
|
-
target_probs[neg] = 1 / (n_neg_samples + 2.0)
|
38
|
-
alpha = 0.0
|
39
|
-
beta = Math.log((n_neg_samples + 1) / (n_pos_samples + 1.0))
|
40
|
-
err = error_function(target_probs, df, alpha, beta)
|
41
|
-
# Optimize parameters for class porbability calculation.
|
42
|
-
old_grad_vec = Numo::DFloat.zeros(2)
|
43
|
-
max_iter.times do
|
44
|
-
# Calculate gradient and hessian matrix.
|
45
|
-
probs = predicted_probs(df, alpha, beta)
|
46
|
-
grad_vec = gradient(target_probs, probs, df)
|
47
|
-
hess_mat = hessian_matrix(probs, df, sigma)
|
48
|
-
break if grad_vec.abs.lt(1e-5).count == 2
|
49
|
-
break if (old_grad_vec - grad_vec).abs.sum < 1e-5
|
50
|
-
|
51
|
-
old_grad_vec = grad_vec
|
52
|
-
# Calculate Newton directions.
|
53
|
-
dirs_vec = directions(grad_vec, hess_mat)
|
54
|
-
grad_dir = grad_vec.dot(dirs_vec)
|
55
|
-
stepsize = 2.0
|
56
|
-
while stepsize >= min_step
|
57
|
-
stepsize *= 0.5
|
58
|
-
new_alpha = alpha + stepsize * dirs_vec[0]
|
59
|
-
new_beta = beta + stepsize * dirs_vec[1]
|
60
|
-
new_err = error_function(target_probs, df, new_alpha, new_beta)
|
61
|
-
next unless new_err < err + 0.0001 * stepsize * grad_dir
|
62
|
-
|
63
|
-
alpha = new_alpha
|
64
|
-
beta = new_beta
|
65
|
-
err = new_err
|
66
|
-
break
|
67
|
-
end
|
68
|
-
end
|
69
|
-
Numo::DFloat[alpha, beta]
|
70
|
-
end
|
71
|
-
|
72
|
-
private
|
73
|
-
|
74
|
-
def error_function(target_probs, df, alpha, beta)
|
75
|
-
fn = alpha * df + beta
|
76
|
-
pos = fn.ge(0.0)
|
77
|
-
neg = fn.lt(0.0)
|
78
|
-
err = 0.0
|
79
|
-
err += (target_probs[pos] * fn[pos] + Numo::NMath.log(1 + Numo::NMath.exp(-fn[pos]))).sum if pos.count.positive?
|
80
|
-
err += ((target_probs[neg] - 1) * fn[neg] + Numo::NMath.log(1 + Numo::NMath.exp(fn[neg]))).sum if neg.count.positive?
|
81
|
-
err
|
82
|
-
end
|
83
|
-
|
84
|
-
def predicted_probs(df, alpha, beta)
|
85
|
-
fn = alpha * df + beta
|
86
|
-
pos = fn.ge(0.0)
|
87
|
-
neg = fn.lt(0.0)
|
88
|
-
probs = Numo::DFloat.zeros(df.shape[0])
|
89
|
-
probs[pos] = Numo::NMath.exp(-fn[pos]) / (1 + Numo::NMath.exp(-fn[pos])) if pos.count.positive?
|
90
|
-
probs[neg] = 1 / (1 + Numo::NMath.exp(fn[neg])) if neg.count.positive?
|
91
|
-
probs
|
92
|
-
end
|
93
|
-
|
94
|
-
def gradient(target_probs, probs, df)
|
95
|
-
sub = target_probs - probs
|
96
|
-
Numo::DFloat[(df * sub).sum, sub.sum]
|
97
|
-
end
|
98
|
-
|
99
|
-
def hessian_matrix(probs, df, sigma)
|
100
|
-
sub = probs * (1 - probs)
|
101
|
-
h11 = (df**2 * sub).sum + sigma
|
102
|
-
h22 = sub.sum + sigma
|
103
|
-
h21 = (df * sub).sum
|
104
|
-
Numo::DFloat[[h11, h21], [h21, h22]]
|
105
|
-
end
|
106
|
-
|
107
|
-
def directions(grad_vec, hess_mat)
|
108
|
-
det = hess_mat[0, 0] * hess_mat[1, 1] - hess_mat[0, 1] * hess_mat[1, 0]
|
109
|
-
inv_hess_mat = Numo::DFloat[[hess_mat[1, 1], -hess_mat[0, 1]], [-hess_mat[1, 0], hess_mat[0, 0]]] / det
|
110
|
-
-inv_hess_mat.dot(grad_vec)
|
111
|
-
end
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|
@@ -1,150 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'rumale/base/base_estimator'
|
4
|
-
require 'rumale/tree/node'
|
5
|
-
require 'rumale/rumaleext'
|
6
|
-
|
7
|
-
module Rumale
|
8
|
-
# This module consists of the classes that implement tree models.
|
9
|
-
module Tree
|
10
|
-
# BaseDecisionTree is an abstract class for implementation of decision tree-based estimator.
|
11
|
-
# This class is used internally.
|
12
|
-
class BaseDecisionTree
|
13
|
-
include Base::BaseEstimator
|
14
|
-
|
15
|
-
# Initialize a decision tree-based estimator.
|
16
|
-
#
|
17
|
-
# @param criterion [String] The function to evalue spliting point.
|
18
|
-
# @param max_depth [Integer] The maximum depth of the tree.
|
19
|
-
# If nil is given, decision tree grows without concern for depth.
|
20
|
-
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
21
|
-
# If nil is given, number of leaves is not limited.
|
22
|
-
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
23
|
-
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
24
|
-
# If nil is given, split process considers all features.
|
25
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
26
|
-
# It is used to randomly determine the order of features when deciding spliting point.
|
27
|
-
def initialize(criterion: nil, max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil, random_seed: nil)
|
28
|
-
@params = {}
|
29
|
-
@params[:criterion] = criterion
|
30
|
-
@params[:max_depth] = max_depth
|
31
|
-
@params[:max_leaf_nodes] = max_leaf_nodes
|
32
|
-
@params[:min_samples_leaf] = min_samples_leaf
|
33
|
-
@params[:max_features] = max_features
|
34
|
-
@params[:random_seed] = random_seed
|
35
|
-
@params[:random_seed] ||= srand
|
36
|
-
@tree = nil
|
37
|
-
@feature_importances = nil
|
38
|
-
@n_leaves = nil
|
39
|
-
@rng = Random.new(@params[:random_seed])
|
40
|
-
end
|
41
|
-
|
42
|
-
# Return the index of the leaf that each sample reached.
|
43
|
-
#
|
44
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
45
|
-
# @return [Numo::Int32] (shape: [n_samples]) Leaf index for sample.
|
46
|
-
def apply(x)
|
47
|
-
x = check_convert_sample_array(x)
|
48
|
-
Numo::Int32[*(Array.new(x.shape[0]) { |n| partial_apply(@tree, x[n, true]) })]
|
49
|
-
end
|
50
|
-
|
51
|
-
private
|
52
|
-
|
53
|
-
def partial_apply(tree, sample)
|
54
|
-
node = tree
|
55
|
-
until node.leaf
|
56
|
-
# :nocov:
|
57
|
-
node = if node.right.nil?
|
58
|
-
node.left
|
59
|
-
elsif node.left.nil?
|
60
|
-
node.right
|
61
|
-
# :nocov:
|
62
|
-
else
|
63
|
-
sample[node.feature_id] <= node.threshold ? node.left : node.right
|
64
|
-
end
|
65
|
-
end
|
66
|
-
node.leaf_id
|
67
|
-
end
|
68
|
-
|
69
|
-
def build_tree(x, y)
|
70
|
-
y = y.expand_dims(1).dup if y.shape[1].nil?
|
71
|
-
@feature_ids = Array.new(x.shape[1]) { |v| v }
|
72
|
-
@tree = grow_node(0, x, y, impurity(y))
|
73
|
-
@feature_ids = nil
|
74
|
-
nil
|
75
|
-
end
|
76
|
-
|
77
|
-
def grow_node(depth, x, y, impurity)
|
78
|
-
# intialize node.
|
79
|
-
n_samples = x.shape[0]
|
80
|
-
node = Node.new(depth: depth, impurity: impurity, n_samples: n_samples)
|
81
|
-
|
82
|
-
# terminate growing.
|
83
|
-
return nil if !@params[:max_leaf_nodes].nil? && @n_leaves >= @params[:max_leaf_nodes]
|
84
|
-
return nil if n_samples < @params[:min_samples_leaf]
|
85
|
-
return put_leaf(node, y) if n_samples == @params[:min_samples_leaf]
|
86
|
-
return put_leaf(node, y) if !@params[:max_depth].nil? && depth == @params[:max_depth]
|
87
|
-
return put_leaf(node, y) if stop_growing?(y)
|
88
|
-
|
89
|
-
# calculate optimal parameters.
|
90
|
-
feature_id, left_imp, right_imp, threshold, gain =
|
91
|
-
rand_ids.map { |n| [n, *best_split(x[true, n], y, impurity)] }.max_by(&:last)
|
92
|
-
|
93
|
-
return put_leaf(node, y) if gain.nil? || gain.zero?
|
94
|
-
|
95
|
-
left_ids = x[true, feature_id].le(threshold).where
|
96
|
-
right_ids = x[true, feature_id].gt(threshold).where
|
97
|
-
node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true], left_imp)
|
98
|
-
node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true], right_imp)
|
99
|
-
|
100
|
-
return put_leaf(node, y) if node.left.nil? && node.right.nil?
|
101
|
-
|
102
|
-
node.feature_id = feature_id
|
103
|
-
node.threshold = threshold
|
104
|
-
node.leaf = false
|
105
|
-
node
|
106
|
-
end
|
107
|
-
|
108
|
-
def stop_growing?(_y)
|
109
|
-
raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
|
110
|
-
end
|
111
|
-
|
112
|
-
def put_leaf(_node, _y)
|
113
|
-
raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
|
114
|
-
end
|
115
|
-
|
116
|
-
def rand_ids
|
117
|
-
@feature_ids.sample(@params[:max_features], random: @sub_rng)
|
118
|
-
end
|
119
|
-
|
120
|
-
def best_split(_features, _y, _impurity)
|
121
|
-
raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
|
122
|
-
end
|
123
|
-
|
124
|
-
def impurity(_y)
|
125
|
-
raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
|
126
|
-
end
|
127
|
-
|
128
|
-
def eval_importance(n_samples, n_features)
|
129
|
-
@feature_importances = Numo::DFloat.zeros(n_features)
|
130
|
-
eval_importance_at_node(@tree)
|
131
|
-
@feature_importances /= n_samples
|
132
|
-
normalizer = @feature_importances.sum
|
133
|
-
@feature_importances /= normalizer if normalizer > 0.0
|
134
|
-
nil
|
135
|
-
end
|
136
|
-
|
137
|
-
def eval_importance_at_node(node)
|
138
|
-
return nil if node.leaf
|
139
|
-
return nil if node.left.nil? || node.right.nil?
|
140
|
-
|
141
|
-
gain = node.n_samples * node.impurity -
|
142
|
-
node.left.n_samples * node.left.impurity -
|
143
|
-
node.right.n_samples * node.right.impurity
|
144
|
-
@feature_importances[node.feature_id] += gain
|
145
|
-
eval_importance_at_node(node.left)
|
146
|
-
eval_importance_at_node(node.right)
|
147
|
-
end
|
148
|
-
end
|
149
|
-
end
|
150
|
-
end
|