rumale 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.coveralls.yml +1 -0
- data/.gitignore +20 -0
- data/.rspec +3 -0
- data/.rubocop.yml +47 -0
- data/.rubocop_todo.yml +58 -0
- data/.travis.yml +13 -0
- data/CHANGELOG.md +2 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +23 -0
- data/README.md +175 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/rumale.rb +70 -0
- data/lib/rumale/base/base_estimator.rb +13 -0
- data/lib/rumale/base/classifier.rb +36 -0
- data/lib/rumale/base/cluster_analyzer.rb +31 -0
- data/lib/rumale/base/evaluator.rb +17 -0
- data/lib/rumale/base/regressor.rb +36 -0
- data/lib/rumale/base/splitter.rb +21 -0
- data/lib/rumale/base/transformer.rb +22 -0
- data/lib/rumale/clustering/dbscan.rb +125 -0
- data/lib/rumale/clustering/k_means.rb +138 -0
- data/lib/rumale/dataset.rb +110 -0
- data/lib/rumale/decomposition/nmf.rb +141 -0
- data/lib/rumale/decomposition/pca.rb +148 -0
- data/lib/rumale/ensemble/ada_boost_classifier.rb +196 -0
- data/lib/rumale/ensemble/ada_boost_regressor.rb +178 -0
- data/lib/rumale/ensemble/random_forest_classifier.rb +180 -0
- data/lib/rumale/ensemble/random_forest_regressor.rb +141 -0
- data/lib/rumale/evaluation_measure/accuracy.rb +29 -0
- data/lib/rumale/evaluation_measure/f_score.rb +50 -0
- data/lib/rumale/evaluation_measure/log_loss.rb +45 -0
- data/lib/rumale/evaluation_measure/mean_absolute_error.rb +29 -0
- data/lib/rumale/evaluation_measure/mean_squared_error.rb +29 -0
- data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +62 -0
- data/lib/rumale/evaluation_measure/precision.rb +50 -0
- data/lib/rumale/evaluation_measure/precision_recall.rb +91 -0
- data/lib/rumale/evaluation_measure/purity.rb +40 -0
- data/lib/rumale/evaluation_measure/r2_score.rb +43 -0
- data/lib/rumale/evaluation_measure/recall.rb +50 -0
- data/lib/rumale/kernel_approximation/rbf.rb +121 -0
- data/lib/rumale/kernel_machine/kernel_svc.rb +193 -0
- data/lib/rumale/linear_model/base_linear_model.rb +89 -0
- data/lib/rumale/linear_model/lasso.rb +136 -0
- data/lib/rumale/linear_model/linear_regression.rb +110 -0
- data/lib/rumale/linear_model/logistic_regression.rb +159 -0
- data/lib/rumale/linear_model/ridge.rb +110 -0
- data/lib/rumale/linear_model/svc.rb +183 -0
- data/lib/rumale/linear_model/svr.rb +122 -0
- data/lib/rumale/model_selection/cross_validation.rb +123 -0
- data/lib/rumale/model_selection/grid_search_cv.rb +247 -0
- data/lib/rumale/model_selection/k_fold.rb +76 -0
- data/lib/rumale/model_selection/stratified_k_fold.rb +94 -0
- data/lib/rumale/multiclass/one_vs_rest_classifier.rb +100 -0
- data/lib/rumale/naive_bayes/naive_bayes.rb +315 -0
- data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +111 -0
- data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +93 -0
- data/lib/rumale/optimizer/nadam.rb +90 -0
- data/lib/rumale/optimizer/rmsprop.rb +69 -0
- data/lib/rumale/optimizer/sgd.rb +65 -0
- data/lib/rumale/optimizer/yellow_fin.rb +144 -0
- data/lib/rumale/pairwise_metric.rb +91 -0
- data/lib/rumale/pipeline/pipeline.rb +197 -0
- data/lib/rumale/polynomial_model/base_factorization_machine.rb +99 -0
- data/lib/rumale/polynomial_model/factorization_machine_classifier.rb +197 -0
- data/lib/rumale/polynomial_model/factorization_machine_regressor.rb +131 -0
- data/lib/rumale/preprocessing/l2_normalizer.rb +62 -0
- data/lib/rumale/preprocessing/label_encoder.rb +94 -0
- data/lib/rumale/preprocessing/min_max_scaler.rb +92 -0
- data/lib/rumale/preprocessing/one_hot_encoder.rb +98 -0
- data/lib/rumale/preprocessing/standard_scaler.rb +86 -0
- data/lib/rumale/probabilistic_output.rb +112 -0
- data/lib/rumale/tree/base_decision_tree.rb +153 -0
- data/lib/rumale/tree/decision_tree_classifier.rb +163 -0
- data/lib/rumale/tree/decision_tree_regressor.rb +135 -0
- data/lib/rumale/tree/node.rb +70 -0
- data/lib/rumale/utils.rb +37 -0
- data/lib/rumale/validation.rb +79 -0
- data/lib/rumale/values.rb +13 -0
- data/lib/rumale/version.rb +6 -0
- data/rumale.gemspec +41 -0
- metadata +204 -0
@@ -0,0 +1,131 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/regressor'
|
4
|
+
require 'rumale/polynomial_model/base_factorization_machine'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module PolynomialModel
|
8
|
+
# FactorizationMachineRegressor is a class that implements Factorization Machine
|
9
|
+
# with stochastic gradient descent (SGD) optimization.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# estimator =
|
13
|
+
# Rumale::PolynomialModel::FactorizationMachineRegressor.new(
|
14
|
+
# n_factors: 10, reg_param_linear: 0.1, reg_param_factor: 0.1,
|
15
|
+
# max_iter: 5000, batch_size: 50, random_seed: 1)
|
16
|
+
# estimator.fit(training_samples, traininig_values)
|
17
|
+
# results = estimator.predict(testing_samples)
|
18
|
+
#
|
19
|
+
# *Reference*
|
20
|
+
# - S. Rendle, "Factorization Machines with libFM," ACM TIST, vol. 3 (3), pp. 57:1--57:22, 2012.
|
21
|
+
# - S. Rendle, "Factorization Machines," Proc. ICDM'10, pp. 995--1000, 2010.
|
22
|
+
class FactorizationMachineRegressor < BaseFactorizationMachine
|
23
|
+
include Base::Regressor
|
24
|
+
|
25
|
+
# Return the factor matrix for Factorization Machine.
|
26
|
+
# @return [Numo::DFloat] (shape: [n_outputs, n_factors, n_features])
|
27
|
+
attr_reader :factor_mat
|
28
|
+
|
29
|
+
# Return the weight vector for Factorization Machine.
|
30
|
+
# @return [Numo::DFloat] (shape: [n_outputs, n_features])
|
31
|
+
attr_reader :weight_vec
|
32
|
+
|
33
|
+
# Return the bias term for Factoriazation Machine.
|
34
|
+
# @return [Numo::DFloat] (shape: [n_outputs])
|
35
|
+
attr_reader :bias_term
|
36
|
+
|
37
|
+
# Return the random generator for random sampling.
|
38
|
+
# @return [Random]
|
39
|
+
attr_reader :rng
|
40
|
+
|
41
|
+
# Create a new regressor with Factorization Machine.
|
42
|
+
#
|
43
|
+
# @param n_factors [Integer] The maximum number of iterations.
|
44
|
+
# @param reg_param_linear [Float] The regularization parameter for linear model.
|
45
|
+
# @param reg_param_factor [Float] The regularization parameter for factor matrix.
|
46
|
+
# @param max_iter [Integer] The maximum number of iterations.
|
47
|
+
# @param batch_size [Integer] The size of the mini batches.
|
48
|
+
# @param optimizer [Optimizer] The optimizer to calculate adaptive learning rate.
|
49
|
+
# If nil is given, Nadam is used.
|
50
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
51
|
+
def initialize(n_factors: 2, reg_param_linear: 1.0, reg_param_factor: 1.0,
|
52
|
+
max_iter: 1000, batch_size: 10, optimizer: nil, random_seed: nil)
|
53
|
+
check_params_float(reg_param_linear: reg_param_linear, reg_param_factor: reg_param_factor)
|
54
|
+
check_params_integer(n_factors: n_factors, max_iter: max_iter, batch_size: batch_size)
|
55
|
+
check_params_type_or_nil(Integer, random_seed: random_seed)
|
56
|
+
check_params_positive(n_factors: n_factors, reg_param_linear: reg_param_linear, reg_param_factor: reg_param_factor,
|
57
|
+
max_iter: max_iter, batch_size: batch_size)
|
58
|
+
keywd_args = method(:initialize).parameters.map { |_t, arg| [arg, binding.local_variable_get(arg)] }.to_h.merge(loss: nil)
|
59
|
+
super(keywd_args)
|
60
|
+
end
|
61
|
+
|
62
|
+
# Fit the model with given training data.
|
63
|
+
#
|
64
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
65
|
+
# @param y [Numo::Int32] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
|
66
|
+
# @return [FactorizationMachineRegressor] The learned regressor itself.
|
67
|
+
def fit(x, y)
|
68
|
+
check_sample_array(x)
|
69
|
+
check_tvalue_array(y)
|
70
|
+
check_sample_tvalue_size(x, y)
|
71
|
+
|
72
|
+
n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
|
73
|
+
_n_samples, n_features = x.shape
|
74
|
+
|
75
|
+
if n_outputs > 1
|
76
|
+
@factor_mat = Numo::DFloat.zeros(n_outputs, @params[:n_factors], n_features)
|
77
|
+
@weight_vec = Numo::DFloat.zeros(n_outputs, n_features)
|
78
|
+
@bias_term = Numo::DFloat.zeros(n_outputs)
|
79
|
+
n_outputs.times { |n| @factor_mat[n, true, true], @weight_vec[n, true], @bias_term[n] = partial_fit(x, y[true, n]) }
|
80
|
+
else
|
81
|
+
@factor_mat, @weight_vec, @bias_term = partial_fit(x, y)
|
82
|
+
end
|
83
|
+
|
84
|
+
self
|
85
|
+
end
|
86
|
+
|
87
|
+
# Predict values for samples.
|
88
|
+
#
|
89
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
90
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
|
91
|
+
def predict(x)
|
92
|
+
check_sample_array(x)
|
93
|
+
linear_term = @bias_term + x.dot(@weight_vec.transpose)
|
94
|
+
factor_term = if @weight_vec.shape[1].nil?
|
95
|
+
0.5 * (@factor_mat.dot(x.transpose)**2 - (@factor_mat**2).dot(x.transpose**2)).sum(0)
|
96
|
+
else
|
97
|
+
0.5 * (@factor_mat.dot(x.transpose)**2 - (@factor_mat**2).dot(x.transpose**2)).sum(1).transpose
|
98
|
+
end
|
99
|
+
linear_term + factor_term
|
100
|
+
end
|
101
|
+
|
102
|
+
# Dump marshal data.
|
103
|
+
# @return [Hash] The marshal data about FactorizationMachineRegressor.
|
104
|
+
def marshal_dump
|
105
|
+
{ params: @params,
|
106
|
+
factor_mat: @factor_mat,
|
107
|
+
weight_vec: @weight_vec,
|
108
|
+
bias_term: @bias_term,
|
109
|
+
rng: @rng }
|
110
|
+
end
|
111
|
+
|
112
|
+
# Load marshal data.
|
113
|
+
# @return [nil]
|
114
|
+
def marshal_load(obj)
|
115
|
+
@params = obj[:params]
|
116
|
+
@factor_mat = obj[:factor_mat]
|
117
|
+
@weight_vec = obj[:weight_vec]
|
118
|
+
@bias_term = obj[:bias_term]
|
119
|
+
@rng = obj[:rng]
|
120
|
+
nil
|
121
|
+
end
|
122
|
+
|
123
|
+
private
|
124
|
+
|
125
|
+
def loss_gradient(x, ex_x, y, factor, weight)
|
126
|
+
z = ex_x.dot(weight) + 0.5 * (factor.dot(x.transpose)**2 - (factor**2).dot(x.transpose**2)).sum(0)
|
127
|
+
2.0 * (z - y)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
# This module consists of the classes that perform preprocessings.
|
8
|
+
module Preprocessing
|
9
|
+
# Normalize samples to unit L2-norm.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# normalizer = Rumale::Preprocessing::StandardScaler.new
|
13
|
+
# new_samples = normalizer.fit_transform(samples)
|
14
|
+
class L2Normalizer
|
15
|
+
include Base::BaseEstimator
|
16
|
+
include Base::Transformer
|
17
|
+
|
18
|
+
# Return the vector consists of L2-norm for each sample.
|
19
|
+
# @return [Numo::DFloat] (shape: [n_samples])
|
20
|
+
attr_reader :norm_vec # :nodoc:
|
21
|
+
|
22
|
+
# Create a new normalizer for normaliing to unit L2-norm.
|
23
|
+
def initialize
|
24
|
+
@params = {}
|
25
|
+
@norm_vec = nil
|
26
|
+
end
|
27
|
+
|
28
|
+
# Calculate L2-norms of each sample.
|
29
|
+
#
|
30
|
+
# @overload fit(x) -> L2Normalizer
|
31
|
+
#
|
32
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L2-norms.
|
33
|
+
# @return [L2Normalizer]
|
34
|
+
def fit(x, _y = nil)
|
35
|
+
check_sample_array(x)
|
36
|
+
@norm_vec = Numo::NMath.sqrt((x**2).sum(1))
|
37
|
+
self
|
38
|
+
end
|
39
|
+
|
40
|
+
# Calculate L2-norms of each sample, and then normalize samples to unit L2-norm.
|
41
|
+
#
|
42
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
43
|
+
#
|
44
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L2-norms.
|
45
|
+
# @return [Numo::DFloat] The normalized samples.
|
46
|
+
def fit_transform(x, _y = nil)
|
47
|
+
check_sample_array(x)
|
48
|
+
fit(x)
|
49
|
+
x / @norm_vec.tile(x.shape[1], 1).transpose
|
50
|
+
end
|
51
|
+
|
52
|
+
# Calculate L2-norms of each sample, and then normalize samples to unit L2-norm.
|
53
|
+
# This method calls the fit_transform method. This method exists for the Pipeline class.
|
54
|
+
#
|
55
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L2-norms.
|
56
|
+
# @return [Numo::DFloat] The normalized samples.
|
57
|
+
def transform(x)
|
58
|
+
fit_transform(x)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module Preprocessing
|
8
|
+
# Encode labels to values between 0 and n_classes - 1.
|
9
|
+
#
|
10
|
+
# @example
|
11
|
+
# encoder = Rumale::Preprocessing::LabelEncoder.new
|
12
|
+
# labels = Numo::Int32[1, 8, 8, 15, 0]
|
13
|
+
# encoded_labels = encoder.fit_transform(labels)
|
14
|
+
# # > pp encoded_labels
|
15
|
+
# # Numo::Int32#shape=[5]
|
16
|
+
# # [1, 2, 2, 3, 0]
|
17
|
+
# decoded_labels = encoder.inverse_transform(encoded_labels)
|
18
|
+
# # > pp decoded_labels
|
19
|
+
# # [1, 8, 8, 15, 0]
|
20
|
+
class LabelEncoder
|
21
|
+
include Base::BaseEstimator
|
22
|
+
include Base::Transformer
|
23
|
+
|
24
|
+
# Return the class labels.
|
25
|
+
# @return [Array] (size: [n_classes])
|
26
|
+
attr_reader :classes
|
27
|
+
|
28
|
+
# Create a new encoder for encoding labels to values between 0 and n_classes - 1.
|
29
|
+
def initialize
|
30
|
+
@params = {}
|
31
|
+
@classes = nil
|
32
|
+
end
|
33
|
+
|
34
|
+
# Fit label-encoder to labels.
|
35
|
+
#
|
36
|
+
# @overload fit(x) -> LabelEncoder
|
37
|
+
#
|
38
|
+
# @param x [Array] (shape: [n_samples]) The labels to fit label-encoder.
|
39
|
+
# @return [LabelEncoder]
|
40
|
+
def fit(x, _y = nil)
|
41
|
+
x = x.to_a if x.is_a?(Numo::NArray)
|
42
|
+
check_params_type(Array, x: x)
|
43
|
+
@classes = x.sort.uniq
|
44
|
+
self
|
45
|
+
end
|
46
|
+
|
47
|
+
# Fit label-encoder to labels, then return encoded labels.
|
48
|
+
#
|
49
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
50
|
+
#
|
51
|
+
# @param x [Array] (shape: [n_samples]) The labels to fit label-encoder.
|
52
|
+
# @return [Numo::Int32] The encoded labels.
|
53
|
+
def fit_transform(x, _y = nil)
|
54
|
+
x = x.to_a if x.is_a?(Numo::NArray)
|
55
|
+
check_params_type(Array, x: x)
|
56
|
+
fit(x).transform(x)
|
57
|
+
end
|
58
|
+
|
59
|
+
# Encode labels.
|
60
|
+
#
|
61
|
+
# @param x [Array] (shape: [n_samples]) The labels to be encoded.
|
62
|
+
# @return [Numo::Int32] The encoded labels.
|
63
|
+
def transform(x)
|
64
|
+
x = x.to_a if x.is_a?(Numo::NArray)
|
65
|
+
check_params_type(Array, x: x)
|
66
|
+
Numo::Int32[*(x.map { |v| @classes.index(v) })]
|
67
|
+
end
|
68
|
+
|
69
|
+
# Decode encoded labels.
|
70
|
+
#
|
71
|
+
# @param x [Numo::Int32] (shape: [n_samples]) The labels to be decoded.
|
72
|
+
# @return [Array] The decoded labels.
|
73
|
+
def inverse_transform(x)
|
74
|
+
check_label_array(x)
|
75
|
+
x.to_a.map { |n| @classes[n] }
|
76
|
+
end
|
77
|
+
|
78
|
+
# Dump marshal data.
|
79
|
+
# @return [Hash] The marshal data about LabelEncoder
|
80
|
+
def marshal_dump
|
81
|
+
{ params: @params,
|
82
|
+
classes: @classes }
|
83
|
+
end
|
84
|
+
|
85
|
+
# Load marshal data.
|
86
|
+
# @return [nil]
|
87
|
+
def marshal_load(obj)
|
88
|
+
@params = obj[:params]
|
89
|
+
@classes = obj[:classes]
|
90
|
+
nil
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
# This module consists of the classes that perform preprocessings.
|
8
|
+
module Preprocessing
|
9
|
+
# Normalize samples by scaling each feature to a given range.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# normalizer = Rumale::Preprocessing::MinMaxScaler.new(feature_range: [0.0, 1.0])
|
13
|
+
# new_training_samples = normalizer.fit_transform(training_samples)
|
14
|
+
# new_testing_samples = normalizer.transform(testing_samples)
|
15
|
+
class MinMaxScaler
|
16
|
+
include Base::BaseEstimator
|
17
|
+
include Base::Transformer
|
18
|
+
|
19
|
+
# Return the vector consists of the minimum value for each feature.
|
20
|
+
# @return [Numo::DFloat] (shape: [n_features])
|
21
|
+
attr_reader :min_vec
|
22
|
+
|
23
|
+
# Return the vector consists of the maximum value for each feature.
|
24
|
+
# @return [Numo::DFloat] (shape: [n_features])
|
25
|
+
attr_reader :max_vec
|
26
|
+
|
27
|
+
# Creates a new normalizer for scaling each feature to a given range.
|
28
|
+
#
|
29
|
+
# @param feature_range [Array<Float>] The desired range of samples.
|
30
|
+
def initialize(feature_range: [0.0, 1.0])
|
31
|
+
check_params_type(Array, feature_range: feature_range)
|
32
|
+
@params = {}
|
33
|
+
@params[:feature_range] = feature_range
|
34
|
+
@min_vec = nil
|
35
|
+
@max_vec = nil
|
36
|
+
end
|
37
|
+
|
38
|
+
# Calculate the minimum and maximum value of each feature for scaling.
|
39
|
+
#
|
40
|
+
# @overload fit(x) -> MinMaxScaler
|
41
|
+
#
|
42
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the minimum and maximum values.
|
43
|
+
# @return [MinMaxScaler]
|
44
|
+
def fit(x, _y = nil)
|
45
|
+
check_sample_array(x)
|
46
|
+
@min_vec = x.min(0)
|
47
|
+
@max_vec = x.max(0)
|
48
|
+
self
|
49
|
+
end
|
50
|
+
|
51
|
+
# Calculate the minimum and maximum values, and then normalize samples to feature_range.
|
52
|
+
#
|
53
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
54
|
+
#
|
55
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the minimum and maximum values.
|
56
|
+
# @return [Numo::DFloat] The scaled samples.
|
57
|
+
def fit_transform(x, _y = nil)
|
58
|
+
check_sample_array(x)
|
59
|
+
fit(x).transform(x)
|
60
|
+
end
|
61
|
+
|
62
|
+
# Perform scaling the given samples according to feature_range.
|
63
|
+
#
|
64
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be scaled.
|
65
|
+
# @return [Numo::DFloat] The scaled samples.
|
66
|
+
def transform(x)
|
67
|
+
check_sample_array(x)
|
68
|
+
n_samples, = x.shape
|
69
|
+
dif_vec = @max_vec - @min_vec
|
70
|
+
nx = (x - @min_vec.tile(n_samples, 1)) / dif_vec.tile(n_samples, 1)
|
71
|
+
nx * (@params[:feature_range][1] - @params[:feature_range][0]) + @params[:feature_range][0]
|
72
|
+
end
|
73
|
+
|
74
|
+
# Dump marshal data.
|
75
|
+
# @return [Hash] The marshal data about MinMaxScaler.
|
76
|
+
def marshal_dump
|
77
|
+
{ params: @params,
|
78
|
+
min_vec: @min_vec,
|
79
|
+
max_vec: @max_vec }
|
80
|
+
end
|
81
|
+
|
82
|
+
# Load marshal data.
|
83
|
+
# @return [nil]
|
84
|
+
def marshal_load(obj)
|
85
|
+
@params = obj[:params]
|
86
|
+
@min_vec = obj[:min_vec]
|
87
|
+
@max_vec = obj[:max_vec]
|
88
|
+
nil
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module Preprocessing
|
8
|
+
# Encode categorical integer features to one-hot-vectors.
|
9
|
+
#
|
10
|
+
# @example
|
11
|
+
# encoder = Rumale::Preprocessing::OneHotEncoder.new
|
12
|
+
# labels = Numo::Int32[0, 0, 2, 3, 2, 1]
|
13
|
+
# one_hot_vectors = encoder.fit_transform(labels)
|
14
|
+
# # > pp one_hot_vectors
|
15
|
+
# # Numo::DFloat#shape[6, 4]
|
16
|
+
# # [[1, 0, 0, 0],
|
17
|
+
# # [1, 0, 0, 0],
|
18
|
+
# # [0, 0, 1, 0],
|
19
|
+
# # [0, 0, 0, 1],
|
20
|
+
# # [0, 0, 1, 0],
|
21
|
+
# # [0, 1, 0, 0]]
|
22
|
+
class OneHotEncoder
|
23
|
+
include Base::BaseEstimator
|
24
|
+
include Base::Transformer
|
25
|
+
|
26
|
+
# Return the maximum values for each feature.
|
27
|
+
# @return [Numo::Int32] (shape: [n_features])
|
28
|
+
attr_reader :n_values
|
29
|
+
|
30
|
+
# Return the indices to feature ranges.
|
31
|
+
# @return [Numo::Int32] (shape: [n_features + 1])
|
32
|
+
attr_reader :feature_indices
|
33
|
+
|
34
|
+
# Create a new encoder for encoding categorical integer features to one-hot-vectors
|
35
|
+
def initialize
|
36
|
+
@params = {}
|
37
|
+
@n_values = nil
|
38
|
+
@feature_indices = nil
|
39
|
+
end
|
40
|
+
|
41
|
+
# Fit one-hot-encoder to samples.
|
42
|
+
#
|
43
|
+
# @overload fit(x) -> OneHotEncoder
|
44
|
+
#
|
45
|
+
# @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to fit one-hot-encoder.
|
46
|
+
# @return [OneHotEncoder]
|
47
|
+
def fit(x, _y = nil)
|
48
|
+
check_params_type(Numo::Int32, x: x)
|
49
|
+
@n_values = x.max(0) + 1
|
50
|
+
@feature_indices = Numo::Int32.hstack([[0], @n_values]).cumsum
|
51
|
+
self
|
52
|
+
end
|
53
|
+
|
54
|
+
# Fit one-hot-encoder to samples, then encode samples into one-hot-vectors
|
55
|
+
#
|
56
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
57
|
+
#
|
58
|
+
# @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to encode into one-hot-vectors.
|
59
|
+
# @return [Numo::DFloat] The one-hot-vectors.
|
60
|
+
def fit_transform(x, _y = nil)
|
61
|
+
check_params_type(Numo::Int32, x: x)
|
62
|
+
fit(x).transform(x)
|
63
|
+
end
|
64
|
+
|
65
|
+
# Encode samples into one-hot-vectors.
|
66
|
+
#
|
67
|
+
# @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to encode into one-hot-vectors.
|
68
|
+
# @return [Numo::DFloat] The one-hot-vectors.
|
69
|
+
def transform(x)
|
70
|
+
check_params_type(Numo::Int32, x: x)
|
71
|
+
n_samples, n_features = x.shape
|
72
|
+
n_features = 1 if n_features.nil?
|
73
|
+
column_indices = (x + @feature_indices[0...-1]).flatten.to_a
|
74
|
+
row_indices = Numo::Int32.new(n_samples).seq.repeat(n_features).to_a
|
75
|
+
codes = Numo::DFloat.zeros(n_samples, @feature_indices[-1])
|
76
|
+
row_indices.zip(column_indices).each { |r, c| codes[r, c] = 1.0 }
|
77
|
+
codes
|
78
|
+
end
|
79
|
+
|
80
|
+
# Dump marshal data.
|
81
|
+
# @return [Hash] The marshal data about OneHotEncoder.
|
82
|
+
def marshal_dump
|
83
|
+
{ params: @params,
|
84
|
+
n_values: @n_values,
|
85
|
+
feature_indices: @feature_indices }
|
86
|
+
end
|
87
|
+
|
88
|
+
# Load marshal data.
|
89
|
+
# @return [nil]
|
90
|
+
def marshal_load(obj)
|
91
|
+
@params = obj[:params]
|
92
|
+
@n_values = obj[:n_values]
|
93
|
+
@feature_indices = obj[:feature_indices]
|
94
|
+
nil
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|