rumale-preprocessing 0.24.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +27 -0
- data/README.md +34 -0
- data/lib/rumale/preprocessing/bin_discretizer.rb +97 -0
- data/lib/rumale/preprocessing/binarizer.rb +65 -0
- data/lib/rumale/preprocessing/kernel_calculator.rb +98 -0
- data/lib/rumale/preprocessing/l1_normalizer.rb +67 -0
- data/lib/rumale/preprocessing/l2_normalizer.rb +68 -0
- data/lib/rumale/preprocessing/label_binarizer.rb +86 -0
- data/lib/rumale/preprocessing/label_encoder.rb +75 -0
- data/lib/rumale/preprocessing/max_abs_scaler.rb +65 -0
- data/lib/rumale/preprocessing/max_normalizer.rb +67 -0
- data/lib/rumale/preprocessing/min_max_scaler.rb +78 -0
- data/lib/rumale/preprocessing/one_hot_encoder.rb +94 -0
- data/lib/rumale/preprocessing/ordinal_encoder.rb +111 -0
- data/lib/rumale/preprocessing/polynomial_features.rb +114 -0
- data/lib/rumale/preprocessing/standard_scaler.rb +74 -0
- data/lib/rumale/preprocessing/version.rb +10 -0
- data/lib/rumale/preprocessing.rb +19 -0
- metadata +97 -0
@@ -0,0 +1,65 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
require 'rumale/validation'
|
6
|
+
|
7
|
+
module Rumale
|
8
|
+
module Preprocessing
|
9
|
+
# Normalize samples by scaling each feature with its maximum absolute value.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# require 'rumale/preprocessing/max_abs_scaler'
|
13
|
+
#
|
14
|
+
# normalizer = Rumale::Preprocessing::MaxAbsScaler.new
|
15
|
+
# new_training_samples = normalizer.fit_transform(training_samples)
|
16
|
+
# new_testing_samples = normalizer.transform(testing_samples)
|
17
|
+
class MaxAbsScaler < ::Rumale::Base::Estimator
|
18
|
+
include ::Rumale::Base::Transformer
|
19
|
+
|
20
|
+
# Return the vector consists of the maximum absolute value for each feature.
|
21
|
+
# @return [Numo::DFloat] (shape: [n_features])
|
22
|
+
attr_reader :max_abs_vec
|
23
|
+
|
24
|
+
# Creates a new normalizer for scaling each feature with its maximum absolute value.
|
25
|
+
def initialize # rubocop:disable Lint/UselessMethodDefinition
|
26
|
+
super()
|
27
|
+
end
|
28
|
+
|
29
|
+
# Calculate the minimum and maximum value of each feature for scaling.
|
30
|
+
#
|
31
|
+
# @overload fit(x) -> MaxAbsScaler
|
32
|
+
#
|
33
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate maximum absolute value for each feature.
|
34
|
+
# @return [MaxAbsScaler]
|
35
|
+
def fit(x, _y = nil)
|
36
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
37
|
+
|
38
|
+
@max_abs_vec = x.abs.max(0)
|
39
|
+
self
|
40
|
+
end
|
41
|
+
|
42
|
+
# Calculate the maximum absolute value for each feature, and then normalize samples.
|
43
|
+
#
|
44
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
45
|
+
#
|
46
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate maximum absolute value for each feature.
|
47
|
+
# @return [Numo::DFloat] The scaled samples.
|
48
|
+
def fit_transform(x, _y = nil)
|
49
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
50
|
+
|
51
|
+
fit(x).transform(x)
|
52
|
+
end
|
53
|
+
|
54
|
+
# Perform scaling the given samples with maximum absolute value for each feature.
|
55
|
+
#
|
56
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be scaled.
|
57
|
+
# @return [Numo::DFloat] The scaled samples.
|
58
|
+
def transform(x)
|
59
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
60
|
+
|
61
|
+
x / @max_abs_vec
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
require 'rumale/validation'
|
6
|
+
|
7
|
+
module Rumale
|
8
|
+
module Preprocessing
|
9
|
+
# Normalize samples with the maximum of the absolute values.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# require 'rumale/preprocessing/max_normalizer'
|
13
|
+
#
|
14
|
+
# normalizer = Rumale::Preprocessing::MaxNormalizer.new
|
15
|
+
# new_samples = normalizer.fit_transform(samples)
|
16
|
+
class MaxNormalizer < ::Rumale::Base::Estimator
|
17
|
+
include ::Rumale::Base::Transformer
|
18
|
+
|
19
|
+
# Return the vector consists of the maximum norm for each sample.
|
20
|
+
# @return [Numo::DFloat] (shape: [n_samples])
|
21
|
+
attr_reader :norm_vec # :nodoc:
|
22
|
+
|
23
|
+
# Create a new normalizer for normaliing to max-norm.
|
24
|
+
def initialize # rubocop:disable Lint/UselessMethodDefinition
|
25
|
+
super()
|
26
|
+
end
|
27
|
+
|
28
|
+
# Calculate the maximum norms of each sample.
|
29
|
+
#
|
30
|
+
# @overload fit(x) -> MaxNormalizer
|
31
|
+
#
|
32
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the maximum norms.
|
33
|
+
# @return [MaxNormalizer]
|
34
|
+
def fit(x, _y = nil)
|
35
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
36
|
+
|
37
|
+
@norm_vec = x.abs.max(1)
|
38
|
+
@norm_vec[@norm_vec.eq(0)] = 1
|
39
|
+
self
|
40
|
+
end
|
41
|
+
|
42
|
+
# Calculate the maximums norm of each sample, and then normalize samples with the norms.
|
43
|
+
#
|
44
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
45
|
+
#
|
46
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate maximum norms.
|
47
|
+
# @return [Numo::DFloat] The normalized samples.
|
48
|
+
def fit_transform(x, _y = nil)
|
49
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
50
|
+
|
51
|
+
fit(x)
|
52
|
+
x / @norm_vec.expand_dims(1)
|
53
|
+
end
|
54
|
+
|
55
|
+
# Calculate the maximum norms of each sample, and then normalize samples with the norms.
|
56
|
+
# This method calls the fit_transform method. This method exists for the Pipeline class.
|
57
|
+
#
|
58
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate maximum norms.
|
59
|
+
# @return [Numo::DFloat] The normalized samples.
|
60
|
+
def transform(x)
|
61
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
62
|
+
|
63
|
+
fit_transform(x)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
require 'rumale/validation'
|
6
|
+
|
7
|
+
module Rumale
|
8
|
+
# This module consists of the classes that perform preprocessings.
|
9
|
+
module Preprocessing
|
10
|
+
# Normalize samples by scaling each feature to a given range.
|
11
|
+
#
|
12
|
+
# @example
|
13
|
+
# require 'rumale/preprocessing/min_max_scaler'
|
14
|
+
#
|
15
|
+
# normalizer = Rumale::Preprocessing::MinMaxScaler.new(feature_range: [0.0, 1.0])
|
16
|
+
# new_training_samples = normalizer.fit_transform(training_samples)
|
17
|
+
# new_testing_samples = normalizer.transform(testing_samples)
|
18
|
+
class MinMaxScaler < ::Rumale::Base::Estimator
|
19
|
+
include ::Rumale::Base::Transformer
|
20
|
+
|
21
|
+
# Return the vector consists of the minimum value for each feature.
|
22
|
+
# @return [Numo::DFloat] (shape: [n_features])
|
23
|
+
attr_reader :min_vec
|
24
|
+
|
25
|
+
# Return the vector consists of the maximum value for each feature.
|
26
|
+
# @return [Numo::DFloat] (shape: [n_features])
|
27
|
+
attr_reader :max_vec
|
28
|
+
|
29
|
+
# Creates a new normalizer for scaling each feature to a given range.
|
30
|
+
#
|
31
|
+
# @param feature_range [Array<Float>] The desired range of samples.
|
32
|
+
def initialize(feature_range: [0.0, 1.0])
|
33
|
+
super()
|
34
|
+
@params = { feature_range: feature_range }
|
35
|
+
end
|
36
|
+
|
37
|
+
# Calculate the minimum and maximum value of each feature for scaling.
|
38
|
+
#
|
39
|
+
# @overload fit(x) -> MinMaxScaler
|
40
|
+
#
|
41
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the minimum and maximum values.
|
42
|
+
# @return [MinMaxScaler]
|
43
|
+
def fit(x, _y = nil)
|
44
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
45
|
+
|
46
|
+
@min_vec = x.min(0)
|
47
|
+
@max_vec = x.max(0)
|
48
|
+
self
|
49
|
+
end
|
50
|
+
|
51
|
+
# Calculate the minimum and maximum values, and then normalize samples to feature_range.
|
52
|
+
#
|
53
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
54
|
+
#
|
55
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the minimum and maximum values.
|
56
|
+
# @return [Numo::DFloat] The scaled samples.
|
57
|
+
def fit_transform(x, _y = nil)
|
58
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
59
|
+
|
60
|
+
fit(x).transform(x)
|
61
|
+
end
|
62
|
+
|
63
|
+
# Perform scaling the given samples according to feature_range.
|
64
|
+
#
|
65
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be scaled.
|
66
|
+
# @return [Numo::DFloat] The scaled samples.
|
67
|
+
def transform(x)
|
68
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
69
|
+
|
70
|
+
n_samples, = x.shape
|
71
|
+
dif_vec = @max_vec - @min_vec
|
72
|
+
dif_vec[dif_vec.eq(0)] = 1.0
|
73
|
+
nx = (x - @min_vec.tile(n_samples, 1)) / dif_vec.tile(n_samples, 1)
|
74
|
+
nx * (@params[:feature_range][1] - @params[:feature_range][0]) + @params[:feature_range][0]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module Preprocessing
|
8
|
+
# Encode categorical integer features to one-hot-vectors.
|
9
|
+
#
|
10
|
+
# @example
|
11
|
+
# require 'rumale/preprocessing/one_hot_encoder'
|
12
|
+
#
|
13
|
+
# encoder = Rumale::Preprocessing::OneHotEncoder.new
|
14
|
+
# labels = Numo::Int32[0, 0, 2, 3, 2, 1]
|
15
|
+
# one_hot_vectors = encoder.fit_transform(labels)
|
16
|
+
# # > pp one_hot_vectors
|
17
|
+
# # Numo::DFloat#shape[6, 4]
|
18
|
+
# # [[1, 0, 0, 0],
|
19
|
+
# # [1, 0, 0, 0],
|
20
|
+
# # [0, 0, 1, 0],
|
21
|
+
# # [0, 0, 0, 1],
|
22
|
+
# # [0, 0, 1, 0],
|
23
|
+
# # [0, 1, 0, 0]]
|
24
|
+
class OneHotEncoder < ::Rumale::Base::Estimator
|
25
|
+
include ::Rumale::Base::Transformer
|
26
|
+
|
27
|
+
# Return the maximum values for each feature.
|
28
|
+
# @return [Numo::Int32] (shape: [n_features])
|
29
|
+
attr_reader :n_values
|
30
|
+
|
31
|
+
# Return the indices for feature values that actually occur in the training set.
|
32
|
+
# @return [Nimo::Int32]
|
33
|
+
attr_reader :active_features
|
34
|
+
|
35
|
+
# Return the indices to feature ranges.
|
36
|
+
# @return [Numo::Int32] (shape: [n_features + 1])
|
37
|
+
attr_reader :feature_indices
|
38
|
+
|
39
|
+
# Create a new encoder for encoding categorical integer features to one-hot-vectors
|
40
|
+
def initialize # rubocop:disable Lint/UselessMethodDefinition
|
41
|
+
super()
|
42
|
+
end
|
43
|
+
|
44
|
+
# Fit one-hot-encoder to samples.
|
45
|
+
#
|
46
|
+
# @overload fit(x) -> OneHotEncoder
|
47
|
+
# @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to fit one-hot-encoder.
|
48
|
+
# @return [OneHotEncoder]
|
49
|
+
def fit(x, _y = nil)
|
50
|
+
raise ArgumentError, 'Expected the input samples only consists of non-negative integer values.' if x.lt(0).any?
|
51
|
+
|
52
|
+
@n_values = x.max(0) + 1
|
53
|
+
@feature_indices = Numo::Int32.hstack([[0], @n_values]).cumsum
|
54
|
+
@active_features = encode(x, @feature_indices).sum(axis: 0).ne(0).where
|
55
|
+
self
|
56
|
+
end
|
57
|
+
|
58
|
+
# Fit one-hot-encoder to samples, then encode samples into one-hot-vectors
|
59
|
+
#
|
60
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
61
|
+
#
|
62
|
+
# @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to encode into one-hot-vectors.
|
63
|
+
# @return [Numo::DFloat] The one-hot-vectors.
|
64
|
+
def fit_transform(x, _y = nil)
|
65
|
+
raise ArgumentError, 'Expected the input samples only consists of non-negative integer values.' if x.lt(0).any?
|
66
|
+
|
67
|
+
fit(x).transform(x)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Encode samples into one-hot-vectors.
|
71
|
+
#
|
72
|
+
# @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to encode into one-hot-vectors.
|
73
|
+
# @return [Numo::DFloat] The one-hot-vectors.
|
74
|
+
def transform(x)
|
75
|
+
raise ArgumentError, 'Expected the input samples only consists of non-negative integer values.' if x.lt(0).any?
|
76
|
+
|
77
|
+
codes = encode(x, @feature_indices)
|
78
|
+
codes[true, @active_features].dup
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
def encode(x, indices)
|
84
|
+
n_samples, n_features = x.shape
|
85
|
+
n_features = 1 if n_features.nil?
|
86
|
+
col_indices = (x + indices[0...-1]).flatten.to_a
|
87
|
+
row_indices = Numo::Int32.new(n_samples).seq.repeat(n_features).to_a
|
88
|
+
codes = Numo::DFloat.zeros(n_samples, indices[-1])
|
89
|
+
row_indices.zip(col_indices).each { |r, c| codes[r, c] = 1.0 }
|
90
|
+
codes
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module Preprocessing
|
8
|
+
# Transfrom categorical features to integer values.
|
9
|
+
#
|
10
|
+
# @example
|
11
|
+
# require 'rumale/preprocessing/ordinal_encoder'
|
12
|
+
#
|
13
|
+
# encoder = Rumale::Preprocessing::OrdinalEncoder.new
|
14
|
+
# training_samples = [['left', 10], ['right', 15], ['right', 20]]
|
15
|
+
# training_samples = Numo::NArray.asarray(training_samples)
|
16
|
+
# encoder.fit(training_samples)
|
17
|
+
# p encoder.categories
|
18
|
+
# # [["left", "right"], [10, 15, 20]]
|
19
|
+
# testing_samples = [['left', 20], ['right', 10]]
|
20
|
+
# testing_samples = Numo::NArray.asarray(testing_samples)
|
21
|
+
# encoded = encoder.transform(testing_samples)
|
22
|
+
# p encoded
|
23
|
+
# # Numo::DFloat#shape=[2,2]
|
24
|
+
# # [[0, 2],
|
25
|
+
# # [1, 0]]
|
26
|
+
# p encoder.inverse_transform(encoded)
|
27
|
+
# # Numo::RObject#shape=[2,2]
|
28
|
+
# # [["left", 20],
|
29
|
+
# # ["right", 10]]
|
30
|
+
class OrdinalEncoder < ::Rumale::Base::Estimator
|
31
|
+
include ::Rumale::Base::Transformer
|
32
|
+
|
33
|
+
# Return the array consists of categorical value each feature.
|
34
|
+
# @return [Array] (size: n_features)
|
35
|
+
attr_reader :categories
|
36
|
+
|
37
|
+
# Create a new encoder that transform categorical features to integer values.
|
38
|
+
#
|
39
|
+
# @param categories [Nil/Array] The category list for each feature.
|
40
|
+
# If nil is given, extracted categories from the training data by calling the fit method are used.
|
41
|
+
def initialize(categories: nil)
|
42
|
+
super()
|
43
|
+
@categories = categories
|
44
|
+
end
|
45
|
+
|
46
|
+
# Fit encoder by extracting the category for each feature.
|
47
|
+
#
|
48
|
+
# @overload fit(x) -> OrdinalEncoder
|
49
|
+
#
|
50
|
+
# @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
|
51
|
+
# @return [LabelEncoder]
|
52
|
+
def fit(x, _y = nil)
|
53
|
+
raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
|
54
|
+
|
55
|
+
n_features = x.shape[1]
|
56
|
+
@categories = Array.new(n_features) { |n| x[true, n].to_a.uniq.sort }
|
57
|
+
self
|
58
|
+
end
|
59
|
+
|
60
|
+
# Fit encoder, then return encoded categorical features to integer values.
|
61
|
+
#
|
62
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
63
|
+
#
|
64
|
+
# @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
|
65
|
+
# @return [Numo::DFloat] The encoded categorical features to integer values.
|
66
|
+
def fit_transform(x, _y = nil)
|
67
|
+
raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
|
68
|
+
|
69
|
+
fit(x).transform(x)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Encode categorical features.
|
73
|
+
#
|
74
|
+
# @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
|
75
|
+
# @return [Numo::DFloat] The encoded categorical features to integer values.
|
76
|
+
def transform(x)
|
77
|
+
raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
|
78
|
+
|
79
|
+
n_features = x.shape[1]
|
80
|
+
if n_features != @categories.size
|
81
|
+
raise ArgumentError,
|
82
|
+
'Expect the number of features and the number of categories to be equal'
|
83
|
+
end
|
84
|
+
|
85
|
+
transformed = Array.new(n_features) do |n|
|
86
|
+
x[true, n].to_a.map { |v| @categories[n].index(v) }
|
87
|
+
end
|
88
|
+
|
89
|
+
Numo::DFloat.asarray(transformed.transpose)
|
90
|
+
end
|
91
|
+
|
92
|
+
# Decode values to categorical features.
|
93
|
+
#
|
94
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples consisting of values transformed from categorical features.
|
95
|
+
# @return [Numo::NArray] The decoded features.
|
96
|
+
def inverse_transform(x)
|
97
|
+
n_features = x.shape[1]
|
98
|
+
if n_features != @categories.size
|
99
|
+
raise ArgumentError,
|
100
|
+
'Expect the number of features and the number of categories to be equal'
|
101
|
+
end
|
102
|
+
|
103
|
+
inv_transformed = Array.new(n_features) do |n|
|
104
|
+
x[true, n].to_a.map { |i| @categories[n][i.to_i] }
|
105
|
+
end
|
106
|
+
|
107
|
+
Numo::NArray.asarray(inv_transformed.transpose)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
require 'rumale/validation'
|
6
|
+
|
7
|
+
module Rumale
|
8
|
+
module Preprocessing
|
9
|
+
# Generating polynomial features from the given samples.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# require 'rumale/preprocessing/polynomial_features'
|
13
|
+
#
|
14
|
+
# transformer = Rumale::Preprocessing::PolynomialFeatures.new(degree: 2)
|
15
|
+
# x = Numo::DFloat[[0, 1], [2, 3], [4, 5]]
|
16
|
+
# z = transformer.fit_transform(x)
|
17
|
+
# p z
|
18
|
+
#
|
19
|
+
# # Numo::DFloat#shape=[3,6]
|
20
|
+
# # [[1, 0, 1, 0, 0, 1],
|
21
|
+
# # [1, 2, 3, 4, 6, 9],
|
22
|
+
# # [1, 4, 5, 16, 20, 25]]
|
23
|
+
#
|
24
|
+
# # If you want to perform polynomial regression, combine it with LinearRegression as follows:
|
25
|
+
# require 'rumale/preprocessing/polynomial_features'
|
26
|
+
# require 'rumale/linear_model/linear_regression'
|
27
|
+
# require 'rumale/pipeline/pipeline'
|
28
|
+
#
|
29
|
+
# ply = Rumale::Preprocessing::PolynomialFeatures.new(degree: 2)
|
30
|
+
# reg = Rumale::LinearModel::LinearRegression.new(fit_bias: false, random_seed: 1)
|
31
|
+
# pipeline = Rumale::Pipeline::Pipeline.new(steps: { trs: ply, est: reg })
|
32
|
+
# pipeline.fit(training_samples, training_values)
|
33
|
+
# results = pipeline.predict(testing_samples)
|
34
|
+
#
|
35
|
+
class PolynomialFeatures < ::Rumale::Base::Estimator
|
36
|
+
include ::Rumale::Base::Transformer
|
37
|
+
|
38
|
+
# Return the number of polynomial features.
|
39
|
+
# @return [Integer]
|
40
|
+
attr_reader :n_output_features
|
41
|
+
|
42
|
+
# Create a transformer for generating polynomial features.
|
43
|
+
#
|
44
|
+
# @param degree [Integer] The degree of polynomial features.
|
45
|
+
def initialize(degree: 2)
|
46
|
+
raise ArgumentError, 'Expect the value of degree parameter greater than or eqaul to 1.' if degree < 1
|
47
|
+
|
48
|
+
super()
|
49
|
+
@params = { degree: degree }
|
50
|
+
end
|
51
|
+
|
52
|
+
# Calculate the number of output polynomial fetures.
|
53
|
+
#
|
54
|
+
# @overload fit(x) -> PolynomialFeatures
|
55
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the number of output polynomial fetures.
|
56
|
+
# @return [PolynomialFeatures]
|
57
|
+
def fit(x, _y = nil)
|
58
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
59
|
+
|
60
|
+
n_features = x.shape[1]
|
61
|
+
@n_output_features = 1
|
62
|
+
@params[:degree].times do |t|
|
63
|
+
@n_output_features += Array.new(n_features) { |n| n }.repeated_combination(t + 1).size
|
64
|
+
end
|
65
|
+
self
|
66
|
+
end
|
67
|
+
|
68
|
+
# Calculate the number of polynomial features, and then transform samples to polynomial features.
|
69
|
+
#
|
70
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
71
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the number of polynomial features
|
72
|
+
# and be transformed.
|
73
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_output_features]) The transformed samples.
|
74
|
+
def fit_transform(x, _y = nil)
|
75
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
76
|
+
|
77
|
+
fit(x).transform(x)
|
78
|
+
end
|
79
|
+
|
80
|
+
# Transform the given samples to polynomial features.
|
81
|
+
#
|
82
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed.
|
83
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_output_features]) The transformed samples.
|
84
|
+
def transform(x)
|
85
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
86
|
+
|
87
|
+
# initialize transformed features
|
88
|
+
n_samples, n_features = x.shape
|
89
|
+
z = Numo::DFloat.zeros(n_samples, n_output_features)
|
90
|
+
# bias
|
91
|
+
z[true, 0] = 1
|
92
|
+
curr_col = 1
|
93
|
+
# itself
|
94
|
+
z[true, 1..n_features] = x
|
95
|
+
curr_col += n_features
|
96
|
+
# high degree features
|
97
|
+
curr_feat_ids = Array.new(n_features + 1) { |n| n + 1 }
|
98
|
+
(1...@params[:degree]).each do
|
99
|
+
next_feat_ids = []
|
100
|
+
n_features.times do |d|
|
101
|
+
f_range = curr_feat_ids[d]...curr_feat_ids.last
|
102
|
+
next_col = curr_col + f_range.size
|
103
|
+
z[true, curr_col...next_col] = z[true, f_range] * x[true, d..d]
|
104
|
+
next_feat_ids.push(curr_col)
|
105
|
+
curr_col = next_col
|
106
|
+
end
|
107
|
+
next_feat_ids.push(curr_col)
|
108
|
+
curr_feat_ids = next_feat_ids
|
109
|
+
end
|
110
|
+
z
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
require 'rumale/validation'
|
6
|
+
|
7
|
+
module Rumale
|
8
|
+
# This module consists of the classes that perform preprocessings.
|
9
|
+
module Preprocessing
|
10
|
+
# Normalize samples by centering and scaling to unit variance.
|
11
|
+
#
|
12
|
+
# @example
|
13
|
+
# require 'rumale/preprocessing/standard_scaler'
|
14
|
+
#
|
15
|
+
# normalizer = Rumale::Preprocessing::StandardScaler.new
|
16
|
+
# new_training_samples = normalizer.fit_transform(training_samples)
|
17
|
+
# new_testing_samples = normalizer.transform(testing_samples)
|
18
|
+
class StandardScaler < ::Rumale::Base::Estimator
|
19
|
+
include ::Rumale::Base::Transformer
|
20
|
+
|
21
|
+
# Return the vector consists of the mean value for each feature.
|
22
|
+
# @return [Numo::DFloat] (shape: [n_features])
|
23
|
+
attr_reader :mean_vec
|
24
|
+
|
25
|
+
# Return the vector consists of the standard deviation for each feature.
|
26
|
+
# @return [Numo::DFloat] (shape: [n_features])
|
27
|
+
attr_reader :std_vec
|
28
|
+
|
29
|
+
# Create a new normalizer for centering and scaling to unit variance.
|
30
|
+
def initialize # rubocop:disable Lint/UselessMethodDefinition
|
31
|
+
super()
|
32
|
+
end
|
33
|
+
|
34
|
+
# Calculate the mean value and standard deviation of each feature for scaling.
|
35
|
+
#
|
36
|
+
# @overload fit(x) -> StandardScaler
|
37
|
+
#
|
38
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features])
|
39
|
+
# The samples to calculate the mean values and standard deviations.
|
40
|
+
# @return [StandardScaler]
|
41
|
+
def fit(x, _y = nil)
|
42
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
43
|
+
|
44
|
+
@mean_vec = x.mean(0)
|
45
|
+
@std_vec = x.stddev(0)
|
46
|
+
self
|
47
|
+
end
|
48
|
+
|
49
|
+
# Calculate the mean values and standard deviations, and then normalize samples using them.
|
50
|
+
#
|
51
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
52
|
+
#
|
53
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features])
|
54
|
+
# The samples to calculate the mean values and standard deviations.
|
55
|
+
# @return [Numo::DFloat] The scaled samples.
|
56
|
+
def fit_transform(x, _y = nil)
|
57
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
58
|
+
|
59
|
+
fit(x).transform(x)
|
60
|
+
end
|
61
|
+
|
62
|
+
# Perform standardization the given samples.
|
63
|
+
#
|
64
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be scaled.
|
65
|
+
# @return [Numo::DFloat] The scaled samples.
|
66
|
+
def transform(x)
|
67
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
68
|
+
|
69
|
+
n_samples, = x.shape
|
70
|
+
(x - @mean_vec.tile(n_samples, 1)) / @std_vec.tile(n_samples, 1)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'numo/narray'
|
4
|
+
|
5
|
+
require_relative 'preprocessing/bin_discretizer'
|
6
|
+
require_relative 'preprocessing/binarizer'
|
7
|
+
require_relative 'preprocessing/kernel_calculator'
|
8
|
+
require_relative 'preprocessing/l1_normalizer'
|
9
|
+
require_relative 'preprocessing/l2_normalizer'
|
10
|
+
require_relative 'preprocessing/label_binarizer'
|
11
|
+
require_relative 'preprocessing/label_encoder'
|
12
|
+
require_relative 'preprocessing/max_abs_scaler'
|
13
|
+
require_relative 'preprocessing/max_normalizer'
|
14
|
+
require_relative 'preprocessing/min_max_scaler'
|
15
|
+
require_relative 'preprocessing/one_hot_encoder'
|
16
|
+
require_relative 'preprocessing/ordinal_encoder'
|
17
|
+
require_relative 'preprocessing/polynomial_features'
|
18
|
+
require_relative 'preprocessing/standard_scaler'
|
19
|
+
require_relative 'preprocessing/version'
|