rumale-preprocessing 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +27 -0
- data/README.md +34 -0
- data/lib/rumale/preprocessing/bin_discretizer.rb +97 -0
- data/lib/rumale/preprocessing/binarizer.rb +65 -0
- data/lib/rumale/preprocessing/kernel_calculator.rb +98 -0
- data/lib/rumale/preprocessing/l1_normalizer.rb +67 -0
- data/lib/rumale/preprocessing/l2_normalizer.rb +68 -0
- data/lib/rumale/preprocessing/label_binarizer.rb +86 -0
- data/lib/rumale/preprocessing/label_encoder.rb +75 -0
- data/lib/rumale/preprocessing/max_abs_scaler.rb +65 -0
- data/lib/rumale/preprocessing/max_normalizer.rb +67 -0
- data/lib/rumale/preprocessing/min_max_scaler.rb +78 -0
- data/lib/rumale/preprocessing/one_hot_encoder.rb +94 -0
- data/lib/rumale/preprocessing/ordinal_encoder.rb +111 -0
- data/lib/rumale/preprocessing/polynomial_features.rb +114 -0
- data/lib/rumale/preprocessing/standard_scaler.rb +74 -0
- data/lib/rumale/preprocessing/version.rb +10 -0
- data/lib/rumale/preprocessing.rb +19 -0
- metadata +97 -0
@@ -0,0 +1,65 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
require 'rumale/validation'
|
6
|
+
|
7
|
+
module Rumale
|
8
|
+
module Preprocessing
|
9
|
+
# Normalize samples by scaling each feature with its maximum absolute value.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# require 'rumale/preprocessing/max_abs_scaler'
|
13
|
+
#
|
14
|
+
# normalizer = Rumale::Preprocessing::MaxAbsScaler.new
|
15
|
+
# new_training_samples = normalizer.fit_transform(training_samples)
|
16
|
+
# new_testing_samples = normalizer.transform(testing_samples)
|
17
|
+
class MaxAbsScaler < ::Rumale::Base::Estimator
|
18
|
+
include ::Rumale::Base::Transformer
|
19
|
+
|
20
|
+
# Return the vector consists of the maximum absolute value for each feature.
|
21
|
+
# @return [Numo::DFloat] (shape: [n_features])
|
22
|
+
attr_reader :max_abs_vec
|
23
|
+
|
24
|
+
# Creates a new normalizer for scaling each feature with its maximum absolute value.
|
25
|
+
def initialize # rubocop:disable Lint/UselessMethodDefinition
|
26
|
+
super()
|
27
|
+
end
|
28
|
+
|
29
|
+
# Calculate the minimum and maximum value of each feature for scaling.
|
30
|
+
#
|
31
|
+
# @overload fit(x) -> MaxAbsScaler
|
32
|
+
#
|
33
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate maximum absolute value for each feature.
|
34
|
+
# @return [MaxAbsScaler]
|
35
|
+
def fit(x, _y = nil)
|
36
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
37
|
+
|
38
|
+
@max_abs_vec = x.abs.max(0)
|
39
|
+
self
|
40
|
+
end
|
41
|
+
|
42
|
+
# Calculate the maximum absolute value for each feature, and then normalize samples.
|
43
|
+
#
|
44
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
45
|
+
#
|
46
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate maximum absolute value for each feature.
|
47
|
+
# @return [Numo::DFloat] The scaled samples.
|
48
|
+
def fit_transform(x, _y = nil)
|
49
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
50
|
+
|
51
|
+
fit(x).transform(x)
|
52
|
+
end
|
53
|
+
|
54
|
+
# Perform scaling the given samples with maximum absolute value for each feature.
|
55
|
+
#
|
56
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be scaled.
|
57
|
+
# @return [Numo::DFloat] The scaled samples.
|
58
|
+
def transform(x)
|
59
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
60
|
+
|
61
|
+
x / @max_abs_vec
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
require 'rumale/validation'
|
6
|
+
|
7
|
+
module Rumale
|
8
|
+
module Preprocessing
|
9
|
+
# Normalize samples with the maximum of the absolute values.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# require 'rumale/preprocessing/max_normalizer'
|
13
|
+
#
|
14
|
+
# normalizer = Rumale::Preprocessing::MaxNormalizer.new
|
15
|
+
# new_samples = normalizer.fit_transform(samples)
|
16
|
+
class MaxNormalizer < ::Rumale::Base::Estimator
|
17
|
+
include ::Rumale::Base::Transformer
|
18
|
+
|
19
|
+
# Return the vector consists of the maximum norm for each sample.
|
20
|
+
# @return [Numo::DFloat] (shape: [n_samples])
|
21
|
+
attr_reader :norm_vec # :nodoc:
|
22
|
+
|
23
|
+
# Create a new normalizer for normaliing to max-norm.
|
24
|
+
def initialize # rubocop:disable Lint/UselessMethodDefinition
|
25
|
+
super()
|
26
|
+
end
|
27
|
+
|
28
|
+
# Calculate the maximum norms of each sample.
|
29
|
+
#
|
30
|
+
# @overload fit(x) -> MaxNormalizer
|
31
|
+
#
|
32
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the maximum norms.
|
33
|
+
# @return [MaxNormalizer]
|
34
|
+
def fit(x, _y = nil)
|
35
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
36
|
+
|
37
|
+
@norm_vec = x.abs.max(1)
|
38
|
+
@norm_vec[@norm_vec.eq(0)] = 1
|
39
|
+
self
|
40
|
+
end
|
41
|
+
|
42
|
+
# Calculate the maximums norm of each sample, and then normalize samples with the norms.
|
43
|
+
#
|
44
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
45
|
+
#
|
46
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate maximum norms.
|
47
|
+
# @return [Numo::DFloat] The normalized samples.
|
48
|
+
def fit_transform(x, _y = nil)
|
49
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
50
|
+
|
51
|
+
fit(x)
|
52
|
+
x / @norm_vec.expand_dims(1)
|
53
|
+
end
|
54
|
+
|
55
|
+
# Calculate the maximum norms of each sample, and then normalize samples with the norms.
|
56
|
+
# This method calls the fit_transform method. This method exists for the Pipeline class.
|
57
|
+
#
|
58
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate maximum norms.
|
59
|
+
# @return [Numo::DFloat] The normalized samples.
|
60
|
+
def transform(x)
|
61
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
62
|
+
|
63
|
+
fit_transform(x)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
require 'rumale/validation'
|
6
|
+
|
7
|
+
module Rumale
|
8
|
+
# This module consists of the classes that perform preprocessings.
|
9
|
+
module Preprocessing
|
10
|
+
# Normalize samples by scaling each feature to a given range.
|
11
|
+
#
|
12
|
+
# @example
|
13
|
+
# require 'rumale/preprocessing/min_max_scaler'
|
14
|
+
#
|
15
|
+
# normalizer = Rumale::Preprocessing::MinMaxScaler.new(feature_range: [0.0, 1.0])
|
16
|
+
# new_training_samples = normalizer.fit_transform(training_samples)
|
17
|
+
# new_testing_samples = normalizer.transform(testing_samples)
|
18
|
+
class MinMaxScaler < ::Rumale::Base::Estimator
|
19
|
+
include ::Rumale::Base::Transformer
|
20
|
+
|
21
|
+
# Return the vector consists of the minimum value for each feature.
|
22
|
+
# @return [Numo::DFloat] (shape: [n_features])
|
23
|
+
attr_reader :min_vec
|
24
|
+
|
25
|
+
# Return the vector consists of the maximum value for each feature.
|
26
|
+
# @return [Numo::DFloat] (shape: [n_features])
|
27
|
+
attr_reader :max_vec
|
28
|
+
|
29
|
+
# Creates a new normalizer for scaling each feature to a given range.
|
30
|
+
#
|
31
|
+
# @param feature_range [Array<Float>] The desired range of samples.
|
32
|
+
def initialize(feature_range: [0.0, 1.0])
|
33
|
+
super()
|
34
|
+
@params = { feature_range: feature_range }
|
35
|
+
end
|
36
|
+
|
37
|
+
# Calculate the minimum and maximum value of each feature for scaling.
|
38
|
+
#
|
39
|
+
# @overload fit(x) -> MinMaxScaler
|
40
|
+
#
|
41
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the minimum and maximum values.
|
42
|
+
# @return [MinMaxScaler]
|
43
|
+
def fit(x, _y = nil)
|
44
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
45
|
+
|
46
|
+
@min_vec = x.min(0)
|
47
|
+
@max_vec = x.max(0)
|
48
|
+
self
|
49
|
+
end
|
50
|
+
|
51
|
+
# Calculate the minimum and maximum values, and then normalize samples to feature_range.
|
52
|
+
#
|
53
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
54
|
+
#
|
55
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the minimum and maximum values.
|
56
|
+
# @return [Numo::DFloat] The scaled samples.
|
57
|
+
def fit_transform(x, _y = nil)
|
58
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
59
|
+
|
60
|
+
fit(x).transform(x)
|
61
|
+
end
|
62
|
+
|
63
|
+
# Perform scaling the given samples according to feature_range.
|
64
|
+
#
|
65
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be scaled.
|
66
|
+
# @return [Numo::DFloat] The scaled samples.
|
67
|
+
def transform(x)
|
68
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
69
|
+
|
70
|
+
n_samples, = x.shape
|
71
|
+
dif_vec = @max_vec - @min_vec
|
72
|
+
dif_vec[dif_vec.eq(0)] = 1.0
|
73
|
+
nx = (x - @min_vec.tile(n_samples, 1)) / dif_vec.tile(n_samples, 1)
|
74
|
+
nx * (@params[:feature_range][1] - @params[:feature_range][0]) + @params[:feature_range][0]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module Preprocessing
|
8
|
+
# Encode categorical integer features to one-hot-vectors.
|
9
|
+
#
|
10
|
+
# @example
|
11
|
+
# require 'rumale/preprocessing/one_hot_encoder'
|
12
|
+
#
|
13
|
+
# encoder = Rumale::Preprocessing::OneHotEncoder.new
|
14
|
+
# labels = Numo::Int32[0, 0, 2, 3, 2, 1]
|
15
|
+
# one_hot_vectors = encoder.fit_transform(labels)
|
16
|
+
# # > pp one_hot_vectors
|
17
|
+
# # Numo::DFloat#shape[6, 4]
|
18
|
+
# # [[1, 0, 0, 0],
|
19
|
+
# # [1, 0, 0, 0],
|
20
|
+
# # [0, 0, 1, 0],
|
21
|
+
# # [0, 0, 0, 1],
|
22
|
+
# # [0, 0, 1, 0],
|
23
|
+
# # [0, 1, 0, 0]]
|
24
|
+
class OneHotEncoder < ::Rumale::Base::Estimator
|
25
|
+
include ::Rumale::Base::Transformer
|
26
|
+
|
27
|
+
# Return the maximum values for each feature.
|
28
|
+
# @return [Numo::Int32] (shape: [n_features])
|
29
|
+
attr_reader :n_values
|
30
|
+
|
31
|
+
# Return the indices for feature values that actually occur in the training set.
|
32
|
+
# @return [Nimo::Int32]
|
33
|
+
attr_reader :active_features
|
34
|
+
|
35
|
+
# Return the indices to feature ranges.
|
36
|
+
# @return [Numo::Int32] (shape: [n_features + 1])
|
37
|
+
attr_reader :feature_indices
|
38
|
+
|
39
|
+
# Create a new encoder for encoding categorical integer features to one-hot-vectors
|
40
|
+
def initialize # rubocop:disable Lint/UselessMethodDefinition
|
41
|
+
super()
|
42
|
+
end
|
43
|
+
|
44
|
+
# Fit one-hot-encoder to samples.
|
45
|
+
#
|
46
|
+
# @overload fit(x) -> OneHotEncoder
|
47
|
+
# @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to fit one-hot-encoder.
|
48
|
+
# @return [OneHotEncoder]
|
49
|
+
def fit(x, _y = nil)
|
50
|
+
raise ArgumentError, 'Expected the input samples only consists of non-negative integer values.' if x.lt(0).any?
|
51
|
+
|
52
|
+
@n_values = x.max(0) + 1
|
53
|
+
@feature_indices = Numo::Int32.hstack([[0], @n_values]).cumsum
|
54
|
+
@active_features = encode(x, @feature_indices).sum(axis: 0).ne(0).where
|
55
|
+
self
|
56
|
+
end
|
57
|
+
|
58
|
+
# Fit one-hot-encoder to samples, then encode samples into one-hot-vectors
|
59
|
+
#
|
60
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
61
|
+
#
|
62
|
+
# @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to encode into one-hot-vectors.
|
63
|
+
# @return [Numo::DFloat] The one-hot-vectors.
|
64
|
+
def fit_transform(x, _y = nil)
|
65
|
+
raise ArgumentError, 'Expected the input samples only consists of non-negative integer values.' if x.lt(0).any?
|
66
|
+
|
67
|
+
fit(x).transform(x)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Encode samples into one-hot-vectors.
|
71
|
+
#
|
72
|
+
# @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to encode into one-hot-vectors.
|
73
|
+
# @return [Numo::DFloat] The one-hot-vectors.
|
74
|
+
def transform(x)
|
75
|
+
raise ArgumentError, 'Expected the input samples only consists of non-negative integer values.' if x.lt(0).any?
|
76
|
+
|
77
|
+
codes = encode(x, @feature_indices)
|
78
|
+
codes[true, @active_features].dup
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
def encode(x, indices)
|
84
|
+
n_samples, n_features = x.shape
|
85
|
+
n_features = 1 if n_features.nil?
|
86
|
+
col_indices = (x + indices[0...-1]).flatten.to_a
|
87
|
+
row_indices = Numo::Int32.new(n_samples).seq.repeat(n_features).to_a
|
88
|
+
codes = Numo::DFloat.zeros(n_samples, indices[-1])
|
89
|
+
row_indices.zip(col_indices).each { |r, c| codes[r, c] = 1.0 }
|
90
|
+
codes
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module Preprocessing
|
8
|
+
# Transfrom categorical features to integer values.
|
9
|
+
#
|
10
|
+
# @example
|
11
|
+
# require 'rumale/preprocessing/ordinal_encoder'
|
12
|
+
#
|
13
|
+
# encoder = Rumale::Preprocessing::OrdinalEncoder.new
|
14
|
+
# training_samples = [['left', 10], ['right', 15], ['right', 20]]
|
15
|
+
# training_samples = Numo::NArray.asarray(training_samples)
|
16
|
+
# encoder.fit(training_samples)
|
17
|
+
# p encoder.categories
|
18
|
+
# # [["left", "right"], [10, 15, 20]]
|
19
|
+
# testing_samples = [['left', 20], ['right', 10]]
|
20
|
+
# testing_samples = Numo::NArray.asarray(testing_samples)
|
21
|
+
# encoded = encoder.transform(testing_samples)
|
22
|
+
# p encoded
|
23
|
+
# # Numo::DFloat#shape=[2,2]
|
24
|
+
# # [[0, 2],
|
25
|
+
# # [1, 0]]
|
26
|
+
# p encoder.inverse_transform(encoded)
|
27
|
+
# # Numo::RObject#shape=[2,2]
|
28
|
+
# # [["left", 20],
|
29
|
+
# # ["right", 10]]
|
30
|
+
class OrdinalEncoder < ::Rumale::Base::Estimator
|
31
|
+
include ::Rumale::Base::Transformer
|
32
|
+
|
33
|
+
# Return the array consists of categorical value each feature.
|
34
|
+
# @return [Array] (size: n_features)
|
35
|
+
attr_reader :categories
|
36
|
+
|
37
|
+
# Create a new encoder that transform categorical features to integer values.
|
38
|
+
#
|
39
|
+
# @param categories [Nil/Array] The category list for each feature.
|
40
|
+
# If nil is given, extracted categories from the training data by calling the fit method are used.
|
41
|
+
def initialize(categories: nil)
|
42
|
+
super()
|
43
|
+
@categories = categories
|
44
|
+
end
|
45
|
+
|
46
|
+
# Fit encoder by extracting the category for each feature.
|
47
|
+
#
|
48
|
+
# @overload fit(x) -> OrdinalEncoder
|
49
|
+
#
|
50
|
+
# @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
|
51
|
+
# @return [LabelEncoder]
|
52
|
+
def fit(x, _y = nil)
|
53
|
+
raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
|
54
|
+
|
55
|
+
n_features = x.shape[1]
|
56
|
+
@categories = Array.new(n_features) { |n| x[true, n].to_a.uniq.sort }
|
57
|
+
self
|
58
|
+
end
|
59
|
+
|
60
|
+
# Fit encoder, then return encoded categorical features to integer values.
|
61
|
+
#
|
62
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
63
|
+
#
|
64
|
+
# @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
|
65
|
+
# @return [Numo::DFloat] The encoded categorical features to integer values.
|
66
|
+
def fit_transform(x, _y = nil)
|
67
|
+
raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
|
68
|
+
|
69
|
+
fit(x).transform(x)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Encode categorical features.
|
73
|
+
#
|
74
|
+
# @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
|
75
|
+
# @return [Numo::DFloat] The encoded categorical features to integer values.
|
76
|
+
def transform(x)
|
77
|
+
raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
|
78
|
+
|
79
|
+
n_features = x.shape[1]
|
80
|
+
if n_features != @categories.size
|
81
|
+
raise ArgumentError,
|
82
|
+
'Expect the number of features and the number of categories to be equal'
|
83
|
+
end
|
84
|
+
|
85
|
+
transformed = Array.new(n_features) do |n|
|
86
|
+
x[true, n].to_a.map { |v| @categories[n].index(v) }
|
87
|
+
end
|
88
|
+
|
89
|
+
Numo::DFloat.asarray(transformed.transpose)
|
90
|
+
end
|
91
|
+
|
92
|
+
# Decode values to categorical features.
|
93
|
+
#
|
94
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples consisting of values transformed from categorical features.
|
95
|
+
# @return [Numo::NArray] The decoded features.
|
96
|
+
def inverse_transform(x)
|
97
|
+
n_features = x.shape[1]
|
98
|
+
if n_features != @categories.size
|
99
|
+
raise ArgumentError,
|
100
|
+
'Expect the number of features and the number of categories to be equal'
|
101
|
+
end
|
102
|
+
|
103
|
+
inv_transformed = Array.new(n_features) do |n|
|
104
|
+
x[true, n].to_a.map { |i| @categories[n][i.to_i] }
|
105
|
+
end
|
106
|
+
|
107
|
+
Numo::NArray.asarray(inv_transformed.transpose)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
require 'rumale/validation'
|
6
|
+
|
7
|
+
module Rumale
|
8
|
+
module Preprocessing
|
9
|
+
# Generating polynomial features from the given samples.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# require 'rumale/preprocessing/polynomial_features'
|
13
|
+
#
|
14
|
+
# transformer = Rumale::Preprocessing::PolynomialFeatures.new(degree: 2)
|
15
|
+
# x = Numo::DFloat[[0, 1], [2, 3], [4, 5]]
|
16
|
+
# z = transformer.fit_transform(x)
|
17
|
+
# p z
|
18
|
+
#
|
19
|
+
# # Numo::DFloat#shape=[3,6]
|
20
|
+
# # [[1, 0, 1, 0, 0, 1],
|
21
|
+
# # [1, 2, 3, 4, 6, 9],
|
22
|
+
# # [1, 4, 5, 16, 20, 25]]
|
23
|
+
#
|
24
|
+
# # If you want to perform polynomial regression, combine it with LinearRegression as follows:
|
25
|
+
# require 'rumale/preprocessing/polynomial_features'
|
26
|
+
# require 'rumale/linear_model/linear_regression'
|
27
|
+
# require 'rumale/pipeline/pipeline'
|
28
|
+
#
|
29
|
+
# ply = Rumale::Preprocessing::PolynomialFeatures.new(degree: 2)
|
30
|
+
# reg = Rumale::LinearModel::LinearRegression.new(fit_bias: false, random_seed: 1)
|
31
|
+
# pipeline = Rumale::Pipeline::Pipeline.new(steps: { trs: ply, est: reg })
|
32
|
+
# pipeline.fit(training_samples, training_values)
|
33
|
+
# results = pipeline.predict(testing_samples)
|
34
|
+
#
|
35
|
+
class PolynomialFeatures < ::Rumale::Base::Estimator
|
36
|
+
include ::Rumale::Base::Transformer
|
37
|
+
|
38
|
+
# Return the number of polynomial features.
|
39
|
+
# @return [Integer]
|
40
|
+
attr_reader :n_output_features
|
41
|
+
|
42
|
+
# Create a transformer for generating polynomial features.
|
43
|
+
#
|
44
|
+
# @param degree [Integer] The degree of polynomial features.
|
45
|
+
def initialize(degree: 2)
|
46
|
+
raise ArgumentError, 'Expect the value of degree parameter greater than or eqaul to 1.' if degree < 1
|
47
|
+
|
48
|
+
super()
|
49
|
+
@params = { degree: degree }
|
50
|
+
end
|
51
|
+
|
52
|
+
# Calculate the number of output polynomial fetures.
|
53
|
+
#
|
54
|
+
# @overload fit(x) -> PolynomialFeatures
|
55
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the number of output polynomial fetures.
|
56
|
+
# @return [PolynomialFeatures]
|
57
|
+
def fit(x, _y = nil)
|
58
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
59
|
+
|
60
|
+
n_features = x.shape[1]
|
61
|
+
@n_output_features = 1
|
62
|
+
@params[:degree].times do |t|
|
63
|
+
@n_output_features += Array.new(n_features) { |n| n }.repeated_combination(t + 1).size
|
64
|
+
end
|
65
|
+
self
|
66
|
+
end
|
67
|
+
|
68
|
+
# Calculate the number of polynomial features, and then transform samples to polynomial features.
|
69
|
+
#
|
70
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
71
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the number of polynomial features
|
72
|
+
# and be transformed.
|
73
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_output_features]) The transformed samples.
|
74
|
+
def fit_transform(x, _y = nil)
|
75
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
76
|
+
|
77
|
+
fit(x).transform(x)
|
78
|
+
end
|
79
|
+
|
80
|
+
# Transform the given samples to polynomial features.
|
81
|
+
#
|
82
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed.
|
83
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_output_features]) The transformed samples.
|
84
|
+
def transform(x)
|
85
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
86
|
+
|
87
|
+
# initialize transformed features
|
88
|
+
n_samples, n_features = x.shape
|
89
|
+
z = Numo::DFloat.zeros(n_samples, n_output_features)
|
90
|
+
# bias
|
91
|
+
z[true, 0] = 1
|
92
|
+
curr_col = 1
|
93
|
+
# itself
|
94
|
+
z[true, 1..n_features] = x
|
95
|
+
curr_col += n_features
|
96
|
+
# high degree features
|
97
|
+
curr_feat_ids = Array.new(n_features + 1) { |n| n + 1 }
|
98
|
+
(1...@params[:degree]).each do
|
99
|
+
next_feat_ids = []
|
100
|
+
n_features.times do |d|
|
101
|
+
f_range = curr_feat_ids[d]...curr_feat_ids.last
|
102
|
+
next_col = curr_col + f_range.size
|
103
|
+
z[true, curr_col...next_col] = z[true, f_range] * x[true, d..d]
|
104
|
+
next_feat_ids.push(curr_col)
|
105
|
+
curr_col = next_col
|
106
|
+
end
|
107
|
+
next_feat_ids.push(curr_col)
|
108
|
+
curr_feat_ids = next_feat_ids
|
109
|
+
end
|
110
|
+
z
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
require 'rumale/validation'
|
6
|
+
|
7
|
+
module Rumale
|
8
|
+
# This module consists of the classes that perform preprocessings.
|
9
|
+
module Preprocessing
|
10
|
+
# Normalize samples by centering and scaling to unit variance.
|
11
|
+
#
|
12
|
+
# @example
|
13
|
+
# require 'rumale/preprocessing/standard_scaler'
|
14
|
+
#
|
15
|
+
# normalizer = Rumale::Preprocessing::StandardScaler.new
|
16
|
+
# new_training_samples = normalizer.fit_transform(training_samples)
|
17
|
+
# new_testing_samples = normalizer.transform(testing_samples)
|
18
|
+
class StandardScaler < ::Rumale::Base::Estimator
|
19
|
+
include ::Rumale::Base::Transformer
|
20
|
+
|
21
|
+
# Return the vector consists of the mean value for each feature.
|
22
|
+
# @return [Numo::DFloat] (shape: [n_features])
|
23
|
+
attr_reader :mean_vec
|
24
|
+
|
25
|
+
# Return the vector consists of the standard deviation for each feature.
|
26
|
+
# @return [Numo::DFloat] (shape: [n_features])
|
27
|
+
attr_reader :std_vec
|
28
|
+
|
29
|
+
# Create a new normalizer for centering and scaling to unit variance.
|
30
|
+
def initialize # rubocop:disable Lint/UselessMethodDefinition
|
31
|
+
super()
|
32
|
+
end
|
33
|
+
|
34
|
+
# Calculate the mean value and standard deviation of each feature for scaling.
|
35
|
+
#
|
36
|
+
# @overload fit(x) -> StandardScaler
|
37
|
+
#
|
38
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features])
|
39
|
+
# The samples to calculate the mean values and standard deviations.
|
40
|
+
# @return [StandardScaler]
|
41
|
+
def fit(x, _y = nil)
|
42
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
43
|
+
|
44
|
+
@mean_vec = x.mean(0)
|
45
|
+
@std_vec = x.stddev(0)
|
46
|
+
self
|
47
|
+
end
|
48
|
+
|
49
|
+
# Calculate the mean values and standard deviations, and then normalize samples using them.
|
50
|
+
#
|
51
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
52
|
+
#
|
53
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features])
|
54
|
+
# The samples to calculate the mean values and standard deviations.
|
55
|
+
# @return [Numo::DFloat] The scaled samples.
|
56
|
+
def fit_transform(x, _y = nil)
|
57
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
58
|
+
|
59
|
+
fit(x).transform(x)
|
60
|
+
end
|
61
|
+
|
62
|
+
# Perform standardization the given samples.
|
63
|
+
#
|
64
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be scaled.
|
65
|
+
# @return [Numo::DFloat] The scaled samples.
|
66
|
+
def transform(x)
|
67
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
68
|
+
|
69
|
+
n_samples, = x.shape
|
70
|
+
(x - @mean_vec.tile(n_samples, 1)) / @std_vec.tile(n_samples, 1)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'numo/narray'
|
4
|
+
|
5
|
+
require_relative 'preprocessing/bin_discretizer'
|
6
|
+
require_relative 'preprocessing/binarizer'
|
7
|
+
require_relative 'preprocessing/kernel_calculator'
|
8
|
+
require_relative 'preprocessing/l1_normalizer'
|
9
|
+
require_relative 'preprocessing/l2_normalizer'
|
10
|
+
require_relative 'preprocessing/label_binarizer'
|
11
|
+
require_relative 'preprocessing/label_encoder'
|
12
|
+
require_relative 'preprocessing/max_abs_scaler'
|
13
|
+
require_relative 'preprocessing/max_normalizer'
|
14
|
+
require_relative 'preprocessing/min_max_scaler'
|
15
|
+
require_relative 'preprocessing/one_hot_encoder'
|
16
|
+
require_relative 'preprocessing/ordinal_encoder'
|
17
|
+
require_relative 'preprocessing/polynomial_features'
|
18
|
+
require_relative 'preprocessing/standard_scaler'
|
19
|
+
require_relative 'preprocessing/version'
|