svmkit 0.7.3 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -9
- data/.rspec +1 -0
- data/.travis.yml +4 -12
- data/LICENSE.txt +1 -1
- data/README.md +11 -13
- data/lib/svmkit.rb +3 -66
- data/svmkit.gemspec +12 -7
- metadata +16 -81
- data/.coveralls.yml +0 -1
- data/.rubocop.yml +0 -47
- data/.rubocop_todo.yml +0 -58
- data/HISTORY.md +0 -168
- data/lib/svmkit/base/base_estimator.rb +0 -13
- data/lib/svmkit/base/classifier.rb +0 -34
- data/lib/svmkit/base/cluster_analyzer.rb +0 -29
- data/lib/svmkit/base/evaluator.rb +0 -13
- data/lib/svmkit/base/regressor.rb +0 -34
- data/lib/svmkit/base/splitter.rb +0 -17
- data/lib/svmkit/base/transformer.rb +0 -18
- data/lib/svmkit/clustering/dbscan.rb +0 -127
- data/lib/svmkit/clustering/k_means.rb +0 -140
- data/lib/svmkit/dataset.rb +0 -109
- data/lib/svmkit/decomposition/nmf.rb +0 -147
- data/lib/svmkit/decomposition/pca.rb +0 -150
- data/lib/svmkit/ensemble/ada_boost_classifier.rb +0 -198
- data/lib/svmkit/ensemble/ada_boost_regressor.rb +0 -180
- data/lib/svmkit/ensemble/random_forest_classifier.rb +0 -182
- data/lib/svmkit/ensemble/random_forest_regressor.rb +0 -143
- data/lib/svmkit/evaluation_measure/accuracy.rb +0 -30
- data/lib/svmkit/evaluation_measure/f_score.rb +0 -51
- data/lib/svmkit/evaluation_measure/log_loss.rb +0 -46
- data/lib/svmkit/evaluation_measure/mean_absolute_error.rb +0 -30
- data/lib/svmkit/evaluation_measure/mean_squared_error.rb +0 -30
- data/lib/svmkit/evaluation_measure/normalized_mutual_information.rb +0 -63
- data/lib/svmkit/evaluation_measure/precision.rb +0 -51
- data/lib/svmkit/evaluation_measure/precision_recall.rb +0 -91
- data/lib/svmkit/evaluation_measure/purity.rb +0 -41
- data/lib/svmkit/evaluation_measure/r2_score.rb +0 -44
- data/lib/svmkit/evaluation_measure/recall.rb +0 -51
- data/lib/svmkit/kernel_approximation/rbf.rb +0 -136
- data/lib/svmkit/kernel_machine/kernel_svc.rb +0 -194
- data/lib/svmkit/linear_model/lasso.rb +0 -138
- data/lib/svmkit/linear_model/linear_regression.rb +0 -112
- data/lib/svmkit/linear_model/logistic_regression.rb +0 -161
- data/lib/svmkit/linear_model/ridge.rb +0 -112
- data/lib/svmkit/linear_model/sgd_linear_estimator.rb +0 -89
- data/lib/svmkit/linear_model/svc.rb +0 -184
- data/lib/svmkit/linear_model/svr.rb +0 -123
- data/lib/svmkit/model_selection/cross_validation.rb +0 -121
- data/lib/svmkit/model_selection/grid_search_cv.rb +0 -247
- data/lib/svmkit/model_selection/k_fold.rb +0 -77
- data/lib/svmkit/model_selection/stratified_k_fold.rb +0 -95
- data/lib/svmkit/multiclass/one_vs_rest_classifier.rb +0 -101
- data/lib/svmkit/naive_bayes/naive_bayes.rb +0 -316
- data/lib/svmkit/nearest_neighbors/k_neighbors_classifier.rb +0 -112
- data/lib/svmkit/nearest_neighbors/k_neighbors_regressor.rb +0 -94
- data/lib/svmkit/optimizer/nadam.rb +0 -90
- data/lib/svmkit/optimizer/rmsprop.rb +0 -69
- data/lib/svmkit/optimizer/sgd.rb +0 -65
- data/lib/svmkit/optimizer/yellow_fin.rb +0 -144
- data/lib/svmkit/pairwise_metric.rb +0 -91
- data/lib/svmkit/pipeline/pipeline.rb +0 -197
- data/lib/svmkit/polynomial_model/factorization_machine_classifier.rb +0 -262
- data/lib/svmkit/polynomial_model/factorization_machine_regressor.rb +0 -194
- data/lib/svmkit/preprocessing/l2_normalizer.rb +0 -63
- data/lib/svmkit/preprocessing/label_encoder.rb +0 -95
- data/lib/svmkit/preprocessing/min_max_scaler.rb +0 -93
- data/lib/svmkit/preprocessing/one_hot_encoder.rb +0 -99
- data/lib/svmkit/preprocessing/standard_scaler.rb +0 -87
- data/lib/svmkit/probabilistic_output.rb +0 -112
- data/lib/svmkit/tree/decision_tree_classifier.rb +0 -276
- data/lib/svmkit/tree/decision_tree_regressor.rb +0 -251
- data/lib/svmkit/tree/node.rb +0 -70
- data/lib/svmkit/utils.rb +0 -22
- data/lib/svmkit/validation.rb +0 -79
- data/lib/svmkit/values.rb +0 -13
- data/lib/svmkit/version.rb +0 -7
@@ -1,194 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'svmkit/validation'
|
4
|
-
require 'svmkit/base/base_estimator'
|
5
|
-
require 'svmkit/base/regressor'
|
6
|
-
require 'svmkit/optimizer/nadam'
|
7
|
-
|
8
|
-
module SVMKit
|
9
|
-
module PolynomialModel
|
10
|
-
# FactorizationMachineRegressor is a class that implements Factorization Machine
|
11
|
-
# with stochastic gradient descent (SGD) optimization.
|
12
|
-
#
|
13
|
-
# @example
|
14
|
-
# estimator =
|
15
|
-
# SVMKit::PolynomialModel::FactorizationMachineRegressor.new(
|
16
|
-
# n_factors: 10, reg_param_linear: 0.1, reg_param_factor: 0.1,
|
17
|
-
# max_iter: 5000, batch_size: 50, random_seed: 1)
|
18
|
-
# estimator.fit(training_samples, traininig_values)
|
19
|
-
# results = estimator.predict(testing_samples)
|
20
|
-
#
|
21
|
-
# *Reference*
|
22
|
-
# - S. Rendle, "Factorization Machines with libFM," ACM TIST, vol. 3 (3), pp. 57:1--57:22, 2012.
|
23
|
-
# - S. Rendle, "Factorization Machines," Proc. ICDM'10, pp. 995--1000, 2010.
|
24
|
-
class FactorizationMachineRegressor
|
25
|
-
include Base::BaseEstimator
|
26
|
-
include Base::Regressor
|
27
|
-
include Validation
|
28
|
-
|
29
|
-
# Return the factor matrix for Factorization Machine.
|
30
|
-
# @return [Numo::DFloat] (shape: [n_outputs, n_factors, n_features])
|
31
|
-
attr_reader :factor_mat
|
32
|
-
|
33
|
-
# Return the weight vector for Factorization Machine.
|
34
|
-
# @return [Numo::DFloat] (shape: [n_outputs, n_features])
|
35
|
-
attr_reader :weight_vec
|
36
|
-
|
37
|
-
# Return the bias term for Factoriazation Machine.
|
38
|
-
# @return [Numo::DFloat] (shape: [n_outputs])
|
39
|
-
attr_reader :bias_term
|
40
|
-
|
41
|
-
# Return the random generator for random sampling.
|
42
|
-
# @return [Random]
|
43
|
-
attr_reader :rng
|
44
|
-
|
45
|
-
# Create a new regressor with Factorization Machine.
|
46
|
-
#
|
47
|
-
# @param n_factors [Integer] The maximum number of iterations.
|
48
|
-
# @param reg_param_linear [Float] The regularization parameter for linear model.
|
49
|
-
# @param reg_param_factor [Float] The regularization parameter for factor matrix.
|
50
|
-
# @param max_iter [Integer] The maximum number of iterations.
|
51
|
-
# @param batch_size [Integer] The size of the mini batches.
|
52
|
-
# @param optimizer [Optimizer] The optimizer to calculate adaptive learning rate.
|
53
|
-
# If nil is given, Nadam is used.
|
54
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
55
|
-
def initialize(n_factors: 2, reg_param_linear: 1.0, reg_param_factor: 1.0,
|
56
|
-
max_iter: 1000, batch_size: 10, optimizer: nil, random_seed: nil)
|
57
|
-
check_params_float(reg_param_linear: reg_param_linear, reg_param_factor: reg_param_factor)
|
58
|
-
check_params_integer(n_factors: n_factors, max_iter: max_iter, batch_size: batch_size)
|
59
|
-
check_params_type_or_nil(Integer, random_seed: random_seed)
|
60
|
-
check_params_positive(n_factors: n_factors, reg_param_linear: reg_param_linear, reg_param_factor: reg_param_factor,
|
61
|
-
max_iter: max_iter, batch_size: batch_size)
|
62
|
-
@params = {}
|
63
|
-
@params[:n_factors] = n_factors
|
64
|
-
@params[:reg_param_linear] = reg_param_linear
|
65
|
-
@params[:reg_param_factor] = reg_param_factor
|
66
|
-
@params[:max_iter] = max_iter
|
67
|
-
@params[:batch_size] = batch_size
|
68
|
-
@params[:optimizer] = optimizer
|
69
|
-
@params[:optimizer] ||= Optimizer::Nadam.new
|
70
|
-
@params[:random_seed] = random_seed
|
71
|
-
@params[:random_seed] ||= srand
|
72
|
-
@factor_mat = nil
|
73
|
-
@weight_vec = nil
|
74
|
-
@bias_term = nil
|
75
|
-
@rng = Random.new(@params[:random_seed])
|
76
|
-
end
|
77
|
-
|
78
|
-
# Fit the model with given training data.
|
79
|
-
#
|
80
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
81
|
-
# @param y [Numo::Int32] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
|
82
|
-
# @return [FactorizationMachineRegressor] The learned regressor itself.
|
83
|
-
def fit(x, y)
|
84
|
-
check_sample_array(x)
|
85
|
-
check_tvalue_array(y)
|
86
|
-
check_sample_tvalue_size(x, y)
|
87
|
-
|
88
|
-
n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
|
89
|
-
_n_samples, n_features = x.shape
|
90
|
-
|
91
|
-
if n_outputs > 1
|
92
|
-
@factor_mat = Numo::DFloat.zeros(n_outputs, @params[:n_factors], n_features)
|
93
|
-
@weight_vec = Numo::DFloat.zeros(n_outputs, n_features)
|
94
|
-
@bias_term = Numo::DFloat.zeros(n_outputs)
|
95
|
-
n_outputs.times { |n| @factor_mat[n, true, true], @weight_vec[n, true], @bias_term[n] = single_fit(x, y[true, n]) }
|
96
|
-
else
|
97
|
-
@factor_mat, @weight_vec, @bias_term = single_fit(x, y)
|
98
|
-
end
|
99
|
-
|
100
|
-
self
|
101
|
-
end
|
102
|
-
|
103
|
-
# Predict values for samples.
|
104
|
-
#
|
105
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
106
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
|
107
|
-
def predict(x)
|
108
|
-
check_sample_array(x)
|
109
|
-
linear_term = @bias_term + x.dot(@weight_vec.transpose)
|
110
|
-
factor_term = if @weight_vec.shape[1].nil?
|
111
|
-
0.5 * (@factor_mat.dot(x.transpose)**2 - (@factor_mat**2).dot(x.transpose**2)).sum(0)
|
112
|
-
else
|
113
|
-
0.5 * (@factor_mat.dot(x.transpose)**2 - (@factor_mat**2).dot(x.transpose**2)).sum(1).transpose
|
114
|
-
end
|
115
|
-
linear_term + factor_term
|
116
|
-
end
|
117
|
-
|
118
|
-
# Dump marshal data.
|
119
|
-
# @return [Hash] The marshal data about FactorizationMachineRegressor.
|
120
|
-
def marshal_dump
|
121
|
-
{ params: @params,
|
122
|
-
factor_mat: @factor_mat,
|
123
|
-
weight_vec: @weight_vec,
|
124
|
-
bias_term: @bias_term,
|
125
|
-
rng: @rng }
|
126
|
-
end
|
127
|
-
|
128
|
-
# Load marshal data.
|
129
|
-
# @return [nil]
|
130
|
-
def marshal_load(obj)
|
131
|
-
@params = obj[:params]
|
132
|
-
@factor_mat = obj[:factor_mat]
|
133
|
-
@weight_vec = obj[:weight_vec]
|
134
|
-
@bias_term = obj[:bias_term]
|
135
|
-
@rng = obj[:rng]
|
136
|
-
nil
|
137
|
-
end
|
138
|
-
|
139
|
-
private
|
140
|
-
|
141
|
-
def single_fit(x, y)
|
142
|
-
# Initialize some variables.
|
143
|
-
n_samples, n_features = x.shape
|
144
|
-
rand_ids = [*0...n_samples].shuffle(random: @rng)
|
145
|
-
weight_vec = Numo::DFloat.zeros(n_features + 1)
|
146
|
-
factor_mat = Numo::DFloat.zeros(@params[:n_factors], n_features)
|
147
|
-
weight_optimizer = @params[:optimizer].dup
|
148
|
-
factor_optimizers = Array.new(@params[:n_factors]) { @params[:optimizer].dup }
|
149
|
-
# Start optimization.
|
150
|
-
@params[:max_iter].times do |_t|
|
151
|
-
# Random sampling.
|
152
|
-
subset_ids = rand_ids.shift(@params[:batch_size])
|
153
|
-
rand_ids.concat(subset_ids)
|
154
|
-
data = x[subset_ids, true]
|
155
|
-
ex_data = expand_feature(data)
|
156
|
-
values = y[subset_ids]
|
157
|
-
# Calculate gradients for loss function.
|
158
|
-
loss_grad = loss_gradient(data, ex_data, values, factor_mat, weight_vec)
|
159
|
-
next if loss_grad.ne(0.0).count.zero?
|
160
|
-
# Update each parameter.
|
161
|
-
weight_vec = weight_optimizer.call(weight_vec, weight_gradient(loss_grad, ex_data, weight_vec))
|
162
|
-
@params[:n_factors].times do |n|
|
163
|
-
factor_mat[n, true] = factor_optimizers[n].call(factor_mat[n, true],
|
164
|
-
factor_gradient(loss_grad, data, factor_mat[n, true]))
|
165
|
-
end
|
166
|
-
end
|
167
|
-
[factor_mat, *split_weight_vec_bias(weight_vec)]
|
168
|
-
end
|
169
|
-
|
170
|
-
def loss_gradient(x, ex_x, y, factor, weight)
|
171
|
-
z = ex_x.dot(weight) + 0.5 * (factor.dot(x.transpose)**2 - (factor**2).dot(x.transpose**2)).sum(0)
|
172
|
-
2.0 * (z - y)
|
173
|
-
end
|
174
|
-
|
175
|
-
def weight_gradient(loss_grad, data, weight)
|
176
|
-
(loss_grad.expand_dims(1) * data).mean(0) + @params[:reg_param_linear] * weight
|
177
|
-
end
|
178
|
-
|
179
|
-
def factor_gradient(loss_grad, data, factor)
|
180
|
-
(loss_grad.expand_dims(1) * (data * data.dot(factor).expand_dims(1) - factor * (data**2))).mean(0) + @params[:reg_param_factor] * factor
|
181
|
-
end
|
182
|
-
|
183
|
-
def expand_feature(x)
|
184
|
-
Numo::NArray.hstack([x, Numo::DFloat.ones([x.shape[0], 1])])
|
185
|
-
end
|
186
|
-
|
187
|
-
def split_weight_vec_bias(weight_vec)
|
188
|
-
weights = weight_vec[0...-1].dup
|
189
|
-
bias = weight_vec[-1]
|
190
|
-
[weights, bias]
|
191
|
-
end
|
192
|
-
end
|
193
|
-
end
|
194
|
-
end
|
@@ -1,63 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'svmkit/validation'
|
4
|
-
require 'svmkit/base/base_estimator'
|
5
|
-
require 'svmkit/base/transformer'
|
6
|
-
|
7
|
-
module SVMKit
|
8
|
-
# This module consists of the classes that perform preprocessings.
|
9
|
-
module Preprocessing
|
10
|
-
# Normalize samples to unit L2-norm.
|
11
|
-
#
|
12
|
-
# @example
|
13
|
-
# normalizer = SVMKit::Preprocessing::StandardScaler.new
|
14
|
-
# new_samples = normalizer.fit_transform(samples)
|
15
|
-
class L2Normalizer
|
16
|
-
include Base::BaseEstimator
|
17
|
-
include Base::Transformer
|
18
|
-
|
19
|
-
# Return the vector consists of L2-norm for each sample.
|
20
|
-
# @return [Numo::DFloat] (shape: [n_samples])
|
21
|
-
attr_reader :norm_vec # :nodoc:
|
22
|
-
|
23
|
-
# Create a new normalizer for normaliing to unit L2-norm.
|
24
|
-
def initialize
|
25
|
-
@params = {}
|
26
|
-
@norm_vec = nil
|
27
|
-
end
|
28
|
-
|
29
|
-
# Calculate L2-norms of each sample.
|
30
|
-
#
|
31
|
-
# @overload fit(x) -> L2Normalizer
|
32
|
-
#
|
33
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L2-norms.
|
34
|
-
# @return [L2Normalizer]
|
35
|
-
def fit(x, _y = nil)
|
36
|
-
SVMKit::Validation.check_sample_array(x)
|
37
|
-
@norm_vec = Numo::NMath.sqrt((x**2).sum(1))
|
38
|
-
self
|
39
|
-
end
|
40
|
-
|
41
|
-
# Calculate L2-norms of each sample, and then normalize samples to unit L2-norm.
|
42
|
-
#
|
43
|
-
# @overload fit_transform(x) -> Numo::DFloat
|
44
|
-
#
|
45
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L2-norms.
|
46
|
-
# @return [Numo::DFloat] The normalized samples.
|
47
|
-
def fit_transform(x, _y = nil)
|
48
|
-
SVMKit::Validation.check_sample_array(x)
|
49
|
-
fit(x)
|
50
|
-
x / @norm_vec.tile(x.shape[1], 1).transpose
|
51
|
-
end
|
52
|
-
|
53
|
-
# Calculate L2-norms of each sample, and then normalize samples to unit L2-norm.
|
54
|
-
# This method calls the fit_transform method. This method exists for the Pipeline class.
|
55
|
-
#
|
56
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L2-norms.
|
57
|
-
# @return [Numo::DFloat] The normalized samples.
|
58
|
-
def transform(x)
|
59
|
-
fit_transform(x)
|
60
|
-
end
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end
|
@@ -1,95 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'svmkit/validation'
|
4
|
-
require 'svmkit/base/base_estimator'
|
5
|
-
require 'svmkit/base/transformer'
|
6
|
-
|
7
|
-
module SVMKit
|
8
|
-
module Preprocessing
|
9
|
-
# Encode labels to values between 0 and n_classes - 1.
|
10
|
-
#
|
11
|
-
# @example
|
12
|
-
# encoder = SVMKit::Preprocessing::LabelEncoder.new
|
13
|
-
# labels = Numo::Int32[1, 8, 8, 15, 0]
|
14
|
-
# encoded_labels = encoder.fit_transform(labels)
|
15
|
-
# # > pp encoded_labels
|
16
|
-
# # Numo::Int32#shape=[5]
|
17
|
-
# # [1, 2, 2, 3, 0]
|
18
|
-
# decoded_labels = encoder.inverse_transform(encoded_labels)
|
19
|
-
# # > pp decoded_labels
|
20
|
-
# # [1, 8, 8, 15, 0]
|
21
|
-
class LabelEncoder
|
22
|
-
include Base::BaseEstimator
|
23
|
-
include Base::Transformer
|
24
|
-
|
25
|
-
# Return the class labels.
|
26
|
-
# @return [Array] (size: [n_classes])
|
27
|
-
attr_reader :classes
|
28
|
-
|
29
|
-
# Create a new encoder for encoding labels to values between 0 and n_classes - 1.
|
30
|
-
def initialize
|
31
|
-
@params = {}
|
32
|
-
@classes = nil
|
33
|
-
end
|
34
|
-
|
35
|
-
# Fit label-encoder to labels.
|
36
|
-
#
|
37
|
-
# @overload fit(x) -> LabelEncoder
|
38
|
-
#
|
39
|
-
# @param x [Array] (shape: [n_samples]) The labels to fit label-encoder.
|
40
|
-
# @return [LabelEncoder]
|
41
|
-
def fit(x, _y = nil)
|
42
|
-
x = x.to_a if x.is_a?(Numo::NArray)
|
43
|
-
SVMKit::Validation.check_params_type(Array, x: x)
|
44
|
-
@classes = x.sort.uniq
|
45
|
-
self
|
46
|
-
end
|
47
|
-
|
48
|
-
# Fit label-encoder to labels, then return encoded labels.
|
49
|
-
#
|
50
|
-
# @overload fit_transform(x) -> Numo::DFloat
|
51
|
-
#
|
52
|
-
# @param x [Array] (shape: [n_samples]) The labels to fit label-encoder.
|
53
|
-
# @return [Numo::Int32] The encoded labels.
|
54
|
-
def fit_transform(x, _y = nil)
|
55
|
-
x = x.to_a if x.is_a?(Numo::NArray)
|
56
|
-
SVMKit::Validation.check_params_type(Array, x: x)
|
57
|
-
fit(x).transform(x)
|
58
|
-
end
|
59
|
-
|
60
|
-
# Encode labels.
|
61
|
-
#
|
62
|
-
# @param x [Array] (shape: [n_samples]) The labels to be encoded.
|
63
|
-
# @return [Numo::Int32] The encoded labels.
|
64
|
-
def transform(x)
|
65
|
-
x = x.to_a if x.is_a?(Numo::NArray)
|
66
|
-
SVMKit::Validation.check_params_type(Array, x: x)
|
67
|
-
Numo::Int32[*(x.map { |v| @classes.index(v) })]
|
68
|
-
end
|
69
|
-
|
70
|
-
# Decode encoded labels.
|
71
|
-
#
|
72
|
-
# @param x [Numo::Int32] (shape: [n_samples]) The labels to be decoded.
|
73
|
-
# @return [Array] The decoded labels.
|
74
|
-
def inverse_transform(x)
|
75
|
-
SVMKit::Validation.check_label_array(x)
|
76
|
-
x.to_a.map { |n| @classes[n] }
|
77
|
-
end
|
78
|
-
|
79
|
-
# Dump marshal data.
|
80
|
-
# @return [Hash] The marshal data about LabelEncoder
|
81
|
-
def marshal_dump
|
82
|
-
{ params: @params,
|
83
|
-
classes: @classes }
|
84
|
-
end
|
85
|
-
|
86
|
-
# Load marshal data.
|
87
|
-
# @return [nil]
|
88
|
-
def marshal_load(obj)
|
89
|
-
@params = obj[:params]
|
90
|
-
@classes = obj[:classes]
|
91
|
-
nil
|
92
|
-
end
|
93
|
-
end
|
94
|
-
end
|
95
|
-
end
|
@@ -1,93 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'svmkit/validation'
|
4
|
-
require 'svmkit/base/base_estimator'
|
5
|
-
require 'svmkit/base/transformer'
|
6
|
-
|
7
|
-
module SVMKit
|
8
|
-
# This module consists of the classes that perform preprocessings.
|
9
|
-
module Preprocessing
|
10
|
-
# Normalize samples by scaling each feature to a given range.
|
11
|
-
#
|
12
|
-
# @example
|
13
|
-
# normalizer = SVMKit::Preprocessing::MinMaxScaler.new(feature_range: [0.0, 1.0])
|
14
|
-
# new_training_samples = normalizer.fit_transform(training_samples)
|
15
|
-
# new_testing_samples = normalizer.transform(testing_samples)
|
16
|
-
class MinMaxScaler
|
17
|
-
include Base::BaseEstimator
|
18
|
-
include Base::Transformer
|
19
|
-
|
20
|
-
# Return the vector consists of the minimum value for each feature.
|
21
|
-
# @return [Numo::DFloat] (shape: [n_features])
|
22
|
-
attr_reader :min_vec
|
23
|
-
|
24
|
-
# Return the vector consists of the maximum value for each feature.
|
25
|
-
# @return [Numo::DFloat] (shape: [n_features])
|
26
|
-
attr_reader :max_vec
|
27
|
-
|
28
|
-
# Creates a new normalizer for scaling each feature to a given range.
|
29
|
-
#
|
30
|
-
# @param feature_range [Array<Float>] The desired range of samples.
|
31
|
-
def initialize(feature_range: [0.0, 1.0])
|
32
|
-
SVMKit::Validation.check_params_type(Array, feature_range: feature_range)
|
33
|
-
@params = {}
|
34
|
-
@params[:feature_range] = feature_range
|
35
|
-
@min_vec = nil
|
36
|
-
@max_vec = nil
|
37
|
-
end
|
38
|
-
|
39
|
-
# Calculate the minimum and maximum value of each feature for scaling.
|
40
|
-
#
|
41
|
-
# @overload fit(x) -> MinMaxScaler
|
42
|
-
#
|
43
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the minimum and maximum values.
|
44
|
-
# @return [MinMaxScaler]
|
45
|
-
def fit(x, _y = nil)
|
46
|
-
SVMKit::Validation.check_sample_array(x)
|
47
|
-
@min_vec = x.min(0)
|
48
|
-
@max_vec = x.max(0)
|
49
|
-
self
|
50
|
-
end
|
51
|
-
|
52
|
-
# Calculate the minimum and maximum values, and then normalize samples to feature_range.
|
53
|
-
#
|
54
|
-
# @overload fit_transform(x) -> Numo::DFloat
|
55
|
-
#
|
56
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the minimum and maximum values.
|
57
|
-
# @return [Numo::DFloat] The scaled samples.
|
58
|
-
def fit_transform(x, _y = nil)
|
59
|
-
SVMKit::Validation.check_sample_array(x)
|
60
|
-
fit(x).transform(x)
|
61
|
-
end
|
62
|
-
|
63
|
-
# Perform scaling the given samples according to feature_range.
|
64
|
-
#
|
65
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be scaled.
|
66
|
-
# @return [Numo::DFloat] The scaled samples.
|
67
|
-
def transform(x)
|
68
|
-
SVMKit::Validation.check_sample_array(x)
|
69
|
-
n_samples, = x.shape
|
70
|
-
dif_vec = @max_vec - @min_vec
|
71
|
-
nx = (x - @min_vec.tile(n_samples, 1)) / dif_vec.tile(n_samples, 1)
|
72
|
-
nx * (@params[:feature_range][1] - @params[:feature_range][0]) + @params[:feature_range][0]
|
73
|
-
end
|
74
|
-
|
75
|
-
# Dump marshal data.
|
76
|
-
# @return [Hash] The marshal data about MinMaxScaler.
|
77
|
-
def marshal_dump
|
78
|
-
{ params: @params,
|
79
|
-
min_vec: @min_vec,
|
80
|
-
max_vec: @max_vec }
|
81
|
-
end
|
82
|
-
|
83
|
-
# Load marshal data.
|
84
|
-
# @return [nil]
|
85
|
-
def marshal_load(obj)
|
86
|
-
@params = obj[:params]
|
87
|
-
@min_vec = obj[:min_vec]
|
88
|
-
@max_vec = obj[:max_vec]
|
89
|
-
nil
|
90
|
-
end
|
91
|
-
end
|
92
|
-
end
|
93
|
-
end
|
@@ -1,99 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'svmkit/validation'
|
4
|
-
require 'svmkit/base/base_estimator'
|
5
|
-
require 'svmkit/base/transformer'
|
6
|
-
|
7
|
-
module SVMKit
|
8
|
-
module Preprocessing
|
9
|
-
# Encode categorical integer features to one-hot-vectors.
|
10
|
-
#
|
11
|
-
# @example
|
12
|
-
# encoder = SVMKit::Preprocessing::OneHotEncoder.new
|
13
|
-
# labels = Numo::Int32[0, 0, 2, 3, 2, 1]
|
14
|
-
# one_hot_vectors = encoder.fit_transform(labels)
|
15
|
-
# # > pp one_hot_vectors
|
16
|
-
# # Numo::DFloat#shape[6, 4]
|
17
|
-
# # [[1, 0, 0, 0],
|
18
|
-
# # [1, 0, 0, 0],
|
19
|
-
# # [0, 0, 1, 0],
|
20
|
-
# # [0, 0, 0, 1],
|
21
|
-
# # [0, 0, 1, 0],
|
22
|
-
# # [0, 1, 0, 0]]
|
23
|
-
class OneHotEncoder
|
24
|
-
include Base::BaseEstimator
|
25
|
-
include Base::Transformer
|
26
|
-
|
27
|
-
# Return the maximum values for each feature.
|
28
|
-
# @return [Numo::Int32] (shape: [n_features])
|
29
|
-
attr_reader :n_values
|
30
|
-
|
31
|
-
# Return the indices to feature ranges.
|
32
|
-
# @return [Numo::Int32] (shape: [n_features + 1])
|
33
|
-
attr_reader :feature_indices
|
34
|
-
|
35
|
-
# Create a new encoder for encoding categorical integer features to one-hot-vectors
|
36
|
-
def initialize
|
37
|
-
@params = {}
|
38
|
-
@n_values = nil
|
39
|
-
@feature_indices = nil
|
40
|
-
end
|
41
|
-
|
42
|
-
# Fit one-hot-encoder to samples.
|
43
|
-
#
|
44
|
-
# @overload fit(x) -> OneHotEncoder
|
45
|
-
#
|
46
|
-
# @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to fit one-hot-encoder.
|
47
|
-
# @return [OneHotEncoder]
|
48
|
-
def fit(x, _y = nil)
|
49
|
-
SVMKit::Validation.check_params_type(Numo::Int32, x: x)
|
50
|
-
@n_values = x.max(0) + 1
|
51
|
-
@feature_indices = Numo::Int32.hstack([[0], @n_values]).cumsum
|
52
|
-
self
|
53
|
-
end
|
54
|
-
|
55
|
-
# Fit one-hot-encoder to samples, then encode samples into one-hot-vectors
|
56
|
-
#
|
57
|
-
# @overload fit_transform(x) -> Numo::DFloat
|
58
|
-
#
|
59
|
-
# @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to encode into one-hot-vectors.
|
60
|
-
# @return [Numo::DFloat] The one-hot-vectors.
|
61
|
-
def fit_transform(x, _y = nil)
|
62
|
-
SVMKit::Validation.check_params_type(Numo::Int32, x: x)
|
63
|
-
fit(x).transform(x)
|
64
|
-
end
|
65
|
-
|
66
|
-
# Encode samples into one-hot-vectors.
|
67
|
-
#
|
68
|
-
# @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to encode into one-hot-vectors.
|
69
|
-
# @return [Numo::DFloat] The one-hot-vectors.
|
70
|
-
def transform(x)
|
71
|
-
SVMKit::Validation.check_params_type(Numo::Int32, x: x)
|
72
|
-
n_samples, n_features = x.shape
|
73
|
-
n_features = 1 if n_features.nil?
|
74
|
-
column_indices = (x + @feature_indices[0...-1]).flatten.to_a
|
75
|
-
row_indices = Numo::Int32.new(n_samples).seq.repeat(n_features).to_a
|
76
|
-
codes = Numo::DFloat.zeros(n_samples, @feature_indices[-1])
|
77
|
-
row_indices.zip(column_indices).each { |r, c| codes[r, c] = 1.0 }
|
78
|
-
codes
|
79
|
-
end
|
80
|
-
|
81
|
-
# Dump marshal data.
|
82
|
-
# @return [Hash] The marshal data about OneHotEncoder.
|
83
|
-
def marshal_dump
|
84
|
-
{ params: @params,
|
85
|
-
n_values: @n_values,
|
86
|
-
feature_indices: @feature_indices }
|
87
|
-
end
|
88
|
-
|
89
|
-
# Load marshal data.
|
90
|
-
# @return [nil]
|
91
|
-
def marshal_load(obj)
|
92
|
-
@params = obj[:params]
|
93
|
-
@n_values = obj[:n_values]
|
94
|
-
@feature_indices = obj[:feature_indices]
|
95
|
-
nil
|
96
|
-
end
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|