svmkit 0.7.3 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -9
  3. data/.rspec +1 -0
  4. data/.travis.yml +4 -12
  5. data/LICENSE.txt +1 -1
  6. data/README.md +11 -13
  7. data/lib/svmkit.rb +3 -66
  8. data/svmkit.gemspec +12 -7
  9. metadata +16 -81
  10. data/.coveralls.yml +0 -1
  11. data/.rubocop.yml +0 -47
  12. data/.rubocop_todo.yml +0 -58
  13. data/HISTORY.md +0 -168
  14. data/lib/svmkit/base/base_estimator.rb +0 -13
  15. data/lib/svmkit/base/classifier.rb +0 -34
  16. data/lib/svmkit/base/cluster_analyzer.rb +0 -29
  17. data/lib/svmkit/base/evaluator.rb +0 -13
  18. data/lib/svmkit/base/regressor.rb +0 -34
  19. data/lib/svmkit/base/splitter.rb +0 -17
  20. data/lib/svmkit/base/transformer.rb +0 -18
  21. data/lib/svmkit/clustering/dbscan.rb +0 -127
  22. data/lib/svmkit/clustering/k_means.rb +0 -140
  23. data/lib/svmkit/dataset.rb +0 -109
  24. data/lib/svmkit/decomposition/nmf.rb +0 -147
  25. data/lib/svmkit/decomposition/pca.rb +0 -150
  26. data/lib/svmkit/ensemble/ada_boost_classifier.rb +0 -198
  27. data/lib/svmkit/ensemble/ada_boost_regressor.rb +0 -180
  28. data/lib/svmkit/ensemble/random_forest_classifier.rb +0 -182
  29. data/lib/svmkit/ensemble/random_forest_regressor.rb +0 -143
  30. data/lib/svmkit/evaluation_measure/accuracy.rb +0 -30
  31. data/lib/svmkit/evaluation_measure/f_score.rb +0 -51
  32. data/lib/svmkit/evaluation_measure/log_loss.rb +0 -46
  33. data/lib/svmkit/evaluation_measure/mean_absolute_error.rb +0 -30
  34. data/lib/svmkit/evaluation_measure/mean_squared_error.rb +0 -30
  35. data/lib/svmkit/evaluation_measure/normalized_mutual_information.rb +0 -63
  36. data/lib/svmkit/evaluation_measure/precision.rb +0 -51
  37. data/lib/svmkit/evaluation_measure/precision_recall.rb +0 -91
  38. data/lib/svmkit/evaluation_measure/purity.rb +0 -41
  39. data/lib/svmkit/evaluation_measure/r2_score.rb +0 -44
  40. data/lib/svmkit/evaluation_measure/recall.rb +0 -51
  41. data/lib/svmkit/kernel_approximation/rbf.rb +0 -136
  42. data/lib/svmkit/kernel_machine/kernel_svc.rb +0 -194
  43. data/lib/svmkit/linear_model/lasso.rb +0 -138
  44. data/lib/svmkit/linear_model/linear_regression.rb +0 -112
  45. data/lib/svmkit/linear_model/logistic_regression.rb +0 -161
  46. data/lib/svmkit/linear_model/ridge.rb +0 -112
  47. data/lib/svmkit/linear_model/sgd_linear_estimator.rb +0 -89
  48. data/lib/svmkit/linear_model/svc.rb +0 -184
  49. data/lib/svmkit/linear_model/svr.rb +0 -123
  50. data/lib/svmkit/model_selection/cross_validation.rb +0 -121
  51. data/lib/svmkit/model_selection/grid_search_cv.rb +0 -247
  52. data/lib/svmkit/model_selection/k_fold.rb +0 -77
  53. data/lib/svmkit/model_selection/stratified_k_fold.rb +0 -95
  54. data/lib/svmkit/multiclass/one_vs_rest_classifier.rb +0 -101
  55. data/lib/svmkit/naive_bayes/naive_bayes.rb +0 -316
  56. data/lib/svmkit/nearest_neighbors/k_neighbors_classifier.rb +0 -112
  57. data/lib/svmkit/nearest_neighbors/k_neighbors_regressor.rb +0 -94
  58. data/lib/svmkit/optimizer/nadam.rb +0 -90
  59. data/lib/svmkit/optimizer/rmsprop.rb +0 -69
  60. data/lib/svmkit/optimizer/sgd.rb +0 -65
  61. data/lib/svmkit/optimizer/yellow_fin.rb +0 -144
  62. data/lib/svmkit/pairwise_metric.rb +0 -91
  63. data/lib/svmkit/pipeline/pipeline.rb +0 -197
  64. data/lib/svmkit/polynomial_model/factorization_machine_classifier.rb +0 -262
  65. data/lib/svmkit/polynomial_model/factorization_machine_regressor.rb +0 -194
  66. data/lib/svmkit/preprocessing/l2_normalizer.rb +0 -63
  67. data/lib/svmkit/preprocessing/label_encoder.rb +0 -95
  68. data/lib/svmkit/preprocessing/min_max_scaler.rb +0 -93
  69. data/lib/svmkit/preprocessing/one_hot_encoder.rb +0 -99
  70. data/lib/svmkit/preprocessing/standard_scaler.rb +0 -87
  71. data/lib/svmkit/probabilistic_output.rb +0 -112
  72. data/lib/svmkit/tree/decision_tree_classifier.rb +0 -276
  73. data/lib/svmkit/tree/decision_tree_regressor.rb +0 -251
  74. data/lib/svmkit/tree/node.rb +0 -70
  75. data/lib/svmkit/utils.rb +0 -22
  76. data/lib/svmkit/validation.rb +0 -79
  77. data/lib/svmkit/values.rb +0 -13
  78. data/lib/svmkit/version.rb +0 -7
@@ -1,194 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'svmkit/validation'
4
- require 'svmkit/base/base_estimator'
5
- require 'svmkit/base/regressor'
6
- require 'svmkit/optimizer/nadam'
7
-
8
- module SVMKit
9
- module PolynomialModel
10
- # FactorizationMachineRegressor is a class that implements Factorization Machine
11
- # with stochastic gradient descent (SGD) optimization.
12
- #
13
- # @example
14
- # estimator =
15
- # SVMKit::PolynomialModel::FactorizationMachineRegressor.new(
16
- # n_factors: 10, reg_param_linear: 0.1, reg_param_factor: 0.1,
17
- # max_iter: 5000, batch_size: 50, random_seed: 1)
18
- # estimator.fit(training_samples, traininig_values)
19
- # results = estimator.predict(testing_samples)
20
- #
21
- # *Reference*
22
- # - S. Rendle, "Factorization Machines with libFM," ACM TIST, vol. 3 (3), pp. 57:1--57:22, 2012.
23
- # - S. Rendle, "Factorization Machines," Proc. ICDM'10, pp. 995--1000, 2010.
24
- class FactorizationMachineRegressor
25
- include Base::BaseEstimator
26
- include Base::Regressor
27
- include Validation
28
-
29
- # Return the factor matrix for Factorization Machine.
30
- # @return [Numo::DFloat] (shape: [n_outputs, n_factors, n_features])
31
- attr_reader :factor_mat
32
-
33
- # Return the weight vector for Factorization Machine.
34
- # @return [Numo::DFloat] (shape: [n_outputs, n_features])
35
- attr_reader :weight_vec
36
-
37
- # Return the bias term for Factoriazation Machine.
38
- # @return [Numo::DFloat] (shape: [n_outputs])
39
- attr_reader :bias_term
40
-
41
- # Return the random generator for random sampling.
42
- # @return [Random]
43
- attr_reader :rng
44
-
45
- # Create a new regressor with Factorization Machine.
46
- #
47
- # @param n_factors [Integer] The maximum number of iterations.
48
- # @param reg_param_linear [Float] The regularization parameter for linear model.
49
- # @param reg_param_factor [Float] The regularization parameter for factor matrix.
50
- # @param max_iter [Integer] The maximum number of iterations.
51
- # @param batch_size [Integer] The size of the mini batches.
52
- # @param optimizer [Optimizer] The optimizer to calculate adaptive learning rate.
53
- # If nil is given, Nadam is used.
54
- # @param random_seed [Integer] The seed value using to initialize the random generator.
55
- def initialize(n_factors: 2, reg_param_linear: 1.0, reg_param_factor: 1.0,
56
- max_iter: 1000, batch_size: 10, optimizer: nil, random_seed: nil)
57
- check_params_float(reg_param_linear: reg_param_linear, reg_param_factor: reg_param_factor)
58
- check_params_integer(n_factors: n_factors, max_iter: max_iter, batch_size: batch_size)
59
- check_params_type_or_nil(Integer, random_seed: random_seed)
60
- check_params_positive(n_factors: n_factors, reg_param_linear: reg_param_linear, reg_param_factor: reg_param_factor,
61
- max_iter: max_iter, batch_size: batch_size)
62
- @params = {}
63
- @params[:n_factors] = n_factors
64
- @params[:reg_param_linear] = reg_param_linear
65
- @params[:reg_param_factor] = reg_param_factor
66
- @params[:max_iter] = max_iter
67
- @params[:batch_size] = batch_size
68
- @params[:optimizer] = optimizer
69
- @params[:optimizer] ||= Optimizer::Nadam.new
70
- @params[:random_seed] = random_seed
71
- @params[:random_seed] ||= srand
72
- @factor_mat = nil
73
- @weight_vec = nil
74
- @bias_term = nil
75
- @rng = Random.new(@params[:random_seed])
76
- end
77
-
78
- # Fit the model with given training data.
79
- #
80
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
81
- # @param y [Numo::Int32] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
82
- # @return [FactorizationMachineRegressor] The learned regressor itself.
83
- def fit(x, y)
84
- check_sample_array(x)
85
- check_tvalue_array(y)
86
- check_sample_tvalue_size(x, y)
87
-
88
- n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
89
- _n_samples, n_features = x.shape
90
-
91
- if n_outputs > 1
92
- @factor_mat = Numo::DFloat.zeros(n_outputs, @params[:n_factors], n_features)
93
- @weight_vec = Numo::DFloat.zeros(n_outputs, n_features)
94
- @bias_term = Numo::DFloat.zeros(n_outputs)
95
- n_outputs.times { |n| @factor_mat[n, true, true], @weight_vec[n, true], @bias_term[n] = single_fit(x, y[true, n]) }
96
- else
97
- @factor_mat, @weight_vec, @bias_term = single_fit(x, y)
98
- end
99
-
100
- self
101
- end
102
-
103
- # Predict values for samples.
104
- #
105
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
106
- # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
107
- def predict(x)
108
- check_sample_array(x)
109
- linear_term = @bias_term + x.dot(@weight_vec.transpose)
110
- factor_term = if @weight_vec.shape[1].nil?
111
- 0.5 * (@factor_mat.dot(x.transpose)**2 - (@factor_mat**2).dot(x.transpose**2)).sum(0)
112
- else
113
- 0.5 * (@factor_mat.dot(x.transpose)**2 - (@factor_mat**2).dot(x.transpose**2)).sum(1).transpose
114
- end
115
- linear_term + factor_term
116
- end
117
-
118
- # Dump marshal data.
119
- # @return [Hash] The marshal data about FactorizationMachineRegressor.
120
- def marshal_dump
121
- { params: @params,
122
- factor_mat: @factor_mat,
123
- weight_vec: @weight_vec,
124
- bias_term: @bias_term,
125
- rng: @rng }
126
- end
127
-
128
- # Load marshal data.
129
- # @return [nil]
130
- def marshal_load(obj)
131
- @params = obj[:params]
132
- @factor_mat = obj[:factor_mat]
133
- @weight_vec = obj[:weight_vec]
134
- @bias_term = obj[:bias_term]
135
- @rng = obj[:rng]
136
- nil
137
- end
138
-
139
- private
140
-
141
- def single_fit(x, y)
142
- # Initialize some variables.
143
- n_samples, n_features = x.shape
144
- rand_ids = [*0...n_samples].shuffle(random: @rng)
145
- weight_vec = Numo::DFloat.zeros(n_features + 1)
146
- factor_mat = Numo::DFloat.zeros(@params[:n_factors], n_features)
147
- weight_optimizer = @params[:optimizer].dup
148
- factor_optimizers = Array.new(@params[:n_factors]) { @params[:optimizer].dup }
149
- # Start optimization.
150
- @params[:max_iter].times do |_t|
151
- # Random sampling.
152
- subset_ids = rand_ids.shift(@params[:batch_size])
153
- rand_ids.concat(subset_ids)
154
- data = x[subset_ids, true]
155
- ex_data = expand_feature(data)
156
- values = y[subset_ids]
157
- # Calculate gradients for loss function.
158
- loss_grad = loss_gradient(data, ex_data, values, factor_mat, weight_vec)
159
- next if loss_grad.ne(0.0).count.zero?
160
- # Update each parameter.
161
- weight_vec = weight_optimizer.call(weight_vec, weight_gradient(loss_grad, ex_data, weight_vec))
162
- @params[:n_factors].times do |n|
163
- factor_mat[n, true] = factor_optimizers[n].call(factor_mat[n, true],
164
- factor_gradient(loss_grad, data, factor_mat[n, true]))
165
- end
166
- end
167
- [factor_mat, *split_weight_vec_bias(weight_vec)]
168
- end
169
-
170
- def loss_gradient(x, ex_x, y, factor, weight)
171
- z = ex_x.dot(weight) + 0.5 * (factor.dot(x.transpose)**2 - (factor**2).dot(x.transpose**2)).sum(0)
172
- 2.0 * (z - y)
173
- end
174
-
175
- def weight_gradient(loss_grad, data, weight)
176
- (loss_grad.expand_dims(1) * data).mean(0) + @params[:reg_param_linear] * weight
177
- end
178
-
179
- def factor_gradient(loss_grad, data, factor)
180
- (loss_grad.expand_dims(1) * (data * data.dot(factor).expand_dims(1) - factor * (data**2))).mean(0) + @params[:reg_param_factor] * factor
181
- end
182
-
183
- def expand_feature(x)
184
- Numo::NArray.hstack([x, Numo::DFloat.ones([x.shape[0], 1])])
185
- end
186
-
187
- def split_weight_vec_bias(weight_vec)
188
- weights = weight_vec[0...-1].dup
189
- bias = weight_vec[-1]
190
- [weights, bias]
191
- end
192
- end
193
- end
194
- end
@@ -1,63 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'svmkit/validation'
4
- require 'svmkit/base/base_estimator'
5
- require 'svmkit/base/transformer'
6
-
7
- module SVMKit
8
- # This module consists of the classes that perform preprocessings.
9
- module Preprocessing
10
- # Normalize samples to unit L2-norm.
11
- #
12
- # @example
13
- # normalizer = SVMKit::Preprocessing::StandardScaler.new
14
- # new_samples = normalizer.fit_transform(samples)
15
- class L2Normalizer
16
- include Base::BaseEstimator
17
- include Base::Transformer
18
-
19
- # Return the vector consists of L2-norm for each sample.
20
- # @return [Numo::DFloat] (shape: [n_samples])
21
- attr_reader :norm_vec # :nodoc:
22
-
23
- # Create a new normalizer for normaliing to unit L2-norm.
24
- def initialize
25
- @params = {}
26
- @norm_vec = nil
27
- end
28
-
29
- # Calculate L2-norms of each sample.
30
- #
31
- # @overload fit(x) -> L2Normalizer
32
- #
33
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L2-norms.
34
- # @return [L2Normalizer]
35
- def fit(x, _y = nil)
36
- SVMKit::Validation.check_sample_array(x)
37
- @norm_vec = Numo::NMath.sqrt((x**2).sum(1))
38
- self
39
- end
40
-
41
- # Calculate L2-norms of each sample, and then normalize samples to unit L2-norm.
42
- #
43
- # @overload fit_transform(x) -> Numo::DFloat
44
- #
45
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L2-norms.
46
- # @return [Numo::DFloat] The normalized samples.
47
- def fit_transform(x, _y = nil)
48
- SVMKit::Validation.check_sample_array(x)
49
- fit(x)
50
- x / @norm_vec.tile(x.shape[1], 1).transpose
51
- end
52
-
53
- # Calculate L2-norms of each sample, and then normalize samples to unit L2-norm.
54
- # This method calls the fit_transform method. This method exists for the Pipeline class.
55
- #
56
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L2-norms.
57
- # @return [Numo::DFloat] The normalized samples.
58
- def transform(x)
59
- fit_transform(x)
60
- end
61
- end
62
- end
63
- end
@@ -1,95 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'svmkit/validation'
4
- require 'svmkit/base/base_estimator'
5
- require 'svmkit/base/transformer'
6
-
7
- module SVMKit
8
- module Preprocessing
9
- # Encode labels to values between 0 and n_classes - 1.
10
- #
11
- # @example
12
- # encoder = SVMKit::Preprocessing::LabelEncoder.new
13
- # labels = Numo::Int32[1, 8, 8, 15, 0]
14
- # encoded_labels = encoder.fit_transform(labels)
15
- # # > pp encoded_labels
16
- # # Numo::Int32#shape=[5]
17
- # # [1, 2, 2, 3, 0]
18
- # decoded_labels = encoder.inverse_transform(encoded_labels)
19
- # # > pp decoded_labels
20
- # # [1, 8, 8, 15, 0]
21
- class LabelEncoder
22
- include Base::BaseEstimator
23
- include Base::Transformer
24
-
25
- # Return the class labels.
26
- # @return [Array] (size: [n_classes])
27
- attr_reader :classes
28
-
29
- # Create a new encoder for encoding labels to values between 0 and n_classes - 1.
30
- def initialize
31
- @params = {}
32
- @classes = nil
33
- end
34
-
35
- # Fit label-encoder to labels.
36
- #
37
- # @overload fit(x) -> LabelEncoder
38
- #
39
- # @param x [Array] (shape: [n_samples]) The labels to fit label-encoder.
40
- # @return [LabelEncoder]
41
- def fit(x, _y = nil)
42
- x = x.to_a if x.is_a?(Numo::NArray)
43
- SVMKit::Validation.check_params_type(Array, x: x)
44
- @classes = x.sort.uniq
45
- self
46
- end
47
-
48
- # Fit label-encoder to labels, then return encoded labels.
49
- #
50
- # @overload fit_transform(x) -> Numo::DFloat
51
- #
52
- # @param x [Array] (shape: [n_samples]) The labels to fit label-encoder.
53
- # @return [Numo::Int32] The encoded labels.
54
- def fit_transform(x, _y = nil)
55
- x = x.to_a if x.is_a?(Numo::NArray)
56
- SVMKit::Validation.check_params_type(Array, x: x)
57
- fit(x).transform(x)
58
- end
59
-
60
- # Encode labels.
61
- #
62
- # @param x [Array] (shape: [n_samples]) The labels to be encoded.
63
- # @return [Numo::Int32] The encoded labels.
64
- def transform(x)
65
- x = x.to_a if x.is_a?(Numo::NArray)
66
- SVMKit::Validation.check_params_type(Array, x: x)
67
- Numo::Int32[*(x.map { |v| @classes.index(v) })]
68
- end
69
-
70
- # Decode encoded labels.
71
- #
72
- # @param x [Numo::Int32] (shape: [n_samples]) The labels to be decoded.
73
- # @return [Array] The decoded labels.
74
- def inverse_transform(x)
75
- SVMKit::Validation.check_label_array(x)
76
- x.to_a.map { |n| @classes[n] }
77
- end
78
-
79
- # Dump marshal data.
80
- # @return [Hash] The marshal data about LabelEncoder
81
- def marshal_dump
82
- { params: @params,
83
- classes: @classes }
84
- end
85
-
86
- # Load marshal data.
87
- # @return [nil]
88
- def marshal_load(obj)
89
- @params = obj[:params]
90
- @classes = obj[:classes]
91
- nil
92
- end
93
- end
94
- end
95
- end
@@ -1,93 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'svmkit/validation'
4
- require 'svmkit/base/base_estimator'
5
- require 'svmkit/base/transformer'
6
-
7
- module SVMKit
8
- # This module consists of the classes that perform preprocessings.
9
- module Preprocessing
10
- # Normalize samples by scaling each feature to a given range.
11
- #
12
- # @example
13
- # normalizer = SVMKit::Preprocessing::MinMaxScaler.new(feature_range: [0.0, 1.0])
14
- # new_training_samples = normalizer.fit_transform(training_samples)
15
- # new_testing_samples = normalizer.transform(testing_samples)
16
- class MinMaxScaler
17
- include Base::BaseEstimator
18
- include Base::Transformer
19
-
20
- # Return the vector consists of the minimum value for each feature.
21
- # @return [Numo::DFloat] (shape: [n_features])
22
- attr_reader :min_vec
23
-
24
- # Return the vector consists of the maximum value for each feature.
25
- # @return [Numo::DFloat] (shape: [n_features])
26
- attr_reader :max_vec
27
-
28
- # Creates a new normalizer for scaling each feature to a given range.
29
- #
30
- # @param feature_range [Array<Float>] The desired range of samples.
31
- def initialize(feature_range: [0.0, 1.0])
32
- SVMKit::Validation.check_params_type(Array, feature_range: feature_range)
33
- @params = {}
34
- @params[:feature_range] = feature_range
35
- @min_vec = nil
36
- @max_vec = nil
37
- end
38
-
39
- # Calculate the minimum and maximum value of each feature for scaling.
40
- #
41
- # @overload fit(x) -> MinMaxScaler
42
- #
43
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the minimum and maximum values.
44
- # @return [MinMaxScaler]
45
- def fit(x, _y = nil)
46
- SVMKit::Validation.check_sample_array(x)
47
- @min_vec = x.min(0)
48
- @max_vec = x.max(0)
49
- self
50
- end
51
-
52
- # Calculate the minimum and maximum values, and then normalize samples to feature_range.
53
- #
54
- # @overload fit_transform(x) -> Numo::DFloat
55
- #
56
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the minimum and maximum values.
57
- # @return [Numo::DFloat] The scaled samples.
58
- def fit_transform(x, _y = nil)
59
- SVMKit::Validation.check_sample_array(x)
60
- fit(x).transform(x)
61
- end
62
-
63
- # Perform scaling the given samples according to feature_range.
64
- #
65
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be scaled.
66
- # @return [Numo::DFloat] The scaled samples.
67
- def transform(x)
68
- SVMKit::Validation.check_sample_array(x)
69
- n_samples, = x.shape
70
- dif_vec = @max_vec - @min_vec
71
- nx = (x - @min_vec.tile(n_samples, 1)) / dif_vec.tile(n_samples, 1)
72
- nx * (@params[:feature_range][1] - @params[:feature_range][0]) + @params[:feature_range][0]
73
- end
74
-
75
- # Dump marshal data.
76
- # @return [Hash] The marshal data about MinMaxScaler.
77
- def marshal_dump
78
- { params: @params,
79
- min_vec: @min_vec,
80
- max_vec: @max_vec }
81
- end
82
-
83
- # Load marshal data.
84
- # @return [nil]
85
- def marshal_load(obj)
86
- @params = obj[:params]
87
- @min_vec = obj[:min_vec]
88
- @max_vec = obj[:max_vec]
89
- nil
90
- end
91
- end
92
- end
93
- end
@@ -1,99 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'svmkit/validation'
4
- require 'svmkit/base/base_estimator'
5
- require 'svmkit/base/transformer'
6
-
7
- module SVMKit
8
- module Preprocessing
9
- # Encode categorical integer features to one-hot-vectors.
10
- #
11
- # @example
12
- # encoder = SVMKit::Preprocessing::OneHotEncoder.new
13
- # labels = Numo::Int32[0, 0, 2, 3, 2, 1]
14
- # one_hot_vectors = encoder.fit_transform(labels)
15
- # # > pp one_hot_vectors
16
- # # Numo::DFloat#shape[6, 4]
17
- # # [[1, 0, 0, 0],
18
- # # [1, 0, 0, 0],
19
- # # [0, 0, 1, 0],
20
- # # [0, 0, 0, 1],
21
- # # [0, 0, 1, 0],
22
- # # [0, 1, 0, 0]]
23
- class OneHotEncoder
24
- include Base::BaseEstimator
25
- include Base::Transformer
26
-
27
- # Return the maximum values for each feature.
28
- # @return [Numo::Int32] (shape: [n_features])
29
- attr_reader :n_values
30
-
31
- # Return the indices to feature ranges.
32
- # @return [Numo::Int32] (shape: [n_features + 1])
33
- attr_reader :feature_indices
34
-
35
- # Create a new encoder for encoding categorical integer features to one-hot-vectors
36
- def initialize
37
- @params = {}
38
- @n_values = nil
39
- @feature_indices = nil
40
- end
41
-
42
- # Fit one-hot-encoder to samples.
43
- #
44
- # @overload fit(x) -> OneHotEncoder
45
- #
46
- # @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to fit one-hot-encoder.
47
- # @return [OneHotEncoder]
48
- def fit(x, _y = nil)
49
- SVMKit::Validation.check_params_type(Numo::Int32, x: x)
50
- @n_values = x.max(0) + 1
51
- @feature_indices = Numo::Int32.hstack([[0], @n_values]).cumsum
52
- self
53
- end
54
-
55
- # Fit one-hot-encoder to samples, then encode samples into one-hot-vectors
56
- #
57
- # @overload fit_transform(x) -> Numo::DFloat
58
- #
59
- # @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to encode into one-hot-vectors.
60
- # @return [Numo::DFloat] The one-hot-vectors.
61
- def fit_transform(x, _y = nil)
62
- SVMKit::Validation.check_params_type(Numo::Int32, x: x)
63
- fit(x).transform(x)
64
- end
65
-
66
- # Encode samples into one-hot-vectors.
67
- #
68
- # @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to encode into one-hot-vectors.
69
- # @return [Numo::DFloat] The one-hot-vectors.
70
- def transform(x)
71
- SVMKit::Validation.check_params_type(Numo::Int32, x: x)
72
- n_samples, n_features = x.shape
73
- n_features = 1 if n_features.nil?
74
- column_indices = (x + @feature_indices[0...-1]).flatten.to_a
75
- row_indices = Numo::Int32.new(n_samples).seq.repeat(n_features).to_a
76
- codes = Numo::DFloat.zeros(n_samples, @feature_indices[-1])
77
- row_indices.zip(column_indices).each { |r, c| codes[r, c] = 1.0 }
78
- codes
79
- end
80
-
81
- # Dump marshal data.
82
- # @return [Hash] The marshal data about OneHotEncoder.
83
- def marshal_dump
84
- { params: @params,
85
- n_values: @n_values,
86
- feature_indices: @feature_indices }
87
- end
88
-
89
- # Load marshal data.
90
- # @return [nil]
91
- def marshal_load(obj)
92
- @params = obj[:params]
93
- @n_values = obj[:n_values]
94
- @feature_indices = obj[:feature_indices]
95
- nil
96
- end
97
- end
98
- end
99
- end