svmkit 0.2.8 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -45,26 +45,30 @@ module SVMKit
45
45
  # @param bias_scale [Float] The scale of the bias term.
46
46
  # @param max_iter [Integer] The maximum number of iterations.
47
47
  # @param batch_size [Integer] The size of the mini batches.
48
+ # @param probability [Boolean] The flag indicating whether to perform probability estimation.
48
49
  # @param normalize [Boolean] The flag indicating whether to normalize the weight vector.
49
50
  # @param random_seed [Integer] The seed value using to initialize the random generator.
50
51
  def initialize(reg_param: 1.0, fit_bias: false, bias_scale: 1.0,
51
- max_iter: 100, batch_size: 50, normalize: true, random_seed: nil)
52
+ max_iter: 100, batch_size: 50, probability: false, normalize: true, random_seed: nil)
52
53
  SVMKit::Validation.check_params_float(reg_param: reg_param, bias_scale: bias_scale)
53
54
  SVMKit::Validation.check_params_integer(max_iter: max_iter, batch_size: batch_size)
54
- SVMKit::Validation.check_params_boolean(fit_bias: fit_bias, normalize: normalize)
55
+ SVMKit::Validation.check_params_boolean(fit_bias: fit_bias, probability: probability, normalize: normalize)
55
56
  SVMKit::Validation.check_params_type_or_nil(Integer, random_seed: random_seed)
56
-
57
+ SVMKit::Validation.check_params_positive(reg_param: reg_param, bias_scale: bias_scale, max_iter: max_iter,
58
+ batch_size: batch_size)
57
59
  @params = {}
58
60
  @params[:reg_param] = reg_param
59
61
  @params[:fit_bias] = fit_bias
60
62
  @params[:bias_scale] = bias_scale
61
63
  @params[:max_iter] = max_iter
62
64
  @params[:batch_size] = batch_size
65
+ @params[:probability] = probability
63
66
  @params[:normalize] = normalize
64
67
  @params[:random_seed] = random_seed
65
68
  @params[:random_seed] ||= srand
66
69
  @weight_vec = nil
67
70
  @bias_term = nil
71
+ @prob_param = nil
68
72
  @classes = nil
69
73
  @rng = Random.new(@params[:random_seed])
70
74
  end
@@ -77,6 +81,7 @@ module SVMKit
77
81
  def fit(x, y)
78
82
  SVMKit::Validation.check_sample_array(x)
79
83
  SVMKit::Validation.check_label_array(y)
84
+ SVMKit::Validation.check_sample_label_size(x, y)
80
85
 
81
86
  @classes = Numo::Int32[*y.to_a.uniq.sort]
82
87
  n_classes = @classes.size
@@ -85,16 +90,27 @@ module SVMKit
85
90
  if n_classes > 2
86
91
  @weight_vec = Numo::DFloat.zeros(n_classes, n_features)
87
92
  @bias_term = Numo::DFloat.zeros(n_classes)
93
+ @prob_param = Numo::DFloat.zeros(n_classes, 2)
88
94
  n_classes.times do |n|
89
95
  bin_y = Numo::Int32.cast(y.eq(@classes[n])) * 2 - 1
90
96
  weight, bias = binary_fit(x, bin_y)
91
97
  @weight_vec[n, true] = weight
92
98
  @bias_term[n] = bias
99
+ @prob_param[n, true] = if @params[:probability]
100
+ SVMKit::ProbabilisticOutput.fit_sigmoid(x.dot(weight.transpose) + bias, bin_y)
101
+ else
102
+ Numo::DFloat[1, 0]
103
+ end
93
104
  end
94
105
  else
95
106
  negative_label = y.to_a.uniq.sort.first
96
107
  bin_y = Numo::Int32.cast(y.ne(negative_label)) * 2 - 1
97
108
  @weight_vec, @bias_term = binary_fit(x, bin_y)
109
+ @prob_param = if @params[:probability]
110
+ SVMKit::ProbabilisticOutput.fit_sigmoid(x.dot(@weight_vec.transpose) + @bias_term, bin_y)
111
+ else
112
+ Numo::DFloat[1, 0]
113
+ end
98
114
  end
99
115
 
100
116
  self
@@ -124,12 +140,32 @@ module SVMKit
124
140
  Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[decision_values[n, true].max_index] })
125
141
  end
126
142
 
143
+ # Predict probability for samples.
144
+ #
145
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
146
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
147
+ def predict_proba(x)
148
+ SVMKit::Validation.check_sample_array(x)
149
+
150
+ if @classes.size > 2
151
+ probs = 1.0 / (Numo::NMath.exp(@prob_param[true, 0] * decision_function(x) + @prob_param[true, 1]) + 1.0)
152
+ return (probs.transpose / probs.sum(axis: 1)).transpose
153
+ end
154
+
155
+ n_samples, = x.shape
156
+ probs = Numo::DFloat.zeros(n_samples, 2)
157
+ probs[true, 1] = 1.0 / (Numo::NMath.exp(@prob_param[0] * decision_function(x) + @prob_param[1]) + 1.0)
158
+ probs[true, 0] = 1.0 - probs[true, 1]
159
+ probs
160
+ end
161
+
127
162
  # Dump marshal data.
128
163
  # @return [Hash] The marshal data about SVC.
129
164
  def marshal_dump
130
165
  { params: @params,
131
166
  weight_vec: @weight_vec,
132
167
  bias_term: @bias_term,
168
+ prob_param: @prob_param,
133
169
  classes: @classes,
134
170
  rng: @rng }
135
171
  end
@@ -140,6 +176,7 @@ module SVMKit
140
176
  @params = obj[:params]
141
177
  @weight_vec = obj[:weight_vec]
142
178
  @bias_term = obj[:bias_term]
179
+ @prob_param = obj[:prob_param]
143
180
  @classes = obj[:classes]
144
181
  @rng = obj[:rng]
145
182
  nil
@@ -159,11 +196,13 @@ module SVMKit
159
196
  # random sampling
160
197
  subset_ids = rand_ids.shift(@params[:batch_size])
161
198
  rand_ids.concat(subset_ids)
162
- target_ids = subset_ids.map { |n| n if weight_vec.dot(samples[n, true]) * bin_y[n] < 1 }.compact
163
- n_subsamples = target_ids.size
164
- next if n_subsamples.zero?
199
+ sub_samples = samples[subset_ids, true]
200
+ sub_bin_y = bin_y[subset_ids]
201
+ target_ids = (sub_samples.dot(weight_vec.transpose) * sub_bin_y).lt(1.0).where
202
+ n_targets = target_ids.size
203
+ next if n_targets.zero?
165
204
  # update the weight vector.
166
- mean_vec = samples[target_ids, true].transpose.dot(bin_y[target_ids]) / n_subsamples
205
+ mean_vec = sub_samples[target_ids, true].transpose.dot(sub_bin_y[target_ids]) / n_targets
167
206
  weight_vec -= learning_rate(t) * (@params[:reg_param] * weight_vec - mean_vec)
168
207
  # scale the weight vector.
169
208
  normalize_weight_vec(weight_vec) if @params[:normalize]
@@ -62,6 +62,7 @@ module SVMKit
62
62
  def perform(x, y)
63
63
  SVMKit::Validation.check_sample_array(x)
64
64
  SVMKit::Validation.check_label_array(y)
65
+ SVMKit::Validation.check_sample_label_size(x, y)
65
66
  # Initialize the report of cross validation.
66
67
  report = { test_score: [], train_score: nil, fit_time: [] }
67
68
  report[:train_score] = [] if @return_train_score
@@ -81,9 +82,12 @@ module SVMKit
81
82
  if @evaluator.nil?
82
83
  report[:test_score].push(@estimator.score(test_x, test_y))
83
84
  report[:train_score].push(@estimator.score(train_x, train_y)) if @return_train_score
85
+ elsif log_loss?
86
+ report[:test_score].push(@evaluator.score(test_y, @estimator.predict_proba(test_x)))
87
+ report[:train_score].push(@evaluator.score(train_y, @estimator.predict_proba(train_x))) if @return_train_score
84
88
  else
85
89
  report[:test_score].push(@evaluator.score(test_y, @estimator.predict(test_x)))
86
- report[:train_score].push(@estimator.score(train_x, @estimator.predict(train_x))) if @return_train_score
90
+ report[:train_score].push(@evaluator.score(train_y, @estimator.predict(train_x))) if @return_train_score
87
91
  end
88
92
  end
89
93
  report
@@ -96,6 +100,10 @@ module SVMKit
96
100
  class_name = @estimator.params[:estimator].class.to_s if class_name.include?('Multiclass')
97
101
  class_name.include?('KernelMachine')
98
102
  end
103
+
104
+ def log_loss?
105
+ @evaluator.is_a?(SVMKit::EvaluationMeasure::LogLoss)
106
+ end
99
107
  end
100
108
  end
101
109
  end
@@ -35,7 +35,7 @@ module SVMKit
35
35
  SVMKit::Validation.check_params_integer(n_splits: n_splits)
36
36
  SVMKit::Validation.check_params_boolean(shuffle: shuffle)
37
37
  SVMKit::Validation.check_params_type_or_nil(Integer, random_seed: random_seed)
38
-
38
+ SVMKit::Validation.check_params_positive(n_splits: n_splits)
39
39
  @n_splits = n_splits
40
40
  @shuffle = shuffle
41
41
  @random_seed = random_seed
@@ -35,7 +35,7 @@ module SVMKit
35
35
  SVMKit::Validation.check_params_integer(n_splits: n_splits)
36
36
  SVMKit::Validation.check_params_boolean(shuffle: shuffle)
37
37
  SVMKit::Validation.check_params_type_or_nil(Integer, random_seed: random_seed)
38
-
38
+ SVMKit::Validation.check_params_positive(n_splits: n_splits)
39
39
  @n_splits = n_splits
40
40
  @shuffle = shuffle
41
41
  @random_seed = random_seed
@@ -51,9 +51,10 @@ module SVMKit
51
51
  # @param y [Numo::Int32] (shape: [n_samples])
52
52
  # The labels to be used to generate data indices for stratified K-fold cross validation.
53
53
  # @return [Array] The set of data indices for constructing the training and testing dataset in each fold.
54
- def split(x, y) # rubocop:disable Lint/UnusedMethodArgument
54
+ def split(x, y)
55
55
  SVMKit::Validation.check_sample_array(x)
56
56
  SVMKit::Validation.check_label_array(y)
57
+ SVMKit::Validation.check_sample_label_size(x, y)
57
58
  # Check the number of samples in each class.
58
59
  unless valid_n_splits?(y)
59
60
  raise ArgumentError,
@@ -48,6 +48,7 @@ module SVMKit
48
48
  def fit(x, y)
49
49
  SVMKit::Validation.check_sample_array(x)
50
50
  SVMKit::Validation.check_label_array(y)
51
+ SVMKit::Validation.check_sample_label_size(x, y)
51
52
  y_arr = y.to_a
52
53
  @classes = Numo::Int32.asarray(y_arr.uniq.sort)
53
54
  @estimators = @classes.to_a.map do |label|
@@ -80,6 +80,7 @@ module SVMKit
80
80
  def fit(x, y)
81
81
  SVMKit::Validation.check_sample_array(x)
82
82
  SVMKit::Validation.check_label_array(y)
83
+ SVMKit::Validation.check_sample_label_size(x, y)
83
84
  n_samples, = x.shape
84
85
  @classes = Numo::Int32[*y.to_a.uniq.sort]
85
86
  @class_priors = Numo::DFloat[*@classes.to_a.map { |l| y.eq(l).count / n_samples.to_f }]
@@ -154,6 +155,7 @@ module SVMKit
154
155
  # @param smoothing_param [Float] The Laplace smoothing parameter.
155
156
  def initialize(smoothing_param: 1.0)
156
157
  SVMKit::Validation.check_params_float(smoothing_param: smoothing_param)
158
+ SVMKit::Validation.check_params_positive(smoothing_param: smoothing_param)
157
159
  @params = {}
158
160
  @params[:smoothing_param] = smoothing_param
159
161
  end
@@ -167,6 +169,7 @@ module SVMKit
167
169
  def fit(x, y)
168
170
  SVMKit::Validation.check_sample_array(x)
169
171
  SVMKit::Validation.check_label_array(y)
172
+ SVMKit::Validation.check_sample_label_size(x, y)
170
173
  n_samples, = x.shape
171
174
  @classes = Numo::Int32[*y.to_a.uniq.sort]
172
175
  @class_priors = Numo::DFloat[*@classes.to_a.map { |l| y.eq(l).count / n_samples.to_f }]
@@ -241,6 +244,7 @@ module SVMKit
241
244
  # @param bin_threshold [Float] The threshold for binarizing of features.
242
245
  def initialize(smoothing_param: 1.0, bin_threshold: 0.0)
243
246
  SVMKit::Validation.check_params_float(smoothing_param: smoothing_param, bin_threshold: bin_threshold)
247
+ SVMKit::Validation.check_params_positive(smoothing_param: smoothing_param)
244
248
  @params = {}
245
249
  @params[:smoothing_param] = smoothing_param
246
250
  @params[:bin_threshold] = bin_threshold
@@ -255,6 +259,7 @@ module SVMKit
255
259
  def fit(x, y)
256
260
  SVMKit::Validation.check_sample_array(x)
257
261
  SVMKit::Validation.check_label_array(y)
262
+ SVMKit::Validation.check_sample_label_size(x, y)
258
263
  n_samples, = x.shape
259
264
  bin_x = Numo::DFloat[*x.gt(@params[:bin_threshold])]
260
265
  @classes = Numo::Int32[*y.to_a.uniq.sort]
@@ -36,6 +36,7 @@ module SVMKit
36
36
  # @param n_neighbors [Integer] The number of neighbors.
37
37
  def initialize(n_neighbors: 5)
38
38
  SVMKit::Validation.check_params_integer(n_neighbors: n_neighbors)
39
+ SVMKit::Validation.check_params_positive(n_neighbors: n_neighbors)
39
40
  @params = {}
40
41
  @params[:n_neighbors] = n_neighbors
41
42
  @prototypes = nil
@@ -51,6 +52,7 @@ module SVMKit
51
52
  def fit(x, y)
52
53
  SVMKit::Validation.check_sample_array(x)
53
54
  SVMKit::Validation.check_label_array(y)
55
+ SVMKit::Validation.check_sample_label_size(x, y)
54
56
  @prototypes = Numo::DFloat.asarray(x.to_a)
55
57
  @labels = Numo::Int32.asarray(y.to_a)
56
58
  @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
@@ -63,7 +63,9 @@ module SVMKit
63
63
  SVMKit::Validation.check_params_integer(n_factors: n_factors, max_iter: max_iter, batch_size: batch_size)
64
64
  SVMKit::Validation.check_params_string(loss: loss)
65
65
  SVMKit::Validation.check_params_type_or_nil(Integer, random_seed: random_seed)
66
-
66
+ SVMKit::Validation.check_params_positive(n_factors: n_factors, reg_param_bias: reg_param_bias,
67
+ reg_param_weight: reg_param_weight, reg_param_factor: reg_param_factor,
68
+ max_iter: max_iter, batch_size: batch_size)
67
69
  @params = {}
68
70
  @params[:n_factors] = n_factors
69
71
  @params[:loss] = loss
@@ -90,6 +92,7 @@ module SVMKit
90
92
  def fit(x, y)
91
93
  SVMKit::Validation.check_sample_array(x)
92
94
  SVMKit::Validation.check_label_array(y)
95
+ SVMKit::Validation.check_sample_label_size(x, y)
93
96
 
94
97
  @classes = Numo::Int32[*y.to_a.uniq.sort]
95
98
  n_classes = @classes.size
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'svmkit/base/base_estimator'
4
+ require 'svmkit/base/transformer'
5
+
6
+ module SVMKit
7
+ module Preprocessing
8
+ # Encode labels to values between 0 and n_classes - 1.
9
+ #
10
+ # @example
11
+ # encoder = SVMKit::Preprocessing::LabelEncoder.new
12
+ # labels = Numo::Int32[1, 8, 8, 15, 0]
13
+ # encoded_labels = encoder.fit_transform(labels)
14
+ # # > pp encoded_labels
15
+ # # Numo::Int32#shape=[5]
16
+ # # [1, 2, 2, 3, 0]
17
+ # decoded_labels = encoder.inverse_transform(encoded_labels)
18
+ # # > pp decoded_labels
19
+ # # [1, 8, 8, 15, 0]
20
+ class LabelEncoder
21
+ include Base::BaseEstimator
22
+ include Base::Transformer
23
+
24
+ # Return the class labels.
25
+ # @return [Array] (size: [n_classes])
26
+ attr_reader :classes
27
+
28
+ # Create a new encoder for encoding labels to values between 0 and n_classes - 1.
29
+ def initialize
30
+ @params = {}
31
+ @classes = nil
32
+ end
33
+
34
+ # Fit label-encoder to labels.
35
+ #
36
+ # @overload fit(x) -> LabelEncoder
37
+ #
38
+ # @param x [Array] (shape: [n_samples]) The labels to fit label-encoder.
39
+ # @return [LabelEncoder]
40
+ def fit(x, _y = nil)
41
+ x = x.to_a if x.is_a?(Numo::NArray)
42
+ SVMKit::Validation.check_params_type(Array, x: x)
43
+ @classes = x.sort.uniq
44
+ self
45
+ end
46
+
47
+ # Fit label-encoder to labels, then return encoded labels.
48
+ #
49
+ # @overload fit_transform(x) -> Numo::DFloat
50
+ #
51
+ # @param x [Array] (shape: [n_samples]) The labels to fit label-encoder.
52
+ # @return [Numo::Int32] The encoded labels.
53
+ def fit_transform(x, _y = nil)
54
+ x = x.to_a if x.is_a?(Numo::NArray)
55
+ SVMKit::Validation.check_params_type(Array, x: x)
56
+ fit(x).transform(x)
57
+ end
58
+
59
+ # Encode labels.
60
+ #
61
+ # @param x [Array] (shape: [n_samples]) The labels to be encoded.
62
+ # @return [Numo::Int32] The encoded labels.
63
+ def transform(x)
64
+ x = x.to_a if x.is_a?(Numo::NArray)
65
+ SVMKit::Validation.check_params_type(Array, x: x)
66
+ Numo::Int32[*(x.map { |v| @classes.index(v) })]
67
+ end
68
+
69
+ # Decode encoded labels.
70
+ #
71
+ # @param x [Numo::Int32] (shape: [n_samples]) The labels to be decoded.
72
+ # @return [Array] The decoded labels.
73
+ def inverse_transform(x)
74
+ SVMKit::Validation.check_label_array(x)
75
+ x.to_a.map { |n| @classes[n] }
76
+ end
77
+
78
+ # Dump marshal data.
79
+ # @return [Hash] The marshal data about LabelEncoder
80
+ def marshal_dump
81
+ { params: @params,
82
+ classes: @classes }
83
+ end
84
+
85
+ # Load marshal data.
86
+ # @return [nil]
87
+ def marshal_load(obj)
88
+ @params = obj[:params]
89
+ @classes = obj[:classes]
90
+ nil
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,98 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'svmkit/base/base_estimator'
4
+ require 'svmkit/base/transformer'
5
+
6
+ module SVMKit
7
+ module Preprocessing
8
+ # Encode categorical integer features to one-hot-vectors.
9
+ #
10
+ # @example
11
+ # encoder = SVMKit::Preprocessing::OneHotEncoder.new
12
+ # labels = Numo::Int32[0, 0, 2, 3, 2, 1]
13
+ # one_hot_vectors = encoder.fit_transform(labels)
14
+ # # > pp one_hot_vectors
15
+ # # Numo::DFloat#shape[6, 4]
16
+ # # [[1, 0, 0, 0],
17
+ # # [1, 0, 0, 0],
18
+ # # [0, 0, 1, 0],
19
+ # # [0, 0, 0, 1],
20
+ # # [0, 0, 1, 0],
21
+ # # [0, 1, 0, 0]]
22
+ class OneHotEncoder
23
+ include Base::BaseEstimator
24
+ include Base::Transformer
25
+
26
+ # Return the maximum values for each feature.
27
+ # @return [Numo::Int32] (shape: [n_features])
28
+ attr_reader :n_values
29
+
30
+ # Return the indices to feature ranges.
31
+ # @return [Numo::Int32] (shape: [n_features + 1])
32
+ attr_reader :feature_indices
33
+
34
+ # Create a new encoder for encoding categorical integer features to one-hot-vectors
35
+ def initialize
36
+ @params = {}
37
+ @n_values = nil
38
+ @feature_indices = nil
39
+ end
40
+
41
+ # Fit one-hot-encoder to samples.
42
+ #
43
+ # @overload fit(x) -> OneHotEncoder
44
+ #
45
+ # @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to fit one-hot-encoder.
46
+ # @return [OneHotEncoder]
47
+ def fit(x, _y = nil)
48
+ SVMKit::Validation.check_params_type(Numo::Int32, x: x)
49
+ @n_values = x.max(0) + 1
50
+ @feature_indices = Numo::Int32.hstack([[0], @n_values]).cumsum
51
+ self
52
+ end
53
+
54
+ # Fit one-hot-encoder to samples, then encode samples into one-hot-vectors
55
+ #
56
+ # @overload fit_transform(x) -> Numo::DFloat
57
+ #
58
+ # @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to encode into one-hot-vectors.
59
+ # @return [Numo::DFloat] The one-hot-vectors.
60
+ def fit_transform(x, _y = nil)
61
+ SVMKit::Validation.check_params_type(Numo::Int32, x: x)
62
+ fit(x).transform(x)
63
+ end
64
+
65
+ # Encode samples into one-hot-vectors.
66
+ #
67
+ # @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to encode into one-hot-vectors.
68
+ # @return [Numo::DFloat] The one-hot-vectors.
69
+ def transform(x)
70
+ SVMKit::Validation.check_params_type(Numo::Int32, x: x)
71
+ n_samples, n_features = x.shape
72
+ n_features = 1 if n_features.nil?
73
+ column_indices = (x + @feature_indices[0...-1]).flatten.to_a
74
+ row_indices = Numo::Int32.new(n_samples).seq.repeat(n_features).to_a
75
+ codes = Numo::DFloat.zeros(n_samples, @feature_indices[-1])
76
+ row_indices.zip(column_indices).each { |r, c| codes[r, c] = 1.0 }
77
+ codes
78
+ end
79
+
80
+ # Dump marshal data.
81
+ # @return [Hash] The marshal data about OneHotEncoder.
82
+ def marshal_dump
83
+ { params: @params,
84
+ n_values: @n_values,
85
+ feature_indices: @feature_indices }
86
+ end
87
+
88
+ # Load marshal data.
89
+ # @return [nil]
90
+ def marshal_load(obj)
91
+ @params = obj[:params]
92
+ @n_values = obj[:n_values]
93
+ @feature_indices = obj[:feature_indices]
94
+ nil
95
+ end
96
+ end
97
+ end
98
+ end