svmkit 0.1.3 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/HISTORY.md +4 -0
- data/README.md +3 -5
- data/lib/svmkit.rb +4 -6
- data/lib/svmkit/dataset.rb +90 -0
- data/lib/svmkit/kernel_approximation/rbf.rb +28 -35
- data/lib/svmkit/kernel_machine/kernel_svc.rb +27 -34
- data/lib/svmkit/linear_model/logistic_regression.rb +43 -35
- data/lib/svmkit/linear_model/{pegasos_svc.rb → svc.rb} +45 -39
- data/lib/svmkit/multiclass/one_vs_rest_classifier.rb +20 -31
- data/lib/svmkit/pairwise_metric.rb +20 -20
- data/lib/svmkit/preprocessing/l2_normalizer.rb +9 -12
- data/lib/svmkit/preprocessing/min_max_scaler.rb +17 -24
- data/lib/svmkit/preprocessing/standard_scaler.rb +16 -17
- data/lib/svmkit/version.rb +1 -1
- data/svmkit.gemspec +15 -3
- metadata +43 -9
- data/lib/svmkit/utils.rb +0 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4a53bee5e11b90721544b873d144b149b38aafe1
|
4
|
+
data.tar.gz: f1ded6552e6cbdd8af3c29c4d8d403d3c8a62128
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c3e3073f1afd4470cc21e1241d1f3666bbfefcd871700f711cfe377bb04c490f2f3ff10bc4d8ef764e05e0015faf09aef114e45ad0affde35a18641b064ed389
|
7
|
+
data.tar.gz: 90144eea5e5f848dffb1325cd4576f27dbf61e5032917493e4957f5acba96489cc2b556f88a2078f5f2d9b5d2842c32454e6e56c003bfd2e36f6d5263cefc4c6
|
data/HISTORY.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
# 0.2.0
|
2
|
+
- Migrated the linear algebra library to Numo::NArray.
|
3
|
+
- Added module for loading and saving libsvm format file.
|
4
|
+
|
1
5
|
# 0.1.3
|
2
6
|
- Added class for Kernel Support Vector Machine with Pegasos algorithm.
|
3
7
|
- Added module for calculating pairwise kernel fuctions and euclidean distances.
|
data/README.md
CHANGED
@@ -30,9 +30,8 @@ Training phase:
|
|
30
30
|
|
31
31
|
```ruby
|
32
32
|
require 'svmkit'
|
33
|
-
require 'libsvmloader'
|
34
33
|
|
35
|
-
samples, labels =
|
34
|
+
samples, labels = SVMKit::Dataset.load_libsvm_file('pendigits')
|
36
35
|
|
37
36
|
normalizer = SVMKit::Preprocessing::MinMaxScaler.new
|
38
37
|
normalized = normalizer.fit_transform(samples)
|
@@ -41,7 +40,7 @@ transformer = SVMKit::KernelApproximation::RBF.new(gamma: 2.0, n_components: 102
|
|
41
40
|
transformed = transformer.fit_transform(normalized)
|
42
41
|
|
43
42
|
base_classifier =
|
44
|
-
SVMKit::LinearModel::
|
43
|
+
SVMKit::LinearModel::SVC.new(reg_param: 1.0, max_iter: 1000, batch_size: 20, random_seed: 1)
|
45
44
|
classifier = SVMKit::Multiclass::OneVsRestClassifier.new(estimator: base_classifier)
|
46
45
|
classifier.fit(transformed, labels)
|
47
46
|
|
@@ -54,9 +53,8 @@ Testing phase:
|
|
54
53
|
|
55
54
|
```ruby
|
56
55
|
require 'svmkit'
|
57
|
-
require 'libsvmloader'
|
58
56
|
|
59
|
-
samples, labels =
|
57
|
+
samples, labels = SVMKit::Dataset.load_libsvm_file('pendigits.t')
|
60
58
|
|
61
59
|
normalizer = Marshal.load(File.binread('trained_normalizer.dat'))
|
62
60
|
transformer = Marshal.load(File.binread('trained_transformer.dat'))
|
data/lib/svmkit.rb
CHANGED
@@ -1,16 +1,14 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
rescue LoadError
|
4
|
-
end
|
1
|
+
|
2
|
+
require 'numo/narray'
|
5
3
|
|
6
4
|
require 'svmkit/version'
|
7
|
-
require 'svmkit/utils'
|
8
5
|
require 'svmkit/pairwise_metric'
|
6
|
+
require 'svmkit/dataset'
|
9
7
|
require 'svmkit/base/base_estimator'
|
10
8
|
require 'svmkit/base/classifier'
|
11
9
|
require 'svmkit/base/transformer'
|
12
10
|
require 'svmkit/kernel_approximation/rbf'
|
13
|
-
require 'svmkit/linear_model/
|
11
|
+
require 'svmkit/linear_model/svc'
|
14
12
|
require 'svmkit/linear_model/logistic_regression'
|
15
13
|
require 'svmkit/kernel_machine/kernel_svc'
|
16
14
|
require 'svmkit/multiclass/one_vs_rest_classifier'
|
@@ -0,0 +1,90 @@
|
|
1
|
+
module SVMKit
|
2
|
+
# Module for loading and saving a dataset file.
|
3
|
+
module Dataset
|
4
|
+
class << self
|
5
|
+
# Load a dataset with the libsvm file format into Numo::NArray.
|
6
|
+
#
|
7
|
+
# @param filename [String] A path to a dataset file.
|
8
|
+
# @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
|
9
|
+
#
|
10
|
+
# @return [Array<Numo::NArray>]
|
11
|
+
# Returns array containing the (n_samples x n_features) matrix for feature vectors
|
12
|
+
# and (n_samples) vector for labels or target values.
|
13
|
+
def load_libsvm_file(filename, zero_based: false)
|
14
|
+
ftvecs = []
|
15
|
+
labels = []
|
16
|
+
n_features = 0
|
17
|
+
File.read(filename).split("\n").each do |line|
|
18
|
+
label, ftvec, max_idx = parse_libsvm_line(line, zero_based)
|
19
|
+
labels.push(label)
|
20
|
+
ftvecs.push(ftvec)
|
21
|
+
n_features = [n_features, max_idx].max
|
22
|
+
end
|
23
|
+
[convert_to_matrix(ftvecs, n_features), Numo::NArray.asarray(labels)]
|
24
|
+
end
|
25
|
+
|
26
|
+
# Dump the dataset with the libsvm file format.
|
27
|
+
#
|
28
|
+
# @param data [Numo::NArray] (shape: [n_samples, n_features]) matrix consisting of feature vectors.
|
29
|
+
# @param labels [Numo::NArray] (shape: [n_samples]) matrix consisting of labels or target values.
|
30
|
+
# @param filename [String] A path to the output libsvm file.
|
31
|
+
# @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
|
32
|
+
def dump_libsvm_file(data, labels, filename, zero_based: false)
|
33
|
+
n_samples = [data.shape[0], labels.shape[0]].min
|
34
|
+
label_type = detect_dtype(labels)
|
35
|
+
value_type = detect_dtype(data)
|
36
|
+
File.open(filename, 'w') do |file|
|
37
|
+
n_samples.times do |n|
|
38
|
+
file.puts(dump_libsvm_line(labels[n], data[n, true],
|
39
|
+
label_type, value_type, zero_based))
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def parse_libsvm_line(line, zero_based)
|
47
|
+
tokens = line.split
|
48
|
+
label = tokens.shift
|
49
|
+
label = label.to_i.to_s == label ? label.to_i : label.to_f
|
50
|
+
ftvec = tokens.map do |el|
|
51
|
+
idx, val = el.split(':')
|
52
|
+
idx = idx.to_i - (zero_based == false ? 1 : 0)
|
53
|
+
val = val.to_i.to_s == val ? val.to_i : val.to_f
|
54
|
+
[idx, val]
|
55
|
+
end
|
56
|
+
max_idx = ftvec.map { |el| el[0] }.max
|
57
|
+
max_idx ||= 0
|
58
|
+
[label, ftvec, max_idx]
|
59
|
+
end
|
60
|
+
|
61
|
+
def convert_to_matrix(data, n_features)
|
62
|
+
mat = []
|
63
|
+
data.each do |ft|
|
64
|
+
vec = Array.new(n_features) { 0 }
|
65
|
+
ft.each { |el| vec[el[0]] = el[1] }
|
66
|
+
mat.push(vec)
|
67
|
+
end
|
68
|
+
Numo::NArray.asarray(mat)
|
69
|
+
end
|
70
|
+
|
71
|
+
def detect_dtype(data)
|
72
|
+
arr_type_str = Numo::NArray.array_type(data).to_s
|
73
|
+
type = '%s'
|
74
|
+
type = '%d' if ['Numo::Int8', 'Numo::Int16', 'Numo::Int32', 'Numo::Int64'].include?(arr_type_str)
|
75
|
+
type = '%d' if ['Numo::UInt8', 'Numo::UInt16', 'Numo::UInt32', 'Numo::UInt64'].include?(arr_type_str)
|
76
|
+
type = '%.10g' if ['Numo::SFloat', 'Numo::DFloat'].include?(arr_type_str)
|
77
|
+
type
|
78
|
+
end
|
79
|
+
|
80
|
+
def dump_libsvm_line(label, ftvec, label_type, value_type, zero_based)
|
81
|
+
line = format(label_type.to_s, label)
|
82
|
+
ftvec.to_a.each_with_index do |val, n|
|
83
|
+
idx = n + (zero_based == false ? 1 : 0)
|
84
|
+
line += format(" %d:#{value_type}", idx, val) if val != 0.0
|
85
|
+
end
|
86
|
+
line
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -17,19 +17,12 @@ module SVMKit
|
|
17
17
|
include Base::BaseEstimator
|
18
18
|
include Base::Transformer
|
19
19
|
|
20
|
-
# @!visibility private
|
21
|
-
DEFAULT_PARAMS = {
|
22
|
-
gamma: 1.0,
|
23
|
-
n_components: 128,
|
24
|
-
random_seed: nil
|
25
|
-
}.freeze
|
26
|
-
|
27
20
|
# Return the random matrix for transformation.
|
28
|
-
# @return [
|
21
|
+
# @return [Numo::DFloat] (shape: [n_features, n_components])
|
29
22
|
attr_reader :random_mat
|
30
23
|
|
31
24
|
# Return the random vector for transformation.
|
32
|
-
# @return [
|
25
|
+
# @return [Numo::DFloat] (shape: [n_components])
|
33
26
|
attr_reader :random_vec
|
34
27
|
|
35
28
|
# Return the random generator for transformation.
|
@@ -38,14 +31,14 @@ module SVMKit
|
|
38
31
|
|
39
32
|
# Create a new transformer for mapping to RBF kernel feature space.
|
40
33
|
#
|
41
|
-
# @
|
42
|
-
#
|
43
|
-
# @param
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
self.params =
|
34
|
+
# @param gamma [Float] The parameter of RBF kernel: exp(-gamma * x^2).
|
35
|
+
# @param n_components [Integer] The number of dimensions of the RBF kernel feature space.
|
36
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
37
|
+
def initialize(gamma: 1.0, n_components: 128, random_seed: nil)
|
38
|
+
self.params = {}
|
39
|
+
self.params[:gamma] = gamma
|
40
|
+
self.params[:n_components] = n_components
|
41
|
+
self.params[:random_seed] = random_seed
|
49
42
|
self.params[:random_seed] ||= srand
|
50
43
|
@rng = Random.new(self.params[:random_seed])
|
51
44
|
@random_mat = nil
|
@@ -56,7 +49,7 @@ module SVMKit
|
|
56
49
|
#
|
57
50
|
# @overload fit(x) -> RBF
|
58
51
|
#
|
59
|
-
# @param x [
|
52
|
+
# @param x [Numo::NArray] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
60
53
|
# This method uses only the number of features of the data.
|
61
54
|
# @return [RBF] The learned transformer itself.
|
62
55
|
def fit(x, _y = nil)
|
@@ -64,40 +57,40 @@ module SVMKit
|
|
64
57
|
params[:n_components] = 2 * n_features if params[:n_components] <= 0
|
65
58
|
@random_mat = rand_normal([n_features, params[:n_components]]) * (2.0 * params[:gamma])**0.5
|
66
59
|
n_half_components = params[:n_components] / 2
|
67
|
-
@random_vec =
|
68
|
-
|
60
|
+
@random_vec = Numo::DFloat.zeros(params[:n_components] - n_half_components).concatenate(
|
61
|
+
Numo::DFloat.ones(n_half_components) * (0.5 * Math::PI)
|
69
62
|
)
|
70
63
|
self
|
71
64
|
end
|
72
65
|
|
73
66
|
# Fit the model with training data, and then transform them with the learned model.
|
74
67
|
#
|
75
|
-
# @overload fit_transform(x) ->
|
68
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
76
69
|
#
|
77
|
-
# @param x [
|
78
|
-
# @return [
|
70
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
71
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
|
79
72
|
def fit_transform(x, _y = nil)
|
80
73
|
fit(x).transform(x)
|
81
74
|
end
|
82
75
|
|
83
76
|
# Transform the given data with the learned model.
|
84
77
|
#
|
85
|
-
# @overload transform(x) ->
|
78
|
+
# @overload transform(x) -> Numo::DFloat
|
86
79
|
#
|
87
|
-
# @param x [
|
88
|
-
# @return [
|
80
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
|
81
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
|
89
82
|
def transform(x)
|
90
83
|
n_samples, = x.shape
|
91
|
-
projection = x.dot(@random_mat) + @random_vec.
|
92
|
-
|
84
|
+
projection = x.dot(@random_mat) + @random_vec.tile(n_samples, 1)
|
85
|
+
Numo::NMath.sin(projection) * ((2.0 / params[:n_components])**0.5)
|
93
86
|
end
|
94
87
|
|
95
88
|
# Dump marshal data.
|
96
89
|
# @return [Hash] The marshal data about RBF.
|
97
90
|
def marshal_dump
|
98
91
|
{ params: params,
|
99
|
-
random_mat:
|
100
|
-
random_vec:
|
92
|
+
random_mat: @random_mat,
|
93
|
+
random_vec: @random_vec,
|
101
94
|
rng: @rng }
|
102
95
|
end
|
103
96
|
|
@@ -105,8 +98,8 @@ module SVMKit
|
|
105
98
|
# @return [nil]
|
106
99
|
def marshal_load(obj)
|
107
100
|
self.params = obj[:params]
|
108
|
-
@random_mat =
|
109
|
-
@random_vec =
|
101
|
+
@random_mat = obj[:random_mat]
|
102
|
+
@random_vec = obj[:random_vec]
|
110
103
|
@rng = obj[:rng]
|
111
104
|
nil
|
112
105
|
end
|
@@ -115,15 +108,15 @@ module SVMKit
|
|
115
108
|
|
116
109
|
# Generate the uniform random matrix with the given shape.
|
117
110
|
def rand_uniform(shape)
|
118
|
-
rnd_vals = Array.new(
|
119
|
-
|
111
|
+
rnd_vals = Array.new(shape.inject(:*)) { @rng.rand }
|
112
|
+
Numo::DFloat.asarray(rnd_vals).reshape(shape[0], shape[1])
|
120
113
|
end
|
121
114
|
|
122
115
|
# Generate the normal random matrix with the given shape, mean, and standard deviation.
|
123
116
|
def rand_normal(shape, mu = 0.0, sigma = 1.0)
|
124
117
|
a = rand_uniform(shape)
|
125
118
|
b = rand_uniform(shape)
|
126
|
-
((
|
119
|
+
(Numo::NMath.sqrt(Numo::NMath.log(a) * -2.0) * Numo::NMath.sin(b * 2.0 * Math::PI)) * sigma + mu
|
127
120
|
end
|
128
121
|
end
|
129
122
|
end
|
@@ -2,7 +2,7 @@ require 'svmkit/base/base_estimator'
|
|
2
2
|
require 'svmkit/base/classifier'
|
3
3
|
|
4
4
|
module SVMKit
|
5
|
-
# This module consists of the classes that implement
|
5
|
+
# This module consists of the classes that implement kernel method-based estimator.
|
6
6
|
module KernelMachine
|
7
7
|
# KernelSVC is a class that implements (Nonlinear) Kernel Support Vector Classifier with the Pegasos algorithm.
|
8
8
|
#
|
@@ -20,15 +20,8 @@ module SVMKit
|
|
20
20
|
include Base::BaseEstimator
|
21
21
|
include Base::Classifier
|
22
22
|
|
23
|
-
# @!visibility private
|
24
|
-
DEFAULT_PARAMS = {
|
25
|
-
reg_param: 1.0,
|
26
|
-
max_iter: 1000,
|
27
|
-
random_seed: nil
|
28
|
-
}.freeze
|
29
|
-
|
30
23
|
# Return the weight vector for Kernel SVC.
|
31
|
-
# @return [
|
24
|
+
# @return [Numo::DFloat] (shape: [n_trainig_sample])
|
32
25
|
attr_reader :weight_vec
|
33
26
|
|
34
27
|
# Return the random generator for performing random sampling in the Pegasos algorithm.
|
@@ -37,14 +30,14 @@ module SVMKit
|
|
37
30
|
|
38
31
|
# Create a new classifier with Kernel Support Vector Machine by the Pegasos algorithm.
|
39
32
|
#
|
40
|
-
# @
|
41
|
-
#
|
42
|
-
# @param
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
self.params =
|
33
|
+
# @param reg_param [Float] The regularization parameter.
|
34
|
+
# @param max_iter [Integer] The maximum number of iterations.
|
35
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
36
|
+
def initialize(reg_param: 1.0, max_iter: 1000, random_seed: nil)
|
37
|
+
self.params = {}
|
38
|
+
self.params[:reg_param] = reg_param
|
39
|
+
self.params[:max_iter] = max_iter
|
40
|
+
self.params[:random_seed] = random_seed
|
48
41
|
self.params[:random_seed] ||= srand
|
49
42
|
@weight_vec = nil
|
50
43
|
@rng = Random.new(self.params[:random_seed])
|
@@ -52,74 +45,74 @@ module SVMKit
|
|
52
45
|
|
53
46
|
# Fit the model with given training data.
|
54
47
|
#
|
55
|
-
# @param x [
|
48
|
+
# @param x [Numo::DFloat] (shape: [n_training_samples, n_training_samples])
|
56
49
|
# The kernel matrix of the training data to be used for fitting the model.
|
57
|
-
# @param y [
|
50
|
+
# @param y [Numo::Int32] (shape: [n_training_samples]) The labels to be used for fitting the model.
|
58
51
|
# @return [KernelSVC] The learned classifier itself.
|
59
52
|
def fit(x, y)
|
60
53
|
# Generate binary labels
|
61
|
-
negative_label = y.uniq.sort.shift
|
62
|
-
bin_y = y.
|
54
|
+
negative_label = y.to_a.uniq.sort.shift
|
55
|
+
bin_y = y.to_a.map { |l| l != negative_label ? 1 : -1 }
|
63
56
|
# Initialize some variables.
|
64
57
|
n_training_samples = x.shape[0]
|
65
58
|
rand_ids = []
|
66
|
-
weight_vec =
|
59
|
+
weight_vec = Numo::DFloat.zeros(n_training_samples)
|
67
60
|
# Start optimization.
|
68
61
|
params[:max_iter].times do |t|
|
69
62
|
# random sampling
|
70
63
|
rand_ids = [*0...n_training_samples].shuffle(random: @rng) if rand_ids.empty?
|
71
64
|
target_id = rand_ids.shift
|
72
65
|
# update the weight vector
|
73
|
-
func = (weight_vec * bin_y[target_id]).dot(x
|
66
|
+
func = (weight_vec * bin_y[target_id]).dot(x[target_id, true].transpose).to_f
|
74
67
|
func *= bin_y[target_id] / (params[:reg_param] * (t + 1))
|
75
68
|
weight_vec[target_id] += 1.0 if func < 1.0
|
76
69
|
end
|
77
70
|
# Store the learned model.
|
78
|
-
@weight_vec = weight_vec *
|
71
|
+
@weight_vec = weight_vec * Numo::DFloat.asarray(bin_y)
|
79
72
|
self
|
80
73
|
end
|
81
74
|
|
82
75
|
# Calculate confidence scores for samples.
|
83
76
|
#
|
84
|
-
# @param x [
|
77
|
+
# @param x [Numo::DFloat] (shape: [n_testing_samples, n_training_samples])
|
85
78
|
# The kernel matrix between testing samples and training samples to compute the scores.
|
86
|
-
# @return [
|
79
|
+
# @return [Numo::DFloat] (shape: [n_testing_samples]) Confidence score per sample.
|
87
80
|
def decision_function(x)
|
88
81
|
@weight_vec.dot(x.transpose)
|
89
82
|
end
|
90
83
|
|
91
84
|
# Predict class labels for samples.
|
92
85
|
#
|
93
|
-
# @param x [
|
86
|
+
# @param x [Numo::DFloat] (shape: [n_testing_samples, n_training_samples])
|
94
87
|
# The kernel matrix between testing samples and training samples to predict the labels.
|
95
|
-
# @return [
|
88
|
+
# @return [Numo::Int32] (shape: [n_testing_samples]) Predicted class label per sample.
|
96
89
|
def predict(x)
|
97
|
-
decision_function(x).map { |v| v >= 0 ? 1 : -1 }
|
90
|
+
Numo::Int32.cast(decision_function(x).map { |v| v >= 0 ? 1 : -1 })
|
98
91
|
end
|
99
92
|
|
100
93
|
# Claculate the mean accuracy of the given testing data.
|
101
94
|
#
|
102
|
-
# @param x [
|
95
|
+
# @param x [Numo::DFloat] (shape: [n_testing_samples, n_training_samples])
|
103
96
|
# The kernel matrix between testing samples and training samples.
|
104
|
-
# @param y [
|
97
|
+
# @param y [Numo::Int32] (shape: [n_testing_samples]) True labels for testing data.
|
105
98
|
# @return [Float] Mean accuracy
|
106
99
|
def score(x, y)
|
107
100
|
p = predict(x)
|
108
|
-
n_hits = (y.
|
101
|
+
n_hits = (y.to_a.map.with_index { |l, n| l == p[n] ? 1 : 0 }).inject(:+)
|
109
102
|
n_hits / y.size.to_f
|
110
103
|
end
|
111
104
|
|
112
105
|
# Dump marshal data.
|
113
106
|
# @return [Hash] The marshal data about KernelSVC.
|
114
107
|
def marshal_dump
|
115
|
-
{ params: params, weight_vec:
|
108
|
+
{ params: params, weight_vec: @weight_vec, rng: @rng }
|
116
109
|
end
|
117
110
|
|
118
111
|
# Load marshal data.
|
119
112
|
# @return [nil]
|
120
113
|
def marshal_load(obj)
|
121
114
|
self.params = obj[:params]
|
122
|
-
@weight_vec =
|
115
|
+
@weight_vec = obj[:weight_vec]
|
123
116
|
@rng = obj[:rng]
|
124
117
|
nil
|
125
118
|
end
|
@@ -31,7 +31,7 @@ module SVMKit
|
|
31
31
|
}.freeze
|
32
32
|
|
33
33
|
# Return the weight vector for Logistic Regression.
|
34
|
-
# @return [
|
34
|
+
# @return [Numo::DFloat] (shape: [n_features])
|
35
35
|
attr_reader :weight_vec
|
36
36
|
|
37
37
|
# Return the bias term (a.k.a. intercept) for Logistic Regression.
|
@@ -44,18 +44,21 @@ module SVMKit
|
|
44
44
|
|
45
45
|
# Create a new classifier with Logisitc Regression by the SGD optimization.
|
46
46
|
#
|
47
|
-
# @
|
48
|
-
#
|
49
|
-
# @param
|
50
|
-
# @option params [Float] :reg_param (1.0) The regularization parameter.
|
51
|
-
# @option params [Boolean] :fit_bias (false) The flag indicating whether to fit the bias term.
|
52
|
-
# @option params [Float] :bias_scale (1.0) The scale of the bias term.
|
47
|
+
# @param reg_param [Float] The regularization parameter.
|
48
|
+
# @param fit_bias [Boolean] The flag indicating whether to fit the bias term.
|
49
|
+
# @param bias_scale [Float] The scale of the bias term.
|
53
50
|
# If fit_bias is true, the feature vector v becoms [v; bias_scale].
|
54
|
-
# @
|
55
|
-
# @
|
56
|
-
# @
|
57
|
-
def initialize(
|
58
|
-
self.params =
|
51
|
+
# @param max_iter [Integer] The maximum number of iterations.
|
52
|
+
# @param batch_size [Integer] The size of the mini batches.
|
53
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
54
|
+
def initialize(reg_param: 1.0, fit_bias: false, bias_scale: 1.0, max_iter: 100, batch_size: 50, random_seed: nil)
|
55
|
+
self.params = {}
|
56
|
+
self.params[:reg_param] = reg_param
|
57
|
+
self.params[:fit_bias] = fit_bias
|
58
|
+
self.params[:bias_scale] = bias_scale
|
59
|
+
self.params[:max_iter] = max_iter
|
60
|
+
self.params[:batch_size] = batch_size
|
61
|
+
self.params[:random_seed] = random_seed
|
59
62
|
self.params[:random_seed] ||= srand
|
60
63
|
@weight_vec = nil
|
61
64
|
@bias_term = 0.0
|
@@ -64,21 +67,25 @@ module SVMKit
|
|
64
67
|
|
65
68
|
# Fit the model with given training data.
|
66
69
|
#
|
67
|
-
# @param x [
|
68
|
-
# @param y [
|
70
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
71
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The categorical variables (e.g. labels)
|
69
72
|
# to be used for fitting the model.
|
70
73
|
# @return [LogisticRegression] The learned classifier itself.
|
71
74
|
def fit(x, y)
|
72
75
|
# Generate binary labels.
|
73
|
-
negative_label = y.uniq.sort.shift
|
74
|
-
bin_y = y.
|
76
|
+
negative_label = y.to_a.uniq.sort.shift
|
77
|
+
bin_y = y.to_a.map { |l| l != negative_label ? 1 : 0 }
|
75
78
|
# Expand feature vectors for bias term.
|
76
79
|
samples = x
|
77
|
-
|
80
|
+
if params[:fit_bias]
|
81
|
+
samples = Numo::NArray.hstack(
|
82
|
+
[samples, Numo::DFloat.ones([x.shape[0], 1]) * params[:bias_scale]]
|
83
|
+
)
|
84
|
+
end
|
78
85
|
# Initialize some variables.
|
79
86
|
n_samples, n_features = samples.shape
|
80
87
|
rand_ids = [*0..n_samples - 1].shuffle(random: @rng)
|
81
|
-
weight_vec =
|
88
|
+
weight_vec = Numo::DFloat.zeros(n_features)
|
82
89
|
# Start optimization.
|
83
90
|
params[:max_iter].times do |t|
|
84
91
|
# random sampling
|
@@ -86,16 +93,17 @@ module SVMKit
|
|
86
93
|
rand_ids.concat(subset_ids)
|
87
94
|
# update the weight vector.
|
88
95
|
eta = 1.0 / (params[:reg_param] * (t + 1))
|
89
|
-
mean_vec =
|
96
|
+
mean_vec = Numo::DFloat.zeros(n_features)
|
90
97
|
subset_ids.each do |n|
|
91
|
-
z = weight_vec.dot(samples
|
98
|
+
z = weight_vec.dot(samples[n, true])
|
92
99
|
coef = bin_y[n] / (1.0 + Math.exp(bin_y[n] * z))
|
93
|
-
mean_vec += samples
|
100
|
+
mean_vec += samples[n, true] * coef
|
94
101
|
end
|
95
102
|
mean_vec *= eta / params[:batch_size]
|
96
103
|
weight_vec = weight_vec * (1.0 - eta * params[:reg_param]) + mean_vec
|
97
104
|
# scale the weight vector.
|
98
|
-
|
105
|
+
norm = Math.sqrt(weight_vec.dot(weight_vec))
|
106
|
+
scaler = (1.0 / params[:reg_param]**0.5) / (norm + 1.0e-12)
|
99
107
|
weight_vec *= [1.0, scaler].min
|
100
108
|
end
|
101
109
|
# Store the learned model.
|
@@ -111,51 +119,51 @@ module SVMKit
|
|
111
119
|
|
112
120
|
# Calculate confidence scores for samples.
|
113
121
|
#
|
114
|
-
# @param x [
|
115
|
-
# @return [
|
122
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
|
123
|
+
# @return [Numo::DFloat] (shape: [n_samples]) Confidence score per sample.
|
116
124
|
def decision_function(x)
|
117
|
-
w = ((@weight_vec.dot(x.transpose) + @bias_term) * -1.0)
|
125
|
+
w = Numo::NMath.exp(((@weight_vec.dot(x.transpose) + @bias_term) * -1.0)) + 1.0
|
118
126
|
w.map { |v| 1.0 / v }
|
119
127
|
end
|
120
128
|
|
121
129
|
# Predict class labels for samples.
|
122
130
|
#
|
123
|
-
# @param x [
|
124
|
-
# @return [
|
131
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
132
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
125
133
|
def predict(x)
|
126
|
-
decision_function(x).map { |v| v >= 0.5 ? 1 : -1 }
|
134
|
+
Numo::Int32.cast(decision_function(x).map { |v| v >= 0.5 ? 1 : -1 })
|
127
135
|
end
|
128
136
|
|
129
137
|
# Predict probability for samples.
|
130
138
|
#
|
131
|
-
# @param x [
|
132
|
-
# @return [
|
139
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
140
|
+
# @return [Numo::DFloat] (shape: [n_samples]) Predicted probability per sample.
|
133
141
|
def predict_proba(x)
|
134
142
|
decision_function(x)
|
135
143
|
end
|
136
144
|
|
137
145
|
# Claculate the mean accuracy of the given testing data.
|
138
146
|
#
|
139
|
-
# @param x [
|
140
|
-
# @param y [
|
147
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
|
148
|
+
# @param y [Numo::Int32] (shape: [n_samples]) True labels for testing data.
|
141
149
|
# @return [Float] Mean accuracy
|
142
150
|
def score(x, y)
|
143
151
|
p = predict(x)
|
144
|
-
n_hits = (y.
|
152
|
+
n_hits = (y.to_a.map.with_index { |l, n| l == p[n] ? 1 : 0 }).inject(:+)
|
145
153
|
n_hits / y.size.to_f
|
146
154
|
end
|
147
155
|
|
148
156
|
# Dump marshal data.
|
149
157
|
# @return [Hash] The marshal data about LogisticRegression.
|
150
158
|
def marshal_dump
|
151
|
-
{ params: params, weight_vec:
|
159
|
+
{ params: params, weight_vec: @weight_vec, bias_term: @bias_term, rng: @rng }
|
152
160
|
end
|
153
161
|
|
154
162
|
# Load marshal data.
|
155
163
|
# @return [nil]
|
156
164
|
def marshal_load(obj)
|
157
165
|
self.params = obj[:params]
|
158
|
-
@weight_vec =
|
166
|
+
@weight_vec = obj[:weight_vec]
|
159
167
|
@bias_term = obj[:bias_term]
|
160
168
|
@rng = obj[:rng]
|
161
169
|
nil
|