svmkit 0.7.3 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -9
- data/.rspec +1 -0
- data/.travis.yml +4 -12
- data/LICENSE.txt +1 -1
- data/README.md +11 -13
- data/lib/svmkit.rb +3 -66
- data/svmkit.gemspec +12 -7
- metadata +16 -81
- data/.coveralls.yml +0 -1
- data/.rubocop.yml +0 -47
- data/.rubocop_todo.yml +0 -58
- data/HISTORY.md +0 -168
- data/lib/svmkit/base/base_estimator.rb +0 -13
- data/lib/svmkit/base/classifier.rb +0 -34
- data/lib/svmkit/base/cluster_analyzer.rb +0 -29
- data/lib/svmkit/base/evaluator.rb +0 -13
- data/lib/svmkit/base/regressor.rb +0 -34
- data/lib/svmkit/base/splitter.rb +0 -17
- data/lib/svmkit/base/transformer.rb +0 -18
- data/lib/svmkit/clustering/dbscan.rb +0 -127
- data/lib/svmkit/clustering/k_means.rb +0 -140
- data/lib/svmkit/dataset.rb +0 -109
- data/lib/svmkit/decomposition/nmf.rb +0 -147
- data/lib/svmkit/decomposition/pca.rb +0 -150
- data/lib/svmkit/ensemble/ada_boost_classifier.rb +0 -198
- data/lib/svmkit/ensemble/ada_boost_regressor.rb +0 -180
- data/lib/svmkit/ensemble/random_forest_classifier.rb +0 -182
- data/lib/svmkit/ensemble/random_forest_regressor.rb +0 -143
- data/lib/svmkit/evaluation_measure/accuracy.rb +0 -30
- data/lib/svmkit/evaluation_measure/f_score.rb +0 -51
- data/lib/svmkit/evaluation_measure/log_loss.rb +0 -46
- data/lib/svmkit/evaluation_measure/mean_absolute_error.rb +0 -30
- data/lib/svmkit/evaluation_measure/mean_squared_error.rb +0 -30
- data/lib/svmkit/evaluation_measure/normalized_mutual_information.rb +0 -63
- data/lib/svmkit/evaluation_measure/precision.rb +0 -51
- data/lib/svmkit/evaluation_measure/precision_recall.rb +0 -91
- data/lib/svmkit/evaluation_measure/purity.rb +0 -41
- data/lib/svmkit/evaluation_measure/r2_score.rb +0 -44
- data/lib/svmkit/evaluation_measure/recall.rb +0 -51
- data/lib/svmkit/kernel_approximation/rbf.rb +0 -136
- data/lib/svmkit/kernel_machine/kernel_svc.rb +0 -194
- data/lib/svmkit/linear_model/lasso.rb +0 -138
- data/lib/svmkit/linear_model/linear_regression.rb +0 -112
- data/lib/svmkit/linear_model/logistic_regression.rb +0 -161
- data/lib/svmkit/linear_model/ridge.rb +0 -112
- data/lib/svmkit/linear_model/sgd_linear_estimator.rb +0 -89
- data/lib/svmkit/linear_model/svc.rb +0 -184
- data/lib/svmkit/linear_model/svr.rb +0 -123
- data/lib/svmkit/model_selection/cross_validation.rb +0 -121
- data/lib/svmkit/model_selection/grid_search_cv.rb +0 -247
- data/lib/svmkit/model_selection/k_fold.rb +0 -77
- data/lib/svmkit/model_selection/stratified_k_fold.rb +0 -95
- data/lib/svmkit/multiclass/one_vs_rest_classifier.rb +0 -101
- data/lib/svmkit/naive_bayes/naive_bayes.rb +0 -316
- data/lib/svmkit/nearest_neighbors/k_neighbors_classifier.rb +0 -112
- data/lib/svmkit/nearest_neighbors/k_neighbors_regressor.rb +0 -94
- data/lib/svmkit/optimizer/nadam.rb +0 -90
- data/lib/svmkit/optimizer/rmsprop.rb +0 -69
- data/lib/svmkit/optimizer/sgd.rb +0 -65
- data/lib/svmkit/optimizer/yellow_fin.rb +0 -144
- data/lib/svmkit/pairwise_metric.rb +0 -91
- data/lib/svmkit/pipeline/pipeline.rb +0 -197
- data/lib/svmkit/polynomial_model/factorization_machine_classifier.rb +0 -262
- data/lib/svmkit/polynomial_model/factorization_machine_regressor.rb +0 -194
- data/lib/svmkit/preprocessing/l2_normalizer.rb +0 -63
- data/lib/svmkit/preprocessing/label_encoder.rb +0 -95
- data/lib/svmkit/preprocessing/min_max_scaler.rb +0 -93
- data/lib/svmkit/preprocessing/one_hot_encoder.rb +0 -99
- data/lib/svmkit/preprocessing/standard_scaler.rb +0 -87
- data/lib/svmkit/probabilistic_output.rb +0 -112
- data/lib/svmkit/tree/decision_tree_classifier.rb +0 -276
- data/lib/svmkit/tree/decision_tree_regressor.rb +0 -251
- data/lib/svmkit/tree/node.rb +0 -70
- data/lib/svmkit/utils.rb +0 -22
- data/lib/svmkit/validation.rb +0 -79
- data/lib/svmkit/values.rb +0 -13
- data/lib/svmkit/version.rb +0 -7
@@ -1,140 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'svmkit/validation'
|
4
|
-
require 'svmkit/base/base_estimator'
|
5
|
-
require 'svmkit/base/cluster_analyzer'
|
6
|
-
require 'svmkit/pairwise_metric'
|
7
|
-
|
8
|
-
module SVMKit
|
9
|
-
# This module consists of classes that implement cluster analysis methods.
|
10
|
-
module Clustering
|
11
|
-
# KMeans is a class that implements K-Means cluster analysis.
|
12
|
-
# The current implementation uses the Euclidean distance for analyzing the clusters.
|
13
|
-
#
|
14
|
-
# @example
|
15
|
-
# analyzer = SVMKit::Clustering::KMeans.new(n_clusters: 10, max_iter: 50)
|
16
|
-
# cluster_labels = analyzer.fit_predict(samples)
|
17
|
-
#
|
18
|
-
# *Reference*
|
19
|
-
# - D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding," Proc. SODA'07, pp. 1027--1035, 2007.
|
20
|
-
class KMeans
|
21
|
-
include Base::BaseEstimator
|
22
|
-
include Base::ClusterAnalyzer
|
23
|
-
include Validation
|
24
|
-
|
25
|
-
# Return the centroids.
|
26
|
-
# @return [Numo::DFloat] (shape: [n_clusters, n_features])
|
27
|
-
attr_reader :cluster_centers
|
28
|
-
|
29
|
-
# Return the random generator.
|
30
|
-
# @return [Random]
|
31
|
-
attr_reader :rng
|
32
|
-
|
33
|
-
# Create a new cluster analyzer with K-Means method.
|
34
|
-
#
|
35
|
-
# @param n_clusters [Integer] The number of clusters.
|
36
|
-
# @param init [String] The initialization method for centroids ('random' or 'k-means++').
|
37
|
-
# @param max_iter [Integer] The maximum number of iterations.
|
38
|
-
# @param tol [Float] The tolerance of termination criterion.
|
39
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
40
|
-
def initialize(n_clusters: 8, init: 'k-means++', max_iter: 50, tol: 1.0e-4, random_seed: nil)
|
41
|
-
check_params_integer(n_clusters: n_clusters, max_iter: max_iter)
|
42
|
-
check_params_float(tol: tol)
|
43
|
-
check_params_string(init: init)
|
44
|
-
check_params_type_or_nil(Integer, random_seed: random_seed)
|
45
|
-
check_params_positive(n_clusters: n_clusters, max_iter: max_iter)
|
46
|
-
@params = {}
|
47
|
-
@params[:n_clusters] = n_clusters
|
48
|
-
@params[:init] = init == 'random' ? 'random' : 'k-means++'
|
49
|
-
@params[:max_iter] = max_iter
|
50
|
-
@params[:tol] = tol
|
51
|
-
@params[:random_seed] = random_seed
|
52
|
-
@params[:random_seed] ||= srand
|
53
|
-
@cluster_centers = nil
|
54
|
-
@rng = Random.new(@params[:random_seed])
|
55
|
-
end
|
56
|
-
|
57
|
-
# Analysis clusters with given training data.
|
58
|
-
#
|
59
|
-
# @overload fit(x) -> KMeans
|
60
|
-
#
|
61
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
62
|
-
# @return [KMeans] The learned cluster analyzer itself.
|
63
|
-
def fit(x, _y = nil)
|
64
|
-
check_sample_array(x)
|
65
|
-
init_cluster_centers(x)
|
66
|
-
@params[:max_iter].times do |_t|
|
67
|
-
cluster_labels = assign_cluster(x)
|
68
|
-
old_centers = @cluster_centers.dup
|
69
|
-
@params[:n_clusters].times do |n|
|
70
|
-
assigned_bits = cluster_labels.eq(n)
|
71
|
-
@cluster_centers[n, true] = x[assigned_bits.where, true].mean(axis: 0) if assigned_bits.count > 0
|
72
|
-
end
|
73
|
-
error = Numo::NMath.sqrt(((old_centers - @cluster_centers)**2).sum(axis: 1)).mean
|
74
|
-
break if error <= @params[:tol]
|
75
|
-
end
|
76
|
-
self
|
77
|
-
end
|
78
|
-
|
79
|
-
# Predict cluster labels for samples.
|
80
|
-
#
|
81
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster label.
|
82
|
-
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
83
|
-
def predict(x)
|
84
|
-
check_sample_array(x)
|
85
|
-
assign_cluster(x)
|
86
|
-
end
|
87
|
-
|
88
|
-
# Analysis clusters and assign samples to clusters.
|
89
|
-
#
|
90
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
91
|
-
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
92
|
-
def fit_predict(x)
|
93
|
-
check_sample_array(x)
|
94
|
-
fit(x)
|
95
|
-
predict(x)
|
96
|
-
end
|
97
|
-
|
98
|
-
# Dump marshal data.
|
99
|
-
# @return [Hash] The marshal data.
|
100
|
-
def marshal_dump
|
101
|
-
{ params: @params,
|
102
|
-
cluster_centers: @cluster_centers,
|
103
|
-
rng: @rng }
|
104
|
-
end
|
105
|
-
|
106
|
-
# Load marshal data.
|
107
|
-
# @return [nil]
|
108
|
-
def marshal_load(obj)
|
109
|
-
@params = obj[:params]
|
110
|
-
@cluster_centers = obj[:cluster_centers]
|
111
|
-
@rng = obj[:rng]
|
112
|
-
nil
|
113
|
-
end
|
114
|
-
|
115
|
-
private
|
116
|
-
|
117
|
-
def assign_cluster(x)
|
118
|
-
distance_matrix = PairwiseMetric.euclidean_distance(x, @cluster_centers)
|
119
|
-
distance_matrix.min_index(axis: 1) - Numo::Int32[*0.step(distance_matrix.size - 1, @cluster_centers.shape[0])]
|
120
|
-
end
|
121
|
-
|
122
|
-
def init_cluster_centers(x)
|
123
|
-
# random initialize
|
124
|
-
n_samples = x.shape[0]
|
125
|
-
rand_id = [*0...n_samples].sample(@params[:n_clusters], random: @rng)
|
126
|
-
@cluster_centers = x[rand_id, true].dup
|
127
|
-
return unless @params[:init] == 'k-means++'
|
128
|
-
# k-means++ initialize
|
129
|
-
(1...@params[:n_clusters]).each do |n|
|
130
|
-
distance_matrix = PairwiseMetric.euclidean_distance(x, @cluster_centers[0...n, true])
|
131
|
-
min_distances = distance_matrix.flatten[distance_matrix.min_index(axis: 1)]
|
132
|
-
probs = min_distances**2 / (min_distances**2).sum
|
133
|
-
cum_probs = probs.cumsum
|
134
|
-
selected_id = cum_probs.gt(@rng.rand).where.to_a.first
|
135
|
-
@cluster_centers[n, true] = x[selected_id, true].dup
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
end
|
data/lib/svmkit/dataset.rb
DELETED
@@ -1,109 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'csv'
|
4
|
-
|
5
|
-
module SVMKit
|
6
|
-
# Module for loading and saving a dataset file.
|
7
|
-
module Dataset
|
8
|
-
class << self
|
9
|
-
# Load a dataset with the libsvm file format into Numo::NArray.
|
10
|
-
#
|
11
|
-
# @param filename [String] A path to a dataset file.
|
12
|
-
# @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
|
13
|
-
#
|
14
|
-
# @return [Array<Numo::NArray>]
|
15
|
-
# Returns array containing the (n_samples x n_features) matrix for feature vectors
|
16
|
-
# and (n_samples) vector for labels or target values.
|
17
|
-
def load_libsvm_file(filename, zero_based: false)
|
18
|
-
ftvecs = []
|
19
|
-
labels = []
|
20
|
-
n_features = 0
|
21
|
-
CSV.foreach(filename, col_sep: "\s", headers: false) do |line|
|
22
|
-
label, ftvec, max_idx = parse_libsvm_line(line, zero_based)
|
23
|
-
labels.push(label)
|
24
|
-
ftvecs.push(ftvec)
|
25
|
-
n_features = max_idx if n_features < max_idx
|
26
|
-
end
|
27
|
-
[convert_to_matrix(ftvecs, n_features), Numo::NArray.asarray(labels)]
|
28
|
-
end
|
29
|
-
|
30
|
-
# Dump the dataset with the libsvm file format.
|
31
|
-
#
|
32
|
-
# @param data [Numo::NArray] (shape: [n_samples, n_features]) matrix consisting of feature vectors.
|
33
|
-
# @param labels [Numo::NArray] (shape: [n_samples]) matrix consisting of labels or target values.
|
34
|
-
# @param filename [String] A path to the output libsvm file.
|
35
|
-
# @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
|
36
|
-
def dump_libsvm_file(data, labels, filename, zero_based: false)
|
37
|
-
n_samples = [data.shape[0], labels.shape[0]].min
|
38
|
-
single_label = labels.shape[1].nil?
|
39
|
-
label_type = detect_dtype(labels)
|
40
|
-
value_type = detect_dtype(data)
|
41
|
-
File.open(filename, 'w') do |file|
|
42
|
-
n_samples.times do |n|
|
43
|
-
label = single_label ? labels[n] : labels[n, true].to_a
|
44
|
-
file.puts(dump_libsvm_line(label, data[n, true],
|
45
|
-
label_type, value_type, zero_based))
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
private
|
51
|
-
|
52
|
-
def parse_libsvm_line(line, zero_based)
|
53
|
-
label = parse_label(line.shift)
|
54
|
-
adj_idx = zero_based == false ? 1 : 0
|
55
|
-
max_idx = -1
|
56
|
-
ftvec = []
|
57
|
-
while (el = line.shift)
|
58
|
-
idx, val = el.split(':')
|
59
|
-
idx = idx.to_i - adj_idx
|
60
|
-
val = val.to_i.to_s == val ? val.to_i : val.to_f
|
61
|
-
max_idx = idx if max_idx < idx
|
62
|
-
ftvec.push([idx, val])
|
63
|
-
end
|
64
|
-
[label, ftvec, max_idx]
|
65
|
-
end
|
66
|
-
|
67
|
-
def parse_label(label)
|
68
|
-
lbl_arr = label.split(',').map { |lbl| lbl.to_i.to_s == lbl ? lbl.to_i : lbl.to_f }
|
69
|
-
lbl_arr.size > 1 ? lbl_arr : lbl_arr[0]
|
70
|
-
end
|
71
|
-
|
72
|
-
def convert_to_matrix(data, n_features)
|
73
|
-
mat = []
|
74
|
-
data.each do |ft|
|
75
|
-
vec = Array.new(n_features) { 0 }
|
76
|
-
ft.each { |el| vec[el[0]] = el[1] }
|
77
|
-
mat.push(vec)
|
78
|
-
end
|
79
|
-
Numo::NArray.asarray(mat)
|
80
|
-
end
|
81
|
-
|
82
|
-
def detect_dtype(data)
|
83
|
-
arr_type_str = Numo::NArray.array_type(data).to_s
|
84
|
-
type = '%s'
|
85
|
-
type = '%d' if ['Numo::Int8', 'Numo::Int16', 'Numo::Int32', 'Numo::Int64'].include?(arr_type_str)
|
86
|
-
type = '%d' if ['Numo::UInt8', 'Numo::UInt16', 'Numo::UInt32', 'Numo::UInt64'].include?(arr_type_str)
|
87
|
-
type = '%.10g' if ['Numo::SFloat', 'Numo::DFloat'].include?(arr_type_str)
|
88
|
-
type
|
89
|
-
end
|
90
|
-
|
91
|
-
def dump_libsvm_line(label, ftvec, label_type, value_type, zero_based)
|
92
|
-
line = dump_label(label, label_type.to_s)
|
93
|
-
ftvec.to_a.each_with_index do |val, n|
|
94
|
-
idx = n + (zero_based == false ? 1 : 0)
|
95
|
-
line += format(" %d:#{value_type}", idx, val) if val != 0.0
|
96
|
-
end
|
97
|
-
line
|
98
|
-
end
|
99
|
-
|
100
|
-
def dump_label(label, label_type_str)
|
101
|
-
if label.is_a?(Array)
|
102
|
-
label.map { |lbl| format(label_type_str, lbl) }.join(',')
|
103
|
-
else
|
104
|
-
format(label_type_str, label)
|
105
|
-
end
|
106
|
-
end
|
107
|
-
end
|
108
|
-
end
|
109
|
-
end
|
@@ -1,147 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'svmkit/validation'
|
4
|
-
require 'svmkit/base/base_estimator'
|
5
|
-
require 'svmkit/base/transformer'
|
6
|
-
|
7
|
-
module SVMKit
|
8
|
-
module Decomposition
|
9
|
-
# NMF is a class that implements Non-negative Matrix Factorization.
|
10
|
-
#
|
11
|
-
# @example
|
12
|
-
# decomposer = SVMKit::Decomposition::NMF.new(n_components: 2)
|
13
|
-
# representaion = decomposer.fit_transform(samples)
|
14
|
-
#
|
15
|
-
# *Reference*
|
16
|
-
# - W. Xu, X. Liu, and Y.Gong, "Document Clustering Based On Non-negative Matrix Factorization," Proc. SIGIR' 03 , pp. 267--273, 2003.
|
17
|
-
class NMF
|
18
|
-
include Base::BaseEstimator
|
19
|
-
include Base::Transformer
|
20
|
-
include Validation
|
21
|
-
|
22
|
-
# Returns the factorization matrix.
|
23
|
-
# @return [Numo::DFloat] (shape: [n_components, n_features])
|
24
|
-
attr_reader :components
|
25
|
-
|
26
|
-
# Return the random generator.
|
27
|
-
# @return [Random]
|
28
|
-
attr_reader :rng
|
29
|
-
|
30
|
-
# Create a new transformer with NMF.
|
31
|
-
#
|
32
|
-
# @param n_components [Integer] The number of components.
|
33
|
-
# @param max_iter [Integer] The maximum number of iterations.
|
34
|
-
# @param tol [Float] The tolerance of termination criterion.
|
35
|
-
# @param eps [Float] A small value close to zero to avoid zero division error.
|
36
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
37
|
-
def initialize(n_components: 2, max_iter: 500, tol: 1.0e-4, eps: 1.0e-16, random_seed: nil)
|
38
|
-
check_params_integer(n_components: n_components, max_iter: max_iter)
|
39
|
-
check_params_float(tol: tol, eps: eps)
|
40
|
-
check_params_type_or_nil(Integer, random_seed: random_seed)
|
41
|
-
check_params_positive(n_components: n_components, max_iter: max_iter, tol: tol, eps: eps)
|
42
|
-
@params = {}
|
43
|
-
@params[:n_components] = n_components
|
44
|
-
@params[:max_iter] = max_iter
|
45
|
-
@params[:tol] = tol
|
46
|
-
@params[:eps] = eps
|
47
|
-
@params[:random_seed] = random_seed
|
48
|
-
@params[:random_seed] ||= srand
|
49
|
-
@components = nil
|
50
|
-
@rng = Random.new(@params[:random_seed])
|
51
|
-
end
|
52
|
-
|
53
|
-
# Fit the model with given training data.
|
54
|
-
#
|
55
|
-
# @overload fit(x) -> NMF
|
56
|
-
#
|
57
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
58
|
-
# @return [NMF] The learned transformer itself.
|
59
|
-
def fit(x, _y = nil)
|
60
|
-
check_sample_array(x)
|
61
|
-
partial_fit(x)
|
62
|
-
self
|
63
|
-
end
|
64
|
-
|
65
|
-
# Fit the model with training data, and then transform them with the learned model.
|
66
|
-
#
|
67
|
-
# @overload fit_transform(x) -> Numo::DFloat
|
68
|
-
#
|
69
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
70
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
|
71
|
-
def fit_transform(x, _y = nil)
|
72
|
-
check_sample_array(x)
|
73
|
-
partial_fit(x)
|
74
|
-
end
|
75
|
-
|
76
|
-
# Transform the given data with the learned model.
|
77
|
-
#
|
78
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
|
79
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
|
80
|
-
def transform(x)
|
81
|
-
check_sample_array(x)
|
82
|
-
partial_fit(x, false)
|
83
|
-
end
|
84
|
-
|
85
|
-
# Inverse transform the given transformed data with the learned model.
|
86
|
-
#
|
87
|
-
# @param z [Numo::DFloat] (shape: [n_samples, n_components]) The data to be restored into original space with the learned model.
|
88
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_featuress]) The restored data.
|
89
|
-
def inverse_transform(z)
|
90
|
-
check_sample_array(z)
|
91
|
-
z.dot(@components)
|
92
|
-
end
|
93
|
-
|
94
|
-
# Dump marshal data.
|
95
|
-
# @return [Hash] The marshal data.
|
96
|
-
def marshal_dump
|
97
|
-
{ params: @params,
|
98
|
-
components: @components,
|
99
|
-
rng: @rng }
|
100
|
-
end
|
101
|
-
|
102
|
-
# Load marshal data.
|
103
|
-
# @return [nil]
|
104
|
-
def marshal_load(obj)
|
105
|
-
@params = obj[:params]
|
106
|
-
@components = obj[:components]
|
107
|
-
@rng = obj[:rng]
|
108
|
-
nil
|
109
|
-
end
|
110
|
-
|
111
|
-
private
|
112
|
-
|
113
|
-
def partial_fit(x, update_comps = true)
|
114
|
-
# initialize some variables.
|
115
|
-
n_samples, n_features = x.shape
|
116
|
-
scale = Math.sqrt(x.mean / @params[:n_components])
|
117
|
-
@components = rand_uniform([@params[:n_components], n_features]) * scale if update_comps
|
118
|
-
coefficients = rand_uniform([n_samples, @params[:n_components]]) * scale
|
119
|
-
# optimization.
|
120
|
-
@params[:max_iter].times do
|
121
|
-
# update
|
122
|
-
if update_comps
|
123
|
-
nume = coefficients.transpose.dot(x)
|
124
|
-
deno = coefficients.transpose.dot(coefficients).dot(@components) + @params[:eps]
|
125
|
-
@components *= (nume / deno)
|
126
|
-
end
|
127
|
-
nume = x.dot(@components.transpose)
|
128
|
-
deno = coefficients.dot(@components).dot(@components.transpose) + @params[:eps]
|
129
|
-
coefficients *= (nume / deno)
|
130
|
-
# normalize
|
131
|
-
norm = Numo::NMath.sqrt((@components**2).sum(1)) + @params[:eps]
|
132
|
-
@components /= norm.expand_dims(1) if update_comps
|
133
|
-
coefficients *= norm
|
134
|
-
# check convergence
|
135
|
-
err = ((x - coefficients.dot(@components))**2).sum(1).mean
|
136
|
-
break if err < @params[:tol]
|
137
|
-
end
|
138
|
-
coefficients
|
139
|
-
end
|
140
|
-
|
141
|
-
def rand_uniform(shape)
|
142
|
-
rnd_vals = Array.new(shape.inject(:*)) { @rng.rand }
|
143
|
-
Numo::DFloat.asarray(rnd_vals).reshape(shape[0], shape[1])
|
144
|
-
end
|
145
|
-
end
|
146
|
-
end
|
147
|
-
end
|
@@ -1,150 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'svmkit/validation'
|
4
|
-
require 'svmkit/base/base_estimator'
|
5
|
-
require 'svmkit/base/transformer'
|
6
|
-
|
7
|
-
module SVMKit
|
8
|
-
# Module for matrix decomposition algorithms.
|
9
|
-
module Decomposition
|
10
|
-
# PCA is a class that implements Principal Component Analysis.
|
11
|
-
#
|
12
|
-
# @example
|
13
|
-
# decomposer = SVMKit::Decomposition::PCA.new(n_components: 2)
|
14
|
-
# representaion = decomposer.fit_transform(samples)
|
15
|
-
#
|
16
|
-
# *Reference*
|
17
|
-
# - A. Sharma and K K. Paliwal, "Fast principal component analysis using fixed-point algorithm," Pattern Recognition Letters, 28, pp. 1151--1155, 2007.
|
18
|
-
class PCA
|
19
|
-
include Base::BaseEstimator
|
20
|
-
include Base::Transformer
|
21
|
-
include Validation
|
22
|
-
|
23
|
-
# Returns the principal components.
|
24
|
-
# @return [Numo::DFloat] (shape: [n_components, n_features])
|
25
|
-
attr_reader :components
|
26
|
-
|
27
|
-
# Returns the mean vector.
|
28
|
-
# @return [Numo::DFloat] (shape: [n_features]
|
29
|
-
attr_reader :mean
|
30
|
-
|
31
|
-
# Return the random generator.
|
32
|
-
# @return [Random]
|
33
|
-
attr_reader :rng
|
34
|
-
|
35
|
-
# Create a new transformer with PCA.
|
36
|
-
#
|
37
|
-
# @param n_components [Integer] The number of principal components.
|
38
|
-
# @param max_iter [Integer] The maximum number of iterations.
|
39
|
-
# @param tol [Float] The tolerance of termination criterion.
|
40
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
41
|
-
def initialize(n_components: 2, max_iter: 100, tol: 1.0e-4, random_seed: nil)
|
42
|
-
check_params_integer(n_components: n_components, max_iter: max_iter)
|
43
|
-
check_params_float(tol: tol)
|
44
|
-
check_params_type_or_nil(Integer, random_seed: random_seed)
|
45
|
-
check_params_positive(n_components: n_components, max_iter: max_iter, tol: tol)
|
46
|
-
@params = {}
|
47
|
-
@params[:n_components] = n_components
|
48
|
-
@params[:max_iter] = max_iter
|
49
|
-
@params[:tol] = tol
|
50
|
-
@params[:random_seed] = random_seed
|
51
|
-
@params[:random_seed] ||= srand
|
52
|
-
@components = nil
|
53
|
-
@mean = nil
|
54
|
-
@rng = Random.new(@params[:random_seed])
|
55
|
-
end
|
56
|
-
|
57
|
-
# Fit the model with given training data.
|
58
|
-
#
|
59
|
-
# @overload fit(x) -> PCA
|
60
|
-
#
|
61
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
62
|
-
# @return [PCA] The learned transformer itself.
|
63
|
-
def fit(x, _y = nil)
|
64
|
-
check_sample_array(x)
|
65
|
-
# initialize some variables.
|
66
|
-
@components = nil
|
67
|
-
n_samples, n_features = x.shape
|
68
|
-
# centering.
|
69
|
-
@mean = x.mean(0)
|
70
|
-
centered_x = x - @mean
|
71
|
-
# optimization.
|
72
|
-
covariance_mat = centered_x.transpose.dot(centered_x) / (n_samples - 1)
|
73
|
-
@params[:n_components].times do
|
74
|
-
comp_vec = random_vec(n_features)
|
75
|
-
@params[:max_iter].times do
|
76
|
-
updated = orthogonalize(covariance_mat.dot(comp_vec))
|
77
|
-
break if (updated.dot(comp_vec) - 1).abs < @params[:tol]
|
78
|
-
comp_vec = updated
|
79
|
-
end
|
80
|
-
@components = @components.nil? ? comp_vec : Numo::NArray.vstack([@components, comp_vec])
|
81
|
-
end
|
82
|
-
self
|
83
|
-
end
|
84
|
-
|
85
|
-
# Fit the model with training data, and then transform them with the learned model.
|
86
|
-
#
|
87
|
-
# @overload fit_transform(x) -> Numo::DFloat
|
88
|
-
#
|
89
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
90
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
|
91
|
-
def fit_transform(x, _y = nil)
|
92
|
-
check_sample_array(x)
|
93
|
-
fit(x).transform(x)
|
94
|
-
end
|
95
|
-
|
96
|
-
# Transform the given data with the learned model.
|
97
|
-
#
|
98
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
|
99
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
|
100
|
-
def transform(x)
|
101
|
-
check_sample_array(x)
|
102
|
-
(x - @mean).dot(@components.transpose)
|
103
|
-
end
|
104
|
-
|
105
|
-
# Inverse transform the given transformed data with the learned model.
|
106
|
-
#
|
107
|
-
# @param z [Numo::DFloat] (shape: [n_samples, n_components]) The data to be restored into original space with the learned model.
|
108
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_featuress]) The restored data.
|
109
|
-
def inverse_transform(z)
|
110
|
-
check_sample_array(z)
|
111
|
-
c = @components.shape[1].nil? ? @components.expand_dims(0) : @components
|
112
|
-
z.dot(c) + @mean
|
113
|
-
end
|
114
|
-
|
115
|
-
# Dump marshal data.
|
116
|
-
# @return [Hash] The marshal data.
|
117
|
-
def marshal_dump
|
118
|
-
{ params: @params,
|
119
|
-
components: @components,
|
120
|
-
mean: @mean,
|
121
|
-
rng: @rng }
|
122
|
-
end
|
123
|
-
|
124
|
-
# Load marshal data.
|
125
|
-
# @return [nil]
|
126
|
-
def marshal_load(obj)
|
127
|
-
@params = obj[:params]
|
128
|
-
@components = obj[:components]
|
129
|
-
@mean = obj[:mean]
|
130
|
-
@rng = obj[:rng]
|
131
|
-
nil
|
132
|
-
end
|
133
|
-
|
134
|
-
private
|
135
|
-
|
136
|
-
def orthogonalize(pcvec)
|
137
|
-
unless @components.nil?
|
138
|
-
delta = @components.dot(pcvec) * @components.transpose
|
139
|
-
delta = delta.sum(1) unless delta.shape[1].nil?
|
140
|
-
pcvec -= delta
|
141
|
-
end
|
142
|
-
pcvec / Math.sqrt((pcvec**2).sum.abs) + 1.0e-12
|
143
|
-
end
|
144
|
-
|
145
|
-
def random_vec(n_features)
|
146
|
-
Numo::DFloat[*(Array.new(n_features) { @rng.rand })]
|
147
|
-
end
|
148
|
-
end
|
149
|
-
end
|
150
|
-
end
|