svmkit 0.5.2 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 917f85878296b940b497f13253e3d3b03047be8f154d554116c2629aaeea55dd
4
- data.tar.gz: 16308e4638b15a55843f15b4e0d97886f27aae0cc236c59c590a8f9fe7f0e5c6
3
+ metadata.gz: d8486463886064e5aa5169dbed20101a01d01101483226cf96e38b377144e153
4
+ data.tar.gz: 79807cdbe9f10fba17cd91dbcd68c713180fd986e9b6fab575a849c762bb0d2e
5
5
  SHA512:
6
- metadata.gz: d390d3ef0d7b06676e6d3c34479939b4a99ee01472816eacbe49fd3f40224ef5984620dfe6d335fb5b15e7213d3b0d17ba9441766e7cdd08c8bad9bff669db8d
7
- data.tar.gz: ab2239c0d1297e18e31940e763875ac24668d8c4c3f30355f06bc5ed305c247ff0328e1d584c5ab70ce77d4d2f946dcc5f72f1eb4c3a25d9b0dcd38e1d246182
6
+ metadata.gz: 2ec6c45c99bda82813644b7b4a043a4dda27810e24e10b4588182cbc3dea106eb05ac013354177861ab4abf552d89670ea1f0d39d50976a232feedc0daee9030
7
+ data.tar.gz: 373a566f294bab3d6fb232516f5720a877b6e20999c3253ae49044d759f83b06999fef4e259df91b950fe49c395042c1a74675fadb326460a7e65d54048ca325
data/HISTORY.md CHANGED
@@ -1,3 +1,7 @@
1
+ # 0.6.0
2
+ - Add class for Principal Component Analysis.
3
+ - Add class for Non-negative Matrix Factorization.
4
+
1
5
  # 0.5.2
2
6
  - Add class for DBSCAN clustering.
3
7
 
data/README.md CHANGED
@@ -10,7 +10,8 @@ SVMKit provides machine learning algorithms with interfaces similar to Scikit-Le
10
10
  SVMKit currently supports Linear / Kernel Support Vector Machine,
11
11
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
12
12
  Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor classifier,
13
- K-Means, DBSCAN and cross-validation.
13
+ K-Means, DBSCAN, Principal Component Analysis, Non-negative Matrix Factorization
14
+ and cross-validation.
14
15
 
15
16
  ## Installation
16
17
 
@@ -39,6 +39,8 @@ require 'svmkit/ensemble/random_forest_classifier'
39
39
  require 'svmkit/ensemble/random_forest_regressor'
40
40
  require 'svmkit/clustering/k_means'
41
41
  require 'svmkit/clustering/dbscan'
42
+ require 'svmkit/decomposition/pca'
43
+ require 'svmkit/decomposition/nmf'
42
44
  require 'svmkit/preprocessing/l2_normalizer'
43
45
  require 'svmkit/preprocessing/min_max_scaler'
44
46
  require 'svmkit/preprocessing/standard_scaler'
@@ -0,0 +1,147 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'svmkit/validation'
4
+ require 'svmkit/base/base_estimator'
5
+ require 'svmkit/base/transformer'
6
+
7
+ module SVMKit
8
+ module Decomposition
9
+ # NMF is a class that implements Non-negative Matrix Factorization.
10
+ #
11
+ # @example
12
+ # decomposer = SVMKit::Decomposition::NMF.new(n_components: 2)
13
+ # representaion = decomposer.fit_transform(samples)
14
+ #
15
+ # *Reference*
16
+ # - W. Xu, X. Liu, and Y.Gong, "Document Clustering Based On Non-negative Matrix Factorization," Proc. SIGIR' 03 , pp. 267--273, 2003.
17
+ class NMF
18
+ include Base::BaseEstimator
19
+ include Base::Transformer
20
+ include Validation
21
+
22
+ # Returns the factorization matrix.
23
+ # @return [Numo::DFloat] (shape: [n_components, n_features])
24
+ attr_reader :components
25
+
26
+ # Return the random generator.
27
+ # @return [Random]
28
+ attr_reader :rng
29
+
30
+ # Create a new transformer with NMF.
31
+ #
32
+ # @param n_components [Integer] The number of components.
33
+ # @param max_iter [Integer] The maximum number of iterations.
34
+ # @param tol [Float] The tolerance of termination criterion.
35
+ # @param eps [Float] A small value close to zero to avoid zero division error.
36
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
37
+ def initialize(n_components: 2, max_iter: 500, tol: 1.0e-4, eps: 1.0e-16, random_seed: nil)
38
+ check_params_integer(n_components: n_components, max_iter: max_iter)
39
+ check_params_float(tol: tol, eps: eps)
40
+ check_params_type_or_nil(Integer, random_seed: random_seed)
41
+ check_params_positive(n_components: n_components, max_iter: max_iter, tol: tol, eps: eps)
42
+ @params = {}
43
+ @params[:n_components] = n_components
44
+ @params[:max_iter] = max_iter
45
+ @params[:tol] = tol
46
+ @params[:eps] = eps
47
+ @params[:random_seed] = random_seed
48
+ @params[:random_seed] ||= srand
49
+ @components = nil
50
+ @rng = Random.new(@params[:random_seed])
51
+ end
52
+
53
+ # Fit the model with given training data.
54
+ #
55
+ # @overload fit(x) -> NMF
56
+ #
57
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
58
+ # @return [NMF] The learned transformer itself.
59
+ def fit(x, _y = nil)
60
+ check_sample_array(x)
61
+ partial_fit(x)
62
+ self
63
+ end
64
+
65
+ # Fit the model with training data, and then transform them with the learned model.
66
+ #
67
+ # @overload fit_transform(x) -> Numo::DFloat
68
+ #
69
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
70
+ # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
71
+ def fit_transform(x, _y = nil)
72
+ check_sample_array(x)
73
+ partial_fit(x)
74
+ end
75
+
76
+ # Transform the given data with the learned model.
77
+ #
78
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
79
+ # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
80
+ def transform(x)
81
+ check_sample_array(x)
82
+ partial_fit(x, false)
83
+ end
84
+
85
+ # Inverse transform the given transformed data with the learned model.
86
+ #
87
+ # @param z [Numo::DFloat] (shape: [n_samples, n_components]) The data to be restored into original space with the learned model.
88
+ # @return [Numo::DFloat] (shape: [n_samples, n_featuress]) The restored data.
89
+ def inverse_transform(z)
90
+ check_sample_array(z)
91
+ z.dot(@components)
92
+ end
93
+
94
+ # Dump marshal data.
95
+ # @return [Hash] The marshal data.
96
+ def marshal_dump
97
+ { params: @params,
98
+ components: @components,
99
+ rng: @rng }
100
+ end
101
+
102
+ # Load marshal data.
103
+ # @return [nil]
104
+ def marshal_load(obj)
105
+ @params = obj[:params]
106
+ @components = obj[:components]
107
+ @rng = obj[:rng]
108
+ nil
109
+ end
110
+
111
+ private
112
+
113
+ def partial_fit(x, update_comps = true)
114
+ # initialize some variables.
115
+ n_samples, n_features = x.shape
116
+ scale = Math.sqrt(x.mean / @params[:n_components])
117
+ @components = rand_uniform([@params[:n_components], n_features]) * scale if update_comps
118
+ coefficients = rand_uniform([n_samples, @params[:n_components]]) * scale
119
+ # optimization.
120
+ @params[:max_iter].times do
121
+ # update
122
+ if update_comps
123
+ nume = coefficients.transpose.dot(x)
124
+ deno = (coefficients.transpose.dot(coefficients)).dot(@components) + @params[:eps]
125
+ @components *= (nume / deno)
126
+ end
127
+ nume = x.dot(@components.transpose)
128
+ deno = (coefficients.dot(@components)).dot(@components.transpose) + @params[:eps]
129
+ coefficients *= (nume / deno)
130
+ # normalize
131
+ norm = Numo::NMath.sqrt((@components**2).sum(1)) + @params[:eps]
132
+ @components /= norm.expand_dims(1) if update_comps
133
+ coefficients *= norm
134
+ # check convergence
135
+ err = ((x - coefficients.dot(@components))**2).sum(1).mean
136
+ break if err < @params[:tol]
137
+ end
138
+ coefficients
139
+ end
140
+
141
+ def rand_uniform(shape)
142
+ rnd_vals = Array.new(shape.inject(:*)) { @rng.rand }
143
+ Numo::DFloat.asarray(rnd_vals).reshape(shape[0], shape[1])
144
+ end
145
+ end
146
+ end
147
+ end
@@ -0,0 +1,150 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'svmkit/validation'
4
+ require 'svmkit/base/base_estimator'
5
+ require 'svmkit/base/transformer'
6
+
7
+ module SVMKit
8
+ # Module for matrix decomposition algorithms.
9
+ module Decomposition
10
+ # PCA is a class that implements Principal Component Analysis.
11
+ #
12
+ # @example
13
+ # decomposer = SVMKit::Decomposition::PCA.new(n_components: 2)
14
+ # representaion = decomposer.fit_transform(samples)
15
+ #
16
+ # *Reference*
17
+ # - A. Sharma and K K. Paliwal, "Fast principal component analysis using fixed-point algorithm," Pattern Recognition Letters, 28, pp. 1151--1155, 2007.
18
+ class PCA
19
+ include Base::BaseEstimator
20
+ include Base::Transformer
21
+ include Validation
22
+
23
+ # Returns the principal components.
24
+ # @return [Numo::DFloat] (shape: [n_components, n_features])
25
+ attr_reader :components
26
+
27
+ # Returns the mean vector.
28
+ # @return [Numo::DFloat] (shape: [n_features]
29
+ attr_reader :mean
30
+
31
+ # Return the random generator.
32
+ # @return [Random]
33
+ attr_reader :rng
34
+
35
+ # Create a new transformer with PCA.
36
+ #
37
+ # @param n_components [Integer] The number of principal components.
38
+ # @param max_iter [Integer] The maximum number of iterations.
39
+ # @param tol [Float] The tolerance of termination criterion.
40
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
41
+ def initialize(n_components: 2, max_iter: 100, tol: 1.0e-4, random_seed: nil)
42
+ check_params_integer(n_components: n_components, max_iter: max_iter)
43
+ check_params_float(tol: tol)
44
+ check_params_type_or_nil(Integer, random_seed: random_seed)
45
+ check_params_positive(n_components: n_components, max_iter: max_iter, tol: tol)
46
+ @params = {}
47
+ @params[:n_components] = n_components
48
+ @params[:max_iter] = max_iter
49
+ @params[:tol] = tol
50
+ @params[:random_seed] = random_seed
51
+ @params[:random_seed] ||= srand
52
+ @components = nil
53
+ @mean = nil
54
+ @rng = Random.new(@params[:random_seed])
55
+ end
56
+
57
+ # Fit the model with given training data.
58
+ #
59
+ # @overload fit(x) -> PCA
60
+ #
61
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
62
+ # @return [PCA] The learned transformer itself.
63
+ def fit(x, _y = nil)
64
+ check_sample_array(x)
65
+ # initialize some variables.
66
+ @components = nil
67
+ n_samples, n_features = x.shape
68
+ # centering.
69
+ @mean = x.mean(0)
70
+ centered_x = x - @mean
71
+ # optimization.
72
+ covariance_mat = centered_x.transpose.dot(centered_x) / (n_samples - 1)
73
+ @params[:n_components].times do
74
+ comp_vec = random_vec(n_features)
75
+ @params[:max_iter].times do
76
+ updated = orthogonalize(covariance_mat.dot(comp_vec))
77
+ break if (updated.dot(comp_vec) - 1).abs < @params[:tol]
78
+ comp_vec = updated
79
+ end
80
+ @components = @components.nil? ? comp_vec : Numo::NArray.vstack([@components, comp_vec])
81
+ end
82
+ self
83
+ end
84
+
85
+ # Fit the model with training data, and then transform them with the learned model.
86
+ #
87
+ # @overload fit_transform(x) -> Numo::DFloat
88
+ #
89
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
90
+ # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
91
+ def fit_transform(x, _y = nil)
92
+ check_sample_array(x)
93
+ fit(x).transform(x)
94
+ end
95
+
96
+ # Transform the given data with the learned model.
97
+ #
98
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
99
+ # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
100
+ def transform(x)
101
+ check_sample_array(x)
102
+ (x - @mean).dot(@components.transpose)
103
+ end
104
+
105
+ # Inverse transform the given transformed data with the learned model.
106
+ #
107
+ # @param z [Numo::DFloat] (shape: [n_samples, n_components]) The data to be restored into original space with the learned model.
108
+ # @return [Numo::DFloat] (shape: [n_samples, n_featuress]) The restored data.
109
+ def inverse_transform(z)
110
+ check_sample_array(z)
111
+ c = @components.shape[1].nil? ? @components.expand_dims(0) : @components
112
+ z.dot(c) + @mean
113
+ end
114
+
115
+ # Dump marshal data.
116
+ # @return [Hash] The marshal data.
117
+ def marshal_dump
118
+ { params: @params,
119
+ components: @components,
120
+ mean: @mean,
121
+ rng: @rng }
122
+ end
123
+
124
+ # Load marshal data.
125
+ # @return [nil]
126
+ def marshal_load(obj)
127
+ @params = obj[:params]
128
+ @components = obj[:components]
129
+ @mean = obj[:mean]
130
+ @rng = obj[:rng]
131
+ nil
132
+ end
133
+
134
+ private
135
+
136
+ def orthogonalize(pcvec)
137
+ unless @components.nil?
138
+ delta = @components.dot(pcvec) * @components.transpose
139
+ delta = delta.sum(1) unless delta.shape[1].nil?
140
+ pcvec -= delta
141
+ end
142
+ pcvec / Math.sqrt((pcvec**2).sum.abs) + 1.0e-12
143
+ end
144
+
145
+ def random_vec(n_features)
146
+ Numo::DFloat[*(Array.new(n_features) { @rng.rand })]
147
+ end
148
+ end
149
+ end
150
+ end
@@ -3,5 +3,5 @@
3
3
  # SVMKit is a machine learning library in Ruby.
4
4
  module SVMKit
5
5
  # @!visibility private
6
- VERSION = '0.5.2'.freeze
6
+ VERSION = '0.6.0'.freeze
7
7
  end
@@ -18,7 +18,8 @@ SVMKit provides machine learning algorithms with interfaces similar to Scikit-Le
18
18
  SVMKit currently supports Linear / Kernel Support Vector Machine,
19
19
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
20
20
  Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor algorithm,
21
- K-Means, DBSCAN and cross-validation.
21
+ K-Means, DBSCAN, Principal Component Analysis, Non-negative Matrix Factorization
22
+ and cross-validation.
22
23
  MSG
23
24
  spec.homepage = 'https://github.com/yoshoku/svmkit'
24
25
  spec.license = 'BSD-2-Clause'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: svmkit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-06-23 00:00:00.000000000 Z
11
+ date: 2018-06-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -86,7 +86,8 @@ description: |
86
86
  SVMKit currently supports Linear / Kernel Support Vector Machine,
87
87
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
88
88
  Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor algorithm,
89
- K-Means, DBSCAN and cross-validation.
89
+ K-Means, DBSCAN, Principal Component Analysis, Non-negative Matrix Factorization
90
+ and cross-validation.
90
91
  email:
91
92
  - yoshoku@outlook.com
92
93
  executables: []
@@ -118,6 +119,8 @@ files:
118
119
  - lib/svmkit/clustering/dbscan.rb
119
120
  - lib/svmkit/clustering/k_means.rb
120
121
  - lib/svmkit/dataset.rb
122
+ - lib/svmkit/decomposition/nmf.rb
123
+ - lib/svmkit/decomposition/pca.rb
121
124
  - lib/svmkit/ensemble/random_forest_classifier.rb
122
125
  - lib/svmkit/ensemble/random_forest_regressor.rb
123
126
  - lib/svmkit/evaluation_measure/accuracy.rb