rumale 0.13.4 → 0.13.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 522eaabfd67ced29bf275fb6f5cec019ff60e3d5
4
- data.tar.gz: 0eb97f58c3764bdcbf448f9a392f8f5091ce418d
3
+ metadata.gz: b9c82fecc8a90ec9d4379249b749e9257b2738c3
4
+ data.tar.gz: 1b4ca375174ee9d1c16a35d50e68ee9b5bd471c5
5
5
  SHA512:
6
- metadata.gz: bf5a3caf614b08813aa4b11673da758778191847ba6fe4c4144cae7da1dd8e4b3ec3eac1367d54b78a00a7afd5ae1ae047fa84c58954b0e7d0571a9442a10380
7
- data.tar.gz: 8bdb25aaec7304f12595673d3fa915cc1739ef14fedd89210205f53325de5a96076b4cf718f8d7ca15fdec1f55d5f9c65cd256781ec2d60462a48221525ad068
6
+ metadata.gz: 3d9bc3c21b951f5738cbf8a2e947d94d6fd735ebbcdc41fdb450d8c4f4f28a9788100c595b1b29bbe81e7124f0ca2b5703c4d6107730fab6f9ab6fe67db4fa1d
7
+ data.tar.gz: 2f93f78169dd8f694ca65634f3e4619ad83df72a59d0e2eacd8c686b2fdc82011f53b6b49e7c18e1c9a9d86904c77709d9743689e39e8db697f469a77a336fe7
@@ -1,3 +1,7 @@
1
+ # 0.13.5
2
+ - Add transformer class for [Factor Analysis](https://yoshoku.github.io/rumale/doc/Rumale/Decomposition/FactorAnalysis.html).
3
+ - Add covariance_type parameter to [Rumale::Clustering::GaussianMixture](https://yoshoku.github.io/rumale/doc/Rumale/Clustering/GaussianMixture.html).
4
+
1
5
  # 0.13.4
2
6
  - Add cluster analysis class for [HDBSCAN](https://yoshoku.github.io/rumale/doc/Rumale/Clustering/HDBSCAN.html).
3
7
  - Add cluster analysis class for [spectral clustering](https://yoshoku.github.io/rumale/doc/Rumale/Clustering/SpectralClustering.html).
@@ -69,6 +69,7 @@ require 'rumale/clustering/spectral_clustering'
69
69
  require 'rumale/clustering/single_linkage'
70
70
  require 'rumale/decomposition/pca'
71
71
  require 'rumale/decomposition/nmf'
72
+ require 'rumale/decomposition/factor_analysis'
72
73
  require 'rumale/manifold/tsne'
73
74
  require 'rumale/manifold/mds'
74
75
  require 'rumale/preprocessing/l2_normalizer'
@@ -3,17 +3,20 @@
3
3
  require 'rumale/base/base_estimator'
4
4
  require 'rumale/base/cluster_analyzer'
5
5
  require 'rumale/preprocessing/label_binarizer'
6
- require 'rumale/pairwise_metric'
7
6
 
8
7
  module Rumale
9
8
  module Clustering
10
9
  # GaussianMixture is a class that implements cluster analysis with gaussian mixture model.
11
- # The current implementation uses only the diagonal elements of covariance matrices to represent mixture parameters
12
- # without using full elements.
13
10
  #
14
11
  # @example
15
12
  # analyzer = Rumale::Clustering::GaussianMixture.new(n_clusters: 10, max_iter: 50)
16
13
  # cluster_labels = analyzer.fit_predict(samples)
14
+ #
15
+ # # If Numo::Linalg is installed, you can specify 'full' for the tyep of covariance option.
16
+ # require 'numo/linalg/autoloader'
17
+ # analyzer = Rumale::Clustering::GaussianMixture.new(n_clusters: 10, max_iter: 50, covariance_type: 'full')
18
+ # cluster_labels = analyzer.fit_predict(samples)
19
+ #
17
20
  class GaussianMixture
18
21
  include Base::BaseEstimator
19
22
  include Base::ClusterAnalyzer
@@ -31,18 +34,19 @@ module Rumale
31
34
  attr_reader :means
32
35
 
33
36
  # Return the diagonal elements of covariance matrix of each cluster.
34
- # @return [Numo::DFloat] (shape: [n_clusters, n_features])
37
+ # @return [Numo::DFloat] (shape: [n_clusters, n_features] if 'diag', [n_clusters, n_features, n_features] if 'full')
35
38
  attr_reader :covariances
36
39
 
37
40
  # Create a new cluster analyzer with gaussian mixture model.
38
41
  #
39
42
  # @param n_clusters [Integer] The number of clusters.
40
43
  # @param init [String] The initialization method for centroids ('random' or 'k-means++').
44
+ # @param covariance_type [String] The type of covariance parameter to be used ('diag' or 'full').
41
45
  # @param max_iter [Integer] The maximum number of iterations.
42
46
  # @param tol [Float] The tolerance of termination criterion.
43
47
  # @param reg_covar [Float] The non-negative regularization to the diagonal of covariance.
44
48
  # @param random_seed [Integer] The seed value using to initialize the random generator.
45
- def initialize(n_clusters: 8, init: 'k-means++', max_iter: 50, tol: 1.0e-4, reg_covar: 1.0e-6, random_seed: nil)
49
+ def initialize(n_clusters: 8, init: 'k-means++', covariance_type: 'diag', max_iter: 50, tol: 1.0e-4, reg_covar: 1.0e-6, random_seed: nil)
46
50
  check_params_integer(n_clusters: n_clusters, max_iter: max_iter)
47
51
  check_params_float(tol: tol)
48
52
  check_params_string(init: init)
@@ -51,6 +55,7 @@ module Rumale
51
55
  @params = {}
52
56
  @params[:n_clusters] = n_clusters
53
57
  @params[:init] = init == 'random' ? 'random' : 'k-means++'
58
+ @params[:covariance_type] = covariance_type == 'full' ? 'full' : 'diag'
54
59
  @params[:max_iter] = max_iter
55
60
  @params[:tol] = tol
56
61
  @params[:reg_covar] = reg_covar
@@ -70,14 +75,16 @@ module Rumale
70
75
  # @return [GaussianMixture] The learned cluster analyzer itself.
71
76
  def fit(x, _y = nil)
72
77
  check_sample_array(x)
78
+ check_enable_linalg('fit')
79
+
73
80
  n_samples = x.shape[0]
74
81
  memberships = init_memberships(x)
75
82
  @params[:max_iter].times do |t|
76
83
  @n_iter = t
77
84
  @weights = calc_weights(n_samples, memberships)
78
85
  @means = calc_means(x, memberships)
79
- @covariances = calc_diag_covariances(x, @means, memberships) + @params[:reg_covar]
80
- new_memberships = calc_memberships(x, @weights, @means, @covariances)
86
+ @covariances = calc_covariances(x, @means, memberships, @params[:reg_covar], @params[:covariance_type])
87
+ new_memberships = calc_memberships(x, @weights, @means, @covariances, @params[:covariance_type])
81
88
  error = (memberships - new_memberships).abs.max
82
89
  break if error <= @params[:tol]
83
90
  memberships = new_memberships.dup
@@ -91,7 +98,9 @@ module Rumale
91
98
  # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
92
99
  def predict(x)
93
100
  check_sample_array(x)
94
- memberships = calc_memberships(x, @weights, @means, @covariances)
101
+ check_enable_linalg('predict')
102
+
103
+ memberships = calc_memberships(x, @weights, @means, @covariances, @params[:covariance_type])
95
104
  assign_cluster(memberships)
96
105
  end
97
106
 
@@ -101,6 +110,8 @@ module Rumale
101
110
  # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
102
111
  def fit_predict(x)
103
112
  check_sample_array(x)
113
+ check_enable_linalg('fit_predict')
114
+
104
115
  fit(x).predict(x)
105
116
  end
106
117
 
@@ -141,15 +152,14 @@ module Rumale
141
152
  Numo::DFloat.cast(encoder.fit_transform(cluster_ids))
142
153
  end
143
154
 
144
- def calc_memberships(x, weights, means, diag_cov)
155
+ def calc_memberships(x, weights, means, covars, covar_type)
145
156
  n_samples = x.shape[0]
146
157
  n_clusters = means.shape[0]
147
158
  memberships = Numo::DFloat.zeros(n_samples, n_clusters)
148
159
  n_clusters.times do |n|
149
160
  centered = x - means[n, true]
150
- inv_cov = 1.0 / diag_cov[n, true]
151
- sqrt_det_cov = 1.0 / Math.sqrt(diag_cov[n, true].prod)
152
- memberships[true, n] = weights[n] * sqrt_det_cov * Numo::NMath.exp(-0.5 * (centered * inv_cov * centered).sum(1))
161
+ covar = covar_type == 'full' ? covars[n, true, true] : covars[n, true]
162
+ memberships[true, n] = calc_unnormalized_membership(centered, weights[n], covar, covar_type)
153
163
  end
154
164
  memberships / memberships.sum(1).expand_dims(1)
155
165
  end
@@ -162,13 +172,67 @@ module Rumale
162
172
  memberships.transpose.dot(x) / memberships.sum(0).expand_dims(1)
163
173
  end
164
174
 
165
- def calc_diag_covariances(x, means, memberships)
175
+ def calc_covariances(x, means, memberships, reg_cover, covar_type)
176
+ if covar_type == 'full'
177
+ calc_full_covariances(x, means, reg_cover, memberships)
178
+ else
179
+ calc_diag_covariances(x, means, reg_cover, memberships)
180
+ end
181
+ end
182
+
183
+ def calc_diag_covariances(x, means, reg_cover, memberships)
166
184
  n_clusters = means.shape[0]
167
185
  diag_cov = Array.new(n_clusters) do |n|
168
186
  centered = x - means[n, true]
169
187
  memberships[true, n].dot(centered**2) / memberships[true, n].sum
170
188
  end
171
- Numo::DFloat.asarray(diag_cov)
189
+ Numo::DFloat.asarray(diag_cov) + reg_cover
190
+ end
191
+
192
+ def calc_full_covariances(x, means, reg_cover, memberships)
193
+ n_features = x.shape[1]
194
+ n_clusters = means.shape[0]
195
+ cov_mats = Numo::DFloat.zeros(n_clusters, n_features, n_features)
196
+ reg_mat = Numo::DFloat.eye(n_features) * reg_cover
197
+ n_clusters.times do |n|
198
+ centered = x - means[n, true]
199
+ members = memberships[true, n]
200
+ cov_mats[n, true, true] = reg_mat + (centered.transpose * members).dot(centered) / members.sum
201
+ end
202
+ cov_mats
203
+ end
204
+
205
+ def calc_unnormalized_membership(centered, weight, covar, covar_type)
206
+ inv_covar = calc_inv_covariance(covar, covar_type)
207
+ inv_sqrt_det_covar = calc_inv_sqrt_det_covariance(covar, covar_type)
208
+ distances = if covar_type == 'full'
209
+ (centered.dot(inv_covar) * centered).sum(1)
210
+ else
211
+ (centered * inv_covar * centered).sum(1)
212
+ end
213
+ weight * inv_sqrt_det_covar * Numo::NMath.exp(-0.5 * distances)
214
+ end
215
+
216
+ def calc_inv_covariance(covar, covar_type)
217
+ if covar_type == 'full'
218
+ Numo::Linalg.inv(covar)
219
+ else
220
+ 1.0 / covar
221
+ end
222
+ end
223
+
224
+ def calc_inv_sqrt_det_covariance(covar, covar_type)
225
+ if covar_type == 'full'
226
+ 1.0 / Math.sqrt(Numo::Linalg.det(covar))
227
+ else
228
+ 1.0 / Math.sqrt(covar.prod)
229
+ end
230
+ end
231
+
232
+ def check_enable_linalg(method_name)
233
+ if (@params[:covariance_type] == 'full') && !enable_linalg?
234
+ raise "GaussianMixture##{method_name} requires Numo::Linalg when covariance_type is 'full' but that is not loaded."
235
+ end
172
236
  end
173
237
  end
174
238
  end
@@ -0,0 +1,170 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/transformer'
5
+ require 'rumale/utils'
6
+
7
+ module Rumale
8
+ module Decomposition
9
+ # FactorAnalysis is a class that implements fator analysis with EM algorithm.
10
+ #
11
+ # @example
12
+ # require 'numo/linalg/autoloader'
13
+ # decomposer = Rumale::Decomposition::FactorAnalysis.new(n_components: 2)
14
+ # representaion = decomposer.fit_transform(samples)
15
+ #
16
+ # *Reference*
17
+ # - D. Barber, "Bayesian Reasoning and Machine Learning," Cambridge University Press, 2012.
18
+ class FactorAnalysis
19
+ include Base::BaseEstimator
20
+ include Base::Transformer
21
+
22
+ # Returns the mean vector.
23
+ # @return [Numo::DFloat] (shape: [n_features])
24
+ attr_reader :mean
25
+
26
+ # Returns the estimated noise variance for each feature.
27
+ # @return [Numo::DFloat] (shape: [n_features])
28
+ attr_reader :noise_variance
29
+
30
+ # Returns the components with maximum variance.
31
+ # @return [Numo::DFloat] (shape: [n_components, n_features])
32
+ attr_reader :components
33
+
34
+ # Returns the log likelihood at each iteration.
35
+ # @return [Numo::DFloat] (shape: [n_iter])
36
+ attr_reader :loglike
37
+
38
+ # Return the number of iterations run for optimization
39
+ # @return [Integer]
40
+ attr_reader :n_iter
41
+
42
+ # Create a new transformer with factor analysis.
43
+ #
44
+ # @param n_components [Integer] The number of components (dimensionality of latent space).
45
+ # @param max_iter [Integer] The maximum number of iterations.
46
+ # @param tol [Float/Nil] The tolerance of termination criterion for EM algorithm.
47
+ # If nil is given, iterate EM steps up to the maximum number of iterations.
48
+ def initialize(n_components: 2, max_iter: 100, tol: 1e-8)
49
+ check_params_integer(n_components: n_components, max_iter: max_iter)
50
+ check_params_type_or_nil(Float, tol: tol)
51
+ check_params_positive(n_components: n_components, max_iter: max_iter)
52
+ @params = {}
53
+ @params[:n_components] = n_components
54
+ @params[:max_iter] = max_iter
55
+ @params[:tol] = tol
56
+ @mean = nil
57
+ @noise_variance = nil
58
+ @components = nil
59
+ @loglike = nil
60
+ @n_iter = nil
61
+ end
62
+
63
+ # Fit the model with given training data.
64
+ #
65
+ # @overload fit(x) -> FactorAnalysis
66
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
67
+ # @return [FactorAnalysis] The learned transformer itself.
68
+ def fit(x, _y = nil)
69
+ raise 'FactorAnalysis#fit requires Numo::Linalg but that is not loaded.' unless enable_linalg?
70
+
71
+ # initialize some variables.
72
+ n_samples, n_features = x.shape
73
+ @mean = x.mean(0)
74
+ centered_x = x - @mean
75
+ cov_mat = centered_x.transpose.dot(centered_x) / n_samples
76
+ sample_vars = x.var(0)
77
+ sqrt_n_samples = Math.sqrt(n_samples)
78
+ @noise_variance = Numo::DFloat.ones(n_features)
79
+
80
+ # run optimization.
81
+ old_loglike = 0.0
82
+ @n_iter = 0
83
+ @loglike = [] unless @params[:tol].nil?
84
+ @params[:max_iter].times do |t|
85
+ @n_iter = t + 1
86
+ sqrt_noise_variance = Numo::NMath.sqrt(@noise_variance)
87
+ scaled_x = centered_x / (sqrt_noise_variance * sqrt_n_samples + 1e-12)
88
+ s, u = truncate_svd(scaled_x, @params[:n_components])
89
+ scaler = Numo::NMath.sqrt(Numo::DFloat.maximum(s**2 - 1.0, 0.0))
90
+ @components = (sqrt_noise_variance.diag.dot(u) * scaler).transpose.dup
91
+ @noise_variance = Numo::DFloat.maximum(sample_vars - @components.transpose.dot(@components).diagonal, 1e-12)
92
+ next if @params[:tol].nil?
93
+ new_loglike = log_likelihood(cov_mat, @components, @noise_variance)
94
+ @loglike.push(new_loglike)
95
+ break if (old_loglike - new_loglike).abs <= @params[:tol]
96
+ old_loglike = new_loglike
97
+ end
98
+
99
+ @loglike = Numo::DFloat.cast(@loglike) unless @params[:tol].nil?
100
+ @components = @components[0, true].dup if @params[:n_components] == 1
101
+ self
102
+ end
103
+
104
+ # Fit the model with training data, and then transform them with the learned model.
105
+ #
106
+ # @overload fit_transform(x) -> Numo::DFloat
107
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
108
+ # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
109
+ def fit_transform(x, _y = nil)
110
+ check_sample_array(x)
111
+ raise 'FactorAnalysis#fit_transform requires Numo::Linalg but that is not loaded.' unless enable_linalg?
112
+
113
+ fit(x).transform(x)
114
+ end
115
+
116
+ # Transform the given data with the learned model.
117
+ #
118
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
119
+ # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
120
+ def transform(x)
121
+ check_sample_array(x)
122
+ raise 'FactorAnalysis#transform requires Numo::Linalg but that is not loaded.' unless enable_linalg?
123
+
124
+ factors = @params[:n_components] == 1 ? @components.expand_dims(0) : @components
125
+ centered_x = x - @mean
126
+ beta = Numo::Linalg.inv(Numo::DFloat.eye(factors.shape[0]) + (factors / @noise_variance).dot(factors.transpose))
127
+ z = centered_x.dot((beta.dot(factors) / @noise_variance).transpose)
128
+ @params[:n_components] == 1 ? z[true, 0].dup : z
129
+ end
130
+
131
+ # Dump marshal data.
132
+ # @return [Hash] The marshal data.
133
+ def marshal_dump
134
+ { params: @params,
135
+ mean: @mean,
136
+ noise_variance: @noise_variance,
137
+ components: @components,
138
+ loglike: @loglike,
139
+ n_iter: @n_iter }
140
+ end
141
+
142
+ # Load marshal data.
143
+ # @return [nil]
144
+ def marshal_load(obj)
145
+ @params = obj[:params]
146
+ @mean = obj[:mean]
147
+ @noise_variance = obj[:noise_variance]
148
+ @components = obj[:components]
149
+ @loglike = obj[:loglike]
150
+ @n_iter = obj[:n_iter]
151
+ end
152
+
153
+ private
154
+
155
+ def log_likelihood(cov_mat, factors, noise_vars)
156
+ n_samples = noise_vars.size
157
+ fact_cov_mat = factors.transpose.dot(factors) + noise_vars.diag
158
+ n_samples.fdiv(2) * Math.log(Numo::Linalg.det(fact_cov_mat)) + Numo::Linalg.inv(fact_cov_mat).dot(cov_mat).trace
159
+ end
160
+
161
+ def truncate_svd(x, k)
162
+ m = x.shape[1]
163
+ eig_vals, eig_vecs = Numo::Linalg.eigh(x.transpose.dot(x), vals_range: (m - k)...m)
164
+ s = Numo::NMath.sqrt(eig_vals.reverse.dup)
165
+ u = eig_vecs.reverse(1).dup
166
+ [s, u]
167
+ end
168
+ end
169
+ end
170
+ end
@@ -3,5 +3,5 @@
3
3
  # Rumale is a machine learning library in Ruby.
4
4
  module Rumale
5
5
  # The version of Rumale you are using.
6
- VERSION = '0.13.4'
6
+ VERSION = '0.13.5'
7
7
  end
@@ -46,10 +46,10 @@ Gem::Specification.new do |spec|
46
46
  spec.add_runtime_dependency 'numo-narray', '>= 0.9.1'
47
47
 
48
48
  spec.add_development_dependency 'bundler', '~> 2.0'
49
- spec.add_development_dependency 'coveralls', '>= 0.8.23'
49
+ spec.add_development_dependency 'coveralls', '~> 0.8'
50
50
  spec.add_development_dependency 'numo-linalg', '>= 0.1.4'
51
51
  spec.add_development_dependency 'parallel', '>= 1.17.0'
52
- spec.add_development_dependency 'rake', '~> 12.0'
52
+ spec.add_development_dependency 'rake', '~> 10.0'
53
53
  spec.add_development_dependency 'rake-compiler', '~> 1.0'
54
54
  spec.add_development_dependency 'rspec', '~> 3.0'
55
55
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rumale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.4
4
+ version: 0.13.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-09-22 00:00:00.000000000 Z
11
+ date: 2019-10-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -42,16 +42,16 @@ dependencies:
42
42
  name: coveralls
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ">="
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: 0.8.23
47
+ version: '0.8'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ">="
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: 0.8.23
54
+ version: '0.8'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: numo-linalg
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -86,14 +86,14 @@ dependencies:
86
86
  requirements:
87
87
  - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: '12.0'
89
+ version: '10.0'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: '12.0'
96
+ version: '10.0'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: rake-compiler
99
99
  requirement: !ruby/object:Gem::Requirement
@@ -171,6 +171,7 @@ files:
171
171
  - lib/rumale/clustering/snn.rb
172
172
  - lib/rumale/clustering/spectral_clustering.rb
173
173
  - lib/rumale/dataset.rb
174
+ - lib/rumale/decomposition/factor_analysis.rb
174
175
  - lib/rumale/decomposition/nmf.rb
175
176
  - lib/rumale/decomposition/pca.rb
176
177
  - lib/rumale/ensemble/ada_boost_classifier.rb