rumale 0.13.4 → 0.13.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 522eaabfd67ced29bf275fb6f5cec019ff60e3d5
4
- data.tar.gz: 0eb97f58c3764bdcbf448f9a392f8f5091ce418d
3
+ metadata.gz: b9c82fecc8a90ec9d4379249b749e9257b2738c3
4
+ data.tar.gz: 1b4ca375174ee9d1c16a35d50e68ee9b5bd471c5
5
5
  SHA512:
6
- metadata.gz: bf5a3caf614b08813aa4b11673da758778191847ba6fe4c4144cae7da1dd8e4b3ec3eac1367d54b78a00a7afd5ae1ae047fa84c58954b0e7d0571a9442a10380
7
- data.tar.gz: 8bdb25aaec7304f12595673d3fa915cc1739ef14fedd89210205f53325de5a96076b4cf718f8d7ca15fdec1f55d5f9c65cd256781ec2d60462a48221525ad068
6
+ metadata.gz: 3d9bc3c21b951f5738cbf8a2e947d94d6fd735ebbcdc41fdb450d8c4f4f28a9788100c595b1b29bbe81e7124f0ca2b5703c4d6107730fab6f9ab6fe67db4fa1d
7
+ data.tar.gz: 2f93f78169dd8f694ca65634f3e4619ad83df72a59d0e2eacd8c686b2fdc82011f53b6b49e7c18e1c9a9d86904c77709d9743689e39e8db697f469a77a336fe7
@@ -1,3 +1,7 @@
1
+ # 0.13.5
2
+ - Add transformer class for [Factor Analysis](https://yoshoku.github.io/rumale/doc/Rumale/Decomposition/FactorAnalysis.html).
3
+ - Add covariance_type parameter to [Rumale::Clustering::GaussianMixture](https://yoshoku.github.io/rumale/doc/Rumale/Clustering/GaussianMixture.html).
4
+
1
5
  # 0.13.4
2
6
  - Add cluster analysis class for [HDBSCAN](https://yoshoku.github.io/rumale/doc/Rumale/Clustering/HDBSCAN.html).
3
7
  - Add cluster analysis class for [spectral clustering](https://yoshoku.github.io/rumale/doc/Rumale/Clustering/SpectralClustering.html).
@@ -69,6 +69,7 @@ require 'rumale/clustering/spectral_clustering'
69
69
  require 'rumale/clustering/single_linkage'
70
70
  require 'rumale/decomposition/pca'
71
71
  require 'rumale/decomposition/nmf'
72
+ require 'rumale/decomposition/factor_analysis'
72
73
  require 'rumale/manifold/tsne'
73
74
  require 'rumale/manifold/mds'
74
75
  require 'rumale/preprocessing/l2_normalizer'
@@ -3,17 +3,20 @@
3
3
  require 'rumale/base/base_estimator'
4
4
  require 'rumale/base/cluster_analyzer'
5
5
  require 'rumale/preprocessing/label_binarizer'
6
- require 'rumale/pairwise_metric'
7
6
 
8
7
  module Rumale
9
8
  module Clustering
10
9
  # GaussianMixture is a class that implements cluster analysis with gaussian mixture model.
11
- # The current implementation uses only the diagonal elements of covariance matrices to represent mixture parameters
12
- # without using full elements.
13
10
  #
14
11
  # @example
15
12
  # analyzer = Rumale::Clustering::GaussianMixture.new(n_clusters: 10, max_iter: 50)
16
13
  # cluster_labels = analyzer.fit_predict(samples)
14
+ #
15
+ # # If Numo::Linalg is installed, you can specify 'full' for the tyep of covariance option.
16
+ # require 'numo/linalg/autoloader'
17
+ # analyzer = Rumale::Clustering::GaussianMixture.new(n_clusters: 10, max_iter: 50, covariance_type: 'full')
18
+ # cluster_labels = analyzer.fit_predict(samples)
19
+ #
17
20
  class GaussianMixture
18
21
  include Base::BaseEstimator
19
22
  include Base::ClusterAnalyzer
@@ -31,18 +34,19 @@ module Rumale
31
34
  attr_reader :means
32
35
 
33
36
  # Return the diagonal elements of covariance matrix of each cluster.
34
- # @return [Numo::DFloat] (shape: [n_clusters, n_features])
37
+ # @return [Numo::DFloat] (shape: [n_clusters, n_features] if 'diag', [n_clusters, n_features, n_features] if 'full')
35
38
  attr_reader :covariances
36
39
 
37
40
  # Create a new cluster analyzer with gaussian mixture model.
38
41
  #
39
42
  # @param n_clusters [Integer] The number of clusters.
40
43
  # @param init [String] The initialization method for centroids ('random' or 'k-means++').
44
+ # @param covariance_type [String] The type of covariance parameter to be used ('diag' or 'full').
41
45
  # @param max_iter [Integer] The maximum number of iterations.
42
46
  # @param tol [Float] The tolerance of termination criterion.
43
47
  # @param reg_covar [Float] The non-negative regularization to the diagonal of covariance.
44
48
  # @param random_seed [Integer] The seed value using to initialize the random generator.
45
- def initialize(n_clusters: 8, init: 'k-means++', max_iter: 50, tol: 1.0e-4, reg_covar: 1.0e-6, random_seed: nil)
49
+ def initialize(n_clusters: 8, init: 'k-means++', covariance_type: 'diag', max_iter: 50, tol: 1.0e-4, reg_covar: 1.0e-6, random_seed: nil)
46
50
  check_params_integer(n_clusters: n_clusters, max_iter: max_iter)
47
51
  check_params_float(tol: tol)
48
52
  check_params_string(init: init)
@@ -51,6 +55,7 @@ module Rumale
51
55
  @params = {}
52
56
  @params[:n_clusters] = n_clusters
53
57
  @params[:init] = init == 'random' ? 'random' : 'k-means++'
58
+ @params[:covariance_type] = covariance_type == 'full' ? 'full' : 'diag'
54
59
  @params[:max_iter] = max_iter
55
60
  @params[:tol] = tol
56
61
  @params[:reg_covar] = reg_covar
@@ -70,14 +75,16 @@ module Rumale
70
75
  # @return [GaussianMixture] The learned cluster analyzer itself.
71
76
  def fit(x, _y = nil)
72
77
  check_sample_array(x)
78
+ check_enable_linalg('fit')
79
+
73
80
  n_samples = x.shape[0]
74
81
  memberships = init_memberships(x)
75
82
  @params[:max_iter].times do |t|
76
83
  @n_iter = t
77
84
  @weights = calc_weights(n_samples, memberships)
78
85
  @means = calc_means(x, memberships)
79
- @covariances = calc_diag_covariances(x, @means, memberships) + @params[:reg_covar]
80
- new_memberships = calc_memberships(x, @weights, @means, @covariances)
86
+ @covariances = calc_covariances(x, @means, memberships, @params[:reg_covar], @params[:covariance_type])
87
+ new_memberships = calc_memberships(x, @weights, @means, @covariances, @params[:covariance_type])
81
88
  error = (memberships - new_memberships).abs.max
82
89
  break if error <= @params[:tol]
83
90
  memberships = new_memberships.dup
@@ -91,7 +98,9 @@ module Rumale
91
98
  # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
92
99
  def predict(x)
93
100
  check_sample_array(x)
94
- memberships = calc_memberships(x, @weights, @means, @covariances)
101
+ check_enable_linalg('predict')
102
+
103
+ memberships = calc_memberships(x, @weights, @means, @covariances, @params[:covariance_type])
95
104
  assign_cluster(memberships)
96
105
  end
97
106
 
@@ -101,6 +110,8 @@ module Rumale
101
110
  # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
102
111
  def fit_predict(x)
103
112
  check_sample_array(x)
113
+ check_enable_linalg('fit_predict')
114
+
104
115
  fit(x).predict(x)
105
116
  end
106
117
 
@@ -141,15 +152,14 @@ module Rumale
141
152
  Numo::DFloat.cast(encoder.fit_transform(cluster_ids))
142
153
  end
143
154
 
144
- def calc_memberships(x, weights, means, diag_cov)
155
+ def calc_memberships(x, weights, means, covars, covar_type)
145
156
  n_samples = x.shape[0]
146
157
  n_clusters = means.shape[0]
147
158
  memberships = Numo::DFloat.zeros(n_samples, n_clusters)
148
159
  n_clusters.times do |n|
149
160
  centered = x - means[n, true]
150
- inv_cov = 1.0 / diag_cov[n, true]
151
- sqrt_det_cov = 1.0 / Math.sqrt(diag_cov[n, true].prod)
152
- memberships[true, n] = weights[n] * sqrt_det_cov * Numo::NMath.exp(-0.5 * (centered * inv_cov * centered).sum(1))
161
+ covar = covar_type == 'full' ? covars[n, true, true] : covars[n, true]
162
+ memberships[true, n] = calc_unnormalized_membership(centered, weights[n], covar, covar_type)
153
163
  end
154
164
  memberships / memberships.sum(1).expand_dims(1)
155
165
  end
@@ -162,13 +172,67 @@ module Rumale
162
172
  memberships.transpose.dot(x) / memberships.sum(0).expand_dims(1)
163
173
  end
164
174
 
165
- def calc_diag_covariances(x, means, memberships)
175
+ def calc_covariances(x, means, memberships, reg_cover, covar_type)
176
+ if covar_type == 'full'
177
+ calc_full_covariances(x, means, reg_cover, memberships)
178
+ else
179
+ calc_diag_covariances(x, means, reg_cover, memberships)
180
+ end
181
+ end
182
+
183
+ def calc_diag_covariances(x, means, reg_cover, memberships)
166
184
  n_clusters = means.shape[0]
167
185
  diag_cov = Array.new(n_clusters) do |n|
168
186
  centered = x - means[n, true]
169
187
  memberships[true, n].dot(centered**2) / memberships[true, n].sum
170
188
  end
171
- Numo::DFloat.asarray(diag_cov)
189
+ Numo::DFloat.asarray(diag_cov) + reg_cover
190
+ end
191
+
192
+ def calc_full_covariances(x, means, reg_cover, memberships)
193
+ n_features = x.shape[1]
194
+ n_clusters = means.shape[0]
195
+ cov_mats = Numo::DFloat.zeros(n_clusters, n_features, n_features)
196
+ reg_mat = Numo::DFloat.eye(n_features) * reg_cover
197
+ n_clusters.times do |n|
198
+ centered = x - means[n, true]
199
+ members = memberships[true, n]
200
+ cov_mats[n, true, true] = reg_mat + (centered.transpose * members).dot(centered) / members.sum
201
+ end
202
+ cov_mats
203
+ end
204
+
205
+ def calc_unnormalized_membership(centered, weight, covar, covar_type)
206
+ inv_covar = calc_inv_covariance(covar, covar_type)
207
+ inv_sqrt_det_covar = calc_inv_sqrt_det_covariance(covar, covar_type)
208
+ distances = if covar_type == 'full'
209
+ (centered.dot(inv_covar) * centered).sum(1)
210
+ else
211
+ (centered * inv_covar * centered).sum(1)
212
+ end
213
+ weight * inv_sqrt_det_covar * Numo::NMath.exp(-0.5 * distances)
214
+ end
215
+
216
+ def calc_inv_covariance(covar, covar_type)
217
+ if covar_type == 'full'
218
+ Numo::Linalg.inv(covar)
219
+ else
220
+ 1.0 / covar
221
+ end
222
+ end
223
+
224
+ def calc_inv_sqrt_det_covariance(covar, covar_type)
225
+ if covar_type == 'full'
226
+ 1.0 / Math.sqrt(Numo::Linalg.det(covar))
227
+ else
228
+ 1.0 / Math.sqrt(covar.prod)
229
+ end
230
+ end
231
+
232
+ def check_enable_linalg(method_name)
233
+ if (@params[:covariance_type] == 'full') && !enable_linalg?
234
+ raise "GaussianMixture##{method_name} requires Numo::Linalg when covariance_type is 'full' but that is not loaded."
235
+ end
172
236
  end
173
237
  end
174
238
  end
@@ -0,0 +1,170 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/transformer'
5
+ require 'rumale/utils'
6
+
7
+ module Rumale
8
+ module Decomposition
9
+ # FactorAnalysis is a class that implements fator analysis with EM algorithm.
10
+ #
11
+ # @example
12
+ # require 'numo/linalg/autoloader'
13
+ # decomposer = Rumale::Decomposition::FactorAnalysis.new(n_components: 2)
14
+ # representaion = decomposer.fit_transform(samples)
15
+ #
16
+ # *Reference*
17
+ # - D. Barber, "Bayesian Reasoning and Machine Learning," Cambridge University Press, 2012.
18
+ class FactorAnalysis
19
+ include Base::BaseEstimator
20
+ include Base::Transformer
21
+
22
+ # Returns the mean vector.
23
+ # @return [Numo::DFloat] (shape: [n_features])
24
+ attr_reader :mean
25
+
26
+ # Returns the estimated noise variance for each feature.
27
+ # @return [Numo::DFloat] (shape: [n_features])
28
+ attr_reader :noise_variance
29
+
30
+ # Returns the components with maximum variance.
31
+ # @return [Numo::DFloat] (shape: [n_components, n_features])
32
+ attr_reader :components
33
+
34
+ # Returns the log likelihood at each iteration.
35
+ # @return [Numo::DFloat] (shape: [n_iter])
36
+ attr_reader :loglike
37
+
38
+ # Return the number of iterations run for optimization
39
+ # @return [Integer]
40
+ attr_reader :n_iter
41
+
42
+ # Create a new transformer with factor analysis.
43
+ #
44
+ # @param n_components [Integer] The number of components (dimensionality of latent space).
45
+ # @param max_iter [Integer] The maximum number of iterations.
46
+ # @param tol [Float/Nil] The tolerance of termination criterion for EM algorithm.
47
+ # If nil is given, iterate EM steps up to the maximum number of iterations.
48
+ def initialize(n_components: 2, max_iter: 100, tol: 1e-8)
49
+ check_params_integer(n_components: n_components, max_iter: max_iter)
50
+ check_params_type_or_nil(Float, tol: tol)
51
+ check_params_positive(n_components: n_components, max_iter: max_iter)
52
+ @params = {}
53
+ @params[:n_components] = n_components
54
+ @params[:max_iter] = max_iter
55
+ @params[:tol] = tol
56
+ @mean = nil
57
+ @noise_variance = nil
58
+ @components = nil
59
+ @loglike = nil
60
+ @n_iter = nil
61
+ end
62
+
63
+ # Fit the model with given training data.
64
+ #
65
+ # @overload fit(x) -> FactorAnalysis
66
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
67
+ # @return [FactorAnalysis] The learned transformer itself.
68
+ def fit(x, _y = nil)
69
+ raise 'FactorAnalysis#fit requires Numo::Linalg but that is not loaded.' unless enable_linalg?
70
+
71
+ # initialize some variables.
72
+ n_samples, n_features = x.shape
73
+ @mean = x.mean(0)
74
+ centered_x = x - @mean
75
+ cov_mat = centered_x.transpose.dot(centered_x) / n_samples
76
+ sample_vars = x.var(0)
77
+ sqrt_n_samples = Math.sqrt(n_samples)
78
+ @noise_variance = Numo::DFloat.ones(n_features)
79
+
80
+ # run optimization.
81
+ old_loglike = 0.0
82
+ @n_iter = 0
83
+ @loglike = [] unless @params[:tol].nil?
84
+ @params[:max_iter].times do |t|
85
+ @n_iter = t + 1
86
+ sqrt_noise_variance = Numo::NMath.sqrt(@noise_variance)
87
+ scaled_x = centered_x / (sqrt_noise_variance * sqrt_n_samples + 1e-12)
88
+ s, u = truncate_svd(scaled_x, @params[:n_components])
89
+ scaler = Numo::NMath.sqrt(Numo::DFloat.maximum(s**2 - 1.0, 0.0))
90
+ @components = (sqrt_noise_variance.diag.dot(u) * scaler).transpose.dup
91
+ @noise_variance = Numo::DFloat.maximum(sample_vars - @components.transpose.dot(@components).diagonal, 1e-12)
92
+ next if @params[:tol].nil?
93
+ new_loglike = log_likelihood(cov_mat, @components, @noise_variance)
94
+ @loglike.push(new_loglike)
95
+ break if (old_loglike - new_loglike).abs <= @params[:tol]
96
+ old_loglike = new_loglike
97
+ end
98
+
99
+ @loglike = Numo::DFloat.cast(@loglike) unless @params[:tol].nil?
100
+ @components = @components[0, true].dup if @params[:n_components] == 1
101
+ self
102
+ end
103
+
104
+ # Fit the model with training data, and then transform them with the learned model.
105
+ #
106
+ # @overload fit_transform(x) -> Numo::DFloat
107
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
108
+ # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
109
+ def fit_transform(x, _y = nil)
110
+ check_sample_array(x)
111
+ raise 'FactorAnalysis#fit_transform requires Numo::Linalg but that is not loaded.' unless enable_linalg?
112
+
113
+ fit(x).transform(x)
114
+ end
115
+
116
+ # Transform the given data with the learned model.
117
+ #
118
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
119
+ # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
120
+ def transform(x)
121
+ check_sample_array(x)
122
+ raise 'FactorAnalysis#transform requires Numo::Linalg but that is not loaded.' unless enable_linalg?
123
+
124
+ factors = @params[:n_components] == 1 ? @components.expand_dims(0) : @components
125
+ centered_x = x - @mean
126
+ beta = Numo::Linalg.inv(Numo::DFloat.eye(factors.shape[0]) + (factors / @noise_variance).dot(factors.transpose))
127
+ z = centered_x.dot((beta.dot(factors) / @noise_variance).transpose)
128
+ @params[:n_components] == 1 ? z[true, 0].dup : z
129
+ end
130
+
131
+ # Dump marshal data.
132
+ # @return [Hash] The marshal data.
133
+ def marshal_dump
134
+ { params: @params,
135
+ mean: @mean,
136
+ noise_variance: @noise_variance,
137
+ components: @components,
138
+ loglike: @loglike,
139
+ n_iter: @n_iter }
140
+ end
141
+
142
+ # Load marshal data.
143
+ # @return [nil]
144
+ def marshal_load(obj)
145
+ @params = obj[:params]
146
+ @mean = obj[:mean]
147
+ @noise_variance = obj[:noise_variance]
148
+ @components = obj[:components]
149
+ @loglike = obj[:loglike]
150
+ @n_iter = obj[:n_iter]
151
+ end
152
+
153
+ private
154
+
155
+ def log_likelihood(cov_mat, factors, noise_vars)
156
+ n_samples = noise_vars.size
157
+ fact_cov_mat = factors.transpose.dot(factors) + noise_vars.diag
158
+ n_samples.fdiv(2) * Math.log(Numo::Linalg.det(fact_cov_mat)) + Numo::Linalg.inv(fact_cov_mat).dot(cov_mat).trace
159
+ end
160
+
161
+ def truncate_svd(x, k)
162
+ m = x.shape[1]
163
+ eig_vals, eig_vecs = Numo::Linalg.eigh(x.transpose.dot(x), vals_range: (m - k)...m)
164
+ s = Numo::NMath.sqrt(eig_vals.reverse.dup)
165
+ u = eig_vecs.reverse(1).dup
166
+ [s, u]
167
+ end
168
+ end
169
+ end
170
+ end
@@ -3,5 +3,5 @@
3
3
  # Rumale is a machine learning library in Ruby.
4
4
  module Rumale
5
5
  # The version of Rumale you are using.
6
- VERSION = '0.13.4'
6
+ VERSION = '0.13.5'
7
7
  end
@@ -46,10 +46,10 @@ Gem::Specification.new do |spec|
46
46
  spec.add_runtime_dependency 'numo-narray', '>= 0.9.1'
47
47
 
48
48
  spec.add_development_dependency 'bundler', '~> 2.0'
49
- spec.add_development_dependency 'coveralls', '>= 0.8.23'
49
+ spec.add_development_dependency 'coveralls', '~> 0.8'
50
50
  spec.add_development_dependency 'numo-linalg', '>= 0.1.4'
51
51
  spec.add_development_dependency 'parallel', '>= 1.17.0'
52
- spec.add_development_dependency 'rake', '~> 12.0'
52
+ spec.add_development_dependency 'rake', '~> 10.0'
53
53
  spec.add_development_dependency 'rake-compiler', '~> 1.0'
54
54
  spec.add_development_dependency 'rspec', '~> 3.0'
55
55
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rumale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.4
4
+ version: 0.13.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-09-22 00:00:00.000000000 Z
11
+ date: 2019-10-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -42,16 +42,16 @@ dependencies:
42
42
  name: coveralls
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ">="
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: 0.8.23
47
+ version: '0.8'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ">="
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: 0.8.23
54
+ version: '0.8'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: numo-linalg
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -86,14 +86,14 @@ dependencies:
86
86
  requirements:
87
87
  - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: '12.0'
89
+ version: '10.0'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: '12.0'
96
+ version: '10.0'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: rake-compiler
99
99
  requirement: !ruby/object:Gem::Requirement
@@ -171,6 +171,7 @@ files:
171
171
  - lib/rumale/clustering/snn.rb
172
172
  - lib/rumale/clustering/spectral_clustering.rb
173
173
  - lib/rumale/dataset.rb
174
+ - lib/rumale/decomposition/factor_analysis.rb
174
175
  - lib/rumale/decomposition/nmf.rb
175
176
  - lib/rumale/decomposition/pca.rb
176
177
  - lib/rumale/ensemble/ada_boost_classifier.rb