rumale 0.12.1 → 0.12.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e069b026a9236e69924051af75689ba292632b50
4
- data.tar.gz: b98e3260e947f66936a7149a8b09eb24bb5219fe
3
+ metadata.gz: 59005b59f6a6a195fbe200260e0c74008fa532fe
4
+ data.tar.gz: 00e40ea656556bd5a42bf7d96674e9b758ec7460
5
5
  SHA512:
6
- metadata.gz: 2ff9dac0d7d633064fc3f4b5127a639a4617357c219fc20ad38c79048981c555fdddebc3f8d16047798bdfc658292e98cbc04d1f26733bcebf4a6412ca84f3ab
7
- data.tar.gz: 9ad31638ea47d527a0af109a8cda2dcb44d0296ce31a88509833e4f8a116f49601dbf6353606fca080ed378425f5ce759c440791e8eccd90743ed65b14b1ef86
6
+ metadata.gz: 59ef5edcd1b435260e79792ed592d3b044a17c83ed23705c5f065cd46d916bb761cc8b4e1759c76cf86febf8839925ba53322ede8b29f57bdb7e7d656f92104b
7
+ data.tar.gz: 2ef45761c87c14882532c27e957d83adc2e40719df03a8cb497da09429713ddf946d1e6ba8bdcb73536cf6daed49c098c851b3bae2a7d828dd3efad42a712360
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ # 0.12.2
2
+ - Add class for cluster analysis with Gaussian Mixture Model.
3
+ - Add encoder class for categorical features.
4
+
1
5
  # 0.12.1
2
6
  - Refactor kernel support vector classifier.
3
7
  - Refactor random sampling on tree estimators.
data/README.md CHANGED
@@ -6,7 +6,7 @@
6
6
  [![Coverage Status](https://coveralls.io/repos/github/yoshoku/rumale/badge.svg?branch=master)](https://coveralls.io/github/yoshoku/rumale?branch=master)
7
7
  [![Gem Version](https://badge.fury.io/rb/rumale.svg)](https://badge.fury.io/rb/rumale)
8
8
  [![BSD 2-Clause License](https://img.shields.io/badge/License-BSD%202--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/master/LICENSE.txt)
9
- [![Documentation](http://img.shields.io/badge/docs-rdoc.info-blue.svg)](https://www.rubydoc.info/gems/rumale/0.12.1)
9
+ [![Documentation](http://img.shields.io/badge/docs-rdoc.info-blue.svg)](https://www.rubydoc.info/gems/rumale/0.12.2)
10
10
 
11
11
  Rumale (**Ru**by **ma**chine **le**arning) is a machine learning library in Ruby.
12
12
  Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
data/lib/rumale.rb CHANGED
@@ -57,6 +57,7 @@ require 'rumale/ensemble/random_forest_regressor'
57
57
  require 'rumale/ensemble/extra_trees_classifier'
58
58
  require 'rumale/ensemble/extra_trees_regressor'
59
59
  require 'rumale/clustering/k_means'
60
+ require 'rumale/clustering/gaussian_mixture'
60
61
  require 'rumale/clustering/dbscan'
61
62
  require 'rumale/decomposition/pca'
62
63
  require 'rumale/decomposition/nmf'
@@ -68,6 +69,7 @@ require 'rumale/preprocessing/standard_scaler'
68
69
  require 'rumale/preprocessing/bin_discretizer'
69
70
  require 'rumale/preprocessing/label_encoder'
70
71
  require 'rumale/preprocessing/one_hot_encoder'
72
+ require 'rumale/preprocessing/ordinal_encoder'
71
73
  require 'rumale/model_selection/k_fold'
72
74
  require 'rumale/model_selection/stratified_k_fold'
73
75
  require 'rumale/model_selection/shuffle_split'
@@ -0,0 +1,174 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/cluster_analyzer'
5
+ require 'rumale/pairwise_metric'
6
+
7
+ module Rumale
8
+ module Clustering
9
+ # GaussianMixture is a class that implements cluster analysis with gaussian mixture model.
10
+ # The current implementation uses only the diagonal elements of covariance matrices to represent mixture parameters
11
+ # without using full elements.
12
+ #
13
+ # @example
14
+ # analyzer = Rumale::Clustering::GaussianMixture.new(n_clusters: 10, max_iter: 50)
15
+ # cluster_labels = analyzer.fit_predict(samples)
16
+ class GaussianMixture
17
+ include Base::BaseEstimator
18
+ include Base::ClusterAnalyzer
19
+
20
+ # Return the number of iterations to covergence.
21
+ # @return [Integer]
22
+ attr_reader :n_iter
23
+
24
+ # Return the weight of each cluster.
25
+ # @return [Numo::DFloat] (shape: [n_clusters])
26
+ attr_reader :weights
27
+
28
+ # Return the mean of each cluster.
29
+ # @return [Numo::DFloat] (shape: [n_clusters, n_features])
30
+ attr_reader :means
31
+
32
+ # Return the diagonal elements of covariance matrix of each cluster.
33
+ # @return [Numo::DFloat] (shape: [n_clusters, n_features])
34
+ attr_reader :covariances
35
+
36
+ # Create a new cluster analyzer with gaussian mixture model.
37
+ #
38
+ # @param n_clusters [Integer] The number of clusters.
39
+ # @param init [String] The initialization method for centroids ('random' or 'k-means++').
40
+ # @param max_iter [Integer] The maximum number of iterations.
41
+ # @param tol [Float] The tolerance of termination criterion.
42
+ # @param reg_covar [Float] The non-negative regularization to the diagonal of covariance.
43
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
44
+ def initialize(n_clusters: 8, init: 'k-means++', max_iter: 50, tol: 1.0e-4, reg_covar: 1.0e-6, random_seed: nil)
45
+ check_params_integer(n_clusters: n_clusters, max_iter: max_iter)
46
+ check_params_float(tol: tol)
47
+ check_params_string(init: init)
48
+ check_params_type_or_nil(Integer, random_seed: random_seed)
49
+ check_params_positive(n_clusters: n_clusters, max_iter: max_iter)
50
+ @params = {}
51
+ @params[:n_clusters] = n_clusters
52
+ @params[:init] = init == 'random' ? 'random' : 'k-means++'
53
+ @params[:max_iter] = max_iter
54
+ @params[:tol] = tol
55
+ @params[:reg_covar] = reg_covar
56
+ @params[:random_seed] = random_seed
57
+ @params[:random_seed] ||= srand
58
+ @n_iter = nil
59
+ @weights = nil
60
+ @means = nil
61
+ @covariances = nil
62
+ end
63
+
64
+ # Analysis clusters with given training data.
65
+ #
66
+ # @overload fit(x) -> GaussianMixture
67
+ #
68
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
69
+ # @return [GaussianMixture] The learned cluster analyzer itself.
70
+ def fit(x, _y = nil)
71
+ check_sample_array(x)
72
+ n_samples = x.shape[0]
73
+ memberships = init_memberships(x)
74
+ @params[:max_iter].times do |t|
75
+ @n_iter = t
76
+ @weights = calc_weights(n_samples, memberships)
77
+ @means = calc_means(x, memberships)
78
+ @covariances = calc_diag_covariances(x, @means, memberships) + @params[:reg_covar]
79
+ new_memberships = calc_memberships(x, @weights, @means, @covariances)
80
+ error = (memberships - new_memberships).abs.max
81
+ break if error <= @params[:tol]
82
+ memberships = new_memberships.dup
83
+ end
84
+ self
85
+ end
86
+
87
+ # Predict cluster labels for samples.
88
+ #
89
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster label.
90
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
91
+ def predict(x)
92
+ check_sample_array(x)
93
+ memberships = calc_memberships(x, @weights, @means, @covariances)
94
+ assign_cluster(memberships)
95
+ end
96
+
97
+ # Analysis clusters and assign samples to clusters.
98
+ #
99
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
100
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
101
+ def fit_predict(x)
102
+ check_sample_array(x)
103
+ fit(x).predict(x)
104
+ end
105
+
106
+ # Dump marshal data.
107
+ # @return [Hash] The marshal data.
108
+ def marshal_dump
109
+ { params: @params,
110
+ n_iter: @n_iter,
111
+ weights: @weights,
112
+ means: @means,
113
+ covariances: @covariances }
114
+ end
115
+
116
+ # Load marshal data.
117
+ # @return [nil]
118
+ def marshal_load(obj)
119
+ @params = obj[:params]
120
+ @n_iter = obj[:n_iter]
121
+ @weights = obj[:weights]
122
+ @means = obj[:means]
123
+ @covariances = obj[:covariances]
124
+ nil
125
+ end
126
+
127
+ private
128
+
129
+ def assign_cluster(memberships)
130
+ n_clusters = memberships.shape[1]
131
+ memberships.max_index(axis: 1) - Numo::Int32[*0.step(memberships.size - 1, n_clusters)]
132
+ end
133
+
134
+ def init_memberships(x)
135
+ kmeans = Rumale::Clustering::KMeans.new(
136
+ n_clusters: @params[:n_clusters], init: @params[:init], max_iter: 0, random_seed: @params[:random_seed]
137
+ )
138
+ cluster_ids = kmeans.fit_predict(x)
139
+ encoder = Rumale::Preprocessing::OneHotEncoder.new
140
+ encoder.fit_transform(cluster_ids)
141
+ end
142
+
143
+ def calc_memberships(x, weights, means, diag_cov)
144
+ n_samples = x.shape[0]
145
+ n_clusters = means.shape[0]
146
+ memberships = Numo::DFloat.zeros(n_samples, n_clusters)
147
+ n_clusters.times do |n|
148
+ centered = x - means[n, true]
149
+ inv_cov = 1.0 / diag_cov[n, true]
150
+ sqrt_det_cov = 1.0 / Math.sqrt(diag_cov[n, true].prod)
151
+ memberships[true, n] = weights[n] * sqrt_det_cov * Numo::NMath.exp(-0.5 * (centered * inv_cov * centered).sum(1))
152
+ end
153
+ memberships / memberships.sum(1).expand_dims(1)
154
+ end
155
+
156
+ def calc_weights(n_samples, memberships)
157
+ memberships.sum(0) / n_samples
158
+ end
159
+
160
+ def calc_means(x, memberships)
161
+ memberships.transpose.dot(x) / memberships.sum(0).expand_dims(1)
162
+ end
163
+
164
+ def calc_diag_covariances(x, means, memberships)
165
+ n_clusters = means.shape[0]
166
+ diag_cov = Array.new(n_clusters) do |n|
167
+ centered = x - means[n, true]
168
+ memberships[true, n].dot(centered**2) / memberships[true, n].sum
169
+ end
170
+ Numo::DFloat.asarray(diag_cov)
171
+ end
172
+ end
173
+ end
174
+ end
@@ -0,0 +1,120 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/transformer'
5
+
6
+ module Rumale
7
+ module Preprocessing
8
+ # Transfrom categorical features to integer values.
9
+ #
10
+ # @example
11
+ # encoder = Rumale::Preprocessing::OrdinalEncoder.new
12
+ # training_samples = [['left', 10], ['right', 15], ['right', 20]]
13
+ # training_samples = Numo::NArray.asarray(training_samples)
14
+ # encoder.fit(training_samples)
15
+ # p encoder.categories
16
+ # # [["left", "right"], [10, 15, 20]]
17
+ # testing_samples = [['left', 20], ['right', 10]]
18
+ # testing_samples = Numo::NArray.asarray(testing_samples)
19
+ # encoded = encoder.transform(testing_samples)
20
+ # p encoded
21
+ # # Numo::DFloat#shape=[2,2]
22
+ # # [[0, 2],
23
+ # # [1, 0]]
24
+ # p encoder.inverse_transform(encoded)
25
+ # # Numo::RObject#shape=[2,2]
26
+ # # [["left", 20],
27
+ # # ["right", 10]]
28
+ class OrdinalEncoder
29
+ include Base::BaseEstimator
30
+ include Base::Transformer
31
+
32
+ # Return the array consists of categorical value each feature.
33
+ # @return [Array] (size: n_features)
34
+ attr_reader :categories
35
+
36
+ # Create a new encoder that transform categorical features to integer values.
37
+ #
38
+ # @param categories [Nil/Array] The category list for each feature.
39
+ # If nil is given, extracted categories from the training data by calling the fit method are used.
40
+ def initialize(categories: nil)
41
+ check_params_type_or_nil(Array, categories: categories)
42
+ @categories = categories
43
+ end
44
+
45
+ # Fit encoder by extracting the category for each feature.
46
+ #
47
+ # @overload fit(x) -> OrdinalEncoder
48
+ #
49
+ # @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
50
+ # @return [LabelEncoder]
51
+ def fit(x, _y = nil)
52
+ raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
53
+ raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
54
+ n_features = x.shape[1]
55
+ @categories = Array.new(n_features) { |n| x[true, n].to_a.uniq.sort }
56
+ self
57
+ end
58
+
59
+ # Fit encoder, then return encoded categorical features to integer values.
60
+ #
61
+ # @overload fit_transform(x) -> Numo::DFloat
62
+ #
63
+ # @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
64
+ # @return [Numo::DFloat] The encoded categorical features to integer values.
65
+ def fit_transform(x, _y = nil)
66
+ raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
67
+ raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
68
+ fit(x).transform(x)
69
+ end
70
+
71
+ # Encode categorical features.
72
+ #
73
+ # @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
74
+ # @return [Numo::DFloat] The encoded categorical features to integer values.
75
+ def transform(x)
76
+ raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
77
+ raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
78
+
79
+ n_features = x.shape[1]
80
+ raise ArgumentError, 'Expect the number of features and the number of categories to be equal' if n_features != @categories.size
81
+
82
+ transformed = Array.new(n_features) do |n|
83
+ x[true, n].to_a.map { |v| @categories[n].index(v) }
84
+ end
85
+
86
+ Numo::DFloat.asarray(transformed.transpose)
87
+ end
88
+
89
+ # Decode values to categorical features.
90
+ #
91
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples consisting of values transformed from categorical features.
92
+ # @return [Numo::NArray] The decoded features.
93
+ def inverse_transform(x)
94
+ check_sample_array(x)
95
+
96
+ n_features = x.shape[1]
97
+ raise ArgumentError, 'Expect the number of features and the number of categories to be equal' if n_features != @categories.size
98
+
99
+ inv_transformed = Array.new(n_features) do |n|
100
+ x[true, n].to_a.map { |i| @categories[n][i.to_i] }
101
+ end
102
+
103
+ Numo::NArray.asarray(inv_transformed.transpose)
104
+ end
105
+
106
+ # Dump marshal data.
107
+ # @return [Hash] The marshal data about OrdinalEncoder.
108
+ def marshal_dump
109
+ { categories: @categories }
110
+ end
111
+
112
+ # Load marshal data.
113
+ # @return [nil]
114
+ def marshal_load(obj)
115
+ @categories = obj[:categories]
116
+ nil
117
+ end
118
+ end
119
+ end
120
+ end
@@ -70,7 +70,7 @@ module Rumale
70
70
 
71
71
  def grow_node(depth, x, y, impurity)
72
72
  # intialize node.
73
- n_samples, n_features = x.shape
73
+ n_samples = x.shape[0]
74
74
  node = Node.new(depth: depth, impurity: impurity, n_samples: n_samples)
75
75
 
76
76
  # terminate growing.
@@ -155,7 +155,7 @@ module Rumale
155
155
  def build_tree(x, y, g, h)
156
156
  @feature_ids = Array.new(x.shape[1]) { |v| v }
157
157
  @tree = grow_node(0, x, y, g, h)
158
- @feature_ids = nil
158
+ @feature_ids = nil
159
159
  nil
160
160
  end
161
161
 
@@ -163,7 +163,7 @@ module Rumale
163
163
  # intialize some variables.
164
164
  sum_g = g.sum
165
165
  sum_h = h.sum
166
- n_samples, n_features = x.shape
166
+ n_samples = x.shape[0]
167
167
  node = Node.new(depth: depth, n_samples: n_samples)
168
168
 
169
169
  # terminate growing.
@@ -3,5 +3,5 @@
3
3
  # Rumale is a machine learning library in Ruby.
4
4
  module Rumale
5
5
  # The version of Rumale you are using.
6
- VERSION = '0.12.1'
6
+ VERSION = '0.12.2'
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rumale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.1
4
+ version: 0.12.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-06-08 00:00:00.000000000 Z
11
+ date: 2019-06-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -147,6 +147,7 @@ files:
147
147
  - lib/rumale/base/splitter.rb
148
148
  - lib/rumale/base/transformer.rb
149
149
  - lib/rumale/clustering/dbscan.rb
150
+ - lib/rumale/clustering/gaussian_mixture.rb
150
151
  - lib/rumale/clustering/k_means.rb
151
152
  - lib/rumale/dataset.rb
152
153
  - lib/rumale/decomposition/nmf.rb
@@ -213,6 +214,7 @@ files:
213
214
  - lib/rumale/preprocessing/max_abs_scaler.rb
214
215
  - lib/rumale/preprocessing/min_max_scaler.rb
215
216
  - lib/rumale/preprocessing/one_hot_encoder.rb
217
+ - lib/rumale/preprocessing/ordinal_encoder.rb
216
218
  - lib/rumale/preprocessing/standard_scaler.rb
217
219
  - lib/rumale/probabilistic_output.rb
218
220
  - lib/rumale/tree/base_decision_tree.rb