rumale 0.12.1 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e069b026a9236e69924051af75689ba292632b50
4
- data.tar.gz: b98e3260e947f66936a7149a8b09eb24bb5219fe
3
+ metadata.gz: 59005b59f6a6a195fbe200260e0c74008fa532fe
4
+ data.tar.gz: 00e40ea656556bd5a42bf7d96674e9b758ec7460
5
5
  SHA512:
6
- metadata.gz: 2ff9dac0d7d633064fc3f4b5127a639a4617357c219fc20ad38c79048981c555fdddebc3f8d16047798bdfc658292e98cbc04d1f26733bcebf4a6412ca84f3ab
7
- data.tar.gz: 9ad31638ea47d527a0af109a8cda2dcb44d0296ce31a88509833e4f8a116f49601dbf6353606fca080ed378425f5ce759c440791e8eccd90743ed65b14b1ef86
6
+ metadata.gz: 59ef5edcd1b435260e79792ed592d3b044a17c83ed23705c5f065cd46d916bb761cc8b4e1759c76cf86febf8839925ba53322ede8b29f57bdb7e7d656f92104b
7
+ data.tar.gz: 2ef45761c87c14882532c27e957d83adc2e40719df03a8cb497da09429713ddf946d1e6ba8bdcb73536cf6daed49c098c851b3bae2a7d828dd3efad42a712360
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ # 0.12.2
2
+ - Add class for cluster analysis with Gaussian Mixture Model.
3
+ - Add encoder class for categorical features.
4
+
1
5
  # 0.12.1
2
6
  - Refactor kernel support vector classifier.
3
7
  - Refactor random sampling on tree estimators.
data/README.md CHANGED
@@ -6,7 +6,7 @@
6
6
  [![Coverage Status](https://coveralls.io/repos/github/yoshoku/rumale/badge.svg?branch=master)](https://coveralls.io/github/yoshoku/rumale?branch=master)
7
7
  [![Gem Version](https://badge.fury.io/rb/rumale.svg)](https://badge.fury.io/rb/rumale)
8
8
  [![BSD 2-Clause License](https://img.shields.io/badge/License-BSD%202--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/master/LICENSE.txt)
9
- [![Documentation](http://img.shields.io/badge/docs-rdoc.info-blue.svg)](https://www.rubydoc.info/gems/rumale/0.12.1)
9
+ [![Documentation](http://img.shields.io/badge/docs-rdoc.info-blue.svg)](https://www.rubydoc.info/gems/rumale/0.12.2)
10
10
 
11
11
  Rumale (**Ru**by **ma**chine **le**arning) is a machine learning library in Ruby.
12
12
  Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
data/lib/rumale.rb CHANGED
@@ -57,6 +57,7 @@ require 'rumale/ensemble/random_forest_regressor'
57
57
  require 'rumale/ensemble/extra_trees_classifier'
58
58
  require 'rumale/ensemble/extra_trees_regressor'
59
59
  require 'rumale/clustering/k_means'
60
+ require 'rumale/clustering/gaussian_mixture'
60
61
  require 'rumale/clustering/dbscan'
61
62
  require 'rumale/decomposition/pca'
62
63
  require 'rumale/decomposition/nmf'
@@ -68,6 +69,7 @@ require 'rumale/preprocessing/standard_scaler'
68
69
  require 'rumale/preprocessing/bin_discretizer'
69
70
  require 'rumale/preprocessing/label_encoder'
70
71
  require 'rumale/preprocessing/one_hot_encoder'
72
+ require 'rumale/preprocessing/ordinal_encoder'
71
73
  require 'rumale/model_selection/k_fold'
72
74
  require 'rumale/model_selection/stratified_k_fold'
73
75
  require 'rumale/model_selection/shuffle_split'
@@ -0,0 +1,174 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/cluster_analyzer'
5
+ require 'rumale/pairwise_metric'
6
+
7
+ module Rumale
8
+ module Clustering
9
+ # GaussianMixture is a class that implements cluster analysis with gaussian mixture model.
10
+ # The current implementation uses only the diagonal elements of covariance matrices to represent mixture parameters
11
+ # without using full elements.
12
+ #
13
+ # @example
14
+ # analyzer = Rumale::Clustering::GaussianMixture.new(n_clusters: 10, max_iter: 50)
15
+ # cluster_labels = analyzer.fit_predict(samples)
16
+ class GaussianMixture
17
+ include Base::BaseEstimator
18
+ include Base::ClusterAnalyzer
19
+
20
+ # Return the number of iterations to covergence.
21
+ # @return [Integer]
22
+ attr_reader :n_iter
23
+
24
+ # Return the weight of each cluster.
25
+ # @return [Numo::DFloat] (shape: [n_clusters])
26
+ attr_reader :weights
27
+
28
+ # Return the mean of each cluster.
29
+ # @return [Numo::DFloat] (shape: [n_clusters, n_features])
30
+ attr_reader :means
31
+
32
+ # Return the diagonal elements of covariance matrix of each cluster.
33
+ # @return [Numo::DFloat] (shape: [n_clusters, n_features])
34
+ attr_reader :covariances
35
+
36
+ # Create a new cluster analyzer with gaussian mixture model.
37
+ #
38
+ # @param n_clusters [Integer] The number of clusters.
39
+ # @param init [String] The initialization method for centroids ('random' or 'k-means++').
40
+ # @param max_iter [Integer] The maximum number of iterations.
41
+ # @param tol [Float] The tolerance of termination criterion.
42
+ # @param reg_covar [Float] The non-negative regularization to the diagonal of covariance.
43
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
44
+ def initialize(n_clusters: 8, init: 'k-means++', max_iter: 50, tol: 1.0e-4, reg_covar: 1.0e-6, random_seed: nil)
45
+ check_params_integer(n_clusters: n_clusters, max_iter: max_iter)
46
+ check_params_float(tol: tol)
47
+ check_params_string(init: init)
48
+ check_params_type_or_nil(Integer, random_seed: random_seed)
49
+ check_params_positive(n_clusters: n_clusters, max_iter: max_iter)
50
+ @params = {}
51
+ @params[:n_clusters] = n_clusters
52
+ @params[:init] = init == 'random' ? 'random' : 'k-means++'
53
+ @params[:max_iter] = max_iter
54
+ @params[:tol] = tol
55
+ @params[:reg_covar] = reg_covar
56
+ @params[:random_seed] = random_seed
57
+ @params[:random_seed] ||= srand
58
+ @n_iter = nil
59
+ @weights = nil
60
+ @means = nil
61
+ @covariances = nil
62
+ end
63
+
64
+ # Analysis clusters with given training data.
65
+ #
66
+ # @overload fit(x) -> GaussianMixture
67
+ #
68
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
69
+ # @return [GaussianMixture] The learned cluster analyzer itself.
70
+ def fit(x, _y = nil)
71
+ check_sample_array(x)
72
+ n_samples = x.shape[0]
73
+ memberships = init_memberships(x)
74
+ @params[:max_iter].times do |t|
75
+ @n_iter = t
76
+ @weights = calc_weights(n_samples, memberships)
77
+ @means = calc_means(x, memberships)
78
+ @covariances = calc_diag_covariances(x, @means, memberships) + @params[:reg_covar]
79
+ new_memberships = calc_memberships(x, @weights, @means, @covariances)
80
+ error = (memberships - new_memberships).abs.max
81
+ break if error <= @params[:tol]
82
+ memberships = new_memberships.dup
83
+ end
84
+ self
85
+ end
86
+
87
+ # Predict cluster labels for samples.
88
+ #
89
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster label.
90
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
91
+ def predict(x)
92
+ check_sample_array(x)
93
+ memberships = calc_memberships(x, @weights, @means, @covariances)
94
+ assign_cluster(memberships)
95
+ end
96
+
97
+ # Analysis clusters and assign samples to clusters.
98
+ #
99
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
100
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
101
+ def fit_predict(x)
102
+ check_sample_array(x)
103
+ fit(x).predict(x)
104
+ end
105
+
106
+ # Dump marshal data.
107
+ # @return [Hash] The marshal data.
108
+ def marshal_dump
109
+ { params: @params,
110
+ n_iter: @n_iter,
111
+ weights: @weights,
112
+ means: @means,
113
+ covariances: @covariances }
114
+ end
115
+
116
+ # Load marshal data.
117
+ # @return [nil]
118
+ def marshal_load(obj)
119
+ @params = obj[:params]
120
+ @n_iter = obj[:n_iter]
121
+ @weights = obj[:weights]
122
+ @means = obj[:means]
123
+ @covariances = obj[:covariances]
124
+ nil
125
+ end
126
+
127
+ private
128
+
129
+ def assign_cluster(memberships)
130
+ n_clusters = memberships.shape[1]
131
+ memberships.max_index(axis: 1) - Numo::Int32[*0.step(memberships.size - 1, n_clusters)]
132
+ end
133
+
134
+ def init_memberships(x)
135
+ kmeans = Rumale::Clustering::KMeans.new(
136
+ n_clusters: @params[:n_clusters], init: @params[:init], max_iter: 0, random_seed: @params[:random_seed]
137
+ )
138
+ cluster_ids = kmeans.fit_predict(x)
139
+ encoder = Rumale::Preprocessing::OneHotEncoder.new
140
+ encoder.fit_transform(cluster_ids)
141
+ end
142
+
143
+ def calc_memberships(x, weights, means, diag_cov)
144
+ n_samples = x.shape[0]
145
+ n_clusters = means.shape[0]
146
+ memberships = Numo::DFloat.zeros(n_samples, n_clusters)
147
+ n_clusters.times do |n|
148
+ centered = x - means[n, true]
149
+ inv_cov = 1.0 / diag_cov[n, true]
150
+ sqrt_det_cov = 1.0 / Math.sqrt(diag_cov[n, true].prod)
151
+ memberships[true, n] = weights[n] * sqrt_det_cov * Numo::NMath.exp(-0.5 * (centered * inv_cov * centered).sum(1))
152
+ end
153
+ memberships / memberships.sum(1).expand_dims(1)
154
+ end
155
+
156
+ def calc_weights(n_samples, memberships)
157
+ memberships.sum(0) / n_samples
158
+ end
159
+
160
+ def calc_means(x, memberships)
161
+ memberships.transpose.dot(x) / memberships.sum(0).expand_dims(1)
162
+ end
163
+
164
+ def calc_diag_covariances(x, means, memberships)
165
+ n_clusters = means.shape[0]
166
+ diag_cov = Array.new(n_clusters) do |n|
167
+ centered = x - means[n, true]
168
+ memberships[true, n].dot(centered**2) / memberships[true, n].sum
169
+ end
170
+ Numo::DFloat.asarray(diag_cov)
171
+ end
172
+ end
173
+ end
174
+ end
@@ -0,0 +1,120 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/transformer'
5
+
6
+ module Rumale
7
+ module Preprocessing
8
+ # Transfrom categorical features to integer values.
9
+ #
10
+ # @example
11
+ # encoder = Rumale::Preprocessing::OrdinalEncoder.new
12
+ # training_samples = [['left', 10], ['right', 15], ['right', 20]]
13
+ # training_samples = Numo::NArray.asarray(training_samples)
14
+ # encoder.fit(training_samples)
15
+ # p encoder.categories
16
+ # # [["left", "right"], [10, 15, 20]]
17
+ # testing_samples = [['left', 20], ['right', 10]]
18
+ # testing_samples = Numo::NArray.asarray(testing_samples)
19
+ # encoded = encoder.transform(testing_samples)
20
+ # p encoded
21
+ # # Numo::DFloat#shape=[2,2]
22
+ # # [[0, 2],
23
+ # # [1, 0]]
24
+ # p encoder.inverse_transform(encoded)
25
+ # # Numo::RObject#shape=[2,2]
26
+ # # [["left", 20],
27
+ # # ["right", 10]]
28
+ class OrdinalEncoder
29
+ include Base::BaseEstimator
30
+ include Base::Transformer
31
+
32
+ # Return the array consists of categorical value each feature.
33
+ # @return [Array] (size: n_features)
34
+ attr_reader :categories
35
+
36
+ # Create a new encoder that transform categorical features to integer values.
37
+ #
38
+ # @param categories [Nil/Array] The category list for each feature.
39
+ # If nil is given, extracted categories from the training data by calling the fit method are used.
40
+ def initialize(categories: nil)
41
+ check_params_type_or_nil(Array, categories: categories)
42
+ @categories = categories
43
+ end
44
+
45
+ # Fit encoder by extracting the category for each feature.
46
+ #
47
+ # @overload fit(x) -> OrdinalEncoder
48
+ #
49
+ # @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
50
+ # @return [LabelEncoder]
51
+ def fit(x, _y = nil)
52
+ raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
53
+ raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
54
+ n_features = x.shape[1]
55
+ @categories = Array.new(n_features) { |n| x[true, n].to_a.uniq.sort }
56
+ self
57
+ end
58
+
59
+ # Fit encoder, then return encoded categorical features to integer values.
60
+ #
61
+ # @overload fit_transform(x) -> Numo::DFloat
62
+ #
63
+ # @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
64
+ # @return [Numo::DFloat] The encoded categorical features to integer values.
65
+ def fit_transform(x, _y = nil)
66
+ raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
67
+ raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
68
+ fit(x).transform(x)
69
+ end
70
+
71
+ # Encode categorical features.
72
+ #
73
+ # @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
74
+ # @return [Numo::DFloat] The encoded categorical features to integer values.
75
+ def transform(x)
76
+ raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
77
+ raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
78
+
79
+ n_features = x.shape[1]
80
+ raise ArgumentError, 'Expect the number of features and the number of categories to be equal' if n_features != @categories.size
81
+
82
+ transformed = Array.new(n_features) do |n|
83
+ x[true, n].to_a.map { |v| @categories[n].index(v) }
84
+ end
85
+
86
+ Numo::DFloat.asarray(transformed.transpose)
87
+ end
88
+
89
+ # Decode values to categorical features.
90
+ #
91
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples consisting of values transformed from categorical features.
92
+ # @return [Numo::NArray] The decoded features.
93
+ def inverse_transform(x)
94
+ check_sample_array(x)
95
+
96
+ n_features = x.shape[1]
97
+ raise ArgumentError, 'Expect the number of features and the number of categories to be equal' if n_features != @categories.size
98
+
99
+ inv_transformed = Array.new(n_features) do |n|
100
+ x[true, n].to_a.map { |i| @categories[n][i.to_i] }
101
+ end
102
+
103
+ Numo::NArray.asarray(inv_transformed.transpose)
104
+ end
105
+
106
+ # Dump marshal data.
107
+ # @return [Hash] The marshal data about OrdinalEncoder.
108
+ def marshal_dump
109
+ { categories: @categories }
110
+ end
111
+
112
+ # Load marshal data.
113
+ # @return [nil]
114
+ def marshal_load(obj)
115
+ @categories = obj[:categories]
116
+ nil
117
+ end
118
+ end
119
+ end
120
+ end
@@ -70,7 +70,7 @@ module Rumale
70
70
 
71
71
  def grow_node(depth, x, y, impurity)
72
72
  # intialize node.
73
- n_samples, n_features = x.shape
73
+ n_samples = x.shape[0]
74
74
  node = Node.new(depth: depth, impurity: impurity, n_samples: n_samples)
75
75
 
76
76
  # terminate growing.
@@ -155,7 +155,7 @@ module Rumale
155
155
  def build_tree(x, y, g, h)
156
156
  @feature_ids = Array.new(x.shape[1]) { |v| v }
157
157
  @tree = grow_node(0, x, y, g, h)
158
- @feature_ids = nil
158
+ @feature_ids = nil
159
159
  nil
160
160
  end
161
161
 
@@ -163,7 +163,7 @@ module Rumale
163
163
  # intialize some variables.
164
164
  sum_g = g.sum
165
165
  sum_h = h.sum
166
- n_samples, n_features = x.shape
166
+ n_samples = x.shape[0]
167
167
  node = Node.new(depth: depth, n_samples: n_samples)
168
168
 
169
169
  # terminate growing.
@@ -3,5 +3,5 @@
3
3
  # Rumale is a machine learning library in Ruby.
4
4
  module Rumale
5
5
  # The version of Rumale you are using.
6
- VERSION = '0.12.1'
6
+ VERSION = '0.12.2'
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rumale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.1
4
+ version: 0.12.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-06-08 00:00:00.000000000 Z
11
+ date: 2019-06-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -147,6 +147,7 @@ files:
147
147
  - lib/rumale/base/splitter.rb
148
148
  - lib/rumale/base/transformer.rb
149
149
  - lib/rumale/clustering/dbscan.rb
150
+ - lib/rumale/clustering/gaussian_mixture.rb
150
151
  - lib/rumale/clustering/k_means.rb
151
152
  - lib/rumale/dataset.rb
152
153
  - lib/rumale/decomposition/nmf.rb
@@ -213,6 +214,7 @@ files:
213
214
  - lib/rumale/preprocessing/max_abs_scaler.rb
214
215
  - lib/rumale/preprocessing/min_max_scaler.rb
215
216
  - lib/rumale/preprocessing/one_hot_encoder.rb
217
+ - lib/rumale/preprocessing/ordinal_encoder.rb
216
218
  - lib/rumale/preprocessing/standard_scaler.rb
217
219
  - lib/rumale/probabilistic_output.rb
218
220
  - lib/rumale/tree/base_decision_tree.rb