rumale 0.12.1 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +1 -1
- data/lib/rumale.rb +2 -0
- data/lib/rumale/clustering/gaussian_mixture.rb +174 -0
- data/lib/rumale/preprocessing/ordinal_encoder.rb +120 -0
- data/lib/rumale/tree/base_decision_tree.rb +1 -1
- data/lib/rumale/tree/gradient_tree_regressor.rb +2 -2
- data/lib/rumale/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 59005b59f6a6a195fbe200260e0c74008fa532fe
|
4
|
+
data.tar.gz: 00e40ea656556bd5a42bf7d96674e9b758ec7460
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 59ef5edcd1b435260e79792ed592d3b044a17c83ed23705c5f065cd46d916bb761cc8b4e1759c76cf86febf8839925ba53322ede8b29f57bdb7e7d656f92104b
|
7
|
+
data.tar.gz: 2ef45761c87c14882532c27e957d83adc2e40719df03a8cb497da09429713ddf946d1e6ba8bdcb73536cf6daed49c098c851b3bae2a7d828dd3efad42a712360
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
[](https://coveralls.io/github/yoshoku/rumale?branch=master)
|
7
7
|
[](https://badge.fury.io/rb/rumale)
|
8
8
|
[](https://github.com/yoshoku/rumale/blob/master/LICENSE.txt)
|
9
|
-
[](https://www.rubydoc.info/gems/rumale/0.12.
|
9
|
+
[](https://www.rubydoc.info/gems/rumale/0.12.2)
|
10
10
|
|
11
11
|
Rumale (**Ru**by **ma**chine **le**arning) is a machine learning library in Ruby.
|
12
12
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
data/lib/rumale.rb
CHANGED
@@ -57,6 +57,7 @@ require 'rumale/ensemble/random_forest_regressor'
|
|
57
57
|
require 'rumale/ensemble/extra_trees_classifier'
|
58
58
|
require 'rumale/ensemble/extra_trees_regressor'
|
59
59
|
require 'rumale/clustering/k_means'
|
60
|
+
require 'rumale/clustering/gaussian_mixture'
|
60
61
|
require 'rumale/clustering/dbscan'
|
61
62
|
require 'rumale/decomposition/pca'
|
62
63
|
require 'rumale/decomposition/nmf'
|
@@ -68,6 +69,7 @@ require 'rumale/preprocessing/standard_scaler'
|
|
68
69
|
require 'rumale/preprocessing/bin_discretizer'
|
69
70
|
require 'rumale/preprocessing/label_encoder'
|
70
71
|
require 'rumale/preprocessing/one_hot_encoder'
|
72
|
+
require 'rumale/preprocessing/ordinal_encoder'
|
71
73
|
require 'rumale/model_selection/k_fold'
|
72
74
|
require 'rumale/model_selection/stratified_k_fold'
|
73
75
|
require 'rumale/model_selection/shuffle_split'
|
@@ -0,0 +1,174 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/cluster_analyzer'
|
5
|
+
require 'rumale/pairwise_metric'
|
6
|
+
|
7
|
+
module Rumale
|
8
|
+
module Clustering
|
9
|
+
# GaussianMixture is a class that implements cluster analysis with gaussian mixture model.
|
10
|
+
# The current implementation uses only the diagonal elements of covariance matrices to represent mixture parameters
|
11
|
+
# without using full elements.
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# analyzer = Rumale::Clustering::GaussianMixture.new(n_clusters: 10, max_iter: 50)
|
15
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
16
|
+
class GaussianMixture
|
17
|
+
include Base::BaseEstimator
|
18
|
+
include Base::ClusterAnalyzer
|
19
|
+
|
20
|
+
# Return the number of iterations to covergence.
|
21
|
+
# @return [Integer]
|
22
|
+
attr_reader :n_iter
|
23
|
+
|
24
|
+
# Return the weight of each cluster.
|
25
|
+
# @return [Numo::DFloat] (shape: [n_clusters])
|
26
|
+
attr_reader :weights
|
27
|
+
|
28
|
+
# Return the mean of each cluster.
|
29
|
+
# @return [Numo::DFloat] (shape: [n_clusters, n_features])
|
30
|
+
attr_reader :means
|
31
|
+
|
32
|
+
# Return the diagonal elements of covariance matrix of each cluster.
|
33
|
+
# @return [Numo::DFloat] (shape: [n_clusters, n_features])
|
34
|
+
attr_reader :covariances
|
35
|
+
|
36
|
+
# Create a new cluster analyzer with gaussian mixture model.
|
37
|
+
#
|
38
|
+
# @param n_clusters [Integer] The number of clusters.
|
39
|
+
# @param init [String] The initialization method for centroids ('random' or 'k-means++').
|
40
|
+
# @param max_iter [Integer] The maximum number of iterations.
|
41
|
+
# @param tol [Float] The tolerance of termination criterion.
|
42
|
+
# @param reg_covar [Float] The non-negative regularization to the diagonal of covariance.
|
43
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
44
|
+
def initialize(n_clusters: 8, init: 'k-means++', max_iter: 50, tol: 1.0e-4, reg_covar: 1.0e-6, random_seed: nil)
|
45
|
+
check_params_integer(n_clusters: n_clusters, max_iter: max_iter)
|
46
|
+
check_params_float(tol: tol)
|
47
|
+
check_params_string(init: init)
|
48
|
+
check_params_type_or_nil(Integer, random_seed: random_seed)
|
49
|
+
check_params_positive(n_clusters: n_clusters, max_iter: max_iter)
|
50
|
+
@params = {}
|
51
|
+
@params[:n_clusters] = n_clusters
|
52
|
+
@params[:init] = init == 'random' ? 'random' : 'k-means++'
|
53
|
+
@params[:max_iter] = max_iter
|
54
|
+
@params[:tol] = tol
|
55
|
+
@params[:reg_covar] = reg_covar
|
56
|
+
@params[:random_seed] = random_seed
|
57
|
+
@params[:random_seed] ||= srand
|
58
|
+
@n_iter = nil
|
59
|
+
@weights = nil
|
60
|
+
@means = nil
|
61
|
+
@covariances = nil
|
62
|
+
end
|
63
|
+
|
64
|
+
# Analysis clusters with given training data.
|
65
|
+
#
|
66
|
+
# @overload fit(x) -> GaussianMixture
|
67
|
+
#
|
68
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
69
|
+
# @return [GaussianMixture] The learned cluster analyzer itself.
|
70
|
+
def fit(x, _y = nil)
|
71
|
+
check_sample_array(x)
|
72
|
+
n_samples = x.shape[0]
|
73
|
+
memberships = init_memberships(x)
|
74
|
+
@params[:max_iter].times do |t|
|
75
|
+
@n_iter = t
|
76
|
+
@weights = calc_weights(n_samples, memberships)
|
77
|
+
@means = calc_means(x, memberships)
|
78
|
+
@covariances = calc_diag_covariances(x, @means, memberships) + @params[:reg_covar]
|
79
|
+
new_memberships = calc_memberships(x, @weights, @means, @covariances)
|
80
|
+
error = (memberships - new_memberships).abs.max
|
81
|
+
break if error <= @params[:tol]
|
82
|
+
memberships = new_memberships.dup
|
83
|
+
end
|
84
|
+
self
|
85
|
+
end
|
86
|
+
|
87
|
+
# Predict cluster labels for samples.
|
88
|
+
#
|
89
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster label.
|
90
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
91
|
+
def predict(x)
|
92
|
+
check_sample_array(x)
|
93
|
+
memberships = calc_memberships(x, @weights, @means, @covariances)
|
94
|
+
assign_cluster(memberships)
|
95
|
+
end
|
96
|
+
|
97
|
+
# Analysis clusters and assign samples to clusters.
|
98
|
+
#
|
99
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
100
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
101
|
+
def fit_predict(x)
|
102
|
+
check_sample_array(x)
|
103
|
+
fit(x).predict(x)
|
104
|
+
end
|
105
|
+
|
106
|
+
# Dump marshal data.
|
107
|
+
# @return [Hash] The marshal data.
|
108
|
+
def marshal_dump
|
109
|
+
{ params: @params,
|
110
|
+
n_iter: @n_iter,
|
111
|
+
weights: @weights,
|
112
|
+
means: @means,
|
113
|
+
covariances: @covariances }
|
114
|
+
end
|
115
|
+
|
116
|
+
# Load marshal data.
|
117
|
+
# @return [nil]
|
118
|
+
def marshal_load(obj)
|
119
|
+
@params = obj[:params]
|
120
|
+
@n_iter = obj[:n_iter]
|
121
|
+
@weights = obj[:weights]
|
122
|
+
@means = obj[:means]
|
123
|
+
@covariances = obj[:covariances]
|
124
|
+
nil
|
125
|
+
end
|
126
|
+
|
127
|
+
private
|
128
|
+
|
129
|
+
def assign_cluster(memberships)
|
130
|
+
n_clusters = memberships.shape[1]
|
131
|
+
memberships.max_index(axis: 1) - Numo::Int32[*0.step(memberships.size - 1, n_clusters)]
|
132
|
+
end
|
133
|
+
|
134
|
+
def init_memberships(x)
|
135
|
+
kmeans = Rumale::Clustering::KMeans.new(
|
136
|
+
n_clusters: @params[:n_clusters], init: @params[:init], max_iter: 0, random_seed: @params[:random_seed]
|
137
|
+
)
|
138
|
+
cluster_ids = kmeans.fit_predict(x)
|
139
|
+
encoder = Rumale::Preprocessing::OneHotEncoder.new
|
140
|
+
encoder.fit_transform(cluster_ids)
|
141
|
+
end
|
142
|
+
|
143
|
+
def calc_memberships(x, weights, means, diag_cov)
|
144
|
+
n_samples = x.shape[0]
|
145
|
+
n_clusters = means.shape[0]
|
146
|
+
memberships = Numo::DFloat.zeros(n_samples, n_clusters)
|
147
|
+
n_clusters.times do |n|
|
148
|
+
centered = x - means[n, true]
|
149
|
+
inv_cov = 1.0 / diag_cov[n, true]
|
150
|
+
sqrt_det_cov = 1.0 / Math.sqrt(diag_cov[n, true].prod)
|
151
|
+
memberships[true, n] = weights[n] * sqrt_det_cov * Numo::NMath.exp(-0.5 * (centered * inv_cov * centered).sum(1))
|
152
|
+
end
|
153
|
+
memberships / memberships.sum(1).expand_dims(1)
|
154
|
+
end
|
155
|
+
|
156
|
+
def calc_weights(n_samples, memberships)
|
157
|
+
memberships.sum(0) / n_samples
|
158
|
+
end
|
159
|
+
|
160
|
+
def calc_means(x, memberships)
|
161
|
+
memberships.transpose.dot(x) / memberships.sum(0).expand_dims(1)
|
162
|
+
end
|
163
|
+
|
164
|
+
def calc_diag_covariances(x, means, memberships)
|
165
|
+
n_clusters = means.shape[0]
|
166
|
+
diag_cov = Array.new(n_clusters) do |n|
|
167
|
+
centered = x - means[n, true]
|
168
|
+
memberships[true, n].dot(centered**2) / memberships[true, n].sum
|
169
|
+
end
|
170
|
+
Numo::DFloat.asarray(diag_cov)
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module Preprocessing
|
8
|
+
# Transfrom categorical features to integer values.
|
9
|
+
#
|
10
|
+
# @example
|
11
|
+
# encoder = Rumale::Preprocessing::OrdinalEncoder.new
|
12
|
+
# training_samples = [['left', 10], ['right', 15], ['right', 20]]
|
13
|
+
# training_samples = Numo::NArray.asarray(training_samples)
|
14
|
+
# encoder.fit(training_samples)
|
15
|
+
# p encoder.categories
|
16
|
+
# # [["left", "right"], [10, 15, 20]]
|
17
|
+
# testing_samples = [['left', 20], ['right', 10]]
|
18
|
+
# testing_samples = Numo::NArray.asarray(testing_samples)
|
19
|
+
# encoded = encoder.transform(testing_samples)
|
20
|
+
# p encoded
|
21
|
+
# # Numo::DFloat#shape=[2,2]
|
22
|
+
# # [[0, 2],
|
23
|
+
# # [1, 0]]
|
24
|
+
# p encoder.inverse_transform(encoded)
|
25
|
+
# # Numo::RObject#shape=[2,2]
|
26
|
+
# # [["left", 20],
|
27
|
+
# # ["right", 10]]
|
28
|
+
class OrdinalEncoder
|
29
|
+
include Base::BaseEstimator
|
30
|
+
include Base::Transformer
|
31
|
+
|
32
|
+
# Return the array consists of categorical value each feature.
|
33
|
+
# @return [Array] (size: n_features)
|
34
|
+
attr_reader :categories
|
35
|
+
|
36
|
+
# Create a new encoder that transform categorical features to integer values.
|
37
|
+
#
|
38
|
+
# @param categories [Nil/Array] The category list for each feature.
|
39
|
+
# If nil is given, extracted categories from the training data by calling the fit method are used.
|
40
|
+
def initialize(categories: nil)
|
41
|
+
check_params_type_or_nil(Array, categories: categories)
|
42
|
+
@categories = categories
|
43
|
+
end
|
44
|
+
|
45
|
+
# Fit encoder by extracting the category for each feature.
|
46
|
+
#
|
47
|
+
# @overload fit(x) -> OrdinalEncoder
|
48
|
+
#
|
49
|
+
# @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
|
50
|
+
# @return [LabelEncoder]
|
51
|
+
def fit(x, _y = nil)
|
52
|
+
raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
|
53
|
+
raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
|
54
|
+
n_features = x.shape[1]
|
55
|
+
@categories = Array.new(n_features) { |n| x[true, n].to_a.uniq.sort }
|
56
|
+
self
|
57
|
+
end
|
58
|
+
|
59
|
+
# Fit encoder, then return encoded categorical features to integer values.
|
60
|
+
#
|
61
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
62
|
+
#
|
63
|
+
# @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
|
64
|
+
# @return [Numo::DFloat] The encoded categorical features to integer values.
|
65
|
+
def fit_transform(x, _y = nil)
|
66
|
+
raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
|
67
|
+
raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
|
68
|
+
fit(x).transform(x)
|
69
|
+
end
|
70
|
+
|
71
|
+
# Encode categorical features.
|
72
|
+
#
|
73
|
+
# @param x [Numo::NArray] (shape: [n_samples, n_features]) The samples consisting of categorical features.
|
74
|
+
# @return [Numo::DFloat] The encoded categorical features to integer values.
|
75
|
+
def transform(x)
|
76
|
+
raise TypeError, 'Expect class of sample matrix to be Numo::NArray' unless x.is_a?(Numo::NArray)
|
77
|
+
raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
|
78
|
+
|
79
|
+
n_features = x.shape[1]
|
80
|
+
raise ArgumentError, 'Expect the number of features and the number of categories to be equal' if n_features != @categories.size
|
81
|
+
|
82
|
+
transformed = Array.new(n_features) do |n|
|
83
|
+
x[true, n].to_a.map { |v| @categories[n].index(v) }
|
84
|
+
end
|
85
|
+
|
86
|
+
Numo::DFloat.asarray(transformed.transpose)
|
87
|
+
end
|
88
|
+
|
89
|
+
# Decode values to categorical features.
|
90
|
+
#
|
91
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples consisting of values transformed from categorical features.
|
92
|
+
# @return [Numo::NArray] The decoded features.
|
93
|
+
def inverse_transform(x)
|
94
|
+
check_sample_array(x)
|
95
|
+
|
96
|
+
n_features = x.shape[1]
|
97
|
+
raise ArgumentError, 'Expect the number of features and the number of categories to be equal' if n_features != @categories.size
|
98
|
+
|
99
|
+
inv_transformed = Array.new(n_features) do |n|
|
100
|
+
x[true, n].to_a.map { |i| @categories[n][i.to_i] }
|
101
|
+
end
|
102
|
+
|
103
|
+
Numo::NArray.asarray(inv_transformed.transpose)
|
104
|
+
end
|
105
|
+
|
106
|
+
# Dump marshal data.
|
107
|
+
# @return [Hash] The marshal data about OrdinalEncoder.
|
108
|
+
def marshal_dump
|
109
|
+
{ categories: @categories }
|
110
|
+
end
|
111
|
+
|
112
|
+
# Load marshal data.
|
113
|
+
# @return [nil]
|
114
|
+
def marshal_load(obj)
|
115
|
+
@categories = obj[:categories]
|
116
|
+
nil
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
@@ -155,7 +155,7 @@ module Rumale
|
|
155
155
|
def build_tree(x, y, g, h)
|
156
156
|
@feature_ids = Array.new(x.shape[1]) { |v| v }
|
157
157
|
@tree = grow_node(0, x, y, g, h)
|
158
|
-
|
158
|
+
@feature_ids = nil
|
159
159
|
nil
|
160
160
|
end
|
161
161
|
|
@@ -163,7 +163,7 @@ module Rumale
|
|
163
163
|
# intialize some variables.
|
164
164
|
sum_g = g.sum
|
165
165
|
sum_h = h.sum
|
166
|
-
n_samples
|
166
|
+
n_samples = x.shape[0]
|
167
167
|
node = Node.new(depth: depth, n_samples: n_samples)
|
168
168
|
|
169
169
|
# terminate growing.
|
data/lib/rumale/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rumale
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-06-
|
11
|
+
date: 2019-06-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -147,6 +147,7 @@ files:
|
|
147
147
|
- lib/rumale/base/splitter.rb
|
148
148
|
- lib/rumale/base/transformer.rb
|
149
149
|
- lib/rumale/clustering/dbscan.rb
|
150
|
+
- lib/rumale/clustering/gaussian_mixture.rb
|
150
151
|
- lib/rumale/clustering/k_means.rb
|
151
152
|
- lib/rumale/dataset.rb
|
152
153
|
- lib/rumale/decomposition/nmf.rb
|
@@ -213,6 +214,7 @@ files:
|
|
213
214
|
- lib/rumale/preprocessing/max_abs_scaler.rb
|
214
215
|
- lib/rumale/preprocessing/min_max_scaler.rb
|
215
216
|
- lib/rumale/preprocessing/one_hot_encoder.rb
|
217
|
+
- lib/rumale/preprocessing/ordinal_encoder.rb
|
216
218
|
- lib/rumale/preprocessing/standard_scaler.rb
|
217
219
|
- lib/rumale/probabilistic_output.rb
|
218
220
|
- lib/rumale/tree/base_decision_tree.rb
|