rumale 0.12.2 → 0.12.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +2 -2
- data/lib/rumale.rb +1 -0
- data/lib/rumale/clustering/power_iteration.rb +129 -0
- data/lib/rumale/dataset.rb +78 -0
- data/lib/rumale/pairwise_metric.rb +1 -2
- data/lib/rumale/version.rb +1 -1
- data/rumale.gemspec +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6d9c7691afd71e50df0c05d726a535a7f5dd426f
|
4
|
+
data.tar.gz: 3c2ac53df9060b7ff8abc62717c2ae06c1adebca
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9686efe5c0126f29672b60047af8e604e4811916fd86a0f9151fe6e9b6e7ee292f4bbd33eae1dab8b2af1941910a52fbb68a0938f280bbce4e4c57faedf3215b
|
7
|
+
data.tar.gz: 2bf6c6c1fb42ab8290cc5471c4417f4aac8a07fdc23c607ffc10855bf3941f9b532e669374e5ad237f67bf5495902290284fc0697143c721e484e6e4ac39753a
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -6,14 +6,14 @@
|
|
6
6
|
[](https://coveralls.io/github/yoshoku/rumale?branch=master)
|
7
7
|
[](https://badge.fury.io/rb/rumale)
|
8
8
|
[](https://github.com/yoshoku/rumale/blob/master/LICENSE.txt)
|
9
|
-
[](https://www.rubydoc.info/gems/rumale/0.12.
|
9
|
+
[](https://www.rubydoc.info/gems/rumale/0.12.3)
|
10
10
|
|
11
11
|
Rumale (**Ru**by **ma**chine **le**arning) is a machine learning library in Ruby.
|
12
12
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
13
13
|
Rumale supports Linear / Kernel Support Vector Machine,
|
14
14
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
15
15
|
Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor classifier,
|
16
|
-
K-Means, DBSCAN, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
|
16
|
+
K-Means, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
|
17
17
|
|
18
18
|
This project was formerly known as "SVMKit".
|
19
19
|
If you are using SVMKit, please install Rumale and replace `SVMKit` constants with `Rumale`.
|
data/lib/rumale.rb
CHANGED
@@ -59,6 +59,7 @@ require 'rumale/ensemble/extra_trees_regressor'
|
|
59
59
|
require 'rumale/clustering/k_means'
|
60
60
|
require 'rumale/clustering/gaussian_mixture'
|
61
61
|
require 'rumale/clustering/dbscan'
|
62
|
+
require 'rumale/clustering/power_iteration'
|
62
63
|
require 'rumale/decomposition/pca'
|
63
64
|
require 'rumale/decomposition/nmf'
|
64
65
|
require 'rumale/manifold/tsne'
|
@@ -0,0 +1,129 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/cluster_analyzer'
|
5
|
+
require 'rumale/pairwise_metric'
|
6
|
+
|
7
|
+
module Rumale
|
8
|
+
module Clustering
|
9
|
+
# PowerIteration is a class that implements power iteration clustering.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# analyzer = Rumale::Clustering::PowerIteration.new(n_clusters: 10, gamma: 8.0, max_iter: 1000)
|
13
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
14
|
+
#
|
15
|
+
# *Reference*
|
16
|
+
# - F. Lin and W W. Cohen, "Power Iteration Clustering," Proc. ICML'10, pp. 655--662, 2010.
|
17
|
+
class PowerIteration
|
18
|
+
include Base::BaseEstimator
|
19
|
+
include Base::ClusterAnalyzer
|
20
|
+
|
21
|
+
# Return the data in embedded space.
|
22
|
+
# @return [Numo::DFloat] (shape: [n_samples])
|
23
|
+
attr_reader :embedding
|
24
|
+
|
25
|
+
# Return the number of iterations run for optimization
|
26
|
+
# @return [Integer]
|
27
|
+
attr_reader :n_iter
|
28
|
+
|
29
|
+
# Create a new cluster analyzer with power iteration clustering.
|
30
|
+
#
|
31
|
+
# @param n_clusters [Integer] The number of clusters.
|
32
|
+
# @param affinity [String] The representation of affinity matrix ('rbf' or 'precomputed').
|
33
|
+
# @param gamma [Float] The parameter of rbf kernel, if nil it is 1 / n_features.
|
34
|
+
# If affinity = 'precomputed', this parameter is ignored.
|
35
|
+
# @param init [String] The initialization method for centroids of K-Means clustering ('random' or 'k-means++').
|
36
|
+
# @param max_iter [Integer] The maximum number of iterations.
|
37
|
+
# @param tol [Float] The tolerance of termination criterion.
|
38
|
+
# @param eps [Float] A small value close to zero to avoid zero division error.
|
39
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
40
|
+
def initialize(n_clusters: 8, affinity: 'rbf', gamma: nil, init: 'k-means++', max_iter: 1000, tol: 1.0e-8, eps: 1.0e-5, random_seed: nil)
|
41
|
+
check_params_integer(n_clusters: n_clusters, max_iter: max_iter)
|
42
|
+
check_params_float(tol: tol, eps: eps)
|
43
|
+
check_params_string(affinity: affinity, init: init)
|
44
|
+
check_params_type_or_nil(Float, gamma: gamma)
|
45
|
+
check_params_type_or_nil(Integer, random_seed: random_seed)
|
46
|
+
check_params_positive(n_clusters: n_clusters, max_iter: max_iter, tol: tol, eps: eps)
|
47
|
+
@params = {}
|
48
|
+
@params[:n_clusters] = n_clusters
|
49
|
+
@params[:affinity] = affinity
|
50
|
+
@params[:gamma] = gamma
|
51
|
+
@params[:init] = init == 'random' ? 'random' : 'k-means++'
|
52
|
+
@params[:max_iter] = max_iter
|
53
|
+
@params[:tol] = tol
|
54
|
+
@params[:eps] = eps
|
55
|
+
@params[:random_seed] = random_seed
|
56
|
+
@params[:random_seed] ||= srand
|
57
|
+
@embedding = nil
|
58
|
+
@n_iter = nil
|
59
|
+
end
|
60
|
+
|
61
|
+
# Analysis clusters with given training data.
|
62
|
+
#
|
63
|
+
# @overload fit(x) -> PowerClustering
|
64
|
+
#
|
65
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
66
|
+
# If the metric is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
|
67
|
+
# @return [PowerIteration] The learned cluster analyzer itself.
|
68
|
+
def fit(x, _y = nil)
|
69
|
+
check_sample_array(x)
|
70
|
+
raise ArgumentError, 'Expect the input affinity matrix to be square.' if @params[:affinity] == 'precomputed' && x.shape[0] != x.shape[1]
|
71
|
+
# initialize some variables.
|
72
|
+
affinity_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.rbf_kernel(x, nil, @params[:gamma])
|
73
|
+
affinity_mat[affinity_mat.diag_indices] = 0.0
|
74
|
+
n_samples = affinity_mat.shape[0]
|
75
|
+
tol = @params[:tol].fdiv(n_samples)
|
76
|
+
# calculate normalized affinity matrix.
|
77
|
+
degrees = affinity_mat.sum(axis: 1)
|
78
|
+
normalized_affinity_mat = (1.0 / degrees).diag.dot(affinity_mat)
|
79
|
+
# initialize embedding space.
|
80
|
+
@embedding = degrees / degrees.sum
|
81
|
+
# optimization
|
82
|
+
@n_iter = 0
|
83
|
+
error = Numo::DFloat.ones(n_samples)
|
84
|
+
@params[:max_iter].times do |t|
|
85
|
+
@n_iter = t + 1
|
86
|
+
new_embedding = normalized_affinity_mat.dot(@embedding)
|
87
|
+
new_embedding /= new_embedding.abs.sum
|
88
|
+
new_error = (new_embedding - @embedding).abs
|
89
|
+
break if (new_error - error).abs.max <= tol
|
90
|
+
@embedding = new_embedding
|
91
|
+
error = new_error
|
92
|
+
end
|
93
|
+
self
|
94
|
+
end
|
95
|
+
|
96
|
+
# Analysis clusters and assign samples to clusters.
|
97
|
+
#
|
98
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
99
|
+
# If the metric is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
|
100
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
101
|
+
def fit_predict(x)
|
102
|
+
check_sample_array(x)
|
103
|
+
fit(x)
|
104
|
+
kmeans = Rumale::Clustering::KMeans.new(
|
105
|
+
n_clusters: @params[:n_clusters], init: @params[:init],
|
106
|
+
max_iter: @params[:max_iter], tol: @params[:tol], random_seed: @params[:random_seed]
|
107
|
+
)
|
108
|
+
kmeans.fit_predict(@embedding.expand_dims(1))
|
109
|
+
end
|
110
|
+
|
111
|
+
# Dump marshal data.
|
112
|
+
# @return [Hash] The marshal data.
|
113
|
+
def marshal_dump
|
114
|
+
{ params: @params,
|
115
|
+
embedding: @embedding,
|
116
|
+
n_iter: @n_iter }
|
117
|
+
end
|
118
|
+
|
119
|
+
# Load marshal data.
|
120
|
+
# @return [nil]
|
121
|
+
def marshal_load(obj)
|
122
|
+
@params = obj[:params]
|
123
|
+
@embedding = obj[:embedding]
|
124
|
+
@n_iter = obj[:n_iter]
|
125
|
+
nil
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
data/lib/rumale/dataset.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'csv'
|
4
|
+
require 'rumale/validation'
|
4
5
|
|
5
6
|
module Rumale
|
6
7
|
# Module for loading and saving a dataset file.
|
@@ -48,6 +49,83 @@ module Rumale
|
|
48
49
|
end
|
49
50
|
end
|
50
51
|
|
52
|
+
# Generate a two-dimensional data set consisting of an inner circle and an outer circle.
|
53
|
+
#
|
54
|
+
# @param n_samples [Integer] The number of samples.
|
55
|
+
# @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
|
56
|
+
# @param noise [Float] The standard deviaion of gaussian noise added to the data.
|
57
|
+
# If nil is given, no noise is added.
|
58
|
+
# @param factor [Float] The scale factor between inner and outer circles. The interval of factor is (0, 1).
|
59
|
+
# @random_seed [Integer] The seed value using to initialize the random generator.
|
60
|
+
def make_circles(n_samples, shuffle: true, noise: nil, factor: 0.8, random_seed: nil)
|
61
|
+
Rumale::Validation.check_params_integer(n_samples: n_samples)
|
62
|
+
Rumale::Validation.check_params_boolean(shuffle: shuffle)
|
63
|
+
Rumale::Validation.check_params_type_or_nil(Float, noise: noise)
|
64
|
+
Rumale::Validation.check_params_float(factor: factor)
|
65
|
+
Rumale::Validation.check_params_type_or_nil(Integer, random_seed: random_seed)
|
66
|
+
raise ArgumentError, 'The number of samples must be more than 2.' if n_samples <= 1
|
67
|
+
raise RangeError, 'The interval of factor is (0, 1).' if factor <= 0 || factor >= 1
|
68
|
+
# initialize some variables.
|
69
|
+
rs = random_seed
|
70
|
+
rs ||= srand
|
71
|
+
rng = Random.new(rs)
|
72
|
+
n_samples_out = n_samples.fdiv(2).to_i
|
73
|
+
n_samples_in = n_samples - n_samples_out
|
74
|
+
# make two circles.
|
75
|
+
linsp_out = Numo::DFloat.linspace(0, 2 * Math::PI, n_samples_out)
|
76
|
+
linsp_in = Numo::DFloat.linspace(0, 2 * Math::PI, n_samples_in)
|
77
|
+
circle_out = Numo::DFloat[Numo::NMath.cos(linsp_out), Numo::NMath.sin(linsp_out)].transpose
|
78
|
+
circle_in = Numo::DFloat[Numo::NMath.cos(linsp_in), Numo::NMath.sin(linsp_in)].transpose
|
79
|
+
x = Numo::DFloat.vstack([circle_out, factor * circle_in])
|
80
|
+
y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
|
81
|
+
# shuffle data indices.
|
82
|
+
if shuffle
|
83
|
+
rand_ids = [*0...n_samples].shuffle(random: rng.dup)
|
84
|
+
x = x[rand_ids, true].dup
|
85
|
+
y = y[rand_ids].dup
|
86
|
+
end
|
87
|
+
# add gaussian noise.
|
88
|
+
x += Rumale::Utils.rand_normal(x.shape, rng.dup, 0.0, noise) unless noise.nil?
|
89
|
+
[x, y]
|
90
|
+
end
|
91
|
+
|
92
|
+
# Generate a two-dimensional data set consisting of two half circles shifted.
|
93
|
+
#
|
94
|
+
# @param n_samples [Integer] The number of samples.
|
95
|
+
# @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
|
96
|
+
# @param noise [Float] The standard deviaion of gaussian noise added to the data.
|
97
|
+
# If nil is given, no noise is added.
|
98
|
+
# @random_seed [Integer] The seed value using to initialize the random generator.
|
99
|
+
def make_moons(n_samples, shuffle: true, noise: nil, random_seed: nil)
|
100
|
+
Rumale::Validation.check_params_integer(n_samples: n_samples)
|
101
|
+
Rumale::Validation.check_params_boolean(shuffle: shuffle)
|
102
|
+
Rumale::Validation.check_params_type_or_nil(Float, noise: noise)
|
103
|
+
Rumale::Validation.check_params_type_or_nil(Integer, random_seed: random_seed)
|
104
|
+
raise ArgumentError, 'The number of samples must be more than 2.' if n_samples <= 1
|
105
|
+
# initialize some variables.
|
106
|
+
rs = random_seed
|
107
|
+
rs ||= srand
|
108
|
+
rng = Random.new(rs)
|
109
|
+
n_samples_out = n_samples.fdiv(2).to_i
|
110
|
+
n_samples_in = n_samples - n_samples_out
|
111
|
+
# make two half circles.
|
112
|
+
linsp_out = Numo::DFloat.linspace(0, Math::PI, n_samples_out)
|
113
|
+
linsp_in = Numo::DFloat.linspace(0, Math::PI, n_samples_in)
|
114
|
+
circle_out = Numo::DFloat[Numo::NMath.cos(linsp_out), Numo::NMath.sin(linsp_out)].transpose
|
115
|
+
circle_in = Numo::DFloat[1 - Numo::NMath.cos(linsp_in), 1 - Numo::NMath.sin(linsp_in) - 0.5].transpose
|
116
|
+
x = Numo::DFloat.vstack([circle_out, circle_in])
|
117
|
+
y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
|
118
|
+
# shuffle data indices.
|
119
|
+
if shuffle
|
120
|
+
rand_ids = [*0...n_samples].shuffle(random: rng.dup)
|
121
|
+
x = x[rand_ids, true].dup
|
122
|
+
y = y[rand_ids].dup
|
123
|
+
end
|
124
|
+
# add gaussian noise.
|
125
|
+
x += Rumale::Utils.rand_normal(x.shape, rng.dup, 0.0, noise) unless noise.nil?
|
126
|
+
[x, y]
|
127
|
+
end
|
128
|
+
|
51
129
|
private
|
52
130
|
|
53
131
|
def parse_libsvm_line(line, zero_based)
|
@@ -52,8 +52,7 @@ module Rumale
|
|
52
52
|
Rumale::Validation.check_sample_array(x)
|
53
53
|
Rumale::Validation.check_sample_array(y)
|
54
54
|
Rumale::Validation.check_params_float(gamma: gamma)
|
55
|
-
|
56
|
-
Numo::NMath.exp((distance_matrix**2) * -gamma)
|
55
|
+
Numo::NMath.exp(-gamma * squared_error(x, y).abs)
|
57
56
|
end
|
58
57
|
|
59
58
|
# Calculate the linear kernel between x and y.
|
data/lib/rumale/version.rb
CHANGED
data/rumale.gemspec
CHANGED
@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
|
|
19
19
|
Rumale currently supports Linear / Kernel Support Vector Machine,
|
20
20
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
21
21
|
Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
22
|
-
K-Means, DBSCAN, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
|
22
|
+
K-Means, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
|
23
23
|
MSG
|
24
24
|
spec.homepage = 'https://github.com/yoshoku/rumale'
|
25
25
|
spec.license = 'BSD-2-Clause'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rumale
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-06-
|
11
|
+
date: 2019-06-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -114,7 +114,7 @@ description: |
|
|
114
114
|
Rumale currently supports Linear / Kernel Support Vector Machine,
|
115
115
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
116
116
|
Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
117
|
-
K-Means, DBSCAN, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
|
117
|
+
K-Means, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
|
118
118
|
email:
|
119
119
|
- yoshoku@outlook.com
|
120
120
|
executables: []
|
@@ -149,6 +149,7 @@ files:
|
|
149
149
|
- lib/rumale/clustering/dbscan.rb
|
150
150
|
- lib/rumale/clustering/gaussian_mixture.rb
|
151
151
|
- lib/rumale/clustering/k_means.rb
|
152
|
+
- lib/rumale/clustering/power_iteration.rb
|
152
153
|
- lib/rumale/dataset.rb
|
153
154
|
- lib/rumale/decomposition/nmf.rb
|
154
155
|
- lib/rumale/decomposition/pca.rb
|
@@ -249,7 +250,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
249
250
|
version: '0'
|
250
251
|
requirements: []
|
251
252
|
rubyforge_project:
|
252
|
-
rubygems_version: 2.
|
253
|
+
rubygems_version: 2.6.14.4
|
253
254
|
signing_key:
|
254
255
|
specification_version: 4
|
255
256
|
summary: Rumale is a machine learning library in Ruby. Rumale provides machine learning
|