rumale 0.12.2 → 0.12.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +2 -2
- data/lib/rumale.rb +1 -0
- data/lib/rumale/clustering/power_iteration.rb +129 -0
- data/lib/rumale/dataset.rb +78 -0
- data/lib/rumale/pairwise_metric.rb +1 -2
- data/lib/rumale/version.rb +1 -1
- data/rumale.gemspec +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6d9c7691afd71e50df0c05d726a535a7f5dd426f
|
4
|
+
data.tar.gz: 3c2ac53df9060b7ff8abc62717c2ae06c1adebca
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9686efe5c0126f29672b60047af8e604e4811916fd86a0f9151fe6e9b6e7ee292f4bbd33eae1dab8b2af1941910a52fbb68a0938f280bbce4e4c57faedf3215b
|
7
|
+
data.tar.gz: 2bf6c6c1fb42ab8290cc5471c4417f4aac8a07fdc23c607ffc10855bf3941f9b532e669374e5ad237f67bf5495902290284fc0697143c721e484e6e4ac39753a
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -6,14 +6,14 @@
|
|
6
6
|
[![Coverage Status](https://coveralls.io/repos/github/yoshoku/rumale/badge.svg?branch=master)](https://coveralls.io/github/yoshoku/rumale?branch=master)
|
7
7
|
[![Gem Version](https://badge.fury.io/rb/rumale.svg)](https://badge.fury.io/rb/rumale)
|
8
8
|
[![BSD 2-Clause License](https://img.shields.io/badge/License-BSD%202--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/master/LICENSE.txt)
|
9
|
-
[![Documentation](http://img.shields.io/badge/docs-rdoc.info-blue.svg)](https://www.rubydoc.info/gems/rumale/0.12.
|
9
|
+
[![Documentation](http://img.shields.io/badge/docs-rdoc.info-blue.svg)](https://www.rubydoc.info/gems/rumale/0.12.3)
|
10
10
|
|
11
11
|
Rumale (**Ru**by **ma**chine **le**arning) is a machine learning library in Ruby.
|
12
12
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
13
13
|
Rumale supports Linear / Kernel Support Vector Machine,
|
14
14
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
15
15
|
Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor classifier,
|
16
|
-
K-Means, DBSCAN, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
|
16
|
+
K-Means, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
|
17
17
|
|
18
18
|
This project was formerly known as "SVMKit".
|
19
19
|
If you are using SVMKit, please install Rumale and replace `SVMKit` constants with `Rumale`.
|
data/lib/rumale.rb
CHANGED
@@ -59,6 +59,7 @@ require 'rumale/ensemble/extra_trees_regressor'
|
|
59
59
|
require 'rumale/clustering/k_means'
|
60
60
|
require 'rumale/clustering/gaussian_mixture'
|
61
61
|
require 'rumale/clustering/dbscan'
|
62
|
+
require 'rumale/clustering/power_iteration'
|
62
63
|
require 'rumale/decomposition/pca'
|
63
64
|
require 'rumale/decomposition/nmf'
|
64
65
|
require 'rumale/manifold/tsne'
|
@@ -0,0 +1,129 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/cluster_analyzer'
|
5
|
+
require 'rumale/pairwise_metric'
|
6
|
+
|
7
|
+
module Rumale
|
8
|
+
module Clustering
|
9
|
+
# PowerIteration is a class that implements power iteration clustering.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# analyzer = Rumale::Clustering::PowerIteration.new(n_clusters: 10, gamma: 8.0, max_iter: 1000)
|
13
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
14
|
+
#
|
15
|
+
# *Reference*
|
16
|
+
# - F. Lin and W W. Cohen, "Power Iteration Clustering," Proc. ICML'10, pp. 655--662, 2010.
|
17
|
+
class PowerIteration
|
18
|
+
include Base::BaseEstimator
|
19
|
+
include Base::ClusterAnalyzer
|
20
|
+
|
21
|
+
# Return the data in embedded space.
|
22
|
+
# @return [Numo::DFloat] (shape: [n_samples])
|
23
|
+
attr_reader :embedding
|
24
|
+
|
25
|
+
# Return the number of iterations run for optimization
|
26
|
+
# @return [Integer]
|
27
|
+
attr_reader :n_iter
|
28
|
+
|
29
|
+
# Create a new cluster analyzer with power iteration clustering.
|
30
|
+
#
|
31
|
+
# @param n_clusters [Integer] The number of clusters.
|
32
|
+
# @param affinity [String] The representation of affinity matrix ('rbf' or 'precomputed').
|
33
|
+
# @param gamma [Float] The parameter of rbf kernel, if nil it is 1 / n_features.
|
34
|
+
# If affinity = 'precomputed', this parameter is ignored.
|
35
|
+
# @param init [String] The initialization method for centroids of K-Means clustering ('random' or 'k-means++').
|
36
|
+
# @param max_iter [Integer] The maximum number of iterations.
|
37
|
+
# @param tol [Float] The tolerance of termination criterion.
|
38
|
+
# @param eps [Float] A small value close to zero to avoid zero division error.
|
39
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
40
|
+
def initialize(n_clusters: 8, affinity: 'rbf', gamma: nil, init: 'k-means++', max_iter: 1000, tol: 1.0e-8, eps: 1.0e-5, random_seed: nil)
|
41
|
+
check_params_integer(n_clusters: n_clusters, max_iter: max_iter)
|
42
|
+
check_params_float(tol: tol, eps: eps)
|
43
|
+
check_params_string(affinity: affinity, init: init)
|
44
|
+
check_params_type_or_nil(Float, gamma: gamma)
|
45
|
+
check_params_type_or_nil(Integer, random_seed: random_seed)
|
46
|
+
check_params_positive(n_clusters: n_clusters, max_iter: max_iter, tol: tol, eps: eps)
|
47
|
+
@params = {}
|
48
|
+
@params[:n_clusters] = n_clusters
|
49
|
+
@params[:affinity] = affinity
|
50
|
+
@params[:gamma] = gamma
|
51
|
+
@params[:init] = init == 'random' ? 'random' : 'k-means++'
|
52
|
+
@params[:max_iter] = max_iter
|
53
|
+
@params[:tol] = tol
|
54
|
+
@params[:eps] = eps
|
55
|
+
@params[:random_seed] = random_seed
|
56
|
+
@params[:random_seed] ||= srand
|
57
|
+
@embedding = nil
|
58
|
+
@n_iter = nil
|
59
|
+
end
|
60
|
+
|
61
|
+
# Analysis clusters with given training data.
|
62
|
+
#
|
63
|
+
# @overload fit(x) -> PowerClustering
|
64
|
+
#
|
65
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
66
|
+
# If the metric is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
|
67
|
+
# @return [PowerIteration] The learned cluster analyzer itself.
|
68
|
+
def fit(x, _y = nil)
|
69
|
+
check_sample_array(x)
|
70
|
+
raise ArgumentError, 'Expect the input affinity matrix to be square.' if @params[:affinity] == 'precomputed' && x.shape[0] != x.shape[1]
|
71
|
+
# initialize some variables.
|
72
|
+
affinity_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.rbf_kernel(x, nil, @params[:gamma])
|
73
|
+
affinity_mat[affinity_mat.diag_indices] = 0.0
|
74
|
+
n_samples = affinity_mat.shape[0]
|
75
|
+
tol = @params[:tol].fdiv(n_samples)
|
76
|
+
# calculate normalized affinity matrix.
|
77
|
+
degrees = affinity_mat.sum(axis: 1)
|
78
|
+
normalized_affinity_mat = (1.0 / degrees).diag.dot(affinity_mat)
|
79
|
+
# initialize embedding space.
|
80
|
+
@embedding = degrees / degrees.sum
|
81
|
+
# optimization
|
82
|
+
@n_iter = 0
|
83
|
+
error = Numo::DFloat.ones(n_samples)
|
84
|
+
@params[:max_iter].times do |t|
|
85
|
+
@n_iter = t + 1
|
86
|
+
new_embedding = normalized_affinity_mat.dot(@embedding)
|
87
|
+
new_embedding /= new_embedding.abs.sum
|
88
|
+
new_error = (new_embedding - @embedding).abs
|
89
|
+
break if (new_error - error).abs.max <= tol
|
90
|
+
@embedding = new_embedding
|
91
|
+
error = new_error
|
92
|
+
end
|
93
|
+
self
|
94
|
+
end
|
95
|
+
|
96
|
+
# Analysis clusters and assign samples to clusters.
|
97
|
+
#
|
98
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
99
|
+
# If the metric is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
|
100
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
101
|
+
def fit_predict(x)
|
102
|
+
check_sample_array(x)
|
103
|
+
fit(x)
|
104
|
+
kmeans = Rumale::Clustering::KMeans.new(
|
105
|
+
n_clusters: @params[:n_clusters], init: @params[:init],
|
106
|
+
max_iter: @params[:max_iter], tol: @params[:tol], random_seed: @params[:random_seed]
|
107
|
+
)
|
108
|
+
kmeans.fit_predict(@embedding.expand_dims(1))
|
109
|
+
end
|
110
|
+
|
111
|
+
# Dump marshal data.
|
112
|
+
# @return [Hash] The marshal data.
|
113
|
+
def marshal_dump
|
114
|
+
{ params: @params,
|
115
|
+
embedding: @embedding,
|
116
|
+
n_iter: @n_iter }
|
117
|
+
end
|
118
|
+
|
119
|
+
# Load marshal data.
|
120
|
+
# @return [nil]
|
121
|
+
def marshal_load(obj)
|
122
|
+
@params = obj[:params]
|
123
|
+
@embedding = obj[:embedding]
|
124
|
+
@n_iter = obj[:n_iter]
|
125
|
+
nil
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
data/lib/rumale/dataset.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'csv'
|
4
|
+
require 'rumale/validation'
|
4
5
|
|
5
6
|
module Rumale
|
6
7
|
# Module for loading and saving a dataset file.
|
@@ -48,6 +49,83 @@ module Rumale
|
|
48
49
|
end
|
49
50
|
end
|
50
51
|
|
52
|
+
# Generate a two-dimensional data set consisting of an inner circle and an outer circle.
|
53
|
+
#
|
54
|
+
# @param n_samples [Integer] The number of samples.
|
55
|
+
# @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
|
56
|
+
# @param noise [Float] The standard deviaion of gaussian noise added to the data.
|
57
|
+
# If nil is given, no noise is added.
|
58
|
+
# @param factor [Float] The scale factor between inner and outer circles. The interval of factor is (0, 1).
|
59
|
+
# @random_seed [Integer] The seed value using to initialize the random generator.
|
60
|
+
def make_circles(n_samples, shuffle: true, noise: nil, factor: 0.8, random_seed: nil)
|
61
|
+
Rumale::Validation.check_params_integer(n_samples: n_samples)
|
62
|
+
Rumale::Validation.check_params_boolean(shuffle: shuffle)
|
63
|
+
Rumale::Validation.check_params_type_or_nil(Float, noise: noise)
|
64
|
+
Rumale::Validation.check_params_float(factor: factor)
|
65
|
+
Rumale::Validation.check_params_type_or_nil(Integer, random_seed: random_seed)
|
66
|
+
raise ArgumentError, 'The number of samples must be more than 2.' if n_samples <= 1
|
67
|
+
raise RangeError, 'The interval of factor is (0, 1).' if factor <= 0 || factor >= 1
|
68
|
+
# initialize some variables.
|
69
|
+
rs = random_seed
|
70
|
+
rs ||= srand
|
71
|
+
rng = Random.new(rs)
|
72
|
+
n_samples_out = n_samples.fdiv(2).to_i
|
73
|
+
n_samples_in = n_samples - n_samples_out
|
74
|
+
# make two circles.
|
75
|
+
linsp_out = Numo::DFloat.linspace(0, 2 * Math::PI, n_samples_out)
|
76
|
+
linsp_in = Numo::DFloat.linspace(0, 2 * Math::PI, n_samples_in)
|
77
|
+
circle_out = Numo::DFloat[Numo::NMath.cos(linsp_out), Numo::NMath.sin(linsp_out)].transpose
|
78
|
+
circle_in = Numo::DFloat[Numo::NMath.cos(linsp_in), Numo::NMath.sin(linsp_in)].transpose
|
79
|
+
x = Numo::DFloat.vstack([circle_out, factor * circle_in])
|
80
|
+
y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
|
81
|
+
# shuffle data indices.
|
82
|
+
if shuffle
|
83
|
+
rand_ids = [*0...n_samples].shuffle(random: rng.dup)
|
84
|
+
x = x[rand_ids, true].dup
|
85
|
+
y = y[rand_ids].dup
|
86
|
+
end
|
87
|
+
# add gaussian noise.
|
88
|
+
x += Rumale::Utils.rand_normal(x.shape, rng.dup, 0.0, noise) unless noise.nil?
|
89
|
+
[x, y]
|
90
|
+
end
|
91
|
+
|
92
|
+
# Generate a two-dimensional data set consisting of two half circles shifted.
|
93
|
+
#
|
94
|
+
# @param n_samples [Integer] The number of samples.
|
95
|
+
# @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
|
96
|
+
# @param noise [Float] The standard deviaion of gaussian noise added to the data.
|
97
|
+
# If nil is given, no noise is added.
|
98
|
+
# @random_seed [Integer] The seed value using to initialize the random generator.
|
99
|
+
def make_moons(n_samples, shuffle: true, noise: nil, random_seed: nil)
|
100
|
+
Rumale::Validation.check_params_integer(n_samples: n_samples)
|
101
|
+
Rumale::Validation.check_params_boolean(shuffle: shuffle)
|
102
|
+
Rumale::Validation.check_params_type_or_nil(Float, noise: noise)
|
103
|
+
Rumale::Validation.check_params_type_or_nil(Integer, random_seed: random_seed)
|
104
|
+
raise ArgumentError, 'The number of samples must be more than 2.' if n_samples <= 1
|
105
|
+
# initialize some variables.
|
106
|
+
rs = random_seed
|
107
|
+
rs ||= srand
|
108
|
+
rng = Random.new(rs)
|
109
|
+
n_samples_out = n_samples.fdiv(2).to_i
|
110
|
+
n_samples_in = n_samples - n_samples_out
|
111
|
+
# make two half circles.
|
112
|
+
linsp_out = Numo::DFloat.linspace(0, Math::PI, n_samples_out)
|
113
|
+
linsp_in = Numo::DFloat.linspace(0, Math::PI, n_samples_in)
|
114
|
+
circle_out = Numo::DFloat[Numo::NMath.cos(linsp_out), Numo::NMath.sin(linsp_out)].transpose
|
115
|
+
circle_in = Numo::DFloat[1 - Numo::NMath.cos(linsp_in), 1 - Numo::NMath.sin(linsp_in) - 0.5].transpose
|
116
|
+
x = Numo::DFloat.vstack([circle_out, circle_in])
|
117
|
+
y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
|
118
|
+
# shuffle data indices.
|
119
|
+
if shuffle
|
120
|
+
rand_ids = [*0...n_samples].shuffle(random: rng.dup)
|
121
|
+
x = x[rand_ids, true].dup
|
122
|
+
y = y[rand_ids].dup
|
123
|
+
end
|
124
|
+
# add gaussian noise.
|
125
|
+
x += Rumale::Utils.rand_normal(x.shape, rng.dup, 0.0, noise) unless noise.nil?
|
126
|
+
[x, y]
|
127
|
+
end
|
128
|
+
|
51
129
|
private
|
52
130
|
|
53
131
|
def parse_libsvm_line(line, zero_based)
|
@@ -52,8 +52,7 @@ module Rumale
|
|
52
52
|
Rumale::Validation.check_sample_array(x)
|
53
53
|
Rumale::Validation.check_sample_array(y)
|
54
54
|
Rumale::Validation.check_params_float(gamma: gamma)
|
55
|
-
|
56
|
-
Numo::NMath.exp((distance_matrix**2) * -gamma)
|
55
|
+
Numo::NMath.exp(-gamma * squared_error(x, y).abs)
|
57
56
|
end
|
58
57
|
|
59
58
|
# Calculate the linear kernel between x and y.
|
data/lib/rumale/version.rb
CHANGED
data/rumale.gemspec
CHANGED
@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
|
|
19
19
|
Rumale currently supports Linear / Kernel Support Vector Machine,
|
20
20
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
21
21
|
Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
22
|
-
K-Means, DBSCAN, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
|
22
|
+
K-Means, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
|
23
23
|
MSG
|
24
24
|
spec.homepage = 'https://github.com/yoshoku/rumale'
|
25
25
|
spec.license = 'BSD-2-Clause'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rumale
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-06-
|
11
|
+
date: 2019-06-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -114,7 +114,7 @@ description: |
|
|
114
114
|
Rumale currently supports Linear / Kernel Support Vector Machine,
|
115
115
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
116
116
|
Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
117
|
-
K-Means, DBSCAN, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
|
117
|
+
K-Means, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
|
118
118
|
email:
|
119
119
|
- yoshoku@outlook.com
|
120
120
|
executables: []
|
@@ -149,6 +149,7 @@ files:
|
|
149
149
|
- lib/rumale/clustering/dbscan.rb
|
150
150
|
- lib/rumale/clustering/gaussian_mixture.rb
|
151
151
|
- lib/rumale/clustering/k_means.rb
|
152
|
+
- lib/rumale/clustering/power_iteration.rb
|
152
153
|
- lib/rumale/dataset.rb
|
153
154
|
- lib/rumale/decomposition/nmf.rb
|
154
155
|
- lib/rumale/decomposition/pca.rb
|
@@ -249,7 +250,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
249
250
|
version: '0'
|
250
251
|
requirements: []
|
251
252
|
rubyforge_project:
|
252
|
-
rubygems_version: 2.
|
253
|
+
rubygems_version: 2.6.14.4
|
253
254
|
signing_key:
|
254
255
|
specification_version: 4
|
255
256
|
summary: Rumale is a machine learning library in Ruby. Rumale provides machine learning
|