rumale 0.13.6 → 0.13.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/lib/rumale.rb +3 -0
- data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +56 -0
- data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +53 -0
- data/lib/rumale/evaluation_measure/silhouette_score.rb +80 -0
- data/lib/rumale/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6b7030e096e10df1a95d79aefe6d275c0ca16406
|
4
|
+
data.tar.gz: 85f6b809e41ecb4743df7e07b99bc3fcf13710e1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: efb25c2ea461c3ceb9b8d500a4ef77dd4db6e4a2c21400009e8fd940bd28dab6d4a725a320a5e3cc1a1b7676626d03b9c2ef73c9c02246c29e122461680488d7
|
7
|
+
data.tar.gz: 264c10852a7eb01ddb075c87969625f2cb82eb0bfa3050e21bf41f355d8213808c3684a63a0e7e95480965e0737ae76a85babee60b0a638274c31bc1741774b9
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
# 0.13.7
|
2
|
+
- Add some evaluator classes for clustering.
|
3
|
+
- SilhouetteScore
|
4
|
+
- CalinskiHarabaszScore
|
5
|
+
- DaviesBouldinScore
|
6
|
+
|
1
7
|
# 0.13.6
|
2
8
|
- Add transformer class for [FastICA](https://yoshoku.github.io/rumale/doc/Rumale/Decomposition/FastICA.html).
|
3
9
|
- Fix a typo on README ([#13](https://github.com/yoshoku/rumale/pull/13)).
|
data/lib/rumale.rb
CHANGED
@@ -104,3 +104,6 @@ require 'rumale/evaluation_measure/adjusted_rand_score'
|
|
104
104
|
require 'rumale/evaluation_measure/purity'
|
105
105
|
require 'rumale/evaluation_measure/mutual_information'
|
106
106
|
require 'rumale/evaluation_measure/normalized_mutual_information'
|
107
|
+
require 'rumale/evaluation_measure/silhouette_score'
|
108
|
+
require 'rumale/evaluation_measure/davies_bouldin_score'
|
109
|
+
require 'rumale/evaluation_measure/calinski_harabasz_score'
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/evaluator'
|
4
|
+
|
5
|
+
module Rumale
|
6
|
+
module EvaluationMeasure
|
7
|
+
# CalinskiHarabaszScore is a class that calculates the Calinski and Harabasz score.
|
8
|
+
#
|
9
|
+
# @example
|
10
|
+
# evaluator = Rumale::EvaluationMeasure::CalinskiHarabaszScore.new
|
11
|
+
# puts evaluator.score(x, predicted)
|
12
|
+
#
|
13
|
+
# *Reference*
|
14
|
+
# - T. Calinski and J. Harabsz, "A dendrite method for cluster analysis," Communication in Statistics, Vol. 3 (1), pp. 1--27, 1972.
|
15
|
+
class CalinskiHarabaszScore
|
16
|
+
include Base::Evaluator
|
17
|
+
|
18
|
+
# Calculates the Calinski and Harabasz score.
|
19
|
+
#
|
20
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for calculating score.
|
21
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The predicted labels for each sample.
|
22
|
+
# @return [Float] The Calinski and Harabasz score.
|
23
|
+
def score(x, y)
|
24
|
+
check_sample_array(x)
|
25
|
+
check_label_array(y)
|
26
|
+
check_sample_label_size(x, y)
|
27
|
+
|
28
|
+
labels = y.to_a.uniq.sort
|
29
|
+
n_clusters = labels.size
|
30
|
+
n_dimensions = x.shape[1]
|
31
|
+
|
32
|
+
centroids = Numo::DFloat.zeros(n_clusters, n_dimensions)
|
33
|
+
|
34
|
+
within_group = 0.0
|
35
|
+
n_clusters.times do |n|
|
36
|
+
cls_samples = x[y.eq(labels[n]), true]
|
37
|
+
cls_centroid = cls_samples.mean(0)
|
38
|
+
centroids[n, true] = cls_centroid
|
39
|
+
within_group += ((cls_samples - cls_centroid)**2).sum
|
40
|
+
end
|
41
|
+
|
42
|
+
return 1.0 if within_group.zero?
|
43
|
+
|
44
|
+
mean_vec = x.mean(0)
|
45
|
+
between_group = 0.0
|
46
|
+
n_clusters.times do |n|
|
47
|
+
sz_cluster = y.eq(labels[n]).count
|
48
|
+
between_group += sz_cluster * ((centroids[n, true] - mean_vec)**2).sum
|
49
|
+
end
|
50
|
+
|
51
|
+
n_samples = x.shape[0]
|
52
|
+
(between_group / (n_clusters - 1)) / (within_group / (n_samples - n_clusters))
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/evaluator'
|
4
|
+
require 'rumale/pairwise_metric'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module EvaluationMeasure
|
8
|
+
# DaviesBouldinScore is a class that calculates the Davies-Bouldin score.
|
9
|
+
#
|
10
|
+
# @example
|
11
|
+
# evaluator = Rumale::EvaluationMeasure::DaviesBouldinScore.new
|
12
|
+
# puts evaluator.score(x, predicted)
|
13
|
+
#
|
14
|
+
# *Reference*
|
15
|
+
# - D L. Davies and D W. Bouldin, "A Cluster Separation Measure," IEEE Trans. Pattern Analysis and Machine Intelligence, Vol. PAMI-1, No. 2, pp. 224--227, 1979.
|
16
|
+
class DaviesBouldinScore
|
17
|
+
include Base::Evaluator
|
18
|
+
|
19
|
+
# Calculates the Davies-Bouldin score.
|
20
|
+
#
|
21
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for calculating score.
|
22
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The predicted labels for each sample.
|
23
|
+
# @return [Float] The Davies-Bouldin score.
|
24
|
+
def score(x, y)
|
25
|
+
check_sample_array(x)
|
26
|
+
check_label_array(y)
|
27
|
+
check_sample_label_size(x, y)
|
28
|
+
|
29
|
+
labels = y.to_a.uniq.sort
|
30
|
+
n_clusters = labels.size
|
31
|
+
n_dimensions = x.shape[1]
|
32
|
+
|
33
|
+
dist_cluster = Numo::DFloat.zeros(n_clusters)
|
34
|
+
centroids = Numo::DFloat.zeros(n_clusters, n_dimensions)
|
35
|
+
|
36
|
+
n_clusters.times do |n|
|
37
|
+
cls_samples = x[y.eq(labels[n]), true]
|
38
|
+
cls_centroid = cls_samples.mean(0)
|
39
|
+
centroids[n, true] = cls_centroid
|
40
|
+
dist_cluster[n] = Rumale::PairwiseMetric.euclidean_distance(cls_samples, cls_centroid.expand_dims(0)).mean
|
41
|
+
end
|
42
|
+
|
43
|
+
dist_centroid = Rumale::PairwiseMetric.euclidean_distance(centroids)
|
44
|
+
# p dist_cluster
|
45
|
+
# p dist_centroid
|
46
|
+
dist_centroid[dist_centroid.eq(0)] = Float::INFINITY
|
47
|
+
dist_mat = (dist_cluster.expand_dims(1) + dist_cluster) / dist_centroid
|
48
|
+
dist_mat[dist_mat.diag_indices] = -Float::INFINITY
|
49
|
+
dist_mat.max(0).mean
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/evaluator'
|
4
|
+
require 'rumale/pairwise_metric'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module EvaluationMeasure
|
8
|
+
# SilhouetteScore is a class that calculates the Silhouette Coefficient.
|
9
|
+
#
|
10
|
+
# @example
|
11
|
+
# evaluator = Rumale::EvaluationMeasure::SilhouetteScore.new
|
12
|
+
# puts evaluator.score(x, predicted)
|
13
|
+
#
|
14
|
+
# *Reference*
|
15
|
+
# - P J. Rousseuw, "Silhouettes: A graphical aid to the interpretation and validation of cluster analysis," Journal of Computational and Applied Mathematics, Vol. 20, pp. 53--65, 1987.
|
16
|
+
class SilhouetteScore
|
17
|
+
include Base::Evaluator
|
18
|
+
|
19
|
+
# Create a new evaluator that calculates the silhouette coefficient.
|
20
|
+
#
|
21
|
+
# @param metric [String] The metric to calculate the sihouette coefficient.
|
22
|
+
# If metric is 'euclidean', Euclidean distance is used for dissimilarity between sample points.
|
23
|
+
# If metric is 'precomputed', the score method expects to be given a distance matrix.
|
24
|
+
def initialize(metric: 'euclidean')
|
25
|
+
check_params_string(metric: metric)
|
26
|
+
@metric = metric
|
27
|
+
end
|
28
|
+
|
29
|
+
# Calculates the silhouette coefficient.
|
30
|
+
#
|
31
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for calculating score.
|
32
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The predicted labels for each sample.
|
33
|
+
# @return [Float] The mean of silhouette coefficient.
|
34
|
+
def score(x, y)
|
35
|
+
check_sample_array(x)
|
36
|
+
check_label_array(y)
|
37
|
+
check_sample_label_size(x, y)
|
38
|
+
|
39
|
+
dist_mat = @metric == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x)
|
40
|
+
|
41
|
+
labels = y.to_a.uniq.sort
|
42
|
+
n_clusters = labels.size
|
43
|
+
n_samples = dist_mat.shape[0]
|
44
|
+
|
45
|
+
intra_dists = Numo::DFloat.zeros(n_samples)
|
46
|
+
n_clusters.times do |n|
|
47
|
+
cls_pos = y.eq(labels[n])
|
48
|
+
sz_cluster = cls_pos.count
|
49
|
+
next unless sz_cluster > 1
|
50
|
+
cls_dist_mat = dist_mat[cls_pos, cls_pos].dup
|
51
|
+
cls_dist_mat[cls_dist_mat.diag_indices] = 0.0
|
52
|
+
intra_dists[cls_pos] = cls_dist_mat.sum(0) / (sz_cluster - 1)
|
53
|
+
end
|
54
|
+
|
55
|
+
inter_dists = Numo::DFloat.zeros(n_samples) + Float::INFINITY
|
56
|
+
n_clusters.times do |m|
|
57
|
+
cls_pos = y.eq(labels[m])
|
58
|
+
n_clusters.times do |n|
|
59
|
+
next if m == n
|
60
|
+
not_cls_pos = y.eq(labels[n])
|
61
|
+
inter_dists[cls_pos] = Numo::DFloat.minimum(
|
62
|
+
inter_dists[cls_pos], dist_mat[cls_pos, not_cls_pos].mean(1)
|
63
|
+
)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
mask = Numo::DFloat.ones(n_samples)
|
68
|
+
n_clusters.times do |n|
|
69
|
+
cls_pos = y.eq(labels[n])
|
70
|
+
mask[cls_pos] = 0 unless cls_pos.count > 1
|
71
|
+
end
|
72
|
+
|
73
|
+
silhouettes = mask * ((inter_dists - intra_dists) / Numo::DFloat.maximum(inter_dists, intra_dists))
|
74
|
+
silhouettes[silhouettes.isnan] = 0.0
|
75
|
+
|
76
|
+
silhouettes.mean
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/lib/rumale/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rumale
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.13.
|
4
|
+
version: 0.13.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-11-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -185,6 +185,8 @@ files:
|
|
185
185
|
- lib/rumale/ensemble/random_forest_regressor.rb
|
186
186
|
- lib/rumale/evaluation_measure/accuracy.rb
|
187
187
|
- lib/rumale/evaluation_measure/adjusted_rand_score.rb
|
188
|
+
- lib/rumale/evaluation_measure/calinski_harabasz_score.rb
|
189
|
+
- lib/rumale/evaluation_measure/davies_bouldin_score.rb
|
188
190
|
- lib/rumale/evaluation_measure/explained_variance_score.rb
|
189
191
|
- lib/rumale/evaluation_measure/f_score.rb
|
190
192
|
- lib/rumale/evaluation_measure/log_loss.rb
|
@@ -200,6 +202,7 @@ files:
|
|
200
202
|
- lib/rumale/evaluation_measure/r2_score.rb
|
201
203
|
- lib/rumale/evaluation_measure/recall.rb
|
202
204
|
- lib/rumale/evaluation_measure/roc_auc.rb
|
205
|
+
- lib/rumale/evaluation_measure/silhouette_score.rb
|
203
206
|
- lib/rumale/kernel_approximation/rbf.rb
|
204
207
|
- lib/rumale/kernel_machine/kernel_pca.rb
|
205
208
|
- lib/rumale/kernel_machine/kernel_ridge.rb
|