kmeans-crystal 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/kmeans-crystal.rb +79 -10
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c25646b44067f4a43cca87296ef5d04ff77272c5
|
4
|
+
data.tar.gz: 83e65a93b07dff510d80df973affc7671f565198
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 52d5da1622b0362238fab40aa34b64a948e381066afc90547f52747aed3220d21cb0980879790bc16bc0a0e9e685b33cef880067bd879c3fae88f17010928837
|
7
|
+
data.tar.gz: 719f3197099ad25d359d1758a48cfcd3fe7913f332d31fc931a291075ae795b555dfaa35b2968793dcb170bedb1d34f0ebc9af28dacb8aee05b79c97f2bf8e3c
|
data/lib/kmeans-crystal.rb
CHANGED
@@ -1,24 +1,46 @@
|
|
1
1
|
module KMeansCrystal
|
2
|
+
|
3
|
+
module Measure
|
4
|
+
# 歐式距離
|
5
|
+
class Euclidean
|
6
|
+
def self.distance(a,b)
|
7
|
+
sum = 0.0
|
8
|
+
a.size.times{|i| sum += (a[i] -b[i])**2 }
|
9
|
+
return Math.sqrt(sum)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
# 曼哈頓距離
|
14
|
+
class Manhattan
|
15
|
+
def self.distance(a,b)
|
16
|
+
sum = 0.0
|
17
|
+
a.size.times{|i| sum += (a[i] -b[i]).abs }
|
18
|
+
return Math.sqrt(sum)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
|
2
24
|
class Cluster
|
3
25
|
attr_reader :centroid
|
4
26
|
attr_reader :entries
|
5
27
|
attr_accessor :name
|
6
28
|
|
7
|
-
def initialize(name, centroid, vector_name)
|
29
|
+
def initialize(name, centroid, vector_name, measure)
|
8
30
|
@name = name
|
9
31
|
@centroid = centroid
|
10
32
|
@entries = Array.new
|
11
33
|
@vector_name = vector_name
|
34
|
+
@measure = measure
|
12
35
|
end
|
13
36
|
|
14
37
|
def output
|
15
|
-
|
38
|
+
output_entries = @entries.map{|e| e[:distance] = distance(e); e }
|
39
|
+
return { name: @name, centroid: @centroid, entries: output_entries }
|
16
40
|
end
|
17
41
|
|
18
42
|
def distance(entry)
|
19
|
-
|
20
|
-
@centroid.size.times{|i| sum += (@centroid[i]-entry[@vector_name][i])**2}
|
21
|
-
return Math.sqrt(sum)
|
43
|
+
return @measure.distance(@centroid, entry[@vector_name])
|
22
44
|
end
|
23
45
|
|
24
46
|
def update_centroid
|
@@ -37,13 +59,37 @@ end
|
|
37
59
|
|
38
60
|
|
39
61
|
class Model
|
40
|
-
def initialize(cluster_num, entries,
|
62
|
+
def initialize(cluster_num, entries, **params)
|
41
63
|
raise 'too less cluster_num to evaluate k-means' if entries.size < cluster_num
|
64
|
+
|
42
65
|
@cluster_num = cluster_num
|
43
66
|
@entries = entries
|
44
|
-
@vector_name = vector_name
|
45
67
|
|
46
|
-
|
68
|
+
@vector_name = case params[:vector_name]
|
69
|
+
when nil
|
70
|
+
:features
|
71
|
+
else
|
72
|
+
params[:vector_name]
|
73
|
+
end
|
74
|
+
|
75
|
+
@measure = case params[:distance]
|
76
|
+
when 'manhattan'
|
77
|
+
Measure::Manhattan
|
78
|
+
when 'euclidean',nil
|
79
|
+
Measure::Euclidean
|
80
|
+
else
|
81
|
+
raise 'incorrect value for distance'
|
82
|
+
end
|
83
|
+
|
84
|
+
init_centroids = case params[:init_centroids]
|
85
|
+
when 'random'
|
86
|
+
@entries.sample(@cluster_num).map{|x| x[@vector_name]}
|
87
|
+
when 'kmeans++',nil
|
88
|
+
kmeans_pp(@entries, @cluster_num)
|
89
|
+
else
|
90
|
+
raise 'incorrect value for init_centroids'
|
91
|
+
end
|
92
|
+
|
47
93
|
@clusters = new_clusters(init_centroids)
|
48
94
|
end
|
49
95
|
|
@@ -96,7 +142,7 @@ class Model
|
|
96
142
|
def new_clusters(centroids)
|
97
143
|
clusters = Array.new
|
98
144
|
centroids.each_with_index do |centroid, i|
|
99
|
-
clusters << Cluster.new("cluster#{i}", centroid, @vector_name)
|
145
|
+
clusters << Cluster.new("cluster#{i}", centroid, @vector_name, @measure)
|
100
146
|
end
|
101
147
|
return clusters
|
102
148
|
end
|
@@ -104,9 +150,32 @@ class Model
|
|
104
150
|
def new_clusters_from_old(clusters)
|
105
151
|
arr = Array.new
|
106
152
|
clusters.each do |cluster|
|
107
|
-
arr << Cluster.new(cluster.name, cluster.update_centroid, @vector_name)
|
153
|
+
arr << Cluster.new(cluster.name, cluster.update_centroid, @vector_name, @measure)
|
108
154
|
end
|
109
155
|
return arr
|
110
156
|
end
|
157
|
+
|
158
|
+
def kmeans_pp(entries, cluster_num)
|
159
|
+
features = entries.map{|x| x[@vector_name]}
|
160
|
+
dimension = features.first.size
|
161
|
+
init_val = Array.new(dimension){[0.0,0.0]}
|
162
|
+
dimension.times do |i|
|
163
|
+
init_val[i][0] = features.map{|f| f[i]}.to_a.min
|
164
|
+
init_val[i][1] = features.map{|f| f[i]}.to_a.max
|
165
|
+
end
|
166
|
+
|
167
|
+
init_centroids = entries.sample(cluster_num).map{|x| x[@vector_name]}
|
168
|
+
combination = 2**dimension
|
169
|
+
combination.times do |i|
|
170
|
+
break if i >= init_centroids.size
|
171
|
+
offset = i.to_s(2).rjust(dimension,"0")
|
172
|
+
dimension.times do |d|
|
173
|
+
max_or_min = offset[d].to_i
|
174
|
+
init_centroids[i][d] = init_val[d][max_or_min]
|
175
|
+
end
|
176
|
+
|
177
|
+
end
|
178
|
+
return init_centroids
|
179
|
+
end
|
111
180
|
end
|
112
181
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kmeans-crystal
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ireullin
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-04-14 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: |-
|
14
14
|
The library for data clustering is implemented by k-means algorithm.With the library, you can monitor the model’s training processand end the training if the result is converged.
|
@@ -40,10 +40,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
40
40
|
version: '0'
|
41
41
|
requirements: []
|
42
42
|
rubyforge_project:
|
43
|
-
rubygems_version: 2.
|
43
|
+
rubygems_version: 2.2.2
|
44
44
|
signing_key:
|
45
45
|
specification_version: 4
|
46
46
|
summary: With this library, you can monitor the model’s training process and end the
|
47
47
|
training if the result is converged.
|
48
48
|
test_files: []
|
49
|
-
has_rdoc:
|