kmeans-crystal 1.0.1 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/kmeans-crystal.rb +79 -10
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c25646b44067f4a43cca87296ef5d04ff77272c5
|
4
|
+
data.tar.gz: 83e65a93b07dff510d80df973affc7671f565198
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 52d5da1622b0362238fab40aa34b64a948e381066afc90547f52747aed3220d21cb0980879790bc16bc0a0e9e685b33cef880067bd879c3fae88f17010928837
|
7
|
+
data.tar.gz: 719f3197099ad25d359d1758a48cfcd3fe7913f332d31fc931a291075ae795b555dfaa35b2968793dcb170bedb1d34f0ebc9af28dacb8aee05b79c97f2bf8e3c
|
data/lib/kmeans-crystal.rb
CHANGED
@@ -1,24 +1,46 @@
|
|
1
1
|
module KMeansCrystal
|
2
|
+
|
3
|
+
module Measure
|
4
|
+
# 歐式距離
|
5
|
+
class Euclidean
|
6
|
+
def self.distance(a,b)
|
7
|
+
sum = 0.0
|
8
|
+
a.size.times{|i| sum += (a[i] -b[i])**2 }
|
9
|
+
return Math.sqrt(sum)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
# 曼哈頓距離
|
14
|
+
class Manhattan
|
15
|
+
def self.distance(a,b)
|
16
|
+
sum = 0.0
|
17
|
+
a.size.times{|i| sum += (a[i] -b[i]).abs }
|
18
|
+
return Math.sqrt(sum)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
|
2
24
|
class Cluster
|
3
25
|
attr_reader :centroid
|
4
26
|
attr_reader :entries
|
5
27
|
attr_accessor :name
|
6
28
|
|
7
|
-
def initialize(name, centroid, vector_name)
|
29
|
+
def initialize(name, centroid, vector_name, measure)
|
8
30
|
@name = name
|
9
31
|
@centroid = centroid
|
10
32
|
@entries = Array.new
|
11
33
|
@vector_name = vector_name
|
34
|
+
@measure = measure
|
12
35
|
end
|
13
36
|
|
14
37
|
def output
|
15
|
-
|
38
|
+
output_entries = @entries.map{|e| e[:distance] = distance(e); e }
|
39
|
+
return { name: @name, centroid: @centroid, entries: output_entries }
|
16
40
|
end
|
17
41
|
|
18
42
|
def distance(entry)
|
19
|
-
|
20
|
-
@centroid.size.times{|i| sum += (@centroid[i]-entry[@vector_name][i])**2}
|
21
|
-
return Math.sqrt(sum)
|
43
|
+
return @measure.distance(@centroid, entry[@vector_name])
|
22
44
|
end
|
23
45
|
|
24
46
|
def update_centroid
|
@@ -37,13 +59,37 @@ end
|
|
37
59
|
|
38
60
|
|
39
61
|
class Model
|
40
|
-
def initialize(cluster_num, entries,
|
62
|
+
def initialize(cluster_num, entries, **params)
|
41
63
|
raise 'too less cluster_num to evaluate k-means' if entries.size < cluster_num
|
64
|
+
|
42
65
|
@cluster_num = cluster_num
|
43
66
|
@entries = entries
|
44
|
-
@vector_name = vector_name
|
45
67
|
|
46
|
-
|
68
|
+
@vector_name = case params[:vector_name]
|
69
|
+
when nil
|
70
|
+
:features
|
71
|
+
else
|
72
|
+
params[:vector_name]
|
73
|
+
end
|
74
|
+
|
75
|
+
@measure = case params[:distance]
|
76
|
+
when 'manhattan'
|
77
|
+
Measure::Manhattan
|
78
|
+
when 'euclidean',nil
|
79
|
+
Measure::Euclidean
|
80
|
+
else
|
81
|
+
raise 'incorrect value for distance'
|
82
|
+
end
|
83
|
+
|
84
|
+
init_centroids = case params[:init_centroids]
|
85
|
+
when 'random'
|
86
|
+
@entries.sample(@cluster_num).map{|x| x[@vector_name]}
|
87
|
+
when 'kmeans++',nil
|
88
|
+
kmeans_pp(@entries, @cluster_num)
|
89
|
+
else
|
90
|
+
raise 'incorrect value for init_centroids'
|
91
|
+
end
|
92
|
+
|
47
93
|
@clusters = new_clusters(init_centroids)
|
48
94
|
end
|
49
95
|
|
@@ -96,7 +142,7 @@ class Model
|
|
96
142
|
def new_clusters(centroids)
|
97
143
|
clusters = Array.new
|
98
144
|
centroids.each_with_index do |centroid, i|
|
99
|
-
clusters << Cluster.new("cluster#{i}", centroid, @vector_name)
|
145
|
+
clusters << Cluster.new("cluster#{i}", centroid, @vector_name, @measure)
|
100
146
|
end
|
101
147
|
return clusters
|
102
148
|
end
|
@@ -104,9 +150,32 @@ class Model
|
|
104
150
|
def new_clusters_from_old(clusters)
|
105
151
|
arr = Array.new
|
106
152
|
clusters.each do |cluster|
|
107
|
-
arr << Cluster.new(cluster.name, cluster.update_centroid, @vector_name)
|
153
|
+
arr << Cluster.new(cluster.name, cluster.update_centroid, @vector_name, @measure)
|
108
154
|
end
|
109
155
|
return arr
|
110
156
|
end
|
157
|
+
|
158
|
+
def kmeans_pp(entries, cluster_num)
|
159
|
+
features = entries.map{|x| x[@vector_name]}
|
160
|
+
dimension = features.first.size
|
161
|
+
init_val = Array.new(dimension){[0.0,0.0]}
|
162
|
+
dimension.times do |i|
|
163
|
+
init_val[i][0] = features.map{|f| f[i]}.to_a.min
|
164
|
+
init_val[i][1] = features.map{|f| f[i]}.to_a.max
|
165
|
+
end
|
166
|
+
|
167
|
+
init_centroids = entries.sample(cluster_num).map{|x| x[@vector_name]}
|
168
|
+
combination = 2**dimension
|
169
|
+
combination.times do |i|
|
170
|
+
break if i >= init_centroids.size
|
171
|
+
offset = i.to_s(2).rjust(dimension,"0")
|
172
|
+
dimension.times do |d|
|
173
|
+
max_or_min = offset[d].to_i
|
174
|
+
init_centroids[i][d] = init_val[d][max_or_min]
|
175
|
+
end
|
176
|
+
|
177
|
+
end
|
178
|
+
return init_centroids
|
179
|
+
end
|
111
180
|
end
|
112
181
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kmeans-crystal
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ireullin
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-04-14 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: |-
|
14
14
|
The library for data clustering is implemented by k-means algorithm.With the library, you can monitor the model’s training processand end the training if the result is converged.
|
@@ -40,10 +40,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
40
40
|
version: '0'
|
41
41
|
requirements: []
|
42
42
|
rubyforge_project:
|
43
|
-
rubygems_version: 2.
|
43
|
+
rubygems_version: 2.2.2
|
44
44
|
signing_key:
|
45
45
|
specification_version: 4
|
46
46
|
summary: With this library, you can monitor the model’s training process and end the
|
47
47
|
training if the result is converged.
|
48
48
|
test_files: []
|
49
|
-
has_rdoc:
|