buncher 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +1 -0
- data/lib/buncher.rb +214 -2
- metadata +2 -2
data/Rakefile
CHANGED
data/lib/buncher.rb
CHANGED
@@ -1,4 +1,216 @@
|
|
1
1
|
require 'buncher/buncher'
|
2
|
-
|
3
|
-
|
2
|
+
require 'array'
|
3
|
+
module Buncher
|
4
|
+
VERSION = "0.0.2"
|
5
|
+
# your cluster needs to look like this. Make a bunch of them and pass them in. It's ok to pass in empty elements to start.
|
6
|
+
class Cluster
|
7
|
+
attr_accessor :elements
|
8
|
+
attr_accessor :center
|
9
|
+
def initialize(center, elements=[])
|
10
|
+
self.center = center
|
11
|
+
self.elements = elements
|
12
|
+
end
|
13
|
+
|
14
|
+
def ndim
|
15
|
+
2
|
16
|
+
end
|
17
|
+
|
18
|
+
def clear
|
19
|
+
elements.clear
|
20
|
+
end
|
21
|
+
|
22
|
+
def sum
|
23
|
+
raise "calculate the sum of the elements"
|
24
|
+
end
|
25
|
+
|
26
|
+
def calculate_center
|
27
|
+
raise "calculate the center as an average of the elements"
|
28
|
+
end
|
29
|
+
def distance_squared(element)
|
30
|
+
raise "Distance between center and element - Implemented in a subclass"
|
31
|
+
end
|
32
|
+
|
33
|
+
def distortion
|
34
|
+
elements.inject(0) {|acc, ele| distance_squared(ele)}
|
35
|
+
end
|
36
|
+
|
37
|
+
# return the distance to the closest element and remove the element from the list
|
38
|
+
def closest!(elements)
|
39
|
+
min_distance=nil
|
40
|
+
min_index=nil
|
41
|
+
elements.each_with_index do |element, index|
|
42
|
+
distance = distance_squared(element)
|
43
|
+
if(min_distance.nil? || min_distance > distance)
|
44
|
+
min_distance = distance
|
45
|
+
min_index = index
|
46
|
+
end
|
47
|
+
end
|
48
|
+
elements.delete_at(min_index)
|
49
|
+
min_distance
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
# some useful math
|
54
|
+
def cdf(z)
|
55
|
+
(0.5 * (1.0 + Math.erf((z*1.0)/1.4142135623730951)))
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# split array into several equal sized parts
|
60
|
+
# taken from http://apidock.com/rails/v3.2.8/Array/in_groups
|
61
|
+
def self.split_array_into_parts(array, nb_parts)
|
62
|
+
start = 0
|
63
|
+
groups = []
|
64
|
+
|
65
|
+
modulo = array.size % nb_parts
|
66
|
+
division = array.size / nb_parts
|
67
|
+
|
68
|
+
nb_parts.times do |index|
|
69
|
+
length = division + (modulo > 0 && modulo > index ? 1 : 0)
|
70
|
+
groups << array.slice(start, length)
|
71
|
+
start += length
|
72
|
+
end
|
73
|
+
groups
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.distance(old_centers, new_clusters)
|
77
|
+
new_clusters.inject(0) {|acc, cluster| acc + cluster.closest!(old_centers)}
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.run(centers, elements, nb_jobs)
|
81
|
+
old_centers=nil
|
82
|
+
count=0
|
83
|
+
while(true) do
|
84
|
+
# clear the centers first
|
85
|
+
centers.map(&:clear)
|
86
|
+
|
87
|
+
# create jobs
|
88
|
+
jobs = []
|
89
|
+
elements_for_jobs = split_array_into_parts(elements, nb_jobs)
|
90
|
+
nb_jobs.times do |i|
|
91
|
+
jobs << Job.new(centers, elements_for_jobs[i])
|
92
|
+
end
|
93
|
+
|
94
|
+
if(nb_jobs > 1)
|
95
|
+
# run jobs in parallel
|
96
|
+
queue = Cabiri::JobQueue.new
|
97
|
+
nb_jobs.times do |i|
|
98
|
+
queue.add(i) { jobs[i].run }
|
99
|
+
end
|
100
|
+
queue.start(nb_jobs)
|
101
|
+
else
|
102
|
+
jobs.map(&:run)
|
103
|
+
end
|
104
|
+
centers.map(&:calculate_center)
|
105
|
+
distance = distance(old_centers,centers) if old_centers
|
106
|
+
# puts "iteration #{count+=1}: distance #{distance}" if old_centers
|
107
|
+
break if old_centers && distance < 0.0001
|
108
|
+
old_centers=centers.map(&:center)
|
109
|
+
end
|
110
|
+
centers
|
111
|
+
end
|
112
|
+
|
113
|
+
# job that will be used for parallelization with Cabiri
|
114
|
+
class Job
|
115
|
+
attr_accessor :centers
|
116
|
+
attr_accessor :elements
|
117
|
+
|
118
|
+
def initialize(centers, elements)
|
119
|
+
@centers = centers
|
120
|
+
@elements = elements
|
121
|
+
end
|
122
|
+
|
123
|
+
def run
|
124
|
+
assignElementsToClosestCenter # this is center, with list of elements
|
125
|
+
@centers
|
126
|
+
end
|
127
|
+
|
128
|
+
def assignElementsToClosestCenter
|
129
|
+
@elements.each do |element|
|
130
|
+
best_center = nil
|
131
|
+
best_distance = nil
|
132
|
+
|
133
|
+
@centers.each do |center|
|
134
|
+
distance = center.distance_squared(element)
|
135
|
+
if best_distance.nil? or distance < best_distance
|
136
|
+
best_center = center
|
137
|
+
best_distance = distance
|
138
|
+
end
|
139
|
+
end
|
140
|
+
best_center.elements << element
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
# from the kmeans++ algorithm for choosing centers. returns a list of centers
|
146
|
+
def self.choose_centers(cluster_clazz, elements, number_centers)
|
147
|
+
ele = elements.sample(1).first
|
148
|
+
elements = elements.dup
|
149
|
+
centers = [cluster_clazz.new(ele)]
|
150
|
+
# puts "center 1 is #{centers.first.center}"
|
151
|
+
elements.delete(ele)
|
152
|
+
(2..number_centers).each do |index|
|
153
|
+
probability_distribution=[]
|
154
|
+
sum=elements.inject(0) {|sum, ele| distance=centers.inject(Float::MAX) {|acc,center|[center.distance_squared(ele),acc].min};sum+=distance;probability_distribution <<[distance,ele];sum}
|
155
|
+
dice = rand(0..sum)
|
156
|
+
# puts "dice=#{dice}, sum=#{sum}"
|
157
|
+
# puts "distribution"
|
158
|
+
# probability_distribution.each {|key,val| puts "#{key} : #{val}"}
|
159
|
+
# puts
|
160
|
+
ignore, next_center = probability_distribution.detect {|prob,ele|dice-=prob; dice <=0}
|
161
|
+
# puts "center #{index} is #{next_center}"
|
162
|
+
centers.unshift(cluster_clazz.new(next_center))
|
163
|
+
elements.delete_at(elements.index(next_center))
|
164
|
+
end
|
165
|
+
centers
|
166
|
+
end
|
167
|
+
|
168
|
+
def self.calc_aK(centers, last_aK)
|
169
|
+
if(centers.size == 2)
|
170
|
+
1.0-3.0/(4.0*centers.first.ndim)
|
171
|
+
else
|
172
|
+
last_aK + (1.0 - last_aK) / 6
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def self.fK(centers,last_sK, last_aK)
|
177
|
+
# from here - http://www.ee.columbia.edu/~dpwe/papers/PhamDN05-kmeans.pdf
|
178
|
+
sK = centers.inject(0) {|acc, val| acc + val.distortion}
|
179
|
+
aK = calc_aK(centers, last_aK) if centers.size > 1
|
180
|
+
if centers.size == 1 || (last_sK||0).zero?
|
181
|
+
[1,sK, aK || 0]
|
182
|
+
else
|
183
|
+
result = sK / (last_sK * aK)
|
184
|
+
[result, sK, aK]
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
# run the clustering algorithm until have calculated the current number of clusters, taken from this paper:
|
189
|
+
# http://papers.nips.cc/paper/2526-learning-the-k-in-k-means.pdf
|
190
|
+
def self.cluster(start,cluster_clazz,elements,threads=1)
|
191
|
+
changed=true
|
192
|
+
round=0
|
193
|
+
solutions={}
|
194
|
+
# try all the sizes of clusters up to #elements. Ok, sure we could probably do something like 25% .. ok, I did
|
195
|
+
# that.
|
196
|
+
not_clustered = last_sK = last_aK =last_fK=nil
|
197
|
+
max_clusters=[1,elements.size/4].max
|
198
|
+
(start..max_clusters).each do |number_clusters|
|
199
|
+
initial_centers = choose_centers(cluster_clazz, elements, number_clusters)
|
200
|
+
centers = initial_centers.map(&:dup)
|
201
|
+
centers = run(centers,elements,threads)
|
202
|
+
yield(elements, centers, initial_centers) if block_given?
|
203
|
+
not_clustered ||=centers
|
204
|
+
last_fK, last_sK, last_aK = fK(centers,last_sK, last_aK)
|
205
|
+
puts "#{number_clusters}: fK() = #{last_fK}, last_sK=#{last_sK} last_aK=#{last_aK} "
|
206
|
+
solutions[last_fK]=centers
|
207
|
+
# break if number_clusters == 3 ## debugging
|
208
|
+
end
|
209
|
+
min_fK =solutions.keys.sort.first
|
210
|
+
if min_fK > 0.85
|
211
|
+
not_clustered # ie, not clustered at all
|
212
|
+
else
|
213
|
+
solutions[min_fK]
|
214
|
+
end
|
215
|
+
end
|
4
216
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: buncher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2015-04-
|
12
|
+
date: 2015-04-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: minitest
|