buncher 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -0
- data/lib/buncher.rb +214 -2
- metadata +2 -2
data/Rakefile
CHANGED
data/lib/buncher.rb
CHANGED
@@ -1,4 +1,216 @@
|
|
1
1
|
require 'buncher/buncher'
|
2
|
-
|
3
|
-
|
2
|
+
require 'array'
|
3
|
+
module Buncher
|
4
|
+
VERSION = "0.0.2"
|
5
|
+
# your cluster needs to look like this. Make a bunch of them and pass them in. It's ok to pass in empty elements to start.
|
6
|
+
class Cluster
|
7
|
+
attr_accessor :elements
|
8
|
+
attr_accessor :center
|
9
|
+
def initialize(center, elements=[])
|
10
|
+
self.center = center
|
11
|
+
self.elements = elements
|
12
|
+
end
|
13
|
+
|
14
|
+
def ndim
|
15
|
+
2
|
16
|
+
end
|
17
|
+
|
18
|
+
def clear
|
19
|
+
elements.clear
|
20
|
+
end
|
21
|
+
|
22
|
+
def sum
|
23
|
+
raise "calculate the sum of the elements"
|
24
|
+
end
|
25
|
+
|
26
|
+
def calculate_center
|
27
|
+
raise "calculate the center as an average of the elements"
|
28
|
+
end
|
29
|
+
def distance_squared(element)
|
30
|
+
raise "Distance between center and element - Implemented in a subclass"
|
31
|
+
end
|
32
|
+
|
33
|
+
def distortion
|
34
|
+
elements.inject(0) {|acc, ele| distance_squared(ele)}
|
35
|
+
end
|
36
|
+
|
37
|
+
# return the distance to the closest element and remove the element from the list
|
38
|
+
def closest!(elements)
|
39
|
+
min_distance=nil
|
40
|
+
min_index=nil
|
41
|
+
elements.each_with_index do |element, index|
|
42
|
+
distance = distance_squared(element)
|
43
|
+
if(min_distance.nil? || min_distance > distance)
|
44
|
+
min_distance = distance
|
45
|
+
min_index = index
|
46
|
+
end
|
47
|
+
end
|
48
|
+
elements.delete_at(min_index)
|
49
|
+
min_distance
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
# some useful math
|
54
|
+
def cdf(z)
|
55
|
+
(0.5 * (1.0 + Math.erf((z*1.0)/1.4142135623730951)))
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# split array into several equal sized parts
|
60
|
+
# taken from http://apidock.com/rails/v3.2.8/Array/in_groups
|
61
|
+
def self.split_array_into_parts(array, nb_parts)
|
62
|
+
start = 0
|
63
|
+
groups = []
|
64
|
+
|
65
|
+
modulo = array.size % nb_parts
|
66
|
+
division = array.size / nb_parts
|
67
|
+
|
68
|
+
nb_parts.times do |index|
|
69
|
+
length = division + (modulo > 0 && modulo > index ? 1 : 0)
|
70
|
+
groups << array.slice(start, length)
|
71
|
+
start += length
|
72
|
+
end
|
73
|
+
groups
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.distance(old_centers, new_clusters)
|
77
|
+
new_clusters.inject(0) {|acc, cluster| acc + cluster.closest!(old_centers)}
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.run(centers, elements, nb_jobs)
|
81
|
+
old_centers=nil
|
82
|
+
count=0
|
83
|
+
while(true) do
|
84
|
+
# clear the centers first
|
85
|
+
centers.map(&:clear)
|
86
|
+
|
87
|
+
# create jobs
|
88
|
+
jobs = []
|
89
|
+
elements_for_jobs = split_array_into_parts(elements, nb_jobs)
|
90
|
+
nb_jobs.times do |i|
|
91
|
+
jobs << Job.new(centers, elements_for_jobs[i])
|
92
|
+
end
|
93
|
+
|
94
|
+
if(nb_jobs > 1)
|
95
|
+
# run jobs in parallel
|
96
|
+
queue = Cabiri::JobQueue.new
|
97
|
+
nb_jobs.times do |i|
|
98
|
+
queue.add(i) { jobs[i].run }
|
99
|
+
end
|
100
|
+
queue.start(nb_jobs)
|
101
|
+
else
|
102
|
+
jobs.map(&:run)
|
103
|
+
end
|
104
|
+
centers.map(&:calculate_center)
|
105
|
+
distance = distance(old_centers,centers) if old_centers
|
106
|
+
# puts "iteration #{count+=1}: distance #{distance}" if old_centers
|
107
|
+
break if old_centers && distance < 0.0001
|
108
|
+
old_centers=centers.map(&:center)
|
109
|
+
end
|
110
|
+
centers
|
111
|
+
end
|
112
|
+
|
113
|
+
# job that will be used for parallelization with Cabiri
|
114
|
+
class Job
|
115
|
+
attr_accessor :centers
|
116
|
+
attr_accessor :elements
|
117
|
+
|
118
|
+
def initialize(centers, elements)
|
119
|
+
@centers = centers
|
120
|
+
@elements = elements
|
121
|
+
end
|
122
|
+
|
123
|
+
def run
|
124
|
+
assignElementsToClosestCenter # this is center, with list of elements
|
125
|
+
@centers
|
126
|
+
end
|
127
|
+
|
128
|
+
def assignElementsToClosestCenter
|
129
|
+
@elements.each do |element|
|
130
|
+
best_center = nil
|
131
|
+
best_distance = nil
|
132
|
+
|
133
|
+
@centers.each do |center|
|
134
|
+
distance = center.distance_squared(element)
|
135
|
+
if best_distance.nil? or distance < best_distance
|
136
|
+
best_center = center
|
137
|
+
best_distance = distance
|
138
|
+
end
|
139
|
+
end
|
140
|
+
best_center.elements << element
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
# from the kmeans++ algorithm for choosing centers. returns a list of centers
|
146
|
+
def self.choose_centers(cluster_clazz, elements, number_centers)
|
147
|
+
ele = elements.sample(1).first
|
148
|
+
elements = elements.dup
|
149
|
+
centers = [cluster_clazz.new(ele)]
|
150
|
+
# puts "center 1 is #{centers.first.center}"
|
151
|
+
elements.delete(ele)
|
152
|
+
(2..number_centers).each do |index|
|
153
|
+
probability_distribution=[]
|
154
|
+
sum=elements.inject(0) {|sum, ele| distance=centers.inject(Float::MAX) {|acc,center|[center.distance_squared(ele),acc].min};sum+=distance;probability_distribution <<[distance,ele];sum}
|
155
|
+
dice = rand(0..sum)
|
156
|
+
# puts "dice=#{dice}, sum=#{sum}"
|
157
|
+
# puts "distribution"
|
158
|
+
# probability_distribution.each {|key,val| puts "#{key} : #{val}"}
|
159
|
+
# puts
|
160
|
+
ignore, next_center = probability_distribution.detect {|prob,ele|dice-=prob; dice <=0}
|
161
|
+
# puts "center #{index} is #{next_center}"
|
162
|
+
centers.unshift(cluster_clazz.new(next_center))
|
163
|
+
elements.delete_at(elements.index(next_center))
|
164
|
+
end
|
165
|
+
centers
|
166
|
+
end
|
167
|
+
|
168
|
+
def self.calc_aK(centers, last_aK)
|
169
|
+
if(centers.size == 2)
|
170
|
+
1.0-3.0/(4.0*centers.first.ndim)
|
171
|
+
else
|
172
|
+
last_aK + (1.0 - last_aK) / 6
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def self.fK(centers,last_sK, last_aK)
|
177
|
+
# from here - http://www.ee.columbia.edu/~dpwe/papers/PhamDN05-kmeans.pdf
|
178
|
+
sK = centers.inject(0) {|acc, val| acc + val.distortion}
|
179
|
+
aK = calc_aK(centers, last_aK) if centers.size > 1
|
180
|
+
if centers.size == 1 || (last_sK||0).zero?
|
181
|
+
[1,sK, aK || 0]
|
182
|
+
else
|
183
|
+
result = sK / (last_sK * aK)
|
184
|
+
[result, sK, aK]
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
# run the clustering algorithm until have calculated the current number of clusters, taken from this paper:
|
189
|
+
# http://papers.nips.cc/paper/2526-learning-the-k-in-k-means.pdf
|
190
|
+
def self.cluster(start,cluster_clazz,elements,threads=1)
|
191
|
+
changed=true
|
192
|
+
round=0
|
193
|
+
solutions={}
|
194
|
+
# try all the sizes of clusters up to #elements. Ok, sure we could probably do something like 25% .. ok, I did
|
195
|
+
# that.
|
196
|
+
not_clustered = last_sK = last_aK =last_fK=nil
|
197
|
+
max_clusters=[1,elements.size/4].max
|
198
|
+
(start..max_clusters).each do |number_clusters|
|
199
|
+
initial_centers = choose_centers(cluster_clazz, elements, number_clusters)
|
200
|
+
centers = initial_centers.map(&:dup)
|
201
|
+
centers = run(centers,elements,threads)
|
202
|
+
yield(elements, centers, initial_centers) if block_given?
|
203
|
+
not_clustered ||=centers
|
204
|
+
last_fK, last_sK, last_aK = fK(centers,last_sK, last_aK)
|
205
|
+
puts "#{number_clusters}: fK() = #{last_fK}, last_sK=#{last_sK} last_aK=#{last_aK} "
|
206
|
+
solutions[last_fK]=centers
|
207
|
+
# break if number_clusters == 3 ## debugging
|
208
|
+
end
|
209
|
+
min_fK =solutions.keys.sort.first
|
210
|
+
if min_fK > 0.85
|
211
|
+
not_clustered # ie, not clustered at all
|
212
|
+
else
|
213
|
+
solutions[min_fK]
|
214
|
+
end
|
215
|
+
end
|
4
216
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: buncher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2015-04-
|
12
|
+
date: 2015-04-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: minitest
|