buncher 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/Rakefile +1 -0
  2. data/lib/buncher.rb +214 -2
  3. metadata +2 -2
data/Rakefile CHANGED
@@ -3,6 +3,7 @@
3
3
  require "rubygems"
4
4
  require "hoe"
5
5
 
6
+ Hoe.plugin :git
6
7
  Hoe.plugin :compiler
7
8
  # Hoe.plugin :gem_prelude_sucks
8
9
  # Hoe.plugin :inline
@@ -1,4 +1,216 @@
1
1
  require 'buncher/buncher'
2
- class Buncher
3
- VERSION = "0.0.1"
2
+ require 'array'
3
+ module Buncher
4
+ VERSION = "0.0.2"
5
+ # your cluster needs to look like this. Make a bunch of them and pass them in. It's ok to pass in empty elements to start.
6
+ class Cluster
7
+ attr_accessor :elements
8
+ attr_accessor :center
9
+ def initialize(center, elements=[])
10
+ self.center = center
11
+ self.elements = elements
12
+ end
13
+
14
+ def ndim
15
+ 2
16
+ end
17
+
18
+ def clear
19
+ elements.clear
20
+ end
21
+
22
+ def sum
23
+ raise "calculate the sum of the elements"
24
+ end
25
+
26
+ def calculate_center
27
+ raise "calculate the center as an average of the elements"
28
+ end
29
+ def distance_squared(element)
30
+ raise "Distance between center and element - Implemented in a subclass"
31
+ end
32
+
33
+ def distortion
34
+ elements.inject(0) {|acc, ele| distance_squared(ele)}
35
+ end
36
+
37
+ # return the distance to the closest element and remove the element from the list
38
+ def closest!(elements)
39
+ min_distance=nil
40
+ min_index=nil
41
+ elements.each_with_index do |element, index|
42
+ distance = distance_squared(element)
43
+ if(min_distance.nil? || min_distance > distance)
44
+ min_distance = distance
45
+ min_index = index
46
+ end
47
+ end
48
+ elements.delete_at(min_index)
49
+ min_distance
50
+ end
51
+
52
+
53
+ # some useful math
54
+ def cdf(z)
55
+ (0.5 * (1.0 + Math.erf((z*1.0)/1.4142135623730951)))
56
+ end
57
+ end
58
+
59
+ # split array into several equal sized parts
60
+ # taken from http://apidock.com/rails/v3.2.8/Array/in_groups
61
+ def self.split_array_into_parts(array, nb_parts)
62
+ start = 0
63
+ groups = []
64
+
65
+ modulo = array.size % nb_parts
66
+ division = array.size / nb_parts
67
+
68
+ nb_parts.times do |index|
69
+ length = division + (modulo > 0 && modulo > index ? 1 : 0)
70
+ groups << array.slice(start, length)
71
+ start += length
72
+ end
73
+ groups
74
+ end
75
+
76
+ def self.distance(old_centers, new_clusters)
77
+ new_clusters.inject(0) {|acc, cluster| acc + cluster.closest!(old_centers)}
78
+ end
79
+
80
+ def self.run(centers, elements, nb_jobs)
81
+ old_centers=nil
82
+ count=0
83
+ while(true) do
84
+ # clear the centers first
85
+ centers.map(&:clear)
86
+
87
+ # create jobs
88
+ jobs = []
89
+ elements_for_jobs = split_array_into_parts(elements, nb_jobs)
90
+ nb_jobs.times do |i|
91
+ jobs << Job.new(centers, elements_for_jobs[i])
92
+ end
93
+
94
+ if(nb_jobs > 1)
95
+ # run jobs in parallel
96
+ queue = Cabiri::JobQueue.new
97
+ nb_jobs.times do |i|
98
+ queue.add(i) { jobs[i].run }
99
+ end
100
+ queue.start(nb_jobs)
101
+ else
102
+ jobs.map(&:run)
103
+ end
104
+ centers.map(&:calculate_center)
105
+ distance = distance(old_centers,centers) if old_centers
106
+ # puts "iteration #{count+=1}: distance #{distance}" if old_centers
107
+ break if old_centers && distance < 0.0001
108
+ old_centers=centers.map(&:center)
109
+ end
110
+ centers
111
+ end
112
+
113
+ # job that will be used for parallelization with Cabiri
114
+ class Job
115
+ attr_accessor :centers
116
+ attr_accessor :elements
117
+
118
+ def initialize(centers, elements)
119
+ @centers = centers
120
+ @elements = elements
121
+ end
122
+
123
+ def run
124
+ assignElementsToClosestCenter # this is center, with list of elements
125
+ @centers
126
+ end
127
+
128
+ def assignElementsToClosestCenter
129
+ @elements.each do |element|
130
+ best_center = nil
131
+ best_distance = nil
132
+
133
+ @centers.each do |center|
134
+ distance = center.distance_squared(element)
135
+ if best_distance.nil? or distance < best_distance
136
+ best_center = center
137
+ best_distance = distance
138
+ end
139
+ end
140
+ best_center.elements << element
141
+ end
142
+ end
143
+ end
144
+
145
+ # from the kmeans++ algorithm for choosing centers. returns a list of centers
146
+ def self.choose_centers(cluster_clazz, elements, number_centers)
147
+ ele = elements.sample(1).first
148
+ elements = elements.dup
149
+ centers = [cluster_clazz.new(ele)]
150
+ # puts "center 1 is #{centers.first.center}"
151
+ elements.delete(ele)
152
+ (2..number_centers).each do |index|
153
+ probability_distribution=[]
154
+ sum=elements.inject(0) {|sum, ele| distance=centers.inject(Float::MAX) {|acc,center|[center.distance_squared(ele),acc].min};sum+=distance;probability_distribution <<[distance,ele];sum}
155
+ dice = rand(0..sum)
156
+ # puts "dice=#{dice}, sum=#{sum}"
157
+ # puts "distribution"
158
+ # probability_distribution.each {|key,val| puts "#{key} : #{val}"}
159
+ # puts
160
+ ignore, next_center = probability_distribution.detect {|prob,ele|dice-=prob; dice <=0}
161
+ # puts "center #{index} is #{next_center}"
162
+ centers.unshift(cluster_clazz.new(next_center))
163
+ elements.delete_at(elements.index(next_center))
164
+ end
165
+ centers
166
+ end
167
+
168
+ def self.calc_aK(centers, last_aK)
169
+ if(centers.size == 2)
170
+ 1.0-3.0/(4.0*centers.first.ndim)
171
+ else
172
+ last_aK + (1.0 - last_aK) / 6
173
+ end
174
+ end
175
+
176
+ def self.fK(centers,last_sK, last_aK)
177
+ # from here - http://www.ee.columbia.edu/~dpwe/papers/PhamDN05-kmeans.pdf
178
+ sK = centers.inject(0) {|acc, val| acc + val.distortion}
179
+ aK = calc_aK(centers, last_aK) if centers.size > 1
180
+ if centers.size == 1 || (last_sK||0).zero?
181
+ [1,sK, aK || 0]
182
+ else
183
+ result = sK / (last_sK * aK)
184
+ [result, sK, aK]
185
+ end
186
+ end
187
+
188
+ # run the clustering algorithm until have calculated the current number of clusters, taken from this paper:
189
+ # http://papers.nips.cc/paper/2526-learning-the-k-in-k-means.pdf
190
+ def self.cluster(start,cluster_clazz,elements,threads=1)
191
+ changed=true
192
+ round=0
193
+ solutions={}
194
+ # try all the sizes of clusters up to #elements. Ok, sure we could probably do something like 25% .. ok, I did
195
+ # that.
196
+ not_clustered = last_sK = last_aK =last_fK=nil
197
+ max_clusters=[1,elements.size/4].max
198
+ (start..max_clusters).each do |number_clusters|
199
+ initial_centers = choose_centers(cluster_clazz, elements, number_clusters)
200
+ centers = initial_centers.map(&:dup)
201
+ centers = run(centers,elements,threads)
202
+ yield(elements, centers, initial_centers) if block_given?
203
+ not_clustered ||=centers
204
+ last_fK, last_sK, last_aK = fK(centers,last_sK, last_aK)
205
+ puts "#{number_clusters}: fK() = #{last_fK}, last_sK=#{last_sK} last_aK=#{last_aK} "
206
+ solutions[last_fK]=centers
207
+ # break if number_clusters == 3 ## debugging
208
+ end
209
+ min_fK =solutions.keys.sort.first
210
+ if min_fK > 0.85
211
+ not_clustered # ie, not clustered at all
212
+ else
213
+ solutions[min_fK]
214
+ end
215
+ end
4
216
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: buncher
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-04-18 00:00:00.000000000 Z
12
+ date: 2015-04-19 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: minitest