buncher 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/Rakefile +1 -0
  2. data/lib/buncher.rb +214 -2
  3. metadata +2 -2
data/Rakefile CHANGED
@@ -3,6 +3,7 @@
3
3
  require "rubygems"
4
4
  require "hoe"
5
5
 
6
+ Hoe.plugin :git
6
7
  Hoe.plugin :compiler
7
8
  # Hoe.plugin :gem_prelude_sucks
8
9
  # Hoe.plugin :inline
@@ -1,4 +1,216 @@
1
1
  require 'buncher/buncher'
2
- class Buncher
3
- VERSION = "0.0.1"
2
+ require 'array'
3
+ module Buncher
4
+ VERSION = "0.0.2"
5
+ # your cluster needs to look like this. Make a bunch of them and pass them in. It's ok to pass in empty elements to start.
6
+ class Cluster
7
+ attr_accessor :elements
8
+ attr_accessor :center
9
+ def initialize(center, elements=[])
10
+ self.center = center
11
+ self.elements = elements
12
+ end
13
+
14
+ def ndim
15
+ 2
16
+ end
17
+
18
+ def clear
19
+ elements.clear
20
+ end
21
+
22
+ def sum
23
+ raise "calculate the sum of the elements"
24
+ end
25
+
26
+ def calculate_center
27
+ raise "calculate the center as an average of the elements"
28
+ end
29
+ def distance_squared(element)
30
+ raise "Distance between center and element - Implemented in a subclass"
31
+ end
32
+
33
+ def distortion
34
+ elements.inject(0) {|acc, ele| distance_squared(ele)}
35
+ end
36
+
37
+ # return the distance to the closest element and remove the element from the list
38
+ def closest!(elements)
39
+ min_distance=nil
40
+ min_index=nil
41
+ elements.each_with_index do |element, index|
42
+ distance = distance_squared(element)
43
+ if(min_distance.nil? || min_distance > distance)
44
+ min_distance = distance
45
+ min_index = index
46
+ end
47
+ end
48
+ elements.delete_at(min_index)
49
+ min_distance
50
+ end
51
+
52
+
53
+ # some useful math
54
+ def cdf(z)
55
+ (0.5 * (1.0 + Math.erf((z*1.0)/1.4142135623730951)))
56
+ end
57
+ end
58
+
59
+ # split array into several equal sized parts
60
+ # taken from http://apidock.com/rails/v3.2.8/Array/in_groups
61
+ def self.split_array_into_parts(array, nb_parts)
62
+ start = 0
63
+ groups = []
64
+
65
+ modulo = array.size % nb_parts
66
+ division = array.size / nb_parts
67
+
68
+ nb_parts.times do |index|
69
+ length = division + (modulo > 0 && modulo > index ? 1 : 0)
70
+ groups << array.slice(start, length)
71
+ start += length
72
+ end
73
+ groups
74
+ end
75
+
76
+ def self.distance(old_centers, new_clusters)
77
+ new_clusters.inject(0) {|acc, cluster| acc + cluster.closest!(old_centers)}
78
+ end
79
+
80
+ def self.run(centers, elements, nb_jobs)
81
+ old_centers=nil
82
+ count=0
83
+ while(true) do
84
+ # clear the centers first
85
+ centers.map(&:clear)
86
+
87
+ # create jobs
88
+ jobs = []
89
+ elements_for_jobs = split_array_into_parts(elements, nb_jobs)
90
+ nb_jobs.times do |i|
91
+ jobs << Job.new(centers, elements_for_jobs[i])
92
+ end
93
+
94
+ if(nb_jobs > 1)
95
+ # run jobs in parallel
96
+ queue = Cabiri::JobQueue.new
97
+ nb_jobs.times do |i|
98
+ queue.add(i) { jobs[i].run }
99
+ end
100
+ queue.start(nb_jobs)
101
+ else
102
+ jobs.map(&:run)
103
+ end
104
+ centers.map(&:calculate_center)
105
+ distance = distance(old_centers,centers) if old_centers
106
+ # puts "iteration #{count+=1}: distance #{distance}" if old_centers
107
+ break if old_centers && distance < 0.0001
108
+ old_centers=centers.map(&:center)
109
+ end
110
+ centers
111
+ end
112
+
113
+ # job that will be used for parallelization with Cabiri
114
+ class Job
115
+ attr_accessor :centers
116
+ attr_accessor :elements
117
+
118
+ def initialize(centers, elements)
119
+ @centers = centers
120
+ @elements = elements
121
+ end
122
+
123
+ def run
124
+ assignElementsToClosestCenter # this is center, with list of elements
125
+ @centers
126
+ end
127
+
128
+ def assignElementsToClosestCenter
129
+ @elements.each do |element|
130
+ best_center = nil
131
+ best_distance = nil
132
+
133
+ @centers.each do |center|
134
+ distance = center.distance_squared(element)
135
+ if best_distance.nil? or distance < best_distance
136
+ best_center = center
137
+ best_distance = distance
138
+ end
139
+ end
140
+ best_center.elements << element
141
+ end
142
+ end
143
+ end
144
+
145
+ # from the kmeans++ algorithm for choosing centers. returns a list of centers
146
+ def self.choose_centers(cluster_clazz, elements, number_centers)
147
+ ele = elements.sample(1).first
148
+ elements = elements.dup
149
+ centers = [cluster_clazz.new(ele)]
150
+ # puts "center 1 is #{centers.first.center}"
151
+ elements.delete(ele)
152
+ (2..number_centers).each do |index|
153
+ probability_distribution=[]
154
+ sum=elements.inject(0) {|sum, ele| distance=centers.inject(Float::MAX) {|acc,center|[center.distance_squared(ele),acc].min};sum+=distance;probability_distribution <<[distance,ele];sum}
155
+ dice = rand(0..sum)
156
+ # puts "dice=#{dice}, sum=#{sum}"
157
+ # puts "distribution"
158
+ # probability_distribution.each {|key,val| puts "#{key} : #{val}"}
159
+ # puts
160
+ ignore, next_center = probability_distribution.detect {|prob,ele|dice-=prob; dice <=0}
161
+ # puts "center #{index} is #{next_center}"
162
+ centers.unshift(cluster_clazz.new(next_center))
163
+ elements.delete_at(elements.index(next_center))
164
+ end
165
+ centers
166
+ end
167
+
168
+ def self.calc_aK(centers, last_aK)
169
+ if(centers.size == 2)
170
+ 1.0-3.0/(4.0*centers.first.ndim)
171
+ else
172
+ last_aK + (1.0 - last_aK) / 6
173
+ end
174
+ end
175
+
176
+ def self.fK(centers,last_sK, last_aK)
177
+ # from here - http://www.ee.columbia.edu/~dpwe/papers/PhamDN05-kmeans.pdf
178
+ sK = centers.inject(0) {|acc, val| acc + val.distortion}
179
+ aK = calc_aK(centers, last_aK) if centers.size > 1
180
+ if centers.size == 1 || (last_sK||0).zero?
181
+ [1,sK, aK || 0]
182
+ else
183
+ result = sK / (last_sK * aK)
184
+ [result, sK, aK]
185
+ end
186
+ end
187
+
188
+ # run the clustering algorithm until have calculated the current number of clusters, taken from this paper:
189
+ # http://papers.nips.cc/paper/2526-learning-the-k-in-k-means.pdf
190
+ def self.cluster(start,cluster_clazz,elements,threads=1)
191
+ changed=true
192
+ round=0
193
+ solutions={}
194
+ # try all the sizes of clusters up to #elements. Ok, sure we could probably do something like 25% .. ok, I did
195
+ # that.
196
+ not_clustered = last_sK = last_aK =last_fK=nil
197
+ max_clusters=[1,elements.size/4].max
198
+ (start..max_clusters).each do |number_clusters|
199
+ initial_centers = choose_centers(cluster_clazz, elements, number_clusters)
200
+ centers = initial_centers.map(&:dup)
201
+ centers = run(centers,elements,threads)
202
+ yield(elements, centers, initial_centers) if block_given?
203
+ not_clustered ||=centers
204
+ last_fK, last_sK, last_aK = fK(centers,last_sK, last_aK)
205
+ puts "#{number_clusters}: fK() = #{last_fK}, last_sK=#{last_sK} last_aK=#{last_aK} "
206
+ solutions[last_fK]=centers
207
+ # break if number_clusters == 3 ## debugging
208
+ end
209
+ min_fK =solutions.keys.sort.first
210
+ if min_fK > 0.85
211
+ not_clustered # ie, not clustered at all
212
+ else
213
+ solutions[min_fK]
214
+ end
215
+ end
4
216
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: buncher
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-04-18 00:00:00.000000000 Z
12
+ date: 2015-04-19 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: minitest