buncher 0.0.2 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/Manifest.txt CHANGED
@@ -3,6 +3,6 @@ History.txt
3
3
  Manifest.txt
4
4
  README.txt
5
5
  Rakefile
6
- bin/buncher
6
+ lib/buncher/buncher.bundle
7
7
  lib/buncher.rb
8
8
  test/test_buncher.rb
data/Rakefile CHANGED
@@ -4,6 +4,7 @@ require "rubygems"
4
4
  require "hoe"
5
5
 
6
6
  Hoe.plugin :git
7
+ Hoe.plugin :debugging # for gdb
7
8
  Hoe.plugin :compiler
8
9
  # Hoe.plugin :gem_prelude_sucks
9
10
  # Hoe.plugin :inline
@@ -13,7 +14,7 @@ Hoe.plugin :minitest
13
14
  # Hoe.plugin :rdoc
14
15
 
15
16
  Hoe.spec "buncher" do
16
- developer("Robert Mathews", "rob@justsoftwareconsulting.com")
17
+ developer("Robert Mathews", "rob@drync.com")
17
18
  self.extra_dev_deps
18
19
 
19
20
  license "MIT" # this should match the license in the README
@@ -6,6 +6,7 @@ extension_name = 'buncher'
6
6
 
7
7
  # The destination
8
8
  dir_config(extension_name)
9
-
9
+ # add some standard template libraries (headers only)
10
+ dir_config('boost','ext/boost/1.57.0')
10
11
  # Do the work
11
12
  create_makefile(extension_name)
Binary file
data/lib/buncher.rb CHANGED
@@ -1,7 +1,6 @@
1
1
  require 'buncher/buncher'
2
- require 'array'
3
2
  module Buncher
4
- VERSION = "0.0.2"
3
+ VERSION = "1.0.0"
5
4
  # your cluster needs to look like this. Make a bunch of them and pass them in. It's ok to pass in empty elements to start.
6
5
  class Cluster
7
6
  attr_accessor :elements
@@ -11,49 +10,15 @@ module Buncher
11
10
  self.elements = elements
12
11
  end
13
12
 
14
- def ndim
15
- 2
16
- end
17
-
18
13
  def clear
19
14
  elements.clear
20
15
  end
21
16
 
22
- def sum
23
- raise "calculate the sum of the elements"
24
- end
25
-
26
- def calculate_center
27
- raise "calculate the center as an average of the elements"
28
- end
29
- def distance_squared(element)
30
- raise "Distance between center and element - Implemented in a subclass"
31
- end
32
-
33
- def distortion
34
- elements.inject(0) {|acc, ele| distance_squared(ele)}
35
- end
36
-
37
- # return the distance to the closest element and remove the element from the list
38
- def closest!(elements)
39
- min_distance=nil
40
- min_index=nil
41
- elements.each_with_index do |element, index|
42
- distance = distance_squared(element)
43
- if(min_distance.nil? || min_distance > distance)
44
- min_distance = distance
45
- min_index = index
46
- end
47
- end
48
- elements.delete_at(min_index)
49
- min_distance
17
+ def size
18
+ elements.size
50
19
  end
51
20
 
52
-
53
- # some useful math
54
- def cdf(z)
55
- (0.5 * (1.0 + Math.erf((z*1.0)/1.4142135623730951)))
56
- end
21
+ # distortion - C++ code
57
22
  end
58
23
 
59
24
  # split array into several equal sized parts
@@ -73,101 +38,9 @@ module Buncher
73
38
  groups
74
39
  end
75
40
 
76
- def self.distance(old_centers, new_clusters)
77
- new_clusters.inject(0) {|acc, cluster| acc + cluster.closest!(old_centers)}
78
- end
79
-
80
- def self.run(centers, elements, nb_jobs)
81
- old_centers=nil
82
- count=0
83
- while(true) do
84
- # clear the centers first
85
- centers.map(&:clear)
86
-
87
- # create jobs
88
- jobs = []
89
- elements_for_jobs = split_array_into_parts(elements, nb_jobs)
90
- nb_jobs.times do |i|
91
- jobs << Job.new(centers, elements_for_jobs[i])
92
- end
93
-
94
- if(nb_jobs > 1)
95
- # run jobs in parallel
96
- queue = Cabiri::JobQueue.new
97
- nb_jobs.times do |i|
98
- queue.add(i) { jobs[i].run }
99
- end
100
- queue.start(nb_jobs)
101
- else
102
- jobs.map(&:run)
103
- end
104
- centers.map(&:calculate_center)
105
- distance = distance(old_centers,centers) if old_centers
106
- # puts "iteration #{count+=1}: distance #{distance}" if old_centers
107
- break if old_centers && distance < 0.0001
108
- old_centers=centers.map(&:center)
109
- end
110
- centers
111
- end
112
-
113
- # job that will be used for parallelization with Cabiri
114
- class Job
115
- attr_accessor :centers
116
- attr_accessor :elements
117
-
118
- def initialize(centers, elements)
119
- @centers = centers
120
- @elements = elements
121
- end
122
-
123
- def run
124
- assignElementsToClosestCenter # this is center, with list of elements
125
- @centers
126
- end
127
-
128
- def assignElementsToClosestCenter
129
- @elements.each do |element|
130
- best_center = nil
131
- best_distance = nil
132
-
133
- @centers.each do |center|
134
- distance = center.distance_squared(element)
135
- if best_distance.nil? or distance < best_distance
136
- best_center = center
137
- best_distance = distance
138
- end
139
- end
140
- best_center.elements << element
141
- end
142
- end
143
- end
144
-
145
- # from the kmeans++ algorithm for choosing centers. returns a list of centers
146
- def self.choose_centers(cluster_clazz, elements, number_centers)
147
- ele = elements.sample(1).first
148
- elements = elements.dup
149
- centers = [cluster_clazz.new(ele)]
150
- # puts "center 1 is #{centers.first.center}"
151
- elements.delete(ele)
152
- (2..number_centers).each do |index|
153
- probability_distribution=[]
154
- sum=elements.inject(0) {|sum, ele| distance=centers.inject(Float::MAX) {|acc,center|[center.distance_squared(ele),acc].min};sum+=distance;probability_distribution <<[distance,ele];sum}
155
- dice = rand(0..sum)
156
- # puts "dice=#{dice}, sum=#{sum}"
157
- # puts "distribution"
158
- # probability_distribution.each {|key,val| puts "#{key} : #{val}"}
159
- # puts
160
- ignore, next_center = probability_distribution.detect {|prob,ele|dice-=prob; dice <=0}
161
- # puts "center #{index} is #{next_center}"
162
- centers.unshift(cluster_clazz.new(next_center))
163
- elements.delete_at(elements.index(next_center))
164
- end
165
- centers
166
- end
167
-
168
41
  def self.calc_aK(centers, last_aK)
169
42
  if(centers.size == 2)
170
- 1.0-3.0/(4.0*centers.first.ndim)
43
+ 1.0-3.0/(4.0*centers.first.center.size)
171
44
  else
172
45
  last_aK + (1.0 - last_aK) / 6
173
46
  end
@@ -187,7 +60,7 @@ module Buncher
187
60
 
188
61
  # run the clustering algorithm until have calculated the current number of clusters, taken from this paper:
189
62
  # http://papers.nips.cc/paper/2526-learning-the-k-in-k-means.pdf
190
- def self.cluster(start,cluster_clazz,elements,threads=1)
63
+ def self.cluster(elements)
191
64
  changed=true
192
65
  round=0
193
66
  solutions={}
@@ -195,16 +68,17 @@ module Buncher
195
68
  # that.
196
69
  not_clustered = last_sK = last_aK =last_fK=nil
197
70
  max_clusters=[1,elements.size/4].max
198
- (start..max_clusters).each do |number_clusters|
199
- initial_centers = choose_centers(cluster_clazz, elements, number_clusters)
71
+ (1..max_clusters).each do |number_clusters|
72
+ initial_centers = choose_centers(elements, number_clusters) # C++ Native code
200
73
  centers = initial_centers.map(&:dup)
201
- centers = run(centers,elements,threads)
74
+ centers = kmeans(centers,elements) ## C++ Native code
202
75
  yield(elements, centers, initial_centers) if block_given?
203
76
  not_clustered ||=centers
204
77
  last_fK, last_sK, last_aK = fK(centers,last_sK, last_aK)
205
- puts "#{number_clusters}: fK() = #{last_fK}, last_sK=#{last_sK} last_aK=#{last_aK} "
78
+ puts "summary #{number_clusters}: fK() = #{last_fK}, last_sK=#{last_sK} last_aK=#{last_aK} "
79
+ puts
206
80
  solutions[last_fK]=centers
207
- # break if number_clusters == 3 ## debugging
81
+ # break if number_clusters == 2 ## debugging
208
82
  end
209
83
  min_fK =solutions.keys.sort.first
210
84
  if min_fK > 0.85
data/test/test_buncher.rb CHANGED
@@ -1,9 +1,121 @@
1
- gem "minitest"
2
- require "minitest/autorun"
3
1
  require "buncher"
2
+ require 'rubystats'
3
+ require 'gnuplot'
4
+
5
+ def dump(centers)
6
+ puts "centers are"
7
+ centers.each {|ccc| puts "center #{ccc.center.inspect} #{ccc.elements[0]}, #{ccc.elements[1]}"}
8
+ end
9
+
10
+ def init_data(number_points, number_clusters)
11
+ points_per_cluster = number_points.to_f / number_clusters
12
+ elements=[]
13
+ extra=0
14
+ index=0
15
+ seed=[[0,1],[0,0],[1,0]]
16
+ number_clusters.times do
17
+ # gens = [Rubystats::NormalDistribution.new(rand(0..1), 0.05), Rubystats::NormalDistribution.new(rand(0..1), 0.05)]
18
+ gens = [Rubystats::NormalDistribution.new(seed[index][0], 0.05), Rubystats::NormalDistribution.new(seed[index][1], 0.05)]
19
+ index+=1
20
+ extra+= points_per_cluster - points_per_cluster.floor
21
+ points = gens.map {|gen|gen.rng(points_per_cluster.floor+extra.floor)}
22
+ points.first.each_index {|iii|elements << [points[0][iii],points[1][iii]]}
23
+ extra-=1.0 if(extra > 1)
24
+ end
25
+ elements
26
+ end
27
+
28
+ def plot(file_name,points,centers, initial_centers=nil)
29
+ Gnuplot.open do |gp|
30
+ Gnuplot::Plot.new( gp ) do |plot|
31
+
32
+ plot.title "Cluster Plot (#{centers.size})"
33
+ plot.xlabel "x"
34
+ plot.ylabel "y"
35
+ plot.terminal "gif"
36
+ plot.output file_name
37
+
38
+ x = points.map(&:first)
39
+ y = points.map(&:last)
40
+
41
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
42
+ ds.with = "points"
43
+ ds.notitle
44
+ end
45
+
46
+ unless initial_centers.nil?
47
+ x = initial_centers.map(&:center).map(&:first)
48
+ y = initial_centers.map(&:center).map(&:last)
49
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
50
+ ds.with = "points"
51
+ ds.title="initial centers"
52
+ end
53
+ end
54
+
55
+ x = centers.map(&:center).map(&:first)
56
+ y = centers.map(&:center).map(&:last)
57
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
58
+ ds.with = "points"
59
+ ds.title="centers"
60
+ end
61
+ end
62
+ end
63
+ file_name
64
+ end
4
65
 
5
66
  class TestBuncher < Minitest::Test
6
- def test_sanity
7
- assert_equal 'hello world', Buncher.hello_world
67
+ WORKING=<<-'EOS'
68
+ def test_should_find_the_one_center
69
+ elements = [[1,1]]
70
+ starting_centers = elements.sample(1).map {|aaa| Buncher::Cluster.new(aaa)}
71
+ new_centers = Buncher::kmeans(starting_centers, elements)
72
+ dump(new_centers)
73
+ assert_in_delta(new_centers.first.center[0],1.0,0.01)
74
+ assert_in_delta(new_centers.first.center[1],1.0,0.01)
75
+ end
76
+
77
+ def test_choose_centers_wrapper
78
+ elements = [[1,1]]
79
+ new_centers = Buncher::choose_centers(elements, 1)
80
+ dump(new_centers)
81
+ assert_in_delta(new_centers.first.center[1],1.0,0.01)
82
+ assert_in_delta(new_centers.first.center[1],1.0,0.01)
83
+ end
84
+ def test_choose_centers_wrapper2
85
+ elements = init_data(100,3)
86
+ new_centers = Buncher::choose_centers(elements, 3)
87
+ dump(new_centers)
88
+ assert_equal(new_centers.size,3)
89
+ end
90
+ def test_should_find_one_cluster
91
+ elements = [[1,1]]
92
+ new_centers = Buncher::cluster(elements)
93
+ dump(new_centers)
94
+ assert_equal(new_centers.size,1)
95
+ end
96
+ def test_choose_centers_wrapper2
97
+ elements = init_data(100,3)
98
+ new_centers = Buncher::choose_centers(elements, 3)
99
+ plot("/tmp/kmeans_initial_#{new_centers.size}.png",elements, new_centers)
100
+ dump(new_centers)
101
+ `open /tmp/kmeans_initial_#{new_centers.size}.png`
102
+ assert_equal(new_centers.size,3)
103
+ end
104
+ EOS
105
+
106
+ def test_gaussian_distribution_of_100_points_in_3_clusters
107
+ 12.times do |run|
108
+ # srand(843284148793854177950180651080082381)
109
+ elements = init_data(100,3)
110
+ # elements.each {|eee| puts "#{eee[0]},#{eee[1]}"}
111
+ # new_centers = Buncher::cluster(elements) {|elements,centers, initial_centers| puts "run #{run} setup";plot("/tmp/#{run}_centers_#{centers.size}.png",elements,initial_centers)}
112
+ new_centers = Buncher::cluster(elements) {|elements,centers, initial_centers|
113
+ plot("/tmp/#{run}_centers_#{centers.size}.png",elements,centers, initial_centers)
114
+ }
115
+ puts "run #{run}: k is #{new_centers.size}, seed was #{srand}"
116
+ puts "ERROR "*4 if new_centers.size != 3
117
+ puts
118
+ assert_equal(3,new_centers.size)
119
+ end
8
120
  end
9
121
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: buncher
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 1.0.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-04-19 00:00:00.000000000 Z
12
+ date: 2015-04-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: minitest
@@ -18,7 +18,7 @@ dependencies:
18
18
  requirements:
19
19
  - - ~>
20
20
  - !ruby/object:Gem::Version
21
- version: '5.4'
21
+ version: '5.6'
22
22
  type: :development
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
@@ -26,7 +26,7 @@ dependencies:
26
26
  requirements:
27
27
  - - ~>
28
28
  - !ruby/object:Gem::Version
29
- version: '5.4'
29
+ version: '5.6'
30
30
  - !ruby/object:Gem::Dependency
31
31
  name: rdoc
32
32
  requirement: !ruby/object:Gem::Requirement
@@ -82,7 +82,7 @@ description: ! 'buncher implements a variant of the popular k-means clustering a
82
82
 
83
83
  in order to find the best value of K.'
84
84
  email:
85
- - rob@justsoftwareconsulting.com
85
+ - rob@drync.com
86
86
  executables: []
87
87
  extensions:
88
88
  - ext/buncher/extconf.rb
@@ -96,6 +96,7 @@ files:
96
96
  - Manifest.txt
97
97
  - README.txt
98
98
  - Rakefile
99
+ - lib/buncher/buncher.bundle
99
100
  - lib/buncher.rb
100
101
  - test/test_buncher.rb
101
102
  - ext/buncher/extconf.rb