buncher 0.0.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Manifest.txt CHANGED
@@ -3,6 +3,6 @@ History.txt
3
3
  Manifest.txt
4
4
  README.txt
5
5
  Rakefile
6
- bin/buncher
6
+ lib/buncher/buncher.bundle
7
7
  lib/buncher.rb
8
8
  test/test_buncher.rb
data/Rakefile CHANGED
@@ -4,6 +4,7 @@ require "rubygems"
4
4
  require "hoe"
5
5
 
6
6
  Hoe.plugin :git
7
+ Hoe.plugin :debugging # for gdb
7
8
  Hoe.plugin :compiler
8
9
  # Hoe.plugin :gem_prelude_sucks
9
10
  # Hoe.plugin :inline
@@ -13,7 +14,7 @@ Hoe.plugin :minitest
13
14
  # Hoe.plugin :rdoc
14
15
 
15
16
  Hoe.spec "buncher" do
16
- developer("Robert Mathews", "rob@justsoftwareconsulting.com")
17
+ developer("Robert Mathews", "rob@drync.com")
17
18
  self.extra_dev_deps
18
19
 
19
20
  license "MIT" # this should match the license in the README
@@ -6,6 +6,7 @@ extension_name = 'buncher'
6
6
 
7
7
  # The destination
8
8
  dir_config(extension_name)
9
-
9
+ # add some standard template libraries (headers only)
10
+ dir_config('boost','ext/boost/1.57.0')
10
11
  # Do the work
11
12
  create_makefile(extension_name)
Binary file
data/lib/buncher.rb CHANGED
@@ -1,7 +1,6 @@
1
1
  require 'buncher/buncher'
2
- require 'array'
3
2
  module Buncher
4
- VERSION = "0.0.2"
3
+ VERSION = "1.0.0"
5
4
  # your cluster needs to look like this. Make a bunch of them and pass them in. It's ok to pass in empty elements to start.
6
5
  class Cluster
7
6
  attr_accessor :elements
@@ -11,49 +10,15 @@ module Buncher
11
10
  self.elements = elements
12
11
  end
13
12
 
14
- def ndim
15
- 2
16
- end
17
-
18
13
  def clear
19
14
  elements.clear
20
15
  end
21
16
 
22
- def sum
23
- raise "calculate the sum of the elements"
24
- end
25
-
26
- def calculate_center
27
- raise "calculate the center as an average of the elements"
28
- end
29
- def distance_squared(element)
30
- raise "Distance between center and element - Implemented in a subclass"
31
- end
32
-
33
- def distortion
34
- elements.inject(0) {|acc, ele| distance_squared(ele)}
35
- end
36
-
37
- # return the distance to the closest element and remove the element from the list
38
- def closest!(elements)
39
- min_distance=nil
40
- min_index=nil
41
- elements.each_with_index do |element, index|
42
- distance = distance_squared(element)
43
- if(min_distance.nil? || min_distance > distance)
44
- min_distance = distance
45
- min_index = index
46
- end
47
- end
48
- elements.delete_at(min_index)
49
- min_distance
17
+ def size
18
+ elements.size
50
19
  end
51
20
 
52
-
53
- # some useful math
54
- def cdf(z)
55
- (0.5 * (1.0 + Math.erf((z*1.0)/1.4142135623730951)))
56
- end
21
+ # distortion - C++ code
57
22
  end
58
23
 
59
24
  # split array into several equal sized parts
@@ -73,101 +38,9 @@ module Buncher
73
38
  groups
74
39
  end
75
40
 
76
- def self.distance(old_centers, new_clusters)
77
- new_clusters.inject(0) {|acc, cluster| acc + cluster.closest!(old_centers)}
78
- end
79
-
80
- def self.run(centers, elements, nb_jobs)
81
- old_centers=nil
82
- count=0
83
- while(true) do
84
- # clear the centers first
85
- centers.map(&:clear)
86
-
87
- # create jobs
88
- jobs = []
89
- elements_for_jobs = split_array_into_parts(elements, nb_jobs)
90
- nb_jobs.times do |i|
91
- jobs << Job.new(centers, elements_for_jobs[i])
92
- end
93
-
94
- if(nb_jobs > 1)
95
- # run jobs in parallel
96
- queue = Cabiri::JobQueue.new
97
- nb_jobs.times do |i|
98
- queue.add(i) { jobs[i].run }
99
- end
100
- queue.start(nb_jobs)
101
- else
102
- jobs.map(&:run)
103
- end
104
- centers.map(&:calculate_center)
105
- distance = distance(old_centers,centers) if old_centers
106
- # puts "iteration #{count+=1}: distance #{distance}" if old_centers
107
- break if old_centers && distance < 0.0001
108
- old_centers=centers.map(&:center)
109
- end
110
- centers
111
- end
112
-
113
- # job that will be used for parallelization with Cabiri
114
- class Job
115
- attr_accessor :centers
116
- attr_accessor :elements
117
-
118
- def initialize(centers, elements)
119
- @centers = centers
120
- @elements = elements
121
- end
122
-
123
- def run
124
- assignElementsToClosestCenter # this is center, with list of elements
125
- @centers
126
- end
127
-
128
- def assignElementsToClosestCenter
129
- @elements.each do |element|
130
- best_center = nil
131
- best_distance = nil
132
-
133
- @centers.each do |center|
134
- distance = center.distance_squared(element)
135
- if best_distance.nil? or distance < best_distance
136
- best_center = center
137
- best_distance = distance
138
- end
139
- end
140
- best_center.elements << element
141
- end
142
- end
143
- end
144
-
145
- # from the kmeans++ algorithm for choosing centers. returns a list of centers
146
- def self.choose_centers(cluster_clazz, elements, number_centers)
147
- ele = elements.sample(1).first
148
- elements = elements.dup
149
- centers = [cluster_clazz.new(ele)]
150
- # puts "center 1 is #{centers.first.center}"
151
- elements.delete(ele)
152
- (2..number_centers).each do |index|
153
- probability_distribution=[]
154
- sum=elements.inject(0) {|sum, ele| distance=centers.inject(Float::MAX) {|acc,center|[center.distance_squared(ele),acc].min};sum+=distance;probability_distribution <<[distance,ele];sum}
155
- dice = rand(0..sum)
156
- # puts "dice=#{dice}, sum=#{sum}"
157
- # puts "distribution"
158
- # probability_distribution.each {|key,val| puts "#{key} : #{val}"}
159
- # puts
160
- ignore, next_center = probability_distribution.detect {|prob,ele|dice-=prob; dice <=0}
161
- # puts "center #{index} is #{next_center}"
162
- centers.unshift(cluster_clazz.new(next_center))
163
- elements.delete_at(elements.index(next_center))
164
- end
165
- centers
166
- end
167
-
168
41
  def self.calc_aK(centers, last_aK)
169
42
  if(centers.size == 2)
170
- 1.0-3.0/(4.0*centers.first.ndim)
43
+ 1.0-3.0/(4.0*centers.first.center.size)
171
44
  else
172
45
  last_aK + (1.0 - last_aK) / 6
173
46
  end
@@ -187,7 +60,7 @@ module Buncher
187
60
 
188
61
  # run the clustering algorithm until have calculated the current number of clusters, taken from this paper:
189
62
  # http://papers.nips.cc/paper/2526-learning-the-k-in-k-means.pdf
190
- def self.cluster(start,cluster_clazz,elements,threads=1)
63
+ def self.cluster(elements)
191
64
  changed=true
192
65
  round=0
193
66
  solutions={}
@@ -195,16 +68,17 @@ module Buncher
195
68
  # that.
196
69
  not_clustered = last_sK = last_aK =last_fK=nil
197
70
  max_clusters=[1,elements.size/4].max
198
- (start..max_clusters).each do |number_clusters|
199
- initial_centers = choose_centers(cluster_clazz, elements, number_clusters)
71
+ (1..max_clusters).each do |number_clusters|
72
+ initial_centers = choose_centers(elements, number_clusters) # C++ Native code
200
73
  centers = initial_centers.map(&:dup)
201
- centers = run(centers,elements,threads)
74
+ centers = kmeans(centers,elements) ## C++ Native code
202
75
  yield(elements, centers, initial_centers) if block_given?
203
76
  not_clustered ||=centers
204
77
  last_fK, last_sK, last_aK = fK(centers,last_sK, last_aK)
205
- puts "#{number_clusters}: fK() = #{last_fK}, last_sK=#{last_sK} last_aK=#{last_aK} "
78
+ puts "summary #{number_clusters}: fK() = #{last_fK}, last_sK=#{last_sK} last_aK=#{last_aK} "
79
+ puts
206
80
  solutions[last_fK]=centers
207
- # break if number_clusters == 3 ## debugging
81
+ # break if number_clusters == 2 ## debugging
208
82
  end
209
83
  min_fK =solutions.keys.sort.first
210
84
  if min_fK > 0.85
data/test/test_buncher.rb CHANGED
@@ -1,9 +1,121 @@
1
- gem "minitest"
2
- require "minitest/autorun"
3
1
  require "buncher"
2
+ require 'rubystats'
3
+ require 'gnuplot'
4
+
5
+ def dump(centers)
6
+ puts "centers are"
7
+ centers.each {|ccc| puts "center #{ccc.center.inspect} #{ccc.elements[0]}, #{ccc.elements[1]}"}
8
+ end
9
+
10
+ def init_data(number_points, number_clusters)
11
+ points_per_cluster = number_points.to_f / number_clusters
12
+ elements=[]
13
+ extra=0
14
+ index=0
15
+ seed=[[0,1],[0,0],[1,0]]
16
+ number_clusters.times do
17
+ # gens = [Rubystats::NormalDistribution.new(rand(0..1), 0.05), Rubystats::NormalDistribution.new(rand(0..1), 0.05)]
18
+ gens = [Rubystats::NormalDistribution.new(seed[index][0], 0.05), Rubystats::NormalDistribution.new(seed[index][1], 0.05)]
19
+ index+=1
20
+ extra+= points_per_cluster - points_per_cluster.floor
21
+ points = gens.map {|gen|gen.rng(points_per_cluster.floor+extra.floor)}
22
+ points.first.each_index {|iii|elements << [points[0][iii],points[1][iii]]}
23
+ extra-=1.0 if(extra > 1)
24
+ end
25
+ elements
26
+ end
27
+
28
+ def plot(file_name,points,centers, initial_centers=nil)
29
+ Gnuplot.open do |gp|
30
+ Gnuplot::Plot.new( gp ) do |plot|
31
+
32
+ plot.title "Cluster Plot (#{centers.size})"
33
+ plot.xlabel "x"
34
+ plot.ylabel "y"
35
+ plot.terminal "gif"
36
+ plot.output file_name
37
+
38
+ x = points.map(&:first)
39
+ y = points.map(&:last)
40
+
41
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
42
+ ds.with = "points"
43
+ ds.notitle
44
+ end
45
+
46
+ unless initial_centers.nil?
47
+ x = initial_centers.map(&:center).map(&:first)
48
+ y = initial_centers.map(&:center).map(&:last)
49
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
50
+ ds.with = "points"
51
+ ds.title="initial centers"
52
+ end
53
+ end
54
+
55
+ x = centers.map(&:center).map(&:first)
56
+ y = centers.map(&:center).map(&:last)
57
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
58
+ ds.with = "points"
59
+ ds.title="centers"
60
+ end
61
+ end
62
+ end
63
+ file_name
64
+ end
4
65
 
5
66
  class TestBuncher < Minitest::Test
6
- def test_sanity
7
- assert_equal 'hello world', Buncher.hello_world
67
+ WORKING=<<-'EOS'
68
+ def test_should_find_the_one_center
69
+ elements = [[1,1]]
70
+ starting_centers = elements.sample(1).map {|aaa| Buncher::Cluster.new(aaa)}
71
+ new_centers = Buncher::kmeans(starting_centers, elements)
72
+ dump(new_centers)
73
+ assert_in_delta(new_centers.first.center[0],1.0,0.01)
74
+ assert_in_delta(new_centers.first.center[1],1.0,0.01)
75
+ end
76
+
77
+ def test_choose_centers_wrapper
78
+ elements = [[1,1]]
79
+ new_centers = Buncher::choose_centers(elements, 1)
80
+ dump(new_centers)
81
+ assert_in_delta(new_centers.first.center[1],1.0,0.01)
82
+ assert_in_delta(new_centers.first.center[1],1.0,0.01)
83
+ end
84
+ def test_choose_centers_wrapper2
85
+ elements = init_data(100,3)
86
+ new_centers = Buncher::choose_centers(elements, 3)
87
+ dump(new_centers)
88
+ assert_equal(new_centers.size,3)
89
+ end
90
+ def test_should_find_one_cluster
91
+ elements = [[1,1]]
92
+ new_centers = Buncher::cluster(elements)
93
+ dump(new_centers)
94
+ assert_equal(new_centers.size,1)
95
+ end
96
+ def test_choose_centers_wrapper2
97
+ elements = init_data(100,3)
98
+ new_centers = Buncher::choose_centers(elements, 3)
99
+ plot("/tmp/kmeans_initial_#{new_centers.size}.png",elements, new_centers)
100
+ dump(new_centers)
101
+ `open /tmp/kmeans_initial_#{new_centers.size}.png`
102
+ assert_equal(new_centers.size,3)
103
+ end
104
+ EOS
105
+
106
+ def test_gaussian_distribution_of_100_points_in_3_clusters
107
+ 12.times do |run|
108
+ # srand(843284148793854177950180651080082381)
109
+ elements = init_data(100,3)
110
+ # elements.each {|eee| puts "#{eee[0]},#{eee[1]}"}
111
+ # new_centers = Buncher::cluster(elements) {|elements,centers, initial_centers| puts "run #{run} setup";plot("/tmp/#{run}_centers_#{centers.size}.png",elements,initial_centers)}
112
+ new_centers = Buncher::cluster(elements) {|elements,centers, initial_centers|
113
+ plot("/tmp/#{run}_centers_#{centers.size}.png",elements,centers, initial_centers)
114
+ }
115
+ puts "run #{run}: k is #{new_centers.size}, seed was #{srand}"
116
+ puts "ERROR "*4 if new_centers.size != 3
117
+ puts
118
+ assert_equal(3,new_centers.size)
119
+ end
8
120
  end
9
121
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: buncher
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 1.0.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-04-19 00:00:00.000000000 Z
12
+ date: 2015-04-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: minitest
@@ -18,7 +18,7 @@ dependencies:
18
18
  requirements:
19
19
  - - ~>
20
20
  - !ruby/object:Gem::Version
21
- version: '5.4'
21
+ version: '5.6'
22
22
  type: :development
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
@@ -26,7 +26,7 @@ dependencies:
26
26
  requirements:
27
27
  - - ~>
28
28
  - !ruby/object:Gem::Version
29
- version: '5.4'
29
+ version: '5.6'
30
30
  - !ruby/object:Gem::Dependency
31
31
  name: rdoc
32
32
  requirement: !ruby/object:Gem::Requirement
@@ -82,7 +82,7 @@ description: ! 'buncher implements a variant of the popular k-means clustering a
82
82
 
83
83
  in order to find the best value of K.'
84
84
  email:
85
- - rob@justsoftwareconsulting.com
85
+ - rob@drync.com
86
86
  executables: []
87
87
  extensions:
88
88
  - ext/buncher/extconf.rb
@@ -96,6 +96,7 @@ files:
96
96
  - Manifest.txt
97
97
  - README.txt
98
98
  - Rakefile
99
+ - lib/buncher/buncher.bundle
99
100
  - lib/buncher.rb
100
101
  - test/test_buncher.rb
101
102
  - ext/buncher/extconf.rb