buncher 0.0.2 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest.txt +1 -1
- data/Rakefile +2 -1
- data/ext/buncher/extconf.rb +2 -1
- data/lib/buncher/buncher.bundle +0 -0
- data/lib/buncher.rb +12 -138
- data/test/test_buncher.rb +116 -4
- metadata +6 -5
data/Manifest.txt
CHANGED
data/Rakefile
CHANGED
@@ -4,6 +4,7 @@ require "rubygems"
|
|
4
4
|
require "hoe"
|
5
5
|
|
6
6
|
Hoe.plugin :git
|
7
|
+
Hoe.plugin :debugging # for gdb
|
7
8
|
Hoe.plugin :compiler
|
8
9
|
# Hoe.plugin :gem_prelude_sucks
|
9
10
|
# Hoe.plugin :inline
|
@@ -13,7 +14,7 @@ Hoe.plugin :minitest
|
|
13
14
|
# Hoe.plugin :rdoc
|
14
15
|
|
15
16
|
Hoe.spec "buncher" do
|
16
|
-
developer("Robert Mathews", "rob@
|
17
|
+
developer("Robert Mathews", "rob@drync.com")
|
17
18
|
self.extra_dev_deps
|
18
19
|
|
19
20
|
license "MIT" # this should match the license in the README
|
data/ext/buncher/extconf.rb
CHANGED
Binary file
|
data/lib/buncher.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
require 'buncher/buncher'
|
2
|
-
require 'array'
|
3
2
|
module Buncher
|
4
|
-
VERSION = "0.0
|
3
|
+
VERSION = "1.0.0"
|
5
4
|
# your cluster needs to look like this. Make a bunch of them and pass them in. It's ok to pass in empty elements to start.
|
6
5
|
class Cluster
|
7
6
|
attr_accessor :elements
|
@@ -11,49 +10,15 @@ module Buncher
|
|
11
10
|
self.elements = elements
|
12
11
|
end
|
13
12
|
|
14
|
-
def ndim
|
15
|
-
2
|
16
|
-
end
|
17
|
-
|
18
13
|
def clear
|
19
14
|
elements.clear
|
20
15
|
end
|
21
16
|
|
22
|
-
def
|
23
|
-
|
24
|
-
end
|
25
|
-
|
26
|
-
def calculate_center
|
27
|
-
raise "calculate the center as an average of the elements"
|
28
|
-
end
|
29
|
-
def distance_squared(element)
|
30
|
-
raise "Distance between center and element - Implemented in a subclass"
|
31
|
-
end
|
32
|
-
|
33
|
-
def distortion
|
34
|
-
elements.inject(0) {|acc, ele| distance_squared(ele)}
|
35
|
-
end
|
36
|
-
|
37
|
-
# return the distance to the closest element and remove the element from the list
|
38
|
-
def closest!(elements)
|
39
|
-
min_distance=nil
|
40
|
-
min_index=nil
|
41
|
-
elements.each_with_index do |element, index|
|
42
|
-
distance = distance_squared(element)
|
43
|
-
if(min_distance.nil? || min_distance > distance)
|
44
|
-
min_distance = distance
|
45
|
-
min_index = index
|
46
|
-
end
|
47
|
-
end
|
48
|
-
elements.delete_at(min_index)
|
49
|
-
min_distance
|
17
|
+
def size
|
18
|
+
elements.size
|
50
19
|
end
|
51
20
|
|
52
|
-
|
53
|
-
# some useful math
|
54
|
-
def cdf(z)
|
55
|
-
(0.5 * (1.0 + Math.erf((z*1.0)/1.4142135623730951)))
|
56
|
-
end
|
21
|
+
# distortion - C++ code
|
57
22
|
end
|
58
23
|
|
59
24
|
# split array into several equal sized parts
|
@@ -73,101 +38,9 @@ module Buncher
|
|
73
38
|
groups
|
74
39
|
end
|
75
40
|
|
76
|
-
def self.distance(old_centers, new_clusters)
|
77
|
-
new_clusters.inject(0) {|acc, cluster| acc + cluster.closest!(old_centers)}
|
78
|
-
end
|
79
|
-
|
80
|
-
def self.run(centers, elements, nb_jobs)
|
81
|
-
old_centers=nil
|
82
|
-
count=0
|
83
|
-
while(true) do
|
84
|
-
# clear the centers first
|
85
|
-
centers.map(&:clear)
|
86
|
-
|
87
|
-
# create jobs
|
88
|
-
jobs = []
|
89
|
-
elements_for_jobs = split_array_into_parts(elements, nb_jobs)
|
90
|
-
nb_jobs.times do |i|
|
91
|
-
jobs << Job.new(centers, elements_for_jobs[i])
|
92
|
-
end
|
93
|
-
|
94
|
-
if(nb_jobs > 1)
|
95
|
-
# run jobs in parallel
|
96
|
-
queue = Cabiri::JobQueue.new
|
97
|
-
nb_jobs.times do |i|
|
98
|
-
queue.add(i) { jobs[i].run }
|
99
|
-
end
|
100
|
-
queue.start(nb_jobs)
|
101
|
-
else
|
102
|
-
jobs.map(&:run)
|
103
|
-
end
|
104
|
-
centers.map(&:calculate_center)
|
105
|
-
distance = distance(old_centers,centers) if old_centers
|
106
|
-
# puts "iteration #{count+=1}: distance #{distance}" if old_centers
|
107
|
-
break if old_centers && distance < 0.0001
|
108
|
-
old_centers=centers.map(&:center)
|
109
|
-
end
|
110
|
-
centers
|
111
|
-
end
|
112
|
-
|
113
|
-
# job that will be used for parallelization with Cabiri
|
114
|
-
class Job
|
115
|
-
attr_accessor :centers
|
116
|
-
attr_accessor :elements
|
117
|
-
|
118
|
-
def initialize(centers, elements)
|
119
|
-
@centers = centers
|
120
|
-
@elements = elements
|
121
|
-
end
|
122
|
-
|
123
|
-
def run
|
124
|
-
assignElementsToClosestCenter # this is center, with list of elements
|
125
|
-
@centers
|
126
|
-
end
|
127
|
-
|
128
|
-
def assignElementsToClosestCenter
|
129
|
-
@elements.each do |element|
|
130
|
-
best_center = nil
|
131
|
-
best_distance = nil
|
132
|
-
|
133
|
-
@centers.each do |center|
|
134
|
-
distance = center.distance_squared(element)
|
135
|
-
if best_distance.nil? or distance < best_distance
|
136
|
-
best_center = center
|
137
|
-
best_distance = distance
|
138
|
-
end
|
139
|
-
end
|
140
|
-
best_center.elements << element
|
141
|
-
end
|
142
|
-
end
|
143
|
-
end
|
144
|
-
|
145
|
-
# from the kmeans++ algorithm for choosing centers. returns a list of centers
|
146
|
-
def self.choose_centers(cluster_clazz, elements, number_centers)
|
147
|
-
ele = elements.sample(1).first
|
148
|
-
elements = elements.dup
|
149
|
-
centers = [cluster_clazz.new(ele)]
|
150
|
-
# puts "center 1 is #{centers.first.center}"
|
151
|
-
elements.delete(ele)
|
152
|
-
(2..number_centers).each do |index|
|
153
|
-
probability_distribution=[]
|
154
|
-
sum=elements.inject(0) {|sum, ele| distance=centers.inject(Float::MAX) {|acc,center|[center.distance_squared(ele),acc].min};sum+=distance;probability_distribution <<[distance,ele];sum}
|
155
|
-
dice = rand(0..sum)
|
156
|
-
# puts "dice=#{dice}, sum=#{sum}"
|
157
|
-
# puts "distribution"
|
158
|
-
# probability_distribution.each {|key,val| puts "#{key} : #{val}"}
|
159
|
-
# puts
|
160
|
-
ignore, next_center = probability_distribution.detect {|prob,ele|dice-=prob; dice <=0}
|
161
|
-
# puts "center #{index} is #{next_center}"
|
162
|
-
centers.unshift(cluster_clazz.new(next_center))
|
163
|
-
elements.delete_at(elements.index(next_center))
|
164
|
-
end
|
165
|
-
centers
|
166
|
-
end
|
167
|
-
|
168
41
|
def self.calc_aK(centers, last_aK)
|
169
42
|
if(centers.size == 2)
|
170
|
-
1.0-3.0/(4.0*centers.first.
|
43
|
+
1.0-3.0/(4.0*centers.first.center.size)
|
171
44
|
else
|
172
45
|
last_aK + (1.0 - last_aK) / 6
|
173
46
|
end
|
@@ -187,7 +60,7 @@ module Buncher
|
|
187
60
|
|
188
61
|
# run the clustering algorithm until have calculated the current number of clusters, taken from this paper:
|
189
62
|
# http://papers.nips.cc/paper/2526-learning-the-k-in-k-means.pdf
|
190
|
-
def self.cluster(
|
63
|
+
def self.cluster(elements)
|
191
64
|
changed=true
|
192
65
|
round=0
|
193
66
|
solutions={}
|
@@ -195,16 +68,17 @@ module Buncher
|
|
195
68
|
# that.
|
196
69
|
not_clustered = last_sK = last_aK =last_fK=nil
|
197
70
|
max_clusters=[1,elements.size/4].max
|
198
|
-
(
|
199
|
-
initial_centers = choose_centers(
|
71
|
+
(1..max_clusters).each do |number_clusters|
|
72
|
+
initial_centers = choose_centers(elements, number_clusters) # C++ Native code
|
200
73
|
centers = initial_centers.map(&:dup)
|
201
|
-
centers =
|
74
|
+
centers = kmeans(centers,elements) ## C++ Native code
|
202
75
|
yield(elements, centers, initial_centers) if block_given?
|
203
76
|
not_clustered ||=centers
|
204
77
|
last_fK, last_sK, last_aK = fK(centers,last_sK, last_aK)
|
205
|
-
puts "#{number_clusters}: fK() = #{last_fK}, last_sK=#{last_sK} last_aK=#{last_aK} "
|
78
|
+
puts "summary #{number_clusters}: fK() = #{last_fK}, last_sK=#{last_sK} last_aK=#{last_aK} "
|
79
|
+
puts
|
206
80
|
solutions[last_fK]=centers
|
207
|
-
# break if number_clusters ==
|
81
|
+
# break if number_clusters == 2 ## debugging
|
208
82
|
end
|
209
83
|
min_fK =solutions.keys.sort.first
|
210
84
|
if min_fK > 0.85
|
data/test/test_buncher.rb
CHANGED
@@ -1,9 +1,121 @@
|
|
1
|
-
gem "minitest"
|
2
|
-
require "minitest/autorun"
|
3
1
|
require "buncher"
|
2
|
+
require 'rubystats'
|
3
|
+
require 'gnuplot'
|
4
|
+
|
5
|
+
def dump(centers)
|
6
|
+
puts "centers are"
|
7
|
+
centers.each {|ccc| puts "center #{ccc.center.inspect} #{ccc.elements[0]}, #{ccc.elements[1]}"}
|
8
|
+
end
|
9
|
+
|
10
|
+
def init_data(number_points, number_clusters)
|
11
|
+
points_per_cluster = number_points.to_f / number_clusters
|
12
|
+
elements=[]
|
13
|
+
extra=0
|
14
|
+
index=0
|
15
|
+
seed=[[0,1],[0,0],[1,0]]
|
16
|
+
number_clusters.times do
|
17
|
+
# gens = [Rubystats::NormalDistribution.new(rand(0..1), 0.05), Rubystats::NormalDistribution.new(rand(0..1), 0.05)]
|
18
|
+
gens = [Rubystats::NormalDistribution.new(seed[index][0], 0.05), Rubystats::NormalDistribution.new(seed[index][1], 0.05)]
|
19
|
+
index+=1
|
20
|
+
extra+= points_per_cluster - points_per_cluster.floor
|
21
|
+
points = gens.map {|gen|gen.rng(points_per_cluster.floor+extra.floor)}
|
22
|
+
points.first.each_index {|iii|elements << [points[0][iii],points[1][iii]]}
|
23
|
+
extra-=1.0 if(extra > 1)
|
24
|
+
end
|
25
|
+
elements
|
26
|
+
end
|
27
|
+
|
28
|
+
def plot(file_name,points,centers, initial_centers=nil)
|
29
|
+
Gnuplot.open do |gp|
|
30
|
+
Gnuplot::Plot.new( gp ) do |plot|
|
31
|
+
|
32
|
+
plot.title "Cluster Plot (#{centers.size})"
|
33
|
+
plot.xlabel "x"
|
34
|
+
plot.ylabel "y"
|
35
|
+
plot.terminal "gif"
|
36
|
+
plot.output file_name
|
37
|
+
|
38
|
+
x = points.map(&:first)
|
39
|
+
y = points.map(&:last)
|
40
|
+
|
41
|
+
plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
|
42
|
+
ds.with = "points"
|
43
|
+
ds.notitle
|
44
|
+
end
|
45
|
+
|
46
|
+
unless initial_centers.nil?
|
47
|
+
x = initial_centers.map(&:center).map(&:first)
|
48
|
+
y = initial_centers.map(&:center).map(&:last)
|
49
|
+
plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
|
50
|
+
ds.with = "points"
|
51
|
+
ds.title="initial centers"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
x = centers.map(&:center).map(&:first)
|
56
|
+
y = centers.map(&:center).map(&:last)
|
57
|
+
plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
|
58
|
+
ds.with = "points"
|
59
|
+
ds.title="centers"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
file_name
|
64
|
+
end
|
4
65
|
|
5
66
|
class TestBuncher < Minitest::Test
|
6
|
-
|
7
|
-
|
67
|
+
WORKING=<<-'EOS'
|
68
|
+
def test_should_find_the_one_center
|
69
|
+
elements = [[1,1]]
|
70
|
+
starting_centers = elements.sample(1).map {|aaa| Buncher::Cluster.new(aaa)}
|
71
|
+
new_centers = Buncher::kmeans(starting_centers, elements)
|
72
|
+
dump(new_centers)
|
73
|
+
assert_in_delta(new_centers.first.center[0],1.0,0.01)
|
74
|
+
assert_in_delta(new_centers.first.center[1],1.0,0.01)
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_choose_centers_wrapper
|
78
|
+
elements = [[1,1]]
|
79
|
+
new_centers = Buncher::choose_centers(elements, 1)
|
80
|
+
dump(new_centers)
|
81
|
+
assert_in_delta(new_centers.first.center[1],1.0,0.01)
|
82
|
+
assert_in_delta(new_centers.first.center[1],1.0,0.01)
|
83
|
+
end
|
84
|
+
def test_choose_centers_wrapper2
|
85
|
+
elements = init_data(100,3)
|
86
|
+
new_centers = Buncher::choose_centers(elements, 3)
|
87
|
+
dump(new_centers)
|
88
|
+
assert_equal(new_centers.size,3)
|
89
|
+
end
|
90
|
+
def test_should_find_one_cluster
|
91
|
+
elements = [[1,1]]
|
92
|
+
new_centers = Buncher::cluster(elements)
|
93
|
+
dump(new_centers)
|
94
|
+
assert_equal(new_centers.size,1)
|
95
|
+
end
|
96
|
+
def test_choose_centers_wrapper2
|
97
|
+
elements = init_data(100,3)
|
98
|
+
new_centers = Buncher::choose_centers(elements, 3)
|
99
|
+
plot("/tmp/kmeans_initial_#{new_centers.size}.png",elements, new_centers)
|
100
|
+
dump(new_centers)
|
101
|
+
`open /tmp/kmeans_initial_#{new_centers.size}.png`
|
102
|
+
assert_equal(new_centers.size,3)
|
103
|
+
end
|
104
|
+
EOS
|
105
|
+
|
106
|
+
def test_gaussian_distribution_of_100_points_in_3_clusters
|
107
|
+
12.times do |run|
|
108
|
+
# srand(843284148793854177950180651080082381)
|
109
|
+
elements = init_data(100,3)
|
110
|
+
# elements.each {|eee| puts "#{eee[0]},#{eee[1]}"}
|
111
|
+
# new_centers = Buncher::cluster(elements) {|elements,centers, initial_centers| puts "run #{run} setup";plot("/tmp/#{run}_centers_#{centers.size}.png",elements,initial_centers)}
|
112
|
+
new_centers = Buncher::cluster(elements) {|elements,centers, initial_centers|
|
113
|
+
plot("/tmp/#{run}_centers_#{centers.size}.png",elements,centers, initial_centers)
|
114
|
+
}
|
115
|
+
puts "run #{run}: k is #{new_centers.size}, seed was #{srand}"
|
116
|
+
puts "ERROR "*4 if new_centers.size != 3
|
117
|
+
puts
|
118
|
+
assert_equal(3,new_centers.size)
|
119
|
+
end
|
8
120
|
end
|
9
121
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: buncher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 1.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2015-04-
|
12
|
+
date: 2015-04-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: minitest
|
@@ -18,7 +18,7 @@ dependencies:
|
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: '5.
|
21
|
+
version: '5.6'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
24
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -26,7 +26,7 @@ dependencies:
|
|
26
26
|
requirements:
|
27
27
|
- - ~>
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: '5.
|
29
|
+
version: '5.6'
|
30
30
|
- !ruby/object:Gem::Dependency
|
31
31
|
name: rdoc
|
32
32
|
requirement: !ruby/object:Gem::Requirement
|
@@ -82,7 +82,7 @@ description: ! 'buncher implements a variant of the popular k-means clustering a
|
|
82
82
|
|
83
83
|
in order to find the best value of K.'
|
84
84
|
email:
|
85
|
-
- rob@
|
85
|
+
- rob@drync.com
|
86
86
|
executables: []
|
87
87
|
extensions:
|
88
88
|
- ext/buncher/extconf.rb
|
@@ -96,6 +96,7 @@ files:
|
|
96
96
|
- Manifest.txt
|
97
97
|
- README.txt
|
98
98
|
- Rakefile
|
99
|
+
- lib/buncher/buncher.bundle
|
99
100
|
- lib/buncher.rb
|
100
101
|
- test/test_buncher.rb
|
101
102
|
- ext/buncher/extconf.rb
|