buncher 0.0.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest.txt +1 -1
- data/Rakefile +2 -1
- data/ext/buncher/extconf.rb +2 -1
- data/lib/buncher/buncher.bundle +0 -0
- data/lib/buncher.rb +12 -138
- data/test/test_buncher.rb +116 -4
- metadata +6 -5
data/Manifest.txt
CHANGED
data/Rakefile
CHANGED
@@ -4,6 +4,7 @@ require "rubygems"
|
|
4
4
|
require "hoe"
|
5
5
|
|
6
6
|
Hoe.plugin :git
|
7
|
+
Hoe.plugin :debugging # for gdb
|
7
8
|
Hoe.plugin :compiler
|
8
9
|
# Hoe.plugin :gem_prelude_sucks
|
9
10
|
# Hoe.plugin :inline
|
@@ -13,7 +14,7 @@ Hoe.plugin :minitest
|
|
13
14
|
# Hoe.plugin :rdoc
|
14
15
|
|
15
16
|
Hoe.spec "buncher" do
|
16
|
-
developer("Robert Mathews", "rob@
|
17
|
+
developer("Robert Mathews", "rob@drync.com")
|
17
18
|
self.extra_dev_deps
|
18
19
|
|
19
20
|
license "MIT" # this should match the license in the README
|
data/ext/buncher/extconf.rb
CHANGED
Binary file
|
data/lib/buncher.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
require 'buncher/buncher'
|
2
|
-
require 'array'
|
3
2
|
module Buncher
|
4
|
-
VERSION = "0.0
|
3
|
+
VERSION = "1.0.0"
|
5
4
|
# your cluster needs to look like this. Make a bunch of them and pass them in. It's ok to pass in empty elements to start.
|
6
5
|
class Cluster
|
7
6
|
attr_accessor :elements
|
@@ -11,49 +10,15 @@ module Buncher
|
|
11
10
|
self.elements = elements
|
12
11
|
end
|
13
12
|
|
14
|
-
def ndim
|
15
|
-
2
|
16
|
-
end
|
17
|
-
|
18
13
|
def clear
|
19
14
|
elements.clear
|
20
15
|
end
|
21
16
|
|
22
|
-
def
|
23
|
-
|
24
|
-
end
|
25
|
-
|
26
|
-
def calculate_center
|
27
|
-
raise "calculate the center as an average of the elements"
|
28
|
-
end
|
29
|
-
def distance_squared(element)
|
30
|
-
raise "Distance between center and element - Implemented in a subclass"
|
31
|
-
end
|
32
|
-
|
33
|
-
def distortion
|
34
|
-
elements.inject(0) {|acc, ele| distance_squared(ele)}
|
35
|
-
end
|
36
|
-
|
37
|
-
# return the distance to the closest element and remove the element from the list
|
38
|
-
def closest!(elements)
|
39
|
-
min_distance=nil
|
40
|
-
min_index=nil
|
41
|
-
elements.each_with_index do |element, index|
|
42
|
-
distance = distance_squared(element)
|
43
|
-
if(min_distance.nil? || min_distance > distance)
|
44
|
-
min_distance = distance
|
45
|
-
min_index = index
|
46
|
-
end
|
47
|
-
end
|
48
|
-
elements.delete_at(min_index)
|
49
|
-
min_distance
|
17
|
+
def size
|
18
|
+
elements.size
|
50
19
|
end
|
51
20
|
|
52
|
-
|
53
|
-
# some useful math
|
54
|
-
def cdf(z)
|
55
|
-
(0.5 * (1.0 + Math.erf((z*1.0)/1.4142135623730951)))
|
56
|
-
end
|
21
|
+
# distortion - C++ code
|
57
22
|
end
|
58
23
|
|
59
24
|
# split array into several equal sized parts
|
@@ -73,101 +38,9 @@ module Buncher
|
|
73
38
|
groups
|
74
39
|
end
|
75
40
|
|
76
|
-
def self.distance(old_centers, new_clusters)
|
77
|
-
new_clusters.inject(0) {|acc, cluster| acc + cluster.closest!(old_centers)}
|
78
|
-
end
|
79
|
-
|
80
|
-
def self.run(centers, elements, nb_jobs)
|
81
|
-
old_centers=nil
|
82
|
-
count=0
|
83
|
-
while(true) do
|
84
|
-
# clear the centers first
|
85
|
-
centers.map(&:clear)
|
86
|
-
|
87
|
-
# create jobs
|
88
|
-
jobs = []
|
89
|
-
elements_for_jobs = split_array_into_parts(elements, nb_jobs)
|
90
|
-
nb_jobs.times do |i|
|
91
|
-
jobs << Job.new(centers, elements_for_jobs[i])
|
92
|
-
end
|
93
|
-
|
94
|
-
if(nb_jobs > 1)
|
95
|
-
# run jobs in parallel
|
96
|
-
queue = Cabiri::JobQueue.new
|
97
|
-
nb_jobs.times do |i|
|
98
|
-
queue.add(i) { jobs[i].run }
|
99
|
-
end
|
100
|
-
queue.start(nb_jobs)
|
101
|
-
else
|
102
|
-
jobs.map(&:run)
|
103
|
-
end
|
104
|
-
centers.map(&:calculate_center)
|
105
|
-
distance = distance(old_centers,centers) if old_centers
|
106
|
-
# puts "iteration #{count+=1}: distance #{distance}" if old_centers
|
107
|
-
break if old_centers && distance < 0.0001
|
108
|
-
old_centers=centers.map(&:center)
|
109
|
-
end
|
110
|
-
centers
|
111
|
-
end
|
112
|
-
|
113
|
-
# job that will be used for parallelization with Cabiri
|
114
|
-
class Job
|
115
|
-
attr_accessor :centers
|
116
|
-
attr_accessor :elements
|
117
|
-
|
118
|
-
def initialize(centers, elements)
|
119
|
-
@centers = centers
|
120
|
-
@elements = elements
|
121
|
-
end
|
122
|
-
|
123
|
-
def run
|
124
|
-
assignElementsToClosestCenter # this is center, with list of elements
|
125
|
-
@centers
|
126
|
-
end
|
127
|
-
|
128
|
-
def assignElementsToClosestCenter
|
129
|
-
@elements.each do |element|
|
130
|
-
best_center = nil
|
131
|
-
best_distance = nil
|
132
|
-
|
133
|
-
@centers.each do |center|
|
134
|
-
distance = center.distance_squared(element)
|
135
|
-
if best_distance.nil? or distance < best_distance
|
136
|
-
best_center = center
|
137
|
-
best_distance = distance
|
138
|
-
end
|
139
|
-
end
|
140
|
-
best_center.elements << element
|
141
|
-
end
|
142
|
-
end
|
143
|
-
end
|
144
|
-
|
145
|
-
# from the kmeans++ algorithm for choosing centers. returns a list of centers
|
146
|
-
def self.choose_centers(cluster_clazz, elements, number_centers)
|
147
|
-
ele = elements.sample(1).first
|
148
|
-
elements = elements.dup
|
149
|
-
centers = [cluster_clazz.new(ele)]
|
150
|
-
# puts "center 1 is #{centers.first.center}"
|
151
|
-
elements.delete(ele)
|
152
|
-
(2..number_centers).each do |index|
|
153
|
-
probability_distribution=[]
|
154
|
-
sum=elements.inject(0) {|sum, ele| distance=centers.inject(Float::MAX) {|acc,center|[center.distance_squared(ele),acc].min};sum+=distance;probability_distribution <<[distance,ele];sum}
|
155
|
-
dice = rand(0..sum)
|
156
|
-
# puts "dice=#{dice}, sum=#{sum}"
|
157
|
-
# puts "distribution"
|
158
|
-
# probability_distribution.each {|key,val| puts "#{key} : #{val}"}
|
159
|
-
# puts
|
160
|
-
ignore, next_center = probability_distribution.detect {|prob,ele|dice-=prob; dice <=0}
|
161
|
-
# puts "center #{index} is #{next_center}"
|
162
|
-
centers.unshift(cluster_clazz.new(next_center))
|
163
|
-
elements.delete_at(elements.index(next_center))
|
164
|
-
end
|
165
|
-
centers
|
166
|
-
end
|
167
|
-
|
168
41
|
def self.calc_aK(centers, last_aK)
|
169
42
|
if(centers.size == 2)
|
170
|
-
1.0-3.0/(4.0*centers.first.
|
43
|
+
1.0-3.0/(4.0*centers.first.center.size)
|
171
44
|
else
|
172
45
|
last_aK + (1.0 - last_aK) / 6
|
173
46
|
end
|
@@ -187,7 +60,7 @@ module Buncher
|
|
187
60
|
|
188
61
|
# run the clustering algorithm until have calculated the current number of clusters, taken from this paper:
|
189
62
|
# http://papers.nips.cc/paper/2526-learning-the-k-in-k-means.pdf
|
190
|
-
def self.cluster(
|
63
|
+
def self.cluster(elements)
|
191
64
|
changed=true
|
192
65
|
round=0
|
193
66
|
solutions={}
|
@@ -195,16 +68,17 @@ module Buncher
|
|
195
68
|
# that.
|
196
69
|
not_clustered = last_sK = last_aK =last_fK=nil
|
197
70
|
max_clusters=[1,elements.size/4].max
|
198
|
-
(
|
199
|
-
initial_centers = choose_centers(
|
71
|
+
(1..max_clusters).each do |number_clusters|
|
72
|
+
initial_centers = choose_centers(elements, number_clusters) # C++ Native code
|
200
73
|
centers = initial_centers.map(&:dup)
|
201
|
-
centers =
|
74
|
+
centers = kmeans(centers,elements) ## C++ Native code
|
202
75
|
yield(elements, centers, initial_centers) if block_given?
|
203
76
|
not_clustered ||=centers
|
204
77
|
last_fK, last_sK, last_aK = fK(centers,last_sK, last_aK)
|
205
|
-
puts "#{number_clusters}: fK() = #{last_fK}, last_sK=#{last_sK} last_aK=#{last_aK} "
|
78
|
+
puts "summary #{number_clusters}: fK() = #{last_fK}, last_sK=#{last_sK} last_aK=#{last_aK} "
|
79
|
+
puts
|
206
80
|
solutions[last_fK]=centers
|
207
|
-
# break if number_clusters ==
|
81
|
+
# break if number_clusters == 2 ## debugging
|
208
82
|
end
|
209
83
|
min_fK =solutions.keys.sort.first
|
210
84
|
if min_fK > 0.85
|
data/test/test_buncher.rb
CHANGED
@@ -1,9 +1,121 @@
|
|
1
|
-
gem "minitest"
|
2
|
-
require "minitest/autorun"
|
3
1
|
require "buncher"
|
2
|
+
require 'rubystats'
|
3
|
+
require 'gnuplot'
|
4
|
+
|
5
|
+
def dump(centers)
|
6
|
+
puts "centers are"
|
7
|
+
centers.each {|ccc| puts "center #{ccc.center.inspect} #{ccc.elements[0]}, #{ccc.elements[1]}"}
|
8
|
+
end
|
9
|
+
|
10
|
+
def init_data(number_points, number_clusters)
|
11
|
+
points_per_cluster = number_points.to_f / number_clusters
|
12
|
+
elements=[]
|
13
|
+
extra=0
|
14
|
+
index=0
|
15
|
+
seed=[[0,1],[0,0],[1,0]]
|
16
|
+
number_clusters.times do
|
17
|
+
# gens = [Rubystats::NormalDistribution.new(rand(0..1), 0.05), Rubystats::NormalDistribution.new(rand(0..1), 0.05)]
|
18
|
+
gens = [Rubystats::NormalDistribution.new(seed[index][0], 0.05), Rubystats::NormalDistribution.new(seed[index][1], 0.05)]
|
19
|
+
index+=1
|
20
|
+
extra+= points_per_cluster - points_per_cluster.floor
|
21
|
+
points = gens.map {|gen|gen.rng(points_per_cluster.floor+extra.floor)}
|
22
|
+
points.first.each_index {|iii|elements << [points[0][iii],points[1][iii]]}
|
23
|
+
extra-=1.0 if(extra > 1)
|
24
|
+
end
|
25
|
+
elements
|
26
|
+
end
|
27
|
+
|
28
|
+
def plot(file_name,points,centers, initial_centers=nil)
|
29
|
+
Gnuplot.open do |gp|
|
30
|
+
Gnuplot::Plot.new( gp ) do |plot|
|
31
|
+
|
32
|
+
plot.title "Cluster Plot (#{centers.size})"
|
33
|
+
plot.xlabel "x"
|
34
|
+
plot.ylabel "y"
|
35
|
+
plot.terminal "gif"
|
36
|
+
plot.output file_name
|
37
|
+
|
38
|
+
x = points.map(&:first)
|
39
|
+
y = points.map(&:last)
|
40
|
+
|
41
|
+
plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
|
42
|
+
ds.with = "points"
|
43
|
+
ds.notitle
|
44
|
+
end
|
45
|
+
|
46
|
+
unless initial_centers.nil?
|
47
|
+
x = initial_centers.map(&:center).map(&:first)
|
48
|
+
y = initial_centers.map(&:center).map(&:last)
|
49
|
+
plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
|
50
|
+
ds.with = "points"
|
51
|
+
ds.title="initial centers"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
x = centers.map(&:center).map(&:first)
|
56
|
+
y = centers.map(&:center).map(&:last)
|
57
|
+
plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
|
58
|
+
ds.with = "points"
|
59
|
+
ds.title="centers"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
file_name
|
64
|
+
end
|
4
65
|
|
5
66
|
class TestBuncher < Minitest::Test
|
6
|
-
|
7
|
-
|
67
|
+
WORKING=<<-'EOS'
|
68
|
+
def test_should_find_the_one_center
|
69
|
+
elements = [[1,1]]
|
70
|
+
starting_centers = elements.sample(1).map {|aaa| Buncher::Cluster.new(aaa)}
|
71
|
+
new_centers = Buncher::kmeans(starting_centers, elements)
|
72
|
+
dump(new_centers)
|
73
|
+
assert_in_delta(new_centers.first.center[0],1.0,0.01)
|
74
|
+
assert_in_delta(new_centers.first.center[1],1.0,0.01)
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_choose_centers_wrapper
|
78
|
+
elements = [[1,1]]
|
79
|
+
new_centers = Buncher::choose_centers(elements, 1)
|
80
|
+
dump(new_centers)
|
81
|
+
assert_in_delta(new_centers.first.center[1],1.0,0.01)
|
82
|
+
assert_in_delta(new_centers.first.center[1],1.0,0.01)
|
83
|
+
end
|
84
|
+
def test_choose_centers_wrapper2
|
85
|
+
elements = init_data(100,3)
|
86
|
+
new_centers = Buncher::choose_centers(elements, 3)
|
87
|
+
dump(new_centers)
|
88
|
+
assert_equal(new_centers.size,3)
|
89
|
+
end
|
90
|
+
def test_should_find_one_cluster
|
91
|
+
elements = [[1,1]]
|
92
|
+
new_centers = Buncher::cluster(elements)
|
93
|
+
dump(new_centers)
|
94
|
+
assert_equal(new_centers.size,1)
|
95
|
+
end
|
96
|
+
def test_choose_centers_wrapper2
|
97
|
+
elements = init_data(100,3)
|
98
|
+
new_centers = Buncher::choose_centers(elements, 3)
|
99
|
+
plot("/tmp/kmeans_initial_#{new_centers.size}.png",elements, new_centers)
|
100
|
+
dump(new_centers)
|
101
|
+
`open /tmp/kmeans_initial_#{new_centers.size}.png`
|
102
|
+
assert_equal(new_centers.size,3)
|
103
|
+
end
|
104
|
+
EOS
|
105
|
+
|
106
|
+
def test_gaussian_distribution_of_100_points_in_3_clusters
|
107
|
+
12.times do |run|
|
108
|
+
# srand(843284148793854177950180651080082381)
|
109
|
+
elements = init_data(100,3)
|
110
|
+
# elements.each {|eee| puts "#{eee[0]},#{eee[1]}"}
|
111
|
+
# new_centers = Buncher::cluster(elements) {|elements,centers, initial_centers| puts "run #{run} setup";plot("/tmp/#{run}_centers_#{centers.size}.png",elements,initial_centers)}
|
112
|
+
new_centers = Buncher::cluster(elements) {|elements,centers, initial_centers|
|
113
|
+
plot("/tmp/#{run}_centers_#{centers.size}.png",elements,centers, initial_centers)
|
114
|
+
}
|
115
|
+
puts "run #{run}: k is #{new_centers.size}, seed was #{srand}"
|
116
|
+
puts "ERROR "*4 if new_centers.size != 3
|
117
|
+
puts
|
118
|
+
assert_equal(3,new_centers.size)
|
119
|
+
end
|
8
120
|
end
|
9
121
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: buncher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 1.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2015-04-
|
12
|
+
date: 2015-04-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: minitest
|
@@ -18,7 +18,7 @@ dependencies:
|
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: '5.
|
21
|
+
version: '5.6'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
24
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -26,7 +26,7 @@ dependencies:
|
|
26
26
|
requirements:
|
27
27
|
- - ~>
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: '5.
|
29
|
+
version: '5.6'
|
30
30
|
- !ruby/object:Gem::Dependency
|
31
31
|
name: rdoc
|
32
32
|
requirement: !ruby/object:Gem::Requirement
|
@@ -82,7 +82,7 @@ description: ! 'buncher implements a variant of the popular k-means clustering a
|
|
82
82
|
|
83
83
|
in order to find the best value of K.'
|
84
84
|
email:
|
85
|
-
- rob@
|
85
|
+
- rob@drync.com
|
86
86
|
executables: []
|
87
87
|
extensions:
|
88
88
|
- ext/buncher/extconf.rb
|
@@ -96,6 +96,7 @@ files:
|
|
96
96
|
- Manifest.txt
|
97
97
|
- README.txt
|
98
98
|
- Rakefile
|
99
|
+
- lib/buncher/buncher.bundle
|
99
100
|
- lib/buncher.rb
|
100
101
|
- test/test_buncher.rb
|
101
102
|
- ext/buncher/extconf.rb
|