kmeans-clustering 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/kmeans-clustering.rb +118 -0
- metadata +62 -0
@@ -0,0 +1,118 @@
|
|
1
|
+
module KMeansClustering
|
2
|
+
require 'cabiri'
|
3
|
+
|
4
|
+
# add static attributes through attr_accessor
|
5
|
+
class << self
|
6
|
+
attr_accessor :calcSum
|
7
|
+
attr_accessor :calcAverage
|
8
|
+
attr_accessor :calcDistanceSquared
|
9
|
+
end
|
10
|
+
|
11
|
+
# split array into several equal sized parts
|
12
|
+
# taken from http://apidock.com/rails/v3.2.8/Array/in_groups
|
13
|
+
def self.split_array_into_parts(array, nb_parts)
|
14
|
+
start = 0
|
15
|
+
groups = []
|
16
|
+
|
17
|
+
modulo = array.size % nb_parts
|
18
|
+
division = array.size / nb_parts
|
19
|
+
|
20
|
+
nb_parts.times do |index|
|
21
|
+
length = division + (modulo > 0 && modulo > index ? 1 : 0)
|
22
|
+
groups << array.slice(start, length)
|
23
|
+
start += length
|
24
|
+
end
|
25
|
+
groups
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.run(centers, elements, nb_iterations, nb_jobs)
|
29
|
+
nb_iterations.times do
|
30
|
+
# create jobs
|
31
|
+
jobs = []
|
32
|
+
elements_for_jobs = split_array_into_parts(elements, nb_jobs)
|
33
|
+
nb_jobs.times do |i|
|
34
|
+
jobs << Job.new(centers, elements_for_jobs[i])
|
35
|
+
end
|
36
|
+
|
37
|
+
# run jobs in parallel
|
38
|
+
queue = Cabiri::JobQueue.new
|
39
|
+
nb_jobs.times do |i|
|
40
|
+
queue.add(i) { jobs[i].run }
|
41
|
+
end
|
42
|
+
queue.start(nb_jobs)
|
43
|
+
|
44
|
+
# sort aggregated proximity data by center
|
45
|
+
sorted_aggregated_proximity_data = Hash.new { |h,k| h[k] = [] }
|
46
|
+
|
47
|
+
queue.finished_jobs.values.each do |finished_job|
|
48
|
+
aggregated_proximity_data = finished_job.result
|
49
|
+
aggregated_proximity_data.each do |center, aggregated_data|
|
50
|
+
sorted_aggregated_proximity_data[center] << aggregated_data
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# calculate sum and nb elements for each center
|
55
|
+
sums = Hash.new { |h,k| h[k] = [] }
|
56
|
+
nb_elements = Hash.new { |h,k| h[k] = [] }
|
57
|
+
|
58
|
+
sorted_aggregated_proximity_data.each do |center, aggregated_data|
|
59
|
+
sums[center] = KMeansClustering::calcSum.call(aggregated_data.collect { |d| d[:sum] })
|
60
|
+
nb_elements[center] = (aggregated_data.collect { |d| d[:nb_elements] }).inject(0, :+)
|
61
|
+
end
|
62
|
+
|
63
|
+
# calculate new centers
|
64
|
+
centers = []
|
65
|
+
sums.keys.each do |center|
|
66
|
+
centers << KMeansClustering::calcAverage.call(sums[center], nb_elements[center])
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
centers
|
71
|
+
end
|
72
|
+
|
73
|
+
# job that will be used for parallelization with Cabiri
|
74
|
+
class Job
|
75
|
+
attr_accessor :centers
|
76
|
+
attr_accessor :elements
|
77
|
+
|
78
|
+
def initialize(centers, elements)
|
79
|
+
@centers = centers
|
80
|
+
@elements = elements
|
81
|
+
end
|
82
|
+
|
83
|
+
def run
|
84
|
+
proximity_data = assignElementsToClosestCenter
|
85
|
+
aggregated_proximity_data = aggregateProximityData(proximity_data)
|
86
|
+
aggregated_proximity_data
|
87
|
+
end
|
88
|
+
|
89
|
+
def assignElementsToClosestCenter
|
90
|
+
results = Hash.new { |h,k| h[k] = [] }
|
91
|
+
|
92
|
+
@elements.each do |element|
|
93
|
+
best_center = nil
|
94
|
+
best_distance = nil
|
95
|
+
|
96
|
+
@centers.each do |center|
|
97
|
+
distance = KMeansClustering::calcDistanceSquared.call(center, element)
|
98
|
+
if best_distance.nil? or distance < best_distance
|
99
|
+
best_center = center
|
100
|
+
best_distance = distance
|
101
|
+
end
|
102
|
+
end
|
103
|
+
results[best_center] << element
|
104
|
+
end
|
105
|
+
|
106
|
+
results
|
107
|
+
end
|
108
|
+
|
109
|
+
def aggregateProximityData(data)
|
110
|
+
results = {}
|
111
|
+
data.each do |center, elements|
|
112
|
+
sum = KMeansClustering::calcSum.call(elements)
|
113
|
+
results[center] = {:sum => sum, :nb_elements => elements.length}
|
114
|
+
end
|
115
|
+
results
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
metadata
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: kmeans-clustering
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Tom Van Eyck
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-02-17 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: cabiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 0.0.7
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 0.0.7
|
30
|
+
description: A simple Ruby gem for parallelized k-means clustering.
|
31
|
+
email: tomvaneyck@gmail.com
|
32
|
+
executables: []
|
33
|
+
extensions: []
|
34
|
+
extra_rdoc_files: []
|
35
|
+
files:
|
36
|
+
- lib/kmeans-clustering.rb
|
37
|
+
homepage: https://github.com/vaneyckt/kmeans-clustering
|
38
|
+
licenses: []
|
39
|
+
post_install_message:
|
40
|
+
rdoc_options: []
|
41
|
+
require_paths:
|
42
|
+
- lib
|
43
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
44
|
+
none: false
|
45
|
+
requirements:
|
46
|
+
- - ! '>='
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '0'
|
49
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
requirements: []
|
56
|
+
rubyforge_project:
|
57
|
+
rubygems_version: 1.8.24
|
58
|
+
signing_key:
|
59
|
+
specification_version: 3
|
60
|
+
summary: A simple Ruby gem for parallelized k-means clustering.
|
61
|
+
test_files: []
|
62
|
+
has_rdoc:
|