kmeans-clustering 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/kmeans-clustering.rb +118 -0
- metadata +62 -0
@@ -0,0 +1,118 @@
|
|
1
|
+
module KMeansClustering
|
2
|
+
require 'cabiri'
|
3
|
+
|
4
|
+
# add static attributes through attr_accessor
|
5
|
+
class << self
|
6
|
+
attr_accessor :calcSum
|
7
|
+
attr_accessor :calcAverage
|
8
|
+
attr_accessor :calcDistanceSquared
|
9
|
+
end
|
10
|
+
|
11
|
+
# split array into several equal sized parts
|
12
|
+
# taken from http://apidock.com/rails/v3.2.8/Array/in_groups
|
13
|
+
def self.split_array_into_parts(array, nb_parts)
|
14
|
+
start = 0
|
15
|
+
groups = []
|
16
|
+
|
17
|
+
modulo = array.size % nb_parts
|
18
|
+
division = array.size / nb_parts
|
19
|
+
|
20
|
+
nb_parts.times do |index|
|
21
|
+
length = division + (modulo > 0 && modulo > index ? 1 : 0)
|
22
|
+
groups << array.slice(start, length)
|
23
|
+
start += length
|
24
|
+
end
|
25
|
+
groups
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.run(centers, elements, nb_iterations, nb_jobs)
|
29
|
+
nb_iterations.times do
|
30
|
+
# create jobs
|
31
|
+
jobs = []
|
32
|
+
elements_for_jobs = split_array_into_parts(elements, nb_jobs)
|
33
|
+
nb_jobs.times do |i|
|
34
|
+
jobs << Job.new(centers, elements_for_jobs[i])
|
35
|
+
end
|
36
|
+
|
37
|
+
# run jobs in parallel
|
38
|
+
queue = Cabiri::JobQueue.new
|
39
|
+
nb_jobs.times do |i|
|
40
|
+
queue.add(i) { jobs[i].run }
|
41
|
+
end
|
42
|
+
queue.start(nb_jobs)
|
43
|
+
|
44
|
+
# sort aggregated proximity data by center
|
45
|
+
sorted_aggregated_proximity_data = Hash.new { |h,k| h[k] = [] }
|
46
|
+
|
47
|
+
queue.finished_jobs.values.each do |finished_job|
|
48
|
+
aggregated_proximity_data = finished_job.result
|
49
|
+
aggregated_proximity_data.each do |center, aggregated_data|
|
50
|
+
sorted_aggregated_proximity_data[center] << aggregated_data
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# calculate sum and nb elements for each center
|
55
|
+
sums = Hash.new { |h,k| h[k] = [] }
|
56
|
+
nb_elements = Hash.new { |h,k| h[k] = [] }
|
57
|
+
|
58
|
+
sorted_aggregated_proximity_data.each do |center, aggregated_data|
|
59
|
+
sums[center] = KMeansClustering::calcSum.call(aggregated_data.collect { |d| d[:sum] })
|
60
|
+
nb_elements[center] = (aggregated_data.collect { |d| d[:nb_elements] }).inject(0, :+)
|
61
|
+
end
|
62
|
+
|
63
|
+
# calculate new centers
|
64
|
+
centers = []
|
65
|
+
sums.keys.each do |center|
|
66
|
+
centers << KMeansClustering::calcAverage.call(sums[center], nb_elements[center])
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
centers
|
71
|
+
end
|
72
|
+
|
73
|
+
# job that will be used for parallelization with Cabiri
|
74
|
+
class Job
|
75
|
+
attr_accessor :centers
|
76
|
+
attr_accessor :elements
|
77
|
+
|
78
|
+
def initialize(centers, elements)
|
79
|
+
@centers = centers
|
80
|
+
@elements = elements
|
81
|
+
end
|
82
|
+
|
83
|
+
def run
|
84
|
+
proximity_data = assignElementsToClosestCenter
|
85
|
+
aggregated_proximity_data = aggregateProximityData(proximity_data)
|
86
|
+
aggregated_proximity_data
|
87
|
+
end
|
88
|
+
|
89
|
+
def assignElementsToClosestCenter
|
90
|
+
results = Hash.new { |h,k| h[k] = [] }
|
91
|
+
|
92
|
+
@elements.each do |element|
|
93
|
+
best_center = nil
|
94
|
+
best_distance = nil
|
95
|
+
|
96
|
+
@centers.each do |center|
|
97
|
+
distance = KMeansClustering::calcDistanceSquared.call(center, element)
|
98
|
+
if best_distance.nil? or distance < best_distance
|
99
|
+
best_center = center
|
100
|
+
best_distance = distance
|
101
|
+
end
|
102
|
+
end
|
103
|
+
results[best_center] << element
|
104
|
+
end
|
105
|
+
|
106
|
+
results
|
107
|
+
end
|
108
|
+
|
109
|
+
def aggregateProximityData(data)
|
110
|
+
results = {}
|
111
|
+
data.each do |center, elements|
|
112
|
+
sum = KMeansClustering::calcSum.call(elements)
|
113
|
+
results[center] = {:sum => sum, :nb_elements => elements.length}
|
114
|
+
end
|
115
|
+
results
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
metadata
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: kmeans-clustering
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Tom Van Eyck
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-02-17 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: cabiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 0.0.7
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 0.0.7
|
30
|
+
description: A simple Ruby gem for parallelized k-means clustering.
|
31
|
+
email: tomvaneyck@gmail.com
|
32
|
+
executables: []
|
33
|
+
extensions: []
|
34
|
+
extra_rdoc_files: []
|
35
|
+
files:
|
36
|
+
- lib/kmeans-clustering.rb
|
37
|
+
homepage: https://github.com/vaneyckt/kmeans-clustering
|
38
|
+
licenses: []
|
39
|
+
post_install_message:
|
40
|
+
rdoc_options: []
|
41
|
+
require_paths:
|
42
|
+
- lib
|
43
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
44
|
+
none: false
|
45
|
+
requirements:
|
46
|
+
- - ! '>='
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '0'
|
49
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
requirements: []
|
56
|
+
rubyforge_project:
|
57
|
+
rubygems_version: 1.8.24
|
58
|
+
signing_key:
|
59
|
+
specification_version: 3
|
60
|
+
summary: A simple Ruby gem for parallelized k-means clustering.
|
61
|
+
test_files: []
|
62
|
+
has_rdoc:
|