kmeans-clustering 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/kmeans-clustering.rb +118 -0
  2. metadata +62 -0
@@ -0,0 +1,118 @@
1
+ module KMeansClustering
2
+ require 'cabiri'
3
+
4
+ # add static attributes through attr_accessor
5
+ class << self
6
+ attr_accessor :calcSum
7
+ attr_accessor :calcAverage
8
+ attr_accessor :calcDistanceSquared
9
+ end
10
+
11
+ # split array into several equal sized parts
12
+ # taken from http://apidock.com/rails/v3.2.8/Array/in_groups
13
+ def self.split_array_into_parts(array, nb_parts)
14
+ start = 0
15
+ groups = []
16
+
17
+ modulo = array.size % nb_parts
18
+ division = array.size / nb_parts
19
+
20
+ nb_parts.times do |index|
21
+ length = division + (modulo > 0 && modulo > index ? 1 : 0)
22
+ groups << array.slice(start, length)
23
+ start += length
24
+ end
25
+ groups
26
+ end
27
+
28
+ def self.run(centers, elements, nb_iterations, nb_jobs)
29
+ nb_iterations.times do
30
+ # create jobs
31
+ jobs = []
32
+ elements_for_jobs = split_array_into_parts(elements, nb_jobs)
33
+ nb_jobs.times do |i|
34
+ jobs << Job.new(centers, elements_for_jobs[i])
35
+ end
36
+
37
+ # run jobs in parallel
38
+ queue = Cabiri::JobQueue.new
39
+ nb_jobs.times do |i|
40
+ queue.add(i) { jobs[i].run }
41
+ end
42
+ queue.start(nb_jobs)
43
+
44
+ # sort aggregated proximity data by center
45
+ sorted_aggregated_proximity_data = Hash.new { |h,k| h[k] = [] }
46
+
47
+ queue.finished_jobs.values.each do |finished_job|
48
+ aggregated_proximity_data = finished_job.result
49
+ aggregated_proximity_data.each do |center, aggregated_data|
50
+ sorted_aggregated_proximity_data[center] << aggregated_data
51
+ end
52
+ end
53
+
54
+ # calculate sum and nb elements for each center
55
+ sums = Hash.new { |h,k| h[k] = [] }
56
+ nb_elements = Hash.new { |h,k| h[k] = [] }
57
+
58
+ sorted_aggregated_proximity_data.each do |center, aggregated_data|
59
+ sums[center] = KMeansClustering::calcSum.call(aggregated_data.collect { |d| d[:sum] })
60
+ nb_elements[center] = (aggregated_data.collect { |d| d[:nb_elements] }).inject(0, :+)
61
+ end
62
+
63
+ # calculate new centers
64
+ centers = []
65
+ sums.keys.each do |center|
66
+ centers << KMeansClustering::calcAverage.call(sums[center], nb_elements[center])
67
+ end
68
+ end
69
+
70
+ centers
71
+ end
72
+
73
+ # job that will be used for parallelization with Cabiri
74
+ class Job
75
+ attr_accessor :centers
76
+ attr_accessor :elements
77
+
78
+ def initialize(centers, elements)
79
+ @centers = centers
80
+ @elements = elements
81
+ end
82
+
83
+ def run
84
+ proximity_data = assignElementsToClosestCenter
85
+ aggregated_proximity_data = aggregateProximityData(proximity_data)
86
+ aggregated_proximity_data
87
+ end
88
+
89
+ def assignElementsToClosestCenter
90
+ results = Hash.new { |h,k| h[k] = [] }
91
+
92
+ @elements.each do |element|
93
+ best_center = nil
94
+ best_distance = nil
95
+
96
+ @centers.each do |center|
97
+ distance = KMeansClustering::calcDistanceSquared.call(center, element)
98
+ if best_distance.nil? or distance < best_distance
99
+ best_center = center
100
+ best_distance = distance
101
+ end
102
+ end
103
+ results[best_center] << element
104
+ end
105
+
106
+ results
107
+ end
108
+
109
+ def aggregateProximityData(data)
110
+ results = {}
111
+ data.each do |center, elements|
112
+ sum = KMeansClustering::calcSum.call(elements)
113
+ results[center] = {:sum => sum, :nb_elements => elements.length}
114
+ end
115
+ results
116
+ end
117
+ end
118
+ end
metadata ADDED
@@ -0,0 +1,62 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: kmeans-clustering
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Tom Van Eyck
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-02-17 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: cabiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 0.0.7
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 0.0.7
30
+ description: A simple Ruby gem for parallelized k-means clustering.
31
+ email: tomvaneyck@gmail.com
32
+ executables: []
33
+ extensions: []
34
+ extra_rdoc_files: []
35
+ files:
36
+ - lib/kmeans-clustering.rb
37
+ homepage: https://github.com/vaneyckt/kmeans-clustering
38
+ licenses: []
39
+ post_install_message:
40
+ rdoc_options: []
41
+ require_paths:
42
+ - lib
43
+ required_ruby_version: !ruby/object:Gem::Requirement
44
+ none: false
45
+ requirements:
46
+ - - ! '>='
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ required_rubygems_version: !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ requirements: []
56
+ rubyforge_project:
57
+ rubygems_version: 1.8.24
58
+ signing_key:
59
+ specification_version: 3
60
+ summary: A simple Ruby gem for parallelized k-means clustering.
61
+ test_files: []
62
+ has_rdoc: