kmeans-clustering 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/kmeans-clustering.rb +118 -0
  2. metadata +62 -0
@@ -0,0 +1,118 @@
1
+ module KMeansClustering
2
+ require 'cabiri'
3
+
4
+ # add static attributes through attr_accessor
5
+ class << self
6
+ attr_accessor :calcSum
7
+ attr_accessor :calcAverage
8
+ attr_accessor :calcDistanceSquared
9
+ end
10
+
11
+ # split array into several equal sized parts
12
+ # taken from http://apidock.com/rails/v3.2.8/Array/in_groups
13
+ def self.split_array_into_parts(array, nb_parts)
14
+ start = 0
15
+ groups = []
16
+
17
+ modulo = array.size % nb_parts
18
+ division = array.size / nb_parts
19
+
20
+ nb_parts.times do |index|
21
+ length = division + (modulo > 0 && modulo > index ? 1 : 0)
22
+ groups << array.slice(start, length)
23
+ start += length
24
+ end
25
+ groups
26
+ end
27
+
28
+ def self.run(centers, elements, nb_iterations, nb_jobs)
29
+ nb_iterations.times do
30
+ # create jobs
31
+ jobs = []
32
+ elements_for_jobs = split_array_into_parts(elements, nb_jobs)
33
+ nb_jobs.times do |i|
34
+ jobs << Job.new(centers, elements_for_jobs[i])
35
+ end
36
+
37
+ # run jobs in parallel
38
+ queue = Cabiri::JobQueue.new
39
+ nb_jobs.times do |i|
40
+ queue.add(i) { jobs[i].run }
41
+ end
42
+ queue.start(nb_jobs)
43
+
44
+ # sort aggregated proximity data by center
45
+ sorted_aggregated_proximity_data = Hash.new { |h,k| h[k] = [] }
46
+
47
+ queue.finished_jobs.values.each do |finished_job|
48
+ aggregated_proximity_data = finished_job.result
49
+ aggregated_proximity_data.each do |center, aggregated_data|
50
+ sorted_aggregated_proximity_data[center] << aggregated_data
51
+ end
52
+ end
53
+
54
+ # calculate sum and nb elements for each center
55
+ sums = Hash.new { |h,k| h[k] = [] }
56
+ nb_elements = Hash.new { |h,k| h[k] = [] }
57
+
58
+ sorted_aggregated_proximity_data.each do |center, aggregated_data|
59
+ sums[center] = KMeansClustering::calcSum.call(aggregated_data.collect { |d| d[:sum] })
60
+ nb_elements[center] = (aggregated_data.collect { |d| d[:nb_elements] }).inject(0, :+)
61
+ end
62
+
63
+ # calculate new centers
64
+ centers = []
65
+ sums.keys.each do |center|
66
+ centers << KMeansClustering::calcAverage.call(sums[center], nb_elements[center])
67
+ end
68
+ end
69
+
70
+ centers
71
+ end
72
+
73
+ # job that will be used for parallelization with Cabiri
74
+ class Job
75
+ attr_accessor :centers
76
+ attr_accessor :elements
77
+
78
+ def initialize(centers, elements)
79
+ @centers = centers
80
+ @elements = elements
81
+ end
82
+
83
+ def run
84
+ proximity_data = assignElementsToClosestCenter
85
+ aggregated_proximity_data = aggregateProximityData(proximity_data)
86
+ aggregated_proximity_data
87
+ end
88
+
89
+ def assignElementsToClosestCenter
90
+ results = Hash.new { |h,k| h[k] = [] }
91
+
92
+ @elements.each do |element|
93
+ best_center = nil
94
+ best_distance = nil
95
+
96
+ @centers.each do |center|
97
+ distance = KMeansClustering::calcDistanceSquared.call(center, element)
98
+ if best_distance.nil? or distance < best_distance
99
+ best_center = center
100
+ best_distance = distance
101
+ end
102
+ end
103
+ results[best_center] << element
104
+ end
105
+
106
+ results
107
+ end
108
+
109
+ def aggregateProximityData(data)
110
+ results = {}
111
+ data.each do |center, elements|
112
+ sum = KMeansClustering::calcSum.call(elements)
113
+ results[center] = {:sum => sum, :nb_elements => elements.length}
114
+ end
115
+ results
116
+ end
117
+ end
118
+ end
metadata ADDED
@@ -0,0 +1,62 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: kmeans-clustering
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Tom Van Eyck
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-02-17 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: cabiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 0.0.7
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 0.0.7
30
+ description: A simple Ruby gem for parallelized k-means clustering.
31
+ email: tomvaneyck@gmail.com
32
+ executables: []
33
+ extensions: []
34
+ extra_rdoc_files: []
35
+ files:
36
+ - lib/kmeans-clustering.rb
37
+ homepage: https://github.com/vaneyckt/kmeans-clustering
38
+ licenses: []
39
+ post_install_message:
40
+ rdoc_options: []
41
+ require_paths:
42
+ - lib
43
+ required_ruby_version: !ruby/object:Gem::Requirement
44
+ none: false
45
+ requirements:
46
+ - - ! '>='
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ required_rubygems_version: !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ requirements: []
56
+ rubyforge_project:
57
+ rubygems_version: 1.8.24
58
+ signing_key:
59
+ specification_version: 3
60
+ summary: A simple Ruby gem for parallelized k-means clustering.
61
+ test_files: []
62
+ has_rdoc: