cluda 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/cluda/cluda_common.rb +79 -0
- data/lib/cluda/distances/chebyshev.rb +1 -1
- data/lib/cluda/distances/distance.rb +0 -15
- data/lib/cluda/distances/euclidean.rb +1 -1
- data/lib/cluda/distances/manhattan.rb +1 -1
- data/lib/cluda/kmeans.rb +66 -26
- data/lib/cluda.rb +1 -0
- metadata +3 -2
@@ -0,0 +1,79 @@
|
|
1
|
+
module Cluda
|
2
|
+
class InvalidPoint < RuntimeError; end
|
3
|
+
class InvalidCentroid < RuntimeError; end
|
4
|
+
class InvalidSmartPoint < RuntimeError; end
|
5
|
+
class InvalidDistanceMethod < RuntimeError; end
|
6
|
+
|
7
|
+
# For an output given by CluDA calculate the means for each centroid
|
8
|
+
#
|
9
|
+
#Example:
|
10
|
+
# > clusters = {{:x=>2, :y=>2}=>
|
11
|
+
# [{:x=>1, :y=>1, :distance=>1.4142135623730951},
|
12
|
+
# {:x=>2, :y=>1, :distance=>1.0},
|
13
|
+
# {:x=>1, :y=>2, :distance=>1.0},
|
14
|
+
# {:x=>2, :y=>2, :distance=>0.0}]}
|
15
|
+
# > Cluda.median_for_centroids(clusters)
|
16
|
+
#Arguments:
|
17
|
+
# points: ( Hash )
|
18
|
+
def self.median_for_centroids ( points )
|
19
|
+
points.keys.each do | centroid |
|
20
|
+
validate_smart_points( points[centroid] )
|
21
|
+
end
|
22
|
+
|
23
|
+
points.keys.map do |centroid|
|
24
|
+
centroid.merge( median: median_for_centroid( centroid, points ) )
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
protected
|
29
|
+
|
30
|
+
def self.valid_class?( name )
|
31
|
+
['euclidean', 'chebyshev', 'manhattan'].include?( name.downcase )
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.validate_centroids( centroids )
|
35
|
+
centroids = centroids.is_a?(Array) ? centroids : [ centroids ]
|
36
|
+
|
37
|
+
validate(centroids)
|
38
|
+
|
39
|
+
centroids.each do |point|
|
40
|
+
raise InvalidCentroid unless point.include?(:median)
|
41
|
+
end
|
42
|
+
|
43
|
+
centroids
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.validate_smart_points( points )
|
47
|
+
points = points.is_a?(Array) ? points : [ points ]
|
48
|
+
|
49
|
+
validate( points )
|
50
|
+
|
51
|
+
points.each do |point|
|
52
|
+
raise InvalidSmartPoint unless point.include?(:distance)
|
53
|
+
end
|
54
|
+
|
55
|
+
points
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.validate( data )
|
59
|
+
points = data.is_a?(Array) ? data : [ data ]
|
60
|
+
points.each do |point|
|
61
|
+
raise InvalidPoint unless point.is_a?(Hash) &&
|
62
|
+
point.include?(:x) && point.include?(:y) &&
|
63
|
+
point[:x].is_a?(Numeric) && point[:y].is_a?(Numeric)
|
64
|
+
end
|
65
|
+
|
66
|
+
points
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.median_for_centroid( centroid, points )
|
70
|
+
median( points[centroid].map{ |point| point[:distance] } )
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.median( list )
|
74
|
+
sorted_list = list.sort
|
75
|
+
len = list.size
|
76
|
+
|
77
|
+
sorted_list[( (len / 2 ) + 0.5 ).floor]
|
78
|
+
end
|
79
|
+
end
|
@@ -1,24 +1,9 @@
|
|
1
1
|
module Cluda
|
2
|
-
class InvalidPoint < RuntimeError; end
|
3
|
-
|
4
2
|
class Distance
|
5
3
|
extend Math
|
6
4
|
|
7
5
|
def self.distance(x0, x)
|
8
6
|
raise ::NotImplementedError.new("You must implement distance method")
|
9
7
|
end
|
10
|
-
|
11
|
-
protected
|
12
|
-
|
13
|
-
def self.validate( data )
|
14
|
-
points = data.is_a?(Array) ? data : [ data ]
|
15
|
-
points.each do |point|
|
16
|
-
raise InvalidPoint unless point.is_a?(Hash) &&
|
17
|
-
point.include?(:x) && point.include?(:y) &&
|
18
|
-
point[:x].is_a?(Numeric) && point[:y].is_a?(Numeric)
|
19
|
-
end
|
20
|
-
|
21
|
-
points
|
22
|
-
end
|
23
8
|
end
|
24
9
|
end
|
data/lib/cluda/kmeans.rb
CHANGED
@@ -3,24 +3,61 @@ require 'cluda/distances/euclidean'
|
|
3
3
|
require 'cluda/distances/chebyshev'
|
4
4
|
|
5
5
|
module Cluda
|
6
|
-
class InvalidDistanceMethod < RuntimeError; end
|
7
|
-
|
8
6
|
class Kmeans
|
9
|
-
|
10
|
-
|
7
|
+
|
8
|
+
DEFAULT_OPTS = { k: 1,
|
9
|
+
centroids: nil,
|
10
|
+
distance_method: 'euclidean',
|
11
|
+
be_smart: false,
|
12
|
+
margin_distance_percentage: 0,
|
13
|
+
max_iterations: 50 }
|
14
|
+
|
15
|
+
#Classify the points using KMeans as the clustering algorithm
|
16
|
+
#
|
17
|
+
#Example:
|
18
|
+
# >> points = [ { x: 1, y: 1}, { x: 2, y: 1}, { x: 1, y: 2}, { x: 2, y: 2}, { x: 4, y: 6}, { x: 5, y: 7}, { x: 5, y: 6}, { x: 5, y: 5}, { x: 6, y: 6}, { x: 6, y: 5}]
|
19
|
+
# >> Cluda::Kmeans.classify( points, k: 1, distance_method: 'euclidean', be_smart: true, max_iterations: 50)
|
20
|
+
#Arguments:
|
21
|
+
# list: (Array [Hash] )
|
22
|
+
# k: (Numeric) *optional*
|
23
|
+
# centroids: (Array) *optional*
|
24
|
+
# distance_method: (String) *optional*
|
25
|
+
# be_smart: (Boolean) *optional* [If you want CluDA to be smart you have to specify the centroids ]
|
26
|
+
# margin_distance_percentage: (Numeric) *optional* [Between 0 and 1]
|
27
|
+
# max_iterations: (Numeric) *optional*
|
28
|
+
def self.classify( list, opts = {} )
|
29
|
+
@opts = DEFAULT_OPTS.merge(opts)
|
30
|
+
|
31
|
+
raise Cluda::InvalidDistanceMethod unless Cluda::valid_class?(@opts[:distance_method])
|
32
|
+
|
33
|
+
_class = Cluda.const_get( @opts[:distance_method].downcase.capitalize )
|
11
34
|
|
12
|
-
|
13
|
-
|
35
|
+
Cluda.validate( list )
|
36
|
+
Cluda.validate_centroids( @opts[:centroids] ) if @opts[:be_smart]
|
14
37
|
|
15
38
|
iter = 1
|
16
|
-
|
17
|
-
centroids
|
39
|
+
max_iterations = @opts[:max_iterations]
|
40
|
+
centroids = @opts[:centroids].nil? || @opts[:centroids].empty? ? initialize_centroids( list , @opts[:k]) : process_centroids( @opts[:centroids] )
|
41
|
+
previous_centroids = nil
|
42
|
+
smart_clustering = @opts[:be_smart]
|
43
|
+
margin_distance_percentage = @opts[:margin_distance_percentage]
|
18
44
|
|
19
45
|
while (iter < max_iterations) && (previous_centroids != centroids)
|
20
46
|
output = init_output(centroids)
|
21
|
-
|
47
|
+
margin = smart_clustering ? @median_centroid * margin_distance_percentage : 0
|
48
|
+
|
22
49
|
list.each do |point|
|
23
|
-
|
50
|
+
centroid, distance = nearest_centroid(point, centroids, _class)
|
51
|
+
|
52
|
+
if smart_clustering && distance > ( @median_centroid + margin )
|
53
|
+
@median_centroid = distance
|
54
|
+
centroids << point
|
55
|
+
create_centroid(point, output)
|
56
|
+
centroid = point
|
57
|
+
distance = 0
|
58
|
+
end
|
59
|
+
|
60
|
+
output[centroid] << point.merge( distance: distance )
|
24
61
|
end
|
25
62
|
|
26
63
|
iter += 1
|
@@ -36,7 +73,7 @@ module Cluda
|
|
36
73
|
def self.nearest_centroid(point, centroids, _class = Cluda::Euclidean )
|
37
74
|
return nil if centroids.empty?
|
38
75
|
|
39
|
-
|
76
|
+
Cluda.validate( point )
|
40
77
|
|
41
78
|
nearest_centroid = centroids[0]
|
42
79
|
min_distance = _class.distance(point, nearest_centroid)
|
@@ -49,11 +86,11 @@ module Cluda
|
|
49
86
|
end
|
50
87
|
end
|
51
88
|
|
52
|
-
nearest_centroid
|
89
|
+
[nearest_centroid, min_distance]
|
53
90
|
end
|
54
91
|
|
55
|
-
def self.initialize_centroids( list , k
|
56
|
-
|
92
|
+
def self.initialize_centroids( list , k )
|
93
|
+
Cluda.validate( list )
|
57
94
|
|
58
95
|
return [] if list.empty? || k > list.size
|
59
96
|
|
@@ -62,21 +99,22 @@ module Cluda
|
|
62
99
|
|
63
100
|
private
|
64
101
|
|
65
|
-
def self.valid_class?( name )
|
66
|
-
['euclidean', 'chebyshev', 'manhattan'].include?( name.downcase )
|
67
|
-
end
|
68
|
-
|
69
102
|
def self.init_output(centroids)
|
70
103
|
centroids.each_with_object({}) do |centroid, memo|
|
71
104
|
memo[centroid] = []
|
72
105
|
end
|
73
106
|
end
|
74
107
|
|
75
|
-
def self.
|
76
|
-
|
77
|
-
|
108
|
+
def self.create_centroid( centroid, output )
|
109
|
+
output[centroid] = []
|
110
|
+
end
|
78
111
|
|
79
|
-
|
112
|
+
def self.process_centroids(centroids)
|
113
|
+
centroids.each_with_object([]) do |point, memo|
|
114
|
+
@median_centroid = point[:median] if @median_centroid.nil? || @median_centroid < point[:median]
|
115
|
+
|
116
|
+
memo << { x: point[:x], y: point[:y] }
|
117
|
+
end
|
80
118
|
end
|
81
119
|
|
82
120
|
def self.get_key_values( points, key )
|
@@ -85,11 +123,13 @@ module Cluda
|
|
85
123
|
|
86
124
|
def self.move_centroids( output )
|
87
125
|
output.map do |(key, value)|
|
88
|
-
|
89
|
-
|
126
|
+
unless value.empty?
|
127
|
+
x = Cluda.median( get_key_values(value, :x) )
|
128
|
+
y = Cluda.median( get_key_values(value, :y) )
|
90
129
|
|
91
|
-
|
92
|
-
|
130
|
+
{ x: x, y: y }
|
131
|
+
end
|
132
|
+
end.compact
|
93
133
|
end
|
94
134
|
end
|
95
135
|
end
|
data/lib/cluda.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cluda
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-
|
12
|
+
date: 2014-02-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -50,6 +50,7 @@ extensions: []
|
|
50
50
|
extra_rdoc_files: []
|
51
51
|
files:
|
52
52
|
- lib/cluda.rb
|
53
|
+
- lib/cluda/cluda_common.rb
|
53
54
|
- lib/cluda/kmeans.rb
|
54
55
|
- lib/cluda/distances/distance.rb
|
55
56
|
- lib/cluda/distances/euclidean.rb
|