cluda 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,79 @@
1
+ module Cluda
2
+ class InvalidPoint < RuntimeError; end
3
+ class InvalidCentroid < RuntimeError; end
4
+ class InvalidSmartPoint < RuntimeError; end
5
+ class InvalidDistanceMethod < RuntimeError; end
6
+
7
+ # For an output given by CluDA calculate the means for each centroid
8
+ #
9
+ #Example:
10
+ # > clusters = {{:x=>2, :y=>2}=>
11
+ # [{:x=>1, :y=>1, :distance=>1.4142135623730951},
12
+ # {:x=>2, :y=>1, :distance=>1.0},
13
+ # {:x=>1, :y=>2, :distance=>1.0},
14
+ # {:x=>2, :y=>2, :distance=>0.0}]}
15
+ # > Cluda.median_for_centroids(clusters)
16
+ #Arguments:
17
+ # points: ( Hash )
18
+ def self.median_for_centroids ( points )
19
+ points.keys.each do | centroid |
20
+ validate_smart_points( points[centroid] )
21
+ end
22
+
23
+ points.keys.map do |centroid|
24
+ centroid.merge( median: median_for_centroid( centroid, points ) )
25
+ end
26
+ end
27
+
28
+ protected
29
+
30
+ def self.valid_class?( name )
31
+ ['euclidean', 'chebyshev', 'manhattan'].include?( name.downcase )
32
+ end
33
+
34
+ def self.validate_centroids( centroids )
35
+ centroids = centroids.is_a?(Array) ? centroids : [ centroids ]
36
+
37
+ validate(centroids)
38
+
39
+ centroids.each do |point|
40
+ raise InvalidCentroid unless point.include?(:median)
41
+ end
42
+
43
+ centroids
44
+ end
45
+
46
+ def self.validate_smart_points( points )
47
+ points = points.is_a?(Array) ? points : [ points ]
48
+
49
+ validate( points )
50
+
51
+ points.each do |point|
52
+ raise InvalidSmartPoint unless point.include?(:distance)
53
+ end
54
+
55
+ points
56
+ end
57
+
58
+ def self.validate( data )
59
+ points = data.is_a?(Array) ? data : [ data ]
60
+ points.each do |point|
61
+ raise InvalidPoint unless point.is_a?(Hash) &&
62
+ point.include?(:x) && point.include?(:y) &&
63
+ point[:x].is_a?(Numeric) && point[:y].is_a?(Numeric)
64
+ end
65
+
66
+ points
67
+ end
68
+
69
+ def self.median_for_centroid( centroid, points )
70
+ median( points[centroid].map{ |point| point[:distance] } )
71
+ end
72
+
73
+ def self.median( list )
74
+ sorted_list = list.sort
75
+ len = list.size
76
+
77
+ sorted_list[( (len / 2 ) + 0.5 ).floor]
78
+ end
79
+ end
@@ -2,7 +2,7 @@ module Cluda
2
2
  class Chebyshev < Distance
3
3
 
4
4
  def self.distance(x0, x)
5
- validate( [x0,x] )
5
+ Cluda.validate( [x0,x] )
6
6
 
7
7
  [ (x0[:x] - x[:x]).abs, (x0[:y] - x[:y]).abs ].max
8
8
  end
@@ -1,24 +1,9 @@
1
1
  module Cluda
2
- class InvalidPoint < RuntimeError; end
3
-
4
2
  class Distance
5
3
  extend Math
6
4
 
7
5
  def self.distance(x0, x)
8
6
  raise ::NotImplementedError.new("You must implement distance method")
9
7
  end
10
-
11
- protected
12
-
13
- def self.validate( data )
14
- points = data.is_a?(Array) ? data : [ data ]
15
- points.each do |point|
16
- raise InvalidPoint unless point.is_a?(Hash) &&
17
- point.include?(:x) && point.include?(:y) &&
18
- point[:x].is_a?(Numeric) && point[:y].is_a?(Numeric)
19
- end
20
-
21
- points
22
- end
23
8
  end
24
9
  end
@@ -2,7 +2,7 @@ module Cluda
2
2
  class Euclidean < Distance
3
3
 
4
4
  def self.distance(x0, x)
5
- validate( [x0,x] )
5
+ Cluda.validate( [x0,x] )
6
6
 
7
7
  sqrt( (x0[:x] - x[:x]) ** 2 + (x0[:y] - x[:y]) ** 2 )
8
8
  end
@@ -2,7 +2,7 @@ module Cluda
2
2
  class Manhattan < Distance
3
3
 
4
4
  def self.distance(x0, x)
5
- validate( [x0,x] )
5
+ Cluda.validate( [x0,x] )
6
6
 
7
7
  (x0[:x] - x[:x]).abs + (x0[:y] - x[:y]).abs
8
8
  end
data/lib/cluda/kmeans.rb CHANGED
@@ -3,24 +3,61 @@ require 'cluda/distances/euclidean'
3
3
  require 'cluda/distances/chebyshev'
4
4
 
5
5
  module Cluda
6
- class InvalidDistanceMethod < RuntimeError; end
7
-
8
6
  class Kmeans
9
- def self.classify( list, k, class_name = 'euclidean', max_iterations = 50 )
10
- raise InvalidDistanceMethod unless valid_class?(class_name)
7
+
8
+ DEFAULT_OPTS = { k: 1,
9
+ centroids: nil,
10
+ distance_method: 'euclidean',
11
+ be_smart: false,
12
+ margin_distance_percentage: 0,
13
+ max_iterations: 50 }
14
+
15
+ #Classify the points using KMeans as the clustering algorithm
16
+ #
17
+ #Example:
18
+ # >> points = [ { x: 1, y: 1}, { x: 2, y: 1}, { x: 1, y: 2}, { x: 2, y: 2}, { x: 4, y: 6}, { x: 5, y: 7}, { x: 5, y: 6}, { x: 5, y: 5}, { x: 6, y: 6}, { x: 6, y: 5}]
19
+ # >> Cluda::Kmeans.classify( points, k: 1, distance_method: 'euclidean', be_smart: true, max_iterations: 50)
20
+ #Arguments:
21
+ # list: (Array [Hash] )
22
+ # k: (Numeric) *optional*
23
+ # centroids: (Array) *optional*
24
+ # distance_method: (String) *optional*
25
+ # be_smart: (Boolean) *optional* [If you want CluDA to be smart you have to specify the centroids ]
26
+ # margin_distance_percentage: (Numeric) *optional* [Between 0 and 1]
27
+ # max_iterations: (Numeric) *optional*
28
+ def self.classify( list, opts = {} )
29
+ @opts = DEFAULT_OPTS.merge(opts)
30
+
31
+ raise Cluda::InvalidDistanceMethod unless Cluda::valid_class?(@opts[:distance_method])
32
+
33
+ _class = Cluda.const_get( @opts[:distance_method].downcase.capitalize )
11
34
 
12
- _class = Cluda.const_get( class_name.downcase.capitalize )
13
- _class.validate( list )
35
+ Cluda.validate( list )
36
+ Cluda.validate_centroids( @opts[:centroids] ) if @opts[:be_smart]
14
37
 
15
38
  iter = 1
16
- previous_centroids = nil
17
- centroids = initialize_centroids( list , k, _class )
39
+ max_iterations = @opts[:max_iterations]
40
+ centroids = @opts[:centroids].nil? || @opts[:centroids].empty? ? initialize_centroids( list , @opts[:k]) : process_centroids( @opts[:centroids] )
41
+ previous_centroids = nil
42
+ smart_clustering = @opts[:be_smart]
43
+ margin_distance_percentage = @opts[:margin_distance_percentage]
18
44
 
19
45
  while (iter < max_iterations) && (previous_centroids != centroids)
20
46
  output = init_output(centroids)
21
-
47
+ margin = smart_clustering ? @median_centroid * margin_distance_percentage : 0
48
+
22
49
  list.each do |point|
23
- output[nearest_centroid(point, centroids, _class)] << point
50
+ centroid, distance = nearest_centroid(point, centroids, _class)
51
+
52
+ if smart_clustering && distance > ( @median_centroid + margin )
53
+ @median_centroid = distance
54
+ centroids << point
55
+ create_centroid(point, output)
56
+ centroid = point
57
+ distance = 0
58
+ end
59
+
60
+ output[centroid] << point.merge( distance: distance )
24
61
  end
25
62
 
26
63
  iter += 1
@@ -36,7 +73,7 @@ module Cluda
36
73
  def self.nearest_centroid(point, centroids, _class = Cluda::Euclidean )
37
74
  return nil if centroids.empty?
38
75
 
39
- _class.validate( point )
76
+ Cluda.validate( point )
40
77
 
41
78
  nearest_centroid = centroids[0]
42
79
  min_distance = _class.distance(point, nearest_centroid)
@@ -49,11 +86,11 @@ module Cluda
49
86
  end
50
87
  end
51
88
 
52
- nearest_centroid
89
+ [nearest_centroid, min_distance]
53
90
  end
54
91
 
55
- def self.initialize_centroids( list , k, _class = Cluda::Euclidean )
56
- _class.validate( list )
92
+ def self.initialize_centroids( list , k )
93
+ Cluda.validate( list )
57
94
 
58
95
  return [] if list.empty? || k > list.size
59
96
 
@@ -62,21 +99,22 @@ module Cluda
62
99
 
63
100
  private
64
101
 
65
- def self.valid_class?( name )
66
- ['euclidean', 'chebyshev', 'manhattan'].include?( name.downcase )
67
- end
68
-
69
102
  def self.init_output(centroids)
70
103
  centroids.each_with_object({}) do |centroid, memo|
71
104
  memo[centroid] = []
72
105
  end
73
106
  end
74
107
 
75
- def self.median( list )
76
- sorted_list = list.sort
77
- len = list.size
108
+ def self.create_centroid( centroid, output )
109
+ output[centroid] = []
110
+ end
78
111
 
79
- sorted_list[( (len / 2 ) + 0.5 ).floor]
112
+ def self.process_centroids(centroids)
113
+ centroids.each_with_object([]) do |point, memo|
114
+ @median_centroid = point[:median] if @median_centroid.nil? || @median_centroid < point[:median]
115
+
116
+ memo << { x: point[:x], y: point[:y] }
117
+ end
80
118
  end
81
119
 
82
120
  def self.get_key_values( points, key )
@@ -85,11 +123,13 @@ module Cluda
85
123
 
86
124
  def self.move_centroids( output )
87
125
  output.map do |(key, value)|
88
- x = median( get_key_values(value, :x) )
89
- y = median( get_key_values(value, :y) )
126
+ unless value.empty?
127
+ x = Cluda.median( get_key_values(value, :x) )
128
+ y = Cluda.median( get_key_values(value, :y) )
90
129
 
91
- { x: x, y: y }
92
- end
130
+ { x: x, y: y }
131
+ end
132
+ end.compact
93
133
  end
94
134
  end
95
135
  end
data/lib/cluda.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  $:.unshift File.expand_path('..', __FILE__ )
2
2
 
3
3
  require 'cluda/distances/distance'
4
+ require 'cluda/cluda_common'
4
5
  require 'cluda/kmeans'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cluda
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-01-29 00:00:00.000000000 Z
12
+ date: 2014-02-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -50,6 +50,7 @@ extensions: []
50
50
  extra_rdoc_files: []
51
51
  files:
52
52
  - lib/cluda.rb
53
+ - lib/cluda/cluda_common.rb
53
54
  - lib/cluda/kmeans.rb
54
55
  - lib/cluda/distances/distance.rb
55
56
  - lib/cluda/distances/euclidean.rb