cluda 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,79 @@
1
+ module Cluda
2
+ class InvalidPoint < RuntimeError; end
3
+ class InvalidCentroid < RuntimeError; end
4
+ class InvalidSmartPoint < RuntimeError; end
5
+ class InvalidDistanceMethod < RuntimeError; end
6
+
7
+ # For an output given by CluDA calculate the means for each centroid
8
+ #
9
+ #Example:
10
+ # > clusters = {{:x=>2, :y=>2}=>
11
+ # [{:x=>1, :y=>1, :distance=>1.4142135623730951},
12
+ # {:x=>2, :y=>1, :distance=>1.0},
13
+ # {:x=>1, :y=>2, :distance=>1.0},
14
+ # {:x=>2, :y=>2, :distance=>0.0}]}
15
+ # > Cluda.median_for_centroids(clusters)
16
+ #Arguments:
17
+ # points: ( Hash )
18
+ def self.median_for_centroids ( points )
19
+ points.keys.each do | centroid |
20
+ validate_smart_points( points[centroid] )
21
+ end
22
+
23
+ points.keys.map do |centroid|
24
+ centroid.merge( median: median_for_centroid( centroid, points ) )
25
+ end
26
+ end
27
+
28
+ protected
29
+
30
+ def self.valid_class?( name )
31
+ ['euclidean', 'chebyshev', 'manhattan'].include?( name.downcase )
32
+ end
33
+
34
+ def self.validate_centroids( centroids )
35
+ centroids = centroids.is_a?(Array) ? centroids : [ centroids ]
36
+
37
+ validate(centroids)
38
+
39
+ centroids.each do |point|
40
+ raise InvalidCentroid unless point.include?(:median)
41
+ end
42
+
43
+ centroids
44
+ end
45
+
46
+ def self.validate_smart_points( points )
47
+ points = points.is_a?(Array) ? points : [ points ]
48
+
49
+ validate( points )
50
+
51
+ points.each do |point|
52
+ raise InvalidSmartPoint unless point.include?(:distance)
53
+ end
54
+
55
+ points
56
+ end
57
+
58
+ def self.validate( data )
59
+ points = data.is_a?(Array) ? data : [ data ]
60
+ points.each do |point|
61
+ raise InvalidPoint unless point.is_a?(Hash) &&
62
+ point.include?(:x) && point.include?(:y) &&
63
+ point[:x].is_a?(Numeric) && point[:y].is_a?(Numeric)
64
+ end
65
+
66
+ points
67
+ end
68
+
69
+ def self.median_for_centroid( centroid, points )
70
+ median( points[centroid].map{ |point| point[:distance] } )
71
+ end
72
+
73
+ def self.median( list )
74
+ sorted_list = list.sort
75
+ len = list.size
76
+
77
+ sorted_list[( (len / 2 ) + 0.5 ).floor]
78
+ end
79
+ end
@@ -2,7 +2,7 @@ module Cluda
2
2
  class Chebyshev < Distance
3
3
 
4
4
  def self.distance(x0, x)
5
- validate( [x0,x] )
5
+ Cluda.validate( [x0,x] )
6
6
 
7
7
  [ (x0[:x] - x[:x]).abs, (x0[:y] - x[:y]).abs ].max
8
8
  end
@@ -1,24 +1,9 @@
1
1
  module Cluda
2
- class InvalidPoint < RuntimeError; end
3
-
4
2
  class Distance
5
3
  extend Math
6
4
 
7
5
  def self.distance(x0, x)
8
6
  raise ::NotImplementedError.new("You must implement distance method")
9
7
  end
10
-
11
- protected
12
-
13
- def self.validate( data )
14
- points = data.is_a?(Array) ? data : [ data ]
15
- points.each do |point|
16
- raise InvalidPoint unless point.is_a?(Hash) &&
17
- point.include?(:x) && point.include?(:y) &&
18
- point[:x].is_a?(Numeric) && point[:y].is_a?(Numeric)
19
- end
20
-
21
- points
22
- end
23
8
  end
24
9
  end
@@ -2,7 +2,7 @@ module Cluda
2
2
  class Euclidean < Distance
3
3
 
4
4
  def self.distance(x0, x)
5
- validate( [x0,x] )
5
+ Cluda.validate( [x0,x] )
6
6
 
7
7
  sqrt( (x0[:x] - x[:x]) ** 2 + (x0[:y] - x[:y]) ** 2 )
8
8
  end
@@ -2,7 +2,7 @@ module Cluda
2
2
  class Manhattan < Distance
3
3
 
4
4
  def self.distance(x0, x)
5
- validate( [x0,x] )
5
+ Cluda.validate( [x0,x] )
6
6
 
7
7
  (x0[:x] - x[:x]).abs + (x0[:y] - x[:y]).abs
8
8
  end
data/lib/cluda/kmeans.rb CHANGED
@@ -3,24 +3,61 @@ require 'cluda/distances/euclidean'
3
3
  require 'cluda/distances/chebyshev'
4
4
 
5
5
  module Cluda
6
- class InvalidDistanceMethod < RuntimeError; end
7
-
8
6
  class Kmeans
9
- def self.classify( list, k, class_name = 'euclidean', max_iterations = 50 )
10
- raise InvalidDistanceMethod unless valid_class?(class_name)
7
+
8
+ DEFAULT_OPTS = { k: 1,
9
+ centroids: nil,
10
+ distance_method: 'euclidean',
11
+ be_smart: false,
12
+ margin_distance_percentage: 0,
13
+ max_iterations: 50 }
14
+
15
+ #Classify the points using KMeans as the clustering algorithm
16
+ #
17
+ #Example:
18
+ # >> points = [ { x: 1, y: 1}, { x: 2, y: 1}, { x: 1, y: 2}, { x: 2, y: 2}, { x: 4, y: 6}, { x: 5, y: 7}, { x: 5, y: 6}, { x: 5, y: 5}, { x: 6, y: 6}, { x: 6, y: 5}]
19
+ # >> Cluda::Kmeans.classify( points, k: 1, distance_method: 'euclidean', be_smart: true, max_iterations: 50)
20
+ #Arguments:
21
+ # list: (Array [Hash] )
22
+ # k: (Numeric) *optional*
23
+ # centroids: (Array) *optional*
24
+ # distance_method: (String) *optional*
25
+ # be_smart: (Boolean) *optional* [If you want CluDA to be smart you have to specify the centroids ]
26
+ # margin_distance_percentage: (Numeric) *optional* [Between 0 and 1]
27
+ # max_iterations: (Numeric) *optional*
28
+ def self.classify( list, opts = {} )
29
+ @opts = DEFAULT_OPTS.merge(opts)
30
+
31
+ raise Cluda::InvalidDistanceMethod unless Cluda::valid_class?(@opts[:distance_method])
32
+
33
+ _class = Cluda.const_get( @opts[:distance_method].downcase.capitalize )
11
34
 
12
- _class = Cluda.const_get( class_name.downcase.capitalize )
13
- _class.validate( list )
35
+ Cluda.validate( list )
36
+ Cluda.validate_centroids( @opts[:centroids] ) if @opts[:be_smart]
14
37
 
15
38
  iter = 1
16
- previous_centroids = nil
17
- centroids = initialize_centroids( list , k, _class )
39
+ max_iterations = @opts[:max_iterations]
40
+ centroids = @opts[:centroids].nil? || @opts[:centroids].empty? ? initialize_centroids( list , @opts[:k]) : process_centroids( @opts[:centroids] )
41
+ previous_centroids = nil
42
+ smart_clustering = @opts[:be_smart]
43
+ margin_distance_percentage = @opts[:margin_distance_percentage]
18
44
 
19
45
  while (iter < max_iterations) && (previous_centroids != centroids)
20
46
  output = init_output(centroids)
21
-
47
+ margin = smart_clustering ? @median_centroid * margin_distance_percentage : 0
48
+
22
49
  list.each do |point|
23
- output[nearest_centroid(point, centroids, _class)] << point
50
+ centroid, distance = nearest_centroid(point, centroids, _class)
51
+
52
+ if smart_clustering && distance > ( @median_centroid + margin )
53
+ @median_centroid = distance
54
+ centroids << point
55
+ create_centroid(point, output)
56
+ centroid = point
57
+ distance = 0
58
+ end
59
+
60
+ output[centroid] << point.merge( distance: distance )
24
61
  end
25
62
 
26
63
  iter += 1
@@ -36,7 +73,7 @@ module Cluda
36
73
  def self.nearest_centroid(point, centroids, _class = Cluda::Euclidean )
37
74
  return nil if centroids.empty?
38
75
 
39
- _class.validate( point )
76
+ Cluda.validate( point )
40
77
 
41
78
  nearest_centroid = centroids[0]
42
79
  min_distance = _class.distance(point, nearest_centroid)
@@ -49,11 +86,11 @@ module Cluda
49
86
  end
50
87
  end
51
88
 
52
- nearest_centroid
89
+ [nearest_centroid, min_distance]
53
90
  end
54
91
 
55
- def self.initialize_centroids( list , k, _class = Cluda::Euclidean )
56
- _class.validate( list )
92
+ def self.initialize_centroids( list , k )
93
+ Cluda.validate( list )
57
94
 
58
95
  return [] if list.empty? || k > list.size
59
96
 
@@ -62,21 +99,22 @@ module Cluda
62
99
 
63
100
  private
64
101
 
65
- def self.valid_class?( name )
66
- ['euclidean', 'chebyshev', 'manhattan'].include?( name.downcase )
67
- end
68
-
69
102
  def self.init_output(centroids)
70
103
  centroids.each_with_object({}) do |centroid, memo|
71
104
  memo[centroid] = []
72
105
  end
73
106
  end
74
107
 
75
- def self.median( list )
76
- sorted_list = list.sort
77
- len = list.size
108
+ def self.create_centroid( centroid, output )
109
+ output[centroid] = []
110
+ end
78
111
 
79
- sorted_list[( (len / 2 ) + 0.5 ).floor]
112
+ def self.process_centroids(centroids)
113
+ centroids.each_with_object([]) do |point, memo|
114
+ @median_centroid = point[:median] if @median_centroid.nil? || @median_centroid < point[:median]
115
+
116
+ memo << { x: point[:x], y: point[:y] }
117
+ end
80
118
  end
81
119
 
82
120
  def self.get_key_values( points, key )
@@ -85,11 +123,13 @@ module Cluda
85
123
 
86
124
  def self.move_centroids( output )
87
125
  output.map do |(key, value)|
88
- x = median( get_key_values(value, :x) )
89
- y = median( get_key_values(value, :y) )
126
+ unless value.empty?
127
+ x = Cluda.median( get_key_values(value, :x) )
128
+ y = Cluda.median( get_key_values(value, :y) )
90
129
 
91
- { x: x, y: y }
92
- end
130
+ { x: x, y: y }
131
+ end
132
+ end.compact
93
133
  end
94
134
  end
95
135
  end
data/lib/cluda.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  $:.unshift File.expand_path('..', __FILE__ )
2
2
 
3
3
  require 'cluda/distances/distance'
4
+ require 'cluda/cluda_common'
4
5
  require 'cluda/kmeans'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cluda
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-01-29 00:00:00.000000000 Z
12
+ date: 2014-02-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -50,6 +50,7 @@ extensions: []
50
50
  extra_rdoc_files: []
51
51
  files:
52
52
  - lib/cluda.rb
53
+ - lib/cluda/cluda_common.rb
53
54
  - lib/cluda/kmeans.rb
54
55
  - lib/cluda/distances/distance.rb
55
56
  - lib/cluda/distances/euclidean.rb