cluda 0.0.2 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 84fc5617e8ce06d8b83eec5c0e6bfc1ea7cf1c43246f5040078324cf754089c7
4
+ data.tar.gz: 812430fa9d168cdd552b059ffaa2933e930a3d461d28d9e2991a36bf1e44a42c
5
+ SHA512:
6
+ metadata.gz: ff054734dc37f4a806dc3a2635e26eec634138abd6a1ceca982b6c54bc6910be36b2ccf00b5be6d7636e56ebf82b6ca122db783f8529ed8f972fdb3b944c10dd
7
+ data.tar.gz: b9e3fd5d71c2db328303288e17991cd892912f24e85fef0c3f0555a0bd06d42a35bb48ee609423b829f10aa015756946136052cd7fb8c04f5f0e0b67010b8ed0
@@ -1,5 +1,7 @@
1
- $:.unshift File.expand_path('..', __FILE__ )
1
+ # frozen_string_literal: true
2
+
3
+ $LOAD_PATH.unshift File.expand_path(__dir__)
2
4
 
3
5
  require 'cluda/distances/distance'
4
6
  require 'cluda/cluda_common'
5
- require 'cluda/kmeans'
7
+ require 'cluda/kmeans'
@@ -1,79 +1,78 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Cluda
4
+ module_function
5
+
2
6
  class InvalidPoint < RuntimeError; end
3
7
  class InvalidCentroid < RuntimeError; end
4
8
  class InvalidSmartPoint < RuntimeError; end
5
9
  class InvalidDistanceMethod < RuntimeError; end
6
-
10
+
7
11
  # For an output given by CluDA calculate the means for each centroid
8
12
  #
9
- #Example:
13
+ # Example:
10
14
  # > clusters = {{:x=>2, :y=>2}=>
11
15
  # [{:x=>1, :y=>1, :distance=>1.4142135623730951},
12
16
  # {:x=>2, :y=>1, :distance=>1.0},
13
17
  # {:x=>1, :y=>2, :distance=>1.0},
14
18
  # {:x=>2, :y=>2, :distance=>0.0}]}
15
19
  # > Cluda.median_for_centroids(clusters)
16
- #Arguments:
20
+ # Arguments:
17
21
  # points: ( Hash )
18
- def self.median_for_centroids ( points )
19
- points.keys.each do | centroid |
20
- validate_smart_points( points[centroid] )
22
+ def median_for_centroids(points)
23
+ points.each_key do |centroid|
24
+ validate_smart_points(points[centroid])
21
25
  end
22
26
 
23
27
  points.keys.map do |centroid|
24
- centroid.merge( median: median_for_centroid( centroid, points ) )
28
+ centroid.merge(median: median_for_centroid(centroid, points))
25
29
  end
26
30
  end
27
-
28
- protected
29
-
30
- def self.valid_class?( name )
31
- ['euclidean', 'chebyshev', 'manhattan'].include?( name.downcase )
31
+
32
+ def valid_class?(name)
33
+ %w[euclidean chebyshev manhattan].include?(name.downcase)
32
34
  end
33
35
 
34
- def self.validate_centroids( centroids )
35
- centroids = centroids.is_a?(Array) ? centroids : [ centroids ]
36
-
36
+ def validate_centroids(centroids)
37
+ centroids = centroids.is_a?(Array) ? centroids : [centroids]
38
+
37
39
  validate(centroids)
38
-
40
+
39
41
  centroids.each do |point|
40
- raise InvalidCentroid unless point.include?(:median)
42
+ raise InvalidCentroid unless point.include?(:median)
41
43
  end
42
44
 
43
45
  centroids
44
46
  end
45
-
46
- def self.validate_smart_points( points )
47
- points = points.is_a?(Array) ? points : [ points ]
48
-
49
- validate( points )
50
-
47
+
48
+ def validate_smart_points(points)
49
+ points = points.is_a?(Array) ? points : [points]
50
+
51
+ validate(points)
52
+
51
53
  points.each do |point|
52
- raise InvalidSmartPoint unless point.include?(:distance)
54
+ raise InvalidSmartPoint unless point.include?(:distance)
53
55
  end
54
56
 
55
57
  points
56
58
  end
57
-
58
- def self.validate( data )
59
- points = data.is_a?(Array) ? data : [ data ]
60
- points.each do |point|
61
- raise InvalidPoint unless point.is_a?(Hash) &&
62
- point.include?(:x) && point.include?(:y) &&
63
- point[:x].is_a?(Numeric) && point[:y].is_a?(Numeric)
64
- end
65
59
 
66
- points
60
+ def validate(data)
61
+ data = [data] unless data.is_a?(Array)
62
+
63
+ data.each do |point|
64
+ raise InvalidPoint unless point.is_a?(Hash) && point[:x].is_a?(Numeric) && point[:y].is_a?(Numeric)
65
+ end
67
66
  end
68
67
 
69
- def self.median_for_centroid( centroid, points )
70
- median( points[centroid].map{ |point| point[:distance] } )
68
+ def median_for_centroid(centroid, points)
69
+ median(points[centroid].map { |point| point[:distance] })
71
70
  end
72
71
 
73
- def self.median( list )
72
+ def median(list)
74
73
  sorted_list = list.sort
75
74
  len = list.size
76
75
 
77
- sorted_list[( (len / 2 ) + 0.5 ).floor]
76
+ sorted_list[((len / 2) + 0.5).floor]
78
77
  end
79
78
  end
@@ -1,10 +1,11 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Cluda
2
- class Chebyshev < Distance
3
-
4
+ class Chebyshev < Distance
4
5
  def self.distance(x0, x)
5
- Cluda.validate( [x0,x] )
6
+ Cluda.validate([x0, x])
6
7
 
7
- [ (x0[:x] - x[:x]).abs, (x0[:y] - x[:y]).abs ].max
8
+ [(x0[:x] - x[:x]).abs, (x0[:y] - x[:y]).abs].max
8
9
  end
9
10
  end
10
11
  end
@@ -1,9 +1,11 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Cluda
2
- class Distance
4
+ class Distance
3
5
  extend Math
4
-
5
- def self.distance(x0, x)
6
- raise ::NotImplementedError.new("You must implement distance method")
6
+
7
+ def self.distance(_x0, _x)
8
+ raise ::NotImplementedError, 'You must implement distance method'
7
9
  end
8
10
  end
9
11
  end
@@ -1,10 +1,11 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Cluda
2
- class Euclidean < Distance
3
-
4
+ class Euclidean < Distance
4
5
  def self.distance(x0, x)
5
- Cluda.validate( [x0,x] )
6
+ Cluda.validate([x0, x])
6
7
 
7
- sqrt( (x0[:x] - x[:x]) ** 2 + (x0[:y] - x[:y]) ** 2 )
8
+ sqrt((x0[:x] - x[:x])**2 + (x0[:y] - x[:y])**2)
8
9
  end
9
10
  end
10
11
  end
@@ -1,8 +1,9 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Cluda
2
- class Manhattan < Distance
3
-
4
+ class Manhattan < Distance
4
5
  def self.distance(x0, x)
5
- Cluda.validate( [x0,x] )
6
+ Cluda.validate([x0, x])
6
7
 
7
8
  (x0[:x] - x[:x]).abs + (x0[:y] - x[:y]).abs
8
9
  end
@@ -1,135 +1,169 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'cluda/distances/manhattan'
2
4
  require 'cluda/distances/euclidean'
3
5
  require 'cluda/distances/chebyshev'
4
6
 
5
7
  module Cluda
6
8
  class Kmeans
7
-
8
- DEFAULT_OPTS = { k: 1,
9
- centroids: nil,
10
- distance_method: 'euclidean',
9
+ DEFAULT_OPTS = { k: 1,
10
+ centroids: nil,
11
+ distance_method: 'euclidean',
11
12
  be_smart: false,
12
13
  margin_distance_percentage: 0,
13
- max_iterations: 50 }
14
-
15
- #Classify the points using KMeans as the clustering algorithm
16
- #
17
- #Example:
18
- # >> points = [ { x: 1, y: 1}, { x: 2, y: 1}, { x: 1, y: 2}, { x: 2, y: 2}, { x: 4, y: 6}, { x: 5, y: 7}, { x: 5, y: 6}, { x: 5, y: 5}, { x: 6, y: 6}, { x: 6, y: 5}]
19
- # >> Cluda::Kmeans.classify( points, k: 1, distance_method: 'euclidean', be_smart: true, max_iterations: 50)
20
- #Arguments:
21
- # list: (Array [Hash] )
22
- # k: (Numeric) *optional*
23
- # centroids: (Array) *optional*
24
- # distance_method: (String) *optional*
25
- # be_smart: (Boolean) *optional* [If you want CluDA to be smart you have to specify the centroids ]
26
- # margin_distance_percentage: (Numeric) *optional* [Between 0 and 1]
27
- # max_iterations: (Numeric) *optional*
28
- def self.classify( list, opts = {} )
29
- @opts = DEFAULT_OPTS.merge(opts)
30
-
31
- raise Cluda::InvalidDistanceMethod unless Cluda::valid_class?(@opts[:distance_method])
32
-
33
- _class = Cluda.const_get( @opts[:distance_method].downcase.capitalize )
34
-
35
- Cluda.validate( list )
36
- Cluda.validate_centroids( @opts[:centroids] ) if @opts[:be_smart]
37
-
38
- iter = 1
39
- max_iterations = @opts[:max_iterations]
40
- centroids = @opts[:centroids].nil? || @opts[:centroids].empty? ? initialize_centroids( list , @opts[:k]) : process_centroids( @opts[:centroids] )
41
- previous_centroids = nil
42
- smart_clustering = @opts[:be_smart]
43
- margin_distance_percentage = @opts[:margin_distance_percentage]
44
-
45
- while (iter < max_iterations) && (previous_centroids != centroids)
46
- output = init_output(centroids)
47
- margin = smart_clustering ? @median_centroid * margin_distance_percentage : 0
48
-
49
- list.each do |point|
50
- centroid, distance = nearest_centroid(point, centroids, _class)
51
-
52
- if smart_clustering && distance > ( @median_centroid + margin )
53
- @median_centroid = distance
54
- centroids << point
55
- create_centroid(point, output)
56
- centroid = point
57
- distance = 0
14
+ max_iterations: 50 }.freeze
15
+
16
+ class << self
17
+ # Classify the points using KMeans as the clustering algorithm
18
+ #
19
+ # Example:
20
+ # >> points = [
21
+ # { x: 1, y: 1},
22
+ # { x: 2, y: 1},
23
+ # { x: 1, y: 2},
24
+ # { x: 2, y: 2},
25
+ # { x: 4, y: 6},
26
+ # { x: 5, y: 7},
27
+ # { x: 5, y: 6},
28
+ # { x: 5, y: 5},
29
+ # { x: 6, y: 6},
30
+ # { x: 6, y: 5}
31
+ # ]
32
+ # >> Cluda::Kmeans.classify( points, k: 1, distance_method: 'euclidean', be_smart: true, max_iterations: 50)
33
+ # Arguments:
34
+ # list: (Array [Hash] )
35
+ # k: (Numeric) *optional*
36
+ # centroids: (Array) *optional*
37
+ # distance_method: (String) *optional*
38
+ # [If you want CluDA to be smart you have to specify the centroids ]
39
+ # be_smart: (Boolean) *optional*
40
+ # margin_distance_percentage: (Numeric) *optional* [Between 0 and 1]
41
+ # max_iterations: (Numeric) *optional*
42
+ def classify(list, opts = {})
43
+ Cluda.validate(list)
44
+ Cluda.validate_centroids(opts[:centroids]) if opts[:be_smart]
45
+
46
+ config = generate_config(list, opts)
47
+
48
+ raise Cluda::InvalidDistanceMethod unless Cluda.valid_class?(config[:distance_method])
49
+
50
+ calculate_clusters(list, **config)
51
+ end
52
+
53
+ def generate_config(list, opts)
54
+ config = DEFAULT_OPTS.merge(opts)
55
+
56
+ centroids_present?(config) ? process_centroids(config) : initialize_centroids(list, config)
57
+
58
+ config[:margin] = config[:be_smart] ? config[:median_centroid] * config[:margin_distance_percentage] : 0
59
+
60
+ config
61
+ end
62
+
63
+ def process_centroids(config)
64
+ config[:centroids].each do |point|
65
+ if config[:median_centroid].nil? || config[:median_centroid] < point[:median]
66
+ config[:median_centroid] = point[:median]
58
67
  end
59
68
 
60
- output[centroid] << point.merge( distance: distance )
69
+ point.delete_if { |k, _| !%i[x y].include? k }
61
70
  end
62
71
 
63
- iter += 1
64
- previous_centroids = centroids
65
- centroids = move_centroids( output )
72
+ config
66
73
  end
67
74
 
68
- output
69
- end
75
+ def initialize_centroids(list, config)
76
+ return config if list.empty? || config[:k] > list.size
77
+
78
+ config[:centroids] = list.shuffle(random: Random.new(rand(0...config[:k])))[0...config[:k]]
79
+
80
+ config
81
+ end
82
+
83
+ def nearest_centroid(point, centroids, klass = Cluda::Euclidean)
84
+ return nil if centroids.empty?
85
+
86
+ Cluda.validate(point)
70
87
 
71
- protected
72
-
73
- def self.nearest_centroid(point, centroids, _class = Cluda::Euclidean )
74
- return nil if centroids.empty?
75
-
76
- Cluda.validate( point )
77
-
78
- nearest_centroid = centroids[0]
79
- min_distance = _class.distance(point, nearest_centroid)
80
-
81
- centroids.each do |centroid|
82
- new_distance = _class.distance(point, centroid)
83
- if new_distance < min_distance
84
- min_distance = new_distance
85
- nearest_centroid = centroid
88
+ nearest_centroid = centroids[0]
89
+ min_distance = klass.distance(point, nearest_centroid)
90
+
91
+ centroids.each do |centroid|
92
+ new_distance = klass.distance(point, centroid)
93
+ if new_distance < min_distance
94
+ min_distance = new_distance
95
+ nearest_centroid = centroid
96
+ end
86
97
  end
98
+
99
+ [nearest_centroid, min_distance]
87
100
  end
88
101
 
89
- [nearest_centroid, min_distance]
90
- end
102
+ private
91
103
 
92
- def self.initialize_centroids( list , k )
93
- Cluda.validate( list )
104
+ def calculate_clusters(list, centroids:, distance_method:, **config)
105
+ cluster = {}
94
106
 
95
- return [] if list.empty? || k > list.size
107
+ previous_centroids = nil
108
+ klass = Cluda.const_get(distance_method.downcase.capitalize)
96
109
 
97
- list.shuffle( random: Random.new.rand(0...k) )[0...k]
98
- end
110
+ config[:max_iterations].times do
111
+ break if previous_centroids == centroids
99
112
 
100
- private
113
+ cluster = assign_points_to_centroids(list, centroids, klass, config)
101
114
 
102
- def self.init_output(centroids)
103
- centroids.each_with_object({}) do |centroid, memo|
104
- memo[centroid] = []
115
+ previous_centroids = centroids
116
+ centroids = move_centroids(cluster)
117
+ end
118
+
119
+ cluster
105
120
  end
106
- end
107
121
 
108
- def self.create_centroid( centroid, output )
109
- output[centroid] = []
110
- end
122
+ def centroids_present?(opts)
123
+ !(opts[:centroids].nil? || opts[:centroids].empty?)
124
+ end
125
+
126
+ def init_cluster(centroids)
127
+ centroids.each_with_object({}) do |centroid, memo|
128
+ memo[centroid] = []
129
+ end
130
+ end
111
131
 
112
- def self.process_centroids(centroids)
113
- centroids.each_with_object([]) do |point, memo|
114
- @median_centroid = point[:median] if @median_centroid.nil? || @median_centroid < point[:median]
115
-
116
- memo << { x: point[:x], y: point[:y] }
132
+ def create_centroid(centroid, output)
133
+ output[centroid] = []
117
134
  end
118
- end
119
135
 
120
- def self.get_key_values( points, key )
121
- points.map { |point| point[key] }
122
- end
123
-
124
- def self.move_centroids( output )
125
- output.map do |(key, value)|
126
- unless value.empty?
127
- x = Cluda.median( get_key_values(value, :x) )
128
- y = Cluda.median( get_key_values(value, :y) )
129
-
136
+ def get_key_values(points, key)
137
+ points.map { |point| point[key] }
138
+ end
139
+
140
+ def move_centroids(output)
141
+ output.map do |(_key, value)|
142
+ next if value.empty?
143
+
144
+ x = Cluda.median(get_key_values(value, :x))
145
+ y = Cluda.median(get_key_values(value, :y))
146
+
130
147
  { x: x, y: y }
148
+ end.compact
149
+ end
150
+
151
+ def assign_points_to_centroids(list, centroids, klass, config)
152
+ list.each_with_object({}) do |point, cluster|
153
+ centroid, distance = nearest_centroid(point, centroids, klass)
154
+
155
+ if config[:be_smart] && distance > (config[:median_centroid] + config[:margin])
156
+ config[:median_centroid] = distance
157
+ centroids << point
158
+ create_centroid(point, cluster)
159
+ centroid = point
160
+ distance = 0
161
+ end
162
+
163
+ cluster[centroid] ||= []
164
+ cluster[centroid] << point.merge(distance: distance)
131
165
  end
132
- end.compact
166
+ end
133
167
  end
134
168
  end
135
169
  end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Cluda
4
+ VERSION = '0.1.0'
5
+ end
metadata CHANGED
@@ -1,84 +1,92 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cluda
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
5
- prerelease:
4
+ version: 0.1.0
6
5
  platform: ruby
7
6
  authors:
8
- - Enrique Figuerola
7
+ - Enrique M Figuerola Gomez
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2014-02-06 00:00:00.000000000 Z
11
+ date: 2020-03-19 00:00:00.000000000 Z
13
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '12.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '12.3'
14
27
  - !ruby/object:Gem::Dependency
15
28
  name: rspec
16
29
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
30
  requirements:
19
- - - ! '>='
31
+ - - "~>"
20
32
  - !ruby/object:Gem::Version
21
- version: 2.11.0
33
+ version: '3.8'
22
34
  type: :development
23
35
  prerelease: false
24
36
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
37
  requirements:
27
- - - ! '>='
38
+ - - "~>"
28
39
  - !ruby/object:Gem::Version
29
- version: 2.11.0
40
+ version: '3.8'
30
41
  - !ruby/object:Gem::Dependency
31
- name: rake
42
+ name: rubocop
32
43
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
44
  requirements:
35
- - - ! '>='
45
+ - - "~>"
36
46
  - !ruby/object:Gem::Version
37
- version: '0'
47
+ version: '0.70'
38
48
  type: :development
39
49
  prerelease: false
40
50
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
51
  requirements:
43
- - - ! '>='
52
+ - - "~>"
44
53
  - !ruby/object:Gem::Version
45
- version: '0'
54
+ version: '0.70'
46
55
  description: CLustering Data Analysis gem
47
- email: hard_rock15@msn.com
56
+ email: me@emfigo.com
48
57
  executables: []
49
58
  extensions: []
50
59
  extra_rdoc_files: []
51
60
  files:
52
61
  - lib/cluda.rb
53
62
  - lib/cluda/cluda_common.rb
54
- - lib/cluda/kmeans.rb
63
+ - lib/cluda/distances/chebyshev.rb
55
64
  - lib/cluda/distances/distance.rb
56
65
  - lib/cluda/distances/euclidean.rb
57
66
  - lib/cluda/distances/manhattan.rb
58
- - lib/cluda/distances/chebyshev.rb
67
+ - lib/cluda/kmeans.rb
68
+ - lib/cluda/version.rb
59
69
  homepage: https://github.com/emfigo/cluda
60
70
  licenses:
61
71
  - MIT
72
+ metadata: {}
62
73
  post_install_message:
63
74
  rdoc_options: []
64
75
  require_paths:
65
76
  - lib
66
77
  required_ruby_version: !ruby/object:Gem::Requirement
67
- none: false
68
78
  requirements:
69
- - - ! '>='
79
+ - - ">="
70
80
  - !ruby/object:Gem::Version
71
- version: '0'
81
+ version: '2.3'
72
82
  required_rubygems_version: !ruby/object:Gem::Requirement
73
- none: false
74
83
  requirements:
75
- - - ! '>='
84
+ - - ">="
76
85
  - !ruby/object:Gem::Version
77
86
  version: '0'
78
87
  requirements: []
79
- rubyforge_project:
80
- rubygems_version: 1.8.23
88
+ rubygems_version: 3.0.3
81
89
  signing_key:
82
- specification_version: 3
90
+ specification_version: 4
83
91
  summary: CLuDA
84
92
  test_files: []