cluda 0.0.2 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/cluda.rb +4 -2
- data/lib/cluda/cluda_common.rb +36 -37
- data/lib/cluda/distances/chebyshev.rb +5 -4
- data/lib/cluda/distances/distance.rb +6 -4
- data/lib/cluda/distances/euclidean.rb +5 -4
- data/lib/cluda/distances/manhattan.rb +4 -3
- data/lib/cluda/kmeans.rb +136 -102
- data/lib/cluda/version.rb +5 -0
- metadata +36 -28
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 84fc5617e8ce06d8b83eec5c0e6bfc1ea7cf1c43246f5040078324cf754089c7
|
4
|
+
data.tar.gz: 812430fa9d168cdd552b059ffaa2933e930a3d461d28d9e2991a36bf1e44a42c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ff054734dc37f4a806dc3a2635e26eec634138abd6a1ceca982b6c54bc6910be36b2ccf00b5be6d7636e56ebf82b6ca122db783f8529ed8f972fdb3b944c10dd
|
7
|
+
data.tar.gz: b9e3fd5d71c2db328303288e17991cd892912f24e85fef0c3f0555a0bd06d42a35bb48ee609423b829f10aa015756946136052cd7fb8c04f5f0e0b67010b8ed0
|
data/lib/cluda.rb
CHANGED
data/lib/cluda/cluda_common.rb
CHANGED
@@ -1,79 +1,78 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Cluda
|
4
|
+
module_function
|
5
|
+
|
2
6
|
class InvalidPoint < RuntimeError; end
|
3
7
|
class InvalidCentroid < RuntimeError; end
|
4
8
|
class InvalidSmartPoint < RuntimeError; end
|
5
9
|
class InvalidDistanceMethod < RuntimeError; end
|
6
|
-
|
10
|
+
|
7
11
|
# For an output given by CluDA calculate the means for each centroid
|
8
12
|
#
|
9
|
-
#Example:
|
13
|
+
# Example:
|
10
14
|
# > clusters = {{:x=>2, :y=>2}=>
|
11
15
|
# [{:x=>1, :y=>1, :distance=>1.4142135623730951},
|
12
16
|
# {:x=>2, :y=>1, :distance=>1.0},
|
13
17
|
# {:x=>1, :y=>2, :distance=>1.0},
|
14
18
|
# {:x=>2, :y=>2, :distance=>0.0}]}
|
15
19
|
# > Cluda.median_for_centroids(clusters)
|
16
|
-
#Arguments:
|
20
|
+
# Arguments:
|
17
21
|
# points: ( Hash )
|
18
|
-
def
|
19
|
-
points.
|
20
|
-
validate_smart_points(
|
22
|
+
def median_for_centroids(points)
|
23
|
+
points.each_key do |centroid|
|
24
|
+
validate_smart_points(points[centroid])
|
21
25
|
end
|
22
26
|
|
23
27
|
points.keys.map do |centroid|
|
24
|
-
centroid.merge(
|
28
|
+
centroid.merge(median: median_for_centroid(centroid, points))
|
25
29
|
end
|
26
30
|
end
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
def self.valid_class?( name )
|
31
|
-
['euclidean', 'chebyshev', 'manhattan'].include?( name.downcase )
|
31
|
+
|
32
|
+
def valid_class?(name)
|
33
|
+
%w[euclidean chebyshev manhattan].include?(name.downcase)
|
32
34
|
end
|
33
35
|
|
34
|
-
def
|
35
|
-
centroids = centroids.is_a?(Array) ? centroids : [
|
36
|
-
|
36
|
+
def validate_centroids(centroids)
|
37
|
+
centroids = centroids.is_a?(Array) ? centroids : [centroids]
|
38
|
+
|
37
39
|
validate(centroids)
|
38
|
-
|
40
|
+
|
39
41
|
centroids.each do |point|
|
40
|
-
raise InvalidCentroid unless
|
42
|
+
raise InvalidCentroid unless point.include?(:median)
|
41
43
|
end
|
42
44
|
|
43
45
|
centroids
|
44
46
|
end
|
45
|
-
|
46
|
-
def
|
47
|
-
points = points.is_a?(Array) ? points : [
|
48
|
-
|
49
|
-
validate(
|
50
|
-
|
47
|
+
|
48
|
+
def validate_smart_points(points)
|
49
|
+
points = points.is_a?(Array) ? points : [points]
|
50
|
+
|
51
|
+
validate(points)
|
52
|
+
|
51
53
|
points.each do |point|
|
52
|
-
raise InvalidSmartPoint unless
|
54
|
+
raise InvalidSmartPoint unless point.include?(:distance)
|
53
55
|
end
|
54
56
|
|
55
57
|
points
|
56
58
|
end
|
57
|
-
|
58
|
-
def self.validate( data )
|
59
|
-
points = data.is_a?(Array) ? data : [ data ]
|
60
|
-
points.each do |point|
|
61
|
-
raise InvalidPoint unless point.is_a?(Hash) &&
|
62
|
-
point.include?(:x) && point.include?(:y) &&
|
63
|
-
point[:x].is_a?(Numeric) && point[:y].is_a?(Numeric)
|
64
|
-
end
|
65
59
|
|
66
|
-
|
60
|
+
def validate(data)
|
61
|
+
data = [data] unless data.is_a?(Array)
|
62
|
+
|
63
|
+
data.each do |point|
|
64
|
+
raise InvalidPoint unless point.is_a?(Hash) && point[:x].is_a?(Numeric) && point[:y].is_a?(Numeric)
|
65
|
+
end
|
67
66
|
end
|
68
67
|
|
69
|
-
def
|
70
|
-
median(
|
68
|
+
def median_for_centroid(centroid, points)
|
69
|
+
median(points[centroid].map { |point| point[:distance] })
|
71
70
|
end
|
72
71
|
|
73
|
-
def
|
72
|
+
def median(list)
|
74
73
|
sorted_list = list.sort
|
75
74
|
len = list.size
|
76
75
|
|
77
|
-
sorted_list[(
|
76
|
+
sorted_list[((len / 2) + 0.5).floor]
|
78
77
|
end
|
79
78
|
end
|
@@ -1,10 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Cluda
|
2
|
-
class Chebyshev <
|
3
|
-
|
4
|
+
class Chebyshev < Distance
|
4
5
|
def self.distance(x0, x)
|
5
|
-
Cluda.validate(
|
6
|
+
Cluda.validate([x0, x])
|
6
7
|
|
7
|
-
[
|
8
|
+
[(x0[:x] - x[:x]).abs, (x0[:y] - x[:y]).abs].max
|
8
9
|
end
|
9
10
|
end
|
10
11
|
end
|
@@ -1,9 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Cluda
|
2
|
-
class Distance
|
4
|
+
class Distance
|
3
5
|
extend Math
|
4
|
-
|
5
|
-
def self.distance(
|
6
|
-
raise ::NotImplementedError
|
6
|
+
|
7
|
+
def self.distance(_x0, _x)
|
8
|
+
raise ::NotImplementedError, 'You must implement distance method'
|
7
9
|
end
|
8
10
|
end
|
9
11
|
end
|
@@ -1,10 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Cluda
|
2
|
-
class Euclidean <
|
3
|
-
|
4
|
+
class Euclidean < Distance
|
4
5
|
def self.distance(x0, x)
|
5
|
-
Cluda.validate(
|
6
|
+
Cluda.validate([x0, x])
|
6
7
|
|
7
|
-
sqrt(
|
8
|
+
sqrt((x0[:x] - x[:x])**2 + (x0[:y] - x[:y])**2)
|
8
9
|
end
|
9
10
|
end
|
10
11
|
end
|
data/lib/cluda/kmeans.rb
CHANGED
@@ -1,135 +1,169 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'cluda/distances/manhattan'
|
2
4
|
require 'cluda/distances/euclidean'
|
3
5
|
require 'cluda/distances/chebyshev'
|
4
6
|
|
5
7
|
module Cluda
|
6
8
|
class Kmeans
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
distance_method: 'euclidean',
|
9
|
+
DEFAULT_OPTS = { k: 1,
|
10
|
+
centroids: nil,
|
11
|
+
distance_method: 'euclidean',
|
11
12
|
be_smart: false,
|
12
13
|
margin_distance_percentage: 0,
|
13
|
-
max_iterations: 50 }
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
list
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
14
|
+
max_iterations: 50 }.freeze
|
15
|
+
|
16
|
+
class << self
|
17
|
+
# Classify the points using KMeans as the clustering algorithm
|
18
|
+
#
|
19
|
+
# Example:
|
20
|
+
# >> points = [
|
21
|
+
# { x: 1, y: 1},
|
22
|
+
# { x: 2, y: 1},
|
23
|
+
# { x: 1, y: 2},
|
24
|
+
# { x: 2, y: 2},
|
25
|
+
# { x: 4, y: 6},
|
26
|
+
# { x: 5, y: 7},
|
27
|
+
# { x: 5, y: 6},
|
28
|
+
# { x: 5, y: 5},
|
29
|
+
# { x: 6, y: 6},
|
30
|
+
# { x: 6, y: 5}
|
31
|
+
# ]
|
32
|
+
# >> Cluda::Kmeans.classify( points, k: 1, distance_method: 'euclidean', be_smart: true, max_iterations: 50)
|
33
|
+
# Arguments:
|
34
|
+
# list: (Array [Hash] )
|
35
|
+
# k: (Numeric) *optional*
|
36
|
+
# centroids: (Array) *optional*
|
37
|
+
# distance_method: (String) *optional*
|
38
|
+
# [If you want CluDA to be smart you have to specify the centroids ]
|
39
|
+
# be_smart: (Boolean) *optional*
|
40
|
+
# margin_distance_percentage: (Numeric) *optional* [Between 0 and 1]
|
41
|
+
# max_iterations: (Numeric) *optional*
|
42
|
+
def classify(list, opts = {})
|
43
|
+
Cluda.validate(list)
|
44
|
+
Cluda.validate_centroids(opts[:centroids]) if opts[:be_smart]
|
45
|
+
|
46
|
+
config = generate_config(list, opts)
|
47
|
+
|
48
|
+
raise Cluda::InvalidDistanceMethod unless Cluda.valid_class?(config[:distance_method])
|
49
|
+
|
50
|
+
calculate_clusters(list, **config)
|
51
|
+
end
|
52
|
+
|
53
|
+
def generate_config(list, opts)
|
54
|
+
config = DEFAULT_OPTS.merge(opts)
|
55
|
+
|
56
|
+
centroids_present?(config) ? process_centroids(config) : initialize_centroids(list, config)
|
57
|
+
|
58
|
+
config[:margin] = config[:be_smart] ? config[:median_centroid] * config[:margin_distance_percentage] : 0
|
59
|
+
|
60
|
+
config
|
61
|
+
end
|
62
|
+
|
63
|
+
def process_centroids(config)
|
64
|
+
config[:centroids].each do |point|
|
65
|
+
if config[:median_centroid].nil? || config[:median_centroid] < point[:median]
|
66
|
+
config[:median_centroid] = point[:median]
|
58
67
|
end
|
59
68
|
|
60
|
-
|
69
|
+
point.delete_if { |k, _| !%i[x y].include? k }
|
61
70
|
end
|
62
71
|
|
63
|
-
|
64
|
-
previous_centroids = centroids
|
65
|
-
centroids = move_centroids( output )
|
72
|
+
config
|
66
73
|
end
|
67
74
|
|
68
|
-
|
69
|
-
|
75
|
+
def initialize_centroids(list, config)
|
76
|
+
return config if list.empty? || config[:k] > list.size
|
77
|
+
|
78
|
+
config[:centroids] = list.shuffle(random: Random.new(rand(0...config[:k])))[0...config[:k]]
|
79
|
+
|
80
|
+
config
|
81
|
+
end
|
82
|
+
|
83
|
+
def nearest_centroid(point, centroids, klass = Cluda::Euclidean)
|
84
|
+
return nil if centroids.empty?
|
85
|
+
|
86
|
+
Cluda.validate(point)
|
70
87
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
centroids.each do |centroid|
|
82
|
-
new_distance = _class.distance(point, centroid)
|
83
|
-
if new_distance < min_distance
|
84
|
-
min_distance = new_distance
|
85
|
-
nearest_centroid = centroid
|
88
|
+
nearest_centroid = centroids[0]
|
89
|
+
min_distance = klass.distance(point, nearest_centroid)
|
90
|
+
|
91
|
+
centroids.each do |centroid|
|
92
|
+
new_distance = klass.distance(point, centroid)
|
93
|
+
if new_distance < min_distance
|
94
|
+
min_distance = new_distance
|
95
|
+
nearest_centroid = centroid
|
96
|
+
end
|
86
97
|
end
|
98
|
+
|
99
|
+
[nearest_centroid, min_distance]
|
87
100
|
end
|
88
101
|
|
89
|
-
|
90
|
-
end
|
102
|
+
private
|
91
103
|
|
92
|
-
|
93
|
-
|
104
|
+
def calculate_clusters(list, centroids:, distance_method:, **config)
|
105
|
+
cluster = {}
|
94
106
|
|
95
|
-
|
107
|
+
previous_centroids = nil
|
108
|
+
klass = Cluda.const_get(distance_method.downcase.capitalize)
|
96
109
|
|
97
|
-
|
98
|
-
|
110
|
+
config[:max_iterations].times do
|
111
|
+
break if previous_centroids == centroids
|
99
112
|
|
100
|
-
|
113
|
+
cluster = assign_points_to_centroids(list, centroids, klass, config)
|
101
114
|
|
102
|
-
|
103
|
-
|
104
|
-
|
115
|
+
previous_centroids = centroids
|
116
|
+
centroids = move_centroids(cluster)
|
117
|
+
end
|
118
|
+
|
119
|
+
cluster
|
105
120
|
end
|
106
|
-
end
|
107
121
|
|
108
|
-
|
109
|
-
|
110
|
-
|
122
|
+
def centroids_present?(opts)
|
123
|
+
!(opts[:centroids].nil? || opts[:centroids].empty?)
|
124
|
+
end
|
125
|
+
|
126
|
+
def init_cluster(centroids)
|
127
|
+
centroids.each_with_object({}) do |centroid, memo|
|
128
|
+
memo[centroid] = []
|
129
|
+
end
|
130
|
+
end
|
111
131
|
|
112
|
-
|
113
|
-
|
114
|
-
@median_centroid = point[:median] if @median_centroid.nil? || @median_centroid < point[:median]
|
115
|
-
|
116
|
-
memo << { x: point[:x], y: point[:y] }
|
132
|
+
def create_centroid(centroid, output)
|
133
|
+
output[centroid] = []
|
117
134
|
end
|
118
|
-
end
|
119
135
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
136
|
+
def get_key_values(points, key)
|
137
|
+
points.map { |point| point[key] }
|
138
|
+
end
|
139
|
+
|
140
|
+
def move_centroids(output)
|
141
|
+
output.map do |(_key, value)|
|
142
|
+
next if value.empty?
|
143
|
+
|
144
|
+
x = Cluda.median(get_key_values(value, :x))
|
145
|
+
y = Cluda.median(get_key_values(value, :y))
|
146
|
+
|
130
147
|
{ x: x, y: y }
|
148
|
+
end.compact
|
149
|
+
end
|
150
|
+
|
151
|
+
def assign_points_to_centroids(list, centroids, klass, config)
|
152
|
+
list.each_with_object({}) do |point, cluster|
|
153
|
+
centroid, distance = nearest_centroid(point, centroids, klass)
|
154
|
+
|
155
|
+
if config[:be_smart] && distance > (config[:median_centroid] + config[:margin])
|
156
|
+
config[:median_centroid] = distance
|
157
|
+
centroids << point
|
158
|
+
create_centroid(point, cluster)
|
159
|
+
centroid = point
|
160
|
+
distance = 0
|
161
|
+
end
|
162
|
+
|
163
|
+
cluster[centroid] ||= []
|
164
|
+
cluster[centroid] << point.merge(distance: distance)
|
131
165
|
end
|
132
|
-
end
|
166
|
+
end
|
133
167
|
end
|
134
168
|
end
|
135
169
|
end
|
metadata
CHANGED
@@ -1,84 +1,92 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cluda
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
5
|
-
prerelease:
|
4
|
+
version: 0.1.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
|
-
- Enrique Figuerola
|
7
|
+
- Enrique M Figuerola Gomez
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2020-03-19 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rake
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '12.3'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '12.3'
|
14
27
|
- !ruby/object:Gem::Dependency
|
15
28
|
name: rspec
|
16
29
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
30
|
requirements:
|
19
|
-
- -
|
31
|
+
- - "~>"
|
20
32
|
- !ruby/object:Gem::Version
|
21
|
-
version:
|
33
|
+
version: '3.8'
|
22
34
|
type: :development
|
23
35
|
prerelease: false
|
24
36
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
37
|
requirements:
|
27
|
-
- -
|
38
|
+
- - "~>"
|
28
39
|
- !ruby/object:Gem::Version
|
29
|
-
version:
|
40
|
+
version: '3.8'
|
30
41
|
- !ruby/object:Gem::Dependency
|
31
|
-
name:
|
42
|
+
name: rubocop
|
32
43
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
44
|
requirements:
|
35
|
-
- -
|
45
|
+
- - "~>"
|
36
46
|
- !ruby/object:Gem::Version
|
37
|
-
version: '0'
|
47
|
+
version: '0.70'
|
38
48
|
type: :development
|
39
49
|
prerelease: false
|
40
50
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
51
|
requirements:
|
43
|
-
- -
|
52
|
+
- - "~>"
|
44
53
|
- !ruby/object:Gem::Version
|
45
|
-
version: '0'
|
54
|
+
version: '0.70'
|
46
55
|
description: CLustering Data Analysis gem
|
47
|
-
email:
|
56
|
+
email: me@emfigo.com
|
48
57
|
executables: []
|
49
58
|
extensions: []
|
50
59
|
extra_rdoc_files: []
|
51
60
|
files:
|
52
61
|
- lib/cluda.rb
|
53
62
|
- lib/cluda/cluda_common.rb
|
54
|
-
- lib/cluda/
|
63
|
+
- lib/cluda/distances/chebyshev.rb
|
55
64
|
- lib/cluda/distances/distance.rb
|
56
65
|
- lib/cluda/distances/euclidean.rb
|
57
66
|
- lib/cluda/distances/manhattan.rb
|
58
|
-
- lib/cluda/
|
67
|
+
- lib/cluda/kmeans.rb
|
68
|
+
- lib/cluda/version.rb
|
59
69
|
homepage: https://github.com/emfigo/cluda
|
60
70
|
licenses:
|
61
71
|
- MIT
|
72
|
+
metadata: {}
|
62
73
|
post_install_message:
|
63
74
|
rdoc_options: []
|
64
75
|
require_paths:
|
65
76
|
- lib
|
66
77
|
required_ruby_version: !ruby/object:Gem::Requirement
|
67
|
-
none: false
|
68
78
|
requirements:
|
69
|
-
- -
|
79
|
+
- - ">="
|
70
80
|
- !ruby/object:Gem::Version
|
71
|
-
version: '
|
81
|
+
version: '2.3'
|
72
82
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
83
|
requirements:
|
75
|
-
- -
|
84
|
+
- - ">="
|
76
85
|
- !ruby/object:Gem::Version
|
77
86
|
version: '0'
|
78
87
|
requirements: []
|
79
|
-
|
80
|
-
rubygems_version: 1.8.23
|
88
|
+
rubygems_version: 3.0.3
|
81
89
|
signing_key:
|
82
|
-
specification_version:
|
90
|
+
specification_version: 4
|
83
91
|
summary: CLuDA
|
84
92
|
test_files: []
|