cluda 0.0.2 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/cluda.rb +4 -2
- data/lib/cluda/cluda_common.rb +36 -37
- data/lib/cluda/distances/chebyshev.rb +5 -4
- data/lib/cluda/distances/distance.rb +6 -4
- data/lib/cluda/distances/euclidean.rb +5 -4
- data/lib/cluda/distances/manhattan.rb +4 -3
- data/lib/cluda/kmeans.rb +136 -102
- data/lib/cluda/version.rb +5 -0
- metadata +36 -28
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 84fc5617e8ce06d8b83eec5c0e6bfc1ea7cf1c43246f5040078324cf754089c7
|
4
|
+
data.tar.gz: 812430fa9d168cdd552b059ffaa2933e930a3d461d28d9e2991a36bf1e44a42c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ff054734dc37f4a806dc3a2635e26eec634138abd6a1ceca982b6c54bc6910be36b2ccf00b5be6d7636e56ebf82b6ca122db783f8529ed8f972fdb3b944c10dd
|
7
|
+
data.tar.gz: b9e3fd5d71c2db328303288e17991cd892912f24e85fef0c3f0555a0bd06d42a35bb48ee609423b829f10aa015756946136052cd7fb8c04f5f0e0b67010b8ed0
|
data/lib/cluda.rb
CHANGED
data/lib/cluda/cluda_common.rb
CHANGED
@@ -1,79 +1,78 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Cluda
|
4
|
+
module_function
|
5
|
+
|
2
6
|
class InvalidPoint < RuntimeError; end
|
3
7
|
class InvalidCentroid < RuntimeError; end
|
4
8
|
class InvalidSmartPoint < RuntimeError; end
|
5
9
|
class InvalidDistanceMethod < RuntimeError; end
|
6
|
-
|
10
|
+
|
7
11
|
# For an output given by CluDA calculate the means for each centroid
|
8
12
|
#
|
9
|
-
#Example:
|
13
|
+
# Example:
|
10
14
|
# > clusters = {{:x=>2, :y=>2}=>
|
11
15
|
# [{:x=>1, :y=>1, :distance=>1.4142135623730951},
|
12
16
|
# {:x=>2, :y=>1, :distance=>1.0},
|
13
17
|
# {:x=>1, :y=>2, :distance=>1.0},
|
14
18
|
# {:x=>2, :y=>2, :distance=>0.0}]}
|
15
19
|
# > Cluda.median_for_centroids(clusters)
|
16
|
-
#Arguments:
|
20
|
+
# Arguments:
|
17
21
|
# points: ( Hash )
|
18
|
-
def
|
19
|
-
points.
|
20
|
-
validate_smart_points(
|
22
|
+
def median_for_centroids(points)
|
23
|
+
points.each_key do |centroid|
|
24
|
+
validate_smart_points(points[centroid])
|
21
25
|
end
|
22
26
|
|
23
27
|
points.keys.map do |centroid|
|
24
|
-
centroid.merge(
|
28
|
+
centroid.merge(median: median_for_centroid(centroid, points))
|
25
29
|
end
|
26
30
|
end
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
def self.valid_class?( name )
|
31
|
-
['euclidean', 'chebyshev', 'manhattan'].include?( name.downcase )
|
31
|
+
|
32
|
+
def valid_class?(name)
|
33
|
+
%w[euclidean chebyshev manhattan].include?(name.downcase)
|
32
34
|
end
|
33
35
|
|
34
|
-
def
|
35
|
-
centroids = centroids.is_a?(Array) ? centroids : [
|
36
|
-
|
36
|
+
def validate_centroids(centroids)
|
37
|
+
centroids = centroids.is_a?(Array) ? centroids : [centroids]
|
38
|
+
|
37
39
|
validate(centroids)
|
38
|
-
|
40
|
+
|
39
41
|
centroids.each do |point|
|
40
|
-
raise InvalidCentroid unless
|
42
|
+
raise InvalidCentroid unless point.include?(:median)
|
41
43
|
end
|
42
44
|
|
43
45
|
centroids
|
44
46
|
end
|
45
|
-
|
46
|
-
def
|
47
|
-
points = points.is_a?(Array) ? points : [
|
48
|
-
|
49
|
-
validate(
|
50
|
-
|
47
|
+
|
48
|
+
def validate_smart_points(points)
|
49
|
+
points = points.is_a?(Array) ? points : [points]
|
50
|
+
|
51
|
+
validate(points)
|
52
|
+
|
51
53
|
points.each do |point|
|
52
|
-
raise InvalidSmartPoint unless
|
54
|
+
raise InvalidSmartPoint unless point.include?(:distance)
|
53
55
|
end
|
54
56
|
|
55
57
|
points
|
56
58
|
end
|
57
|
-
|
58
|
-
def self.validate( data )
|
59
|
-
points = data.is_a?(Array) ? data : [ data ]
|
60
|
-
points.each do |point|
|
61
|
-
raise InvalidPoint unless point.is_a?(Hash) &&
|
62
|
-
point.include?(:x) && point.include?(:y) &&
|
63
|
-
point[:x].is_a?(Numeric) && point[:y].is_a?(Numeric)
|
64
|
-
end
|
65
59
|
|
66
|
-
|
60
|
+
def validate(data)
|
61
|
+
data = [data] unless data.is_a?(Array)
|
62
|
+
|
63
|
+
data.each do |point|
|
64
|
+
raise InvalidPoint unless point.is_a?(Hash) && point[:x].is_a?(Numeric) && point[:y].is_a?(Numeric)
|
65
|
+
end
|
67
66
|
end
|
68
67
|
|
69
|
-
def
|
70
|
-
median(
|
68
|
+
def median_for_centroid(centroid, points)
|
69
|
+
median(points[centroid].map { |point| point[:distance] })
|
71
70
|
end
|
72
71
|
|
73
|
-
def
|
72
|
+
def median(list)
|
74
73
|
sorted_list = list.sort
|
75
74
|
len = list.size
|
76
75
|
|
77
|
-
sorted_list[(
|
76
|
+
sorted_list[((len / 2) + 0.5).floor]
|
78
77
|
end
|
79
78
|
end
|
@@ -1,10 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Cluda
|
2
|
-
class Chebyshev <
|
3
|
-
|
4
|
+
class Chebyshev < Distance
|
4
5
|
def self.distance(x0, x)
|
5
|
-
Cluda.validate(
|
6
|
+
Cluda.validate([x0, x])
|
6
7
|
|
7
|
-
[
|
8
|
+
[(x0[:x] - x[:x]).abs, (x0[:y] - x[:y]).abs].max
|
8
9
|
end
|
9
10
|
end
|
10
11
|
end
|
@@ -1,9 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Cluda
|
2
|
-
class Distance
|
4
|
+
class Distance
|
3
5
|
extend Math
|
4
|
-
|
5
|
-
def self.distance(
|
6
|
-
raise ::NotImplementedError
|
6
|
+
|
7
|
+
def self.distance(_x0, _x)
|
8
|
+
raise ::NotImplementedError, 'You must implement distance method'
|
7
9
|
end
|
8
10
|
end
|
9
11
|
end
|
@@ -1,10 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Cluda
|
2
|
-
class Euclidean <
|
3
|
-
|
4
|
+
class Euclidean < Distance
|
4
5
|
def self.distance(x0, x)
|
5
|
-
Cluda.validate(
|
6
|
+
Cluda.validate([x0, x])
|
6
7
|
|
7
|
-
sqrt(
|
8
|
+
sqrt((x0[:x] - x[:x])**2 + (x0[:y] - x[:y])**2)
|
8
9
|
end
|
9
10
|
end
|
10
11
|
end
|
data/lib/cluda/kmeans.rb
CHANGED
@@ -1,135 +1,169 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'cluda/distances/manhattan'
|
2
4
|
require 'cluda/distances/euclidean'
|
3
5
|
require 'cluda/distances/chebyshev'
|
4
6
|
|
5
7
|
module Cluda
|
6
8
|
class Kmeans
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
distance_method: 'euclidean',
|
9
|
+
DEFAULT_OPTS = { k: 1,
|
10
|
+
centroids: nil,
|
11
|
+
distance_method: 'euclidean',
|
11
12
|
be_smart: false,
|
12
13
|
margin_distance_percentage: 0,
|
13
|
-
max_iterations: 50 }
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
list
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
14
|
+
max_iterations: 50 }.freeze
|
15
|
+
|
16
|
+
class << self
|
17
|
+
# Classify the points using KMeans as the clustering algorithm
|
18
|
+
#
|
19
|
+
# Example:
|
20
|
+
# >> points = [
|
21
|
+
# { x: 1, y: 1},
|
22
|
+
# { x: 2, y: 1},
|
23
|
+
# { x: 1, y: 2},
|
24
|
+
# { x: 2, y: 2},
|
25
|
+
# { x: 4, y: 6},
|
26
|
+
# { x: 5, y: 7},
|
27
|
+
# { x: 5, y: 6},
|
28
|
+
# { x: 5, y: 5},
|
29
|
+
# { x: 6, y: 6},
|
30
|
+
# { x: 6, y: 5}
|
31
|
+
# ]
|
32
|
+
# >> Cluda::Kmeans.classify( points, k: 1, distance_method: 'euclidean', be_smart: true, max_iterations: 50)
|
33
|
+
# Arguments:
|
34
|
+
# list: (Array [Hash] )
|
35
|
+
# k: (Numeric) *optional*
|
36
|
+
# centroids: (Array) *optional*
|
37
|
+
# distance_method: (String) *optional*
|
38
|
+
# [If you want CluDA to be smart you have to specify the centroids ]
|
39
|
+
# be_smart: (Boolean) *optional*
|
40
|
+
# margin_distance_percentage: (Numeric) *optional* [Between 0 and 1]
|
41
|
+
# max_iterations: (Numeric) *optional*
|
42
|
+
def classify(list, opts = {})
|
43
|
+
Cluda.validate(list)
|
44
|
+
Cluda.validate_centroids(opts[:centroids]) if opts[:be_smart]
|
45
|
+
|
46
|
+
config = generate_config(list, opts)
|
47
|
+
|
48
|
+
raise Cluda::InvalidDistanceMethod unless Cluda.valid_class?(config[:distance_method])
|
49
|
+
|
50
|
+
calculate_clusters(list, **config)
|
51
|
+
end
|
52
|
+
|
53
|
+
def generate_config(list, opts)
|
54
|
+
config = DEFAULT_OPTS.merge(opts)
|
55
|
+
|
56
|
+
centroids_present?(config) ? process_centroids(config) : initialize_centroids(list, config)
|
57
|
+
|
58
|
+
config[:margin] = config[:be_smart] ? config[:median_centroid] * config[:margin_distance_percentage] : 0
|
59
|
+
|
60
|
+
config
|
61
|
+
end
|
62
|
+
|
63
|
+
def process_centroids(config)
|
64
|
+
config[:centroids].each do |point|
|
65
|
+
if config[:median_centroid].nil? || config[:median_centroid] < point[:median]
|
66
|
+
config[:median_centroid] = point[:median]
|
58
67
|
end
|
59
68
|
|
60
|
-
|
69
|
+
point.delete_if { |k, _| !%i[x y].include? k }
|
61
70
|
end
|
62
71
|
|
63
|
-
|
64
|
-
previous_centroids = centroids
|
65
|
-
centroids = move_centroids( output )
|
72
|
+
config
|
66
73
|
end
|
67
74
|
|
68
|
-
|
69
|
-
|
75
|
+
def initialize_centroids(list, config)
|
76
|
+
return config if list.empty? || config[:k] > list.size
|
77
|
+
|
78
|
+
config[:centroids] = list.shuffle(random: Random.new(rand(0...config[:k])))[0...config[:k]]
|
79
|
+
|
80
|
+
config
|
81
|
+
end
|
82
|
+
|
83
|
+
def nearest_centroid(point, centroids, klass = Cluda::Euclidean)
|
84
|
+
return nil if centroids.empty?
|
85
|
+
|
86
|
+
Cluda.validate(point)
|
70
87
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
centroids.each do |centroid|
|
82
|
-
new_distance = _class.distance(point, centroid)
|
83
|
-
if new_distance < min_distance
|
84
|
-
min_distance = new_distance
|
85
|
-
nearest_centroid = centroid
|
88
|
+
nearest_centroid = centroids[0]
|
89
|
+
min_distance = klass.distance(point, nearest_centroid)
|
90
|
+
|
91
|
+
centroids.each do |centroid|
|
92
|
+
new_distance = klass.distance(point, centroid)
|
93
|
+
if new_distance < min_distance
|
94
|
+
min_distance = new_distance
|
95
|
+
nearest_centroid = centroid
|
96
|
+
end
|
86
97
|
end
|
98
|
+
|
99
|
+
[nearest_centroid, min_distance]
|
87
100
|
end
|
88
101
|
|
89
|
-
|
90
|
-
end
|
102
|
+
private
|
91
103
|
|
92
|
-
|
93
|
-
|
104
|
+
def calculate_clusters(list, centroids:, distance_method:, **config)
|
105
|
+
cluster = {}
|
94
106
|
|
95
|
-
|
107
|
+
previous_centroids = nil
|
108
|
+
klass = Cluda.const_get(distance_method.downcase.capitalize)
|
96
109
|
|
97
|
-
|
98
|
-
|
110
|
+
config[:max_iterations].times do
|
111
|
+
break if previous_centroids == centroids
|
99
112
|
|
100
|
-
|
113
|
+
cluster = assign_points_to_centroids(list, centroids, klass, config)
|
101
114
|
|
102
|
-
|
103
|
-
|
104
|
-
|
115
|
+
previous_centroids = centroids
|
116
|
+
centroids = move_centroids(cluster)
|
117
|
+
end
|
118
|
+
|
119
|
+
cluster
|
105
120
|
end
|
106
|
-
end
|
107
121
|
|
108
|
-
|
109
|
-
|
110
|
-
|
122
|
+
def centroids_present?(opts)
|
123
|
+
!(opts[:centroids].nil? || opts[:centroids].empty?)
|
124
|
+
end
|
125
|
+
|
126
|
+
def init_cluster(centroids)
|
127
|
+
centroids.each_with_object({}) do |centroid, memo|
|
128
|
+
memo[centroid] = []
|
129
|
+
end
|
130
|
+
end
|
111
131
|
|
112
|
-
|
113
|
-
|
114
|
-
@median_centroid = point[:median] if @median_centroid.nil? || @median_centroid < point[:median]
|
115
|
-
|
116
|
-
memo << { x: point[:x], y: point[:y] }
|
132
|
+
def create_centroid(centroid, output)
|
133
|
+
output[centroid] = []
|
117
134
|
end
|
118
|
-
end
|
119
135
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
136
|
+
def get_key_values(points, key)
|
137
|
+
points.map { |point| point[key] }
|
138
|
+
end
|
139
|
+
|
140
|
+
def move_centroids(output)
|
141
|
+
output.map do |(_key, value)|
|
142
|
+
next if value.empty?
|
143
|
+
|
144
|
+
x = Cluda.median(get_key_values(value, :x))
|
145
|
+
y = Cluda.median(get_key_values(value, :y))
|
146
|
+
|
130
147
|
{ x: x, y: y }
|
148
|
+
end.compact
|
149
|
+
end
|
150
|
+
|
151
|
+
def assign_points_to_centroids(list, centroids, klass, config)
|
152
|
+
list.each_with_object({}) do |point, cluster|
|
153
|
+
centroid, distance = nearest_centroid(point, centroids, klass)
|
154
|
+
|
155
|
+
if config[:be_smart] && distance > (config[:median_centroid] + config[:margin])
|
156
|
+
config[:median_centroid] = distance
|
157
|
+
centroids << point
|
158
|
+
create_centroid(point, cluster)
|
159
|
+
centroid = point
|
160
|
+
distance = 0
|
161
|
+
end
|
162
|
+
|
163
|
+
cluster[centroid] ||= []
|
164
|
+
cluster[centroid] << point.merge(distance: distance)
|
131
165
|
end
|
132
|
-
end
|
166
|
+
end
|
133
167
|
end
|
134
168
|
end
|
135
169
|
end
|
metadata
CHANGED
@@ -1,84 +1,92 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cluda
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
5
|
-
prerelease:
|
4
|
+
version: 0.1.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
|
-
- Enrique Figuerola
|
7
|
+
- Enrique M Figuerola Gomez
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2020-03-19 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rake
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '12.3'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '12.3'
|
14
27
|
- !ruby/object:Gem::Dependency
|
15
28
|
name: rspec
|
16
29
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
30
|
requirements:
|
19
|
-
- -
|
31
|
+
- - "~>"
|
20
32
|
- !ruby/object:Gem::Version
|
21
|
-
version:
|
33
|
+
version: '3.8'
|
22
34
|
type: :development
|
23
35
|
prerelease: false
|
24
36
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
37
|
requirements:
|
27
|
-
- -
|
38
|
+
- - "~>"
|
28
39
|
- !ruby/object:Gem::Version
|
29
|
-
version:
|
40
|
+
version: '3.8'
|
30
41
|
- !ruby/object:Gem::Dependency
|
31
|
-
name:
|
42
|
+
name: rubocop
|
32
43
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
44
|
requirements:
|
35
|
-
- -
|
45
|
+
- - "~>"
|
36
46
|
- !ruby/object:Gem::Version
|
37
|
-
version: '0'
|
47
|
+
version: '0.70'
|
38
48
|
type: :development
|
39
49
|
prerelease: false
|
40
50
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
51
|
requirements:
|
43
|
-
- -
|
52
|
+
- - "~>"
|
44
53
|
- !ruby/object:Gem::Version
|
45
|
-
version: '0'
|
54
|
+
version: '0.70'
|
46
55
|
description: CLustering Data Analysis gem
|
47
|
-
email:
|
56
|
+
email: me@emfigo.com
|
48
57
|
executables: []
|
49
58
|
extensions: []
|
50
59
|
extra_rdoc_files: []
|
51
60
|
files:
|
52
61
|
- lib/cluda.rb
|
53
62
|
- lib/cluda/cluda_common.rb
|
54
|
-
- lib/cluda/
|
63
|
+
- lib/cluda/distances/chebyshev.rb
|
55
64
|
- lib/cluda/distances/distance.rb
|
56
65
|
- lib/cluda/distances/euclidean.rb
|
57
66
|
- lib/cluda/distances/manhattan.rb
|
58
|
-
- lib/cluda/
|
67
|
+
- lib/cluda/kmeans.rb
|
68
|
+
- lib/cluda/version.rb
|
59
69
|
homepage: https://github.com/emfigo/cluda
|
60
70
|
licenses:
|
61
71
|
- MIT
|
72
|
+
metadata: {}
|
62
73
|
post_install_message:
|
63
74
|
rdoc_options: []
|
64
75
|
require_paths:
|
65
76
|
- lib
|
66
77
|
required_ruby_version: !ruby/object:Gem::Requirement
|
67
|
-
none: false
|
68
78
|
requirements:
|
69
|
-
- -
|
79
|
+
- - ">="
|
70
80
|
- !ruby/object:Gem::Version
|
71
|
-
version: '
|
81
|
+
version: '2.3'
|
72
82
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
83
|
requirements:
|
75
|
-
- -
|
84
|
+
- - ">="
|
76
85
|
- !ruby/object:Gem::Version
|
77
86
|
version: '0'
|
78
87
|
requirements: []
|
79
|
-
|
80
|
-
rubygems_version: 1.8.23
|
88
|
+
rubygems_version: 3.0.3
|
81
89
|
signing_key:
|
82
|
-
specification_version:
|
90
|
+
specification_version: 4
|
83
91
|
summary: CLuDA
|
84
92
|
test_files: []
|