kmeans-clusterer 0.5.3 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/kmeans-clusterer.rb +21 -21
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cf47337db7688eba2c31a5117e2f8c071415759a
|
4
|
+
data.tar.gz: 844c11b63adf3d1e5a554bebcdbde8f251c45d96
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 78750f92bd336135d5118c061dea3e6d56cce8500642f18aa8b4dd0c5dd2953eda2c47691348685ff591d5c33b795aff93c52fd6869d53701cc3c46b661b9eee
|
7
|
+
data.tar.gz: 70470a45c59ad789a08f3f73a189de3958e6202b4c4fc5459eae967d2cd3ea4653ac20fc1dc2f343fe795829307f845680975fbaeeb35e5b62335ffc3006fd76
|
data/lib/kmeans-clusterer.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'narray'
|
2
2
|
|
3
3
|
class KMeansClusterer
|
4
|
+
TYPECODE = { double: NArray::DFLOAT, single: NArray::SFLOAT }
|
5
|
+
|
4
6
|
module Scaler
|
5
7
|
def self.mean data
|
6
8
|
data.mean(1)
|
@@ -12,8 +14,8 @@ class KMeansClusterer
|
|
12
14
|
std
|
13
15
|
end
|
14
16
|
|
15
|
-
def self.scale data, mean = nil, std = nil
|
16
|
-
data = NArray.cast(data,
|
17
|
+
def self.scale data, mean = nil, std = nil, typecode = nil
|
18
|
+
data = NArray.cast(data, typecode)
|
17
19
|
mean ||= self.mean(data)
|
18
20
|
std ||= self.std(data)
|
19
21
|
data = (data - mean) / std
|
@@ -64,27 +66,24 @@ class KMeansClusterer
|
|
64
66
|
point.cluster = self
|
65
67
|
@points << point
|
66
68
|
end
|
67
|
-
|
68
|
-
def points_narray
|
69
|
-
NArray.cast @points.map(&:data)
|
70
|
-
end
|
71
69
|
end
|
72
70
|
|
73
71
|
|
74
|
-
DEFAULT_OPTS = { scale_data: false, runs: 10, log: false, init: :kmpp}
|
72
|
+
DEFAULT_OPTS = { scale_data: false, runs: 10, log: false, init: :kmpp, float_precision: :double }
|
75
73
|
|
76
74
|
def self.run k, data, opts = {}
|
77
75
|
opts = DEFAULT_OPTS.merge(opts)
|
78
76
|
|
79
77
|
opts[:k] = k
|
78
|
+
opts[:typecode] = TYPECODE[opts[:float_precision]]
|
80
79
|
|
81
80
|
if opts[:scale_data]
|
82
|
-
data, mean, std = Scaler.scale(data)
|
81
|
+
data, mean, std = Scaler.scale(data, nil, nil, opts[:typecode])
|
83
82
|
opts[:mean] = mean
|
84
83
|
opts[:std] = std
|
85
84
|
end
|
86
85
|
|
87
|
-
opts[:points_matrix] = NMatrix.cast(data,
|
86
|
+
opts[:points_matrix] = NMatrix.cast(data, opts[:typecode])
|
88
87
|
opts[:row_norms] = opts[:points_matrix].map {|v| v**2}.sum(0)
|
89
88
|
|
90
89
|
bestrun = nil
|
@@ -119,6 +118,7 @@ class KMeansClusterer
|
|
119
118
|
@mean = opts[:mean]
|
120
119
|
@std = opts[:std]
|
121
120
|
@scale_data = opts[:scale_data]
|
121
|
+
@typecode = opts[:typecode]
|
122
122
|
|
123
123
|
init_centroids
|
124
124
|
end
|
@@ -144,7 +144,7 @@ class KMeansClusterer
|
|
144
144
|
updated_centroids = []
|
145
145
|
|
146
146
|
@k.times do |i|
|
147
|
-
centroid = NArray.cast(@centroids[true, i].flatten)
|
147
|
+
centroid = NArray.cast(@centroids[true, i].flatten, @typecode)
|
148
148
|
point_ids = @cluster_point_ids[i]
|
149
149
|
|
150
150
|
if point_ids.empty?
|
@@ -159,7 +159,7 @@ class KMeansClusterer
|
|
159
159
|
updated_centroids << newcenter
|
160
160
|
end
|
161
161
|
|
162
|
-
@centroids = NMatrix.cast updated_centroids
|
162
|
+
@centroids = NMatrix.cast updated_centroids, @typecode
|
163
163
|
|
164
164
|
break if moves.max < 0.001 # i.e., no movement
|
165
165
|
break if @iterations >= 300
|
@@ -179,8 +179,8 @@ class KMeansClusterer
|
|
179
179
|
end
|
180
180
|
|
181
181
|
def predict data
|
182
|
-
data, _m, _s = Scaler.scale(data, @mean, @std) if @scale_data
|
183
|
-
data = NMatrix.cast(data,
|
182
|
+
data, _m, _s = Scaler.scale(data, @mean, @std, @typecode) if @scale_data
|
183
|
+
data = NMatrix.cast(data, @typecode)
|
184
184
|
distances = distance(@centroids, data, nil)
|
185
185
|
data.shape[1].times.map do |i|
|
186
186
|
distances[i, true].sort_index[0] # index of closest cluster
|
@@ -223,7 +223,7 @@ class KMeansClusterer
|
|
223
223
|
private
|
224
224
|
def wrap_point point
|
225
225
|
return point if point.is_a?(Point)
|
226
|
-
Point.new(0, NArray.
|
226
|
+
Point.new(0, NArray.cast(point, @typecode))
|
227
227
|
end
|
228
228
|
|
229
229
|
def dissimilarity points, point
|
@@ -259,7 +259,7 @@ class KMeansClusterer
|
|
259
259
|
d2 << min_distance**2
|
260
260
|
end
|
261
261
|
|
262
|
-
d2 = NArray.
|
262
|
+
d2 = NArray.cast(d2, @typecode)
|
263
263
|
probs = d2 / d2.sum
|
264
264
|
cumprobs = probs.cumsum
|
265
265
|
r = rand
|
@@ -271,7 +271,7 @@ class KMeansClusterer
|
|
271
271
|
end
|
272
272
|
|
273
273
|
def custom_centroid_init
|
274
|
-
@centroids = NMatrix.cast @init
|
274
|
+
@centroids = NMatrix.cast @init, @typecode
|
275
275
|
@k = @init.length
|
276
276
|
end
|
277
277
|
|
@@ -289,14 +289,14 @@ class KMeansClusterer
|
|
289
289
|
|
290
290
|
def set_points
|
291
291
|
@points = @points_count.times.map do |i|
|
292
|
-
data = NArray.cast @points_matrix[true, i].flatten
|
292
|
+
data = NArray.cast @points_matrix[true, i].flatten, @typecode
|
293
293
|
Point.new(i, data, @labels[i])
|
294
294
|
end
|
295
295
|
end
|
296
296
|
|
297
297
|
def set_clusters
|
298
298
|
@clusters = @k.times.map do |i|
|
299
|
-
centroid = NArray.cast @centroids[true, i].flatten
|
299
|
+
centroid = NArray.cast @centroids[true, i].flatten, @typecode
|
300
300
|
c = Cluster.new i, Point.new(-i, centroid)
|
301
301
|
@cluster_point_ids[i].each do |p|
|
302
302
|
c << @points[p]
|
@@ -322,17 +322,17 @@ class KMeansClusterer
|
|
322
322
|
end
|
323
323
|
|
324
324
|
def get_point i
|
325
|
-
NArray.cast @points_matrix[true, i].flatten
|
325
|
+
NArray.cast @points_matrix[true, i].flatten, @typecode
|
326
326
|
end
|
327
327
|
|
328
328
|
def get_centroid i
|
329
|
-
NArray.cast(@centroids[true, i].flatten)
|
329
|
+
NArray.cast(@centroids[true, i].flatten, @typecode)
|
330
330
|
end
|
331
331
|
|
332
332
|
def get_points_for_centroid i
|
333
333
|
point_ids = @cluster_point_ids[i]
|
334
334
|
points = @points_matrix[true, point_ids]
|
335
|
-
points.empty? ? NArray.
|
335
|
+
points.empty? ? NArray.sfloat(0) : NArray.cast(points, @typecode)
|
336
336
|
end
|
337
337
|
|
338
338
|
def distance x, y, yy = @row_norms
|