kmeans-clusterer 0.5.3 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/kmeans-clusterer.rb +21 -21
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cf47337db7688eba2c31a5117e2f8c071415759a
|
4
|
+
data.tar.gz: 844c11b63adf3d1e5a554bebcdbde8f251c45d96
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 78750f92bd336135d5118c061dea3e6d56cce8500642f18aa8b4dd0c5dd2953eda2c47691348685ff591d5c33b795aff93c52fd6869d53701cc3c46b661b9eee
|
7
|
+
data.tar.gz: 70470a45c59ad789a08f3f73a189de3958e6202b4c4fc5459eae967d2cd3ea4653ac20fc1dc2f343fe795829307f845680975fbaeeb35e5b62335ffc3006fd76
|
data/lib/kmeans-clusterer.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'narray'
|
2
2
|
|
3
3
|
class KMeansClusterer
|
4
|
+
TYPECODE = { double: NArray::DFLOAT, single: NArray::SFLOAT }
|
5
|
+
|
4
6
|
module Scaler
|
5
7
|
def self.mean data
|
6
8
|
data.mean(1)
|
@@ -12,8 +14,8 @@ class KMeansClusterer
|
|
12
14
|
std
|
13
15
|
end
|
14
16
|
|
15
|
-
def self.scale data, mean = nil, std = nil
|
16
|
-
data = NArray.cast(data,
|
17
|
+
def self.scale data, mean = nil, std = nil, typecode = nil
|
18
|
+
data = NArray.cast(data, typecode)
|
17
19
|
mean ||= self.mean(data)
|
18
20
|
std ||= self.std(data)
|
19
21
|
data = (data - mean) / std
|
@@ -64,27 +66,24 @@ class KMeansClusterer
|
|
64
66
|
point.cluster = self
|
65
67
|
@points << point
|
66
68
|
end
|
67
|
-
|
68
|
-
def points_narray
|
69
|
-
NArray.cast @points.map(&:data)
|
70
|
-
end
|
71
69
|
end
|
72
70
|
|
73
71
|
|
74
|
-
DEFAULT_OPTS = { scale_data: false, runs: 10, log: false, init: :kmpp}
|
72
|
+
DEFAULT_OPTS = { scale_data: false, runs: 10, log: false, init: :kmpp, float_precision: :double }
|
75
73
|
|
76
74
|
def self.run k, data, opts = {}
|
77
75
|
opts = DEFAULT_OPTS.merge(opts)
|
78
76
|
|
79
77
|
opts[:k] = k
|
78
|
+
opts[:typecode] = TYPECODE[opts[:float_precision]]
|
80
79
|
|
81
80
|
if opts[:scale_data]
|
82
|
-
data, mean, std = Scaler.scale(data)
|
81
|
+
data, mean, std = Scaler.scale(data, nil, nil, opts[:typecode])
|
83
82
|
opts[:mean] = mean
|
84
83
|
opts[:std] = std
|
85
84
|
end
|
86
85
|
|
87
|
-
opts[:points_matrix] = NMatrix.cast(data,
|
86
|
+
opts[:points_matrix] = NMatrix.cast(data, opts[:typecode])
|
88
87
|
opts[:row_norms] = opts[:points_matrix].map {|v| v**2}.sum(0)
|
89
88
|
|
90
89
|
bestrun = nil
|
@@ -119,6 +118,7 @@ class KMeansClusterer
|
|
119
118
|
@mean = opts[:mean]
|
120
119
|
@std = opts[:std]
|
121
120
|
@scale_data = opts[:scale_data]
|
121
|
+
@typecode = opts[:typecode]
|
122
122
|
|
123
123
|
init_centroids
|
124
124
|
end
|
@@ -144,7 +144,7 @@ class KMeansClusterer
|
|
144
144
|
updated_centroids = []
|
145
145
|
|
146
146
|
@k.times do |i|
|
147
|
-
centroid = NArray.cast(@centroids[true, i].flatten)
|
147
|
+
centroid = NArray.cast(@centroids[true, i].flatten, @typecode)
|
148
148
|
point_ids = @cluster_point_ids[i]
|
149
149
|
|
150
150
|
if point_ids.empty?
|
@@ -159,7 +159,7 @@ class KMeansClusterer
|
|
159
159
|
updated_centroids << newcenter
|
160
160
|
end
|
161
161
|
|
162
|
-
@centroids = NMatrix.cast updated_centroids
|
162
|
+
@centroids = NMatrix.cast updated_centroids, @typecode
|
163
163
|
|
164
164
|
break if moves.max < 0.001 # i.e., no movement
|
165
165
|
break if @iterations >= 300
|
@@ -179,8 +179,8 @@ class KMeansClusterer
|
|
179
179
|
end
|
180
180
|
|
181
181
|
def predict data
|
182
|
-
data, _m, _s = Scaler.scale(data, @mean, @std) if @scale_data
|
183
|
-
data = NMatrix.cast(data,
|
182
|
+
data, _m, _s = Scaler.scale(data, @mean, @std, @typecode) if @scale_data
|
183
|
+
data = NMatrix.cast(data, @typecode)
|
184
184
|
distances = distance(@centroids, data, nil)
|
185
185
|
data.shape[1].times.map do |i|
|
186
186
|
distances[i, true].sort_index[0] # index of closest cluster
|
@@ -223,7 +223,7 @@ class KMeansClusterer
|
|
223
223
|
private
|
224
224
|
def wrap_point point
|
225
225
|
return point if point.is_a?(Point)
|
226
|
-
Point.new(0, NArray.
|
226
|
+
Point.new(0, NArray.cast(point, @typecode))
|
227
227
|
end
|
228
228
|
|
229
229
|
def dissimilarity points, point
|
@@ -259,7 +259,7 @@ class KMeansClusterer
|
|
259
259
|
d2 << min_distance**2
|
260
260
|
end
|
261
261
|
|
262
|
-
d2 = NArray.
|
262
|
+
d2 = NArray.cast(d2, @typecode)
|
263
263
|
probs = d2 / d2.sum
|
264
264
|
cumprobs = probs.cumsum
|
265
265
|
r = rand
|
@@ -271,7 +271,7 @@ class KMeansClusterer
|
|
271
271
|
end
|
272
272
|
|
273
273
|
def custom_centroid_init
|
274
|
-
@centroids = NMatrix.cast @init
|
274
|
+
@centroids = NMatrix.cast @init, @typecode
|
275
275
|
@k = @init.length
|
276
276
|
end
|
277
277
|
|
@@ -289,14 +289,14 @@ class KMeansClusterer
|
|
289
289
|
|
290
290
|
def set_points
|
291
291
|
@points = @points_count.times.map do |i|
|
292
|
-
data = NArray.cast @points_matrix[true, i].flatten
|
292
|
+
data = NArray.cast @points_matrix[true, i].flatten, @typecode
|
293
293
|
Point.new(i, data, @labels[i])
|
294
294
|
end
|
295
295
|
end
|
296
296
|
|
297
297
|
def set_clusters
|
298
298
|
@clusters = @k.times.map do |i|
|
299
|
-
centroid = NArray.cast @centroids[true, i].flatten
|
299
|
+
centroid = NArray.cast @centroids[true, i].flatten, @typecode
|
300
300
|
c = Cluster.new i, Point.new(-i, centroid)
|
301
301
|
@cluster_point_ids[i].each do |p|
|
302
302
|
c << @points[p]
|
@@ -322,17 +322,17 @@ class KMeansClusterer
|
|
322
322
|
end
|
323
323
|
|
324
324
|
def get_point i
|
325
|
-
NArray.cast @points_matrix[true, i].flatten
|
325
|
+
NArray.cast @points_matrix[true, i].flatten, @typecode
|
326
326
|
end
|
327
327
|
|
328
328
|
def get_centroid i
|
329
|
-
NArray.cast(@centroids[true, i].flatten)
|
329
|
+
NArray.cast(@centroids[true, i].flatten, @typecode)
|
330
330
|
end
|
331
331
|
|
332
332
|
def get_points_for_centroid i
|
333
333
|
point_ids = @cluster_point_ids[i]
|
334
334
|
points = @points_matrix[true, point_ids]
|
335
|
-
points.empty? ? NArray.
|
335
|
+
points.empty? ? NArray.sfloat(0) : NArray.cast(points, @typecode)
|
336
336
|
end
|
337
337
|
|
338
338
|
def distance x, y, yy = @row_norms
|