flock 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +29 -7
- data/VERSION +1 -1
- data/examples/treecluster.rb +13 -0
- data/ext/flock.c +97 -2
- data/flock.gemspec +4 -3
- data/lib/flock.rb +16 -13
- metadata +6 -4
data/README.rdoc
CHANGED
@@ -4,11 +4,15 @@ Ruby bindings to {Cluster 3.0}[http://bonsai.hgc.jp/~mdehoon/software/cluster/so
|
|
4
4
|
|
5
5
|
== Description
|
6
6
|
|
7
|
-
Provides bindings to
|
7
|
+
Provides bindings to clustering methods in Cluster 3.0.
|
8
|
+
|
9
|
+
* K-Means
|
10
|
+
* Kohonen Self-Organizing Maps
|
11
|
+
* Tree Cluster or Hierarchical Clustering
|
8
12
|
|
9
13
|
== Synopsis
|
10
14
|
|
11
|
-
===
|
15
|
+
=== Specify vectors explicitly
|
12
16
|
|
13
17
|
require 'pp'
|
14
18
|
require 'flock'
|
@@ -69,8 +73,18 @@ Provides bindings to K-Means clustering in Cluster 3.0
|
|
69
73
|
weights: Array.new(13) {1.0},
|
70
74
|
)
|
71
75
|
|
76
|
+
pp Flock.treecluster(
|
77
|
+
6,
|
78
|
+
data,
|
79
|
+
mask,
|
80
|
+
method: Flock::METHOD_AVERAGE,
|
81
|
+
metric: Flock::METRIC_EUCLIDIAN,
|
82
|
+
transpose: 0,
|
83
|
+
weights: Array.new(13) {1.0},
|
84
|
+
)
|
85
|
+
|
72
86
|
|
73
|
-
=== Sparse and
|
87
|
+
=== Sparse data and clustering string labels
|
74
88
|
|
75
89
|
require 'pp'
|
76
90
|
require 'flock'
|
@@ -88,7 +102,7 @@ Provides bindings to K-Means clustering in Cluster 3.0
|
|
88
102
|
|
89
103
|
data = []
|
90
104
|
|
91
|
-
# a much simpler way to cluster text
|
105
|
+
# a much simpler way to cluster text labels.
|
92
106
|
data << %w(apple orange)
|
93
107
|
data << %w(black white)
|
94
108
|
data << %w(white cyan)
|
@@ -97,9 +111,15 @@ Provides bindings to K-Means clustering in Cluster 3.0
|
|
97
111
|
|
98
112
|
# additional options such as metric, iterations can be passed in a hash.
|
99
113
|
pp Flock.sparse_kmeans(2, data)
|
114
|
+
pp Flock.sparse_treecluster(2, data)
|
115
|
+
|
100
116
|
|
101
117
|
=== Self-Organizing Map
|
102
118
|
|
119
|
+
Self-Organizing Maps (SOM) require that you specify a 2D grid on which data points can cluster. Some of the
|
120
|
+
grid points may be empty and others might have clusters mapped to them. There is no need to provide
|
121
|
+
a fixed cluster size like K-Means or Tree Cluster.
|
122
|
+
|
103
123
|
require 'pp'
|
104
124
|
require 'flock'
|
105
125
|
|
@@ -116,11 +136,13 @@ Provides bindings to K-Means clustering in Cluster 3.0
|
|
116
136
|
# additional options such as metric, iterations can be passed in a hash.
|
117
137
|
pp Flock.sparse_self_organizing_map(2, 2, data)
|
118
138
|
|
119
|
-
|
139
|
+
Note: SOM clustering provides the 2D grid coordinate for each vector instead of an integer cluster value
|
140
|
+
for each vector like K-Means and Tree Cluster methods do.
|
120
141
|
|
121
|
-
|
142
|
+
== TODO
|
122
143
|
|
123
|
-
*
|
144
|
+
* {K-Tree clustering}[http://arxiv.org/pdf/1001.0827v1]
|
145
|
+
* Use Sparse Matrix instead of converting sparse data into dense matrices.
|
124
146
|
|
125
147
|
= License
|
126
148
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.4.0
|
data/ext/flock.c
CHANGED
@@ -218,8 +218,12 @@ VALUE rb_som(int argc, VALUE *argv, VALUE self) {
|
|
218
218
|
VALUE cluster = rb_ary_new();
|
219
219
|
VALUE centroid = rb_ary_new();
|
220
220
|
|
221
|
-
for (i = 0; i < dimx; i++)
|
222
|
-
|
221
|
+
for (i = 0; i < dimx; i++) {
|
222
|
+
VALUE gridpoint = rb_ary_new();
|
223
|
+
rb_ary_push(gridpoint, INT2NUM(ccluster[i][0]));
|
224
|
+
rb_ary_push(gridpoint, INT2NUM(ccluster[i][1]));
|
225
|
+
rb_ary_push(cluster, gridpoint);
|
226
|
+
}
|
223
227
|
|
224
228
|
for (i = 0; i < nxgrid; i++) {
|
225
229
|
for (j = 0; j < nygrid; j++) {
|
@@ -256,6 +260,96 @@ VALUE rb_som(int argc, VALUE *argv, VALUE self) {
|
|
256
260
|
return result;
|
257
261
|
}
|
258
262
|
|
263
|
+
VALUE rb_treecluster(int argc, VALUE *argv, VALUE self) {
|
264
|
+
VALUE size, data, mask, weights, options;
|
265
|
+
rb_scan_args(argc, argv, "22", &size, &data, &mask, &options);
|
266
|
+
|
267
|
+
if (TYPE(data) != T_ARRAY)
|
268
|
+
rb_raise(rb_eArgError, "data should be an array of arrays");
|
269
|
+
|
270
|
+
if (!NIL_P(mask) && TYPE(mask) != T_ARRAY)
|
271
|
+
rb_raise(rb_eArgError, "mask should be an array of arrays");
|
272
|
+
|
273
|
+
if (NIL_P(size) || NUM2INT(rb_Integer(size)) > RARRAY_LEN(data))
|
274
|
+
rb_raise(rb_eArgError, "size should be > 0 and <= data size");
|
275
|
+
|
276
|
+
int transpose = opt_int_value(options, "transpose", 0);
|
277
|
+
// a = average, m = means
|
278
|
+
int method = opt_int_value(options, "method", 'a');
|
279
|
+
// e = euclidian,
|
280
|
+
// b = city-block distance
|
281
|
+
// c = correlation
|
282
|
+
// a = absolute value of the correlation
|
283
|
+
// u = uncentered correlation
|
284
|
+
// x = absolute uncentered correlation
|
285
|
+
// s = spearman's rank correlation
|
286
|
+
// k = kendall's tau
|
287
|
+
int dist = opt_int_value(options, "metric", 'e');
|
288
|
+
|
289
|
+
int i,j;
|
290
|
+
int nrows = RARRAY_LEN(data);
|
291
|
+
int ncols = RARRAY_LEN(rb_ary_entry(data, 0));
|
292
|
+
int nsets = NUM2INT(rb_Integer(size));
|
293
|
+
|
294
|
+
double **cdata = (double**)malloc(sizeof(double*)*nrows);
|
295
|
+
int **cmask = (int **)malloc(sizeof(int *)*nrows);
|
296
|
+
double *cweights = (double *)malloc(sizeof(double )*ncols);
|
297
|
+
|
298
|
+
int *ccluster, dimx = nrows, dimy = ncols;
|
299
|
+
|
300
|
+
for (i = 0; i < nrows; i++) {
|
301
|
+
cdata[i] = (double*)malloc(sizeof(double)*ncols);
|
302
|
+
cmask[i] = (int *)malloc(sizeof(int )*ncols);
|
303
|
+
for (j = 0; j < ncols; j++) {
|
304
|
+
cdata[i][j] = NUM2DBL(rb_Float(rb_ary_entry(rb_ary_entry(data, i), j)));
|
305
|
+
cmask[i][j] = NIL_P(mask) ? 1 : NUM2INT(rb_Integer(rb_ary_entry(rb_ary_entry(mask, i), j)));
|
306
|
+
}
|
307
|
+
}
|
308
|
+
|
309
|
+
weights = NIL_P(options) ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("weights")));
|
310
|
+
for (i = 0; i < ncols; i++) {
|
311
|
+
cweights[i] = NIL_P(weights) ? 1.0 : NUM2DBL(rb_Float(rb_ary_entry(weights, i)));
|
312
|
+
}
|
313
|
+
|
314
|
+
if (transpose) {
|
315
|
+
dimx = ncols;
|
316
|
+
dimy = nrows;
|
317
|
+
}
|
318
|
+
|
319
|
+
ccluster = (int *)malloc(sizeof(int)*dimx);
|
320
|
+
|
321
|
+
Node *tree = treecluster(nrows, ncols, cdata, cmask, cweights, transpose, dist, method, 0);
|
322
|
+
VALUE result = Qnil, cluster;
|
323
|
+
|
324
|
+
if (tree) {
|
325
|
+
cuttree(dimx, tree, nsets, ccluster);
|
326
|
+
|
327
|
+
result = rb_hash_new();
|
328
|
+
cluster = rb_ary_new();
|
329
|
+
|
330
|
+
for (i = 0; i < dimx; i++)
|
331
|
+
rb_ary_push(cluster, INT2NUM(ccluster[i]));
|
332
|
+
|
333
|
+
rb_hash_aset(result, ID2SYM(rb_intern("cluster")), cluster);
|
334
|
+
}
|
335
|
+
|
336
|
+
for (i = 0; i < nrows; i++) {
|
337
|
+
free(cdata[i]);
|
338
|
+
free(cmask[i]);
|
339
|
+
}
|
340
|
+
|
341
|
+
free(cdata);
|
342
|
+
free(cmask);
|
343
|
+
free(cweights);
|
344
|
+
free(ccluster);
|
345
|
+
|
346
|
+
if (tree)
|
347
|
+
free(tree);
|
348
|
+
else
|
349
|
+
rb_raise(rb_eNoMemError, "tree cluster ran out of memory");
|
350
|
+
|
351
|
+
return result;
|
352
|
+
}
|
259
353
|
|
260
354
|
VALUE rb_distance(VALUE vec1, VALUE vec2, distance_fn fn) {
|
261
355
|
uint32_t size;
|
@@ -334,6 +428,7 @@ void Init_flock(void) {
|
|
334
428
|
mFlock = rb_define_module("Flock");
|
335
429
|
rb_define_module_function(mFlock, "kmeans", RUBY_METHOD_FUNC(rb_kmeans), -1);
|
336
430
|
rb_define_module_function(mFlock, "self_organizing_map", RUBY_METHOD_FUNC(rb_som), -1);
|
431
|
+
rb_define_module_function(mFlock, "treecluster", RUBY_METHOD_FUNC(rb_treecluster), -1);
|
337
432
|
|
338
433
|
rb_define_const(mFlock, "METHOD_AVERAGE", INT2NUM('a'));
|
339
434
|
rb_define_const(mFlock, "METHOD_MEDIAN", INT2NUM('m'));
|
data/flock.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{flock}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.4.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Bharanee Rathna"]
|
12
|
-
s.date = %q{2011-04-
|
12
|
+
s.date = %q{2011-04-27}
|
13
13
|
s.description = %q{A thin ruby binding to Cluster 3.0}
|
14
14
|
s.email = ["deepfryed@gmail.com"]
|
15
15
|
s.extensions = ["ext/extconf.rb"]
|
@@ -35,7 +35,8 @@ Gem::Specification.new do |s|
|
|
35
35
|
s.test_files = [
|
36
36
|
"examples/sparse.rb",
|
37
37
|
"examples/som.rb",
|
38
|
-
"examples/dense.rb"
|
38
|
+
"examples/dense.rb",
|
39
|
+
"examples/treecluster.rb"
|
39
40
|
]
|
40
41
|
|
41
42
|
if s.respond_to? :specification_version then
|
data/lib/flock.rb
CHANGED
@@ -21,27 +21,30 @@ module Flock
|
|
21
21
|
[dims,data]
|
22
22
|
end
|
23
23
|
|
24
|
-
def self.
|
24
|
+
def self.densify sparse_data, weights = nil
|
25
25
|
dims, data = sparse_data[0].kind_of?(Array) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
|
26
26
|
|
27
|
-
if
|
28
|
-
|
29
|
-
|
30
|
-
|
27
|
+
if weights
|
28
|
+
resampled = Array.new(dims.size) {1}
|
29
|
+
weights.each {|k,v| resampled[dims[k]] = v }
|
30
|
+
weights = resampled
|
31
31
|
end
|
32
32
|
|
33
|
+
[data, weights]
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.sparse_kmeans size, sparse_data, options = {}
|
37
|
+
data, options[:weights] = densify(sparse_data, options[:weights])
|
33
38
|
kmeans(size, data, nil, options)
|
34
39
|
end
|
35
40
|
|
36
41
|
def self.sparse_self_organizing_map nx, ny, sparse_data, options = {}
|
37
|
-
|
38
|
-
|
39
|
-
if options.key?(:weights)
|
40
|
-
weights = Array.new(dims.size) {1}
|
41
|
-
options[:weights].each {|k,v| weights[dims[k]] = v }
|
42
|
-
options[:weights] = weights
|
43
|
-
end
|
44
|
-
|
42
|
+
data, options[:weights] = densify(sparse_data, options[:weights])
|
45
43
|
self_organizing_map(nx, ny, data, nil, options)
|
46
44
|
end
|
45
|
+
|
46
|
+
def self.sparse_treecluster size, sparse_data, options = {}
|
47
|
+
data, options[:weights] = densify(sparse_data, options[:weights])
|
48
|
+
treecluster(size, data, nil, options)
|
49
|
+
end
|
47
50
|
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 4
|
8
|
+
- 0
|
9
|
+
version: 0.4.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Bharanee Rathna
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-04-
|
17
|
+
date: 2011-04-27 00:00:00 +10:00
|
18
18
|
default_executable:
|
19
19
|
dependencies: []
|
20
20
|
|
@@ -40,6 +40,7 @@ files:
|
|
40
40
|
- examples/sparse.rb
|
41
41
|
- examples/som.rb
|
42
42
|
- examples/dense.rb
|
43
|
+
- examples/treecluster.rb
|
43
44
|
has_rdoc: true
|
44
45
|
homepage: http://github.com/deepfryed/flock
|
45
46
|
licenses: []
|
@@ -76,3 +77,4 @@ test_files:
|
|
76
77
|
- examples/sparse.rb
|
77
78
|
- examples/som.rb
|
78
79
|
- examples/dense.rb
|
80
|
+
- examples/treecluster.rb
|