flock 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +29 -7
- data/VERSION +1 -1
- data/examples/treecluster.rb +13 -0
- data/ext/flock.c +97 -2
- data/flock.gemspec +4 -3
- data/lib/flock.rb +16 -13
- metadata +6 -4
data/README.rdoc
CHANGED
@@ -4,11 +4,15 @@ Ruby bindings to {Cluster 3.0}[http://bonsai.hgc.jp/~mdehoon/software/cluster/so
|
|
4
4
|
|
5
5
|
== Description
|
6
6
|
|
7
|
-
Provides bindings to
|
7
|
+
Provides bindings to clustering methods in Cluster 3.0.
|
8
|
+
|
9
|
+
* K-Means
|
10
|
+
* Kohonen Self-Organizing Maps
|
11
|
+
* Tree Cluster or Hierarchical Clustering
|
8
12
|
|
9
13
|
== Synopsis
|
10
14
|
|
11
|
-
===
|
15
|
+
=== Specify vectors explicitly
|
12
16
|
|
13
17
|
require 'pp'
|
14
18
|
require 'flock'
|
@@ -69,8 +73,18 @@ Provides bindings to K-Means clustering in Cluster 3.0
|
|
69
73
|
weights: Array.new(13) {1.0},
|
70
74
|
)
|
71
75
|
|
76
|
+
pp Flock.treecluster(
|
77
|
+
6,
|
78
|
+
data,
|
79
|
+
mask,
|
80
|
+
method: Flock::METHOD_AVERAGE,
|
81
|
+
metric: Flock::METRIC_EUCLIDIAN,
|
82
|
+
transpose: 0,
|
83
|
+
weights: Array.new(13) {1.0},
|
84
|
+
)
|
85
|
+
|
72
86
|
|
73
|
-
=== Sparse and
|
87
|
+
=== Sparse data and clustering string labels
|
74
88
|
|
75
89
|
require 'pp'
|
76
90
|
require 'flock'
|
@@ -88,7 +102,7 @@ Provides bindings to K-Means clustering in Cluster 3.0
|
|
88
102
|
|
89
103
|
data = []
|
90
104
|
|
91
|
-
# a much simpler way to cluster text
|
105
|
+
# a much simpler way to cluster text labels.
|
92
106
|
data << %w(apple orange)
|
93
107
|
data << %w(black white)
|
94
108
|
data << %w(white cyan)
|
@@ -97,9 +111,15 @@ Provides bindings to K-Means clustering in Cluster 3.0
|
|
97
111
|
|
98
112
|
# additional options such as metric, iterations can be passed in a hash.
|
99
113
|
pp Flock.sparse_kmeans(2, data)
|
114
|
+
pp Flock.sparse_treecluster(2, data)
|
115
|
+
|
100
116
|
|
101
117
|
=== Self-Organizing Map
|
102
118
|
|
119
|
+
Self-Organizing Maps (SOM) require that you specify a 2D grid on which data points can cluster. Some of the
|
120
|
+
grid points may be empty and others might have clusters mapped to them. There is no need to provide
|
121
|
+
a fixed cluster size like K-Means or Tree Cluster.
|
122
|
+
|
103
123
|
require 'pp'
|
104
124
|
require 'flock'
|
105
125
|
|
@@ -116,11 +136,13 @@ Provides bindings to K-Means clustering in Cluster 3.0
|
|
116
136
|
# additional options such as metric, iterations can be passed in a hash.
|
117
137
|
pp Flock.sparse_self_organizing_map(2, 2, data)
|
118
138
|
|
119
|
-
|
139
|
+
Note: SOM clustering provides the 2D grid coordinate for each vector instead of an integer cluster value
|
140
|
+
for each vector like K-Means and Tree Cluster methods do.
|
120
141
|
|
121
|
-
|
142
|
+
== TODO
|
122
143
|
|
123
|
-
*
|
144
|
+
* {K-Tree clustering}[http://arxiv.org/pdf/1001.0827v1]
|
145
|
+
* Use Sparse Matrix instead of converting sparse data into dense matrices.
|
124
146
|
|
125
147
|
= License
|
126
148
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.4.0
|
data/ext/flock.c
CHANGED
@@ -218,8 +218,12 @@ VALUE rb_som(int argc, VALUE *argv, VALUE self) {
|
|
218
218
|
VALUE cluster = rb_ary_new();
|
219
219
|
VALUE centroid = rb_ary_new();
|
220
220
|
|
221
|
-
for (i = 0; i < dimx; i++)
|
222
|
-
|
221
|
+
for (i = 0; i < dimx; i++) {
|
222
|
+
VALUE gridpoint = rb_ary_new();
|
223
|
+
rb_ary_push(gridpoint, INT2NUM(ccluster[i][0]));
|
224
|
+
rb_ary_push(gridpoint, INT2NUM(ccluster[i][1]));
|
225
|
+
rb_ary_push(cluster, gridpoint);
|
226
|
+
}
|
223
227
|
|
224
228
|
for (i = 0; i < nxgrid; i++) {
|
225
229
|
for (j = 0; j < nygrid; j++) {
|
@@ -256,6 +260,96 @@ VALUE rb_som(int argc, VALUE *argv, VALUE self) {
|
|
256
260
|
return result;
|
257
261
|
}
|
258
262
|
|
263
|
+
VALUE rb_treecluster(int argc, VALUE *argv, VALUE self) {
|
264
|
+
VALUE size, data, mask, weights, options;
|
265
|
+
rb_scan_args(argc, argv, "22", &size, &data, &mask, &options);
|
266
|
+
|
267
|
+
if (TYPE(data) != T_ARRAY)
|
268
|
+
rb_raise(rb_eArgError, "data should be an array of arrays");
|
269
|
+
|
270
|
+
if (!NIL_P(mask) && TYPE(mask) != T_ARRAY)
|
271
|
+
rb_raise(rb_eArgError, "mask should be an array of arrays");
|
272
|
+
|
273
|
+
if (NIL_P(size) || NUM2INT(rb_Integer(size)) > RARRAY_LEN(data))
|
274
|
+
rb_raise(rb_eArgError, "size should be > 0 and <= data size");
|
275
|
+
|
276
|
+
int transpose = opt_int_value(options, "transpose", 0);
|
277
|
+
// a = average, m = means
|
278
|
+
int method = opt_int_value(options, "method", 'a');
|
279
|
+
// e = euclidian,
|
280
|
+
// b = city-block distance
|
281
|
+
// c = correlation
|
282
|
+
// a = absolute value of the correlation
|
283
|
+
// u = uncentered correlation
|
284
|
+
// x = absolute uncentered correlation
|
285
|
+
// s = spearman's rank correlation
|
286
|
+
// k = kendall's tau
|
287
|
+
int dist = opt_int_value(options, "metric", 'e');
|
288
|
+
|
289
|
+
int i,j;
|
290
|
+
int nrows = RARRAY_LEN(data);
|
291
|
+
int ncols = RARRAY_LEN(rb_ary_entry(data, 0));
|
292
|
+
int nsets = NUM2INT(rb_Integer(size));
|
293
|
+
|
294
|
+
double **cdata = (double**)malloc(sizeof(double*)*nrows);
|
295
|
+
int **cmask = (int **)malloc(sizeof(int *)*nrows);
|
296
|
+
double *cweights = (double *)malloc(sizeof(double )*ncols);
|
297
|
+
|
298
|
+
int *ccluster, dimx = nrows, dimy = ncols;
|
299
|
+
|
300
|
+
for (i = 0; i < nrows; i++) {
|
301
|
+
cdata[i] = (double*)malloc(sizeof(double)*ncols);
|
302
|
+
cmask[i] = (int *)malloc(sizeof(int )*ncols);
|
303
|
+
for (j = 0; j < ncols; j++) {
|
304
|
+
cdata[i][j] = NUM2DBL(rb_Float(rb_ary_entry(rb_ary_entry(data, i), j)));
|
305
|
+
cmask[i][j] = NIL_P(mask) ? 1 : NUM2INT(rb_Integer(rb_ary_entry(rb_ary_entry(mask, i), j)));
|
306
|
+
}
|
307
|
+
}
|
308
|
+
|
309
|
+
weights = NIL_P(options) ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("weights")));
|
310
|
+
for (i = 0; i < ncols; i++) {
|
311
|
+
cweights[i] = NIL_P(weights) ? 1.0 : NUM2DBL(rb_Float(rb_ary_entry(weights, i)));
|
312
|
+
}
|
313
|
+
|
314
|
+
if (transpose) {
|
315
|
+
dimx = ncols;
|
316
|
+
dimy = nrows;
|
317
|
+
}
|
318
|
+
|
319
|
+
ccluster = (int *)malloc(sizeof(int)*dimx);
|
320
|
+
|
321
|
+
Node *tree = treecluster(nrows, ncols, cdata, cmask, cweights, transpose, dist, method, 0);
|
322
|
+
VALUE result = Qnil, cluster;
|
323
|
+
|
324
|
+
if (tree) {
|
325
|
+
cuttree(dimx, tree, nsets, ccluster);
|
326
|
+
|
327
|
+
result = rb_hash_new();
|
328
|
+
cluster = rb_ary_new();
|
329
|
+
|
330
|
+
for (i = 0; i < dimx; i++)
|
331
|
+
rb_ary_push(cluster, INT2NUM(ccluster[i]));
|
332
|
+
|
333
|
+
rb_hash_aset(result, ID2SYM(rb_intern("cluster")), cluster);
|
334
|
+
}
|
335
|
+
|
336
|
+
for (i = 0; i < nrows; i++) {
|
337
|
+
free(cdata[i]);
|
338
|
+
free(cmask[i]);
|
339
|
+
}
|
340
|
+
|
341
|
+
free(cdata);
|
342
|
+
free(cmask);
|
343
|
+
free(cweights);
|
344
|
+
free(ccluster);
|
345
|
+
|
346
|
+
if (tree)
|
347
|
+
free(tree);
|
348
|
+
else
|
349
|
+
rb_raise(rb_eNoMemError, "tree cluster ran out of memory");
|
350
|
+
|
351
|
+
return result;
|
352
|
+
}
|
259
353
|
|
260
354
|
VALUE rb_distance(VALUE vec1, VALUE vec2, distance_fn fn) {
|
261
355
|
uint32_t size;
|
@@ -334,6 +428,7 @@ void Init_flock(void) {
|
|
334
428
|
mFlock = rb_define_module("Flock");
|
335
429
|
rb_define_module_function(mFlock, "kmeans", RUBY_METHOD_FUNC(rb_kmeans), -1);
|
336
430
|
rb_define_module_function(mFlock, "self_organizing_map", RUBY_METHOD_FUNC(rb_som), -1);
|
431
|
+
rb_define_module_function(mFlock, "treecluster", RUBY_METHOD_FUNC(rb_treecluster), -1);
|
337
432
|
|
338
433
|
rb_define_const(mFlock, "METHOD_AVERAGE", INT2NUM('a'));
|
339
434
|
rb_define_const(mFlock, "METHOD_MEDIAN", INT2NUM('m'));
|
data/flock.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{flock}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.4.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Bharanee Rathna"]
|
12
|
-
s.date = %q{2011-04-
|
12
|
+
s.date = %q{2011-04-27}
|
13
13
|
s.description = %q{A thin ruby binding to Cluster 3.0}
|
14
14
|
s.email = ["deepfryed@gmail.com"]
|
15
15
|
s.extensions = ["ext/extconf.rb"]
|
@@ -35,7 +35,8 @@ Gem::Specification.new do |s|
|
|
35
35
|
s.test_files = [
|
36
36
|
"examples/sparse.rb",
|
37
37
|
"examples/som.rb",
|
38
|
-
"examples/dense.rb"
|
38
|
+
"examples/dense.rb",
|
39
|
+
"examples/treecluster.rb"
|
39
40
|
]
|
40
41
|
|
41
42
|
if s.respond_to? :specification_version then
|
data/lib/flock.rb
CHANGED
@@ -21,27 +21,30 @@ module Flock
|
|
21
21
|
[dims,data]
|
22
22
|
end
|
23
23
|
|
24
|
-
def self.
|
24
|
+
def self.densify sparse_data, weights = nil
|
25
25
|
dims, data = sparse_data[0].kind_of?(Array) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
|
26
26
|
|
27
|
-
if
|
28
|
-
|
29
|
-
|
30
|
-
|
27
|
+
if weights
|
28
|
+
resampled = Array.new(dims.size) {1}
|
29
|
+
weights.each {|k,v| resampled[dims[k]] = v }
|
30
|
+
weights = resampled
|
31
31
|
end
|
32
32
|
|
33
|
+
[data, weights]
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.sparse_kmeans size, sparse_data, options = {}
|
37
|
+
data, options[:weights] = densify(sparse_data, options[:weights])
|
33
38
|
kmeans(size, data, nil, options)
|
34
39
|
end
|
35
40
|
|
36
41
|
def self.sparse_self_organizing_map nx, ny, sparse_data, options = {}
|
37
|
-
|
38
|
-
|
39
|
-
if options.key?(:weights)
|
40
|
-
weights = Array.new(dims.size) {1}
|
41
|
-
options[:weights].each {|k,v| weights[dims[k]] = v }
|
42
|
-
options[:weights] = weights
|
43
|
-
end
|
44
|
-
|
42
|
+
data, options[:weights] = densify(sparse_data, options[:weights])
|
45
43
|
self_organizing_map(nx, ny, data, nil, options)
|
46
44
|
end
|
45
|
+
|
46
|
+
def self.sparse_treecluster size, sparse_data, options = {}
|
47
|
+
data, options[:weights] = densify(sparse_data, options[:weights])
|
48
|
+
treecluster(size, data, nil, options)
|
49
|
+
end
|
47
50
|
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 4
|
8
|
+
- 0
|
9
|
+
version: 0.4.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Bharanee Rathna
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-04-
|
17
|
+
date: 2011-04-27 00:00:00 +10:00
|
18
18
|
default_executable:
|
19
19
|
dependencies: []
|
20
20
|
|
@@ -40,6 +40,7 @@ files:
|
|
40
40
|
- examples/sparse.rb
|
41
41
|
- examples/som.rb
|
42
42
|
- examples/dense.rb
|
43
|
+
- examples/treecluster.rb
|
43
44
|
has_rdoc: true
|
44
45
|
homepage: http://github.com/deepfryed/flock
|
45
46
|
licenses: []
|
@@ -76,3 +77,4 @@ test_files:
|
|
76
77
|
- examples/sparse.rb
|
77
78
|
- examples/som.rb
|
78
79
|
- examples/dense.rb
|
80
|
+
- examples/treecluster.rb
|