flock 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,11 +4,15 @@ Ruby bindings to {Cluster 3.0}[http://bonsai.hgc.jp/~mdehoon/software/cluster/so
4
4
 
5
5
  == Description
6
6
 
7
- Provides bindings to K-Means clustering in Cluster 3.0
7
+ Provides bindings to clustering methods in Cluster 3.0.
8
+
9
+ * K-Means
10
+ * Kohonen Self-Organizing Maps
11
+ * Tree Cluster or Hierarchical Clustering
8
12
 
9
13
  == Synopsis
10
14
 
11
- === Numeric Data
15
+ === Specify vectors explicitly
12
16
 
13
17
  require 'pp'
14
18
  require 'flock'
@@ -69,8 +73,18 @@ Provides bindings to K-Means clustering in Cluster 3.0
69
73
  weights: Array.new(13) {1.0},
70
74
  )
71
75
 
76
+ pp Flock.treecluster(
77
+ 6,
78
+ data,
79
+ mask,
80
+ method: Flock::METHOD_AVERAGE,
81
+ metric: Flock::METRIC_EUCLIDIAN,
82
+ transpose: 0,
83
+ weights: Array.new(13) {1.0},
84
+ )
85
+
72
86
 
73
- === Sparse and Non-Numeric data
87
+ === Sparse data and clustering string labels
74
88
 
75
89
  require 'pp'
76
90
  require 'flock'
@@ -88,7 +102,7 @@ Provides bindings to K-Means clustering in Cluster 3.0
88
102
 
89
103
  data = []
90
104
 
91
- # a much simpler way to cluster text
105
+ # a much simpler way to cluster text labels.
92
106
  data << %w(apple orange)
93
107
  data << %w(black white)
94
108
  data << %w(white cyan)
@@ -97,9 +111,15 @@ Provides bindings to K-Means clustering in Cluster 3.0
97
111
 
98
112
  # additional options such as metric, iterations can be passed in a hash.
99
113
  pp Flock.sparse_kmeans(2, data)
114
+ pp Flock.sparse_treecluster(2, data)
115
+
100
116
 
101
117
  === Self-Organizing Map
102
118
 
119
+ Self-Organizing Maps (SOM) require that you specify a 2D grid on which data points can cluster. Some of the
120
+ grid points may be empty and others might have clusters mapped to them. There is no need to provide
121
+ a fixed cluster size like K-Means or Tree Cluster.
122
+
103
123
  require 'pp'
104
124
  require 'flock'
105
125
 
@@ -116,11 +136,13 @@ Provides bindings to K-Means clustering in Cluster 3.0
116
136
  # additional options such as metric, iterations can be passed in a hash.
117
137
  pp Flock.sparse_self_organizing_map(2, 2, data)
118
138
 
119
- == TODO
139
+ Note: SOM clustering provides the 2D grid coordinate for each vector instead of an integer cluster value
140
+ for each vector like K-Means and Tree Cluster methods do.
120
141
 
121
- Bindings to,
142
+ == TODO
122
143
 
123
- * Hierarchical clustering (treecluster)
144
+ * {K-Tree clustering}[http://arxiv.org/pdf/1001.0827v1]
145
+ * Use Sparse Matrix instead of converting sparse data into dense matrices.
124
146
 
125
147
  = License
126
148
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.1
1
+ 0.4.0
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'pp'
4
+ require 'flock'
5
+
6
+ data = []
7
+ data << %w(apple orange)
8
+ data << %w(black white)
9
+ data << %w(white cyan)
10
+ data << %w(orange)
11
+ data << %w(apple)
12
+
13
+ pp Flock.sparse_treecluster(2, data)
@@ -218,8 +218,12 @@ VALUE rb_som(int argc, VALUE *argv, VALUE self) {
218
218
  VALUE cluster = rb_ary_new();
219
219
  VALUE centroid = rb_ary_new();
220
220
 
221
- for (i = 0; i < dimx; i++)
222
- rb_ary_push(cluster, INT2NUM(ccluster[i][0] * nxgrid + ccluster[i][1]));
221
+ for (i = 0; i < dimx; i++) {
222
+ VALUE gridpoint = rb_ary_new();
223
+ rb_ary_push(gridpoint, INT2NUM(ccluster[i][0]));
224
+ rb_ary_push(gridpoint, INT2NUM(ccluster[i][1]));
225
+ rb_ary_push(cluster, gridpoint);
226
+ }
223
227
 
224
228
  for (i = 0; i < nxgrid; i++) {
225
229
  for (j = 0; j < nygrid; j++) {
@@ -256,6 +260,96 @@ VALUE rb_som(int argc, VALUE *argv, VALUE self) {
256
260
  return result;
257
261
  }
258
262
 
263
+ VALUE rb_treecluster(int argc, VALUE *argv, VALUE self) {
264
+ VALUE size, data, mask, weights, options;
265
+ rb_scan_args(argc, argv, "22", &size, &data, &mask, &options);
266
+
267
+ if (TYPE(data) != T_ARRAY)
268
+ rb_raise(rb_eArgError, "data should be an array of arrays");
269
+
270
+ if (!NIL_P(mask) && TYPE(mask) != T_ARRAY)
271
+ rb_raise(rb_eArgError, "mask should be an array of arrays");
272
+
273
+ if (NIL_P(size) || NUM2INT(rb_Integer(size)) > RARRAY_LEN(data))
274
+ rb_raise(rb_eArgError, "size should be > 0 and <= data size");
275
+
276
+ int transpose = opt_int_value(options, "transpose", 0);
277
+ // a = average, m = means
278
+ int method = opt_int_value(options, "method", 'a');
279
+ // e = euclidian,
280
+ // b = city-block distance
281
+ // c = correlation
282
+ // a = absolute value of the correlation
283
+ // u = uncentered correlation
284
+ // x = absolute uncentered correlation
285
+ // s = spearman's rank correlation
286
+ // k = kendall's tau
287
+ int dist = opt_int_value(options, "metric", 'e');
288
+
289
+ int i,j;
290
+ int nrows = RARRAY_LEN(data);
291
+ int ncols = RARRAY_LEN(rb_ary_entry(data, 0));
292
+ int nsets = NUM2INT(rb_Integer(size));
293
+
294
+ double **cdata = (double**)malloc(sizeof(double*)*nrows);
295
+ int **cmask = (int **)malloc(sizeof(int *)*nrows);
296
+ double *cweights = (double *)malloc(sizeof(double )*ncols);
297
+
298
+ int *ccluster, dimx = nrows, dimy = ncols;
299
+
300
+ for (i = 0; i < nrows; i++) {
301
+ cdata[i] = (double*)malloc(sizeof(double)*ncols);
302
+ cmask[i] = (int *)malloc(sizeof(int )*ncols);
303
+ for (j = 0; j < ncols; j++) {
304
+ cdata[i][j] = NUM2DBL(rb_Float(rb_ary_entry(rb_ary_entry(data, i), j)));
305
+ cmask[i][j] = NIL_P(mask) ? 1 : NUM2INT(rb_Integer(rb_ary_entry(rb_ary_entry(mask, i), j)));
306
+ }
307
+ }
308
+
309
+ weights = NIL_P(options) ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("weights")));
310
+ for (i = 0; i < ncols; i++) {
311
+ cweights[i] = NIL_P(weights) ? 1.0 : NUM2DBL(rb_Float(rb_ary_entry(weights, i)));
312
+ }
313
+
314
+ if (transpose) {
315
+ dimx = ncols;
316
+ dimy = nrows;
317
+ }
318
+
319
+ ccluster = (int *)malloc(sizeof(int)*dimx);
320
+
321
+ Node *tree = treecluster(nrows, ncols, cdata, cmask, cweights, transpose, dist, method, 0);
322
+ VALUE result = Qnil, cluster;
323
+
324
+ if (tree) {
325
+ cuttree(dimx, tree, nsets, ccluster);
326
+
327
+ result = rb_hash_new();
328
+ cluster = rb_ary_new();
329
+
330
+ for (i = 0; i < dimx; i++)
331
+ rb_ary_push(cluster, INT2NUM(ccluster[i]));
332
+
333
+ rb_hash_aset(result, ID2SYM(rb_intern("cluster")), cluster);
334
+ }
335
+
336
+ for (i = 0; i < nrows; i++) {
337
+ free(cdata[i]);
338
+ free(cmask[i]);
339
+ }
340
+
341
+ free(cdata);
342
+ free(cmask);
343
+ free(cweights);
344
+ free(ccluster);
345
+
346
+ if (tree)
347
+ free(tree);
348
+ else
349
+ rb_raise(rb_eNoMemError, "tree cluster ran out of memory");
350
+
351
+ return result;
352
+ }
259
353
 
260
354
  VALUE rb_distance(VALUE vec1, VALUE vec2, distance_fn fn) {
261
355
  uint32_t size;
@@ -334,6 +428,7 @@ void Init_flock(void) {
334
428
  mFlock = rb_define_module("Flock");
335
429
  rb_define_module_function(mFlock, "kmeans", RUBY_METHOD_FUNC(rb_kmeans), -1);
336
430
  rb_define_module_function(mFlock, "self_organizing_map", RUBY_METHOD_FUNC(rb_som), -1);
431
+ rb_define_module_function(mFlock, "treecluster", RUBY_METHOD_FUNC(rb_treecluster), -1);
337
432
 
338
433
  rb_define_const(mFlock, "METHOD_AVERAGE", INT2NUM('a'));
339
434
  rb_define_const(mFlock, "METHOD_MEDIAN", INT2NUM('m'));
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{flock}
8
- s.version = "0.3.1"
8
+ s.version = "0.4.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Bharanee Rathna"]
12
- s.date = %q{2011-04-24}
12
+ s.date = %q{2011-04-27}
13
13
  s.description = %q{A thin ruby binding to Cluster 3.0}
14
14
  s.email = ["deepfryed@gmail.com"]
15
15
  s.extensions = ["ext/extconf.rb"]
@@ -35,7 +35,8 @@ Gem::Specification.new do |s|
35
35
  s.test_files = [
36
36
  "examples/sparse.rb",
37
37
  "examples/som.rb",
38
- "examples/dense.rb"
38
+ "examples/dense.rb",
39
+ "examples/treecluster.rb"
39
40
  ]
40
41
 
41
42
  if s.respond_to? :specification_version then
@@ -21,27 +21,30 @@ module Flock
21
21
  [dims,data]
22
22
  end
23
23
 
24
- def self.sparse_kmeans size, sparse_data, options = {}
24
+ def self.densify sparse_data, weights = nil
25
25
  dims, data = sparse_data[0].kind_of?(Array) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
26
26
 
27
- if options.key?(:weights)
28
- weights = Array.new(dims.size) {1}
29
- options[:weights].each {|k,v| weights[dims[k]] = v }
30
- options[:weights] = weights
27
+ if weights
28
+ resampled = Array.new(dims.size) {1}
29
+ weights.each {|k,v| resampled[dims[k]] = v }
30
+ weights = resampled
31
31
  end
32
32
 
33
+ [data, weights]
34
+ end
35
+
36
+ def self.sparse_kmeans size, sparse_data, options = {}
37
+ data, options[:weights] = densify(sparse_data, options[:weights])
33
38
  kmeans(size, data, nil, options)
34
39
  end
35
40
 
36
41
  def self.sparse_self_organizing_map nx, ny, sparse_data, options = {}
37
- dims, data = sparse_data[0].kind_of?(Array) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
38
-
39
- if options.key?(:weights)
40
- weights = Array.new(dims.size) {1}
41
- options[:weights].each {|k,v| weights[dims[k]] = v }
42
- options[:weights] = weights
43
- end
44
-
42
+ data, options[:weights] = densify(sparse_data, options[:weights])
45
43
  self_organizing_map(nx, ny, data, nil, options)
46
44
  end
45
+
46
+ def self.sparse_treecluster size, sparse_data, options = {}
47
+ data, options[:weights] = densify(sparse_data, options[:weights])
48
+ treecluster(size, data, nil, options)
49
+ end
47
50
  end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 3
8
- - 1
9
- version: 0.3.1
7
+ - 4
8
+ - 0
9
+ version: 0.4.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Bharanee Rathna
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-04-24 00:00:00 +10:00
17
+ date: 2011-04-27 00:00:00 +10:00
18
18
  default_executable:
19
19
  dependencies: []
20
20
 
@@ -40,6 +40,7 @@ files:
40
40
  - examples/sparse.rb
41
41
  - examples/som.rb
42
42
  - examples/dense.rb
43
+ - examples/treecluster.rb
43
44
  has_rdoc: true
44
45
  homepage: http://github.com/deepfryed/flock
45
46
  licenses: []
@@ -76,3 +77,4 @@ test_files:
76
77
  - examples/sparse.rb
77
78
  - examples/som.rb
78
79
  - examples/dense.rb
80
+ - examples/treecluster.rb