flock 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,11 +4,15 @@ Ruby bindings to {Cluster 3.0}[http://bonsai.hgc.jp/~mdehoon/software/cluster/so
4
4
 
5
5
  == Description
6
6
 
7
- Provides bindings to K-Means clustering in Cluster 3.0
7
+ Provides bindings to clustering methods in Cluster 3.0.
8
+
9
+ * K-Means
10
+ * Kohonen Self-Organizing Maps
11
+ * Tree Cluster or Hierarchical Clustering
8
12
 
9
13
  == Synopsis
10
14
 
11
- === Numeric Data
15
+ === Specify vectors explicitly
12
16
 
13
17
  require 'pp'
14
18
  require 'flock'
@@ -69,8 +73,18 @@ Provides bindings to K-Means clustering in Cluster 3.0
69
73
  weights: Array.new(13) {1.0},
70
74
  )
71
75
 
76
+ pp Flock.treecluster(
77
+ 6,
78
+ data,
79
+ mask,
80
+ method: Flock::METHOD_AVERAGE,
81
+ metric: Flock::METRIC_EUCLIDIAN,
82
+ transpose: 0,
83
+ weights: Array.new(13) {1.0},
84
+ )
85
+
72
86
 
73
- === Sparse and Non-Numeric data
87
+ === Sparse data and clustering string labels
74
88
 
75
89
  require 'pp'
76
90
  require 'flock'
@@ -88,7 +102,7 @@ Provides bindings to K-Means clustering in Cluster 3.0
88
102
 
89
103
  data = []
90
104
 
91
- # a much simpler way to cluster text
105
+ # a much simpler way to cluster text labels.
92
106
  data << %w(apple orange)
93
107
  data << %w(black white)
94
108
  data << %w(white cyan)
@@ -97,9 +111,15 @@ Provides bindings to K-Means clustering in Cluster 3.0
97
111
 
98
112
  # additional options such as metric, iterations can be passed in a hash.
99
113
  pp Flock.sparse_kmeans(2, data)
114
+ pp Flock.sparse_treecluster(2, data)
115
+
100
116
 
101
117
  === Self-Organizing Map
102
118
 
119
+ Self-Organizing Maps (SOM) require that you specify a 2D grid on which data points can cluster. Some of the
120
+ grid points may be empty and others might have clusters mapped to them. There is no need to provide
121
+ a fixed cluster size like K-Means or Tree Cluster.
122
+
103
123
  require 'pp'
104
124
  require 'flock'
105
125
 
@@ -116,11 +136,13 @@ Provides bindings to K-Means clustering in Cluster 3.0
116
136
  # additional options such as metric, iterations can be passed in a hash.
117
137
  pp Flock.sparse_self_organizing_map(2, 2, data)
118
138
 
119
- == TODO
139
+ Note: SOM clustering provides the 2D grid coordinate for each vector instead of an integer cluster value
140
+ for each vector like K-Means and Tree Cluster methods do.
120
141
 
121
- Bindings to,
142
+ == TODO
122
143
 
123
- * Hierarchical clustering (treecluster)
144
+ * {K-Tree clustering}[http://arxiv.org/pdf/1001.0827v1]
145
+ * Use Sparse Matrix instead of converting sparse data into dense matrices.
124
146
 
125
147
  = License
126
148
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.1
1
+ 0.4.0
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'pp'
4
+ require 'flock'
5
+
6
+ data = []
7
+ data << %w(apple orange)
8
+ data << %w(black white)
9
+ data << %w(white cyan)
10
+ data << %w(orange)
11
+ data << %w(apple)
12
+
13
+ pp Flock.sparse_treecluster(2, data)
@@ -218,8 +218,12 @@ VALUE rb_som(int argc, VALUE *argv, VALUE self) {
218
218
  VALUE cluster = rb_ary_new();
219
219
  VALUE centroid = rb_ary_new();
220
220
 
221
- for (i = 0; i < dimx; i++)
222
- rb_ary_push(cluster, INT2NUM(ccluster[i][0] * nxgrid + ccluster[i][1]));
221
+ for (i = 0; i < dimx; i++) {
222
+ VALUE gridpoint = rb_ary_new();
223
+ rb_ary_push(gridpoint, INT2NUM(ccluster[i][0]));
224
+ rb_ary_push(gridpoint, INT2NUM(ccluster[i][1]));
225
+ rb_ary_push(cluster, gridpoint);
226
+ }
223
227
 
224
228
  for (i = 0; i < nxgrid; i++) {
225
229
  for (j = 0; j < nygrid; j++) {
@@ -256,6 +260,96 @@ VALUE rb_som(int argc, VALUE *argv, VALUE self) {
256
260
  return result;
257
261
  }
258
262
 
263
+ VALUE rb_treecluster(int argc, VALUE *argv, VALUE self) {
264
+ VALUE size, data, mask, weights, options;
265
+ rb_scan_args(argc, argv, "22", &size, &data, &mask, &options);
266
+
267
+ if (TYPE(data) != T_ARRAY)
268
+ rb_raise(rb_eArgError, "data should be an array of arrays");
269
+
270
+ if (!NIL_P(mask) && TYPE(mask) != T_ARRAY)
271
+ rb_raise(rb_eArgError, "mask should be an array of arrays");
272
+
273
+ if (NIL_P(size) || NUM2INT(rb_Integer(size)) > RARRAY_LEN(data))
274
+ rb_raise(rb_eArgError, "size should be > 0 and <= data size");
275
+
276
+ int transpose = opt_int_value(options, "transpose", 0);
277
+ // a = average, m = means
278
+ int method = opt_int_value(options, "method", 'a');
279
+ // e = euclidian,
280
+ // b = city-block distance
281
+ // c = correlation
282
+ // a = absolute value of the correlation
283
+ // u = uncentered correlation
284
+ // x = absolute uncentered correlation
285
+ // s = spearman's rank correlation
286
+ // k = kendall's tau
287
+ int dist = opt_int_value(options, "metric", 'e');
288
+
289
+ int i,j;
290
+ int nrows = RARRAY_LEN(data);
291
+ int ncols = RARRAY_LEN(rb_ary_entry(data, 0));
292
+ int nsets = NUM2INT(rb_Integer(size));
293
+
294
+ double **cdata = (double**)malloc(sizeof(double*)*nrows);
295
+ int **cmask = (int **)malloc(sizeof(int *)*nrows);
296
+ double *cweights = (double *)malloc(sizeof(double )*ncols);
297
+
298
+ int *ccluster, dimx = nrows, dimy = ncols;
299
+
300
+ for (i = 0; i < nrows; i++) {
301
+ cdata[i] = (double*)malloc(sizeof(double)*ncols);
302
+ cmask[i] = (int *)malloc(sizeof(int )*ncols);
303
+ for (j = 0; j < ncols; j++) {
304
+ cdata[i][j] = NUM2DBL(rb_Float(rb_ary_entry(rb_ary_entry(data, i), j)));
305
+ cmask[i][j] = NIL_P(mask) ? 1 : NUM2INT(rb_Integer(rb_ary_entry(rb_ary_entry(mask, i), j)));
306
+ }
307
+ }
308
+
309
+ weights = NIL_P(options) ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("weights")));
310
+ for (i = 0; i < ncols; i++) {
311
+ cweights[i] = NIL_P(weights) ? 1.0 : NUM2DBL(rb_Float(rb_ary_entry(weights, i)));
312
+ }
313
+
314
+ if (transpose) {
315
+ dimx = ncols;
316
+ dimy = nrows;
317
+ }
318
+
319
+ ccluster = (int *)malloc(sizeof(int)*dimx);
320
+
321
+ Node *tree = treecluster(nrows, ncols, cdata, cmask, cweights, transpose, dist, method, 0);
322
+ VALUE result = Qnil, cluster;
323
+
324
+ if (tree) {
325
+ cuttree(dimx, tree, nsets, ccluster);
326
+
327
+ result = rb_hash_new();
328
+ cluster = rb_ary_new();
329
+
330
+ for (i = 0; i < dimx; i++)
331
+ rb_ary_push(cluster, INT2NUM(ccluster[i]));
332
+
333
+ rb_hash_aset(result, ID2SYM(rb_intern("cluster")), cluster);
334
+ }
335
+
336
+ for (i = 0; i < nrows; i++) {
337
+ free(cdata[i]);
338
+ free(cmask[i]);
339
+ }
340
+
341
+ free(cdata);
342
+ free(cmask);
343
+ free(cweights);
344
+ free(ccluster);
345
+
346
+ if (tree)
347
+ free(tree);
348
+ else
349
+ rb_raise(rb_eNoMemError, "tree cluster ran out of memory");
350
+
351
+ return result;
352
+ }
259
353
 
260
354
  VALUE rb_distance(VALUE vec1, VALUE vec2, distance_fn fn) {
261
355
  uint32_t size;
@@ -334,6 +428,7 @@ void Init_flock(void) {
334
428
  mFlock = rb_define_module("Flock");
335
429
  rb_define_module_function(mFlock, "kmeans", RUBY_METHOD_FUNC(rb_kmeans), -1);
336
430
  rb_define_module_function(mFlock, "self_organizing_map", RUBY_METHOD_FUNC(rb_som), -1);
431
+ rb_define_module_function(mFlock, "treecluster", RUBY_METHOD_FUNC(rb_treecluster), -1);
337
432
 
338
433
  rb_define_const(mFlock, "METHOD_AVERAGE", INT2NUM('a'));
339
434
  rb_define_const(mFlock, "METHOD_MEDIAN", INT2NUM('m'));
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{flock}
8
- s.version = "0.3.1"
8
+ s.version = "0.4.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Bharanee Rathna"]
12
- s.date = %q{2011-04-24}
12
+ s.date = %q{2011-04-27}
13
13
  s.description = %q{A thin ruby binding to Cluster 3.0}
14
14
  s.email = ["deepfryed@gmail.com"]
15
15
  s.extensions = ["ext/extconf.rb"]
@@ -35,7 +35,8 @@ Gem::Specification.new do |s|
35
35
  s.test_files = [
36
36
  "examples/sparse.rb",
37
37
  "examples/som.rb",
38
- "examples/dense.rb"
38
+ "examples/dense.rb",
39
+ "examples/treecluster.rb"
39
40
  ]
40
41
 
41
42
  if s.respond_to? :specification_version then
@@ -21,27 +21,30 @@ module Flock
21
21
  [dims,data]
22
22
  end
23
23
 
24
- def self.sparse_kmeans size, sparse_data, options = {}
24
+ def self.densify sparse_data, weights = nil
25
25
  dims, data = sparse_data[0].kind_of?(Array) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
26
26
 
27
- if options.key?(:weights)
28
- weights = Array.new(dims.size) {1}
29
- options[:weights].each {|k,v| weights[dims[k]] = v }
30
- options[:weights] = weights
27
+ if weights
28
+ resampled = Array.new(dims.size) {1}
29
+ weights.each {|k,v| resampled[dims[k]] = v }
30
+ weights = resampled
31
31
  end
32
32
 
33
+ [data, weights]
34
+ end
35
+
36
+ def self.sparse_kmeans size, sparse_data, options = {}
37
+ data, options[:weights] = densify(sparse_data, options[:weights])
33
38
  kmeans(size, data, nil, options)
34
39
  end
35
40
 
36
41
  def self.sparse_self_organizing_map nx, ny, sparse_data, options = {}
37
- dims, data = sparse_data[0].kind_of?(Array) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
38
-
39
- if options.key?(:weights)
40
- weights = Array.new(dims.size) {1}
41
- options[:weights].each {|k,v| weights[dims[k]] = v }
42
- options[:weights] = weights
43
- end
44
-
42
+ data, options[:weights] = densify(sparse_data, options[:weights])
45
43
  self_organizing_map(nx, ny, data, nil, options)
46
44
  end
45
+
46
+ def self.sparse_treecluster size, sparse_data, options = {}
47
+ data, options[:weights] = densify(sparse_data, options[:weights])
48
+ treecluster(size, data, nil, options)
49
+ end
47
50
  end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 3
8
- - 1
9
- version: 0.3.1
7
+ - 4
8
+ - 0
9
+ version: 0.4.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Bharanee Rathna
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-04-24 00:00:00 +10:00
17
+ date: 2011-04-27 00:00:00 +10:00
18
18
  default_executable:
19
19
  dependencies: []
20
20
 
@@ -40,6 +40,7 @@ files:
40
40
  - examples/sparse.rb
41
41
  - examples/som.rb
42
42
  - examples/dense.rb
43
+ - examples/treecluster.rb
43
44
  has_rdoc: true
44
45
  homepage: http://github.com/deepfryed/flock
45
46
  licenses: []
@@ -76,3 +77,4 @@ test_files:
76
77
  - examples/sparse.rb
77
78
  - examples/som.rb
78
79
  - examples/dense.rb
80
+ - examples/treecluster.rb