rbcluster 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,93 @@
1
+ /******************************************************************************/
2
+ /* The C Clustering Library.
3
+ * Copyright (C) 2002 Michiel Jan Laurens de Hoon.
4
+ *
5
+ * This library was written at the Laboratory of DNA Information Analysis,
6
+ * Human Genome Center, Institute of Medical Science, University of Tokyo,
7
+ * 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
8
+ * Contact: mdehoon 'AT' gsc.riken.jp
9
+ *
10
+ * Permission to use, copy, modify, and distribute this software and its
11
+ * documentation with or without modifications and for any purpose and
12
+ * without fee is hereby granted, provided that any copyright notices
13
+ * appear in all copies and that both those copyright notices and this
14
+ * permission notice appear in supporting documentation, and that the
15
+ * names of the contributors or copyright holders not be used in
16
+ * advertising or publicity pertaining to distribution of the software
17
+ * without specific prior permission.
18
+ *
19
+ * THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
20
+ * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
21
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
22
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
23
+ * OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
24
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
25
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
26
+ * OR PERFORMANCE OF THIS SOFTWARE.
27
+ *
28
+ */
29
+
30
+ #ifndef min
31
+ #define min(x, y) ((x) < (y) ? (x) : (y))
32
+ #endif
33
+ #ifndef max
34
+ #define max(x, y) ((x) > (y) ? (x) : (y))
35
+ #endif
36
+
37
+ #ifdef WINDOWS
38
+ # include <windows.h>
39
+ #endif
40
+
41
+ #define CLUSTERVERSION "1.50"
42
+
43
+ /* Chapter 2 */
44
+ double clusterdistance (int nrows, int ncolumns, double** data, int** mask,
45
+ double weight[], int n1, int n2, int index1[], int index2[], char dist,
46
+ char method, int transpose);
47
+ double** distancematrix (int ngenes, int ndata, double** data,
48
+ int** mask, double* weight, char dist, int transpose);
49
+
50
+ /* Chapter 3 */
51
+ int getclustercentroids(int nclusters, int nrows, int ncolumns,
52
+ double** data, int** mask, int clusterid[], double** cdata, int** cmask,
53
+ int transpose, char method);
54
+ void getclustermedoids(int nclusters, int nelements, double** distance,
55
+ int clusterid[], int centroids[], double errors[]);
56
+ void kcluster (int nclusters, int ngenes, int ndata, double** data,
57
+ int** mask, double weight[], int transpose, int npass, char method, char dist,
58
+ int clusterid[], double* error, int* ifound);
59
+ void kmedoids (int nclusters, int nelements, double** distance,
60
+ int npass, int clusterid[], double* error, int* ifound);
61
+
62
+ /* Chapter 4 */
63
+ typedef struct {int left; int right; double distance;} Node;
64
+ /*
65
+ * A Node struct describes a single node in a tree created by hierarchical
66
+ * clustering. The tree can be represented by an array of n Node structs,
67
+ * where n is the number of elements minus one. The integers left and right
68
+ * in each Node struct refer to the two elements or subnodes that are joined
69
+ * in this node. The original elements are numbered 0..nelements-1, and the
70
+ * nodes -1..-(nelements-1). For each node, distance contains the distance
71
+ * between the two subnodes that were joined.
72
+ */
73
+
74
+ Node* treecluster (int nrows, int ncolumns, double** data, int** mask,
75
+ double weight[], int transpose, char dist, char method, double** distmatrix);
76
+ void cuttree (int nelements, Node* tree, int nclusters, int clusterid[]);
77
+
78
+ /* Chapter 5 */
79
+ void somcluster (int nrows, int ncolumns, double** data, int** mask,
80
+ const double weight[], int transpose, int nxnodes, int nynodes,
81
+ double inittau, int niter, char dist, double*** celldata,
82
+ int clusterid[][2]);
83
+
84
+ /* Chapter 6 */
85
+ int pca(int m, int n, double** u, double** v, double* w);
86
+
87
+ /* Utility routines, currently undocumented */
88
+ void sort(int n, const double data[], int index[]);
89
+ double mean(int n, double x[]);
90
+ double median (int n, double x[]);
91
+
92
+ double* calculate_weights(int nrows, int ncolumns, double** data, int** mask,
93
+ double weights[], int transpose, char dist, double cutoff, double exponent);
@@ -0,0 +1,6 @@
1
+ require 'mkmf'
2
+
3
+ $CFLAGS << " #{ENV["CFLAGS"]} -std=c99"
4
+ RbConfig::MAKEFILE_CONFIG['CC'] = ENV['CC'] if ENV['CC']
5
+
6
+ create_makefile "rbcluster/rbcluster"
@@ -0,0 +1,775 @@
1
+ #include <stdlib.h>
2
+ #include <stdio.h>
3
+ #include "ruby.h"
4
+ #include "cluster.h"
5
+
6
+ // missing on 1.8
7
+ #ifndef DBL2NUM
8
+ #define DBL2NUM( dbl_val ) rb_float_new( dbl_val )
9
+ #endif
10
+
11
+ VALUE rbcluster_mCluster = Qnil;
12
+ VALUE rbcluster_cNode = Qnil;
13
+
14
+ VALUE rbcluster_rows2rb(double** data, int nrows, int ncols) {
15
+ VALUE rows = rb_ary_new2((long)nrows);
16
+ VALUE cols;
17
+ int i, j;
18
+
19
+ for(i = 0; i < nrows; ++i) {
20
+ cols = rb_ary_new2((long)ncols);
21
+ rb_ary_push(rows, cols);
22
+ for(j = 0; j < ncols; ++j) {
23
+ rb_ary_push(cols, DBL2NUM(data[i][j]));
24
+ }
25
+ }
26
+
27
+ return rows;
28
+ }
29
+
30
+ double* rbcluster_ary_to_doubles(VALUE data, int len) {
31
+ Check_Type(data, T_ARRAY);
32
+
33
+ if(RARRAY_LEN(data) != len) {
34
+ rb_raise(rb_eArgError, "expected Array to have %d entries, got %ld", len, RARRAY_LEN(data));
35
+ }
36
+
37
+ double* result = malloc(len*sizeof(double));
38
+ for(int i = 0; i < len; ++i) {
39
+ result[i] = NUM2DBL(rb_ary_entry(data, i));
40
+ }
41
+
42
+ return result;
43
+ }
44
+
45
+ double** rbcluster_ary_to_rows(VALUE data, int* nrows, int* ncols) {
46
+ Check_Type(data, T_ARRAY);
47
+ long rows, cols;
48
+
49
+ rows = RARRAY_LEN(data);
50
+ if(rows == 0) {
51
+ rb_raise(rb_eArgError, "no data given");
52
+ }
53
+
54
+ VALUE first_element = rb_ary_entry(data, 0);
55
+ Check_Type(first_element, T_ARRAY);
56
+ cols = RARRAY_LEN(first_element);
57
+
58
+ double** result = malloc((rows)*sizeof(double*));
59
+
60
+ VALUE row;
61
+ int i, j;
62
+
63
+ for(i = 0; i < rows; ++i) {
64
+ result[i] = malloc((cols)*sizeof(double));
65
+ row = rb_ary_entry(data, i);
66
+
67
+ Check_Type(row, T_ARRAY);
68
+ if(RARRAY_LEN(row) != cols) {
69
+ rb_raise(rb_eArgError, "expected %ld columns, row has %ld", cols, RARRAY_LEN(row));
70
+ }
71
+
72
+ for(j = 0; j < cols; ++j) {
73
+ result[i][j] = NUM2DBL(rb_ary_entry(row, j));
74
+ }
75
+ }
76
+
77
+ *nrows = (int)rows;
78
+ *ncols = (int)cols;
79
+
80
+ return result;
81
+ }
82
+
83
+ void rbcluster_free_rows(double** data, int nrows) {
84
+ for(int i = 0; i < nrows; ++i)
85
+ free(data[i]);
86
+
87
+ free(data);
88
+ }
89
+
90
+ int** rbcluster_create_mask(int nrows, int ncols) {
91
+ int** mask = malloc(nrows*sizeof(int*));
92
+
93
+ for(int i = 0; i < nrows; ++i) {
94
+ mask[i] = malloc(ncols*sizeof(int));
95
+ for(int n = 0; n < ncols; ++n) {
96
+ mask[i][n] = 1;
97
+ }
98
+ }
99
+
100
+ return mask;
101
+ }
102
+
103
+ void rbcluster_free_mask(int** mask, int nrows) {
104
+ for(int i = 0; i < nrows; ++i)
105
+ free(mask[i]);
106
+
107
+ free(mask);
108
+ }
109
+
110
+ double* rbcluster_create_weight(int ncols) {
111
+ double* weight = malloc(ncols*sizeof(double));
112
+
113
+ for (int i = 0; i < ncols; i++)
114
+ weight[i] = 1.0;
115
+
116
+ return weight;
117
+ }
118
+
119
+ VALUE rbcluster_ints2rb(int* ints, long nrows) {
120
+ VALUE ary = rb_ary_new2(nrows);
121
+
122
+ for(int i = 0; i < nrows; ++i) {
123
+ rb_ary_push(ary, INT2FIX(ints[i]));
124
+ }
125
+
126
+ return ary;
127
+ }
128
+
129
+ VALUE rbcluster_doubles2rb(double* arr, long nrows) {
130
+ VALUE ary = rb_ary_new2(nrows);
131
+
132
+ for(int i = 0; i < nrows; ++i) {
133
+ rb_ary_push(ary, DBL2NUM(arr[i]));
134
+ }
135
+
136
+ return ary;
137
+ }
138
+
139
+
140
+ /*
141
+ option hash parsing
142
+ */
143
+
144
+ void rbcluster_parse_mask(VALUE opts, int** mask, int nrows, int ncols) {
145
+ VALUE val = rb_hash_aref(opts, ID2SYM(rb_intern("mask")));
146
+
147
+ if(NIL_P(val))
148
+ return;
149
+
150
+ // TODO: better error message
151
+ Check_Type(val, T_ARRAY);
152
+ VALUE row;
153
+ int i, j;
154
+
155
+ for(i = 0; i < nrows; ++i) {
156
+ row = rb_ary_entry(val, i);
157
+ Check_Type(row, T_ARRAY);
158
+ for(j = 0; j < ncols; ++j) {
159
+ mask[i][j] = FIX2INT(rb_ary_entry(row, j));
160
+ }
161
+ }
162
+ }
163
+
164
+ void rbcluster_parse_weight(VALUE opts, double** weight, int ncols) {
165
+ VALUE val = rb_hash_aref(opts, ID2SYM(rb_intern("weight")));
166
+
167
+ if(val != Qnil) {
168
+ free(*weight);
169
+ *weight = rbcluster_ary_to_doubles(val, ncols);
170
+ }
171
+ }
172
+
173
+ void rbcluster_parse_int(VALUE opts, const char* key, int* out) {
174
+ VALUE val = rb_hash_aref(opts, ID2SYM(rb_intern(key)));
175
+ if(val != Qnil) {
176
+ Check_Type(val, T_FIXNUM);
177
+ *out = FIX2INT(val);
178
+ }
179
+ }
180
+
181
+ void rbcluster_parse_double(VALUE opts, const char* key, double* out) {
182
+ VALUE val = rb_hash_aref(opts, ID2SYM(rb_intern(key)));
183
+ if(val != Qnil) {
184
+ Check_Type(val, T_FLOAT);
185
+ *out = NUM2DBL(val);
186
+ }
187
+ }
188
+
189
+ void rbcluster_parse_char(VALUE opts, const char* key, char* out) {
190
+ VALUE val = rb_hash_aref(opts, ID2SYM(rb_intern(key)));
191
+ if(val != Qnil) {
192
+ Check_Type(val, T_STRING);
193
+ *out = RSTRING_PTR(val)[0];
194
+ }
195
+ }
196
+
197
+ void rbcluster_parse_bool(VALUE opts, const char* key, int* out) {
198
+ VALUE val = rb_hash_aref(opts, ID2SYM(rb_intern(key)));
199
+ if(val != Qnil) {
200
+ *out = val ? 1 : 0;
201
+ }
202
+ }
203
+
204
+
205
+ /*
206
+ main function
207
+ */
208
+
209
+ VALUE rbcluster_kcluster(int argc, VALUE* argv, VALUE self) {
210
+ VALUE arr, opts;
211
+ int nrows, ncols, i, j;
212
+
213
+ rb_scan_args(argc, argv, "11", &arr, &opts);
214
+
215
+ double** data = rbcluster_ary_to_rows(arr, &nrows, &ncols);
216
+ int** mask = rbcluster_create_mask(nrows, ncols);
217
+
218
+ // defaults
219
+ int nclusters = 2;
220
+ int transpose = 0;
221
+ int npass = 1;
222
+ char method = 'a';
223
+ char dist = 'e';
224
+ double* weight = rbcluster_create_weight(nrows);
225
+
226
+ int* clusterid = malloc(nrows*sizeof(int));
227
+ int ifound = 0;
228
+ double error;
229
+
230
+ // options
231
+ if(opts != Qnil) {
232
+ Check_Type(opts, T_HASH);
233
+ VALUE val;
234
+
235
+ rbcluster_parse_int(opts, "clusters", &nclusters);
236
+ rbcluster_parse_mask(opts, mask, nrows, ncols);
237
+ rbcluster_parse_weight(opts, &weight, ncols);
238
+ rbcluster_parse_bool(opts, "transpose", &transpose);
239
+ rbcluster_parse_int(opts, "passes", &npass);
240
+ rbcluster_parse_char(opts, "method", &method);
241
+ rbcluster_parse_char(opts, "dist", &dist);
242
+ }
243
+
244
+ kcluster(
245
+ nclusters,
246
+ nrows,
247
+ ncols,
248
+ data,
249
+ mask,
250
+ weight,
251
+ transpose,
252
+ npass,
253
+ method,
254
+ dist,
255
+ clusterid,
256
+ &error,
257
+ &ifound
258
+ );
259
+
260
+ VALUE result = rbcluster_ints2rb(clusterid, nrows);
261
+
262
+ rbcluster_free_rows(data, nrows);
263
+ rbcluster_free_mask(mask, nrows);
264
+
265
+ free(weight);
266
+ free(clusterid);
267
+
268
+ return rb_ary_new3(3, result, rb_float_new(error), INT2NUM(ifound));
269
+ }
270
+
271
+ VALUE rbcluster_kmedoids(int argc, VALUE* argv, VALUE self) {
272
+ VALUE data, opts;
273
+
274
+ rb_scan_args(argc, argv, "11", &data, &opts);
275
+ Check_Type(data, T_ARRAY);
276
+
277
+ int nitems = (int)RARRAY_LEN(data);
278
+ int nclusters = 2;
279
+ int npass = 1;
280
+
281
+ // populate 'distances' from the input Array
282
+ double** distances = malloc(nitems*sizeof(double*));
283
+ int i, j;
284
+ VALUE row, num;
285
+
286
+ for(i = 0; i < nitems; ++i) {
287
+ row = rb_ary_entry(data, i);
288
+ // TODO: better error message
289
+ Check_Type(row, T_ARRAY);
290
+ if(RARRAY_LEN(row) != i) {
291
+ rb_raise(rb_eArgError,
292
+ "expected row %d to have exactly %d elements, got %ld", i, i, RARRAY_LEN(row));
293
+ }
294
+
295
+ if(i == 0) {
296
+ distances[i] = NULL;
297
+ } else {
298
+ distances[i] = malloc(i*sizeof(double));
299
+ }
300
+
301
+ for(j = 0; j < i; ++j) {
302
+ distances[i][j] = NUM2DBL(rb_ary_entry(row, j));
303
+ }
304
+ }
305
+
306
+ if(opts != Qnil) {
307
+ rbcluster_parse_int(opts, "clusters", &nclusters);
308
+ rbcluster_parse_int(opts, "passes", &npass);
309
+ // TODO: initialid
310
+ }
311
+
312
+ int* clusterid = malloc(nitems*sizeof(int));
313
+ double error;
314
+ int ifound;
315
+
316
+ // void kmedoids (int nclusters, int nelements, double** distance,
317
+ // int npass, int clusterid[], double* error, int* ifound);
318
+ kmedoids(
319
+ nclusters,
320
+ nitems,
321
+ distances,
322
+ npass,
323
+ clusterid,
324
+ &error,
325
+ &ifound
326
+ );
327
+
328
+ VALUE result = rbcluster_ints2rb(clusterid, nitems);
329
+ free(clusterid);
330
+ for(i = 1; i < nitems; ++i) free(distances[i]);
331
+
332
+ return rb_ary_new3(3, result, rb_float_new(error), INT2NUM(ifound));
333
+ }
334
+
335
+ VALUE rbcluster_median(VALUE self, VALUE ary) {
336
+ Check_Type(ary, T_ARRAY);
337
+
338
+ long len = RARRAY_LEN(ary);
339
+ double arr[len];
340
+ int i;
341
+ VALUE num;
342
+
343
+ for(i = 0; i < len; ++i) {
344
+ num = rb_ary_entry(ary, i);
345
+ arr[i] = NUM2DBL(num);
346
+ }
347
+
348
+ return rb_float_new(median((int)len, arr));
349
+ }
350
+
351
+ VALUE rbcluster_mean(VALUE self, VALUE ary) {
352
+ Check_Type(ary, T_ARRAY);
353
+
354
+ long len = RARRAY_LEN(ary);
355
+ double arr[len];
356
+ int i;
357
+ VALUE num;
358
+
359
+ for(i = 0; i < len; ++i) {
360
+ num = rb_ary_entry(ary, i);
361
+ arr[i] = NUM2DBL(num);
362
+ }
363
+
364
+ return rb_float_new(mean((int)len, arr));
365
+ }
366
+
367
+ VALUE rbcluster_distancematrix(int argc, VALUE* argv, VALUE self) {
368
+ VALUE data, opts;
369
+ int nrows, ncols, i, j;
370
+
371
+ rb_scan_args(argc, argv, "11", &data, &opts);
372
+ double** rows = rbcluster_ary_to_rows(data, &nrows, &ncols);
373
+
374
+ char dist = 'e';
375
+ int transpose = 0;
376
+ int** mask = rbcluster_create_mask(nrows, ncols);
377
+ double* weight = rbcluster_create_weight(ncols);
378
+
379
+ if(opts != Qnil) {
380
+ Check_Type(opts, T_HASH);
381
+ VALUE val;
382
+
383
+ rbcluster_parse_mask(opts, mask, nrows, ncols);
384
+ rbcluster_parse_weight(opts, &weight, ncols);
385
+ rbcluster_parse_char(opts, "dist", &dist);
386
+ rbcluster_parse_bool(opts, "transpose", &transpose);
387
+ }
388
+
389
+ VALUE result = Qnil;
390
+ double** distances = distancematrix(
391
+ nrows,
392
+ ncols,
393
+ rows,
394
+ mask,
395
+ weight,
396
+ dist,
397
+ transpose
398
+ );
399
+
400
+ if(distances) {
401
+ result = rb_ary_new();
402
+ for(i = 0; i < nrows; ++i) {
403
+ VALUE row = rb_ary_new();
404
+
405
+ for(j = 0; j < i; ++j){
406
+ rb_ary_push(row, rb_float_new(distances[i][j]));
407
+ }
408
+
409
+ // first row is NULL
410
+ if(i != 0) {
411
+ free(distances[i]);
412
+ }
413
+
414
+ rb_ary_push(result, row);
415
+ }
416
+ }
417
+
418
+ free(weight);
419
+ rbcluster_free_rows(rows, nrows);
420
+ rbcluster_free_mask(mask, nrows);
421
+
422
+ return result;
423
+ }
424
+
425
+ int* rbcluster_parse_index(VALUE arr, int* len) {
426
+ Check_Type(arr, T_ARRAY);
427
+ long length = RARRAY_LEN(arr);
428
+
429
+ int* result = malloc(length*sizeof(int));
430
+
431
+ for(int i = 0; i < length; ++i) {
432
+ result[i] = FIX2INT(rb_ary_entry(arr, i));
433
+ }
434
+
435
+ *len = (int)length;
436
+ return result;
437
+ }
438
+
439
+ VALUE rbcluster_clusterdistance(int argc, VALUE* argv, VALUE self) {
440
+ VALUE data, index1, index2, opts;
441
+ int nrows, ncols;
442
+
443
+ rb_scan_args(argc, argv, "31", &data, &index1, &index2, &opts);
444
+ double** rows = rbcluster_ary_to_rows(data, &nrows, &ncols);
445
+
446
+ int nidx1, nidx2;
447
+ int* idx1 = rbcluster_parse_index(index1, &nidx1);
448
+ int* idx2 = rbcluster_parse_index(index2, &nidx2);
449
+
450
+ int** mask = rbcluster_create_mask(nrows, ncols);
451
+ double* weight = rbcluster_create_weight(ncols);
452
+ char method = 'a';
453
+ char dist = 'e';
454
+ int transpose = 0;
455
+
456
+ if(opts != Qnil) {
457
+ rbcluster_parse_mask(opts, mask, nrows, ncols);
458
+ rbcluster_parse_weight(opts, &weight, ncols);
459
+ rbcluster_parse_char(opts, "dist", &dist);
460
+ rbcluster_parse_char(opts, "method", &method);
461
+ rbcluster_parse_bool(opts, "transpose", &transpose);
462
+ }
463
+
464
+ double result = clusterdistance(
465
+ nrows,
466
+ ncols,
467
+ rows,
468
+ mask,
469
+ weight,
470
+ nidx1,
471
+ nidx2,
472
+ idx1,
473
+ idx2,
474
+ dist,
475
+ method,
476
+ transpose
477
+ );
478
+
479
+ free(weight);
480
+ free(idx1);
481
+ free(idx2);
482
+ rbcluster_free_rows(rows, nrows);
483
+ rbcluster_free_mask(mask, nrows);
484
+
485
+ return rb_float_new(result);
486
+ }
487
+
488
+ VALUE rbcluster_create_node(Node* node) {
489
+ VALUE args[3];
490
+
491
+ args[0] = INT2NUM(node->left);
492
+ args[1] = INT2NUM(node->right);
493
+ args[2] = DBL2NUM(node->distance);
494
+
495
+ return rb_class_new_instance(3, args, rbcluster_cNode);
496
+ }
497
+
498
+ VALUE rbcluster_node_initialize(int argc, VALUE* argv, VALUE self) {
499
+ VALUE left, right, distance;
500
+
501
+ rb_scan_args(argc, argv, "21", &left, &right, &distance);
502
+
503
+ rb_ivar_set(self, rb_intern("@left"), left);
504
+ rb_ivar_set(self, rb_intern("@right"), right);
505
+ rb_ivar_set(self, rb_intern("@distance"), distance);
506
+
507
+ return self;
508
+ }
509
+
510
+ double*** rbcluster_create_celldata(int nxgrid, int nygrid, int ndata) {
511
+ double*** celldata = calloc(nxgrid*nygrid*ndata, sizeof(double**));
512
+ int i, j;
513
+
514
+ for (i = 0; i < nxgrid; i++)
515
+ { celldata[i] = calloc(nygrid*ndata, sizeof(double*));
516
+ for (j = 0; j < nygrid; j++)
517
+ celldata[i][j] = calloc(ndata, sizeof(double));
518
+ }
519
+
520
+ return celldata;
521
+ }
522
+
523
+ void rbcluster_free_celldata(double*** celldata, int nxgrid, int nygrid) {
524
+ int i, j;
525
+
526
+ for (i = 0; i < nxgrid; i++) {
527
+ for (j = 0; j < nygrid; j++) {
528
+ free(celldata[i][j]);
529
+ }
530
+ }
531
+
532
+ for (i = 0; i < nxgrid; i++)
533
+ free(celldata[i]);
534
+
535
+ free(celldata);
536
+ }
537
+
538
+ VALUE rbcluster_treecluster(int argc, VALUE* argv, VALUE self) {
539
+ VALUE data, opts;
540
+ rb_scan_args(argc, argv, "11", &data, &opts);
541
+
542
+ int nrows, ncols;
543
+ double** rows = rbcluster_ary_to_rows(data, &nrows, &ncols);
544
+
545
+ int** mask = rbcluster_create_mask(nrows, ncols);
546
+ double* weight = rbcluster_create_weight(ncols);
547
+ int transpose = 0;
548
+ char dist = 'e';
549
+ char method = 'a';
550
+
551
+ // TODO: allow passing a distance matrix instead of data.
552
+ double** distmatrix = NULL;
553
+
554
+ // options
555
+ if(opts != Qnil) {
556
+ rbcluster_parse_mask(opts, mask, nrows, ncols);
557
+ rbcluster_parse_weight(opts, &weight, ncols);
558
+ rbcluster_parse_char(opts, "dist", &dist);
559
+ rbcluster_parse_char(opts, "method", &method);
560
+ rbcluster_parse_bool(opts, "transpose", &transpose);
561
+
562
+ if(TYPE(opts) == T_HASH && rb_hash_aref(opts, ID2SYM(rb_intern("distancematrix"))) != Qnil) {
563
+ rb_raise(rb_eNotImpError, "passing a distance matrix to treecluster() is not yet implemented");
564
+ }
565
+ }
566
+
567
+ Node* tree = treecluster(
568
+ nrows,
569
+ ncols,
570
+ rows,
571
+ mask,
572
+ weight,
573
+ transpose,
574
+ dist,
575
+ method,
576
+ distmatrix
577
+ );
578
+
579
+ VALUE result = rb_ary_new2(nrows - 1);
580
+ for(int i = 0; i < nrows - 1; ++i) {
581
+ rb_ary_push(result, rbcluster_create_node(&tree[i]));
582
+ }
583
+
584
+ free(tree);
585
+ free(weight);
586
+ rbcluster_free_rows(rows, nrows);
587
+ rbcluster_free_mask(mask, nrows);
588
+
589
+ return result;
590
+ }
591
+
592
+ VALUE rbcluster_somcluster(int argc, VALUE* argv, VALUE self) {
593
+ VALUE data, opts;
594
+
595
+ rb_scan_args(argc, argv, "11", &data, &opts);
596
+
597
+ int nrows, ncols;
598
+ double** rows = rbcluster_ary_to_rows(data, &nrows, &ncols);
599
+ int** mask = rbcluster_create_mask(nrows, ncols);
600
+ double* weight = rbcluster_create_weight(ncols);
601
+
602
+ int transpose = 0;
603
+ char dist = 'e';
604
+ int nxgrid = 2;
605
+ int nygrid = 1;
606
+ double inittau = 0.02;
607
+ int niter = 1;
608
+
609
+ if(opts != Qnil) {
610
+ rbcluster_parse_mask(opts, mask, nrows, ncols);
611
+ rbcluster_parse_weight(opts, &weight, ncols);
612
+ rbcluster_parse_bool(opts, "transpose", &transpose);
613
+ rbcluster_parse_int(opts, "nxgrid", &nxgrid);
614
+ rbcluster_parse_int(opts, "nygrid", &nygrid);
615
+ rbcluster_parse_double(opts, "inittau", &inittau);
616
+ rbcluster_parse_int(opts, "niter", &niter);
617
+ rbcluster_parse_char(opts, "dist", &dist);
618
+ }
619
+
620
+ int i, j, k;
621
+ double*** celldata = rbcluster_create_celldata(nygrid, nxgrid, ncols);
622
+
623
+ int clusterid[nrows][2];
624
+
625
+ somcluster(
626
+ nrows,
627
+ ncols,
628
+ rows,
629
+ mask,
630
+ weight,
631
+ transpose,
632
+ nxgrid,
633
+ nygrid,
634
+ inittau,
635
+ niter,
636
+ dist,
637
+ celldata,
638
+ clusterid
639
+ );
640
+
641
+ VALUE rb_celldata = rb_ary_new2(nxgrid);
642
+ VALUE rb_clusterid = rb_ary_new2(nrows);
643
+
644
+ VALUE iarr, jarr;
645
+
646
+ for(i = 0; i < nxgrid; ++i) {
647
+ iarr = rb_ary_new2(nygrid);
648
+ for(j = 0; j < nygrid; ++j) {
649
+ jarr = rb_ary_new2(ncols);
650
+ for(k = 0; k < ncols; ++k) {
651
+ rb_ary_push(jarr, rb_float_new(celldata[i][j][k]));
652
+ }
653
+ rb_ary_push(iarr, jarr);
654
+ }
655
+ rb_ary_push(rb_celldata, iarr);
656
+ }
657
+
658
+ VALUE inner_arr;
659
+ for(i = 0; i < nrows; ++i) {
660
+ inner_arr = rb_ary_new2(2);
661
+ rb_ary_push(inner_arr, INT2FIX(clusterid[i][0]));
662
+ rb_ary_push(inner_arr, INT2FIX(clusterid[i][1]));
663
+
664
+ rb_ary_push(rb_clusterid, inner_arr);
665
+ }
666
+
667
+ free(weight);
668
+ rbcluster_free_rows(rows, nrows);
669
+ rbcluster_free_mask(mask, nrows);
670
+ rbcluster_free_celldata(celldata, nxgrid, nygrid);
671
+
672
+ return rb_ary_new3(2, rb_clusterid, rb_celldata);
673
+ }
674
+
675
+ void print_doubles(double* vals, int len) {
676
+ puts("[");
677
+ for(int i = 0; i < len; ++i) {
678
+ printf("\t%d: %f\n", i, vals[i]);
679
+ }
680
+ puts("]");
681
+ }
682
+
683
+ void print_double_matrix(double** vals, int nrows, int ncols) {
684
+ puts("[");
685
+ for(int i = 0; i < nrows; ++i) {
686
+ printf("\t[ ");
687
+ for(int j = 0; j < ncols; ++j) {
688
+ printf("%f%c", vals[i][j], j == ncols - 1 ? ' ' : ',');
689
+ }
690
+ printf("\t]\n");
691
+ }
692
+ puts("]");
693
+ }
694
+
695
+ VALUE rbcluster_pca(VALUE self, VALUE data) {
696
+ int nrows, ncols, i, j;
697
+
698
+ double** u = rbcluster_ary_to_rows(data, &nrows, &ncols);
699
+ int ndata = min(nrows, ncols);
700
+
701
+ double** v = malloc(ndata*sizeof(double*));
702
+ for(i = 0; i < ndata; ++i) {
703
+ v[i] = malloc(ndata*sizeof(double));
704
+ }
705
+ double* w = malloc(ndata*sizeof(double));
706
+ double* means = malloc(ncols*sizeof(double));
707
+
708
+ // calculate the mean of each column
709
+ for(j = 0; j < ncols; ++j) {
710
+ means[j] = 0.0;
711
+ for(i = 0; i < nrows; ++i) {
712
+ means[j] += u[i][j];
713
+ }
714
+
715
+ means[j] /= nrows;
716
+ }
717
+
718
+ // subtract the mean of each column
719
+ for(i = 0; i < nrows; ++i) {
720
+ for(j = 0; j < ncols; ++j) {
721
+ u[i][j] = u[i][j] - means[j];
722
+ }
723
+ }
724
+
725
+ int ok = pca(nrows, ncols, u, v, w);
726
+ if(ok == -1) {
727
+ rb_raise(rb_eStandardError, "could not allocate memory");
728
+ } else if(ok > 0) {
729
+ rb_raise(rb_eStandardError, "svd failed to converge");
730
+ }
731
+
732
+ VALUE mean = rbcluster_doubles2rb(means, ncols);
733
+ VALUE eigenvalues = rbcluster_doubles2rb(w, ndata);
734
+ VALUE coordinates = Qnil;
735
+ VALUE pc = Qnil;
736
+
737
+ if(nrows >= ncols) {
738
+ coordinates = rbcluster_rows2rb(u, nrows, ncols);
739
+ pc = rbcluster_rows2rb(v, ndata, ndata);
740
+ } else {
741
+ pc = rbcluster_rows2rb(u, nrows, ncols);
742
+ coordinates = rbcluster_rows2rb(v, ndata, ndata);
743
+ }
744
+
745
+ rbcluster_free_rows(u, nrows);
746
+ rbcluster_free_rows(v, ndata);
747
+
748
+ free(w);
749
+ free(means);
750
+
751
+ return rb_ary_new3(4, mean, coordinates, pc, eigenvalues);
752
+ }
753
+
754
+ void Init_rbcluster() {
755
+ rbcluster_mCluster = rb_define_module("Cluster");
756
+ rbcluster_cNode = rb_define_class_under(rbcluster_mCluster, "Node", rb_cObject);
757
+
758
+ rb_define_attr(rbcluster_cNode, "left", 1, 1);
759
+ rb_define_attr(rbcluster_cNode, "right", 1, 1);
760
+ rb_define_attr(rbcluster_cNode, "distance", 1, 1);
761
+ rb_define_method(rbcluster_cNode, "initialize", rbcluster_node_initialize, -1);
762
+
763
+ rb_define_singleton_method(rbcluster_mCluster, "median", rbcluster_median, 1);
764
+ rb_define_singleton_method(rbcluster_mCluster, "mean", rbcluster_mean, 1);
765
+
766
+ rb_define_singleton_method(rbcluster_mCluster, "kcluster", rbcluster_kcluster, -1);
767
+ rb_define_singleton_method(rbcluster_mCluster, "distancematrix", rbcluster_distancematrix, -1);
768
+ rb_define_singleton_method(rbcluster_mCluster, "kmedoids", rbcluster_kmedoids, -1);
769
+ rb_define_singleton_method(rbcluster_mCluster, "clusterdistance", rbcluster_clusterdistance, -1);
770
+ rb_define_singleton_method(rbcluster_mCluster, "treecluster", rbcluster_treecluster, -1);
771
+ rb_define_singleton_method(rbcluster_mCluster, "somcluster", rbcluster_somcluster, -1);
772
+ rb_define_singleton_method(rbcluster_mCluster, "pca", rbcluster_pca, 1);
773
+
774
+ rb_define_const(rbcluster_mCluster, "C_VERSION", rb_str_new2(CLUSTERVERSION));
775
+ }