rbcluster 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,93 @@
1
+ /******************************************************************************/
2
+ /* The C Clustering Library.
3
+ * Copyright (C) 2002 Michiel Jan Laurens de Hoon.
4
+ *
5
+ * This library was written at the Laboratory of DNA Information Analysis,
6
+ * Human Genome Center, Institute of Medical Science, University of Tokyo,
7
+ * 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
8
+ * Contact: mdehoon 'AT' gsc.riken.jp
9
+ *
10
+ * Permission to use, copy, modify, and distribute this software and its
11
+ * documentation with or without modifications and for any purpose and
12
+ * without fee is hereby granted, provided that any copyright notices
13
+ * appear in all copies and that both those copyright notices and this
14
+ * permission notice appear in supporting documentation, and that the
15
+ * names of the contributors or copyright holders not be used in
16
+ * advertising or publicity pertaining to distribution of the software
17
+ * without specific prior permission.
18
+ *
19
+ * THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
20
+ * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
21
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
22
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
23
+ * OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
24
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
25
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
26
+ * OR PERFORMANCE OF THIS SOFTWARE.
27
+ *
28
+ */
29
+
30
+ #ifndef min
31
+ #define min(x, y) ((x) < (y) ? (x) : (y))
32
+ #endif
33
+ #ifndef max
34
+ #define max(x, y) ((x) > (y) ? (x) : (y))
35
+ #endif
36
+
37
+ #ifdef WINDOWS
38
+ # include <windows.h>
39
+ #endif
40
+
41
+ #define CLUSTERVERSION "1.50"
42
+
43
+ /* Chapter 2 */
44
+ double clusterdistance (int nrows, int ncolumns, double** data, int** mask,
45
+ double weight[], int n1, int n2, int index1[], int index2[], char dist,
46
+ char method, int transpose);
47
+ double** distancematrix (int ngenes, int ndata, double** data,
48
+ int** mask, double* weight, char dist, int transpose);
49
+
50
+ /* Chapter 3 */
51
+ int getclustercentroids(int nclusters, int nrows, int ncolumns,
52
+ double** data, int** mask, int clusterid[], double** cdata, int** cmask,
53
+ int transpose, char method);
54
+ void getclustermedoids(int nclusters, int nelements, double** distance,
55
+ int clusterid[], int centroids[], double errors[]);
56
+ void kcluster (int nclusters, int ngenes, int ndata, double** data,
57
+ int** mask, double weight[], int transpose, int npass, char method, char dist,
58
+ int clusterid[], double* error, int* ifound);
59
+ void kmedoids (int nclusters, int nelements, double** distance,
60
+ int npass, int clusterid[], double* error, int* ifound);
61
+
62
+ /* Chapter 4 */
63
+ typedef struct {int left; int right; double distance;} Node;
64
+ /*
65
+ * A Node struct describes a single node in a tree created by hierarchical
66
+ * clustering. The tree can be represented by an array of n Node structs,
67
+ * where n is the number of elements minus one. The integers left and right
68
+ * in each Node struct refer to the two elements or subnodes that are joined
69
+ * in this node. The original elements are numbered 0..nelements-1, and the
70
+ * nodes -1..-(nelements-1). For each node, distance contains the distance
71
+ * between the two subnodes that were joined.
72
+ */
73
+
74
+ Node* treecluster (int nrows, int ncolumns, double** data, int** mask,
75
+ double weight[], int transpose, char dist, char method, double** distmatrix);
76
+ void cuttree (int nelements, Node* tree, int nclusters, int clusterid[]);
77
+
78
+ /* Chapter 5 */
79
+ void somcluster (int nrows, int ncolumns, double** data, int** mask,
80
+ const double weight[], int transpose, int nxnodes, int nynodes,
81
+ double inittau, int niter, char dist, double*** celldata,
82
+ int clusterid[][2]);
83
+
84
+ /* Chapter 6 */
85
+ int pca(int m, int n, double** u, double** v, double* w);
86
+
87
+ /* Utility routines, currently undocumented */
88
+ void sort(int n, const double data[], int index[]);
89
+ double mean(int n, double x[]);
90
+ double median (int n, double x[]);
91
+
92
+ double* calculate_weights(int nrows, int ncolumns, double** data, int** mask,
93
+ double weights[], int transpose, char dist, double cutoff, double exponent);
@@ -0,0 +1,6 @@
1
+ require 'mkmf'
2
+
3
+ $CFLAGS << " #{ENV["CFLAGS"]} -std=c99"
4
+ RbConfig::MAKEFILE_CONFIG['CC'] = ENV['CC'] if ENV['CC']
5
+
6
+ create_makefile "rbcluster/rbcluster"
@@ -0,0 +1,775 @@
1
+ #include <stdlib.h>
2
+ #include <stdio.h>
3
+ #include "ruby.h"
4
+ #include "cluster.h"
5
+
6
+ // missing on 1.8
7
+ #ifndef DBL2NUM
8
+ #define DBL2NUM( dbl_val ) rb_float_new( dbl_val )
9
+ #endif
10
+
11
+ VALUE rbcluster_mCluster = Qnil;
12
+ VALUE rbcluster_cNode = Qnil;
13
+
14
+ VALUE rbcluster_rows2rb(double** data, int nrows, int ncols) {
15
+ VALUE rows = rb_ary_new2((long)nrows);
16
+ VALUE cols;
17
+ int i, j;
18
+
19
+ for(i = 0; i < nrows; ++i) {
20
+ cols = rb_ary_new2((long)ncols);
21
+ rb_ary_push(rows, cols);
22
+ for(j = 0; j < ncols; ++j) {
23
+ rb_ary_push(cols, DBL2NUM(data[i][j]));
24
+ }
25
+ }
26
+
27
+ return rows;
28
+ }
29
+
30
+ double* rbcluster_ary_to_doubles(VALUE data, int len) {
31
+ Check_Type(data, T_ARRAY);
32
+
33
+ if(RARRAY_LEN(data) != len) {
34
+ rb_raise(rb_eArgError, "expected Array to have %d entries, got %ld", len, RARRAY_LEN(data));
35
+ }
36
+
37
+ double* result = malloc(len*sizeof(double));
38
+ for(int i = 0; i < len; ++i) {
39
+ result[i] = NUM2DBL(rb_ary_entry(data, i));
40
+ }
41
+
42
+ return result;
43
+ }
44
+
45
+ double** rbcluster_ary_to_rows(VALUE data, int* nrows, int* ncols) {
46
+ Check_Type(data, T_ARRAY);
47
+ long rows, cols;
48
+
49
+ rows = RARRAY_LEN(data);
50
+ if(rows == 0) {
51
+ rb_raise(rb_eArgError, "no data given");
52
+ }
53
+
54
+ VALUE first_element = rb_ary_entry(data, 0);
55
+ Check_Type(first_element, T_ARRAY);
56
+ cols = RARRAY_LEN(first_element);
57
+
58
+ double** result = malloc((rows)*sizeof(double*));
59
+
60
+ VALUE row;
61
+ int i, j;
62
+
63
+ for(i = 0; i < rows; ++i) {
64
+ result[i] = malloc((cols)*sizeof(double));
65
+ row = rb_ary_entry(data, i);
66
+
67
+ Check_Type(row, T_ARRAY);
68
+ if(RARRAY_LEN(row) != cols) {
69
+ rb_raise(rb_eArgError, "expected %ld columns, row has %ld", cols, RARRAY_LEN(row));
70
+ }
71
+
72
+ for(j = 0; j < cols; ++j) {
73
+ result[i][j] = NUM2DBL(rb_ary_entry(row, j));
74
+ }
75
+ }
76
+
77
+ *nrows = (int)rows;
78
+ *ncols = (int)cols;
79
+
80
+ return result;
81
+ }
82
+
83
+ void rbcluster_free_rows(double** data, int nrows) {
84
+ for(int i = 0; i < nrows; ++i)
85
+ free(data[i]);
86
+
87
+ free(data);
88
+ }
89
+
90
+ int** rbcluster_create_mask(int nrows, int ncols) {
91
+ int** mask = malloc(nrows*sizeof(int*));
92
+
93
+ for(int i = 0; i < nrows; ++i) {
94
+ mask[i] = malloc(ncols*sizeof(int));
95
+ for(int n = 0; n < ncols; ++n) {
96
+ mask[i][n] = 1;
97
+ }
98
+ }
99
+
100
+ return mask;
101
+ }
102
+
103
+ void rbcluster_free_mask(int** mask, int nrows) {
104
+ for(int i = 0; i < nrows; ++i)
105
+ free(mask[i]);
106
+
107
+ free(mask);
108
+ }
109
+
110
+ double* rbcluster_create_weight(int ncols) {
111
+ double* weight = malloc(ncols*sizeof(double));
112
+
113
+ for (int i = 0; i < ncols; i++)
114
+ weight[i] = 1.0;
115
+
116
+ return weight;
117
+ }
118
+
119
+ VALUE rbcluster_ints2rb(int* ints, long nrows) {
120
+ VALUE ary = rb_ary_new2(nrows);
121
+
122
+ for(int i = 0; i < nrows; ++i) {
123
+ rb_ary_push(ary, INT2FIX(ints[i]));
124
+ }
125
+
126
+ return ary;
127
+ }
128
+
129
+ VALUE rbcluster_doubles2rb(double* arr, long nrows) {
130
+ VALUE ary = rb_ary_new2(nrows);
131
+
132
+ for(int i = 0; i < nrows; ++i) {
133
+ rb_ary_push(ary, DBL2NUM(arr[i]));
134
+ }
135
+
136
+ return ary;
137
+ }
138
+
139
+
140
+ /*
141
+ option hash parsing
142
+ */
143
+
144
+ void rbcluster_parse_mask(VALUE opts, int** mask, int nrows, int ncols) {
145
+ VALUE val = rb_hash_aref(opts, ID2SYM(rb_intern("mask")));
146
+
147
+ if(NIL_P(val))
148
+ return;
149
+
150
+ // TODO: better error message
151
+ Check_Type(val, T_ARRAY);
152
+ VALUE row;
153
+ int i, j;
154
+
155
+ for(i = 0; i < nrows; ++i) {
156
+ row = rb_ary_entry(val, i);
157
+ Check_Type(row, T_ARRAY);
158
+ for(j = 0; j < ncols; ++j) {
159
+ mask[i][j] = FIX2INT(rb_ary_entry(row, j));
160
+ }
161
+ }
162
+ }
163
+
164
+ void rbcluster_parse_weight(VALUE opts, double** weight, int ncols) {
165
+ VALUE val = rb_hash_aref(opts, ID2SYM(rb_intern("weight")));
166
+
167
+ if(val != Qnil) {
168
+ free(*weight);
169
+ *weight = rbcluster_ary_to_doubles(val, ncols);
170
+ }
171
+ }
172
+
173
+ void rbcluster_parse_int(VALUE opts, const char* key, int* out) {
174
+ VALUE val = rb_hash_aref(opts, ID2SYM(rb_intern(key)));
175
+ if(val != Qnil) {
176
+ Check_Type(val, T_FIXNUM);
177
+ *out = FIX2INT(val);
178
+ }
179
+ }
180
+
181
+ void rbcluster_parse_double(VALUE opts, const char* key, double* out) {
182
+ VALUE val = rb_hash_aref(opts, ID2SYM(rb_intern(key)));
183
+ if(val != Qnil) {
184
+ Check_Type(val, T_FLOAT);
185
+ *out = NUM2DBL(val);
186
+ }
187
+ }
188
+
189
+ void rbcluster_parse_char(VALUE opts, const char* key, char* out) {
190
+ VALUE val = rb_hash_aref(opts, ID2SYM(rb_intern(key)));
191
+ if(val != Qnil) {
192
+ Check_Type(val, T_STRING);
193
+ *out = RSTRING_PTR(val)[0];
194
+ }
195
+ }
196
+
197
+ void rbcluster_parse_bool(VALUE opts, const char* key, int* out) {
198
+ VALUE val = rb_hash_aref(opts, ID2SYM(rb_intern(key)));
199
+ if(val != Qnil) {
200
+ *out = val ? 1 : 0;
201
+ }
202
+ }
203
+
204
+
205
+ /*
206
+ main function
207
+ */
208
+
209
+ VALUE rbcluster_kcluster(int argc, VALUE* argv, VALUE self) {
210
+ VALUE arr, opts;
211
+ int nrows, ncols, i, j;
212
+
213
+ rb_scan_args(argc, argv, "11", &arr, &opts);
214
+
215
+ double** data = rbcluster_ary_to_rows(arr, &nrows, &ncols);
216
+ int** mask = rbcluster_create_mask(nrows, ncols);
217
+
218
+ // defaults
219
+ int nclusters = 2;
220
+ int transpose = 0;
221
+ int npass = 1;
222
+ char method = 'a';
223
+ char dist = 'e';
224
+ double* weight = rbcluster_create_weight(nrows);
225
+
226
+ int* clusterid = malloc(nrows*sizeof(int));
227
+ int ifound = 0;
228
+ double error;
229
+
230
+ // options
231
+ if(opts != Qnil) {
232
+ Check_Type(opts, T_HASH);
233
+ VALUE val;
234
+
235
+ rbcluster_parse_int(opts, "clusters", &nclusters);
236
+ rbcluster_parse_mask(opts, mask, nrows, ncols);
237
+ rbcluster_parse_weight(opts, &weight, ncols);
238
+ rbcluster_parse_bool(opts, "transpose", &transpose);
239
+ rbcluster_parse_int(opts, "passes", &npass);
240
+ rbcluster_parse_char(opts, "method", &method);
241
+ rbcluster_parse_char(opts, "dist", &dist);
242
+ }
243
+
244
+ kcluster(
245
+ nclusters,
246
+ nrows,
247
+ ncols,
248
+ data,
249
+ mask,
250
+ weight,
251
+ transpose,
252
+ npass,
253
+ method,
254
+ dist,
255
+ clusterid,
256
+ &error,
257
+ &ifound
258
+ );
259
+
260
+ VALUE result = rbcluster_ints2rb(clusterid, nrows);
261
+
262
+ rbcluster_free_rows(data, nrows);
263
+ rbcluster_free_mask(mask, nrows);
264
+
265
+ free(weight);
266
+ free(clusterid);
267
+
268
+ return rb_ary_new3(3, result, rb_float_new(error), INT2NUM(ifound));
269
+ }
270
+
271
+ VALUE rbcluster_kmedoids(int argc, VALUE* argv, VALUE self) {
272
+ VALUE data, opts;
273
+
274
+ rb_scan_args(argc, argv, "11", &data, &opts);
275
+ Check_Type(data, T_ARRAY);
276
+
277
+ int nitems = (int)RARRAY_LEN(data);
278
+ int nclusters = 2;
279
+ int npass = 1;
280
+
281
+ // populate 'distances' from the input Array
282
+ double** distances = malloc(nitems*sizeof(double*));
283
+ int i, j;
284
+ VALUE row, num;
285
+
286
+ for(i = 0; i < nitems; ++i) {
287
+ row = rb_ary_entry(data, i);
288
+ // TODO: better error message
289
+ Check_Type(row, T_ARRAY);
290
+ if(RARRAY_LEN(row) != i) {
291
+ rb_raise(rb_eArgError,
292
+ "expected row %d to have exactly %d elements, got %ld", i, i, RARRAY_LEN(row));
293
+ }
294
+
295
+ if(i == 0) {
296
+ distances[i] = NULL;
297
+ } else {
298
+ distances[i] = malloc(i*sizeof(double));
299
+ }
300
+
301
+ for(j = 0; j < i; ++j) {
302
+ distances[i][j] = NUM2DBL(rb_ary_entry(row, j));
303
+ }
304
+ }
305
+
306
+ if(opts != Qnil) {
307
+ rbcluster_parse_int(opts, "clusters", &nclusters);
308
+ rbcluster_parse_int(opts, "passes", &npass);
309
+ // TODO: initialid
310
+ }
311
+
312
+ int* clusterid = malloc(nitems*sizeof(int));
313
+ double error;
314
+ int ifound;
315
+
316
+ // void kmedoids (int nclusters, int nelements, double** distance,
317
+ // int npass, int clusterid[], double* error, int* ifound);
318
+ kmedoids(
319
+ nclusters,
320
+ nitems,
321
+ distances,
322
+ npass,
323
+ clusterid,
324
+ &error,
325
+ &ifound
326
+ );
327
+
328
+ VALUE result = rbcluster_ints2rb(clusterid, nitems);
329
+ free(clusterid);
330
+ for(i = 1; i < nitems; ++i) free(distances[i]);
331
+
332
+ return rb_ary_new3(3, result, rb_float_new(error), INT2NUM(ifound));
333
+ }
334
+
335
+ VALUE rbcluster_median(VALUE self, VALUE ary) {
336
+ Check_Type(ary, T_ARRAY);
337
+
338
+ long len = RARRAY_LEN(ary);
339
+ double arr[len];
340
+ int i;
341
+ VALUE num;
342
+
343
+ for(i = 0; i < len; ++i) {
344
+ num = rb_ary_entry(ary, i);
345
+ arr[i] = NUM2DBL(num);
346
+ }
347
+
348
+ return rb_float_new(median((int)len, arr));
349
+ }
350
+
351
+ VALUE rbcluster_mean(VALUE self, VALUE ary) {
352
+ Check_Type(ary, T_ARRAY);
353
+
354
+ long len = RARRAY_LEN(ary);
355
+ double arr[len];
356
+ int i;
357
+ VALUE num;
358
+
359
+ for(i = 0; i < len; ++i) {
360
+ num = rb_ary_entry(ary, i);
361
+ arr[i] = NUM2DBL(num);
362
+ }
363
+
364
+ return rb_float_new(mean((int)len, arr));
365
+ }
366
+
367
+ VALUE rbcluster_distancematrix(int argc, VALUE* argv, VALUE self) {
368
+ VALUE data, opts;
369
+ int nrows, ncols, i, j;
370
+
371
+ rb_scan_args(argc, argv, "11", &data, &opts);
372
+ double** rows = rbcluster_ary_to_rows(data, &nrows, &ncols);
373
+
374
+ char dist = 'e';
375
+ int transpose = 0;
376
+ int** mask = rbcluster_create_mask(nrows, ncols);
377
+ double* weight = rbcluster_create_weight(ncols);
378
+
379
+ if(opts != Qnil) {
380
+ Check_Type(opts, T_HASH);
381
+ VALUE val;
382
+
383
+ rbcluster_parse_mask(opts, mask, nrows, ncols);
384
+ rbcluster_parse_weight(opts, &weight, ncols);
385
+ rbcluster_parse_char(opts, "dist", &dist);
386
+ rbcluster_parse_bool(opts, "transpose", &transpose);
387
+ }
388
+
389
+ VALUE result = Qnil;
390
+ double** distances = distancematrix(
391
+ nrows,
392
+ ncols,
393
+ rows,
394
+ mask,
395
+ weight,
396
+ dist,
397
+ transpose
398
+ );
399
+
400
+ if(distances) {
401
+ result = rb_ary_new();
402
+ for(i = 0; i < nrows; ++i) {
403
+ VALUE row = rb_ary_new();
404
+
405
+ for(j = 0; j < i; ++j){
406
+ rb_ary_push(row, rb_float_new(distances[i][j]));
407
+ }
408
+
409
+ // first row is NULL
410
+ if(i != 0) {
411
+ free(distances[i]);
412
+ }
413
+
414
+ rb_ary_push(result, row);
415
+ }
416
+ }
417
+
418
+ free(weight);
419
+ rbcluster_free_rows(rows, nrows);
420
+ rbcluster_free_mask(mask, nrows);
421
+
422
+ return result;
423
+ }
424
+
425
+ int* rbcluster_parse_index(VALUE arr, int* len) {
426
+ Check_Type(arr, T_ARRAY);
427
+ long length = RARRAY_LEN(arr);
428
+
429
+ int* result = malloc(length*sizeof(int));
430
+
431
+ for(int i = 0; i < length; ++i) {
432
+ result[i] = FIX2INT(rb_ary_entry(arr, i));
433
+ }
434
+
435
+ *len = (int)length;
436
+ return result;
437
+ }
438
+
439
+ VALUE rbcluster_clusterdistance(int argc, VALUE* argv, VALUE self) {
440
+ VALUE data, index1, index2, opts;
441
+ int nrows, ncols;
442
+
443
+ rb_scan_args(argc, argv, "31", &data, &index1, &index2, &opts);
444
+ double** rows = rbcluster_ary_to_rows(data, &nrows, &ncols);
445
+
446
+ int nidx1, nidx2;
447
+ int* idx1 = rbcluster_parse_index(index1, &nidx1);
448
+ int* idx2 = rbcluster_parse_index(index2, &nidx2);
449
+
450
+ int** mask = rbcluster_create_mask(nrows, ncols);
451
+ double* weight = rbcluster_create_weight(ncols);
452
+ char method = 'a';
453
+ char dist = 'e';
454
+ int transpose = 0;
455
+
456
+ if(opts != Qnil) {
457
+ rbcluster_parse_mask(opts, mask, nrows, ncols);
458
+ rbcluster_parse_weight(opts, &weight, ncols);
459
+ rbcluster_parse_char(opts, "dist", &dist);
460
+ rbcluster_parse_char(opts, "method", &method);
461
+ rbcluster_parse_bool(opts, "transpose", &transpose);
462
+ }
463
+
464
+ double result = clusterdistance(
465
+ nrows,
466
+ ncols,
467
+ rows,
468
+ mask,
469
+ weight,
470
+ nidx1,
471
+ nidx2,
472
+ idx1,
473
+ idx2,
474
+ dist,
475
+ method,
476
+ transpose
477
+ );
478
+
479
+ free(weight);
480
+ free(idx1);
481
+ free(idx2);
482
+ rbcluster_free_rows(rows, nrows);
483
+ rbcluster_free_mask(mask, nrows);
484
+
485
+ return rb_float_new(result);
486
+ }
487
+
488
+ VALUE rbcluster_create_node(Node* node) {
489
+ VALUE args[3];
490
+
491
+ args[0] = INT2NUM(node->left);
492
+ args[1] = INT2NUM(node->right);
493
+ args[2] = DBL2NUM(node->distance);
494
+
495
+ return rb_class_new_instance(3, args, rbcluster_cNode);
496
+ }
497
+
498
+ VALUE rbcluster_node_initialize(int argc, VALUE* argv, VALUE self) {
499
+ VALUE left, right, distance;
500
+
501
+ rb_scan_args(argc, argv, "21", &left, &right, &distance);
502
+
503
+ rb_ivar_set(self, rb_intern("@left"), left);
504
+ rb_ivar_set(self, rb_intern("@right"), right);
505
+ rb_ivar_set(self, rb_intern("@distance"), distance);
506
+
507
+ return self;
508
+ }
509
+
510
+ double*** rbcluster_create_celldata(int nxgrid, int nygrid, int ndata) {
511
+ double*** celldata = calloc(nxgrid*nygrid*ndata, sizeof(double**));
512
+ int i, j;
513
+
514
+ for (i = 0; i < nxgrid; i++)
515
+ { celldata[i] = calloc(nygrid*ndata, sizeof(double*));
516
+ for (j = 0; j < nygrid; j++)
517
+ celldata[i][j] = calloc(ndata, sizeof(double));
518
+ }
519
+
520
+ return celldata;
521
+ }
522
+
523
+ void rbcluster_free_celldata(double*** celldata, int nxgrid, int nygrid) {
524
+ int i, j;
525
+
526
+ for (i = 0; i < nxgrid; i++) {
527
+ for (j = 0; j < nygrid; j++) {
528
+ free(celldata[i][j]);
529
+ }
530
+ }
531
+
532
+ for (i = 0; i < nxgrid; i++)
533
+ free(celldata[i]);
534
+
535
+ free(celldata);
536
+ }
537
+
538
+ VALUE rbcluster_treecluster(int argc, VALUE* argv, VALUE self) {
539
+ VALUE data, opts;
540
+ rb_scan_args(argc, argv, "11", &data, &opts);
541
+
542
+ int nrows, ncols;
543
+ double** rows = rbcluster_ary_to_rows(data, &nrows, &ncols);
544
+
545
+ int** mask = rbcluster_create_mask(nrows, ncols);
546
+ double* weight = rbcluster_create_weight(ncols);
547
+ int transpose = 0;
548
+ char dist = 'e';
549
+ char method = 'a';
550
+
551
+ // TODO: allow passing a distance matrix instead of data.
552
+ double** distmatrix = NULL;
553
+
554
+ // options
555
+ if(opts != Qnil) {
556
+ rbcluster_parse_mask(opts, mask, nrows, ncols);
557
+ rbcluster_parse_weight(opts, &weight, ncols);
558
+ rbcluster_parse_char(opts, "dist", &dist);
559
+ rbcluster_parse_char(opts, "method", &method);
560
+ rbcluster_parse_bool(opts, "transpose", &transpose);
561
+
562
+ if(TYPE(opts) == T_HASH && rb_hash_aref(opts, ID2SYM(rb_intern("distancematrix"))) != Qnil) {
563
+ rb_raise(rb_eNotImpError, "passing a distance matrix to treecluster() is not yet implemented");
564
+ }
565
+ }
566
+
567
+ Node* tree = treecluster(
568
+ nrows,
569
+ ncols,
570
+ rows,
571
+ mask,
572
+ weight,
573
+ transpose,
574
+ dist,
575
+ method,
576
+ distmatrix
577
+ );
578
+
579
+ VALUE result = rb_ary_new2(nrows - 1);
580
+ for(int i = 0; i < nrows - 1; ++i) {
581
+ rb_ary_push(result, rbcluster_create_node(&tree[i]));
582
+ }
583
+
584
+ free(tree);
585
+ free(weight);
586
+ rbcluster_free_rows(rows, nrows);
587
+ rbcluster_free_mask(mask, nrows);
588
+
589
+ return result;
590
+ }
591
+
592
+ VALUE rbcluster_somcluster(int argc, VALUE* argv, VALUE self) {
593
+ VALUE data, opts;
594
+
595
+ rb_scan_args(argc, argv, "11", &data, &opts);
596
+
597
+ int nrows, ncols;
598
+ double** rows = rbcluster_ary_to_rows(data, &nrows, &ncols);
599
+ int** mask = rbcluster_create_mask(nrows, ncols);
600
+ double* weight = rbcluster_create_weight(ncols);
601
+
602
+ int transpose = 0;
603
+ char dist = 'e';
604
+ int nxgrid = 2;
605
+ int nygrid = 1;
606
+ double inittau = 0.02;
607
+ int niter = 1;
608
+
609
+ if(opts != Qnil) {
610
+ rbcluster_parse_mask(opts, mask, nrows, ncols);
611
+ rbcluster_parse_weight(opts, &weight, ncols);
612
+ rbcluster_parse_bool(opts, "transpose", &transpose);
613
+ rbcluster_parse_int(opts, "nxgrid", &nxgrid);
614
+ rbcluster_parse_int(opts, "nygrid", &nygrid);
615
+ rbcluster_parse_double(opts, "inittau", &inittau);
616
+ rbcluster_parse_int(opts, "niter", &niter);
617
+ rbcluster_parse_char(opts, "dist", &dist);
618
+ }
619
+
620
+ int i, j, k;
621
+ double*** celldata = rbcluster_create_celldata(nygrid, nxgrid, ncols);
622
+
623
+ int clusterid[nrows][2];
624
+
625
+ somcluster(
626
+ nrows,
627
+ ncols,
628
+ rows,
629
+ mask,
630
+ weight,
631
+ transpose,
632
+ nxgrid,
633
+ nygrid,
634
+ inittau,
635
+ niter,
636
+ dist,
637
+ celldata,
638
+ clusterid
639
+ );
640
+
641
+ VALUE rb_celldata = rb_ary_new2(nxgrid);
642
+ VALUE rb_clusterid = rb_ary_new2(nrows);
643
+
644
+ VALUE iarr, jarr;
645
+
646
+ for(i = 0; i < nxgrid; ++i) {
647
+ iarr = rb_ary_new2(nygrid);
648
+ for(j = 0; j < nygrid; ++j) {
649
+ jarr = rb_ary_new2(ncols);
650
+ for(k = 0; k < ncols; ++k) {
651
+ rb_ary_push(jarr, rb_float_new(celldata[i][j][k]));
652
+ }
653
+ rb_ary_push(iarr, jarr);
654
+ }
655
+ rb_ary_push(rb_celldata, iarr);
656
+ }
657
+
658
+ VALUE inner_arr;
659
+ for(i = 0; i < nrows; ++i) {
660
+ inner_arr = rb_ary_new2(2);
661
+ rb_ary_push(inner_arr, INT2FIX(clusterid[i][0]));
662
+ rb_ary_push(inner_arr, INT2FIX(clusterid[i][1]));
663
+
664
+ rb_ary_push(rb_clusterid, inner_arr);
665
+ }
666
+
667
+ free(weight);
668
+ rbcluster_free_rows(rows, nrows);
669
+ rbcluster_free_mask(mask, nrows);
670
+ rbcluster_free_celldata(celldata, nxgrid, nygrid);
671
+
672
+ return rb_ary_new3(2, rb_clusterid, rb_celldata);
673
+ }
674
+
675
+ void print_doubles(double* vals, int len) {
676
+ puts("[");
677
+ for(int i = 0; i < len; ++i) {
678
+ printf("\t%d: %f\n", i, vals[i]);
679
+ }
680
+ puts("]");
681
+ }
682
+
683
+ void print_double_matrix(double** vals, int nrows, int ncols) {
684
+ puts("[");
685
+ for(int i = 0; i < nrows; ++i) {
686
+ printf("\t[ ");
687
+ for(int j = 0; j < ncols; ++j) {
688
+ printf("%f%c", vals[i][j], j == ncols - 1 ? ' ' : ',');
689
+ }
690
+ printf("\t]\n");
691
+ }
692
+ puts("]");
693
+ }
694
+
695
+ VALUE rbcluster_pca(VALUE self, VALUE data) {
696
+ int nrows, ncols, i, j;
697
+
698
+ double** u = rbcluster_ary_to_rows(data, &nrows, &ncols);
699
+ int ndata = min(nrows, ncols);
700
+
701
+ double** v = malloc(ndata*sizeof(double*));
702
+ for(i = 0; i < ndata; ++i) {
703
+ v[i] = malloc(ndata*sizeof(double));
704
+ }
705
+ double* w = malloc(ndata*sizeof(double));
706
+ double* means = malloc(ncols*sizeof(double));
707
+
708
+ // calculate the mean of each column
709
+ for(j = 0; j < ncols; ++j) {
710
+ means[j] = 0.0;
711
+ for(i = 0; i < nrows; ++i) {
712
+ means[j] += u[i][j];
713
+ }
714
+
715
+ means[j] /= nrows;
716
+ }
717
+
718
+ // subtract the mean of each column
719
+ for(i = 0; i < nrows; ++i) {
720
+ for(j = 0; j < ncols; ++j) {
721
+ u[i][j] = u[i][j] - means[j];
722
+ }
723
+ }
724
+
725
+ int ok = pca(nrows, ncols, u, v, w);
726
+ if(ok == -1) {
727
+ rb_raise(rb_eStandardError, "could not allocate memory");
728
+ } else if(ok > 0) {
729
+ rb_raise(rb_eStandardError, "svd failed to converge");
730
+ }
731
+
732
+ VALUE mean = rbcluster_doubles2rb(means, ncols);
733
+ VALUE eigenvalues = rbcluster_doubles2rb(w, ndata);
734
+ VALUE coordinates = Qnil;
735
+ VALUE pc = Qnil;
736
+
737
+ if(nrows >= ncols) {
738
+ coordinates = rbcluster_rows2rb(u, nrows, ncols);
739
+ pc = rbcluster_rows2rb(v, ndata, ndata);
740
+ } else {
741
+ pc = rbcluster_rows2rb(u, nrows, ncols);
742
+ coordinates = rbcluster_rows2rb(v, ndata, ndata);
743
+ }
744
+
745
+ rbcluster_free_rows(u, nrows);
746
+ rbcluster_free_rows(v, ndata);
747
+
748
+ free(w);
749
+ free(means);
750
+
751
+ return rb_ary_new3(4, mean, coordinates, pc, eigenvalues);
752
+ }
753
+
754
+ void Init_rbcluster() {
755
+ rbcluster_mCluster = rb_define_module("Cluster");
756
+ rbcluster_cNode = rb_define_class_under(rbcluster_mCluster, "Node", rb_cObject);
757
+
758
+ rb_define_attr(rbcluster_cNode, "left", 1, 1);
759
+ rb_define_attr(rbcluster_cNode, "right", 1, 1);
760
+ rb_define_attr(rbcluster_cNode, "distance", 1, 1);
761
+ rb_define_method(rbcluster_cNode, "initialize", rbcluster_node_initialize, -1);
762
+
763
+ rb_define_singleton_method(rbcluster_mCluster, "median", rbcluster_median, 1);
764
+ rb_define_singleton_method(rbcluster_mCluster, "mean", rbcluster_mean, 1);
765
+
766
+ rb_define_singleton_method(rbcluster_mCluster, "kcluster", rbcluster_kcluster, -1);
767
+ rb_define_singleton_method(rbcluster_mCluster, "distancematrix", rbcluster_distancematrix, -1);
768
+ rb_define_singleton_method(rbcluster_mCluster, "kmedoids", rbcluster_kmedoids, -1);
769
+ rb_define_singleton_method(rbcluster_mCluster, "clusterdistance", rbcluster_clusterdistance, -1);
770
+ rb_define_singleton_method(rbcluster_mCluster, "treecluster", rbcluster_treecluster, -1);
771
+ rb_define_singleton_method(rbcluster_mCluster, "somcluster", rbcluster_somcluster, -1);
772
+ rb_define_singleton_method(rbcluster_mCluster, "pca", rbcluster_pca, 1);
773
+
774
+ rb_define_const(rbcluster_mCluster, "C_VERSION", rb_str_new2(CLUSTERVERSION));
775
+ }