rbcluster 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +9 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE +29 -0
- data/README.md +54 -0
- data/Rakefile +17 -0
- data/examples/simple_kcluster.rb +10 -0
- data/ext/rbcluster/cluster.c +4598 -0
- data/ext/rbcluster/cluster.h +93 -0
- data/ext/rbcluster/extconf.rb +6 -0
- data/ext/rbcluster/rbcluster.c +775 -0
- data/lib/rbcluster.rb +5 -0
- data/lib/rbcluster/tree.rb +20 -0
- data/lib/rbcluster/version.rb +3 -0
- data/rbcluster.gemspec +24 -0
- data/spec/clustercentroids_spec.rb +6 -0
- data/spec/clusterdistance_spec.rb +106 -0
- data/spec/clustermedoids_spec.rb +6 -0
- data/spec/cuttree_spec.rb +6 -0
- data/spec/kcluster_spec.rb +95 -0
- data/spec/kmedoids_spec.rb +86 -0
- data/spec/median_mean_spec.rb +26 -0
- data/spec/node_spec.rb +27 -0
- data/spec/pca_spec.rb +113 -0
- data/spec/somcluster_spec.rb +81 -0
- data/spec/spec_helper.rb +3 -0
- data/spec/treecluster_spec.rb +412 -0
- metadata +110 -0
@@ -0,0 +1,93 @@
|
|
1
|
+
/******************************************************************************/
|
2
|
+
/* The C Clustering Library.
|
3
|
+
* Copyright (C) 2002 Michiel Jan Laurens de Hoon.
|
4
|
+
*
|
5
|
+
* This library was written at the Laboratory of DNA Information Analysis,
|
6
|
+
* Human Genome Center, Institute of Medical Science, University of Tokyo,
|
7
|
+
* 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
|
8
|
+
* Contact: mdehoon 'AT' gsc.riken.jp
|
9
|
+
*
|
10
|
+
* Permission to use, copy, modify, and distribute this software and its
|
11
|
+
* documentation with or without modifications and for any purpose and
|
12
|
+
* without fee is hereby granted, provided that any copyright notices
|
13
|
+
* appear in all copies and that both those copyright notices and this
|
14
|
+
* permission notice appear in supporting documentation, and that the
|
15
|
+
* names of the contributors or copyright holders not be used in
|
16
|
+
* advertising or publicity pertaining to distribution of the software
|
17
|
+
* without specific prior permission.
|
18
|
+
*
|
19
|
+
* THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
|
20
|
+
* WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
|
21
|
+
* WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
|
22
|
+
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
|
23
|
+
* OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
24
|
+
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
25
|
+
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
26
|
+
* OR PERFORMANCE OF THIS SOFTWARE.
|
27
|
+
*
|
28
|
+
*/
|
29
|
+
|
30
|
+
#ifndef min
|
31
|
+
#define min(x, y) ((x) < (y) ? (x) : (y))
|
32
|
+
#endif
|
33
|
+
#ifndef max
|
34
|
+
#define max(x, y) ((x) > (y) ? (x) : (y))
|
35
|
+
#endif
|
36
|
+
|
37
|
+
#ifdef WINDOWS
|
38
|
+
# include <windows.h>
|
39
|
+
#endif
|
40
|
+
|
41
|
+
#define CLUSTERVERSION "1.50"
|
42
|
+
|
43
|
+
/* Chapter 2 */
|
44
|
+
double clusterdistance (int nrows, int ncolumns, double** data, int** mask,
|
45
|
+
double weight[], int n1, int n2, int index1[], int index2[], char dist,
|
46
|
+
char method, int transpose);
|
47
|
+
double** distancematrix (int ngenes, int ndata, double** data,
|
48
|
+
int** mask, double* weight, char dist, int transpose);
|
49
|
+
|
50
|
+
/* Chapter 3 */
|
51
|
+
int getclustercentroids(int nclusters, int nrows, int ncolumns,
|
52
|
+
double** data, int** mask, int clusterid[], double** cdata, int** cmask,
|
53
|
+
int transpose, char method);
|
54
|
+
void getclustermedoids(int nclusters, int nelements, double** distance,
|
55
|
+
int clusterid[], int centroids[], double errors[]);
|
56
|
+
void kcluster (int nclusters, int ngenes, int ndata, double** data,
|
57
|
+
int** mask, double weight[], int transpose, int npass, char method, char dist,
|
58
|
+
int clusterid[], double* error, int* ifound);
|
59
|
+
void kmedoids (int nclusters, int nelements, double** distance,
|
60
|
+
int npass, int clusterid[], double* error, int* ifound);
|
61
|
+
|
62
|
+
/* Chapter 4 */
|
63
|
+
typedef struct {int left; int right; double distance;} Node;
|
64
|
+
/*
|
65
|
+
* A Node struct describes a single node in a tree created by hierarchical
|
66
|
+
* clustering. The tree can be represented by an array of n Node structs,
|
67
|
+
* where n is the number of elements minus one. The integers left and right
|
68
|
+
* in each Node struct refer to the two elements or subnodes that are joined
|
69
|
+
* in this node. The original elements are numbered 0..nelements-1, and the
|
70
|
+
* nodes -1..-(nelements-1). For each node, distance contains the distance
|
71
|
+
* between the two subnodes that were joined.
|
72
|
+
*/
|
73
|
+
|
74
|
+
Node* treecluster (int nrows, int ncolumns, double** data, int** mask,
|
75
|
+
double weight[], int transpose, char dist, char method, double** distmatrix);
|
76
|
+
void cuttree (int nelements, Node* tree, int nclusters, int clusterid[]);
|
77
|
+
|
78
|
+
/* Chapter 5 */
|
79
|
+
void somcluster (int nrows, int ncolumns, double** data, int** mask,
|
80
|
+
const double weight[], int transpose, int nxnodes, int nynodes,
|
81
|
+
double inittau, int niter, char dist, double*** celldata,
|
82
|
+
int clusterid[][2]);
|
83
|
+
|
84
|
+
/* Chapter 6 */
|
85
|
+
int pca(int m, int n, double** u, double** v, double* w);
|
86
|
+
|
87
|
+
/* Utility routines, currently undocumented */
|
88
|
+
void sort(int n, const double data[], int index[]);
|
89
|
+
double mean(int n, double x[]);
|
90
|
+
double median (int n, double x[]);
|
91
|
+
|
92
|
+
double* calculate_weights(int nrows, int ncolumns, double** data, int** mask,
|
93
|
+
double weights[], int transpose, char dist, double cutoff, double exponent);
|
@@ -0,0 +1,775 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <stdio.h>
|
3
|
+
#include "ruby.h"
|
4
|
+
#include "cluster.h"
|
5
|
+
|
6
|
+
// missing on 1.8
|
7
|
+
#ifndef DBL2NUM
|
8
|
+
#define DBL2NUM( dbl_val ) rb_float_new( dbl_val )
|
9
|
+
#endif
|
10
|
+
|
11
|
+
VALUE rbcluster_mCluster = Qnil;
|
12
|
+
VALUE rbcluster_cNode = Qnil;
|
13
|
+
|
14
|
+
VALUE rbcluster_rows2rb(double** data, int nrows, int ncols) {
|
15
|
+
VALUE rows = rb_ary_new2((long)nrows);
|
16
|
+
VALUE cols;
|
17
|
+
int i, j;
|
18
|
+
|
19
|
+
for(i = 0; i < nrows; ++i) {
|
20
|
+
cols = rb_ary_new2((long)ncols);
|
21
|
+
rb_ary_push(rows, cols);
|
22
|
+
for(j = 0; j < ncols; ++j) {
|
23
|
+
rb_ary_push(cols, DBL2NUM(data[i][j]));
|
24
|
+
}
|
25
|
+
}
|
26
|
+
|
27
|
+
return rows;
|
28
|
+
}
|
29
|
+
|
30
|
+
double* rbcluster_ary_to_doubles(VALUE data, int len) {
|
31
|
+
Check_Type(data, T_ARRAY);
|
32
|
+
|
33
|
+
if(RARRAY_LEN(data) != len) {
|
34
|
+
rb_raise(rb_eArgError, "expected Array to have %d entries, got %ld", len, RARRAY_LEN(data));
|
35
|
+
}
|
36
|
+
|
37
|
+
double* result = malloc(len*sizeof(double));
|
38
|
+
for(int i = 0; i < len; ++i) {
|
39
|
+
result[i] = NUM2DBL(rb_ary_entry(data, i));
|
40
|
+
}
|
41
|
+
|
42
|
+
return result;
|
43
|
+
}
|
44
|
+
|
45
|
+
double** rbcluster_ary_to_rows(VALUE data, int* nrows, int* ncols) {
|
46
|
+
Check_Type(data, T_ARRAY);
|
47
|
+
long rows, cols;
|
48
|
+
|
49
|
+
rows = RARRAY_LEN(data);
|
50
|
+
if(rows == 0) {
|
51
|
+
rb_raise(rb_eArgError, "no data given");
|
52
|
+
}
|
53
|
+
|
54
|
+
VALUE first_element = rb_ary_entry(data, 0);
|
55
|
+
Check_Type(first_element, T_ARRAY);
|
56
|
+
cols = RARRAY_LEN(first_element);
|
57
|
+
|
58
|
+
double** result = malloc((rows)*sizeof(double*));
|
59
|
+
|
60
|
+
VALUE row;
|
61
|
+
int i, j;
|
62
|
+
|
63
|
+
for(i = 0; i < rows; ++i) {
|
64
|
+
result[i] = malloc((cols)*sizeof(double));
|
65
|
+
row = rb_ary_entry(data, i);
|
66
|
+
|
67
|
+
Check_Type(row, T_ARRAY);
|
68
|
+
if(RARRAY_LEN(row) != cols) {
|
69
|
+
rb_raise(rb_eArgError, "expected %ld columns, row has %ld", cols, RARRAY_LEN(row));
|
70
|
+
}
|
71
|
+
|
72
|
+
for(j = 0; j < cols; ++j) {
|
73
|
+
result[i][j] = NUM2DBL(rb_ary_entry(row, j));
|
74
|
+
}
|
75
|
+
}
|
76
|
+
|
77
|
+
*nrows = (int)rows;
|
78
|
+
*ncols = (int)cols;
|
79
|
+
|
80
|
+
return result;
|
81
|
+
}
|
82
|
+
|
83
|
+
void rbcluster_free_rows(double** data, int nrows) {
|
84
|
+
for(int i = 0; i < nrows; ++i)
|
85
|
+
free(data[i]);
|
86
|
+
|
87
|
+
free(data);
|
88
|
+
}
|
89
|
+
|
90
|
+
int** rbcluster_create_mask(int nrows, int ncols) {
|
91
|
+
int** mask = malloc(nrows*sizeof(int*));
|
92
|
+
|
93
|
+
for(int i = 0; i < nrows; ++i) {
|
94
|
+
mask[i] = malloc(ncols*sizeof(int));
|
95
|
+
for(int n = 0; n < ncols; ++n) {
|
96
|
+
mask[i][n] = 1;
|
97
|
+
}
|
98
|
+
}
|
99
|
+
|
100
|
+
return mask;
|
101
|
+
}
|
102
|
+
|
103
|
+
void rbcluster_free_mask(int** mask, int nrows) {
|
104
|
+
for(int i = 0; i < nrows; ++i)
|
105
|
+
free(mask[i]);
|
106
|
+
|
107
|
+
free(mask);
|
108
|
+
}
|
109
|
+
|
110
|
+
double* rbcluster_create_weight(int ncols) {
|
111
|
+
double* weight = malloc(ncols*sizeof(double));
|
112
|
+
|
113
|
+
for (int i = 0; i < ncols; i++)
|
114
|
+
weight[i] = 1.0;
|
115
|
+
|
116
|
+
return weight;
|
117
|
+
}
|
118
|
+
|
119
|
+
VALUE rbcluster_ints2rb(int* ints, long nrows) {
|
120
|
+
VALUE ary = rb_ary_new2(nrows);
|
121
|
+
|
122
|
+
for(int i = 0; i < nrows; ++i) {
|
123
|
+
rb_ary_push(ary, INT2FIX(ints[i]));
|
124
|
+
}
|
125
|
+
|
126
|
+
return ary;
|
127
|
+
}
|
128
|
+
|
129
|
+
VALUE rbcluster_doubles2rb(double* arr, long nrows) {
|
130
|
+
VALUE ary = rb_ary_new2(nrows);
|
131
|
+
|
132
|
+
for(int i = 0; i < nrows; ++i) {
|
133
|
+
rb_ary_push(ary, DBL2NUM(arr[i]));
|
134
|
+
}
|
135
|
+
|
136
|
+
return ary;
|
137
|
+
}
|
138
|
+
|
139
|
+
|
140
|
+
/*
|
141
|
+
option hash parsing
|
142
|
+
*/
|
143
|
+
|
144
|
+
void rbcluster_parse_mask(VALUE opts, int** mask, int nrows, int ncols) {
|
145
|
+
VALUE val = rb_hash_aref(opts, ID2SYM(rb_intern("mask")));
|
146
|
+
|
147
|
+
if(NIL_P(val))
|
148
|
+
return;
|
149
|
+
|
150
|
+
// TODO: better error message
|
151
|
+
Check_Type(val, T_ARRAY);
|
152
|
+
VALUE row;
|
153
|
+
int i, j;
|
154
|
+
|
155
|
+
for(i = 0; i < nrows; ++i) {
|
156
|
+
row = rb_ary_entry(val, i);
|
157
|
+
Check_Type(row, T_ARRAY);
|
158
|
+
for(j = 0; j < ncols; ++j) {
|
159
|
+
mask[i][j] = FIX2INT(rb_ary_entry(row, j));
|
160
|
+
}
|
161
|
+
}
|
162
|
+
}
|
163
|
+
|
164
|
+
void rbcluster_parse_weight(VALUE opts, double** weight, int ncols) {
|
165
|
+
VALUE val = rb_hash_aref(opts, ID2SYM(rb_intern("weight")));
|
166
|
+
|
167
|
+
if(val != Qnil) {
|
168
|
+
free(*weight);
|
169
|
+
*weight = rbcluster_ary_to_doubles(val, ncols);
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
void rbcluster_parse_int(VALUE opts, const char* key, int* out) {
|
174
|
+
VALUE val = rb_hash_aref(opts, ID2SYM(rb_intern(key)));
|
175
|
+
if(val != Qnil) {
|
176
|
+
Check_Type(val, T_FIXNUM);
|
177
|
+
*out = FIX2INT(val);
|
178
|
+
}
|
179
|
+
}
|
180
|
+
|
181
|
+
void rbcluster_parse_double(VALUE opts, const char* key, double* out) {
|
182
|
+
VALUE val = rb_hash_aref(opts, ID2SYM(rb_intern(key)));
|
183
|
+
if(val != Qnil) {
|
184
|
+
Check_Type(val, T_FLOAT);
|
185
|
+
*out = NUM2DBL(val);
|
186
|
+
}
|
187
|
+
}
|
188
|
+
|
189
|
+
void rbcluster_parse_char(VALUE opts, const char* key, char* out) {
|
190
|
+
VALUE val = rb_hash_aref(opts, ID2SYM(rb_intern(key)));
|
191
|
+
if(val != Qnil) {
|
192
|
+
Check_Type(val, T_STRING);
|
193
|
+
*out = RSTRING_PTR(val)[0];
|
194
|
+
}
|
195
|
+
}
|
196
|
+
|
197
|
+
void rbcluster_parse_bool(VALUE opts, const char* key, int* out) {
|
198
|
+
VALUE val = rb_hash_aref(opts, ID2SYM(rb_intern(key)));
|
199
|
+
if(val != Qnil) {
|
200
|
+
*out = val ? 1 : 0;
|
201
|
+
}
|
202
|
+
}
|
203
|
+
|
204
|
+
|
205
|
+
/*
|
206
|
+
main function
|
207
|
+
*/
|
208
|
+
|
209
|
+
VALUE rbcluster_kcluster(int argc, VALUE* argv, VALUE self) {
|
210
|
+
VALUE arr, opts;
|
211
|
+
int nrows, ncols, i, j;
|
212
|
+
|
213
|
+
rb_scan_args(argc, argv, "11", &arr, &opts);
|
214
|
+
|
215
|
+
double** data = rbcluster_ary_to_rows(arr, &nrows, &ncols);
|
216
|
+
int** mask = rbcluster_create_mask(nrows, ncols);
|
217
|
+
|
218
|
+
// defaults
|
219
|
+
int nclusters = 2;
|
220
|
+
int transpose = 0;
|
221
|
+
int npass = 1;
|
222
|
+
char method = 'a';
|
223
|
+
char dist = 'e';
|
224
|
+
double* weight = rbcluster_create_weight(nrows);
|
225
|
+
|
226
|
+
int* clusterid = malloc(nrows*sizeof(int));
|
227
|
+
int ifound = 0;
|
228
|
+
double error;
|
229
|
+
|
230
|
+
// options
|
231
|
+
if(opts != Qnil) {
|
232
|
+
Check_Type(opts, T_HASH);
|
233
|
+
VALUE val;
|
234
|
+
|
235
|
+
rbcluster_parse_int(opts, "clusters", &nclusters);
|
236
|
+
rbcluster_parse_mask(opts, mask, nrows, ncols);
|
237
|
+
rbcluster_parse_weight(opts, &weight, ncols);
|
238
|
+
rbcluster_parse_bool(opts, "transpose", &transpose);
|
239
|
+
rbcluster_parse_int(opts, "passes", &npass);
|
240
|
+
rbcluster_parse_char(opts, "method", &method);
|
241
|
+
rbcluster_parse_char(opts, "dist", &dist);
|
242
|
+
}
|
243
|
+
|
244
|
+
kcluster(
|
245
|
+
nclusters,
|
246
|
+
nrows,
|
247
|
+
ncols,
|
248
|
+
data,
|
249
|
+
mask,
|
250
|
+
weight,
|
251
|
+
transpose,
|
252
|
+
npass,
|
253
|
+
method,
|
254
|
+
dist,
|
255
|
+
clusterid,
|
256
|
+
&error,
|
257
|
+
&ifound
|
258
|
+
);
|
259
|
+
|
260
|
+
VALUE result = rbcluster_ints2rb(clusterid, nrows);
|
261
|
+
|
262
|
+
rbcluster_free_rows(data, nrows);
|
263
|
+
rbcluster_free_mask(mask, nrows);
|
264
|
+
|
265
|
+
free(weight);
|
266
|
+
free(clusterid);
|
267
|
+
|
268
|
+
return rb_ary_new3(3, result, rb_float_new(error), INT2NUM(ifound));
|
269
|
+
}
|
270
|
+
|
271
|
+
VALUE rbcluster_kmedoids(int argc, VALUE* argv, VALUE self) {
|
272
|
+
VALUE data, opts;
|
273
|
+
|
274
|
+
rb_scan_args(argc, argv, "11", &data, &opts);
|
275
|
+
Check_Type(data, T_ARRAY);
|
276
|
+
|
277
|
+
int nitems = (int)RARRAY_LEN(data);
|
278
|
+
int nclusters = 2;
|
279
|
+
int npass = 1;
|
280
|
+
|
281
|
+
// populate 'distances' from the input Array
|
282
|
+
double** distances = malloc(nitems*sizeof(double*));
|
283
|
+
int i, j;
|
284
|
+
VALUE row, num;
|
285
|
+
|
286
|
+
for(i = 0; i < nitems; ++i) {
|
287
|
+
row = rb_ary_entry(data, i);
|
288
|
+
// TODO: better error message
|
289
|
+
Check_Type(row, T_ARRAY);
|
290
|
+
if(RARRAY_LEN(row) != i) {
|
291
|
+
rb_raise(rb_eArgError,
|
292
|
+
"expected row %d to have exactly %d elements, got %ld", i, i, RARRAY_LEN(row));
|
293
|
+
}
|
294
|
+
|
295
|
+
if(i == 0) {
|
296
|
+
distances[i] = NULL;
|
297
|
+
} else {
|
298
|
+
distances[i] = malloc(i*sizeof(double));
|
299
|
+
}
|
300
|
+
|
301
|
+
for(j = 0; j < i; ++j) {
|
302
|
+
distances[i][j] = NUM2DBL(rb_ary_entry(row, j));
|
303
|
+
}
|
304
|
+
}
|
305
|
+
|
306
|
+
if(opts != Qnil) {
|
307
|
+
rbcluster_parse_int(opts, "clusters", &nclusters);
|
308
|
+
rbcluster_parse_int(opts, "passes", &npass);
|
309
|
+
// TODO: initialid
|
310
|
+
}
|
311
|
+
|
312
|
+
int* clusterid = malloc(nitems*sizeof(int));
|
313
|
+
double error;
|
314
|
+
int ifound;
|
315
|
+
|
316
|
+
// void kmedoids (int nclusters, int nelements, double** distance,
|
317
|
+
// int npass, int clusterid[], double* error, int* ifound);
|
318
|
+
kmedoids(
|
319
|
+
nclusters,
|
320
|
+
nitems,
|
321
|
+
distances,
|
322
|
+
npass,
|
323
|
+
clusterid,
|
324
|
+
&error,
|
325
|
+
&ifound
|
326
|
+
);
|
327
|
+
|
328
|
+
VALUE result = rbcluster_ints2rb(clusterid, nitems);
|
329
|
+
free(clusterid);
|
330
|
+
for(i = 1; i < nitems; ++i) free(distances[i]);
|
331
|
+
|
332
|
+
return rb_ary_new3(3, result, rb_float_new(error), INT2NUM(ifound));
|
333
|
+
}
|
334
|
+
|
335
|
+
VALUE rbcluster_median(VALUE self, VALUE ary) {
|
336
|
+
Check_Type(ary, T_ARRAY);
|
337
|
+
|
338
|
+
long len = RARRAY_LEN(ary);
|
339
|
+
double arr[len];
|
340
|
+
int i;
|
341
|
+
VALUE num;
|
342
|
+
|
343
|
+
for(i = 0; i < len; ++i) {
|
344
|
+
num = rb_ary_entry(ary, i);
|
345
|
+
arr[i] = NUM2DBL(num);
|
346
|
+
}
|
347
|
+
|
348
|
+
return rb_float_new(median((int)len, arr));
|
349
|
+
}
|
350
|
+
|
351
|
+
VALUE rbcluster_mean(VALUE self, VALUE ary) {
|
352
|
+
Check_Type(ary, T_ARRAY);
|
353
|
+
|
354
|
+
long len = RARRAY_LEN(ary);
|
355
|
+
double arr[len];
|
356
|
+
int i;
|
357
|
+
VALUE num;
|
358
|
+
|
359
|
+
for(i = 0; i < len; ++i) {
|
360
|
+
num = rb_ary_entry(ary, i);
|
361
|
+
arr[i] = NUM2DBL(num);
|
362
|
+
}
|
363
|
+
|
364
|
+
return rb_float_new(mean((int)len, arr));
|
365
|
+
}
|
366
|
+
|
367
|
+
VALUE rbcluster_distancematrix(int argc, VALUE* argv, VALUE self) {
|
368
|
+
VALUE data, opts;
|
369
|
+
int nrows, ncols, i, j;
|
370
|
+
|
371
|
+
rb_scan_args(argc, argv, "11", &data, &opts);
|
372
|
+
double** rows = rbcluster_ary_to_rows(data, &nrows, &ncols);
|
373
|
+
|
374
|
+
char dist = 'e';
|
375
|
+
int transpose = 0;
|
376
|
+
int** mask = rbcluster_create_mask(nrows, ncols);
|
377
|
+
double* weight = rbcluster_create_weight(ncols);
|
378
|
+
|
379
|
+
if(opts != Qnil) {
|
380
|
+
Check_Type(opts, T_HASH);
|
381
|
+
VALUE val;
|
382
|
+
|
383
|
+
rbcluster_parse_mask(opts, mask, nrows, ncols);
|
384
|
+
rbcluster_parse_weight(opts, &weight, ncols);
|
385
|
+
rbcluster_parse_char(opts, "dist", &dist);
|
386
|
+
rbcluster_parse_bool(opts, "transpose", &transpose);
|
387
|
+
}
|
388
|
+
|
389
|
+
VALUE result = Qnil;
|
390
|
+
double** distances = distancematrix(
|
391
|
+
nrows,
|
392
|
+
ncols,
|
393
|
+
rows,
|
394
|
+
mask,
|
395
|
+
weight,
|
396
|
+
dist,
|
397
|
+
transpose
|
398
|
+
);
|
399
|
+
|
400
|
+
if(distances) {
|
401
|
+
result = rb_ary_new();
|
402
|
+
for(i = 0; i < nrows; ++i) {
|
403
|
+
VALUE row = rb_ary_new();
|
404
|
+
|
405
|
+
for(j = 0; j < i; ++j){
|
406
|
+
rb_ary_push(row, rb_float_new(distances[i][j]));
|
407
|
+
}
|
408
|
+
|
409
|
+
// first row is NULL
|
410
|
+
if(i != 0) {
|
411
|
+
free(distances[i]);
|
412
|
+
}
|
413
|
+
|
414
|
+
rb_ary_push(result, row);
|
415
|
+
}
|
416
|
+
}
|
417
|
+
|
418
|
+
free(weight);
|
419
|
+
rbcluster_free_rows(rows, nrows);
|
420
|
+
rbcluster_free_mask(mask, nrows);
|
421
|
+
|
422
|
+
return result;
|
423
|
+
}
|
424
|
+
|
425
|
+
int* rbcluster_parse_index(VALUE arr, int* len) {
|
426
|
+
Check_Type(arr, T_ARRAY);
|
427
|
+
long length = RARRAY_LEN(arr);
|
428
|
+
|
429
|
+
int* result = malloc(length*sizeof(int));
|
430
|
+
|
431
|
+
for(int i = 0; i < length; ++i) {
|
432
|
+
result[i] = FIX2INT(rb_ary_entry(arr, i));
|
433
|
+
}
|
434
|
+
|
435
|
+
*len = (int)length;
|
436
|
+
return result;
|
437
|
+
}
|
438
|
+
|
439
|
+
VALUE rbcluster_clusterdistance(int argc, VALUE* argv, VALUE self) {
|
440
|
+
VALUE data, index1, index2, opts;
|
441
|
+
int nrows, ncols;
|
442
|
+
|
443
|
+
rb_scan_args(argc, argv, "31", &data, &index1, &index2, &opts);
|
444
|
+
double** rows = rbcluster_ary_to_rows(data, &nrows, &ncols);
|
445
|
+
|
446
|
+
int nidx1, nidx2;
|
447
|
+
int* idx1 = rbcluster_parse_index(index1, &nidx1);
|
448
|
+
int* idx2 = rbcluster_parse_index(index2, &nidx2);
|
449
|
+
|
450
|
+
int** mask = rbcluster_create_mask(nrows, ncols);
|
451
|
+
double* weight = rbcluster_create_weight(ncols);
|
452
|
+
char method = 'a';
|
453
|
+
char dist = 'e';
|
454
|
+
int transpose = 0;
|
455
|
+
|
456
|
+
if(opts != Qnil) {
|
457
|
+
rbcluster_parse_mask(opts, mask, nrows, ncols);
|
458
|
+
rbcluster_parse_weight(opts, &weight, ncols);
|
459
|
+
rbcluster_parse_char(opts, "dist", &dist);
|
460
|
+
rbcluster_parse_char(opts, "method", &method);
|
461
|
+
rbcluster_parse_bool(opts, "transpose", &transpose);
|
462
|
+
}
|
463
|
+
|
464
|
+
double result = clusterdistance(
|
465
|
+
nrows,
|
466
|
+
ncols,
|
467
|
+
rows,
|
468
|
+
mask,
|
469
|
+
weight,
|
470
|
+
nidx1,
|
471
|
+
nidx2,
|
472
|
+
idx1,
|
473
|
+
idx2,
|
474
|
+
dist,
|
475
|
+
method,
|
476
|
+
transpose
|
477
|
+
);
|
478
|
+
|
479
|
+
free(weight);
|
480
|
+
free(idx1);
|
481
|
+
free(idx2);
|
482
|
+
rbcluster_free_rows(rows, nrows);
|
483
|
+
rbcluster_free_mask(mask, nrows);
|
484
|
+
|
485
|
+
return rb_float_new(result);
|
486
|
+
}
|
487
|
+
|
488
|
+
VALUE rbcluster_create_node(Node* node) {
|
489
|
+
VALUE args[3];
|
490
|
+
|
491
|
+
args[0] = INT2NUM(node->left);
|
492
|
+
args[1] = INT2NUM(node->right);
|
493
|
+
args[2] = DBL2NUM(node->distance);
|
494
|
+
|
495
|
+
return rb_class_new_instance(3, args, rbcluster_cNode);
|
496
|
+
}
|
497
|
+
|
498
|
+
VALUE rbcluster_node_initialize(int argc, VALUE* argv, VALUE self) {
|
499
|
+
VALUE left, right, distance;
|
500
|
+
|
501
|
+
rb_scan_args(argc, argv, "21", &left, &right, &distance);
|
502
|
+
|
503
|
+
rb_ivar_set(self, rb_intern("@left"), left);
|
504
|
+
rb_ivar_set(self, rb_intern("@right"), right);
|
505
|
+
rb_ivar_set(self, rb_intern("@distance"), distance);
|
506
|
+
|
507
|
+
return self;
|
508
|
+
}
|
509
|
+
|
510
|
+
double*** rbcluster_create_celldata(int nxgrid, int nygrid, int ndata) {
|
511
|
+
double*** celldata = calloc(nxgrid*nygrid*ndata, sizeof(double**));
|
512
|
+
int i, j;
|
513
|
+
|
514
|
+
for (i = 0; i < nxgrid; i++)
|
515
|
+
{ celldata[i] = calloc(nygrid*ndata, sizeof(double*));
|
516
|
+
for (j = 0; j < nygrid; j++)
|
517
|
+
celldata[i][j] = calloc(ndata, sizeof(double));
|
518
|
+
}
|
519
|
+
|
520
|
+
return celldata;
|
521
|
+
}
|
522
|
+
|
523
|
+
void rbcluster_free_celldata(double*** celldata, int nxgrid, int nygrid) {
|
524
|
+
int i, j;
|
525
|
+
|
526
|
+
for (i = 0; i < nxgrid; i++) {
|
527
|
+
for (j = 0; j < nygrid; j++) {
|
528
|
+
free(celldata[i][j]);
|
529
|
+
}
|
530
|
+
}
|
531
|
+
|
532
|
+
for (i = 0; i < nxgrid; i++)
|
533
|
+
free(celldata[i]);
|
534
|
+
|
535
|
+
free(celldata);
|
536
|
+
}
|
537
|
+
|
538
|
+
VALUE rbcluster_treecluster(int argc, VALUE* argv, VALUE self) {
|
539
|
+
VALUE data, opts;
|
540
|
+
rb_scan_args(argc, argv, "11", &data, &opts);
|
541
|
+
|
542
|
+
int nrows, ncols;
|
543
|
+
double** rows = rbcluster_ary_to_rows(data, &nrows, &ncols);
|
544
|
+
|
545
|
+
int** mask = rbcluster_create_mask(nrows, ncols);
|
546
|
+
double* weight = rbcluster_create_weight(ncols);
|
547
|
+
int transpose = 0;
|
548
|
+
char dist = 'e';
|
549
|
+
char method = 'a';
|
550
|
+
|
551
|
+
// TODO: allow passing a distance matrix instead of data.
|
552
|
+
double** distmatrix = NULL;
|
553
|
+
|
554
|
+
// options
|
555
|
+
if(opts != Qnil) {
|
556
|
+
rbcluster_parse_mask(opts, mask, nrows, ncols);
|
557
|
+
rbcluster_parse_weight(opts, &weight, ncols);
|
558
|
+
rbcluster_parse_char(opts, "dist", &dist);
|
559
|
+
rbcluster_parse_char(opts, "method", &method);
|
560
|
+
rbcluster_parse_bool(opts, "transpose", &transpose);
|
561
|
+
|
562
|
+
if(TYPE(opts) == T_HASH && rb_hash_aref(opts, ID2SYM(rb_intern("distancematrix"))) != Qnil) {
|
563
|
+
rb_raise(rb_eNotImpError, "passing a distance matrix to treecluster() is not yet implemented");
|
564
|
+
}
|
565
|
+
}
|
566
|
+
|
567
|
+
Node* tree = treecluster(
|
568
|
+
nrows,
|
569
|
+
ncols,
|
570
|
+
rows,
|
571
|
+
mask,
|
572
|
+
weight,
|
573
|
+
transpose,
|
574
|
+
dist,
|
575
|
+
method,
|
576
|
+
distmatrix
|
577
|
+
);
|
578
|
+
|
579
|
+
VALUE result = rb_ary_new2(nrows - 1);
|
580
|
+
for(int i = 0; i < nrows - 1; ++i) {
|
581
|
+
rb_ary_push(result, rbcluster_create_node(&tree[i]));
|
582
|
+
}
|
583
|
+
|
584
|
+
free(tree);
|
585
|
+
free(weight);
|
586
|
+
rbcluster_free_rows(rows, nrows);
|
587
|
+
rbcluster_free_mask(mask, nrows);
|
588
|
+
|
589
|
+
return result;
|
590
|
+
}
|
591
|
+
|
592
|
+
VALUE rbcluster_somcluster(int argc, VALUE* argv, VALUE self) {
|
593
|
+
VALUE data, opts;
|
594
|
+
|
595
|
+
rb_scan_args(argc, argv, "11", &data, &opts);
|
596
|
+
|
597
|
+
int nrows, ncols;
|
598
|
+
double** rows = rbcluster_ary_to_rows(data, &nrows, &ncols);
|
599
|
+
int** mask = rbcluster_create_mask(nrows, ncols);
|
600
|
+
double* weight = rbcluster_create_weight(ncols);
|
601
|
+
|
602
|
+
int transpose = 0;
|
603
|
+
char dist = 'e';
|
604
|
+
int nxgrid = 2;
|
605
|
+
int nygrid = 1;
|
606
|
+
double inittau = 0.02;
|
607
|
+
int niter = 1;
|
608
|
+
|
609
|
+
if(opts != Qnil) {
|
610
|
+
rbcluster_parse_mask(opts, mask, nrows, ncols);
|
611
|
+
rbcluster_parse_weight(opts, &weight, ncols);
|
612
|
+
rbcluster_parse_bool(opts, "transpose", &transpose);
|
613
|
+
rbcluster_parse_int(opts, "nxgrid", &nxgrid);
|
614
|
+
rbcluster_parse_int(opts, "nygrid", &nygrid);
|
615
|
+
rbcluster_parse_double(opts, "inittau", &inittau);
|
616
|
+
rbcluster_parse_int(opts, "niter", &niter);
|
617
|
+
rbcluster_parse_char(opts, "dist", &dist);
|
618
|
+
}
|
619
|
+
|
620
|
+
int i, j, k;
|
621
|
+
double*** celldata = rbcluster_create_celldata(nygrid, nxgrid, ncols);
|
622
|
+
|
623
|
+
int clusterid[nrows][2];
|
624
|
+
|
625
|
+
somcluster(
|
626
|
+
nrows,
|
627
|
+
ncols,
|
628
|
+
rows,
|
629
|
+
mask,
|
630
|
+
weight,
|
631
|
+
transpose,
|
632
|
+
nxgrid,
|
633
|
+
nygrid,
|
634
|
+
inittau,
|
635
|
+
niter,
|
636
|
+
dist,
|
637
|
+
celldata,
|
638
|
+
clusterid
|
639
|
+
);
|
640
|
+
|
641
|
+
VALUE rb_celldata = rb_ary_new2(nxgrid);
|
642
|
+
VALUE rb_clusterid = rb_ary_new2(nrows);
|
643
|
+
|
644
|
+
VALUE iarr, jarr;
|
645
|
+
|
646
|
+
for(i = 0; i < nxgrid; ++i) {
|
647
|
+
iarr = rb_ary_new2(nygrid);
|
648
|
+
for(j = 0; j < nygrid; ++j) {
|
649
|
+
jarr = rb_ary_new2(ncols);
|
650
|
+
for(k = 0; k < ncols; ++k) {
|
651
|
+
rb_ary_push(jarr, rb_float_new(celldata[i][j][k]));
|
652
|
+
}
|
653
|
+
rb_ary_push(iarr, jarr);
|
654
|
+
}
|
655
|
+
rb_ary_push(rb_celldata, iarr);
|
656
|
+
}
|
657
|
+
|
658
|
+
VALUE inner_arr;
|
659
|
+
for(i = 0; i < nrows; ++i) {
|
660
|
+
inner_arr = rb_ary_new2(2);
|
661
|
+
rb_ary_push(inner_arr, INT2FIX(clusterid[i][0]));
|
662
|
+
rb_ary_push(inner_arr, INT2FIX(clusterid[i][1]));
|
663
|
+
|
664
|
+
rb_ary_push(rb_clusterid, inner_arr);
|
665
|
+
}
|
666
|
+
|
667
|
+
free(weight);
|
668
|
+
rbcluster_free_rows(rows, nrows);
|
669
|
+
rbcluster_free_mask(mask, nrows);
|
670
|
+
rbcluster_free_celldata(celldata, nxgrid, nygrid);
|
671
|
+
|
672
|
+
return rb_ary_new3(2, rb_clusterid, rb_celldata);
|
673
|
+
}
|
674
|
+
|
675
|
+
void print_doubles(double* vals, int len) {
|
676
|
+
puts("[");
|
677
|
+
for(int i = 0; i < len; ++i) {
|
678
|
+
printf("\t%d: %f\n", i, vals[i]);
|
679
|
+
}
|
680
|
+
puts("]");
|
681
|
+
}
|
682
|
+
|
683
|
+
void print_double_matrix(double** vals, int nrows, int ncols) {
|
684
|
+
puts("[");
|
685
|
+
for(int i = 0; i < nrows; ++i) {
|
686
|
+
printf("\t[ ");
|
687
|
+
for(int j = 0; j < ncols; ++j) {
|
688
|
+
printf("%f%c", vals[i][j], j == ncols - 1 ? ' ' : ',');
|
689
|
+
}
|
690
|
+
printf("\t]\n");
|
691
|
+
}
|
692
|
+
puts("]");
|
693
|
+
}
|
694
|
+
|
695
|
+
VALUE rbcluster_pca(VALUE self, VALUE data) {
|
696
|
+
int nrows, ncols, i, j;
|
697
|
+
|
698
|
+
double** u = rbcluster_ary_to_rows(data, &nrows, &ncols);
|
699
|
+
int ndata = min(nrows, ncols);
|
700
|
+
|
701
|
+
double** v = malloc(ndata*sizeof(double*));
|
702
|
+
for(i = 0; i < ndata; ++i) {
|
703
|
+
v[i] = malloc(ndata*sizeof(double));
|
704
|
+
}
|
705
|
+
double* w = malloc(ndata*sizeof(double));
|
706
|
+
double* means = malloc(ncols*sizeof(double));
|
707
|
+
|
708
|
+
// calculate the mean of each column
|
709
|
+
for(j = 0; j < ncols; ++j) {
|
710
|
+
means[j] = 0.0;
|
711
|
+
for(i = 0; i < nrows; ++i) {
|
712
|
+
means[j] += u[i][j];
|
713
|
+
}
|
714
|
+
|
715
|
+
means[j] /= nrows;
|
716
|
+
}
|
717
|
+
|
718
|
+
// subtract the mean of each column
|
719
|
+
for(i = 0; i < nrows; ++i) {
|
720
|
+
for(j = 0; j < ncols; ++j) {
|
721
|
+
u[i][j] = u[i][j] - means[j];
|
722
|
+
}
|
723
|
+
}
|
724
|
+
|
725
|
+
int ok = pca(nrows, ncols, u, v, w);
|
726
|
+
if(ok == -1) {
|
727
|
+
rb_raise(rb_eStandardError, "could not allocate memory");
|
728
|
+
} else if(ok > 0) {
|
729
|
+
rb_raise(rb_eStandardError, "svd failed to converge");
|
730
|
+
}
|
731
|
+
|
732
|
+
VALUE mean = rbcluster_doubles2rb(means, ncols);
|
733
|
+
VALUE eigenvalues = rbcluster_doubles2rb(w, ndata);
|
734
|
+
VALUE coordinates = Qnil;
|
735
|
+
VALUE pc = Qnil;
|
736
|
+
|
737
|
+
if(nrows >= ncols) {
|
738
|
+
coordinates = rbcluster_rows2rb(u, nrows, ncols);
|
739
|
+
pc = rbcluster_rows2rb(v, ndata, ndata);
|
740
|
+
} else {
|
741
|
+
pc = rbcluster_rows2rb(u, nrows, ncols);
|
742
|
+
coordinates = rbcluster_rows2rb(v, ndata, ndata);
|
743
|
+
}
|
744
|
+
|
745
|
+
rbcluster_free_rows(u, nrows);
|
746
|
+
rbcluster_free_rows(v, ndata);
|
747
|
+
|
748
|
+
free(w);
|
749
|
+
free(means);
|
750
|
+
|
751
|
+
return rb_ary_new3(4, mean, coordinates, pc, eigenvalues);
|
752
|
+
}
|
753
|
+
|
754
|
+
void Init_rbcluster() {
|
755
|
+
rbcluster_mCluster = rb_define_module("Cluster");
|
756
|
+
rbcluster_cNode = rb_define_class_under(rbcluster_mCluster, "Node", rb_cObject);
|
757
|
+
|
758
|
+
rb_define_attr(rbcluster_cNode, "left", 1, 1);
|
759
|
+
rb_define_attr(rbcluster_cNode, "right", 1, 1);
|
760
|
+
rb_define_attr(rbcluster_cNode, "distance", 1, 1);
|
761
|
+
rb_define_method(rbcluster_cNode, "initialize", rbcluster_node_initialize, -1);
|
762
|
+
|
763
|
+
rb_define_singleton_method(rbcluster_mCluster, "median", rbcluster_median, 1);
|
764
|
+
rb_define_singleton_method(rbcluster_mCluster, "mean", rbcluster_mean, 1);
|
765
|
+
|
766
|
+
rb_define_singleton_method(rbcluster_mCluster, "kcluster", rbcluster_kcluster, -1);
|
767
|
+
rb_define_singleton_method(rbcluster_mCluster, "distancematrix", rbcluster_distancematrix, -1);
|
768
|
+
rb_define_singleton_method(rbcluster_mCluster, "kmedoids", rbcluster_kmedoids, -1);
|
769
|
+
rb_define_singleton_method(rbcluster_mCluster, "clusterdistance", rbcluster_clusterdistance, -1);
|
770
|
+
rb_define_singleton_method(rbcluster_mCluster, "treecluster", rbcluster_treecluster, -1);
|
771
|
+
rb_define_singleton_method(rbcluster_mCluster, "somcluster", rbcluster_somcluster, -1);
|
772
|
+
rb_define_singleton_method(rbcluster_mCluster, "pca", rbcluster_pca, 1);
|
773
|
+
|
774
|
+
rb_define_const(rbcluster_mCluster, "C_VERSION", rb_str_new2(CLUSTERVERSION));
|
775
|
+
}
|