flock 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +9 -7
- data/VERSION +1 -1
- data/examples/som.rb +13 -0
- data/examples/sparse.rb +5 -5
- data/ext/cluster.c +20 -28
- data/ext/cluster.h +14 -4
- data/ext/flock.c +251 -21
- data/flock.gemspec +3 -2
- data/lib/flock.rb +13 -1
- metadata +6 -4
data/README.rdoc
CHANGED
@@ -76,17 +76,19 @@ Provides bindings to K-Means clustering in Cluster 3.0
|
|
76
76
|
require 'flock'
|
77
77
|
|
78
78
|
data = []
|
79
|
-
|
80
|
-
|
81
|
-
data << {
|
82
|
-
data << {
|
83
|
-
data << {
|
79
|
+
|
80
|
+
# keys don't need to be numeric
|
81
|
+
data << { 1 => 0.5, 2 => 0.5 }
|
82
|
+
data << { 3 => 1, 4 => 1 }
|
83
|
+
data << { 4 => 1, 5 => 0.3 }
|
84
|
+
data << { 2 => 0.75 }
|
85
|
+
data << { 1 => 0.60 }
|
84
86
|
|
85
87
|
pp Flock.sparse_kmeans(2, data)
|
86
88
|
|
87
|
-
# or even more simply (defaults to 1)
|
88
|
-
|
89
89
|
data = []
|
90
|
+
|
91
|
+
# a much simpler way to cluster text
|
90
92
|
data << %w(apple orange)
|
91
93
|
data << %w(black white)
|
92
94
|
data << %w(white cyan)
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/examples/som.rb
ADDED
data/examples/sparse.rb
CHANGED
@@ -4,11 +4,11 @@ require 'pp'
|
|
4
4
|
require 'flock'
|
5
5
|
|
6
6
|
data = []
|
7
|
-
data << {
|
8
|
-
data << {
|
9
|
-
data << {
|
10
|
-
data << {
|
11
|
-
data << {
|
7
|
+
data << { 1 => 0.5, 2 => 0.5 }
|
8
|
+
data << { 3 => 1, 4 => 1 }
|
9
|
+
data << { 4 => 1, 5 => 0.3 }
|
10
|
+
data << { 2 => 0.75 }
|
11
|
+
data << { 1 => 0.60 }
|
12
12
|
|
13
13
|
pp Flock.sparse_kmeans(2, data)
|
14
14
|
|
data/ext/cluster.c
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
* Human Genome Center, Institute of Medical Science, University of Tokyo,
|
6
6
|
* 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
|
7
7
|
* Contact: mdehoon 'AT' gsc.riken.jp
|
8
|
-
*
|
8
|
+
*
|
9
9
|
* Permission to use, copy, modify, and distribute this software and its
|
10
10
|
* documentation with or without modifications and for any purpose and
|
11
11
|
* without fee is hereby granted, provided that any copyright notices
|
@@ -14,7 +14,7 @@
|
|
14
14
|
* names of the contributors or copyright holders not be used in
|
15
15
|
* advertising or publicity pertaining to distribution of the software
|
16
16
|
* without specific prior permission.
|
17
|
-
*
|
17
|
+
*
|
18
18
|
* THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
|
19
19
|
* WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
|
20
20
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
|
@@ -23,7 +23,7 @@
|
|
23
23
|
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
24
24
|
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
25
25
|
* OR PERFORMANCE OF THIS SOFTWARE.
|
26
|
-
*
|
26
|
+
*
|
27
27
|
*/
|
28
28
|
|
29
29
|
#include <time.h>
|
@@ -334,7 +334,7 @@ static int svd(int m, int n, double** u, double w[], double** vt)
|
|
334
334
|
* A=usv of a real m by n rectangular matrix, where m is greater
|
335
335
|
* than or equal to n. Householder bidiagonalization and a variant
|
336
336
|
* of the QR algorithm are used.
|
337
|
-
*
|
337
|
+
*
|
338
338
|
*
|
339
339
|
* On input.
|
340
340
|
*
|
@@ -929,10 +929,9 @@ positive integer if the singular value decomposition fails to converge.
|
|
929
929
|
|
930
930
|
/* ********************************************************************* */
|
931
931
|
|
932
|
-
static
|
933
932
|
double euclid (int n, double** data1, double** data2, int** mask1, int** mask2,
|
934
933
|
const double weight[], int index1, int index2, int transpose)
|
935
|
-
|
934
|
+
|
936
935
|
/*
|
937
936
|
Purpose
|
938
937
|
=======
|
@@ -1004,7 +1003,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
|
|
1004
1003
|
|
1005
1004
|
/* ********************************************************************* */
|
1006
1005
|
|
1007
|
-
static
|
1008
1006
|
double cityblock (int n, double** data1, double** data2, int** mask1,
|
1009
1007
|
int** mask2, const double weight[], int index1, int index2, int transpose)
|
1010
1008
|
|
@@ -1080,7 +1078,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
|
|
1080
1078
|
|
1081
1079
|
/* ********************************************************************* */
|
1082
1080
|
|
1083
|
-
static
|
1084
1081
|
double correlation (int n, double** data1, double** data2, int** mask1,
|
1085
1082
|
int** mask2, const double weight[], int index1, int index2, int transpose)
|
1086
1083
|
/*
|
@@ -1180,7 +1177,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
|
|
1180
1177
|
|
1181
1178
|
/* ********************************************************************* */
|
1182
1179
|
|
1183
|
-
static
|
1184
1180
|
double acorrelation (int n, double** data1, double** data2, int** mask1,
|
1185
1181
|
int** mask2, const double weight[], int index1, int index2, int transpose)
|
1186
1182
|
/*
|
@@ -1279,7 +1275,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
|
|
1279
1275
|
|
1280
1276
|
/* ********************************************************************* */
|
1281
1277
|
|
1282
|
-
static
|
1283
1278
|
double ucorrelation (int n, double** data1, double** data2, int** mask1,
|
1284
1279
|
int** mask2, const double weight[], int index1, int index2, int transpose)
|
1285
1280
|
/*
|
@@ -1374,7 +1369,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
|
|
1374
1369
|
|
1375
1370
|
/* ********************************************************************* */
|
1376
1371
|
|
1377
|
-
static
|
1378
1372
|
double uacorrelation (int n, double** data1, double** data2, int** mask1,
|
1379
1373
|
int** mask2, const double weight[], int index1, int index2, int transpose)
|
1380
1374
|
/*
|
@@ -1469,7 +1463,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
|
|
1469
1463
|
|
1470
1464
|
/* ********************************************************************* */
|
1471
1465
|
|
1472
|
-
static
|
1473
1466
|
double spearman (int n, double** data1, double** data2, int** mask1,
|
1474
1467
|
int** mask2, const double weight[], int index1, int index2, int transpose)
|
1475
1468
|
/*
|
@@ -1597,7 +1590,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
|
|
1597
1590
|
|
1598
1591
|
/* ********************************************************************* */
|
1599
1592
|
|
1600
|
-
static
|
1601
1593
|
double kendall (int n, double** data1, double** data2, int** mask1, int** mask2,
|
1602
1594
|
const double weight[], int index1, int index2, int transpose)
|
1603
1595
|
/*
|
@@ -1708,7 +1700,7 @@ Otherwise, the distance between two columns in the matrix is calculated.
|
|
1708
1700
|
|
1709
1701
|
/* ********************************************************************* */
|
1710
1702
|
|
1711
|
-
static double(*setmetric(char dist))
|
1703
|
+
static double(*setmetric(char dist))
|
1712
1704
|
(int, double**, double**, int**, int**, const double[], int, int, int)
|
1713
1705
|
{ switch(dist)
|
1714
1706
|
{ case 'e': return &euclid;
|
@@ -2203,7 +2195,7 @@ calculating the medians.
|
|
2203
2195
|
}
|
2204
2196
|
}
|
2205
2197
|
}
|
2206
|
-
|
2198
|
+
|
2207
2199
|
/* ********************************************************************* */
|
2208
2200
|
|
2209
2201
|
int getclustercentroids(int nclusters, int nrows, int ncolumns,
|
@@ -2427,7 +2419,7 @@ kmeans(int nclusters, int nrows, int ncolumns, double** data, int** mask,
|
|
2427
2419
|
break; /* Identical solution found; break out of this loop */
|
2428
2420
|
}
|
2429
2421
|
|
2430
|
-
if (npass<=1)
|
2422
|
+
if (npass<=1)
|
2431
2423
|
{ *error = total;
|
2432
2424
|
break;
|
2433
2425
|
}
|
@@ -2532,7 +2524,7 @@ kmedians(int nclusters, int nrows, int ncolumns, double** data, int** mask,
|
|
2532
2524
|
break; /* Identical solution found; break out of this loop */
|
2533
2525
|
}
|
2534
2526
|
|
2535
|
-
if (npass<=1)
|
2527
|
+
if (npass<=1)
|
2536
2528
|
{ *error = total;
|
2537
2529
|
break;
|
2538
2530
|
}
|
@@ -2603,7 +2595,7 @@ of the matrix are clustered.
|
|
2603
2595
|
|
2604
2596
|
npass (input) int
|
2605
2597
|
The number of times clustering is performed. Clustering is performed npass
|
2606
|
-
times, each time starting from a different (random) initial assignment of
|
2598
|
+
times, each time starting from a different (random) initial assignment of
|
2607
2599
|
genes to clusters. The clustering solution with the lowest within-cluster sum
|
2608
2600
|
of distances is chosen.
|
2609
2601
|
If npass==0, then the clustering algorithm will be run once, where the initial
|
@@ -2697,7 +2689,7 @@ number of clusters is larger than the number of elements being clustered,
|
|
2697
2689
|
return;
|
2698
2690
|
}
|
2699
2691
|
}
|
2700
|
-
|
2692
|
+
|
2701
2693
|
if (method=='m')
|
2702
2694
|
{ double* cache = malloc(nelements*sizeof(double));
|
2703
2695
|
if(cache)
|
@@ -3105,7 +3097,7 @@ weights array, the function returns NULL.
|
|
3105
3097
|
|
3106
3098
|
/* ******************************************************************** */
|
3107
3099
|
|
3108
|
-
void cuttree (int nelements, Node* tree, int nclusters, int clusterid[])
|
3100
|
+
void cuttree (int nelements, Node* tree, int nclusters, int clusterid[])
|
3109
3101
|
|
3110
3102
|
/*
|
3111
3103
|
Purpose
|
@@ -3160,7 +3152,7 @@ error occured, all elements in clusterid are set to -1.
|
|
3160
3152
|
}
|
3161
3153
|
for (i = 0; i < n; i++) nodeid[i] = -1;
|
3162
3154
|
for (i = n-1; i >= 0; i--)
|
3163
|
-
{ if(nodeid[i]<0)
|
3155
|
+
{ if(nodeid[i]<0)
|
3164
3156
|
{ j = icluster;
|
3165
3157
|
nodeid[i] = j;
|
3166
3158
|
icluster++;
|
@@ -3269,7 +3261,7 @@ If a memory error occurs, pclcluster returns NULL.
|
|
3269
3261
|
if(!makedatamask(nelements, ndata, &newdata, &newmask))
|
3270
3262
|
{ free(result);
|
3271
3263
|
free(distid);
|
3272
|
-
return NULL;
|
3264
|
+
return NULL;
|
3273
3265
|
}
|
3274
3266
|
|
3275
3267
|
for (i = 0; i < nelements; i++) distid[i] = i;
|
@@ -3313,7 +3305,7 @@ If a memory error occurs, pclcluster returns NULL.
|
|
3313
3305
|
free(mask[is]);
|
3314
3306
|
data[is] = data[nnodes-inode];
|
3315
3307
|
mask[is] = mask[nnodes-inode];
|
3316
|
-
|
3308
|
+
|
3317
3309
|
/* Fix the distances */
|
3318
3310
|
distid[is] = distid[nnodes-inode];
|
3319
3311
|
for (i = 0; i < is; i++)
|
@@ -3334,7 +3326,7 @@ If a memory error occurs, pclcluster returns NULL.
|
|
3334
3326
|
free(data);
|
3335
3327
|
free(mask);
|
3336
3328
|
free(distid);
|
3337
|
-
|
3329
|
+
|
3338
3330
|
return result;
|
3339
3331
|
}
|
3340
3332
|
|
@@ -3829,7 +3821,7 @@ If a memory error occurs, treecluster returns NULL.
|
|
3829
3821
|
for (i = 1; i < nelements; i++) free(distmatrix[i]);
|
3830
3822
|
free (distmatrix);
|
3831
3823
|
}
|
3832
|
-
|
3824
|
+
|
3833
3825
|
return result;
|
3834
3826
|
}
|
3835
3827
|
|
@@ -4037,7 +4029,7 @@ void somworker (int nrows, int ncolumns, double** data, int** mask,
|
|
4037
4029
|
static
|
4038
4030
|
void somassign (int nrows, int ncolumns, double** data, int** mask,
|
4039
4031
|
const double weights[], int transpose, int nxgrid, int nygrid,
|
4040
|
-
double*** celldata, char dist, int clusterid
|
4032
|
+
double*** celldata, char dist, int **clusterid)
|
4041
4033
|
/* Collect clusterids */
|
4042
4034
|
{ const int ndata = (transpose==0) ? ncolumns : nrows;
|
4043
4035
|
int i,j;
|
@@ -4121,7 +4113,7 @@ void somassign (int nrows, int ncolumns, double** data, int** mask,
|
|
4121
4113
|
|
4122
4114
|
void somcluster (int nrows, int ncolumns, double** data, int** mask,
|
4123
4115
|
const double weight[], int transpose, int nxgrid, int nygrid,
|
4124
|
-
double inittau, int niter, char dist, double*** celldata, int clusterid
|
4116
|
+
double inittau, int niter, char dist, double*** celldata, int **clusterid)
|
4125
4117
|
/*
|
4126
4118
|
|
4127
4119
|
Purpose
|
@@ -4235,7 +4227,7 @@ somcluster.
|
|
4235
4227
|
double clusterdistance (int nrows, int ncolumns, double** data,
|
4236
4228
|
int** mask, double weight[], int n1, int n2, int index1[], int index2[],
|
4237
4229
|
char dist, char method, int transpose)
|
4238
|
-
|
4230
|
+
|
4239
4231
|
/*
|
4240
4232
|
Purpose
|
4241
4233
|
=======
|
data/ext/cluster.h
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
* Human Genome Center, Institute of Medical Science, University of Tokyo,
|
7
7
|
* 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
|
8
8
|
* Contact: mdehoon 'AT' gsc.riken.jp
|
9
|
-
*
|
9
|
+
*
|
10
10
|
* Permission to use, copy, modify, and distribute this software and its
|
11
11
|
* documentation with or without modifications and for any purpose and
|
12
12
|
* without fee is hereby granted, provided that any copyright notices
|
@@ -15,7 +15,7 @@
|
|
15
15
|
* names of the contributors or copyright holders not be used in
|
16
16
|
* advertising or publicity pertaining to distribution of the software
|
17
17
|
* without specific prior permission.
|
18
|
-
*
|
18
|
+
*
|
19
19
|
* THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
|
20
20
|
* WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
|
21
21
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
|
@@ -24,7 +24,7 @@
|
|
24
24
|
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
25
25
|
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
26
26
|
* OR PERFORMANCE OF THIS SOFTWARE.
|
27
|
-
*
|
27
|
+
*
|
28
28
|
*/
|
29
29
|
|
30
30
|
#ifndef min
|
@@ -79,7 +79,7 @@ void cuttree (int nelements, Node* tree, int nclusters, int clusterid[]);
|
|
79
79
|
void somcluster (int nrows, int ncolumns, double** data, int** mask,
|
80
80
|
const double weight[], int transpose, int nxnodes, int nynodes,
|
81
81
|
double inittau, int niter, char dist, double*** celldata,
|
82
|
-
int clusterid
|
82
|
+
int **clusterid);
|
83
83
|
|
84
84
|
/* Chapter 6 */
|
85
85
|
int pca(int m, int n, double** u, double** v, double* w);
|
@@ -91,3 +91,13 @@ double median (int n, double x[]);
|
|
91
91
|
|
92
92
|
double* calculate_weights(int nrows, int ncolumns, double** data, int** mask,
|
93
93
|
double weights[], int transpose, char dist, double cutoff, double exponent);
|
94
|
+
|
95
|
+
/* distance functions */
|
96
|
+
extern double euclid (int, double**, double**, int**, int**, const double [], int, int, int);
|
97
|
+
extern double cityblock(int, double**, double**, int**, int**, const double [], int, int, int);
|
98
|
+
extern double correlation(int, double**, double**, int**, int**, const double [], int, int, int);
|
99
|
+
extern double acorrelation(int, double**, double**, int**, int**, const double [], int, int, int);
|
100
|
+
extern double ucorrelation(int, double**, double**, int**, int**, const double [], int, int, int);
|
101
|
+
extern double uacorrelation(int, double**, double**, int**, int**, const double [], int, int, int);
|
102
|
+
extern double spearman(int, double**, double**, int**, int**, const double [], int, int, int);
|
103
|
+
extern double kendall(int, double**, double**, int**, int**, const double [], int, int, int);
|
data/ext/flock.c
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
#define CONST_GET(scope, constant) (rb_funcall(scope, ID_CONST_GET, 1, rb_str_new2(constant)))
|
6
6
|
|
7
7
|
static VALUE mFlock;
|
8
|
+
typedef double (*distance_fn)(int, double**, double**, int**, int**, const double [], int, int, int);
|
8
9
|
|
9
10
|
int opt_int_value(VALUE option, char *key, int def) {
|
10
11
|
if (NIL_P(option)) return def;
|
@@ -13,6 +14,13 @@ int opt_int_value(VALUE option, char *key, int def) {
|
|
13
14
|
return NIL_P(value) ? def : NUM2INT(value);
|
14
15
|
}
|
15
16
|
|
17
|
+
int opt_double_value(VALUE option, char *key, double def) {
|
18
|
+
if (NIL_P(option)) return def;
|
19
|
+
|
20
|
+
VALUE value = rb_hash_aref(option, ID2SYM(rb_intern(key)));
|
21
|
+
return NIL_P(value) ? def : NUM2DBL(value);
|
22
|
+
}
|
23
|
+
|
16
24
|
VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
|
17
25
|
VALUE size, data, mask, weights, options;
|
18
26
|
rb_scan_args(argc, argv, "22", &size, &data, &mask, &options);
|
@@ -26,6 +34,20 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
|
|
26
34
|
if (NIL_P(size) || NUM2INT(rb_Integer(size)) > RARRAY_LEN(data))
|
27
35
|
rb_raise(rb_eArgError, "size should be > 0 and <= data size");
|
28
36
|
|
37
|
+
int transpose = opt_int_value(options, "transpose", 0);
|
38
|
+
int npass = opt_int_value(options, "iterations", 1000);
|
39
|
+
// a = average, m = means
|
40
|
+
int method = opt_int_value(options, "method", 'a');
|
41
|
+
// e = euclidian,
|
42
|
+
// b = city-block distance
|
43
|
+
// c = correlation
|
44
|
+
// a = absolute value of the correlation
|
45
|
+
// u = uncentered correlation
|
46
|
+
// x = absolute uncentered correlation
|
47
|
+
// s = spearman's rank correlation
|
48
|
+
// k = kendall's tau
|
49
|
+
int dist = opt_int_value(options, "metric", 'e');
|
50
|
+
|
29
51
|
int i,j;
|
30
52
|
int nrows = RARRAY_LEN(data);
|
31
53
|
int ncols = RARRAY_LEN(rb_ary_entry(data, 0));
|
@@ -33,16 +55,14 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
|
|
33
55
|
|
34
56
|
double **cdata = (double**)malloc(sizeof(double*)*nrows);
|
35
57
|
int **cmask = (int **)malloc(sizeof(int *)*nrows);
|
36
|
-
double **ccentroid = (double**)malloc(sizeof(double*)*nrows);
|
37
|
-
int **ccentroid_mask = (int **)malloc(sizeof(int *)*nrows);
|
38
58
|
double *cweights = (double *)malloc(sizeof(double )*ncols);
|
39
|
-
|
59
|
+
|
60
|
+
double **ccentroid;
|
61
|
+
int *ccluster, **ccentroid_mask, dimx = nrows, dimy = ncols, cdimx = nsets, cdimy = ncols;
|
40
62
|
|
41
63
|
for (i = 0; i < nrows; i++) {
|
42
64
|
cdata[i] = (double*)malloc(sizeof(double)*ncols);
|
43
65
|
cmask[i] = (int *)malloc(sizeof(int )*ncols);
|
44
|
-
ccentroid[i] = (double*)malloc(sizeof(double)*ncols);
|
45
|
-
ccentroid_mask[i] = (int *)malloc(sizeof(int )*ncols);
|
46
66
|
for (j = 0; j < ncols; j++) {
|
47
67
|
cdata[i][j] = NUM2DBL(rb_Float(rb_ary_entry(rb_ary_entry(data, i), j)));
|
48
68
|
cmask[i][j] = NIL_P(mask) ? 1 : NUM2INT(rb_Integer(rb_ary_entry(rb_ary_entry(mask, i), j)));
|
@@ -54,25 +74,27 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
|
|
54
74
|
cweights[i] = NIL_P(weights) ? 1.0 : NUM2DBL(rb_Float(rb_ary_entry(weights, i)));
|
55
75
|
}
|
56
76
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
77
|
+
if (transpose) {
|
78
|
+
dimx = ncols;
|
79
|
+
dimy = nrows;
|
80
|
+
cdimx = nrows;
|
81
|
+
cdimy = nsets;
|
82
|
+
}
|
83
|
+
|
84
|
+
ccluster = (int *)malloc(sizeof(int )*dimx);
|
85
|
+
ccentroid = (double**)malloc(sizeof(double*)*cdimx);
|
86
|
+
ccentroid_mask = (int **)malloc(sizeof(int *)*cdimx);
|
87
|
+
|
88
|
+
for (i = 0; i < cdimx; i++) {
|
89
|
+
ccentroid[i] = (double*)malloc(sizeof(double)*cdimy);
|
90
|
+
ccentroid_mask[i] = (int *)malloc(sizeof(int )*cdimy);
|
91
|
+
}
|
70
92
|
|
71
93
|
int ifound;
|
72
94
|
double error;
|
95
|
+
|
73
96
|
kcluster(nsets,
|
74
97
|
nrows, ncols, cdata, cmask, cweights, transpose, npass, method, dist, ccluster, &error, &ifound);
|
75
|
-
|
76
98
|
getclustercentroids(nsets,
|
77
99
|
nrows, ncols, cdata, cmask, ccluster, ccentroid, ccentroid_mask, transpose, method);
|
78
100
|
|
@@ -80,10 +102,12 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
|
|
80
102
|
VALUE cluster = rb_ary_new();
|
81
103
|
VALUE centroid = rb_ary_new();
|
82
104
|
|
83
|
-
for (i = 0; i <
|
105
|
+
for (i = 0; i < dimx; i++)
|
84
106
|
rb_ary_push(cluster, INT2NUM(ccluster[i]));
|
107
|
+
|
108
|
+
for (i = 0; i < cdimx; i++) {
|
85
109
|
VALUE point = rb_ary_new();
|
86
|
-
for (j = 0; j <
|
110
|
+
for (j = 0; j < cdimy; j++)
|
87
111
|
rb_ary_push(point, DBL2NUM(ccentroid[i][j]));
|
88
112
|
rb_ary_push(centroid, point);
|
89
113
|
}
|
@@ -96,6 +120,9 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
|
|
96
120
|
for (i = 0; i < nrows; i++) {
|
97
121
|
free(cdata[i]);
|
98
122
|
free(cmask[i]);
|
123
|
+
}
|
124
|
+
|
125
|
+
for (i = 0; i < cdimx; i++) {
|
99
126
|
free(ccentroid[i]);
|
100
127
|
free(ccentroid_mask[i]);
|
101
128
|
}
|
@@ -110,9 +137,203 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
|
|
110
137
|
return result;
|
111
138
|
}
|
112
139
|
|
140
|
+
VALUE rb_som(int argc, VALUE *argv, VALUE self) {
|
141
|
+
VALUE nx, ny, data, mask, weights, options;
|
142
|
+
rb_scan_args(argc, argv, "32", &nx, &ny, &data, &mask, &options);
|
143
|
+
|
144
|
+
if (TYPE(data) != T_ARRAY)
|
145
|
+
rb_raise(rb_eArgError, "data should be an array of arrays");
|
146
|
+
|
147
|
+
if (!NIL_P(mask) && TYPE(mask) != T_ARRAY)
|
148
|
+
rb_raise(rb_eArgError, "mask should be an array of arrays");
|
149
|
+
|
150
|
+
if (NIL_P(nx) || NUM2INT(rb_Integer(nx)) <= 0)
|
151
|
+
rb_raise(rb_eArgError, "nx should be > 0");
|
152
|
+
|
153
|
+
if (NIL_P(ny) || NUM2INT(rb_Integer(ny)) <= 0)
|
154
|
+
rb_raise(rb_eArgError, "ny should be > 0");
|
155
|
+
|
156
|
+
int nxgrid = NUM2INT(rb_Integer(nx));
|
157
|
+
int nygrid = NUM2INT(rb_Integer(ny));
|
158
|
+
int transpose = opt_int_value(options, "transpose", 0);
|
159
|
+
int npass = opt_int_value(options, "iterations", 1000);
|
160
|
+
|
161
|
+
// e = euclidian,
|
162
|
+
// b = city-block distance
|
163
|
+
// c = correlation
|
164
|
+
// a = absolute value of the correlation
|
165
|
+
// u = uncentered correlation
|
166
|
+
// x = absolute uncentered correlation
|
167
|
+
// s = spearman's rank correlation
|
168
|
+
// k = kendall's tau
|
169
|
+
int dist = opt_int_value(options, "metric", 'e');
|
170
|
+
double tau = opt_double_value(options, "tau", 1.0);
|
171
|
+
|
172
|
+
int i, j, k;
|
173
|
+
int nrows = RARRAY_LEN(data);
|
174
|
+
int ncols = RARRAY_LEN(rb_ary_entry(data, 0));
|
175
|
+
|
176
|
+
double **cdata = (double**)malloc(sizeof(double*)*nrows);
|
177
|
+
int **cmask = (int **)malloc(sizeof(int *)*nrows);
|
178
|
+
double *cweights = (double *)malloc(sizeof(double )*ncols);
|
179
|
+
|
180
|
+
int **ccluster;
|
181
|
+
double ***ccelldata;
|
182
|
+
int dimx = nrows, dimy = ncols;
|
183
|
+
|
184
|
+
if (transpose) {
|
185
|
+
dimx = ncols;
|
186
|
+
dimy = nrows;
|
187
|
+
}
|
188
|
+
|
189
|
+
ccluster = (int **)malloc(sizeof(int*)*dimx);
|
190
|
+
for (i = 0; i < dimx; i++)
|
191
|
+
ccluster[i] = (int*)malloc(sizeof(int)*2);
|
192
|
+
|
193
|
+
for (i = 0; i < nrows; i++) {
|
194
|
+
cdata[i] = (double*)malloc(sizeof(double)*ncols);
|
195
|
+
cmask[i] = (int *)malloc(sizeof(int )*ncols);
|
196
|
+
for (j = 0; j < ncols; j++) {
|
197
|
+
cdata[i][j] = NUM2DBL(rb_Float(rb_ary_entry(rb_ary_entry(data, i), j)));
|
198
|
+
cmask[i][j] = NIL_P(mask) ? 1 : NUM2INT(rb_Integer(rb_ary_entry(rb_ary_entry(mask, i), j)));
|
199
|
+
}
|
200
|
+
}
|
201
|
+
|
202
|
+
weights = NIL_P(options) ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("weights")));
|
203
|
+
for (i = 0; i < ncols; i++) {
|
204
|
+
cweights[i] = NIL_P(weights) ? 1.0 : NUM2DBL(rb_Float(rb_ary_entry(weights, i)));
|
205
|
+
}
|
206
|
+
|
207
|
+
ccelldata = (double***)malloc(sizeof(double**)*nxgrid);
|
208
|
+
for (i = 0; i < nxgrid; i++) {
|
209
|
+
ccelldata[i] = (double **)malloc(sizeof(double*)*nygrid);
|
210
|
+
for (j = 0; j < nygrid; j++)
|
211
|
+
ccelldata[i][j] = (double *)malloc(sizeof(double)*dimy);
|
212
|
+
}
|
213
|
+
|
214
|
+
somcluster(nrows, ncols, cdata, cmask, cweights, transpose,
|
215
|
+
nxgrid, nygrid, tau, npass, dist, ccelldata, ccluster);
|
216
|
+
|
217
|
+
VALUE result = rb_hash_new();
|
218
|
+
VALUE cluster = rb_ary_new();
|
219
|
+
VALUE centroid = rb_ary_new();
|
220
|
+
|
221
|
+
for (i = 0; i < dimx; i++)
|
222
|
+
rb_ary_push(cluster, INT2NUM(ccluster[i][0] * nxgrid + ccluster[i][1]));
|
223
|
+
|
224
|
+
for (i = 0; i < nxgrid; i++) {
|
225
|
+
for (j = 0; j < nygrid; j++) {
|
226
|
+
VALUE point = rb_ary_new();
|
227
|
+
for (k = 0; k < dimy; k++)
|
228
|
+
rb_ary_push(point, DBL2NUM(ccelldata[i][j][k]));
|
229
|
+
rb_ary_push(centroid, point);
|
230
|
+
}
|
231
|
+
}
|
232
|
+
|
233
|
+
rb_hash_aset(result, ID2SYM(rb_intern("cluster")), cluster);
|
234
|
+
rb_hash_aset(result, ID2SYM(rb_intern("centroid")), centroid);
|
235
|
+
|
236
|
+
for (i = 0; i < nrows; i++) {
|
237
|
+
free(cdata[i]);
|
238
|
+
free(cmask[i]);
|
239
|
+
}
|
240
|
+
|
241
|
+
for (i = 0; i < dimx; i++)
|
242
|
+
free(ccluster[i]);
|
243
|
+
|
244
|
+
for (i = 0; i < nxgrid; i++) {
|
245
|
+
for (j = 0; j < nygrid; j++)
|
246
|
+
free(ccelldata[i][j]);
|
247
|
+
free(ccelldata[i]);
|
248
|
+
}
|
249
|
+
|
250
|
+
free(cdata);
|
251
|
+
free(cmask);
|
252
|
+
free(ccelldata);
|
253
|
+
free(cweights);
|
254
|
+
free(ccluster);
|
255
|
+
|
256
|
+
return result;
|
257
|
+
}
|
258
|
+
|
259
|
+
|
260
|
+
VALUE rb_distance(VALUE vec1, VALUE vec2, distance_fn fn) {
|
261
|
+
uint32_t size;
|
262
|
+
double *data1, *data2, *weight, dist;
|
263
|
+
int *mask, i;
|
264
|
+
|
265
|
+
if (TYPE(vec1) != T_ARRAY)
|
266
|
+
rb_raise(rb_eArgError, "vector1 should be an array");
|
267
|
+
|
268
|
+
if (TYPE(vec2) != T_ARRAY)
|
269
|
+
rb_raise(rb_eArgError, "vector2 should be an array");
|
270
|
+
|
271
|
+
size = RARRAY_LEN(vec1);
|
272
|
+
|
273
|
+
if (size != RARRAY_LEN(vec2))
|
274
|
+
rb_raise(rb_eArgError, "vector1 & vector2 dimensions mismatch");
|
275
|
+
|
276
|
+
if (size < 1)
|
277
|
+
rb_raise(rb_eArgError, "dimension should be greater than 0");
|
278
|
+
|
279
|
+
data1 = (double *)malloc(sizeof(double)*size);
|
280
|
+
data2 = (double *)malloc(sizeof(double)*size);
|
281
|
+
weight = (double *)malloc(sizeof(double)*size);
|
282
|
+
mask = (int *)malloc(sizeof(int)*size);
|
283
|
+
|
284
|
+
for (i = 0; i < size; i++) {
|
285
|
+
mask[i] = 1;
|
286
|
+
weight[i] = 1;
|
287
|
+
data1[i] = NUM2DBL(rb_ary_entry(vec1, i));
|
288
|
+
data2[i] = NUM2DBL(rb_ary_entry(vec2, i));
|
289
|
+
}
|
290
|
+
|
291
|
+
dist = fn(size, &data1, &data2, &mask, &mask, weight, 0, 0, 0);
|
292
|
+
free(mask);
|
293
|
+
free(weight);
|
294
|
+
free(data2);
|
295
|
+
free(data1);
|
296
|
+
|
297
|
+
return DBL2NUM(dist);
|
298
|
+
}
|
299
|
+
|
300
|
+
VALUE rb_euclid(VALUE self, VALUE vec1, VALUE vec2) {
|
301
|
+
return rb_distance(vec1, vec2, euclid);
|
302
|
+
}
|
303
|
+
|
304
|
+
VALUE rb_cityblock(VALUE self, VALUE vec1, VALUE vec2) {
|
305
|
+
return rb_distance(vec1, vec2, cityblock);
|
306
|
+
}
|
307
|
+
|
308
|
+
VALUE rb_correlation(VALUE self, VALUE vec1, VALUE vec2) {
|
309
|
+
return rb_distance(vec1, vec2, correlation);
|
310
|
+
}
|
311
|
+
|
312
|
+
VALUE rb_ucorrelation(VALUE self, VALUE vec1, VALUE vec2) {
|
313
|
+
return rb_distance(vec1, vec2, ucorrelation);
|
314
|
+
}
|
315
|
+
|
316
|
+
VALUE rb_acorrelation(VALUE self, VALUE vec1, VALUE vec2) {
|
317
|
+
return rb_distance(vec1, vec2, acorrelation);
|
318
|
+
}
|
319
|
+
|
320
|
+
VALUE rb_uacorrelation(VALUE self, VALUE vec1, VALUE vec2) {
|
321
|
+
return rb_distance(vec1, vec2, uacorrelation);
|
322
|
+
}
|
323
|
+
|
324
|
+
VALUE rb_spearman(VALUE self, VALUE vec1, VALUE vec2) {
|
325
|
+
return rb_distance(vec1, vec2, spearman);
|
326
|
+
}
|
327
|
+
|
328
|
+
VALUE rb_kendall(VALUE self, VALUE vec1, VALUE vec2) {
|
329
|
+
return rb_distance(vec1, vec2, kendall);
|
330
|
+
}
|
331
|
+
|
332
|
+
|
113
333
|
void Init_flock(void) {
|
114
334
|
mFlock = rb_define_module("Flock");
|
115
335
|
rb_define_module_function(mFlock, "kmeans", RUBY_METHOD_FUNC(rb_kmeans), -1);
|
336
|
+
rb_define_module_function(mFlock, "self_organizing_map", RUBY_METHOD_FUNC(rb_som), -1);
|
116
337
|
|
117
338
|
rb_define_const(mFlock, "METHOD_AVERAGE", INT2NUM('a'));
|
118
339
|
rb_define_const(mFlock, "METHOD_MEDIAN", INT2NUM('m'));
|
@@ -125,4 +346,13 @@ void Init_flock(void) {
|
|
125
346
|
rb_define_const(mFlock, "METRIC_ABSOLUTE_UNCENTERED_CORRELATION", INT2NUM('x'));
|
126
347
|
rb_define_const(mFlock, "METRIC_SPEARMAN", INT2NUM('s'));
|
127
348
|
rb_define_const(mFlock, "METRIC_KENDALL", INT2NUM('k'));
|
349
|
+
|
350
|
+
rb_define_module_function(mFlock, "euclidian_distance", RUBY_METHOD_FUNC(rb_euclid), 2);
|
351
|
+
rb_define_module_function(mFlock, "cityblock_distance", RUBY_METHOD_FUNC(rb_cityblock), 2);
|
352
|
+
rb_define_module_function(mFlock, "correlation_distance", RUBY_METHOD_FUNC(rb_correlation), 2);
|
353
|
+
rb_define_module_function(mFlock, "absolute_correlation_distance", RUBY_METHOD_FUNC(rb_acorrelation), 2);
|
354
|
+
rb_define_module_function(mFlock, "uncentered_correlation_distance", RUBY_METHOD_FUNC(rb_ucorrelation), 2);
|
355
|
+
rb_define_module_function(mFlock, "absolute_uncentered_correlation_distance", RUBY_METHOD_FUNC(rb_uacorrelation), 2);
|
356
|
+
rb_define_module_function(mFlock, "spearman_distance", RUBY_METHOD_FUNC(rb_spearman), 2);
|
357
|
+
rb_define_module_function(mFlock, "kendall_distance", RUBY_METHOD_FUNC(rb_kendall), 2);
|
128
358
|
}
|
data/flock.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{flock}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.3.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Bharanee Rathna"]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-04-24}
|
13
13
|
s.description = %q{A thin ruby binding to Cluster 3.0}
|
14
14
|
s.email = ["deepfryed@gmail.com"]
|
15
15
|
s.extensions = ["ext/extconf.rb"]
|
@@ -34,6 +34,7 @@ Gem::Specification.new do |s|
|
|
34
34
|
s.summary = %q{Ruby bindings to Cluster 3.0.}
|
35
35
|
s.test_files = [
|
36
36
|
"examples/sparse.rb",
|
37
|
+
"examples/som.rb",
|
37
38
|
"examples/dense.rb"
|
38
39
|
]
|
39
40
|
|
data/lib/flock.rb
CHANGED
@@ -21,7 +21,7 @@ module Flock
|
|
21
21
|
[dims,data]
|
22
22
|
end
|
23
23
|
|
24
|
-
def self.sparse_kmeans size, sparse_data, options={}
|
24
|
+
def self.sparse_kmeans size, sparse_data, options = {}
|
25
25
|
dims, data = sparse_data[0].kind_of?(Array) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
|
26
26
|
|
27
27
|
if options.key?(:weights)
|
@@ -32,4 +32,16 @@ module Flock
|
|
32
32
|
|
33
33
|
kmeans(size, data, nil, options)
|
34
34
|
end
|
35
|
+
|
36
|
+
def self.sparse_self_organizing_map nx, ny, sparse_data, options = {}
|
37
|
+
dims, data = sparse_data[0].kind_of?(Array) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
|
38
|
+
|
39
|
+
if options.key?(:weights)
|
40
|
+
weights = Array.new(dims.size) {1}
|
41
|
+
options[:weights].each {|k,v| weights[dims[k]] = v }
|
42
|
+
options[:weights] = weights
|
43
|
+
end
|
44
|
+
|
45
|
+
self_organizing_map(nx, ny, data, nil, options)
|
46
|
+
end
|
35
47
|
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 3
|
8
|
+
- 0
|
9
|
+
version: 0.3.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Bharanee Rathna
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-04-24 00:00:00 +10:00
|
18
18
|
default_executable:
|
19
19
|
dependencies: []
|
20
20
|
|
@@ -38,6 +38,7 @@ files:
|
|
38
38
|
- flock.gemspec
|
39
39
|
- lib/flock.rb
|
40
40
|
- examples/sparse.rb
|
41
|
+
- examples/som.rb
|
41
42
|
- examples/dense.rb
|
42
43
|
has_rdoc: true
|
43
44
|
homepage: http://github.com/deepfryed/flock
|
@@ -73,4 +74,5 @@ specification_version: 3
|
|
73
74
|
summary: Ruby bindings to Cluster 3.0.
|
74
75
|
test_files:
|
75
76
|
- examples/sparse.rb
|
77
|
+
- examples/som.rb
|
76
78
|
- examples/dense.rb
|