flock 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +9 -7
- data/VERSION +1 -1
- data/examples/som.rb +13 -0
- data/examples/sparse.rb +5 -5
- data/ext/cluster.c +20 -28
- data/ext/cluster.h +14 -4
- data/ext/flock.c +251 -21
- data/flock.gemspec +3 -2
- data/lib/flock.rb +13 -1
- metadata +6 -4
data/README.rdoc
CHANGED
@@ -76,17 +76,19 @@ Provides bindings to K-Means clustering in Cluster 3.0
|
|
76
76
|
require 'flock'
|
77
77
|
|
78
78
|
data = []
|
79
|
-
|
80
|
-
|
81
|
-
data << {
|
82
|
-
data << {
|
83
|
-
data << {
|
79
|
+
|
80
|
+
# keys don't need to be numeric
|
81
|
+
data << { 1 => 0.5, 2 => 0.5 }
|
82
|
+
data << { 3 => 1, 4 => 1 }
|
83
|
+
data << { 4 => 1, 5 => 0.3 }
|
84
|
+
data << { 2 => 0.75 }
|
85
|
+
data << { 1 => 0.60 }
|
84
86
|
|
85
87
|
pp Flock.sparse_kmeans(2, data)
|
86
88
|
|
87
|
-
# or even more simply (defaults to 1)
|
88
|
-
|
89
89
|
data = []
|
90
|
+
|
91
|
+
# a much simpler way to cluster text
|
90
92
|
data << %w(apple orange)
|
91
93
|
data << %w(black white)
|
92
94
|
data << %w(white cyan)
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/examples/som.rb
ADDED
data/examples/sparse.rb
CHANGED
@@ -4,11 +4,11 @@ require 'pp'
|
|
4
4
|
require 'flock'
|
5
5
|
|
6
6
|
data = []
|
7
|
-
data << {
|
8
|
-
data << {
|
9
|
-
data << {
|
10
|
-
data << {
|
11
|
-
data << {
|
7
|
+
data << { 1 => 0.5, 2 => 0.5 }
|
8
|
+
data << { 3 => 1, 4 => 1 }
|
9
|
+
data << { 4 => 1, 5 => 0.3 }
|
10
|
+
data << { 2 => 0.75 }
|
11
|
+
data << { 1 => 0.60 }
|
12
12
|
|
13
13
|
pp Flock.sparse_kmeans(2, data)
|
14
14
|
|
data/ext/cluster.c
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
* Human Genome Center, Institute of Medical Science, University of Tokyo,
|
6
6
|
* 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
|
7
7
|
* Contact: mdehoon 'AT' gsc.riken.jp
|
8
|
-
*
|
8
|
+
*
|
9
9
|
* Permission to use, copy, modify, and distribute this software and its
|
10
10
|
* documentation with or without modifications and for any purpose and
|
11
11
|
* without fee is hereby granted, provided that any copyright notices
|
@@ -14,7 +14,7 @@
|
|
14
14
|
* names of the contributors or copyright holders not be used in
|
15
15
|
* advertising or publicity pertaining to distribution of the software
|
16
16
|
* without specific prior permission.
|
17
|
-
*
|
17
|
+
*
|
18
18
|
* THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
|
19
19
|
* WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
|
20
20
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
|
@@ -23,7 +23,7 @@
|
|
23
23
|
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
24
24
|
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
25
25
|
* OR PERFORMANCE OF THIS SOFTWARE.
|
26
|
-
*
|
26
|
+
*
|
27
27
|
*/
|
28
28
|
|
29
29
|
#include <time.h>
|
@@ -334,7 +334,7 @@ static int svd(int m, int n, double** u, double w[], double** vt)
|
|
334
334
|
* A=usv of a real m by n rectangular matrix, where m is greater
|
335
335
|
* than or equal to n. Householder bidiagonalization and a variant
|
336
336
|
* of the QR algorithm are used.
|
337
|
-
*
|
337
|
+
*
|
338
338
|
*
|
339
339
|
* On input.
|
340
340
|
*
|
@@ -929,10 +929,9 @@ positive integer if the singular value decomposition fails to converge.
|
|
929
929
|
|
930
930
|
/* ********************************************************************* */
|
931
931
|
|
932
|
-
static
|
933
932
|
double euclid (int n, double** data1, double** data2, int** mask1, int** mask2,
|
934
933
|
const double weight[], int index1, int index2, int transpose)
|
935
|
-
|
934
|
+
|
936
935
|
/*
|
937
936
|
Purpose
|
938
937
|
=======
|
@@ -1004,7 +1003,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
|
|
1004
1003
|
|
1005
1004
|
/* ********************************************************************* */
|
1006
1005
|
|
1007
|
-
static
|
1008
1006
|
double cityblock (int n, double** data1, double** data2, int** mask1,
|
1009
1007
|
int** mask2, const double weight[], int index1, int index2, int transpose)
|
1010
1008
|
|
@@ -1080,7 +1078,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
|
|
1080
1078
|
|
1081
1079
|
/* ********************************************************************* */
|
1082
1080
|
|
1083
|
-
static
|
1084
1081
|
double correlation (int n, double** data1, double** data2, int** mask1,
|
1085
1082
|
int** mask2, const double weight[], int index1, int index2, int transpose)
|
1086
1083
|
/*
|
@@ -1180,7 +1177,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
|
|
1180
1177
|
|
1181
1178
|
/* ********************************************************************* */
|
1182
1179
|
|
1183
|
-
static
|
1184
1180
|
double acorrelation (int n, double** data1, double** data2, int** mask1,
|
1185
1181
|
int** mask2, const double weight[], int index1, int index2, int transpose)
|
1186
1182
|
/*
|
@@ -1279,7 +1275,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
|
|
1279
1275
|
|
1280
1276
|
/* ********************************************************************* */
|
1281
1277
|
|
1282
|
-
static
|
1283
1278
|
double ucorrelation (int n, double** data1, double** data2, int** mask1,
|
1284
1279
|
int** mask2, const double weight[], int index1, int index2, int transpose)
|
1285
1280
|
/*
|
@@ -1374,7 +1369,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
|
|
1374
1369
|
|
1375
1370
|
/* ********************************************************************* */
|
1376
1371
|
|
1377
|
-
static
|
1378
1372
|
double uacorrelation (int n, double** data1, double** data2, int** mask1,
|
1379
1373
|
int** mask2, const double weight[], int index1, int index2, int transpose)
|
1380
1374
|
/*
|
@@ -1469,7 +1463,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
|
|
1469
1463
|
|
1470
1464
|
/* ********************************************************************* */
|
1471
1465
|
|
1472
|
-
static
|
1473
1466
|
double spearman (int n, double** data1, double** data2, int** mask1,
|
1474
1467
|
int** mask2, const double weight[], int index1, int index2, int transpose)
|
1475
1468
|
/*
|
@@ -1597,7 +1590,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
|
|
1597
1590
|
|
1598
1591
|
/* ********************************************************************* */
|
1599
1592
|
|
1600
|
-
static
|
1601
1593
|
double kendall (int n, double** data1, double** data2, int** mask1, int** mask2,
|
1602
1594
|
const double weight[], int index1, int index2, int transpose)
|
1603
1595
|
/*
|
@@ -1708,7 +1700,7 @@ Otherwise, the distance between two columns in the matrix is calculated.
|
|
1708
1700
|
|
1709
1701
|
/* ********************************************************************* */
|
1710
1702
|
|
1711
|
-
static double(*setmetric(char dist))
|
1703
|
+
static double(*setmetric(char dist))
|
1712
1704
|
(int, double**, double**, int**, int**, const double[], int, int, int)
|
1713
1705
|
{ switch(dist)
|
1714
1706
|
{ case 'e': return &euclid;
|
@@ -2203,7 +2195,7 @@ calculating the medians.
|
|
2203
2195
|
}
|
2204
2196
|
}
|
2205
2197
|
}
|
2206
|
-
|
2198
|
+
|
2207
2199
|
/* ********************************************************************* */
|
2208
2200
|
|
2209
2201
|
int getclustercentroids(int nclusters, int nrows, int ncolumns,
|
@@ -2427,7 +2419,7 @@ kmeans(int nclusters, int nrows, int ncolumns, double** data, int** mask,
|
|
2427
2419
|
break; /* Identical solution found; break out of this loop */
|
2428
2420
|
}
|
2429
2421
|
|
2430
|
-
if (npass<=1)
|
2422
|
+
if (npass<=1)
|
2431
2423
|
{ *error = total;
|
2432
2424
|
break;
|
2433
2425
|
}
|
@@ -2532,7 +2524,7 @@ kmedians(int nclusters, int nrows, int ncolumns, double** data, int** mask,
|
|
2532
2524
|
break; /* Identical solution found; break out of this loop */
|
2533
2525
|
}
|
2534
2526
|
|
2535
|
-
if (npass<=1)
|
2527
|
+
if (npass<=1)
|
2536
2528
|
{ *error = total;
|
2537
2529
|
break;
|
2538
2530
|
}
|
@@ -2603,7 +2595,7 @@ of the matrix are clustered.
|
|
2603
2595
|
|
2604
2596
|
npass (input) int
|
2605
2597
|
The number of times clustering is performed. Clustering is performed npass
|
2606
|
-
times, each time starting from a different (random) initial assignment of
|
2598
|
+
times, each time starting from a different (random) initial assignment of
|
2607
2599
|
genes to clusters. The clustering solution with the lowest within-cluster sum
|
2608
2600
|
of distances is chosen.
|
2609
2601
|
If npass==0, then the clustering algorithm will be run once, where the initial
|
@@ -2697,7 +2689,7 @@ number of clusters is larger than the number of elements being clustered,
|
|
2697
2689
|
return;
|
2698
2690
|
}
|
2699
2691
|
}
|
2700
|
-
|
2692
|
+
|
2701
2693
|
if (method=='m')
|
2702
2694
|
{ double* cache = malloc(nelements*sizeof(double));
|
2703
2695
|
if(cache)
|
@@ -3105,7 +3097,7 @@ weights array, the function returns NULL.
|
|
3105
3097
|
|
3106
3098
|
/* ******************************************************************** */
|
3107
3099
|
|
3108
|
-
void cuttree (int nelements, Node* tree, int nclusters, int clusterid[])
|
3100
|
+
void cuttree (int nelements, Node* tree, int nclusters, int clusterid[])
|
3109
3101
|
|
3110
3102
|
/*
|
3111
3103
|
Purpose
|
@@ -3160,7 +3152,7 @@ error occured, all elements in clusterid are set to -1.
|
|
3160
3152
|
}
|
3161
3153
|
for (i = 0; i < n; i++) nodeid[i] = -1;
|
3162
3154
|
for (i = n-1; i >= 0; i--)
|
3163
|
-
{ if(nodeid[i]<0)
|
3155
|
+
{ if(nodeid[i]<0)
|
3164
3156
|
{ j = icluster;
|
3165
3157
|
nodeid[i] = j;
|
3166
3158
|
icluster++;
|
@@ -3269,7 +3261,7 @@ If a memory error occurs, pclcluster returns NULL.
|
|
3269
3261
|
if(!makedatamask(nelements, ndata, &newdata, &newmask))
|
3270
3262
|
{ free(result);
|
3271
3263
|
free(distid);
|
3272
|
-
return NULL;
|
3264
|
+
return NULL;
|
3273
3265
|
}
|
3274
3266
|
|
3275
3267
|
for (i = 0; i < nelements; i++) distid[i] = i;
|
@@ -3313,7 +3305,7 @@ If a memory error occurs, pclcluster returns NULL.
|
|
3313
3305
|
free(mask[is]);
|
3314
3306
|
data[is] = data[nnodes-inode];
|
3315
3307
|
mask[is] = mask[nnodes-inode];
|
3316
|
-
|
3308
|
+
|
3317
3309
|
/* Fix the distances */
|
3318
3310
|
distid[is] = distid[nnodes-inode];
|
3319
3311
|
for (i = 0; i < is; i++)
|
@@ -3334,7 +3326,7 @@ If a memory error occurs, pclcluster returns NULL.
|
|
3334
3326
|
free(data);
|
3335
3327
|
free(mask);
|
3336
3328
|
free(distid);
|
3337
|
-
|
3329
|
+
|
3338
3330
|
return result;
|
3339
3331
|
}
|
3340
3332
|
|
@@ -3829,7 +3821,7 @@ If a memory error occurs, treecluster returns NULL.
|
|
3829
3821
|
for (i = 1; i < nelements; i++) free(distmatrix[i]);
|
3830
3822
|
free (distmatrix);
|
3831
3823
|
}
|
3832
|
-
|
3824
|
+
|
3833
3825
|
return result;
|
3834
3826
|
}
|
3835
3827
|
|
@@ -4037,7 +4029,7 @@ void somworker (int nrows, int ncolumns, double** data, int** mask,
|
|
4037
4029
|
static
|
4038
4030
|
void somassign (int nrows, int ncolumns, double** data, int** mask,
|
4039
4031
|
const double weights[], int transpose, int nxgrid, int nygrid,
|
4040
|
-
double*** celldata, char dist, int clusterid
|
4032
|
+
double*** celldata, char dist, int **clusterid)
|
4041
4033
|
/* Collect clusterids */
|
4042
4034
|
{ const int ndata = (transpose==0) ? ncolumns : nrows;
|
4043
4035
|
int i,j;
|
@@ -4121,7 +4113,7 @@ void somassign (int nrows, int ncolumns, double** data, int** mask,
|
|
4121
4113
|
|
4122
4114
|
void somcluster (int nrows, int ncolumns, double** data, int** mask,
|
4123
4115
|
const double weight[], int transpose, int nxgrid, int nygrid,
|
4124
|
-
double inittau, int niter, char dist, double*** celldata, int clusterid
|
4116
|
+
double inittau, int niter, char dist, double*** celldata, int **clusterid)
|
4125
4117
|
/*
|
4126
4118
|
|
4127
4119
|
Purpose
|
@@ -4235,7 +4227,7 @@ somcluster.
|
|
4235
4227
|
double clusterdistance (int nrows, int ncolumns, double** data,
|
4236
4228
|
int** mask, double weight[], int n1, int n2, int index1[], int index2[],
|
4237
4229
|
char dist, char method, int transpose)
|
4238
|
-
|
4230
|
+
|
4239
4231
|
/*
|
4240
4232
|
Purpose
|
4241
4233
|
=======
|
data/ext/cluster.h
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
* Human Genome Center, Institute of Medical Science, University of Tokyo,
|
7
7
|
* 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
|
8
8
|
* Contact: mdehoon 'AT' gsc.riken.jp
|
9
|
-
*
|
9
|
+
*
|
10
10
|
* Permission to use, copy, modify, and distribute this software and its
|
11
11
|
* documentation with or without modifications and for any purpose and
|
12
12
|
* without fee is hereby granted, provided that any copyright notices
|
@@ -15,7 +15,7 @@
|
|
15
15
|
* names of the contributors or copyright holders not be used in
|
16
16
|
* advertising or publicity pertaining to distribution of the software
|
17
17
|
* without specific prior permission.
|
18
|
-
*
|
18
|
+
*
|
19
19
|
* THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
|
20
20
|
* WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
|
21
21
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
|
@@ -24,7 +24,7 @@
|
|
24
24
|
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
25
25
|
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
26
26
|
* OR PERFORMANCE OF THIS SOFTWARE.
|
27
|
-
*
|
27
|
+
*
|
28
28
|
*/
|
29
29
|
|
30
30
|
#ifndef min
|
@@ -79,7 +79,7 @@ void cuttree (int nelements, Node* tree, int nclusters, int clusterid[]);
|
|
79
79
|
void somcluster (int nrows, int ncolumns, double** data, int** mask,
|
80
80
|
const double weight[], int transpose, int nxnodes, int nynodes,
|
81
81
|
double inittau, int niter, char dist, double*** celldata,
|
82
|
-
int clusterid
|
82
|
+
int **clusterid);
|
83
83
|
|
84
84
|
/* Chapter 6 */
|
85
85
|
int pca(int m, int n, double** u, double** v, double* w);
|
@@ -91,3 +91,13 @@ double median (int n, double x[]);
|
|
91
91
|
|
92
92
|
double* calculate_weights(int nrows, int ncolumns, double** data, int** mask,
|
93
93
|
double weights[], int transpose, char dist, double cutoff, double exponent);
|
94
|
+
|
95
|
+
/* distance functions */
|
96
|
+
extern double euclid (int, double**, double**, int**, int**, const double [], int, int, int);
|
97
|
+
extern double cityblock(int, double**, double**, int**, int**, const double [], int, int, int);
|
98
|
+
extern double correlation(int, double**, double**, int**, int**, const double [], int, int, int);
|
99
|
+
extern double acorrelation(int, double**, double**, int**, int**, const double [], int, int, int);
|
100
|
+
extern double ucorrelation(int, double**, double**, int**, int**, const double [], int, int, int);
|
101
|
+
extern double uacorrelation(int, double**, double**, int**, int**, const double [], int, int, int);
|
102
|
+
extern double spearman(int, double**, double**, int**, int**, const double [], int, int, int);
|
103
|
+
extern double kendall(int, double**, double**, int**, int**, const double [], int, int, int);
|
data/ext/flock.c
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
#define CONST_GET(scope, constant) (rb_funcall(scope, ID_CONST_GET, 1, rb_str_new2(constant)))
|
6
6
|
|
7
7
|
static VALUE mFlock;
|
8
|
+
typedef double (*distance_fn)(int, double**, double**, int**, int**, const double [], int, int, int);
|
8
9
|
|
9
10
|
int opt_int_value(VALUE option, char *key, int def) {
|
10
11
|
if (NIL_P(option)) return def;
|
@@ -13,6 +14,13 @@ int opt_int_value(VALUE option, char *key, int def) {
|
|
13
14
|
return NIL_P(value) ? def : NUM2INT(value);
|
14
15
|
}
|
15
16
|
|
17
|
+
int opt_double_value(VALUE option, char *key, double def) {
|
18
|
+
if (NIL_P(option)) return def;
|
19
|
+
|
20
|
+
VALUE value = rb_hash_aref(option, ID2SYM(rb_intern(key)));
|
21
|
+
return NIL_P(value) ? def : NUM2DBL(value);
|
22
|
+
}
|
23
|
+
|
16
24
|
VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
|
17
25
|
VALUE size, data, mask, weights, options;
|
18
26
|
rb_scan_args(argc, argv, "22", &size, &data, &mask, &options);
|
@@ -26,6 +34,20 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
|
|
26
34
|
if (NIL_P(size) || NUM2INT(rb_Integer(size)) > RARRAY_LEN(data))
|
27
35
|
rb_raise(rb_eArgError, "size should be > 0 and <= data size");
|
28
36
|
|
37
|
+
int transpose = opt_int_value(options, "transpose", 0);
|
38
|
+
int npass = opt_int_value(options, "iterations", 1000);
|
39
|
+
// a = average, m = means
|
40
|
+
int method = opt_int_value(options, "method", 'a');
|
41
|
+
// e = euclidian,
|
42
|
+
// b = city-block distance
|
43
|
+
// c = correlation
|
44
|
+
// a = absolute value of the correlation
|
45
|
+
// u = uncentered correlation
|
46
|
+
// x = absolute uncentered correlation
|
47
|
+
// s = spearman's rank correlation
|
48
|
+
// k = kendall's tau
|
49
|
+
int dist = opt_int_value(options, "metric", 'e');
|
50
|
+
|
29
51
|
int i,j;
|
30
52
|
int nrows = RARRAY_LEN(data);
|
31
53
|
int ncols = RARRAY_LEN(rb_ary_entry(data, 0));
|
@@ -33,16 +55,14 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
|
|
33
55
|
|
34
56
|
double **cdata = (double**)malloc(sizeof(double*)*nrows);
|
35
57
|
int **cmask = (int **)malloc(sizeof(int *)*nrows);
|
36
|
-
double **ccentroid = (double**)malloc(sizeof(double*)*nrows);
|
37
|
-
int **ccentroid_mask = (int **)malloc(sizeof(int *)*nrows);
|
38
58
|
double *cweights = (double *)malloc(sizeof(double )*ncols);
|
39
|
-
|
59
|
+
|
60
|
+
double **ccentroid;
|
61
|
+
int *ccluster, **ccentroid_mask, dimx = nrows, dimy = ncols, cdimx = nsets, cdimy = ncols;
|
40
62
|
|
41
63
|
for (i = 0; i < nrows; i++) {
|
42
64
|
cdata[i] = (double*)malloc(sizeof(double)*ncols);
|
43
65
|
cmask[i] = (int *)malloc(sizeof(int )*ncols);
|
44
|
-
ccentroid[i] = (double*)malloc(sizeof(double)*ncols);
|
45
|
-
ccentroid_mask[i] = (int *)malloc(sizeof(int )*ncols);
|
46
66
|
for (j = 0; j < ncols; j++) {
|
47
67
|
cdata[i][j] = NUM2DBL(rb_Float(rb_ary_entry(rb_ary_entry(data, i), j)));
|
48
68
|
cmask[i][j] = NIL_P(mask) ? 1 : NUM2INT(rb_Integer(rb_ary_entry(rb_ary_entry(mask, i), j)));
|
@@ -54,25 +74,27 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
|
|
54
74
|
cweights[i] = NIL_P(weights) ? 1.0 : NUM2DBL(rb_Float(rb_ary_entry(weights, i)));
|
55
75
|
}
|
56
76
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
77
|
+
if (transpose) {
|
78
|
+
dimx = ncols;
|
79
|
+
dimy = nrows;
|
80
|
+
cdimx = nrows;
|
81
|
+
cdimy = nsets;
|
82
|
+
}
|
83
|
+
|
84
|
+
ccluster = (int *)malloc(sizeof(int )*dimx);
|
85
|
+
ccentroid = (double**)malloc(sizeof(double*)*cdimx);
|
86
|
+
ccentroid_mask = (int **)malloc(sizeof(int *)*cdimx);
|
87
|
+
|
88
|
+
for (i = 0; i < cdimx; i++) {
|
89
|
+
ccentroid[i] = (double*)malloc(sizeof(double)*cdimy);
|
90
|
+
ccentroid_mask[i] = (int *)malloc(sizeof(int )*cdimy);
|
91
|
+
}
|
70
92
|
|
71
93
|
int ifound;
|
72
94
|
double error;
|
95
|
+
|
73
96
|
kcluster(nsets,
|
74
97
|
nrows, ncols, cdata, cmask, cweights, transpose, npass, method, dist, ccluster, &error, &ifound);
|
75
|
-
|
76
98
|
getclustercentroids(nsets,
|
77
99
|
nrows, ncols, cdata, cmask, ccluster, ccentroid, ccentroid_mask, transpose, method);
|
78
100
|
|
@@ -80,10 +102,12 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
|
|
80
102
|
VALUE cluster = rb_ary_new();
|
81
103
|
VALUE centroid = rb_ary_new();
|
82
104
|
|
83
|
-
for (i = 0; i <
|
105
|
+
for (i = 0; i < dimx; i++)
|
84
106
|
rb_ary_push(cluster, INT2NUM(ccluster[i]));
|
107
|
+
|
108
|
+
for (i = 0; i < cdimx; i++) {
|
85
109
|
VALUE point = rb_ary_new();
|
86
|
-
for (j = 0; j <
|
110
|
+
for (j = 0; j < cdimy; j++)
|
87
111
|
rb_ary_push(point, DBL2NUM(ccentroid[i][j]));
|
88
112
|
rb_ary_push(centroid, point);
|
89
113
|
}
|
@@ -96,6 +120,9 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
|
|
96
120
|
for (i = 0; i < nrows; i++) {
|
97
121
|
free(cdata[i]);
|
98
122
|
free(cmask[i]);
|
123
|
+
}
|
124
|
+
|
125
|
+
for (i = 0; i < cdimx; i++) {
|
99
126
|
free(ccentroid[i]);
|
100
127
|
free(ccentroid_mask[i]);
|
101
128
|
}
|
@@ -110,9 +137,203 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
|
|
110
137
|
return result;
|
111
138
|
}
|
112
139
|
|
140
|
+
VALUE rb_som(int argc, VALUE *argv, VALUE self) {
|
141
|
+
VALUE nx, ny, data, mask, weights, options;
|
142
|
+
rb_scan_args(argc, argv, "32", &nx, &ny, &data, &mask, &options);
|
143
|
+
|
144
|
+
if (TYPE(data) != T_ARRAY)
|
145
|
+
rb_raise(rb_eArgError, "data should be an array of arrays");
|
146
|
+
|
147
|
+
if (!NIL_P(mask) && TYPE(mask) != T_ARRAY)
|
148
|
+
rb_raise(rb_eArgError, "mask should be an array of arrays");
|
149
|
+
|
150
|
+
if (NIL_P(nx) || NUM2INT(rb_Integer(nx)) <= 0)
|
151
|
+
rb_raise(rb_eArgError, "nx should be > 0");
|
152
|
+
|
153
|
+
if (NIL_P(ny) || NUM2INT(rb_Integer(ny)) <= 0)
|
154
|
+
rb_raise(rb_eArgError, "ny should be > 0");
|
155
|
+
|
156
|
+
int nxgrid = NUM2INT(rb_Integer(nx));
|
157
|
+
int nygrid = NUM2INT(rb_Integer(ny));
|
158
|
+
int transpose = opt_int_value(options, "transpose", 0);
|
159
|
+
int npass = opt_int_value(options, "iterations", 1000);
|
160
|
+
|
161
|
+
// e = euclidian,
|
162
|
+
// b = city-block distance
|
163
|
+
// c = correlation
|
164
|
+
// a = absolute value of the correlation
|
165
|
+
// u = uncentered correlation
|
166
|
+
// x = absolute uncentered correlation
|
167
|
+
// s = spearman's rank correlation
|
168
|
+
// k = kendall's tau
|
169
|
+
int dist = opt_int_value(options, "metric", 'e');
|
170
|
+
double tau = opt_double_value(options, "tau", 1.0);
|
171
|
+
|
172
|
+
int i, j, k;
|
173
|
+
int nrows = RARRAY_LEN(data);
|
174
|
+
int ncols = RARRAY_LEN(rb_ary_entry(data, 0));
|
175
|
+
|
176
|
+
double **cdata = (double**)malloc(sizeof(double*)*nrows);
|
177
|
+
int **cmask = (int **)malloc(sizeof(int *)*nrows);
|
178
|
+
double *cweights = (double *)malloc(sizeof(double )*ncols);
|
179
|
+
|
180
|
+
int **ccluster;
|
181
|
+
double ***ccelldata;
|
182
|
+
int dimx = nrows, dimy = ncols;
|
183
|
+
|
184
|
+
if (transpose) {
|
185
|
+
dimx = ncols;
|
186
|
+
dimy = nrows;
|
187
|
+
}
|
188
|
+
|
189
|
+
ccluster = (int **)malloc(sizeof(int*)*dimx);
|
190
|
+
for (i = 0; i < dimx; i++)
|
191
|
+
ccluster[i] = (int*)malloc(sizeof(int)*2);
|
192
|
+
|
193
|
+
for (i = 0; i < nrows; i++) {
|
194
|
+
cdata[i] = (double*)malloc(sizeof(double)*ncols);
|
195
|
+
cmask[i] = (int *)malloc(sizeof(int )*ncols);
|
196
|
+
for (j = 0; j < ncols; j++) {
|
197
|
+
cdata[i][j] = NUM2DBL(rb_Float(rb_ary_entry(rb_ary_entry(data, i), j)));
|
198
|
+
cmask[i][j] = NIL_P(mask) ? 1 : NUM2INT(rb_Integer(rb_ary_entry(rb_ary_entry(mask, i), j)));
|
199
|
+
}
|
200
|
+
}
|
201
|
+
|
202
|
+
weights = NIL_P(options) ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("weights")));
|
203
|
+
for (i = 0; i < ncols; i++) {
|
204
|
+
cweights[i] = NIL_P(weights) ? 1.0 : NUM2DBL(rb_Float(rb_ary_entry(weights, i)));
|
205
|
+
}
|
206
|
+
|
207
|
+
ccelldata = (double***)malloc(sizeof(double**)*nxgrid);
|
208
|
+
for (i = 0; i < nxgrid; i++) {
|
209
|
+
ccelldata[i] = (double **)malloc(sizeof(double*)*nygrid);
|
210
|
+
for (j = 0; j < nygrid; j++)
|
211
|
+
ccelldata[i][j] = (double *)malloc(sizeof(double)*dimy);
|
212
|
+
}
|
213
|
+
|
214
|
+
somcluster(nrows, ncols, cdata, cmask, cweights, transpose,
|
215
|
+
nxgrid, nygrid, tau, npass, dist, ccelldata, ccluster);
|
216
|
+
|
217
|
+
VALUE result = rb_hash_new();
|
218
|
+
VALUE cluster = rb_ary_new();
|
219
|
+
VALUE centroid = rb_ary_new();
|
220
|
+
|
221
|
+
for (i = 0; i < dimx; i++)
|
222
|
+
rb_ary_push(cluster, INT2NUM(ccluster[i][0] * nxgrid + ccluster[i][1]));
|
223
|
+
|
224
|
+
for (i = 0; i < nxgrid; i++) {
|
225
|
+
for (j = 0; j < nygrid; j++) {
|
226
|
+
VALUE point = rb_ary_new();
|
227
|
+
for (k = 0; k < dimy; k++)
|
228
|
+
rb_ary_push(point, DBL2NUM(ccelldata[i][j][k]));
|
229
|
+
rb_ary_push(centroid, point);
|
230
|
+
}
|
231
|
+
}
|
232
|
+
|
233
|
+
rb_hash_aset(result, ID2SYM(rb_intern("cluster")), cluster);
|
234
|
+
rb_hash_aset(result, ID2SYM(rb_intern("centroid")), centroid);
|
235
|
+
|
236
|
+
for (i = 0; i < nrows; i++) {
|
237
|
+
free(cdata[i]);
|
238
|
+
free(cmask[i]);
|
239
|
+
}
|
240
|
+
|
241
|
+
for (i = 0; i < dimx; i++)
|
242
|
+
free(ccluster[i]);
|
243
|
+
|
244
|
+
for (i = 0; i < nxgrid; i++) {
|
245
|
+
for (j = 0; j < nygrid; j++)
|
246
|
+
free(ccelldata[i][j]);
|
247
|
+
free(ccelldata[i]);
|
248
|
+
}
|
249
|
+
|
250
|
+
free(cdata);
|
251
|
+
free(cmask);
|
252
|
+
free(ccelldata);
|
253
|
+
free(cweights);
|
254
|
+
free(ccluster);
|
255
|
+
|
256
|
+
return result;
|
257
|
+
}
|
258
|
+
|
259
|
+
|
260
|
+
VALUE rb_distance(VALUE vec1, VALUE vec2, distance_fn fn) {
|
261
|
+
uint32_t size;
|
262
|
+
double *data1, *data2, *weight, dist;
|
263
|
+
int *mask, i;
|
264
|
+
|
265
|
+
if (TYPE(vec1) != T_ARRAY)
|
266
|
+
rb_raise(rb_eArgError, "vector1 should be an array");
|
267
|
+
|
268
|
+
if (TYPE(vec2) != T_ARRAY)
|
269
|
+
rb_raise(rb_eArgError, "vector2 should be an array");
|
270
|
+
|
271
|
+
size = RARRAY_LEN(vec1);
|
272
|
+
|
273
|
+
if (size != RARRAY_LEN(vec2))
|
274
|
+
rb_raise(rb_eArgError, "vector1 & vector2 dimensions mismatch");
|
275
|
+
|
276
|
+
if (size < 1)
|
277
|
+
rb_raise(rb_eArgError, "dimension should be greater than 0");
|
278
|
+
|
279
|
+
data1 = (double *)malloc(sizeof(double)*size);
|
280
|
+
data2 = (double *)malloc(sizeof(double)*size);
|
281
|
+
weight = (double *)malloc(sizeof(double)*size);
|
282
|
+
mask = (int *)malloc(sizeof(int)*size);
|
283
|
+
|
284
|
+
for (i = 0; i < size; i++) {
|
285
|
+
mask[i] = 1;
|
286
|
+
weight[i] = 1;
|
287
|
+
data1[i] = NUM2DBL(rb_ary_entry(vec1, i));
|
288
|
+
data2[i] = NUM2DBL(rb_ary_entry(vec2, i));
|
289
|
+
}
|
290
|
+
|
291
|
+
dist = fn(size, &data1, &data2, &mask, &mask, weight, 0, 0, 0);
|
292
|
+
free(mask);
|
293
|
+
free(weight);
|
294
|
+
free(data2);
|
295
|
+
free(data1);
|
296
|
+
|
297
|
+
return DBL2NUM(dist);
|
298
|
+
}
|
299
|
+
|
300
|
+
VALUE rb_euclid(VALUE self, VALUE vec1, VALUE vec2) {
|
301
|
+
return rb_distance(vec1, vec2, euclid);
|
302
|
+
}
|
303
|
+
|
304
|
+
VALUE rb_cityblock(VALUE self, VALUE vec1, VALUE vec2) {
|
305
|
+
return rb_distance(vec1, vec2, cityblock);
|
306
|
+
}
|
307
|
+
|
308
|
+
VALUE rb_correlation(VALUE self, VALUE vec1, VALUE vec2) {
|
309
|
+
return rb_distance(vec1, vec2, correlation);
|
310
|
+
}
|
311
|
+
|
312
|
+
VALUE rb_ucorrelation(VALUE self, VALUE vec1, VALUE vec2) {
|
313
|
+
return rb_distance(vec1, vec2, ucorrelation);
|
314
|
+
}
|
315
|
+
|
316
|
+
VALUE rb_acorrelation(VALUE self, VALUE vec1, VALUE vec2) {
|
317
|
+
return rb_distance(vec1, vec2, acorrelation);
|
318
|
+
}
|
319
|
+
|
320
|
+
VALUE rb_uacorrelation(VALUE self, VALUE vec1, VALUE vec2) {
|
321
|
+
return rb_distance(vec1, vec2, uacorrelation);
|
322
|
+
}
|
323
|
+
|
324
|
+
VALUE rb_spearman(VALUE self, VALUE vec1, VALUE vec2) {
|
325
|
+
return rb_distance(vec1, vec2, spearman);
|
326
|
+
}
|
327
|
+
|
328
|
+
VALUE rb_kendall(VALUE self, VALUE vec1, VALUE vec2) {
|
329
|
+
return rb_distance(vec1, vec2, kendall);
|
330
|
+
}
|
331
|
+
|
332
|
+
|
113
333
|
void Init_flock(void) {
|
114
334
|
mFlock = rb_define_module("Flock");
|
115
335
|
rb_define_module_function(mFlock, "kmeans", RUBY_METHOD_FUNC(rb_kmeans), -1);
|
336
|
+
rb_define_module_function(mFlock, "self_organizing_map", RUBY_METHOD_FUNC(rb_som), -1);
|
116
337
|
|
117
338
|
rb_define_const(mFlock, "METHOD_AVERAGE", INT2NUM('a'));
|
118
339
|
rb_define_const(mFlock, "METHOD_MEDIAN", INT2NUM('m'));
|
@@ -125,4 +346,13 @@ void Init_flock(void) {
|
|
125
346
|
rb_define_const(mFlock, "METRIC_ABSOLUTE_UNCENTERED_CORRELATION", INT2NUM('x'));
|
126
347
|
rb_define_const(mFlock, "METRIC_SPEARMAN", INT2NUM('s'));
|
127
348
|
rb_define_const(mFlock, "METRIC_KENDALL", INT2NUM('k'));
|
349
|
+
|
350
|
+
rb_define_module_function(mFlock, "euclidian_distance", RUBY_METHOD_FUNC(rb_euclid), 2);
|
351
|
+
rb_define_module_function(mFlock, "cityblock_distance", RUBY_METHOD_FUNC(rb_cityblock), 2);
|
352
|
+
rb_define_module_function(mFlock, "correlation_distance", RUBY_METHOD_FUNC(rb_correlation), 2);
|
353
|
+
rb_define_module_function(mFlock, "absolute_correlation_distance", RUBY_METHOD_FUNC(rb_acorrelation), 2);
|
354
|
+
rb_define_module_function(mFlock, "uncentered_correlation_distance", RUBY_METHOD_FUNC(rb_ucorrelation), 2);
|
355
|
+
rb_define_module_function(mFlock, "absolute_uncentered_correlation_distance", RUBY_METHOD_FUNC(rb_uacorrelation), 2);
|
356
|
+
rb_define_module_function(mFlock, "spearman_distance", RUBY_METHOD_FUNC(rb_spearman), 2);
|
357
|
+
rb_define_module_function(mFlock, "kendall_distance", RUBY_METHOD_FUNC(rb_kendall), 2);
|
128
358
|
}
|
data/flock.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{flock}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.3.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Bharanee Rathna"]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-04-24}
|
13
13
|
s.description = %q{A thin ruby binding to Cluster 3.0}
|
14
14
|
s.email = ["deepfryed@gmail.com"]
|
15
15
|
s.extensions = ["ext/extconf.rb"]
|
@@ -34,6 +34,7 @@ Gem::Specification.new do |s|
|
|
34
34
|
s.summary = %q{Ruby bindings to Cluster 3.0.}
|
35
35
|
s.test_files = [
|
36
36
|
"examples/sparse.rb",
|
37
|
+
"examples/som.rb",
|
37
38
|
"examples/dense.rb"
|
38
39
|
]
|
39
40
|
|
data/lib/flock.rb
CHANGED
@@ -21,7 +21,7 @@ module Flock
|
|
21
21
|
[dims,data]
|
22
22
|
end
|
23
23
|
|
24
|
-
def self.sparse_kmeans size, sparse_data, options={}
|
24
|
+
def self.sparse_kmeans size, sparse_data, options = {}
|
25
25
|
dims, data = sparse_data[0].kind_of?(Array) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
|
26
26
|
|
27
27
|
if options.key?(:weights)
|
@@ -32,4 +32,16 @@ module Flock
|
|
32
32
|
|
33
33
|
kmeans(size, data, nil, options)
|
34
34
|
end
|
35
|
+
|
36
|
+
def self.sparse_self_organizing_map nx, ny, sparse_data, options = {}
|
37
|
+
dims, data = sparse_data[0].kind_of?(Array) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
|
38
|
+
|
39
|
+
if options.key?(:weights)
|
40
|
+
weights = Array.new(dims.size) {1}
|
41
|
+
options[:weights].each {|k,v| weights[dims[k]] = v }
|
42
|
+
options[:weights] = weights
|
43
|
+
end
|
44
|
+
|
45
|
+
self_organizing_map(nx, ny, data, nil, options)
|
46
|
+
end
|
35
47
|
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 3
|
8
|
+
- 0
|
9
|
+
version: 0.3.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Bharanee Rathna
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-04-24 00:00:00 +10:00
|
18
18
|
default_executable:
|
19
19
|
dependencies: []
|
20
20
|
|
@@ -38,6 +38,7 @@ files:
|
|
38
38
|
- flock.gemspec
|
39
39
|
- lib/flock.rb
|
40
40
|
- examples/sparse.rb
|
41
|
+
- examples/som.rb
|
41
42
|
- examples/dense.rb
|
42
43
|
has_rdoc: true
|
43
44
|
homepage: http://github.com/deepfryed/flock
|
@@ -73,4 +74,5 @@ specification_version: 3
|
|
73
74
|
summary: Ruby bindings to Cluster 3.0.
|
74
75
|
test_files:
|
75
76
|
- examples/sparse.rb
|
77
|
+
- examples/som.rb
|
76
78
|
- examples/dense.rb
|