flock 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -76,17 +76,19 @@ Provides bindings to K-Means clustering in Cluster 3.0
76
76
  require 'flock'
77
77
 
78
78
  data = []
79
- data << { apple: 1, orange: 1 }
80
- data << { black: 1, white: 1 }
81
- data << { white: 1, cyan: 1 }
82
- data << { orange: 1 }
83
- data << { apple: 1 }
79
+
80
+ # keys don't need to be numeric
81
+ data << { 1 => 0.5, 2 => 0.5 }
82
+ data << { 3 => 1, 4 => 1 }
83
+ data << { 4 => 1, 5 => 0.3 }
84
+ data << { 2 => 0.75 }
85
+ data << { 1 => 0.60 }
84
86
 
85
87
  pp Flock.sparse_kmeans(2, data)
86
88
 
87
- # or even more simply (defaults to 1)
88
-
89
89
  data = []
90
+
91
+ # a much simpler way to cluster text
90
92
  data << %w(apple orange)
91
93
  data << %w(black white)
92
94
  data << %w(white cyan)
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.1
1
+ 0.3.0
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'pp'
4
+ require 'flock'
5
+
6
+ data = []
7
+ data << %w(orange apple)
8
+ data << %w(black white)
9
+ data << %w(white cyan)
10
+ data << %w(orange)
11
+ data << %w(apple)
12
+
13
+ pp Flock.sparse_self_organizing_map(2, 2, data)
@@ -4,11 +4,11 @@ require 'pp'
4
4
  require 'flock'
5
5
 
6
6
  data = []
7
- data << { apple: 1, orange: 1 }
8
- data << { black: 1, white: 1 }
9
- data << { white: 1, cyan: 1 }
10
- data << { orange: 1 }
11
- data << { apple: 1 }
7
+ data << { 1 => 0.5, 2 => 0.5 }
8
+ data << { 3 => 1, 4 => 1 }
9
+ data << { 4 => 1, 5 => 0.3 }
10
+ data << { 2 => 0.75 }
11
+ data << { 1 => 0.60 }
12
12
 
13
13
  pp Flock.sparse_kmeans(2, data)
14
14
 
@@ -5,7 +5,7 @@
5
5
  * Human Genome Center, Institute of Medical Science, University of Tokyo,
6
6
  * 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
7
7
  * Contact: mdehoon 'AT' gsc.riken.jp
8
- *
8
+ *
9
9
  * Permission to use, copy, modify, and distribute this software and its
10
10
  * documentation with or without modifications and for any purpose and
11
11
  * without fee is hereby granted, provided that any copyright notices
@@ -14,7 +14,7 @@
14
14
  * names of the contributors or copyright holders not be used in
15
15
  * advertising or publicity pertaining to distribution of the software
16
16
  * without specific prior permission.
17
- *
17
+ *
18
18
  * THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
19
19
  * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
20
20
  * WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
@@ -23,7 +23,7 @@
23
23
  * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
24
24
  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
25
25
  * OR PERFORMANCE OF THIS SOFTWARE.
26
- *
26
+ *
27
27
  */
28
28
 
29
29
  #include <time.h>
@@ -334,7 +334,7 @@ static int svd(int m, int n, double** u, double w[], double** vt)
334
334
  * A=usv of a real m by n rectangular matrix, where m is greater
335
335
  * than or equal to n. Householder bidiagonalization and a variant
336
336
  * of the QR algorithm are used.
337
- *
337
+ *
338
338
  *
339
339
  * On input.
340
340
  *
@@ -929,10 +929,9 @@ positive integer if the singular value decomposition fails to converge.
929
929
 
930
930
  /* ********************************************************************* */
931
931
 
932
- static
933
932
  double euclid (int n, double** data1, double** data2, int** mask1, int** mask2,
934
933
  const double weight[], int index1, int index2, int transpose)
935
-
934
+
936
935
  /*
937
936
  Purpose
938
937
  =======
@@ -1004,7 +1003,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
1004
1003
 
1005
1004
  /* ********************************************************************* */
1006
1005
 
1007
- static
1008
1006
  double cityblock (int n, double** data1, double** data2, int** mask1,
1009
1007
  int** mask2, const double weight[], int index1, int index2, int transpose)
1010
1008
 
@@ -1080,7 +1078,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
1080
1078
 
1081
1079
  /* ********************************************************************* */
1082
1080
 
1083
- static
1084
1081
  double correlation (int n, double** data1, double** data2, int** mask1,
1085
1082
  int** mask2, const double weight[], int index1, int index2, int transpose)
1086
1083
  /*
@@ -1180,7 +1177,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
1180
1177
 
1181
1178
  /* ********************************************************************* */
1182
1179
 
1183
- static
1184
1180
  double acorrelation (int n, double** data1, double** data2, int** mask1,
1185
1181
  int** mask2, const double weight[], int index1, int index2, int transpose)
1186
1182
  /*
@@ -1279,7 +1275,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
1279
1275
 
1280
1276
  /* ********************************************************************* */
1281
1277
 
1282
- static
1283
1278
  double ucorrelation (int n, double** data1, double** data2, int** mask1,
1284
1279
  int** mask2, const double weight[], int index1, int index2, int transpose)
1285
1280
  /*
@@ -1374,7 +1369,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
1374
1369
 
1375
1370
  /* ********************************************************************* */
1376
1371
 
1377
- static
1378
1372
  double uacorrelation (int n, double** data1, double** data2, int** mask1,
1379
1373
  int** mask2, const double weight[], int index1, int index2, int transpose)
1380
1374
  /*
@@ -1469,7 +1463,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
1469
1463
 
1470
1464
  /* ********************************************************************* */
1471
1465
 
1472
- static
1473
1466
  double spearman (int n, double** data1, double** data2, int** mask1,
1474
1467
  int** mask2, const double weight[], int index1, int index2, int transpose)
1475
1468
  /*
@@ -1597,7 +1590,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
1597
1590
 
1598
1591
  /* ********************************************************************* */
1599
1592
 
1600
- static
1601
1593
  double kendall (int n, double** data1, double** data2, int** mask1, int** mask2,
1602
1594
  const double weight[], int index1, int index2, int transpose)
1603
1595
  /*
@@ -1708,7 +1700,7 @@ Otherwise, the distance between two columns in the matrix is calculated.
1708
1700
 
1709
1701
  /* ********************************************************************* */
1710
1702
 
1711
- static double(*setmetric(char dist))
1703
+ static double(*setmetric(char dist))
1712
1704
  (int, double**, double**, int**, int**, const double[], int, int, int)
1713
1705
  { switch(dist)
1714
1706
  { case 'e': return &euclid;
@@ -2203,7 +2195,7 @@ calculating the medians.
2203
2195
  }
2204
2196
  }
2205
2197
  }
2206
-
2198
+
2207
2199
  /* ********************************************************************* */
2208
2200
 
2209
2201
  int getclustercentroids(int nclusters, int nrows, int ncolumns,
@@ -2427,7 +2419,7 @@ kmeans(int nclusters, int nrows, int ncolumns, double** data, int** mask,
2427
2419
  break; /* Identical solution found; break out of this loop */
2428
2420
  }
2429
2421
 
2430
- if (npass<=1)
2422
+ if (npass<=1)
2431
2423
  { *error = total;
2432
2424
  break;
2433
2425
  }
@@ -2532,7 +2524,7 @@ kmedians(int nclusters, int nrows, int ncolumns, double** data, int** mask,
2532
2524
  break; /* Identical solution found; break out of this loop */
2533
2525
  }
2534
2526
 
2535
- if (npass<=1)
2527
+ if (npass<=1)
2536
2528
  { *error = total;
2537
2529
  break;
2538
2530
  }
@@ -2603,7 +2595,7 @@ of the matrix are clustered.
2603
2595
 
2604
2596
  npass (input) int
2605
2597
  The number of times clustering is performed. Clustering is performed npass
2606
- times, each time starting from a different (random) initial assignment of
2598
+ times, each time starting from a different (random) initial assignment of
2607
2599
  genes to clusters. The clustering solution with the lowest within-cluster sum
2608
2600
  of distances is chosen.
2609
2601
  If npass==0, then the clustering algorithm will be run once, where the initial
@@ -2697,7 +2689,7 @@ number of clusters is larger than the number of elements being clustered,
2697
2689
  return;
2698
2690
  }
2699
2691
  }
2700
-
2692
+
2701
2693
  if (method=='m')
2702
2694
  { double* cache = malloc(nelements*sizeof(double));
2703
2695
  if(cache)
@@ -3105,7 +3097,7 @@ weights array, the function returns NULL.
3105
3097
 
3106
3098
  /* ******************************************************************** */
3107
3099
 
3108
- void cuttree (int nelements, Node* tree, int nclusters, int clusterid[])
3100
+ void cuttree (int nelements, Node* tree, int nclusters, int clusterid[])
3109
3101
 
3110
3102
  /*
3111
3103
  Purpose
@@ -3160,7 +3152,7 @@ error occured, all elements in clusterid are set to -1.
3160
3152
  }
3161
3153
  for (i = 0; i < n; i++) nodeid[i] = -1;
3162
3154
  for (i = n-1; i >= 0; i--)
3163
- { if(nodeid[i]<0)
3155
+ { if(nodeid[i]<0)
3164
3156
  { j = icluster;
3165
3157
  nodeid[i] = j;
3166
3158
  icluster++;
@@ -3269,7 +3261,7 @@ If a memory error occurs, pclcluster returns NULL.
3269
3261
  if(!makedatamask(nelements, ndata, &newdata, &newmask))
3270
3262
  { free(result);
3271
3263
  free(distid);
3272
- return NULL;
3264
+ return NULL;
3273
3265
  }
3274
3266
 
3275
3267
  for (i = 0; i < nelements; i++) distid[i] = i;
@@ -3313,7 +3305,7 @@ If a memory error occurs, pclcluster returns NULL.
3313
3305
  free(mask[is]);
3314
3306
  data[is] = data[nnodes-inode];
3315
3307
  mask[is] = mask[nnodes-inode];
3316
-
3308
+
3317
3309
  /* Fix the distances */
3318
3310
  distid[is] = distid[nnodes-inode];
3319
3311
  for (i = 0; i < is; i++)
@@ -3334,7 +3326,7 @@ If a memory error occurs, pclcluster returns NULL.
3334
3326
  free(data);
3335
3327
  free(mask);
3336
3328
  free(distid);
3337
-
3329
+
3338
3330
  return result;
3339
3331
  }
3340
3332
 
@@ -3829,7 +3821,7 @@ If a memory error occurs, treecluster returns NULL.
3829
3821
  for (i = 1; i < nelements; i++) free(distmatrix[i]);
3830
3822
  free (distmatrix);
3831
3823
  }
3832
-
3824
+
3833
3825
  return result;
3834
3826
  }
3835
3827
 
@@ -4037,7 +4029,7 @@ void somworker (int nrows, int ncolumns, double** data, int** mask,
4037
4029
  static
4038
4030
  void somassign (int nrows, int ncolumns, double** data, int** mask,
4039
4031
  const double weights[], int transpose, int nxgrid, int nygrid,
4040
- double*** celldata, char dist, int clusterid[][2])
4032
+ double*** celldata, char dist, int **clusterid)
4041
4033
  /* Collect clusterids */
4042
4034
  { const int ndata = (transpose==0) ? ncolumns : nrows;
4043
4035
  int i,j;
@@ -4121,7 +4113,7 @@ void somassign (int nrows, int ncolumns, double** data, int** mask,
4121
4113
 
4122
4114
  void somcluster (int nrows, int ncolumns, double** data, int** mask,
4123
4115
  const double weight[], int transpose, int nxgrid, int nygrid,
4124
- double inittau, int niter, char dist, double*** celldata, int clusterid[][2])
4116
+ double inittau, int niter, char dist, double*** celldata, int **clusterid)
4125
4117
  /*
4126
4118
 
4127
4119
  Purpose
@@ -4235,7 +4227,7 @@ somcluster.
4235
4227
  double clusterdistance (int nrows, int ncolumns, double** data,
4236
4228
  int** mask, double weight[], int n1, int n2, int index1[], int index2[],
4237
4229
  char dist, char method, int transpose)
4238
-
4230
+
4239
4231
  /*
4240
4232
  Purpose
4241
4233
  =======
@@ -6,7 +6,7 @@
6
6
  * Human Genome Center, Institute of Medical Science, University of Tokyo,
7
7
  * 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
8
8
  * Contact: mdehoon 'AT' gsc.riken.jp
9
- *
9
+ *
10
10
  * Permission to use, copy, modify, and distribute this software and its
11
11
  * documentation with or without modifications and for any purpose and
12
12
  * without fee is hereby granted, provided that any copyright notices
@@ -15,7 +15,7 @@
15
15
  * names of the contributors or copyright holders not be used in
16
16
  * advertising or publicity pertaining to distribution of the software
17
17
  * without specific prior permission.
18
- *
18
+ *
19
19
  * THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
20
20
  * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
21
21
  * WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
@@ -24,7 +24,7 @@
24
24
  * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
25
25
  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
26
26
  * OR PERFORMANCE OF THIS SOFTWARE.
27
- *
27
+ *
28
28
  */
29
29
 
30
30
  #ifndef min
@@ -79,7 +79,7 @@ void cuttree (int nelements, Node* tree, int nclusters, int clusterid[]);
79
79
  void somcluster (int nrows, int ncolumns, double** data, int** mask,
80
80
  const double weight[], int transpose, int nxnodes, int nynodes,
81
81
  double inittau, int niter, char dist, double*** celldata,
82
- int clusterid[][2]);
82
+ int **clusterid);
83
83
 
84
84
  /* Chapter 6 */
85
85
  int pca(int m, int n, double** u, double** v, double* w);
@@ -91,3 +91,13 @@ double median (int n, double x[]);
91
91
 
92
92
  double* calculate_weights(int nrows, int ncolumns, double** data, int** mask,
93
93
  double weights[], int transpose, char dist, double cutoff, double exponent);
94
+
95
+ /* distance functions */
96
+ extern double euclid (int, double**, double**, int**, int**, const double [], int, int, int);
97
+ extern double cityblock(int, double**, double**, int**, int**, const double [], int, int, int);
98
+ extern double correlation(int, double**, double**, int**, int**, const double [], int, int, int);
99
+ extern double acorrelation(int, double**, double**, int**, int**, const double [], int, int, int);
100
+ extern double ucorrelation(int, double**, double**, int**, int**, const double [], int, int, int);
101
+ extern double uacorrelation(int, double**, double**, int**, int**, const double [], int, int, int);
102
+ extern double spearman(int, double**, double**, int**, int**, const double [], int, int, int);
103
+ extern double kendall(int, double**, double**, int**, int**, const double [], int, int, int);
@@ -5,6 +5,7 @@
5
5
  #define CONST_GET(scope, constant) (rb_funcall(scope, ID_CONST_GET, 1, rb_str_new2(constant)))
6
6
 
7
7
  static VALUE mFlock;
8
+ typedef double (*distance_fn)(int, double**, double**, int**, int**, const double [], int, int, int);
8
9
 
9
10
  int opt_int_value(VALUE option, char *key, int def) {
10
11
  if (NIL_P(option)) return def;
@@ -13,6 +14,13 @@ int opt_int_value(VALUE option, char *key, int def) {
13
14
  return NIL_P(value) ? def : NUM2INT(value);
14
15
  }
15
16
 
17
+ int opt_double_value(VALUE option, char *key, double def) {
18
+ if (NIL_P(option)) return def;
19
+
20
+ VALUE value = rb_hash_aref(option, ID2SYM(rb_intern(key)));
21
+ return NIL_P(value) ? def : NUM2DBL(value);
22
+ }
23
+
16
24
  VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
17
25
  VALUE size, data, mask, weights, options;
18
26
  rb_scan_args(argc, argv, "22", &size, &data, &mask, &options);
@@ -26,6 +34,20 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
26
34
  if (NIL_P(size) || NUM2INT(rb_Integer(size)) > RARRAY_LEN(data))
27
35
  rb_raise(rb_eArgError, "size should be > 0 and <= data size");
28
36
 
37
+ int transpose = opt_int_value(options, "transpose", 0);
38
+ int npass = opt_int_value(options, "iterations", 1000);
39
+ // a = average, m = means
40
+ int method = opt_int_value(options, "method", 'a');
41
+ // e = euclidian,
42
+ // b = city-block distance
43
+ // c = correlation
44
+ // a = absolute value of the correlation
45
+ // u = uncentered correlation
46
+ // x = absolute uncentered correlation
47
+ // s = spearman's rank correlation
48
+ // k = kendall's tau
49
+ int dist = opt_int_value(options, "metric", 'e');
50
+
29
51
  int i,j;
30
52
  int nrows = RARRAY_LEN(data);
31
53
  int ncols = RARRAY_LEN(rb_ary_entry(data, 0));
@@ -33,16 +55,14 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
33
55
 
34
56
  double **cdata = (double**)malloc(sizeof(double*)*nrows);
35
57
  int **cmask = (int **)malloc(sizeof(int *)*nrows);
36
- double **ccentroid = (double**)malloc(sizeof(double*)*nrows);
37
- int **ccentroid_mask = (int **)malloc(sizeof(int *)*nrows);
38
58
  double *cweights = (double *)malloc(sizeof(double )*ncols);
39
- int *ccluster = (int *)malloc(sizeof(int )*nrows);
59
+
60
+ double **ccentroid;
61
+ int *ccluster, **ccentroid_mask, dimx = nrows, dimy = ncols, cdimx = nsets, cdimy = ncols;
40
62
 
41
63
  for (i = 0; i < nrows; i++) {
42
64
  cdata[i] = (double*)malloc(sizeof(double)*ncols);
43
65
  cmask[i] = (int *)malloc(sizeof(int )*ncols);
44
- ccentroid[i] = (double*)malloc(sizeof(double)*ncols);
45
- ccentroid_mask[i] = (int *)malloc(sizeof(int )*ncols);
46
66
  for (j = 0; j < ncols; j++) {
47
67
  cdata[i][j] = NUM2DBL(rb_Float(rb_ary_entry(rb_ary_entry(data, i), j)));
48
68
  cmask[i][j] = NIL_P(mask) ? 1 : NUM2INT(rb_Integer(rb_ary_entry(rb_ary_entry(mask, i), j)));
@@ -54,25 +74,27 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
54
74
  cweights[i] = NIL_P(weights) ? 1.0 : NUM2DBL(rb_Float(rb_ary_entry(weights, i)));
55
75
  }
56
76
 
57
- int transpose = opt_int_value(options, "transpose", 0);
58
- int npass = opt_int_value(options, "iterations", 1000);
59
- // a = average, m = means
60
- int method = opt_int_value(options, "method", 'a');
61
- // e = euclidian,
62
- // b = city-block distance
63
- // c = correlation
64
- // a = absolute value of the correlation
65
- // u = uncentered correlation
66
- // x = absolute uncentered correlation
67
- // s = spearman's rank correlation
68
- // k = kendall's tau
69
- int dist = opt_int_value(options, "metric", 'e');
77
+ if (transpose) {
78
+ dimx = ncols;
79
+ dimy = nrows;
80
+ cdimx = nrows;
81
+ cdimy = nsets;
82
+ }
83
+
84
+ ccluster = (int *)malloc(sizeof(int )*dimx);
85
+ ccentroid = (double**)malloc(sizeof(double*)*cdimx);
86
+ ccentroid_mask = (int **)malloc(sizeof(int *)*cdimx);
87
+
88
+ for (i = 0; i < cdimx; i++) {
89
+ ccentroid[i] = (double*)malloc(sizeof(double)*cdimy);
90
+ ccentroid_mask[i] = (int *)malloc(sizeof(int )*cdimy);
91
+ }
70
92
 
71
93
  int ifound;
72
94
  double error;
95
+
73
96
  kcluster(nsets,
74
97
  nrows, ncols, cdata, cmask, cweights, transpose, npass, method, dist, ccluster, &error, &ifound);
75
-
76
98
  getclustercentroids(nsets,
77
99
  nrows, ncols, cdata, cmask, ccluster, ccentroid, ccentroid_mask, transpose, method);
78
100
 
@@ -80,10 +102,12 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
80
102
  VALUE cluster = rb_ary_new();
81
103
  VALUE centroid = rb_ary_new();
82
104
 
83
- for (i = 0; i < nrows; i++) {
105
+ for (i = 0; i < dimx; i++)
84
106
  rb_ary_push(cluster, INT2NUM(ccluster[i]));
107
+
108
+ for (i = 0; i < cdimx; i++) {
85
109
  VALUE point = rb_ary_new();
86
- for (j = 0; j < ncols; j++)
110
+ for (j = 0; j < cdimy; j++)
87
111
  rb_ary_push(point, DBL2NUM(ccentroid[i][j]));
88
112
  rb_ary_push(centroid, point);
89
113
  }
@@ -96,6 +120,9 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
96
120
  for (i = 0; i < nrows; i++) {
97
121
  free(cdata[i]);
98
122
  free(cmask[i]);
123
+ }
124
+
125
+ for (i = 0; i < cdimx; i++) {
99
126
  free(ccentroid[i]);
100
127
  free(ccentroid_mask[i]);
101
128
  }
@@ -110,9 +137,203 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
110
137
  return result;
111
138
  }
112
139
 
140
+ VALUE rb_som(int argc, VALUE *argv, VALUE self) {
141
+ VALUE nx, ny, data, mask, weights, options;
142
+ rb_scan_args(argc, argv, "32", &nx, &ny, &data, &mask, &options);
143
+
144
+ if (TYPE(data) != T_ARRAY)
145
+ rb_raise(rb_eArgError, "data should be an array of arrays");
146
+
147
+ if (!NIL_P(mask) && TYPE(mask) != T_ARRAY)
148
+ rb_raise(rb_eArgError, "mask should be an array of arrays");
149
+
150
+ if (NIL_P(nx) || NUM2INT(rb_Integer(nx)) <= 0)
151
+ rb_raise(rb_eArgError, "nx should be > 0");
152
+
153
+ if (NIL_P(ny) || NUM2INT(rb_Integer(ny)) <= 0)
154
+ rb_raise(rb_eArgError, "ny should be > 0");
155
+
156
+ int nxgrid = NUM2INT(rb_Integer(nx));
157
+ int nygrid = NUM2INT(rb_Integer(ny));
158
+ int transpose = opt_int_value(options, "transpose", 0);
159
+ int npass = opt_int_value(options, "iterations", 1000);
160
+
161
+ // e = euclidian,
162
+ // b = city-block distance
163
+ // c = correlation
164
+ // a = absolute value of the correlation
165
+ // u = uncentered correlation
166
+ // x = absolute uncentered correlation
167
+ // s = spearman's rank correlation
168
+ // k = kendall's tau
169
+ int dist = opt_int_value(options, "metric", 'e');
170
+ double tau = opt_double_value(options, "tau", 1.0);
171
+
172
+ int i, j, k;
173
+ int nrows = RARRAY_LEN(data);
174
+ int ncols = RARRAY_LEN(rb_ary_entry(data, 0));
175
+
176
+ double **cdata = (double**)malloc(sizeof(double*)*nrows);
177
+ int **cmask = (int **)malloc(sizeof(int *)*nrows);
178
+ double *cweights = (double *)malloc(sizeof(double )*ncols);
179
+
180
+ int **ccluster;
181
+ double ***ccelldata;
182
+ int dimx = nrows, dimy = ncols;
183
+
184
+ if (transpose) {
185
+ dimx = ncols;
186
+ dimy = nrows;
187
+ }
188
+
189
+ ccluster = (int **)malloc(sizeof(int*)*dimx);
190
+ for (i = 0; i < dimx; i++)
191
+ ccluster[i] = (int*)malloc(sizeof(int)*2);
192
+
193
+ for (i = 0; i < nrows; i++) {
194
+ cdata[i] = (double*)malloc(sizeof(double)*ncols);
195
+ cmask[i] = (int *)malloc(sizeof(int )*ncols);
196
+ for (j = 0; j < ncols; j++) {
197
+ cdata[i][j] = NUM2DBL(rb_Float(rb_ary_entry(rb_ary_entry(data, i), j)));
198
+ cmask[i][j] = NIL_P(mask) ? 1 : NUM2INT(rb_Integer(rb_ary_entry(rb_ary_entry(mask, i), j)));
199
+ }
200
+ }
201
+
202
+ weights = NIL_P(options) ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("weights")));
203
+ for (i = 0; i < ncols; i++) {
204
+ cweights[i] = NIL_P(weights) ? 1.0 : NUM2DBL(rb_Float(rb_ary_entry(weights, i)));
205
+ }
206
+
207
+ ccelldata = (double***)malloc(sizeof(double**)*nxgrid);
208
+ for (i = 0; i < nxgrid; i++) {
209
+ ccelldata[i] = (double **)malloc(sizeof(double*)*nygrid);
210
+ for (j = 0; j < nygrid; j++)
211
+ ccelldata[i][j] = (double *)malloc(sizeof(double)*dimy);
212
+ }
213
+
214
+ somcluster(nrows, ncols, cdata, cmask, cweights, transpose,
215
+ nxgrid, nygrid, tau, npass, dist, ccelldata, ccluster);
216
+
217
+ VALUE result = rb_hash_new();
218
+ VALUE cluster = rb_ary_new();
219
+ VALUE centroid = rb_ary_new();
220
+
221
+ for (i = 0; i < dimx; i++)
222
+ rb_ary_push(cluster, INT2NUM(ccluster[i][0] * nxgrid + ccluster[i][1]));
223
+
224
+ for (i = 0; i < nxgrid; i++) {
225
+ for (j = 0; j < nygrid; j++) {
226
+ VALUE point = rb_ary_new();
227
+ for (k = 0; k < dimy; k++)
228
+ rb_ary_push(point, DBL2NUM(ccelldata[i][j][k]));
229
+ rb_ary_push(centroid, point);
230
+ }
231
+ }
232
+
233
+ rb_hash_aset(result, ID2SYM(rb_intern("cluster")), cluster);
234
+ rb_hash_aset(result, ID2SYM(rb_intern("centroid")), centroid);
235
+
236
+ for (i = 0; i < nrows; i++) {
237
+ free(cdata[i]);
238
+ free(cmask[i]);
239
+ }
240
+
241
+ for (i = 0; i < dimx; i++)
242
+ free(ccluster[i]);
243
+
244
+ for (i = 0; i < nxgrid; i++) {
245
+ for (j = 0; j < nygrid; j++)
246
+ free(ccelldata[i][j]);
247
+ free(ccelldata[i]);
248
+ }
249
+
250
+ free(cdata);
251
+ free(cmask);
252
+ free(ccelldata);
253
+ free(cweights);
254
+ free(ccluster);
255
+
256
+ return result;
257
+ }
258
+
259
+
260
+ VALUE rb_distance(VALUE vec1, VALUE vec2, distance_fn fn) {
261
+ uint32_t size;
262
+ double *data1, *data2, *weight, dist;
263
+ int *mask, i;
264
+
265
+ if (TYPE(vec1) != T_ARRAY)
266
+ rb_raise(rb_eArgError, "vector1 should be an array");
267
+
268
+ if (TYPE(vec2) != T_ARRAY)
269
+ rb_raise(rb_eArgError, "vector2 should be an array");
270
+
271
+ size = RARRAY_LEN(vec1);
272
+
273
+ if (size != RARRAY_LEN(vec2))
274
+ rb_raise(rb_eArgError, "vector1 & vector2 dimensions mismatch");
275
+
276
+ if (size < 1)
277
+ rb_raise(rb_eArgError, "dimension should be greater than 0");
278
+
279
+ data1 = (double *)malloc(sizeof(double)*size);
280
+ data2 = (double *)malloc(sizeof(double)*size);
281
+ weight = (double *)malloc(sizeof(double)*size);
282
+ mask = (int *)malloc(sizeof(int)*size);
283
+
284
+ for (i = 0; i < size; i++) {
285
+ mask[i] = 1;
286
+ weight[i] = 1;
287
+ data1[i] = NUM2DBL(rb_ary_entry(vec1, i));
288
+ data2[i] = NUM2DBL(rb_ary_entry(vec2, i));
289
+ }
290
+
291
+ dist = fn(size, &data1, &data2, &mask, &mask, weight, 0, 0, 0);
292
+ free(mask);
293
+ free(weight);
294
+ free(data2);
295
+ free(data1);
296
+
297
+ return DBL2NUM(dist);
298
+ }
299
+
300
+ VALUE rb_euclid(VALUE self, VALUE vec1, VALUE vec2) {
301
+ return rb_distance(vec1, vec2, euclid);
302
+ }
303
+
304
+ VALUE rb_cityblock(VALUE self, VALUE vec1, VALUE vec2) {
305
+ return rb_distance(vec1, vec2, cityblock);
306
+ }
307
+
308
+ VALUE rb_correlation(VALUE self, VALUE vec1, VALUE vec2) {
309
+ return rb_distance(vec1, vec2, correlation);
310
+ }
311
+
312
+ VALUE rb_ucorrelation(VALUE self, VALUE vec1, VALUE vec2) {
313
+ return rb_distance(vec1, vec2, ucorrelation);
314
+ }
315
+
316
+ VALUE rb_acorrelation(VALUE self, VALUE vec1, VALUE vec2) {
317
+ return rb_distance(vec1, vec2, acorrelation);
318
+ }
319
+
320
+ VALUE rb_uacorrelation(VALUE self, VALUE vec1, VALUE vec2) {
321
+ return rb_distance(vec1, vec2, uacorrelation);
322
+ }
323
+
324
+ VALUE rb_spearman(VALUE self, VALUE vec1, VALUE vec2) {
325
+ return rb_distance(vec1, vec2, spearman);
326
+ }
327
+
328
+ VALUE rb_kendall(VALUE self, VALUE vec1, VALUE vec2) {
329
+ return rb_distance(vec1, vec2, kendall);
330
+ }
331
+
332
+
113
333
  void Init_flock(void) {
114
334
  mFlock = rb_define_module("Flock");
115
335
  rb_define_module_function(mFlock, "kmeans", RUBY_METHOD_FUNC(rb_kmeans), -1);
336
+ rb_define_module_function(mFlock, "self_organizing_map", RUBY_METHOD_FUNC(rb_som), -1);
116
337
 
117
338
  rb_define_const(mFlock, "METHOD_AVERAGE", INT2NUM('a'));
118
339
  rb_define_const(mFlock, "METHOD_MEDIAN", INT2NUM('m'));
@@ -125,4 +346,13 @@ void Init_flock(void) {
125
346
  rb_define_const(mFlock, "METRIC_ABSOLUTE_UNCENTERED_CORRELATION", INT2NUM('x'));
126
347
  rb_define_const(mFlock, "METRIC_SPEARMAN", INT2NUM('s'));
127
348
  rb_define_const(mFlock, "METRIC_KENDALL", INT2NUM('k'));
349
+
350
+ rb_define_module_function(mFlock, "euclidian_distance", RUBY_METHOD_FUNC(rb_euclid), 2);
351
+ rb_define_module_function(mFlock, "cityblock_distance", RUBY_METHOD_FUNC(rb_cityblock), 2);
352
+ rb_define_module_function(mFlock, "correlation_distance", RUBY_METHOD_FUNC(rb_correlation), 2);
353
+ rb_define_module_function(mFlock, "absolute_correlation_distance", RUBY_METHOD_FUNC(rb_acorrelation), 2);
354
+ rb_define_module_function(mFlock, "uncentered_correlation_distance", RUBY_METHOD_FUNC(rb_ucorrelation), 2);
355
+ rb_define_module_function(mFlock, "absolute_uncentered_correlation_distance", RUBY_METHOD_FUNC(rb_uacorrelation), 2);
356
+ rb_define_module_function(mFlock, "spearman_distance", RUBY_METHOD_FUNC(rb_spearman), 2);
357
+ rb_define_module_function(mFlock, "kendall_distance", RUBY_METHOD_FUNC(rb_kendall), 2);
128
358
  }
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{flock}
8
- s.version = "0.2.1"
8
+ s.version = "0.3.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Bharanee Rathna"]
12
- s.date = %q{2011-02-19}
12
+ s.date = %q{2011-04-24}
13
13
  s.description = %q{A thin ruby binding to Cluster 3.0}
14
14
  s.email = ["deepfryed@gmail.com"]
15
15
  s.extensions = ["ext/extconf.rb"]
@@ -34,6 +34,7 @@ Gem::Specification.new do |s|
34
34
  s.summary = %q{Ruby bindings to Cluster 3.0.}
35
35
  s.test_files = [
36
36
  "examples/sparse.rb",
37
+ "examples/som.rb",
37
38
  "examples/dense.rb"
38
39
  ]
39
40
 
@@ -21,7 +21,7 @@ module Flock
21
21
  [dims,data]
22
22
  end
23
23
 
24
- def self.sparse_kmeans size, sparse_data, options={}
24
+ def self.sparse_kmeans size, sparse_data, options = {}
25
25
  dims, data = sparse_data[0].kind_of?(Array) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
26
26
 
27
27
  if options.key?(:weights)
@@ -32,4 +32,16 @@ module Flock
32
32
 
33
33
  kmeans(size, data, nil, options)
34
34
  end
35
+
36
+ def self.sparse_self_organizing_map nx, ny, sparse_data, options = {}
37
+ dims, data = sparse_data[0].kind_of?(Array) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
38
+
39
+ if options.key?(:weights)
40
+ weights = Array.new(dims.size) {1}
41
+ options[:weights].each {|k,v| weights[dims[k]] = v }
42
+ options[:weights] = weights
43
+ end
44
+
45
+ self_organizing_map(nx, ny, data, nil, options)
46
+ end
35
47
  end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 2
8
- - 1
9
- version: 0.2.1
7
+ - 3
8
+ - 0
9
+ version: 0.3.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Bharanee Rathna
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-02-19 00:00:00 +11:00
17
+ date: 2011-04-24 00:00:00 +10:00
18
18
  default_executable:
19
19
  dependencies: []
20
20
 
@@ -38,6 +38,7 @@ files:
38
38
  - flock.gemspec
39
39
  - lib/flock.rb
40
40
  - examples/sparse.rb
41
+ - examples/som.rb
41
42
  - examples/dense.rb
42
43
  has_rdoc: true
43
44
  homepage: http://github.com/deepfryed/flock
@@ -73,4 +74,5 @@ specification_version: 3
73
74
  summary: Ruby bindings to Cluster 3.0.
74
75
  test_files:
75
76
  - examples/sparse.rb
77
+ - examples/som.rb
76
78
  - examples/dense.rb